mirror of
https://github.com/KhronosGroup/OpenCL-CTS.git
synced 2026-03-19 22:19:02 +00:00
868 lines
32 KiB
C++
868 lines
32 KiB
C++
//
|
|
// Copyright (c) 2017 The Khronos Group Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
#include "procs.h"
|
|
#include "subhelpers.h"
|
|
#include "harness/conversions.h"
|
|
#include "harness/typeWrappers.h"
|
|
|
|
static const char * any_source =
|
|
"__kernel void test_any(const __global Type *in, __global int2 *xy, __global Type *out)\n"
|
|
"{\n"
|
|
" int gid = get_global_id(0);\n"
|
|
" XY(xy,gid);\n"
|
|
" out[gid] = sub_group_any(in[gid]);\n"
|
|
"}\n";
|
|
|
|
static const char * all_source =
|
|
"__kernel void test_all(const __global Type *in, __global int2 *xy, __global Type *out)\n"
|
|
"{\n"
|
|
" int gid = get_global_id(0);\n"
|
|
" XY(xy,gid);\n"
|
|
" out[gid] = sub_group_all(in[gid]);\n"
|
|
"}\n";
|
|
|
|
static const char * bcast_source =
|
|
"__kernel void test_bcast(const __global Type *in, __global int2 *xy, __global Type *out)\n"
|
|
"{\n"
|
|
" int gid = get_global_id(0);\n"
|
|
" XY(xy,gid);\n"
|
|
" Type x = in[gid];\n"
|
|
" size_t loid = (size_t)((int)x % 100);\n"
|
|
" out[gid] = sub_group_broadcast(x, loid);\n"
|
|
"}\n";
|
|
|
|
static const char * redadd_source =
|
|
"__kernel void test_redadd(const __global Type *in, __global int2 *xy, __global Type *out)\n"
|
|
"{\n"
|
|
" int gid = get_global_id(0);\n"
|
|
" XY(xy,gid);\n"
|
|
" out[gid] = sub_group_reduce_add(in[gid]);\n"
|
|
"}\n";
|
|
|
|
static const char * redmax_source =
|
|
"__kernel void test_redmax(const __global Type *in, __global int2 *xy, __global Type *out)\n"
|
|
"{\n"
|
|
" int gid = get_global_id(0);\n"
|
|
" XY(xy,gid);\n"
|
|
" out[gid] = sub_group_reduce_max(in[gid]);\n"
|
|
"}\n";
|
|
|
|
static const char * redmin_source =
|
|
"__kernel void test_redmin(const __global Type *in, __global int2 *xy, __global Type *out)\n"
|
|
"{\n"
|
|
" int gid = get_global_id(0);\n"
|
|
" XY(xy,gid);\n"
|
|
" out[gid] = sub_group_reduce_min(in[gid]);\n"
|
|
"}\n";
|
|
|
|
static const char * scinadd_source =
|
|
"__kernel void test_scinadd(const __global Type *in, __global int2 *xy, __global Type *out)\n"
|
|
"{\n"
|
|
" int gid = get_global_id(0);\n"
|
|
" XY(xy,gid);\n"
|
|
" out[gid] = sub_group_scan_inclusive_add(in[gid]);\n"
|
|
"}\n";
|
|
|
|
static const char * scinmax_source =
|
|
"__kernel void test_scinmax(const __global Type *in, __global int2 *xy, __global Type *out)\n"
|
|
"{\n"
|
|
" int gid = get_global_id(0);\n"
|
|
" XY(xy,gid);\n"
|
|
" out[gid] = sub_group_scan_inclusive_max(in[gid]);\n"
|
|
"}\n";
|
|
|
|
static const char * scinmin_source =
|
|
"__kernel void test_scinmin(const __global Type *in, __global int2 *xy, __global Type *out)\n"
|
|
"{\n"
|
|
" int gid = get_global_id(0);\n"
|
|
" XY(xy,gid);\n"
|
|
" out[gid] = sub_group_scan_inclusive_min(in[gid]);\n"
|
|
"}\n";
|
|
|
|
static const char * scexadd_source =
|
|
"__kernel void test_scexadd(const __global Type *in, __global int2 *xy, __global Type *out)\n"
|
|
"{\n"
|
|
" int gid = get_global_id(0);\n"
|
|
" XY(xy,gid);\n"
|
|
" out[gid] = sub_group_scan_exclusive_add(in[gid]);\n"
|
|
"}\n";
|
|
|
|
static const char * scexmax_source =
|
|
"__kernel void test_scexmax(const __global Type *in, __global int2 *xy, __global Type *out)\n"
|
|
"{\n"
|
|
" int gid = get_global_id(0);\n"
|
|
" XY(xy,gid);\n"
|
|
" out[gid] = sub_group_scan_exclusive_max(in[gid]);\n"
|
|
"}\n";
|
|
|
|
static const char * scexmin_source =
|
|
"__kernel void test_scexmin(const __global Type *in, __global int2 *xy, __global Type *out)\n"
|
|
"{\n"
|
|
" int gid = get_global_id(0);\n"
|
|
" XY(xy,gid);\n"
|
|
" out[gid] = sub_group_scan_exclusive_min(in[gid]);\n"
|
|
"}\n";
|
|
|
|
// These need to stay in sync with the kernel source below
|
|
#define NUM_LOC 49
|
|
#define INST_LOC_MASK 0x7f
|
|
#define INST_OP_SHIFT 0
|
|
#define INST_OP_MASK 0xf
|
|
#define INST_LOC_SHIFT 4
|
|
#define INST_VAL_SHIFT 12
|
|
#define INST_VAL_MASK 0x7ffff
|
|
#define INST_END 0x0
|
|
#define INST_STORE 0x1
|
|
#define INST_WAIT 0x2
|
|
#define INST_COUNT 0x3
|
|
|
|
static const char * ifp_source =
|
|
"#define NUM_LOC 49\n"
|
|
"#define INST_LOC_MASK 0x7f\n"
|
|
"#define INST_OP_SHIFT 0\n"
|
|
"#define INST_OP_MASK 0xf\n"
|
|
"#define INST_LOC_SHIFT 4\n"
|
|
"#define INST_VAL_SHIFT 12\n"
|
|
"#define INST_VAL_MASK 0x7ffff\n"
|
|
"#define INST_END 0x0\n"
|
|
"#define INST_STORE 0x1\n"
|
|
"#define INST_WAIT 0x2\n"
|
|
"#define INST_COUNT 0x3\n"
|
|
"\n"
|
|
"__kernel void\n"
|
|
"test_ifp(const __global int *in, __global int2 *xy, __global int *out)\n"
|
|
"{\n"
|
|
" __local atomic_int loc[NUM_LOC];\n"
|
|
"\n"
|
|
" // Don't run if there is only one sub group\n"
|
|
" if (get_num_sub_groups() == 1)\n"
|
|
" return;\n"
|
|
"\n"
|
|
" // First initialize loc[]\n"
|
|
" int lid = (int)get_local_id(0);\n"
|
|
"\n"
|
|
" if (lid < NUM_LOC)\n"
|
|
" atomic_init(loc+lid, 0);\n"
|
|
"\n"
|
|
" work_group_barrier(CLK_LOCAL_MEM_FENCE);\n"
|
|
"\n"
|
|
" // Compute pointer to this sub group's \"instructions\"\n"
|
|
" const __global int *pc = in +\n"
|
|
" ((int)get_group_id(0)*(int)get_enqueued_num_sub_groups() +\n"
|
|
" (int)get_sub_group_id()) *\n"
|
|
" (NUM_LOC+1);\n"
|
|
"\n"
|
|
" // Set up to \"run\"\n"
|
|
" bool ok = (int)get_sub_group_local_id() == 0;\n"
|
|
" bool run = true;\n"
|
|
"\n"
|
|
" while (run) {\n"
|
|
" int inst = *pc++;\n"
|
|
" int iop = (inst >> INST_OP_SHIFT) & INST_OP_MASK;\n"
|
|
" int iloc = (inst >> INST_LOC_SHIFT) & INST_LOC_MASK;\n"
|
|
" int ival = (inst >> INST_VAL_SHIFT) & INST_VAL_MASK;\n"
|
|
"\n"
|
|
" switch (iop) {\n"
|
|
" case INST_STORE:\n"
|
|
" if (ok)\n"
|
|
" atomic_store(loc+iloc, ival);\n"
|
|
" break;\n"
|
|
" case INST_WAIT:\n"
|
|
" if (ok) {\n"
|
|
" while (atomic_load(loc+iloc) != ival)\n"
|
|
" ;\n"
|
|
" }\n"
|
|
" break;\n"
|
|
" case INST_COUNT:\n"
|
|
" if (ok) {\n"
|
|
" int i;\n"
|
|
" for (i=0;i<ival;++i)\n"
|
|
" atomic_fetch_add(loc+iloc, 1);\n"
|
|
" }\n"
|
|
" break;\n"
|
|
" case INST_END:\n"
|
|
" run = false;\n"
|
|
" break;\n"
|
|
" }\n"
|
|
"\n"
|
|
" sub_group_barrier(CLK_LOCAL_MEM_FENCE);\n"
|
|
" }\n"
|
|
"\n"
|
|
" work_group_barrier(CLK_LOCAL_MEM_FENCE);\n"
|
|
"\n"
|
|
" // Save this group's result\n"
|
|
" __global int *op = out + (int)get_group_id(0)*NUM_LOC;\n"
|
|
" if (lid < NUM_LOC)\n"
|
|
" op[lid] = atomic_load(loc+lid);\n"
|
|
"}\n";
|
|
|
|
// Any/All test functions
|
|
template <int Which>
|
|
struct AA {
|
|
static void gen(cl_int *x, cl_int *t, cl_int *m, int ns, int nw, int ng)
|
|
{
|
|
int i, ii, j, k, n;
|
|
int nj = (nw + ns - 1)/ns;
|
|
int e;
|
|
|
|
ii = 0;
|
|
for (k=0; k<ng; ++k) {
|
|
for (j=0; j<nj; ++j) {
|
|
ii = j*ns;
|
|
n = ii + ns > nw ? nw - ii : ns;
|
|
e = (int)(genrand_int32(gMTdata) % 3);
|
|
|
|
// Initialize data matrix indexed by local id and sub group id
|
|
switch (e) {
|
|
case 0:
|
|
memset(&t[ii], 0, n*sizeof(cl_int));
|
|
break;
|
|
case 1:
|
|
memset(&t[ii], 0, n*sizeof(cl_int));
|
|
i = (int)(genrand_int32(gMTdata) % (cl_uint)n);
|
|
t[ii + i] = 41;
|
|
break;
|
|
case 2:
|
|
memset(&t[ii], 0xff, n*sizeof(cl_int));
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Now map into work group using map from device
|
|
for (j=0;j<nw;++j) {
|
|
i = m[2*j+1]*ns + m[2*j];
|
|
x[j] = t[i];
|
|
}
|
|
|
|
x += nw;
|
|
m += 2*nw;
|
|
}
|
|
}
|
|
|
|
static int chk(cl_int *x, cl_int *y, cl_int *mx, cl_int *my, cl_int *m, int ns, int nw, int ng)
|
|
{
|
|
int ii, i, j, k, n;
|
|
int nj = (nw + ns - 1)/ns;
|
|
cl_int taa, raa;
|
|
|
|
log_info(" sub_group_%s...\n", Which == 0 ? "any" : "all");
|
|
|
|
for (k=0; k<ng; ++k) {
|
|
// Map to array indexed to array indexed by local ID and sub group
|
|
for (j=0; j<nw; ++j) {
|
|
i = m[2*j+1]*ns + m[2*j];
|
|
mx[i] = x[j];
|
|
my[i] = y[j];
|
|
}
|
|
|
|
for (j=0; j<nj; ++j) {
|
|
ii = j*ns;
|
|
n = ii + ns > nw ? nw - ii : ns;
|
|
|
|
// Compute target
|
|
if (Which == 0) {
|
|
taa = 0;
|
|
for (i=0; i<n; ++i)
|
|
taa |= mx[ii + i] != 0;
|
|
} else {
|
|
taa = 1;
|
|
for (i=0; i<n; ++i)
|
|
taa &= mx[ii + i] != 0;
|
|
}
|
|
|
|
// Check result
|
|
for (i=0; i<n; ++i) {
|
|
raa = my[ii+i] != 0;
|
|
if (raa != taa) {
|
|
log_error("ERROR: sub_group_%s mismatch for local id %d in sub group %d in group %d\n",
|
|
Which == 0 ? "any" : "all", i, j, k);
|
|
return -1;
|
|
}
|
|
}
|
|
}
|
|
|
|
x += nw;
|
|
y += nw;
|
|
m += 2*nw;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
};
|
|
|
|
// Reduce functions
|
|
template <typename Ty, int Which>
|
|
struct RED {
|
|
static void gen(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng)
|
|
{
|
|
int i, ii, j, k, n;
|
|
int nj = (nw + ns - 1)/ns;
|
|
|
|
ii = 0;
|
|
for (k=0; k<ng; ++k) {
|
|
for (j=0; j<nj; ++j) {
|
|
ii = j*ns;
|
|
n = ii + ns > nw ? nw - ii : ns;
|
|
|
|
for (i=0; i<n; ++i)
|
|
t[ii+i] = (Ty)((int)(genrand_int32(gMTdata) & 0x7fffffff) % ns + 1);
|
|
}
|
|
|
|
// Now map into work group using map from device
|
|
for (j=0;j<nw;++j) {
|
|
i = m[2*j+1]*ns + m[2*j];
|
|
x[j] = t[i];
|
|
}
|
|
|
|
x += nw;
|
|
m += 2*nw;
|
|
}
|
|
}
|
|
|
|
static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, int ns, int nw, int ng)
|
|
{
|
|
int ii, i, j, k, n;
|
|
int nj = (nw + ns - 1)/ns;
|
|
Ty tr, rr;
|
|
|
|
log_info(" sub_group_reduce_%s(%s)...\n", Which == 0 ? "add" : (Which == 1 ? "max" : "min"), TypeName<Ty>::val());
|
|
|
|
for (k=0; k<ng; ++k) {
|
|
// Map to array indexed to array indexed by local ID and sub group
|
|
for (j=0; j<nw; ++j) {
|
|
i = m[2*j+1]*ns + m[2*j];
|
|
mx[i] = x[j];
|
|
my[i] = y[j];
|
|
}
|
|
|
|
for (j=0; j<nj; ++j) {
|
|
ii = j*ns;
|
|
n = ii + ns > nw ? nw - ii : ns;
|
|
|
|
// Compute target
|
|
if (Which == 0) {
|
|
// add
|
|
tr = mx[ii];
|
|
for (i=1; i<n; ++i)
|
|
tr += mx[ii + i];
|
|
} else if (Which == 1) {
|
|
// max
|
|
tr = mx[ii];
|
|
for (i=1; i<n; ++i)
|
|
tr = tr > mx[ii + i] ? tr : mx[ii + i];
|
|
} else if (Which == 2) {
|
|
// min
|
|
tr = mx[ii];
|
|
for (i=1; i<n; ++i)
|
|
tr = tr > mx[ii + i] ? mx[ii + i] : tr;
|
|
}
|
|
|
|
// Check result
|
|
for (i=0; i<n; ++i) {
|
|
rr = my[ii+i];
|
|
if (rr != tr) {
|
|
log_error("ERROR: sub_group_reduce_%s(%s) mismatch for local id %d in sub group %d in group %d\n",
|
|
Which == 0 ? "add" : (Which == 1 ? "max" : "min"), TypeName<Ty>::val(), i, j, k);
|
|
return -1;
|
|
}
|
|
}
|
|
}
|
|
|
|
x += nw;
|
|
y += nw;
|
|
m += 2*nw;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
};
|
|
|
|
// Scan Inclusive functions
|
|
template <typename Ty, int Which>
|
|
struct SCIN {
|
|
static void gen(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng)
|
|
{
|
|
int i, ii, j, k, n;
|
|
int nj = (nw + ns - 1)/ns;
|
|
|
|
ii = 0;
|
|
for (k=0; k<ng; ++k) {
|
|
for (j=0; j<nj; ++j) {
|
|
ii = j*ns;
|
|
n = ii + ns > nw ? nw - ii : ns;
|
|
|
|
for (i=0; i<n; ++i)
|
|
// t[ii+i] = (Ty)((int)(genrand_int32(gMTdata) & 0x7fffffff) % ns + 1);
|
|
t[ii+i] = (Ty)i;
|
|
}
|
|
|
|
// Now map into work group using map from device
|
|
for (j=0;j<nw;++j) {
|
|
i = m[2*j+1]*ns + m[2*j];
|
|
x[j] = t[i];
|
|
}
|
|
|
|
x += nw;
|
|
m += 2*nw;
|
|
}
|
|
}
|
|
|
|
static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, int ns, int nw, int ng)
|
|
{
|
|
int ii, i, j, k, n;
|
|
int nj = (nw + ns - 1)/ns;
|
|
Ty tr, rr;
|
|
|
|
log_info(" sub_group_scan_inclusive_%s(%s)...\n", Which == 0 ? "add" : (Which == 1 ? "max" : "min"), TypeName<Ty>::val());
|
|
|
|
for (k=0; k<ng; ++k) {
|
|
// Map to array indexed to array indexed by local ID and sub group
|
|
for (j=0; j<nw; ++j) {
|
|
i = m[2*j+1]*ns + m[2*j];
|
|
mx[i] = x[j];
|
|
my[i] = y[j];
|
|
}
|
|
|
|
for (j=0; j<nj; ++j) {
|
|
ii = j*ns;
|
|
n = ii + ns > nw ? nw - ii : ns;
|
|
|
|
// Check result
|
|
for (i=0; i<n; ++i) {
|
|
if (Which == 0) {
|
|
tr = i == 0 ? mx[ii] : tr + mx[ii + i];
|
|
} else if (Which == 1) {
|
|
tr = i == 0 ? mx[ii] : (tr > mx[ii + i] ? tr : mx[ii + i]);
|
|
} else {
|
|
tr = i == 0 ? mx[ii] : (tr > mx[ii + i] ? mx[ii + i] : tr);
|
|
}
|
|
|
|
rr = my[ii+i];
|
|
if (rr != tr) {
|
|
log_error("ERROR: sub_group_scan_inclusive_%s(%s) mismatch for local id %d in sub group %d in group %d\n",
|
|
Which == 0 ? "add" : (Which == 1 ? "max" : "min"), TypeName<Ty>::val(), i, j, k);
|
|
return -1;
|
|
}
|
|
}
|
|
}
|
|
|
|
x += nw;
|
|
y += nw;
|
|
m += 2*nw;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
};
|
|
|
|
// Scan Exclusive functions
|
|
template <typename Ty, int Which>
|
|
struct SCEX {
|
|
static void gen(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng)
|
|
{
|
|
int i, ii, j, k, n;
|
|
int nj = (nw + ns - 1)/ns;
|
|
|
|
ii = 0;
|
|
for (k=0; k<ng; ++k) {
|
|
for (j=0; j<nj; ++j) {
|
|
ii = j*ns;
|
|
n = ii + ns > nw ? nw - ii : ns;
|
|
|
|
for (i=0; i<n; ++i)
|
|
t[ii+i] = (Ty)((int)(genrand_int32(gMTdata) & 0x7fffffff) % ns + 1);
|
|
}
|
|
|
|
// Now map into work group using map from device
|
|
for (j=0;j<nw;++j) {
|
|
i = m[2*j+1]*ns + m[2*j];
|
|
x[j] = t[i];
|
|
}
|
|
|
|
x += nw;
|
|
m += 2*nw;
|
|
}
|
|
}
|
|
|
|
static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, int ns, int nw, int ng)
|
|
{
|
|
int ii, i, j, k, n;
|
|
int nj = (nw + ns - 1)/ns;
|
|
Ty tr, trt, rr;
|
|
|
|
log_info(" sub_group_scan_exclusive_%s(%s)...\n", Which == 0 ? "add" : (Which == 1 ? "max" : "min"), TypeName<Ty>::val());
|
|
|
|
for (k=0; k<ng; ++k) {
|
|
// Map to array indexed to array indexed by local ID and sub group
|
|
for (j=0; j<nw; ++j) {
|
|
i = m[2*j+1]*ns + m[2*j];
|
|
mx[i] = x[j];
|
|
my[i] = y[j];
|
|
}
|
|
|
|
for (j=0; j<nj; ++j) {
|
|
ii = j*ns;
|
|
n = ii + ns > nw ? nw - ii : ns;
|
|
|
|
// Check result
|
|
for (i=0; i<n; ++i) {
|
|
if (Which == 0) {
|
|
tr = i == 0 ? TypeIdentity<Ty,Which>::val() : tr + trt;
|
|
} else if (Which == 1) {
|
|
tr = i == 0 ? TypeIdentity<Ty,Which>::val() : (trt > tr ? trt : tr);
|
|
} else {
|
|
tr = i == 0 ? TypeIdentity<Ty,Which>::val() : (trt > tr ? tr : trt);
|
|
}
|
|
trt = mx[ii+i];
|
|
rr = my[ii+i];
|
|
|
|
if (rr != tr) {
|
|
log_error("ERROR: sub_group_scan_exclusive_%s(%s) mismatch for local id %d in sub group %d in group %d\n",
|
|
Which == 0 ? "add" : (Which == 1 ? "max" : "min"), TypeName<Ty>::val(), i, j, k);
|
|
return -1;
|
|
}
|
|
}
|
|
}
|
|
|
|
x += nw;
|
|
y += nw;
|
|
m += 2*nw;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
};
|
|
|
|
// Broadcast functios
|
|
template <typename Ty>
|
|
struct BC {
|
|
static void gen(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng)
|
|
{
|
|
int i, ii, j, k, l, n;
|
|
int nj = (nw + ns - 1)/ns;
|
|
int d = ns > 100 ? 100 : ns;
|
|
|
|
ii = 0;
|
|
for (k=0; k<ng; ++k) {
|
|
for (j=0; j<nj; ++j) {
|
|
ii = j*ns;
|
|
n = ii + ns > nw ? nw - ii : ns;
|
|
l = (int)(genrand_int32(gMTdata) & 0x7fffffff) % (d > n ? n : d);
|
|
|
|
for (i=0; i<n; ++i)
|
|
t[ii+i] = (Ty)((int)(genrand_int32(gMTdata) & 0x7fffffff) % 100 * 100 + l);
|
|
}
|
|
|
|
// Now map into work group using map from device
|
|
for (j=0;j<nw;++j) {
|
|
i = m[2*j+1]*ns + m[2*j];
|
|
x[j] = t[i];
|
|
}
|
|
|
|
x += nw;
|
|
m += 2*nw;
|
|
}
|
|
}
|
|
|
|
static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, int ns, int nw, int ng)
|
|
{
|
|
int ii, i, j, k, l, n;
|
|
int nj = (nw + ns - 1)/ns;
|
|
Ty tr, rr;
|
|
|
|
log_info(" sub_group_broadcast(%s)...\n", TypeName<Ty>::val());
|
|
|
|
for (k=0; k<ng; ++k) {
|
|
// Map to array indexed to array indexed by local ID and sub group
|
|
for (j=0; j<nw; ++j) {
|
|
i = m[2*j+1]*ns + m[2*j];
|
|
mx[i] = x[j];
|
|
my[i] = y[j];
|
|
}
|
|
|
|
for (j=0; j<nj; ++j) {
|
|
ii = j*ns;
|
|
n = ii + ns > nw ? nw - ii : ns;
|
|
l = (int)mx[ii] % 100;
|
|
tr = mx[ii+l];
|
|
|
|
// Check result
|
|
for (i=0; i<n; ++i) {
|
|
rr = my[ii+i];
|
|
if (rr != tr) {
|
|
log_error("ERROR: sub_group_broadcast(%s) mismatch for local id %d in sub group %d in group %d\n",
|
|
TypeName<Ty>::val(), i, j, k);
|
|
return -1;
|
|
}
|
|
}
|
|
}
|
|
|
|
x += nw;
|
|
y += nw;
|
|
m += 2*nw;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
};
|
|
|
|
// Independent forward progress stuff
|
|
// Note:
|
|
// Output needs num_groups * NUM_LOC elements
|
|
// local_size must be > NUM_LOC
|
|
// Input needs num_groups * num_sub_groups * (NUM_LOC+1) elements
|
|
|
|
static inline int
|
|
inst(int op, int loc, int val)
|
|
{
|
|
return (val << INST_VAL_SHIFT) | (loc << INST_LOC_SHIFT) | (op << INST_OP_SHIFT);
|
|
}
|
|
|
|
void gen_insts(cl_int *x, cl_int *p, int n)
|
|
{
|
|
int i, j0, j1;
|
|
int val;
|
|
int ii[NUM_LOC];
|
|
|
|
// Create a random permutation of 0...NUM_LOC-1
|
|
ii[0] = 0;
|
|
for (i=1; i<NUM_LOC;++i) {
|
|
j0 = random_in_range(0, i, gMTdata);
|
|
if (j0 != i)
|
|
ii[i] = ii[j0];
|
|
ii[j0] = i;
|
|
}
|
|
|
|
// Initialize "instruction pointers"
|
|
memset(p, 0, n*4);
|
|
|
|
for (i=0; i<NUM_LOC; ++i) {
|
|
// Randomly choose 2 different sub groups
|
|
// One does a random amount of work, and the other waits for it
|
|
j0 = random_in_range(0, n-1, gMTdata);
|
|
|
|
do
|
|
j1 = random_in_range(0, n-1, gMTdata);
|
|
while (j1 == j0);
|
|
|
|
// Randomly choose a wait value and assign "instructions"
|
|
val = random_in_range(100, 200 + 10*NUM_LOC, gMTdata);
|
|
x[j0*(NUM_LOC+1) + p[j0]] = inst(INST_COUNT, ii[i], val);
|
|
x[j1*(NUM_LOC+1) + p[j1]] = inst(INST_WAIT, ii[i], val);
|
|
++p[j0];
|
|
++p[j1];
|
|
}
|
|
|
|
// Last "inst" for each sub group is END
|
|
for (i=0; i<n; ++i)
|
|
x[i*(NUM_LOC+1) + p[i]] = inst(INST_END, 0, 0);
|
|
}
|
|
|
|
// Execute one group's "instructions"
|
|
void run_insts(cl_int *x, cl_int *p, int n)
|
|
{
|
|
int i, nend;
|
|
bool scont;
|
|
cl_int loc[NUM_LOC];
|
|
|
|
// Initialize result and "instruction pointers"
|
|
memset(loc, 0, sizeof(loc));
|
|
memset(p, 0, 4*n);
|
|
|
|
// Repetitively loop over subgroups with each executing "instructions" until blocked
|
|
// The loop terminates when all subgroups have hit the "END instruction"
|
|
do {
|
|
nend = 0;
|
|
for (i=0; i<n; ++i) {
|
|
do {
|
|
cl_int inst = x[i*(NUM_LOC+1) + p[i]];
|
|
cl_int iop = (inst >> INST_OP_SHIFT) & INST_OP_MASK;
|
|
cl_int iloc = (inst >> INST_LOC_SHIFT) & INST_LOC_MASK;
|
|
cl_int ival = (inst >> INST_VAL_SHIFT) & INST_VAL_MASK;
|
|
scont = false;
|
|
|
|
switch (iop) {
|
|
case INST_STORE:
|
|
loc[iloc] = ival;
|
|
++p[i];
|
|
scont = true;
|
|
break;
|
|
case INST_WAIT:
|
|
if (loc[iloc] == ival) {
|
|
++p[i];
|
|
scont = true;
|
|
}
|
|
break;
|
|
case INST_COUNT:
|
|
loc[iloc] += ival;
|
|
++p[i];
|
|
scont = true;
|
|
break;
|
|
case INST_END:
|
|
++nend;
|
|
break;
|
|
}
|
|
} while (scont);
|
|
}
|
|
} while (nend < n);
|
|
|
|
// Return result, reusing "p"
|
|
memcpy(p, loc, sizeof(loc));
|
|
}
|
|
|
|
|
|
struct IFP {
|
|
static void gen(cl_int *x, cl_int *t, cl_int *, int ns, int nw, int ng)
|
|
{
|
|
int k;
|
|
int nj = (nw + ns - 1) / ns;
|
|
|
|
// We need at least 2 sub groups per group for this test
|
|
if (nj == 1)
|
|
return;
|
|
|
|
for (k=0; k<ng; ++k) {
|
|
gen_insts(x, t, nj);
|
|
x += nj * (NUM_LOC+1);
|
|
}
|
|
}
|
|
|
|
static int chk(cl_int *x, cl_int *y, cl_int *t, cl_int *, cl_int *, int ns, int nw, int ng)
|
|
{
|
|
int i, k;
|
|
int nj = (nw + ns - 1) / ns;
|
|
|
|
// We need at least 2 sub groups per group for this tes
|
|
if (nj == 1)
|
|
return 0;
|
|
|
|
log_info(" independent forward progress...\n");
|
|
|
|
for (k=0; k<ng; ++k) {
|
|
run_insts(x, t, nj);
|
|
for (i=0; i<NUM_LOC; ++i) {
|
|
if (t[i] != y[i]) {
|
|
log_error("ERROR: mismatch at element %d in work group %d\n", i, k);
|
|
return -1;
|
|
}
|
|
}
|
|
x += nj * (NUM_LOC+1);
|
|
y += NUM_LOC;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
};
|
|
|
|
|
|
// Entry point from main
|
|
int
|
|
test_work_group_functions(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
|
|
{
|
|
int error;
|
|
|
|
// Adjust these individually below if desired/needed
|
|
#define G 2000
|
|
#define L 200
|
|
|
|
error = test<int, AA<0>, G, L>::run(device, context, queue, num_elements, "test_any", any_source);
|
|
error |= test<int, AA<1>, G, L>::run(device, context, queue, num_elements, "test_all", all_source);
|
|
|
|
// error |= test<cl_half, BC<cl_half>, G, L>::run(device, context, queue, num_elements, "test_bcast", bcast_source);
|
|
error |= test<cl_uint, BC<cl_uint>, G, L>::run(device, context, queue, num_elements, "test_bcast", bcast_source);
|
|
error |= test<cl_int, BC<cl_int>, G, L>::run(device, context, queue, num_elements, "test_bcast", bcast_source);
|
|
error |= test<cl_ulong, BC<cl_ulong>, G, L>::run(device, context, queue, num_elements, "test_bcast", bcast_source);
|
|
error |= test<cl_long, BC<cl_long>, G, L>::run(device, context, queue, num_elements, "test_bcast", bcast_source);
|
|
error |= test<float, BC<float>, G, L>::run(device, context, queue, num_elements, "test_bcast", bcast_source);
|
|
error |= test<double, BC<double>, G, L>::run(device, context, queue, num_elements, "test_bcast", bcast_source);
|
|
|
|
// error |= test<cl_half, RED<cl_half,0>, G, L>::run(device, context, queue, num_elements, "test_redadd", redadd_source);
|
|
error |= test<cl_uint, RED<cl_uint,0>, G, L>::run(device, context, queue, num_elements, "test_redadd", redadd_source);
|
|
error |= test<cl_int, RED<cl_int,0>, G, L>::run(device, context, queue, num_elements, "test_redadd", redadd_source);
|
|
error |= test<cl_ulong, RED<cl_ulong,0>, G, L>::run(device, context, queue, num_elements, "test_redadd", redadd_source);
|
|
error |= test<cl_long, RED<cl_long,0>, G, L>::run(device, context, queue, num_elements, "test_redadd", redadd_source);
|
|
error |= test<float, RED<float,0>, G, L>::run(device, context, queue, num_elements, "test_redadd", redadd_source);
|
|
error |= test<double, RED<double,0>, G, L>::run(device, context, queue, num_elements, "test_redadd", redadd_source);
|
|
|
|
// error |= test<cl_half, RED<cl_half,1>, G, L>::run(device, context, queue, num_elements, "test_redmax", redmax_source);
|
|
error |= test<cl_uint, RED<cl_uint,1>, G, L>::run(device, context, queue, num_elements, "test_redmax", redmax_source);
|
|
error |= test<cl_int, RED<cl_int,1>, G, L>::run(device, context, queue, num_elements, "test_redmax", redmax_source);
|
|
error |= test<cl_ulong, RED<cl_ulong,1>, G, L>::run(device, context, queue, num_elements, "test_redmax", redmax_source);
|
|
error |= test<cl_long, RED<cl_long,1>, G, L>::run(device, context, queue, num_elements, "test_redmax", redmax_source);
|
|
error |= test<float, RED<float,1>, G, L>::run(device, context, queue, num_elements, "test_redmax", redmax_source);
|
|
error |= test<double, RED<double,1>, G, L>::run(device, context, queue, num_elements, "test_redmax", redmax_source);
|
|
|
|
// error |= test<cl_half, RED<cl_half,2>, G, L>::run(device, context, queue, num_elements, "test_redmin", redmin_source);
|
|
error |= test<cl_uint, RED<cl_uint,2>, G, L>::run(device, context, queue, num_elements, "test_redmin", redmin_source);
|
|
error |= test<cl_int, RED<cl_int,2>, G, L>::run(device, context, queue, num_elements, "test_redmin", redmin_source);
|
|
error |= test<cl_ulong, RED<cl_ulong,2>, G, L>::run(device, context, queue, num_elements, "test_redmin", redmin_source);
|
|
error |= test<cl_long, RED<cl_long,2>, G, L>::run(device, context, queue, num_elements, "test_redmin", redmin_source);
|
|
error |= test<float, RED<float,2>, G, L>::run(device, context, queue, num_elements, "test_redmin", redmin_source);
|
|
error |= test<double, RED<double,2>, G, L>::run(device, context, queue, num_elements, "test_redmin", redmin_source);
|
|
|
|
// error |= test<cl_half, SCIN<cl_half,0>, G, L>::run(device, context, queue, num_elements, "test_scinadd", scinadd_source);
|
|
error |= test<cl_uint, SCIN<cl_uint,0>, G, L>::run(device, context, queue, num_elements, "test_scinadd", scinadd_source);
|
|
error |= test<cl_int, SCIN<cl_int,0>, G, L>::run(device, context, queue, num_elements, "test_scinadd", scinadd_source);
|
|
error |= test<cl_ulong, SCIN<cl_ulong,0>, G, L>::run(device, context, queue, num_elements, "test_scinadd", scinadd_source);
|
|
error |= test<cl_long, SCIN<cl_long,0>, G, L>::run(device, context, queue, num_elements, "test_scinadd", scinadd_source);
|
|
error |= test<float, SCIN<float,0>, G, L>::run(device, context, queue, num_elements, "test_scinadd", scinadd_source);
|
|
error |= test<double, SCIN<double,0>, G, L>::run(device, context, queue, num_elements, "test_scinadd", scinadd_source);
|
|
|
|
// error |= test<cl_half, SCIN<cl_half,1>, G, L>::run(device, context, queue, num_elements, "test_scinmax", scinmax_source);
|
|
error |= test<cl_uint, SCIN<cl_uint,1>, G, L>::run(device, context, queue, num_elements, "test_scinmax", scinmax_source);
|
|
error |= test<cl_int, SCIN<cl_int,1>, G, L>::run(device, context, queue, num_elements, "test_scinmax", scinmax_source);
|
|
error |= test<cl_ulong, SCIN<cl_ulong,1>, G, L>::run(device, context, queue, num_elements, "test_scinmax", scinmax_source);
|
|
error |= test<cl_long, SCIN<cl_long,1>, G, L>::run(device, context, queue, num_elements, "test_scinmax", scinmax_source);
|
|
error |= test<float, SCIN<float,1>, G, L>::run(device, context, queue, num_elements, "test_scinmax", scinmax_source);
|
|
error |= test<double, SCIN<double,1>, G, L>::run(device, context, queue, num_elements, "test_scinmax", scinmax_source);
|
|
|
|
// error |= test<cl_half, SCIN<cl_half,2>, G, L>::run(device, context, queue, num_elements, "test_scinmin", scinmin_source);
|
|
error |= test<cl_uint, SCIN<cl_uint,2>, G, L>::run(device, context, queue, num_elements, "test_scinmin", scinmin_source);
|
|
error |= test<cl_int, SCIN<cl_int,2>, G, L>::run(device, context, queue, num_elements, "test_scinmin", scinmin_source);
|
|
error |= test<cl_ulong, SCIN<cl_ulong,2>, G, L>::run(device, context, queue, num_elements, "test_scinmin", scinmin_source);
|
|
error |= test<cl_long, SCIN<cl_long,2>, G, L>::run(device, context, queue, num_elements, "test_scinmin", scinmin_source);
|
|
error |= test<float, SCIN<float,2>, G, L>::run(device, context, queue, num_elements, "test_scinmin", scinmin_source);
|
|
error |= test<double, SCIN<double,2>, G, L>::run(device, context, queue, num_elements, "test_scinmin", scinmin_source);
|
|
|
|
// error |= test<cl_half, SCEX<cl_half,0>, G, L>::run(device, context, queue, num_elements, "test_scexadd", scexadd_source);
|
|
error |= test<cl_uint, SCEX<cl_uint,0>, G, L>::run(device, context, queue, num_elements, "test_scexadd", scexadd_source);
|
|
error |= test<cl_int, SCEX<cl_int,0>, G, L>::run(device, context, queue, num_elements, "test_scexadd", scexadd_source);
|
|
error |= test<cl_ulong, SCEX<cl_ulong,0>, G, L>::run(device, context, queue, num_elements, "test_scexadd", scexadd_source);
|
|
error |= test<cl_long, SCEX<cl_long,0>, G, L>::run(device, context, queue, num_elements, "test_scexadd", scexadd_source);
|
|
error |= test<float, SCEX<float,0>, G, L>::run(device, context, queue, num_elements, "test_scexadd", scexadd_source);
|
|
error |= test<double, SCEX<double,0>, G, L>::run(device, context, queue, num_elements, "test_scexadd", scexadd_source);
|
|
|
|
// error |= test<cl_half, SCEX<cl_half,1>, G, L>::run(device, context, queue, num_elements, "test_scexmax", scexmax_source);
|
|
error |= test<cl_uint, SCEX<cl_uint,1>, G, L>::run(device, context, queue, num_elements, "test_scexmax", scexmax_source);
|
|
error |= test<cl_int, SCEX<cl_int,1>, G, L>::run(device, context, queue, num_elements, "test_scexmax", scexmax_source);
|
|
error |= test<cl_ulong, SCEX<cl_ulong,1>, G, L>::run(device, context, queue, num_elements, "test_scexmax", scexmax_source);
|
|
error |= test<cl_long, SCEX<cl_long,1>, G, L>::run(device, context, queue, num_elements, "test_scexmax", scexmax_source);
|
|
error |= test<float, SCEX<float,1>, G, L>::run(device, context, queue, num_elements, "test_scexmax", scexmax_source);
|
|
error |= test<double, SCEX<double,1>, G, L>::run(device, context, queue, num_elements, "test_scexmax", scexmax_source);
|
|
|
|
// error |= test<cl_half, SCEX<cl_half,2>, G, L>::run(device, context, queue, num_elements, "test_scexmin", scexmin_source);
|
|
error |= test<cl_uint, SCEX<cl_uint,2>, G, L>::run(device, context, queue, num_elements, "test_scexmin", scexmin_source);
|
|
error |= test<cl_int, SCEX<cl_int,2>, G, L>::run(device, context, queue, num_elements, "test_scexmin", scexmin_source);
|
|
error |= test<cl_ulong, SCEX<cl_ulong,2>, G, L>::run(device, context, queue, num_elements, "test_scexmin", scexmin_source);
|
|
error |= test<cl_long, SCEX<cl_long,2>, G, L>::run(device, context, queue, num_elements, "test_scexmin", scexmin_source);
|
|
error |= test<float, SCEX<float,2>, G, L>::run(device, context, queue, num_elements, "test_scexmin", scexmin_source);
|
|
error |= test<double, SCEX<double,2>, G, L>::run(device, context, queue, num_elements, "test_scexmin", scexmin_source);
|
|
|
|
error |= test<cl_int, IFP, G, L>::run(device, context, queue, num_elements, "test_ifp", ifp_source, NUM_LOC + 1);
|
|
return error;
|
|
}
|
|
|