Use maximum subgroup size in sub_group_ballot tests (#1344)

sub_group_ballot_bit_count() and sub_group_ballot_find_msb() mask
their input according to a subgroup size, which is assumed to be the
maximum subgroup size, and not the actual subgroup size excluding
non-existent work-items in the "remainder" subgroup.

Fix this as per the the clarification made to the OpenCL C specification
in revision 3.0.9 for issue KhronosGroup/OpenCL-Docs#626 by pull request
KhronosGroup/OpenCL-Docs#689.

Signed-off-by: Stuart Brady <stuart.brady@arm.com>
This commit is contained in:
Stuart Brady
2021-11-25 13:36:20 +00:00
committed by GitHub
parent c25709f396
commit 3eb0f50d85

View File

@@ -496,7 +496,7 @@ template <typename Ty, BallotOp operation> struct BALLOT_COUNT_SCAN_FIND
| (bs128(mx[wg_offset + wi_id].s1) << 32)
| (bs128(mx[wg_offset + wi_id].s2) << 64)
| (bs128(mx[wg_offset + wi_id].s3) << 96);
bs &= getImportantBits(wi_id, current_sbs);
bs &= getImportantBits(wi_id, sbs);
device_result = my[wg_offset + wi_id].s0;
if (operation == BallotOp::ballot_inclusive_scan
|| operation == BallotOp::ballot_exclusive_scan
@@ -516,7 +516,7 @@ template <typename Ty, BallotOp operation> struct BALLOT_COUNT_SCAN_FIND
}
else if (operation == BallotOp::ballot_find_lsb)
{
for (int id = 0; id < current_sbs; ++id)
for (int id = 0; id < sbs; ++id)
{
if (bs.test(id))
{
@@ -537,7 +537,7 @@ template <typename Ty, BallotOp operation> struct BALLOT_COUNT_SCAN_FIND
}
else if (operation == BallotOp::ballot_find_msb)
{
for (int id = current_sbs - 1; id >= 0; --id)
for (int id = sbs - 1; id >= 0; --id)
{
if (bs.test(id))
{