// // Copyright (c) 2017 The Khronos Group Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // #include #include #include #include #if !defined(_WIN32) #include #endif #include #include #include "procs.h" const char *barrier_with_localmem_kernel_code[] = { "__kernel void compute_sum_with_localmem(__global int *a, int n, __local int *tmp_sum, __global int *sum)\n" "{\n" " int tid = get_local_id(0);\n" " int lsize = get_local_size(0);\n" " int i;\n" "\n" " tmp_sum[tid] = 0;\n" " for (i=tid; i max_local_workgroup_size[0]) kwgsize = max_local_workgroup_size[0]; // err = clSetKernelArgs(context, kernel, 4, NULL, values, sizes); err = clSetKernelArg(kernel, 0, sizeof(streams[0]), &streams[0]); err |= clSetKernelArg(kernel, 1, sizeof num_elements, &num_elements); err |= clSetKernelArg(kernel, 2, wgsize * sizeof(cl_int), NULL); err |= clSetKernelArg(kernel, 3, sizeof streams[1], &streams[1]); if (err != CL_SUCCESS) { log_error("clSetKernelArgs failed\n"); return -1; } global_threads[0] = wgsize; local_threads[0] = wgsize; // Adjust the local thread size to fit and be a nice multiple. if (kwgsize < wgsize) { log_info("Adjusting wgsize down from %lu to %lu.\n", wgsize, kwgsize); local_threads[0] = kwgsize; } while (global_threads[0] % local_threads[0] != 0) local_threads[0]--; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_threads, local_threads, 0, NULL, NULL); if (err != CL_SUCCESS) { log_error("clEnqueueNDRangeKernel failed\n"); return -1; } err = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, out_length, output_ptr, 0, NULL, NULL); if (err != CL_SUCCESS) { log_error("clEnqueueReadBuffer failed\n"); return -1; } err = verify_sum(input_ptr, tmp_ptr, output_ptr, num_elements); // cleanup clReleaseMemObject(streams[0]); clReleaseMemObject(streams[1]); clReleaseKernel(kernel); clReleaseProgram(program); free(input_ptr); free(tmp_ptr); free(output_ptr); return err; } int test_local_kernel_def(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements) { cl_mem streams[2]; cl_program program; cl_kernel kernel; cl_int *input_ptr, *output_ptr, *tmp_ptr; size_t global_threads[1], local_threads[1]; size_t wgsize, kwgsize; int err, i; char *program_source = (char*)malloc(sizeof(char)*2048); MTdata d = init_genrand( gRandomSeed ); size_t max_local_workgroup_size[3]; memset(program_source, 0, 2048); err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof wgsize, &wgsize, NULL); if (err) { log_error("clGetDeviceInfo failed, %d\n\n", err); return -1; } wgsize/=2; if (wgsize < 1) wgsize = 1; size_t in_length = sizeof(cl_int) * num_elements; size_t out_length = sizeof(cl_int) * wgsize; input_ptr = (cl_int *)malloc(in_length); output_ptr = (cl_int *)malloc(out_length); tmp_ptr = (cl_int *)malloc(out_length); streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, in_length, NULL, NULL); if (!streams[0]) { log_error("clCreateBuffer failed\n"); return -1; } streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, out_length, NULL, NULL); if (!streams[1]) { log_error("clCreateBuffer failed\n"); return -1; } for (i=0; i max_local_workgroup_size[0]) kwgsize = max_local_workgroup_size[0]; // err = clSetKernelArgs(context, kernel, 4, NULL, values, sizes); err = clSetKernelArg(kernel, 0, sizeof(streams[0]), &streams[0]); err |= clSetKernelArg(kernel, 1, sizeof num_elements, &num_elements); err |= clSetKernelArg(kernel, 2, sizeof streams[1], &streams[1]); if (err != CL_SUCCESS) { log_error("clSetKernelArgs failed\n"); return -1; } global_threads[0] = wgsize; local_threads[0] = wgsize; // Adjust the local thread size to fit and be a nice multiple. if (kwgsize < wgsize) { log_info("Adjusting wgsize down from %lu to %lu.\n", wgsize, kwgsize); local_threads[0] = kwgsize; } while (global_threads[0] % local_threads[0] != 0) local_threads[0]--; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_threads, local_threads, 0, NULL, NULL); if (err != CL_SUCCESS) { log_error("clEnqueueNDRangeKernel failed\n"); return -1; } err = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, out_length, output_ptr, 0, NULL, NULL); if (err != CL_SUCCESS) { log_error("clEnqueueReadBuffer failed\n"); return -1; } err = verify_sum(input_ptr, tmp_ptr, output_ptr, num_elements); // cleanup clReleaseMemObject(streams[0]); clReleaseMemObject(streams[1]); clReleaseKernel(kernel); clReleaseProgram(program); free(input_ptr); free(tmp_ptr); free(output_ptr); return err; }