OpenCL-CTS/test_conformance/basic/test_vector_creation.cpp

//
// Copyright (c) 2017 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//    http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#include "procs.h"
#include "../../test_common/harness/conversions.h"
#include "../../test_common/harness/typeWrappers.h"
#include "../../test_common/harness/errorHelpers.h"


#define DEBUG 0
#define DEPTH 16
// Limit the maximum code size for any given kernel.
#define MAX_CODE_SIZE (1024*32)

const int sizes[] = {1, 2, 3, 4, 8, 16, -1, -1, -1, -1};
const char *size_names[] = {"", "2", "3", "4", "8", "16" , "!!a", "!!b", "!!c", "!!d"};

// Creates a kernel by enumerating all possible ways of building the vector out of vloads
// skip_to_results will skip results up to a given number. If the amount of code generated
// is greater than MAX_CODE_SIZE, this function will return the number of results used,
// which can then be used as the skip_to_result value to continue where it left off.
int create_kernel(ExplicitType type, int output_size, char *program, int *number_of_results, int skip_to_result) {

    int number_of_sizes;

    switch (output_size) {
        case 1:
            number_of_sizes = 1;
            break;
        case 2:
            number_of_sizes = 2;
            break;
        case 3:
            number_of_sizes = 3;
            break;
        case 4:
            number_of_sizes = 4;
            break;
        case 8:
            number_of_sizes = 5;
            break;
        case 16:
            number_of_sizes = 6;
            break;
        default:
            log_error("Invalid size: %d\n", output_size);
            return -1;
    }

    int total_results = 0;
    int current_result = 0;
    int total_vloads = 0;
    int total_program_length = 0;
    int aborted_due_to_size = 0;

    if (skip_to_result < 0)
        skip_to_result = 0;

    // The line of code for the vector creation
    char line[1024];
    // Keep track of what size vector we are using in each position so we can iterate through all fo them
    int pos[DEPTH];
    int max_size = output_size;
    if (DEBUG > 1) log_info("max_size: %d\n", max_size);

    program[0] = '\0';
    sprintf(program, "%s\n__kernel void test_vector_creation(__global %s *src, __global %s%s *result) {\n",
            type == kDouble ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable" : "",
            get_explicit_type_name(type), get_explicit_type_name(type), ( number_of_sizes == 3 ) ? "" : size_names[number_of_sizes-1]);
    total_program_length += (int)strlen(program);

    char storePrefix[ 128 ], storeSuffix[ 128 ];

    // Start out trying sizes 1,1,1,1,1...
    for (int i=0; i<DEPTH; i++)
        pos[i] = 0;

    int done = 0;
    while (!done) {
        if (DEBUG > 1) {
            log_info("pos size[] = [");
            for (int k=0; k<DEPTH; k++)
                log_info(" %d ", pos[k]);
            log_info("]\n");
        }

        // Go through the selected vector sizes and see if the first n of them fit the
        //  required size exactly.
        int size_so_far = 0;
        int vloads;
        for ( vloads=0; vloads<DEPTH; vloads++) {
            if (size_so_far + sizes[pos[vloads]] <= max_size) {
                size_so_far += sizes[pos[vloads]];
            } else {
                break;
            }
        }
        if (DEBUG > 1)  log_info("vloads: %d, size_so_far:%d\n", vloads, size_so_far);

        // If they did not fit the required size exactly it is too long, so there is no point in checking any other combinations
        //  of the sizes to the right. Prune them from the search.
        if (size_so_far != max_size) {
            // Zero all the sizes to the right
            for (int k=vloads+1; k<DEPTH; k++) {
                pos[k] = 0;
            }
            // Increment this current size and propagate the values up if needed
            for (int d=vloads; d>=0; d--) {
                pos[d]++;
                if (pos[d] >= number_of_sizes) {
                    pos[d] = 0;
                    if (d == 0) {
                        // If we rolled over then we are done
                        done = 1;
                        break;
                    }
                } else {
                    break;
                }
            }
            // Go on to the next size since this one (and all others "under" it) didn't fit
            continue;
        }


        // Generate the actual load line if we are building this part
        line[0]= '\0';
        if (skip_to_result == 0 || total_results >= skip_to_result) {
            if( number_of_sizes == 3 )
            {
                sprintf( storePrefix, "vstore3( " );
                sprintf( storeSuffix, ", %d, result )", current_result );
            }
            else
            {
                sprintf( storePrefix, "result[%d] = ", current_result );
                storeSuffix[ 0 ] = 0;
            }

            sprintf(line, "\t%s(%s%d)(", storePrefix, get_explicit_type_name(type), output_size);
            current_result++;

            int offset = 0;
            for (int i=0; i<vloads; i++) {
                if (pos[i] == 0)
                    sprintf(line + strlen(line), "src[%d]", offset);
                else
                    sprintf(line + strlen(line), "vload%s(0,src+%d)", size_names[pos[i]], offset);
                offset += sizes[pos[i]];
                if (i<(vloads-1))
                    sprintf(line + strlen(line), ",");
            }
            sprintf(line + strlen(line), ")%s;\n", storeSuffix);

            strcat(program, line);
            total_vloads += vloads;
        }
        total_results++;
        total_program_length += (int)strlen(line);
        if (total_program_length > MAX_CODE_SIZE) {
            aborted_due_to_size = 1;
            done = 1;
        }


        if (DEBUG) log_info("line is: %s", line);

        // If we did not use all of them, then we ignore any changes further to the right.
        // We do this by causing those loops to skip on the next iteration.
        if (vloads < DEPTH) {
            if (DEBUG > 1) log_info("done with this depth\n");
            for (int k=vloads; k<DEPTH; k++)
                pos[k] = number_of_sizes;
        }

        // Increment the far right size by 1, rolling over as needed
        for (int d=DEPTH-1; d>=0; d--) {
            pos[d]++;
            if (pos[d] >= number_of_sizes) {
                pos[d] = 0;
                if (d == 0) {
                    // If we rolled over at the far-left then we are done
                    done = 1;
                    break;
                }
            } else {
                break;
            }
        }
        if (done)
            break;

        // Continue until we are done.
    }
    strcat(program, "}\n\n"); //log_info("%s\n", program);
    total_program_length += 3;
    if (DEBUG) log_info("\t\t(Program for vector type %s%s contains %d vector creations, of total program length %gkB, with a total of %d vloads.)\n",
                        get_explicit_type_name(type), size_names[number_of_sizes-1], total_results, total_program_length/1024.0, total_vloads);
    *number_of_results = current_result;
    if (aborted_due_to_size)
        return total_results;
    return 0;
}


int test_vector_creation(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
{
    ExplicitType vecType[] = { kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kDouble };
    unsigned int vecSizes[] = { 1, 2, 3, 4, 8, 16};

    char *program_source;
    int error;
    int total_errors = 0;

    cl_int input_data_int[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
    cl_double input_data_double[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
    void *input_data_converted;
    void *output_data;

    int number_of_results;;

    input_data_converted = malloc(sizeof(cl_double)*16);
    program_source = (char*)malloc(sizeof(char)*1024*1024*4);

    // Iterate over all the types
    for (int type_index=0; type_index<10; type_index++) {
    if(!gHasLong && ((vecType[type_index] == kLong)  || (vecType[type_index] == kULong)))
    {
      log_info("Long/ULong data type not supported on this device\n");
      continue;
    }

        clMemWrapper input;

        if (vecType[type_index] == kDouble) {
            if (!is_extension_available(deviceID, "cl_khr_fp64")) {
                log_info("Extension cl_khr_fp64 not supported; skipping double tests.\n");
                continue;
            }
            log_info("Testing doubles.\n");
        }

        // Convert the data to the right format for the test.
        memset(input_data_converted, 0xff, sizeof(cl_double)*16);
        if (vecType[type_index] != kDouble) {
            for (int j=0; j<16; j++) {
                convert_explicit_value(&input_data_int[j], ((char*)input_data_converted)+get_explicit_type_size(vecType[type_index])*j,
                                       kInt, 0, kRoundToEven, vecType[type_index]);
            }
        } else {
            memcpy(input_data_converted, &input_data_double, sizeof(cl_double)*16);
        }

        input = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, get_explicit_type_size(vecType[type_index])*16,
                               (vecType[type_index] != kDouble) ? input_data_converted : input_data_double, &error);
        if (error) {
            print_error(error, "clCreateBuffer failed");
            total_errors++;
            continue;
        }

        // Iterate over all the vector sizes.
        for (int size_index=1; size_index< 5; size_index++) {
            size_t global[] = {1,1,1};
            int number_generated = -1;
            int previous_number_generated = 0;

            log_info("Testing %s%s...\n", get_explicit_type_name(vecType[type_index]), size_names[size_index]);
            while (number_generated != 0) {
                clMemWrapper output;
                clKernelWrapper kernel;
                clProgramWrapper program;

                number_generated = create_kernel(vecType[type_index], vecSizes[size_index], program_source, &number_of_results, number_generated);
                if (number_generated != 0) {
                    if (previous_number_generated == 0)
                        log_info("Code size greater than %gkB; splitting test into multiple kernels.\n", MAX_CODE_SIZE/1024.0);
                    log_info("\tExecuting vector permutations %d to %d...\n", previous_number_generated, number_generated-1);
                }

                error = create_single_kernel_helper(context, &program, &kernel, 1, (const char **)&program_source, "test_vector_creation");
                if (error) {
                    log_error("create_single_kernel_helper failed.\n");
                    total_errors++;
                    break;
                }

                output = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
                                        number_of_results*get_explicit_type_size(vecType[type_index])*vecSizes[size_index],
                                        NULL, &error);
                if (error) {
                    print_error(error, "clCreateBuffer failed");
                    total_errors++;
                    break;
                }

                error = clSetKernelArg(kernel, 0, sizeof(input), &input);
                error |= clSetKernelArg(kernel, 1, sizeof(output), &output);
                if (error) {
                    print_error(error, "clSetKernelArg failed");
                    total_errors++;
                    break;
                }

                error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global, NULL, 0, NULL, NULL);
                if (error) {
                    print_error(error, "clEnqueueNDRangeKernel failed");
                    total_errors++;
                    break;
                }

                error = clFinish(queue);
                if (error) {
                    print_error(error, "clFinish failed");
                    total_errors++;
                    break;
                }

                output_data = malloc(number_of_results*get_explicit_type_size(vecType[type_index])*vecSizes[size_index]);
                if (output_data == NULL) {
                    log_error("Failed to allocate memory for output data.\n");
                    total_errors++;
                    break;
                }
                memset(output_data, 0xff, number_of_results*get_explicit_type_size(vecType[type_index])*vecSizes[size_index]);
                error = clEnqueueReadBuffer(queue, output, CL_TRUE, 0,
                                            number_of_results*get_explicit_type_size(vecType[type_index])*vecSizes[size_index],
                                            output_data, 0, NULL, NULL);
                if (error) {
                    print_error(error, "clEnqueueReadBuffer failed");
                    total_errors++;
                    free(output_data);
                    break;
                }

                // Check the results
                char *res = (char *)output_data;
                char *exp = (char *)input_data_converted;
                for (int i=0; i<number_of_results; i++) {
                    // If they do not match, then print out why
                    if (memcmp(input_data_converted,
                               res + i*(get_explicit_type_size(vecType[type_index])*vecSizes[size_index]),
                               get_explicit_type_size(vecType[type_index])*vecSizes[size_index])
                        ) {
                        log_error("Data failed to validate for result %d\n", i);

                        // Find the line in the program that failed. This is ugly.
                        char search[32];
                        char found_line[1024];
                        found_line[0]='\0';
                        search[0]='\0';
                        sprintf(search, "result[%d] = (", i);
                        char *start_loc = strstr(program_source, search);
                        if (start_loc == NULL)
                            log_error("Failed to find program source for failure for %s in \n%s", search, program_source);
                        else {
                          char *end_loc = strstr(start_loc, "\n");
                          memcpy(&found_line, start_loc, (end_loc-start_loc));
                          found_line[end_loc-start_loc]='\0';
                          log_error("Failed vector line: %s\n", found_line);
                        }

                        for (int j=0; j<(int)vecSizes[size_index]; j++) {
                            char expected_value[64];
                            char returned_value[64];
                            expected_value[0]='\0';
                            returned_value[0]='\0';
                            print_type_to_string(vecType[type_index], (void*)(res+get_explicit_type_size(vecType[type_index])*(i*vecSizes[size_index]+j)), returned_value);
                            print_type_to_string(vecType[type_index], (void*)(exp+get_explicit_type_size(vecType[type_index])*j), expected_value);
                            log_error("index [%d, component %d]: got: %s expected: %s\n", i, j,
                                      returned_value, expected_value);
                        }

                        total_errors++;
                    }
                }
                free(output_data);
                previous_number_generated = number_generated;
            } // number_generated != 0

        } // vector sizes
    } // vector types

    free(input_data_converted);
    free(program_source);

    return total_errors;
}