Initial open source release of OpenCL 2.2 CTS.

2026-03-19 06:09:01 +00:00 · 2017-05-16 18:25:37 +05:30
parent 6911ba5116
commit 2821bf1323
1035 changed files with 343518 additions and 0 deletions
--- a/test_conformance/CMakeCommon.txt
+++ b/test_conformance/CMakeCommon.txt
@@ -0,0 +1,13 @@
+set_source_files_properties(COMPILE_FLAGS -msse2)
+
+string(TOLOWER ${MODULE_NAME} MODULE_NAME_LOWER)
+
+set(${MODULE_NAME}_OUT ${CONFORMANCE_PREFIX}${MODULE_NAME_LOWER}${CONFORMANCE_SUFFIX})
+
+add_executable(${${MODULE_NAME}_OUT} ${${MODULE_NAME}_SOURCES})
+
+set_source_files_properties(${${MODULE_NAME}_SOURCES} PROPERTIES LANGUAGE CXX)
+
+set_property(TARGET ${${MODULE_NAME}_OUT} PROPERTY FOLDER "CONFORMANCE${CONFORMANCE_SUFFIX}")
+
+TARGET_LINK_LIBRARIES(${${MODULE_NAME}_OUT} ${CLConform_LIBRARIES})
--- a/test_conformance/CMakeLists.txt
+++ b/test_conformance/CMakeLists.txt
@@ -0,0 +1,92 @@
+# Remember current source directory (`test_conformance').
+set( CLConf_Install_Base_Dir "${CMAKE_CURRENT_SOURCE_DIR}" )
+
+add_subdirectory( allocations )
+add_subdirectory( api )
+add_subdirectory( atomics )
+add_subdirectory( basic )
+add_subdirectory( buffers )
+add_subdirectory( commonfns )
+add_subdirectory( compatibility )
+add_subdirectory( compiler )
+add_subdirectory( computeinfo )
+add_subdirectory( contractions )
+add_subdirectory( conversions )
+if(D3D10_IS_SUPPORTED)
+   add_subdirectory( d3d10 )
+endif(D3D10_IS_SUPPORTED)
+if(D3D11_IS_SUPPORTED)
+   add_subdirectory( d3d11 )
+endif(D3D11_IS_SUPPORTED)
+add_subdirectory( device_partition )
+add_subdirectory( events )
+add_subdirectory( geometrics )
+if(GL_IS_SUPPORTED)
+   add_subdirectory( gl )
+endif(GL_IS_SUPPORTED)
+if(GLES_IS_SUPPORTED)
+   add_subdirectory(gles)
+endif(GLES_IS_SUPPORTED)
+add_subdirectory( half )
+add_subdirectory( headers )
+add_subdirectory( images )
+add_subdirectory( integer_ops )
+add_subdirectory( math_brute_force )
+add_subdirectory( mem_host_flags )
+add_subdirectory( multiple_device_context )
+add_subdirectory( printf )
+add_subdirectory( profiling )
+add_subdirectory( relationals )
+add_subdirectory( select )
+add_subdirectory( thread_dimensions )
+add_subdirectory( vec_align )
+add_subdirectory( vec_step )
+add_subdirectory( c11_atomics )
+add_subdirectory( device_execution )
+add_subdirectory( non_uniform_work_group )
+add_subdirectory( SVM )
+add_subdirectory( generic_address_space )
+add_subdirectory( subgroups )
+add_subdirectory( workgroups )
+add_subdirectory( pipes )
+add_subdirectory( device_timer )
+add_subdirectory( clcpp )
+
+set(CSV_FILES
+    opencl_conformance_tests_21_full_spirv.csv
+    opencl_conformance_tests_21_legacy_wimpy.csv
+    opencl_conformance_tests_22.csv
+    opencl_conformance_tests_generate_spirv.csv
+    opencl_conformance_tests_conversions.csv
+    opencl_conformance_tests_d3d.csv
+    opencl_conformance_tests_full.csv
+    opencl_conformance_tests_full_no_math_or_conversions.csv
+    opencl_conformance_tests_math.csv
+    opencl_conformance_tests_quick.csv
+)
+set(PY_FILES
+    run_conformance.py
+)
+
+# Copy .csv files
+foreach(FILE ${CSV_FILES})
+    configure_file(${FILE} ${FILE} COPYONLY)
+endforeach()
+
+# Copy test run script
+foreach(FILE ${PY_FILES})
+    if(WIN32)
+        configure_file(${FILE} ${FILE} COPYONLY)
+    else(WIN32)
+        # Copy to CMakeFiles
+        configure_file(${FILE} ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/${FILE} COPYONLY)
+        # Copy to final destination and set permissions
+        file(COPY ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/${FILE}
+             DESTINATION ${CMAKE_BINARY_DIR}/test_conformance
+             FILE_PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ
+             GROUP_EXECUTE WORLD_READ WORLD_EXECUTE)
+    endif(WIN32)
+endforeach()
+foreach(FILE test_conformance/${PY_FILES})
+
+endforeach()
--- a/test_conformance/Jamfile
+++ b/test_conformance/Jamfile
@@ -0,0 +1,24 @@
+project
+    : requirements
+      <library>/harness//harness
+      <warnings>off
+    ;
+
+use-project /harness : ../test_common/harness ;
+
+proj_lst = allocations api atomics basic buffers commonfns compiler
+           computeinfo contractions conversions events geometrics gl
+           half images integer_ops math_brute_force multiple_device_context
+           profiling relationals select thread_dimensions ;
+
+for proj in $(proj_lst)
+{
+   build-project $(proj) ;
+}
+
+install data
+    : [ glob *.csv ] [ glob *.py ]
+    : <variant>debug:<location>$(DIST)/debug/tests/test_conformance
+      <variant>release:<location>$(DIST)/release/tests/test_conformance
+    ;
+
--- a/test_conformance/Makefile
+++ b/test_conformance/Makefile
@@ -0,0 +1,61 @@
+
+PRODUCTS = \
+	allocations/ \
+	api/ \
+	atomics/ \
+	basic/ \
+	buffers/ \
+	commonfns/ \
+  compatibility/test_conformance/ \
+	compiler/ \
+	computeinfo/ \
+	contractions/ \
+	conversions/ \
+	device_partition/ \
+	events/ \
+	geometrics/ \
+	gl/ \
+	half/ \
+	headers/ \
+	images/ \
+	integer_ops/ \
+	math_brute_force/ \
+	mem_host_flags/ \
+	multiple_device_context/ \
+	printf/ \
+	profiling/ \
+	relationals/ \
+	select/ \
+	thread_dimensions/ \
+	vec_align/ \
+	vec_step/ \
+	workgroups/
+
+ 
+TOP=$(shell pwd)
+
+all: $(PRODUCTS)
+
+clean:
+	@for testdir in $(dir $(PRODUCTS))  ; \
+		do ( \
+			echo "==================================================================================" ; \
+			echo "Cleaning $$testdir" ; \
+			echo "==================================================================================" ; \
+			if test -d $$testdir; \
+				then cd $$testdir && make clean; \
+				else echo "Warning: Directory '$$testdir' Does Not Exist"; \
+			fi; \
+			); \
+		done \
+
+$(PRODUCTS): 
+	@echo "==================================================================================" ;
+	@echo "(`date "+%H:%M:%S"`) Make $@" ;
+	@echo "==================================================================================" ;
+	@if test -d $@; \
+		then cd $(dir $@) && make -i; \
+		else echo "Warning: Directory '$@' Does Not Exist"; \
+		fi; \
+
+.PHONY: clean $(PRODUCTS)  all
--- a/test_conformance/SVM/CMakeLists.txt
+++ b/test_conformance/SVM/CMakeLists.txt
@@ -0,0 +1,26 @@
+set(MODULE_NAME SVM)
+
+set(${MODULE_NAME}_SOURCES
+    main.cpp
+    test_allocate_shared_buffer.cpp
+    test_byte_granularity.cpp
+    test_cross_buffer_pointers.cpp
+    test_enqueue_api.cpp
+    test_fine_grain_memory_consistency.cpp
+    test_fine_grain_sync_buffers.cpp
+    test_pointer_passing.cpp
+    test_set_kernel_exec_info_svm_ptrs.cpp
+    test_shared_address_space_coarse_grain.cpp
+    test_shared_address_space_fine_grain.cpp
+    test_shared_address_space_fine_grain_buffers.cpp
+    test_shared_sub_buffers.cpp
+    test_migrate.cpp
+    ../../test_common/harness/testHarness.c
+    ../../test_common/harness/errorHelpers.c
+    ../../test_common/harness/kernelHelpers.c
+    ../../test_common/harness/mt19937.c
+    ../../test_common/harness/msvc9.c
+    ../../test_common/harness/parseParameters.cpp
+)
+
+include(../CMakeCommon.txt)
--- a/test_conformance/SVM/Makefile
+++ b/test_conformance/SVM/Makefile
@@ -0,0 +1,54 @@
+ifdef BUILD_WITH_ATF
+ATF = -framework ATF
+USE_ATF = -DUSE_ATF
+endif
+
+SRCS = main.c \
+	test_allocate_shared_buffer.cpp \
+	test_byte_granularity.cpp \
+	test_cross_buffer_pointers.cpp \
+	test_enqueue_api.cpp \
+    test_fine_grain_memory_consistency.cpp \
+    test_fine_grain_sync_buffers.cpp \
+    test_pointer_passing.cpp \
+    test_set_kernel_exec_info_svm_ptrs.cpp \
+	test_shared_address_space_coarse_grain.cpp \
+	test_shared_address_space_fine_grain_buffers.cpp \
+	test_shared_address_space_fine_grain.cpp \
+    test_shared_sub_buffers.cpp \
+    test_migrate.cpp \
+    ../../test_common/harness/errorHelpers.c \
+	../../test_common/harness/threadTesting.c \
+	../../test_common/harness/testHarness.c \
+	../../test_common/harness/kernelHelpers.c \
+	../../test_common/harness/typeWrappers.cpp \
+	../../test_common/harness/mt19937.c \
+		  
+DEFINES = DONT_TEST_GARBAGE_POINTERS
+
+SOURCES = $(abspath $(SRCS))
+LIBPATH += -L/System/Library/Frameworks/OpenCL.framework/Libraries
+LIBPATH += -L.
+HEADERS = 
+TARGET = test_SVM
+INCLUDE = 
+COMPILERFLAGS = -c -Wall -g -Wshorten-64-to-32
+CC = c++
+CFLAGS = $(COMPILERFLAGS) ${RC_CFLAGS} ${USE_ATF} $(DEFINES:%=-D%) $(INCLUDE)
+CXXFLAGS = $(COMPILERFLAGS) ${RC_CFLAGS} ${USE_ATF} $(DEFINES:%=-D%) $(INCLUDE)
+LIBRARIES = -framework OpenCL -framework OpenGL -framework GLUT -framework AppKit ${ATF}
+
+OBJECTS := ${SOURCES:.c=.o}
+OBJECTS := ${OBJECTS:.cpp=.o}
+
+TARGETOBJECT =
+all: $(TARGET)
+
+$(TARGET): $(OBJECTS)
+	$(CC) $(RC_CFLAGS) $(OBJECTS) -o $@ $(LIBPATH) $(LIBRARIES)
+
+clean:
+	rm -f $(TARGET) $(OBJECTS)
+
+.DEFAULT:
+	@echo The target \"$@\" does not exist in Makefile.
--- a/test_conformance/SVM/common.h
+++ b/test_conformance/SVM/common.h
@@ -0,0 +1,101 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#ifndef __COMMON_H__
+#define __COMMON_H__
+
+#include "../../test_common/harness/compat.h"
+#include "../../test_common/harness/testHarness.h"
+#include "../../test_common/harness/errorHelpers.h"
+#include "../../test_common/harness/kernelHelpers.h"
+#include "../../test_common/harness/typeWrappers.h"
+
+#if (defined(_WIN32) || defined(_WIN64)) && defined(_MSC_VER)
+    #include <windows.h>
+#endif
+
+typedef enum {
+    memory_order_relaxed,
+    memory_order_acquire,
+    memory_order_release,
+    memory_order_acq_rel,
+    memory_order_seq_cst
+} cl_memory_order;
+
+cl_int AtomicLoadExplicit(volatile cl_int * pValue, cl_memory_order order);
+cl_int AtomicFetchAddExplicit(volatile cl_int *object, cl_int operand, cl_memory_order o);
+
+template <typename T>
+bool AtomicCompareExchangeStrongExplicit(volatile T *a, T *expected, T desired,
+                                  cl_memory_order order_success,
+                                  cl_memory_order order_failure)
+{
+    T tmp;
+#if defined( _MSC_VER ) || (defined( __INTEL_COMPILER ) && defined(WIN32))
+    tmp = (T)InterlockedCompareExchange((volatile LONG *)a, (LONG)desired, *(LONG *)expected);
+#elif defined(__GNUC__)
+    tmp = (T)__sync_val_compare_and_swap((volatile intptr_t*)a, (intptr_t)(*expected), (intptr_t)desired);
+#else
+    log_info("Host function not implemented: atomic_compare_exchange\n");
+    tmp = 0;
+#endif
+    if(tmp == *expected)
+        return true;
+    *expected = tmp;
+    return false;
+}
+
+// this checks for a NULL ptr and/or an error code
+#define test_error2(error_code, ptr, msg)  { if(error != 0)  { test_error(error_code, msg); } else  { if(NULL == ptr)  {print_null_error(msg); return -1;} } }
+#define print_null_error(msg) log_error("ERROR: %s! (NULL pointer detected %s:%d)\n", msg, __FILE__, __LINE__ );
+
+// max possible number of queues needed, 1 for each device in platform.
+#define MAXQ 32
+
+typedef struct Node{
+    cl_int global_id;
+    cl_int position_in_list;
+    struct Node* pNext;
+} Node;
+
+extern void   create_linked_lists(Node* pNodes, size_t num_lists, int list_length);
+extern cl_int verify_linked_lists(Node* pNodes, size_t num_lists, int list_length);
+
+extern cl_int        create_linked_lists_on_device(int qi, cl_command_queue q, cl_mem allocator,     cl_kernel k, size_t numLists  );
+extern cl_int        verify_linked_lists_on_device(int qi, cl_command_queue q, cl_mem num_correct,   cl_kernel k, cl_int ListLength, size_t numLists  );
+extern cl_int create_linked_lists_on_device_no_map(int qi, cl_command_queue q, size_t *pAllocator,   cl_kernel k, size_t numLists  );
+extern cl_int verify_linked_lists_on_device_no_map(int qi, cl_command_queue q, cl_int *pNum_correct, cl_kernel k, cl_int ListLength, size_t numLists  );
+
+extern int    test_byte_granularity(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int    test_set_kernel_exec_info_svm_ptrs(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int    test_fine_grain_memory_consistency(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int    test_fine_grain_sync_buffers(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int    test_shared_address_space_coarse_grain_old_api(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int    test_shared_address_space_coarse_grain_new_api(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int    test_shared_address_space_fine_grain_buffers(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int    test_shared_address_space_fine_grain(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int    test_cross_buffer_pointers_coarse_grain(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int    test_svm_pointer_passing(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int    test_allocate_shared_buffer(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int    test_shared_sub_buffers(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int    test_enqueue_api(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int    test_migrate(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+
+extern cl_int create_cl_objects(cl_device_id device_from_harness, const char** ppCodeString, cl_context* context, cl_program *program, cl_command_queue *queues, cl_uint *num_devices, cl_device_svm_capabilities required_svm_caps);
+
+extern const char *linked_list_create_and_verify_kernels[];
+
+#endif    // #ifndef __COMMON_H__
+
--- a/test_conformance/SVM/main.cpp
+++ b/test_conformance/SVM/main.cpp
@@ -0,0 +1,317 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/compat.h"
+
+#include <stdio.h>
+#include <vector>
+#include <sstream>
+#include "../../test_common/harness/testHarness.h"
+#include "../../test_common/harness/kernelHelpers.h"
+
+#include "common.h"
+
+// SVM Atomic wrappers.
+// Platforms that support SVM atomics (atomics that work across the host and devices) need to implement these host side functions correctly.
+// Platforms that do not support SVM atomics can simpy implement these functions as empty stubs since the functions will not be called.
+// For now only Windows x86 is implemented, add support for other platforms as needed.
+cl_int AtomicLoadExplicit(volatile cl_int * pValue, cl_memory_order order)
+{
+#if (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))) || (defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)))
+  return *pValue;  // provided the value is aligned x86 doesn't need anything more than this for seq_cst.
+#elif defined(__GNUC__)
+	return __sync_add_and_fetch(pValue, 0);
+#else
+  log_error("ERROR: AtomicLoadExplicit function not implemented\n");
+  return -1;
+#endif
+}
+// all the x86 atomics are seq_cst, so don't need to do anything with the memory order parameter.
+cl_int AtomicFetchAddExplicit(volatile cl_int *object, cl_int operand, cl_memory_order o)
+{
+#if (defined(_WIN32) || defined(_WIN64)) && defined(_MSC_VER)
+  return InterlockedExchangeAdd( (volatile LONG*) object, operand);
+#elif defined(__GNUC__)
+  return __sync_fetch_and_add(object, operand);
+#else
+  log_error("ERROR: AtomicFetchAddExplicit function not implemented\n");
+  return -1;
+#endif
+}
+
+cl_int AtomicExchangeExplicit(volatile cl_int *object, cl_int desired, cl_memory_order mo)
+{
+#if (defined(_WIN32) || defined(_WIN64)) && defined(_MSC_VER)
+  return InterlockedExchange( (volatile LONG*) object, desired);
+#elif defined(__GNUC__)
+  return __sync_lock_test_and_set(object, desired);
+#else
+  log_error("ERROR: AtomicExchangeExplicit function not implemented\n");
+  return -1;
+#endif
+}
+
+
+const char *linked_list_create_and_verify_kernels[] = {
+  "typedef struct Node {\n"
+  "    int global_id;\n"
+  "    int position_in_list;\n"
+  "    __global struct Node* pNext;\n"
+  "} Node;\n"
+  "\n"
+  // The allocation_index parameter must be initialized on the host to N work-items
+  // The first N nodes in pNodes will be the heads of the lists.
+  "__kernel void create_linked_lists(__global Node* pNodes, volatile __attribute__((nosvm)) __global int* allocation_index, int list_length)\n"
+  "{\n"
+  "    size_t i = get_global_id(0);\n"
+  "    __global Node *pNode = &pNodes[i];\n"
+  "\n"
+  "    pNode->global_id = i;\n"
+  "    pNode->position_in_list = 0;\n"
+  "\n"
+  "    __global Node *pNew;\n"
+  "    for(int j=1; j < list_length; j++)\n"
+  "    {\n"
+  "        pNew = &pNodes[ atomic_inc(allocation_index) ];// allocate a new node\n"
+  "        pNew->global_id = i;\n"
+  "        pNew->position_in_list = j;\n"
+  "        pNode->pNext = pNew;  // link new node onto end of list\n"
+  "        pNode = pNew;   // move to end of list\n"
+  "    }\n"
+  "}\n"
+
+  "__kernel void verify_linked_lists(__global Node* pNodes, volatile __global uint* num_correct, int list_length)\n"
+  "{\n"
+  "    size_t i = get_global_id(0);\n"
+  "    __global Node *pNode = &pNodes[i];\n"
+  "\n"
+  "    for(int j=0; j < list_length; j++)\n"
+  "    {\n"
+  "        if( pNode->global_id == i && pNode->position_in_list == j)\n"
+  "        {\n"
+  "            atomic_inc(num_correct);\n"
+  "        } \n"
+  "        else {\n"
+  "            break;\n"
+  "        }\n"
+  "        pNode = pNode->pNext;\n"
+  "    }\n"
+  "}\n"
+};
+
+
+// The first N nodes in pNodes will be the heads of the lists.
+void create_linked_lists(Node* pNodes, size_t num_lists, int list_length)
+{
+  size_t allocation_index = num_lists;  // heads of lists are in first num_lists nodes.
+
+  for(cl_uint i = 0; i < num_lists; i++)
+  {
+    Node *pNode = &pNodes[i];
+    pNode->global_id = i;
+    pNode->position_in_list = 0;
+    Node *pNew;
+    for(int j=1; j < list_length; j++)
+    {
+      pNew = &pNodes[ allocation_index++ ];// allocate a new node
+      pNew->global_id = i;
+      pNew->position_in_list = j;
+      pNode->pNext = pNew;  // link new node onto end of list
+      pNode = pNew;   // move to end of list
+    }
+  }
+}
+
+cl_int verify_linked_lists(Node* pNodes, size_t num_lists, int list_length)
+{
+  cl_int error = CL_SUCCESS;
+  int numCorrect = 0;
+
+  log_info(" and verifying on host ");
+  for(cl_uint i=0; i < num_lists; i++)
+  {
+    Node *pNode = &pNodes[i];
+    for(int j=0; j < list_length; j++)
+    {
+      if( pNode->global_id == i && pNode->position_in_list == j)
+      {
+        numCorrect++;
+      }
+      else {
+        break;
+      }
+      pNode = pNode->pNext;
+    }
+  }
+  if(numCorrect != list_length * (cl_uint)num_lists)
+  {
+    error = -1;
+    log_info("Failed\n");
+  }
+  else
+    log_info("Passed\n");
+
+  return error;
+}
+
+// Note that we don't use the context provided by the test harness since it doesn't support multiple devices,
+// so we create are own context here that has all devices, we use the same platform that the harness used.
+cl_int create_cl_objects(cl_device_id device_from_harness, const char** ppCodeString, cl_context* context, cl_program *program, cl_command_queue *queues, cl_uint *num_devices, cl_device_svm_capabilities required_svm_caps)
+{
+  cl_int error;
+
+  cl_platform_id platform_id;
+  // find out what platform the harness is using.
+  error = clGetDeviceInfo(device_from_harness, CL_DEVICE_PLATFORM,sizeof(cl_platform_id),&platform_id,NULL);
+  test_error(error,"clGetDeviceInfo failed");
+
+  error = clGetDeviceIDs(platform_id,  CL_DEVICE_TYPE_ALL, 0, NULL, num_devices );
+  test_error(error, "clGetDeviceIDs failed");
+
+  std::vector<cl_device_id> devicesTmp(*num_devices), devices, capable_devices;
+
+  error = clGetDeviceIDs(platform_id,  CL_DEVICE_TYPE_ALL, *num_devices, &devicesTmp[0], NULL );
+  test_error(error, "clGetDeviceIDs failed");
+
+  devices.push_back(device_from_harness);
+  for (size_t i = 0; i < devicesTmp.size(); ++i)
+  {
+    if (device_from_harness != devicesTmp[i])
+      devices.push_back(devicesTmp[i]);
+  }
+
+  // Select only the devices that support the SVM level needed for the test.
+  // Note that if requested SVM capabilities are not supported by any device then the test still passes (even though it does not execute).
+  cl_device_svm_capabilities caps;
+  cl_uint num_capable_devices = 0;
+  for(cl_uint i = 0; i < *num_devices; i++)
+  {
+    size_t ret_len = 0;
+    error = clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, 0, 0, &ret_len);
+    if (error != CL_SUCCESS)
+    {
+      log_error("clGetDeviceInfo failed %s\n", IGetErrorString(error));
+      return -1;
+    }
+
+    std::vector<char> oclVersion(ret_len + 1);
+    error = clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, sizeof(char) * oclVersion.size(), &oclVersion[0], 0);
+    if (error != CL_SUCCESS)
+    {
+      log_error("clGetDeviceInfo failed %s\n", IGetErrorString(error));
+      return -1;
+    }
+
+    std::string versionStr(&oclVersion[7]);
+    std::stringstream stream;
+    stream << versionStr;
+
+    double version = 0.0;
+    stream >> version;
+
+    if(device_from_harness != devices[i] && version < 2.0)
+    {
+      continue;
+    }
+
+    error = clGetDeviceInfo(devices[i], CL_DEVICE_SVM_CAPABILITIES, sizeof(cl_device_svm_capabilities), &caps, NULL);
+    test_error(error,"clGetDeviceInfo failed for CL_DEVICE_MEM_SHARING");
+    if(caps & (~(CL_DEVICE_SVM_COARSE_GRAIN_BUFFER | CL_DEVICE_SVM_FINE_GRAIN_BUFFER |  CL_DEVICE_SVM_FINE_GRAIN_SYSTEM | CL_DEVICE_SVM_ATOMICS)))
+    {
+      log_error("clGetDeviceInfo returned an invalid cl_device_svm_capabilities value");
+      return -1;
+    }
+    if((caps & required_svm_caps) == required_svm_caps)
+    {
+      capable_devices.push_back(devices[i]);
+      ++num_capable_devices;
+    }
+  }
+  devices = capable_devices;  // the only devices we care about from here on are the ones capable of supporting the requested SVM level.
+  *num_devices = num_capable_devices;
+  if(num_capable_devices == 0)
+    //    if(svm_level > CL_DEVICE_COARSE_SVM && 0 == num_capable_devices)
+  {
+    log_info("Requested SVM level not supported by any device on this platform, test not executed.\n");
+    return 1; // 1 indicates do not execute, but counts as passing.
+  }
+
+  cl_context_properties context_properties[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform_id, NULL };
+  *context = clCreateContext(context_properties, *num_devices, &devices[0], NULL, NULL, &error);
+  test_error(error, "Unable to create context" );
+
+  //    *queues = (cl_command_queue *) malloc( *num_devices * sizeof( cl_command_queue ) );
+
+  for(cl_uint i = 0; i < *num_devices; i++)
+  {
+    queues[i] = clCreateCommandQueueWithProperties(*context, devices[i], 0, &error);
+    test_error(error, "clCreateCommandQueue failed");
+  }
+
+  if(ppCodeString)
+  {
+    error = create_single_kernel_helper(*context, program, 0, 1, ppCodeString, 0, "-cl-std=CL2.0");
+    test_error( error, "failed to create program" );
+  }
+
+  return 0;
+}
+
+basefn    basefn_list[] = {
+  test_byte_granularity,
+  test_set_kernel_exec_info_svm_ptrs,
+  test_fine_grain_memory_consistency,
+  test_fine_grain_sync_buffers,
+  test_shared_address_space_fine_grain,
+  test_shared_sub_buffers,
+  test_shared_address_space_fine_grain_buffers,
+  test_allocate_shared_buffer,
+  test_shared_address_space_coarse_grain_old_api,
+  test_shared_address_space_coarse_grain_new_api,
+  test_cross_buffer_pointers_coarse_grain,
+  test_svm_pointer_passing,
+  test_enqueue_api,
+  test_migrate,
+};
+
+const char    *basefn_names[] = {
+  "svm_byte_granularity",
+  "svm_set_kernel_exec_info_svm_ptrs",
+  "svm_fine_grain_memory_consistency",
+  "svm_fine_grain_sync_buffers",
+  "svm_shared_address_space_fine_grain",
+  "svm_shared_sub_buffers",
+  "svm_shared_address_space_fine_grain_buffers",
+  "svm_allocate_shared_buffer",
+  "svm_shared_address_space_coarse_grain_old_api",
+  "svm_shared_address_space_coarse_grain_new_api",
+  "svm_cross_buffer_pointers_coarse_grain",
+  "svm_pointer_passing",
+  "svm_enqueue_api",
+  "svm_migrate_mem",
+};
+
+ct_assert((sizeof(basefn_names) / sizeof(basefn_names[0])) == (sizeof(basefn_list) / sizeof(basefn_list[0])));
+
+int    num_fns = sizeof(basefn_names) / sizeof(char *);
+
+
+int main(int argc, const char *argv[])
+{
+  return runTestHarness( argc, argv, num_fns, basefn_list, basefn_names, false, true, 0 );
+}
+
+
+
--- a/test_conformance/SVM/test_allocate_shared_buffer.cpp
+++ b/test_conformance/SVM/test_allocate_shared_buffer.cpp
@@ -0,0 +1,107 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "common.h"
+
+const cl_mem_flags flag_set[] = {
+  CL_MEM_READ_WRITE,
+  CL_MEM_WRITE_ONLY,
+  CL_MEM_READ_ONLY,
+  CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER,
+  CL_MEM_WRITE_ONLY | CL_MEM_SVM_FINE_GRAIN_BUFFER,
+  CL_MEM_READ_ONLY | CL_MEM_SVM_FINE_GRAIN_BUFFER,
+  CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS,
+  CL_MEM_WRITE_ONLY | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS,
+  CL_MEM_READ_ONLY | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS,
+  0
+};
+const char* flag_set_names[] = {
+  "CL_MEM_READ_WRITE",
+  "CL_MEM_WRITE_ONLY",
+  "CL_MEM_READ_ONLY",
+  "CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER",
+  "CL_MEM_WRITE_ONLY | CL_MEM_SVM_FINE_GRAIN_BUFFER",
+  "CL_MEM_READ_ONLY | CL_MEM_SVM_FINE_GRAIN_BUFFER",
+  "CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS",
+  "CL_MEM_WRITE_ONLY | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS",
+  "CL_MEM_READ_ONLY | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS",
+  "0"
+};
+
+
+int test_allocate_shared_buffer(cl_device_id deviceID, cl_context context2, cl_command_queue queue, int num_elements)
+{
+  clContextWrapper    context = NULL;
+  clProgramWrapper    program = NULL;
+  cl_uint     num_devices = 0;
+  cl_int      err = CL_SUCCESS;
+  clCommandQueueWrapper queues[MAXQ];
+
+  cl_device_svm_capabilities caps;
+  err = clGetDeviceInfo(deviceID, CL_DEVICE_SVM_CAPABILITIES, sizeof(cl_device_svm_capabilities), &caps, NULL);
+  test_error(err,"clGetDeviceInfo failed for CL_DEVICE_SVM_CAPABILITIES");
+
+  // under construction...
+  err = create_cl_objects(deviceID, NULL, &context, &program, &queues[0], &num_devices, CL_DEVICE_SVM_COARSE_GRAIN_BUFFER);
+  if(err) return -1;
+
+  size_t size = 1024;
+
+  // iteration over flag combos
+  int num_flags = sizeof(flag_set)/sizeof(cl_mem_flags);
+  for(int i = 0; i < num_flags; i++)
+  {
+    if (((flag_set[i] & CL_MEM_SVM_FINE_GRAIN_BUFFER) != 0 && (caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) == 0)
+        || ((flag_set[i] & CL_MEM_SVM_ATOMICS) != 0 && (caps & CL_DEVICE_SVM_ATOMICS) == 0))
+    {
+      log_info("Skipping clSVMalloc with flags: %s\n", flag_set_names[i]);
+      continue;
+    }
+
+    log_info("Testing clSVMalloc with flags: %s\n", flag_set_names[i]);
+    cl_char *pBufData1 = (cl_char*) clSVMAlloc(context, flag_set[i], size, 0);
+    if(pBufData1 == NULL)
+    {
+      log_error("SVMalloc returned NULL");
+      return -1;
+    }
+
+    {
+      clMemWrapper buf = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, size, pBufData1, &err);
+      test_error(err,"clCreateBuffer failed");
+
+      cl_char *pBufData2 = NULL;
+      cl_uint flags = CL_MAP_READ | CL_MAP_READ;
+      if(flag_set[i] & CL_MEM_HOST_READ_ONLY) flags ^= CL_MAP_WRITE;
+      if(flag_set[i] & CL_MEM_HOST_WRITE_ONLY) flags ^= CL_MAP_READ;
+
+      if(!(flag_set[i] & CL_MEM_HOST_NO_ACCESS))
+      {
+        pBufData2 = (cl_char*) clEnqueueMapBuffer(queues[0], buf, CL_TRUE, flags, 0, size, 0, NULL,NULL, &err);
+        test_error(err, "clEnqueueMapBuffer failed");
+
+        if(pBufData2 != pBufData1 || NULL == pBufData1)
+        {
+          log_error("SVM pointer returned by clEnqueueMapBuffer doesn't match pointer returned by clSVMalloc");
+          return -1;
+        }
+      }
+    }
+
+    clSVMFree(context, pBufData1);
+  }
+
+  return 0;
+}
--- a/test_conformance/SVM/test_byte_granularity.cpp
+++ b/test_conformance/SVM/test_byte_granularity.cpp
@@ -0,0 +1,148 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "common.h"
+
+const char *byte_manipulation_kernels[] = {
+  // Each device will write it's id into the bytes that it "owns", ownership is based on round robin (global_id % num_id)
+  // num_id is equal to number of SVM devices in the system plus one (for the host code).
+  // id is the index (id) of the device that this kernel is executing on.
+  // For example, if there are 2 SVM devices and the host; the buffer should look like this after each device and the host write their id's:
+  // 0, 1, 2, 0, 1, 2, 0, 1, 2...
+  "__kernel void write_owned_locations(__global char* a, uint num_id, uint id)\n"
+  "{\n"
+  "    size_t i = get_global_id(0);\n"
+  "   int owner = i % num_id;\n"
+  "    if(id == owner) \n"
+  "       a[i] = id;\n"  // modify location if it belongs to this device, write id
+  "}\n"
+
+  // Verify that a device can see the byte sized updates from the other devices, sum up the device id's and see if they match expected value.
+  // Note: this must be called with a reduced NDRange so that neighbor acesses don't go past end of buffer.
+  // For example if there are two SVM devices and the host (3 total devices) the buffer should look like this:
+  // 0,1,2,0,1,2...
+  // and the expected sum at each point is 0+1+2 = 3.
+  "__kernel void sum_neighbor_locations(__global char* a, uint num_devices, volatile __global uint* error_count)\n"
+  "{\n"
+  "    size_t i = get_global_id(0);\n"
+  "    uint expected_sum = (num_devices * (num_devices - 1))/2;\n"
+  "    uint sum = 0;\n"
+  "    for(uint j=0; j<num_devices; j++) {\n"
+  "        sum += a[i + j];\n" // add my neighbors to the right
+  "    }\n"
+  "    if(sum != expected_sum)\n"
+  "        atomic_inc(error_count);\n"
+  "}\n"
+};
+
+
+
+int    test_byte_granularity(cl_device_id deviceID, cl_context c, cl_command_queue queue, int num_elements)
+{
+  clContextWrapper context;
+  clProgramWrapper program;
+  clKernelWrapper k1,k2;
+  clCommandQueueWrapper queues[MAXQ];
+
+  cl_uint     num_devices = 0;
+  cl_int      err = CL_SUCCESS;
+  cl_int        rval = CL_SUCCESS;
+
+  err = create_cl_objects(deviceID, &byte_manipulation_kernels[0], &context, &program, &queues[0], &num_devices, CL_DEVICE_SVM_FINE_GRAIN_BUFFER);
+  if(err == 1) return 0; // no devices capable of requested SVM level, so don't execute but count test as passing.
+  if(err < 0) return -1; // fail test.
+
+  cl_uint num_devices_plus_host = num_devices + 1;
+
+  k1 = clCreateKernel(program, "write_owned_locations", &err);
+  test_error(err, "clCreateKernel failed");
+  k2 = clCreateKernel(program, "sum_neighbor_locations", &err);
+  test_error(err, "clCreateKernel failed");
+
+
+  cl_char *pA = (cl_char*) clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, sizeof(cl_char) * num_elements, 0);
+
+  cl_uint **error_counts =  (cl_uint**) malloc(sizeof(void*) * num_devices);
+
+  for(cl_uint i=0; i < num_devices; i++) {
+    error_counts[i] = (cl_uint*) clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, sizeof(cl_uint), 0);
+    *error_counts[i] = 0;
+  }
+  for(int i=0; i < num_elements; i++) pA[i] = -1;
+
+  err |= clSetKernelArgSVMPointer(k1, 0, pA);
+  err |= clSetKernelArg(k1, 1, sizeof(cl_uint), &num_devices_plus_host);
+  test_error(err, "clSetKernelArg failed");
+
+  // get all the devices going simultaneously
+  size_t element_num = num_elements;
+  for(cl_uint d=0; d < num_devices; d++)  // device ids starting at 1.
+  {
+    err = clSetKernelArg(k1, 2, sizeof(cl_uint), &d);
+    test_error(err, "clSetKernelArg failed");
+    err = clEnqueueNDRangeKernel(queues[d], k1, 1, NULL, &element_num, NULL, 0, NULL, NULL);
+    test_error(err,"clEnqueueNDRangeKernel failed");
+  }
+
+  for(cl_uint d=0; d < num_devices; d++) clFlush(queues[d]);
+
+  cl_uint host_id = num_devices;  // host code will take the id above the devices.
+  for(int i = (int)num_devices; i < num_elements; i+= num_devices_plus_host) pA[i] = host_id;
+
+  for(cl_uint id = 0; id < num_devices; id++) clFinish(queues[id]);
+
+  // now check that each device can see the byte writes made by the other devices.
+
+  err |= clSetKernelArgSVMPointer(k2, 0, pA);
+  err |= clSetKernelArg(k2, 1, sizeof(cl_uint), &num_devices_plus_host);
+  test_error(err, "clSetKernelArg failed");
+
+  // adjusted so k2 doesn't read past end of buffer
+  size_t adjusted_num_elements = num_elements - num_devices;
+  for(cl_uint id = 0; id < num_devices; id++)
+  {
+    err = clSetKernelArgSVMPointer(k2, 2, error_counts[id]);
+    test_error(err, "clSetKernelArg failed");
+
+    err = clEnqueueNDRangeKernel(queues[id], k2, 1, NULL, &adjusted_num_elements, NULL, 0, NULL, NULL);
+    test_error(err,"clEnqueueNDRangeKernel failed");
+  }
+
+  for(cl_uint id = 0; id < num_devices; id++) clFinish(queues[id]);
+
+  bool failed = false;
+
+  // see if any of the devices found errors
+  for(cl_uint i=0; i < num_devices; i++) {
+    if(*error_counts[i] > 0)
+      failed = true;
+  }
+  cl_uint expected = (num_devices_plus_host * (num_devices_plus_host - 1))/2;
+  // check that host can see the byte writes made by the devices.
+  for(cl_uint i = 0; i < num_elements - num_devices_plus_host; i++)
+  {
+    int sum = 0;
+    for(cl_uint j=0; j < num_devices_plus_host; j++) sum += pA[i+j];
+    if(sum != expected)
+      failed = true;
+  }
+
+  clSVMFree(context, pA);
+  for(cl_uint i=0; i < num_devices; i++) clSVMFree(context, error_counts[i]);
+
+  if(failed)
+    return -1;
+  return 0;
+}
--- a/test_conformance/SVM/test_cross_buffer_pointers.cpp
+++ b/test_conformance/SVM/test_cross_buffer_pointers.cpp
@@ -0,0 +1,219 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "common.h"
+
+// create linked lists that use nodes from two different buffers.
+const char *SVMCrossBufferPointers_test_kernel[] = {
+  "\n"
+  "typedef struct Node {\n"
+  "    int global_id;\n"
+  "    int position_in_list;\n"
+  "    __global struct Node* pNext;\n"
+  "} Node;\n"
+  "\n"
+  "__global Node* allocate_node(__global Node* pNodes1, __global Node* pNodes2, volatile __global int* allocation_index, size_t i)\n"
+  "{\n"
+  // mix things up, adjacent work items will allocate from different buffers
+  "    if(i & 0x1)\n"
+  "        return &pNodes1[atomic_inc(allocation_index)];\n"
+  "    else\n"
+  "        return &pNodes2[atomic_inc(allocation_index)];\n"
+  "}\n"
+  "\n"
+  // The allocation_index parameter must be initialized on the host to N work-items
+  // The first N nodes in pNodes will be the heads of the lists.
+  "__kernel void create_linked_lists(__global Node* pNodes, __global Node* pNodes2, volatile __global int* allocation_index, int list_length)\n"
+  "{\n"
+  "    size_t i = get_global_id(0);\n"
+  "    __global Node *pNode = &pNodes[i];\n"
+  "\n"
+  "    pNode->global_id = i;\n"
+  "    pNode->position_in_list = 0;\n"
+  "\n"
+  "    __global Node *pNew;\n"
+  "    for(int j=1; j < list_length; j++)\n"
+  "    {\n"
+  "        pNew = allocate_node(pNodes, pNodes2, allocation_index, i);\n"
+  "        pNew->global_id = i;\n"
+  "        pNew->position_in_list = j;\n"
+  "        pNode->pNext = pNew;  // link new node onto end of list\n"
+  "        pNode = pNew;   // move to end of list\n"
+  "    }\n"
+  "}\n"
+  "\n"
+  "__kernel void verify_linked_lists(__global Node* pNodes, __global Node* pNodes2, volatile __global uint* num_correct, int list_length)\n"
+  "{\n"
+  "    size_t i = get_global_id(0);\n"
+  "    __global Node *pNode = &pNodes[i];\n"
+  "\n"
+  "    for(int j=0; j < list_length; j++)\n"
+  "    {\n"
+  "        if( pNode->global_id == i && pNode->position_in_list == j)\n"
+  "        {\n"
+  "            atomic_inc(num_correct);\n"
+  "        }\n"
+  "        else {\n"
+  "            break;\n"
+  "        }\n"
+  "        pNode = pNode->pNext;\n"
+  "    }\n"
+  "}\n"
+};
+
+
+// Creates linked list using host code.
+cl_int create_linked_lists_on_host(cl_command_queue cmdq, cl_mem nodes, cl_mem nodes2, cl_int ListLength, size_t numLists )
+{
+  cl_int error = CL_SUCCESS;
+
+  log_info("SVM: creating linked list on host ");
+
+  Node *pNodes = (Node*) clEnqueueMapBuffer(cmdq, nodes, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(Node)*ListLength*numLists, 0, NULL,NULL, &error);
+  test_error2(error, pNodes, "clEnqueueMapBuffer failed");
+
+  Node *pNodes2 = (Node*) clEnqueueMapBuffer(cmdq, nodes2, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(Node)*ListLength*numLists, 0, NULL,NULL, &error);
+  test_error2(error, pNodes2, "clEnqueueMapBuffer failed");
+
+  create_linked_lists(pNodes, numLists, ListLength);
+
+  error = clEnqueueUnmapMemObject(cmdq, nodes, pNodes, 0,NULL,NULL);
+  test_error(error, "clEnqueueUnmapMemObject failed");
+  error = clEnqueueUnmapMemObject(cmdq, nodes2, pNodes2, 0,NULL,NULL);
+  test_error(error, "clEnqueueUnmapMemObject failed");
+  error = clFinish(cmdq);
+  test_error(error, "clFinish failed");
+  return error;
+}
+
+// Verify correctness of the linked list using host code.
+cl_int verify_linked_lists_on_host(int ci, cl_command_queue cmdq, cl_mem nodes, cl_mem nodes2, cl_int ListLength, size_t numLists )
+{
+  cl_int error = CL_SUCCESS;
+
+  //log_info(" and verifying on host ");
+
+  Node *pNodes = (Node*) clEnqueueMapBuffer(cmdq, nodes, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(Node)*ListLength * numLists, 0, NULL,NULL, &error);
+  test_error2(error, pNodes, "clEnqueueMapBuffer failed");
+  Node *pNodes2 = (Node*) clEnqueueMapBuffer(cmdq, nodes2, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(Node)*ListLength * numLists, 0, NULL,NULL, &error);
+  test_error2(error, pNodes, "clEnqueueMapBuffer failed");
+
+  error = verify_linked_lists(pNodes, numLists, ListLength);
+  if(error) return -1;
+
+  error = clEnqueueUnmapMemObject(cmdq, nodes, pNodes, 0,NULL,NULL);
+  test_error(error, "clEnqueueUnmapMemObject failed");
+  error = clEnqueueUnmapMemObject(cmdq, nodes2, pNodes2, 0,NULL,NULL);
+  test_error(error, "clEnqueueUnmapMemObject failed");
+  error = clFinish(cmdq);
+  test_error(error, "clFinish failed");
+  return error;
+}
+
+// This tests that shared buffers are able to contain pointers that point to other shared buffers.
+// This tests that all devices and the host share a common address space; using only the coarse-grain features.
+// This is done by creating a linked list on a device and then verifying the correctness of the list
+// on another device or the host.
+// The linked list nodes are allocated from two different buffers this is done to ensure that cross buffer pointers work correctly.
+// This basic test is performed for all combinations of devices and the host.
+int test_cross_buffer_pointers_coarse_grain(cl_device_id deviceID, cl_context context2, cl_command_queue queue, int num_elements)
+{
+  clContextWrapper    context = NULL;
+  clProgramWrapper    program = NULL;
+  cl_uint     num_devices = 0;
+  cl_int      error = CL_SUCCESS;
+  clCommandQueueWrapper queues[MAXQ];
+
+  error = create_cl_objects(deviceID, &SVMCrossBufferPointers_test_kernel[0], &context, &program, &queues[0], &num_devices, CL_DEVICE_SVM_COARSE_GRAIN_BUFFER);
+  if(error) return -1;
+
+  size_t numLists =  num_elements;
+  cl_int ListLength = 32;
+
+  clKernelWrapper kernel_create_lists = clCreateKernel(program, "create_linked_lists", &error);
+  test_error(error, "clCreateKernel failed");
+
+  clKernelWrapper kernel_verify_lists = clCreateKernel(program, "verify_linked_lists", &error);
+  test_error(error, "clCreateKernel failed");
+
+  // this buffer holds some of the linked list nodes.
+  Node* pNodes = (Node*) clSVMAlloc(context, CL_MEM_READ_WRITE, sizeof(Node)*ListLength*numLists, 0);
+
+  // this buffer holds some of the linked list nodes.
+  Node* pNodes2 = (Node*) clSVMAlloc(context, CL_MEM_READ_WRITE, sizeof(Node)*ListLength*numLists, 0);
+
+  {
+    clMemWrapper nodes = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, sizeof(Node)*ListLength*numLists, pNodes, &error);
+    test_error(error, "clCreateBuffer failed.");
+
+    clMemWrapper nodes2 = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, sizeof(Node)*ListLength*numLists, pNodes2, &error);
+    test_error(error, "clCreateBuffer failed.");
+
+    // this buffer holds the index into the nodes buffer that is used for node allocation
+    clMemWrapper allocator = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &error);
+    test_error(error, "clCreateBuffer failed.");
+
+    // this buffer holds the count of correct nodes which is computed by the verify kernel.
+    clMemWrapper num_correct = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &error);
+    test_error(error, "clCreateBuffer failed.");
+
+    error |= clSetKernelArg(kernel_create_lists, 0, sizeof(void*), (void *) &nodes);
+    //error |= clSetKernelArgSVMPointer(kernel_create_lists, 0, (void *) pNodes);
+    error |= clSetKernelArg(kernel_create_lists, 1, sizeof(void*), (void *) &nodes2);
+    error |= clSetKernelArg(kernel_create_lists, 2, sizeof(void*), (void *) &allocator);
+    error |= clSetKernelArg(kernel_create_lists, 3, sizeof(cl_int),   (void *) &ListLength);
+
+    error |= clSetKernelArg(kernel_verify_lists, 0, sizeof(void*), (void *) &nodes);
+    error |= clSetKernelArg(kernel_verify_lists, 1, sizeof(void*), (void *) &nodes2);
+    error |= clSetKernelArg(kernel_verify_lists, 2, sizeof(void*), (void *) &num_correct);
+    error |= clSetKernelArg(kernel_verify_lists, 3, sizeof(cl_int),   (void *) &ListLength);
+    test_error(error, "clSetKernelArg failed");
+
+    // Create linked list on one device and verify on another device (or the host).
+    // Do this for all possible combinations of devices and host within the platform.
+    for (int ci=0; ci<(int)num_devices+1; ci++)  // ci is CreationIndex, index of device/q to create linked list on
+    {
+      for (int vi=0; vi<(int)num_devices+1; vi++)  // vi is VerificationIndex, index of device/q to verify linked list on
+      {
+        if(ci == num_devices) // last device index represents the host, note the num_device+1 above.
+        {
+          error = create_linked_lists_on_host(queues[0], nodes, nodes2, ListLength, numLists);
+          if(error) return -1;
+        }
+        else
+        {
+          error = create_linked_lists_on_device(ci, queues[ci], allocator, kernel_create_lists, numLists);
+          if(error) return -1;
+        }
+
+        if(vi == num_devices)
+        {
+          error = verify_linked_lists_on_host(vi, queues[0], nodes, nodes2, ListLength, numLists);
+          if(error) return -1;
+        }
+        else
+        {
+          error = verify_linked_lists_on_device(vi, queues[vi], num_correct, kernel_verify_lists, ListLength, numLists);
+          if(error) return -1;
+        }
+      } // inner loop, vi
+    } // outer loop, ci
+  }
+
+  clSVMFree(context, pNodes2);
+  clSVMFree(context, pNodes);
+
+  return 0;
+}
--- a/test_conformance/SVM/test_enqueue_api.cpp
+++ b/test_conformance/SVM/test_enqueue_api.cpp
@@ -0,0 +1,254 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "common.h"
+#include "../../test_common/harness/mt19937.h"
+
+#include <vector>
+
+typedef struct
+{
+  cl_uint status;
+  cl_uint num_svm_pointers;
+  std::vector<void *> svm_pointers;
+} CallbackData;
+
+void generate_data(std::vector<cl_uchar> &data, size_t size, MTdata seed)
+{
+  cl_uint randomData = genrand_int32(seed);
+  cl_uint bitsLeft = 32;
+
+  for( size_t i = 0; i < size; i++ )
+  {
+    if( 0 == bitsLeft)
+    {
+      randomData = genrand_int32(seed);
+      bitsLeft = 32;
+    }
+    data[i] = (cl_uchar)( randomData & 255 );
+    randomData >>= 8; randomData -= 8;
+  }
+}
+
+//callback which will be passed to clEnqueueSVMFree command
+void CL_CALLBACK callback_svm_free(cl_command_queue queue, cl_uint num_svm_pointers, void * svm_pointers[], void * user_data)
+{
+  CallbackData *data = (CallbackData *)user_data;
+  data->num_svm_pointers = num_svm_pointers;
+  data->svm_pointers.resize(num_svm_pointers, 0);
+
+  cl_context context;
+  if(clGetCommandQueueInfo(queue, CL_QUEUE_CONTEXT, sizeof(cl_context), &context, 0) != CL_SUCCESS)
+  {
+    log_error("clGetCommandQueueInfo failed in the callback\n");
+    return;
+  }
+
+  for (size_t i = 0; i < num_svm_pointers; ++i)
+  {
+    data->svm_pointers[i] = svm_pointers[i];
+    clSVMFree(context, svm_pointers[i]);
+  }
+
+  data->status = 1;
+}
+
+int test_enqueue_api(cl_device_id deviceID, cl_context c, cl_command_queue queue, int num_elements)
+{
+  clContextWrapper context = NULL;
+  clCommandQueueWrapper queues[MAXQ];
+  cl_uint num_devices = 0;
+  const size_t elementNum = 1024;
+  const size_t numSVMBuffers = 32;
+  cl_int error = CL_SUCCESS;
+  RandomSeed seed(0);
+
+  error = create_cl_objects(deviceID, NULL, &context, NULL, &queues[0], &num_devices, CL_DEVICE_SVM_COARSE_GRAIN_BUFFER);
+  if(error) return -1;
+
+  queue = queues[0];
+
+  //all possible sizes of vectors and scalars
+  size_t typeSizes[] = {
+    sizeof(cl_uchar),
+    sizeof(cl_uchar2),
+    sizeof(cl_uchar3),
+    sizeof(cl_uchar4),
+    sizeof(cl_uchar8),
+    sizeof(cl_uchar16),
+    sizeof(cl_ushort),
+    sizeof(cl_ushort2),
+    sizeof(cl_ushort3),
+    sizeof(cl_ushort4),
+    sizeof(cl_ushort8),
+    sizeof(cl_ushort16),
+    sizeof(cl_uint),
+    sizeof(cl_uint2),
+    sizeof(cl_uint3),
+    sizeof(cl_uint4),
+    sizeof(cl_uint8),
+    sizeof(cl_uint16),
+    sizeof(cl_ulong),
+    sizeof(cl_ulong2),
+    sizeof(cl_ulong3),
+    sizeof(cl_ulong4),
+    sizeof(cl_ulong8),
+    sizeof(cl_ulong16),
+  };
+
+  for (size_t i = 0; i < ( sizeof(typeSizes) / sizeof(typeSizes[0]) ); ++i)
+  {
+    //generate initial data
+    std::vector<cl_uchar> fillData0(typeSizes[i]), fillData1(typeSizes[i], 0), fillData2(typeSizes[i]);
+    generate_data(fillData0, typeSizes[i], seed);
+    generate_data(fillData2, typeSizes[i], seed);
+
+    cl_uchar *srcBuffer = (cl_uchar *)clSVMAlloc(context, CL_MEM_READ_WRITE, elementNum * typeSizes[i], 0);
+    cl_uchar *dstBuffer = (cl_uchar *)clSVMAlloc(context, CL_MEM_READ_WRITE, elementNum * typeSizes[i], 0);
+
+    clEventWrapper userEvent = clCreateUserEvent(context, &error);
+    test_error(error, "clCreateUserEvent failed");
+
+    clEventWrapper eventMemFill;
+    error = clEnqueueSVMMemFill(queue, srcBuffer, &fillData0[0], typeSizes[i], elementNum * typeSizes[i], 1, &userEvent, &eventMemFill);
+    test_error(error, "clEnqueueSVMMemFill failed");
+
+    clEventWrapper eventMemcpy;
+    error = clEnqueueSVMMemcpy(queue, CL_FALSE, dstBuffer, srcBuffer, elementNum * typeSizes[i], 1, &eventMemFill, &eventMemcpy);
+    test_error(error, "clEnqueueSVMMemcpy failed");
+
+    error = clSetUserEventStatus(userEvent, CL_COMPLETE);
+    test_error(error, "clSetUserEventStatus failed");
+
+    clEventWrapper eventMap;
+    error = clEnqueueSVMMap(queue, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, dstBuffer, elementNum * typeSizes[i], 1, &eventMemcpy, &eventMap);
+    test_error(error, "clEnqueueSVMMap failed");
+
+    error = clWaitForEvents(1, &eventMap);
+    test_error(error, "clWaitForEvents failed");
+
+    //data verification
+    for (size_t j = 0; j < elementNum * typeSizes[i]; ++j)
+    {
+      if (dstBuffer[j] != fillData0[j % typeSizes[i]])
+      {
+        log_error("Invalid data at index %ld, expected %d, got %d\n", j, fillData0[j % typeSizes[i]], dstBuffer[j]);
+        return -1;
+      }
+    }
+
+    clEventWrapper eventUnmap;
+    error = clEnqueueSVMUnmap(queue, dstBuffer, 0, 0, &eventUnmap);
+    test_error(error, "clEnqueueSVMUnmap failed");
+
+    error = clEnqueueSVMMemFill(queue, srcBuffer, &fillData2[0], typeSizes[i], elementNum * typeSizes[i] / 2, 0, 0, 0);
+    test_error(error, "clEnqueueSVMMemFill failed");
+
+    error = clEnqueueSVMMemFill(queue, dstBuffer + elementNum * typeSizes[i] / 2, &fillData2[0], typeSizes[i], elementNum * typeSizes[i] / 2, 0, 0, 0);
+    test_error(error, "clEnqueueSVMMemFill failed");
+
+    error = clEnqueueSVMMemcpy(queue, CL_FALSE, dstBuffer, srcBuffer, elementNum * typeSizes[i] / 2, 0, 0, 0);
+    test_error(error, "clEnqueueSVMMemcpy failed");
+
+    error = clEnqueueSVMMemcpy(queue, CL_TRUE, dstBuffer + elementNum * typeSizes[i] / 2, srcBuffer + elementNum * typeSizes[i] / 2, elementNum * typeSizes[i] / 2, 0, 0, 0);
+    test_error(error, "clEnqueueSVMMemcpy failed");
+
+    void *ptrs[] = {(void *)srcBuffer, (void *)dstBuffer};
+
+    clEventWrapper eventFree;
+    error = clEnqueueSVMFree(queue, 2, ptrs, 0, 0, 0, 0, &eventFree);
+    test_error(error, "clEnqueueSVMFree failed");
+
+    error = clWaitForEvents(1, &eventFree);
+    test_error(error, "clWaitForEvents failed");
+
+    //event info verification for new SVM commands
+    cl_command_type commandType;
+    error = clGetEventInfo(eventMemFill, CL_EVENT_COMMAND_TYPE, sizeof(cl_command_type), &commandType, NULL);
+    test_error(error, "clGetEventInfo failed");
+    if (commandType != CL_COMMAND_SVM_MEMFILL)
+    {
+      log_error("Invalid command type returned for clEnqueueSVMMemFill\n");
+      return -1;
+    }
+
+    error = clGetEventInfo(eventMemcpy, CL_EVENT_COMMAND_TYPE, sizeof(cl_command_type), &commandType, NULL);
+    test_error(error, "clGetEventInfo failed");
+    if (commandType != CL_COMMAND_SVM_MEMCPY)
+    {
+      log_error("Invalid command type returned for clEnqueueSVMMemcpy\n");
+      return -1;
+    }
+
+    error = clGetEventInfo(eventMap, CL_EVENT_COMMAND_TYPE, sizeof(cl_command_type), &commandType, NULL);
+    test_error(error, "clGetEventInfo failed");
+    if (commandType != CL_COMMAND_SVM_MAP)
+    {
+      log_error("Invalid command type returned for clEnqueueSVMMap\n");
+      return -1;
+    }
+
+    error = clGetEventInfo(eventUnmap, CL_EVENT_COMMAND_TYPE, sizeof(cl_command_type), &commandType, NULL);
+    test_error(error, "clGetEventInfo failed");
+    if (commandType != CL_COMMAND_SVM_UNMAP)
+    {
+      log_error("Invalid command type returned for clEnqueueSVMUnmap\n");
+      return -1;
+    }
+
+    error = clGetEventInfo(eventFree, CL_EVENT_COMMAND_TYPE, sizeof(cl_command_type), &commandType, NULL);
+    test_error(error, "clGetEventInfo failed");
+    if (commandType != CL_COMMAND_SVM_FREE)
+    {
+      log_error("Invalid command type returned for clEnqueueSVMFree\n");
+      return -1;
+    }
+  }
+
+  std::vector<void *> buffers(numSVMBuffers, 0);
+  for(size_t i = 0; i < numSVMBuffers; ++i) buffers[i] = clSVMAlloc(context, CL_MEM_READ_WRITE, elementNum, 0);
+
+  //verify if callback is triggered correctly
+  CallbackData data;
+  data.status = 0;
+
+  error = clEnqueueSVMFree(queue, buffers.size(), &buffers[0], callback_svm_free, &data, 0, 0, 0);
+  test_error(error, "clEnqueueSVMFree failed");
+
+  error = clFinish(queue);
+  test_error(error, "clFinish failed");
+
+  //wait for the callback
+  while(data.status == 0) { }
+
+  //check if number of SVM pointers returned in the callback matches with expected
+  if (data.num_svm_pointers != buffers.size())
+  {
+    log_error("Invalid number of SVM pointers returned in the callback, expected: %ld, got: %d\n", buffers.size(), data.num_svm_pointers);
+    return -1;
+  }
+
+  //check if pointers returned in callback are correct
+  for (size_t i = 0; i < buffers.size(); ++i)
+  {
+    if (data.svm_pointers[i] != buffers[i])
+    {
+      log_error("Invalid SVM pointer returned in the callback, idx: %ld\n", i);
+      return -1;
+    }
+  }
+
+  return 0;
+}
--- a/test_conformance/SVM/test_fine_grain_memory_consistency.cpp
+++ b/test_conformance/SVM/test_fine_grain_memory_consistency.cpp
@@ -0,0 +1,168 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "common.h"
+
+const char *hash_table_kernel[] = {
+  "typedef struct BinNode {\n"
+  " int value;\n"
+  " atomic_uintptr_t pNext;\n"
+  "} BinNode;\n"
+
+  "__kernel void build_hash_table(__global uint* input, __global BinNode* pNodes, volatile __global atomic_uint* pNumNodes, uint numBins)\n"
+  "{\n"
+  " __global BinNode *pNew = &pNodes[ atomic_fetch_add_explicit(pNumNodes, 1, memory_order_relaxed, memory_scope_all_svm_devices) ];\n"
+  " uint i = get_global_id(0);\n"
+  " uint b = input[i] % numBins;\n"
+  " pNew->value = input[i];\n"
+  " uintptr_t next = atomic_load_explicit(&(pNodes[b].pNext), memory_order_seq_cst, memory_scope_all_svm_devices);\n"
+  " do\n"
+  " {\n"
+  "   atomic_store_explicit(&(pNew->pNext), next, memory_order_seq_cst, memory_scope_all_svm_devices);\n" // always inserting at head of list
+  " } while(!atomic_compare_exchange_strong_explicit(&(pNodes[b].pNext), &next, (uintptr_t)pNew, memory_order_seq_cst, memory_order_relaxed, memory_scope_all_svm_devices));\n"
+  "}\n"
+};
+
+typedef struct BinNode{
+  cl_uint value;
+  struct BinNode* pNext;
+} BinNode;
+
+void build_hash_table_on_host(cl_context c, cl_uint* input, size_t inputSize, BinNode* pNodes, cl_int volatile *pNumNodes, cl_uint numBins)
+{
+  for(cl_uint i = 0; i < inputSize; i++)
+  {
+    BinNode *pNew = &pNodes[ AtomicFetchAddExplicit(pNumNodes, 1, memory_order_relaxed) ];
+    cl_uint b = input[i] % numBins;
+    pNew->value = input[i];
+
+    BinNode *next = pNodes[b].pNext;
+    do {
+        pNew->pNext = next;  // always inserting at head of list
+    } while(!AtomicCompareExchangeStrongExplicit(&(pNodes[b].pNext), &next, pNew, memory_order_relaxed, memory_order_seq_cst));
+  }
+}
+
+
+int launch_kernels_and_verify(clContextWrapper &context, clCommandQueueWrapper* queues, clKernelWrapper &kernel, cl_uint num_devices, cl_uint numBins, size_t num_pixels)
+{
+  int err = CL_SUCCESS;
+  cl_uint *pInputImage = (cl_uint*) clSVMAlloc(context, CL_MEM_READ_ONLY  | CL_MEM_SVM_FINE_GRAIN_BUFFER, sizeof(cl_uint) * num_pixels, 0);
+  BinNode *pNodes      = (BinNode*) clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, sizeof(BinNode) * (num_pixels * (num_devices + 1) + numBins), 0);
+  cl_int *pNumNodes       = (cl_int*)  clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, sizeof(cl_int), 0);
+
+  *pNumNodes = numBins;  // using the first numBins nodes to hold the list heads.
+  for(cl_uint i=0;i<numBins;i++) {
+    pNodes[i].pNext = NULL;
+  };
+
+  for(cl_uint i=0; i < num_pixels; i++) pInputImage[i] = i;
+
+  err |= clSetKernelArgSVMPointer(kernel, 0, pInputImage);
+  err |= clSetKernelArgSVMPointer(kernel, 1, pNodes);
+  err |= clSetKernelArgSVMPointer(kernel, 2, pNumNodes);
+  err |= clSetKernelArg(kernel, 3, sizeof(cl_uint), (void*) &numBins);
+
+  test_error(err, "clSetKernelArg failed");
+
+  cl_event done;
+  // get all the devices going simultaneously, each device (and the host) will insert all the pixels.
+  for(cl_uint d=0; d<num_devices; d++)
+  {
+    err = clEnqueueNDRangeKernel(queues[d], kernel, 1, NULL, &num_pixels, 0, 0, NULL, &done);
+    test_error(err,"clEnqueueNDRangeKernel failed");
+  }
+  for(cl_uint d=0; d<num_devices; d++) clFlush(queues[d]);
+
+  // wait until we see some activity from a device (try to run host side simultaneously).
+  while(numBins == AtomicLoadExplicit(pNumNodes, memory_order_relaxed));
+
+  build_hash_table_on_host(context, pInputImage, num_pixels, pNodes, pNumNodes, numBins);
+
+  for(cl_uint d=0; d<num_devices; d++) clFinish(queues[d]);
+
+  cl_uint num_items = 0;
+  // check correctness of each bin in the hash table.
+  for(cl_uint i = 0; i < numBins; i++)
+  {
+    BinNode *pNode = pNodes[i].pNext;
+    while(pNode)
+    {
+      if((pNode->value % numBins) != i)
+      {
+        log_error("Something went wrong, item is in wrong hash bucket\n");
+        break;
+      }
+      num_items++;
+      pNode = pNode->pNext;
+    }
+  }
+
+  clSVMFree(context, pInputImage);
+  clSVMFree(context, pNodes);
+  clSVMFree(context, pNumNodes);
+  // each device and the host inserted all of the pixels, check that none are missing.
+  if(num_items != num_pixels * (num_devices + 1) )
+  {
+    log_error("The hash table is not correct, num items %d, expected num items: %d\n", num_items, num_pixels * (num_devices + 1));
+    return -1; // test did not pass
+  }
+  return 0;
+}
+
+// This tests for memory consistency across devices and the host.
+// Each device and the host simultaneously insert values into a single hash table.
+// Each bin in the hash table is a linked list.  Each bin is protected against simultaneous
+// update using a lock free technique.  The correctness of the list is verfied on the host.
+// This test requires the new OpenCL 2.0 atomic operations that implement the new seq_cst memory ordering.
+int    test_fine_grain_memory_consistency(cl_device_id deviceID, cl_context c, cl_command_queue queue, int num_elements)
+{
+  clContextWrapper context;
+  clProgramWrapper program;
+  clKernelWrapper kernel;
+  clCommandQueueWrapper queues[MAXQ];
+
+  cl_uint     num_devices = 0;
+  cl_int      err = CL_SUCCESS;
+
+  if (sizeof(void *) == 8 && (!is_extension_available(deviceID, "cl_khr_int64_base_atomics") || !is_extension_available(deviceID, "cl_khr_int64_extended_atomics")))
+  {
+      log_info("WARNING: test skipped. 'cl_khr_int64_base_atomics' and 'cl_khr_int64_extended_atomics' extensions are not supported\n");
+      return 0;
+  }
+
+  err = create_cl_objects(deviceID, &hash_table_kernel[0], &context, &program, &queues[0], &num_devices, CL_DEVICE_SVM_FINE_GRAIN_BUFFER | CL_DEVICE_SVM_ATOMICS);
+  if(err == 1) return 0; // no devices capable of requested SVM level, so don't execute but count test as passing.
+  if(err < 0) return -1; // fail test.
+
+  kernel = clCreateKernel(program, "build_hash_table", &err);
+  test_error(err, "clCreateKernel failed");
+  size_t num_pixels = num_elements;
+
+  int result;
+  cl_uint numBins = 1;  // all work groups in all devices and the host code will hammer on this one lock.
+  result = launch_kernels_and_verify(context, queues, kernel, num_devices, numBins, num_pixels);
+  if(result == -1) return result;
+
+  numBins = 2;  // 2 locks within in same cache line will get hit from different devices and host.
+  result = launch_kernels_and_verify(context, queues, kernel, num_devices, numBins, num_pixels);
+  if(result == -1) return result;
+
+  numBins = 29; // locks span a few cache lines.
+  result = launch_kernels_and_verify(context, queues, kernel, num_devices, numBins, num_pixels);
+  if(result == -1) return result;
+
+  return result;
+}
--- a/test_conformance/SVM/test_fine_grain_sync_buffers.cpp
+++ b/test_conformance/SVM/test_fine_grain_sync_buffers.cpp
@@ -0,0 +1,105 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "common.h"
+
+const char *find_targets_kernel[] = {
+
+  "__kernel void find_targets(__global uint* image, uint target, volatile __global atomic_uint *numTargetsFound, volatile __global atomic_uint *targetLocations)\n"
+  "{\n"
+  " size_t i = get_global_id(0);\n"
+  " uint index;\n"
+  " if(image[i] == target) {\n"
+  "   index = atomic_fetch_add_explicit(numTargetsFound, 1, memory_order_relaxed, memory_scope_device); \n"
+  "   atomic_exchange_explicit(&targetLocations[index], i, memory_order_relaxed, memory_scope_all_svm_devices); \n"
+  " }\n"
+  "}\n"
+};
+
+
+void spawnAnalysisTask(int location)
+{
+  //    printf("found target at location %d\n", location);
+}
+
+#define MAX_TARGETS 1024
+
+// Goals: demonstrate use of SVM's atomics to do fine grain synchronization between the device and host.
+// Concept: a device kernel is used to search an input image for regions that match a target pattern.
+// The device immediately notifies the host when it finds a target (via an atomic operation that works across host and devices).
+// The host is then able to spawn a task that further analyzes the target while the device continues searching for more targets.
+int    test_fine_grain_sync_buffers(cl_device_id deviceID, cl_context c, cl_command_queue queue, int num_elements)
+{
+  clContextWrapper    context = NULL;
+  clProgramWrapper    program = NULL;
+  cl_uint     num_devices = 0;
+  cl_int      err = CL_SUCCESS;
+  clCommandQueueWrapper queues[MAXQ];
+
+  err = create_cl_objects(deviceID, &find_targets_kernel[0], &context, &program, &queues[0], &num_devices, CL_DEVICE_SVM_FINE_GRAIN_BUFFER | CL_DEVICE_SVM_ATOMICS);
+  if(err == 1) return 0; // no devices capable of requested SVM level, so don't execute but count test as passing.
+  if(err < 0) return -1; // fail test.
+
+  clKernelWrapper kernel = clCreateKernel(program, "find_targets", &err);
+  test_error(err, "clCreateKernel failed");
+
+  size_t num_pixels = num_elements;
+  //cl_uint num_pixels = 1024*1024*32;
+
+  cl_uint *pInputImage      = (cl_uint*) clSVMAlloc(context, CL_MEM_READ_ONLY  | CL_MEM_SVM_FINE_GRAIN_BUFFER, sizeof(cl_uint) * num_pixels, 0);
+  cl_uint *pNumTargetsFound = (cl_uint*) clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, sizeof(cl_uint), 0);
+  cl_int  *pTargetLocations = (cl_int* ) clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, sizeof(cl_int) * MAX_TARGETS, 0);
+
+  cl_uint targetDescriptor = 777;
+  *pNumTargetsFound = 0;
+  cl_uint i;
+  for(i=0; i < MAX_TARGETS; i++) pTargetLocations[i] = -1;
+  for(i=0; i < num_pixels; i++) pInputImage[i] = 0;
+  pInputImage[0] = targetDescriptor;
+  pInputImage[3] = targetDescriptor;
+  pInputImage[num_pixels - 1] = targetDescriptor;
+
+  err |= clSetKernelArgSVMPointer(kernel, 0, pInputImage);
+  err |= clSetKernelArg(kernel, 1, sizeof(cl_uint), (void*) &targetDescriptor);
+  err |= clSetKernelArgSVMPointer(kernel, 2, pNumTargetsFound);
+  err |= clSetKernelArgSVMPointer(kernel, 3, pTargetLocations);
+  test_error(err, "clSetKernelArg failed");
+
+  cl_event done;
+  err = clEnqueueNDRangeKernel(queues[0], kernel, 1, NULL, &num_pixels, NULL, 0, NULL, &done);
+  test_error(err,"clEnqueueNDRangeKernel failed");
+  clFlush(queues[0]);
+
+
+  i=0;
+  cl_int status;
+  // check for new targets, if found spawn a task to analyze target.
+  do {
+    err = clGetEventInfo(done,CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &status, NULL);
+    test_error(err,"clGetEventInfo failed");
+    if( AtomicLoadExplicit(&pTargetLocations[i], memory_order_relaxed) != -1)  // -1 indicates slot not used yet.
+    {
+      spawnAnalysisTask(pTargetLocations[i]);
+      i++;
+    }
+  } while (status != CL_COMPLETE || AtomicLoadExplicit(&pTargetLocations[i], memory_order_relaxed) != -1);
+
+  clSVMFree(context, pInputImage);
+  clSVMFree(context, pNumTargetsFound);
+  clSVMFree(context, pTargetLocations);
+
+  if(i != 3) return -1;
+  return 0;
+}
--- a/test_conformance/SVM/test_migrate.cpp
+++ b/test_conformance/SVM/test_migrate.cpp
@@ -0,0 +1,330 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "common.h"
+#include "../../test_common/harness/mt19937.h"
+
+#define GLOBAL_SIZE 65536
+
+static const char *sources[] = {
+"__kernel void migrate_kernel(__global uint * restrict a, __global uint * restrict b, __global uint * restrict c)\n"
+"{\n"
+"    size_t i = get_global_id(0);\n"
+"    a[i] ^= 0x13579bdf;\n"
+"    b[i] ^= 0x2468ace0;\n"
+"    c[i] ^= 0x731fec8f;\n"
+"}\n"
+};
+
+static void
+fill_buffer(cl_uint* p, size_t n, MTdata seed)
+{
+    for (size_t i=0; i<n; ++i)
+        p[i] = (cl_uint)genrand_int32(seed);
+}
+
+static bool
+check(const char* s, cl_uint* a, cl_uint* e, size_t n)
+{
+    bool ok = true;
+    for (size_t i=0; ok && i<n; ++i) {
+        if (a[i] != e[i]) {
+            log_error("ERROR: %s mismatch at word %u, *%08x vs %08x\n", s, (unsigned int)i, e[i], a[i]);
+            ok = false;
+        }
+    }
+    return ok;
+}
+
+static int
+wait_and_release(const char* s, cl_event* evs, int n)
+{
+    cl_int error = clWaitForEvents(n, evs);
+    if (error == CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST) {
+        for (int i=0; i<n; ++i) {
+            cl_int e;
+            error = clGetEventInfo(evs[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &e, NULL);
+            test_error(error, "clGetEventInfo failed");
+            if (e != CL_COMPLETE) {
+                log_error("ERROR: %s event %d execution status was %s\n", s, i, IGetErrorString(e));
+                return e;
+            }
+        }
+    } else
+        test_error(error, "clWaitForEvents failed");
+
+    for (int i=0; i<n; ++i) {
+        error = clReleaseEvent(evs[i]);
+        test_error(error, "clReleaseEvent failed");
+    }
+
+    return 0;
+}
+
+int
+test_migrate(cl_device_id deviceID, cl_context c, cl_command_queue queue, int num_elements)
+{
+    cl_uint amem[GLOBAL_SIZE];
+    cl_uint bmem[GLOBAL_SIZE];
+    cl_uint cmem[GLOBAL_SIZE];
+    cl_uint ramem[GLOBAL_SIZE];
+    cl_uint rbmem[GLOBAL_SIZE];
+    cl_uint rcmem[GLOBAL_SIZE];
+    cl_event evs[20];
+
+    const size_t global_size = GLOBAL_SIZE;
+
+    RandomSeed seed(0);
+
+    clContextWrapper context = NULL;
+    clCommandQueueWrapper queues[MAXQ];
+    cl_uint num_devices = 0;
+    clProgramWrapper program;
+    cl_int error;
+
+    error = create_cl_objects(deviceID, &sources[0], &context, &program, &queues[0], &num_devices, CL_DEVICE_SVM_COARSE_GRAIN_BUFFER);
+    if (error)
+        return -1;
+
+    cl_command_queue queue0 = queues[0];
+    clCommandQueueWrapper queue1;
+
+    if (num_devices > 1) {
+        log_info("  Running on two devices.\n");
+        queue1 = queues[1];
+    } else {
+        // Ensure we have two distinct queues
+        cl_device_id did;
+        error = clGetCommandQueueInfo(queue0, CL_QUEUE_DEVICE, sizeof(did), (void *)&did, NULL);
+        test_error(error, "clGetCommandQueueInfo failed");
+
+        cl_command_queue_properties cqp;
+        error = clGetCommandQueueInfo(queue0, CL_QUEUE_PROPERTIES, sizeof(cqp), &cqp, NULL);
+        test_error(error, "clGetCommandQueueInfo failed");
+
+        cl_queue_properties qp[3] = { CL_QUEUE_PROPERTIES, cqp, 0 };
+        queue1 = clCreateCommandQueueWithProperties(context, did, qp, &error);
+        test_error(error, "clCteateCommandQueueWithProperties failed");
+    }
+
+    clKernelWrapper kernel = clCreateKernel(program, "migrate_kernel", &error);
+    test_error(error, "clCreateKernel failed");
+
+    char* asvm = (char*)clSVMAlloc(context, CL_MEM_READ_WRITE, global_size*sizeof(cl_uint), 16);
+    if (asvm == NULL) {
+        log_error("ERROR: clSVMAlloc returned NULL at %s:%d\n", __FILE__, __LINE__);
+        return -1;
+    }
+
+    char* bsvm = (char *)clSVMAlloc(context, CL_MEM_READ_WRITE, global_size*sizeof(cl_uint), 16);
+    if (bsvm == NULL) {
+        log_error("ERROR: clSVMAlloc returned NULL at %s:%d\n", __FILE__, __LINE__);
+        clSVMFree(context, asvm);
+        return -1;
+    }
+
+    char* csvm = (char *)clSVMAlloc(context, CL_MEM_READ_WRITE, global_size*sizeof(cl_uint), 16);
+    if (csvm == NULL) {
+        log_error("ERROR: clSVMAlloc returned NULL at %s:%d\n", __FILE__, __LINE__);
+        clSVMFree(context, bsvm);
+        clSVMFree(context, asvm);
+        return -1;
+    }
+
+    error = clSetKernelArgSVMPointer(kernel, 0, (void*)asvm);
+    test_error(error, "clSetKernelArgSVMPointer failed");
+
+    error = clSetKernelArgSVMPointer(kernel, 1, (void*)bsvm);
+    test_error(error, "clSetKernelArgSVMPointer failed");
+
+    error = clSetKernelArgSVMPointer(kernel, 2, (void*)csvm);
+    test_error(error, "clSetKernelArgSVMPointer failed");
+
+    // Initialize host copy of data (and result)
+    fill_buffer(amem, global_size, seed);
+    fill_buffer(bmem, global_size, seed);
+    fill_buffer(cmem, global_size, seed);
+
+    // Now we're ready to start
+    {
+        // First, fill in the data on device0
+        cl_uint patt[] = { 0, 0, 0, 0};
+        error = clEnqueueSVMMemFill(queue0, (void *)asvm, patt, sizeof(patt), global_size*sizeof(cl_uint), 0, NULL, &evs[0]);
+        test_error(error, "clEnqueueSVMMemFill failed");
+
+        error = clEnqueueSVMMemFill(queue0, (void *)bsvm, patt, sizeof(patt), global_size*sizeof(cl_uint), 0, NULL, &evs[1]);
+        test_error(error, "clEnqueueSVMMemFill failed");
+
+        error = clEnqueueSVMMemFill(queue0, (void *)csvm, patt, sizeof(patt), global_size*sizeof(cl_uint), 0, NULL, &evs[2]);
+        test_error(error, "clEnqueueSVMMemFill failed");
+    }
+
+    {
+        // Now migrate fully to device 1 and discard the data
+        char* ptrs[] = { asvm, bsvm, csvm };
+        error = clEnqueueSVMMigrateMem(queue1, 3, (const void**)ptrs, NULL, CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED, 1, &evs[2], &evs[3]);
+        test_error(error, "clEnqueueSVMMigrateMem failed");
+    }
+
+    {
+        // Test host flag
+        char *ptrs[] = { asvm+1, bsvm+3, csvm+5 };
+        const size_t szs[] = { 1, 1, 0 };
+        error = clEnqueueSVMMigrateMem(queue0, 3, (const void**)ptrs, szs, CL_MIGRATE_MEM_OBJECT_HOST, 1, &evs[3], &evs[4]);
+        test_error(error, "clEnqueueSVMMigrateMem failed");
+    }
+
+    {
+        // Next fill with known data
+        error = clEnqueueSVMMap(queue1, CL_FALSE, CL_MAP_WRITE, (void*)asvm, global_size*sizeof(cl_uint), 1, &evs[4], &evs[5]);
+        test_error(error, "clEnqueueSVMMap failed");
+
+        error = clEnqueueSVMMap(queue1, CL_FALSE, CL_MAP_WRITE, (void*)bsvm, global_size*sizeof(cl_uint), 0, NULL, &evs[6]);
+        test_error(error, "clEnqueueSVMMap failed");
+
+        error = clEnqueueSVMMap(queue1, CL_FALSE, CL_MAP_WRITE, (void*)csvm, global_size*sizeof(cl_uint), 0, NULL, &evs[7]);
+        test_error(error, "clEnqueueSVMMap failed");
+    }
+
+    error = clFlush(queue0);
+    test_error(error, "clFlush failed");
+
+    error = clFlush(queue1);
+    test_error(error, "clFlush failed");
+
+    error = wait_and_release("first batch", evs, 8);
+    if (error)
+        return -1;
+
+    memcpy((void *)asvm, (void *)amem, global_size*sizeof(cl_uint));
+    memcpy((void *)bsvm, (void *)bmem, global_size*sizeof(cl_uint));
+    memcpy((void *)csvm, (void *)cmem, global_size*sizeof(cl_uint));
+
+    {
+        error = clEnqueueSVMUnmap(queue1, (void *)asvm, 0, NULL, &evs[0]);
+        test_error(error, "clEnqueueSVMUnmap failed");
+
+        error = clEnqueueSVMUnmap(queue1, (void *)bsvm, 0, NULL, &evs[1]);
+        test_error(error, "clEnqueueSVMUnmap failed");
+
+        error = clEnqueueSVMUnmap(queue1, (void *)csvm, 0, NULL, &evs[2]);
+        test_error(error, "clEnqueueSVMUnmap failed");
+    }
+
+
+    {
+        // Now try some overlapping regions, and operate on the result
+        char *ptrs[] = { asvm+100, bsvm+17, csvm+1000, asvm+101, bsvm+19, csvm+1017 };
+        const size_t szs[] = { 13, 23, 43, 3, 7, 11 };
+
+        error = clEnqueueSVMMigrateMem(queue0, 3, (const void**)ptrs, szs, 0, 1, &evs[2], &evs[3]);
+        test_error(error, "clEnqueueSVMMigrateMem failed");
+
+        error = clEnqueueNDRangeKernel(queue0, kernel, 1, NULL, &global_size, NULL, 0, NULL, &evs[4]);
+        test_error(error, "clEnqueueNDRangeKernel failed");
+    }
+
+    {
+        // Now another pair
+        char *ptrs[] = { asvm+8, bsvm+17, csvm+31, csvm+83 };
+        const size_t szs[] = { 0, 1, 3, 7 };
+
+        error = clEnqueueSVMMigrateMem(queue1, 4, (const void**)ptrs, szs, 0, 1, &evs[4], &evs[5]);
+        test_error(error, "clEnqueueSVMMigrateMem failed");
+
+        error = clEnqueueNDRangeKernel(queue1, kernel, 1, NULL, &global_size, NULL, 0, NULL, &evs[6]);
+        test_error(error, "clEnqueueNDRangeKernel failed");
+    }
+
+    {
+        // Another pair
+        char *ptrs[] = { asvm+64, asvm+128, bsvm+64, bsvm+128, csvm, csvm+64 };
+        const size_t szs[] = { 64, 64, 64, 64, 64, 64 };
+
+        error = clEnqueueSVMMigrateMem(queue0, 6, (const void**)ptrs, szs, 0, 1, &evs[6], &evs[7]);
+        test_error(error, "clEnqueueSVMMigrateMem failed");
+
+        error = clEnqueueNDRangeKernel(queue0, kernel, 1, NULL, &global_size, NULL, 0, NULL, &evs[8]);
+        test_error(error, "clEnqueueNDRangeKernel failed");
+    }
+
+    {
+        // Final pair
+        char *ptrs[] = { asvm, asvm, bsvm, csvm, csvm };
+        const size_t szs[] = { 0, 1, 0, 1, 0 };
+
+        error = clEnqueueSVMMigrateMem(queue1, 5, (const void**)ptrs, szs, 0, 1, &evs[8], &evs[9]);
+        test_error(error, "clEnqueueSVMMigrateMem failed");
+
+        error = clEnqueueNDRangeKernel(queue1, kernel, 1, NULL, &global_size, NULL, 0, NULL, &evs[10]);
+        test_error(error, "clEnqueueNDRangeKernel failed");
+    }
+
+    {
+        error = clEnqueueSVMMap(queue1, CL_FALSE, CL_MAP_READ, (void*)asvm, global_size*sizeof(cl_uint), 0, NULL, &evs[11]);
+        test_error(error, "clEnqueueSVMMap failed");
+
+        error = clEnqueueSVMMap(queue1, CL_FALSE, CL_MAP_READ, (void*)bsvm, global_size*sizeof(cl_uint), 0, NULL, &evs[12]);
+        test_error(error, "clEnqueueSVMMap failed");
+
+        error = clEnqueueSVMMap(queue1, CL_FALSE, CL_MAP_READ, (void*)csvm, global_size*sizeof(cl_uint), 0, NULL, &evs[13]);
+        test_error(error, "clEnqueueSVMMap failed");
+    }
+
+    error = clFlush(queue0);
+    test_error(error, "clFlush failed");
+
+    error = clFlush(queue1);
+    test_error(error, "clFlush failed");
+
+    error = wait_and_release("batch 2", evs, 14);
+    if (error)
+        return -1;
+
+    // Check kernel results
+    bool ok = check("memory a", (cl_uint *)asvm, amem, global_size);
+    ok &= check("memory b", (cl_uint *)bsvm, bmem, global_size);
+    ok &= check("memory c", (cl_uint *)csvm, cmem, global_size);
+
+    {
+        void *ptrs[] = { asvm, bsvm, csvm };
+
+        error = clEnqueueSVMUnmap(queue1, (void *)asvm, 0, NULL, &evs[0]);
+        test_error(error, "clEnqueueSVMUnmap failed");
+
+        error = clEnqueueSVMUnmap(queue1, (void *)bsvm, 0, NULL, &evs[1]);
+        test_error(error, "clEnqueueSVMUnmap failed");
+
+        error = clEnqueueSVMUnmap(queue1, (void *)csvm, 0, NULL, &evs[2]);
+        test_error(error, "clEnqueueSVMUnmap failed");
+
+        error = clEnqueueSVMFree(queue1, 3, ptrs, NULL, NULL, 0, NULL, &evs[3]);
+    }
+
+    error = clFlush(queue1);
+    test_error(error, "clFlush failed");
+
+    error = wait_and_release("batch 3", evs, 4);
+    if (error)
+        return -1;
+
+    clSVMFree(context, asvm);
+    clSVMFree(context, bsvm);
+    clSVMFree(context, csvm);
+
+    // The wrappers will clean up the rest
+    return ok ? 0 : -1;
+}
+
--- a/test_conformance/SVM/test_pointer_passing.cpp
+++ b/test_conformance/SVM/test_pointer_passing.cpp
@@ -0,0 +1,115 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "common.h"
+
+const char *SVMPointerPassing_test_kernel[] = {
+  "__kernel void verify_char(__global uchar* pChar, volatile __global uint* num_correct, uchar expected)\n"
+  "{\n"
+  "    if(0 == get_global_id(0))\n"
+  "    {\n"
+  "        *num_correct = 0;\n"
+  "        if(*pChar == expected)\n"
+  "        {\n"
+  "                    *num_correct=1;\n"
+  "        }\n"
+  "    }\n"
+  "}\n"
+};
+
+
+// Test that arbitrarily aligned char pointers into shared buffers can be passed directly to a kernel.
+// This iterates through a buffer passing a pointer to each location to the kernel.
+// The buffer is initialized to known values at each location.
+// The kernel checks that it finds the expected value at each location.
+// TODO: possibly make this work across all base types (including typeN?), also check ptr arithmetic ++,--.
+int test_svm_pointer_passing(cl_device_id deviceID, cl_context context2, cl_command_queue queue, int num_elements)
+{
+  clContextWrapper    context = NULL;
+  clProgramWrapper    program = NULL;
+  cl_uint     num_devices = 0;
+  cl_int      error = CL_SUCCESS;
+  clCommandQueueWrapper queues[MAXQ];
+
+  error = create_cl_objects(deviceID, &SVMPointerPassing_test_kernel[0], &context, &program, &queues[0], &num_devices, CL_DEVICE_SVM_COARSE_GRAIN_BUFFER);
+  if(error) return -1;
+
+  clKernelWrapper kernel_verify_char = clCreateKernel(program, "verify_char", &error);
+  test_error(error,"clCreateKernel failed");
+
+  size_t bufSize = 256;
+  char *pbuf = (char*) clSVMAlloc(context, CL_MEM_READ_WRITE, sizeof(cl_uchar)*bufSize, 0);
+
+  cl_int *pNumCorrect = NULL;
+  pNumCorrect = (cl_int*) clSVMAlloc(context, CL_MEM_READ_WRITE, sizeof(cl_int), 0);
+
+  {
+    clMemWrapper buf = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, sizeof(cl_uchar)*bufSize, pbuf, &error);
+    test_error(error, "clCreateBuffer failed.");
+
+    clMemWrapper num_correct = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, sizeof(cl_int), pNumCorrect, &error);
+    test_error(error, "clCreateBuffer failed.");
+
+    error = clSetKernelArg(kernel_verify_char, 1, sizeof(void*), (void *) &num_correct);
+    test_error(error, "clSetKernelArg failed");
+
+    // put values into buf so that we can expect to see these values in the kernel when we pass a pointer to them.
+    cl_command_queue cmdq = queues[0];
+    cl_uchar* pBuf = (cl_uchar*) clEnqueueMapBuffer(cmdq, buf, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(cl_uchar)*bufSize, 0, NULL,NULL, &error);
+    test_error2(error, pBuf, "clEnqueueMapBuffer failed");
+    for(int i = 0; i<(int)bufSize; i++)
+    {
+      pBuf[i]= (cl_uchar)i;
+    }
+    error = clEnqueueUnmapMemObject(cmdq, buf, pBuf, 0,NULL,NULL);
+    test_error(error, "clEnqueueUnmapMemObject failed.");
+
+    for (cl_uint ii = 0; ii<num_devices; ++ii)  // iterate over all devices in the platform.
+    {
+      cmdq = queues[ii];
+      for(int i = 0; i<(int)bufSize; i++)
+      {
+        cl_uchar* pChar = &pBuf[i];
+        error = clSetKernelArgSVMPointer(kernel_verify_char, 0, pChar); // pass a pointer to a location within the buffer
+        test_error(error, "clSetKernelArg failed");
+        error = clSetKernelArg(kernel_verify_char, 2, sizeof(cl_uchar), (void *) &i );  // pass the expected value at the above location.
+        test_error(error, "clSetKernelArg failed");
+        error = clEnqueueNDRangeKernel(cmdq, kernel_verify_char, 1, NULL, &bufSize, NULL, 0, NULL, NULL);
+        test_error(error,"clEnqueueNDRangeKernel failed");
+
+        pNumCorrect = (cl_int*) clEnqueueMapBuffer(cmdq, num_correct, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(cl_int), 0, NULL,NULL, &error);
+        test_error2(error, pNumCorrect, "clEnqueueMapBuffer failed");
+        cl_int correct_count = *pNumCorrect;
+        error = clEnqueueUnmapMemObject(cmdq, num_correct, pNumCorrect, 0,NULL,NULL);
+        test_error(error, "clEnqueueUnmapMemObject failed.");
+
+        if(correct_count != 1)
+        {
+          log_error("Passing pointer directly to kernel for byte #%d failed on device %d\n", i, ii);
+          return -1;
+        }
+      }
+    }
+
+    error = clFinish(cmdq);
+    test_error(error, "clFinish failed");
+  }
+
+
+  clSVMFree(context, pbuf);
+  clSVMFree(context, pNumCorrect);
+
+  return 0;
+}
--- a/test_conformance/SVM/test_set_kernel_exec_info_svm_ptrs.cpp
+++ b/test_conformance/SVM/test_set_kernel_exec_info_svm_ptrs.cpp
@@ -0,0 +1,153 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "common.h"
+
+typedef struct {
+  cl_int *pA;
+  cl_int *pB;
+  cl_int *pC;
+} BufPtrs;
+
+const char *set_kernel_exec_info_svm_ptrs_kernel[] = {
+  "struct BufPtrs;\n"
+  "\n"
+  "typedef struct {\n"
+  "    __global int *pA;\n"
+  "    __global int *pB;\n"
+  "    __global int *pC;\n"
+  "} BufPtrs;\n"
+  "\n"
+  "__kernel void set_kernel_exec_info_test(__global BufPtrs* pBufs)\n"
+  "{\n"
+  "    size_t i;\n"
+  "   i = get_global_id(0);\n"
+  "    pBufs->pA[i]++;\n"
+  "    pBufs->pB[i]++;\n"
+  "    pBufs->pC[i]++;\n"
+  "}\n"
+};
+
+// Test that clSetKernelExecInfo works correctly with CL_KERNEL_EXEC_INFO_SVM_PTRS flag.
+//
+int test_set_kernel_exec_info_svm_ptrs(cl_device_id deviceID, cl_context context2, cl_command_queue queue, int num_elements)
+{
+  clContextWrapper    c = NULL;
+  clProgramWrapper    program = NULL;
+  cl_uint     num_devices = 0;
+  cl_int      error = CL_SUCCESS;
+  clCommandQueueWrapper q;
+
+  //error = create_cl_objects(deviceID, &set_kernel_exec_info_svm_ptrs_kernel[0], &context, &program, &q, &num_devices, CL_DEVICE_SVM_FINE_GRAIN);
+  error = create_cl_objects(deviceID, &set_kernel_exec_info_svm_ptrs_kernel[0], &c, &program, &q, &num_devices, CL_DEVICE_SVM_COARSE_GRAIN_BUFFER);
+  if(error == 1) return 0; // no devices capable of requested SVM level, so don't execute but count test as passing.
+  if(error < 0) return -1; // fail test.
+
+
+  clKernelWrapper k = clCreateKernel(program, "set_kernel_exec_info_test", &error);
+  test_error(error, "clCreateKernel failed");
+
+  size_t size = num_elements*sizeof(int);
+  //int* pA = (int*) clSVMalloc(c, CL_MEM_READ_WRITE | CL_DEVICE_SVM_FINE_GRAIN_SYSTEM, sizeof(int)*num_elements, 0);
+  //int* pB = (int*) clSVMalloc(c, CL_MEM_READ_WRITE | CL_DEVICE_SVM_FINE_GRAIN_SYSTEM, sizeof(int)*num_elements, 0);
+  //int* pC = (int*) clSVMalloc(c, CL_MEM_READ_WRITE | CL_DEVICE_SVM_FINE_GRAIN_SYSTEM, sizeof(int)*num_elements, 0);
+  int* pA = (int*) clSVMAlloc(c, CL_MEM_READ_WRITE, size, 0);
+  int* pB = (int*) clSVMAlloc(c, CL_MEM_READ_WRITE, size, 0);
+  int* pC = (int*) clSVMAlloc(c, CL_MEM_READ_WRITE, size, 0);
+  BufPtrs* pBuf = (BufPtrs*) clSVMAlloc(c, CL_MEM_READ_WRITE, sizeof(BufPtrs), 0);
+
+  bool failed = false;
+  {
+    clMemWrapper ba,bb,bc,bBuf;
+    ba = clCreateBuffer(c, CL_MEM_USE_HOST_PTR, size, pA, &error);
+    test_error(error, "clCreateBuffer failed");
+    bb = clCreateBuffer(c, CL_MEM_USE_HOST_PTR, size, pB, &error);
+    test_error(error, "clCreateBuffer failed");
+    bc = clCreateBuffer(c, CL_MEM_USE_HOST_PTR, size, pC, &error);
+    test_error(error, "clCreateBuffer failed");
+    bBuf = clCreateBuffer(c, CL_MEM_USE_HOST_PTR, sizeof(BufPtrs), pBuf, &error);
+    test_error(error, "clCreateBuffer failed");
+
+    clEnqueueMapBuffer(q, ba, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size, 0, NULL, NULL, &error);
+    test_error(error, "clEnqueueMapBuffer failed");
+    clEnqueueMapBuffer(q, bb, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size, 0, NULL, NULL, &error);
+    test_error(error, "clEnqueueMapBuffer failed");
+    clEnqueueMapBuffer(q, bc, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size, 0, NULL, NULL, &error);
+    test_error(error, "clEnqueueMapBuffer failed");
+    clEnqueueMapBuffer(q, bBuf, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(BufPtrs), 0, NULL, NULL, &error);
+    test_error(error, "clEnqueueMapBuffer failed");
+
+    for(int i = 0; i < num_elements; i++) pA[i] = pB[i] = pC[i] = 0;
+
+    pBuf->pA = pA;
+    pBuf->pB = pB;
+    pBuf->pC = pC;
+
+    error = clEnqueueUnmapMemObject(q, ba, pA, 0,NULL,NULL);
+    test_error(error, " clEnqueueUnmapMemObject failed.");
+    error = clEnqueueUnmapMemObject(q, bb, pB, 0,NULL,NULL);
+    test_error(error, " clEnqueueUnmapMemObject failed.");
+    error = clEnqueueUnmapMemObject(q, bc, pC, 0,NULL,NULL);
+    test_error(error, " clEnqueueUnmapMemObject failed.");
+    error = clEnqueueUnmapMemObject(q, bBuf, pBuf, 0,NULL,NULL);
+    test_error(error, " clEnqueueUnmapMemObject failed.");
+
+
+    error = clSetKernelArgSVMPointer(k, 0, pBuf);
+    test_error(error, "clSetKernelArg failed");
+
+    error = clSetKernelExecInfo(k, CL_KERNEL_EXEC_INFO_SVM_PTRS, sizeof(BufPtrs), pBuf);
+    test_error(error, "clSetKernelExecInfo failed");
+
+    size_t range =  num_elements;
+    error = clEnqueueNDRangeKernel(q, k, 1, NULL, &range, NULL, 0, NULL, NULL);
+    test_error(error,"clEnqueueNDRangeKernel failed");
+
+    error = clFinish(q);
+    test_error(error, "clFinish failed.");
+
+    clEnqueueMapBuffer(q, ba, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size, 0, NULL, NULL, &error);
+    test_error(error, "clEnqueueMapBuffer failed");
+    clEnqueueMapBuffer(q, bb, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size, 0, NULL, NULL, &error);
+    test_error(error, "clEnqueueMapBuffer failed");
+    clEnqueueMapBuffer(q, bc, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size, 0, NULL, NULL, &error);
+    test_error(error, "clEnqueueMapBuffer failed");
+
+    for(int i = 0; i < num_elements; i++)
+    {
+      if(pA[i] + pB[i] + pC[i] != 3)
+        failed = true;
+    }
+
+    error = clEnqueueUnmapMemObject(q, ba, pA, 0,NULL,NULL);
+    test_error(error, " clEnqueueUnmapMemObject failed.");
+    error = clEnqueueUnmapMemObject(q, bb, pB, 0,NULL,NULL);
+    test_error(error, " clEnqueueUnmapMemObject failed.");
+    error = clEnqueueUnmapMemObject(q, bc, pC, 0,NULL,NULL);
+    test_error(error, " clEnqueueUnmapMemObject failed.");
+  }
+
+  error = clFinish(q);
+  test_error(error, " clFinish failed.");
+
+  clSVMFree(c, pA);
+  clSVMFree(c, pB);
+  clSVMFree(c, pC);
+  clSVMFree(c, pBuf);
+
+  if(failed) return -1;
+
+  return 0;
+}
--- a/test_conformance/SVM/test_shared_address_space_coarse_grain.cpp
+++ b/test_conformance/SVM/test_shared_address_space_coarse_grain.cpp
@@ -0,0 +1,282 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "common.h"
+
+// Creates linked list using host code
+cl_int create_linked_lists_on_host(cl_command_queue cmdq, cl_mem nodes, Node *pNodes2, cl_int ListLength, size_t numLists, cl_bool useNewAPI )
+{
+  cl_int error = CL_SUCCESS;
+
+  log_info("SVM: creating linked list on host ");
+
+  Node *pNodes;
+  if (useNewAPI == CL_FALSE)
+  {
+    pNodes = (Node*) clEnqueueMapBuffer(cmdq, nodes, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(Node)*ListLength*numLists, 0, NULL,NULL, &error);
+    test_error2(error, pNodes, "clEnqMapBuffer failed");
+  }
+  else
+  {
+    pNodes = pNodes2;
+    error = clEnqueueSVMMap(cmdq, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, pNodes2, sizeof(Node)*ListLength*numLists, 0, NULL,NULL);
+    test_error2(error, pNodes, "clEnqueueSVMMap failed");
+  }
+
+  create_linked_lists(pNodes, numLists, ListLength);
+
+  if (useNewAPI == CL_FALSE)
+  {
+    error = clEnqueueUnmapMemObject(cmdq, nodes, pNodes, 0,NULL,NULL);
+    test_error(error, "clEnqueueUnmapMemObject failed.");
+  }
+  else
+  {
+    error = clEnqueueSVMUnmap(cmdq, pNodes2, 0, NULL, NULL);
+    test_error(error, "clEnqueueSVMUnmap failed.");
+  }
+
+  error = clFinish(cmdq);
+  test_error(error, "clFinish failed.");
+  return error;
+}
+
+// Purpose: uses host code to verify correctness of the linked list
+cl_int verify_linked_lists_on_host(int ci, cl_command_queue cmdq, cl_mem nodes, Node *pNodes2, cl_int ListLength, size_t numLists, cl_bool useNewAPI )
+{
+  cl_int error = CL_SUCCESS;
+  cl_int correct_count;
+
+  Node *pNodes;
+  if (useNewAPI == CL_FALSE)
+  {
+    pNodes = (Node*) clEnqueueMapBuffer(cmdq, nodes, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(Node)*ListLength * numLists, 0, NULL,NULL, &error);
+    test_error2(error, pNodes, "clEnqueueMapBuffer failed");
+  }
+  else
+  {
+    pNodes = pNodes2;
+    error = clEnqueueSVMMap(cmdq, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, pNodes2, sizeof(Node)*ListLength * numLists, 0, NULL,NULL);
+    test_error2(error, pNodes, "clEnqueueSVMMap failed");
+  }
+
+  correct_count = 0;
+
+  error = verify_linked_lists(pNodes, numLists, ListLength);
+  if(error) return -1;
+
+  if (useNewAPI == CL_FALSE)
+  {
+    error = clEnqueueUnmapMemObject(cmdq, nodes, pNodes, 0,NULL,NULL);
+    test_error(error, "clEnqueueUnmapMemObject failed.");
+  }
+  else
+  {
+    error = clEnqueueSVMUnmap(cmdq, pNodes2, 0,NULL,NULL);
+    test_error(error, "clEnqueueSVMUnmap failed.");
+  }
+
+  error = clFinish(cmdq);
+  test_error(error, "clFinish failed.");
+  return error;
+}
+
+cl_int create_linked_lists_on_device(int ci, cl_command_queue cmdq, cl_mem allocator, cl_kernel kernel_create_lists, size_t numLists  )
+{
+  cl_int error = CL_SUCCESS;
+  log_info("SVM: creating linked list on device: %d ", ci);
+
+  size_t *pAllocator = (size_t*) clEnqueueMapBuffer(cmdq, allocator, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(cl_int), 0, NULL,NULL, &error);
+  test_error2(error, pAllocator, "clEnqueueMapBuffer failed");
+  // reset allocator index
+  *pAllocator = numLists;   // the first numLists elements of the nodes array are already allocated (they hold the head of each list).
+  error = clEnqueueUnmapMemObject(cmdq, allocator, pAllocator, 0,NULL,NULL);
+  test_error(error, " clEnqueueUnmapMemObject failed.");
+
+  error = clEnqueueNDRangeKernel(cmdq, kernel_create_lists, 1, NULL, &numLists, NULL, 0, NULL, NULL);
+  test_error(error, "clEnqueueNDRange failed.");
+  error = clFinish(cmdq);
+  test_error(error, "clFinish failed.");
+
+  return error;
+}
+
+cl_int verify_linked_lists_on_device(int vi, cl_command_queue cmdq,cl_mem num_correct, cl_kernel kernel_verify_lists, cl_int ListLength, size_t numLists  )
+{
+  cl_int error = CL_SUCCESS;
+
+  log_info(" and verifying on device: %d ", vi);
+
+  cl_int *pNumCorrect = (cl_int*) clEnqueueMapBuffer(cmdq, num_correct, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(cl_int), 0, NULL,NULL, &error);
+  test_error2(error, pNumCorrect, "clEnqueueMapBuffer failed");
+
+  *pNumCorrect = 0;     // reset numCorrect to zero
+
+  error = clEnqueueUnmapMemObject(cmdq, num_correct, pNumCorrect, 0,NULL,NULL);
+  test_error(error, "clEnqueueUnmapMemObject failed.");
+
+  error = clEnqueueNDRangeKernel(cmdq, kernel_verify_lists, 1, NULL, &numLists, NULL, 0, NULL, NULL);
+  test_error(error,"clEnqueueNDRangeKernel failed");
+
+  pNumCorrect = (cl_int*) clEnqueueMapBuffer(cmdq, num_correct, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(cl_int), 0, NULL,NULL, &error);
+  test_error2(error, pNumCorrect, "clEnqueueMapBuffer failed");
+  cl_int correct_count = *pNumCorrect;
+  error = clEnqueueUnmapMemObject(cmdq, num_correct, pNumCorrect, 0,NULL,NULL);
+  test_error(error, "clEnqueueUnmapMemObject failed");
+  clFinish(cmdq);
+  test_error(error,"clFinish failed");
+
+  if(correct_count != ListLength * (cl_uint)numLists)
+  {
+    error = -1;
+    log_info("Failed\n");
+  }
+  else
+    log_info("Passed\n");
+
+  return error;
+}
+
+// This tests that all devices and the host share a common address space; using only the coarse-grain features.
+// This is done by creating a linked list on a device and then verifying the correctness of the list
+// on another device or the host.  This basic test is performed for all combinations of devices and the host that exist within
+// the platform.  The test passes only if every combination passes.
+int shared_address_space_coarse_grain(cl_device_id deviceID, cl_context context2, cl_command_queue queue, int num_elements, cl_bool useNewAPI)
+{
+  clContextWrapper    context = NULL;
+  clProgramWrapper    program = NULL;
+  cl_uint     num_devices = 0;
+  cl_int      error = CL_SUCCESS;
+  clCommandQueueWrapper queues[MAXQ];
+
+  error = create_cl_objects(deviceID, &linked_list_create_and_verify_kernels[0], &context, &program, &queues[0], &num_devices, CL_DEVICE_SVM_COARSE_GRAIN_BUFFER);
+  if(error) return -1;
+
+  size_t numLists =  num_elements;
+  cl_int ListLength = 32;
+
+  clKernelWrapper kernel_create_lists = clCreateKernel(program, "create_linked_lists", &error);
+  test_error(error, "clCreateKernel failed");
+
+  clKernelWrapper kernel_verify_lists = clCreateKernel(program, "verify_linked_lists", &error);
+  test_error(error, "clCreateKernel failed");
+
+  // this buffer holds the linked list nodes.
+  Node* pNodes = (Node*) clSVMAlloc(context, CL_MEM_READ_WRITE, sizeof(Node)*ListLength*numLists, 0);
+
+  {
+    cl_bool usesSVMpointer = CL_FALSE;
+    clMemWrapper nodes;
+    if (useNewAPI == CL_FALSE)
+    {
+      nodes = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, sizeof(Node)*ListLength*numLists, pNodes, &error);
+      test_error(error, "clCreateBuffer failed.");
+
+      // verify if buffer uses SVM pointer
+      size_t paramSize = 0;
+      error = clGetMemObjectInfo(nodes, CL_MEM_USES_SVM_POINTER, 0, 0, &paramSize);
+      test_error(error, "clGetMemObjectInfo failed.");
+
+      if (paramSize != sizeof(cl_bool))
+      {
+        log_error("clGetMemObjectInfo(CL_MEM_USES_SVM_POINTER) returned wrong size.");
+        return -1;
+      }
+
+      error = clGetMemObjectInfo(nodes, CL_MEM_USES_SVM_POINTER, sizeof(cl_bool), &usesSVMpointer, 0);
+      test_error(error, "clGetMemObjectInfo failed.");
+
+      if (usesSVMpointer != CL_TRUE)
+      {
+        log_error("clGetMemObjectInfo(CL_MEM_USES_SVM_POINTER) returned CL_FALSE for buffer created from SVM pointer.");
+        return -1;
+      }
+    }
+
+    // this buffer holds an index into the nodes buffer, it is used for node allocation
+    clMemWrapper allocator = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &error);
+    test_error(error, "clCreateBuffer failed.");
+
+    error = clGetMemObjectInfo(allocator, CL_MEM_USES_SVM_POINTER, sizeof(cl_bool), &usesSVMpointer, 0);
+    test_error(error, "clGetMemObjectInfo failed.");
+
+    if (usesSVMpointer != CL_FALSE)
+    {
+      log_error("clGetMemObjectInfo(CL_MEM_USES_SVM_POINTER) returned CL_TRUE for non-SVM buffer.");
+      return -1;
+    }
+
+    // this buffer holds the count of correct nodes, which is computed by the verify kernel.
+    clMemWrapper num_correct = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &error);
+    test_error(error, "clCreateBuffer failed.");
+
+    if (useNewAPI == CL_TRUE)
+      error |= clSetKernelArgSVMPointer(kernel_create_lists, 0, pNodes);
+    else
+      error |= clSetKernelArg(kernel_create_lists, 0, sizeof(void*), (void *) &nodes);
+
+    error |= clSetKernelArg(kernel_create_lists, 1, sizeof(void*), (void *) &allocator);
+    error |= clSetKernelArg(kernel_create_lists, 2, sizeof(cl_int),   (void *) &ListLength);
+
+    error |= clSetKernelArgSVMPointer(kernel_verify_lists, 0, pNodes);
+    error |= clSetKernelArg(kernel_verify_lists, 1, sizeof(void*), (void *) &num_correct);
+    error |= clSetKernelArg(kernel_verify_lists, 2, sizeof(cl_int),   (void *) &ListLength);
+    test_error(error, "clSetKernelArg failed");
+
+    // Create linked list on one device and verify on another device (or the host).
+    // Do this for all possible combinations of devices and host within the platform.
+    for (int ci=0; ci<(int)num_devices+1; ci++)  // ci is CreationIndex, index of device/q to create linked list on
+    {
+      for (int vi=0; vi<(int)num_devices+1; vi++)  // vi is VerificationIndex, index of device/q to verify linked list on
+      {
+        if(ci == num_devices) // last device index represents the host, note the num_device+1 above.
+        {
+          error = create_linked_lists_on_host(queues[0], nodes, pNodes, ListLength, numLists, useNewAPI);
+          if(error) return -1;
+        }
+        else
+        {
+          error = create_linked_lists_on_device(ci, queues[ci], allocator, kernel_create_lists, numLists);
+          if(error) return -1;
+        }
+
+        if(vi == num_devices)
+        {
+          error = verify_linked_lists_on_host(vi, queues[0], nodes, pNodes, ListLength, numLists, useNewAPI);
+          if(error) return -1;
+        }
+        else
+        {
+          error = verify_linked_lists_on_device(vi, queues[vi], num_correct, kernel_verify_lists, ListLength, numLists);
+          if(error) return -1;
+        }
+      }
+    }
+  }
+
+  clSVMFree(context, pNodes);
+
+  return 0;
+}
+
+int test_shared_address_space_coarse_grain_old_api(cl_device_id deviceID, cl_context context2, cl_command_queue queue, int num_elements)
+{
+  return shared_address_space_coarse_grain(deviceID, context2, queue, num_elements, CL_FALSE);
+}
+
+int test_shared_address_space_coarse_grain_new_api(cl_device_id deviceID, cl_context context2, cl_command_queue queue, int num_elements)
+{
+  return shared_address_space_coarse_grain(deviceID, context2, queue, num_elements, CL_TRUE);
+}
--- a/test_conformance/SVM/test_shared_address_space_fine_grain.cpp
+++ b/test_conformance/SVM/test_shared_address_space_fine_grain.cpp
@@ -0,0 +1,101 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "common.h"
+
+
+// This tests that all devices and the host share a common address space using fine-grain mode with no buffers.
+// This is done by creating a linked list on a device and then verifying the correctness of the list
+// on another device or the host.  This basic test is performed for all combinations of devices and the host that exist within
+// the platform.  The test passes only if every combination passes.
+int test_shared_address_space_fine_grain(cl_device_id deviceID, cl_context context2, cl_command_queue queue, int num_elements)
+{
+  clContextWrapper    context = NULL;
+  clProgramWrapper    program = NULL;
+  cl_uint     num_devices = 0;
+  cl_int      error = CL_SUCCESS;
+  clCommandQueueWrapper queues[MAXQ];
+
+  error = create_cl_objects(deviceID, &linked_list_create_and_verify_kernels[0], &context, &program, &queues[0], &num_devices, CL_DEVICE_SVM_FINE_GRAIN_SYSTEM);
+  if(error == 1) return 0; // no devices capable of requested SVM level, so don't execute but count test as passing.
+  if(error < 0) return -1; // fail test.
+
+  size_t numLists =  num_elements;
+  cl_int ListLength = 32;
+
+  clKernelWrapper kernel_create_lists = clCreateKernel(program, "create_linked_lists", &error);
+  test_error(error, "clCreateKernel failed");
+
+  clKernelWrapper kernel_verify_lists = clCreateKernel(program, "verify_linked_lists", &error);
+  test_error(error, "clCreateKernel failed");
+
+  // this allocation holds the linked list nodes.
+  // FIXME: remove the alignment once prototype can handle it
+  Node* pNodes = (Node*) align_malloc(numLists*ListLength*sizeof(Node),128);
+  test_error2(error, pNodes, "malloc failed");
+
+  // this allocation holds an index into the nodes buffer, it is used for node allocation
+  size_t* pAllocator = (size_t*) align_malloc(sizeof(cl_int), 128);
+  test_error2(error, pAllocator, "malloc failed");
+
+  // this allocation holds the count of correct nodes, which is computed by the verify kernel.
+  cl_int* pNum_correct = (cl_int*) align_malloc(sizeof(cl_int), 128);
+  test_error2(error, pNum_correct, "malloc failed");
+
+
+  error |= clSetKernelArgSVMPointer(kernel_create_lists, 0, pNodes);
+  error |= clSetKernelArgSVMPointer(kernel_create_lists, 1, pAllocator);
+  error |= clSetKernelArg(kernel_create_lists, 2, sizeof(cl_int),(void *) &ListLength);
+
+  error |= clSetKernelArgSVMPointer(kernel_verify_lists, 0, pNodes);
+  error |= clSetKernelArgSVMPointer(kernel_verify_lists, 1, pNum_correct);
+  error |= clSetKernelArg(kernel_verify_lists, 2, sizeof(cl_int),   (void *) &ListLength);
+  test_error(error, "clSetKernelArg failed");
+
+  // Create linked list on one device and verify on another device (or the host).
+  // Do this for all possible combinations of devices and host within the platform.
+  for (int ci=0; ci<(int)num_devices+1; ci++)  // ci is CreationIndex, index of device/q to create linked list on
+  {
+    for (int vi=0; vi<(int)num_devices+1; vi++)  // vi is VerificationIndex, index of device/q to verify linked list on
+    {
+      if(ci == num_devices) // last device index represents the host, note the num_device+1 above.
+      {
+        log_info("creating linked list on host ");
+        create_linked_lists(pNodes, numLists, ListLength);
+      }
+      else
+      {
+        error = create_linked_lists_on_device_no_map(ci, queues[ci], pAllocator, kernel_create_lists, numLists);
+        if(error) return -1;
+      }
+
+      if(vi == num_devices)
+      {
+        error = verify_linked_lists(pNodes, numLists, ListLength);
+        if(error) return -1;
+      }
+      else
+      {
+        error = verify_linked_lists_on_device_no_map(vi, queues[vi], pNum_correct, kernel_verify_lists, ListLength, numLists);
+        if(error) return -1;
+      }
+    }
+  }
+
+  align_free(pNodes);
+  align_free(pAllocator);
+  align_free(pNum_correct);
+  return 0;
+}
--- a/test_conformance/SVM/test_shared_address_space_fine_grain_buffers.cpp
+++ b/test_conformance/SVM/test_shared_address_space_fine_grain_buffers.cpp
@@ -0,0 +1,138 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "common.h"
+
+
+
+
+cl_int create_linked_lists_on_device_no_map(int ci, cl_command_queue cmdq, size_t* pAllocator, cl_kernel kernel_create_lists, size_t numLists  )
+{
+  cl_int error = CL_SUCCESS;
+  log_info("SVM: creating linked list on device: %d ", ci);
+
+  // reset allocator index
+  *pAllocator = numLists;   // the first numLists elements of the nodes array are already allocated (they hold the head of each list).
+  error = clEnqueueNDRangeKernel(cmdq, kernel_create_lists, 1, NULL, &numLists, NULL, 0, NULL, NULL);
+  test_error(error, "clEnqueueNDRange failed.");
+  error = clFinish(cmdq);
+  test_error(error, "clFinish failed.");
+  return error;
+}
+
+cl_int verify_linked_lists_on_device_no_map(int vi, cl_command_queue cmdq,cl_int* pNumCorrect, cl_kernel kernel_verify_lists, cl_int ListLength, size_t numLists  )
+{
+  cl_int error = CL_SUCCESS;
+
+  log_info(" and verifying on device: %d ", vi);
+
+  *pNumCorrect = 0;     // reset numCorrect to zero
+
+  error = clEnqueueNDRangeKernel(cmdq, kernel_verify_lists, 1, NULL, &numLists, NULL, 0, NULL, NULL);
+  test_error(error,"clEnqueueNDRangeKernel failed");
+  clFinish(cmdq);
+  test_error(error,"clFinish failed");
+
+  cl_int correct_count = *pNumCorrect;
+  if(correct_count != ListLength * (cl_uint)numLists)
+  {
+    error = -1;
+    log_info("Failed\n");
+  }
+  else
+    log_info("Passed\n");
+
+  return error;
+}
+
+// This tests that all devices and the host share a common address space; using only the fine-grain with buffers mode.
+// This is done by creating a linked list on a device and then verifying the correctness of the list
+// on another device or the host.  This basic test is performed for all combinations of devices and the host that exist within
+// the platform.  The test passes only if every combination passes.
+int test_shared_address_space_fine_grain_buffers(cl_device_id deviceID, cl_context context2, cl_command_queue queue, int num_elements)
+{
+  clContextWrapper    context = NULL;
+  clProgramWrapper    program = NULL;
+  cl_uint     num_devices = 0;
+  cl_int      error = CL_SUCCESS;
+  clCommandQueueWrapper queues[MAXQ];
+
+  error = create_cl_objects(deviceID, &linked_list_create_and_verify_kernels[0], &context, &program, &queues[0], &num_devices, CL_DEVICE_SVM_FINE_GRAIN_BUFFER);
+  if(error == 1) return 0; // no devices capable of requested SVM level, so don't execute but count test as passing.
+  if(error < 0) return -1; // fail test.
+
+  size_t numLists =  num_elements;
+  cl_int ListLength = 32;
+
+  clKernelWrapper kernel_create_lists = clCreateKernel(program, "create_linked_lists", &error);
+  test_error(error, "clCreateKernel failed");
+
+  clKernelWrapper kernel_verify_lists = clCreateKernel(program, "verify_linked_lists", &error);
+  test_error(error, "clCreateKernel failed");
+
+  // this buffer holds the linked list nodes.
+  Node* pNodes = (Node*) clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, sizeof(Node)*ListLength*numLists, 0);
+
+  // this buffer holds an index into the nodes buffer, it is used for node allocation
+  size_t *pAllocator = (size_t*) clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, sizeof(size_t), 0);
+
+  // this buffer holds the count of correct nodes, which is computed by the verify kernel.
+  cl_int *pNumCorrect = (cl_int*) clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, sizeof(cl_int), 0);
+
+  error |= clSetKernelArgSVMPointer(kernel_create_lists, 0, pNodes);
+  error |= clSetKernelArgSVMPointer(kernel_create_lists, 1, pAllocator);
+  error |= clSetKernelArg(kernel_create_lists, 2, sizeof(cl_int),   (void *) &ListLength);
+
+  error |= clSetKernelArgSVMPointer(kernel_verify_lists, 0, pNodes);
+  error |= clSetKernelArgSVMPointer(kernel_verify_lists, 1, pNumCorrect);
+  error |= clSetKernelArg(kernel_verify_lists, 2, sizeof(cl_int),   (void *) &ListLength);
+  test_error(error, "clSetKernelArg failed");
+
+  // Create linked list on one device and verify on another device (or the host).
+  // Do this for all possible combinations of devices and host within the platform.
+  for (int ci=0; ci<(int)num_devices+1; ci++)  // ci is CreationIndex, index of device/q to create linked list on
+  {
+    for (int vi=0; vi<(int)num_devices+1; vi++)  // vi is VerificationIndex, index of device/q to verify linked list on
+    {
+      if(ci == num_devices) // last device index represents the host, note the num_device+1 above.
+      {
+        log_info("SVM: creating linked list on host ");
+        create_linked_lists(pNodes, numLists, ListLength);
+      }
+      else
+      {
+        error = create_linked_lists_on_device_no_map(ci, queues[ci], pAllocator, kernel_create_lists, numLists);
+        if(error) return -1;
+      }
+
+      if(vi == num_devices)
+      {
+        error = verify_linked_lists(pNodes, numLists, ListLength);
+        if(error) return -1;
+      }
+      else
+      {
+        error = verify_linked_lists_on_device_no_map(vi, queues[vi], pNumCorrect, kernel_verify_lists, ListLength, numLists);
+        if(error) return -1;
+      }
+    }
+  }
+
+  clSVMFree(context, pNodes);
+  clSVMFree(context, pAllocator);
+  clSVMFree(context, pNumCorrect);
+
+  return 0;
+}
--- a/test_conformance/SVM/test_shared_sub_buffers.cpp
+++ b/test_conformance/SVM/test_shared_sub_buffers.cpp
@@ -0,0 +1,241 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "common.h"
+
+const char *shared_sub_buffers_test_kernel[] = {
+  "typedef struct Node {\n"
+  "    int global_id;\n"
+  "    int position_in_list;\n"
+  "    __global struct Node* pNext;\n"
+  "} Node;\n"
+
+  // create linked lists that use nodes from 2 different buffers
+  "__global Node* allocate_node(__global Node* pNodes1, __global Node* pNodes2, volatile __global int* allocation_index, size_t i)\n"
+  "{\n"
+  // mix things up, adjacent work items will allocate from different buffers
+  "    if(i & 0x1)\n"
+  "        return &pNodes1[atomic_inc(allocation_index)];\n"
+  "    else\n"
+  "        return &pNodes2[atomic_inc(allocation_index)];\n"
+  "}\n"
+
+  // The allocation_index parameter must be initialized on the host to N work-items
+  // The first N nodes in pNodes will be the heads of the lists.
+  // This tests passing 4 different sub-buffers that come from two parent buffers.
+  // Note that we have arguments that appear to be unused, but they are required so that system knows to get all the sub-buffers on to the device
+  "__kernel void create_linked_lists(__global Node* pNodes_sub1, __global Node* pNodes2_sub1, __global Node* pNodes_sub2, __global Node* pNodes2_sub2, volatile __global int* allocation_index, int list_length) \n"
+  "{\n"
+  "    size_t i = get_global_id(0);\n"
+  "    __global Node *pNode = &pNodes_sub1[i];\n"
+  "    pNode->global_id = i;\n"
+  "    pNode->position_in_list = 0;\n"
+  "    __global Node *pNew;\n"
+  "    for(int j=1; j < list_length; j++) {\n"
+  "        pNew = allocate_node(pNodes_sub1, pNodes2_sub1, allocation_index, i);\n"
+  "        pNew->global_id = i;\n"
+  "        pNew->position_in_list = j;\n"
+  "        pNode->pNext = pNew;  // link new node onto end of list\n"
+  "        pNode = pNew;   // move to end of list\n"
+  "    }\n"
+  "}\n"
+  // Note that we have arguments that appear to be unused, but they are required so that system knows to get all the sub-buffers on to the device
+  "__kernel void verify_linked_lists(__global Node* pNodes_sub1, __global Node* pNodes2_sub1, __global Node* pNodes_sub2, __global Node* pNodes2_sub2, volatile __global uint* num_correct, int list_length)\n"
+  "{\n"
+  "    size_t i = get_global_id(0);\n"
+  "    __global Node *pNode = &pNodes_sub1[i];\n"
+  "    for(int j=0; j < list_length; j++) {\n"
+  "        if( pNode->global_id == i && pNode->position_in_list == j)\n"
+  "            atomic_inc(num_correct);\n"
+  "        else \n"
+  "            break;\n"
+  "        pNode = pNode->pNext;\n"
+  "    }\n"
+  "}\n"
+};
+
+
+// Creates linked list using host code.
+cl_int create_linked_lists_on_host_sb(cl_command_queue cmdq, cl_mem nodes, cl_mem nodes2, cl_int ListLength, size_t numLists )
+{
+  cl_int error = CL_SUCCESS;
+
+  log_info("SVM: creating linked list on host ");
+
+  Node *pNodes = (Node*) clEnqueueMapBuffer(cmdq, nodes, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(Node)*ListLength*numLists, 0, NULL,NULL, &error);
+  test_error2(error, pNodes, "clEnqueueMapBuffer failed");
+
+  Node *pNodes2 = (Node*) clEnqueueMapBuffer(cmdq, nodes2, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(Node)*ListLength*numLists, 0, NULL,NULL, &error);
+  test_error2(error, pNodes2, "clEnqueueMapBuffer failed");
+
+  create_linked_lists(pNodes, numLists, ListLength);
+
+  error = clEnqueueUnmapMemObject(cmdq, nodes, pNodes, 0,NULL,NULL);
+  test_error(error, "clEnqueueUnmapMemObject failed");
+  error = clEnqueueUnmapMemObject(cmdq, nodes2, pNodes2, 0,NULL,NULL);
+  test_error(error, "clEnqueueUnmapMemObject failed");
+  error = clFinish(cmdq);
+  test_error(error, "clFinish failed");
+  return error;
+}
+
+// Verify correctness of the linked list using host code.
+cl_int verify_linked_lists_on_host_sb(int ci, cl_command_queue cmdq, cl_mem nodes, cl_mem nodes2, cl_int ListLength, size_t numLists )
+{
+  cl_int error = CL_SUCCESS;
+
+  //log_info(" and verifying on host ");
+
+  Node *pNodes = (Node*) clEnqueueMapBuffer(cmdq, nodes, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(Node)*ListLength * numLists, 0, NULL,NULL, &error);
+  test_error2(error, pNodes, "clEnqueueMapBuffer failed");
+  Node *pNodes2 = (Node*) clEnqueueMapBuffer(cmdq, nodes2, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(Node)*ListLength * numLists, 0, NULL,NULL, &error);
+  test_error2(error, pNodes, "clEnqueueMapBuffer failed");
+
+  error = verify_linked_lists(pNodes, numLists, ListLength);
+  if(error) return -1;
+
+  error = clEnqueueUnmapMemObject(cmdq, nodes, pNodes, 0,NULL,NULL);
+  test_error(error, "clEnqueueUnmapMemObject failed");
+  error = clEnqueueUnmapMemObject(cmdq, nodes2, pNodes2, 0,NULL,NULL);
+  test_error(error, "clEnqueueUnmapMemObject failed");
+  error = clFinish(cmdq);
+  test_error(error, "clFinish failed");
+  return error;
+}
+
+
+// This tests that shared sub-buffers can be created and that they inherit the flags from the parent buffer when no flags are specified.
+// This tests that passing only the sub-buffers to a kernel works.
+// The test is derived from the cross-buffer pointers test which
+// tests that shared buffers are able to contain pointers that point to other shared buffers.
+// This tests that all devices and the host share a common address space; using only the coarse-grain features.
+// This is done by creating a linked list on a device and then verifying the correctness of the list
+// on another device or the host.
+// The linked list nodes are allocated from two different buffers this is done to ensure that cross buffer pointers work correctly.
+// This basic test is performed for all combinations of devices and the host.
+int test_shared_sub_buffers(cl_device_id deviceID, cl_context context2, cl_command_queue queue, int num_elements)
+{
+  clContextWrapper    context = NULL;
+  clProgramWrapper    program = NULL;
+  cl_uint     num_devices = 0;
+  cl_int      error = CL_SUCCESS;
+  clCommandQueueWrapper queues[MAXQ];
+
+  error = create_cl_objects(deviceID, &shared_sub_buffers_test_kernel[0], &context, &program, &queues[0], &num_devices, CL_DEVICE_SVM_COARSE_GRAIN_BUFFER);
+  if(error) return -1;
+
+  size_t numLists =  num_elements;
+  if(numLists & 0x1) numLists++; // force even size, so we can easily create two sub-buffers of same size.
+
+  cl_int ListLength = 32;
+
+  clKernelWrapper kernel_create_lists = clCreateKernel(program, "create_linked_lists", &error);
+  test_error(error, "clCreateKernel failed");
+
+  clKernelWrapper kernel_verify_lists = clCreateKernel(program, "verify_linked_lists", &error);
+  test_error(error, "clCreateKernel failed");
+
+  size_t nodes_bufsize = sizeof(Node)*ListLength*numLists;
+  Node* pNodes = (Node*) clSVMAlloc(context, CL_MEM_READ_WRITE, nodes_bufsize, 0);
+  Node* pNodes2 = (Node*) clSVMAlloc(context, CL_MEM_READ_WRITE, nodes_bufsize, 0);
+
+  {
+    // this buffer holds some of the linked list nodes.
+    clMemWrapper nodes = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, nodes_bufsize, pNodes, &error);
+    test_error(error, "clCreateBuffer failed.");
+
+    cl_buffer_region r;
+    r.origin = 0;
+    r.size = nodes_bufsize / 2;
+    // this should inherit the flag settings from nodes buffer
+    clMemWrapper nodes_sb1 = clCreateSubBuffer(nodes, 0, CL_BUFFER_CREATE_TYPE_REGION, (void*)&r, &error);
+    test_error(error, "clCreateSubBuffer");
+    r.origin = nodes_bufsize / 2;
+    clMemWrapper nodes_sb2 = clCreateSubBuffer(nodes, 0, CL_BUFFER_CREATE_TYPE_REGION, (void*)&r, &error);
+    test_error(error, "clCreateSubBuffer");
+
+
+    // this buffer holds some of the linked list nodes.
+    clMemWrapper nodes2 = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, sizeof(Node)*ListLength*numLists, pNodes2, &error);
+    test_error(error, "clCreateBuffer failed.");
+    r.origin = 0;
+    r.size = nodes_bufsize / 2;
+    // this should inherit the flag settings from nodes buffer
+    clMemWrapper nodes2_sb1 = clCreateSubBuffer(nodes2, 0, CL_BUFFER_CREATE_TYPE_REGION, (void*)&r, &error);
+    test_error(error, "clCreateSubBuffer");
+    r.origin = nodes_bufsize / 2;
+    clMemWrapper nodes2_sb2 = clCreateSubBuffer(nodes2, 0, CL_BUFFER_CREATE_TYPE_REGION,(void*)&r, &error);
+    test_error(error, "clCreateSubBuffer");
+
+
+
+    // this buffer holds the index into the nodes buffer that is used for node allocation
+    clMemWrapper allocator = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &error);
+    test_error(error, "clCreateBuffer failed.");
+
+    // this buffer holds the count of correct nodes which is computed by the verify kernel.
+    clMemWrapper num_correct = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &error);
+    test_error(error, "clCreateBuffer failed.");
+
+    error |= clSetKernelArg(kernel_create_lists, 0, sizeof(void*), (void *) &nodes_sb1);
+    error |= clSetKernelArg(kernel_create_lists, 1, sizeof(void*), (void *) &nodes2_sb1);
+    error |= clSetKernelArg(kernel_create_lists, 2, sizeof(void*), (void *) &nodes_sb2);
+    error |= clSetKernelArg(kernel_create_lists, 3, sizeof(void*), (void *) &nodes2_sb2);
+    error |= clSetKernelArg(kernel_create_lists, 4, sizeof(void*), (void *) &allocator);
+    error |= clSetKernelArg(kernel_create_lists, 5, sizeof(cl_int),(void *) &ListLength);
+
+    error |= clSetKernelArg(kernel_verify_lists, 0, sizeof(void*), (void *) &nodes_sb1);
+    error |= clSetKernelArg(kernel_verify_lists, 1, sizeof(void*), (void *) &nodes2_sb1);
+    error |= clSetKernelArg(kernel_verify_lists, 2, sizeof(void*), (void *) &nodes_sb2);
+    error |= clSetKernelArg(kernel_verify_lists, 3, sizeof(void*), (void *) &nodes2_sb2);
+    error |= clSetKernelArg(kernel_verify_lists, 4, sizeof(void*), (void *) &num_correct);
+    error |= clSetKernelArg(kernel_verify_lists, 5, sizeof(cl_int),(void *) &ListLength);
+    test_error(error, "clSetKernelArg failed");
+
+    // Create linked list on one device and verify on another device (or the host).
+    // Do this for all possible combinations of devices and host within the platform.
+    for (int ci=0; ci<(int)num_devices+1; ci++)  // ci is CreationIndex, index of device/q to create linked list on
+    {
+      for (int vi=0; vi<(int)num_devices+1; vi++)  // vi is VerificationIndex, index of device/q to verify linked list on
+      {
+        if(ci == num_devices) // last device index represents the host, note the num_device+1 above.
+        {
+          error = create_linked_lists_on_host_sb(queues[0], nodes, nodes2, ListLength, numLists);
+          if(error) return -1;
+        }
+        else
+        {
+          error = create_linked_lists_on_device(ci, queues[ci], allocator, kernel_create_lists, numLists);
+          if(error) return -1;
+        }
+
+        if(vi == num_devices)
+        {
+          error = verify_linked_lists_on_host_sb(vi, queues[0], nodes, nodes2, ListLength, numLists);
+          if(error) return -1;
+        }
+        else
+        {
+          error = verify_linked_lists_on_device(vi, queues[vi], num_correct, kernel_verify_lists, ListLength, numLists);
+          if(error) return -1;
+        }
+      } // inner loop, vi
+    } // outer loop, ci
+  }
+  clSVMFree(context, pNodes2);
+  clSVMFree(context, pNodes);
+
+  return 0;
+}
--- a/test_conformance/allocations/CMakeLists.txt
+++ b/test_conformance/allocations/CMakeLists.txt
@@ -0,0 +1,19 @@
+set(MODULE_NAME ALLOCATIONS)
+
+set(${MODULE_NAME}_SOURCES
+        main.cpp
+        allocation_execute.cpp
+        allocation_fill.cpp
+        allocation_functions.cpp
+        allocation_utils.cpp
+        ../../test_common/harness/errorHelpers.c
+        ../../test_common/harness/threadTesting.c
+        ../../test_common/harness/kernelHelpers.c
+        ../../test_common/harness/testHarness.c
+        ../../test_common/harness/typeWrappers.cpp
+        ../../test_common/harness/mt19937.c
+        ../../test_common/harness/msvc9.c
+        ../../test_common/harness/parseParameters.cpp
+)
+
+include(../CMakeCommon.txt)
--- a/test_conformance/allocations/Jamfile
+++ b/test_conformance/allocations/Jamfile
@@ -0,0 +1,19 @@
+project
+    : requirements
+#      <toolset>gcc:<cflags>-xc++
+#      <toolset>msvc:<cflags>"/TP"
+    ;
+
+exe test_allocations
+    : allocation_execute.cpp
+      allocation_fill.cpp
+      allocation_functions.cpp
+      allocation_utils.cpp
+      main.cpp
+    ;
+
+install dist
+    : test_allocations
+    : <variant>debug:<location>$(DIST)/debug/tests/test_conformance/allocations
+      <variant>release:<location>$(DIST)/release/tests/test_conformance/allocations
+    ;
--- a/test_conformance/allocations/Makefile
+++ b/test_conformance/allocations/Makefile
@@ -0,0 +1,46 @@
+ifdef BUILD_WITH_ATF
+ATF = -framework ATF
+USE_ATF = -DUSE_ATF
+endif
+
+SRCS = main.cpp \
+		allocation_functions.cpp \
+		allocation_fill.cpp  \
+		allocation_utils.cpp \
+		allocation_execute.cpp \
+		  ../../test_common/harness/errorHelpers.c \
+		  ../../test_common/harness/threadTesting.c \
+		  ../../test_common/harness/kernelHelpers.c \
+		  ../../test_common/harness/testHarness.c \
+                  ../../test_common/harness/mt19937.c \
+		  ../../test_common/harness/typeWrappers.cpp
+		  
+DEFINES = DONT_TEST_GARBAGE_POINTERS
+
+SOURCES = $(abspath $(SRCS))
+LIBPATH += -L/System/Library/Frameworks/OpenCL.framework/Libraries
+LIBPATH += -L.
+FRAMEWORK = $(SOURCES)
+HEADERS = 
+TARGET = test_allocations
+INCLUDE = 
+COMPILERFLAGS = -c -Wall -g -Wshorten-64-to-32 -Os
+CC = c++
+CFLAGS = $(COMPILERFLAGS) ${RC_CFLAGS} ${USE_ATF} $(DEFINES:%=-D%) $(INCLUDE)
+CXXFLAGS = $(COMPILERFLAGS) ${RC_CFLAGS} ${USE_ATF} $(DEFINES:%=-D%) $(INCLUDE)
+LIBRARIES = -framework OpenCL -framework OpenGL -framework GLUT -framework AppKit ${ATF}
+
+OBJECTS := ${SOURCES:.c=.o}
+OBJECTS := ${OBJECTS:.cpp=.o}
+
+TARGETOBJECT =
+all: $(TARGET)
+
+$(TARGET): $(OBJECTS)
+	$(CC) $(RC_CFLAGS) $(OBJECTS) -o $@ $(LIBPATH) $(LIBRARIES)
+
+clean:
+	rm -f $(TARGET) $(OBJECTS)
+
+.DEFAULT:
+	@echo The target \"$@\" does not exist in Makefile.
--- a/test_conformance/allocations/allocation_execute.cpp
+++ b/test_conformance/allocations/allocation_execute.cpp
@@ -0,0 +1,333 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "allocation_execute.h"
+#include "allocation_functions.h"
+
+
+const char *buffer_kernel_pattern = {
+    "__kernel void sample_test(%s __global uint *result, __global uint *array_sizes, uint per_item)\n"
+    "{\n"
+    "\tint tid = get_global_id(0);\n"
+    "\tuint r = 0;\n"
+    "\tuint i;\n"
+    "\tfor(i=tid*per_item; i<(1+tid)*per_item; i++) {\n"
+    "%s"
+    "\t}\n"
+    "\tresult[tid] = r;\n"
+    "}\n" };
+
+const char *image_kernel_pattern = {
+    "__kernel void sample_test(%s __global uint *result)\n"
+    "{\n"
+    "\tuint4 color;\n"
+    "\tcolor = (uint4)(0);\n"
+    "%s"
+    "\tint x, y;\n"
+    "%s"
+    "\tresult[get_global_id(0)] += color.x + color.y + color.z + color.w;\n"
+    "}\n" };
+
+const char *read_pattern = {
+    "\tfor(y=0; y<get_image_height(image%d); y++)\n"
+    "\t\tif (y %s get_global_size(0) == get_global_id(0))\n"
+    "\t\t\tfor (x=0; x<get_image_width(image%d); x++) {\n"
+    "\t\t\t\tcolor += read_imageui(image%d, sampler, (int2)(x,y));\n"
+    "\t\t\t}\n"
+};
+
+const char *offset_pattern =
+"\tconst uint4 offset = (uint4)(0,1,2,3);\n";
+
+const char *sampler_pattern =
+"\tconst sampler_t sampler = CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST | CLK_NORMALIZED_COORDS_FALSE;\n";
+
+
+const char *write_pattern = {
+    "\tfor(y=0; y<get_image_height(image%d); y++)\n"
+    "\t\tif (y %s get_global_size(0) == get_global_id(0))\n"
+    "\t\t\tfor (x=0; x<get_image_width(image%d); x++) {\n"
+    "\t\t\t\tcolor = (uint4)x*(uint4)y+offset;\n"
+    "\t\t\t\twrite_imageui(image%d, (int2)(x,y), color);\n"
+    "\t\t\t}\n"
+    "\tbarrier(CLK_LOCAL_MEM_FENCE);\n"
+};
+
+
+int check_image(cl_command_queue queue, cl_mem mem) {
+    int error;
+    cl_mem_object_type type;
+    size_t width, height;
+    size_t origin[3], region[3], x, j;
+    cl_uint *data;
+
+    error = clGetMemObjectInfo(mem, CL_MEM_TYPE, sizeof(type), &type, NULL);
+    if (error) {
+        print_error(error, "clGetMemObjectInfo failed for CL_MEM_TYPE.");
+        return -1;
+    }
+
+    if (type == CL_MEM_OBJECT_BUFFER) {
+        log_error("Expected image object, not buffer.\n");
+        return -1;
+    } else if (type == CL_MEM_OBJECT_IMAGE2D) {
+        error = clGetImageInfo(mem, CL_IMAGE_WIDTH, sizeof(width), &width, NULL);
+        if (error) {
+            print_error(error, "clGetMemObjectInfo failed for CL_IMAGE_WIDTH.");
+            return -1;
+        }
+        error = clGetImageInfo(mem, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL);
+        if (error) {
+            print_error(error, "clGetMemObjectInfo failed for CL_IMAGE_HEIGHT.");
+            return -1;
+        }
+    }
+
+
+    data = (cl_uint*)malloc(width*4*sizeof(cl_uint));
+    if (data == NULL) {
+        log_error("Failed to malloc host buffer for writing into image.\n");
+        return FAILED_ABORT;
+    }
+    origin[0] = 0;
+    origin[1] = 0;
+    origin[2] = 0;
+    region[0] = width;
+    region[1] = 1;
+    region[2] = 1;
+    for (origin[1] = 0; origin[1] < height; origin[1]++) {
+        error = clEnqueueReadImage(queue, mem, CL_TRUE, origin, region, 0, 0, data, 0, NULL, NULL);
+        if (error) {
+            print_error(error, "clEnqueueReadImage failed");
+            free(data);
+            return error;
+        }
+
+        for (x=0; x<width; x++) {
+            for (j=0; j<4; j++) {
+                if (data[x*4+j] != (cl_uint)(x*origin[1]+j)) {
+                    log_error("Pixel %d, %d, component %d, expected %u, got %u.\n",
+                              (int)x, (int)origin[1], (int)j, (cl_uint)(x*origin[1]+j), data[x*4+j]);
+                    return -1;
+                }
+            }
+        }
+    }
+    free(data);
+    return 0;
+}
+
+
+#define NUM_OF_WORK_ITEMS 8192*2
+
+int execute_kernel(cl_context context, cl_command_queue *queue, cl_device_id device_id, int test, cl_mem mems[], int number_of_mems_used, int verify_checksum) {
+
+    char *argument_string;
+    char *access_string;
+    char *kernel_string;
+    int i, error, result;
+    clKernelWrapper kernel;
+    clProgramWrapper program;
+    clMemWrapper result_mem;
+    char *ptr;
+    size_t global_dims[3];
+    cl_uint per_item;
+    cl_uint per_item_uint;
+    cl_uint returned_results[NUM_OF_WORK_ITEMS], final_result;
+    clEventWrapper event;
+    cl_int event_status;
+
+    // Allocate memory for the kernel source
+    argument_string = (char*)malloc(sizeof(char)*MAX_NUMBER_TO_ALLOCATE*64);
+    access_string = (char*)malloc(sizeof(char)*MAX_NUMBER_TO_ALLOCATE*(strlen(read_pattern)+10));
+    kernel_string = (char*)malloc(sizeof(char)*MAX_NUMBER_TO_ALLOCATE*(strlen(read_pattern)+10+64)+1024);
+    argument_string[0] = '\0';
+    access_string[0] = '\0';
+    kernel_string[0] = '\0';
+
+    // Zero the results.
+    for (i=0; i<NUM_OF_WORK_ITEMS; i++)
+        returned_results[i] = 0;
+
+    // Build the kernel source
+    if (test == BUFFER || test == BUFFER_NON_BLOCKING) {
+        for(i=0; i<number_of_mems_used; i++) {
+            sprintf(argument_string + strlen(argument_string), " __global uint *buffer%d, ", i);
+            sprintf(access_string + strlen( access_string), "\t\tif (i<array_sizes[%d]) r += buffer%d[i];\n", i, i);
+        }
+        sprintf(kernel_string, buffer_kernel_pattern, argument_string, access_string);
+    }
+    else if (test == IMAGE_READ || test == IMAGE_READ_NON_BLOCKING) {
+        for(i=0; i<number_of_mems_used; i++) {
+            sprintf(argument_string + strlen(argument_string), " read_only image2d_t image%d, ", i);
+            sprintf(access_string + strlen(access_string), read_pattern, i, "%", i, i);
+        }
+        sprintf(kernel_string, image_kernel_pattern, argument_string, sampler_pattern, access_string);
+    }
+    else if (test == IMAGE_WRITE || test == IMAGE_WRITE_NON_BLOCKING) {
+        for(i=0; i<number_of_mems_used; i++) {
+            sprintf(argument_string + strlen(argument_string), " write_only image2d_t image%d, ", i);
+            sprintf(access_string + strlen( access_string), write_pattern, i, "%", i, i);
+        }
+        sprintf(kernel_string, image_kernel_pattern, argument_string, offset_pattern, access_string);
+    }
+    ptr = kernel_string;
+
+    // Create the kernel
+    error = create_single_kernel_helper( context, &program, &kernel, 1, (const char **)&ptr, "sample_test" );
+
+    free(argument_string);
+    free(access_string);
+    free(kernel_string);
+
+    result = check_allocation_error(context, device_id, error, queue);
+    if (result != SUCCEEDED) {
+        if (result == FAILED_TOO_BIG)
+            log_info("\t\tCreate kernel failed: %s.\n", IGetErrorString(error));
+        else
+            print_error(error, "Create kernel and program failed");
+        return result;
+    }
+
+    // Set the arguments
+    for (i=0; i<number_of_mems_used; i++) {
+        error = clSetKernelArg(kernel, i, sizeof(cl_mem), &mems[i]);
+        test_error(error, "clSetKernelArg failed");
+    }
+
+    // Set the result
+    result_mem = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(cl_uint)*NUM_OF_WORK_ITEMS, &returned_results, &error);
+    test_error(error, "clCreateBuffer failed");
+    error = clSetKernelArg(kernel, i, sizeof(result_mem), &result_mem);
+    test_error(error, "clSetKernelArg failed");
+
+    // Thread dimensions for execution
+    global_dims[0] = NUM_OF_WORK_ITEMS; global_dims[1] = 1; global_dims[2] = 1;
+
+    // We have extra arguments for the buffer kernel because we need to pass in the buffer sizes
+    cl_uint *sizes = (cl_uint*)malloc(sizeof(cl_uint)*number_of_mems_used);
+    cl_uint max_size = 0;
+    clMemWrapper buffer_sizes;
+    if (test == BUFFER || test == BUFFER_NON_BLOCKING) {
+        for (i=0; i<number_of_mems_used; i++) {
+            size_t size;
+            error = clGetMemObjectInfo(mems[i], CL_MEM_SIZE, sizeof(size), &size, NULL);
+            test_error_abort(error, "clGetMemObjectInfo failed for CL_MEM_SIZE.");
+            sizes[i] = (cl_uint)(size/sizeof(cl_uint));
+            if (size/sizeof(cl_uint) > max_size)
+                max_size = (cl_uint)(size/sizeof(cl_uint));
+        }
+        buffer_sizes = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, sizeof(cl_uint)*number_of_mems_used, sizes, &error);
+        test_error_abort(error, "clCreateBuffer failed");
+        error = clSetKernelArg(kernel, number_of_mems_used+1, sizeof(cl_mem), &buffer_sizes);
+        test_error(error, "clSetKernelArg failed");
+        per_item = (cl_uint)ceil((double)max_size/global_dims[0]);
+        if (per_item > CL_UINT_MAX)
+            log_error("Size is too large for a uint parameter to the kernel. Expect invalid results.\n");
+        per_item_uint = (cl_uint)per_item;
+        error = clSetKernelArg(kernel, number_of_mems_used+2, sizeof(per_item_uint), &per_item_uint);
+        test_error(error, "clSetKernelArg failed");
+        free(sizes);
+    }
+
+    size_t local_dims[3] = {1,1,1};
+    error = get_max_common_work_group_size(context, kernel, global_dims[0], &local_dims[0]);
+    test_error(error, "get_max_common_work_group_size failed");
+
+    // Execute the kernel
+    error = clEnqueueNDRangeKernel(*queue, kernel, 1, NULL, global_dims, local_dims, 0, NULL, &event);
+    result = check_allocation_error(context, device_id, error, queue);
+    if (result != SUCCEEDED) {
+        if (result == FAILED_TOO_BIG)
+            log_info("\t\tExecute kernel failed: %s (global dim: %ld, local dim: %ld)\n", IGetErrorString(error), global_dims[0], local_dims[0]);
+        else
+            print_error(error, "clEnqueueNDRangeKernel failed");
+        return result;
+    }
+
+    // Finish the test
+    error = clFinish(*queue);
+
+    result = check_allocation_error(context, device_id, error, queue);
+
+    if (result != SUCCEEDED) {
+        if (result == FAILED_TOO_BIG)
+            log_info("\t\tclFinish failed: %s.\n", IGetErrorString(error));
+        else
+            print_error(error, "clFinish failed");
+        return result;
+    }
+
+    // Verify that the event from the execution did not have an error
+    error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL);
+    test_error_abort(error, "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed");
+    if (event_status < 0) {
+        result = check_allocation_error(context, device_id, event_status, queue);
+        if (result != SUCCEEDED) {
+            if (result == FAILED_TOO_BIG)
+                log_info("\t\tEvent returned from kernel execution indicates failure: %s.\n", IGetErrorString(event_status));
+            else
+                print_error(event_status, "clEnqueueNDRangeKernel failed");
+            return result;
+        }
+    }
+
+    // If we are not verifying the checksum return here
+    if (!verify_checksum) {
+        log_info("Note: Allocations were not initialized so kernel execution can not verify correct results.\n");
+        return SUCCEEDED;
+    }
+
+    // Verify the checksum.
+    // Read back the result
+    error = clEnqueueReadBuffer(*queue, result_mem, CL_TRUE, 0, sizeof(cl_uint)*NUM_OF_WORK_ITEMS, &returned_results, 0, NULL, NULL);
+    test_error_abort(error, "clEnqueueReadBuffer failed");
+    final_result = 0;
+    if (test == BUFFER || test == IMAGE_READ || test == BUFFER_NON_BLOCKING || test == IMAGE_READ_NON_BLOCKING) {
+        // For buffers or read images we are just looking at the sum of what each thread summed up
+        for (i=0; i<NUM_OF_WORK_ITEMS; i++) {
+            final_result += returned_results[i];
+        }
+        if (final_result != checksum) {
+            log_error("\t\tChecksum failed to verify. Expected %u got %u.\n", checksum, final_result);
+            return FAILED_ABORT;
+        }
+        log_info("\t\tChecksum verified (%u == %u).\n", checksum, final_result);
+    } else {
+        // For write images we need to verify the values
+        for (i=0; i<number_of_mems_used; i++) {
+            if (check_image(*queue, mems[i])) {
+                log_error("\t\tImage contents failed to verify for image %d.\n", (int)i);
+                return FAILED_ABORT;
+            }
+        }
+        log_info("\t\tImage contents verified.\n");
+    }
+
+    // Finish the test
+    error = clFinish(*queue);
+    result = check_allocation_error(context, device_id, error, queue);
+    if (result != SUCCEEDED) {
+        if (result == FAILED_TOO_BIG)
+            log_info("\t\tclFinish failed: %s.\n", IGetErrorString(error));
+        else
+            print_error(error, "clFinish failed");
+        return result;
+    }
+
+    return SUCCEEDED;
+}
+
+
--- a/test_conformance/allocations/allocation_execute.h
+++ b/test_conformance/allocations/allocation_execute.h
@@ -0,0 +1,22 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "testBase.h"
+#include "allocation_utils.h"
+
+
+int execute_kernel(cl_context context, cl_command_queue *queue, cl_device_id device_id, int test, cl_mem mems[], int number_of_mems_used, int verify_checksum);
+
+
--- a/test_conformance/allocations/allocation_fill.cpp
+++ b/test_conformance/allocations/allocation_fill.cpp
@@ -0,0 +1,338 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "allocation_fill.h"
+
+#define BUFFER_CHUNK_SIZE 8*1024*1024
+#define IMAGE_LINES 8
+
+#include "../../test_common/harness/compat.h"
+
+int fill_buffer_with_data(cl_context context, cl_device_id device_id, cl_command_queue *queue, cl_mem mem, size_t size, MTdata d, cl_bool blocking_write) {
+     size_t i, j;
+  cl_uint *data;
+  int error, result;
+  cl_uint checksum_delta = 0;
+  cl_event event;
+
+  size_t size_to_use = BUFFER_CHUNK_SIZE;
+  if (size_to_use > size)
+    size_to_use = size;
+
+  data = (cl_uint*)malloc(size_to_use);
+  if (data == NULL) {
+    log_error("Failed to malloc host buffer for writing into buffer.\n");
+    return FAILED_ABORT;
+  }
+  for (i=0; i<size-size_to_use; i+=size_to_use) {
+    // Put values in the data, and keep a checksum as we go along.
+    for (j=0; j<size_to_use/sizeof(cl_uint); j++) {
+      data[j] = genrand_int32(d);
+      checksum_delta += data[j];
+    }
+    if (blocking_write) {
+      error = clEnqueueWriteBuffer(*queue, mem, CL_TRUE, i, size_to_use, data, 0, NULL, NULL);
+      result = check_allocation_error(context, device_id, error, queue);
+
+      if (result == FAILED_ABORT) {
+        print_error(error, "clEnqueueWriteBuffer failed.");
+      }
+
+      if (result != SUCCEEDED) {
+        clFinish(*queue);
+        free(data);
+        clReleaseMemObject(mem);
+        return result;
+      }
+    } else {
+      error = clEnqueueWriteBuffer(*queue, mem, CL_FALSE, i, size_to_use, data, 0, NULL, &event);
+      result = check_allocation_error(context, device_id, error, queue);
+
+      if (result == FAILED_ABORT) {
+        print_error(error, "clEnqueueWriteBuffer failed.");
+      }
+
+      if (result != SUCCEEDED) {
+        clFinish(*queue);
+        free(data);
+        clReleaseMemObject(mem);
+        return result;
+      }
+
+      error = clWaitForEvents(1, &event);
+      result = check_allocation_error(context, device_id, error, queue);
+
+      if (result == FAILED_ABORT) {
+        print_error(error, "clWaitForEvents failed.");
+      }
+
+      if (result != SUCCEEDED) {
+        clFinish(*queue);
+        clReleaseEvent(event);
+        free(data);
+        clReleaseMemObject(mem);
+        return result;
+      }
+
+      clReleaseEvent(event);
+    }
+  }
+
+  // Deal with any leftover bits
+  if (i < size) {
+    // Put values in the data, and keep a checksum as we go along.
+    for (j=0; j<(size-i)/sizeof(cl_uint); j++) {
+      data[j] = (cl_uint)genrand_int32(d);
+      checksum_delta += data[j];
+    }
+
+    if (blocking_write) {
+      error = clEnqueueWriteBuffer(*queue, mem, CL_TRUE, i, size-i, data, 0, NULL, NULL);
+      result = check_allocation_error(context, device_id, error, queue);
+
+      if (result == FAILED_ABORT) {
+        print_error(error, "clEnqueueWriteBuffer failed.");
+      }
+
+      if (result != SUCCEEDED) {
+        clFinish(*queue);
+        clReleaseMemObject(mem);
+        free(data);
+        return result;
+      }
+    } else {
+      error = clEnqueueWriteBuffer(*queue, mem, CL_FALSE, i, size-i, data, 0, NULL, &event);
+      result = check_allocation_error(context, device_id, error, queue);
+
+      if (result == FAILED_ABORT) {
+        print_error(error, "clEnqueueWriteBuffer failed.");
+      }
+
+      if (result != SUCCEEDED) {
+        clFinish(*queue);
+        clReleaseMemObject(mem);
+        free(data);
+        return result;
+      }
+
+      error = clWaitForEvents(1, &event);
+      result = check_allocation_error(context, device_id, error, queue);
+
+      if (result == FAILED_ABORT) {
+        print_error(error, "clWaitForEvents failed.");
+      }
+
+      if (result != SUCCEEDED) {
+        clFinish(*queue);
+        clReleaseEvent(event);
+        free(data);
+        clReleaseMemObject(mem);
+        return result;
+      }
+
+      clReleaseEvent(event);
+    }
+  }
+
+  free(data);
+  // Only update the checksum if this succeeded.
+  checksum += checksum_delta;
+  return SUCCEEDED;
+}
+
+
+int fill_image_with_data(cl_context context, cl_device_id device_id, cl_command_queue *queue, cl_mem mem, size_t width, size_t height, MTdata d, cl_bool blocking_write) {
+  size_t origin[3], region[3], j;
+  int error, result;
+  cl_uint *data;
+  cl_uint checksum_delta = 0;
+  cl_event event;
+
+  size_t image_lines_to_use;
+  image_lines_to_use = IMAGE_LINES;
+  if (image_lines_to_use > height)
+      image_lines_to_use = height;
+
+  data = (cl_uint*)malloc(width*4*sizeof(cl_uint)*image_lines_to_use);
+  if (data == NULL) {
+    log_error("Failed to malloc host buffer for writing into image.\n");
+    return FAILED_ABORT;
+  }
+  origin[0] = 0;
+  origin[1] = 0;
+  origin[2] = 0;
+  region[0] = width;
+  region[1] = image_lines_to_use;
+  region[2] = 1;
+  for (origin[1] = 0; origin[1] < height - image_lines_to_use; origin[1] += image_lines_to_use) {
+    // Put values in the data, and keep a checksum as we go along.
+    for (j=0; j<width*4*image_lines_to_use; j++) {
+      data[j] = (cl_uint)genrand_int32(d);
+      checksum_delta += data[j];
+    }
+
+    if (blocking_write) {
+      error = clEnqueueWriteImage(*queue, mem, CL_TRUE, origin, region, 0, 0, data, 0, NULL, NULL);
+      result = check_allocation_error(context, device_id, error, queue);
+
+      if (result == FAILED_ABORT) {
+        print_error(error, "clEnqueueWriteImage failed.");
+      }
+
+      if (result != SUCCEEDED) {
+        clFinish(*queue);
+        clReleaseMemObject(mem);
+        free(data);
+        return result;
+      }
+      result = clFinish(*queue);
+      if (result != SUCCEEDED)
+      {
+        print_error(error, "clFinish failed after successful enquing filling buffer with data.");
+        return result;
+      }
+    } else {
+      error = clEnqueueWriteImage(*queue, mem, CL_FALSE, origin, region, 0, 0, data, 0, NULL, &event);
+      result = check_allocation_error(context, device_id, error, queue);
+
+      if (result == FAILED_ABORT) {
+        print_error(error, "clEnqueueWriteImage failed.");
+      }
+
+      if (result != SUCCEEDED) {
+        clFinish(*queue);
+        clReleaseMemObject(mem);
+        free(data);
+        return result;
+      }
+
+      error = clWaitForEvents(1, &event);
+
+      // Dig out execution error if that is the problem
+      if (error == CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST) {
+          cl_int err, exec_status;
+          err = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(exec_status), &exec_status, NULL);
+          test_error(err, "clGetEventInfo failed getting CL_EVENT_COMMAND_EXECUTION_STATUS from failed event");
+          error = exec_status;
+      }
+
+      result = check_allocation_error(context, device_id, error, queue);
+
+      if (result == FAILED_ABORT) {
+        print_error(error, "clWaitForEvents failed.");
+      }
+
+      if (result != SUCCEEDED) {
+        clReleaseEvent(event);
+        free(data);
+        clReleaseMemObject(mem);
+        return result;
+      }
+
+      clReleaseEvent(event);
+    }
+  }
+
+  // Deal with any leftover bits
+  if (origin[1] < height) {
+    // Put values in the data, and keep a checksum as we go along.
+    for (j=0; j<width*4*(height-origin[1]); j++) {
+      data[j] = (cl_uint)genrand_int32(d);
+      checksum_delta += data[j];
+    }
+
+    region[1] = height-origin[1];
+    if(blocking_write) {
+      error = clEnqueueWriteImage(*queue, mem, CL_TRUE, origin, region, 0, 0, data, 0, NULL, NULL);
+      result = check_allocation_error(context, device_id, error, queue);
+
+      if (result == FAILED_ABORT) {
+        print_error(error, "clEnqueueWriteImage failed.");
+      }
+
+      if (result != SUCCEEDED) {
+        clFinish(*queue);
+        clReleaseMemObject(mem);
+        free(data);
+        return result;
+      }
+    } else {
+      error = clEnqueueWriteImage(*queue, mem, CL_FALSE, origin, region, 0, 0, data, 0, NULL, &event);
+      result = check_allocation_error(context, device_id, error, queue);
+
+      if (result == FAILED_ABORT) {
+        print_error(error, "clEnqueueWriteImage failed.");
+      }
+
+      if (result != SUCCEEDED) {
+        clFinish(*queue);
+        clReleaseMemObject(mem);
+        free(data);
+        return result;
+      }
+
+      error = clWaitForEvents(1, &event);
+      result = check_allocation_error(context, device_id, error, queue);
+
+      if (result == FAILED_ABORT) {
+        print_error(error, "clWaitForEvents failed.");
+      }
+
+      if (result != SUCCEEDED) {
+        clFinish(*queue);
+        clReleaseEvent(event);
+        free(data);
+        clReleaseMemObject(mem);
+        return result;
+      }
+
+      clReleaseEvent(event);
+    }
+  }
+
+  free(data);
+  // Only update the checksum if this succeeded.
+  checksum += checksum_delta;
+  return SUCCEEDED;
+}
+
+
+
+int fill_mem_with_data(cl_context context, cl_device_id device_id, cl_command_queue *queue, cl_mem mem, MTdata d, cl_bool blocking_write) {
+  int error;
+  cl_mem_object_type type;
+  size_t size, width, height;
+
+  error = clGetMemObjectInfo(mem, CL_MEM_TYPE, sizeof(type), &type, NULL);
+  test_error_abort(error, "clGetMemObjectInfo failed for CL_MEM_TYPE.");
+
+  if (type == CL_MEM_OBJECT_BUFFER) {
+    error = clGetMemObjectInfo(mem, CL_MEM_SIZE, sizeof(size), &size, NULL);
+    test_error_abort(error, "clGetMemObjectInfo failed for CL_MEM_SIZE.");
+    return fill_buffer_with_data(context, device_id, queue, mem, size, d, blocking_write);
+  } else if (type == CL_MEM_OBJECT_IMAGE2D) {
+    error = clGetImageInfo(mem, CL_IMAGE_WIDTH, sizeof(width), &width, NULL);
+    test_error_abort(error, "clGetImageInfo failed for CL_IMAGE_WIDTH.");
+    error = clGetImageInfo(mem, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL);
+    test_error_abort(error, "clGetImageInfo failed for CL_IMAGE_HEIGHT.");
+    return fill_image_with_data(context, device_id, queue, mem, width, height, d, blocking_write);
+  }
+
+  log_error("Invalid CL_MEM_TYPE: %d\n", type);
+  return FAILED_ABORT;
+}
+
+
+
--- a/test_conformance/allocations/allocation_fill.h
+++ b/test_conformance/allocations/allocation_fill.h
@@ -0,0 +1,19 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "testBase.h"
+#include "allocation_utils.h"
+
+int fill_mem_with_data(cl_context context, cl_device_id device_id, cl_command_queue *queue, cl_mem mem, MTdata d, cl_bool blocking_write);
--- a/test_conformance/allocations/allocation_functions.cpp
+++ b/test_conformance/allocations/allocation_functions.cpp
@@ -0,0 +1,287 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "allocation_functions.h"
+#include "allocation_fill.h"
+
+
+static cl_image_format    image_format = { CL_RGBA, CL_UNSIGNED_INT32 };
+
+int allocate_buffer(cl_context context, cl_command_queue *queue, cl_device_id device_id, cl_mem *mem, size_t size_to_allocate, cl_bool blocking_write) {
+  int error;
+  // log_info("\t\tAttempting to allocate a %gMB array and fill with %s writes.\n", (size_to_allocate/(1024.0*1024.0)), (blocking_write ? "blocking" : "non-blocking"));
+  *mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size_to_allocate, NULL, &error);
+  return check_allocation_error(context, device_id, error, queue);
+}
+
+
+int find_good_image_size(cl_device_id device_id, size_t size_to_allocate, size_t *width, size_t *height, size_t* max_size) {
+  size_t max_width, max_height, num_pixels, found_width, found_height;
+  int error;
+
+  if (checkForImageSupport(device_id)) {
+    log_info("Can not allocate an image on this device because it does not support images.");
+    return FAILED_ABORT;
+  }
+
+  if (size_to_allocate == 0) {
+    log_error("Trying to allcoate a zero sized image.\n");
+    return FAILED_ABORT;
+  }
+
+  error = clGetDeviceInfo( device_id, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof( max_width ), &max_width, NULL );
+  test_error_abort(error, "clGetDeviceInfo failed.");
+  error = clGetDeviceInfo( device_id, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof( max_height ), &max_height, NULL );
+  test_error_abort(error, "clGetDeviceInfo failed.");
+
+  num_pixels = size_to_allocate / (sizeof(cl_uint)*4);
+
+  if (num_pixels > (max_width*max_height)) {
+    if(NULL != max_size) {
+      *max_size = max_width * max_height * sizeof(cl_uint) * 4;
+    }
+    return FAILED_TOO_BIG;
+  }
+
+  // We want a close-to-square aspect ratio.
+  // Note that this implicitly assumes that  max width >= max height
+  found_width = (int)sqrt( (double) num_pixels );
+  if( found_width > max_width ) {
+    found_width = max_width;
+  }
+  if (found_width == 0)
+    found_width = 1;
+
+  found_height = (size_t)num_pixels/found_width;
+  if (found_height > max_height) {
+    found_height = max_height;
+  }
+  if (found_height == 0)
+    found_height = 1;
+
+  *width = found_width;
+  *height = found_height;
+
+  if(NULL != max_size) {
+    *max_size = found_width * found_height * sizeof(cl_uint) * 4;
+  }
+
+  return SUCCEEDED;
+}
+
+
+int allocate_image2d_read(cl_context context, cl_command_queue *queue, cl_device_id device_id, cl_mem *mem, size_t size_to_allocate, cl_bool blocking_write) {
+  size_t width, height;
+  int error;
+
+  error = find_good_image_size(device_id, size_to_allocate, &width, &height, NULL);
+  if (error != SUCCEEDED)
+    return error;
+
+  log_info("\t\tAttempting to allocate a %gMB read-only image (%d x %d) and fill with %s writes.\n",
+          (size_to_allocate/(1024.0*1024.0)), (int)width, (int)height, (blocking_write ? "blocking" : "non-blocking"));
+  *mem = create_image_2d(context, CL_MEM_READ_ONLY, &image_format, width, height, 0, NULL, &error);
+
+  return check_allocation_error(context, device_id, error, queue);
+}
+
+
+int allocate_image2d_write(cl_context context, cl_command_queue *queue, cl_device_id device_id, cl_mem *mem, size_t size_to_allocate, cl_bool blocking_write) {
+  size_t width, height;
+  int error;
+
+  error = find_good_image_size(device_id, size_to_allocate, &width, &height, NULL);
+  if (error != SUCCEEDED)
+    return error;
+
+  //log_info("\t\tAttempting to allocate a %gMB write-only image (%d x %d) and fill with %s writes.\n",
+           //(size_to_allocate/(1024.0*1024.0)), (int)width, (int)height, (blocking_write ? "blocking" : "non-blocking"));
+  *mem = create_image_2d(context, CL_MEM_WRITE_ONLY, &image_format, width, height, 0, NULL, &error);
+
+  return check_allocation_error(context, device_id, error, queue);
+}
+
+int do_allocation(cl_context context, cl_command_queue *queue, cl_device_id device_id, size_t size_to_allocate, int type, cl_mem *mem) {
+  if (type == BUFFER) return allocate_buffer(context, queue, device_id, mem, size_to_allocate, true);
+  if (type == IMAGE_READ) return allocate_image2d_read(context, queue, device_id, mem, size_to_allocate, true);
+  if (type == IMAGE_WRITE) return allocate_image2d_write(context, queue, device_id, mem, size_to_allocate, true);
+  if (type == BUFFER_NON_BLOCKING) return allocate_buffer(context, queue, device_id, mem, size_to_allocate, false);
+  if (type == IMAGE_READ_NON_BLOCKING) return allocate_image2d_read(context, queue, device_id, mem, size_to_allocate, false);
+  if (type == IMAGE_WRITE_NON_BLOCKING) return allocate_image2d_write(context, queue, device_id, mem, size_to_allocate, false);
+    log_error("Invalid allocation type: %d\n", type);
+  return FAILED_ABORT;
+}
+
+
+int allocate_size(cl_context context, cl_command_queue *queue, cl_device_id device_id, int multiple_allocations, size_t size_to_allocate,
+                  int type, cl_mem mems[], int *number_of_mems, size_t *final_size, int force_fill, MTdata d) {
+
+    cl_ulong max_individual_allocation_size, global_mem_size;
+  int error, result;
+  size_t amount_allocated;
+  size_t reduction_amount;
+  int current_allocation;
+  size_t allocation_this_time, actual_allocation;
+
+  // Set the number of mems used to 0 so if we fail to create even a single one we don't end up returning a garbage value
+  *number_of_mems = 0;
+
+  error = clGetDeviceInfo(device_id, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(max_individual_allocation_size), &max_individual_allocation_size, NULL);
+  test_error_abort( error, "clGetDeviceInfo failed for CL_DEVICE_MAX_MEM_ALLOC_SIZE");
+  error = clGetDeviceInfo(device_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(global_mem_size), &global_mem_size, NULL);
+  test_error_abort( error, "clGetDeviceInfo failed for CL_DEVICE_GLOBAL_MEM_SIZE");
+
+  if (global_mem_size > (cl_ulong)SIZE_MAX) {
+    global_mem_size = (cl_ulong)SIZE_MAX;
+  }
+
+//  log_info("Device reports CL_DEVICE_MAX_MEM_ALLOC_SIZE=%llu bytes (%gMB), CL_DEVICE_GLOBAL_MEM_SIZE=%llu bytes (%gMB).\n",
+//           max_individual_allocation_size, toMB(max_individual_allocation_size),
+//           global_mem_size, toMB(global_mem_size));
+
+  if (size_to_allocate > global_mem_size) {
+    log_error("Can not allocate more than the global memory size.\n");
+    return FAILED_ABORT;
+  }
+
+  amount_allocated = 0;
+  current_allocation = 0;
+
+  // If allocating for images, reduce the maximum allocation size to the maximum image size.
+  // If we don't do this, then the value of CL_DEVICE_MAX_MEM_ALLOC_SIZE / 4 can be higher
+  // than the maximum image size on systems with 16GB or RAM or more. In this case, we
+  // succeed in allocating an image but its size is less than CL_DEVICE_MAX_MEM_ALLOC_SIZE / 4
+  // (min_allocation_allowed) and thus we fail the allocation below.
+  if(type == IMAGE_READ || type == IMAGE_READ_NON_BLOCKING || type == IMAGE_WRITE || type == IMAGE_WRITE_NON_BLOCKING) {
+    size_t width;
+    size_t height;
+    size_t max_size;
+    error = find_good_image_size(device_id, size_to_allocate, &width, &height, &max_size);
+    if (!(error == SUCCEEDED || error == FAILED_TOO_BIG))
+      return error;
+    if(max_size < max_individual_allocation_size)
+      max_individual_allocation_size = max_size;
+  }
+
+  reduction_amount = (size_t)max_individual_allocation_size/16;
+
+  if (type == BUFFER || type == BUFFER_NON_BLOCKING) log_info("\tAttempting to allocate a buffer of size %gMB.\n", toMB(size_to_allocate));
+  else if (type == IMAGE_READ || type == IMAGE_READ_NON_BLOCKING) log_info("\tAttempting to allocate a read-only image of size %gMB.\n", toMB(size_to_allocate));
+  else if (type == IMAGE_WRITE || type == IMAGE_WRITE_NON_BLOCKING) log_info("\tAttempting to allocate a write-only image of size %gMB.\n", toMB(size_to_allocate));
+
+//  log_info("\t\t(Reduction size is %gMB per iteration, minimum allowable individual allocation size is %gMB.)\n",
+//           toMB(reduction_amount), toMB(min_allocation_allowed));
+//  if (force_fill && type != IMAGE_WRITE && type != IMAGE_WRITE_NON_BLOCKING) log_info("\t\t(Allocations will be filled with random data for checksum calculation.)\n");
+
+  // If we are only doing a single allocation, only allow 1
+  int max_to_allocate = multiple_allocations ? MAX_NUMBER_TO_ALLOCATE : 1;
+
+  // Make sure that the maximum number of images allocated is constrained by the
+  // maximum that may be passed to a kernel
+  if (type != BUFFER && type != BUFFER_NON_BLOCKING) {
+    cl_device_info param_name = (type == IMAGE_READ || type == IMAGE_READ_NON_BLOCKING) ?
+      CL_DEVICE_MAX_READ_IMAGE_ARGS : CL_DEVICE_MAX_WRITE_IMAGE_ARGS;
+
+    cl_uint max_image_args;
+    error = clGetDeviceInfo(device_id, param_name, sizeof(max_image_args), &max_image_args, NULL);
+    test_error( error, "clGetDeviceInfo failed for CL_DEVICE_MAX IMAGE_ARGS");
+
+    if ((int)max_image_args < max_to_allocate) {
+      log_info("\t\tMaximum number of images per kernel limited to %d\n",(int)max_image_args);
+      max_to_allocate =  max_image_args;
+    }
+  }
+
+
+  // Try to allocate the requested amount.
+  while (amount_allocated != size_to_allocate && current_allocation < max_to_allocate) {
+
+    // Determine how much more is needed
+    allocation_this_time = size_to_allocate - amount_allocated;
+
+    // Bound by the individual allocation size
+    if (allocation_this_time > max_individual_allocation_size)
+        allocation_this_time = (size_t)max_individual_allocation_size;
+
+    // Allocate the largest object possible
+    result = FAILED_TOO_BIG;
+    //log_info("\t\tTrying sub-allocation %d at size %gMB.\n", current_allocation, toMB(allocation_this_time));
+    while (result == FAILED_TOO_BIG && allocation_this_time != 0) {
+
+      // Create the object
+        result = do_allocation(context, queue, device_id, allocation_this_time, type, &mems[current_allocation]);
+      if (result == SUCCEEDED) {
+        // Allocation succeeded, another memory object was added to the array
+        *number_of_mems = (current_allocation+1);
+
+          // Verify the size is correct to within 1MB.
+        actual_allocation = get_actual_allocation_size(mems[current_allocation]);
+        if (fabs((double)allocation_this_time - (double)actual_allocation) > 1024.0*1024.0) {
+             log_error("Allocation not of expected size. Expected %gMB, got %gMB.\n", toMB(allocation_this_time), toMB( actual_allocation));
+          return FAILED_ABORT;
+        }
+
+        // If we are filling the allocation for verification do so
+        if (force_fill) {
+          //log_info("\t\t\tWriting random values to object and calculating checksum.\n");
+          cl_bool blocking_write = true;
+          if (type == BUFFER_NON_BLOCKING || type == IMAGE_READ_NON_BLOCKING || type == IMAGE_WRITE_NON_BLOCKING) {
+            blocking_write = false;
+          }
+          result = fill_mem_with_data(context, device_id, queue, mems[current_allocation], d, blocking_write);
+        }
+      }
+
+      // If creation failed, try to create a smaller object
+      if (result == FAILED_TOO_BIG) {
+        //log_info("\t\t\tAllocation %d failed at size %gMB. Trying smaller.\n", current_allocation, toMB(allocation_this_time));
+        if (allocation_this_time > reduction_amount)
+            allocation_this_time -= reduction_amount;
+        else if (reduction_amount > 1) {
+          reduction_amount /= 2;
+        }
+        else {
+          allocation_this_time = 0;
+        }
+
+      }
+    }
+
+    if (result == FAILED_ABORT) {
+      log_error("\t\tAllocation failed.\n");
+      return FAILED_ABORT;
+    }
+
+    if (!allocation_this_time) {
+      log_info("\t\tFailed to allocate %gMB across several objects.\n", toMB(size_to_allocate));
+      return FAILED_TOO_BIG;
+    }
+
+    // Otherwise we succeeded
+    if (result != SUCCEEDED) {
+      log_error("Test logic error.");
+      test_finish();
+      exit(-1);
+    }
+    amount_allocated += allocation_this_time;
+
+    *final_size = amount_allocated;
+
+    current_allocation++;
+  }
+
+  log_info("\t\tSucceeded in allocating %gMB using %d memory objects.\n", toMB(amount_allocated), current_allocation);
+  return SUCCEEDED;
+}
--- a/test_conformance/allocations/allocation_functions.h
+++ b/test_conformance/allocations/allocation_functions.h
@@ -0,0 +1,24 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "testBase.h"
+#include "allocation_utils.h"
+
+int do_allocation(cl_context context, cl_command_queue *queue, cl_device_id device_id, size_t size_to_allocate, int type, cl_mem *mem);
+int allocate_buffer(cl_context context, cl_command_queue *queue, cl_device_id device_id, cl_mem *mem, size_t size_to_allocate);
+int allocate_image2d_read(cl_context context, cl_command_queue *queue, cl_device_id device_id, cl_mem *mem, size_t size_to_allocate);
+int allocate_image2d_write(cl_context context, cl_command_queue *queue, cl_device_id device_id, cl_mem *mem, size_t size_to_allocate);
+int allocate_size(cl_context context, cl_command_queue *queue, cl_device_id device_id, int multiple_allocations, size_t size_to_allocate,
+                  int type, cl_mem mems[], int *number_of_mems, size_t *final_size, int force_fill, MTdata d);
--- a/test_conformance/allocations/allocation_utils.cpp
+++ b/test_conformance/allocations/allocation_utils.cpp
@@ -0,0 +1,87 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "allocation_utils.h"
+
+cl_command_queue reset_queue(cl_context context, cl_device_id device_id, cl_command_queue *queue, int *error)
+{
+  log_info("Invalid command queue. Releasing and recreating the command queue.\n");
+  clReleaseCommandQueue(*queue);
+    *queue = clCreateCommandQueueWithProperties(context, device_id, 0, error);
+  return *queue;
+}
+
+int check_allocation_error(cl_context context, cl_device_id device_id, int error, cl_command_queue *queue) {
+  //log_info("check_allocation_error context=%p device_id=%p error=%d *queue=%p\n", context, device_id, error, *queue);
+  if ((error == CL_MEM_OBJECT_ALLOCATION_FAILURE ) || (error == CL_OUT_OF_RESOURCES ) || (error == CL_OUT_OF_HOST_MEMORY) || (error == CL_INVALID_IMAGE_SIZE)) {
+    return FAILED_TOO_BIG;
+  } else if (error == CL_INVALID_COMMAND_QUEUE) {
+    *queue = reset_queue(context, device_id, queue, &error);
+    if (CL_SUCCESS != error)
+    {
+      log_error("Failed to reset command queue after corrupted queue: %s\n", IGetErrorString(error));
+      return FAILED_ABORT;
+    }
+    // Try again with smaller resources.
+    return FAILED_TOO_BIG;
+  } else if (error != CL_SUCCESS) {
+    log_error("Allocation failed with %s.\n", IGetErrorString(error));
+    return FAILED_ABORT;
+  }
+  return SUCCEEDED;
+}
+
+
+double toMB(cl_ulong size_in) {
+  return (double)size_in/(1024.0*1024.0);
+}
+
+size_t get_actual_allocation_size(cl_mem mem) {
+  int error;
+  cl_mem_object_type type;
+  size_t size, width, height;
+
+  error = clGetMemObjectInfo(mem, CL_MEM_TYPE, sizeof(type), &type, NULL);
+  if (error) {
+      print_error(error, "clGetMemObjectInfo failed for CL_MEM_TYPE.");
+    return 0;
+  }
+
+  if (type == CL_MEM_OBJECT_BUFFER) {
+    error = clGetMemObjectInfo(mem, CL_MEM_SIZE, sizeof(size), &size, NULL);
+    if (error) {
+      print_error(error, "clGetMemObjectInfo failed for CL_MEM_SIZE.");
+      return 0;
+    }
+    return size;
+  } else if (type == CL_MEM_OBJECT_IMAGE2D) {
+    error = clGetImageInfo(mem, CL_IMAGE_WIDTH, sizeof(width), &width, NULL);
+    if (error) {
+      print_error(error, "clGetMemObjectInfo failed for CL_IMAGE_WIDTH.");
+      return 0;
+    }
+    error = clGetImageInfo(mem, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL);
+    if (error) {
+      print_error(error, "clGetMemObjectInfo failed for CL_IMAGE_HEIGHT.");
+      return 0;
+    }
+    return width*height*4*sizeof(cl_uint);
+  }
+
+  log_error("Invalid CL_MEM_TYPE: %d\n", type);
+  return 0;
+}
+
+
--- a/test_conformance/allocations/allocation_utils.h
+++ b/test_conformance/allocations/allocation_utils.h
@@ -0,0 +1,24 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "testBase.h"
+
+extern cl_uint checksum;
+
+int check_allocation_error(cl_context context, cl_device_id device_id, int error, cl_command_queue *queue);
+double toMB(cl_ulong size_in);
+size_t get_actual_allocation_size(cl_mem mem);
+
+
--- a/test_conformance/allocations/main.cpp
+++ b/test_conformance/allocations/main.cpp
@@ -0,0 +1,411 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "testBase.h"
+
+#include "allocation_functions.h"
+#include "allocation_fill.h"
+#include "allocation_execute.h"
+#include "../../test_common/harness/testHarness.h"
+#include "../../test_common/harness/parseParameters.h"
+#include <time.h>
+
+typedef long long unsigned llu;
+
+cl_device_id g_device_id;
+cl_device_type g_device_type = CL_DEVICE_TYPE_DEFAULT;
+clContextWrapper g_context;
+clCommandQueueWrapper g_queue;
+int g_repetition_count = 1;
+int g_tests_to_run = 0;
+int g_reduction_percentage = 100;
+int g_write_allocations = 1;
+int g_multiple_allocations = 0;
+int g_execute_kernel = 1;
+
+cl_uint checksum;
+
+void printUsage( const char *execName )
+{
+    const char *p = strrchr( execName, '/' );
+    if( p != NULL )
+        execName = p + 1;
+
+    log_info( "Usage: %s [single|multiple] [numReps] [reduction%%] allocType\n", execName );
+    log_info( "Where:\n" );
+    log_info( "\tsingle - Tests using a single allocation as large as possible\n" );
+    log_info( "\tmultiple - Tests using as many allocations as possible\n" );
+    log_info( "\n" );
+    log_info( "\tnumReps - Optional integer specifying the number of repetitions to run and average the result (defaults to 1)\n" );
+    log_info( "\treduction%% - Optional integer, followed by a %% sign, that acts as a multiplier for the target amount of memory.\n" );
+    log_info( "\t              Example: target amount of 512MB and a reduction of 75%% will result in a target of 384MB.\n" );
+    log_info( "\n" );
+    log_info( "\tallocType - Allocation type to test with. Can be one of the following:\n" );
+    log_info( "\t\tbuffer\n");
+    log_info( "\t\timage2d_read\n");
+    log_info( "\t\timage2d_write\n");
+    log_info( "\t\tbuffer_non_blocking\n");
+    log_info( "\t\timage2d_read_non_blocking\n");
+    log_info( "\t\timage2d_write_non_blocking\n");
+    log_info( "\t\tall (runs all of the above in sequence)\n" );
+    log_info( "\tdo_not_force_fill - Disable explicitly write data to all memory objects after creating them.\n" );
+    log_info( "\t Without this, the kernel execution can not verify its checksum.\n" );
+    log_info( "\tdo_not_execute - Disable executing a kernel that accesses all of the memory objects.\n" );
+}
+
+
+int init_cl() {
+    cl_platform_id platform;
+    int error;
+
+    error = clGetPlatformIDs(1, &platform, NULL);
+    test_error(error, "clGetPlatformIDs failed");
+
+    error = clGetDeviceIDs(platform, g_device_type, 1, &g_device_id, NULL);
+    test_error(error, "clGetDeviceIDs failed");
+
+    /* Create a context */
+    g_context = clCreateContext( NULL, 1, &g_device_id, notify_callback, NULL, &error );
+    test_error(error, "clCreateContext failed");
+
+    /* Create command queue */
+    g_queue = clCreateCommandQueueWithProperties( g_context, g_device_id, 0, &error );
+    test_error(error, "clCreateCommandQueue failed");
+
+    return error;
+}
+
+
+int main(int argc, const char *argv[])
+{
+    int error;
+    int count;
+    cl_mem mems[MAX_NUMBER_TO_ALLOCATE];
+    cl_ulong max_individual_allocation_size, global_mem_size;
+    char            str[ 128 ],  *endPtr;
+    int r;
+    int number_of_mems_used;
+    int failure_counts = 0;
+    int test, test_to_run = 0;
+    int randomize = 0;
+    size_t final_size, max_size, current_test_size;
+
+    test_start();
+
+    argc = parseCustomParam(argc, argv);
+    if (argc == -1)
+    {
+        test_finish();
+        return -1;
+    }
+
+    // Parse arguments
+    checkDeviceTypeOverride( &g_device_type );
+    for( int i = 1; i < argc; i++ )
+    {
+        strncpy( str, argv[ i ], sizeof( str ) - 1 );
+
+        if( strcmp( str, "cpu" ) == 0 || strcmp( str, "CL_DEVICE_TYPE_CPU" ) == 0 )
+            g_device_type = CL_DEVICE_TYPE_CPU;
+        else if( strcmp( str, "gpu" ) == 0 || strcmp( str, "CL_DEVICE_TYPE_GPU" ) == 0 )
+            g_device_type = CL_DEVICE_TYPE_GPU;
+        else if( strcmp( str, "accelerator" ) == 0 || strcmp( str, "CL_DEVICE_TYPE_ACCELERATOR" ) == 0 )
+            g_device_type = CL_DEVICE_TYPE_ACCELERATOR;
+        else if( strcmp( str, "CL_DEVICE_TYPE_DEFAULT" ) == 0 )
+            g_device_type = CL_DEVICE_TYPE_DEFAULT;
+
+        else if( strcmp( str, "multiple" ) == 0 )
+            g_multiple_allocations = 1;
+        else if( strcmp( str, "randomize" ) == 0 )
+            randomize = 1;
+        else if( strcmp( str, "single" ) == 0 )
+            g_multiple_allocations = 0;
+
+        else if( ( r = (int)strtol( str, &endPtr, 10 ) ) && ( endPtr != str ) && ( *endPtr == 0 ) )
+        {
+            // By spec, that means the entire string was an integer, so take it as a repetition count
+            g_repetition_count = r;
+        }
+
+        else if( strcmp( str, "all" ) == 0 )
+        {
+            g_tests_to_run = BUFFER | IMAGE_READ | IMAGE_WRITE | BUFFER_NON_BLOCKING | IMAGE_READ_NON_BLOCKING | IMAGE_WRITE_NON_BLOCKING;
+        }
+
+        else if( strchr( str, '%' ) != NULL )
+        {
+            // Reduction percentage (let strtol ignore the percentage)
+            g_reduction_percentage = (int)strtol( str, NULL, 10 );
+        }
+
+        else if( g_tests_to_run == 0 )
+        {
+            if( strcmp( str, "buffer" ) == 0 )
+            {
+                g_tests_to_run |= BUFFER;
+            }
+            else if( strcmp( str, "image2d_read" ) == 0 )
+            {
+                g_tests_to_run |= IMAGE_READ;
+            }
+            else if( strcmp( str, "image2d_write" ) == 0 )
+            {
+                g_tests_to_run |= IMAGE_WRITE;
+            }
+            else if( strcmp( str, "buffer_non_blocking" ) == 0 )
+            {
+                g_tests_to_run |= BUFFER_NON_BLOCKING;
+            }
+            else if( strcmp( str, "image2d_read_non_blocking" ) == 0 )
+            {
+                g_tests_to_run |= IMAGE_READ_NON_BLOCKING;
+            }
+            else if( strcmp( str, "image2d_write_non_blocking" ) == 0 )
+            {
+                g_tests_to_run |= IMAGE_WRITE_NON_BLOCKING;
+            }
+            if( g_tests_to_run == 0 )
+                break;    // Argument is invalid; break to print usage
+        }
+
+        else if( strcmp( str, "do_not_force_fill" ) == 0 )
+        {
+            g_write_allocations = 0;
+        }
+
+        else if( strcmp( str, "do_not_execute" ) == 0 )
+        {
+            g_execute_kernel = 0;
+        }
+
+    }
+
+    if( randomize )
+    {
+        gRandomSeed = (cl_uint) time( NULL );
+        log_info( "Random seed: %u.\n", gRandomSeed );
+        gReSeed = 1;
+    }
+
+    if( g_tests_to_run == 0 )
+    {
+        // Allocation type was never specified, or one of the arguments was invalid. Print usage and bail
+        printUsage( argv[ 0 ] );
+        return -1;
+    }
+
+    // All ready to go, so set up an environment
+    error = init_cl();
+    if (error) {
+        test_finish();
+        return -1;
+    }
+
+    if( printDeviceHeader( g_device_id ) != CL_SUCCESS )
+    {
+        test_finish();
+        return -1;
+    }
+
+
+    error = clGetDeviceInfo(g_device_id, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(max_individual_allocation_size), &max_individual_allocation_size, NULL);
+    if ( error ) {
+        print_error( error, "clGetDeviceInfo failed for CL_DEVICE_MAX_MEM_ALLOC_SIZE");
+        test_finish();
+        return -1;
+    }
+    error = clGetDeviceInfo(g_device_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(global_mem_size), &global_mem_size, NULL);
+    if ( error ) {
+        print_error( error, "clGetDeviceInfo failed for CL_DEVICE_GLOBAL_MEM_SIZE");
+        test_finish();
+        return -1;
+    }
+
+    log_info("Device reports CL_DEVICE_MAX_MEM_ALLOC_SIZE=%llu bytes (%gMB), CL_DEVICE_GLOBAL_MEM_SIZE=%llu bytes (%gMB).\n",
+             llu( max_individual_allocation_size ), toMB(max_individual_allocation_size),
+             llu( global_mem_size ), toMB(global_mem_size));
+
+    if (global_mem_size > (cl_ulong)SIZE_MAX) {
+      global_mem_size = (cl_ulong)SIZE_MAX;
+    }
+
+    if( max_individual_allocation_size > global_mem_size )
+    {
+        log_error( "FAILURE:  CL_DEVICE_MAX_MEM_ALLOC_SIZE (%llu) is greater than the CL_DEVICE_GLOBAL_MEM_SIZE (%llu)\n", llu( max_individual_allocation_size ), llu( global_mem_size ) );
+        test_finish();
+        return -1;
+    }
+
+    // We may need to back off the global_mem_size on unified memory devices to leave room for application and operating system code
+    // and associated data in the working set, so we dont start pathologically paging.
+    // Check to see if we are a unified memory device
+    cl_bool hasUnifiedMemory = CL_FALSE;
+    if( ( error = clGetDeviceInfo( g_device_id, CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof( hasUnifiedMemory ), &hasUnifiedMemory, NULL )))
+    {
+        print_error( error, "clGetDeviceInfo failed for CL_DEVICE_HOST_UNIFIED_MEMORY");
+        test_finish();
+        return -1;
+    }
+    // we share unified memory so back off to 1/2 the global memory size.
+    if( CL_TRUE == hasUnifiedMemory )
+    {
+        global_mem_size -= global_mem_size /2;
+        log_info( "Device shares memory with the host, so backing off the maximum combined allocation size to be %gMB to avoid rampant paging.\n", toMB( global_mem_size ) );
+    }
+    else
+    {
+        // Lets just use 60% of total available memory as framework/driver may not allow using all of it
+        // e.g. vram on GPU is used by window server and even for this test, we need some space for context,
+        // queue, kernel code on GPU.
+        global_mem_size *= 0.60;
+    }
+
+    // Pick the baseline size based on whether we are doing a single large or multiple allocations
+    if (!g_multiple_allocations) {
+        max_size = (size_t)max_individual_allocation_size;
+    } else {
+        max_size = (size_t)global_mem_size;
+    }
+
+
+    // Adjust based on the percentage
+    if (g_reduction_percentage != 100) {
+        log_info("NOTE: reducing max allocations to %d%%.\n", g_reduction_percentage);
+        max_size = (size_t)((double)max_size * (double)g_reduction_percentage/100.0);
+    }
+
+    // Round to nearest MB.
+    max_size &= (size_t)(0xFFFFFFFFFF00000ULL);
+
+    log_info("** Target allocation size (rounded to nearest MB) is: %lu bytes (%gMB).\n", max_size, toMB(max_size));
+
+    // Run all the requested tests
+    RandomSeed seed( gRandomSeed );
+    for (test=0; test<6; test++) {
+        if (test == 0) test_to_run = BUFFER;
+        if (test == 1) test_to_run = IMAGE_READ;
+        if (test == 2) test_to_run = IMAGE_WRITE;
+        if (test == 3) test_to_run = BUFFER_NON_BLOCKING;
+        if (test == 4) test_to_run = IMAGE_READ_NON_BLOCKING;
+        if (test == 5) test_to_run = IMAGE_WRITE_NON_BLOCKING;
+        if (!(g_tests_to_run & test_to_run))
+            continue;
+
+        // Skip image tests if we don't support images on the device
+        if (test > 0 && checkForImageSupport(g_device_id)) {
+            log_info("Can not test image allocation because device does not support images.\n");
+            continue;
+        }
+
+        // This section was added in order to fix a bug in the test
+        // If CL_DEVICE_MAX_MEM_ALLOC_SIZE is much grater than CL_DEVICE_IMAGE2D_MAX_WIDTH * CL_DEVICE_IMAGE2D_MAX_HEIGHT
+        // The test will fail in image allocations as the size requested for the allocation will be much grater than the maximum size allowed for image
+        if ( (test_to_run != BUFFER) && (test_to_run != BUFFER_NON_BLOCKING) ) {
+          size_t max_width, max_height;
+          cl_ulong max_image2d_size;
+          error = clGetDeviceInfo(g_device_id, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof( max_width ), &max_width, NULL );
+          test_error_abort( error, "clGetDeviceInfo failed for CL_DEVICE_IMAGE2D_MAX_WIDTH");
+          error = clGetDeviceInfo(g_device_id, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof( max_height ), &max_height, NULL );
+          test_error_abort( error, "clGetDeviceInfo failed for CL_DEVICE_IMAGE2D_MAX_HEIGHT");
+          max_image2d_size = (cl_ulong)max_height*max_width*4*sizeof(cl_uint);
+
+          if (max_individual_allocation_size > max_image2d_size)
+          {
+            max_individual_allocation_size = max_image2d_size;
+          }
+        }
+
+        // Pick the baseline size based on whether we are doing a single large or multiple allocations
+    if (!g_multiple_allocations) {
+      max_size = (size_t)max_individual_allocation_size;
+    } else {
+      max_size = (size_t)global_mem_size;
+    }
+
+        // Adjust based on the percentage
+        if (g_reduction_percentage != 100) {
+            log_info("NOTE: reducing max allocations to %d%%.\n", g_reduction_percentage);
+            max_size = (size_t)((double)max_size * (double)g_reduction_percentage/100.0);
+        }
+
+        // Round to nearest MB.
+        max_size &= (size_t)(0xFFFFFFFFFF00000ULL);
+
+        log_info("** Target allocation size (rounded to nearest MB) is: %llu bytes (%gMB).\n", llu( max_size ), toMB(max_size));
+
+        if (test_to_run == BUFFER || test_to_run == BUFFER_NON_BLOCKING) log_info("** Allocating buffer(s) to size %gMB.\n", toMB(max_size));
+        else if (test_to_run == IMAGE_READ || test_to_run == IMAGE_READ_NON_BLOCKING) log_info("** Allocating read-only image(s) to size %gMB.\n", toMB(max_size));
+        else if (test_to_run == IMAGE_WRITE || test_to_run == IMAGE_WRITE_NON_BLOCKING) log_info("** Allocating write-only image(s) to size %gMB.\n", toMB(max_size));
+        else {log_error("Test logic error.\n"); return -1;}
+
+        // Run the test the requested number of times
+        for (count = 0; count < g_repetition_count; count++) {
+            current_test_size = max_size;
+            error = FAILED_TOO_BIG;
+            log_info("  => Allocation %d\n", count+1);
+
+            while (error == FAILED_TOO_BIG && current_test_size > max_size/8) {
+                // Reset our checksum for each allocation
+                checksum = 0;
+
+                // Do the allocation
+                error = allocate_size(g_context, &g_queue, g_device_id, g_multiple_allocations, current_test_size, test_to_run, mems, &number_of_mems_used, &final_size, g_write_allocations, seed);
+
+                // If we succeeded and we're supposed to execute a kernel, do so.
+                if (error == SUCCEEDED && g_execute_kernel) {
+                    log_info("\tExecuting kernel with memory objects.\n");
+                    error = execute_kernel(g_context, &g_queue, g_device_id, test_to_run, mems, number_of_mems_used, g_write_allocations);
+                }
+
+                // If we failed to allocate more than 1/8th of the requested amount return a failure.
+                if (final_size < (size_t)max_size/8) {
+                    //          log_error("===> Allocation %d failed to allocate more than 1/8th of the requested size.\n", count+1);
+                    failure_counts++;
+                }
+                // Clean up.
+                for (int i=0; i<number_of_mems_used; i++)
+                    clReleaseMemObject(mems[i]);
+
+                if (error == FAILED_ABORT) {
+                    log_error("  => Allocation %d failed.\n", count+1);
+                    failure_counts++;
+                }
+
+                if (error == FAILED_TOO_BIG) {
+                    current_test_size -= max_size/16;
+                    // log_info("\tFailed at this size; trying a smaller size of %gMB.\n", toMB(current_test_size));
+                }
+            }
+            if (error == SUCCEEDED && current_test_size == max_size)
+                log_info("\tPASS: Allocation succeeded.\n");
+            else if (error == SUCCEEDED && current_test_size > max_size/8)
+                log_info("\tPASS: Allocation succeeded at reduced size.\n");
+            else {
+                log_error("\tFAIL: Allocation failed.\n");
+                failure_counts++;
+            }
+        }
+    }
+
+    if (failure_counts)
+        log_error("FAILED allocations test.\n");
+    else
+        log_info("PASSED allocations test.\n");
+
+    test_finish();
+    return failure_counts;
+}
+
+
--- a/test_conformance/allocations/testBase.h
+++ b/test_conformance/allocations/testBase.h
@@ -0,0 +1,62 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#ifndef _testBase_h
+#define _testBase_h
+
+#include "../../test_common/harness/compat.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#if !defined(_WIN32)
+#include <unistd.h>
+#endif
+
+#include "../../test_common/harness/errorHelpers.h"
+#include "../../test_common/harness/kernelHelpers.h"
+#include "../../test_common/harness/typeWrappers.h"
+#include "../../test_common/harness/testHarness.h"
+
+
+#define MAX_NUMBER_TO_ALLOCATE 100
+
+#define FAILED_CORRUPTED_QUEUE -2
+#define FAILED_ABORT -1
+#define FAILED_TOO_BIG 1
+// On Windows macro `SUCCEEDED' is defined in `WinError.h'. It causes compiler warnings. Let us avoid them.
+#if defined( _WIN32 ) && defined( SUCCEEDED )
+    #undef SUCCEEDED
+#endif
+#define SUCCEEDED 0
+
+#define BUFFER 1
+#define IMAGE_READ 2
+#define IMAGE_WRITE 4
+#define BUFFER_NON_BLOCKING 8
+#define IMAGE_READ_NON_BLOCKING 16
+#define IMAGE_WRITE_NON_BLOCKING 32
+
+#define test_error_abort(errCode,msg)    test_error_ret_abort(errCode,msg,errCode)
+#define test_error_ret_abort(errCode,msg,retValue)    { if( errCode != CL_SUCCESS ) { print_error( errCode, msg ); return FAILED_ABORT ; } }
+
+
+#endif // _testBase_h
+
+
+
--- a/test_conformance/api/CMakeLists.txt
+++ b/test_conformance/api/CMakeLists.txt
@@ -0,0 +1,39 @@
+set(MODULE_NAME API)
+
+set(${MODULE_NAME}_SOURCES
+         main.c
+         test_bool.c
+         test_retain.cpp
+         test_retain_program.c
+         test_queries.cpp
+         test_create_kernels.c
+         test_kernels.c
+         test_api_min_max.c
+         test_kernel_arg_changes.cpp
+         test_kernel_arg_multi_setup.cpp
+         test_binary.cpp
+         test_native_kernel.cpp
+         test_mem_objects.cpp
+         test_create_context_from_type.cpp
+         test_device_min_data_type_align_size_alignment.cpp
+         test_platform.cpp
+         test_kernel_arg_info.c
+         test_null_buffer_arg.c
+         test_mem_object_info.cpp
+         test_queue_hint.cpp
+         test_sub_group_dispatch.cpp
+         test_clone_kernel.cpp
+         test_zero_sized_enqueue.cpp
+         ../../test_common/harness/errorHelpers.c
+         ../../test_common/harness/threadTesting.c
+         ../../test_common/harness/testHarness.c
+         ../../test_common/harness/kernelHelpers.c
+         ../../test_common/harness/typeWrappers.cpp
+         ../../test_common/harness/conversions.c
+         ../../test_common/harness/mt19937.c
+         ../../test_common/harness/msvc9.c
+         ../../test_common/harness/imageHelpers.cpp
+         ../../test_common/harness/parseParameters.cpp
+)
+
+include(../CMakeCommon.txt)
--- a/test_conformance/api/Jamfile
+++ b/test_conformance/api/Jamfile
@@ -0,0 +1,27 @@
+project
+    : requirements
+      <toolset>gcc:<cflags>-xc++
+      <toolset>msvc:<cflags>"/TP"
+    ;
+
+
+exe test_api
+    : main.c
+      test_api_min_max.c
+      test_binary.cpp
+      test_create_kernels.c
+      test_create_context_from_type.cpp
+      test_kernel_arg_changes.cpp
+      test_kernel_arg_multi_setup.cpp
+      test_kernels.c
+      test_native_kernel.cpp
+      test_queries.cpp
+      test_retain_program.c
+      test_platform.cpp 
+    ;
+
+install dist
+    : test_api #test.lst
+    : <variant>debug:<location>$(DIST)/debug/tests/test_conformance/api
+      <variant>release:<location>$(DIST)/release/tests/test_conformance/api
+    ;
--- a/test_conformance/api/Makefile
+++ b/test_conformance/api/Makefile
@@ -0,0 +1,61 @@
+ifdef BUILD_WITH_ATF
+ATF = -framework ATF
+USE_ATF = -DUSE_ATF
+endif
+
+SRCS = main.c \
+			test_retain_program.c \
+			test_queries.cpp \
+			test_create_kernels.c \
+			test_kernels.c \
+            test_kernel_arg_info.c \
+			test_api_min_max.c \
+			test_kernel_arg_changes.cpp \
+			test_kernel_arg_multi_setup.cpp \
+			test_binary.cpp \
+			test_native_kernel.cpp \
+			test_create_context_from_type.cpp \
+			test_platform.cpp \
+			test_retain.cpp \
+			test_device_min_data_type_align_size_alignment.cpp \
+			test_mem_objects.cpp \
+            test_bool.c \
+            test_null_buffer_arg.c \
+            test_mem_object_info.cpp \
+            ../../test_common/harness/errorHelpers.c \
+			../../test_common/harness/threadTesting.c \
+			../../test_common/harness/testHarness.c \
+			../../test_common/harness/imageHelpers.cpp \
+			../../test_common/harness/kernelHelpers.c \
+			../../test_common/harness/typeWrappers.cpp \
+			../../test_common/harness/mt19937.c \
+			../../test_common/harness/conversions.c
+		  
+DEFINES = DONT_TEST_GARBAGE_POINTERS
+
+SOURCES = $(abspath $(SRCS))
+LIBPATH += -L/System/Library/Frameworks/OpenCL.framework/Libraries
+LIBPATH += -L.
+HEADERS = 
+TARGET = test_api
+INCLUDE = 
+COMPILERFLAGS = -c -Wall -g -Wshorten-64-to-32
+CC = c++
+CFLAGS = $(COMPILERFLAGS) ${RC_CFLAGS} ${USE_ATF} $(DEFINES:%=-D%) $(INCLUDE)
+CXXFLAGS = $(COMPILERFLAGS) ${RC_CFLAGS} ${USE_ATF} $(DEFINES:%=-D%) $(INCLUDE)
+LIBRARIES = -framework OpenCL -framework OpenGL -framework GLUT -framework AppKit ${ATF}
+
+OBJECTS := ${SOURCES:.c=.o}
+OBJECTS := ${OBJECTS:.cpp=.o}
+
+TARGETOBJECT =
+all: $(TARGET)
+
+$(TARGET): $(OBJECTS)
+	$(CC) $(RC_CFLAGS) $(OBJECTS) -o $@ $(LIBPATH) $(LIBRARIES)
+
+clean:
+	rm -f $(TARGET) $(OBJECTS)
+
+.DEFAULT:
+	@echo The target \"$@\" does not exist in Makefile.
--- a/test_conformance/api/main.c
+++ b/test_conformance/api/main.c
@@ -0,0 +1,223 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/compat.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <string.h>
+#include "procs.h"
+#include "../../test_common/harness/testHarness.h"
+
+#if !defined(_WIN32)
+#include <unistd.h>
+#endif
+
+// FIXME: To use certain functions in ../../test_common/harness/imageHelpers.h
+// (for example, generate_random_image_data()), the tests are required to declare
+// the following variables (<rdar://problem/11111245>):
+cl_device_type gDeviceType = CL_DEVICE_TYPE_DEFAULT;
+bool gTestRounding = false;
+
+basefn    basefn_list[] = {
+    test_get_platform_info,
+    test_get_sampler_info,
+    test_get_command_queue_info,
+    test_get_context_info,
+    test_get_device_info,
+    test_enqueue_task,
+    test_binary_get,
+    test_program_binary_create,
+    test_kernel_required_group_size,
+
+    test_release_kernel_order,
+    test_release_during_execute,
+
+    test_load_single_kernel,
+    test_load_two_kernels,
+    test_load_two_kernels_in_one,
+    test_load_two_kernels_manually,
+    test_get_program_info_kernel_names,
+    test_get_kernel_arg_info,
+    test_create_kernels_in_program,
+    test_get_kernel_info,
+    test_execute_kernel_local_sizes,
+    test_set_kernel_arg_by_index,
+    test_set_kernel_arg_constant,
+    test_set_kernel_arg_struct_array,
+    test_kernel_global_constant,
+
+    test_min_max_thread_dimensions,
+    test_min_max_work_items_sizes,
+    test_min_max_work_group_size,
+    test_min_max_read_image_args,
+    test_min_max_write_image_args,
+    test_min_max_mem_alloc_size,
+    test_min_max_image_2d_width,
+    test_min_max_image_2d_height,
+    test_min_max_image_3d_width,
+    test_min_max_image_3d_height,
+    test_min_max_image_3d_depth,
+    test_min_max_image_array_size,
+    test_min_max_image_buffer_size,
+    test_min_max_parameter_size,
+    test_min_max_samplers,
+    test_min_max_constant_buffer_size,
+    test_min_max_constant_args,
+    test_min_max_compute_units,
+    test_min_max_address_bits,
+    test_min_max_single_fp_config,
+    test_min_max_double_fp_config,
+    test_min_max_local_mem_size,
+    test_min_max_kernel_preferred_work_group_size_multiple,
+    test_min_max_execution_capabilities,
+    test_min_max_queue_properties,
+    test_min_max_device_version,
+    test_min_max_language_version,
+
+    test_kernel_arg_changes,
+    test_kernel_arg_multi_setup_random,
+
+    test_native_kernel,
+
+    test_create_context_from_type,
+
+    test_platform_extensions,
+    test_get_platform_ids,
+    test_for_bool_type,
+
+    test_repeated_setup_cleanup,
+
+    test_retain_queue_single,
+    test_retain_queue_multiple,
+    test_retain_mem_object_single,
+    test_retain_mem_object_multiple,
+    test_min_data_type_align_size_alignment,
+
+    test_mem_object_destructor_callback,
+    test_null_buffer_arg,
+    test_get_buffer_info,
+    test_get_image2d_info,
+    test_get_image3d_info,
+    test_get_image1d_info,
+    test_get_image1d_array_info,
+    test_get_image2d_array_info,
+    test_queue_hint,
+    test_sub_group_dispatch,
+    test_clone_kernel,
+    test_zero_sized_enqueue
+};
+
+
+const char    *basefn_names[] = {
+    "get_platform_info",
+    "get_sampler_info",
+    "get_command_queue_info",
+    "get_context_info",
+    "get_device_info",
+    "enqueue_task",
+    "binary_get",
+    "binary_create",
+    "kernel_required_group_size",
+
+    "release_kernel_order",
+    "release_during_execute",
+
+    "load_single_kernel",
+    "load_two_kernels",
+    "load_two_kernels_in_one",
+    "load_two_kernels_manually",
+    "get_program_info_kernel_names",
+    "get_kernel_arg_info",
+    "create_kernels_in_program",
+    "get_kernel_info",
+    "execute_kernel_local_sizes",
+    "set_kernel_arg_by_index",
+    "set_kernel_arg_constant",
+    "set_kernel_arg_struct_array",
+    "kernel_global_constant",
+
+    "min_max_thread_dimensions",
+    "min_max_work_items_sizes",
+    "min_max_work_group_size",
+    "min_max_read_image_args",
+    "min_max_write_image_args",
+    "min_max_mem_alloc_size",
+    "min_max_image_2d_width",
+    "min_max_image_2d_height",
+    "min_max_image_3d_width",
+    "min_max_image_3d_height",
+    "min_max_image_3d_depth",
+    "min_max_image_array_size",
+    "min_max_image_buffer_size",
+    "min_max_parameter_size",
+    "min_max_samplers",
+    "min_max_constant_buffer_size",
+    "min_max_constant_args",
+    "min_max_compute_units",
+    "min_max_address_bits",
+    "min_max_single_fp_config",
+    "min_max_double_fp_config",
+    "min_max_local_mem_size",
+    "min_max_kernel_preferred_work_group_size_multiple",
+    "min_max_execution_capabilities",
+    "min_max_queue_properties",
+    "min_max_device_version",
+    "min_max_language_version",
+
+    "kernel_arg_changes",
+    "kernel_arg_multi_setup_random",
+
+    "native_kernel",
+
+    "create_context_from_type",
+    "platform_extensions",
+
+    "get_platform_ids",
+    "bool_type",
+
+    "repeated_setup_cleanup",
+
+    "retain_queue_single",
+    "retain_queue_multiple",
+    "retain_mem_object_single",
+    "retain_mem_object_multiple",
+
+    "min_data_type_align_size_alignment",
+
+    "mem_object_destructor_callback",
+    "null_buffer_arg",
+    "get_buffer_info",
+    "get_image2d_info",
+    "get_image3d_info",
+    "get_image1d_info",
+    "get_image1d_array_info",
+    "get_image2d_array_info",
+    "queue_hint",
+    "sub_group_dispatch",
+    "clone_kernel",
+    "zero_sized_enqueue",
+};
+
+ct_assert((sizeof(basefn_names) / sizeof(basefn_names[0])) == (sizeof(basefn_list) / sizeof(basefn_list[0])));
+
+int    num_fns = sizeof(basefn_names) / sizeof(char *);
+
+int main(int argc, const char *argv[])
+{
+    return runTestHarness( argc, argv, num_fns, basefn_list, basefn_names, false, false, 0 );
+}
+
+
--- a/test_conformance/api/procs.h
+++ b/test_conformance/api/procs.h
@@ -0,0 +1,111 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/errorHelpers.h"
+#include "../../test_common/harness/kernelHelpers.h"
+#include "../../test_common/harness/typeWrappers.h"
+#include "../../test_common/harness/clImageHelper.h"
+#include "../../test_common/harness/imageHelpers.h"
+extern float    calculate_ulperror(float a, float b);
+
+extern int        test_load_single_kernel(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_load_two_kernels(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_load_two_kernels_in_one(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_load_two_kernels_manually(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_get_program_info_kernel_names( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_create_kernels_in_program(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_enqueue_task(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_repeated_setup_cleanup(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+
+extern int      test_for_bool_type(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_platform_extensions(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_get_platform_info(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_get_sampler_info(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_get_command_queue_info(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_get_context_info(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_get_device_info(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_kernel_required_group_size(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+
+extern int        test_binary_get(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_program_binary_create(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+
+extern int        test_release_kernel_order(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_release_during_execute(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+
+extern int        test_get_kernel_info(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_execute_kernel_local_sizes(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_set_kernel_arg_by_index(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_set_kernel_arg_struct(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_set_kernel_arg_constant(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_set_kernel_arg_struct_array(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_kernel_global_constant(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+
+extern int        test_min_max_thread_dimensions(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_min_max_work_items_sizes(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_min_max_work_group_size(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_min_max_read_image_args(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_min_max_write_image_args(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_min_max_mem_alloc_size(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_min_max_image_2d_width(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_min_max_image_2d_height(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_min_max_image_3d_width(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_min_max_image_3d_height(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_min_max_image_3d_depth(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_min_max_image_array_size(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_min_max_image_buffer_size(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_min_max_parameter_size(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_min_max_samplers(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_min_max_constant_buffer_size(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_min_max_constant_args(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_min_max_compute_units(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_min_max_address_bits(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_min_max_single_fp_config(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_min_max_double_fp_config(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_min_max_local_mem_size(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_min_max_kernel_preferred_work_group_size_multiple(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_min_max_execution_capabilities(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_min_max_queue_properties(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_min_max_device_version(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_min_max_language_version(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+
+extern int        test_native_kernel(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems );
+
+extern int      test_create_context_from_type(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+
+extern int      test_get_platform_ids(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+
+extern int        test_kernel_arg_changes(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_kernel_arg_multi_setup_random(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+
+extern int        test_retain_queue_single(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_retain_queue_multiple(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_retain_mem_object_single(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_retain_mem_object_multiple(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_min_data_type_align_size_alignment(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems );
+
+extern int        test_mem_object_destructor_callback(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+
+extern int      test_null_buffer_arg( cl_device_id device_id, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_get_buffer_info( cl_device_id deviceID, cl_context context, cl_command_queue ignoreQueue, int num_elements );
+extern int      test_get_image2d_info( cl_device_id deviceID, cl_context context, cl_command_queue ignoreQueue, int num_elements );
+extern int      test_get_image3d_info( cl_device_id deviceID, cl_context context, cl_command_queue ignoreQueue, int num_elements );
+extern int      test_get_image1d_info( cl_device_id deviceID, cl_context context, cl_command_queue ignoreQueue, int num_elements );
+extern int      test_get_image1d_array_info( cl_device_id deviceID, cl_context context, cl_command_queue ignoreQueue, int num_elements );
+extern int      test_get_image2d_array_info( cl_device_id deviceID, cl_context context, cl_command_queue ignoreQueue, int num_elements );
+extern int      test_get_kernel_arg_info( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_queue_hint(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_sub_group_dispatch(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_clone_kernel(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_zero_sized_enqueue(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
--- a/test_conformance/api/testBase.h
+++ b/test_conformance/api/testBase.h
@@ -0,0 +1,31 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#ifndef _testBase_h
+#define _testBase_h
+
+#include "../../test_common/harness/compat.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "procs.h"
+
+#endif // _testBase_h
+
+
+
--- a/test_conformance/api/test_api_min_max.c
+++ b/test_conformance/api/test_api_min_max.c
--- a/test_conformance/api/test_binary.cpp
+++ b/test_conformance/api/test_binary.cpp
@@ -0,0 +1,218 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "testBase.h"
+
+static const char *sample_binary_kernel_source[] = {
+"__kernel void sample_test(__global float *src, __global int *dst)\n"
+"{\n"
+"    int  tid = get_global_id(0);\n"
+"\n"
+"    dst[tid] = (int)src[tid] + 1;\n"
+"\n"
+"}\n" };
+
+
+int test_binary_get(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    int error;
+    clProgramWrapper program;
+    size_t            binarySize;
+
+
+    error = create_single_kernel_helper(context, &program, NULL, 1, sample_binary_kernel_source, NULL);
+    test_error( error, "Unable to build test program" );
+
+    // Get the size of the resulting binary (only one device)
+    error = clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, sizeof( binarySize ), &binarySize, NULL );
+    test_error( error, "Unable to get binary size" );
+
+    // Sanity check
+    if( binarySize == 0 )
+    {
+        log_error( "ERROR: Binary size of program is zero\n" );
+        return -1;
+    }
+
+    // Create a buffer and get the actual binary
+    unsigned char *binary;
+  binary = (unsigned char*)malloc(sizeof(unsigned char)*binarySize);
+    unsigned char *buffers[ 1 ] = { binary };
+
+    // Do another sanity check here first
+    size_t size;
+    error = clGetProgramInfo( program, CL_PROGRAM_BINARIES, 0, NULL, &size );
+    test_error( error, "Unable to get expected size of binaries array" );
+    if( size != sizeof( buffers ) )
+    {
+        log_error( "ERROR: Expected size of binaries array in clGetProgramInfo is incorrect (should be %d, got %d)\n", (int)sizeof( buffers ), (int)size );
+        free(binary);
+    return -1;
+    }
+
+    error = clGetProgramInfo( program, CL_PROGRAM_BINARIES, sizeof( buffers ), &buffers, NULL );
+    test_error( error, "Unable to get program binary" );
+
+    // No way to verify the binary is correct, so just be good with that
+  free(binary);
+    return 0;
+}
+
+
+int test_program_binary_create(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    /* To test this in a self-contained fashion, we have to create a program with
+   source, then get the binary, then use that binary to reload the program, and then verify */
+
+    int error;
+    clProgramWrapper program, program_from_binary;
+    size_t            binarySize;
+
+
+    error = create_single_kernel_helper(context, &program, NULL, 1, sample_binary_kernel_source, NULL);
+    test_error( error, "Unable to build test program" );
+
+    // Get the size of the resulting binary (only one device)
+    error = clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, sizeof( binarySize ), &binarySize, NULL );
+    test_error( error, "Unable to get binary size" );
+
+    // Sanity check
+    if( binarySize == 0 )
+    {
+        log_error( "ERROR: Binary size of program is zero\n" );
+        return -1;
+    }
+
+    // Create a buffer and get the actual binary
+    unsigned char *binary = (unsigned char*)malloc(binarySize);
+    const unsigned char *buffers[ 1 ] = { binary };
+
+    error = clGetProgramInfo( program, CL_PROGRAM_BINARIES, sizeof( buffers ), &buffers, NULL );
+    test_error( error, "Unable to get program binary" );
+
+    cl_int loadErrors[ 1 ];
+    program_from_binary = clCreateProgramWithBinary( context, 1, &deviceID, &binarySize, buffers, loadErrors, &error );
+    test_error( error, "Unable to load valid program binary" );
+    test_error( loadErrors[ 0 ], "Unable to load valid device binary into program" );
+
+  error = clBuildProgram( program_from_binary, 1, &deviceID, NULL, NULL, NULL );
+  test_error( error, "Unable to build binary program" );
+
+    // Get the size of the binary built from the first binary
+    size_t binary2Size;
+    error = clGetProgramInfo( program_from_binary, CL_PROGRAM_BINARY_SIZES, sizeof( binary2Size ), &binary2Size, NULL );
+    test_error( error, "Unable to get size for the binary program" );
+
+    // Now get the binary one more time and verify it loaded the right binary
+    unsigned char *binary2 = (unsigned char*)malloc(binary2Size);
+    buffers[ 0 ] = binary2;
+    error = clGetProgramInfo( program_from_binary, CL_PROGRAM_BINARIES, sizeof( buffers ), &buffers, NULL );
+    test_error( error, "Unable to get program binary second time" );
+
+    // Try again, this time without passing the status ptr in, to make sure we still
+    // get a valid binary
+    clProgramWrapper programWithoutStatus = clCreateProgramWithBinary( context, 1, &deviceID, &binary2Size, buffers, NULL, &error );
+    test_error( error, "Unable to load valid program binary when binary_status pointer is NULL" );
+
+    error = clBuildProgram( programWithoutStatus, 1, &deviceID, NULL, NULL, NULL );
+    test_error( error, "Unable to build binary program created without binary_status" );
+
+    // Get the size of the binary created without passing binary_status
+    size_t binary3Size;
+    error = clGetProgramInfo( programWithoutStatus, CL_PROGRAM_BINARY_SIZES, sizeof( binary3Size ), &binary3Size, NULL );
+    test_error( error, "Unable to get size for the binary program created without binary_status" );
+
+    // Now get the binary one more time
+    unsigned char *binary3 = (unsigned char*)malloc(binary3Size);
+    buffers[ 0 ] = binary3;
+    error = clGetProgramInfo( programWithoutStatus, CL_PROGRAM_BINARIES, sizeof( buffers ), &buffers, NULL );
+    test_error( error, "Unable to get program binary from the program created without binary_status" );
+
+    // We no longer need these intermediate binaries
+    free(binary);
+    free(binary2);
+    free(binary3);
+
+  // Now execute them both to see that they both do the same thing.
+  clMemWrapper in, out, out_binary;
+  clKernelWrapper kernel, kernel_binary;
+  cl_int *out_data, *out_data_binary;
+  cl_float *in_data;
+  size_t size_to_run = 1000;
+
+  // Allocate some data
+  in_data = (cl_float*)malloc(sizeof(cl_float)*size_to_run);
+  out_data = (cl_int*)malloc(sizeof(cl_int)*size_to_run);
+  out_data_binary = (cl_int*)malloc(sizeof(cl_int)*size_to_run);
+  memset(out_data, 0, sizeof(cl_int)*size_to_run);
+  memset(out_data_binary, 0, sizeof(cl_int)*size_to_run);
+  for (size_t i=0; i<size_to_run; i++)
+    in_data[i] = (cl_float)i;
+
+  // Create the buffers
+  in = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, sizeof(cl_float)*size_to_run, in_data, &error);
+  test_error( error, "clCreateBuffer failed");
+  out = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, sizeof(cl_int)*size_to_run, out_data, &error);
+  test_error( error, "clCreateBuffer failed");
+  out_binary = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, sizeof(cl_int)*size_to_run, out_data_binary, &error);
+  test_error( error, "clCreateBuffer failed");
+
+  // Create the kernels
+  kernel = clCreateKernel(program, "sample_test", &error);
+  test_error( error, "clCreateKernel failed");
+  kernel_binary = clCreateKernel(program_from_binary, "sample_test", &error);
+  test_error( error, "clCreateKernel from binary failed");
+
+  // Set the arguments
+  error = clSetKernelArg(kernel, 0, sizeof(in), &in);
+  test_error( error, "clSetKernelArg failed");
+  error = clSetKernelArg(kernel, 1, sizeof(out), &out);
+  test_error( error, "clSetKernelArg failed");
+  error = clSetKernelArg(kernel_binary, 0, sizeof(in), &in);
+  test_error( error, "clSetKernelArg failed");
+  error = clSetKernelArg(kernel_binary, 1, sizeof(out_binary), &out_binary);
+  test_error( error, "clSetKernelArg failed");
+
+  // Execute the kernels
+  error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &size_to_run, NULL, 0, NULL, NULL);
+  test_error( error, "clEnqueueNDRangeKernel failed");
+  error = clEnqueueNDRangeKernel(queue, kernel_binary, 1, NULL, &size_to_run, NULL, 0, NULL, NULL);
+  test_error( error, "clEnqueueNDRangeKernel for binary kernel failed");
+
+  // Finish up
+  error = clFinish(queue);
+  test_error( error, "clFinish failed");
+
+  // Get the results back
+  error = clEnqueueReadBuffer(queue, out, CL_TRUE, 0, sizeof(cl_int)*size_to_run, out_data, 0, NULL, NULL);
+  test_error( error, "clEnqueueReadBuffer failed");
+  error = clEnqueueReadBuffer(queue, out_binary, CL_TRUE, 0, sizeof(cl_int)*size_to_run, out_data_binary, 0, NULL, NULL);
+  test_error( error, "clEnqueueReadBuffer failed");
+
+  // Compare the results
+    if( memcmp( out_data, out_data_binary, sizeof(cl_int)*size_to_run ) != 0 )
+    {
+        log_error( "ERROR: Results from executing binary and regular kernel differ.\n" );
+        return -1;
+    }
+
+    // All done!
+  free(in_data);
+  free(out_data);
+  free(out_data_binary);
+    return 0;
+}
+
+
--- a/test_conformance/api/test_bool.c
+++ b/test_conformance/api/test_bool.c
@@ -0,0 +1,52 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "testBase.h"
+#include "../../test_common/harness/testHarness.h"
+
+
+const char *kernel_with_bool[] = {
+    "__kernel void kernel_with_bool(__global float *src, __global int *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    bool myBool = (src[tid] < 0.5f) && (src[tid] > -0.5f);\n"
+    "    if(myBool)\n"
+    "    {\n"
+    "        dst[tid] = (int)src[tid];\n"
+    "    }\n"
+    "    else\n"
+    "    {\n"
+    "        dst[tid] = 0;\n"
+    "    }\n"
+    "\n"
+    "}\n"
+};
+
+int test_for_bool_type(cl_device_id deviceID, cl_context context,
+               cl_command_queue queue, int num_elements)
+{
+
+    cl_program program;
+    cl_kernel kernel;
+
+    int err = create_single_kernel_helper(context,
+                      &program,
+                      &kernel,
+                      1, kernel_with_bool,
+                      "kernel_with_bool" );
+    return err;
+}
+
--- a/test_conformance/api/test_clone_kernel.cpp
+++ b/test_conformance/api/test_clone_kernel.cpp
@@ -0,0 +1,411 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "testBase.h"
+#include "../../test_common/harness/typeWrappers.h"
+#include "../../test_common/harness/conversions.h"
+#include <sstream>
+#include <string>
+#include <cmath>
+
+using namespace std;
+
+const char *clone_kernel_test_img[] =
+{
+    "__kernel void img_read_kernel(read_only image2d_t img, sampler_t sampler, __global int* outbuf)\n"
+    "{\n"
+    "    uint4 color;\n"
+    "\n"
+    "    color = read_imageui(img, sampler, (int2)(0,0));\n"
+    "    \n"
+    "    // 7, 8, 9, 10th DWORD\n"
+    "    outbuf[7] = color.x;\n"
+    "    outbuf[8] = color.y;\n"
+    "    outbuf[9] = color.z;\n"
+    "    outbuf[10] = color.w;\n"
+    "}\n"
+    "\n"
+    "__kernel void img_write_kernel(write_only image2d_t img, uint4 color)\n"
+    "{\n"
+    "    write_imageui (img, (int2)(0, 0), color);\n"
+    "}\n"
+
+};
+
+const char *clone_kernel_test_double[] =
+{
+    "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
+    "__kernel void clone_kernel_test1(double d, __global double* outbuf)\n"
+    "{\n"
+    "    // use the same outbuf as rest of the tests\n"
+    "    outbuf[2] = d;\n"
+    "}\n"
+};
+
+const char *clone_kernel_test_kernel[] = {
+"typedef struct\n"
+"{\n"
+"    int i;\n"
+"    float f;\n"
+"} structArg;\n"
+"\n"
+"// value type test\n"
+"__kernel void clone_kernel_test0(int iarg, float farg, structArg sarg, __local int* localbuf, __global int* outbuf)\n"
+"{\n"
+"    int  tid = get_global_id(0);\n"
+"\n"
+"    outbuf[0] = iarg;\n"
+"    outbuf[1] = sarg.i;\n"
+"    \n"
+"    ((__global float*)outbuf)[2] = farg;\n"
+"    ((__global float*)outbuf)[3] = sarg.f;\n"
+"}\n"
+"\n"
+"__kernel void buf_read_kernel(__global int* buf, __global int* outbuf)\n"
+"{\n"
+"    // 6th DWORD\n"
+"    outbuf[6] = buf[0];\n"
+"}\n"
+"\n"
+"__kernel void buf_write_kernel(__global int* buf, int write_val)\n"
+"{\n"
+"    buf[0] = write_val;\n"
+"}\n"
+
+ };
+
+const int BUF_SIZE = 128;
+
+struct structArg
+{
+    int i;
+    float f;
+};
+
+static unsigned char *
+generate_8888_image(int w, int h, MTdata d)
+{
+    unsigned char   *ptr = (unsigned char*)malloc(w * h * 4);
+    int             i;
+
+    for (i=0; i<w*h*4; i++)
+        ptr[i] = (unsigned char)genrand_int32( d);
+
+    return ptr;
+}
+
+int test_image_arg_shallow_clone(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, void* pbufRes, clMemWrapper& bufOut)
+{
+    int error;
+    cl_image_format    img_format;
+    clSamplerWrapper sampler;
+    img_format.image_channel_order = CL_RGBA;
+    img_format.image_channel_data_type = CL_UNSIGNED_INT8;
+	cl_image_desc imageDesc;
+	memset(&imageDesc, 0x0, sizeof(cl_image_desc));
+    imageDesc.image_type = CL_MEM_OBJECT_IMAGE2D;
+    imageDesc.image_width = 512;
+    imageDesc.image_height = 512;
+
+    cl_uint color[4] = {1,3,5,7};
+
+    clProgramWrapper program;
+    clKernelWrapper kernel_read;
+    clKernelWrapper kernel_write;
+    clKernelWrapper kernel_cloned;
+    size_t    ndrange1 = 1;
+
+    clMemWrapper img;
+
+    if( create_single_kernel_helper( context, &program, &kernel_read, 1, clone_kernel_test_img, "img_read_kernel" ) != 0 )
+    {
+        return -1;
+    }
+
+    if( create_single_kernel_helper( context, &program, &kernel_write, 1, clone_kernel_test_img, "img_write_kernel" ) != 0 )
+    {
+        return -1;
+    }
+
+    img = clCreateImage(context, CL_MEM_READ_WRITE, &img_format, &imageDesc, NULL, &error);
+    test_error( error, "clCreateImage failed." );
+
+    cl_sampler_properties properties[] = {
+        CL_SAMPLER_NORMALIZED_COORDS, CL_FALSE,
+        CL_SAMPLER_ADDRESSING_MODE, CL_ADDRESS_CLAMP_TO_EDGE,
+        CL_SAMPLER_FILTER_MODE, CL_FILTER_NEAREST,
+        0 };
+    sampler = clCreateSamplerWithProperties(context, properties, &error);
+    test_error( error, "clCreateSamplerWithProperties failed." );
+
+    error = clSetKernelArg(kernel_write, 1, sizeof(int) * 4, color);
+    error += clSetKernelArg(kernel_write, 0, sizeof(cl_mem), &img);
+    test_error( error, "clSetKernelArg failed." );
+
+    error = clEnqueueNDRangeKernel(queue, kernel_write, 1, NULL, &ndrange1, NULL, 0, NULL, NULL);
+    test_error( error, "clEnqueueNDRangeKernel failed." );
+
+    error = clSetKernelArg(kernel_read, 0, sizeof(cl_mem), &img);
+    error += clSetKernelArg(kernel_read, 1, sizeof(cl_sampler), &sampler);
+    error += clSetKernelArg(kernel_read, 2, sizeof(cl_mem), &bufOut);
+
+    test_error( error, "clSetKernelArg failed." );
+
+    // clone the kernel
+    kernel_cloned = clCloneKernel(kernel_read, &error);
+    test_error( error, "clCloneKernel failed." );
+    error = clEnqueueNDRangeKernel(queue, kernel_cloned, 1, NULL, &ndrange1, NULL, 0, NULL, NULL);
+    test_error( error, "clEnqueueNDRangeKernel failed." );
+
+    // read result back
+    error = clEnqueueReadBuffer(queue, bufOut, CL_TRUE, 0, 128, pbufRes, 0, NULL, NULL);
+    test_error( error, "clEnqueueReadBuffer failed." );
+
+    if (((cl_uint*)pbufRes)[7] != color[0])
+    {
+        test_error( error, "clCloneKernel test failed." );
+        return -1;
+    }
+
+    if (((cl_uint*)pbufRes)[8] != color[1])
+    {
+        test_error( error, "clCloneKernel test failed." );
+        return -1;
+    }
+
+    if (((cl_uint*)pbufRes)[9] != color[2])
+    {
+        test_error( error, "clCloneKernel test failed." );
+        return -1;
+    }
+
+    if (((cl_uint*)pbufRes)[10] != color[3])
+    {
+        test_error( error, "clCloneKernel test failed." );
+        return -1;
+    }
+
+    return 0;
+}
+
+int test_double_arg_clone(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, void* pbufRes, clMemWrapper& bufOut)
+{
+    int error = 0;
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    clKernelWrapper kernel_cloned;
+    size_t    ndrange1 = 1;
+
+    if( create_single_kernel_helper( context, &program, &kernel, 1, clone_kernel_test_double, "clone_kernel_test1" ) != 0 )
+    {
+        return -1;
+    }
+
+    cl_double d = 1.23;
+    error = clSetKernelArg(kernel, 0, sizeof(double), &d);
+    error += clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufOut);
+    test_error( error, "clSetKernelArg failed." );
+
+    kernel_cloned = clCloneKernel(kernel, &error);
+    test_error( error, "clCloneKernel failed." );
+
+    error = clEnqueueNDRangeKernel(queue, kernel_cloned, 1, NULL, &ndrange1, NULL, 0, NULL, NULL);
+    test_error( error, "clEnqueueNDRangeKernel failed." );
+
+    // read result back
+    error = clEnqueueReadBuffer(queue, bufOut, CL_TRUE, 0, BUF_SIZE, pbufRes, 0, NULL, NULL);
+    test_error( error, "clEnqueueReadBuffer failed." );
+
+    if (abs(((cl_double*)pbufRes)[2] - d) > 0.0000001)
+    {
+        test_error( error, "clCloneKernel test failed." );
+        return -1;
+    }
+
+    return 0;
+}
+
+int test_clone_kernel(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    int error;
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    clKernelWrapper kernel_pipe_read;
+    clKernelWrapper kernel_buf_read;
+    clKernelWrapper kernel_pipe_write;
+    clKernelWrapper kernel_buf_write;
+
+    clKernelWrapper kernel_pipe_read_cloned;
+    clKernelWrapper kernel_buf_read_cloned;
+    size_t    ndrange1 = 1;
+
+    int write_val = 123;
+
+
+    cl_bool bimg = CL_FALSE;
+    cl_bool bdouble = CL_FALSE;
+    // test image support
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool), &bimg, NULL);
+    test_error( error, "clGetDeviceInfo failed." );
+
+    // test double support
+    size_t ext_str_size;
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
+    test_error( error, "clGetDeviceInfo failed." );
+    char* ext_str = new char[ext_str_size+1];
+
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_EXTENSIONS, ext_str_size, ext_str, NULL);
+    test_error( error, "clGetDeviceInfo failed." );
+
+    ext_str[ext_str_size] = '\0';
+
+    stringstream ss;
+    ss << ext_str;
+
+    while (!ss.eof())
+    {
+        string s;
+        ss >> s;
+        if (s == "cl_khr_fp64")
+        {
+            bdouble = CL_TRUE;
+            break;
+        }
+    }
+
+    /* Create kernels to test with */
+    if( create_single_kernel_helper( context, &program, &kernel, 1, clone_kernel_test_kernel, "clone_kernel_test0" ) != 0 )
+    {
+        return -1;
+    }
+
+    if( create_single_kernel_helper( context, &program, &kernel_buf_read, 1, clone_kernel_test_kernel, "buf_read_kernel" ) != 0 )
+    {
+        return -1;
+    }
+
+    if( create_single_kernel_helper( context, &program, &kernel_buf_write, 1, clone_kernel_test_kernel, "buf_write_kernel" ) != 0 )
+    {
+        return -1;
+    }
+
+    // Kernel args
+    // Value type
+    int intarg = 0;
+    float farg = 1.0;
+    structArg sa = { 1, 1.0f };
+
+    // cl_mem
+    clMemWrapper buf, bufOut;
+
+    char* pbuf = new char[BUF_SIZE];
+    char* pbufRes = new char[BUF_SIZE];
+    buf = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, BUF_SIZE, pbuf, &error);
+    test_error( error, "clCreateBuffer failed." );
+
+    bufOut = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, BUF_SIZE, NULL, &error);
+    test_error( error, "clCreateBuffer failed." );
+
+    clMemWrapper pipe = clCreatePipe(context, CL_MEM_HOST_NO_ACCESS, sizeof(int), 16, NULL, &error);
+    test_error( error, "clCreatePipe failed." );
+
+    error = clSetKernelArg(kernel, 0, sizeof(int), &intarg);
+    error += clSetKernelArg(kernel, 1, sizeof(float), &farg);
+    error += clSetKernelArg(kernel, 2, sizeof(structArg), &sa);
+    error += clSetKernelArg(kernel, 3, 128, NULL);    // local mem
+
+    test_error( error, "clSetKernelArg failed." );
+
+    // clone the kernel
+    clKernelWrapper clonek = clCloneKernel(kernel, &error);
+    test_error( error, "clCloneKernel failed." );
+
+    // set the last arg and enqueue
+    error = clSetKernelArg(clonek, 4, sizeof(cl_mem), &bufOut);
+    test_error( error, "clSetKernelArg failed." );
+    error = clEnqueueNDRangeKernel(queue, clonek, 1, NULL, &ndrange1, NULL, 0, NULL, NULL);
+    test_error( error, "clEnqueueNDRangeKernel failed." );
+
+    // shallow clone tests for buffer, svm and pipes
+    error = clSetKernelArg(kernel_buf_write, 0, sizeof(cl_mem), &buf);
+    error += clSetKernelArg(kernel_buf_write, 1, sizeof(int), &write_val);
+    test_error( error, "clSetKernelArg failed." );
+    error = clEnqueueNDRangeKernel(queue, kernel_buf_write, 1, NULL, &ndrange1, NULL, 0, NULL, NULL);
+    test_error( error, "clEnqueueNDRangeKernel failed." );
+
+    error = clSetKernelArg(kernel_buf_read, 0, sizeof(cl_mem), &buf);
+    error += clSetKernelArg(kernel_buf_read, 1, sizeof(cl_mem), &bufOut);
+    test_error( error, "clSetKernelArg failed." );
+
+    // clone the kernel
+    kernel_buf_read_cloned = clCloneKernel(kernel_buf_read, &error);
+    test_error( error, "clCloneKernel API call failed." );
+    error = clEnqueueNDRangeKernel(queue, kernel_buf_read_cloned, 1, NULL, &ndrange1, NULL, 0, NULL, NULL);
+    test_error( error, "clEnqueueNDRangeKernel failed." );
+
+    // read result back
+    error = clEnqueueReadBuffer(queue, bufOut, CL_TRUE, 0, BUF_SIZE, pbufRes, 0, NULL, NULL);
+    test_error( error, "clEnqueueReadBuffer failed." );
+
+    // Compare the results
+    if (((int*)pbufRes)[0] != intarg)
+    {
+        test_error( error, "clCloneKernel test failed. Failed to clone integer type argument." );
+        return -1;
+    }
+
+    if (((int*)pbufRes)[1] != sa.i)
+    {
+        test_error( error, "clCloneKernel test failed. Failed to clone structure type argument." );
+        return -1;
+    }
+
+    if (((float*)pbufRes)[2] != farg)
+    {
+        test_error( error, "clCloneKernel test failed. Failed to clone structure type argument." );
+        return -1;
+    }
+
+    if (((float*)pbufRes)[3] != sa.f)
+    {
+        test_error( error, "clCloneKernel test failed. Failed to clone float type argument." );
+        return -1;
+    }
+
+    if (((int*)pbufRes)[6] != write_val)
+    {
+        test_error( error, "clCloneKernel test failed.  Failed to clone cl_mem argument." );
+        return -1;
+    }
+
+    if (bimg)
+    {
+        error = test_image_arg_shallow_clone(deviceID, context, queue, num_elements, pbufRes, bufOut);
+        test_error( error, "image arg shallow clone test failed." );
+    }
+
+    if (bdouble)
+    {
+        error = test_double_arg_clone(deviceID, context, queue, num_elements, pbufRes, bufOut);
+        test_error( error, "double arg clone test failed." );
+    }
+
+    delete [] pbuf;
+    delete [] pbufRes;
+    delete [] ext_str;
+
+    return 0;
+}
+
--- a/test_conformance/api/test_create_context_from_type.cpp
+++ b/test_conformance/api/test_create_context_from_type.cpp
@@ -0,0 +1,130 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "testBase.h"
+#include "../../test_common/harness/testHarness.h"
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include "../../test_common/harness/conversions.h"
+
+extern cl_uint gRandomSeed;
+
+int test_create_context_from_type(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    int error;
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    clMemWrapper    streams[2];
+    clContextWrapper context_to_test;
+    clCommandQueueWrapper queue_to_test;
+    size_t    threads[1], localThreads[1];
+    cl_float inputData[10];
+    cl_int outputData[10];
+    int i;
+    RandomSeed seed( gRandomSeed );
+
+    const char *sample_single_test_kernel[] = {
+    "__kernel void sample_test(__global float *src, __global int *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = (int)src[tid];\n"
+    "\n"
+    "}\n" };
+
+    cl_device_type type;
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_TYPE, sizeof(type), &type, NULL);
+    test_error(error, "clGetDeviceInfo for CL_DEVICE_TYPE failed\n");
+
+    cl_platform_id platform;
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_PLATFORM, sizeof(platform), &platform, NULL);
+    test_error(error, "clGetDeviceInfo for CL_DEVICE_PLATFORM failed\n");
+
+    cl_context_properties properties[3] = {
+      (cl_context_properties)CL_CONTEXT_PLATFORM,
+      (cl_context_properties)platform,
+      NULL
+    };
+
+    context_to_test = clCreateContextFromType(properties, type, notify_callback, NULL, &error);
+    test_error(error, "clCreateContextFromType failed");
+    if (context_to_test == NULL) {
+        log_error("clCreateContextFromType returned NULL, but error was CL_SUCCESS.");
+        return -1;
+    }
+
+    queue_to_test = clCreateCommandQueueWithProperties(context_to_test, deviceID, NULL, &error);
+    test_error(error, "clCreateCommandQueue failed");
+    if (queue_to_test == NULL) {
+        log_error("clCreateCommandQueue returned NULL, but error was CL_SUCCESS.");
+        return -1;
+    }
+
+    /* Create a kernel to test with */
+    if( create_single_kernel_helper( context_to_test, &program, &kernel, 1, sample_single_test_kernel, "sample_test" ) != 0 )
+    {
+        return -1;
+    }
+
+    /* Create some I/O streams */
+    streams[0] = clCreateBuffer(context_to_test, (cl_mem_flags)(CL_MEM_READ_WRITE),  sizeof(cl_float) * 10, NULL, &error);
+    test_error( error, "Creating test array failed" );
+    streams[1] = clCreateBuffer(context_to_test, (cl_mem_flags)(CL_MEM_READ_WRITE),  sizeof(cl_int) * 10, NULL, &error);
+    test_error( error, "Creating test array failed" );
+
+    /* Write some test data */
+    memset( outputData, 0, sizeof( outputData ) );
+
+    for (i=0; i<10; i++)
+        inputData[i] = get_random_float(-(float) 0x7fffffff, (float) 0x7fffffff, seed);
+
+    error = clEnqueueWriteBuffer(queue_to_test, streams[0], CL_TRUE, 0, sizeof(cl_float)*10, (void *)inputData, 0, NULL, NULL);
+    test_error( error, "Unable to set testing kernel data" );
+
+    /* Test setting the arguments by index manually */
+    error = clSetKernelArg(kernel, 1, sizeof( streams[1] ), &streams[1]);
+    test_error( error, "Unable to set indexed kernel arguments" );
+    error = clSetKernelArg(kernel, 0, sizeof( streams[0] ), &streams[0]);
+    test_error( error, "Unable to set indexed kernel arguments" );
+
+
+    /* Test running the kernel and verifying it */
+    threads[0] = (size_t)10;
+
+    error = get_max_common_work_group_size( context_to_test, kernel, threads[0], &localThreads[0] );
+    test_error( error, "Unable to get work group size to use" );
+
+    error = clEnqueueNDRangeKernel( queue_to_test, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
+    test_error( error, "Kernel execution failed" );
+
+    error = clEnqueueReadBuffer( queue_to_test, streams[1], CL_TRUE, 0, sizeof(cl_int)*10, (void *)outputData, 0, NULL, NULL );
+    test_error( error, "Unable to get result data" );
+
+    for (i=0; i<10; i++)
+    {
+        if (outputData[i] != (int)inputData[i])
+        {
+            log_error( "ERROR: Data did not verify on first pass!\n" );
+            return -1;
+        }
+    }
+
+  return 0;
+}
+
+
--- a/test_conformance/api/test_create_kernels.c
+++ b/test_conformance/api/test_create_kernels.c
@@ -0,0 +1,595 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "testBase.h"
+#include "../../test_common/harness/testHarness.h"
+
+
+const char *sample_single_kernel[] = {
+    "__kernel void sample_test(__global float *src, __global int *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = (int)src[tid];\n"
+    "\n"
+    "}\n" };
+
+size_t sample_single_kernel_lengths[1];
+
+const char *sample_two_kernels[] = {
+    "__kernel void sample_test(__global float *src, __global int *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = (int)src[tid];\n"
+    "\n"
+    "}\n",
+    "__kernel void sample_test2(__global int *src, __global float *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = (float)src[tid];\n"
+    "\n"
+    "}\n" };
+
+size_t sample_two_kernel_lengths[2];
+
+const char *sample_two_kernels_in_1[] = {
+    "__kernel void sample_test(__global float *src, __global int *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = (int)src[tid];\n"
+    "\n"
+    "}\n"
+    "__kernel void sample_test2(__global int *src, __global float *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = (float)src[tid];\n"
+    "\n"
+    "}\n" };
+
+size_t sample_two_kernels_in_1_lengths[1];
+
+
+const char *repeate_test_kernel =
+"__kernel void test_kernel(__global int *src, __global int *dst)\n"
+"{\n"
+" dst[get_global_id(0)] = src[get_global_id(0)]+1;\n"
+"}\n";
+
+
+
+int test_load_single_kernel(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    int error;
+    clProgramWrapper program;
+    cl_program testProgram;
+    clKernelWrapper kernel;
+    cl_context testContext;
+    unsigned int numKernels;
+    cl_char testName[512];
+    cl_uint testArgCount;
+    size_t realSize;
+
+
+    error = create_single_kernel_helper(context, &program, NULL, 1, sample_single_kernel, NULL);
+    test_error( error, "Unable to build test program" );
+
+    error = clCreateKernelsInProgram(program, 1, &kernel, &numKernels);
+    test_error( error, "Unable to create single kernel program" );
+
+    /* Check program and context pointers */
+    error = clGetKernelInfo( kernel, CL_KERNEL_PROGRAM, sizeof( cl_program ), &testProgram, &realSize );
+    test_error( error, "Unable to get kernel's program" );
+    if( (cl_program)testProgram != (cl_program)program )
+    {
+        log_error( "ERROR: Returned kernel's program does not match program used to create it! (Got %p, expected %p)\n", (cl_program)testProgram, (cl_program)program );
+        return -1;
+    }
+    if( realSize != sizeof( cl_program ) )
+    {
+        log_error( "ERROR: Returned size of kernel's program does not match expected size (expected %d, got %d)\n", (int)sizeof( cl_program ), (int)realSize );
+        return -1;
+    }
+
+    error = clGetKernelInfo( kernel, CL_KERNEL_CONTEXT, sizeof( cl_context ), &testContext, &realSize );
+    test_error( error, "Unable to get kernel's context" );
+    if( (cl_context)testContext != (cl_context)context )
+    {
+        log_error( "ERROR: Returned kernel's context does not match program used to create it! (Got %p, expected %p)\n", (cl_context)testContext, (cl_context)context );
+        return -1;
+    }
+    if( realSize != sizeof( cl_context ) )
+    {
+        log_error( "ERROR: Returned size of kernel's context does not match expected size (expected %d, got %d)\n", (int)sizeof( cl_context ), (int)realSize );
+        return -1;
+    }
+
+    /* Test arg count */
+    error = clGetKernelInfo( kernel, CL_KERNEL_NUM_ARGS, 0, NULL, &realSize );
+    test_error( error, "Unable to get size of arg count info from kernel" );
+
+    if( realSize != sizeof( testArgCount ) )
+    {
+        log_error( "ERROR: size of arg count not valid! %d\n", (int)realSize );
+        return -1;
+    }
+
+    error = clGetKernelInfo( kernel, CL_KERNEL_NUM_ARGS, sizeof( testArgCount ), &testArgCount, NULL );
+    test_error( error, "Unable to get arg count from kernel" );
+
+    if( testArgCount != 2 )
+    {
+        log_error( "ERROR: Kernel arg count does not match!\n" );
+        return -1;
+    }
+
+
+    /* Test function name */
+    error = clGetKernelInfo( kernel, CL_KERNEL_FUNCTION_NAME, sizeof( testName ), testName, &realSize );
+    test_error( error, "Unable to get name from kernel" );
+
+    if( strcmp( (char *)testName, "sample_test" ) != 0 )
+    {
+        log_error( "ERROR: Kernel names do not match!\n" );
+        return -1;
+    }
+    if( realSize != strlen( (char *)testName ) + 1 )
+    {
+        log_error( "ERROR: Length of kernel name returned does not validate (expected %d, got %d)\n", (int)strlen( (char *)testName ) + 1, (int)realSize );
+        return -1;
+    }
+
+    /* All done */
+
+    return 0;
+}
+
+int test_load_two_kernels(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    int error;
+    clProgramWrapper program;
+    clKernelWrapper kernel[2];
+    unsigned int numKernels;
+    cl_char testName[ 512 ];
+    cl_uint testArgCount;
+
+
+    error = create_single_kernel_helper(context, &program, NULL, 2, sample_two_kernels, NULL);
+    test_error( error, "Unable to build test program" );
+
+    error = clCreateKernelsInProgram(program, 2, &kernel[0], &numKernels);
+    test_error( error, "Unable to create dual kernel program" );
+
+    if( numKernels != 2 )
+    {
+        log_error( "ERROR: wrong # of kernels! (%d)\n", numKernels );
+        return -1;
+    }
+
+    /* Check first kernel */
+    error = clGetKernelInfo( kernel[0], CL_KERNEL_FUNCTION_NAME, sizeof( testName ), testName, NULL );
+    test_error( error, "Unable to get function name from kernel" );
+
+    int found_kernel1 = 0, found_kernel2 = 0;
+
+    if( strcmp( (char *)testName, "sample_test" ) == 0 ) {
+        found_kernel1 = 1;
+    } else if( strcmp( (char *)testName, "sample_test2" ) == 0 ) {
+        found_kernel2 = 1;
+    } else {
+        log_error( "ERROR: Invalid kernel name returned: \"%s\" expected \"%s\" or \"%s\".\n", testName, "sample_test", "sample_test2");
+        return -1;
+    }
+
+    error = clGetKernelInfo( kernel[1], CL_KERNEL_FUNCTION_NAME, sizeof( testName ), testName, NULL );
+    test_error( error, "Unable to get function name from second kernel" );
+
+    if( strcmp( (char *)testName, "sample_test" ) == 0 ) {
+        if (found_kernel1) {
+            log_error("Kernel \"%s\" returned twice.\n", (char *)testName);
+            return -1;
+        }
+        found_kernel1 = 1;
+    } else if( strcmp( (char *)testName, "sample_test2" ) == 0 ) {
+        if (found_kernel2) {
+            log_error("Kernel \"%s\" returned twice.\n", (char *)testName);
+            return -1;
+        }
+        found_kernel2 = 1;
+    } else {
+        log_error( "ERROR: Invalid kernel name returned: \"%s\" expected \"%s\" or \"%s\".\n", testName, "sample_test", "sample_test2");
+        return -1;
+    }
+
+    if( !found_kernel1 || !found_kernel2 )
+    {
+        log_error( "ERROR: Kernel names do not match.\n" );
+        if (!found_kernel1)
+            log_error("Kernel \"%s\" not returned.\n", "sample_test");
+        if (!found_kernel2)
+            log_error("Kernel \"%s\" not returned.\n", "sample_test");
+        return -1;
+    }
+
+    error = clGetKernelInfo( kernel[0], CL_KERNEL_NUM_ARGS, sizeof( testArgCount ), &testArgCount, NULL );
+    test_error( error, "Unable to get arg count from kernel" );
+
+    if( testArgCount != 2 )
+    {
+        log_error( "ERROR: wrong # of args for kernel\n" );
+        return -1;
+    }
+
+    /* All done */
+    return 0;
+}
+
+int test_load_two_kernels_in_one(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    int error;
+    clProgramWrapper program;
+    clKernelWrapper kernel[2];
+    unsigned int numKernels;
+    cl_char testName[512];
+    cl_uint testArgCount;
+
+
+    error = create_single_kernel_helper(context, &program, NULL, 1, sample_two_kernels_in_1, NULL);
+    test_error( error, "Unable to build test program" );
+
+    error = clCreateKernelsInProgram(program, 2, &kernel[0], &numKernels);
+    test_error( error, "Unable to create dual kernel program" );
+
+    if( numKernels != 2 )
+    {
+        log_error( "ERROR: wrong # of kernels! (%d)\n", numKernels );
+        return -1;
+    }
+
+    /* Check first kernel */
+    error = clGetKernelInfo( kernel[0], CL_KERNEL_FUNCTION_NAME, sizeof( testName ), testName, NULL );
+    test_error( error, "Unable to get function name from kernel" );
+
+    int found_kernel1 = 0, found_kernel2 = 0;
+
+    if( strcmp( (char *)testName, "sample_test" ) == 0 ) {
+        found_kernel1 = 1;
+    } else if( strcmp( (char *)testName, "sample_test2" ) == 0 ) {
+        found_kernel2 = 1;
+    } else {
+        log_error( "ERROR: Invalid kernel name returned: \"%s\" expected \"%s\" or \"%s\".\n", testName, "sample_test", "sample_test2");
+        return -1;
+    }
+
+    error = clGetKernelInfo( kernel[0], CL_KERNEL_NUM_ARGS, sizeof( testArgCount ), &testArgCount, NULL );
+    test_error( error, "Unable to get arg count from kernel" );
+
+    if( testArgCount != 2 )
+    {
+        log_error( "ERROR: wrong # of args for kernel\n" );
+        return -1;
+    }
+
+    /* Check second kernel */
+    error = clGetKernelInfo( kernel[1], CL_KERNEL_FUNCTION_NAME, sizeof( testName ), testName, NULL );
+    test_error( error, "Unable to get function name from kernel" );
+
+    if( strcmp( (char *)testName, "sample_test" ) == 0 ) {
+        if (found_kernel1) {
+            log_error("Kernel \"%s\" returned twice.\n", (char *)testName);
+            return -1;
+        }
+        found_kernel1 = 1;
+    } else if( strcmp( (char *)testName, "sample_test2" ) == 0 ) {
+        if (found_kernel2) {
+            log_error("Kernel \"%s\" returned twice.\n", (char *)testName);
+            return -1;
+        }
+        found_kernel2 = 1;
+    } else {
+        log_error( "ERROR: Invalid kernel name returned: \"%s\" expected \"%s\" or \"%s\".\n", testName, "sample_test", "sample_test2");
+        return -1;
+    }
+
+    if( !found_kernel1 || !found_kernel2 )
+    {
+        log_error( "ERROR: Kernel names do not match.\n" );
+        if (!found_kernel1)
+            log_error("Kernel \"%s\" not returned.\n", "sample_test");
+        if (!found_kernel2)
+            log_error("Kernel \"%s\" not returned.\n", "sample_test");
+        return -1;
+    }
+
+    /* All done */
+    return 0;
+}
+
+int test_load_two_kernels_manually( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    clProgramWrapper program;
+    clKernelWrapper kernel1, kernel2;
+    int error;
+
+
+    /* Now create a test program */
+    error = create_single_kernel_helper(context, &program, NULL, 1, sample_two_kernels_in_1, NULL);
+    test_error( error, "Unable to build test program" );
+
+    /* Try manually creating kernels (backwards just in case) */
+    kernel1 = clCreateKernel( program, "sample_test2", &error );
+
+    if( kernel1 == NULL || error != CL_SUCCESS )
+    {
+        print_error( error, "Could not get kernel 1" );
+        return -1;
+    }
+
+    kernel2 = clCreateKernel( program, "sample_test", &error );
+
+    if( kernel2 == NULL )
+    {
+        print_error( error, "Could not get kernel 2" );
+        return -1;
+    }
+
+    return 0;
+}
+
+int test_get_program_info_kernel_names( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    clProgramWrapper program;
+    clKernelWrapper kernel1, kernel2;
+    int error;
+    size_t i;
+
+    /* Now create a test program */
+    error = create_single_kernel_helper(context, &program, NULL, 1, sample_two_kernels_in_1, NULL);
+    test_error( error, "Unable to build test program" );
+
+    /* Lookup the number of kernels in the program. */
+    size_t total_kernels = 0;
+    error = clGetProgramInfo(program, CL_PROGRAM_NUM_KERNELS, sizeof(size_t),&total_kernels,NULL);
+    test_error( error, "Unable to get program info num kernels");
+
+    if (total_kernels != 2)
+    {
+        print_error( error, "Program did not contain two kernels" );
+        return -1;
+    }
+
+    /* Lookup the kernel names. */
+    const char* actual_names[] = { "sample_test;sample_test2", "sample_test2;sample_test"} ;
+
+    size_t kernel_names_len = 0;
+    error = clGetProgramInfo(program,CL_PROGRAM_KERNEL_NAMES,0,NULL,&kernel_names_len);
+    test_error( error, "Unable to get length of kernel names list." );
+
+    if (kernel_names_len != (strlen(actual_names[0])+1))
+    {
+        print_error( error, "Kernel names length did not match");
+        return -1;
+    }
+
+    const size_t len = (kernel_names_len+1)*sizeof(char);
+    char* kernel_names = (char*)malloc(len);
+    error = clGetProgramInfo(program,CL_PROGRAM_KERNEL_NAMES,len,kernel_names,&kernel_names_len);
+    test_error( error, "Unable to get kernel names list." );
+
+    /* Check to see if the kernel name array is null terminated. */
+    if (kernel_names[kernel_names_len-1] != '\0')
+    {
+        free(kernel_names);
+        print_error( error, "Kernel name list was not null terminated");
+        return -1;
+    }
+
+    /* Check to see if the correct kernel name string was returned. */
+    for( i = 0; i < sizeof( actual_names ) / sizeof( actual_names[0] ); i++ )
+        if( 0 == strcmp(actual_names[i],kernel_names) )
+            break;
+
+    if (i == sizeof( actual_names ) / sizeof( actual_names[0] ) )
+    {
+        free(kernel_names);
+        log_error( "Kernel names \"%s\" did not match:\n", kernel_names );
+        for( i = 0; i < sizeof( actual_names ) / sizeof( actual_names[0] ); i++ )
+            log_error( "\t\t\"%s\"\n", actual_names[0] );
+        return -1;
+    }
+    free(kernel_names);
+
+    /* Try manually creating kernels (backwards just in case) */
+    kernel1 = clCreateKernel( program, "sample_test", &error );
+    if( kernel1 == NULL || error != CL_SUCCESS )
+    {
+        print_error( error, "Could not get kernel 1" );
+        return -1;
+    }
+
+    kernel2 = clCreateKernel( program, "sample_test2", &error );
+    if( kernel2 == NULL )
+    {
+        print_error( error, "Could not get kernel 2" );
+        return -1;
+    }
+
+    return 0;
+}
+
+static const char *single_task_kernel[] = {
+    "__kernel void sample_test(__global int *dst, int count)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    for( int i = 0; i < count; i++ )\n"
+    "        dst[i] = tid + i;\n"
+    "\n"
+    "}\n" };
+
+int test_enqueue_task(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    int error;
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    clMemWrapper output;
+    cl_int count;
+
+
+    if( create_single_kernel_helper( context, &program, &kernel, 1, single_task_kernel, "sample_test" ) )
+        return -1;
+
+    // Create args
+    count = 100;
+    output = clCreateBuffer( context, (cl_mem_flags)(CL_MEM_READ_WRITE), sizeof( cl_int ) * count, NULL, &error );
+    test_error( error, "Unable to create output buffer" );
+
+    error = clSetKernelArg( kernel, 0, sizeof( cl_mem ), &output );
+    test_error( error, "Unable to set kernel argument" );
+    error = clSetKernelArg( kernel, 1, sizeof( cl_int ), &count );
+    test_error( error, "Unable to set kernel argument" );
+
+    // Run task
+    error = clEnqueueTask( queue, kernel, 0, NULL, NULL );
+    test_error( error, "Unable to run task" );
+
+    // Read results
+    cl_int *results = (cl_int*)malloc(sizeof(cl_int)*count);
+    error = clEnqueueReadBuffer( queue, output, CL_TRUE, 0, sizeof( cl_int ) * count, results, 0, NULL, NULL );
+    test_error( error, "Unable to read results" );
+
+    // Validate
+    for( cl_int i = 0; i < count; i++ )
+    {
+        if( results[ i ] != i )
+        {
+            log_error( "ERROR: Task result value %d did not validate! Expected %d, got %d\n", (int)i, (int)i, (int)results[ i ] );
+            free(results);
+            return -1;
+        }
+    }
+
+    /* All done */
+    free(results);
+    return 0;
+}
+
+
+
+#define TEST_SIZE 1000
+int test_repeated_setup_cleanup(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+
+    cl_context local_context;
+    cl_command_queue local_queue;
+    cl_program local_program;
+    cl_kernel local_kernel;
+    cl_mem local_mem_in, local_mem_out;
+    cl_event local_event;
+    size_t global_dim[3];
+    int i, j, error;
+    global_dim[0] = TEST_SIZE;
+    global_dim[1] = 1; global_dim[2] = 1;
+    cl_int *inData, *outData;
+    cl_int status;
+
+    inData = (cl_int*)malloc(sizeof(cl_int)*TEST_SIZE);
+    outData = (cl_int*)malloc(sizeof(cl_int)*TEST_SIZE);
+    for (i=0; i<TEST_SIZE; i++) {
+        inData[i] = i;
+    }
+
+
+    for (i=0; i<100; i++) {
+        memset(outData, 0, sizeof(cl_int)*TEST_SIZE);
+
+        local_context = clCreateContext(NULL, 1, &deviceID, notify_callback, NULL, &error);
+        test_error( error, "clCreateContext failed");
+
+        local_queue = clCreateCommandQueueWithProperties(local_context, deviceID, 0, &error);
+        test_error( error, "clCreateCommandQueue failed");
+
+        error = create_single_kernel_helper(local_context, &local_program, NULL, 1, &repeate_test_kernel, NULL);
+        test_error( error, "Unable to build test program" );
+
+        local_kernel = clCreateKernel(local_program, "test_kernel", &error);
+        test_error( error, "clCreateKernel failed");
+
+        local_mem_in = clCreateBuffer(local_context, CL_MEM_READ_ONLY, TEST_SIZE*sizeof(cl_int), NULL, &error);
+        test_error( error, "clCreateBuffer failed");
+
+        local_mem_out = clCreateBuffer(local_context, CL_MEM_WRITE_ONLY, TEST_SIZE*sizeof(cl_int), NULL, &error);
+        test_error( error, "clCreateBuffer failed");
+
+        error = clEnqueueWriteBuffer(local_queue, local_mem_in, CL_TRUE, 0, TEST_SIZE*sizeof(cl_int), inData, 0, NULL, NULL);
+        test_error( error, "clEnqueueWriteBuffer failed");
+
+        error = clEnqueueWriteBuffer(local_queue, local_mem_out, CL_TRUE, 0, TEST_SIZE*sizeof(cl_int), outData, 0, NULL, NULL);
+        test_error( error, "clEnqueueWriteBuffer failed");
+
+        error = clSetKernelArg(local_kernel, 0, sizeof(local_mem_in), &local_mem_in);
+        test_error( error, "clSetKernelArg failed");
+
+        error = clSetKernelArg(local_kernel, 1, sizeof(local_mem_out), &local_mem_out);
+        test_error( error, "clSetKernelArg failed");
+
+        error = clEnqueueNDRangeKernel(local_queue, local_kernel, 1, NULL, global_dim, NULL, 0, NULL, &local_event);
+        test_error( error, "clEnqueueNDRangeKernel failed");
+
+        error = clWaitForEvents(1, &local_event);
+        test_error( error, "clWaitForEvents failed");
+
+        error = clGetEventInfo(local_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
+        test_error( error, "clGetEventInfo failed");
+
+        if (status != CL_COMPLETE) {
+            log_error( "Kernel execution not complete: status %d.\n", status);
+            free(inData);
+            free(outData);
+            return -1;
+        }
+
+        error = clEnqueueReadBuffer(local_queue, local_mem_out, CL_TRUE, 0, TEST_SIZE*sizeof(cl_int), outData, 0, NULL, NULL);
+        test_error( error, "clEnqueueReadBuffer failed");
+
+        clReleaseEvent(local_event);
+        clReleaseMemObject(local_mem_in);
+        clReleaseMemObject(local_mem_out);
+        clReleaseKernel(local_kernel);
+        clReleaseProgram(local_program);
+        clReleaseCommandQueue(local_queue);
+        clReleaseContext(local_context);
+
+        for (j=0; j<TEST_SIZE; j++) {
+            if (outData[j] != inData[j] + 1) {
+                log_error("Results failed to validate at iteration %d. %d != %d.\n", i, outData[j], inData[j] + 1);
+                free(inData);
+                free(outData);
+                return -1;
+            }
+        }
+    }
+
+    free(inData);
+    free(outData);
+
+    return 0;
+}
+
+
+
--- a/test_conformance/api/test_device_min_data_type_align_size_alignment.cpp
+++ b/test_conformance/api/test_device_min_data_type_align_size_alignment.cpp
@@ -0,0 +1,60 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "testBase.h"
+#include "../../test_common/harness/testHarness.h"
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+int IsAPowerOfTwo( unsigned long x )
+{
+  return 0 == (x & (x-1));
+}
+
+
+int test_min_data_type_align_size_alignment(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
+{
+  cl_uint min_alignment;
+
+  if (gHasLong)
+    min_alignment = sizeof(cl_long)*16;
+  else
+    min_alignment = sizeof(cl_int)*16;
+
+  int error = 0;
+  cl_uint alignment;
+
+  error = clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(alignment), &alignment, NULL);
+  test_error(error, "clGetDeviceInfo for CL_DEVICE_MEM_BASE_ADDR_ALIGN failed");
+  log_info("Device reported CL_DEVICE_MEM_BASE_ADDR_ALIGN = %lu bits.\n", (unsigned long)alignment);
+
+  // Verify the size is large enough
+  if (alignment < min_alignment*8) {
+    log_error("ERROR: alignment too small. Minimum alignment for %s16 is %lu bits, device reported %lu bits.",
+              (gHasLong) ? "long" : "int",
+              (unsigned long)(min_alignment*8), (unsigned long)alignment);
+    return -1;
+  }
+
+  // Verify the size is a power of two
+  if (!IsAPowerOfTwo((unsigned long)alignment)) {
+    log_error("ERROR: alignment is not a power of two.\n");
+    return -1;
+  }
+
+  return 0;
+
+}
--- a/test_conformance/api/test_kernel_arg_changes.cpp
+++ b/test_conformance/api/test_kernel_arg_changes.cpp
@@ -0,0 +1,141 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "testBase.h"
+
+extern "C" { extern cl_uint gRandomSeed;}
+
+// This test is designed to stress changing kernel arguments between execute calls (that are asynchronous and thus
+// potentially overlapping) to make sure each kernel gets the right arguments
+
+// Note: put a delay loop in the kernel to make sure we have time to queue the next kernel before this one finishes
+const char *inspect_image_kernel_source[] = {
+"__kernel void sample_test(read_only image2d_t src, __global int *outDimensions )\n"
+"{\n"
+"    int tid = get_global_id(0), i;\n"
+"     for( i = 0; i < 100000; i++ ); \n"
+"    outDimensions[tid * 2] = get_image_width(src) * tid;\n"
+"    outDimensions[tid * 2 + 1] = get_image_height(src) * tid;\n"
+"\n"
+"}\n" };
+
+#define NUM_TRIES    100
+#define NUM_THREADS 2048
+
+int test_kernel_arg_changes(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
+{
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    int error, i;
+    clMemWrapper images[ NUM_TRIES ];
+    size_t         sizes[ NUM_TRIES ][ 2 ];
+    clMemWrapper results[ NUM_TRIES ];
+    cl_image_format    imageFormat;
+    size_t maxWidth, maxHeight;
+    size_t threads[1], localThreads[1];
+    cl_int resultArray[ NUM_THREADS * 2 ];
+    char errStr[ 128 ];
+    RandomSeed seed( gRandomSeed );
+
+
+    PASSIVE_REQUIRE_IMAGE_SUPPORT( device )
+
+    // Just get any ol format to test with
+    error = get_8_bit_image_format( context, CL_MEM_OBJECT_IMAGE2D, CL_MEM_READ_WRITE, 0, &imageFormat );
+    test_error( error, "Unable to obtain suitable image format to test with!" );
+
+    // Create our testing kernel
+    error = create_single_kernel_helper( context, &program, &kernel, 1, inspect_image_kernel_source, "sample_test" );
+    test_error( error, "Unable to create testing kernel" );
+
+    // Get max dimensions for each of our images
+    error = clGetDeviceInfo( device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof( maxWidth ), &maxWidth, NULL );
+    error |= clGetDeviceInfo( device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof( maxHeight ), &maxHeight, NULL );
+    test_error( error, "Unable to get max image dimensions for device" );
+
+    // Get the number of threads we'll be able to run
+    threads[0] = NUM_THREADS;
+    error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
+    test_error( error, "Unable to get work group size for kernel" );
+
+    // Create a variety of images and output arrays
+    for( i = 0; i < NUM_TRIES; i++ )
+    {
+        sizes[ i ][ 0 ] = genrand_int32(seed) % (maxWidth/32) + 1;
+        sizes[ i ][ 1 ] = genrand_int32(seed) % (maxHeight/32) + 1;
+
+        images[ i ] = create_image_2d( context, (cl_mem_flags)(CL_MEM_READ_ONLY),
+                                     &imageFormat, sizes[ i ][ 0], sizes[ i ][ 1 ], 0, NULL, &error );
+        if( images[i] == NULL )
+        {
+            log_error("Failed to create image %d of size %d x %d (%s).\n", i, (int)sizes[i][0], (int)sizes[i][1], IGetErrorString( error ));
+            return -1;
+        }
+        results[ i ] = clCreateBuffer( context, (cl_mem_flags)(CL_MEM_READ_WRITE),  sizeof( cl_int ) * threads[0] * 2, NULL, &error );
+        if( results[i] == NULL)
+        {
+            log_error("Failed to create array %d of size %d.\n", i, (int)threads[0]*2);
+            return -1;
+        }
+    }
+
+    // Start setting arguments and executing kernels
+    for( i = 0; i < NUM_TRIES; i++ )
+    {
+        // Set the arguments for this try
+        error = clSetKernelArg( kernel, 0, sizeof( cl_mem ), &images[ i ] );
+        sprintf( errStr, "Unable to set argument 0 for kernel try %d", i );
+        test_error( error, errStr );
+
+        error = clSetKernelArg( kernel, 1, sizeof( cl_mem ), &results[ i ] );
+        sprintf( errStr, "Unable to set argument 1 for kernel try %d", i );
+        test_error( error, errStr );
+
+        // Queue up execution
+        error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
+        sprintf( errStr, "Unable to execute kernel try %d", i );
+        test_error( error, errStr );
+    }
+
+    // Read the results back out, one at a time, and verify
+    for( i = 0; i < NUM_TRIES; i++ )
+    {
+        error = clEnqueueReadBuffer( queue, results[ i ], CL_TRUE, 0, sizeof( cl_int ) * threads[0] * 2, resultArray, 0, NULL, NULL );
+        sprintf( errStr, "Unable to read results for kernel try %d", i );
+        test_error( error, errStr );
+
+        // Verify. Each entry should be n * the (width/height) of image i
+        for( int j = 0; j < NUM_THREADS; j++ )
+        {
+            if( resultArray[ j * 2 + 0 ] != (int)sizes[ i ][ 0 ] * j )
+            {
+                log_error( "ERROR: Verficiation for kernel try %d, sample %d FAILED, expected a width of %d, got %d\n",
+                          i, j, (int)sizes[ i ][ 0 ] * j, resultArray[ j * 2 + 0 ] );
+                return -1;
+            }
+            if( resultArray[ j * 2 + 1 ] != (int)sizes[ i ][ 1 ] * j )
+            {
+                log_error( "ERROR: Verficiation for kernel try %d, sample %d FAILED, expected a height of %d, got %d\n",
+                          i, j, (int)sizes[ i ][ 1 ] * j, resultArray[ j * 2 + 1 ] );
+                return -1;
+            }
+        }
+    }
+
+    // If we got here, everything verified successfully
+    return 0;
+}
+
+
--- a/test_conformance/api/test_kernel_arg_info.c
+++ b/test_conformance/api/test_kernel_arg_info.c
--- a/test_conformance/api/test_kernel_arg_multi_setup.cpp
+++ b/test_conformance/api/test_kernel_arg_multi_setup.cpp
@@ -0,0 +1,277 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "testBase.h"
+#include "../../test_common/harness/conversions.h"
+
+// This test is designed to stress passing multiple vector parameters to kernels and verifying access between them all
+
+const char *multi_arg_kernel_source_pattern =
+"__kernel void sample_test(__global %s *src1, __global %s *src2, __global %s *src3, __global %s *dst1, __global %s *dst2, __global %s *dst3 )\n"
+"{\n"
+"    int tid = get_global_id(0);\n"
+"    dst1[tid] = src1[tid];\n"
+"    dst2[tid] = src2[tid];\n"
+"    dst3[tid] = src3[tid];\n"
+"}\n";
+
+extern cl_uint gRandomSeed;
+
+#define MAX_ERROR_TOLERANCE 0.0005f
+
+int test_multi_arg_set(cl_device_id device, cl_context context, cl_command_queue queue,
+                       ExplicitType vec1Type, int vec1Size,
+                       ExplicitType vec2Type, int vec2Size,
+                       ExplicitType vec3Type, int vec3Size, MTdata d)
+{
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    int error, i, j;
+    clMemWrapper streams[ 6 ];
+    size_t threads[1], localThreads[1];
+    char programSrc[ 10248 ], vec1Name[ 64 ], vec2Name[ 64 ], vec3Name[ 64 ];
+    char sizeNames[][ 4 ] = { "", "2", "3", "4", "", "", "", "8" };
+    const char *ptr;
+    void *initData[3], *resultData[3];
+
+
+    // Create the program source
+    sprintf( vec1Name, "%s%s", get_explicit_type_name( vec1Type ), sizeNames[ vec1Size - 1 ] );
+    sprintf( vec2Name, "%s%s", get_explicit_type_name( vec2Type ), sizeNames[ vec2Size - 1 ] );
+    sprintf( vec3Name, "%s%s", get_explicit_type_name( vec3Type ), sizeNames[ vec3Size - 1 ] );
+
+    sprintf( programSrc, multi_arg_kernel_source_pattern,
+            vec1Name, vec2Name, vec3Name, vec1Name, vec2Name, vec3Name,
+            vec1Size, vec1Size, vec2Size, vec2Size, vec3Size, vec3Size );
+    ptr = programSrc;
+
+    // Create our testing kernel
+    error = create_single_kernel_helper( context, &program, &kernel, 1, &ptr, "sample_test" );
+    test_error( error, "Unable to create testing kernel" );
+
+    // Get thread dimensions
+    threads[0] = 1024;
+    error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
+    test_error( error, "Unable to get work group size for kernel" );
+
+    // Create input streams
+    initData[ 0 ] = create_random_data( vec1Type, d, (unsigned int)threads[ 0 ] * vec1Size );
+    streams[ 0 ] = clCreateBuffer( context, (cl_mem_flags)( CL_MEM_COPY_HOST_PTR ), get_explicit_type_size( vec1Type ) * threads[0] * vec1Size, initData[ 0 ], &error );
+    test_error( error, "Unable to create testing stream" );
+
+    initData[ 1 ] = create_random_data( vec2Type, d, (unsigned int)threads[ 0 ] * vec2Size );
+    streams[ 1 ] = clCreateBuffer( context, (cl_mem_flags)( CL_MEM_COPY_HOST_PTR ), get_explicit_type_size( vec2Type ) * threads[0] * vec2Size, initData[ 1 ], &error );
+    test_error( error, "Unable to create testing stream" );
+
+    initData[ 2 ] = create_random_data( vec3Type, d, (unsigned int)threads[ 0 ] * vec3Size );
+    streams[ 2 ] = clCreateBuffer( context, (cl_mem_flags)( CL_MEM_COPY_HOST_PTR ), get_explicit_type_size( vec3Type ) * threads[0] * vec3Size, initData[ 2 ], &error );
+    test_error( error, "Unable to create testing stream" );
+
+    streams[ 3 ] = clCreateBuffer( context, (cl_mem_flags)(CL_MEM_READ_WRITE),  get_explicit_type_size( vec1Type ) * threads[0] * vec1Size, NULL, &error );
+    test_error( error, "Unable to create testing stream" );
+
+    streams[ 4 ] = clCreateBuffer( context, (cl_mem_flags)(CL_MEM_READ_WRITE),  get_explicit_type_size( vec2Type ) * threads[0] * vec2Size, NULL, &error );
+    test_error( error, "Unable to create testing stream" );
+
+    streams[ 5 ] = clCreateBuffer( context, (cl_mem_flags)(CL_MEM_READ_WRITE),  get_explicit_type_size( vec3Type ) * threads[0] * vec3Size, NULL, &error );
+    test_error( error, "Unable to create testing stream" );
+
+    // Set the arguments
+    error = 0;
+    for( i = 0; i < 6; i++ )
+        error |= clSetKernelArg( kernel, i, sizeof( cl_mem ), &streams[ i ] );
+    test_error( error, "Unable to set arguments for kernel" );
+
+    // Execute!
+    error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
+    test_error( error, "Unable to execute kernel" );
+
+    // Read results
+    resultData[0] = malloc( get_explicit_type_size( vec1Type ) * vec1Size * threads[0] );
+    resultData[1] = malloc( get_explicit_type_size( vec2Type ) * vec2Size * threads[0] );
+    resultData[2] = malloc( get_explicit_type_size( vec3Type ) * vec3Size * threads[0] );
+    error = clEnqueueReadBuffer( queue, streams[ 3 ], CL_TRUE, 0, get_explicit_type_size( vec1Type ) * vec1Size * threads[ 0 ], resultData[0], 0, NULL, NULL );
+    error |= clEnqueueReadBuffer( queue, streams[ 4 ], CL_TRUE, 0, get_explicit_type_size( vec2Type ) * vec2Size * threads[ 0 ], resultData[1], 0, NULL, NULL );
+    error |= clEnqueueReadBuffer( queue, streams[ 5 ], CL_TRUE, 0, get_explicit_type_size( vec3Type ) * vec3Size * threads[ 0 ], resultData[2], 0, NULL, NULL );
+    test_error( error, "Unable to read result stream" );
+
+    // Verify
+    char *ptr1 = (char *)initData[ 0 ], *ptr2 = (char *)resultData[ 0 ];
+    size_t span = get_explicit_type_size( vec1Type );
+    for( i = 0; i < (int)threads[0]; i++ )
+    {
+        for( j = 0; j < vec1Size; j++ )
+        {
+            if( memcmp( ptr1 + span * j , ptr2 + span * j, span ) != 0 )
+            {
+                log_error( "ERROR: Value did not validate for component %d of item %d of stream 0!\n", j, i );
+                free( initData[ 0 ] );
+                free( initData[ 1 ] );
+                free( initData[ 2 ] );
+                free( resultData[ 0 ] );
+                free( resultData[ 1 ] );
+                free( resultData[ 2 ] );
+                return -1;
+            }
+        }
+        ptr1 += span * vec1Size;
+        ptr2 += span * vec1Size;
+    }
+
+    ptr1 = (char *)initData[ 1 ];
+    ptr2 = (char *)resultData[ 1 ];
+    span = get_explicit_type_size( vec2Type );
+    for( i = 0; i < (int)threads[0]; i++ )
+    {
+        for( j = 0; j < vec2Size; j++ )
+        {
+            if( memcmp( ptr1 + span * j , ptr2 + span * j, span ) != 0 )
+            {
+                log_error( "ERROR: Value did not validate for component %d of item %d of stream 1!\n", j, i );
+                free( initData[ 0 ] );
+                free( initData[ 1 ] );
+                free( initData[ 2 ] );
+                free( resultData[ 0 ] );
+                free( resultData[ 1 ] );
+                free( resultData[ 2 ] );
+                return -1;
+            }
+        }
+        ptr1 += span * vec2Size;
+        ptr2 += span * vec2Size;
+    }
+
+    ptr1 = (char *)initData[ 2 ];
+    ptr2 = (char *)resultData[ 2 ];
+    span = get_explicit_type_size( vec3Type );
+    for( i = 0; i < (int)threads[0]; i++ )
+    {
+        for( j = 0; j < vec3Size; j++ )
+        {
+            if( memcmp( ptr1 + span * j , ptr2 + span * j, span ) != 0 )
+            {
+                log_error( "ERROR: Value did not validate for component %d of item %d of stream 2!\n", j, i );
+                free( initData[ 0 ] );
+                free( initData[ 1 ] );
+                free( initData[ 2 ] );
+                free( resultData[ 0 ] );
+                free( resultData[ 1 ] );
+                free( resultData[ 2 ] );
+                return -1;
+            }
+        }
+        ptr1 += span * vec3Size;
+        ptr2 += span * vec3Size;
+    }
+
+    // If we got here, everything verified successfully
+    free( initData[ 0 ] );
+    free( initData[ 1 ] );
+    free( initData[ 2 ] );
+    free( resultData[ 0 ] );
+    free( resultData[ 1 ] );
+    free( resultData[ 2 ] );
+
+    return 0;
+}
+
+int test_kernel_arg_multi_setup_exhaustive(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
+{
+    // Loop through every combination of input and output types
+    ExplicitType types[] = { kChar, kShort, kInt, kFloat, kNumExplicitTypes };
+    int type1, type2, type3;
+    int size1, size2, size3;
+    RandomSeed seed( gRandomSeed );
+
+    log_info( "\n" ); // for formatting
+
+    for( type1 = 0; types[ type1 ] != kNumExplicitTypes; type1++ )
+    {
+        for( type2 = 0; types[ type2 ] != kNumExplicitTypes; type2++ )
+        {
+            for( type3 = 0; types[ type3 ] != kNumExplicitTypes; type3++ )
+            {
+                log_info( "\n\ttesting %s, %s, %s...", get_explicit_type_name( types[ type1 ] ), get_explicit_type_name( types[ type2 ] ), get_explicit_type_name( types[ type3 ] ) );
+
+                // Loop through every combination of vector size
+                for( size1 = 2; size1 <= 8; size1 <<= 1 )
+                {
+                    for( size2 = 2; size2 <= 8; size2 <<= 1 )
+                    {
+                        for( size3 = 2; size3 <= 8; size3 <<= 1 )
+                        {
+                            log_info(".");
+                            fflush( stdout);
+                            if( test_multi_arg_set( device, context, queue,
+                                                   types[ type1 ], size1,
+                                                   types[ type2 ], size2,
+                                                   types[ type3 ], size3, seed ) )
+                                return -1;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    log_info( "\n" );
+    return 0;
+}
+
+int test_kernel_arg_multi_setup_random(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
+{
+    // Loop through a selection of combinations
+    ExplicitType types[] = { kChar, kShort, kInt, kFloat, kNumExplicitTypes };
+    int type1, type2, type3;
+    int size1, size2, size3;
+    RandomSeed seed( gRandomSeed );
+
+    num_elements = 3*3*3*4;
+    log_info( "Testing %d random configurations\n", num_elements );
+
+    // Loop through every combination of vector size
+    for( size1 = 2; size1 <= 8; size1 <<= 1 )
+    {
+        for( size2 = 2; size2 <= 8; size2 <<= 1 )
+        {
+            for( size3 = 2; size3 <= 8; size3 <<= 1 )
+            {
+                // Loop through 4 type combinations for each size combination
+                int n;
+                for (n=0; n<4; n++) {
+                    type1 = (int)get_random_float(0,4, seed);
+                    type2 = (int)get_random_float(0,4, seed);
+                    type3 = (int)get_random_float(0,4, seed);
+
+
+                    log_info( "\ttesting %s%d, %s%d, %s%d...\n",
+                             get_explicit_type_name( types[ type1 ] ), size1,
+                             get_explicit_type_name( types[ type2 ] ), size2,
+                             get_explicit_type_name( types[ type3 ] ), size3 );
+
+                    if( test_multi_arg_set( device, context, queue,
+                                           types[ type1 ], size1,
+                                           types[ type2 ], size2,
+                                           types[ type3 ], size3, seed ) )
+                        return -1;
+                }
+            }
+        }
+    }
+    return 0;
+}
+
+
+
+
--- a/test_conformance/api/test_kernels.c
+++ b/test_conformance/api/test_kernels.c
@@ -0,0 +1,695 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "testBase.h"
+#include "../../test_common/harness/typeWrappers.h"
+#include "../../test_common/harness/conversions.h"
+
+extern cl_uint gRandomSeed;
+
+const char *sample_single_test_kernel[] = {
+"__kernel void sample_test(__global float *src, __global int *dst)\n"
+"{\n"
+"    int  tid = get_global_id(0);\n"
+"\n"
+"    dst[tid] = (int)src[tid];\n"
+"\n"
+"}\n" };
+
+const char *sample_struct_test_kernel[] = {
+"typedef struct {\n"
+"__global int *A;\n"
+"__global int *B;\n"
+"} input_pair_t;\n"
+"\n"
+"__kernel void sample_test(__global input_pair_t *src, __global int *dst)\n"
+"{\n"
+"    int  tid = get_global_id(0);\n"
+"\n"
+"    dst[tid] = src->A[tid] + src->B[tid];\n"
+"\n"
+"}\n" };
+
+const char *sample_struct_array_test_kernel[] = {
+"typedef struct {\n"
+"int A;\n"
+"int B;\n"
+"} input_pair_t;\n"
+"\n"
+"__kernel void sample_test(__global input_pair_t *src, __global int *dst)\n"
+"{\n"
+"    int  tid = get_global_id(0);\n"
+"\n"
+"    dst[tid] = src[tid].A + src[tid].B;\n"
+"\n"
+"}\n" };
+
+const char *sample_const_test_kernel[] = {
+"__kernel void sample_test(__constant int *src1, __constant int *src2, __global int *dst)\n"
+"{\n"
+"    int  tid = get_global_id(0);\n"
+"\n"
+"    dst[tid] = src1[tid] + src2[tid];\n"
+"\n"
+"}\n" };
+
+const char *sample_const_global_test_kernel[] = {
+"__constant int addFactor = 1024;\n"
+"__kernel void sample_test(__global int *src1, __global int *dst)\n"
+"{\n"
+"    int  tid = get_global_id(0);\n"
+"\n"
+"    dst[tid] = src1[tid] + addFactor;\n"
+"\n"
+"}\n" };
+
+const char *sample_two_kernel_program[] = {
+"__kernel void sample_test(__global float *src, __global int *dst)\n"
+"{\n"
+"    int  tid = get_global_id(0);\n"
+"\n"
+"    dst[tid] = (int)src[tid];\n"
+"\n"
+"}\n",
+"__kernel void sample_test2(__global int *src, __global float *dst)\n"
+"{\n"
+"    int  tid = get_global_id(0);\n"
+"\n"
+"    dst[tid] = (float)src[tid];\n"
+"\n"
+"}\n" };
+
+
+
+
+int test_get_kernel_info(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    int error;
+    cl_program program, testProgram;
+    cl_context testContext;
+    cl_kernel kernel;
+    cl_char name[ 512 ];
+    cl_uint numArgs, numInstances;
+    size_t paramSize;
+
+
+    /* Create reference */
+    if( create_single_kernel_helper( context, &program, &kernel, 1, sample_single_test_kernel, "sample_test" ) != 0 )
+    {
+        return -1;
+    }
+
+    error = clGetKernelInfo( kernel, CL_KERNEL_FUNCTION_NAME, NULL, 0, &paramSize );
+    test_error( error, "Unable to get kernel function name param size" );
+    if( paramSize != strlen( "sample_test" ) + 1 )
+    {
+        log_error( "ERROR: Kernel function name param returns invalid size (expected %d, got %d)\n", (int)strlen( "sample_test" ) + 1, (int)paramSize );
+        return -1;
+    }
+
+    error = clGetKernelInfo( kernel, CL_KERNEL_FUNCTION_NAME, sizeof( name ), name, NULL );
+    test_error( error, "Unable to get kernel function name" );
+    if( strcmp( (char *)name, "sample_test" ) != 0 )
+    {
+        log_error( "ERROR: Kernel function name returned invalid value (expected sample_test, got %s)\n", (char *)name );
+        return -1;
+    }
+
+
+    error = clGetKernelInfo( kernel, CL_KERNEL_NUM_ARGS, 0, NULL, &paramSize );
+    test_error( error, "Unable to get kernel arg count param size" );
+    if( paramSize != sizeof( numArgs ) )
+    {
+        log_error( "ERROR: Kernel arg count param returns invalid size (expected %d, got %d)\n", (int)sizeof( numArgs ), (int)paramSize );
+        return -1;
+    }
+
+    error = clGetKernelInfo( kernel, CL_KERNEL_NUM_ARGS, sizeof( numArgs ), &numArgs, NULL );
+    test_error( error, "Unable to get kernel arg count" );
+    if( numArgs != 2 )
+    {
+        log_error( "ERROR: Kernel arg count returned invalid value (expected %d, got %d)\n", 2, numArgs );
+        return -1;
+    }
+
+
+    error = clGetKernelInfo( kernel, CL_KERNEL_REFERENCE_COUNT, 0, NULL, &paramSize );
+    test_error( error, "Unable to get kernel reference count param size" );
+    if( paramSize != sizeof( numInstances ) )
+    {
+        log_error( "ERROR: Kernel reference count param returns invalid size (expected %d, got %d)\n", (int)sizeof( numInstances ), (int)paramSize );
+        return -1;
+    }
+
+    error = clGetKernelInfo( kernel, CL_KERNEL_REFERENCE_COUNT, sizeof( numInstances ), &numInstances, NULL );
+    test_error( error, "Unable to get kernel reference count" );
+
+
+    error = clGetKernelInfo( kernel, CL_KERNEL_PROGRAM, NULL, 0, &paramSize );
+    test_error( error, "Unable to get kernel program param size" );
+    if( paramSize != sizeof( testProgram ) )
+    {
+        log_error( "ERROR: Kernel program param returns invalid size (expected %d, got %d)\n", (int)sizeof( testProgram ), (int)paramSize );
+        return -1;
+    }
+
+    error = clGetKernelInfo( kernel, CL_KERNEL_PROGRAM, sizeof( testProgram ), &testProgram, NULL );
+    test_error( error, "Unable to get kernel program" );
+    if( testProgram != program )
+    {
+        log_error( "ERROR: Kernel program returned invalid value (expected %p, got %p)\n", program, testProgram );
+        return -1;
+    }
+
+    error = clGetKernelInfo( kernel, CL_KERNEL_CONTEXT, sizeof( testContext ), &testContext, NULL );
+    test_error( error, "Unable to get kernel context" );
+    if( testContext != context )
+    {
+        log_error( "ERROR: Kernel context returned invalid value (expected %p, got %p)\n", context, testContext );
+        return -1;
+    }
+
+    /* Release memory */
+    clReleaseKernel( kernel );
+    clReleaseProgram( program );
+    return 0;
+}
+
+int test_execute_kernel_local_sizes(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    int error;
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    clMemWrapper            streams[2];
+    size_t    threads[1], localThreads[1];
+    cl_float inputData[100];
+    cl_int outputData[100];
+    RandomSeed seed( gRandomSeed );
+    int i;
+
+    /* Create a kernel to test with */
+    if( create_single_kernel_helper( context, &program, &kernel, 1, sample_single_test_kernel, "sample_test" ) != 0 )
+    {
+        return -1;
+    }
+
+    /* Create some I/O streams */
+    streams[0] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE),  sizeof(cl_float) * 100, NULL, &error);
+    test_error( error, "Creating test array failed" );
+    streams[1] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE),  sizeof(cl_int) * 100, NULL, &error);
+    test_error( error, "Creating test array failed" );
+
+    /* Write some test data */
+    memset( outputData, 0, sizeof( outputData ) );
+
+    for (i=0; i<100; i++)
+        inputData[i] = get_random_float(-(float) 0x7fffffff, (float) 0x7fffffff, seed);
+
+    error = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0, sizeof(cl_float)*100, (void *)inputData, 0, NULL, NULL);
+    test_error( error, "Unable to set testing kernel data" );
+
+    /* Set the arguments */
+    error = clSetKernelArg( kernel, 0, sizeof( streams[0] ), &streams[0] );
+    test_error( error, "Unable to set kernel arguments" );
+    error = clSetKernelArg( kernel, 1, sizeof( streams[1] ), &streams[1] );
+    test_error( error, "Unable to set kernel arguments" );
+
+    /* Test running the kernel and verifying it */
+    threads[0] = (size_t)100;
+    error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
+    test_error( error, "Unable to get work group size to use" );
+
+    error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
+    test_error( error, "Kernel execution failed" );
+
+    error = clEnqueueReadBuffer( queue, streams[1], CL_TRUE, 0, sizeof(cl_int)*100, (void *)outputData, 0, NULL, NULL );
+    test_error( error, "Unable to get result data" );
+
+    for (i=0; i<100; i++)
+    {
+        if (outputData[i] != (int)inputData[i])
+        {
+            log_error( "ERROR: Data did not verify on first pass!\n" );
+            return -1;
+        }
+    }
+
+    /* Try again */
+    if( localThreads[0] > 1 )
+        localThreads[0] /= 2;
+    while( localThreads[0] > 1 && 0 != threads[0] % localThreads[0] )
+        localThreads[0]--;
+    error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
+    test_error( error, "Kernel execution failed" );
+
+    error = clEnqueueReadBuffer( queue, streams[1], CL_TRUE, 0, sizeof(cl_int)*100, (void *)outputData, 0, NULL, NULL );
+    test_error( error, "Unable to get result data" );
+
+    for (i=0; i<100; i++)
+    {
+        if (outputData[i] != (int)inputData[i])
+        {
+            log_error( "ERROR: Data did not verify on first pass!\n" );
+            return -1;
+        }
+    }
+
+    /* And again */
+    if( localThreads[0] > 1 )
+        localThreads[0] /= 2;
+    while( localThreads[0] > 1 && 0 != threads[0] % localThreads[0] )
+        localThreads[0]--;
+    error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
+    test_error( error, "Kernel execution failed" );
+
+    error = clEnqueueReadBuffer( queue, streams[1], CL_TRUE, 0, sizeof(cl_int)*100, (void *)outputData, 0, NULL, NULL );
+    test_error( error, "Unable to get result data" );
+
+    for (i=0; i<100; i++)
+    {
+        if (outputData[i] != (int)inputData[i])
+        {
+            log_error( "ERROR: Data did not verify on first pass!\n" );
+            return -1;
+        }
+    }
+
+    /* One more time */
+    localThreads[0] = (unsigned int)1;
+    error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
+    test_error( error, "Kernel execution failed" );
+
+    error = clEnqueueReadBuffer( queue, streams[1], CL_TRUE, 0, sizeof(cl_int)*100, (void *)outputData, 0, NULL, NULL );
+    test_error( error, "Unable to get result data" );
+
+    for (i=0; i<100; i++)
+    {
+        if (outputData[i] != (int)inputData[i])
+        {
+            log_error( "ERROR: Data did not verify on first pass!\n" );
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+int test_set_kernel_arg_by_index(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    int error;
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    clMemWrapper    streams[2];
+    size_t    threads[1], localThreads[1];
+    cl_float inputData[10];
+    cl_int outputData[10];
+    RandomSeed seed( gRandomSeed );
+    int i;
+
+    /* Create a kernel to test with */
+    if( create_single_kernel_helper( context, &program, &kernel, 1, sample_single_test_kernel, "sample_test" ) != 0 )
+    {
+        return -1;
+    }
+
+    /* Create some I/O streams */
+    streams[0] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE),  sizeof(cl_float) * 10, NULL, &error);
+    test_error( error, "Creating test array failed" );
+    streams[1] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE),  sizeof(cl_int) * 10, NULL, &error);
+    test_error( error, "Creating test array failed" );
+
+    /* Write some test data */
+    memset( outputData, 0, sizeof( outputData ) );
+
+    for (i=0; i<10; i++)
+        inputData[i] = get_random_float(-(float) 0x7fffffff, (float) 0x7fffffff, seed);
+
+    error = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0, sizeof(cl_float)*10, (void *)inputData, 0, NULL, NULL);
+    test_error( error, "Unable to set testing kernel data" );
+
+    /* Test setting the arguments by index manually */
+    error = clSetKernelArg(kernel, 1, sizeof( streams[1] ), &streams[1]);
+    test_error( error, "Unable to set indexed kernel arguments" );
+    error = clSetKernelArg(kernel, 0, sizeof( streams[0] ), &streams[0]);
+    test_error( error, "Unable to set indexed kernel arguments" );
+
+
+    /* Test running the kernel and verifying it */
+    threads[0] = (size_t)10;
+
+    error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
+    test_error( error, "Unable to get work group size to use" );
+
+    error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
+    test_error( error, "Kernel execution failed" );
+
+    error = clEnqueueReadBuffer( queue, streams[1], CL_TRUE, 0, sizeof(cl_int)*10, (void *)outputData, 0, NULL, NULL );
+    test_error( error, "Unable to get result data" );
+
+    for (i=0; i<10; i++)
+    {
+        if (outputData[i] != (int)inputData[i])
+        {
+            log_error( "ERROR: Data did not verify on first pass!\n" );
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+int test_set_kernel_arg_struct(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    int error;
+    cl_program program;
+    cl_kernel kernel;
+    void            *args[2];
+    cl_mem            outStream;
+    size_t    threads[1], localThreads[1];
+    cl_int outputData[10];
+    int i;
+    cl_int randomTestDataA[10], randomTestDataB[10];
+    MTdata  d;
+
+    struct img_pair_t
+    {
+        cl_mem streamA;
+        cl_mem streamB;
+    } image_pair;
+
+
+    /* Create a kernel to test with */
+    if( create_single_kernel_helper( context, &program, &kernel, 1, sample_struct_test_kernel, "sample_test" ) != 0 )
+    {
+        return -1;
+    }
+
+    /* Create some I/O streams */
+    d = init_genrand( gRandomSeed );
+    for( i = 0; i < 10; i++ )
+    {
+        randomTestDataA[i] = (cl_int)genrand_int32(d);
+        randomTestDataB[i] = (cl_int)genrand_int32(d);
+    }
+    free_mtdata(d); d = NULL;
+
+    image_pair.streamA = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_COPY_HOST_PTR), sizeof(cl_int) * 10, randomTestDataA, &error);
+    test_error( error, "Creating test array failed" );
+    image_pair.streamB = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_COPY_HOST_PTR), sizeof(cl_int) * 10, randomTestDataB, &error);
+    test_error( error, "Creating test array failed" );
+    outStream = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE),  sizeof(cl_int) * 10, NULL, &error);
+    test_error( error, "Creating test array failed" );
+
+    /* Set the arguments */
+    args[0] = &image_pair;
+    args[1] = outStream;
+
+    error = clSetKernelArg(kernel, 0, sizeof( image_pair ), &image_pair);
+    test_error( error, "Unable to set indexed kernel arguments" );
+    error = clSetKernelArg(kernel, 1, sizeof( cl_mem ), &args[1]);
+    test_error( error, "Unable to set indexed kernel arguments" );
+
+    /* Test running the kernel and verifying it */
+    threads[0] = (size_t)10;
+
+    error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
+    test_error( error, "Unable to get work group size to use" );
+
+    error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
+    test_error( error, "Kernel execution failed" );
+
+    error = clEnqueueReadBuffer( queue, outStream, CL_TRUE, 0, sizeof(cl_int)*10, (void *)outputData, 0, NULL, NULL );
+    test_error( error, "Unable to get result data" );
+
+    for (i=0; i<10; i++)
+    {
+        if (outputData[i] != randomTestDataA[i] + randomTestDataB[i])
+        {
+            log_error( "ERROR: Data did not verify!\n" );
+            return -1;
+        }
+    }
+
+
+    clReleaseMemObject( image_pair.streamA );
+    clReleaseMemObject( image_pair.streamB );
+    clReleaseMemObject( outStream );
+    clReleaseKernel( kernel );
+    clReleaseProgram( program );
+
+    return 0;
+}
+
+int test_set_kernel_arg_constant(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    int error;
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    clMemWrapper            streams[3];
+    size_t    threads[1], localThreads[1];
+    cl_int outputData[10];
+    int i;
+    cl_int randomTestDataA[10], randomTestDataB[10];
+    cl_ulong maxSize;
+    MTdata d;
+
+    /* Verify our test buffer won't be bigger than allowed */
+    error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof( maxSize ), &maxSize, 0 );
+    test_error( error, "Unable to get max constant buffer size" );
+    if( maxSize < sizeof( cl_int ) * 10 )
+    {
+        log_error( "ERROR: Unable to test constant argument to kernel: max size of constant buffer is reported as %d!\n", (int)maxSize );
+        return -1;
+    }
+
+    /* Create a kernel to test with */
+    if( create_single_kernel_helper( context, &program, &kernel, 1, sample_const_test_kernel, "sample_test" ) != 0 )
+    {
+        return -1;
+    }
+
+    /* Create some I/O streams */
+    d = init_genrand( gRandomSeed );
+    for( i = 0; i < 10; i++ )
+    {
+        randomTestDataA[i] = (cl_int)genrand_int32(d) & 0xffffff;    /* Make sure values are positive, just so we don't have to */
+        randomTestDataB[i] = (cl_int)genrand_int32(d) & 0xffffff;    /* deal with overflow on the verification */
+    }
+    free_mtdata(d); d = NULL;
+
+    streams[0] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_COPY_HOST_PTR), sizeof(cl_int) * 10, randomTestDataA, &error);
+    test_error( error, "Creating test array failed" );
+    streams[1] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_COPY_HOST_PTR), sizeof(cl_int) * 10, randomTestDataB, &error);
+    test_error( error, "Creating test array failed" );
+    streams[2] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE),  sizeof(cl_int) * 10, NULL, &error);
+    test_error( error, "Creating test array failed" );
+
+    /* Set the arguments */
+    error = clSetKernelArg(kernel, 0, sizeof( streams[0] ), &streams[0]);
+    test_error( error, "Unable to set indexed kernel arguments" );
+    error = clSetKernelArg(kernel, 1, sizeof( streams[1] ), &streams[1]);
+    test_error( error, "Unable to set indexed kernel arguments" );
+    error = clSetKernelArg(kernel, 2, sizeof( streams[2] ), &streams[2]);
+    test_error( error, "Unable to set indexed kernel arguments" );
+
+
+    /* Test running the kernel and verifying it */
+    threads[0] = (size_t)10;
+
+    error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
+    test_error( error, "Unable to get work group size to use" );
+
+    error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
+    test_error( error, "Kernel execution failed" );
+
+    error = clEnqueueReadBuffer( queue, streams[2], CL_TRUE, 0, sizeof(cl_int)*10, (void *)outputData, 0, NULL, NULL );
+    test_error( error, "Unable to get result data" );
+
+    for (i=0; i<10; i++)
+    {
+        if (outputData[i] != randomTestDataA[i] + randomTestDataB[i])
+        {
+            log_error( "ERROR: Data sample %d did not verify! %d does not match %d + %d (%d)\n", i, outputData[i], randomTestDataA[i], randomTestDataB[i], ( randomTestDataA[i] + randomTestDataB[i] ) );
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+int test_set_kernel_arg_struct_array(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    int error;
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    clMemWrapper            streams[2];
+    size_t    threads[1], localThreads[1];
+    cl_int outputData[10];
+    int i;
+    MTdata d;
+
+    typedef struct img_pair_type
+    {
+        int A;
+        int B;
+    } image_pair_t;
+
+    image_pair_t image_pair[ 10 ];
+
+
+    /* Create a kernel to test with */
+    if( create_single_kernel_helper( context, &program, &kernel, 1, sample_struct_array_test_kernel, "sample_test" ) != 0 )
+    {
+        return -1;
+    }
+
+    /* Create some I/O streams */
+    d = init_genrand( gRandomSeed );
+    for( i = 0; i < 10; i++ )
+    {
+        image_pair[i].A = (cl_int)genrand_int32(d);
+        image_pair[i].A = (cl_int)genrand_int32(d);
+    }
+    free_mtdata(d); d = NULL;
+
+    streams[0] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_COPY_HOST_PTR), sizeof(image_pair_t) * 10, (void *)image_pair, &error);
+    test_error( error, "Creating test array failed" );
+    streams[1] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE),  sizeof(cl_int) * 10, NULL, &error);
+    test_error( error, "Creating test array failed" );
+
+    /* Set the arguments */
+    error = clSetKernelArg(kernel, 0, sizeof( streams[0] ), &streams[0]);
+    test_error( error, "Unable to set indexed kernel arguments" );
+    error = clSetKernelArg(kernel, 1, sizeof( streams[1] ), &streams[1]);
+    test_error( error, "Unable to set indexed kernel arguments" );
+
+    /* Test running the kernel and verifying it */
+    threads[0] = (size_t)10;
+
+    error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
+    test_error( error, "Unable to get work group size to use" );
+
+    error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
+    test_error( error, "Kernel execution failed" );
+
+    error = clEnqueueReadBuffer( queue, streams[1], CL_TRUE, 0, sizeof(cl_int)*10, (void *)outputData, 0, NULL, NULL );
+    test_error( error, "Unable to get result data" );
+
+    for (i=0; i<10; i++)
+    {
+        if (outputData[i] != image_pair[i].A + image_pair[i].B)
+        {
+            log_error( "ERROR: Data did not verify!\n" );
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+int test_create_kernels_in_program(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    int error;
+    cl_program program;
+    cl_kernel  kernel[3];
+    unsigned int kernelCount;
+
+    error = create_single_kernel_helper(context, &program, NULL, 2, sample_two_kernel_program, NULL);
+    test_error(error, "Unable to build test program");
+
+    /* Try getting the kernel count */
+    error = clCreateKernelsInProgram( program, 0, NULL, &kernelCount );
+    test_error( error, "Unable to get kernel count for built program" );
+    if( kernelCount != 2 )
+    {
+        log_error( "ERROR: Returned kernel count from clCreateKernelsInProgram is incorrect! (got %d, expected 2)\n", kernelCount );
+        return -1;
+    }
+
+    /* Try actually getting the kernels */
+    error = clCreateKernelsInProgram( program, 2, kernel, NULL );
+    test_error( error, "Unable to get kernels for built program" );
+    clReleaseKernel( kernel[0] );
+    clReleaseKernel( kernel[1] );
+
+    clReleaseProgram( program );
+    return 0;
+}
+
+int test_kernel_global_constant(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    int error;
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    clMemWrapper            streams[2];
+    size_t    threads[1], localThreads[1];
+    cl_int outputData[10];
+    int i;
+    cl_int randomTestDataA[10];
+    MTdata d;
+
+
+    /* Create a kernel to test with */
+    if( create_single_kernel_helper( context, &program, &kernel, 1, sample_const_global_test_kernel, "sample_test" ) != 0 )
+    {
+        return -1;
+    }
+
+    /* Create some I/O streams */
+    d = init_genrand( gRandomSeed );
+    for( i = 0; i < 10; i++ )
+    {
+        randomTestDataA[i] = (cl_int)genrand_int32(d) & 0xffff;    /* Make sure values are positive and small, just so we don't have to */
+    }
+    free_mtdata(d); d = NULL;
+
+    streams[0] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_COPY_HOST_PTR), sizeof(cl_int) * 10, randomTestDataA, &error);
+    test_error( error, "Creating test array failed" );
+    streams[1] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE),  sizeof(cl_int) * 10, NULL, &error);
+    test_error( error, "Creating test array failed" );
+
+    /* Set the arguments */
+    error = clSetKernelArg(kernel, 0, sizeof( streams[0] ), &streams[0]);
+    test_error( error, "Unable to set indexed kernel arguments" );
+    error = clSetKernelArg(kernel, 1, sizeof( streams[1] ), &streams[1]);
+    test_error( error, "Unable to set indexed kernel arguments" );
+
+
+    /* Test running the kernel and verifying it */
+    threads[0] = (size_t)10;
+
+    error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
+    test_error( error, "Unable to get work group size to use" );
+
+    error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
+    test_error( error, "Kernel execution failed" );
+
+    error = clEnqueueReadBuffer( queue, streams[1], CL_TRUE, 0, sizeof(cl_int)*10, (void *)outputData, 0, NULL, NULL );
+    test_error( error, "Unable to get result data" );
+
+    for (i=0; i<10; i++)
+    {
+        if (outputData[i] != randomTestDataA[i] + 1024)
+        {
+            log_error( "ERROR: Data sample %d did not verify! %d does not match %d + 1024 (%d)\n", i, outputData[i], randomTestDataA[i], ( randomTestDataA[i] + 1024 ) );
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+
+
--- a/test_conformance/api/test_mem_object_info.cpp
+++ b/test_conformance/api/test_mem_object_info.cpp
@@ -0,0 +1,756 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "testBase.h"
+#include "../../test_common/harness/typeWrappers.h"
+#include "../../test_common/harness/testHarness.h"
+
+extern cl_uint gRandomSeed;
+
+
+#define TEST_MEM_OBJECT_PARAM( mem, paramName, val, expected, name, type, cast )    \
+error = clGetMemObjectInfo( mem, paramName, sizeof( val ), &val, &size );   \
+test_error( error, "Unable to get mem object " name );  \
+if( val != expected )   \
+{   \
+log_error( "ERROR: Mem object " name " did not validate! (expected " type ", got " type " from %s:%d)\n",   \
+expected, (cast)val, __FILE__, __LINE__ );   \
+return -1;  \
+}   \
+if( size != sizeof( val ) ) \
+{   \
+log_error( "ERROR: Returned size of mem object " name " does not validate! (expected %d, got %d from %s:%d)\n", \
+(int)sizeof( val ), (int)size , __FILE__, __LINE__ );   \
+return -1;  \
+}
+
+static void CL_CALLBACK mem_obj_destructor_callback( cl_mem, void * data )
+{
+    free( data );
+}
+
+static unsigned int
+get_image_dim(MTdata *d, unsigned int mod)
+{
+    unsigned int val = 0;
+
+    do
+    {
+        val = (unsigned int)genrand_int32(*d) % mod;
+    } while (val == 0);
+
+    return val;
+}
+
+
+int test_get_buffer_info( cl_device_id deviceID, cl_context context, cl_command_queue ignoreQueue, int num_elements )
+{
+    int error;
+    size_t size;
+    void * buffer = NULL;
+
+    clMemWrapper bufferObject;
+    clMemWrapper subBufferObject;
+
+    cl_mem_flags bufferFlags[] = {
+        CL_MEM_READ_WRITE,
+        CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+        CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
+        CL_MEM_READ_ONLY,
+        CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+        CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+        CL_MEM_WRITE_ONLY,
+        CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
+        CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
+        CL_MEM_HOST_READ_ONLY | CL_MEM_READ_WRITE,
+        CL_MEM_HOST_READ_ONLY | CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_HOST_READ_ONLY | CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+        CL_MEM_HOST_READ_ONLY | CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_HOST_READ_ONLY | CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
+        CL_MEM_HOST_READ_ONLY | CL_MEM_READ_ONLY,
+        CL_MEM_HOST_READ_ONLY | CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_HOST_READ_ONLY | CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+        CL_MEM_HOST_READ_ONLY | CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_HOST_READ_ONLY | CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+        CL_MEM_HOST_READ_ONLY | CL_MEM_WRITE_ONLY,
+        CL_MEM_HOST_READ_ONLY | CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_HOST_READ_ONLY | CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
+        CL_MEM_HOST_READ_ONLY | CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_HOST_READ_ONLY | CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
+        CL_MEM_HOST_WRITE_ONLY | CL_MEM_READ_WRITE,
+        CL_MEM_HOST_WRITE_ONLY | CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_HOST_WRITE_ONLY | CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+        CL_MEM_HOST_WRITE_ONLY | CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_HOST_WRITE_ONLY | CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
+        CL_MEM_HOST_WRITE_ONLY | CL_MEM_READ_ONLY,
+        CL_MEM_HOST_WRITE_ONLY | CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_HOST_WRITE_ONLY | CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+        CL_MEM_HOST_WRITE_ONLY | CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_HOST_WRITE_ONLY | CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+        CL_MEM_HOST_WRITE_ONLY | CL_MEM_WRITE_ONLY,
+        CL_MEM_HOST_WRITE_ONLY | CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_HOST_WRITE_ONLY | CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
+        CL_MEM_HOST_WRITE_ONLY | CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_HOST_WRITE_ONLY | CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
+        CL_MEM_HOST_NO_ACCESS | CL_MEM_READ_WRITE,
+        CL_MEM_HOST_NO_ACCESS | CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_HOST_NO_ACCESS | CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+        CL_MEM_HOST_NO_ACCESS | CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_HOST_NO_ACCESS | CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
+        CL_MEM_HOST_NO_ACCESS | CL_MEM_READ_ONLY,
+        CL_MEM_HOST_NO_ACCESS | CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_HOST_NO_ACCESS | CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+        CL_MEM_HOST_NO_ACCESS | CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_HOST_NO_ACCESS | CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+        CL_MEM_HOST_NO_ACCESS | CL_MEM_WRITE_ONLY,
+        CL_MEM_HOST_NO_ACCESS | CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_HOST_NO_ACCESS | CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
+        CL_MEM_HOST_NO_ACCESS | CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_HOST_NO_ACCESS | CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
+    };
+
+    cl_mem_flags subBufferFlags[] = {
+        CL_MEM_READ_WRITE,
+        CL_MEM_READ_ONLY,
+        CL_MEM_WRITE_ONLY,
+        0,
+        CL_MEM_HOST_READ_ONLY | CL_MEM_READ_WRITE,
+        CL_MEM_HOST_READ_ONLY | CL_MEM_READ_ONLY,
+        CL_MEM_HOST_READ_ONLY | CL_MEM_WRITE_ONLY,
+        CL_MEM_HOST_READ_ONLY | 0,
+        CL_MEM_HOST_WRITE_ONLY | CL_MEM_READ_WRITE,
+        CL_MEM_HOST_WRITE_ONLY | CL_MEM_READ_ONLY,
+        CL_MEM_HOST_WRITE_ONLY | CL_MEM_WRITE_ONLY,
+        CL_MEM_HOST_WRITE_ONLY | 0,
+        CL_MEM_HOST_NO_ACCESS | CL_MEM_READ_WRITE,
+        CL_MEM_HOST_NO_ACCESS | CL_MEM_READ_ONLY,
+        CL_MEM_HOST_NO_ACCESS | CL_MEM_WRITE_ONLY,
+        CL_MEM_HOST_NO_ACCESS | 0,
+    };
+
+
+    // Get the address alignment, so we can make sure the sub-buffer test later works properly.
+    cl_uint addressAlignBits;
+    error = clGetDeviceInfo( deviceID, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(addressAlignBits), &addressAlignBits, NULL );
+
+    size_t addressAlign = addressAlignBits/8;
+    if ( addressAlign < 128 )
+    {
+        addressAlign = 128;
+    }
+
+    for ( unsigned int i = 0; i < sizeof(bufferFlags) / sizeof(cl_mem_flags); ++i )
+    {
+        //printf("@@@ bufferFlags[%u]=0x%x\n", i, bufferFlags[ i ]);
+        if ( bufferFlags[ i ] & CL_MEM_USE_HOST_PTR )
+        {
+            // Create a buffer object to test against.
+            buffer = malloc( addressAlign * 4 );
+            bufferObject = clCreateBuffer( context, bufferFlags[ i ], addressAlign * 4, buffer, &error );
+            if ( error )
+            {
+                free( buffer );
+                test_error( error, "Unable to create buffer (CL_MEM_USE_HOST_PTR) to test with" );
+            }
+
+            // Make sure buffer is cleaned up appropriately if we encounter an error in the rest of the calls.
+            error = clSetMemObjectDestructorCallback( bufferObject, mem_obj_destructor_callback, buffer );
+            test_error( error, "Unable to set mem object destructor callback" );
+
+            void * ptr;
+            TEST_MEM_OBJECT_PARAM( bufferObject, CL_MEM_HOST_PTR, ptr, buffer, "host pointer", "%p", void * )
+        }
+        else if ( (bufferFlags[ i ] & CL_MEM_ALLOC_HOST_PTR) && (bufferFlags[ i ] & CL_MEM_COPY_HOST_PTR) )
+        {
+            // Create a buffer object to test against.
+            buffer = malloc( addressAlign * 4 );
+            bufferObject = clCreateBuffer( context, bufferFlags[ i ], addressAlign * 4, buffer, &error );
+            if ( error )
+            {
+                free( buffer );
+                test_error( error, "Unable to create buffer (CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR) to test with" );
+            }
+
+            // Make sure buffer is cleaned up appropriately if we encounter an error in the rest of the calls.
+            error = clSetMemObjectDestructorCallback( bufferObject, mem_obj_destructor_callback, buffer );
+            test_error( error, "Unable to set mem object destructor callback" );
+        }
+        else if ( bufferFlags[ i ] & CL_MEM_ALLOC_HOST_PTR )
+        {
+            // Create a buffer object to test against.
+            bufferObject = clCreateBuffer( context, bufferFlags[ i ], addressAlign * 4, NULL, &error );
+            test_error( error, "Unable to create buffer (CL_MEM_ALLOC_HOST_PTR) to test with" );
+        }
+        else if ( bufferFlags[ i ] & CL_MEM_COPY_HOST_PTR )
+        {
+            // Create a buffer object to test against.
+            buffer = malloc( addressAlign * 4 );
+            bufferObject = clCreateBuffer( context, bufferFlags[ i ], addressAlign * 4, buffer, &error );
+            if ( error )
+            {
+                free( buffer );
+                test_error( error, "Unable to create buffer (CL_MEM_COPY_HOST_PTR) to test with" );
+            }
+
+            // Make sure buffer is cleaned up appropriately if we encounter an error in the rest of the calls.
+            error = clSetMemObjectDestructorCallback( bufferObject, mem_obj_destructor_callback, buffer );
+            test_error( error, "Unable to set mem object destructor callback" );
+        }
+        else
+        {
+            // Create a buffer object to test against.
+            bufferObject = clCreateBuffer( context, bufferFlags[ i ], addressAlign * 4, NULL, &error );
+            test_error( error, "Unable to create buffer to test with" );
+        }
+
+        // Perform buffer object queries.
+        cl_mem_object_type type;
+        TEST_MEM_OBJECT_PARAM( bufferObject, CL_MEM_TYPE, type, CL_MEM_OBJECT_BUFFER, "type", "%d", int )
+
+        cl_mem_flags flags;
+        TEST_MEM_OBJECT_PARAM( bufferObject, CL_MEM_FLAGS, flags, (unsigned int)bufferFlags[ i ], "flags", "%d", unsigned int )
+
+        size_t sz;
+        TEST_MEM_OBJECT_PARAM( bufferObject, CL_MEM_SIZE, sz, (size_t)( addressAlign * 4 ), "size", "%ld", size_t )
+
+        cl_uint mapCount;
+        error = clGetMemObjectInfo( bufferObject, CL_MEM_MAP_COUNT, sizeof( mapCount ), &mapCount, &size );
+        test_error( error, "Unable to get mem object map count" );
+        if( size != sizeof( mapCount ) )
+        {
+            log_error( "ERROR: Returned size of mem object map count does not validate! (expected %d, got %d from %s:%d)\n",
+                      (int)sizeof( mapCount ), (int)size, __FILE__, __LINE__ );
+            return -1;
+        }
+
+        cl_uint refCount;
+        error = clGetMemObjectInfo( bufferObject, CL_MEM_REFERENCE_COUNT, sizeof( refCount ), &refCount, &size );
+        test_error( error, "Unable to get mem object reference count" );
+        if( size != sizeof( refCount ) )
+        {
+            log_error( "ERROR: Returned size of mem object reference count does not validate! (expected %d, got %d from %s:%d)\n",
+                      (int)sizeof( refCount ), (int)size, __FILE__, __LINE__ );
+            return -1;
+        }
+
+        cl_context otherCtx;
+        TEST_MEM_OBJECT_PARAM( bufferObject, CL_MEM_CONTEXT, otherCtx, context, "context", "%p", cl_context )
+
+        cl_mem origObj;
+        TEST_MEM_OBJECT_PARAM( bufferObject, CL_MEM_ASSOCIATED_MEMOBJECT, origObj, (void *)NULL, "associated mem object", "%p", void * )
+
+        size_t offset;
+        TEST_MEM_OBJECT_PARAM( bufferObject, CL_MEM_OFFSET, offset, 0L, "offset", "%ld", size_t )
+
+        cl_buffer_region region;
+        region.origin = addressAlign;
+        region.size = addressAlign;
+
+        // Loop over possible sub-buffer objects to create.
+        for ( unsigned int j = 0; j < sizeof(subBufferFlags) / sizeof(cl_mem_flags); ++j )
+        {
+            if ( subBufferFlags[ j ] & CL_MEM_READ_WRITE )
+            {
+                if ( !(bufferFlags[ i ] & CL_MEM_READ_WRITE) )
+                    continue; // Buffer must be read_write for sub-buffer to be read_write.
+            }
+            if ( subBufferFlags[ j ] & CL_MEM_READ_ONLY )
+            {
+                if ( !(bufferFlags[ i ] & CL_MEM_READ_WRITE) && !(bufferFlags[ i ] & CL_MEM_READ_ONLY) )
+                    continue; // Buffer must be read_write or read_only for sub-buffer to be read_only
+            }
+            if ( subBufferFlags[ j ] & CL_MEM_WRITE_ONLY )
+            {
+                if ( !(bufferFlags[ i ] & CL_MEM_READ_WRITE) && !(bufferFlags[ i ] & CL_MEM_WRITE_ONLY) )
+                    continue; // Buffer must be read_write or write_only for sub-buffer to be write_only
+            }
+            if ( subBufferFlags[ j ] & CL_MEM_HOST_READ_ONLY )
+            {
+                if ( (bufferFlags[ i ] & CL_MEM_HOST_NO_ACCESS) || (bufferFlags[ i ] & CL_MEM_HOST_WRITE_ONLY) )
+                    continue; // Buffer must be host all access or host read_only for sub-buffer to be host read_only
+            }
+            if ( subBufferFlags[ j ] & CL_MEM_HOST_WRITE_ONLY )
+            {
+                if ( (bufferFlags[ i ] & CL_MEM_HOST_NO_ACCESS) || (bufferFlags[ i ] & CL_MEM_HOST_READ_ONLY) )
+                    continue; // Buffer must be host all access or host write_only for sub-buffer to be host write_only
+            }
+            //printf("@@@ bufferFlags[%u]=0x%x subBufferFlags[%u]=0x%x\n", i, bufferFlags[ i ], j, subBufferFlags[ j ]);
+
+            subBufferObject = clCreateSubBuffer( bufferObject, subBufferFlags[ j ], CL_BUFFER_CREATE_TYPE_REGION, &region, &error );
+            test_error( error, "Unable to create sub-buffer to test against" );
+
+            // Perform sub-buffer object queries.
+            cl_mem_object_type type;
+            TEST_MEM_OBJECT_PARAM( subBufferObject, CL_MEM_TYPE, type, CL_MEM_OBJECT_BUFFER, "type", "%d", int )
+
+            cl_mem_flags flags;
+            cl_mem_flags inheritedFlags = subBufferFlags[ j ];
+            if ( (subBufferFlags[ j ] & (CL_MEM_READ_WRITE | CL_MEM_READ_ONLY | CL_MEM_WRITE_ONLY)) == 0 )
+            {
+              inheritedFlags |= bufferFlags[ i ] & (CL_MEM_READ_WRITE | CL_MEM_READ_ONLY | CL_MEM_WRITE_ONLY);
+            }
+            inheritedFlags |= bufferFlags[ i ] & (CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR | CL_MEM_USE_HOST_PTR);
+            if ( (subBufferFlags[ j ] & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) == 0)
+            {
+              inheritedFlags |= bufferFlags[ i ] & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS);
+            }
+            TEST_MEM_OBJECT_PARAM( subBufferObject, CL_MEM_FLAGS, flags, (unsigned int)inheritedFlags, "flags", "%d", unsigned int )
+
+            TEST_MEM_OBJECT_PARAM( subBufferObject, CL_MEM_SIZE, sz, (size_t)( addressAlign ), "size", "%ld", size_t )
+
+            if ( bufferFlags[ i ] & CL_MEM_USE_HOST_PTR )
+            {
+                void * ptr;
+                void * offsetInBuffer = (char *)buffer + addressAlign;
+
+                TEST_MEM_OBJECT_PARAM( subBufferObject, CL_MEM_HOST_PTR, ptr, offsetInBuffer, "host pointer", "%p", void * )
+            }
+
+            cl_uint mapCount;
+            error = clGetMemObjectInfo( subBufferObject, CL_MEM_MAP_COUNT, sizeof( mapCount ), &mapCount, &size );
+            test_error( error, "Unable to get mem object map count" );
+            if( size != sizeof( mapCount ) )
+            {
+                log_error( "ERROR: Returned size of mem object map count does not validate! (expected %d, got %d from %s:%d)\n",
+                          (int)sizeof( mapCount ), (int)size, __FILE__, __LINE__ );
+                return -1;
+            }
+
+            cl_uint refCount;
+            error = clGetMemObjectInfo( subBufferObject, CL_MEM_REFERENCE_COUNT, sizeof( refCount ), &refCount, &size );
+            test_error( error, "Unable to get mem object reference count" );
+            if( size != sizeof( refCount ) )
+            {
+                log_error( "ERROR: Returned size of mem object reference count does not validate! (expected %d, got %d from %s:%d)\n",
+                          (int)sizeof( refCount ), (int)size, __FILE__, __LINE__ );
+                return -1;
+            }
+
+            cl_context otherCtx;
+            TEST_MEM_OBJECT_PARAM( subBufferObject, CL_MEM_CONTEXT, otherCtx, context, "context", "%p", cl_context )
+
+            TEST_MEM_OBJECT_PARAM( subBufferObject, CL_MEM_ASSOCIATED_MEMOBJECT, origObj, (cl_mem)bufferObject, "associated mem object", "%p", void * )
+
+            TEST_MEM_OBJECT_PARAM( subBufferObject, CL_MEM_OFFSET, offset, (size_t)( addressAlign ), "offset", "%ld", size_t )
+
+            clReleaseMemObject( subBufferObject );
+            subBufferObject = NULL;
+
+        }
+
+        clReleaseMemObject( bufferObject );
+        bufferObject = NULL;
+    }
+
+    return CL_SUCCESS;
+}
+
+
+int test_get_imageObject_info( cl_mem * image, cl_mem_flags objectFlags, cl_image_desc *imageInfo, cl_image_format *imageFormat, size_t pixelSize, cl_context context )
+{
+    int error;
+    size_t size;
+    cl_mem_object_type type;
+    cl_mem_flags flags;
+    cl_uint mapCount;
+    cl_uint refCount;
+    size_t rowPitchMultiplier;
+    size_t slicePitchMultiplier;
+    cl_context otherCtx;
+    size_t offset;
+    size_t sz;
+
+    TEST_MEM_OBJECT_PARAM( *image, CL_MEM_TYPE, type, imageInfo->image_type, "type", "%d", int )
+
+    TEST_MEM_OBJECT_PARAM( *image, CL_MEM_FLAGS, flags, (unsigned int)objectFlags, "flags", "%d", unsigned int )
+
+    error = clGetMemObjectInfo( *image, CL_MEM_SIZE, sizeof( sz ), &sz, NULL );
+    test_error( error, "Unable to get mem size" );
+
+    // The size returned is not constrained by the spec.
+
+    error = clGetMemObjectInfo( *image, CL_MEM_MAP_COUNT, sizeof( mapCount ), &mapCount, &size );
+    test_error( error, "Unable to get mem object map count" );
+    if( size != sizeof( mapCount ) )
+    {
+        log_error( "ERROR: Returned size of mem object map count does not validate! (expected %d, got %d from %s:%d)\n",
+                  (int)sizeof( mapCount ), (int)size, __FILE__, __LINE__ );
+        return -1;
+    }
+
+    error = clGetMemObjectInfo( *image, CL_MEM_REFERENCE_COUNT, sizeof( refCount ), &refCount, &size );
+    test_error( error, "Unable to get mem object reference count" );
+    if( size != sizeof( refCount ) )
+    {
+        log_error( "ERROR: Returned size of mem object reference count does not validate! (expected %d, got %d from %s:%d)\n",
+                  (int)sizeof( refCount ), (int)size, __FILE__, __LINE__ );
+        return -1;
+    }
+
+    TEST_MEM_OBJECT_PARAM( *image, CL_MEM_CONTEXT, otherCtx, context, "context", "%p", cl_context )
+
+    TEST_MEM_OBJECT_PARAM( *image, CL_MEM_OFFSET, offset, 0L, "offset", "%ld", size_t )
+
+    return CL_SUCCESS;
+}
+
+
+int test_get_image_info( cl_device_id deviceID, cl_context context, cl_mem_object_type type )
+{
+    int error;
+    size_t size;
+    void * image = NULL;
+
+    cl_mem imageObject;
+    cl_image_desc imageInfo;
+
+    cl_mem_flags imageFlags[] = {
+        CL_MEM_READ_WRITE,
+        CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+        CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
+        CL_MEM_READ_ONLY,
+        CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+        CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+        CL_MEM_WRITE_ONLY,
+        CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
+        CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
+        CL_MEM_HOST_READ_ONLY | CL_MEM_READ_WRITE,
+        CL_MEM_HOST_READ_ONLY | CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_HOST_READ_ONLY | CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+        CL_MEM_HOST_READ_ONLY | CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_HOST_READ_ONLY | CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
+        CL_MEM_HOST_READ_ONLY | CL_MEM_READ_ONLY,
+        CL_MEM_HOST_READ_ONLY | CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_HOST_READ_ONLY | CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+        CL_MEM_HOST_READ_ONLY | CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_HOST_READ_ONLY | CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+        CL_MEM_HOST_READ_ONLY | CL_MEM_WRITE_ONLY,
+        CL_MEM_HOST_READ_ONLY | CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_HOST_READ_ONLY | CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
+        CL_MEM_HOST_READ_ONLY | CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_HOST_READ_ONLY | CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
+        CL_MEM_HOST_WRITE_ONLY | CL_MEM_READ_WRITE,
+        CL_MEM_HOST_WRITE_ONLY | CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_HOST_WRITE_ONLY | CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+        CL_MEM_HOST_WRITE_ONLY | CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_HOST_WRITE_ONLY | CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
+        CL_MEM_HOST_WRITE_ONLY | CL_MEM_READ_ONLY,
+        CL_MEM_HOST_WRITE_ONLY | CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_HOST_WRITE_ONLY | CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+        CL_MEM_HOST_WRITE_ONLY | CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_HOST_WRITE_ONLY | CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+        CL_MEM_HOST_WRITE_ONLY | CL_MEM_WRITE_ONLY,
+        CL_MEM_HOST_WRITE_ONLY | CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_HOST_WRITE_ONLY | CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
+        CL_MEM_HOST_WRITE_ONLY | CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_HOST_WRITE_ONLY | CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
+        CL_MEM_HOST_NO_ACCESS | CL_MEM_READ_WRITE,
+        CL_MEM_HOST_NO_ACCESS | CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_HOST_NO_ACCESS | CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+        CL_MEM_HOST_NO_ACCESS | CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_HOST_NO_ACCESS | CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
+        CL_MEM_HOST_NO_ACCESS | CL_MEM_READ_ONLY,
+        CL_MEM_HOST_NO_ACCESS | CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_HOST_NO_ACCESS | CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+        CL_MEM_HOST_NO_ACCESS | CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_HOST_NO_ACCESS | CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+        CL_MEM_HOST_NO_ACCESS | CL_MEM_WRITE_ONLY,
+        CL_MEM_HOST_NO_ACCESS | CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_HOST_NO_ACCESS | CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
+        CL_MEM_HOST_NO_ACCESS | CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR,
+        CL_MEM_HOST_NO_ACCESS | CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
+    };
+    MTdata d;
+
+    PASSIVE_REQUIRE_IMAGE_SUPPORT( deviceID )
+
+    cl_image_format imageFormat;
+    size_t pixelSize = 4;
+
+    imageFormat.image_channel_order = CL_RGBA;
+    imageFormat.image_channel_data_type = CL_UNORM_INT8;
+
+    imageInfo.image_width = imageInfo.image_height = imageInfo.image_depth = 1;
+    imageInfo.image_array_size = 0;
+    imageInfo.num_mip_levels = imageInfo.num_samples = 0;
+    imageInfo.mem_object = NULL;
+
+    d = init_genrand( gRandomSeed );
+
+    for ( unsigned int i = 0; i < sizeof(imageFlags) / sizeof(cl_mem_flags); ++i )
+    {
+        imageInfo.image_row_pitch = 0;
+        imageInfo.image_slice_pitch = 0;
+
+        switch (type)
+        {
+            case CL_MEM_OBJECT_IMAGE1D:
+                imageInfo.image_width = get_image_dim(&d, 1023);
+                imageInfo.image_type = CL_MEM_OBJECT_IMAGE1D;
+                break;
+
+            case CL_MEM_OBJECT_IMAGE2D:
+                imageInfo.image_width = get_image_dim(&d, 1023);
+                imageInfo.image_height = get_image_dim(&d, 1023);
+                imageInfo.image_type = CL_MEM_OBJECT_IMAGE2D;
+                break;
+
+            case CL_MEM_OBJECT_IMAGE3D:
+                error = checkFor3DImageSupport(deviceID);
+                if (error == CL_IMAGE_FORMAT_NOT_SUPPORTED)
+                {
+                    log_info("Device doesn't support 3D images. Skipping test.\n");
+                    return CL_SUCCESS;
+                }
+                imageInfo.image_width = get_image_dim(&d, 127);
+                imageInfo.image_height = get_image_dim(&d, 127);
+                imageInfo.image_depth = get_image_dim(&d, 127);
+                imageInfo.image_type = CL_MEM_OBJECT_IMAGE3D;
+                break;
+
+            case CL_MEM_OBJECT_IMAGE1D_ARRAY:
+                imageInfo.image_width = get_image_dim(&d, 1023);
+                imageInfo.image_array_size = get_image_dim(&d, 1023);
+                imageInfo.image_type = CL_MEM_OBJECT_IMAGE1D_ARRAY;
+                break;
+
+            case CL_MEM_OBJECT_IMAGE2D_ARRAY:
+                imageInfo.image_width = get_image_dim(&d, 255);
+                imageInfo.image_height = get_image_dim(&d, 255);
+                imageInfo.image_array_size = get_image_dim(&d, 255);
+                imageInfo.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY;
+                break;
+        }
+
+        if ( imageFlags[i] & CL_MEM_USE_HOST_PTR )
+        {
+            // Create an image object to test against.
+            image = malloc( imageInfo.image_width * imageInfo.image_height * imageInfo.image_depth * pixelSize *
+                           ((imageInfo.image_array_size == 0) ? 1 : imageInfo.image_array_size) );
+            imageObject = clCreateImage( context, imageFlags[i], &imageFormat, &imageInfo, image, &error );
+            if ( error )
+            {
+                free( image );
+                test_error( error, "Unable to create image with (CL_MEM_USE_HOST_PTR) to test with" );
+            }
+
+            // Make sure image is cleaned up appropriately if we encounter an error in the rest of the calls.
+            error = clSetMemObjectDestructorCallback( imageObject, mem_obj_destructor_callback, image );
+            test_error( error, "Unable to set mem object destructor callback" );
+
+            void * ptr;
+            TEST_MEM_OBJECT_PARAM( imageObject, CL_MEM_HOST_PTR, ptr, image, "host pointer", "%p", void * )
+            int ret = test_get_imageObject_info( &imageObject, imageFlags[i], &imageInfo, &imageFormat, pixelSize, context );
+            if (ret)
+                return ret;
+
+            // release image object
+            clReleaseMemObject(imageObject);
+
+            // Try again with non-zero rowPitch.
+            imageInfo.image_row_pitch = imageInfo.image_width * pixelSize;
+            switch (type)
+            {
+                case CL_MEM_OBJECT_IMAGE1D_ARRAY:
+                case CL_MEM_OBJECT_IMAGE2D_ARRAY:
+                case CL_MEM_OBJECT_IMAGE3D:
+                    imageInfo.image_slice_pitch = imageInfo.image_row_pitch * imageInfo.image_height;
+                    break;
+            }
+
+            image = malloc( imageInfo.image_width * imageInfo.image_height * imageInfo.image_depth * pixelSize *
+                           ((imageInfo.image_array_size == 0) ? 1 : imageInfo.image_array_size) );
+            imageObject = clCreateImage( context, imageFlags[i], &imageFormat, &imageInfo, image, &error );
+            if ( error )
+            {
+                free( image );
+                test_error( error, "Unable to create image2d (CL_MEM_USE_HOST_PTR) to test with" );
+            }
+
+            // Make sure image2d is cleaned up appropriately if we encounter an error in the rest of the calls.
+            error = clSetMemObjectDestructorCallback( imageObject, mem_obj_destructor_callback, image );
+            test_error( error, "Unable to set mem object destructor callback" );
+
+            TEST_MEM_OBJECT_PARAM( imageObject, CL_MEM_HOST_PTR, ptr, image, "host pointer", "%p", void * )
+            ret = test_get_imageObject_info( &imageObject, imageFlags[i], &imageInfo, &imageFormat, pixelSize, context );
+            if (ret)
+                return ret;
+
+        }
+        else if ( (imageFlags[i] & CL_MEM_ALLOC_HOST_PTR) && (imageFlags[i] & CL_MEM_COPY_HOST_PTR) )
+        {
+            // Create an image object to test against.
+            image = malloc( imageInfo.image_width * imageInfo.image_height * imageInfo.image_depth * pixelSize *
+                           ((imageInfo.image_array_size == 0) ? 1 : imageInfo.image_array_size) );
+            imageObject = clCreateImage( context, imageFlags[i], &imageFormat, &imageInfo, image, &error );
+            if ( error )
+            {
+                free( image );
+                test_error( error, "Unable to create image with (CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR) to test with" );
+            }
+
+            // Make sure image is cleaned up appropriately if we encounter an error in the rest of the calls.
+            error = clSetMemObjectDestructorCallback( imageObject, mem_obj_destructor_callback, image );
+            test_error( error, "Unable to set mem object destructor callback" );
+            int ret = test_get_imageObject_info( &imageObject, imageFlags[ i ], &imageInfo, &imageFormat, pixelSize, context );
+            if (ret)
+                return ret;
+
+            // release image object
+            clReleaseMemObject(imageObject);
+
+            // Try again with non-zero rowPitch.
+            imageInfo.image_row_pitch = imageInfo.image_width * pixelSize;
+            switch (type)
+            {
+                case CL_MEM_OBJECT_IMAGE1D_ARRAY:
+                case CL_MEM_OBJECT_IMAGE2D_ARRAY:
+                case CL_MEM_OBJECT_IMAGE3D:
+                    imageInfo.image_slice_pitch = imageInfo.image_row_pitch * imageInfo.image_height;
+                    break;
+            }
+
+            image = malloc( imageInfo.image_width * imageInfo.image_height * imageInfo.image_depth * pixelSize *
+                           ((imageInfo.image_array_size == 0) ? 1 : imageInfo.image_array_size) );
+            imageObject = clCreateImage( context, imageFlags[i], &imageFormat, &imageInfo, image, &error );
+            if ( error )
+            {
+                free( image );
+                test_error( error, "Unable to create image with (CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR) to test with" );
+            }
+
+            // Make sure image is cleaned up appropriately if we encounter an error in the rest of the calls.
+            error = clSetMemObjectDestructorCallback( imageObject, mem_obj_destructor_callback, image );
+            test_error( error, "Unable to set mem object destructor callback" );
+            ret = test_get_imageObject_info( &imageObject, imageFlags[i], &imageInfo, &imageFormat, pixelSize, context );
+            if (ret)
+                return ret;
+
+        }
+        else if ( imageFlags[i] & CL_MEM_ALLOC_HOST_PTR )
+        {
+            // Create an image object to test against.
+            imageObject = clCreateImage( context, imageFlags[i], &imageFormat, &imageInfo, NULL, &error );
+            test_error( error, "Unable to create image with (CL_MEM_ALLOC_HOST_PTR) to test with" );
+            int ret = test_get_imageObject_info( &imageObject, imageFlags[i], &imageInfo, &imageFormat, pixelSize, context );
+            if (ret)
+                return ret;
+
+        }
+        else if ( imageFlags[i] & CL_MEM_COPY_HOST_PTR )
+        {
+            // Create an image object to test against.
+            image = malloc( imageInfo.image_width * imageInfo.image_height * imageInfo.image_depth * pixelSize *
+                           ((imageInfo.image_array_size == 0) ? 1 : imageInfo.image_array_size) );
+            imageObject = clCreateImage( context, imageFlags[i], &imageFormat, &imageInfo, image, &error );
+            if ( error )
+            {
+                free( image );
+                test_error( error, "Unable to create image with (CL_MEM_COPY_HOST_PTR) to test with" );
+            }
+
+            // Make sure image is cleaned up appropriately if we encounter an error in the rest of the calls.
+            error = clSetMemObjectDestructorCallback( imageObject, mem_obj_destructor_callback, image );
+            test_error( error, "Unable to set mem object destructor callback" );
+            int ret = test_get_imageObject_info( &imageObject, imageFlags[i], &imageInfo, &imageFormat, pixelSize, context );
+            if (ret)
+                return ret;
+
+            clReleaseMemObject(imageObject);
+
+            // Try again with non-zero rowPitch.
+            imageInfo.image_row_pitch = imageInfo.image_width * pixelSize;
+            switch (type)
+            {
+                case CL_MEM_OBJECT_IMAGE1D_ARRAY:
+                case CL_MEM_OBJECT_IMAGE2D_ARRAY:
+                case CL_MEM_OBJECT_IMAGE3D:
+                    imageInfo.image_slice_pitch = imageInfo.image_row_pitch * imageInfo.image_height;
+                    break;
+            }
+
+            image = malloc( imageInfo.image_width * imageInfo.image_height * imageInfo.image_depth * pixelSize *
+                           ((imageInfo.image_array_size == 0) ? 1 : imageInfo.image_array_size) );
+            imageObject = clCreateImage( context, imageFlags[i], &imageFormat, &imageInfo, image, &error );
+            if ( error )
+            {
+                free( image );
+                test_error( error, "Unable to create image with (CL_MEM_COPY_HOST_PTR) to test with" );
+            }
+
+            // Make sure image is cleaned up appropriately if we encounter an error in the rest of the calls.
+            error = clSetMemObjectDestructorCallback( imageObject, mem_obj_destructor_callback, image );
+            test_error( error, "Unable to set mem object destructor callback" );
+            ret = test_get_imageObject_info( &imageObject, imageFlags[i], &imageInfo, &imageFormat, pixelSize, context );
+            if (ret)
+                return ret;
+
+        }
+        else
+        {
+            // Create an image object to test against.
+            imageObject = clCreateImage( context, imageFlags[i], &imageFormat, &imageInfo, NULL, &error );
+            test_error( error, "Unable to create image to test with" );
+            int ret = test_get_imageObject_info( &imageObject, imageFlags[i], &imageInfo, &imageFormat, pixelSize, context );
+            if (ret)
+                return ret;
+
+        }
+
+        clReleaseMemObject( imageObject );
+    }
+
+    return CL_SUCCESS;
+}
+
+
+int test_get_image2d_info( cl_device_id deviceID, cl_context context, cl_command_queue ignoreQueue, int num_elements )
+{
+    return test_get_image_info(deviceID, context, CL_MEM_OBJECT_IMAGE2D);
+}
+
+int test_get_image3d_info( cl_device_id deviceID, cl_context context, cl_command_queue ignoreQueue, int num_elements )
+{
+    return test_get_image_info(deviceID, context, CL_MEM_OBJECT_IMAGE3D);
+}
+
+int test_get_image1d_info( cl_device_id deviceID, cl_context context, cl_command_queue ignoreQueue, int num_elements )
+{
+    return test_get_image_info(deviceID, context, CL_MEM_OBJECT_IMAGE1D);
+}
+
+int test_get_image1d_array_info( cl_device_id deviceID, cl_context context, cl_command_queue ignoreQueue, int num_elements )
+{
+    return test_get_image_info(deviceID, context, CL_MEM_OBJECT_IMAGE1D_ARRAY);
+}
+
+int test_get_image2d_array_info( cl_device_id deviceID, cl_context context, cl_command_queue ignoreQueue, int num_elements )
+{
+    return test_get_image_info(deviceID, context, CL_MEM_OBJECT_IMAGE2D_ARRAY);
+}
+
+
--- a/test_conformance/api/test_mem_objects.cpp
+++ b/test_conformance/api/test_mem_objects.cpp
@@ -0,0 +1,108 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "testBase.h"
+
+static volatile cl_int sDestructorIndex;
+
+void CL_CALLBACK mem_destructor_callback( cl_mem memObject, void * userData )
+{
+    int * userPtr = (int *)userData;
+
+    // ordering of callbacks is guaranteed, meaning we don't need to do atomic operation here
+    *userPtr = ++sDestructorIndex;
+}
+
+#ifndef ABS
+#define ABS( x ) ( ( x < 0 ) ? -x : x )
+#endif
+
+int test_mem_object_destructor_callback_single( clMemWrapper &memObject )
+{
+    cl_int error;
+    int i;
+
+    // Set up some variables to catch the order in which callbacks are called
+    volatile int callbackOrders[ 3 ] = { 0, 0, 0 };
+    sDestructorIndex = 0;
+
+    // Set up the callbacks
+    error = clSetMemObjectDestructorCallback( memObject, mem_destructor_callback, (void*) &callbackOrders[ 0 ] );
+    test_error( error, "Unable to set destructor callback" );
+
+    error = clSetMemObjectDestructorCallback( memObject, mem_destructor_callback, (void*) &callbackOrders[ 1 ] );
+    test_error( error, "Unable to set destructor callback" );
+
+    error = clSetMemObjectDestructorCallback( memObject, mem_destructor_callback, (void*) &callbackOrders[ 2 ] );
+    test_error( error, "Unable to set destructor callback" );
+
+    // Now release the buffer, which SHOULD call the callbacks
+    error = clReleaseMemObject( memObject );
+    test_error( error, "Unable to release test buffer" );
+
+    // Note: since we manually released the mem wrapper, we need to set it to NULL to prevent a double-release
+    memObject = NULL;
+
+    // At this point, all three callbacks should have already been called
+    int numErrors = 0;
+    for(  i = 0; i < 3; i++ )
+    {
+        // Spin waiting for the release to finish.  If you don't call the mem_destructor_callback, you will not
+        // pass the test.  bugzilla 6316
+        while( 0 == callbackOrders[i] )
+        {}
+
+        if( ABS( callbackOrders[ i ] ) != 3-i )
+        {
+            log_error( "\tERROR: Callback %d was called in the wrong order! (Was called order %d, should have been order %d)\n",
+                      i+1, ABS( callbackOrders[ i ] ), i );
+            numErrors++;
+        }
+    }
+
+    return ( numErrors > 0 ) ? -1 : 0;
+}
+
+int test_mem_object_destructor_callback(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    clMemWrapper testBuffer, testImage;
+    cl_int error;
+
+
+    // Create a buffer and an image to test callbacks against
+    testBuffer = clCreateBuffer( context, CL_MEM_READ_WRITE, 1024, NULL, &error );
+    test_error( error, "Unable to create testing buffer" );
+
+    if( test_mem_object_destructor_callback_single( testBuffer ) != 0 )
+    {
+        log_error( "ERROR: Destructor callbacks for buffer object FAILED\n" );
+        return -1;
+    }
+
+    if( checkForImageSupport( deviceID ) == 0 )
+    {
+        cl_image_format imageFormat = { CL_RGBA, CL_SIGNED_INT8 };
+        testImage = create_image_2d( context, CL_MEM_READ_ONLY, &imageFormat, 16, 16, 0, NULL, &error );
+        test_error( error, "Unable to create testing image" );
+
+        if( test_mem_object_destructor_callback_single( testImage ) != 0 )
+        {
+            log_error( "ERROR: Destructor callbacks for image object FAILED\n" );
+            return -1;
+        }
+    }
+
+    return 0;
+}
--- a/test_conformance/api/test_native_kernel.cpp
+++ b/test_conformance/api/test_native_kernel.cpp
@@ -0,0 +1,121 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "testBase.h"
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include "../../test_common/harness/conversions.h"
+
+extern cl_uint gRandomSeed;
+
+static void CL_CALLBACK test_native_kernel_fn( void *userData )
+{
+    struct arg_struct {
+        cl_int * source;
+        cl_int * dest;
+        cl_int count;
+    } *args = (arg_struct *)userData;
+
+    for( cl_int i = 0; i < args->count; i++ )
+        args->dest[ i ] = args->source[ i ];
+}
+
+int test_native_kernel(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
+{
+    int error;
+    RandomSeed seed( gRandomSeed );
+    // Check if we support native kernels
+    cl_device_exec_capabilities capabilities;
+    error = clGetDeviceInfo(device, CL_DEVICE_EXECUTION_CAPABILITIES, sizeof(capabilities), &capabilities, NULL);
+    if (!(capabilities & CL_EXEC_NATIVE_KERNEL)) {
+        log_info("Device does not support CL_EXEC_NATIVE_KERNEL.\n");
+        return 0;
+    }
+
+    clMemWrapper streams[ 2 ];
+#if !(defined (_WIN32) && defined (_MSC_VER))
+    cl_int inBuffer[ n_elems ], outBuffer[ n_elems ];
+#else
+    cl_int* inBuffer  = (cl_int *)_malloca( n_elems * sizeof(cl_int) );
+    cl_int* outBuffer = (cl_int *)_malloca( n_elems * sizeof(cl_int) );
+#endif
+    clEventWrapper finishEvent;
+
+    struct arg_struct
+    {
+        cl_mem inputStream;
+        cl_mem outputStream;
+        cl_int count;
+    } args;
+
+
+    // Create some input values
+    generate_random_data( kInt, n_elems, seed, inBuffer );
+
+
+    // Create I/O streams
+    streams[ 0 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, n_elems * sizeof(cl_int), inBuffer, &error );
+    test_error( error, "Unable to create I/O stream" );
+    streams[ 1 ] = clCreateBuffer( context, 0, n_elems * sizeof(cl_int), NULL, &error );
+    test_error( error, "Unable to create I/O stream" );
+
+
+    // Set up the arrays to call with
+    args.inputStream = streams[ 0 ];
+    args.outputStream = streams[ 1 ];
+    args.count = n_elems;
+
+    void * memLocs[ 2 ] = { &args.inputStream, &args.outputStream };
+
+
+    // Run the kernel
+    error = clEnqueueNativeKernel( queue, test_native_kernel_fn,
+                                      &args, sizeof( args ),
+                                      2, &streams[ 0 ],
+                                      (const void **)memLocs,
+                                      0, NULL, &finishEvent );
+    test_error( error, "Unable to queue native kernel" );
+
+    // Finish and wait for the kernel to complete
+    error = clFinish( queue );
+    test_error(error, "clFinish failed");
+
+    error = clWaitForEvents( 1, &finishEvent );
+    test_error(error, "clWaitForEvents failed");
+
+    // Now read the results and verify
+    error = clEnqueueReadBuffer( queue, streams[ 1 ], CL_TRUE, 0, n_elems * sizeof(cl_int), outBuffer, 0, NULL, NULL );
+    test_error( error, "Unable to read results" );
+
+    for( int i = 0; i < n_elems; i++ )
+    {
+        if( inBuffer[ i ] != outBuffer[ i ] )
+        {
+            log_error( "ERROR: Data sample %d for native kernel did not validate (expected %d, got %d)\n",
+                      i, (int)inBuffer[ i ], (int)outBuffer[ i ] );
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
+
+
+
+
--- a/test_conformance/api/test_null_buffer_arg.c
+++ b/test_conformance/api/test_null_buffer_arg.c
@@ -0,0 +1,206 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include <stdio.h>
+#if defined(__APPLE__)
+#include <OpenCL/opencl.h>
+#include <OpenCL/cl_platform.h>
+#else
+#include <CL/opencl.h>
+#include <CL/cl_platform.h>
+#endif
+#include "testBase.h"
+#include "../../test_common/harness/typeWrappers.h"
+#include "../../test_common/harness/testHarness.h"
+#include "procs.h"
+
+
+enum { SUCCESS, FAILURE };
+typedef enum { NON_NULL_PATH, ADDROF_NULL_PATH, NULL_PATH } test_type;
+
+#define NITEMS 4096
+
+/* places the comparison result of value of the src ptr against 0 into each element of the output
+ * array, to allow testing that the kernel actually _gets_ the NULL value */
+const char *kernel_string_long =
+"kernel void test_kernel(global float *src, global long *dst)\n"
+"{\n"
+"    uint tid = get_global_id(0);\n"
+"    dst[tid] = (long)(src != 0);\n"
+"}\n";
+
+// For gIsEmbedded
+const char *kernel_string =
+"kernel void test_kernel(global float *src, global int *dst)\n"
+"{\n"
+"    uint tid = get_global_id(0);\n"
+"    dst[tid] = (int)(src != 0);\n"
+"}\n";
+
+
+/*
+ * The guts of the test:
+ * call setKernelArgs with a regular buffer, &NULL, or NULL depending on
+ * the value of 'test_type'
+ */
+static int test_setargs_and_execution(cl_command_queue queue, cl_kernel kernel,
+    cl_mem test_buf, cl_mem result_buf, test_type type)
+{
+    unsigned int test_success = 0;
+
+    unsigned int i;
+    cl_int status;
+    char *typestr;
+
+    if (type == NON_NULL_PATH) {
+        status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &test_buf);
+        typestr = "non-NULL";
+    } else if (type == ADDROF_NULL_PATH) {
+        test_buf = NULL;
+        status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &test_buf);
+        typestr = "&NULL";
+    } else if (type == NULL_PATH) {
+        status = clSetKernelArg(kernel, 0, sizeof(cl_mem), NULL);
+        typestr = "NULL";
+    }
+
+    log_info("Testing setKernelArgs with %s buffer.\n", typestr);
+
+    if (status != CL_SUCCESS) {
+        log_error("clSetKernelArg failed with status: %d\n", status);
+        return FAILURE; // no point in continuing *this* test
+    }
+
+    size_t global = NITEMS;
+    status = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global,
+        NULL, 0, NULL, NULL);
+    test_error(status, "NDRangeKernel failed.");
+
+    if (gIsEmbedded)
+    {
+        cl_int* host_result = (cl_int*)malloc(NITEMS*sizeof(cl_int));
+        status = clEnqueueReadBuffer(queue, result_buf, CL_TRUE, 0,
+                                     sizeof(cl_int)*NITEMS, host_result, 0, NULL, NULL);
+        test_error(status, "ReadBuffer failed.");
+        // in the non-null case, we expect NONZERO values:
+        if (type == NON_NULL_PATH) {
+            for (i=0; i<NITEMS; i++) {
+                if (host_result[i] == 0) {
+                    log_error("failure: item %d in the result buffer was unexpectedly NULL.\n", i);
+                    test_success = FAILURE; break;
+                }
+            }
+
+        } else if (type == ADDROF_NULL_PATH || type == NULL_PATH) {
+            for (i=0; i<NITEMS; i++) {
+                if (host_result[i] != 0) {
+                    log_error("failure: item %d in the result buffer was unexpectedly non-NULL.\n", i);
+                    test_success = FAILURE; break;
+                }
+            }
+        }
+        free(host_result);
+    }
+    else
+    {
+    cl_long* host_result = (cl_long*)malloc(NITEMS*sizeof(cl_long));
+    status = clEnqueueReadBuffer(queue, result_buf, CL_TRUE, 0,
+        sizeof(cl_long)*NITEMS, host_result, 0, NULL, NULL);
+    test_error(status, "ReadBuffer failed.");
+    // in the non-null case, we expect NONZERO values:
+    if (type == NON_NULL_PATH) {
+        for (i=0; i<NITEMS; i++) {
+            if (host_result[i] == 0) {
+                log_error("failure: item %d in the result buffer was unexpectedly NULL.\n", i);
+                test_success = FAILURE; break;
+            }
+        }
+    } else if (type == ADDROF_NULL_PATH || type == NULL_PATH) {
+        for (i=0; i<NITEMS; i++) {
+            if (host_result[i] != 0) {
+                log_error("failure: item %d in the result buffer was unexpectedly non-NULL.\n", i);
+                test_success = FAILURE; break;
+            }
+        }
+    }
+    free(host_result);
+    }
+
+    if (test_success == SUCCESS) {
+        log_info("\t%s ok.\n", typestr);
+    }
+
+    return test_success;
+}
+
+int test_null_buffer_arg(cl_device_id device, cl_context context,
+    cl_command_queue queue, int num_elements)
+{
+    unsigned int test_success = 0;
+    unsigned int i;
+    unsigned int buffer_size;
+    cl_int status;
+    cl_program program;
+    cl_kernel kernel;
+
+    // prep kernel:
+    if (gIsEmbedded)
+        status = create_single_kernel_helper(context, &program, NULL, 1, &kernel_string, NULL);
+    else
+        status = create_single_kernel_helper(context, &program, NULL, 1, &kernel_string_long, NULL);
+
+    test_error(status, "Unable to build test program");
+
+    kernel = clCreateKernel(program, "test_kernel", &status);
+    test_error(status, "CreateKernel failed.");
+
+    cl_mem dev_src = clCreateBuffer(context, CL_MEM_READ_ONLY, NITEMS*sizeof(cl_float),
+        NULL, NULL);
+
+    if (gIsEmbedded)
+        buffer_size = NITEMS*sizeof(cl_int);
+    else
+        buffer_size = NITEMS*sizeof(cl_long);
+
+    cl_mem dev_dst = clCreateBuffer(context, CL_MEM_WRITE_ONLY, buffer_size,
+        NULL, NULL);
+
+    // set the destination buffer normally:
+    status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &dev_dst);
+    test_error(status, "SetKernelArg failed.");
+
+    //
+    // we test three cases:
+    //
+    // - typical case, used everyday: non-null buffer
+    // - the case of src as &NULL (the spec-compliance test)
+    // - the case of src as NULL (the backwards-compatibility test, Apple only)
+    //
+
+    test_success  = test_setargs_and_execution(queue, kernel, dev_src, dev_dst, NON_NULL_PATH);
+    test_success |= test_setargs_and_execution(queue, kernel, dev_src, dev_dst, ADDROF_NULL_PATH);
+
+#ifdef __APPLE__
+    test_success |= test_setargs_and_execution(queue, kernel, dev_src, dev_dst, NULL_PATH);
+#endif
+
+    // clean up:
+    if (dev_src) clReleaseMemObject(dev_src);
+    clReleaseMemObject(dev_dst);
+    clReleaseKernel(kernel);
+    clReleaseProgram(program);
+
+    return test_success;
+}
--- a/test_conformance/api/test_platform.cpp
+++ b/test_conformance/api/test_platform.cpp
@@ -0,0 +1,289 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "testBase.h"
+
+#include <string.h>
+
+#define EXTENSION_NAME_BUF_SIZE 4096
+
+#define PRINT_EXTENSION_INFO 0
+
+int test_platform_extensions(cl_device_id deviceID, cl_context context,
+                 cl_command_queue queue, int num_elements)
+{
+    const char * extensions[] = {
+    "cl_khr_byte_addressable_store",
+//    "cl_APPLE_SetMemObjectDestructor",
+    "cl_khr_global_int32_base_atomics",
+    "cl_khr_global_int32_extended_atomics",
+    "cl_khr_local_int32_base_atomics",
+    "cl_khr_local_int32_extended_atomics",
+    "cl_khr_int64_base_atomics",
+    "cl_khr_int64_extended_atomics",
+// need to put in entires for various atomics
+    "cl_khr_3d_image_writes",
+    "cl_khr_fp16",
+    "cl_khr_fp64",
+    NULL
+    };
+
+    bool extensionsSupported[] = {
+    false, //"cl_khr_byte_addressable_store",
+    false, // need to put in entires for various atomics
+    false, // "cl_khr_global_int32_base_atomics",
+    false, // "cl_khr_global_int32_extended_atomics",
+    false, // "cl_khr_local_int32_base_atomics",
+    false, // "cl_khr_local_int32_extended_atomics",
+    false, // "cl_khr_int64_base_atomics",
+    false, // "cl_khr_int64_extended_atomics",
+    false, //"cl_khr_3d_image_writes",
+    false, //"cl_khr_fp16",
+    false, //"cl_khr_fp64",
+    false //NULL
+    };
+
+    int extensionIndex;
+
+    cl_platform_id platformID;
+    cl_int err;
+
+    char platform_extensions[EXTENSION_NAME_BUF_SIZE];
+    char device_extensions[EXTENSION_NAME_BUF_SIZE];
+
+    // Okay, so what we're going to do is just check the device indicated by
+    // deviceID against the platform that includes this device
+
+
+    // pass CL_DEVICE_PLATFORM to clGetDeviceInfo
+    // to get a result of type cl_platform_id
+
+    err = clGetDeviceInfo(deviceID,
+              CL_DEVICE_PLATFORM,
+              sizeof(cl_platform_id),
+              (void *)(&platformID),
+              NULL);
+
+    if(err != CL_SUCCESS)
+    {
+    vlog_error("test_platform_extensions : could not get platformID from device\n");
+    return -1;
+    }
+
+
+    // now we grab the set of extensions specified by the platform
+    err = clGetPlatformInfo(platformID,
+                CL_PLATFORM_EXTENSIONS,
+                sizeof(platform_extensions),
+                (void *)(&platform_extensions[0]),
+                NULL);
+    if(err != CL_SUCCESS)
+    {
+    vlog_error("test_platform_extensions : could not get extension string from platform\n");
+    return -1;
+    }
+
+#if PRINT_EXTENSION_INFO
+    log_info("Platform extensions include \"%s\"\n\n", platform_extensions);
+#endif
+
+    // here we parse the platform extensions, to look for the "important" ones
+    for(extensionIndex=0; extensions[extensionIndex] != NULL; ++extensionIndex)
+    {
+    if(strstr(platform_extensions, extensions[extensionIndex]) != NULL)
+    {
+        // we found it
+#if PRINT_EXTENSION_INFO
+        log_info("Found \"%s\" in platform extensions\n",
+        extensions[extensionIndex]);
+#endif
+        extensionsSupported[extensionIndex] = true;
+    }
+    }
+
+    // and then we grab the set of extensions specified by the device
+    // (this can be turned into a "loop over all devices in this platform")
+    err = clGetDeviceInfo(deviceID,
+              CL_DEVICE_EXTENSIONS,
+              sizeof(device_extensions),
+              (void *)(&device_extensions[0]),
+              NULL);
+    if(err != CL_SUCCESS)
+    {
+    vlog_error("test_platform_extensions : could not get extension string from device\n");
+    return -1;
+    }
+
+
+#if PRINT_EXTENSION_INFO
+    log_info("Device extensions include \"%s\"\n\n", device_extensions);
+#endif
+
+    for(extensionIndex=0; extensions[extensionIndex] != NULL; ++extensionIndex)
+    {
+    if(extensionsSupported[extensionIndex] == false)
+    {
+        continue; // skip this one
+    }
+
+    if(strstr(device_extensions, extensions[extensionIndex]) == NULL)
+    {
+        // device does not support it
+        vlog_error("Platform supports extension \"%s\" but device does not\n",
+               extensions[extensionIndex]);
+        return -1;
+    }
+    }
+    return 0;
+}
+
+int test_get_platform_ids(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) {
+  cl_platform_id platforms[16];
+  cl_uint num_platforms;
+  char *string_returned;
+
+  string_returned = (char*)malloc(8192);
+
+  int total_errors = 0;
+  int err = CL_SUCCESS;
+
+
+  err = clGetPlatformIDs(16, platforms, &num_platforms);
+  test_error(err, "clGetPlatformIDs failed");
+
+  if (num_platforms <= 16) {
+    // Try with NULL
+    err = clGetPlatformIDs(num_platforms, platforms, NULL);
+    test_error(err, "clGetPlatformIDs failed with NULL for return size");
+  }
+
+  if (num_platforms < 1) {
+    log_error("Found 0 platforms.\n");
+    return -1;
+  }
+  log_info("Found %d platforms.\n", num_platforms);
+
+
+  for (int p=0; p<(int)num_platforms; p++) {
+    cl_device_id *devices;
+    cl_uint num_devices;
+    size_t size;
+
+
+    log_info("Platform %d (%p):\n", p, platforms[p]);
+
+    memset(string_returned, 0, 8192);
+    err = clGetPlatformInfo(platforms[p], CL_PLATFORM_PROFILE, 8192, string_returned, &size);
+    test_error(err, "clGetPlatformInfo for CL_PLATFORM_PROFILE failed");
+    log_info("\tCL_PLATFORM_PROFILE: %s\n", string_returned);
+    if (strlen(string_returned)+1 != size) {
+      log_error("Returned string length %ld does not equal reported one %ld.\n", strlen(string_returned)+1, size);
+      total_errors++;
+    }
+
+    memset(string_returned, 0, 8192);
+    err = clGetPlatformInfo(platforms[p], CL_PLATFORM_VERSION, 8192, string_returned, &size);
+    test_error(err, "clGetPlatformInfo for CL_PLATFORM_VERSION failed");
+    log_info("\tCL_PLATFORM_VERSION: %s\n", string_returned);
+    if (strlen(string_returned)+1 != size) {
+      log_error("Returned string length %ld does not equal reported one %ld.\n", strlen(string_returned)+1, size);
+      total_errors++;
+    }
+
+    memset(string_returned, 0, 8192);
+    err = clGetPlatformInfo(platforms[p], CL_PLATFORM_NAME, 8192, string_returned, &size);
+    test_error(err, "clGetPlatformInfo for CL_PLATFORM_NAME failed");
+    log_info("\tCL_PLATFORM_NAME: %s\n", string_returned);
+    if (strlen(string_returned)+1 != size) {
+      log_error("Returned string length %ld does not equal reported one %ld.\n", strlen(string_returned)+1, size);
+      total_errors++;
+    }
+
+    memset(string_returned, 0, 8192);
+    err = clGetPlatformInfo(platforms[p], CL_PLATFORM_VENDOR, 8192, string_returned, &size);
+    test_error(err, "clGetPlatformInfo for CL_PLATFORM_VENDOR failed");
+    log_info("\tCL_PLATFORM_VENDOR: %s\n", string_returned);
+    if (strlen(string_returned)+1 != size) {
+      log_error("Returned string length %ld does not equal reported one %ld.\n", strlen(string_returned)+1, size);
+      total_errors++;
+    }
+
+    memset(string_returned, 0, 8192);
+    err = clGetPlatformInfo(platforms[p], CL_PLATFORM_EXTENSIONS, 8192, string_returned, &size);
+    test_error(err, "clGetPlatformInfo for CL_PLATFORM_EXTENSIONS failed");
+    log_info("\tCL_PLATFORM_EXTENSIONS: %s\n", string_returned);
+    if (strlen(string_returned)+1 != size) {
+      log_error("Returned string length %ld does not equal reported one %ld.\n", strlen(string_returned)+1, size);
+      total_errors++;
+    }
+
+    err = clGetDeviceIDs(platforms[p], CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
+    test_error(err, "clGetDeviceIDs size failed.\n");
+    devices = (cl_device_id *)malloc(num_devices*sizeof(cl_device_id));
+    memset(devices, 0, sizeof(cl_device_id)*num_devices);
+    err = clGetDeviceIDs(platforms[p], CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
+    test_error(err, "clGetDeviceIDs failed.\n");
+
+    log_info("\tPlatform has %d devices.\n", (int)num_devices);
+    for (int d=0; d<(int)num_devices; d++) {
+      size_t returned_size;
+      cl_platform_id returned_platform;
+      cl_context context;
+      cl_context_properties properties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[p], 0 };
+
+      err = clGetDeviceInfo(devices[d], CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &returned_platform, &returned_size);
+      test_error(err, "clGetDeviceInfo failed for CL_DEVICE_PLATFORM\n");
+      if (returned_size != sizeof(cl_platform_id)) {
+        log_error("Reported return size (%ld) does not match expected size (%ld).\n", returned_size, sizeof(cl_platform_id));
+        total_errors++;
+      }
+
+      memset(string_returned, 0, 8192);
+      err = clGetDeviceInfo(devices[d], CL_DEVICE_NAME, 8192, string_returned, NULL);
+      test_error(err, "clGetDeviceInfo failed for CL_DEVICE_NAME\n");
+
+      log_info("\t\tPlatform for device %d (%s) is %p.\n", d, string_returned, returned_platform);
+
+      log_info("\t\t\tTesting clCreateContext for the platform/device...\n");
+      // Try creating a context for the platform
+      context = clCreateContext(properties, 1, &devices[d], NULL, NULL, &err);
+      test_error(err, "\t\tclCreateContext failed for device with platform properties\n");
+
+      memset(properties, 0, sizeof(cl_context_properties)*3);
+
+      err = clGetContextInfo(context, CL_CONTEXT_PROPERTIES, sizeof(cl_context_properties)*3, properties, &returned_size);
+      test_error(err, "clGetContextInfo for CL_CONTEXT_PROPERTIES failed");
+      if (returned_size != sizeof(cl_context_properties)*3) {
+        log_error("Invalid size returned from clGetContextInfo for CL_CONTEXT_PROPERTIES. Got %ld, expected %ld.\n",
+                  returned_size, sizeof(cl_context_properties)*3);
+        total_errors++;
+      }
+
+      if (properties[0] != (cl_context_properties)CL_CONTEXT_PLATFORM || properties[1] != (cl_context_properties)platforms[p]) {
+        log_error("Wrong properties returned. Expected: [%p %p], got [%p %p]\n",
+                  (void*)CL_CONTEXT_PLATFORM, platforms[p], (void*)properties[0], (void*)properties[1]);
+        total_errors++;
+      }
+
+      err = clReleaseContext(context);
+      test_error(err, "clReleaseContext failed");
+    }
+    free(devices);
+  }
+
+  free(string_returned);
+
+  return total_errors;
+}
--- a/test_conformance/api/test_queries.cpp
+++ b/test_conformance/api/test_queries.cpp
@@ -0,0 +1,643 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "testBase.h"
+#include "../../test_common/harness/imageHelpers.h"
+#include <stdlib.h>
+#include <ctype.h>
+
+int test_get_platform_info(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    cl_platform_id platform;
+    cl_int error;
+    char buffer[ 16384 ];
+    size_t length;
+
+    // Get the platform to use
+    error = clGetPlatformIDs(1, &platform, NULL);
+    test_error( error, "Unable to get platform" );
+
+    // Platform profile should either be FULL_PROFILE or EMBEDDED_PROFILE
+    error = clGetPlatformInfo(platform,  CL_PLATFORM_PROFILE, sizeof( buffer ), buffer, &length );
+    test_error( error, "Unable to get platform profile string" );
+
+    log_info("Returned CL_PLATFORM_PROFILE %s.\n", buffer);
+
+    if( strcmp( buffer, "FULL_PROFILE" ) != 0 && strcmp( buffer, "EMBEDDED_PROFILE" ) != 0 )
+    {
+        log_error( "ERROR: Returned platform profile string is not a valid string by OpenCL 1.2! (Returned: %s)\n", buffer );
+        return -1;
+    }
+    if( strlen( buffer )+1 != length )
+    {
+        log_error( "ERROR: Returned length of profile string is incorrect (actual length: %d, returned length: %d)\n",
+                  (int)strlen( buffer )+1, (int)length );
+        return -1;
+    }
+
+    // Check just length return
+    error = clGetPlatformInfo(platform,  CL_PLATFORM_PROFILE, 0, NULL, &length );
+    test_error( error, "Unable to get platform profile length" );
+    if( strlen( (char *)buffer )+1 != length )
+    {
+        log_error( "ERROR: Returned length of profile string is incorrect (actual length: %d, returned length: %d)\n",
+                  (int)strlen( (char *)buffer )+1, (int)length );
+        return -1;
+    }
+
+
+    // Platform version should fit the regex "OpenCL *[0-9]+\.[0-9]+"
+    error = clGetPlatformInfo(platform,  CL_PLATFORM_VERSION, sizeof( buffer ), buffer, &length );
+    test_error( error, "Unable to get platform version string" );
+
+    log_info("Returned CL_PLATFORM_VERSION %s.\n", buffer);
+
+    if( memcmp( buffer, "OpenCL ", strlen( "OpenCL " ) ) != 0 )
+    {
+        log_error( "ERROR: Initial part of platform version string does not match required format! (returned: %s)\n", (char *)buffer );
+        return -1;
+    }
+    char *p1 = (char *)buffer + strlen( "OpenCL " );
+    while( *p1 == ' ' )
+        p1++;
+    char *p2 = p1;
+    while( isdigit( *p2 ) )
+        p2++;
+    if( *p2 != '.' )
+    {
+        log_error( "ERROR: Numeric part of platform version string does not match required format! (returned: %s)\n", (char *)buffer );
+        return -1;
+    }
+    char *p3 = p2 + 1;
+    while( isdigit( *p3 ) )
+        p3++;
+    if( *p3 != ' ' )
+    {
+        log_error( "ERROR: space expected after minor version number! (returned: %s)\n", (char *)buffer );
+        return -1;
+    }
+    *p2 = ' '; // Put in a space for atoi below.
+    p2++;
+
+    // make sure it is null terminated
+    for( ; p3 != buffer + length; p3++ )
+        if( *p3 == '\0' )
+            break;
+    if( p3 == buffer + length )
+    {
+        log_error( "ERROR: platform version string is not NUL terminated!\n" );
+        return -1;
+    }
+
+    int major = atoi( p1 );
+    int minor = atoi( p2 );
+    int minor_revision = 2;
+    if( major * 10 + minor < 10 + minor_revision )
+    {
+        log_error( "ERROR: OpenCL profile version returned is less than 1.%d!\n", minor_revision );
+        return -1;
+    }
+
+    // Sanity checks on the returned values
+    if( length != strlen( (char *)buffer ) + 1)
+    {
+        log_error( "ERROR: Returned length of version string does not match actual length (actual: %d, returned: %d)\n", (int)strlen( (char *)buffer )+1, (int)length );
+        return -1;
+    }
+
+    // Check just length
+    error = clGetPlatformInfo(platform,  CL_PLATFORM_VERSION, 0, NULL, &length );
+    test_error( error, "Unable to get platform version length" );
+    if( length != strlen( (char *)buffer )+1 )
+    {
+        log_error( "ERROR: Returned length of version string does not match actual length (actual: %d, returned: %d)\n", (int)strlen( buffer )+1, (int)length );
+        return -1;
+    }
+
+    return 0;
+}
+
+int test_get_sampler_info(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    int error;
+    size_t size;
+
+    PASSIVE_REQUIRE_IMAGE_SUPPORT( deviceID )
+
+    cl_sampler_properties properties[] = {
+        CL_SAMPLER_NORMALIZED_COORDS, CL_TRUE,
+        CL_SAMPLER_ADDRESSING_MODE, CL_ADDRESS_CLAMP,
+        CL_SAMPLER_FILTER_MODE, CL_FILTER_LINEAR,
+        0 };
+    clSamplerWrapper sampler = clCreateSamplerWithProperties(context, properties, &error);
+    test_error( error, "Unable to create sampler to test with" );
+
+    cl_uint refCount;
+    error = clGetSamplerInfo( sampler, CL_SAMPLER_REFERENCE_COUNT, sizeof( refCount ), &refCount, &size );
+    test_error( error, "Unable to get sampler ref count" );
+    if( size != sizeof( refCount ) )
+    {
+        log_error( "ERROR: Returned size of sampler refcount does not validate! (expected %d, got %d)\n", (int)sizeof( refCount ), (int)size );
+        return -1;
+    }
+
+    cl_context otherCtx;
+    error = clGetSamplerInfo( sampler, CL_SAMPLER_CONTEXT, sizeof( otherCtx ), &otherCtx, &size );
+    test_error( error, "Unable to get sampler context" );
+    if( otherCtx != context )
+    {
+        log_error( "ERROR: Sampler context does not validate! (expected %p, got %p)\n", context, otherCtx );
+        return -1;
+    }
+    if( size != sizeof( otherCtx ) )
+    {
+        log_error( "ERROR: Returned size of sampler context does not validate! (expected %d, got %d)\n", (int)sizeof( otherCtx ), (int)size );
+        return -1;
+    }
+
+    cl_addressing_mode mode;
+    error = clGetSamplerInfo( sampler, CL_SAMPLER_ADDRESSING_MODE, sizeof( mode ), &mode, &size );
+    test_error( error, "Unable to get sampler addressing mode" );
+    if( mode != CL_ADDRESS_CLAMP )
+    {
+        log_error( "ERROR: Sampler addressing mode does not validate! (expected %d, got %d)\n", (int)CL_ADDRESS_CLAMP, (int)mode );
+        return -1;
+    }
+    if( size != sizeof( mode ) )
+    {
+        log_error( "ERROR: Returned size of sampler addressing mode does not validate! (expected %d, got %d)\n", (int)sizeof( mode ), (int)size );
+        return -1;
+    }
+
+    cl_filter_mode fmode;
+    error = clGetSamplerInfo( sampler, CL_SAMPLER_FILTER_MODE, sizeof( fmode ), &fmode, &size );
+    test_error( error, "Unable to get sampler filter mode" );
+    if( fmode != CL_FILTER_LINEAR )
+    {
+        log_error( "ERROR: Sampler filter mode does not validate! (expected %d, got %d)\n", (int)CL_FILTER_LINEAR, (int)fmode );
+        return -1;
+    }
+    if( size != sizeof( fmode ) )
+    {
+        log_error( "ERROR: Returned size of sampler filter mode does not validate! (expected %d, got %d)\n", (int)sizeof( fmode ), (int)size );
+        return -1;
+    }
+
+    cl_int norm;
+    error = clGetSamplerInfo( sampler, CL_SAMPLER_NORMALIZED_COORDS, sizeof( norm ), &norm, &size );
+    test_error( error, "Unable to get sampler normalized flag" );
+    if( norm != CL_TRUE )
+    {
+        log_error( "ERROR: Sampler normalized flag does not validate! (expected %d, got %d)\n", (int)CL_TRUE, (int)norm );
+        return -1;
+    }
+    if( size != sizeof( norm ) )
+    {
+        log_error( "ERROR: Returned size of sampler normalized flag does not validate! (expected %d, got %d)\n", (int)sizeof( norm ), (int)size );
+        return -1;
+    }
+
+    return 0;
+}
+
+#define TEST_COMMAND_QUEUE_PARAM( queue, paramName, val, expected, name, type, cast )    \
+error = clGetCommandQueueInfo( queue, paramName, sizeof( val ), &val, &size );        \
+test_error( error, "Unable to get command queue " name );                            \
+if( val != expected )                                                                \
+{                                                                                    \
+log_error( "ERROR: Command queue " name " did not validate! (expected " type ", got " type ")\n", (cast)expected, (cast)val );    \
+return -1;                                                                        \
+}            \
+if( size != sizeof( val ) )                \
+{                                        \
+log_error( "ERROR: Returned size of command queue " name " does not validate! (expected %d, got %d)\n", (int)sizeof( val ), (int)size );    \
+return -1;    \
+}
+
+int test_get_command_queue_info(cl_device_id deviceID, cl_context context, cl_command_queue ignoreQueue, int num_elements)
+{
+    int error;
+    size_t size;
+
+    cl_queue_properties device_props;
+    cl_queue_properties queue_props[] = {CL_QUEUE_PROPERTIES,0,0};
+
+    clGetDeviceInfo(deviceID, CL_DEVICE_QUEUE_ON_HOST_PROPERTIES, sizeof(device_props), &device_props, NULL);
+    log_info("CL_DEVICE_QUEUE_ON_HOST_PROPERTIES is %d\n", (int)device_props);
+
+    queue_props[1] = device_props;
+    clCommandQueueWrapper queue = clCreateCommandQueueWithProperties( context, deviceID, &queue_props[0], &error );
+    test_error( error, "Unable to create command queue to test with" );
+
+    cl_uint refCount;
+    error = clGetCommandQueueInfo( queue, CL_QUEUE_REFERENCE_COUNT, sizeof( refCount ), &refCount, &size );
+    test_error( error, "Unable to get command queue reference count" );
+    if( size != sizeof( refCount ) )
+    {
+        log_error( "ERROR: Returned size of command queue reference count does not validate! (expected %d, got %d)\n", (int)sizeof( refCount ), (int)size );
+        return -1;
+    }
+
+    cl_context otherCtx;
+    TEST_COMMAND_QUEUE_PARAM( queue, CL_QUEUE_CONTEXT, otherCtx, context, "context", "%p", cl_context )
+
+    cl_device_id otherDevice;
+    error = clGetCommandQueueInfo( queue, CL_QUEUE_DEVICE, sizeof(otherDevice), &otherDevice, &size);
+    test_error(error, "clGetCommandQueue failed.");
+
+    if (size != sizeof(cl_device_id)) {
+        log_error( " ERROR: Returned size of command queue CL_QUEUE_DEVICE does not validate! (expected %d, got %d)\n", (int)sizeof( otherDevice ), (int)size );
+        return -1;
+    }
+
+    /* Since the device IDs are opaque types we check the CL_DEVICE_VENDOR_ID which is unique for identical hardware. */
+    cl_uint otherDevice_vid, deviceID_vid;
+    error = clGetDeviceInfo(otherDevice, CL_DEVICE_VENDOR_ID, sizeof(otherDevice_vid), &otherDevice_vid, NULL );
+    test_error( error, "Unable to get device CL_DEVICE_VENDOR_ID" );
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_VENDOR_ID, sizeof(deviceID_vid), &deviceID_vid, NULL );
+    test_error( error, "Unable to get device CL_DEVICE_VENDOR_ID" );
+
+    if( otherDevice_vid != deviceID_vid )
+    {
+        log_error( "ERROR: Incorrect device returned for queue! (Expected vendor ID 0x%x, got 0x%x)\n", deviceID_vid, otherDevice_vid );
+        return -1;
+    }
+
+    cl_command_queue_properties props;
+    TEST_COMMAND_QUEUE_PARAM( queue, CL_QUEUE_PROPERTIES, props, (unsigned int)( device_props ), "properties", "%d", unsigned int )
+
+    return 0;
+}
+
+int test_get_context_info(cl_device_id deviceID, cl_context context, cl_command_queue ignoreQueue, int num_elements)
+{
+    int error;
+    size_t size;
+    cl_context_properties props;
+
+    error = clGetContextInfo( context, CL_CONTEXT_PROPERTIES, sizeof( props ), &props, &size );
+    test_error( error, "Unable to get context props" );
+
+    if (size == 0) {
+        // Valid size
+        return 0;
+    } else if (size == sizeof(cl_context_properties)) {
+        // Data must be NULL
+        if (props != 0) {
+            log_error("ERROR: Returned properties is no NULL.\n");
+            return -1;
+        }
+        // Valid data and size
+        return 0;
+    }
+    // Size was not 0 or 1
+    log_error( "ERROR: Returned size of context props is not valid! (expected 0 or %d, got %d)\n",
+              (int)sizeof(cl_context_properties), (int)size );
+    return -1;
+}
+
+#define TEST_MEM_OBJECT_PARAM( mem, paramName, val, expected, name, type, cast )    \
+error = clGetMemObjectInfo( mem, paramName, sizeof( val ), &val, &size );        \
+test_error( error, "Unable to get mem object " name );                            \
+if( val != expected )                                                                \
+{                                                                                    \
+log_error( "ERROR: Mem object " name " did not validate! (expected " type ", got " type ")\n", (cast)(expected), (cast)val );    \
+return -1;                                                                        \
+}            \
+if( size != sizeof( val ) )                \
+{                                        \
+log_error( "ERROR: Returned size of mem object " name " does not validate! (expected %d, got %d)\n", (int)sizeof( val ), (int)size );    \
+return -1;    \
+}
+
+void CL_CALLBACK mem_obj_destructor_callback( cl_mem, void *data )
+{
+    free( data );
+}
+
+// All possible combinations of valid cl_mem_flags.
+static cl_mem_flags all_flags[16] = {
+  0,
+  CL_MEM_READ_WRITE,
+  CL_MEM_READ_ONLY,
+  CL_MEM_WRITE_ONLY,
+  CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+  CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+  CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
+  CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+  CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+  CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
+  CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR,
+  CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR,
+  CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR,
+  CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
+  CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+  CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
+};
+
+#define TEST_DEVICE_PARAM( device, paramName, val, name, type, cast )    \
+error = clGetDeviceInfo( device, paramName, sizeof( val ), &val, &size );        \
+test_error( error, "Unable to get device " name );                            \
+if( size != sizeof( val ) )                \
+{                                        \
+log_error( "ERROR: Returned size of device " name " does not validate! (expected %d, got %d)\n", (int)sizeof( val ), (int)size );    \
+return -1;    \
+}                \
+log_info( "\tReported device " name " : " type "\n", (cast)val );
+
+#define TEST_DEVICE_PARAM_MEM( device, paramName, val, name, type, div )    \
+error = clGetDeviceInfo( device, paramName, sizeof( val ), &val, &size );        \
+test_error( error, "Unable to get device " name );                            \
+if( size != sizeof( val ) )                \
+{                                        \
+log_error( "ERROR: Returned size of device " name " does not validate! (expected %d, got %d)\n", (int)sizeof( val ), (int)size );    \
+return -1;    \
+}                \
+log_info( "\tReported device " name " : " type "\n", (int)( val / div ) );
+
+int test_get_device_info(cl_device_id deviceID, cl_context context, cl_command_queue ignoreQueue, int num_elements)
+{
+    int error;
+    size_t size;
+
+    cl_uint vendorID;
+    TEST_DEVICE_PARAM( deviceID, CL_DEVICE_VENDOR_ID, vendorID, "vendor ID", "0x%08x", int )
+
+    char extensions[ 10240 ];
+    error = clGetDeviceInfo( deviceID, CL_DEVICE_EXTENSIONS, sizeof( extensions ), &extensions, &size );
+    test_error( error, "Unable to get device extensions" );
+    if( size != strlen( extensions ) + 1 )
+    {
+        log_error( "ERROR: Returned size of device extensions does not validate! (expected %d, got %d)\n", (int)( strlen( extensions ) + 1 ), (int)size );
+        return -1;
+    }
+    log_info( "\tReported device extensions: %s \n", extensions );
+
+    cl_uint preferred;
+    TEST_DEVICE_PARAM( deviceID, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, preferred, "preferred vector char width", "%d", int )
+    TEST_DEVICE_PARAM( deviceID, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, preferred, "preferred vector short width", "%d", int )
+    TEST_DEVICE_PARAM( deviceID, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, preferred, "preferred vector int width", "%d", int )
+    TEST_DEVICE_PARAM( deviceID, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, preferred, "preferred vector long width", "%d", int )
+    TEST_DEVICE_PARAM( deviceID, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, preferred, "preferred vector float width", "%d", int )
+    TEST_DEVICE_PARAM( deviceID, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, preferred, "preferred vector double width", "%d", int )
+
+    // Note that even if cl_khr_fp64, the preferred width for double can be non-zero.  For example, vendors
+    // extensions can support double but may not support cl_khr_fp64, which implies math library support.
+
+    cl_uint baseAddrAlign;
+    TEST_DEVICE_PARAM( deviceID, CL_DEVICE_MEM_BASE_ADDR_ALIGN, baseAddrAlign, "base address alignment", "%d bytes", int )
+
+    cl_uint maxDataAlign;
+    TEST_DEVICE_PARAM( deviceID, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, maxDataAlign, "min data type alignment", "%d bytes", int )
+
+    cl_device_mem_cache_type cacheType;
+    error = clGetDeviceInfo( deviceID, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, sizeof( cacheType ), &cacheType, &size );
+    test_error( error, "Unable to get device global mem cache type" );
+    if( size != sizeof( cacheType ) )
+    {
+        log_error( "ERROR: Returned size of device global mem cache type does not validate! (expected %d, got %d)\n", (int)sizeof( cacheType ), (int)size );
+        return -1;
+    }
+    const char *cacheTypeName = ( cacheType == CL_NONE ) ? "CL_NONE" : ( cacheType == CL_READ_ONLY_CACHE ) ? "CL_READ_ONLY_CACHE" : ( cacheType == CL_READ_WRITE_CACHE ) ? "CL_READ_WRITE_CACHE" : "<unknown>";
+    log_info( "\tReported device global mem cache type: %s \n", cacheTypeName );
+
+    cl_uint cachelineSize;
+    TEST_DEVICE_PARAM( deviceID, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, cachelineSize, "global mem cacheline size", "%d bytes", int )
+
+    cl_ulong cacheSize;
+    TEST_DEVICE_PARAM_MEM( deviceID, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cacheSize, "global mem cache size", "%d KB", 1024 )
+
+    cl_ulong memSize;
+    TEST_DEVICE_PARAM_MEM( deviceID, CL_DEVICE_GLOBAL_MEM_SIZE, memSize, "global mem size", "%d MB", ( 1024 * 1024 ) )
+
+    cl_device_local_mem_type localMemType;
+    error = clGetDeviceInfo( deviceID, CL_DEVICE_LOCAL_MEM_TYPE, sizeof( localMemType ), &localMemType, &size );
+    test_error( error, "Unable to get device local mem type" );
+    if( size != sizeof( cacheType ) )
+    {
+        log_error( "ERROR: Returned size of device local mem type does not validate! (expected %d, got %d)\n", (int)sizeof( localMemType ), (int)size );
+        return -1;
+    }
+    const char *localMemTypeName = ( localMemType == CL_LOCAL ) ? "CL_LOCAL" : ( cacheType == CL_GLOBAL ) ? "CL_GLOBAL" : "<unknown>";
+    log_info( "\tReported device local mem type: %s \n", localMemTypeName );
+
+
+    cl_bool errSupport;
+    TEST_DEVICE_PARAM( deviceID, CL_DEVICE_ERROR_CORRECTION_SUPPORT, errSupport, "error correction support", "%d", int )
+
+    size_t timerResolution;
+    TEST_DEVICE_PARAM( deviceID, CL_DEVICE_PROFILING_TIMER_RESOLUTION, timerResolution, "profiling timer resolution", "%ld nanoseconds", long )
+
+    cl_bool endian;
+    TEST_DEVICE_PARAM( deviceID, CL_DEVICE_ENDIAN_LITTLE, endian, "little endian flag", "%d", int )
+
+    cl_bool avail;
+    TEST_DEVICE_PARAM( deviceID, CL_DEVICE_AVAILABLE, avail, "available flag", "%d", int )
+
+    cl_bool compilerAvail;
+    TEST_DEVICE_PARAM( deviceID, CL_DEVICE_COMPILER_AVAILABLE, compilerAvail, "compiler available flag", "%d", int )
+
+    char profile[ 1024 ];
+    error = clGetDeviceInfo( deviceID, CL_DEVICE_PROFILE, sizeof( profile ), &profile, &size );
+    test_error( error, "Unable to get device profile" );
+    if( size != strlen( profile ) + 1 )
+    {
+        log_error( "ERROR: Returned size of device profile does not validate! (expected %d, got %d)\n", (int)( strlen( profile ) + 1 ), (int)size );
+        return -1;
+    }
+    if( strcmp( profile, "FULL_PROFILE" ) != 0 && strcmp( profile, "EMBEDDED_PROFILE" ) != 0 )
+    {
+        log_error( "ERROR: Returned profile of device not FULL or EMBEDDED as required by OpenCL 1.2! (Returned %s)\n", profile );
+        return -1;
+    }
+    log_info( "\tReported device profile: %s \n", profile );
+
+
+    return 0;
+}
+
+
+
+
+static const char *sample_compile_size[2] = {
+    "__kernel void sample_test(__global int *src, __global int *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "     dst[tid] = src[tid];\n"
+    "\n"
+    "}\n",
+    "__kernel __attribute__((reqd_work_group_size(%d,%d,%d))) void sample_test(__global int *src, __global int *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "     dst[tid] = src[tid];\n"
+    "\n"
+    "}\n" };
+
+int test_kernel_required_group_size(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    int error;
+    size_t realSize;
+    size_t kernel_max_workgroup_size;
+    size_t global[] = {64,14,10};
+    size_t local[] = {0,0,0};
+
+    cl_uint max_dimensions;
+
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(max_dimensions), &max_dimensions, NULL);
+    test_error(error,  "clGetDeviceInfo failed for CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS");
+    log_info("Device reported CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS = %d.\n", (int)max_dimensions);
+
+    {
+        clProgramWrapper program;
+        clKernelWrapper kernel;
+
+        error = create_single_kernel_helper( context, &program, &kernel, 1, &sample_compile_size[ 0 ], "sample_test" );
+        if( error != 0 )
+            return error;
+
+        error = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(kernel_max_workgroup_size), &kernel_max_workgroup_size, NULL);
+        test_error( error, "clGetKernelWorkGroupInfo failed for CL_KERNEL_WORK_GROUP_SIZE");
+        log_info("The CL_KERNEL_WORK_GROUP_SIZE for the kernel is %d.\n", (int)kernel_max_workgroup_size);
+
+        size_t size[ 3 ];
+        error = clGetKernelWorkGroupInfo( kernel, deviceID, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, sizeof( size ), size, &realSize );
+        test_error( error, "Unable to get work group info" );
+
+        if( size[ 0 ] != 0 || size[ 1 ] != 0 || size[ 2 ] != 0 )
+        {
+            log_error( "ERROR: Nonzero compile work group size returned for nonspecified size! (returned %d,%d,%d)\n", (int)size[0], (int)size[1], (int)size[2] );
+            return -1;
+        }
+
+        if( realSize != sizeof( size ) )
+        {
+            log_error( "ERROR: Returned size of compile work group size not valid! (Expected %d, got %d)\n", (int)sizeof( size ), (int)realSize );
+            return -1;
+        }
+
+        // Determine some local dimensions to use for the test.
+        if (max_dimensions == 1) {
+            error = get_max_common_work_group_size(context, kernel, global[0], &local[0]);
+            test_error( error, "get_max_common_work_group_size failed");
+            log_info("For global dimension %d, kernel will require local dimension %d.\n", (int)global[0], (int)local[0]);
+        } else if (max_dimensions == 2) {
+            error = get_max_common_2D_work_group_size(context, kernel, global, local);
+            test_error( error, "get_max_common_2D_work_group_size failed");
+            log_info("For global dimension %d x %d, kernel will require local dimension %d x %d.\n", (int)global[0], (int)global[1], (int)local[0], (int)local[1]);
+        } else {
+            error = get_max_common_3D_work_group_size(context, kernel, global, local);
+            test_error( error, "get_max_common_3D_work_group_size failed");
+            log_info("For global dimension %d x %d x %d, kernel will require local dimension %d x %d x %d.\n",
+                     (int)global[0], (int)global[1], (int)global[2], (int)local[0], (int)local[1], (int)local[2]);
+        }
+    }
+
+
+    {
+        clProgramWrapper program;
+        clKernelWrapper kernel;
+        clMemWrapper in, out;
+        //char source[1024];
+        char *source = (char*)malloc(1024);
+        source[0] = '\0';
+
+        sprintf(source, sample_compile_size[1], local[0], local[1], local[2]);
+
+        error = create_single_kernel_helper( context, &program, &kernel, 1, (const char**)&source, "sample_test" );
+        if( error != 0 )
+            return error;
+
+        size_t size[ 3 ];
+        error = clGetKernelWorkGroupInfo( kernel, deviceID, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, sizeof( size ), size, &realSize );
+        test_error( error, "Unable to get work group info" );
+
+        if( size[ 0 ] != local[0] || size[ 1 ] != local[1] || size[ 2 ] != local[2] )
+        {
+            log_error( "ERROR: Incorrect compile work group size returned for specified size! (returned %d,%d,%d, expected %d,%d,%d)\n",
+                      (int)size[0], (int)size[1], (int)size[2], (int)local[0], (int)local[1], (int)local[2]);
+            return -1;
+        }
+
+        // Verify that the kernel will only execute with that size.
+        in = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_int)*global[0], NULL, &error);
+        test_error(error, "clCreateBuffer failed");
+        out = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_int)*global[0], NULL, &error);
+        test_error(error, "clCreateBuffer failed");
+
+        error = clSetKernelArg(kernel, 0, sizeof(in), &in);
+        test_error(error, "clSetKernelArg failed");
+        error = clSetKernelArg(kernel, 1, sizeof(out), &out);
+        test_error(error, "clSetKernelArg failed");
+
+        error = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global, local, 0, NULL, NULL);
+        test_error(error, "clEnqueueNDRangeKernel failed");
+
+        error = clFinish(queue);
+        test_error(error, "clFinish failed");
+
+        log_info("kernel_required_group_size may report spurious ERRORS in the conformance log.\n");
+
+        local[0]++;
+        error = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global, local, 0, NULL, NULL);
+        if (error != CL_INVALID_WORK_GROUP_SIZE) {
+            log_error("Incorrect error returned for executing a kernel with the wrong required local work group size. (used %d,%d,%d, required %d,%d,%d)\n",
+                      (int)local[0], (int)local[1], (int)local[2], (int)local[0]-1, (int)local[1], (int)local[2] );
+            print_error(error, "Expected: CL_INVALID_WORK_GROUP_SIZE.");
+            return -1;
+        }
+
+        error = clFinish(queue);
+        test_error(error, "clFinish failed");
+
+        if (max_dimensions == 1) {
+            free(source);
+            return 0;
+        }
+
+        local[0]--; local[1]++;
+        error = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global, local, 0, NULL, NULL);
+        if (error != CL_INVALID_WORK_GROUP_SIZE) {
+            log_error("Incorrect error returned for executing a kernel with the wrong required local work group size. (used %d,%d,%d, required %d,%d,%d)\n",
+                      (int)local[0], (int)local[1], (int)local[2], (int)local[0]-1, (int)local[1], (int)local[2]);
+            print_error(error, "Expected: CL_INVALID_WORK_GROUP_SIZE.");
+            return -1;
+        }
+
+        error = clFinish(queue);
+        test_error(error, "clFinish failed");
+
+        if (max_dimensions == 2) {
+            return 0;
+            free(source);
+        }
+
+        local[1]--; local[2]++;
+        error = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global, local, 0, NULL, NULL);
+        if (error != CL_INVALID_WORK_GROUP_SIZE) {
+            log_error("Incorrect error returned for executing a kernel with the wrong required local work group size. (used %d,%d,%d, required %d,%d,%d)\n",
+                      (int)local[0], (int)local[1], (int)local[2], (int)local[0]-1, (int)local[1], (int)local[2]);
+            print_error(error, "Expected: CL_INVALID_WORK_GROUP_SIZE.");
+            return -1;
+        }
+
+        error = clFinish(queue);
+        test_error(error, "clFinish failed");
+        free(source);
+    }
+
+    return 0;
+}
+
+
--- a/test_conformance/api/test_queue_hint.cpp
+++ b/test_conformance/api/test_queue_hint.cpp
@@ -0,0 +1,191 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "testBase.h"
+#include "../../test_common/harness/typeWrappers.h"
+#include "../../test_common/harness/conversions.h"
+#include <sstream>
+#include <string>
+
+using namespace std;
+/*
+
+*/
+
+const char *queue_hint_test_kernel[] = {
+"__kernel void vec_cpy(__global int *src, __global int *dst)\n"
+"{\n"
+"    int  tid = get_global_id(0);\n"
+"\n"
+"    dst[tid] = src[tid];\n"
+"\n"
+"}\n" };
+
+int test_enqueue(cl_context context, clCommandQueueWrapper& queue, clKernelWrapper& kernel, size_t num_elements)
+{
+    clMemWrapper            streams[2];
+    int error;
+
+    int* buf = new int[num_elements];
+
+    for (int i = 0; i < static_cast<int>(num_elements); ++i)
+    {
+        buf[i] = i;
+    }
+
+
+    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, num_elements * sizeof(int), buf, &error);
+    test_error( error, "clCreateBuffer failed." );
+    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, num_elements * sizeof(int), NULL, &error);
+    test_error( error, "clCreateBuffer failed." );
+
+    error = clSetKernelArg(kernel, 0, sizeof(streams[0]), &streams[0]);
+    test_error( error, "clSetKernelArg failed." );
+
+    error = clSetKernelArg(kernel, 1, sizeof(streams[1]), &streams[1]);
+    test_error( error, "clSetKernelArg failed." );
+
+    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &num_elements, NULL, 0, NULL, NULL);
+    test_error( error, "clEnqueueNDRangeKernel failed." );
+
+    error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, num_elements * sizeof(int), buf, 0, NULL, NULL);
+    test_error( error, "clEnqueueReadBuffer failed." );
+
+    for (int i = 0; i < static_cast<int>(num_elements); ++i)
+    {
+        if (buf[i] != i)
+        {
+            log_error("ERROR: Incorrect vector copy result.");
+            return -1;
+        }
+    }
+
+    delete [] buf;
+
+    return 0;
+}
+
+
+
+
+int test_queue_hint(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    if (num_elements <= 0)
+    {
+        num_elements = 128;
+    }
+
+    int err = 0;
+
+    // Query extension
+    cl_platform_id platform;
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    char *string_returned;
+
+    string_returned = (char*)malloc(8192);
+
+    err = clGetDeviceInfo(deviceID, CL_DEVICE_PLATFORM, sizeof(platform), &platform, NULL);
+    test_error(err, "clGetDeviceInfo for CL_DEVICE_PLATFORM failed");
+
+    err = create_single_kernel_helper_with_build_options(context, &program, &kernel, 1, queue_hint_test_kernel, "vec_cpy", NULL);
+    if (err != 0)
+    {
+        return err;
+    }
+
+    memset(string_returned, 0, 8192);
+    err = clGetPlatformInfo(platform, CL_PLATFORM_EXTENSIONS, 8192, string_returned, NULL);
+    test_error(err, "clGetPlatformInfo for CL_PLATFORM_EXTENSIONS failed");
+    log_info("\tCL_PLATFORM_EXTENSIONS: %s\n", string_returned);
+    string strExt = string_returned;
+    if (strExt.find("cl_khr_priority_hints") != string::npos)
+    {
+        log_info("Testing cl_khr_priority_hints...\n", string_returned);
+
+        cl_queue_properties queue_prop[][3] =
+        {
+            {
+                CL_QUEUE_PRIORITY_KHR, CL_QUEUE_PRIORITY_HIGH_KHR,
+                0
+            },
+            {
+                CL_QUEUE_PRIORITY_KHR, CL_QUEUE_PRIORITY_MED_KHR,
+                0
+            },
+            {
+                CL_QUEUE_PRIORITY_KHR, CL_QUEUE_PRIORITY_LOW_KHR,
+                0
+            }
+        };
+
+        for (int i = 0; i < 3; ++i)
+        {
+            clCommandQueueWrapper q = clCreateCommandQueueWithProperties(context, deviceID, queue_prop[i], &err);
+            test_error(err, "clCreateCommandQueueWithProperties failed");
+
+            err = test_enqueue(context, q, kernel, (size_t)num_elements);
+            if (err != 0)
+            {
+                return err;
+            }
+        }
+    }
+    else
+    {
+        log_info("cl_khr_priority_hints is not supported.");
+    }
+
+    if (strExt.find("cl_khr_throttle_hints") != string::npos)
+    {
+        cl_queue_properties queue_prop[][3] =
+        {
+            {
+                CL_QUEUE_THROTTLE_KHR, CL_QUEUE_THROTTLE_HIGH_KHR,
+                0
+            },
+            {
+                CL_QUEUE_THROTTLE_KHR, CL_QUEUE_THROTTLE_MED_KHR,
+                0
+            },
+            {
+                CL_QUEUE_THROTTLE_KHR, CL_QUEUE_THROTTLE_LOW_KHR,
+                0
+            }
+        };
+
+        for (int i = 0; i < 3; ++i)
+        {
+            clCommandQueueWrapper q = clCreateCommandQueueWithProperties(context, deviceID, queue_prop[i], &err);
+            test_error(err, "clCreateCommandQueueWithProperties failed");
+
+            err = test_enqueue(context, q, kernel, (size_t)num_elements);
+            if (err != 0)
+            {
+                return err;
+            }
+        }
+
+    }
+    else
+    {
+        log_info("cl_khr_throttle_hints is not supported.");
+    }
+
+    free(string_returned);
+
+    return 0;
+}
+
--- a/test_conformance/api/test_retain.cpp
+++ b/test_conformance/api/test_retain.cpp
@@ -0,0 +1,234 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "testBase.h"
+#if !defined(_WIN32)
+#include <unistd.h>
+#endif // !_WIN32
+
+// Note: According to spec, the various functions to get instance counts should return an error when passed in an object
+// that has already been released. However, the spec is out of date. If it gets re-updated to allow such action, re-enable
+// this define.
+//#define VERIFY_AFTER_RELEASE    1
+
+#define GET_QUEUE_INSTANCE_COUNT(p) numInstances = ( (err = clGetCommandQueueInfo(p, CL_QUEUE_REFERENCE_COUNT, sizeof( numInstances ), &numInstances, NULL)) == CL_SUCCESS ? numInstances : 0 )
+#define GET_MEM_INSTANCE_COUNT(p) numInstances = ( (err = clGetMemObjectInfo(p, CL_MEM_REFERENCE_COUNT, sizeof( numInstances ), &numInstances, NULL)) == CL_SUCCESS ? numInstances : 0 )
+
+#define VERIFY_INSTANCE_COUNT(c,rightValue) if( c != rightValue ) { \
+log_error( "ERROR: Instance count for test object is not valid! (should be %d, really is %d)\n", rightValue, c ); \
+return -1;    }
+
+int test_retain_queue_single(cl_device_id deviceID, cl_context context, cl_command_queue queueNotUsed, int num_elements)
+{
+    cl_command_queue queue;
+    cl_uint numInstances;
+    int err;
+
+
+    /* Create a test queue */
+    queue = clCreateCommandQueueWithProperties( context, deviceID, 0, &err );
+    test_error( err, "Unable to create command queue to test with" );
+
+    /* Test the instance count */
+    GET_QUEUE_INSTANCE_COUNT( queue );
+    test_error( err, "Unable to get queue instance count" );
+    VERIFY_INSTANCE_COUNT( numInstances, 1 );
+
+    /* Now release the program */
+    clReleaseCommandQueue( queue );
+#ifdef VERIFY_AFTER_RELEASE
+    /* We're not allowed to get the instance count after the object has been completely released. But that's
+     exactly how we can tell the release worked--by making sure getting the instance count fails! */
+    GET_QUEUE_INSTANCE_COUNT( queue );
+    if( err != CL_INVALID_COMMAND_QUEUE )
+    {
+        print_error( err, "Command queue was not properly released" );
+        return -1;
+    }
+#endif
+
+    return 0;
+}
+
+int test_retain_queue_multiple(cl_device_id deviceID, cl_context context, cl_command_queue queueNotUsed, int num_elements)
+{
+    cl_command_queue queue;
+    unsigned int numInstances, i;
+    int err;
+
+
+    /* Create a test program */
+    queue = clCreateCommandQueueWithProperties( context, deviceID, 0, &err );
+    test_error( err, "Unable to create command queue to test with" );
+
+    /* Increment 9 times, which should bring the count to 10 */
+    for( i = 0; i < 9; i++ )
+    {
+        clRetainCommandQueue( queue );
+    }
+
+    /* Test the instance count */
+    GET_QUEUE_INSTANCE_COUNT( queue );
+    test_error( err, "Unable to get queue instance count" );
+    VERIFY_INSTANCE_COUNT( numInstances, 10 );
+
+    /* Now release 5 times, which should take us to 5 */
+    for( i = 0; i < 5; i++ )
+    {
+        clReleaseCommandQueue( queue );
+    }
+
+    GET_QUEUE_INSTANCE_COUNT( queue );
+    test_error( err, "Unable to get queue instance count" );
+    VERIFY_INSTANCE_COUNT( numInstances, 5 );
+
+    /* Retain again three times, which should take us to 8 */
+    for( i = 0; i < 3; i++ )
+    {
+        clRetainCommandQueue( queue );
+    }
+
+    GET_QUEUE_INSTANCE_COUNT( queue );
+    test_error( err, "Unable to get queue instance count" );
+    VERIFY_INSTANCE_COUNT( numInstances, 8 );
+
+    /* Release 7 times, which should take it to 1 */
+    for( i = 0; i < 7; i++ )
+    {
+        clReleaseCommandQueue( queue );
+    }
+
+    GET_QUEUE_INSTANCE_COUNT( queue );
+    test_error( err, "Unable to get queue instance count" );
+    VERIFY_INSTANCE_COUNT( numInstances, 1 );
+
+    /* And one last one */
+    clReleaseCommandQueue( queue );
+
+#ifdef VERIFY_AFTER_RELEASE
+    /* We're not allowed to get the instance count after the object has been completely released. But that's
+     exactly how we can tell the release worked--by making sure getting the instance count fails! */
+    GET_QUEUE_INSTANCE_COUNT( queue );
+    if( err != CL_INVALID_COMMAND_QUEUE )
+    {
+        print_error( err, "Command queue was not properly released" );
+        return -1;
+    }
+#endif
+
+    return 0;
+}
+
+int test_retain_mem_object_single(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    cl_mem object;
+    cl_uint numInstances;
+    int err;
+
+
+    /* Create a test object */
+    object = clCreateBuffer( context, CL_MEM_READ_ONLY, 32, NULL, &err );
+    test_error( err, "Unable to create buffer to test with" );
+
+    /* Test the instance count */
+    GET_MEM_INSTANCE_COUNT( object );
+    test_error( err, "Unable to get mem object count" );
+    VERIFY_INSTANCE_COUNT( numInstances, 1 );
+
+    /* Now release the program */
+    clReleaseMemObject( object );
+#ifdef VERIFY_AFTER_RELEASE
+    /* We're not allowed to get the instance count after the object has been completely released. But that's
+     exactly how we can tell the release worked--by making sure getting the instance count fails! */
+    GET_MEM_INSTANCE_COUNT( object );
+    if( err != CL_INVALID_MEM_OBJECT )
+    {
+        print_error( err, "Mem object was not properly released" );
+        return -1;
+    }
+#endif
+
+    return 0;
+}
+
+int test_retain_mem_object_multiple(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    cl_mem object;
+    unsigned int numInstances, i;
+    int err;
+
+
+    /* Create a test object */
+    object = clCreateBuffer( context, CL_MEM_READ_ONLY, 32, NULL, &err );
+    test_error( err, "Unable to create buffer to test with" );
+
+    /* Increment 9 times, which should bring the count to 10 */
+    for( i = 0; i < 9; i++ )
+    {
+        clRetainMemObject( object );
+    }
+
+    /* Test the instance count */
+    GET_MEM_INSTANCE_COUNT( object );
+    test_error( err, "Unable to get mem object count" );
+    VERIFY_INSTANCE_COUNT( numInstances, 10 );
+
+    /* Now release 5 times, which should take us to 5 */
+    for( i = 0; i < 5; i++ )
+    {
+        clReleaseMemObject( object );
+    }
+
+    GET_MEM_INSTANCE_COUNT( object );
+    test_error( err, "Unable to get mem object count" );
+    VERIFY_INSTANCE_COUNT( numInstances, 5 );
+
+    /* Retain again three times, which should take us to 8 */
+    for( i = 0; i < 3; i++ )
+    {
+        clRetainMemObject( object );
+    }
+
+    GET_MEM_INSTANCE_COUNT( object );
+    test_error( err, "Unable to get mem object count" );
+    VERIFY_INSTANCE_COUNT( numInstances, 8 );
+
+    /* Release 7 times, which should take it to 1 */
+    for( i = 0; i < 7; i++ )
+    {
+        clReleaseMemObject( object );
+    }
+
+    GET_MEM_INSTANCE_COUNT( object );
+    test_error( err, "Unable to get mem object count" );
+    VERIFY_INSTANCE_COUNT( numInstances, 1 );
+
+    /* And one last one */
+    clReleaseMemObject( object );
+
+#ifdef VERIFY_AFTER_RELEASE
+    /* We're not allowed to get the instance count after the object has been completely released. But that's
+     exactly how we can tell the release worked--by making sure getting the instance count fails! */
+    GET_MEM_INSTANCE_COUNT( object );
+    if( err != CL_INVALID_MEM_OBJECT )
+    {
+        print_error( err, "Mem object was not properly released" );
+        return -1;
+    }
+#endif
+
+    return 0;
+}
+
--- a/test_conformance/api/test_retain_program.c
+++ b/test_conformance/api/test_retain_program.c
@@ -0,0 +1,105 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "testBase.h"
+
+#if !defined(_WIN32)
+#include <unistd.h>
+#endif
+
+#include "../../test_common/harness/compat.h"
+
+int test_release_kernel_order(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    cl_program program;
+    cl_kernel kernel;
+    int error;
+    const char *testProgram[] = { "__kernel void sample_test(__global int *data){}" };
+
+    /* Create a test program */
+    error = create_single_kernel_helper(context, &program, NULL, 1, testProgram, NULL);
+    test_error( error, "Unable to build sample program to test with" );
+
+    /* And create a kernel from it */
+    kernel = clCreateKernel( program, "sample_test", &error );
+    test_error( error, "Unable to create kernel" );
+
+    /* Now try freeing the program first, then the kernel. If refcounts are right, this should work just fine */
+    clReleaseProgram( program );
+    clReleaseKernel( kernel );
+
+    /* If we got here fine, we succeeded. If not, well, we won't be able to return an error :) */
+    return 0;
+}
+
+const char *sample_delay_kernel[] = {
+"__kernel void sample_test(__global float *src, __global int *dst)\n"
+"{\n"
+"    int  tid = get_global_id(0);\n"
+"    for( int i = 0; i < 1000000; i++ ); \n"
+"    dst[tid] = (int)src[tid];\n"
+"\n"
+"}\n" };
+
+int test_release_during_execute( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    int error;
+    cl_program program;
+    cl_kernel kernel;
+    cl_mem streams[2];
+    size_t threads[1] = { 10 }, localThreadSize;
+
+
+    /* We now need an event to test. So we'll execute a kernel to get one */
+    if( create_single_kernel_helper( context, &program, &kernel, 1, sample_delay_kernel, "sample_test" ) )
+    {
+        return -1;
+    }
+
+    streams[0] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE),  sizeof(cl_float) * 10, NULL, &error);
+    test_error( error, "Creating test array failed" );
+    streams[1] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE),  sizeof(cl_int) * 10, NULL, &error);
+    test_error( error, "Creating test array failed" );
+
+    /* Set the arguments */
+    error = clSetKernelArg(kernel, 0, sizeof( streams[0] ), &streams[ 0 ]);
+    test_error( error, "Unable to set indexed kernel arguments" );
+    error = clSetKernelArg(kernel, 1, sizeof( streams[1] ), &streams[ 1 ]);
+    test_error( error, "Unable to set indexed kernel arguments" );
+
+    error = get_max_common_work_group_size( context, kernel, threads[0], &localThreadSize );
+    test_error( error, "Unable to calc local thread size" );
+
+
+    /* Execute the kernel */
+    error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, &localThreadSize, 0, NULL, NULL );
+    test_error( error, "Unable to execute test kernel" );
+
+    /* The kernel should still be executing, but we should still be able to release it. It's not terribly
+       useful, but we should be able to do it, if the internal refcounting is indeed correct. */
+
+    clReleaseMemObject( streams[ 1 ] );
+    clReleaseMemObject( streams[ 0 ] );
+    clReleaseKernel( kernel );
+    clReleaseProgram( program );
+
+  /* Now make sure we're really finished before we go on. */
+  error = clFinish(queue);
+  test_error( error, "Unable to finish context.");
+
+    return 0;
+}
+
+
--- a/test_conformance/api/test_sub_group_dispatch.cpp
+++ b/test_conformance/api/test_sub_group_dispatch.cpp
@@ -0,0 +1,218 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "testBase.h"
+#include "../../test_common/harness/typeWrappers.h"
+#include "../../test_common/harness/conversions.h"
+
+const char *subgroup_dispatch_kernel[] = {
+"__kernel void subgroup_dispatch_kernel(__global int *output)\n"
+"{\n"
+"    size_t size = get_num_sub_groups ();\n"
+"\n"
+"    output[0] = size;\n"
+"\n"
+"}\n" };
+
+size_t flatten_ndrange(size_t* ndrange, size_t dim)
+{
+    switch(dim)
+    {
+    case 1:
+        return *ndrange;
+    case 2:
+        return ndrange[0] * ndrange[1];
+    case 3:
+        return ndrange[0] * ndrange[1] * ndrange[2];
+    default:
+        log_error("ERROR: bad ndrange value");
+        return 0;
+    }
+}
+
+cl_int get_sub_group_num(cl_command_queue queue, cl_kernel kernel, clMemWrapper& out, size_t& size, size_t local_size, size_t dim)
+{
+    size_t ndrange[3] = {local_size, 1, 1};
+    cl_int error = CL_SUCCESS;
+    size = 0;
+    error = clSetKernelArg(kernel, 0, sizeof(out), &out);
+    error += clEnqueueNDRangeKernel(queue, kernel, dim, NULL, ndrange, ndrange, 0, NULL, NULL);
+    error += clEnqueueReadBuffer(queue, out, CL_TRUE, 0, 4, &size, 0, NULL, NULL);
+    return error;
+}
+
+int test_sub_group_dispatch(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    static const size_t gsize0 = 80;
+    int i, error;
+    size_t realSize;
+    size_t kernel_max_subgroup_size, kernel_subgroup_count;
+    size_t global[] = {1,1,1};
+    size_t max_local;
+
+    cl_platform_id platform;
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    clMemWrapper out;
+
+    size_t ret_ndrange1d;
+    size_t ret_ndrange2d[2];
+    size_t ret_ndrange3d[3];
+
+    size_t ret_ndrange2d_flattened;
+    size_t ret_ndrange3d_flattened;
+
+    error = create_single_kernel_helper_with_build_options(context, &program, &kernel, 1, subgroup_dispatch_kernel, "subgroup_dispatch_kernel", "-cl-std=CL2.0");
+    if (error != 0)
+        return error;
+
+    out = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(size_t), NULL, &error);
+    test_error(error, "clCreateBuffer failed");
+
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_local, NULL);
+    test_error(error, "clGetDeviceInfo failed");
+
+
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_PLATFORM, sizeof(platform), (void *)&platform, NULL);
+    test_error(error, "clDeviceInfo failed for CL_DEVICE_PLATFORM");
+
+    // Get the max subgroup size
+    error = clGetKernelSubGroupInfo(kernel, deviceID, CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
+            sizeof(max_local), &max_local, sizeof(kernel_max_subgroup_size), (void *)&kernel_max_subgroup_size, &realSize);
+    test_error(error, "clGetKernelSubGroupInfo failed for CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE");
+    log_info("The CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE for the kernel is %d.\n", (int)kernel_max_subgroup_size);
+
+    if (realSize != sizeof(kernel_max_subgroup_size)) {
+        log_error( "ERROR: Returned size of max sub group size not valid! (Expected %d, got %d)\n", (int)sizeof(kernel_max_subgroup_size), (int)realSize );
+        return -1;
+    }
+
+    // Get the number of subgroup for max local size
+    error = clGetKernelSubGroupInfo(kernel, deviceID, CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE,
+            sizeof(max_local), &max_local, sizeof(kernel_subgroup_count), (void *)&kernel_subgroup_count, &realSize);
+    test_error(error, "clGetKernelSubGroupInfo failed for CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE");
+    log_info("The CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE for the kernel is %d.\n", (int)kernel_subgroup_count);
+
+    if (realSize != sizeof(kernel_subgroup_count)) {
+        log_error( "ERROR: Returned size of sub group count not valid! (Expected %d, got %d)\n", (int)sizeof(kernel_subgroup_count), (int)realSize );
+        return -1;
+    }
+
+    // test CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT
+    for (size_t i = kernel_subgroup_count; i > 0; --i)
+    {
+        // test all 3 different dimention of requested local size
+        size_t expect_size = kernel_max_subgroup_size * i;
+        size_t kernel_ret_size = 0;
+        error = clGetKernelSubGroupInfo(kernel, deviceID, CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT, sizeof(i), &i, sizeof(ret_ndrange1d), &ret_ndrange1d, &realSize);
+        test_error(error, "clGetKernelSubGroupInfo failed for CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT");
+        if (realSize != sizeof(ret_ndrange1d)) {
+            log_error( "ERROR: Returned size of sub group count not valid! (Expected %d, got %d)\n", (int)sizeof(kernel_subgroup_count), (int)realSize );
+            return -1;
+        }
+
+        if (ret_ndrange1d != expect_size)
+        {
+            log_error( "ERROR: Incorrect value returned for CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT! (Expected %d, got %d)\n", (int)expect_size, (int)ret_ndrange1d );
+            return -1;
+        }
+
+        error = get_sub_group_num(queue, kernel, out, kernel_ret_size, ret_ndrange1d, 1);
+        test_error(error, "Failed to query number of subgroups from kernel");
+        if (i != kernel_ret_size)
+        {
+            log_error( "ERROR: Mismatch between requested number of subgroups and what get_num_sub_groups() in kernel returned! (Expected %d, got %d)\n", (int)i, (int)kernel_ret_size );
+            return -1;
+        }
+
+        error = clGetKernelSubGroupInfo(kernel, deviceID, CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT, sizeof(i), &i, sizeof(ret_ndrange2d), ret_ndrange2d, &realSize);
+        test_error(error, "clGetKernelSubGroupInfo failed for CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT");
+        if (realSize != sizeof(ret_ndrange2d)) {
+            log_error( "ERROR: Returned size of sub group count not valid! (Expected %d, got %d)\n", (int)sizeof(kernel_subgroup_count), (int)realSize );
+            return -1;
+        }
+
+        ret_ndrange2d_flattened = flatten_ndrange(ret_ndrange2d, 2);
+        if (ret_ndrange2d_flattened != expect_size ||
+            ret_ndrange2d[1] != 1)
+        {
+            log_error( "ERROR: Incorrect value returned for CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT! (Expected %d, got %d)\n", (int)expect_size, (int)ret_ndrange2d_flattened );
+            return -1;
+        }
+
+        error = get_sub_group_num(queue, kernel, out, kernel_ret_size, ret_ndrange2d_flattened, 2);
+        test_error(error, "Failed to query number of subgroups from kernel");
+        if (i != kernel_ret_size)
+        {
+            log_error( "ERROR: Mismatch between requested number of subgroups and what get_num_sub_groups() in kernel returned! (Expected %d, got %d)\n", (int)i, (int)kernel_ret_size );
+            return -1;
+        }
+
+        error = clGetKernelSubGroupInfo(kernel, deviceID, CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT, sizeof(i), &i, sizeof(ret_ndrange3d), ret_ndrange3d, &realSize);
+        test_error(error, "clGetKernelSubGroupInfo failed for CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT");
+        if (realSize != sizeof(ret_ndrange3d)) {
+            log_error( "ERROR: Returned size of sub group count not valid! (Expected %d, got %d)\n", (int)sizeof(kernel_subgroup_count), (int)realSize );
+            return -1;
+        }
+
+        ret_ndrange3d_flattened = flatten_ndrange(ret_ndrange3d, 3);
+        if (ret_ndrange3d_flattened != expect_size ||
+            ret_ndrange3d[1] != 1 ||
+            ret_ndrange3d[2] != 1)
+        {
+            log_error( "ERROR: Incorrect value returned for CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT! (Expected %d, got %d)\n", (int)expect_size, (int)ret_ndrange3d_flattened );
+            return -1;
+        }
+
+        error = get_sub_group_num(queue, kernel, out, kernel_ret_size, ret_ndrange3d_flattened, 3);
+        test_error(error, "Failed to query number of subgroups from kernel");
+        if (i != kernel_ret_size)
+        {
+            log_error( "ERROR: Mismatch between requested number of subgroups and what get_num_sub_groups() in kernel returned! (Expected %d, got %d)\n", (int)i, (int)kernel_ret_size );
+            return -1;
+        }
+    }
+
+    // test when input subgroup count exceeds max wg size
+    size_t large_sg_size = kernel_subgroup_count + 1;
+    error = clGetKernelSubGroupInfo(kernel, deviceID, CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT, sizeof(size_t), &large_sg_size, sizeof(ret_ndrange1d), &ret_ndrange1d, &realSize);
+        test_error(error, "clGetKernelSubGroupInfo failed for CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT");
+    if (ret_ndrange1d != 0)
+    {
+        log_error( "ERROR: Incorrect value returned for CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT! (Expected %d, got %d)\n", 0, (int)ret_ndrange1d );
+            return -1;
+    }
+
+    error = clGetKernelSubGroupInfo(kernel, deviceID, CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT, sizeof(size_t), &large_sg_size, sizeof(ret_ndrange2d), ret_ndrange2d, &realSize);
+        test_error(error, "clGetKernelSubGroupInfo failed for CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT");
+    if (ret_ndrange2d[0] != 0 ||
+        ret_ndrange2d[1] != 0)
+    {
+        log_error( "ERROR: Incorrect value returned for CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT!" );
+            return -1;
+    }
+
+    error = clGetKernelSubGroupInfo(kernel, deviceID, CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT, sizeof(size_t), &large_sg_size, sizeof(ret_ndrange3d), ret_ndrange3d, &realSize);
+        test_error(error, "clGetKernelSubGroupInfo failed for CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT");
+    if (ret_ndrange3d[0] != 0 ||
+        ret_ndrange3d[1] != 0 ||
+        ret_ndrange3d[2] != 0)
+    {
+        log_error( "ERROR: Incorrect value returned for CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT!" );
+            return -1;
+    }
+
+    return 0;
+}
--- a/test_conformance/api/test_zero_sized_enqueue.cpp
+++ b/test_conformance/api/test_zero_sized_enqueue.cpp
@@ -0,0 +1,209 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "testBase.h"
+#include "../../test_common/harness/typeWrappers.h"
+#include "../../test_common/harness/conversions.h"
+
+const char *zero_sized_enqueue_test_kernel[] = {
+"__kernel void foo_kernel(__global float *src, __global int *dst)\n"
+"{\n"
+"    int  tid = get_global_id(0);\n"
+"\n"
+"    dst[tid] = (int)src[tid];\n"
+"\n"
+"}\n" };
+
+const int bufSize = 128;
+
+cl_int test_zero_sized_enqueue_and_test_output_buffer(cl_command_queue queue, clKernelWrapper& kernel, clMemWrapper& buf, size_t dim, size_t ndrange[])
+{
+    cl_int error = clEnqueueNDRangeKernel(queue, kernel, dim, NULL, ndrange, NULL, 0, NULL, NULL);
+    if (error != CL_SUCCESS)
+    {
+        return error;
+    }
+
+    clFinish(queue);
+
+    // check output buffer has not changed.
+    int* output = reinterpret_cast<int*>(clEnqueueMapBuffer(queue, buf, CL_TRUE, CL_MAP_READ, 0, sizeof(int) * bufSize, 0, NULL, NULL, &error));
+    if (error != CL_SUCCESS)
+    {
+        return error;
+    }
+
+    for (int i = 0; i < bufSize; ++i)
+    {
+        if (output[i] != 0)
+        {
+            log_error( "ERROR: output buffer value has changed.\n" );
+            return CL_INVALID_OPERATION;
+        }
+    }
+
+    return clEnqueueUnmapMemObject(queue, buf, output, 0, NULL, NULL);
+}
+
+int test_zero_sized_enqueue_helper(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    int error;
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    clMemWrapper            streams[2];
+    size_t    ndrange1 = 0;
+    size_t    ndrange20[2] = {0, 0};
+    size_t    ndrange21[2] = {1, 0};
+    size_t    ndrange22[2] = {0, 1};
+
+    size_t    ndrange30[3] = {0, 0, 0};
+    size_t    ndrange31[3] = {1, 0, 0};
+    size_t    ndrange32[3] = {0, 1, 0};
+    size_t    ndrange33[3] = {0, 0, 1};
+    size_t    ndrange34[3] = {0, 1, 1};
+    size_t    ndrange35[3] = {1, 0, 1};
+    size_t    ndrange36[3] = {1, 1, 0};
+
+    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, bufSize * sizeof(int), NULL, &error);
+    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, bufSize * sizeof(int), NULL, &error);
+
+    int* buf = new int[bufSize];
+    memset(buf, 0, sizeof(int) * bufSize);
+
+    // update output buffer
+    error = clEnqueueWriteBuffer(queue, streams[1], CL_TRUE, 0, sizeof(int) * bufSize, buf, 0, NULL, NULL);
+
+
+    /* Create a kernel to test with */
+    if( create_single_kernel_helper( context, &program, &kernel, 1, zero_sized_enqueue_test_kernel, "foo_kernel" ) != 0 )
+    {
+        return -1;
+    }
+
+    error = clSetKernelArg(kernel, 0, sizeof(cl_mem), &streams[0]);
+    test_error( error, "clSetKernelArg failed." );
+    error = clSetKernelArg(kernel, 1, sizeof(cl_mem), &streams[1]);
+    test_error( error, "clSetKernelArg failed." );
+
+    // Simple API return code tests for 1D, 2D and 3D zero sized ND range.
+    error = test_zero_sized_enqueue_and_test_output_buffer(queue, kernel, streams[1], 1, &ndrange1);
+    test_error( error, "1D zero sized kernel enqueue failed." );
+
+    error = test_zero_sized_enqueue_and_test_output_buffer(queue, kernel, streams[1], 2, ndrange20);
+    test_error( error, "2D zero sized kernel enqueue failed." );
+
+    error = test_zero_sized_enqueue_and_test_output_buffer(queue, kernel, streams[1], 2, ndrange21);
+    test_error( error, "2D zero sized kernel enqueue failed." );
+
+    error = test_zero_sized_enqueue_and_test_output_buffer(queue, kernel, streams[1], 2, ndrange22);
+    test_error( error, "2D zero sized kernel enqueue failed." );
+
+
+    error = test_zero_sized_enqueue_and_test_output_buffer(queue, kernel, streams[1], 3, ndrange30);
+    test_error( error, "3D zero sized kernel enqueue failed." );
+
+    error = test_zero_sized_enqueue_and_test_output_buffer(queue, kernel, streams[1], 3, ndrange31);
+    test_error( error, "3D zero sized kernel enqueue failed." );
+
+    error = test_zero_sized_enqueue_and_test_output_buffer(queue, kernel, streams[1], 3, ndrange32);
+    test_error( error, "3D zero sized kernel enqueue failed." );
+
+    error = test_zero_sized_enqueue_and_test_output_buffer(queue, kernel, streams[1], 3, ndrange33);
+    test_error( error, "3D zero sized kernel enqueue failed." );
+
+    error = test_zero_sized_enqueue_and_test_output_buffer(queue, kernel, streams[1], 3, ndrange34);
+    test_error( error, "3D zero sized kernel enqueue failed." );
+
+    error = test_zero_sized_enqueue_and_test_output_buffer(queue, kernel, streams[1], 3, ndrange35);
+    test_error( error, "3D zero sized kernel enqueue failed." );
+
+    error = test_zero_sized_enqueue_and_test_output_buffer(queue, kernel, streams[1], 3, ndrange36);
+    test_error( error, "3D zero sized kernel enqueue failed." );
+
+    // Verify zero-sized ND range kernel still satisfy event wait list and correct event object
+    // is returned
+    cl_event ev = NULL;
+    clEventWrapper user_ev = clCreateUserEvent(context, &error);
+    test_error( error, "user event creation failed." );
+    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, ndrange30, NULL, 1, &user_ev, &ev);
+    test_error( error, "3D zero sized kernel enqueue failed." );
+    if (ev == NULL)
+    {
+        log_error( "ERROR: failed to create an event object\n" );
+        return -1;
+    }
+
+    cl_int sta;
+    error = clGetEventInfo(ev, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &sta, NULL);
+    test_error( error, "Failed to get event status.");
+
+    if (sta != CL_QUEUED)
+    {
+        log_error( "ERROR: incorrect zero sized kernel enqueue event status.\n" );
+        return -1;
+    }
+
+    // now unblock zero-sized enqueue
+    error = clSetUserEventStatus(user_ev, CL_COMPLETE);
+    test_error( error, "Failed to set user event status.");
+
+    clFinish(queue);
+
+    // now check zero sized enqueue event status
+    error = clGetEventInfo(ev, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &sta, NULL);
+    test_error( error, "Failed to get event status.");
+
+    if (sta != CL_COMPLETE)
+    {
+        log_error( "ERROR: incorrect zero sized kernel enqueue event status.\n" );
+        return -1;
+    }
+
+    delete [] buf;
+
+    return 0;
+}
+
+
+int test_zero_sized_enqueue(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    int res = test_zero_sized_enqueue_helper(deviceID, context, queue, num_elements);
+    if (res != 0)
+    {
+        return res;
+    }
+
+    // now test out of order queue
+    cl_command_queue_properties props;
+    cl_int error = clGetDeviceInfo(deviceID, CL_DEVICE_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties), &props, NULL);
+    test_error( error, "clGetDeviceInfo failed.");
+
+    if (props | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)
+    {
+        // test out of order queue
+        cl_queue_properties queue_prop_def[] =
+        {
+            CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
+            0
+        };
+
+        clCommandQueueWrapper ooqueue = clCreateCommandQueueWithProperties(context, deviceID, queue_prop_def, &error);
+        test_error( error, "clCreateCommandQueueWithProperties failed.");
+
+        res = test_zero_sized_enqueue_helper(deviceID, context, ooqueue, num_elements);
+    }
+
+    return res;
+}
--- a/test_conformance/atomics/CMakeLists.txt
+++ b/test_conformance/atomics/CMakeLists.txt
@@ -0,0 +1,18 @@
+set(MODULE_NAME ATOMICS)
+
+set(${MODULE_NAME}_SOURCES
+        main.c
+        test_atomics.cpp
+        test_indexed_cases.c
+        ../../test_common/harness/errorHelpers.c
+        ../../test_common/harness/threadTesting.c
+        ../../test_common/harness/testHarness.c
+        ../../test_common/harness/kernelHelpers.c
+        ../../test_common/harness/mt19937.c
+        ../../test_common/harness/conversions.c
+        ../../test_common/harness/msvc9.c
+        ../../test_common/harness/parseParameters.cpp
+)
+
+include(../CMakeCommon.txt)
+
--- a/test_conformance/atomics/Jamfile
+++ b/test_conformance/atomics/Jamfile
@@ -0,0 +1,17 @@
+project
+    : requirements
+      <toolset>gcc:<cflags>-xc++
+      <toolset>msvc:<cflags>"/TP"
+    ;
+ 
+exe test_atomics
+    : main.c
+      test_atomics.c
+      test_indexed_cases.c
+    ;
+
+install dist
+    : test_atomics 
+    : <variant>debug:<location>$(DIST)/debug/tests/test_conformance/atomics
+      <variant>release:<location>$(DIST)/release/tests/test_conformance/atomics
+    ;
--- a/test_conformance/atomics/Makefile
+++ b/test_conformance/atomics/Makefile
@@ -0,0 +1,44 @@
+ifdef BUILD_WITH_ATF
+ATF = -framework ATF
+USE_ATF = -DUSE_ATF
+endif
+
+SRCS = main.c \
+		  test_atomics.cpp \
+		  test_indexed_cases.c \
+		  ../../test_common/harness/errorHelpers.c \
+		  ../../test_common/harness/threadTesting.c \
+		  ../../test_common/harness/testHarness.c \
+                  ../../test_common/harness/mt19937.c \
+                  ../../test_common/harness/conversions.c \
+		  ../../test_common/harness/kernelHelpers.c
+		  
+DEFINES = 
+
+SOURCES = $(abspath $(SRCS))
+LIBPATH += -L/System/Library/Frameworks/OpenCL.framework/Libraries
+LIBPATH += -L.
+FRAMEWORK = $(SOURCES)
+HEADERS = 
+TARGET = test_atomics
+INCLUDE = 
+COMPILERFLAGS = -c -Wall -g -Wshorten-64-to-32
+CC = c++
+CFLAGS = $(COMPILERFLAGS) $(RC_CFLAGS) ${USE_ATF} $(DEFINES:%=-D%) $(INCLUDE)
+CXXFLAGS = $(COMPILERFLAGS) $(RC_CFLAGS) ${USE_ATF} $(DEFINES:%=-D%) $(INCLUDE)
+LIBRARIES = -framework OpenCL -framework OpenGL -framework GLUT -framework AppKit ${ATF}
+
+OBJECTS := ${SOURCES:.c=.o}
+OBJECTS := ${OBJECTS:.cpp=.o}
+
+TARGETOBJECT =
+all: $(TARGET)
+
+$(TARGET): $(OBJECTS)
+	$(CC) $(RC_CFLAGS) $(OBJECTS) -o $@ $(LIBPATH) $(LIBRARIES)
+
+clean:
+	rm -f $(TARGET) $(OBJECTS)
+
+.DEFAULT:
+	@echo The target \"$@\" does not exist in Makefile.
--- a/test_conformance/atomics/main.c
+++ b/test_conformance/atomics/main.c
@@ -0,0 +1,71 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/compat.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "procs.h"
+#include "../../test_common/harness/testHarness.h"
+
+#if !defined(_WIN32)
+#include <unistd.h>
+#endif
+
+
+basefn    basefn_list[] = {
+            test_atomic_add,
+            test_atomic_sub,
+            test_atomic_xchg,
+            test_atomic_min,
+            test_atomic_max,
+            test_atomic_inc,
+            test_atomic_dec,
+            test_atomic_cmpxchg,
+            test_atomic_and,
+            test_atomic_or,
+            test_atomic_xor,
+
+            test_atomic_add_index,
+            test_atomic_add_index_bin
+};
+
+const char    *basefn_names[] = {
+            "atomic_add",
+            "atomic_sub",
+            "atomic_xchg",
+            "atomic_min",
+            "atomic_max",
+            "atomic_inc",
+            "atomic_dec",
+            "atomic_cmpxchg",
+            "atomic_and",
+            "atomic_or",
+            "atomic_xor",
+
+            "atomic_add_index",
+            "atomic_add_index_bin",
+};
+
+ct_assert((sizeof(basefn_names) / sizeof(basefn_names[0])) == (sizeof(basefn_list) / sizeof(basefn_list[0])));
+
+int    num_fns = sizeof(basefn_names) / sizeof(char *);
+
+int main(int argc, const char *argv[])
+{
+    return runTestHarness( argc, argv, num_fns, basefn_list, basefn_names, false, false, 0 );
+}
+
+
--- a/test_conformance/atomics/procs.h
+++ b/test_conformance/atomics/procs.h
@@ -0,0 +1,39 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/errorHelpers.h"
+#include "../../test_common/harness/kernelHelpers.h"
+#include "../../test_common/harness/threadTesting.h"
+#include "../../test_common/harness/typeWrappers.h"
+
+extern int      create_program_and_kernel(const char *source, const char *kernel_name, cl_program *program_ret, cl_kernel *kernel_ret);
+
+extern int        test_atomic_add(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_atomic_sub(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_atomic_xchg(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_atomic_min(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_atomic_max(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_atomic_inc(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_atomic_dec(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_atomic_cmpxchg(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_atomic_and(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_atomic_or(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_atomic_xor(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+
+extern int        test_atomic_add_index(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_atomic_add_index_bin(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+
+
+
--- a/test_conformance/atomics/testBase.h
+++ b/test_conformance/atomics/testBase.h
@@ -0,0 +1,31 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#ifndef _testBase_h
+#define _testBase_h
+
+#include "../../test_common/harness/compat.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "procs.h"
+
+#endif // _testBase_h
+
+
+
--- a/test_conformance/atomics/test_atomics.cpp
+++ b/test_conformance/atomics/test_atomics.cpp
--- a/test_conformance/atomics/test_indexed_cases.c
+++ b/test_conformance/atomics/test_indexed_cases.c
@@ -0,0 +1,380 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "testBase.h"
+#include "../../test_common/harness/conversions.h"
+
+extern cl_uint gRandomSeed;
+
+const char * atomic_index_source =
+"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+"// Counter keeps track of which index in counts we are using.\n"
+"// We get that value, increment it, and then set that index in counts to our thread ID.\n"
+"// At the end of this we should have all thread IDs in some random location in counts\n"
+"// exactly once. If atom_add failed then we will write over various thread IDs and we\n"
+"// will be missing some.\n"
+"\n"
+"__kernel void add_index_test(__global int *counter, __global int *counts) {\n"
+"    int tid = get_global_id(0);\n"
+"    \n"
+"    int counter_to_use = atom_add(counter, 1);\n"
+"    counts[counter_to_use] = tid;\n"
+"}";
+
+int test_atomic_add_index(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    clMemWrapper counter, counters;
+    size_t numGlobalThreads, numLocalThreads;
+    int fail = 0, succeed = 0, err;
+
+  /* Check if atomics are supported. */
+  if (!is_extension_available(deviceID, "cl_khr_global_int32_base_atomics")) {
+    log_info("Base atomics not supported (cl_khr_global_int32_base_atomics). Skipping test.\n");
+    return 0;
+  }
+
+    //===== add_index test
+    // The index test replicates what particles does.
+    // It uses one memory location to keep track of the current index and then each thread
+    // does an atomic add to it to get its new location. The threads then write to their
+    // assigned location. At the end we check to make sure that each thread's ID shows up
+    // exactly once in the output.
+
+    numGlobalThreads = 2048;
+
+    if( create_single_kernel_helper( context, &program, &kernel, 1, &atomic_index_source, "add_index_test" ) )
+        return -1;
+
+    if( get_max_common_work_group_size( context, kernel, numGlobalThreads, &numLocalThreads ) )
+        return -1;
+
+    log_info("Execute global_threads:%d local_threads:%d\n",
+             (int)numGlobalThreads, (int)numLocalThreads);
+
+    // Create the counter that will keep track of where each thread writes.
+    counter = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE),
+                                   sizeof(cl_int) * 1, NULL, NULL);
+    // Create the counters that will hold the results of each thread writing
+    // its ID into a (hopefully) unique location.
+    counters = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE),
+                                    sizeof(cl_int) * numGlobalThreads, NULL, NULL);
+
+    // Reset all those locations to -1 to indciate they have not been used.
+    cl_int *values = (cl_int*) malloc(sizeof(cl_int)*numGlobalThreads);
+    if (values == NULL) {
+        log_error("add_index_test FAILED to allocate memory for initial values.\n");
+        fail = 1; succeed = -1;
+    } else {
+        memset(values, -1, numLocalThreads);
+        unsigned int i=0;
+        for (i=0; i<numGlobalThreads; i++)
+            values[i] = -1;
+        int init=0;
+        err = clEnqueueWriteBuffer(queue, counters, true, 0, numGlobalThreads*sizeof(cl_int), values, 0, NULL, NULL);
+        err |= clEnqueueWriteBuffer(queue, counter, true, 0,1*sizeof(cl_int), &init, 0, NULL, NULL);
+        if (err) {
+            log_error("add_index_test FAILED to write initial values to arrays: %d\n", err);
+            fail=1; succeed=-1;
+        } else {
+            err = clSetKernelArg(kernel, 0, sizeof(counter), &counter);
+            err |= clSetKernelArg(kernel, 1, sizeof(counters), &counters);
+            if (err) {
+                log_error("add_index_test FAILED to set kernel arguments: %d\n", err);
+                fail=1; succeed=-1;
+            } else {
+                err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, &numGlobalThreads, &numLocalThreads, 0, NULL, NULL );
+                if (err) {
+                    log_error("add_index_test FAILED to execute kernel: %d\n", err);
+                    fail=1; succeed=-1;
+                } else {
+                    err = clEnqueueReadBuffer( queue, counters, true, 0, sizeof(cl_int)*numGlobalThreads, values, 0, NULL, NULL );
+                    if (err) {
+                        log_error("add_index_test FAILED to read back results: %d\n", err);
+                        fail = 1; succeed=-1;
+                    } else {
+                        unsigned int looking_for, index;
+                        for (looking_for=0; looking_for<numGlobalThreads; looking_for++) {
+                            int instances_found=0;
+                            for (index=0; index<numGlobalThreads; index++) {
+                                if (values[index]==(int)looking_for)
+                                    instances_found++;
+                            }
+                            if (instances_found != 1) {
+                                log_error("add_index_test FAILED: wrong number of instances (%d!=1) for counter %d.\n", instances_found, looking_for);
+                                fail = 1; succeed=-1;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        if (!fail) {
+            log_info("add_index_test passed. Each thread used exactly one index.\n");
+        }
+        free(values);
+    }
+    return fail;
+}
+
+const char *add_index_bin_kernel[] = {
+"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+"// This test assigns a bunch of values to bins and then tries to put them in the bins in parallel\n"
+"// using an atomic add to keep track of the current location to write into in each bin.\n"
+"// This is the same as the memory update for the particles demo.\n"
+"\n"
+"__kernel void add_index_bin_test(__global int *bin_counters, __global int *bins, __global int *bin_assignments, int max_counts_per_bin) {\n"
+"    int tid = get_global_id(0);\n"
+"\n"
+"    int location = bin_assignments[tid];\n"
+"    int counter = atom_add(&bin_counters[location], 1);\n"
+"    bins[location*max_counts_per_bin + counter] = tid;\n"
+"}" };
+
+// This test assigns a bunch of values to bins and then tries to put them in the bins in parallel
+// using an atomic add to keep track of the current location to write into in each bin.
+// This is the same as the memory update for the particles demo.
+int add_index_bin_test(size_t *global_threads, cl_command_queue queue, cl_context context, MTdata d)
+{
+    int number_of_items = (int)global_threads[0];
+    size_t local_threads[1];
+    int divisor = 12;
+    int number_of_bins = number_of_items/divisor;
+    int max_counts_per_bin = divisor*2;
+
+    int fail = 0;
+    int succeed = 0;
+    int err;
+
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+
+    //  log_info("add_index_bin_test: %d items, into %d bins, with a max of %d items per bin (bins is %d long).\n",
+    //           number_of_items, number_of_bins, max_counts_per_bin, number_of_bins*max_counts_per_bin);
+
+    //===== add_index_bin test
+    // The index test replicates what particles does.
+    err = create_single_kernel_helper(context, &program, &kernel, 1, add_index_bin_kernel, "add_index_bin_test" );
+    test_error( err, "Unable to create testing kernel" );
+
+    if( get_max_common_work_group_size( context, kernel, global_threads[0], &local_threads[0] ) )
+        return -1;
+
+    log_info("Execute global_threads:%d local_threads:%d\n",
+             (int)global_threads[0], (int)local_threads[0]);
+
+    // Allocate our storage
+    cl_mem bin_counters = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE),
+                                        sizeof(cl_int) * number_of_bins, NULL, NULL);
+    cl_mem bins = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE),
+                                sizeof(cl_int) * number_of_bins*max_counts_per_bin, NULL, NULL);
+    cl_mem bin_assignments = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_ONLY),
+                                           sizeof(cl_int) * number_of_items, NULL, NULL);
+
+    if (bin_counters == NULL) {
+        log_error("add_index_bin_test FAILED to allocate bin_counters.\n");
+        return -1;
+    }
+    if (bins == NULL) {
+        log_error("add_index_bin_test FAILED to allocate bins.\n");
+        return -1;
+    }
+    if (bin_assignments == NULL) {
+        log_error("add_index_bin_test FAILED to allocate bin_assignments.\n");
+        return -1;
+    }
+
+    // Initialize our storage
+    cl_int *l_bin_counts = (cl_int*)malloc(sizeof(cl_int)*number_of_bins);
+    if (!l_bin_counts) {
+        log_error("add_index_bin_test FAILED to allocate initial values for bin_counters.\n");
+        return -1;
+    }
+    int i;
+    for (i=0; i<number_of_bins; i++)
+        l_bin_counts[i] = 0;
+    err = clEnqueueWriteBuffer(queue, bin_counters, true, 0, sizeof(cl_int)*number_of_bins, l_bin_counts, 0, NULL, NULL);
+    if (err) {
+        log_error("add_index_bin_test FAILED to set initial values for bin_counters: %d\n", err);
+        return -1;
+    }
+
+    cl_int *values = (cl_int*)malloc(sizeof(cl_int)*number_of_bins*max_counts_per_bin);
+    if (!values) {
+        log_error("add_index_bin_test FAILED to allocate initial values for bins.\n");
+        return -1;
+    }
+    for (i=0; i<number_of_bins*max_counts_per_bin; i++)
+        values[i] = -1;
+    err = clEnqueueWriteBuffer(queue, bins, true, 0, sizeof(cl_int)*number_of_bins*max_counts_per_bin, values, 0, NULL, NULL);
+    if (err) {
+        log_error("add_index_bin_test FAILED to set initial values for bins: %d\n", err);
+        return -1;
+    }
+    free(values);
+
+    cl_int *l_bin_assignments = (cl_int*)malloc(sizeof(cl_int)*number_of_items);
+    if (!l_bin_assignments) {
+        log_error("add_index_bin_test FAILED to allocate initial values for l_bin_assignments.\n");
+        return -1;
+    }
+    for (i=0; i<number_of_items; i++) {
+        int bin = random_in_range(0, number_of_bins-1, d);
+        while (l_bin_counts[bin] >= max_counts_per_bin) {
+            bin = random_in_range(0, number_of_bins-1, d);
+        }
+        if (bin >= number_of_bins)
+            log_error("add_index_bin_test internal error generating bin assignments: bin %d >= number_of_bins %d.\n", bin, number_of_bins);
+        if (l_bin_counts[bin]+1 > max_counts_per_bin)
+            log_error("add_index_bin_test internal error generating bin assignments: bin %d has more entries (%d) than max_counts_per_bin (%d).\n", bin, l_bin_counts[bin], max_counts_per_bin);
+        l_bin_counts[bin]++;
+        l_bin_assignments[i] = bin;
+        //     log_info("item %d assigned to bin %d (%d items)\n", i, bin, l_bin_counts[bin]);
+    }
+    err = clEnqueueWriteBuffer(queue, bin_assignments, true, 0, sizeof(cl_int)*number_of_items, l_bin_assignments, 0, NULL, NULL);
+    if (err) {
+        log_error("add_index_bin_test FAILED to set initial values for bin_assignments: %d\n", err);
+        return -1;
+    }
+    // Setup the kernel
+    err = clSetKernelArg(kernel, 0, sizeof(bin_counters), &bin_counters);
+    err |= clSetKernelArg(kernel, 1, sizeof(bins), &bins);
+    err |= clSetKernelArg(kernel, 2, sizeof(bin_assignments), &bin_assignments);
+    err |= clSetKernelArg(kernel, 3, sizeof(max_counts_per_bin), &max_counts_per_bin);
+    if (err) {
+        log_error("add_index_bin_test FAILED to set kernel arguments: %d\n", err);
+        fail=1; succeed=-1;
+        return -1;
+    }
+
+    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, global_threads, local_threads, 0, NULL, NULL );
+    if (err) {
+        log_error("add_index_bin_test FAILED to execute kernel: %d\n", err);
+        fail=1; succeed=-1;
+    }
+
+    cl_int *final_bin_assignments = (cl_int*)malloc(sizeof(cl_int)*number_of_bins*max_counts_per_bin);
+    if (!final_bin_assignments) {
+        log_error("add_index_bin_test FAILED to allocate initial values for final_bin_assignments.\n");
+        return -1;
+    }
+    err = clEnqueueReadBuffer( queue, bins, true, 0, sizeof(cl_int)*number_of_bins*max_counts_per_bin, final_bin_assignments, 0, NULL, NULL );
+    if (err) {
+        log_error("add_index_bin_test FAILED to read back bins: %d\n", err);
+        fail = 1; succeed=-1;
+    }
+
+    cl_int *final_bin_counts = (cl_int*)malloc(sizeof(cl_int)*number_of_bins);
+    if (!final_bin_counts) {
+        log_error("add_index_bin_test FAILED to allocate initial values for final_bin_counts.\n");
+        return -1;
+    }
+    err = clEnqueueReadBuffer( queue, bin_counters, true, 0, sizeof(cl_int)*number_of_bins, final_bin_counts, 0, NULL, NULL );
+    if (err) {
+        log_error("add_index_bin_test FAILED to read back bin_counters: %d\n", err);
+        fail = 1; succeed=-1;
+    }
+
+    // Verification.
+    int errors=0;
+    int current_bin;
+    int search;
+    //  Print out all the contents of the bins.
+    //  for (current_bin=0; current_bin<number_of_bins; current_bin++)
+    //        for (search=0; search<max_counts_per_bin; search++)
+    //      log_info("[bin %d, entry %d] = %d\n", current_bin, search, final_bin_assignments[current_bin*max_counts_per_bin+search]);
+
+    // First verify that there are the correct number in each bin.
+    for (current_bin=0; current_bin<number_of_bins; current_bin++) {
+        int expected_number = l_bin_counts[current_bin];
+        int actual_number = final_bin_counts[current_bin];
+        if (expected_number != actual_number) {
+            log_error("add_index_bin_test FAILED: bin %d reported %d entries when %d were expected.\n", current_bin, actual_number, expected_number);
+            errors++;
+        }
+        for (search=0; search<expected_number; search++) {
+            if (final_bin_assignments[current_bin*max_counts_per_bin+search] == -1) {
+                log_error("add_index_bin_test FAILED: bin %d had no entry at position %d when it should have had %d entries.\n", current_bin, search, expected_number);
+                errors++;
+            }
+        }
+        for (search=expected_number; search<max_counts_per_bin; search++) {
+            if (final_bin_assignments[current_bin*max_counts_per_bin+search] != -1) {
+                log_error("add_index_bin_test FAILED: bin %d had an extra entry at position %d when it should have had only %d entries.\n", current_bin, search, expected_number);
+                errors++;
+            }
+        }
+    }
+    // Now verify that the correct ones are in each bin
+    int index;
+    for (index=0; index<number_of_items; index++) {
+        int expected_bin = l_bin_assignments[index];
+        int found_it = 0;
+        for (search=0; search<l_bin_counts[expected_bin]; search++) {
+            if (final_bin_assignments[expected_bin*max_counts_per_bin+search] == index) {
+                found_it = 1;
+            }
+        }
+        if (found_it == 0) {
+            log_error("add_index_bin_test FAILED: did not find item %d in bin %d.\n", index, expected_bin);
+            errors++;
+        }
+    }
+    free(l_bin_counts);
+    free(l_bin_assignments);
+    free(final_bin_assignments);
+    free(final_bin_counts);
+    clReleaseMemObject(bin_counters);
+    clReleaseMemObject(bins);
+    clReleaseMemObject(bin_assignments);
+    if (errors == 0) {
+        log_info("add_index_bin_test passed. Each item was put in the correct bin in parallel.\n");
+        return 0;
+    } else {
+        log_error("add_index_bin_test FAILED: %d errors.\n", errors);
+        return -1;
+    }
+}
+
+int test_atomic_add_index_bin(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    //===== add_index_bin test
+    size_t numGlobalThreads = 2048;
+    int iteration=0;
+    int err, failed = 0;
+    MTdata d = init_genrand( gRandomSeed );
+
+  /* Check if atomics are supported. */
+  if (!is_extension_available(deviceID, "cl_khr_global_int32_base_atomics")) {
+    log_info("Base atomics not supported (cl_khr_global_int32_base_atomics). Skipping test.\n");
+    free_mtdata( d );
+    return 0;
+  }
+
+    for(iteration=0; iteration<10; iteration++) {
+        log_info("add_index_bin_test with %d elements:\n", (int)numGlobalThreads);
+        err = add_index_bin_test(&numGlobalThreads,  queue,  context, d);
+        if (err) {
+            failed++;
+            break;
+        }
+        numGlobalThreads*=2;
+    }
+    free_mtdata( d );
+    return failed;
+}
+
+
--- a/test_conformance/basic/CMakeLists.txt
+++ b/test_conformance/basic/CMakeLists.txt
@@ -0,0 +1,83 @@
+set(MODULE_NAME BASIC)
+
+set(${MODULE_NAME}_SOURCES
+    main.c
+    test_fpmath_float.c test_fpmath_float2.c test_fpmath_float4.c
+    test_intmath_int.c test_intmath_int2.c test_intmath_int4.c
+    test_intmath_long.c test_intmath_long2.c test_intmath_long4.c
+    test_hiloeo.c test_local.c test_pointercast.c
+    test_if.c test_loop.c
+    test_readimage.c test_readimage_int16.c test_readimage_fp32.c
+    test_readimage3d.c test_readimage3d_int16.c test_readimage3d_fp32.c
+    test_writeimage.c test_writeimage_int16.c test_writeimage_fp32.c
+    test_multireadimageonefmt.c test_multireadimagemultifmt.c
+    test_imagedim.c
+    test_vloadstore.c
+    test_int2float.c test_float2int.c
+    test_createkernelsinprogram.c
+    test_hostptr.c
+    test_explicit_s2v.cpp
+    test_constant.c
+    test_image_multipass.c
+    test_imagereadwrite.c test_imagereadwrite3d.c
+    test_image_param.c
+    test_imagenpot.c
+    test_image_r8.c
+    test_barrier.c
+    test_basic_parameter_types.c
+    test_arrayreadwrite.c
+    test_arraycopy.c
+    test_imagearraycopy.c
+    test_imagearraycopy3d.c
+    test_imagecopy.c
+    test_imagerandomcopy.c
+    test_arrayimagecopy.c
+    test_arrayimagecopy3d.c
+    test_imagecopy3d.c
+    test_enqueue_map.cpp
+    test_work_item_functions.cpp
+    test_astype.cpp
+    test_async_copy.cpp
+    test_sizeof.c
+    test_vector_creation.cpp
+    test_vec_type_hint.c
+    test_numeric_constants.cpp
+    test_constant_source.cpp
+    test_bufferreadwriterect.c
+    test_async_strided_copy.cpp
+    test_preprocessors.cpp
+    test_kernel_memory_alignment.cpp
+    test_global_work_offsets.cpp
+    test_kernel_call_kernel_function.cpp
+    test_local_kernel_scope.cpp
+    test_progvar.cpp
+    test_wg_barrier.c
+    test_global_linear_id.c
+    test_local_linear_id.c
+    test_enqueued_local_size.c
+    test_simple_image_pitch.c
+    test_get_linear_ids.cpp
+    test_rw_image_access_qualifier.c
+    ../../test_common/harness/errorHelpers.c
+    ../../test_common/harness/threadTesting.c
+    ../../test_common/harness/testHarness.c
+    ../../test_common/harness/kernelHelpers.c
+    ../../test_common/harness/typeWrappers.cpp
+    ../../test_common/harness/imageHelpers.cpp
+    ../../test_common/harness/mt19937.c
+    ../../test_common/harness/conversions.c
+    ../../test_common/harness/rounding_mode.c
+    ../../test_common/harness/msvc9.c
+    test_wg_barrier.c
+    test_enqueued_local_size.c
+    test_global_linear_id.c
+    test_local_linear_id.c
+    test_progvar.cpp
+    ../../test_common/harness/parseParameters.cpp
+)
+
+if(APPLE)
+    list(APPEND ${MODULE_NAME}_SOURCES test_queue_priority.c)
+endif(APPLE)
+
+include(../CMakeCommon.txt)
--- a/test_conformance/basic/Jamfile
+++ b/test_conformance/basic/Jamfile
@@ -0,0 +1,74 @@
+project
+    : requirements
+      <toolset>gcc:<cflags>-xc++
+      <toolset>msvc:<cflags>"/TP"
+    ;
+
+exe test_basic
+    : main.c
+      test_fpmath_float.c test_fpmath_float2.c test_fpmath_float4.c
+      test_intmath_int.c test_intmath_int2.c test_intmath_int4.c
+      test_intmath_long.c test_intmath_long2.c test_intmath_long4.c
+      test_hiloeo.c test_local.c test_pointercast.c
+      test_if.c test_sizeof.c test_loop.c
+      test_readimage.c test_readimage_int16.c test_readimage_fp32.c
+      test_readimage3d.c test_readimage3d_int16.c test_readimage3d_fp32.c
+      test_writeimage.c test_writeimage_int16.c test_writeimage_fp32.c
+      test_multireadimageonefmt.c test_multireadimagemultifmt.c
+      test_imagedim.c
+      test_vloadstore.c
+      test_int2float.c test_float2int.c
+      test_createkernelsinprogram.c
+      test_hostptr.c
+      test_explicit_s2v.cpp
+      test_constant.c
+      test_constant_source.cpp
+      test_image_multipass.c
+      test_imagereadwrite.c test_imagereadwrite3d.c
+      test_bufferreadwriterect.c
+      test_image_param.c
+      test_imagenpot.c
+      test_image_r8.c
+      test_barrier.c
+      test_arrayreadwrite.c
+      test_arraycopy.c
+      test_imagearraycopy.c
+      test_imagearraycopy3d.c
+      test_imagecopy.c
+      test_imagerandomcopy.c
+      test_arrayimagecopy.c
+      test_arrayimagecopy3d.c
+      test_imagecopy3d.c
+      test_enqueue_map.cpp
+      test_work_item_functions.cpp
+      test_astype.cpp
+      test_async_copy.cpp
+      test_async_strided_copy.cpp
+      test_numeric_constants.cpp
+      test_kernel_call_kernel_function.cpp
+      test_basic_parameter_types.c
+      test_vector_creation.cpp
+      test_vec_type_hint.c
+      test_preprocessors.cpp
+      test_kernel_memory_alignment.cpp
+      test_global_work_offsets.cpp
+      test_local_kernel_scope.cpp
+      test_get_linear_ids.cpp
+      ../../test_common/harness/errorHelpers.c
+      ../../test_common/harness/threadTesting.c
+      ../../test_common/harness/testHarness.c
+      ../../test_common/harness/rounding_mode.c
+      ../../test_common/harness/kernelHelpers.c
+      ../../test_common/harness/typeWrappers.cpp
+      ../../test_common/harness/imageHelpers.cpp
+      ../../test_common/harness/mt19937.c
+      ../../test_common/harness/conversions.c
+    : <target-os>windows:<source>../../test_common/harness/msvc9.c
+    ;
+
+install dist
+    : test_basic
+    : <variant>debug:<location>$(DIST)/debug/tests/test_conformance/basic
+      <variant>release:<location>$(DIST)/release/tests/test_conformance/basic
+    ;
+ 
--- a/test_conformance/basic/Makefile
+++ b/test_conformance/basic/Makefile
@@ -0,0 +1,103 @@
+ifdef BUILD_WITH_ATF
+ATF = -framework ATF
+USE_ATF = -DUSE_ATF
+endif
+
+SRCS = main.c \
+		test_fpmath_float.c test_fpmath_float2.c test_fpmath_float4.c \
+		test_intmath_int.c test_intmath_int2.c test_intmath_int4.c  \
+		test_intmath_long.c test_intmath_long2.c test_intmath_long4.c \
+		test_hiloeo.c test_local.c test_local_kernel_scope.cpp test_pointercast.c \
+		test_if.c test_sizeof.c test_loop.c \
+		test_readimage.c test_readimage_int16.c test_readimage_fp32.c \
+		test_readimage3d.c test_readimage3d_int16.c test_readimage3d_fp32.c \
+		test_writeimage.c test_writeimage_int16.c test_writeimage_fp32.c \
+		test_multireadimageonefmt.c test_multireadimagemultifmt.c \
+		test_imagedim.c \
+		test_vloadstore.c \
+		test_int2float.c test_float2int.c \
+		test_createkernelsinprogram.c \
+		test_hostptr.c \
+		test_explicit_s2v.cpp \
+		test_constant.c \
+		test_constant_source.cpp \
+		test_image_multipass.c \
+		test_imagereadwrite.c test_imagereadwrite3d.c \
+		test_bufferreadwriterect.c \
+		test_image_param.c \
+		test_imagenpot.c \
+		test_image_r8.c \
+		test_barrier.c \
+		test_wg_barrier.c \
+		test_arrayreadwrite.c \
+		test_arraycopy.c \
+		test_imagearraycopy.c \
+		test_imagearraycopy3d.c \
+		test_imagecopy.c \
+		test_imagerandomcopy.c \
+		test_arrayimagecopy.c \
+		test_arrayimagecopy3d.c\
+		test_imagecopy3d.c \
+		test_enqueue_map.cpp \
+		test_work_item_functions.cpp \
+		test_astype.cpp \
+		test_async_copy.cpp \
+		test_async_strided_copy.cpp \
+		test_numeric_constants.cpp \
+		test_kernel_call_kernel_function.cpp \
+		test_basic_parameter_types.c \
+		test_vector_creation.cpp \
+		test_vec_type_hint.c \
+		test_preprocessors.cpp \
+		test_kernel_memory_alignment.cpp \
+		test_global_work_offsets.cpp \
+		test_simple_image_pitch.c \
+		test_queue_priority.c \
+		test_global_linear_id.c \
+		test_local_linear_id.c \
+		test_enqueued_local_size.c \
+		test_get_linear_ids.c \
+		test_progvar.cpp \
+		test_rw_image_access_qualifier.c \
+		../../test_common/harness/errorHelpers.c \
+		../../test_common/harness/threadTesting.c \
+		../../test_common/harness/testHarness.c \
+		../../test_common/harness/rounding_mode.c \
+		../../test_common/harness/kernelHelpers.c \
+		../../test_common/harness/typeWrappers.cpp \
+		../../test_common/harness/imageHelpers.cpp \
+                ../../test_common/harness/mt19937.c \
+		../../test_common/harness/conversions.c 
+
+DEFINES = 
+
+SOURCES = $(abspath $(SRCS))
+LIBPATH += -L/System/Library/Frameworks/OpenCL.framework/Libraries
+LIBPATH += -L.
+FRAMEWORK = $(SOURCES)
+HEADERS = 
+TARGET = test_basic
+INCLUDE = 
+COMPILERFLAGS = -c -Wall -g -O0 -Wshorten-64-to-32
+CC = c++
+CFLAGS = $(COMPILERFLAGS) ${RC_CFLAGS} ${USE_ATF} $(DEFINES:%=-D%) $(INCLUDE)
+CXXFLAGS = $(COMPILERFLAGS) ${RC_CFLAGS} ${USE_ATF} $(DEFINES:%=-D%) $(INCLUDE)
+LIBRARIES = -framework OpenCL -framework OpenGL -framework GLUT -framework AppKit ${ATF}
+
+OBJECTS := ${SOURCES:.c=.o}
+OBJECTS := ${OBJECTS:.cpp=.o}
+
+TARGETOBJECT =
+all: $(TARGET)
+
+$(TARGET): $(OBJECTS)
+	$(CC) $(RC_CFLAGS) $(OBJECTS) -o $@ $(LIBPATH) $(LIBRARIES)
+
+clean:
+	rm -f $(TARGET) $(OBJECTS)
+
+.DEFAULT:
+	@echo The target \"$@\" does not exist in Makefile.
+
+
+
--- a/test_conformance/basic/main.c
+++ b/test_conformance/basic/main.c
@@ -0,0 +1,303 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/compat.h"
+
+#if !defined(_WIN32)
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "../../test_common/harness/testHarness.h"
+#include "procs.h"
+
+// FIXME: To use certain functions in ../../test_common/harness/imageHelpers.h
+// (for example, generate_random_image_data()), the tests are required to declare
+// the following variables (<rdar://problem/11111245>):
+cl_device_type gDeviceType = CL_DEVICE_TYPE_DEFAULT;
+bool gTestRounding = false;
+
+basefn    basefn_list[] = {
+    test_hostptr,
+    test_fpmath_float,
+    test_fpmath_float2,
+    test_fpmath_float4,
+    test_intmath_int,
+    test_intmath_int2,
+    test_intmath_int4,
+    test_intmath_long,
+    test_intmath_long2,
+    test_intmath_long4,
+    test_hiloeo,
+    test_if,
+    test_sizeof,
+    test_loop,
+    test_pointer_cast,
+    test_local_arg_def,
+    test_local_kernel_def,
+    test_local_kernel_scope,
+    test_constant,
+    test_constant_source,
+    test_readimage,
+    test_readimage_int16,
+    test_readimage_fp32,
+    test_writeimage,
+    test_writeimage_int16,
+    test_writeimage_fp32,
+    test_multireadimageonefmt,
+
+    test_multireadimagemultifmt,
+    test_image_r8,
+    test_barrier,
+    test_wg_barrier,
+    test_int2float,
+    test_float2int,
+    test_imagereadwrite,
+    test_imagereadwrite3d,
+    test_readimage3d,
+    test_readimage3d_int16,
+    test_readimage3d_fp32,
+    test_bufferreadwriterect,
+    test_arrayreadwrite,
+    test_arraycopy,
+    test_imagearraycopy,
+    test_imagearraycopy3d,
+    test_imagecopy,
+    test_imagecopy3d,
+    test_imagerandomcopy,
+    test_arrayimagecopy,
+    test_arrayimagecopy3d,
+    test_imagenpot,
+
+    test_vload_global,
+    test_vload_local,
+    test_vload_constant,
+    test_vload_private,
+    test_vstore_global,
+    test_vstore_local,
+    test_vstore_private,
+
+    test_createkernelsinprogram,
+    test_imagedim_pow2,
+    test_imagedim_non_pow2,
+    test_image_param,
+    test_image_multipass_integer_coord,
+    test_image_multipass_float_coord,
+    test_explicit_s2v_bool,
+    test_explicit_s2v_char,
+    test_explicit_s2v_uchar,
+    test_explicit_s2v_short,
+    test_explicit_s2v_ushort,
+    test_explicit_s2v_int,
+    test_explicit_s2v_uint,
+    test_explicit_s2v_long,
+    test_explicit_s2v_ulong,
+    test_explicit_s2v_float,
+    test_explicit_s2v_double,
+
+    test_enqueue_map_buffer,
+    test_enqueue_map_image,
+
+    test_work_item_functions,
+
+    test_astype,
+
+    test_async_copy_global_to_local,
+    test_async_copy_local_to_global,
+    test_async_strided_copy_global_to_local,
+    test_async_strided_copy_local_to_global,
+    test_prefetch,
+
+    test_kernel_call_kernel_function,
+    test_host_numeric_constants,
+    test_kernel_numeric_constants,
+    test_kernel_limit_constants,
+    test_kernel_preprocessor_macros,
+
+    test_basic_parameter_types,
+    test_vector_creation,
+    test_vec_type_hint,
+    test_kernel_memory_alignment_local,
+    test_kernel_memory_alignment_global,
+    test_kernel_memory_alignment_constant,
+    test_kernel_memory_alignment_private,
+
+    test_progvar_prog_scope_misc,
+    test_progvar_prog_scope_uninit,
+    test_progvar_prog_scope_init,
+    test_progvar_func_scope,
+
+    test_global_work_offsets,
+    test_get_global_offset,
+
+    test_global_linear_id,
+    test_local_linear_id,
+    test_enqueued_local_size,
+
+    test_simple_read_image_pitch,
+    test_simple_write_image_pitch,
+
+#if defined( __APPLE__ )
+    test_queue_priority,
+#endif
+
+    test_get_linear_ids,
+    test_rw_image_access_qualifier
+};
+
+const char    *basefn_names[] = {
+    "hostptr",
+    "fpmath_float",
+    "fpmath_float2",
+    "fpmath_float4",
+    "intmath_int",
+    "intmath_int2",
+    "intmath_int4",
+    "intmath_long",
+    "intmath_long2",
+    "intmath_long4",
+    "hiloeo",
+    "if",
+    "sizeof",
+    "loop",
+    "pointer_cast",
+    "local_arg_def",
+    "local_kernel_def",
+    "local_kernel_scope",
+    "constant",
+    "constant_source",
+    "readimage",
+    "readimage_int16",
+    "readimage_fp32",
+    "writeimage",
+    "writeimage_int16",
+    "writeimage_fp32",
+    "mri_one",
+
+    "mri_multiple",
+    "image_r8",
+    "barrier",
+    "wg_barrier",
+    "int2float",
+    "float2int",
+    "imagereadwrite",
+    "imagereadwrite3d",
+    "readimage3d",
+    "readimage3d_int16",
+    "readimage3d_fp32",
+    "bufferreadwriterect",
+    "arrayreadwrite",
+    "arraycopy",
+    "imagearraycopy",
+    "imagearraycopy3d",
+    "imagecopy",
+    "imagecopy3d",
+    "imagerandomcopy",
+    "arrayimagecopy",
+    "arrayimagecopy3d",
+    "imagenpot",
+
+    "vload_global",
+    "vload_local",
+    "vload_constant",
+    "vload_private",
+    "vstore_global",
+    "vstore_local",
+    "vstore_private",
+
+    "createkernelsinprogram",
+    "imagedim_pow2",
+    "imagedim_non_pow2",
+    "image_param",
+    "image_multipass_integer_coord",
+    "image_multipass_float_coord",
+    "explicit_s2v_bool",
+    "explicit_s2v_char",
+    "explicit_s2v_uchar",
+    "explicit_s2v_short",
+    "explicit_s2v_ushort",
+    "explicit_s2v_int",
+    "explicit_s2v_uint",
+    "explicit_s2v_long",
+    "explicit_s2v_ulong",
+    "explicit_s2v_float",
+    "explicit_s2v_double",
+
+    "enqueue_map_buffer",
+    "enqueue_map_image",
+
+    "work_item_functions",
+
+    "astype",
+
+    "async_copy_global_to_local",
+    "async_copy_local_to_global",
+    "async_strided_copy_global_to_local",
+    "async_strided_copy_local_to_global",
+    "prefetch",
+
+    "kernel_call_kernel_function",
+    "host_numeric_constants",
+    "kernel_numeric_constants",
+    "kernel_limit_constants",
+    "kernel_preprocessor_macros",
+
+    "parameter_types",
+
+    "vector_creation",
+    "vec_type_hint",
+
+    "kernel_memory_alignment_local",
+    "kernel_memory_alignment_global",
+    "kernel_memory_alignment_constant",
+    "kernel_memory_alignment_private",
+
+    "progvar_prog_scope_misc",
+    "progvar_prog_scope_uninit",
+    "progvar_prog_scope_init",
+    "progvar_func_scope",
+
+    "global_work_offsets",
+    "get_global_offset",
+
+    "global_linear_id",
+    "local_linear_id",
+    "enqueued_local_size",
+
+    "simple_read_image_pitch",
+    "simple_write_image_pitch",
+
+#if defined( __APPLE__ )
+    "queue_priority",
+#endif
+
+    "get_linear_ids",
+    "test_rw_image_access_qualifier",
+};
+
+ct_assert((sizeof(basefn_names) / sizeof(basefn_names[0])) == (sizeof(basefn_list) / sizeof(basefn_list[0])));
+
+int    num_fns = sizeof(basefn_names) / sizeof(char *);
+
+
+int main(int argc, const char *argv[])
+{
+    return runTestHarness( argc, argv, num_fns, basefn_list, basefn_names, false, false, 0 );
+}
+
+
+
--- a/test_conformance/basic/procs.h
+++ b/test_conformance/basic/procs.h
@@ -0,0 +1,160 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/kernelHelpers.h"
+#include "../../test_common/harness/testHarness.h"
+#include "../../test_common/harness/errorHelpers.h"
+#include "../../test_common/harness/typeWrappers.h"
+#include "../../test_common/harness/conversions.h"
+#include "../../test_common/harness/rounding_mode.h"
+
+extern void     memset_pattern4(void *dest, const void *src_pattern, size_t bytes );
+
+extern int      test_hostptr(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_fpmath_float(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_fpmath_float2(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_fpmath_float4(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_intmath_int(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_intmath_int2(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_intmath_int4(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_intmath_long(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_intmath_long2(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_intmath_long4(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_hiloeo(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_if(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_sizeof(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_loop(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_pointer_cast(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_local_arg_def(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_local_kernel_def(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_local_kernel_scope(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_constant(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int        test_constant_source(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_readimage(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_readimage_int16(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_readimage_fp32(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_writeimage(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_writeimage_int16(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_writeimage_fp32(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_multireadimageonefmt(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_multireadimagemultifmt(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_image_r8(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_simplebarrier(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_barrier(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_wg_barrier(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_int2float(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_float2int(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_imagearraycopy(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_imagearraycopy3d(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_imagereadwrite(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_imagereadwrite3d(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_readimage3d(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_readimage3d_int16(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_readimage3d_fp32(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_bufferreadwriterect(cl_device_id device, cl_context context, cl_command_queue queue_, int num_elements);
+extern int      test_imagecopy(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_imagecopy3d(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_imagerandomcopy(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_arraycopy(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems);
+extern int      test_arrayimagecopy(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_arrayimagecopy3d(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_imagenpot(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_sampler_float(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_sampler_int(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_createkernelsinprogram(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_single_large_allocation(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_multiple_max_allocation(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_arrayreadwrite(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_imagedim_pow2(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_imagedim_non_pow2(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_image_param(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_image_multipass_integer_coord(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_image_multipass_float_coord(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+
+extern int      test_vload_global(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_vload_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_vload_constant(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_vload_private(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_vstore_global(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_vstore_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_vstore_private(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+
+extern int      test_explicit_s2v_bool(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_explicit_s2v_char(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_explicit_s2v_uchar(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_explicit_s2v_short(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_explicit_s2v_ushort(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_explicit_s2v_int(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_explicit_s2v_uint(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_explicit_s2v_long(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_explicit_s2v_ulong(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_explicit_s2v_float(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_explicit_s2v_double(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+
+extern int      test_enqueue_map_buffer(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_enqueue_map_image(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+
+extern int      test_work_item_functions(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+
+extern int      test_astype(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+
+extern int      test_native_kernel(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+
+extern int      test_async_copy_global_to_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_async_copy_local_to_global(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_async_strided_copy_global_to_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_async_strided_copy_local_to_global(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_prefetch(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+
+extern int      test_host_numeric_constants(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_kernel_numeric_constants(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_kernel_limit_constants(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+
+extern int    test_kernel_preprocessor_macros(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+
+extern int      test_kernel_call_kernel_function(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+
+extern int      test_basic_parameter_types(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_vector_creation(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_vec_type_hint(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+
+
+extern int test_kernel_memory_alignment_local(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems );
+extern int test_kernel_memory_alignment_global(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems );
+extern int test_kernel_memory_alignment_constant(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems );
+extern int test_kernel_memory_alignment_private(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems );
+
+extern int test_progvar_prog_scope_misc(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems );
+extern int test_progvar_prog_scope_uninit(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems );
+extern int test_progvar_prog_scope_init(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems );
+extern int test_progvar_func_scope(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems );
+
+extern int test_global_work_offsets(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems );
+extern int test_get_global_offset(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems );
+
+extern int test_global_linear_id(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems );
+extern int test_local_linear_id(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems );
+extern int test_enqueued_local_size(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems );
+
+extern int test_simple_read_image_pitch(cl_device_id device, cl_context cl_context_, cl_command_queue q, int num_elements);
+extern int test_simple_write_image_pitch(cl_device_id device, cl_context cl_context_, cl_command_queue q, int num_elements);
+
+#if defined( __APPLE__ )
+extern int test_queue_priority(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements);
+#endif
+
+extern int test_get_linear_ids(cl_device_id device, cl_context cl_context_, cl_command_queue q, int num_elements);
+extern int test_rw_image_access_qualifier(cl_device_id device_id, cl_context context, cl_command_queue commands, int num_elements);
+
--- a/test_conformance/basic/run_array
+++ b/test_conformance/basic/run_array
@@ -0,0 +1,3 @@
+#!/bin/sh
+cd `dirname $0`
+./test_basic arrayreadwrite arraycopy bufferreadwriterect $@
--- a/test_conformance/basic/run_array_image_copy
+++ b/test_conformance/basic/run_array_image_copy
@@ -0,0 +1,3 @@
+#!/bin/sh
+cd `dirname $0`
+./test_basic arrayimagecopy arrayimagecopy3d imagearraycopy
--- a/test_conformance/basic/run_image
+++ b/test_conformance/basic/run_image
@@ -0,0 +1,17 @@
+#!/bin/sh
+cd `dirname $0`
+./test_basic  \
+imagecopy imagerandomcopy \
+imagearraycopy imagearraycopy3d \
+image_r8 \
+readimage readimage_int16 readimage_fp32 \
+writeimage writeimage_int16 writeimage_fp32 \
+imagenpot \
+image_param \
+image_multipass_integer_coord \
+readimage3d \
+readimage3d_int16 \
+readimage3d_fp32 \
+imagereadwrite3d \
+imagereadwrite \
+$@
--- a/test_conformance/basic/run_multi_read_image
+++ b/test_conformance/basic/run_multi_read_image
@@ -0,0 +1,4 @@
+#!/bin/sh
+cd `dirname $0`
+./test_basic mri_one mri_multiple
+
--- a/test_conformance/basic/test_arraycopy.c
+++ b/test_conformance/basic/test_arraycopy.c
@@ -0,0 +1,201 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/compat.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+
+#include "procs.h"
+
+const char *copy_kernel_code =
+"__kernel void test_copy(__global unsigned int *src, __global unsigned int *dst)\n"
+"{\n"
+"    int  tid = get_global_id(0);\n"
+"\n"
+"    dst[tid] = src[tid];\n"
+"}\n";
+
+int
+test_arraycopy(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
+{
+    cl_uint    *input_ptr, *output_ptr;
+    cl_mem                streams[4], results;
+    cl_program          program;
+    cl_kernel            kernel;
+    unsigned            num_elements = 128 * 1024;
+    cl_uint             num_copies = 1;
+    size_t                delta_offset;
+    unsigned            i;
+    cl_int err;
+    MTdata              d;
+
+    int error_count = 0;
+
+    input_ptr = (cl_uint*)malloc(sizeof(cl_uint) * num_elements);
+    output_ptr = (cl_uint*)malloc(sizeof(cl_uint) * num_elements);
+
+    // results
+    results = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE),  sizeof(cl_uint) * num_elements, NULL, &err);
+    test_error(err, "clCreateBuffer failed");
+
+/*****************************************************************************************************************************************/
+#pragma mark client backing
+
+    log_info("Testing CL_MEM_USE_HOST_PTR buffer with clEnqueueCopyBuffer\n");
+    // randomize data
+    d = init_genrand( gRandomSeed );
+    for (i=0; i<num_elements; i++)
+        input_ptr[i] = (cl_uint)(genrand_int32(d) & 0x7FFFFFFF);
+
+    // client backing
+    streams[0] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_USE_HOST_PTR), sizeof(cl_uint) * num_elements, input_ptr, &err);
+    test_error(err, "clCreateBuffer failed");
+
+    delta_offset = num_elements * sizeof(cl_uint) / num_copies;
+    for (i=0; i<num_copies; i++)
+    {
+        size_t    offset = i * delta_offset;
+        err = clEnqueueCopyBuffer(queue, streams[0], results, offset, offset, delta_offset, 0, NULL, NULL);
+        test_error(err, "clEnqueueCopyBuffer failed");
+    }
+
+    // Try upload from client backing
+    err = clEnqueueReadBuffer( queue, results, CL_TRUE, 0, num_elements*sizeof(cl_uint), output_ptr, 0, NULL, NULL );
+    test_error(err, "clEnqueueReadBuffer failed");
+
+    for (i=0; i<num_elements; i++)
+    {
+        if (input_ptr[i] != output_ptr[i])
+        {
+            err = -1;
+            error_count++;
+        }
+    }
+
+    if (err)
+        log_error("\tCL_MEM_USE_HOST_PTR buffer with clEnqueueCopyBuffer FAILED\n");
+    else
+        log_info("\tCL_MEM_USE_HOST_PTR buffer with clEnqueueCopyBuffer passed\n");
+
+
+
+#pragma mark framework backing (no client data)
+
+    log_info("Testing with clEnqueueWriteBuffer and clEnqueueCopyBuffer\n");
+    // randomize data
+    for (i=0; i<num_elements; i++)
+        input_ptr[i] = (cl_uint)(genrand_int32(d) & 0x7FFFFFFF);
+
+    // no backing
+    streams[2] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE) , sizeof(cl_uint) * num_elements, NULL, &err);
+    test_error(err, "clCreateBuffer failed");
+
+    for (i=0; i<num_copies; i++)
+    {
+        size_t    offset = i * delta_offset;
+
+        // Copy the array up from host ptr
+        err = clEnqueueWriteBuffer(queue, streams[2], CL_TRUE, 0, sizeof(cl_uint)*num_elements, input_ptr, 0, NULL, NULL);
+        test_error(err, "clEnqueueWriteBuffer failed");
+
+        err = clEnqueueCopyBuffer(queue, streams[2], results, offset, offset, delta_offset, 0, NULL, NULL);
+        test_error(err, "clEnqueueCopyBuffer failed");
+    }
+
+    err = clEnqueueReadBuffer( queue, results, true, 0, num_elements*sizeof(cl_uint), output_ptr, 0, NULL, NULL );
+    test_error(err, "clEnqueueReadBuffer failed");
+
+    for (i=0; i<num_elements; i++)
+    {
+        if (input_ptr[i] != output_ptr[i])
+        {
+            err = -1;
+            error_count++;
+            break;
+        }
+    }
+
+    if (err)
+        log_error("\tclEnqueueWriteBuffer and clEnqueueCopyBuffer FAILED\n");
+    else
+        log_info("\tclEnqueueWriteBuffer and clEnqueueCopyBuffer passed\n");
+
+/*****************************************************************************************************************************************/
+#pragma mark kernel copy test
+
+    log_info("Testing CL_MEM_USE_HOST_PTR buffer with kernel copy\n");
+    // randomize data
+    for (i=0; i<num_elements; i++)
+        input_ptr[i] = (cl_uint)(genrand_int32(d) & 0x7FFFFFFF);
+    free_mtdata(d); d= NULL;
+
+    // client backing
+  streams[3] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_USE_HOST_PTR), sizeof(cl_uint) * num_elements, input_ptr, &err);
+  test_error(err, "clCreateBuffer failed");
+
+  err = create_single_kernel_helper(context, &program, &kernel, 1, &copy_kernel_code, "test_copy" );
+  test_error(err, "create_single_kernel_helper failed");
+
+  err = clSetKernelArg(kernel, 0, sizeof streams[3], &streams[3]);
+  err |= clSetKernelArg(kernel, 1, sizeof results, &results);
+  test_error(err, "clSetKernelArg failed");
+
+  size_t threads[3] = {num_elements, 0, 0};
+
+    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, NULL, 0, NULL, NULL );
+  test_error(err, "clEnqueueNDRangeKernel failed");
+
+    err = clEnqueueReadBuffer( queue, results, CL_TRUE, 0, num_elements*sizeof(cl_uint), output_ptr, 0, NULL, NULL );
+    test_error(err, "clEnqueueReadBuffer failed");
+
+    for (i=0; i<num_elements; i++)
+    {
+        if (input_ptr[i] != output_ptr[i])
+        {
+            err = -1;
+      error_count++;
+            break;
+        }
+    }
+
+  // Keep track of multiple errors.
+  if (error_count != 0)
+    err = error_count;
+
+    if (err)
+        log_error("\tCL_MEM_USE_HOST_PTR buffer with kernel copy FAILED\n");
+    else
+        log_info("\tCL_MEM_USE_HOST_PTR buffer with kernel copy passed\n");
+
+
+  clReleaseProgram(program);
+  clReleaseKernel(kernel);
+  clReleaseMemObject(results);
+  clReleaseMemObject(streams[0]);
+  clReleaseMemObject(streams[2]);
+  clReleaseMemObject(streams[3]);
+
+  free(input_ptr);
+  free(output_ptr);
+
+    return err;
+}
+
+
+
--- a/test_conformance/basic/test_arrayimagecopy.c
+++ b/test_conformance/basic/test_arrayimagecopy.c
@@ -0,0 +1,143 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/compat.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "procs.h"
+
+int test_arrayimagecopy_single_format(cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format)
+{
+  cl_uchar    *bufptr, *imgptr;
+  clMemWrapper      buffer, image;
+  int        img_width = 512;
+  int        img_height = 512;
+  size_t    elem_size;
+  size_t    buffer_size;
+  int        i;
+  cl_int          err;
+  MTdata          d;
+  cl_event  copyevent;
+
+  log_info("Testing %s %s\n", GetChannelOrderName(format->image_channel_order), GetChannelTypeName(format->image_channel_data_type));
+
+  image = create_image_2d(context, (cl_mem_flags)(CL_MEM_READ_WRITE), format, img_width, img_height, 0, NULL, &err);
+  test_error(err, "create_image_2d failed");
+
+  err = clGetImageInfo(image, CL_IMAGE_ELEMENT_SIZE, sizeof(size_t), &elem_size, NULL);
+  test_error(err, "clGetImageInfo failed");
+
+  buffer_size = sizeof(cl_uchar) * elem_size * img_width * img_height;
+
+  buffer = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE),  buffer_size, NULL, &err);
+  test_error(err, "clCreateBuffer failed");
+
+  d = init_genrand( gRandomSeed );
+  bufptr = (cl_uchar*)malloc(buffer_size);
+  for (i=0; i<(int)buffer_size; i++) {
+     bufptr[i] = (cl_uchar)genrand_int32(d);
+  }
+  free_mtdata(d); d = NULL;
+
+  size_t origin[3]={0,0,0}, region[3]={img_width,img_height,1};
+  err = clEnqueueWriteBuffer( queue, buffer, CL_TRUE, 0, buffer_size, bufptr, 0, NULL, NULL);
+  test_error(err, "clEnqueueWriteBuffer failed");
+
+  err = clEnqueueCopyBufferToImage( queue, buffer, image, 0, origin, region, 0, NULL, &copyevent );
+  test_error(err, "clEnqueueCopyImageToBuffer failed");
+
+  imgptr = (cl_uchar*)malloc(buffer_size);
+
+  err = clEnqueueReadImage( queue, image, CL_TRUE, origin, region, 0, 0, imgptr, 1, &copyevent, NULL );
+  test_error(err, "clEnqueueReadBuffer failed");
+
+  if (memcmp(bufptr, imgptr, buffer_size) != 0) {
+    log_error( "ERROR: Results did not validate!\n" );
+    unsigned char * inchar = (unsigned char*)bufptr;
+    unsigned char * outchar = (unsigned char*)imgptr;
+    int failuresPrinted = 0;
+    int i;
+    for (i=0; i< (int)buffer_size; i+=(int)elem_size) {
+        int failed = 0;
+        int j;
+        for (j=0; j<(int)elem_size; j++)
+            if (inchar[i+j] != outchar[i+j])
+                failed = 1;
+        char values[4096];
+        values[0] = 0;
+        if (failed) {
+            sprintf(values + strlen(values), "%d(0x%x) -> actual [", i, i);
+            int j;
+            for (j=0; j<(int)elem_size; j++)
+                sprintf(values + strlen( values), "0x%02x ", inchar[i+j]);
+            sprintf(values + strlen(values), "] != expected [");
+            for (j=0; j<(int)elem_size; j++)
+                sprintf(values + strlen( values), "0x%02x ", outchar[i+j]);
+            sprintf(values + strlen(values), "]");
+            log_error("%s\n", values);
+            failuresPrinted++;
+        }
+        if (failuresPrinted > 5) {
+            log_error("Not printing further failures...\n");
+            break;
+        }
+    }
+    err = -1;
+  }
+
+  free(bufptr);
+  free(imgptr);
+
+  if (err)
+    log_error("ARRAY to IMAGE copy test failed for image_channel_order=0x%lx and image_channel_data_type=0x%lx\n",
+              (unsigned long)format->image_channel_order, (unsigned long)format->image_channel_data_type);
+
+  return err;
+}
+
+int test_arrayimagecopy(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
+{
+  cl_int          err;
+  cl_image_format *formats;
+  cl_uint         num_formats;
+  cl_uint         i;
+
+  PASSIVE_REQUIRE_IMAGE_SUPPORT( device )
+
+  err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE, CL_MEM_OBJECT_IMAGE2D, 0, NULL, &num_formats);
+  test_error(err, "clGetSupportedImageFormats failed");
+
+  formats = (cl_image_format *)malloc(num_formats * sizeof(cl_image_format));
+
+  err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE, CL_MEM_OBJECT_IMAGE2D, num_formats, formats, NULL);
+  test_error(err, "clGetSupportedImageFormats failed");
+
+  for (i = 0; i < num_formats; i++) {
+    err |= test_arrayimagecopy_single_format(device, context, queue, &formats[i]);
+  }
+
+  free(formats);
+  if (err)
+    log_error("ARRAY to IMAGE copy test failed\n");
+  else
+    log_info("ARRAY to IMAGE copy test passed\n");
+
+  return err;
+}
--- a/test_conformance/basic/test_arrayimagecopy3d.c
+++ b/test_conformance/basic/test_arrayimagecopy3d.c
@@ -0,0 +1,144 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/compat.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "procs.h"
+
+int test_arrayimagecopy3d_single_format(cl_device_id device, cl_context context, cl_command_queue queue, cl_image_format *format)
+{
+  cl_uchar    *bufptr, *imgptr;
+  clMemWrapper      buffer, image;
+  int        img_width = 128;
+  int        img_height = 128;
+  int        img_depth = 32;
+  size_t    elem_size;
+  size_t    buffer_size;
+  int        i;
+  cl_int          err;
+  MTdata          d;
+  cl_event  copyevent;
+
+  log_info("Testing %s %s\n", GetChannelOrderName(format->image_channel_order), GetChannelTypeName(format->image_channel_data_type));
+
+  image = create_image_3d(context, (cl_mem_flags)(CL_MEM_READ_WRITE), format, img_width, img_height, img_depth, 0, 0, NULL, &err);
+  test_error(err, "create_image_3d failed");
+
+  err = clGetImageInfo(image, CL_IMAGE_ELEMENT_SIZE, sizeof(size_t), &elem_size, NULL);
+  test_error(err, "clGetImageInfo failed");
+
+  buffer_size = sizeof(cl_uchar) * elem_size * img_width * img_height * img_depth;
+
+  buffer = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE),  buffer_size, NULL, &err);
+  test_error(err, "clCreateBuffer failed");
+
+  d = init_genrand( gRandomSeed );
+  bufptr = (cl_uchar*)malloc(buffer_size);
+  for (i=0; i<(int)buffer_size; i++) {
+     bufptr[i] = (cl_uchar)genrand_int32(d);
+  }
+  free_mtdata(d); d = NULL;
+
+  size_t origin[3]={0,0,0}, region[3]={img_width,img_height,img_depth};
+  err = clEnqueueWriteBuffer( queue, buffer, CL_TRUE, 0, buffer_size, bufptr, 0, NULL, NULL);
+  test_error(err, "clEnqueueWriteBuffer failed");
+
+  err = clEnqueueCopyBufferToImage( queue, buffer, image, 0, origin, region, 0, NULL, &copyevent );
+  test_error(err, "clEnqueueCopyImageToBuffer failed");
+
+  imgptr = (cl_uchar*)malloc(buffer_size);
+
+  err = clEnqueueReadImage( queue, image, CL_TRUE, origin, region, 0, 0, imgptr, 1, &copyevent, NULL );
+  test_error(err, "clEnqueueReadBuffer failed");
+
+  if (memcmp(bufptr, imgptr, buffer_size) != 0) {
+    log_error( "ERROR: Results did not validate!\n" );
+    unsigned char * inchar = (unsigned char*)bufptr;
+    unsigned char * outchar = (unsigned char*)imgptr;
+    int failuresPrinted = 0;
+    int i;
+    for (i=0; i< (int)buffer_size; i+=(int)elem_size) {
+        int failed = 0;
+        int j;
+        for (j=0; j<(int)elem_size; j++)
+            if (inchar[i+j] != outchar[i+j])
+                failed = 1;
+        char values[4096];
+        values[0] = 0;
+        if (failed) {
+            sprintf(values + strlen(values), "%d(0x%x) -> actual [", i, i);
+            int j;
+            for (j=0; j<(int)elem_size; j++)
+                sprintf(values + strlen( values), "0x%02x ", inchar[i+j]);
+            sprintf(values + strlen(values), "] != expected [");
+            for (j=0; j<(int)elem_size; j++)
+                sprintf(values + strlen( values), "0x%02x ", outchar[i+j]);
+            sprintf(values + strlen(values), "]");
+            log_error("%s\n", values);
+            failuresPrinted++;
+        }
+        if (failuresPrinted > 5) {
+            log_error("Not printing further failures...\n");
+            break;
+        }
+    }
+    err = -1;
+  }
+
+  free(bufptr);
+  free(imgptr);
+
+  if (err)
+    log_error("ARRAY to IMAGE3D copy test failed for image_channel_order=0x%lx and image_channel_data_type=0x%lx\n",
+              (unsigned long)format->image_channel_order, (unsigned long)format->image_channel_data_type);
+
+  return err;
+}
+
+int test_arrayimagecopy3d(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
+{
+  cl_int          err;
+  cl_image_format *formats;
+  cl_uint         num_formats;
+  cl_uint         i;
+
+  PASSIVE_REQUIRE_3D_IMAGE_SUPPORT( device )
+
+  err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE, CL_MEM_OBJECT_IMAGE3D, 0, NULL, &num_formats);
+  test_error(err, "clGetSupportedImageFormats failed");
+
+  formats = (cl_image_format *)malloc(num_formats * sizeof(cl_image_format));
+
+  err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE, CL_MEM_OBJECT_IMAGE3D, num_formats, formats, NULL);
+  test_error(err, "clGetSupportedImageFormats failed");
+
+  for (i = 0; i < num_formats; i++) {
+    err |= test_arrayimagecopy3d_single_format(device, context, queue, &formats[i]);
+  }
+
+  free(formats);
+  if (err)
+    log_error("ARRAY to IMAGE3D copy test failed\n");
+  else
+    log_info("ARRAY to IMAGE3D copy test passed\n");
+
+  return err;
+}
--- a/test_conformance/basic/test_arrayreadwrite.c
+++ b/test_conformance/basic/test_arrayreadwrite.c
@@ -0,0 +1,95 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/compat.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+
+#include "procs.h"
+
+
+int
+test_arrayreadwrite(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
+{
+    cl_uint                *inptr, *outptr;
+    cl_mem              streams[1];
+    int                 num_tries = 400;
+    num_elements = 1024 * 1024 * 4;
+    int                 i, j, err;
+    MTdata              d;
+
+    inptr = (cl_uint*)malloc(num_elements*sizeof(cl_uint));
+    outptr = (cl_uint*)malloc(num_elements*sizeof(cl_uint));
+
+    // randomize data
+    d = init_genrand( gRandomSeed );
+    for (i=0; i<num_elements; i++)
+        inptr[i] = (cl_uint)(genrand_int32(d) & 0x7FFFFFFF);
+
+    streams[0] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE),  sizeof(cl_uint) * num_elements, NULL, &err);
+    test_error(err, "clCreateBuffer failed");
+
+    for (i=0; i<num_tries; i++)
+    {
+        int        offset;
+        int        cb;
+
+        do {
+            offset = (int)(genrand_int32(d) & 0x7FFFFFFF);
+            if (offset > 0 && offset < num_elements)
+                break;
+        } while (1);
+        cb = (int)(genrand_int32(d) & 0x7FFFFFFF);
+        if (cb > (num_elements - offset))
+            cb = num_elements - offset;
+
+        err = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, offset*sizeof(cl_uint), sizeof(cl_uint)*cb,&inptr[offset], 0, NULL, NULL);
+        test_error(err, "clEnqueueWriteBuffer failed");
+
+        err = clEnqueueReadBuffer( queue, streams[0], CL_TRUE, offset*sizeof(cl_uint), cb*sizeof(cl_uint), &outptr[offset], 0, NULL, NULL );
+        test_error(err, "clEnqueueReadBuffer failed");
+
+        for (j=offset; j<offset+cb; j++)
+        {
+            if (inptr[j] != outptr[j])
+            {
+                log_error("ARRAY read, write test failed\n");
+                err = -1;
+                break;
+            }
+        }
+
+        if (err)
+            break;
+    }
+
+    free_mtdata(d);
+    clReleaseMemObject(streams[0]);
+    free(inptr);
+    free(outptr);
+
+    if (!err)
+        log_info("ARRAY read, write test passed\n");
+
+    return err;
+}
+
+
+
--- a/test_conformance/basic/test_astype.cpp
+++ b/test_conformance/basic/test_astype.cpp
@@ -0,0 +1,288 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/compat.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <limits.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+
+#include "procs.h"
+#include "../../test_common/harness/conversions.h"
+#include "../../test_common/harness/typeWrappers.h"
+
+
+static const char *astype_kernel_pattern =
+"%s\n"
+"__kernel void test_fn( __global %s%s *src, __global %s%s *dst )\n"
+"{\n"
+"    int tid = get_global_id( 0 );\n"
+"    %s%s tmp = as_%s%s( src[ tid ] );\n"
+"   dst[ tid ] = tmp;\n"
+"}\n";
+
+static const char *astype_kernel_pattern_V3srcV3dst =
+"%s\n"
+"__kernel void test_fn( __global %s *src, __global %s *dst )\n"
+"{\n"
+"    int tid = get_global_id( 0 );\n"
+"    %s%s tmp = as_%s%s( vload3(tid,src) );\n"
+"   vstore3(tmp,tid,dst);\n"
+"}\n";
+// in the printf, remove the third and fifth argument, each of which
+// should be a "3", when copying from the printf for astype_kernel_pattern
+
+static const char *astype_kernel_pattern_V3dst =
+"%s\n"
+"__kernel void test_fn( __global %s%s *src, __global %s *dst )\n"
+"{\n"
+"    int tid = get_global_id( 0 );\n"
+"    %s3 tmp = as_%s3( src[ tid ] );\n"
+"   vstore3(tmp,tid,dst);\n"
+"}\n";
+// in the printf, remove the fifth argument, which
+// should be a "3", when copying from the printf for astype_kernel_pattern
+
+
+static const char *astype_kernel_pattern_V3src =
+"%s\n"
+"__kernel void test_fn( __global %s *src, __global %s%s *dst )\n"
+"{\n"
+"    int tid = get_global_id( 0 );\n"
+"    %s%s tmp = as_%s%s( vload3(tid,src) );\n"
+"   dst[ tid ] = tmp;\n"
+"}\n";
+// in the printf, remove the third argument, which
+// should be a "3", when copying from the printf for astype_kernel_pattern
+
+
+int test_astype_set( cl_device_id device, cl_context context, cl_command_queue queue, ExplicitType inVecType, ExplicitType outVecType,
+                    unsigned int vecSize, unsigned int outVecSize,
+                    int numElements )
+{
+    int error;
+
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    clMemWrapper streams[ 2 ];
+
+    char programSrc[ 10240 ];
+    size_t threads[ 1 ], localThreads[ 1 ];
+    size_t typeSize = get_explicit_type_size( inVecType );
+    size_t outTypeSize = get_explicit_type_size(outVecType);
+    char sizeNames[][ 3 ] = { "", "", "2", "3", "4", "", "", "", "8", "", "", "", "", "", "", "", "16" };
+    MTdata d;
+
+
+
+    // Create program
+    if(outVecSize == 3 && vecSize == 3) {
+        // astype_kernel_pattern_V3srcV3dst
+        sprintf( programSrc, astype_kernel_pattern_V3srcV3dst,
+                (outVecType == kDouble || inVecType == kDouble) ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable" : "",
+                get_explicit_type_name( inVecType ), // sizeNames[ vecSize ],
+                get_explicit_type_name( outVecType ), // sizeNames[ outVecSize ],
+                get_explicit_type_name( outVecType ), sizeNames[ outVecSize ],
+                get_explicit_type_name( outVecType ), sizeNames[ outVecSize ] );
+    } else if(outVecSize == 3) {
+        // astype_kernel_pattern_V3dst
+        sprintf( programSrc, astype_kernel_pattern_V3dst,
+                (outVecType == kDouble || inVecType == kDouble) ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable" : "",
+                get_explicit_type_name( inVecType ), sizeNames[ vecSize ],
+                get_explicit_type_name( outVecType ),
+                get_explicit_type_name( outVecType ),
+                get_explicit_type_name( outVecType ));
+
+    } else if(vecSize == 3) {
+        // astype_kernel_pattern_V3src
+        sprintf( programSrc, astype_kernel_pattern_V3src,
+                (outVecType == kDouble || inVecType == kDouble) ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable" : "",
+                get_explicit_type_name( inVecType ),// sizeNames[ vecSize ],
+                get_explicit_type_name( outVecType ), sizeNames[ outVecSize ],
+                get_explicit_type_name( outVecType ), sizeNames[ outVecSize ],
+                get_explicit_type_name( outVecType ), sizeNames[ outVecSize ]);
+    } else {
+        sprintf( programSrc, astype_kernel_pattern,
+                (outVecType == kDouble || inVecType == kDouble) ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable" : "",
+                get_explicit_type_name( inVecType ), sizeNames[ vecSize ],
+                get_explicit_type_name( outVecType ), sizeNames[ outVecSize ],
+                get_explicit_type_name( outVecType ), sizeNames[ outVecSize ],
+                get_explicit_type_name( outVecType ), sizeNames[ outVecSize ]);
+    }
+
+    const char *ptr = programSrc;
+    error = create_single_kernel_helper( context, &program, &kernel, 1, &ptr, "test_fn" );
+    test_error( error, "Unable to create testing kernel" );
+
+
+    // Create some input values
+    size_t inBufferSize = sizeof(char)* numElements * get_explicit_type_size( inVecType ) * vecSize;
+    char *inBuffer = (char*)malloc( inBufferSize );
+    size_t outBufferSize = sizeof(char)* numElements * get_explicit_type_size( outVecType ) *outVecSize;
+    char *outBuffer = (char*)malloc( outBufferSize );
+
+    d = init_genrand( gRandomSeed );
+    generate_random_data( inVecType, numElements * vecSize,
+                         d, inBuffer );
+    free_mtdata(d); d = NULL;
+
+    // Create I/O streams and set arguments
+    streams[ 0 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, inBufferSize, inBuffer, &error );
+    test_error( error, "Unable to create I/O stream" );
+    streams[ 1 ] = clCreateBuffer( context, CL_MEM_READ_WRITE, outBufferSize, NULL, &error );
+    test_error( error, "Unable to create I/O stream" );
+
+    error = clSetKernelArg( kernel, 0, sizeof( streams[ 0 ] ), &streams[ 0 ] );
+    test_error( error, "Unable to set kernel argument" );
+    error = clSetKernelArg( kernel, 1, sizeof( streams[ 1 ] ), &streams[ 1 ] );
+    test_error( error, "Unable to set kernel argument" );
+
+
+    // Run the kernel
+    threads[ 0 ] = numElements;
+    error = get_max_common_work_group_size( context, kernel, threads[ 0 ], &localThreads[ 0 ] );
+    test_error( error, "Unable to get group size to run with" );
+
+    error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
+    test_error( error, "Unable to run kernel" );
+
+
+    // Get the results and compare
+    // The beauty is that astype is supposed to return the bit pattern as a different type, which means
+    // the output should have the exact same bit pattern as the input. No interpretation necessary!
+    error = clEnqueueReadBuffer( queue, streams[ 1 ], CL_TRUE, 0, outBufferSize, outBuffer, 0, NULL, NULL );
+    test_error( error, "Unable to read results" );
+
+    char *expected = inBuffer;
+    char *actual = outBuffer;
+    size_t compSize = typeSize*vecSize;
+    if(outTypeSize*outVecSize < compSize) {
+        compSize = outTypeSize*outVecSize;
+    }
+
+    if(outVecSize == 4 && vecSize == 3)
+    {
+        // as_type4(vec3) should compile but produce undefined results??
+        free(inBuffer);
+        free(outBuffer);
+        return 0;
+    }
+
+    if(outVecSize != 3 && vecSize != 3 && outVecSize != vecSize)
+    {
+        // as_typen(vecm) should compile and run but produce
+        // implementation-defined results for m != n
+        // and n*sizeof(type) = sizeof(vecm)
+        free(inBuffer);
+        free(outBuffer);
+        return 0;
+    }
+
+    for( int i = 0; i < numElements; i++ )
+    {
+        if( memcmp( expected, actual, compSize ) != 0 )
+        {
+            char expectedString[ 1024 ], actualString[ 1024 ];
+            log_error( "ERROR: Data sample %d of %d for as_%s%d( %s%d ) did not validate (expected {%s}, got {%s})\n",
+                      (int)i, (int)numElements, get_explicit_type_name( outVecType ), vecSize, get_explicit_type_name( inVecType ), vecSize,
+                      GetDataVectorString( expected, typeSize, vecSize, expectedString ),
+                      GetDataVectorString( actual, typeSize, vecSize, actualString ) );
+            log_error("Src is :\n%s\n----\n%d threads %d localthreads\n",
+                      programSrc, (int)threads[0],(int) localThreads[0]);
+            free(inBuffer);
+            free(outBuffer);
+            return 1;
+        }
+        expected += typeSize * vecSize;
+        actual += outTypeSize * outVecSize;
+    }
+
+    free(inBuffer);
+    free(outBuffer);
+    return 0;
+}
+
+int test_astype(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
+{
+    // Note: although casting to different vector element sizes that match the same size (i.e. short2 -> char4) is
+    // legal in OpenCL 1.0, the result is dependent on the device it runs on, which means there's no actual way
+    // for us to verify what is "valid". So the only thing we can test are types that match in size independent
+    // of the element count (char -> uchar, etc)
+    ExplicitType vecTypes[] = { kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kDouble, kNumExplicitTypes };
+    unsigned int vecSizes[] = { 1, 2, 3, 4, 8, 16, 0 };
+    unsigned int inTypeIdx, outTypeIdx, sizeIdx, outSizeIdx;
+    size_t inTypeSize, outTypeSize;
+    int error = 0;
+
+    for( inTypeIdx = 0; vecTypes[ inTypeIdx ] != kNumExplicitTypes; inTypeIdx++ )
+    {
+        inTypeSize = get_explicit_type_size(vecTypes[inTypeIdx]);
+
+        if( vecTypes[ inTypeIdx ] == kDouble && !is_extension_available( device, "cl_khr_fp64" ) )
+            continue;
+
+        if (( vecTypes[ inTypeIdx ] == kLong || vecTypes[ inTypeIdx ] == kULong ) && !gHasLong )
+            continue;
+
+        for( outTypeIdx = 0; vecTypes[ outTypeIdx ] != kNumExplicitTypes; outTypeIdx++ )
+        {
+            outTypeSize = get_explicit_type_size(vecTypes[outTypeIdx]);
+            if( vecTypes[ outTypeIdx ] == kDouble && !is_extension_available( device, "cl_khr_fp64" ) ) {
+                continue;
+            }
+
+            if (( vecTypes[ outTypeIdx ] == kLong || vecTypes[ outTypeIdx ] == kULong ) && !gHasLong )
+                continue;
+
+            // change this check
+            if( inTypeIdx == outTypeIdx ) {
+                continue;
+            }
+
+            log_info( " (%s->%s)\n", get_explicit_type_name( vecTypes[ inTypeIdx ] ), get_explicit_type_name( vecTypes[ outTypeIdx ] ) );
+            fflush( stdout );
+
+            for( sizeIdx = 0; vecSizes[ sizeIdx ] != 0; sizeIdx++ )
+            {
+
+                for(outSizeIdx = 0; vecSizes[outSizeIdx] != 0; outSizeIdx++)
+                {
+                    if(vecSizes[sizeIdx]*inTypeSize !=
+                       vecSizes[outSizeIdx]*outTypeSize )
+                    {
+                        continue;
+                    }
+                    error += test_astype_set( device, context, queue, vecTypes[ inTypeIdx ], vecTypes[ outTypeIdx ], vecSizes[ sizeIdx ], vecSizes[outSizeIdx], n_elems );
+
+
+                }
+
+            }
+            if(get_explicit_type_size(vecTypes[inTypeIdx]) ==
+               get_explicit_type_size(vecTypes[outTypeIdx])) {
+                // as_type3(vec4) allowed, as_type4(vec3) not allowed
+                error += test_astype_set( device, context, queue, vecTypes[ inTypeIdx ], vecTypes[ outTypeIdx ], 3, 4, n_elems );
+                error += test_astype_set( device, context, queue, vecTypes[ inTypeIdx ], vecTypes[ outTypeIdx ], 4, 3, n_elems );
+            }
+
+        }
+    }
+    return error;
+}
+
+
--- a/test_conformance/basic/test_async_copy.cpp
+++ b/test_conformance/basic/test_async_copy.cpp
@@ -0,0 +1,279 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/compat.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+
+
+#include "procs.h"
+#include "../../test_common/harness/conversions.h"
+
+static const char *async_global_to_local_kernel =
+"%s\n" // optional pragma string
+"__kernel void test_fn( const __global %s *src, __global %s *dst, __local %s *localBuffer, int copiesPerWorkgroup, int copiesPerWorkItem )\n"
+"{\n"
+" int i;\n"
+// Zero the local storage first
+" for(i=0; i<copiesPerWorkItem; i++)\n"
+"     localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] = (%s)(%s)0;\n"
+// Do this to verify all kernels are done zeroing the local buffer before we try the copy
+"    barrier( CLK_LOCAL_MEM_FENCE );\n"
+"    event_t event;\n"
+"    event = async_work_group_copy( (__local %s*)localBuffer, (__global const %s*)(src+copiesPerWorkgroup*get_group_id(0)), (size_t)copiesPerWorkgroup, 0 );\n"
+// Wait for the copy to complete, then verify by manually copying to the dest
+"    wait_group_events( 1, &event );\n"
+" for(i=0; i<copiesPerWorkItem; i++)\n"
+"  dst[ get_global_id( 0 )*copiesPerWorkItem+i ] = localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ];\n"
+"}\n" ;
+
+static const char *async_local_to_global_kernel =
+"%s\n" // optional pragma string
+"__kernel void test_fn( const __global %s *src, __global %s *dst, __local %s *localBuffer, int copiesPerWorkgroup, int copiesPerWorkItem )\n"
+"{\n"
+" int i;\n"
+// Zero the local storage first
+" for(i=0; i<copiesPerWorkItem; i++)\n"
+"  localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] = (%s)(%s)0;\n"
+// Do this to verify all kernels are done zeroing the local buffer before we try the copy
+"    barrier( CLK_LOCAL_MEM_FENCE );\n"
+" for(i=0; i<copiesPerWorkItem; i++)\n"
+"  localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] = src[ get_global_id( 0 )*copiesPerWorkItem+i ];\n"
+// Do this to verify all kernels are done copying to the local buffer before we try the copy
+"    barrier( CLK_LOCAL_MEM_FENCE );\n"
+"    event_t event;\n"
+"    event = async_work_group_copy((__global %s*)(dst+copiesPerWorkgroup*get_group_id(0)), (__local const %s*)localBuffer, (size_t)copiesPerWorkgroup, 0 );\n"
+"    wait_group_events( 1, &event );\n"
+"}\n" ;
+
+
+static const char *prefetch_kernel =
+"%s\n" // optional pragma string
+"__kernel void test_fn( const __global %s *src, __global %s *dst, __local %s *localBuffer, int copiesPerWorkgroup, int copiesPerWorkItem )\n"
+"{\n"
+" // Ignore this: %s%s%s\n"
+" int i;\n"
+" prefetch( (const __global %s*)(src+copiesPerWorkItem*get_global_id(0)), copiesPerWorkItem);\n"
+" for(i=0; i<copiesPerWorkItem; i++)\n"
+"  dst[ get_global_id( 0 )*copiesPerWorkItem+i ] = src[ get_global_id( 0 )*copiesPerWorkItem+i ];\n"
+"}\n" ;
+
+
+
+int test_copy(cl_device_id deviceID, cl_context context, cl_command_queue queue, const char *kernelCode,
+              ExplicitType vecType, int vecSize
+              )
+{
+    int error;
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    clMemWrapper streams[ 2 ];
+    size_t threads[ 1 ], localThreads[ 1 ];
+    void *inBuffer, *outBuffer;
+    MTdata d;
+    char vecNameString[64]; vecNameString[0] = 0;
+    if (vecSize == 1)
+        sprintf(vecNameString, "%s", get_explicit_type_name(vecType));
+    else
+        sprintf(vecNameString, "%s%d", get_explicit_type_name(vecType), vecSize);
+
+
+    size_t elementSize = get_explicit_type_size(vecType)*vecSize;
+    log_info("Testing %s\n", vecNameString);
+
+    cl_long max_local_mem_size;
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(max_local_mem_size), &max_local_mem_size, NULL);
+    test_error( error, "clGetDeviceInfo for CL_DEVICE_LOCAL_MEM_SIZE failed.");
+
+    unsigned int num_of_compute_devices;
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(num_of_compute_devices), &num_of_compute_devices, NULL);
+    test_error( error, "clGetDeviceInfo for CL_DEVICE_MAX_COMPUTE_UNITS failed.");
+
+    char programSource[4096]; programSource[0]=0;
+    char *programPtr;
+
+    sprintf(programSource, kernelCode,
+            vecType == kDouble ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable" : "",
+            vecNameString, vecNameString, vecNameString, vecNameString, get_explicit_type_name(vecType), vecNameString, vecNameString);
+    //log_info("program: %s\n", programSource);
+    programPtr = programSource;
+
+    error = create_single_kernel_helper( context, &program, &kernel, 1, (const char **)&programPtr, "test_fn" );
+    test_error( error, "Unable to create testing kernel" );
+
+    size_t max_workgroup_size;
+    error = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(max_workgroup_size), &max_workgroup_size, NULL);
+    test_error (error, "clGetKernelWorkGroupInfo failed for CL_KERNEL_WORK_GROUP_SIZE.");
+
+    size_t max_local_workgroup_size[3];
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(max_local_workgroup_size), max_local_workgroup_size, NULL);
+    test_error (error, "clGetDeviceInfo failed for CL_DEVICE_MAX_WORK_ITEM_SIZES");
+
+    // Pick the minimum of the device and the kernel
+    if (max_workgroup_size > max_local_workgroup_size[0])
+        max_workgroup_size = max_local_workgroup_size[0];
+
+    size_t numberOfCopiesPerWorkitem = 13;
+    elementSize = get_explicit_type_size(vecType)* ((vecSize == 3) ? 4 : vecSize);
+    size_t localStorageSpacePerWorkitem = numberOfCopiesPerWorkitem*elementSize;
+    size_t maxLocalWorkgroupSize = (((int)max_local_mem_size/2)/localStorageSpacePerWorkitem);
+
+    // Calculation can return 0 on embedded devices due to 1KB local mem limit
+    if(maxLocalWorkgroupSize == 0)
+    {
+        maxLocalWorkgroupSize = 1;
+    }
+
+    size_t localWorkgroupSize = maxLocalWorkgroupSize;
+    if (maxLocalWorkgroupSize > max_workgroup_size)
+        localWorkgroupSize = max_workgroup_size;
+
+    size_t localBufferSize = localWorkgroupSize*elementSize*numberOfCopiesPerWorkitem;
+    size_t numberOfLocalWorkgroups = 1111;
+    size_t globalBufferSize = numberOfLocalWorkgroups*localBufferSize;
+    size_t globalWorkgroupSize = numberOfLocalWorkgroups*localWorkgroupSize;
+
+    inBuffer = (void*)malloc(globalBufferSize);
+    outBuffer = (void*)malloc(globalBufferSize);
+    memset(outBuffer, 0, globalBufferSize);
+
+    cl_int copiesPerWorkItemInt, copiesPerWorkgroup;
+    copiesPerWorkItemInt = (int)numberOfCopiesPerWorkitem;
+    copiesPerWorkgroup = (int)(numberOfCopiesPerWorkitem*localWorkgroupSize);
+
+    log_info("Global: %d, local %d, local buffer %db, global buffer %db, each work group will copy %d elements and each work item item will copy %d elements.\n",
+             (int) globalWorkgroupSize, (int)localWorkgroupSize, (int)localBufferSize, (int)globalBufferSize, copiesPerWorkgroup, copiesPerWorkItemInt);
+
+    threads[0] = globalWorkgroupSize;
+    localThreads[0] = localWorkgroupSize;
+
+    d = init_genrand( gRandomSeed );
+    generate_random_data( vecType, globalBufferSize/get_explicit_type_size(vecType), d, inBuffer );
+    free_mtdata(d); d = NULL;
+
+    streams[ 0 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, globalBufferSize, inBuffer, &error );
+    test_error( error, "Unable to create input buffer" );
+    streams[ 1 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, globalBufferSize, outBuffer, &error );
+    test_error( error, "Unable to create output buffer" );
+
+    error = clSetKernelArg( kernel, 0, sizeof( streams[ 0 ] ), &streams[ 0 ] );
+    test_error( error, "Unable to set kernel argument" );
+    error = clSetKernelArg( kernel, 1, sizeof( streams[ 1 ] ), &streams[ 1 ] );
+    test_error( error, "Unable to set kernel argument" );
+    error = clSetKernelArg( kernel, 2, localBufferSize, NULL );
+    test_error( error, "Unable to set kernel argument" );
+    error = clSetKernelArg( kernel, 3, sizeof(copiesPerWorkgroup), &copiesPerWorkgroup );
+    test_error( error, "Unable to set kernel argument" );
+    error = clSetKernelArg( kernel, 4, sizeof(copiesPerWorkItemInt), &copiesPerWorkItemInt );
+    test_error( error, "Unable to set kernel argument" );
+
+    // Enqueue
+    error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
+    test_error( error, "Unable to queue kernel" );
+
+    // Read
+    error = clEnqueueReadBuffer( queue, streams[ 1 ], CL_TRUE, 0, globalBufferSize, outBuffer, 0, NULL, NULL );
+    test_error( error, "Unable to read results" );
+
+    // Verify
+    int failuresPrinted = 0;
+    if( memcmp( inBuffer, outBuffer, globalBufferSize ) != 0 )
+    {
+        size_t typeSize = get_explicit_type_size(vecType)* vecSize;
+        unsigned char * inchar = (unsigned char*)inBuffer;
+        unsigned char * outchar = (unsigned char*)outBuffer;
+        for (int i=0; i< (int)globalBufferSize; i+=(int)elementSize) {
+            if (memcmp( ((char *)inchar)+i, ((char *)outchar)+i, typeSize) != 0 )
+            {
+                char values[4096];
+                values[0] = 0;
+                if ( failuresPrinted == 0 ) {
+                    // Print first failure message
+                    log_error( "ERROR: Results of copy did not validate!\n" );
+                }
+                sprintf(values + strlen( values), "%d -> [", i);
+                for (int j=0; j<(int)elementSize; j++)
+                    sprintf(values + strlen( values), "%2x ", inchar[i+j]);
+                sprintf(values + strlen(values), "] != [");
+                for (int j=0; j<(int)elementSize; j++)
+                    sprintf(values + strlen( values), "%2x ", outchar[i+j]);
+                sprintf(values + strlen(values), "]");
+                log_error("%s\n", values);
+                failuresPrinted++;
+            }
+
+            if (failuresPrinted > 5) {
+                log_error("Not printing further failures...\n");
+                break;
+            }
+        }
+    }
+
+    free(inBuffer);
+    free(outBuffer);
+
+    return failuresPrinted ? -1 : 0;
+}
+
+int test_copy_all_types(cl_device_id deviceID, cl_context context, cl_command_queue queue, const char *kernelCode) {
+    ExplicitType vecType[] = { kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kDouble, kNumExplicitTypes };
+    unsigned int vecSizes[] = { 1, 2, 3, 4, 8, 16, 0 };
+    unsigned int size, typeIndex;
+
+    int errors = 0;
+
+    for( typeIndex = 0; vecType[ typeIndex ] != kNumExplicitTypes; typeIndex++ )
+    {
+        if( vecType[ typeIndex ] == kDouble && !is_extension_available( deviceID, "cl_khr_fp64" ) )
+            continue;
+
+        if (( vecType[ typeIndex ] == kLong || vecType[ typeIndex ] == kULong ) && !gHasLong )
+            continue;
+
+        for( size = 0; vecSizes[ size ] != 0; size++ )
+        {
+            if (test_copy( deviceID, context, queue, kernelCode, vecType[typeIndex],vecSizes[size] )) {
+                errors++;
+            }
+        }
+    }
+    if (errors)
+        return -1;
+    return 0;
+}
+
+
+
+
+int test_async_copy_global_to_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    return test_copy_all_types( deviceID, context, queue, async_global_to_local_kernel );
+}
+
+int test_async_copy_local_to_global(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    return test_copy_all_types( deviceID, context, queue, async_local_to_global_kernel );
+}
+
+int test_prefetch(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    return test_copy_all_types( deviceID, context, queue, prefetch_kernel );
+}
+
--- a/test_conformance/basic/test_async_strided_copy.cpp
+++ b/test_conformance/basic/test_async_strided_copy.cpp
@@ -0,0 +1,274 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/compat.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+
+
+#include "procs.h"
+#include "../../test_common/harness/conversions.h"
+
+static const char *async_strided_global_to_local_kernel =
+"%s\n" // optional pragma string
+"%s__kernel void test_fn( const __global %s *src, __global %s *dst, __local %s *localBuffer, int copiesPerWorkgroup, int copiesPerWorkItem, int stride )\n"
+"{\n"
+" int i;\n"
+// Zero the local storage first
+" for(i=0; i<copiesPerWorkItem; i++)\n"
+"   localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] = (%s)(%s)0;\n"
+// Do this to verify all kernels are done zeroing the local buffer before we try the copy
+" barrier( CLK_LOCAL_MEM_FENCE );\n"
+" event_t event;\n"
+" event = async_work_group_strided_copy( (__local %s*)localBuffer, (__global const %s*)(src+copiesPerWorkgroup*stride*get_group_id(0)), (size_t)copiesPerWorkgroup, (size_t)stride, 0 );\n"
+// Wait for the copy to complete, then verify by manually copying to the dest
+" wait_group_events( 1, &event );\n"
+" for(i=0; i<copiesPerWorkItem; i++)\n"
+"   dst[ get_global_id( 0 )*copiesPerWorkItem*stride+i*stride ] = localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ];\n"
+"}\n" ;
+
+static const char *async_strided_local_to_global_kernel =
+"%s\n" // optional pragma string
+"%s__kernel void test_fn( const __global %s *src, __global %s *dst, __local %s *localBuffer, int copiesPerWorkgroup, int copiesPerWorkItem, int stride )\n"
+"{\n"
+" int i;\n"
+// Zero the local storage first
+" for(i=0; i<copiesPerWorkItem; i++)\n"
+"   localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] = (%s)(%s)0;\n"
+// Do this to verify all kernels are done zeroing the local buffer before we try the copy
+" barrier( CLK_LOCAL_MEM_FENCE );\n"
+" for(i=0; i<copiesPerWorkItem; i++)\n"
+"   localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] = src[ get_global_id( 0 )*copiesPerWorkItem*stride+i*stride ];\n"
+// Do this to verify all kernels are done copying to the local buffer before we try the copy
+" barrier( CLK_LOCAL_MEM_FENCE );\n"
+" event_t event;\n"
+" event = async_work_group_strided_copy((__global %s*)(dst+copiesPerWorkgroup*stride*get_group_id(0)), (__local const %s*)localBuffer, (size_t)copiesPerWorkgroup, (size_t)stride, 0 );\n"
+" wait_group_events( 1, &event );\n"
+"}\n" ;
+
+
+int test_strided_copy(cl_device_id deviceID, cl_context context, cl_command_queue queue, const char *kernelCode, ExplicitType vecType, int vecSize, int stride)
+{
+    int error;
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    clMemWrapper streams[ 2 ];
+    size_t threads[ 1 ], localThreads[ 1 ];
+    void *inBuffer, *outBuffer;
+    MTdata d;
+    char vecNameString[64]; vecNameString[0] = 0;
+
+    if (vecSize == 1)
+        sprintf(vecNameString, "%s", get_explicit_type_name(vecType));
+    else
+        sprintf(vecNameString, "%s%d", get_explicit_type_name(vecType), vecSize);
+
+
+    log_info("Testing %s\n", vecNameString);
+
+    cl_long max_local_mem_size;
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(max_local_mem_size), &max_local_mem_size, NULL);
+    test_error( error, "clGetDeviceInfo for CL_DEVICE_LOCAL_MEM_SIZE failed.");
+
+    unsigned int num_of_compute_devices;
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(num_of_compute_devices), &num_of_compute_devices, NULL);
+    test_error( error, "clGetDeviceInfo for CL_DEVICE_MAX_COMPUTE_UNITS failed.");
+
+    char programSource[4096]; programSource[0]=0;
+    char *programPtr;
+
+    sprintf(programSource, kernelCode,
+        vecType == kDouble ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable" : "",
+        "",
+        vecNameString, vecNameString, vecNameString, vecNameString, get_explicit_type_name(vecType), vecNameString, vecNameString);
+    //log_info("program: %s\n", programSource);
+    programPtr = programSource;
+
+    error = create_single_kernel_helper( context, &program, &kernel, 1, (const char **)&programPtr, "test_fn" );
+    test_error( error, "Unable to create testing kernel" );
+
+    size_t max_workgroup_size;
+    error = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(max_workgroup_size), &max_workgroup_size, NULL);
+    test_error (error, "clGetKernelWorkGroupInfo failed for CL_KERNEL_WORK_GROUP_SIZE.");
+
+    size_t max_local_workgroup_size[3];
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(max_local_workgroup_size), max_local_workgroup_size, NULL);
+    test_error (error, "clGetDeviceInfo failed for CL_DEVICE_MAX_WORK_ITEM_SIZES");
+
+  // Pick the minimum of the device and the kernel
+    if (max_workgroup_size > max_local_workgroup_size[0])
+        max_workgroup_size = max_local_workgroup_size[0];
+
+    size_t elementSize = get_explicit_type_size(vecType)* ((vecSize == 3) ? 4 : vecSize);
+
+    cl_ulong max_global_mem_size;
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(max_global_mem_size), &max_global_mem_size, NULL);
+    test_error (error, "clGetDeviceInfo failed for CL_DEVICE_GLOBAL_MEM_SIZE");
+
+    if (max_global_mem_size > (cl_ulong)SIZE_MAX) {
+      max_global_mem_size = (cl_ulong)SIZE_MAX;
+    }
+
+    cl_bool unified_mem;
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(unified_mem), &unified_mem, NULL);
+    test_error (error, "clGetDeviceInfo failed for CL_DEVICE_HOST_UNIFIED_MEMORY");
+
+    int number_of_global_mem_buffers = (unified_mem) ? 4 : 2;
+
+    size_t numberOfCopiesPerWorkitem = 3;
+    size_t localStorageSpacePerWorkitem = numberOfCopiesPerWorkitem*elementSize;
+    size_t maxLocalWorkgroupSize = (((int)max_local_mem_size/2)/localStorageSpacePerWorkitem);
+
+    size_t localWorkgroupSize = maxLocalWorkgroupSize;
+    if (maxLocalWorkgroupSize > max_workgroup_size)
+        localWorkgroupSize = max_workgroup_size;
+
+    size_t localBufferSize = localWorkgroupSize*elementSize*numberOfCopiesPerWorkitem;
+    size_t numberOfLocalWorkgroups = 579;//1111;
+
+    // Reduce the numberOfLocalWorkgroups so that no more than 1/2 of CL_DEVICE_GLOBAL_MEM_SIZE is consumed
+    // by the allocated buffer. This is done to avoid resource  errors resulting from address space fragmentation.
+    size_t numberOfLocalWorkgroupsLimit = max_global_mem_size / (2 * number_of_global_mem_buffers * localBufferSize * stride);
+    if (numberOfLocalWorkgroups > numberOfLocalWorkgroupsLimit) numberOfLocalWorkgroups = numberOfLocalWorkgroupsLimit;
+
+    size_t globalBufferSize = numberOfLocalWorkgroups*localBufferSize*stride;
+    size_t globalWorkgroupSize = numberOfLocalWorkgroups*localWorkgroupSize;
+
+    inBuffer = (void*)malloc(globalBufferSize);
+    outBuffer = (void*)malloc(globalBufferSize);
+    memset(outBuffer, 0, globalBufferSize);
+
+    cl_int copiesPerWorkItemInt, copiesPerWorkgroup;
+    copiesPerWorkItemInt = (int)numberOfCopiesPerWorkitem;
+    copiesPerWorkgroup = (int)(numberOfCopiesPerWorkitem*localWorkgroupSize);
+
+    log_info("Global: %d, local %d, local buffer %db, global buffer %db, copy stride %d, each work group will copy %d elements and each work item item will copy %d elements.\n",
+                (int) globalWorkgroupSize, (int)localWorkgroupSize, (int)localBufferSize, (int)globalBufferSize, (int)stride, copiesPerWorkgroup, copiesPerWorkItemInt);
+
+    threads[0] = globalWorkgroupSize;
+    localThreads[0] = localWorkgroupSize;
+
+    d = init_genrand( gRandomSeed );
+    generate_random_data( vecType, globalBufferSize/get_explicit_type_size(vecType), d, inBuffer );
+    free_mtdata(d); d = NULL;
+
+    streams[ 0 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, globalBufferSize, inBuffer, &error );
+    test_error( error, "Unable to create input buffer" );
+    streams[ 1 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, globalBufferSize, outBuffer, &error );
+    test_error( error, "Unable to create output buffer" );
+
+    error = clSetKernelArg( kernel, 0, sizeof( streams[ 0 ] ), &streams[ 0 ] );
+    test_error( error, "Unable to set kernel argument" );
+    error = clSetKernelArg( kernel, 1, sizeof( streams[ 1 ] ), &streams[ 1 ] );
+    test_error( error, "Unable to set kernel argument" );
+    error = clSetKernelArg( kernel, 2, localBufferSize, NULL );
+    test_error( error, "Unable to set kernel argument" );
+    error = clSetKernelArg( kernel, 3, sizeof(copiesPerWorkgroup), &copiesPerWorkgroup );
+    test_error( error, "Unable to set kernel argument" );
+    error = clSetKernelArg( kernel, 4, sizeof(copiesPerWorkItemInt), &copiesPerWorkItemInt );
+    test_error( error, "Unable to set kernel argument" );
+    error = clSetKernelArg( kernel, 5, sizeof(stride), &stride );
+    test_error( error, "Unable to set kernel argument" );
+
+    // Enqueue
+    error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
+    test_error( error, "Unable to queue kernel" );
+
+    // Read
+    error = clEnqueueReadBuffer( queue, streams[ 1 ], CL_TRUE, 0, globalBufferSize, outBuffer, 0, NULL, NULL );
+    test_error( error, "Unable to read results" );
+
+    // Verify
+    size_t typeSize = get_explicit_type_size(vecType)* vecSize;
+    for (int i=0; i<(int)globalBufferSize; i+=(int)elementSize*(int)stride)
+    {
+        if (memcmp( ((char *)inBuffer)+i, ((char *)outBuffer)+i, typeSize) != 0 )
+        {
+            unsigned char * inchar = (unsigned char*)inBuffer + i;
+            unsigned char * outchar = (unsigned char*)outBuffer + i;
+            char values[4096];
+            values[0] = 0;
+
+            log_error( "ERROR: Results of copy did not validate!\n" );
+            sprintf(values + strlen( values), "%d -> [", i);
+            for (int j=0; j<(int)elementSize; j++)
+                sprintf(values + strlen( values), "%2x ", inchar[i*elementSize+j]);
+            sprintf(values + strlen(values), "] != [");
+            for (int j=0; j<(int)elementSize; j++)
+                sprintf(values + strlen( values), "%2x ", outchar[i*elementSize+j]);
+            sprintf(values + strlen(values), "]");
+            log_error("%s\n", values);
+
+               return -1;
+        }
+    }
+
+    free(inBuffer);
+    free(outBuffer);
+
+    return 0;
+}
+
+int test_strided_copy_all_types(cl_device_id deviceID, cl_context context, cl_command_queue queue, const char *kernelCode)
+{
+    ExplicitType vecType[] = { kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kDouble, kNumExplicitTypes };
+    unsigned int vecSizes[] = { 1, 2, 3, 4, 8, 16, 0 };
+    unsigned int strideSizes[] = { 1, 3, 4, 5, 0 };
+    unsigned int size, typeIndex, stride;
+
+    int errors = 0;
+
+    for( typeIndex = 0; vecType[ typeIndex ] != kNumExplicitTypes; typeIndex++ )
+    {
+        if( vecType[ typeIndex ] == kDouble && !is_extension_available( deviceID, "cl_khr_fp64" ) )
+            continue;
+
+        if (( vecType[ typeIndex ] == kLong || vecType[ typeIndex ] == kULong ) && !gHasLong )
+            continue;
+
+        for( size = 0; vecSizes[ size ] != 0; size++ )
+        {
+            for( stride = 0; strideSizes[ stride ] != 0; stride++)
+            {
+                if (test_strided_copy( deviceID, context, queue, kernelCode, vecType[typeIndex], vecSizes[size], strideSizes[stride] ))
+                {
+                    errors++;
+                }
+            }
+        }
+    }
+    if (errors)
+        return -1;
+    return 0;
+}
+
+
+
+
+int test_async_strided_copy_global_to_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    return test_strided_copy_all_types( deviceID, context, queue, async_strided_global_to_local_kernel );
+}
+
+int test_async_strided_copy_local_to_global(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    return test_strided_copy_all_types( deviceID, context, queue, async_strided_local_to_global_kernel );
+}
+
--- a/test_conformance/basic/test_barrier.c
+++ b/test_conformance/basic/test_barrier.c
@@ -0,0 +1,159 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/compat.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+
+#include "procs.h"
+
+const char *barrier_kernel_code =
+"__kernel void compute_sum(__global int *a, int n, __global int *tmp_sum, __global int *sum)\n"
+"{\n"
+"    int  tid = get_local_id(0);\n"
+"    int  lsize = get_local_size(0);\n"
+"    int  i;\n"
+"\n"
+"    tmp_sum[tid] = 0;\n"
+"    for (i=tid; i<n; i+=lsize)\n"
+"        tmp_sum[tid] += a[i];\n"
+"     \n"
+"     // updated to work for any workgroup size \n"
+"    for (i=hadd(lsize,1); lsize>1; i = hadd(i,1))\n"
+"    {\n"
+"        barrier(CLK_GLOBAL_MEM_FENCE);\n"
+"        if (tid + i < lsize)\n"
+"            tmp_sum[tid] += tmp_sum[tid + i];\n"
+"         lsize = i; \n"
+"    }\n"
+"\n"
+"     //no barrier is required here because last person to write to tmp_sum[0] was tid 0 \n"
+"    if (tid == 0)\n"
+"        *sum = tmp_sum[0];\n"
+"}\n";
+
+
+static int
+verify_sum(int *inptr, int *outptr, int n)
+{
+  int            r = 0;
+  int         i;
+
+  for (i=0; i<n; i++)
+  {
+        r += inptr[i];
+  }
+
+    if (r != outptr[0])
+    {
+        log_error("BARRIER test failed\n");
+        return -1;
+    }
+
+  log_info("BARRIER test passed\n");
+  return 0;
+}
+
+
+int
+test_barrier(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
+{
+    cl_mem            streams[3];
+    cl_int            *input_ptr = NULL, *output_ptr = NULL;
+    cl_program        program;
+    cl_kernel        kernel;
+    size_t    global_threads[3];
+    size_t    local_threads[3];
+    int                err;
+    int                i;
+    size_t max_local_workgroup_size[3];
+    size_t max_threadgroup_size = 0;
+    MTdata d;
+
+    err = create_single_kernel_helper(context, &program, &kernel, 1, &barrier_kernel_code, "compute_sum" );
+    test_error(err, "Failed to build kernel/program.");
+
+    err = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE,
+                                 sizeof(max_threadgroup_size), &max_threadgroup_size, NULL);
+    test_error(err, "clGetKernelWorkgroupInfo failed.");
+
+    err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(max_local_workgroup_size), max_local_workgroup_size, NULL);
+    test_error(err, "clGetDeviceInfo failed for CL_DEVICE_MAX_WORK_ITEM_SIZES");
+
+    // Pick the minimum of the device and the kernel
+    if (max_threadgroup_size > max_local_workgroup_size[0])
+        max_threadgroup_size = max_local_workgroup_size[0];
+
+    // work group size must divide evenly into the global size
+    while( num_elements % max_threadgroup_size )
+        max_threadgroup_size--;
+
+    input_ptr = (int*)malloc(sizeof(int) * num_elements);
+    output_ptr = (int*)malloc(sizeof(int));
+
+    streams[0] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE),  sizeof(cl_int) * num_elements, NULL, &err);
+    test_error(err, "clCreateBuffer failed.");
+    streams[1] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE),  sizeof(cl_int), NULL, &err);
+    test_error(err, "clCreateBuffer failed.");
+    streams[2] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE),  sizeof(cl_int) * max_threadgroup_size, NULL, &err);
+    test_error(err, "clCreateBuffer failed.");
+
+    d = init_genrand( gRandomSeed );
+    for (i=0; i<num_elements; i++)
+        input_ptr[i] = (int)get_random_float(-0x01000000, 0x01000000, d);
+    free_mtdata(d);  d = NULL;
+
+    err = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0, sizeof(cl_int)*num_elements, (void *)input_ptr, 0, NULL, NULL);
+    test_error(err, "clEnqueueWriteBuffer failed.");
+
+    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0]);
+    err |= clSetKernelArg(kernel, 1, sizeof num_elements, &num_elements);
+    err |= clSetKernelArg(kernel, 2, sizeof streams[2], &streams[2]);
+    err |= clSetKernelArg(kernel, 3, sizeof streams[1], &streams[1]);
+    test_error(err, "clSetKernelArg failed.");
+
+    global_threads[0] = max_threadgroup_size;
+    local_threads[0] = max_threadgroup_size;
+
+    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, global_threads, local_threads, 0, NULL, NULL );
+    test_error(err, "clEnqueueNDRangeKernel failed.");
+
+    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_int), (void *)output_ptr, 0, NULL, NULL );
+    test_error(err, "clEnqueueReadBuffer failed.");
+
+        err = verify_sum(input_ptr, output_ptr, num_elements);
+
+
+    // cleanup
+    clReleaseMemObject(streams[0]);
+    clReleaseMemObject(streams[1]);
+    clReleaseMemObject(streams[2]);
+    clReleaseKernel(kernel);
+    clReleaseProgram(program);
+    free(input_ptr);
+    free(output_ptr);
+
+    return err;
+}
+
+
+
+
+
--- a/test_conformance/basic/test_basic_parameter_types.c
+++ b/test_conformance/basic/test_basic_parameter_types.c
@@ -0,0 +1,303 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/compat.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "procs.h"
+
+const char *kernel_code =
+"__kernel void test_kernel(\n"
+"char%s c, uchar%s uc, short%s s, ushort%s us, int%s i, uint%s ui, float%s f,\n"
+"__global float%s *result)\n"
+"{\n"
+"  result[0] = %s(c);\n"
+"  result[1] = %s(uc);\n"
+"  result[2] = %s(s);\n"
+"  result[3] = %s(us);\n"
+"  result[4] = %s(i);\n"
+"  result[5] = %s(ui);\n"
+"  result[6] = f;\n"
+"}\n";
+
+const char *kernel_code_long =
+"__kernel void test_kernel_long(\n"
+"long%s l, ulong%s ul,\n"
+"__global float%s *result)\n"
+"{\n"
+"  result[0] = %s(l);\n"
+"  result[1] = %s(ul);\n"
+"}\n";
+
+int
+test_basic_parameter_types_long(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
+{
+     clMemWrapper results;
+  int error;
+  size_t global[3] = {1, 1, 1};
+  float results_back[2*16];
+  int count, index;
+  const char* types[] = { "long", "ulong" };
+  char kernel_string[8192];
+  int sizes[] = {1, 2, 4, 8, 16};
+  const char* size_strings[] = {"", "2", "4", "8", "16"};
+  float expected;
+  int total_errors = 0;
+  int size_to_test;
+  char *ptr;
+  char convert_string[1024];
+  size_t max_parameter_size;
+
+  // We don't really care about the contents since we're just testing that the types work.
+  cl_long l[16]={-21,-1,2,-3,4,-5,6,-7,8,-9,10,-11,12,-13,14,-15};
+  cl_ulong ul[16]={22,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+
+  // Calculate how large our paramter size is to the kernel
+  size_t parameter_size = sizeof(cl_long) + sizeof(cl_ulong);
+
+  // Init our strings.
+  kernel_string[0] = '\0';
+  convert_string[0] = '\0';
+
+  // Get the maximum parameter size allowed
+  error = clGetDeviceInfo( device, CL_DEVICE_MAX_PARAMETER_SIZE, sizeof( max_parameter_size ), &max_parameter_size, NULL );
+    test_error( error, "Unable to get max parameter size from device" );
+
+  // Create the results buffer
+  results = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float)*2*16, NULL, &error);
+  test_error(error, "clCreateBuffer failed");
+
+  // Go over all the vector sizes
+  for (size_to_test = 0; size_to_test < 5; size_to_test++) {
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+
+    size_t total_parameter_size = parameter_size*sizes[size_to_test] + sizeof(cl_mem);
+    if (total_parameter_size > max_parameter_size) {
+      log_info("Can not test with vector size %d because it would exceed the maximum allowed parameter size to the kernel. (%d > %d)\n",
+               (int)sizes[size_to_test], (int)total_parameter_size, (int)max_parameter_size);
+      continue;
+    }
+
+    log_info("Testing vector size %d\n", sizes[size_to_test]);
+
+    // If size is > 1, then we need a explicit convert call.
+    if (sizes[size_to_test] > 1) {
+      sprintf(convert_string, "convert_float%s",  size_strings[size_to_test]);
+    } else {
+      sprintf(convert_string, " ");
+    }
+
+    // Build the kernel
+    sprintf(kernel_string, kernel_code_long,
+            size_strings[size_to_test], size_strings[size_to_test], size_strings[size_to_test],
+            convert_string, convert_string
+    );
+
+    ptr = kernel_string;
+    error = create_single_kernel_helper(context, &program, &kernel, 1, (const char **)&ptr, "test_kernel_long");
+    test_error(error, "create single kernel failed");
+
+    // Set the arguments
+    for (count = 0; count < 2; count++) {
+      switch (count) {
+        case 0: error = clSetKernelArg(kernel, count, sizeof(cl_long)*sizes[size_to_test], &l); break;
+        case 1: error = clSetKernelArg(kernel, count, sizeof(cl_ulong)*sizes[size_to_test], &ul); break;
+        default: log_error("Test error"); break;
+      }
+      if (error)
+        log_error("Setting kernel arg %d %s%s: ", count, types[count], size_strings[size_to_test]);
+      test_error(error, "clSetKernelArgs failed");
+    }
+    error = clSetKernelArg(kernel, 2, sizeof(cl_mem), &results);
+    test_error(error, "clSetKernelArgs failed");
+
+    // Execute
+    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global, NULL, 0, NULL, NULL);
+    test_error(error, "clEnqueueNDRangeKernel failed");
+
+    // Read back the results
+    error = clEnqueueReadBuffer(queue, results, CL_TRUE, 0, sizeof(cl_float)*2*16, results_back, 0, NULL, NULL);
+    test_error(error, "clEnqueueReadBuffer failed");
+
+    // Verify the results
+    for (count = 0; count < 2; count++) {
+      for (index=0; index < sizes[size_to_test]; index++) {
+        switch (count) {
+          case 0: expected = (float)l[index]; break;
+          case 1: expected = (float)ul[index]; break;
+          default: log_error("Test error"); break;
+        }
+
+        if (results_back[count*sizes[size_to_test]+index] != expected) {
+          total_errors++;
+          log_error("Conversion from %s%s failed: index %d got %g, expected %g.\n", types[count], size_strings[size_to_test],
+                    index, results_back[count*sizes[size_to_test]+index], expected);
+        }
+      }
+    }
+  }
+
+  return total_errors;
+}
+
+int
+test_basic_parameter_types(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
+{
+     clMemWrapper results;
+  int error;
+  size_t global[3] = {1, 1, 1};
+  float results_back[7*16];
+  int count, index;
+  const char* types[] = {"char", "uchar", "short", "ushort", "int", "uint", "float"};
+  char kernel_string[8192];
+  int sizes[] = {1, 2, 4, 8, 16};
+  const char* size_strings[] = {"", "2", "4", "8", "16"};
+  float expected;
+  int total_errors = 0;
+  int size_to_test;
+  char *ptr;
+  char convert_string[1024];
+  size_t max_parameter_size;
+
+  // We don't really care about the contents since we're just testing that the types work.
+  cl_char c[16]={0,-1,2,-3,4,-5,6,-7,8,-9,10,-11,12,-13,14,-15};
+  cl_uchar uc[16]={16,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+  cl_short s[16]={-17,-1,2,-3,4,-5,6,-7,8,-9,10,-11,12,-13,14,-15};
+  cl_ushort us[16]={18,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+  cl_int i[16]={-19,-1,2,-3,4,-5,6,-7,8,-9,10,-11,12,-13,14,-15};
+  cl_uint ui[16]={20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+  cl_float f[16]={-23,-1,2,-3,4,-5,6,-7,8,-9,10,-11,12,-13,14,-15};
+
+  // Calculate how large our paramter size is to the kernel
+  size_t parameter_size = sizeof(cl_char) + sizeof(cl_uchar) +
+  sizeof(cl_short) +sizeof(cl_ushort) +
+  sizeof(cl_int) +sizeof(cl_uint) +
+  sizeof(cl_float);
+
+  // Init our strings.
+  kernel_string[0] = '\0';
+  convert_string[0] = '\0';
+
+  // Get the maximum parameter size allowed
+  error = clGetDeviceInfo( device, CL_DEVICE_MAX_PARAMETER_SIZE, sizeof( max_parameter_size ), &max_parameter_size, NULL );
+    test_error( error, "Unable to get max parameter size from device" );
+
+  // Create the results buffer
+  results = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float)*7*16, NULL, &error);
+  test_error(error, "clCreateBuffer failed");
+
+  // Go over all the vector sizes
+  for (size_to_test = 0; size_to_test < 5; size_to_test++) {
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+
+    size_t total_parameter_size = parameter_size*sizes[size_to_test] + sizeof(cl_mem);
+    if (total_parameter_size > max_parameter_size) {
+      log_info("Can not test with vector size %d because it would exceed the maximum allowed parameter size to the kernel. (%d > %d)\n",
+               (int)sizes[size_to_test], (int)total_parameter_size, (int)max_parameter_size);
+      continue;
+    }
+
+    log_info("Testing vector size %d\n", sizes[size_to_test]);
+
+    // If size is > 1, then we need a explicit convert call.
+    if (sizes[size_to_test] > 1) {
+      sprintf(convert_string, "convert_float%s",  size_strings[size_to_test]);
+    } else {
+      sprintf(convert_string, " ");
+    }
+
+    // Build the kernel
+    sprintf(kernel_string, kernel_code,
+            size_strings[size_to_test], size_strings[size_to_test], size_strings[size_to_test],
+            size_strings[size_to_test], size_strings[size_to_test], size_strings[size_to_test],
+            size_strings[size_to_test], size_strings[size_to_test],
+            convert_string, convert_string, convert_string,
+            convert_string, convert_string, convert_string
+    );
+
+    ptr = kernel_string;
+    error = create_single_kernel_helper(context, &program, &kernel, 1, (const char **)&ptr, "test_kernel");
+    test_error(error, "create single kernel failed");
+
+    // Set the arguments
+    for (count = 0; count < 7; count++) {
+      switch (count) {
+        case 0: error = clSetKernelArg(kernel, count, sizeof(cl_char)*sizes[size_to_test], &c); break;
+        case 1: error = clSetKernelArg(kernel, count, sizeof(cl_uchar)*sizes[size_to_test], &uc); break;
+        case 2: error = clSetKernelArg(kernel, count, sizeof(cl_short)*sizes[size_to_test], &s); break;
+        case 3: error = clSetKernelArg(kernel, count, sizeof(cl_ushort)*sizes[size_to_test], &us); break;
+        case 4: error = clSetKernelArg(kernel, count, sizeof(cl_int)*sizes[size_to_test], &i); break;
+        case 5: error = clSetKernelArg(kernel, count, sizeof(cl_uint)*sizes[size_to_test], &ui); break;
+        case 6: error = clSetKernelArg(kernel, count, sizeof(cl_float)*sizes[size_to_test], &f); break;
+        default: log_error("Test error"); break;
+      }
+      if (error)
+        log_error("Setting kernel arg %d %s%s: ", count, types[count], size_strings[size_to_test]);
+      test_error(error, "clSetKernelArgs failed");
+    }
+    error = clSetKernelArg(kernel, 7, sizeof(cl_mem), &results);
+    test_error(error, "clSetKernelArgs failed");
+
+    // Execute
+    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global, NULL, 0, NULL, NULL);
+    test_error(error, "clEnqueueNDRangeKernel failed");
+
+    // Read back the results
+    error = clEnqueueReadBuffer(queue, results, CL_TRUE, 0, sizeof(cl_float)*7*16, results_back, 0, NULL, NULL);
+    test_error(error, "clEnqueueReadBuffer failed");
+
+    // Verify the results
+    for (count = 0; count < 7; count++) {
+      for (index=0; index < sizes[size_to_test]; index++) {
+        switch (count) {
+          case 0: expected = (float)c[index]; break;
+          case 1: expected = (float)uc[index]; break;
+          case 2: expected = (float)s[index]; break;
+          case 3: expected = (float)us[index]; break;
+          case 4: expected = (float)i[index]; break;
+          case 5: expected = (float)ui[index]; break;
+          case 6: expected = (float)f[index]; break;
+          default: log_error("Test error"); break;
+        }
+
+        if (results_back[count*sizes[size_to_test]+index] != expected) {
+          total_errors++;
+          log_error("Conversion from %s%s failed: index %d got %g, expected %g.\n", types[count], size_strings[size_to_test],
+                    index, results_back[count*sizes[size_to_test]+index], expected);
+        }
+      }
+    }
+  }
+
+  if (gHasLong) {
+    log_info("Testing long types...\n");
+    total_errors += test_basic_parameter_types_long( device, context, queue, num_elements );
+  }
+  else {
+    log_info("Longs unsupported, skipping.");
+  }
+
+  return total_errors;
+}
+
+
+
--- a/test_conformance/basic/test_bufferreadwriterect.c
+++ b/test_conformance/basic/test_bufferreadwriterect.c
@@ -0,0 +1,564 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/compat.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "procs.h"
+
+#define CL_EXIT_ERROR(cmd,format,...)                \
+{                                \
+if ((cmd) != CL_SUCCESS) {                    \
+log_error("CL ERROR: %s %u: ", __FILE__,__LINE__);    \
+log_error(format,## __VA_ARGS__ );            \
+log_error("\n");                        \
+/*abort();*/                \
+}                                \
+}
+
+typedef unsigned char BufferType;
+
+// Globals for test
+cl_command_queue queue;
+
+// Width and height of each pair of images.
+enum { TotalImages = 8 };
+size_t width  [TotalImages];
+size_t height [TotalImages];
+size_t depth  [TotalImages];
+
+// cl buffer and host buffer.
+cl_mem buffer [TotalImages];
+BufferType* verify[TotalImages];
+BufferType* backing[TotalImages];
+
+// Temporary buffer used for read and write operations.
+BufferType* tmp_buffer;
+size_t tmp_buffer_size;
+
+size_t num_tries   = 50; // Number of randomly selected operations to perform.
+size_t alloc_scale = 2;   // Scale term applied buffer allocation size.
+MTdata mt;
+
+// Initialize a buffer in host memory containing random values of the specified size.
+static void initialize_image(BufferType* ptr, size_t w, size_t h, size_t d, MTdata mt)
+{
+    enum { ElementSize = sizeof(BufferType)/sizeof(unsigned char) };
+
+    unsigned char* buf = (unsigned char*)ptr;
+    size_t size = w*h*d*ElementSize;
+
+    for (size_t i = 0; i != size; i++) {
+        buf[i] = (unsigned char)(genrand_int32(mt) % 0xff);
+    }
+}
+
+// This function prints the contents of a buffer to standard error.
+void print_buffer(BufferType* buf, size_t w, size_t h, size_t d) {
+    log_error("Size = %lux%lux%lu (%lu total)\n",w,h,d,w*h*d);
+    for (unsigned k=0; k!=d;++k) {
+        log_error("Slice: %u\n",k);
+        for (unsigned j=0; j!=h;++j) {
+            for (unsigned i=0;i!=w;++i) {
+                log_error("%02x",buf[k*(w*h)+j*w+i]);
+            }
+            log_error("\n");
+        }
+        log_error("\n");
+    }
+}
+
+// Returns true if the two specified regions overlap.
+bool check_overlap_rect(size_t src_offset[3],
+                        size_t dst_offset[3],
+                        size_t region[3],
+                        size_t row_pitch,
+                        size_t slice_pitch)
+{
+    const size_t src_min[] = { src_offset[0], src_offset[1], src_offset[2] };
+    const size_t src_max[] = { src_offset[0] + region[0], src_offset[1] + region[1], src_offset[2] + region[2] };
+
+    const size_t dst_min[] = { dst_offset[0], dst_offset[1], dst_offset[2] };
+    const size_t dst_max[] = { dst_offset[0] + region[0],
+                               dst_offset[1] + region[1],
+                               dst_offset[2] + region[2]};
+// Check for overlap
+        bool overlap = true;
+        unsigned i;
+        for (i = 0; i != 3; ++i)
+        {
+            overlap = overlap && (src_min[i] < dst_max[i]) && (src_max[i] > dst_min[i]);
+        }
+
+    size_t dst_start = dst_offset[2] * slice_pitch + dst_offset[1] * row_pitch + dst_offset[0];
+    size_t dst_end = dst_start + (region[2] * slice_pitch +
+                                  region[1] * row_pitch + region[0]);
+    size_t src_start = src_offset[2] * slice_pitch + src_offset[1] * row_pitch + src_offset[0];
+    size_t src_end = src_start + (region[2] * slice_pitch +
+                                  region[1] * row_pitch + region[0]);
+    if (!overlap) {
+        size_t delta_src_x = (src_offset[0] + region[0] > row_pitch) ?
+            src_offset[0] + region[0] - row_pitch : 0; size_t delta_dst_x = (dst_offset[0] + region[0] > row_pitch) ?
+            dst_offset[0] + region[0] - row_pitch : 0;
+        if ((delta_src_x > 0 && delta_src_x > dst_offset[0]) ||
+            (delta_dst_x > 0 && delta_dst_x > src_offset[0])) {
+            if ((src_start <= dst_start && dst_start < src_end) || (dst_start <= src_start && src_start < dst_end)) overlap = true;
+        }
+        if (region[2] > 1) {
+            size_t src_height = slice_pitch / row_pitch; size_t dst_height = slice_pitch / row_pitch;
+            size_t delta_src_y = (src_offset[1] + region[1] > src_height) ? src_offset[1] + region[1] - src_height : 0;
+            size_t delta_dst_y = (dst_offset[1] + region[1] > dst_height) ? dst_offset[1] + region[1] - dst_height : 0;
+            if ((delta_src_y > 0 && delta_src_y > dst_offset[1]) ||
+                (delta_dst_y > 0 && delta_dst_y > src_offset[1])) {
+                if ((src_start <= dst_start && dst_start < src_end) || (dst_start <= src_start && src_start < dst_end))
+                    overlap = true;
+            }
+        }
+    }
+    return overlap;
+}
+
+
+
+// This function invokes the CopyBufferRect CL command and then mirrors the operation on the host side verify buffers.
+int copy_region(size_t src, size_t soffset[3], size_t sregion[3], size_t dst, size_t doffset[3], size_t dregion[3]) {
+
+    // Copy between cl buffers.
+    size_t src_slice_pitch = (width[src]*height[src] != 1) ? width[src]*height[src] : 0;
+    size_t dst_slice_pitch = (width[dst]*height[dst] != 1) ? width[dst]*height[dst] : 0;
+    size_t src_row_pitch = width[src];
+
+    cl_int err;
+    if (check_overlap_rect(soffset,doffset,sregion,src_row_pitch, src_slice_pitch)) {
+        log_info( "Copy overlap reported, skipping copy buffer rect\n" );
+        return CL_SUCCESS;
+    } else {
+        if ((err = clEnqueueCopyBufferRect(queue,
+                                         buffer[src],buffer[dst],
+                                         soffset, doffset,
+                                         sregion,/*dregion,*/
+                                         width[src], src_slice_pitch,
+                                         width[dst], dst_slice_pitch,
+                                         0, NULL, NULL)) != CL_SUCCESS)
+        {
+            CL_EXIT_ERROR(err, "clEnqueueCopyBufferRect failed between %u and %u",(unsigned)src,(unsigned)dst);
+        }
+    }
+
+    // Copy between host buffers.
+    size_t total = sregion[0] * sregion[1] * sregion[2];
+
+    size_t spitch = width[src];
+    size_t sslice = width[src]*height[src];
+
+    size_t dpitch = width[dst];
+    size_t dslice = width[dst]*height[dst];
+
+    for (size_t i = 0; i != total; ++i) {
+
+        // Compute the coordinates of the element within the source and destination regions.
+        size_t rslice = sregion[0]*sregion[1];
+        size_t sz = i / rslice;
+        size_t sy = (i % rslice) / sregion[0];
+        size_t sx = (i % rslice) % sregion[0];
+
+        size_t dz = sz;
+        size_t dy = sy;
+        size_t dx = sx;
+
+        // Compute the offset in bytes of the source and destination.
+        size_t s_idx = (soffset[2]+sz)*sslice + (soffset[1]+sy)*spitch + soffset[0]+sx;
+        size_t d_idx = (doffset[2]+dz)*dslice + (doffset[1]+dy)*dpitch + doffset[0]+dx;
+
+        verify[dst][d_idx] = verify[src][s_idx];
+    }
+
+    return 0;
+}
+
+// This function compares the destination region in the buffer pointed
+// to by device, to the source region of the specified verify buffer.
+int verify_region(BufferType* device, size_t src, size_t soffset[3], size_t sregion[3], size_t dst, size_t doffset[3]) {
+
+    // Copy between host buffers.
+    size_t spitch = width[src];
+    size_t sslice = width[src]*height[src];
+
+    size_t dpitch = width[dst];
+    size_t dslice = width[dst]*height[dst];
+
+    size_t total = sregion[0] * sregion[1] * sregion[2];
+    for (size_t i = 0; i != total; ++i) {
+
+        // Compute the coordinates of the element within the source and destination regions.
+        size_t rslice = sregion[0]*sregion[1];
+        size_t sz = i / rslice;
+        size_t sy = (i % rslice) / sregion[0];
+        size_t sx = (i % rslice) % sregion[0];
+
+        // Compute the offset in bytes of the source and destination.
+        size_t s_idx = (soffset[2]+sz)*sslice + (soffset[1]+sy)*spitch + soffset[0]+sx;
+        size_t d_idx = (doffset[2]+sz)*dslice + (doffset[1]+sy)*dpitch + doffset[0]+sx;
+
+        if (device[d_idx] != verify[src][s_idx]) {
+            log_error("Verify failed on comparsion %lu: coordinate (%lu, %lu, %lu) of region\n",i,sx,sy,sz);
+            log_error("0x%02x != 0x%02x\n", device[d_idx], verify[src][s_idx]);
+#if 0
+            // Uncomment this section to print buffers.
+            log_error("Device (copy): [%lu]\n",dst);
+            print_buffer(device,width[dst],height[dst],depth[dst]);
+            log_error("\n");
+            log_error("Verify: [%lu]\n",src);
+            print_buffer(verify[src],width[src],height[src],depth[src]);
+            log_error("\n");
+            abort();
+#endif
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+
+// This function invokes ReadBufferRect to read a region from the
+// specified source buffer into a temporary destination buffer. The
+// contents of the temporary buffer are then compared to the source
+// region of the corresponding verify buffer.
+int read_verify_region(size_t src, size_t soffset[3], size_t sregion[3], size_t dst, size_t doffset[3], size_t dregion[3]) {
+
+    // Clear the temporary destination host buffer.
+    memset(tmp_buffer, 0xff, tmp_buffer_size);
+
+    size_t src_slice_pitch = (width[src]*height[src] != 1) ? width[src]*height[src] : 0;
+    size_t dst_slice_pitch = (width[dst]*height[dst] != 1) ? width[dst]*height[dst] : 0;
+
+    // Copy the source region of the cl buffer, to the destination region of the temporary buffer.
+    CL_EXIT_ERROR(clEnqueueReadBufferRect(queue,
+                                          buffer[src],
+                                          CL_TRUE,
+                                          soffset,doffset,
+                                          sregion,
+                                          width[src], src_slice_pitch,
+                                          width[dst], dst_slice_pitch,
+                                          tmp_buffer,
+                                          0, NULL, NULL), "clEnqueueCopyBufferRect failed between %u and %u",(unsigned)src,(unsigned)dst);
+
+    return verify_region(tmp_buffer,src,soffset,sregion,dst,doffset);
+}
+
+// This function performs the same verification check as
+// read_verify_region, except a MapBuffer command is used to access the
+// device buffer data instead of a ReadBufferRect, and the whole
+// buffer is checked.
+int map_verify_region(size_t src) {
+
+    size_t size_bytes = width[src]*height[src]*depth[src]*sizeof(BufferType);
+
+    // Copy the source region of the cl buffer, to the destination region of the temporary buffer.
+    cl_int err;
+    BufferType* mapped = (BufferType*)clEnqueueMapBuffer(queue,buffer[src],CL_TRUE,CL_MAP_READ,0,size_bytes,0,NULL,NULL,&err);
+    CL_EXIT_ERROR(err, "clEnqueueMapBuffer failed for buffer %u",(unsigned)src);
+
+    size_t soffset[] = { 0, 0, 0 };
+    size_t sregion[] = { width[src], height[src], depth[src] };
+
+    int ret = verify_region(mapped,src,soffset,sregion,src,soffset);
+
+    CL_EXIT_ERROR(clEnqueueUnmapMemObject(queue,buffer[src],mapped,0,NULL,NULL),
+                  "clEnqueueUnmapMemObject failed for buffer %u",(unsigned)src);
+
+    return ret;
+}
+
+// This function generates a new temporary buffer and then writes a
+// region of it to a region in the specified destination buffer.
+int write_region(size_t src, size_t soffset[3], size_t sregion[3], size_t dst, size_t doffset[3], size_t dregion[3]) {
+
+    initialize_image(tmp_buffer, tmp_buffer_size, 1, 1, mt);
+    // memset(tmp_buffer, 0xf0, tmp_buffer_size);
+
+    size_t src_slice_pitch = (width[src]*height[src] != 1) ? width[src]*height[src] : 0;
+    size_t dst_slice_pitch = (width[dst]*height[dst] != 1) ? width[dst]*height[dst] : 0;
+
+    // Copy the source region of the cl buffer, to the destination region of the temporary buffer.
+    CL_EXIT_ERROR(clEnqueueWriteBufferRect(queue,
+                                           buffer[dst],
+                                           CL_TRUE,
+                                           doffset,soffset,
+    /*sregion,*/dregion,
+                                           width[dst], dst_slice_pitch,
+                                           width[src], src_slice_pitch,
+                                           tmp_buffer,
+                                           0, NULL, NULL), "clEnqueueWriteBufferRect failed between %u and %u",(unsigned)src,(unsigned)dst);
+
+    // Copy from the temporary buffer to the host buffer.
+    size_t spitch = width[src];
+    size_t sslice = width[src]*height[src];
+    size_t dpitch = width[dst];
+    size_t dslice = width[dst]*height[dst];
+
+    size_t total = sregion[0] * sregion[1] * sregion[2];
+    for (size_t i = 0; i != total; ++i) {
+
+        // Compute the coordinates of the element within the source and destination regions.
+        size_t rslice = sregion[0]*sregion[1];
+        size_t sz = i / rslice;
+        size_t sy = (i % rslice) / sregion[0];
+        size_t sx = (i % rslice) % sregion[0];
+
+        size_t dz = sz;
+        size_t dy = sy;
+        size_t dx = sx;
+
+        // Compute the offset in bytes of the source and destination.
+        size_t s_idx = (soffset[2]+sz)*sslice + (soffset[1]+sy)*spitch + soffset[0]+sx;
+        size_t d_idx = (doffset[2]+dz)*dslice + (doffset[1]+dy)*dpitch + doffset[0]+dx;
+
+        verify[dst][d_idx] = tmp_buffer[s_idx];
+    }
+    return 0;
+}
+
+void CL_CALLBACK mem_obj_destructor_callback( cl_mem, void *data )
+{
+    free( data );
+}
+
+// This is the main test function for the conformance test.
+int
+test_bufferreadwriterect(cl_device_id device, cl_context context, cl_command_queue queue_, int num_elements)
+{
+    queue = queue_;
+    cl_int err;
+
+    // Initialize the random number generator.
+    mt = init_genrand( gRandomSeed );
+
+    // Compute a maximum buffer size based on the number of test images and the device maximum.
+    cl_ulong max_mem_alloc_size = 0;
+    CL_EXIT_ERROR(clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_mem_alloc_size, NULL),"Could not get device info");
+    log_info("CL_DEVICE_MAX_MEM_ALLOC_SIZE = %llu bytes.\n", max_mem_alloc_size);
+
+    // Confirm that the maximum allocation size is not zero.
+    if (max_mem_alloc_size == 0) {
+        log_error("Error: CL_DEVICE_MAX_MEM_ALLOC_SIZE is zero bytes\n");
+        return -1;
+    }
+
+    // Guess at a reasonable maximum dimension.
+    size_t max_mem_alloc_dim = (size_t)cbrt((double)(max_mem_alloc_size/sizeof(BufferType)))/alloc_scale;
+    if (max_mem_alloc_dim == 0) {
+        max_mem_alloc_dim = max_mem_alloc_size;
+    }
+
+    log_info("Using maximum dimension      = %lu.\n", max_mem_alloc_dim);
+
+    // Create pairs of cl buffers and host buffers on which operations will be mirrored.
+    log_info("Creating %u pairs of random sized host and cl buffers.\n", TotalImages);
+
+    size_t max_size = 0;
+    size_t total_bytes = 0;
+
+    for (unsigned i=0; i != TotalImages; ++i) {
+
+        // Determine a width and height for this buffer.
+        size_t size_bytes;
+        size_t tries = 0;
+        size_t max_tries = 1048576;
+        do {
+            width[i]   = get_random_size_t(1, max_mem_alloc_dim, mt);
+            height[i]  = get_random_size_t(1, max_mem_alloc_dim, mt);
+            depth[i]   = get_random_size_t(1, max_mem_alloc_dim, mt);
+            ++tries;
+        } while ((tries < max_tries) && (size_bytes = width[i]*height[i]*depth[i]*sizeof(BufferType)) > max_mem_alloc_size);
+
+        // Check to see if adequately sized buffers were found.
+        if (tries >= max_tries) {
+            log_error("Error: Could not find random buffer sized less than %llu bytes in %lu tries.\n",
+                      max_mem_alloc_size, max_tries);
+            return -1;
+        }
+
+        // Keep track of the dimensions of the largest buffer.
+        max_size = (size_bytes > max_size) ? size_bytes : max_size;
+        total_bytes += size_bytes;
+
+        log_info("Buffer[%u] is (%lu,%lu,%lu) = %lu MB (truncated)\n",i,width[i],height[i],depth[i],(size_bytes)/1048576);
+    }
+
+    log_info( "Total size: %lu MB (truncated)\n", total_bytes/1048576 );
+
+    // Allocate a temporary buffer for read and write operations.
+    tmp_buffer_size  = max_size;
+    tmp_buffer = (BufferType*)malloc(tmp_buffer_size);
+
+    // Initialize cl buffers
+    log_info( "Initializing buffers\n" );
+    for (unsigned i=0; i != TotalImages; ++i) {
+
+        size_t size_bytes = width[i]*height[i]*depth[i]*sizeof(BufferType);
+
+        // Allocate a host copy of the buffer for verification.
+        verify[i] = (BufferType*)malloc(size_bytes);
+        CL_EXIT_ERROR(verify[i] ? CL_SUCCESS : -1, "malloc of host buffer failed for buffer %u", i);
+
+        // Allocate the buffer in host memory.
+        backing[i] = (BufferType*)malloc(size_bytes);
+        CL_EXIT_ERROR(backing[i] ? CL_SUCCESS : -1, "malloc of backing buffer failed for buffer %u", i);
+
+        // Generate a random buffer.
+        log_info( "Initializing buffer %u\n", i );
+        initialize_image(verify[i], width[i], height[i], depth[i], mt);
+
+        // Copy the image into a buffer which will passed to CL.
+        memcpy(backing[i], verify[i], size_bytes);
+
+        // Create the CL buffer.
+        buffer[i] = clCreateBuffer (context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, size_bytes, backing[i], &err);
+        CL_EXIT_ERROR(err,"clCreateBuffer failed for buffer %u", i);
+
+        // Make sure buffer is cleaned up appropriately if we encounter an error in the rest of the calls.
+        err = clSetMemObjectDestructorCallback( buffer[i], mem_obj_destructor_callback, backing[i] );
+        CL_EXIT_ERROR(err, "Unable to set mem object destructor callback" );
+    }
+
+    // Main test loop, run num_tries times.
+    log_info( "Executing %u test operations selected at random.\n", (unsigned)num_tries );
+    for (size_t iter = 0; iter < num_tries; ++iter) {
+
+        // Determine a source and a destination.
+        size_t src = get_random_size_t(0,TotalImages,mt);
+        size_t dst = get_random_size_t(0,TotalImages,mt);
+
+        // Determine the minimum dimensions.
+        size_t min_width = width[src] < width[dst] ? width[src] : width[dst];
+        size_t min_height = height[src] < height[dst] ? height[src] : height[dst];
+        size_t min_depth = depth[src] < depth[dst] ? depth[src] : depth[dst];
+
+        // Generate a random source rectangle within the minimum dimensions.
+        size_t mx = get_random_size_t(0, min_width-1, mt);
+        size_t my = get_random_size_t(0, min_height-1, mt);
+        size_t mz = get_random_size_t(0, min_depth-1, mt);
+
+        size_t sw = get_random_size_t(1, (min_width - mx), mt);
+        size_t sh = get_random_size_t(1, (min_height - my), mt);
+        size_t sd = get_random_size_t(1, (min_depth - mz), mt);
+
+        size_t sx = get_random_size_t(0, width[src]-sw, mt);
+        size_t sy = get_random_size_t(0, height[src]-sh, mt);
+        size_t sz = get_random_size_t(0, depth[src]-sd, mt);
+
+        size_t soffset[] = { sx, sy, sz };
+        size_t sregion[] = { sw, sh, sd };
+
+        // Generate a destination rectangle of the same size.
+        size_t dw = sw;
+        size_t dh = sh;
+        size_t dd = sd;
+
+        // Generate a random destination offset within the buffer.
+        size_t dx = get_random_size_t(0, (width[dst] - dw), mt);
+        size_t dy = get_random_size_t(0, (height[dst] - dh), mt);
+        size_t dz = get_random_size_t(0, (depth[dst] - dd), mt);
+        size_t doffset[] = { dx, dy, dz };
+        size_t dregion[] = { dw, dh, dd };
+
+        // Execute one of three operations:
+        // - Copy: Copies between src and dst within each set of host, buffer, and images.
+        // - Read & verify: Reads src region from buffer and image, and compares to host.
+        // - Write: Generates new buffer with src dimensions, and writes to cl buffer and image.
+
+        enum { TotalOperations = 3 };
+        size_t operation = get_random_size_t(0,TotalOperations,mt);
+
+        switch (operation) {
+            case 0:
+                log_info("%lu Copy %lu offset (%lu,%lu,%lu) -> %lu offset (%lu,%lu,%lu) region (%lux%lux%lu = %lu)\n",
+                         iter,
+                         src, soffset[0], soffset[1], soffset[2],
+                         dst, doffset[0], doffset[1], doffset[2],
+                         sregion[0], sregion[1], sregion[2],
+                         sregion[0]*sregion[1]*sregion[2]);
+                if ((err = copy_region(src, soffset, sregion, dst, doffset, dregion)))
+                    return err;
+                break;
+            case 1:
+                log_info("%lu Read %lu offset (%lu,%lu,%lu) -> %lu offset (%lu,%lu,%lu) region (%lux%lux%lu = %lu)\n",
+                         iter,
+                         src, soffset[0], soffset[1], soffset[2],
+                         dst, doffset[0], doffset[1], doffset[2],
+                         sregion[0], sregion[1], sregion[2],
+                         sregion[0]*sregion[1]*sregion[2]);
+                if ((err = read_verify_region(src, soffset, sregion, dst, doffset, dregion)))
+                    return err;
+                break;
+            case 2:
+                log_info("%lu Write %lu offset (%lu,%lu,%lu) -> %lu offset (%lu,%lu,%lu) region (%lux%lux%lu = %lu)\n",
+                         iter,
+                         src, soffset[0], soffset[1], soffset[2],
+                         dst, doffset[0], doffset[1], doffset[2],
+                         sregion[0], sregion[1], sregion[2],
+                         sregion[0]*sregion[1]*sregion[2]);
+                if ((err = write_region(src, soffset, sregion, dst, doffset, dregion)))
+                    return err;
+                break;
+        }
+
+#if 0
+        // Uncomment this section to verify each operation.
+        // If commented out, verification won't occur until the end of the
+        // test, and it will not be possible to determine which operation failed.
+        log_info("Verify src %lu offset (%u,%u,%u) region (%lux%lux%lu)\n", src, 0, 0, 0, width[src], height[src], depth[src]);
+        if (err = map_verify_region(src))
+            return err;
+
+        log_info("Verify dst %lu offset (%u,%u,%u) region (%lux%lux%lu)\n", dst, 0, 0, 0, width[dst], height[dst], depth[dst]);
+        if (err = map_verify_region(dst))
+            return err;
+
+
+#endif
+
+    } // end main for loop.
+
+    for (unsigned i=0;i<TotalImages;++i) {
+        log_info("Verify %u offset (%u,%u,%u) region (%lux%lux%lu)\n", i, 0, 0, 0, width[i], height[i], depth[i]);
+        if ((err = map_verify_region(i)))
+            return err;
+    }
+
+    // Clean-up.
+    free_mtdata(mt);
+    for (unsigned i=0;i<TotalImages;++i) {
+        free( verify[i] );
+        clReleaseMemObject( buffer[i] );
+    }
+    free( tmp_buffer );
+
+    if (!err) {
+        log_info("RECT read, write test passed\n");
+    }
+
+    return err;
+}
+
+
+
--- a/test_conformance/basic/test_constant.c
+++ b/test_conformance/basic/test_constant.c
@@ -0,0 +1,262 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/compat.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+
+#include "procs.h"
+
+const char *constant_kernel_code =
+"__kernel void constant_kernel(__global float *out, __constant float *tmpF, __constant int *tmpI)\n"
+"{\n"
+"    int  tid = get_global_id(0);\n"
+"\n"
+"    float ftmp = tmpF[tid]; \n"
+"    float Itmp = tmpI[tid]; \n"
+"    out[tid] = ftmp * Itmp; \n"
+"}\n";
+
+const char *loop_constant_kernel_code =
+"kernel void loop_constant_kernel(global float *out, constant float *i_pos, int num)\n"
+"{\n"
+"    int tid = get_global_id(0);\n"
+"    float sum = 0;\n"
+"    for (int i = 0; i < num; i++) {\n"
+"        float  pos  = i_pos[i*3];\n"
+"        sum += pos;\n"
+"    }\n"
+"    out[tid] = sum;\n"
+"}\n";
+
+
+static int
+verify(cl_float *tmpF, cl_int *tmpI, cl_float *out, int n)
+{
+    int         i;
+
+    for (i=0; i < n; i++)
+    {
+        float f = tmpF[i] * tmpI[i];
+        if( out[i] != f )
+        {
+            log_error("CONSTANT test failed\n");
+            return -1;
+        }
+    }
+
+    log_info("CONSTANT test passed\n");
+    return 0;
+}
+
+
+static int
+verify_loop_constant(const cl_float *tmp, cl_float *out, cl_int l, int n)
+{
+    int i;
+    cl_int j;
+    for (i=0; i < n; i++)
+    {
+        float sum = 0;
+        for (j=0; j < l; ++j)
+            sum += tmp[j*3];
+
+        if( out[i] != sum )
+        {
+            log_error("loop CONSTANT test failed\n");
+            return -1;
+        }
+    }
+
+    log_info("loop CONSTANT test passed\n");
+    return 0;
+}
+
+int
+test_constant(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
+{
+    cl_mem            streams[3];
+    cl_int            *tmpI;
+    cl_float        *tmpF, *out;
+    cl_program        program;
+    cl_kernel        kernel;
+    size_t    global_threads[3];
+    int                err;
+    unsigned int                i;
+    cl_ulong maxSize;
+    size_t num_floats, num_ints, constant_values;
+    MTdata          d;
+    RoundingMode     oldRoundMode;
+    int isRTZ = 0;
+
+  /* Verify our test buffer won't be bigger than allowed */
+    err = clGetDeviceInfo( device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof( maxSize ), &maxSize, 0 );
+    test_error( err, "Unable to get max constant buffer size" );
+
+  log_info("Device reports CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE %llu bytes.\n", maxSize);
+  maxSize/=4;
+  num_ints = (size_t)maxSize/sizeof(cl_int);
+  num_floats = (size_t)maxSize/sizeof(cl_float);
+  if (num_ints >= num_floats) {
+    constant_values = num_floats;
+  } else {
+    constant_values = num_ints;
+  }
+
+  log_info("Test will attempt to use %lu bytes with one %lu byte constant int buffer and one %lu byte constant float buffer.\n",
+           constant_values*sizeof(cl_int) + constant_values*sizeof(cl_float), constant_values*sizeof(cl_int), constant_values*sizeof(cl_float));
+
+    tmpI = (cl_int*)malloc(sizeof(cl_int) * constant_values);
+    tmpF = (cl_float*)malloc(sizeof(cl_float) * constant_values);
+    out  = (cl_float*)malloc(sizeof(cl_float) * constant_values);
+    streams[0] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE),  sizeof(cl_float) * constant_values, NULL, NULL);
+    if (!streams[0])
+    {
+        log_error("clCreateBuffer failed\n");
+        return -1;
+    }
+    streams[1] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE),  sizeof(cl_float) * constant_values, NULL, NULL);
+    if (!streams[1])
+    {
+        log_error("clCreateBuffer failed\n");
+        return -1;
+    }
+    streams[2] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE),  sizeof(cl_int) * constant_values, NULL, NULL);
+    if (!streams[2])
+    {
+        log_error("clCreateBuffer failed\n");
+        return -1;
+    }
+
+    d = init_genrand( gRandomSeed );
+    for (i=0; i<constant_values; i++) {
+        tmpI[i] = (int)get_random_float(-0x02000000, 0x02000000, d);
+        tmpF[i] = get_random_float(-0x02000000, 0x02000000, d);
+    }
+    free_mtdata(d); d = NULL;
+
+    err = clEnqueueWriteBuffer(queue, streams[1], CL_TRUE, 0, sizeof(cl_float)*constant_values, (void *)tmpF, 0, NULL, NULL);
+    if (err != CL_SUCCESS)
+    {
+        log_error("clWriteArray failed\n");
+        return -1;
+    }
+  err = clEnqueueWriteBuffer(queue, streams[2], CL_TRUE, 0, sizeof(cl_int)*constant_values, (void *)tmpI, 0, NULL, NULL);
+    if (err != CL_SUCCESS)
+    {
+        log_error("clWriteArray failed\n");
+        return -1;
+    }
+
+  err = create_single_kernel_helper(context, &program, &kernel, 1, &constant_kernel_code, "constant_kernel" );
+    if (err) {
+    log_error("Failed to create kernel and program: %d\n", err);
+    return -1;
+  }
+
+
+    err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0]);
+    err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1]);
+    err |= clSetKernelArg(kernel, 2, sizeof streams[2], &streams[2]);
+    if (err != CL_SUCCESS)
+    {
+        log_error("clSetKernelArgs failed\n");
+        return -1;
+    }
+
+    global_threads[0] = constant_values;
+    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, global_threads, NULL, 0, NULL, NULL );
+    if (err != CL_SUCCESS)
+    {
+        log_error("clEnqueueNDRangeKernel failed: %d\n", err);
+        return -1;
+    }
+    err = clEnqueueReadBuffer( queue, streams[0], CL_TRUE, 0, sizeof(cl_float)*constant_values, (void *)out, 0, NULL, NULL );
+    if (err != CL_SUCCESS)
+    {
+        log_error("clEnqueueReadBuffer failed\n");
+        return -1;
+    }
+
+    //If we only support rtz mode
+    if( CL_FP_ROUND_TO_ZERO == get_default_rounding_mode(device) && gIsEmbedded)
+    {
+        oldRoundMode = set_round(kRoundTowardZero, kfloat);
+        isRTZ = 1;
+    }
+
+    err = verify(tmpF, tmpI, out, (int)constant_values);
+
+    if (isRTZ)
+        (void)set_round(oldRoundMode, kfloat);
+
+    // Loop constant buffer test
+    cl_program loop_program;
+    cl_kernel  loop_kernel;
+    cl_int limit = 2;
+
+    memset(out, 0, sizeof(cl_float) * constant_values);
+    err = create_single_kernel_helper(context, &loop_program, &loop_kernel, 1,
+                                      &loop_constant_kernel_code, "loop_constant_kernel" );
+    if (err) {
+        log_error("Failed to create loop kernel and program: %d\n", err);
+        return -1;
+    }
+
+    err = clSetKernelArg(loop_kernel, 0, sizeof streams[0], &streams[0]);
+    err |= clSetKernelArg(loop_kernel, 1, sizeof streams[1], &streams[1]);
+    err |= clSetKernelArg(loop_kernel, 2, sizeof(limit), &limit);
+    if (err != CL_SUCCESS) {
+        log_error("clSetKernelArgs for loop kernel failed\n");
+        return -1;
+    }
+
+    err = clEnqueueNDRangeKernel( queue, loop_kernel, 1, NULL, global_threads, NULL, 0, NULL, NULL );
+    if (err != CL_SUCCESS) {
+        log_error("clEnqueueNDRangeKernel failed: %d\n", err);
+        return -1;
+    }
+    err = clEnqueueReadBuffer( queue, streams[0], CL_TRUE, 0, sizeof(cl_float)*constant_values, (void *)out, 0, NULL, NULL );
+    if (err != CL_SUCCESS) {
+        log_error("clEnqueueReadBuffer failed\n");
+        return -1;
+    }
+
+    err = verify_loop_constant(tmpF, out, limit, (int)constant_values);
+
+    // cleanup
+    clReleaseMemObject(streams[0]);
+    clReleaseMemObject(streams[1]);
+    clReleaseMemObject(streams[2]);
+    clReleaseKernel(kernel);
+    clReleaseProgram(program);
+    clReleaseKernel(loop_kernel);
+    clReleaseProgram(loop_program);
+    free(tmpI);
+    free(tmpF);
+    free(out);
+
+    return err;
+}
+
+
+
+
+
--- a/test_conformance/basic/test_constant_source.cpp
+++ b/test_conformance/basic/test_constant_source.cpp
@@ -0,0 +1,101 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/compat.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+
+#include "procs.h"
+
+const char *constant_source_kernel_code[] = {
+"__constant int outVal = 42;\n"
+"__constant int outIndex = 7;\n"
+"__constant int outValues[ 16 ] = { 17, 01, 11, 12, 1955, 11, 5, 1985, 113, 1, 24, 1984, 7, 23, 1979, 97 };\n"
+"\n"
+"__kernel void constant_kernel( __global int *out )\n"
+"{\n"
+"    int  tid = get_global_id(0);\n"
+"\n"
+"    if( tid == 0 )\n"
+"    {\n"
+"        out[ 0 ] = outVal;\n"
+"        out[ 1 ] = outValues[ outIndex ];\n"
+"    }\n"
+"    else\n"
+"    {\n"
+"        out[ tid + 1 ] = outValues[ tid ];\n"
+"    }\n"
+"}\n" };
+
+int test_constant_source(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
+{
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+
+    clMemWrapper outStream;
+    cl_int         outValues[ 17 ];
+    cl_int         expectedValues[ 17 ] = { 42, 1985, 01, 11, 12, 1955, 11, 5, 1985, 113, 1, 24, 1984, 7, 23, 1979, 97 };
+
+    cl_int        error;
+
+
+    // Create a kernel to test with
+    error = create_single_kernel_helper( context, &program, &kernel, 1, constant_source_kernel_code, "constant_kernel" );
+    test_error( error, "Unable to create testing kernel" );
+
+    // Create our output buffer
+    outStream = clCreateBuffer( context, CL_MEM_WRITE_ONLY, sizeof( outValues ), NULL, &error );
+    test_error( error, "Unable to create output buffer" );
+
+    // Set the argument
+    error = clSetKernelArg( kernel, 0, sizeof( outStream ), &outStream );
+    test_error( error, "Unable to set kernel argument" );
+
+    // Run test kernel
+    size_t threads[ 1 ] = { 16 };
+    error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, NULL, 0, NULL, NULL );
+    test_error( error, "Unable to enqueue kernel" );
+
+    // Read results
+    error = clEnqueueReadBuffer( queue, outStream, CL_TRUE, 0, sizeof( outValues ), outValues, 0, NULL, NULL );
+    test_error( error, "Unable to read results" );
+
+    // Verify results
+    for( int i = 0; i < 17; i++ )
+    {
+        if( expectedValues[ i ] != outValues[ i ] )
+        {
+            if( i == 0 )
+                log_error( "ERROR: Output value %d from constant source global did not validate! (Expected %d, got %d)\n", i, expectedValues[ i ], outValues[ i ] );
+            else if( i == 1 )
+                log_error( "ERROR: Output value %d from constant-indexed constant array did not validate! (Expected %d, got %d)\n", i, expectedValues[ i ], outValues[ i ] );
+            else
+                log_error( "ERROR: Output value %d from variable-indexed constant array did not validate! (Expected %d, got %d)\n", i, expectedValues[ i ], outValues[ i ] );
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+
+
+
+
--- a/test_conformance/basic/test_createkernelsinprogram.c
+++ b/test_conformance/basic/test_createkernelsinprogram.c
@@ -0,0 +1,105 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/compat.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+
+#include "procs.h"
+
+const char *sample_single_kernel = {
+"__kernel void sample_test(__global float *src, __global int *dst)\n"
+"{\n"
+"    int  tid = get_global_id(0);\n"
+"\n"
+"    dst[tid] = (int)src[tid];\n"
+"\n"
+"}\n"};
+
+const char *sample_double_kernel = {
+"__kernel void sample_test(__global float *src, __global int *dst)\n"
+"{\n"
+"    int  tid = get_global_id(0);\n"
+"\n"
+"    dst[tid] = (int)src[tid];\n"
+"\n"
+"}\n"
+"__kernel void sample_test2(__global float *src, __global int *dst)\n"
+"{\n"
+"    int  tid = get_global_id(0);\n"
+"\n"
+"    dst[tid] = (int)src[tid];\n"
+"\n"
+"}\n"};
+
+
+int
+test_createkernelsinprogram(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
+{
+    cl_program        program;
+    cl_kernel        kernel[2];
+    unsigned int    num_kernels;
+    int                err;
+
+    err = create_single_kernel_helper(context, &program, NULL, 1, &sample_single_kernel, NULL);
+    if (err != CL_SUCCESS)
+    {
+        log_error("create_single_kernel_helper failed\n");
+        return -1;
+    }
+
+    err = clCreateKernelsInProgram(program, 1, kernel, &num_kernels);
+    if ( (err != CL_SUCCESS) || (num_kernels != 1) )
+    {
+        log_error("clCreateKernelsInProgram test failed for a single kernel\n");
+        return -1;
+    }
+
+    clReleaseKernel(kernel[0]);
+    clReleaseProgram(program);
+
+    err = create_single_kernel_helper(context, &program, NULL, 1, &sample_double_kernel, NULL);
+    if (err != CL_SUCCESS)
+    {
+        log_error("create_single_kernel_helper failed\n");
+        return -1;
+    }
+
+    err = clCreateKernelsInProgram(program, 2, kernel, &num_kernels);
+    if ( (err != CL_SUCCESS) || (num_kernels != 2) )
+    {
+        log_error("clCreateKernelsInProgram test failed for two kernels\n");
+        return -1;
+    }
+
+  log_info("clCreateKernelsInProgram test passed\n");
+
+    clReleaseKernel(kernel[0]);
+    clReleaseKernel(kernel[1]);
+    clReleaseProgram(program);
+
+
+    return err;
+}
+
+
+
+
+
--- a/test_conformance/basic/test_enqueue_map.cpp
+++ b/test_conformance/basic/test_enqueue_map.cpp
@@ -0,0 +1,254 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/compat.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+
+#include "procs.h"
+#include "../../test_common/harness/conversions.h"
+#include "../../test_common/harness/typeWrappers.h"
+
+const cl_mem_flags flag_set[] = {
+  CL_MEM_ALLOC_HOST_PTR,
+  CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR,
+  CL_MEM_USE_HOST_PTR,
+  CL_MEM_COPY_HOST_PTR,
+  0
+};
+const char* flag_set_names[] = {
+  "CL_MEM_ALLOC_HOST_PTR",
+  "CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR",
+  "CL_MEM_USE_HOST_PTR",
+  "CL_MEM_COPY_HOST_PTR",
+  "0"
+};
+
+int test_enqueue_map_buffer(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    int error;
+    const size_t bufferSize = 256*256;
+    int src_flag_id;
+    MTdata d = init_genrand( gRandomSeed );
+    cl_char *initialData = (cl_char*)malloc(bufferSize);
+    cl_char *finalData = (cl_char*)malloc(bufferSize);
+
+    for (src_flag_id=0; src_flag_id < sizeof(flag_set)/sizeof(flag_set[0]); src_flag_id++)
+    {
+        clMemWrapper memObject;
+        log_info("Testing with cl_mem_flags src: %s\n", flag_set_names[src_flag_id]);
+
+        generate_random_data( kChar, (unsigned int)bufferSize, d, initialData );
+
+        if ((flag_set[src_flag_id] & CL_MEM_USE_HOST_PTR) || (flag_set[src_flag_id] & CL_MEM_COPY_HOST_PTR))
+            memObject = clCreateBuffer(context, flag_set[src_flag_id],  bufferSize * sizeof( cl_char ), initialData, &error);
+        else
+            memObject = clCreateBuffer(context, flag_set[src_flag_id],  bufferSize * sizeof( cl_char ), NULL, &error);
+        test_error( error, "Unable to create testing buffer" );
+
+        if (!(flag_set[src_flag_id] & CL_MEM_USE_HOST_PTR) && !(flag_set[src_flag_id] & CL_MEM_COPY_HOST_PTR))
+        {
+            error = clEnqueueWriteBuffer(queue, memObject, CL_TRUE, 0, bufferSize * sizeof( cl_char ), initialData, 0, NULL, NULL);
+            test_error( error, "clEnqueueWriteBuffer failed");
+        }
+
+        for( int i = 0; i < 128; i++ )
+        {
+
+          size_t offset = (size_t)random_in_range( 0, (int)bufferSize - 1, d );
+          size_t length = (size_t)random_in_range( 1, (int)( bufferSize - offset ), d );
+
+          cl_char *mappedRegion = (cl_char *)clEnqueueMapBuffer( queue, memObject, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
+                                                                offset, length, 0, NULL, NULL, &error );
+          if( error != CL_SUCCESS )
+          {
+            print_error( error, "clEnqueueMapBuffer call failed" );
+            log_error( "\tOffset: %d  Length: %d\n", (int)offset, (int)length );
+            free( initialData );
+            free( finalData );
+            free_mtdata(d);
+            return -1;
+          }
+
+          // Write into the region
+          for( size_t j = 0; j < length; j++ )
+          {
+            cl_char spin = (cl_char)genrand_int32( d );
+
+            // Test read AND write in one swipe
+            cl_char value = mappedRegion[ j ];
+            value = spin - value;
+            mappedRegion[ j ] = value;
+
+            // Also update the initial data array
+            value = initialData[ offset + j ];
+            value = spin - value;
+            initialData[ offset + j ] = value;
+          }
+
+          // Unmap
+          error = clEnqueueUnmapMemObject( queue, memObject, mappedRegion, 0, NULL, NULL );
+          test_error( error, "Unable to unmap buffer" );
+        }
+
+        // Final validation: read actual values of buffer and compare against our reference
+        error = clEnqueueReadBuffer( queue, memObject, CL_TRUE, 0, sizeof( cl_char ) * bufferSize, finalData, 0, NULL, NULL );
+        test_error( error, "Unable to read results" );
+
+        for( size_t q = 0; q < bufferSize; q++ )
+        {
+            if( initialData[ q ] != finalData[ q ] )
+            {
+                log_error( "ERROR: Sample %d did not validate! Got %d, expected %d\n", (int)q, (int)finalData[ q ], (int)initialData[ q ] );
+                free( initialData );
+                free( finalData );
+                free_mtdata(d);
+                return -1;
+            }
+        }
+    } // cl_mem flags
+
+    free( initialData );
+    free( finalData );
+    free_mtdata(d);
+
+    return 0;
+}
+
+int test_enqueue_map_image(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    int error;
+    cl_image_format format = { CL_RGBA, CL_UNSIGNED_INT32 };
+    const size_t imageSize = 256;
+    int src_flag_id;
+    cl_uint *initialData;
+    cl_uint *finalData;
+    MTdata  d;
+
+    PASSIVE_REQUIRE_IMAGE_SUPPORT( deviceID )
+
+    initialData = (cl_uint*)malloc(imageSize * imageSize * 4 *sizeof(cl_uint));
+    finalData = (cl_uint*)malloc(imageSize * imageSize * 4 *sizeof(cl_uint));
+
+    if( !is_image_format_supported( context, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE2D, &format ) )
+    {
+        log_error( "ERROR: Test requires basic OpenCL 1.0 format CL_RGBA:CL_UNSIGNED_INT32, which is unsupported by this device!\n" );
+        free(initialData);
+        free(finalData);
+        return -1;
+    }
+
+    d = init_genrand( gRandomSeed );
+  for (src_flag_id=0; src_flag_id < sizeof(flag_set)/sizeof(flag_set[0]); src_flag_id++) {
+    clMemWrapper memObject;
+    log_info("Testing with cl_mem_flags src: %s\n", flag_set_names[src_flag_id]);
+
+    generate_random_data( kUInt, (unsigned int)( imageSize * imageSize ), d, initialData );
+
+    if ((flag_set[src_flag_id] & CL_MEM_USE_HOST_PTR) || (flag_set[src_flag_id] & CL_MEM_COPY_HOST_PTR))
+      memObject = create_image_2d( context, CL_MEM_READ_WRITE | flag_set[src_flag_id], &format,
+                                  imageSize, imageSize, 0, initialData, &error );
+    else
+      memObject = create_image_2d( context, CL_MEM_READ_WRITE | flag_set[src_flag_id], &format,
+                                  imageSize, imageSize, 0, NULL, &error );
+    test_error( error, "Unable to create testing buffer" );
+
+    if (!(flag_set[src_flag_id] & CL_MEM_USE_HOST_PTR) && !(flag_set[src_flag_id] & CL_MEM_COPY_HOST_PTR)) {
+      size_t write_origin[3]={0,0,0}, write_region[3]={imageSize, imageSize, 1};
+      error = clEnqueueWriteImage(queue, memObject, CL_TRUE, write_origin, write_region, NULL, NULL, initialData, 0, NULL, NULL);
+      test_error( error, "Unable to write to testing buffer" );
+    }
+
+    for( int i = 0; i < 128; i++ )
+    {
+
+      size_t offset[3], region[3];
+      size_t rowPitch;
+
+      offset[ 0 ] = (size_t)random_in_range( 0, (int)imageSize - 1, d );
+      region[ 0 ] = (size_t)random_in_range( 1, (int)( imageSize - offset[ 0 ] - 1), d );
+      offset[ 1 ] = (size_t)random_in_range( 0, (int)imageSize - 1, d );
+      region[ 1 ] = (size_t)random_in_range( 1, (int)( imageSize - offset[ 1 ] - 1), d );
+      offset[ 2 ] = 0;
+      region[ 2 ] = 1;
+      cl_uint *mappedRegion = (cl_uint *)clEnqueueMapImage( queue, memObject, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
+                                                           offset, region, &rowPitch, NULL, 0, NULL, NULL, &error );
+      if( error != CL_SUCCESS )
+      {
+        print_error( error, "clEnqueueMapImage call failed" );
+        log_error( "\tOffset: %d,%d  Region: %d,%d\n", (int)offset[0], (int)offset[1], (int)region[0], (int)region[1] );
+        free(initialData);
+        free(finalData);
+        free_mtdata(d);
+        return -1;
+      }
+
+      // Write into the region
+      cl_uint *mappedPtr = mappedRegion;
+      for( size_t y = 0; y < region[ 1 ]; y++ )
+      {
+        for( size_t x = 0; x < region[ 0 ] * 4; x++ )
+        {
+          cl_int spin = (cl_int)random_in_range( 16, 1024, d );
+
+          cl_int value;
+          // Test read AND write in one swipe
+          value = mappedPtr[ ( y * rowPitch/sizeof(cl_uint) ) + x ];
+          value = spin - value;
+          mappedPtr[ ( y * rowPitch/sizeof(cl_uint) ) + x ] = value;
+
+          // Also update the initial data array
+          value = initialData[ ( ( offset[ 1 ] + y ) * imageSize + offset[ 0 ] ) * 4 + x ];
+          value = spin - value;
+          initialData[ ( ( offset[ 1 ] + y ) * imageSize + offset[ 0 ] ) * 4 + x ] = value;
+        }
+      }
+
+      // Unmap
+      error = clEnqueueUnmapMemObject( queue, memObject, mappedRegion, 0, NULL, NULL );
+      test_error( error, "Unable to unmap buffer" );
+    }
+
+    // Final validation: read actual values of buffer and compare against our reference
+    size_t finalOrigin[3] = { 0, 0, 0 }, finalRegion[3] = { imageSize, imageSize, 1 };
+    error = clEnqueueReadImage( queue, memObject, CL_TRUE, finalOrigin, finalRegion, 0, 0, finalData, 0, NULL, NULL );
+    test_error( error, "Unable to read results" );
+
+    for( size_t q = 0; q < imageSize * imageSize * 4; q++ )
+    {
+      if( initialData[ q ] != finalData[ q ] )
+      {
+        log_error( "ERROR: Sample %d (coord %d,%d) did not validate! Got %d, expected %d\n", (int)q, (int)( ( q / 4 ) % imageSize ), (int)( ( q / 4 ) / imageSize ),
+                                    (int)finalData[ q ], (int)initialData[ q ] );
+        free(initialData);
+        free(finalData);
+        free_mtdata(d);
+        return -1;
+      }
+    }
+  } // cl_mem_flags
+
+    free(initialData);
+    free(finalData);
+    free_mtdata(d);
+    return 0;
+}
+
+
--- a/test_conformance/basic/test_enqueued_local_size.c
+++ b/test_conformance/basic/test_enqueued_local_size.c
@@ -0,0 +1,123 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/compat.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include "../../test_common/harness/rounding_mode.h"
+
+#include "procs.h"
+
+static const char *enqueued_local_size_2d_code =
+"__kernel void test_enqueued_local_size_2d(global int *dst)\n"
+"{\n"
+"    if ((get_global_id(0) == 0) && (get_global_id(1) == 0))\n"
+"    {\n"
+"        dst[0] = (int)get_enqueued_local_size(0)\n;"
+"        dst[1] = (int)get_enqueued_local_size(1)\n;"
+"    }\n"
+"}\n";
+
+static const char *enqueued_local_size_1d_code =
+"__kernel void test_enqueued_local_size_1d(global int *dst)\n"
+"{\n"
+"    int  tid_x = get_global_id(0);\n"
+"    if (get_global_id(0) == 0)\n"
+"    {\n"
+"        dst[tid_x] = (int)get_enqueued_local_size(0)\n;"
+"    }\n"
+"}\n";
+
+
+static int
+verify_enqueued_local_size(int *result, size_t *expected, int n)
+{
+    int i;
+    for (i=0; i<n; i++)
+    {
+        if (result[i] != (int)expected[i])
+        {
+            log_error("get_enqueued_local_size failed\n");
+            return -1;
+        }
+    }
+    log_info("get_enqueued_local_size passed\n");
+    return 0;
+}
+
+
+int
+test_enqueued_local_size(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
+{
+    cl_mem streams;
+    cl_program program[2];
+    cl_kernel kernel[2];
+
+    int *output_ptr;
+    size_t globalsize[2];
+    size_t localsize[2];
+    int err;
+
+    output_ptr   = (int*)malloc(2 * sizeof(int));
+
+    streams = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE), 2*sizeof(int), NULL, &err);
+    test_error( err, "clCreateBuffer failed.");
+
+    err = create_single_kernel_helper_with_build_options(context, &program[0], &kernel[0], 1, &enqueued_local_size_1d_code, "test_enqueued_local_size_1d", "-cl-std=CL2.0");
+    test_error( err, "create_single_kernel_helper failed");
+    err = create_single_kernel_helper_with_build_options(context, &program[1], &kernel[1], 1, &enqueued_local_size_2d_code, "test_enqueued_local_size_2d", "-cl-std=CL2.0");
+    test_error( err, "create_single_kernel_helper failed");
+
+    err  = clSetKernelArg(kernel[0], 0, sizeof streams, &streams);
+    test_error( err, "clSetKernelArgs failed.");
+    err  = clSetKernelArg(kernel[1], 0, sizeof streams, &streams);
+    test_error( err, "clSetKernelArgs failed.");
+
+    globalsize[0] = (size_t)num_elements;
+    globalsize[1] = (size_t)num_elements;
+    localsize[0] = 16;
+    localsize[1] = 11;
+    err = clEnqueueNDRangeKernel(queue, kernel[1], 2, NULL, globalsize, localsize, 0, NULL, NULL);
+    test_error( err, "clEnqueueNDRangeKernel failed.");
+
+    err = clEnqueueReadBuffer(queue, streams, CL_TRUE, 0, 2*sizeof(int), output_ptr, 0, NULL, NULL);
+    test_error( err, "clEnqueueReadBuffer failed.");
+
+    err = verify_enqueued_local_size(output_ptr, localsize, 2);
+
+    globalsize[0] = (size_t)num_elements;
+    localsize[0] = 9;
+    err = clEnqueueNDRangeKernel(queue, kernel[1], 1, NULL, globalsize, localsize, 0, NULL, NULL);
+    test_error( err, "clEnqueueNDRangeKernel failed.");
+
+    err = clEnqueueReadBuffer(queue, streams, CL_TRUE, 0, 2*sizeof(int), output_ptr, 0, NULL, NULL);
+    test_error( err, "clEnqueueReadBuffer failed.");
+
+    err = verify_enqueued_local_size(output_ptr, localsize, 1);
+
+    // cleanup
+    clReleaseMemObject(streams);
+    clReleaseKernel(kernel[0]);
+    clReleaseKernel(kernel[1]);
+    clReleaseProgram(program[0]);
+    clReleaseProgram(program[1]);
+    free(output_ptr);
+
+    return err;
+}
--- a/test_conformance/basic/test_explicit_s2v.cpp
+++ b/test_conformance/basic/test_explicit_s2v.cpp
@@ -0,0 +1,385 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/compat.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+
+#include "procs.h"
+#include "../../test_common/harness/conversions.h"
+#include "../../test_common/harness/typeWrappers.h"
+
+#define DECLARE_S2V_IDENT_KERNEL(srctype,dsttype,size) \
+"__kernel void test_conversion(__global " srctype " *sourceValues, __global " dsttype #size " *destValues )\n"        \
+"{\n"                                                                            \
+"    int  tid = get_global_id(0);\n"                                        \
+"    " srctype "  src = sourceValues[tid];\n"                                        \
+"\n"                                                                            \
+"    destValues[tid] = (" dsttype #size ")src;\n"                        \
+"\n"                                                                            \
+"}\n"
+
+#define DECLARE_S2V_IDENT_KERNELS(srctype,dsttype) \
+{        \
+DECLARE_S2V_IDENT_KERNEL(srctype,#dsttype,2), \
+DECLARE_S2V_IDENT_KERNEL(srctype,#dsttype,4), \
+DECLARE_S2V_IDENT_KERNEL(srctype,#dsttype,8), \
+DECLARE_S2V_IDENT_KERNEL(srctype,#dsttype,16) \
+}
+
+#define DECLARE_EMPTY { NULL, NULL, NULL, NULL, NULL }
+
+/* Note: the next four arrays all must match in order and size to the ExplicitTypes enum in conversions.h!!! */
+
+#define DECLARE_S2V_IDENT_KERNELS_SET(srctype)    \
+{                                                    \
+DECLARE_S2V_IDENT_KERNELS(#srctype,bool),            \
+            DECLARE_S2V_IDENT_KERNELS(#srctype,char),            \
+            DECLARE_S2V_IDENT_KERNELS(#srctype,uchar),            \
+            DECLARE_S2V_IDENT_KERNELS(#srctype,unsigned char),    \
+DECLARE_S2V_IDENT_KERNELS(#srctype,short),            \
+DECLARE_S2V_IDENT_KERNELS(#srctype,ushort),            \
+DECLARE_S2V_IDENT_KERNELS(#srctype,unsigned short),    \
+DECLARE_S2V_IDENT_KERNELS(#srctype,int),                \
+DECLARE_S2V_IDENT_KERNELS(#srctype,uint),            \
+DECLARE_S2V_IDENT_KERNELS(#srctype,unsigned int),    \
+DECLARE_S2V_IDENT_KERNELS(#srctype,long),            \
+DECLARE_S2V_IDENT_KERNELS(#srctype,ulong),            \
+DECLARE_S2V_IDENT_KERNELS(#srctype,unsigned long),    \
+DECLARE_S2V_IDENT_KERNELS(#srctype,float),            \
+DECLARE_EMPTY                                        \
+}
+
+#define DECLARE_EMPTY_SET                \
+{                                                    \
+DECLARE_EMPTY, \
+DECLARE_EMPTY, \
+DECLARE_EMPTY, \
+DECLARE_EMPTY, \
+DECLARE_EMPTY, \
+DECLARE_EMPTY, \
+DECLARE_EMPTY, \
+DECLARE_EMPTY, \
+DECLARE_EMPTY, \
+DECLARE_EMPTY, \
+DECLARE_EMPTY, \
+DECLARE_EMPTY, \
+DECLARE_EMPTY, \
+DECLARE_EMPTY, \
+DECLARE_EMPTY    \
+}
+
+
+/* The overall array */
+const char * kernel_explicit_s2v_set[kNumExplicitTypes][kNumExplicitTypes][5] = {
+    DECLARE_S2V_IDENT_KERNELS_SET(bool),
+    DECLARE_S2V_IDENT_KERNELS_SET(char),
+    DECLARE_S2V_IDENT_KERNELS_SET(uchar),
+    DECLARE_S2V_IDENT_KERNELS_SET(unsigned char),
+    DECLARE_S2V_IDENT_KERNELS_SET(short),
+    DECLARE_S2V_IDENT_KERNELS_SET(ushort),
+    DECLARE_S2V_IDENT_KERNELS_SET(unsigned short),
+    DECLARE_S2V_IDENT_KERNELS_SET(int),
+    DECLARE_S2V_IDENT_KERNELS_SET(uint),
+    DECLARE_S2V_IDENT_KERNELS_SET(unsigned int),
+    DECLARE_S2V_IDENT_KERNELS_SET(long),
+    DECLARE_S2V_IDENT_KERNELS_SET(ulong),
+    DECLARE_S2V_IDENT_KERNELS_SET(unsigned long),
+    DECLARE_S2V_IDENT_KERNELS_SET(float),
+    DECLARE_EMPTY_SET
+};
+
+int test_explicit_s2v_function(cl_device_id deviceID, cl_context context, cl_command_queue queue, const char *programSrc,
+                               ExplicitType srcType, unsigned int count, ExplicitType destType, unsigned int vecSize, void *inputData )
+{
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    int error;
+    clMemWrapper streams[2];
+    void *outData;
+    unsigned char convertedData[ 8 ];    /* Max type size is 8 bytes */
+    size_t threadSize[3], groupSize[3];
+    unsigned int i, s;
+    unsigned char *inPtr, *outPtr;
+    size_t paramSize, destTypeSize;
+
+    const char* finalProgramSrc[2] = {
+        "", // optional pragma
+        programSrc
+    };
+
+    if (srcType == kDouble || destType == kDouble) {
+        finalProgramSrc[0] = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
+    }
+
+
+    if( programSrc == NULL )
+        return 0;
+
+    paramSize = get_explicit_type_size( srcType );
+    destTypeSize = get_explicit_type_size( destType );
+
+    size_t destStride = destTypeSize * vecSize;
+
+    outData = malloc( destStride * count );
+
+    if( create_single_kernel_helper( context, &program, &kernel, 2, finalProgramSrc, "test_conversion" ) )
+    {
+        log_info( "****** %s%s *******\n", finalProgramSrc[0], finalProgramSrc[1] );
+        return -1;
+    }
+
+    streams[0] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_COPY_HOST_PTR), paramSize * count, inputData, &error);
+    test_error( error, "clCreateBuffer failed");
+    streams[1] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE),  destStride * count, NULL, &error);
+    test_error( error, "clCreateBuffer failed");
+
+    /* Set the arguments */
+    error = clSetKernelArg(kernel, 0, sizeof( streams[0] ), &streams[0] );
+    test_error( error, "Unable to set indexed kernel arguments" );
+    error = clSetKernelArg(kernel, 1, sizeof( streams[1] ), &streams[1] );
+    test_error( error, "Unable to set indexed kernel arguments" );
+
+    /* Run the kernel */
+    threadSize[0] = count;
+
+    error = get_max_common_work_group_size( context, kernel, threadSize[0], &groupSize[0] );
+    test_error( error, "Unable to get work group size to use" );
+
+    error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threadSize, groupSize, 0, NULL, NULL );
+    test_error( error, "Unable to execute test kernel" );
+
+    /* Now verify the results. Each value should have been duplicated four times, and we should be able to just
+     do a memcpy instead of relying on the actual type of data */
+    error = clEnqueueReadBuffer( queue, streams[1], CL_TRUE, 0, destStride * count, outData, 0, NULL, NULL );
+    test_error( error, "Unable to read output values!" );
+
+    inPtr = (unsigned char *)inputData;
+    outPtr = (unsigned char *)outData;
+
+    for( i = 0; i < count; i++ )
+    {
+        /* Convert the input data element to our output data type to compare against */
+        convert_explicit_value( (void *)inPtr, (void *)convertedData, srcType, false, kDefaultRoundingType, destType );
+
+        /* Now compare every element of the vector */
+        for( s = 0; s < vecSize; s++ )
+        {
+            if( memcmp( convertedData, outPtr + destTypeSize * s, destTypeSize ) != 0 )
+            {
+                unsigned int *p = (unsigned int *)outPtr;
+                log_error( "ERROR: Output value %d:%d does not validate for size %d:%d!\n", i, s, vecSize, (int)destTypeSize );
+                log_error( "       Input:   0x%0*x\n", (int)( paramSize * 2 ), *(unsigned int *)inPtr & ( 0xffffffff >> ( 32 - paramSize * 8 ) ) );
+                log_error( "       Actual:  0x%08x 0x%08x 0x%08x 0x%08x\n", p[ 0 ], p[ 1 ], p[ 2 ], p[ 3 ] );
+                return -1;
+            }
+        }
+        inPtr += paramSize;
+        outPtr += destStride;
+    }
+
+    free( outData );
+
+    return 0;
+}
+
+int test_explicit_s2v_function_set(cl_device_id deviceID, cl_context context, cl_command_queue queue, ExplicitType srcType,
+                                   unsigned int count, void *inputData )
+{
+    unsigned int sizes[] = { 2, 4, 8, 16, 0 };
+    int i, dstType, failed = 0;
+
+
+    for( dstType = kBool; dstType < kNumExplicitTypes; dstType++ )
+    {
+        if( dstType == kDouble && !is_extension_available( deviceID, "cl_khr_fp64" ) )
+            continue;
+
+        if (( dstType == kLong || dstType == kULong ) && !gHasLong )
+            continue;
+
+        for( i = 0; sizes[i] != 0; i++ )
+        {
+            if( dstType != srcType )
+                continue;
+            if( strchr( get_explicit_type_name( (ExplicitType)srcType ), ' ' ) != NULL ||
+               strchr( get_explicit_type_name( (ExplicitType)dstType ), ' ' ) != NULL )
+                continue;
+
+            if( test_explicit_s2v_function( deviceID, context, queue, kernel_explicit_s2v_set[ srcType ][ dstType ][ i ],
+                                           srcType, count, (ExplicitType)dstType, sizes[ i ], inputData ) != 0 )
+            {
+                log_error( "ERROR: Explicit cast of scalar %s to vector %s%d FAILED; skipping other %s vector tests\n",
+                          get_explicit_type_name(srcType), get_explicit_type_name((ExplicitType)dstType), sizes[i], get_explicit_type_name((ExplicitType)dstType) );
+                failed = -1;
+                break;
+            }
+        }
+    }
+
+    return failed;
+}
+
+int test_explicit_s2v_bool(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    log_info( "NOTE: Boolean vectors not defined in OpenCL 1.0. Skipping test.\n" );
+    return 0;
+#if 0
+    bool    data[128];
+
+    generate_random_data( kBool, 128, data );
+
+    return test_explicit_s2v_function_set( deviceID, context, queue, kBool, 128, data );
+#endif
+}
+
+int test_explicit_s2v_char(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    char    data[128];
+    RandomSeed seed(gRandomSeed);
+
+    generate_random_data( kChar, 128, seed, data );
+
+    return test_explicit_s2v_function_set( deviceID, context, queue, kChar, 128, data );
+}
+
+int test_explicit_s2v_uchar(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    unsigned char    data[128];
+    RandomSeed seed(gRandomSeed);
+
+    generate_random_data( kUChar, 128, seed, data );
+
+    if( test_explicit_s2v_function_set( deviceID, context, queue, kUChar, 128, data ) != 0 )
+        return -1;
+    if( test_explicit_s2v_function_set( deviceID, context, queue, kUnsignedChar, 128, data ) != 0 )
+        return -1;
+    return 0;
+}
+
+int test_explicit_s2v_short(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    short            data[128];
+    RandomSeed seed(gRandomSeed);
+
+    generate_random_data( kShort, 128, seed, data );
+
+    if( test_explicit_s2v_function_set( deviceID, context, queue, kShort, 128, data ) != 0 )
+        return -1;
+    return 0;
+}
+
+int test_explicit_s2v_ushort(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    unsigned short    data[128];
+    RandomSeed seed(gRandomSeed);
+
+    generate_random_data( kUShort, 128, seed, data );
+
+    if( test_explicit_s2v_function_set( deviceID, context, queue, kUShort, 128, data ) != 0 )
+        return -1;
+    if( test_explicit_s2v_function_set( deviceID, context, queue, kUnsignedShort, 128, data ) != 0 )
+        return -1;
+    return 0;
+}
+
+int test_explicit_s2v_int(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    int                data[128];
+    RandomSeed seed(gRandomSeed);
+
+    generate_random_data( kInt, 128, seed, data );
+
+    if( test_explicit_s2v_function_set( deviceID, context, queue, kInt, 128, data ) != 0 )
+        return -1;
+    return 0;
+}
+
+int test_explicit_s2v_uint(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    unsigned int    data[128];
+    RandomSeed seed(gRandomSeed);
+
+    generate_random_data( kUInt, 128, seed, data );
+
+    if( test_explicit_s2v_function_set( deviceID, context, queue, kUInt, 128, data ) != 0 )
+        return -1;
+    if( test_explicit_s2v_function_set( deviceID, context, queue, kUnsignedInt, 128, data ) != 0 )
+        return -1;
+    return 0;
+}
+
+int test_explicit_s2v_long(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    cl_long    data[128];
+    RandomSeed seed(gRandomSeed);
+
+    generate_random_data( kLong, 128, seed, data );
+
+    if( test_explicit_s2v_function_set( deviceID, context, queue, kLong,  128, data ) != 0 )
+        return -1;
+    return 0;
+}
+
+int test_explicit_s2v_ulong(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    cl_ulong    data[128];
+    RandomSeed seed(gRandomSeed);
+
+    generate_random_data( kULong, 128, seed, data );
+
+    if( test_explicit_s2v_function_set( deviceID, context, queue, kULong,  128, data ) != 0 )
+        return -1;
+    if( test_explicit_s2v_function_set( deviceID, context, queue, kUnsignedLong, 128, data ) != 0 )
+        return -1;
+    return 0;
+}
+
+int test_explicit_s2v_float(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    float            data[128];
+    RandomSeed seed(gRandomSeed);
+
+    generate_random_data( kFloat, 128, seed, data );
+
+    if( test_explicit_s2v_function_set( deviceID, context, queue, kFloat, 128, data ) != 0 )
+        return -1;
+    return 0;
+}
+
+
+int test_explicit_s2v_double(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    double            data[128];
+    RandomSeed seed(gRandomSeed);
+
+    if( !is_extension_available( deviceID, "cl_khr_fp64" ) ) {
+        log_info("Extension cl_khr_fp64 not supported. Skipping test.\n");
+        return 0;
+    }
+
+    generate_random_data( kDouble, 128, seed, data );
+
+    if( test_explicit_s2v_function_set( deviceID, context, queue, kDouble, 128, data ) != 0 )
+        return -1;
+    return 0;
+}
+
+
--- a/test_conformance/basic/test_float2int.c
+++ b/test_conformance/basic/test_float2int.c
@@ -0,0 +1,145 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/compat.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+
+#include "procs.h"
+
+const char *float2int_kernel_code =
+"__kernel void test_float2int(__global float *src, __global int *dst)\n"
+"{\n"
+"    int  tid = get_global_id(0);\n"
+"\n"
+"    dst[tid] = (int)src[tid];\n"
+"\n"
+"}\n";
+
+
+int
+verify_float2int(cl_float *inptr, cl_int *outptr, int n)
+{
+  int     i;
+
+  for (i=0; i<n; i++)
+  {
+    if (outptr[i] != (int)inptr[i])
+    {
+      log_error("FLOAT2INT test failed\n");
+      return -1;
+    }
+  }
+
+  log_info("FLOAT2INT test passed\n");
+  return 0;
+}
+
+
+int
+test_float2int(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
+{
+    cl_mem            streams[2];
+    cl_float        *input_ptr;
+    cl_int          *output_ptr;
+    cl_program        program;
+    cl_kernel        kernel;
+    void            *values[2];
+    size_t    threads[1];
+    int                err;
+    int                i;
+    MTdata          d;
+
+    input_ptr = (cl_float*)malloc(sizeof(cl_float) * num_elements);
+    output_ptr = (cl_int*)malloc(sizeof(cl_int) * num_elements);
+    streams[0] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE),  sizeof(cl_float) * num_elements, NULL, NULL);
+    if (!streams[0])
+    {
+        log_error("clCreateBuffer failed\n");
+        return -1;
+    }
+    streams[1] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE),  sizeof(cl_int) * num_elements, NULL, NULL);
+    if (!streams[1])
+    {
+        log_error("clCreateBuffer failed\n");
+        return -1;
+    }
+
+    d = init_genrand( gRandomSeed );
+    for (i=0; i<num_elements; i++)
+        input_ptr[i] = get_random_float(-MAKE_HEX_FLOAT( 0x1.0p31f, 0x1, 31), MAKE_HEX_FLOAT( 0x1.0p31f, 0x1, 31), d);
+    free_mtdata(d); d = NULL;
+
+    err = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0, sizeof(cl_float)*num_elements, (void *)input_ptr, 0, NULL, NULL);
+    if (err != CL_SUCCESS)
+    {
+        log_error("clWriteArray failed\n");
+        return -1;
+    }
+
+    err = create_single_kernel_helper(context, &program, &kernel, 1, &float2int_kernel_code, "test_float2int");
+    if (err != CL_SUCCESS)
+    {
+        log_error("create_single_kernel_helper failed\n");
+        return -1;
+    }
+
+    values[0] = streams[0];
+    values[1] = streams[1];
+  err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0]);
+  err = clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1]);
+    if (err != CL_SUCCESS)
+    {
+        log_error("clSetKernelArgs failed\n");
+        return -1;
+    }
+
+    threads[0] = (size_t)num_elements;
+    err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, NULL, 0, NULL, NULL );
+    if (err != CL_SUCCESS)
+    {
+        log_error("clEnqueueNDRangeKernel failed\n");
+        return -1;
+    }
+
+    err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_int)*num_elements, (void *)output_ptr, 0, NULL, NULL );
+    if (err != CL_SUCCESS)
+    {
+        log_error("clEnqueueReadBuffer failed\n");
+        return -1;
+    }
+
+    err = verify_float2int(input_ptr, output_ptr, num_elements);
+
+    // cleanup
+    clReleaseMemObject(streams[0]);
+    clReleaseMemObject(streams[1]);
+    clReleaseKernel(kernel);
+    clReleaseProgram(program);
+    free(input_ptr);
+    free(output_ptr);
+
+    return err;
+}
+
+
+
+
+
--- a/test_conformance/basic/test_fpmath_float.c
+++ b/test_conformance/basic/test_fpmath_float.c
@@ -0,0 +1,271 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/compat.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include "../../test_common/harness/rounding_mode.h"
+
+#include "procs.h"
+
+static const char *fpadd_kernel_code =
+"__kernel void test_fpadd(__global float *srcA, __global float *srcB, __global float *dst)\n"
+"{\n"
+"    int  tid = get_global_id(0);\n"
+"\n"
+"    dst[tid] = srcA[tid] + srcB[tid];\n"
+"}\n";
+
+static const char *fpsub_kernel_code =
+"__kernel void test_fpsub(__global float *srcA, __global float *srcB, __global float *dst)\n"
+"{\n"
+"    int  tid = get_global_id(0);\n"
+"\n"
+"    dst[tid] = srcA[tid] - srcB[tid];\n"
+"}\n";
+
+static const char *fpmul_kernel_code =
+"__kernel void test_fpmul(__global float *srcA, __global float *srcB, __global float *dst)\n"
+"{\n"
+"    int  tid = get_global_id(0);\n"
+"\n"
+"    dst[tid] = srcA[tid] * srcB[tid];\n"
+"}\n";
+
+
+static const float    MAX_ERR = 1e-5f;
+
+static int
+verify_fpadd(float *inptrA, float *inptrB, float *outptr, int n)
+{
+    float       r;
+    int         i;
+
+    for (i=0; i<n; i++)
+    {
+        r = inptrA[i] + inptrB[i];
+        if (r != outptr[i])
+        {
+            log_error("FP_ADD float test failed\n");
+            return -1;
+        }
+    }
+
+    log_info("FP_ADD float test passed\n");
+    return 0;
+}
+
+static int
+verify_fpsub(float *inptrA, float *inptrB, float *outptr, int n)
+{
+    float       r;
+    int         i;
+
+    for (i=0; i<n; i++)
+    {
+        r = inptrA[i] - inptrB[i];
+        if (r != outptr[i])
+        {
+            log_error("FP_SUB float test failed\n");
+            return -1;
+        }
+    }
+
+    log_info("FP_SUB float test passed\n");
+    return 0;
+}
+
+static int
+verify_fpmul(float *inptrA, float *inptrB, float *outptr, int n)
+{
+    float       r;
+    int         i;
+
+    for (i=0; i<n; i++)
+    {
+        r = inptrA[i] * inptrB[i];
+        if (r != outptr[i])
+        {
+            log_error("FP_MUL float test failed\n");
+            return -1;
+        }
+    }
+
+    log_info("FP_MUL float test passed\n");
+    return 0;
+}
+
+
+int
+test_fpmath_float(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
+{
+    cl_mem streams[4];
+    cl_program program[3];
+    cl_kernel kernel[3];
+
+    float *input_ptr[3], *output_ptr, *p;
+    size_t threads[1];
+    int err, i;
+    MTdata d = init_genrand( gRandomSeed );
+    size_t length = sizeof(cl_float) * num_elements;
+    int isRTZ = 0;
+    RoundingMode oldMode = kDefaultRoundingMode;
+
+    // check for floating point capabilities
+    cl_device_fp_config single_config = 0;
+    err = clGetDeviceInfo( device, CL_DEVICE_SINGLE_FP_CONFIG, sizeof( single_config ), &single_config, NULL );
+    if (err) {
+      log_error("clGetDeviceInfo for CL_DEVICE_SINGLE_FP_CONFIG failed: %d", err);
+      test_finish();
+      return -1;
+    }
+    //If we only support rtz mode
+    if( CL_FP_ROUND_TO_ZERO == ( single_config & (CL_FP_ROUND_TO_ZERO|CL_FP_ROUND_TO_NEAREST) ) )
+    {
+        //Check to make sure we are an embedded device
+        char profile[32];
+        err = clGetDeviceInfo( device, CL_DEVICE_PROFILE, sizeof(profile), profile, NULL);
+        if( err )
+        {
+            log_error("clGetDeviceInfo for CL_DEVICE_PROFILE failed: %d", err);
+              test_finish();
+              return -1;
+        }
+        if( 0 != strcmp( profile, "EMBEDDED_PROFILE"))
+        {
+            log_error( "FAILURE:  Device doesn't support CL_FP_ROUND_TO_NEAREST and isn't EMBEDDED_PROFILE\n" );
+            test_finish();
+            return -1;
+        }
+
+        isRTZ = 1;
+        oldMode = get_round();
+    }
+
+
+    input_ptr[0] = (cl_float*)malloc(length);
+    input_ptr[1] = (cl_float*)malloc(length);
+    input_ptr[2] = (cl_float*)malloc(length);
+    output_ptr   = (cl_float*)malloc(length);
+
+    streams[0] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE), length, NULL, &err);
+    test_error( err, "clCreateBuffer failed.");
+    streams[1] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE), length, NULL, &err);
+    test_error( err, "clCreateBuffer failed.");
+    streams[2] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE), length, NULL, &err);
+    test_error( err, "clCreateBuffer failed.");
+    streams[3] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE), length, NULL, &err);
+    test_error( err, "clCreateBuffer failed.");
+
+    p = input_ptr[0];
+    for (i=0; i<num_elements; i++)
+        p[i] = get_random_float(-MAKE_HEX_FLOAT(0x1.0p31f, 0x1, 31), MAKE_HEX_FLOAT(0x1.0p31f, 0x1, 31), d);
+    p = input_ptr[1];
+    for (i=0; i<num_elements; i++)
+        p[i] = get_random_float(-MAKE_HEX_FLOAT(0x1.0p31f, 0x1, 31), MAKE_HEX_FLOAT(0x1.0p31f, 0x1, 31), d);
+    p = input_ptr[2];
+    for (i=0; i<num_elements; i++)
+        p[i] = get_random_float(-MAKE_HEX_FLOAT(0x1.0p31f, 0x1, 31), MAKE_HEX_FLOAT(0x1.0p31f, 0x1, 31), d);
+
+    err = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0, length, input_ptr[0], 0, NULL, NULL);
+    test_error( err, "clEnqueueWriteBuffer failed.");
+
+    err = clEnqueueWriteBuffer(queue, streams[1], CL_TRUE, 0, length, input_ptr[1], 0, NULL, NULL);
+    test_error( err, "clEnqueueWriteBuffer failed.");
+
+    err = clEnqueueWriteBuffer(queue, streams[2], CL_TRUE, 0, length, input_ptr[2], 0, NULL, NULL);
+    test_error( err, "clEnqueueWriteBuffer failed.");
+
+    err = create_single_kernel_helper(context, &program[0], &kernel[0], 1, &fpadd_kernel_code, "test_fpadd");
+    test_error( err, "create_single_kernel_helper failed");
+
+    err = create_single_kernel_helper(context, &program[1], &kernel[1], 1, &fpsub_kernel_code, "test_fpsub");
+    test_error( err, "create_single_kernel_helper failed");
+
+    err = create_single_kernel_helper(context, &program[2], &kernel[2], 1, &fpmul_kernel_code, "test_fpmul");
+    test_error( err, "create_single_kernel_helper failed");
+
+
+    err  = clSetKernelArg(kernel[0], 0, sizeof streams[0], &streams[0]);
+    err |= clSetKernelArg(kernel[0], 1, sizeof streams[1], &streams[1]);
+    err |= clSetKernelArg(kernel[0], 2, sizeof streams[3], &streams[3]);
+    test_error( err, "clSetKernelArgs failed.");
+
+    err  = clSetKernelArg(kernel[1], 0, sizeof streams[0], &streams[0]);
+    err |= clSetKernelArg(kernel[1], 1, sizeof streams[1], &streams[1]);
+    err |= clSetKernelArg(kernel[1], 2, sizeof streams[3], &streams[3]);
+    test_error( err, "clSetKernelArgs failed.");
+
+    err  = clSetKernelArg(kernel[2], 0, sizeof streams[0], &streams[0]);
+    err |= clSetKernelArg(kernel[2], 1, sizeof streams[1], &streams[1]);
+    err |= clSetKernelArg(kernel[2], 2, sizeof streams[3], &streams[3]);
+    test_error( err, "clSetKernelArgs failed.");
+
+    threads[0] = (unsigned int)num_elements;
+    for (i=0; i<3; i++)
+    {
+        err = clEnqueueNDRangeKernel(queue, kernel[i], 1, NULL, threads, NULL, 0, NULL, NULL);
+        test_error( err, "clEnqueueNDRangeKernel failed.");
+
+        err = clEnqueueReadBuffer(queue, streams[3], CL_TRUE, 0, length, output_ptr, 0, NULL, NULL);
+        test_error( err, "clEnqueueReadBuffer failed.");
+
+        if( isRTZ )
+            set_round( kRoundTowardZero, kfloat );
+
+        switch (i)
+        {
+            case 0:
+                err = verify_fpadd(input_ptr[0], input_ptr[1], output_ptr, num_elements);
+                break;
+            case 1:
+                err = verify_fpsub(input_ptr[0], input_ptr[1], output_ptr, num_elements);
+                break;
+            case 2:
+                err = verify_fpmul(input_ptr[0], input_ptr[1], output_ptr, num_elements);
+                break;
+        }
+
+        if( isRTZ )
+            set_round( oldMode, kfloat );
+
+        if (err)
+            break;
+    }
+
+    // cleanup
+    clReleaseMemObject(streams[0]);
+    clReleaseMemObject(streams[1]);
+    clReleaseMemObject(streams[2]);
+    clReleaseMemObject(streams[3]);
+    for (i=0; i<3; i++)
+    {
+        clReleaseKernel(kernel[i]);
+        clReleaseProgram(program[i]);
+    }
+    free(input_ptr[0]);
+    free(input_ptr[1]);
+    free(input_ptr[2]);
+    free(output_ptr);
+    free_mtdata( d );
+
+    return err;
+}
+
+
--- a/test_conformance/basic/test_fpmath_float2.c
+++ b/test_conformance/basic/test_fpmath_float2.c
@@ -0,0 +1,269 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/compat.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include "../../test_common/harness/rounding_mode.h"
+
+
+#include "procs.h"
+
+const char *fpadd2_kernel_code =
+"__kernel void test_fpadd2(__global float2 *srcA, __global float2 *srcB, __global float2 *dst)\n"
+"{\n"
+"    int  tid = get_global_id(0);\n"
+"\n"
+"    dst[tid] = srcA[tid] + srcB[tid];\n"
+"}\n";
+
+const char *fpsub2_kernel_code =
+"__kernel void test_fpsub2(__global float2 *srcA, __global float2 *srcB, __global float2 *dst)\n"
+"{\n"
+"    int  tid = get_global_id(0);\n"
+"\n"
+"    dst[tid] = srcA[tid] - srcB[tid];\n"
+"}\n";
+
+const char *fpmul2_kernel_code =
+"__kernel void test_fpmul2(__global float2 *srcA, __global float2 *srcB, __global float2 *dst)\n"
+"{\n"
+"    int  tid = get_global_id(0);\n"
+"\n"
+"    dst[tid] = srcA[tid] * srcB[tid];\n"
+"}\n";
+
+
+int
+verify_fpadd2(float *inptrA, float *inptrB, float *outptr, int n)
+{
+    float       r;
+    int         i;
+
+    for (i=0; i<n; i++)
+    {
+        r = inptrA[i] + inptrB[i];
+        if (r != outptr[i])
+        {
+            log_error("FP_ADD float2 test failed\n");
+            return -1;
+        }
+    }
+
+    log_info("FP_ADD float2 test passed\n");
+    return 0;
+}
+
+int
+verify_fpsub2(float *inptrA, float *inptrB, float *outptr, int n)
+{
+    float       r;
+    int         i;
+
+    for (i=0; i<n; i++)
+    {
+        r = inptrA[i] - inptrB[i];
+        if (r != outptr[i])
+        {
+            log_error("FP_SUB float2 test failed\n");
+            return -1;
+        }
+    }
+
+    log_info("FP_SUB float2 test passed\n");
+    return 0;
+}
+
+int
+verify_fpmul2(float *inptrA, float *inptrB, float *outptr, int n)
+{
+    float       r;
+    int         i;
+
+    for (i=0; i<n; i++)
+    {
+        r = inptrA[i] * inptrB[i];
+        if (r != outptr[i])
+        {
+            log_error("FP_MUL float2 test failed\n");
+            return -1;
+        }
+    }
+
+    log_info("FP_MUL float2 test passed\n");
+    return 0;
+}
+
+
+int
+test_fpmath_float2(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
+{
+    cl_mem streams[4];
+    cl_program program[3];
+    cl_kernel kernel[3];
+
+    cl_float *input_ptr[3], *output_ptr, *p;
+    size_t threads[1];
+    int err, i;
+    MTdata d = init_genrand( gRandomSeed );
+
+    size_t length = sizeof(cl_float) * 2 * num_elements;
+    int isRTZ = 0;
+    RoundingMode oldMode = kDefaultRoundingMode;
+
+    // check for floating point capabilities
+    cl_device_fp_config single_config = 0;
+    err = clGetDeviceInfo( device, CL_DEVICE_SINGLE_FP_CONFIG, sizeof( single_config ), &single_config, NULL );
+    if (err) {
+      log_error("clGetDeviceInfo for CL_DEVICE_SINGLE_FP_CONFIG failed: %d", err);
+      test_finish();
+      return -1;
+    }
+    //If we only support rtz mode
+    if( CL_FP_ROUND_TO_ZERO == ( single_config & (CL_FP_ROUND_TO_ZERO|CL_FP_ROUND_TO_NEAREST) ) )
+    {
+        //Check to make sure we are an embedded device
+        char profile[32];
+        err = clGetDeviceInfo( device, CL_DEVICE_PROFILE, sizeof(profile), profile, NULL);
+        if( err )
+        {
+            log_error("clGetDeviceInfo for CL_DEVICE_PROFILE failed: %d", err);
+              test_finish();
+              return -1;
+        }
+        if( 0 != strcmp( profile, "EMBEDDED_PROFILE"))
+        {
+            log_error( "FAILURE:  Device doesn't support CL_FP_ROUND_TO_NEAREST and isn't EMBEDDED_PROFILE\n" );
+            test_finish();
+            return -1;
+        }
+
+        isRTZ = 1;
+        oldMode = get_round();
+    }
+
+    input_ptr[0] = (cl_float*)malloc(length);
+    input_ptr[1] = (cl_float*)malloc(length);
+    input_ptr[2] = (cl_float*)malloc(length);
+    output_ptr   = (cl_float*)malloc(length);
+
+    streams[0] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE), length, NULL, &err);
+    test_error( err, "clCreateBuffer failed.");
+    streams[1] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE), length, NULL, &err);
+    test_error( err, "clCreateBuffer failed.");
+    streams[2] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE), length, NULL, &err);
+    test_error( err, "clCreateBuffer failed.");
+    streams[3] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE), length, NULL, &err);
+    test_error( err, "clCreateBuffer failed.");
+
+    p = input_ptr[0];
+    for (i=0; i<num_elements*2; i++)
+        p[i] = get_random_float(-MAKE_HEX_FLOAT( 0x1.0p31f, 0x1, 31), MAKE_HEX_FLOAT( 0x1.0p31f, 0x1, 31), d);
+    p = input_ptr[1];
+    for (i=0; i<num_elements*2; i++)
+        p[i] = get_random_float(-MAKE_HEX_FLOAT( 0x1.0p31f, 0x1, 31), MAKE_HEX_FLOAT( 0x1.0p31f, 0x1, 31), d);
+    p = input_ptr[2];
+    for (i=0; i<num_elements*2; i++)
+        p[i] = get_random_float(-MAKE_HEX_FLOAT( 0x1.0p31f, 0x1, 31), MAKE_HEX_FLOAT( 0x1.0p31f, 0x1, 31), d);
+
+    err = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0, length, input_ptr[0], 0, NULL, NULL);
+    test_error(err, "clEnqueueWriteBuffer failed");
+    err = clEnqueueWriteBuffer(queue, streams[1], CL_TRUE, 0, length, input_ptr[1], 0, NULL, NULL);
+    test_error(err, "clEnqueueWriteBuffer failed");
+    err = clEnqueueWriteBuffer(queue, streams[2], CL_TRUE, 0, length, input_ptr[2], 0, NULL, NULL);
+    test_error(err, "clEnqueueWriteBuffer failed");
+
+    err = create_single_kernel_helper(context, &program[0], &kernel[0], 1, &fpadd2_kernel_code, "test_fpadd2");
+    test_error( err, "create_single_kernel_helper failed");
+
+    err = create_single_kernel_helper(context, &program[1], &kernel[1], 1, &fpsub2_kernel_code, "test_fpsub2");
+    test_error( err, "create_single_kernel_helper failed");
+
+    err = create_single_kernel_helper(context, &program[2], &kernel[2], 1, &fpmul2_kernel_code, "test_fpmul2");
+    test_error( err, "create_single_kernel_helper failed");
+
+
+    err  = clSetKernelArg(kernel[0], 0, sizeof streams[0], &streams[0]);
+    err |= clSetKernelArg(kernel[0], 1, sizeof streams[1], &streams[1]);
+    err |= clSetKernelArg(kernel[0], 2, sizeof streams[3], &streams[3]);
+    test_error( err, "clSetKernelArgs failed.");
+
+    err  = clSetKernelArg(kernel[1], 0, sizeof streams[0], &streams[0]);
+    err |= clSetKernelArg(kernel[1], 1, sizeof streams[1], &streams[1]);
+    err |= clSetKernelArg(kernel[1], 2, sizeof streams[3], &streams[3]);
+    test_error( err, "clSetKernelArgs failed.");
+
+    err  = clSetKernelArg(kernel[2], 0, sizeof streams[0], &streams[0]);
+    err |= clSetKernelArg(kernel[2], 1, sizeof streams[1], &streams[1]);
+    err |= clSetKernelArg(kernel[2], 2, sizeof streams[3], &streams[3]);
+    test_error( err, "clSetKernelArgs failed.");
+    free_mtdata(d);
+    d = NULL;
+
+    threads[0] = (unsigned int)num_elements;
+    for (i=0; i<3; i++)
+    {
+        err = clEnqueueNDRangeKernel(queue, kernel[i], 1, NULL, threads, NULL, 0, NULL, NULL);
+      test_error( err, "clEnqueueNDRangeKernel failed.");
+
+        err = clEnqueueReadBuffer(queue, streams[3], CL_TRUE, 0, length, output_ptr, 0, NULL, NULL);
+      test_error( err, "clEnqueueReadBuffer failed.");
+
+        if( isRTZ )
+            set_round( kRoundTowardZero, kfloat );
+
+        switch (i)
+        {
+            case 0:
+                err = verify_fpadd2(input_ptr[0], input_ptr[1], output_ptr, num_elements*2);
+                break;
+            case 1:
+                err = verify_fpsub2(input_ptr[0], input_ptr[1], output_ptr, num_elements*2);
+                break;
+            case 2:
+                err = verify_fpmul2(input_ptr[0], input_ptr[1], output_ptr, num_elements*2);
+                break;
+        }
+
+        if( isRTZ )
+            set_round( oldMode, kfloat );
+
+        if (err)
+            break;
+    }
+
+
+    // cleanup
+    clReleaseMemObject(streams[0]);
+    clReleaseMemObject(streams[1]);
+    clReleaseMemObject(streams[2]);
+    clReleaseMemObject(streams[3]);
+    for (i=0; i<3; i++)
+    {
+        clReleaseKernel(kernel[i]);
+        clReleaseProgram(program[i]);
+    }
+    free(input_ptr[0]);
+    free(input_ptr[1]);
+    free(input_ptr[2]);
+    free(output_ptr);
+    return err;
+}
+
+
--- a/test_conformance/basic/test_fpmath_float4.c
+++ b/test_conformance/basic/test_fpmath_float4.c
@@ -0,0 +1,270 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/compat.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+
+#include "procs.h"
+#include "../../test_common/harness/rounding_mode.h"
+
+const char *fpadd4_kernel_code =
+"__kernel void test_fpadd4(__global float4 *srcA, __global float4 *srcB, __global float4 *dst)\n"
+"{\n"
+"    int  tid = get_global_id(0);\n"
+"\n"
+"    dst[tid] = srcA[tid] + srcB[tid];\n"
+"}\n";
+
+const char *fpsub4_kernel_code =
+"__kernel void test_fpsub4(__global float4 *srcA, __global float4 *srcB, __global float4 *dst)\n"
+"{\n"
+"    int  tid = get_global_id(0);\n"
+"\n"
+"    dst[tid] = srcA[tid] - srcB[tid];\n"
+"}\n";
+
+const char *fpmul4_kernel_code =
+"__kernel void test_fpmul4(__global float4 *srcA, __global float4 *srcB, __global float4 *dst)\n"
+"{\n"
+"    int  tid = get_global_id(0);\n"
+"\n"
+"    dst[tid] = srcA[tid] * srcB[tid];\n"
+"}\n";
+
+
+int
+verify_fpadd4(float *inptrA, float *inptrB, float *outptr, int n)
+{
+    float       r;
+    int         i;
+
+    for (i=0; i<n; i++)
+    {
+        r = inptrA[i] + inptrB[i];
+        if (r != outptr[i])
+        {
+            log_error("FP_ADD float4 test failed\n");
+            return -1;
+        }
+    }
+
+    log_info("FP_ADD float4 test passed\n");
+    return 0;
+}
+
+int
+verify_fpsub4(float *inptrA, float *inptrB, float *outptr, int n)
+{
+    float       r;
+    int         i;
+
+    for (i=0; i<n; i++)
+    {
+        r = inptrA[i] - inptrB[i];
+        if (r != outptr[i])
+        {
+            log_error("FP_SUB float4 test failed\n");
+            return -1;
+        }
+    }
+
+    log_info("FP_SUB float4 test passed\n");
+    return 0;
+}
+
+int
+verify_fpmul4(float *inptrA, float *inptrB, float *outptr, int n)
+{
+  float       r;
+  int         i;
+
+  for (i=0; i<n; i++)
+  {
+    r = inptrA[i] * inptrB[i];
+    if (r != outptr[i])
+    {
+      log_error("FP_MUL float4 test failed\n");
+      return -1;
+    }
+  }
+
+  log_info("FP_MUL float4 test passed\n");
+  return 0;
+}
+
+
+int
+test_fpmath_float4(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
+{
+    cl_mem streams[4];
+    cl_program program[3];
+    cl_kernel kernel[3];
+
+    cl_float *input_ptr[3], *output_ptr, *p;
+    size_t threads[1];
+    int err, i;
+    MTdata d = init_genrand( gRandomSeed );
+
+    size_t length = sizeof(cl_float) * 4 * num_elements;
+    int isRTZ = 0;
+    RoundingMode oldMode = kDefaultRoundingMode;
+
+    // check for floating point capabilities
+    cl_device_fp_config single_config = 0;
+    err = clGetDeviceInfo( device, CL_DEVICE_SINGLE_FP_CONFIG, sizeof( single_config ), &single_config, NULL );
+    if (err) {
+      log_error("clGetDeviceInfo for CL_DEVICE_SINGLE_FP_CONFIG failed: %d", err);
+      test_finish();
+      return -1;
+    }
+    //If we only support rtz mode
+    if( CL_FP_ROUND_TO_ZERO == ( single_config & (CL_FP_ROUND_TO_ZERO|CL_FP_ROUND_TO_NEAREST) ) )
+    {
+        //Check to make sure we are an embedded device
+        char profile[32];
+        err = clGetDeviceInfo( device, CL_DEVICE_PROFILE, sizeof(profile), profile, NULL);
+        if( err )
+        {
+            log_error("clGetDeviceInfo for CL_DEVICE_PROFILE failed: %d", err);
+              test_finish();
+              return -1;
+        }
+        if( 0 != strcmp( profile, "EMBEDDED_PROFILE"))
+        {
+            log_error( "FAILURE:  Device doesn't support CL_FP_ROUND_TO_NEAREST and isn't EMBEDDED_PROFILE\n" );
+            test_finish();
+            return -1;
+        }
+
+        isRTZ = 1;
+        oldMode = get_round();
+    }
+
+    input_ptr[0] = (cl_float*)malloc(length);
+    input_ptr[1] = (cl_float*)malloc(length);
+    input_ptr[2] = (cl_float*)malloc(length);
+    output_ptr   = (cl_float*)malloc(length);
+
+    streams[0] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE), length, NULL, &err);
+    test_error( err, "clCreateBuffer failed.");
+    streams[1] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE), length, NULL, &err);
+    test_error( err, "clCreateBuffer failed.");
+    streams[2] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE), length, NULL, &err);
+    test_error( err, "clCreateBuffer failed.");
+    streams[3] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE), length, NULL, &err);
+    test_error( err, "clCreateBuffer failed.");
+
+    p = input_ptr[0];
+    for (i=0; i<num_elements*4; i++)
+        p[i] = get_random_float(-MAKE_HEX_FLOAT(0x1.0p31f, 0x1, 31), MAKE_HEX_FLOAT(0x1.0p31f, 0x1, 31), d);
+    p = input_ptr[1];
+    for (i=0; i<num_elements*4; i++)
+        p[i] = get_random_float(-MAKE_HEX_FLOAT(0x1.0p31f, 0x1, 31), MAKE_HEX_FLOAT(0x1.0p31f, 0x1, 31), d);
+    p = input_ptr[2];
+    for (i=0; i<num_elements*4; i++)
+        p[i] = get_random_float(-MAKE_HEX_FLOAT(0x1.0p31f, 0x1, 31), MAKE_HEX_FLOAT(0x1.0p31f, 0x1, 31), d);
+
+    free_mtdata(d);
+
+    err = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0, length, input_ptr[0], 0, NULL, NULL);
+    test_error(err, "clEnqueueWriteBuffer failed");
+    err = clEnqueueWriteBuffer(queue, streams[1], CL_TRUE, 0, length, input_ptr[1], 0, NULL, NULL);
+    test_error(err, "clEnqueueWriteBuffer failed");
+    err = clEnqueueWriteBuffer(queue, streams[2], CL_TRUE, 0, length, input_ptr[2], 0, NULL, NULL);
+    test_error(err, "clEnqueueWriteBuffer failed");
+
+    err = create_single_kernel_helper(context, &program[0], &kernel[0], 1, &fpadd4_kernel_code, "test_fpadd4");
+    test_error( err, "create_single_kernel_helper failed");
+
+    err = create_single_kernel_helper(context, &program[1], &kernel[1], 1, &fpsub4_kernel_code, "test_fpsub4");
+    test_error( err, "create_single_kernel_helper failed");
+
+    err = create_single_kernel_helper(context, &program[2], &kernel[2], 1, &fpmul4_kernel_code, "test_fpmul4");
+    test_error( err, "create_single_kernel_helper failed");
+
+
+    err  = clSetKernelArg(kernel[0], 0, sizeof streams[0], &streams[0]);
+    err |= clSetKernelArg(kernel[0], 1, sizeof streams[1], &streams[1]);
+    err |= clSetKernelArg(kernel[0], 2, sizeof streams[3], &streams[3]);
+    test_error( err, "clSetKernelArgs failed.");
+
+    err  = clSetKernelArg(kernel[1], 0, sizeof streams[0], &streams[0]);
+    err |= clSetKernelArg(kernel[1], 1, sizeof streams[1], &streams[1]);
+    err |= clSetKernelArg(kernel[1], 2, sizeof streams[3], &streams[3]);
+    test_error( err, "clSetKernelArgs failed.");
+
+    err  = clSetKernelArg(kernel[2], 0, sizeof streams[0], &streams[0]);
+    err |= clSetKernelArg(kernel[2], 1, sizeof streams[1], &streams[1]);
+    err |= clSetKernelArg(kernel[2], 2, sizeof streams[3], &streams[3]);
+    test_error( err, "clSetKernelArgs failed.");
+
+
+  threads[0] = (unsigned int)num_elements;
+  for (i=0; i<3; i++)
+  {
+    err = clEnqueueNDRangeKernel(queue, kernel[i], 1, NULL, threads, NULL, 0, NULL, NULL);
+    test_error( err, "clEnqueueNDRangeKernel failed.");
+
+    err = clEnqueueReadBuffer(queue, streams[3], CL_TRUE, 0, length, output_ptr, 0, NULL, NULL);
+    test_error( err, "clEnqueueReadBuffer failed.");
+
+    if( isRTZ )
+        set_round( kRoundTowardZero, kfloat );
+
+    switch (i)
+    {
+      case 0:
+        err = verify_fpadd4(input_ptr[0], input_ptr[1], output_ptr, num_elements*4);
+        break;
+      case 1:
+        err = verify_fpsub4(input_ptr[0], input_ptr[1], output_ptr, num_elements*4);
+        break;
+      case 2:
+        err = verify_fpmul4(input_ptr[0], input_ptr[1], output_ptr, num_elements*4);
+        break;
+    }
+
+    if( isRTZ )
+        set_round( oldMode, kfloat );
+
+    if (err)
+      break;
+    }
+
+
+    // cleanup
+    clReleaseMemObject(streams[0]);
+    clReleaseMemObject(streams[1]);
+    clReleaseMemObject(streams[2]);
+    clReleaseMemObject(streams[3]);
+    for (i=0; i<3; i++)
+    {
+        clReleaseKernel(kernel[i]);
+        clReleaseProgram(program[i]);
+    }
+    free(input_ptr[0]);
+    free(input_ptr[1]);
+    free(input_ptr[2]);
+    free(output_ptr);
+    return err;
+}
+
+
--- a/test_conformance/basic/test_get_linear_ids.cpp
+++ b/test_conformance/basic/test_get_linear_ids.cpp
@@ -0,0 +1,191 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "procs.h"
+#include <ctype.h>
+
+static const char *linear_ids_source[1] = {
+"__kernel void test_linear_ids(__global int2 *out)\n"
+"{\n"
+"    size_t lid, gid;\n"
+"    uint d = get_work_dim();\n"
+"    if (d == 1U) {\n"
+"        gid = get_global_id(0) - get_global_offset(0);\n"
+"        lid = get_local_id(0);\n"
+"    } else if (d == 2U) {\n"
+"        gid = (get_global_id(1) - get_global_offset(1)) * get_global_size(0) +\n"
+"              (get_global_id(0) - get_global_offset(0));\n"
+"        lid = get_local_id(1) * get_local_size(0) + get_local_id(0);\n"
+"    } else {\n"
+"        gid = ((get_global_id(2) - get_global_offset(2)) * get_global_size(1) +\n"
+"               (get_global_id(1) - get_global_offset(1))) * get_global_size(0) +\n"
+"               (get_global_id(0) - get_global_offset(0));\n"
+"        lid = (get_local_id(2) * get_local_size(1) +\n"
+"               get_local_id(1)) * get_local_size(0) + get_local_id(0);\n"
+"    }\n"
+"    out[gid].x = gid == get_global_linear_id();\n"
+"    out[gid].y = lid == get_local_linear_id();\n"
+"}\n"
+};
+
+#define NUM_ITER 12
+#define MAX_1D 4096
+#define MAX_2D 64
+#define MAX_3D 16
+#define MAX_OFFSET 100000
+
+int
+test_get_linear_ids(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
+{
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    clMemWrapper outbuf;
+    int error, iter, i, j, k;
+    size_t lws[3], gws[3], gwo[3];
+    cl_uint dims;
+    cl_int outmem[2*MAX_1D], *om;
+
+
+    // Create the kernel
+    error = create_single_kernel_helper_with_build_options(context, &program, &kernel, 1, linear_ids_source, "test_linear_ids", "-cl-std=CL2.0");
+    if (error)
+        return error;
+
+    // Create the out buffer
+    outbuf = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(outmem), NULL, &error);
+    test_error(error, "failed to create result buffer\n");
+
+    // This will leak if there is an error, but this is what is done everywhere else
+    MTdata seed = init_genrand(gRandomSeed);
+
+    // Run some tests
+    for (iter=0; iter<NUM_ITER; ++iter) {
+        dims = iter % 3 + 1;
+
+        switch (dims) {
+        case 1:
+            gwo[0] = random_in_range(0, MAX_OFFSET, seed);
+            gws[0] = random_in_range(MAX_1D/8, MAX_1D/4, seed)*4;
+            error = get_max_common_work_group_size(context, kernel, gws[0], lws);
+            break;
+        case 2:
+            gwo[0] = random_in_range(0, MAX_OFFSET, seed);
+            gwo[1] = random_in_range(0, MAX_OFFSET, seed);
+            gws[0] = random_in_range(MAX_2D/8, MAX_2D/4, seed)*4;
+            gws[1] = random_in_range(MAX_2D/8, MAX_2D/4, seed)*4;
+            error = get_max_common_2D_work_group_size(context, kernel, gws, lws);
+            break;
+        case 3:
+            gwo[0] = random_in_range(0, MAX_OFFSET, seed);
+            gwo[1] = random_in_range(0, MAX_OFFSET, seed);
+            gwo[2] = random_in_range(0, MAX_OFFSET, seed);
+            gws[0] = random_in_range(MAX_3D/4, MAX_3D/2, seed)*2;
+            gws[1] = random_in_range(MAX_3D/4, MAX_3D/2, seed)*2;
+            gws[2] = random_in_range(MAX_3D/4, MAX_3D/2, seed)*2;
+            error = get_max_common_3D_work_group_size(context, kernel, gws, lws);
+            break;
+        }
+
+        test_error(error, "Failed to determine local work size\n");
+
+
+        switch (dims) {
+        case 1:
+            log_info("  testing offset=%u global=%u local=%u...\n", gwo[0], gws[0], lws[0]);
+            break;
+        case 2:
+            log_info("  testing offset=(%u,%u) global=(%u,%u) local=(%u,%u)...\n",
+                    gwo[0], gwo[1], gws[0], gws[1], lws[0], lws[1]);
+            break;
+        case 3:
+            log_info("  testing offset=(%u,%u,%u) global=(%u,%u,%u) local=(%u,%u,%u)...\n",
+                    gwo[0], gwo[1], gwo[2], gws[0], gws[1], gws[2], lws[0], lws[1], lws[2]);
+            break;
+        }
+
+        // Set up and run
+        memset(outmem, 0, sizeof(outmem));
+
+        error = clSetKernelArg(kernel, 0, sizeof(outbuf), (void *)&outbuf);
+        test_error(error, "clSetKernelArg failed\n");
+
+        error = clEnqueueWriteBuffer(queue, outbuf, CL_FALSE, 0, sizeof(outmem), (void *)outmem, 0, NULL, NULL);
+        test_error(error, "clEnqueueWriteBuffer failed\n");
+
+        error = clEnqueueNDRangeKernel(queue, kernel, dims, gwo, gws, lws, 0, NULL, NULL);
+        test_error(error, "clEnqueueNDRangeKernel failed\n");
+
+        error = clEnqueueReadBuffer(queue, outbuf, CL_FALSE, 0, sizeof(outmem), (void *)outmem, 0, NULL, NULL);
+        test_error(error, "clEnqueueReadBuffer failed\n");
+
+        error = clFinish(queue);
+        test_error(error, "clFinish failed\n");
+
+        // Check the return
+        switch (dims) {
+        case 1:
+            for (i=0, om=outmem; i<(int)gws[0]; ++i, om+=2) {
+                if (om[0] != 1) {
+                    log_error("get_global_linear_id() failed at %d\n", i);
+                    return -1;
+                }
+                if (om[1] != 1) {
+                    log_error("get_local_linear_id() failed at (%d, %d)\n", i % (int)lws[0], i / (int)lws[0]);
+                    return -1;
+                }
+            }
+            break;
+        case 2:
+            for (j=0, om=outmem; j<gws[1]; ++j) {
+                for (i=0; i<gws[0]; ++i, om+=2) {
+                    if (om[0] != 1) {
+                        log_error("get_global_linear_id() failed at (%d,%d)\n", i, j);
+                        return -1;
+                    }
+                    if (om[1] != 1) {
+                        log_error("get_local_linear_id() failed at (%d, %d), (%d, %d)\n",
+                                i % (int)lws[0], j % (int)lws[1],
+                                i / (int)lws[0], j / (int)lws[1]);
+                        return -1;
+                    }
+                }
+            }
+            break;
+        case 3:
+            for (k=0, om=outmem; k<gws[2]; ++k) {
+                for (j=0; j<gws[1]; ++j) {
+                    for (i=0; i<gws[0]; ++i, om+=2) {
+                        if (om[0] != 1) {
+                            log_error("get_global_linear_id() failed at (%d,%d, %d)\n", i, j, k);
+                            return -1;
+                        }
+                        if (om[1] != 1) {
+                            log_error("get_local_linear_id() failed at (%d, %d), (%d, %d), (%d, %d)\n",
+                                    i % (int)lws[0], j % (int)lws[1], k % (int)lws[2],
+                                    i / (int)lws[0], j / (int)lws[1], k / (int)lws[2]);
+                            return -1;
+                        }
+                    }
+                }
+            }
+            break;
+        }
+
+    }
+
+    free_mtdata(seed);
+    return 0;
+}
+
--- a/Show More
+++ b/Show More