Initial open source release of OpenCL 2.0 CTS.

2026-03-23 07:39:01 +00:00 · 2017-05-16 18:50:35 +05:30
parent 6911ba5116
commit 3a440d17c8
883 changed files with 318212 additions and 0 deletions
--- a/test_conformance/SVM/CMakeLists.txt
+++ b/test_conformance/SVM/CMakeLists.txt
@@ -0,0 +1,24 @@
+set(MODULE_NAME SVM)
+
+set(${MODULE_NAME}_SOURCES
+    main.cpp
+    test_allocate_shared_buffer.cpp
+    test_byte_granularity.cpp
+    test_cross_buffer_pointers.cpp
+    test_enqueue_api.cpp
+    test_fine_grain_memory_consistency.cpp
+    test_fine_grain_sync_buffers.cpp
+    test_pointer_passing.cpp
+    test_set_kernel_exec_info_svm_ptrs.cpp
+    test_shared_address_space_coarse_grain.cpp
+    test_shared_address_space_fine_grain.cpp
+    test_shared_address_space_fine_grain_buffers.cpp
+    test_shared_sub_buffers.cpp
+    ../../test_common/harness/testHarness.c
+    ../../test_common/harness/errorHelpers.c
+    ../../test_common/harness/kernelHelpers.c
+    ../../test_common/harness/mt19937.c
+    ../../test_common/harness/msvc9.c
+)
+
+include(../CMakeCommon.txt)
--- a/test_conformance/SVM/Makefile
+++ b/test_conformance/SVM/Makefile
@@ -0,0 +1,53 @@
+ifdef BUILD_WITH_ATF
+ATF = -framework ATF
+USE_ATF = -DUSE_ATF
+endif
+
+SRCS = main.c \
+	test_allocate_shared_buffer.cpp \
+	test_byte_granularity.cpp \
+	test_cross_buffer_pointers.cpp \
+	test_enqueue_api.cpp \
+    test_fine_grain_memory_consistency.cpp \
+    test_fine_grain_sync_buffers.cpp \
+    test_pointer_passing.cpp \
+    test_set_kernel_exec_info_svm_ptrs.cpp \
+	test_shared_address_space_coarse_grain.cpp \
+	test_shared_address_space_fine_grain_buffers.cpp \
+	test_shared_address_space_fine_grain.cpp \
+    test_shared_sub_buffers.cpp \
+    ../../test_common/harness/errorHelpers.c \
+	../../test_common/harness/threadTesting.c \
+	../../test_common/harness/testHarness.c \
+	../../test_common/harness/kernelHelpers.c \
+	../../test_common/harness/typeWrappers.cpp \
+	../../test_common/harness/mt19937.c \
+		  
+DEFINES = DONT_TEST_GARBAGE_POINTERS
+
+SOURCES = $(abspath $(SRCS))
+LIBPATH += -L/System/Library/Frameworks/OpenCL.framework/Libraries
+LIBPATH += -L.
+HEADERS = 
+TARGET = test_SVM
+INCLUDE = 
+COMPILERFLAGS = -c -Wall -g -Wshorten-64-to-32
+CC = c++
+CFLAGS = $(COMPILERFLAGS) ${RC_CFLAGS} ${USE_ATF} $(DEFINES:%=-D%) $(INCLUDE)
+CXXFLAGS = $(COMPILERFLAGS) ${RC_CFLAGS} ${USE_ATF} $(DEFINES:%=-D%) $(INCLUDE)
+LIBRARIES = -framework OpenCL -framework OpenGL -framework GLUT -framework AppKit ${ATF}
+
+OBJECTS := ${SOURCES:.c=.o}
+OBJECTS := ${OBJECTS:.cpp=.o}
+
+TARGETOBJECT =
+all: $(TARGET)
+
+$(TARGET): $(OBJECTS)
+	$(CC) $(RC_CFLAGS) $(OBJECTS) -o $@ $(LIBPATH) $(LIBRARIES)
+
+clean:
+	rm -f $(TARGET) $(OBJECTS)
+
+.DEFAULT:
+	@echo The target \"$@\" does not exist in Makefile.
--- a/test_conformance/SVM/common.h
+++ b/test_conformance/SVM/common.h
@@ -0,0 +1,100 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#ifndef __COMMON_H__
+#define __COMMON_H__
+
+#include "../../test_common/harness/compat.h"
+#include "../../test_common/harness/testHarness.h"
+#include "../../test_common/harness/errorHelpers.h"
+#include "../../test_common/harness/kernelHelpers.h"
+#include "../../test_common/harness/typeWrappers.h"
+
+#if (defined(_WIN32) || defined(_WIN64)) && defined(_MSC_VER)
+    #include <windows.h>
+#endif
+
+typedef enum {
+    memory_order_relaxed,
+    memory_order_acquire,
+    memory_order_release,
+    memory_order_acq_rel,
+    memory_order_seq_cst
+} cl_memory_order;
+
+cl_int AtomicLoadExplicit(volatile cl_int * pValue, cl_memory_order order);
+cl_int AtomicFetchAddExplicit(volatile cl_int *object, cl_int operand, cl_memory_order o);
+
+template <typename T>
+bool AtomicCompareExchangeStrongExplicit(volatile T *a, T *expected, T desired,
+                                  cl_memory_order order_success,
+                                  cl_memory_order order_failure)
+{
+    T tmp;
+#if defined( _MSC_VER ) || (defined( __INTEL_COMPILER ) && defined(WIN32))
+    tmp = (T)InterlockedCompareExchange((volatile LONG *)a, (LONG)desired, *(LONG *)expected);
+#elif defined(__GNUC__)
+    tmp = (T)__sync_val_compare_and_swap((volatile intptr_t*)a, (intptr_t)(*expected), (intptr_t)desired);
+#else
+    log_info("Host function not implemented: atomic_compare_exchange\n");
+    tmp = 0;
+#endif
+    if(tmp == *expected)
+        return true;
+    *expected = tmp;
+    return false;
+}
+
+// this checks for a NULL ptr and/or an error code
+#define test_error2(error_code, ptr, msg)  { if(error != 0)  { test_error(error_code, msg); } else  { if(NULL == ptr)  {print_null_error(msg); return -1;} } }
+#define print_null_error(msg) log_error("ERROR: %s! (NULL pointer detected %s:%d)\n", msg, __FILE__, __LINE__ );
+
+// max possible number of queues needed, 1 for each device in platform.
+#define MAXQ 32
+
+typedef struct Node{
+    cl_int global_id;
+    cl_int position_in_list;
+    struct Node* pNext;
+} Node;
+
+extern void   create_linked_lists(Node* pNodes, size_t num_lists, int list_length);
+extern cl_int verify_linked_lists(Node* pNodes, size_t num_lists, int list_length);
+
+extern cl_int        create_linked_lists_on_device(int qi, cl_command_queue q, cl_mem allocator,     cl_kernel k, size_t numLists  );
+extern cl_int        verify_linked_lists_on_device(int qi, cl_command_queue q, cl_mem num_correct,   cl_kernel k, cl_int ListLength, size_t numLists  );
+extern cl_int create_linked_lists_on_device_no_map(int qi, cl_command_queue q, size_t *pAllocator,   cl_kernel k, size_t numLists  );
+extern cl_int verify_linked_lists_on_device_no_map(int qi, cl_command_queue q, cl_int *pNum_correct, cl_kernel k, cl_int ListLength, size_t numLists  );
+
+extern int    test_byte_granularity(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int    test_set_kernel_exec_info_svm_ptrs(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int    test_fine_grain_memory_consistency(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int    test_fine_grain_sync_buffers(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int    test_shared_address_space_coarse_grain_old_api(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int    test_shared_address_space_coarse_grain_new_api(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int    test_shared_address_space_fine_grain_buffers(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int    test_shared_address_space_fine_grain(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int    test_cross_buffer_pointers_coarse_grain(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int    test_svm_pointer_passing(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int    test_allocate_shared_buffer(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int    test_shared_sub_buffers(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int    test_enqueue_api(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+
+extern cl_int create_cl_objects(cl_device_id device_from_harness, const char** ppCodeString, cl_context* context, cl_program *program, cl_command_queue *queues, cl_uint *num_devices, cl_device_svm_capabilities required_svm_caps);
+
+extern const char *linked_list_create_and_verify_kernels[];
+
+#endif    // #ifndef __COMMON_H__
+
--- a/test_conformance/SVM/main.cpp
+++ b/test_conformance/SVM/main.cpp
@@ -0,0 +1,338 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/compat.h"
+
+#include <stdio.h>
+#include <vector>
+#include <sstream>
+#include "../../test_common/harness/testHarness.h"
+
+#include "common.h"
+
+// SVM Atomic wrappers.
+// Platforms that support SVM atomics (atomics that work across the host and devices) need to implement these host side functions correctly.
+// Platforms that do not support SVM atomics can simpy implement these functions as empty stubs since the functions will not be called.
+// For now only Windows x86 is implemented, add support for other platforms as needed.
+cl_int AtomicLoadExplicit(volatile cl_int * pValue, cl_memory_order order)
+{
+#if (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))) || (defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)))
+  return *pValue;  // provided the value is aligned x86 doesn't need anything more than this for seq_cst.
+#elif defined(__GNUC__)
+	return __sync_add_and_fetch(pValue, 0);
+#else
+  log_error("ERROR: AtomicLoadExplicit function not implemented\n");
+  return -1;
+#endif
+}
+// all the x86 atomics are seq_cst, so don't need to do anything with the memory order parameter.
+cl_int AtomicFetchAddExplicit(volatile cl_int *object, cl_int operand, cl_memory_order o)
+{
+#if (defined(_WIN32) || defined(_WIN64)) && defined(_MSC_VER)
+  return InterlockedExchangeAdd( (volatile LONG*) object, operand);
+#elif defined(__GNUC__)
+  return __sync_fetch_and_add(object, operand);
+#else
+  log_error("ERROR: AtomicFetchAddExplicit function not implemented\n");
+  return -1;
+#endif
+}
+
+cl_int AtomicExchangeExplicit(volatile cl_int *object, cl_int desired, cl_memory_order mo)
+{
+#if (defined(_WIN32) || defined(_WIN64)) && defined(_MSC_VER)
+  return InterlockedExchange( (volatile LONG*) object, desired);
+#elif defined(__GNUC__)
+  return __sync_lock_test_and_set(object, desired);
+#else
+  log_error("ERROR: AtomicExchangeExplicit function not implemented\n");
+  return -1;
+#endif
+}
+
+
+const char *linked_list_create_and_verify_kernels[] = {
+  "typedef struct Node {\n"
+  "    int global_id;\n"
+  "    int position_in_list;\n"
+  "    __global struct Node* pNext;\n"
+  "} Node;\n"
+  "\n"
+  // The allocation_index parameter must be initialized on the host to N work-items
+  // The first N nodes in pNodes will be the heads of the lists.
+  "__kernel void create_linked_lists(__global Node* pNodes, volatile __attribute__((nosvm)) __global int* allocation_index, int list_length)\n"
+  "{\n"
+  "    size_t i = get_global_id(0);\n"
+  "    __global Node *pNode = &pNodes[i];\n"
+  "\n"
+  "    pNode->global_id = i;\n"
+  "    pNode->position_in_list = 0;\n"
+  "\n"
+  "    __global Node *pNew;\n"
+  "    for(int j=1; j < list_length; j++)\n"
+  "    {\n"
+  "        pNew = &pNodes[ atomic_inc(allocation_index) ];// allocate a new node\n"
+  "        pNew->global_id = i;\n"
+  "        pNew->position_in_list = j;\n"
+  "        pNode->pNext = pNew;  // link new node onto end of list\n"
+  "        pNode = pNew;   // move to end of list\n"
+  "    }\n"
+  "}\n"
+
+  "__kernel void verify_linked_lists(__global Node* pNodes, volatile __global uint* num_correct, int list_length)\n"
+  "{\n"
+  "    size_t i = get_global_id(0);\n"
+  "    __global Node *pNode = &pNodes[i];\n"
+  "\n"
+  "    for(int j=0; j < list_length; j++)\n"
+  "    {\n"
+  "        if( pNode->global_id == i && pNode->position_in_list == j)\n"
+  "        {\n"
+  "            atomic_inc(num_correct);\n"
+  "        } \n"
+  "        else {\n"
+  "            break;\n"
+  "        }\n"
+  "        pNode = pNode->pNext;\n"
+  "    }\n"
+  "}\n"
+};
+
+
+// The first N nodes in pNodes will be the heads of the lists.
+void create_linked_lists(Node* pNodes, size_t num_lists, int list_length)
+{
+  size_t allocation_index = num_lists;  // heads of lists are in first num_lists nodes.
+
+  for(cl_uint i = 0; i < num_lists; i++)
+  {
+    Node *pNode = &pNodes[i];
+    pNode->global_id = i;
+    pNode->position_in_list = 0;
+    Node *pNew;
+    for(int j=1; j < list_length; j++)
+    {
+      pNew = &pNodes[ allocation_index++ ];// allocate a new node
+      pNew->global_id = i;
+      pNew->position_in_list = j;
+      pNode->pNext = pNew;  // link new node onto end of list
+      pNode = pNew;   // move to end of list
+    }
+  }
+}
+
+cl_int verify_linked_lists(Node* pNodes, size_t num_lists, int list_length)
+{
+  cl_int error = CL_SUCCESS;
+  int numCorrect = 0;
+
+  log_info(" and verifying on host ");
+  for(cl_uint i=0; i < num_lists; i++)
+  {
+    Node *pNode = &pNodes[i];
+    for(int j=0; j < list_length; j++)
+    {
+      if( pNode->global_id == i && pNode->position_in_list == j)
+      {
+        numCorrect++;
+      }
+      else {
+        break;
+      }
+      pNode = pNode->pNext;
+    }
+  }
+  if(numCorrect != list_length * (cl_uint)num_lists)
+  {
+    error = -1;
+    log_info("Failed\n");
+  }
+  else
+    log_info("Passed\n");
+
+  return error;
+}
+
+// Note that we don't use the context provided by the test harness since it doesn't support multiple devices,
+// so we create are own context here that has all devices, we use the same platform that the harness used.
+cl_int create_cl_objects(cl_device_id device_from_harness, const char** ppCodeString, cl_context* context, cl_program *program, cl_command_queue *queues, cl_uint *num_devices, cl_device_svm_capabilities required_svm_caps)
+{
+  cl_int error;
+
+  cl_platform_id platform_id;
+  // find out what platform the harness is using.
+  error = clGetDeviceInfo(device_from_harness, CL_DEVICE_PLATFORM,sizeof(cl_platform_id),&platform_id,NULL);
+  test_error(error,"clGetDeviceInfo failed");
+
+  error = clGetDeviceIDs(platform_id,  CL_DEVICE_TYPE_ALL, 0, NULL, num_devices );
+  test_error(error, "clGetDeviceIDs failed");
+
+  std::vector<cl_device_id> devicesTmp(*num_devices), devices, capable_devices;
+
+  error = clGetDeviceIDs(platform_id,  CL_DEVICE_TYPE_ALL, *num_devices, &devicesTmp[0], NULL );
+  test_error(error, "clGetDeviceIDs failed");
+
+  devices.push_back(device_from_harness);
+  for (size_t i = 0; i < devicesTmp.size(); ++i)
+  {
+    if (device_from_harness != devicesTmp[i])
+      devices.push_back(devicesTmp[i]);
+  }
+
+  // Select only the devices that support the SVM level needed for the test.
+  // Note that if requested SVM capabilities are not supported by any device then the test still passes (even though it does not execute).
+  cl_device_svm_capabilities caps;
+  cl_uint num_capable_devices = 0;
+  for(cl_uint i = 0; i < *num_devices; i++)
+  {
+    size_t ret_len = 0;
+    error = clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, 0, 0, &ret_len);
+    if (error != CL_SUCCESS)
+    {
+      log_error("clGetDeviceInfo failed %s\n", IGetErrorString(error));
+      return -1;
+    }
+
+    std::vector<char> oclVersion(ret_len + 1);
+    error = clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, sizeof(char) * oclVersion.size(), &oclVersion[0], 0);
+    if (error != CL_SUCCESS)
+    {
+      log_error("clGetDeviceInfo failed %s\n", IGetErrorString(error));
+      return -1;
+    }
+
+    std::string versionStr(&oclVersion[7]);
+    std::stringstream stream;
+    stream << versionStr;
+
+    double version = 0.0;
+    stream >> version;
+
+    if(device_from_harness != devices[i] && version < 2.0)
+    {
+      continue;
+    }
+
+    error = clGetDeviceInfo(devices[i], CL_DEVICE_SVM_CAPABILITIES, sizeof(cl_device_svm_capabilities), &caps, NULL);
+    test_error(error,"clGetDeviceInfo failed for CL_DEVICE_MEM_SHARING");
+    if(caps & (~(CL_DEVICE_SVM_COARSE_GRAIN_BUFFER | CL_DEVICE_SVM_FINE_GRAIN_BUFFER |  CL_DEVICE_SVM_FINE_GRAIN_SYSTEM | CL_DEVICE_SVM_ATOMICS)))
+    {
+      log_error("clGetDeviceInfo returned an invalid cl_device_svm_capabilities value");
+      return -1;
+    }
+    if((caps & required_svm_caps) == required_svm_caps)
+    {
+      capable_devices.push_back(devices[i]);
+      ++num_capable_devices;
+    }
+  }
+  devices = capable_devices;  // the only devices we care about from here on are the ones capable of supporting the requested SVM level.
+  *num_devices = num_capable_devices;
+  if(num_capable_devices == 0)
+    //    if(svm_level > CL_DEVICE_COARSE_SVM && 0 == num_capable_devices)
+  {
+    log_info("Requested SVM level not supported by any device on this platform, test not executed.\n");
+    return 1; // 1 indicates do not execute, but counts as passing.
+  }
+
+  cl_context_properties context_properties[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform_id, NULL };
+  *context = clCreateContext(context_properties, *num_devices, &devices[0], NULL, NULL, &error);
+  test_error(error, "Unable to create context" );
+
+  //    *queues = (cl_command_queue *) malloc( *num_devices * sizeof( cl_command_queue ) );
+
+  for(cl_uint i = 0; i < *num_devices; i++)
+  {
+    queues[i] = clCreateCommandQueueWithProperties(*context, devices[i], 0, &error);
+    test_error(error, "clCreateCommandQueue failed");
+  }
+
+  if(ppCodeString)
+  {
+    *program = clCreateProgramWithSource(*context, 1, ppCodeString , NULL, &error);
+    test_error( error, "clCreateProgramWithSource failed" );
+
+    error = clBuildProgram(*program,0,NULL,"-cl-std=CL2.0",NULL,NULL);
+    if (error != CL_SUCCESS)
+    {
+      print_error(error, "clBuildProgram failed");
+
+      char *buildLog = NULL;
+      size_t buildLogSize = 0;
+      error = clGetProgramBuildInfo (*program, devices[0], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, &buildLogSize);
+      buildLog = (char*)malloc(buildLogSize);
+      memset(buildLog, 0, buildLogSize);
+      error = clGetProgramBuildInfo (*program, devices[0], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL);
+      char string[15000];
+      sprintf(string,"%s", buildLog);
+      //MessageBox(NULL,(LPCWSTR)string,(LPCWSTR)"OpenCL Error",MB_OK);
+      //MessageBox(NULL,string,"OpenCL Error",MB_OK);
+      free(buildLog);
+      log_info("%s",string);
+      if (error) {
+        print_error(error, "clGetProgramBuildInfo CL_PROGRAM_BUILD_LOG failed");
+        return -1;
+      }
+      return -1;
+    }
+  }
+
+  return 0;
+}
+
+basefn    basefn_list[] = {
+  test_byte_granularity,
+  test_set_kernel_exec_info_svm_ptrs,
+  test_fine_grain_memory_consistency,
+  test_fine_grain_sync_buffers,
+  test_shared_address_space_fine_grain,
+  test_shared_sub_buffers,
+  test_shared_address_space_fine_grain_buffers,
+  test_allocate_shared_buffer,
+  test_shared_address_space_coarse_grain_old_api,
+  test_shared_address_space_coarse_grain_new_api,
+  test_cross_buffer_pointers_coarse_grain,
+  test_svm_pointer_passing,
+  test_enqueue_api,
+};
+
+const char    *basefn_names[] = {
+  "svm_byte_granularity",
+  "svm_set_kernel_exec_info_svm_ptrs",
+  "svm_fine_grain_memory_consistency",
+  "svm_fine_grain_sync_buffers",
+  "svm_shared_address_space_fine_grain",
+  "svm_shared_sub_buffers",
+  "svm_shared_address_space_fine_grain_buffers",
+  "svm_allocate_shared_buffer",
+  "svm_shared_address_space_coarse_grain_old_api",
+  "svm_shared_address_space_coarse_grain_new_api",
+  "svm_cross_buffer_pointers_coarse_grain",
+  "svm_pointer_passing",
+  "svm_enqueue_api",
+};
+
+ct_assert((sizeof(basefn_names) / sizeof(basefn_names[0])) == (sizeof(basefn_list) / sizeof(basefn_list[0])));
+
+int    num_fns = sizeof(basefn_names) / sizeof(char *);
+
+
+int main(int argc, const char *argv[])
+{
+  return runTestHarness( argc, argv, num_fns, basefn_list, basefn_names, false, true, 0 );
+}
+
+
+
--- a/test_conformance/SVM/test_allocate_shared_buffer.cpp
+++ b/test_conformance/SVM/test_allocate_shared_buffer.cpp
@@ -0,0 +1,107 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "common.h"
+
+const cl_mem_flags flag_set[] = {
+  CL_MEM_READ_WRITE,
+  CL_MEM_WRITE_ONLY,
+  CL_MEM_READ_ONLY,
+  CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER,
+  CL_MEM_WRITE_ONLY | CL_MEM_SVM_FINE_GRAIN_BUFFER,
+  CL_MEM_READ_ONLY | CL_MEM_SVM_FINE_GRAIN_BUFFER,
+  CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS,
+  CL_MEM_WRITE_ONLY | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS,
+  CL_MEM_READ_ONLY | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS,
+  0
+};
+const char* flag_set_names[] = {
+  "CL_MEM_READ_WRITE",
+  "CL_MEM_WRITE_ONLY",
+  "CL_MEM_READ_ONLY",
+  "CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER",
+  "CL_MEM_WRITE_ONLY | CL_MEM_SVM_FINE_GRAIN_BUFFER",
+  "CL_MEM_READ_ONLY | CL_MEM_SVM_FINE_GRAIN_BUFFER",
+  "CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS",
+  "CL_MEM_WRITE_ONLY | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS",
+  "CL_MEM_READ_ONLY | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS",
+  "0"
+};
+
+
+int test_allocate_shared_buffer(cl_device_id deviceID, cl_context context2, cl_command_queue queue, int num_elements)
+{
+  clContextWrapper    context = NULL;
+  clProgramWrapper    program = NULL;
+  cl_uint     num_devices = 0;
+  cl_int      err = CL_SUCCESS;
+  clCommandQueueWrapper queues[MAXQ];
+
+  cl_device_svm_capabilities caps;
+  err = clGetDeviceInfo(deviceID, CL_DEVICE_SVM_CAPABILITIES, sizeof(cl_device_svm_capabilities), &caps, NULL);
+  test_error(err,"clGetDeviceInfo failed for CL_DEVICE_SVM_CAPABILITIES");
+
+  // under construction...
+  err = create_cl_objects(deviceID, NULL, &context, &program, &queues[0], &num_devices, CL_DEVICE_SVM_COARSE_GRAIN_BUFFER);
+  if(err) return -1;
+
+  size_t size = 1024;
+
+  // iteration over flag combos
+  int num_flags = sizeof(flag_set)/sizeof(cl_mem_flags);
+  for(int i = 0; i < num_flags; i++)
+  {
+    if (((flag_set[i] & CL_MEM_SVM_FINE_GRAIN_BUFFER) != 0 && (caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) == 0)
+        || ((flag_set[i] & CL_MEM_SVM_ATOMICS) != 0 && (caps & CL_DEVICE_SVM_ATOMICS) == 0))
+    {
+      log_info("Skipping clSVMalloc with flags: %s\n", flag_set_names[i]);
+      continue;
+    }
+
+    log_info("Testing clSVMalloc with flags: %s\n", flag_set_names[i]);
+    cl_char *pBufData1 = (cl_char*) clSVMAlloc(context, flag_set[i], size, 0);
+    if(pBufData1 == NULL)
+    {
+      log_error("SVMalloc returned NULL");
+      return -1;
+    }
+
+    {
+      clMemWrapper buf = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, size, pBufData1, &err);
+      test_error(err,"clCreateBuffer failed");
+
+      cl_char *pBufData2 = NULL;
+      cl_uint flags = CL_MAP_READ | CL_MAP_READ;
+      if(flag_set[i] & CL_MEM_HOST_READ_ONLY) flags ^= CL_MAP_WRITE;
+      if(flag_set[i] & CL_MEM_HOST_WRITE_ONLY) flags ^= CL_MAP_READ;
+
+      if(!(flag_set[i] & CL_MEM_HOST_NO_ACCESS))
+      {
+        pBufData2 = (cl_char*) clEnqueueMapBuffer(queues[0], buf, CL_TRUE, flags, 0, size, 0, NULL,NULL, &err);
+        test_error(err, "clEnqueueMapBuffer failed");
+
+        if(pBufData2 != pBufData1 || NULL == pBufData1)
+        {
+          log_error("SVM pointer returned by clEnqueueMapBuffer doesn't match pointer returned by clSVMalloc");
+          return -1;
+        }
+      }
+    }
+
+    clSVMFree(context, pBufData1);
+  }
+
+  return 0;
+}
--- a/test_conformance/SVM/test_byte_granularity.cpp
+++ b/test_conformance/SVM/test_byte_granularity.cpp
@@ -0,0 +1,148 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "common.h"
+
+const char *byte_manipulation_kernels[] = {
+  // Each device will write it's id into the bytes that it "owns", ownership is based on round robin (global_id % num_id)
+  // num_id is equal to number of SVM devices in the system plus one (for the host code).
+  // id is the index (id) of the device that this kernel is executing on.
+  // For example, if there are 2 SVM devices and the host; the buffer should look like this after each device and the host write their id's:
+  // 0, 1, 2, 0, 1, 2, 0, 1, 2...
+  "__kernel void write_owned_locations(__global char* a, uint num_id, uint id)\n"
+  "{\n"
+  "    size_t i = get_global_id(0);\n"
+  "   int owner = i % num_id;\n"
+  "    if(id == owner) \n"
+  "       a[i] = id;\n"  // modify location if it belongs to this device, write id
+  "}\n"
+
+  // Verify that a device can see the byte sized updates from the other devices, sum up the device id's and see if they match expected value.
+  // Note: this must be called with a reduced NDRange so that neighbor acesses don't go past end of buffer.
+  // For example if there are two SVM devices and the host (3 total devices) the buffer should look like this:
+  // 0,1,2,0,1,2...
+  // and the expected sum at each point is 0+1+2 = 3.
+  "__kernel void sum_neighbor_locations(__global char* a, uint num_devices, volatile __global uint* error_count)\n"
+  "{\n"
+  "    size_t i = get_global_id(0);\n"
+  "    uint expected_sum = (num_devices * (num_devices - 1))/2;\n"
+  "    uint sum = 0;\n"
+  "    for(uint j=0; j<num_devices; j++) {\n"
+  "        sum += a[i + j];\n" // add my neighbors to the right
+  "    }\n"
+  "    if(sum != expected_sum)\n"
+  "        atomic_inc(error_count);\n"
+  "}\n"
+};
+
+
+
+int    test_byte_granularity(cl_device_id deviceID, cl_context c, cl_command_queue queue, int num_elements)
+{
+  clContextWrapper context;
+  clProgramWrapper program;
+  clKernelWrapper k1,k2;
+  clCommandQueueWrapper queues[MAXQ];
+
+  cl_uint     num_devices = 0;
+  cl_int      err = CL_SUCCESS;
+  cl_int        rval = CL_SUCCESS;
+
+  err = create_cl_objects(deviceID, &byte_manipulation_kernels[0], &context, &program, &queues[0], &num_devices, CL_DEVICE_SVM_FINE_GRAIN_BUFFER);
+  if(err == 1) return 0; // no devices capable of requested SVM level, so don't execute but count test as passing.
+  if(err < 0) return -1; // fail test.
+
+  cl_uint num_devices_plus_host = num_devices + 1;
+
+  k1 = clCreateKernel(program, "write_owned_locations", &err);
+  test_error(err, "clCreateKernel failed");
+  k2 = clCreateKernel(program, "sum_neighbor_locations", &err);
+  test_error(err, "clCreateKernel failed");
+
+
+  cl_char *pA = (cl_char*) clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, sizeof(cl_char) * num_elements, 0);
+
+  cl_uint **error_counts =  (cl_uint**) malloc(sizeof(void*) * num_devices);
+
+  for(cl_uint i=0; i < num_devices; i++) {
+    error_counts[i] = (cl_uint*) clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, sizeof(cl_uint), 0);
+    *error_counts[i] = 0;
+  }
+  for(int i=0; i < num_elements; i++) pA[i] = -1;
+
+  err |= clSetKernelArgSVMPointer(k1, 0, pA);
+  err |= clSetKernelArg(k1, 1, sizeof(cl_uint), &num_devices_plus_host);
+  test_error(err, "clSetKernelArg failed");
+
+  // get all the devices going simultaneously
+  size_t element_num = num_elements;
+  for(cl_uint d=0; d < num_devices; d++)  // device ids starting at 1.
+  {
+    err = clSetKernelArg(k1, 2, sizeof(cl_uint), &d);
+    test_error(err, "clSetKernelArg failed");
+    err = clEnqueueNDRangeKernel(queues[d], k1, 1, NULL, &element_num, NULL, 0, NULL, NULL);
+    test_error(err,"clEnqueueNDRangeKernel failed");
+  }
+
+  for(cl_uint d=0; d < num_devices; d++) clFlush(queues[d]);
+
+  cl_uint host_id = num_devices;  // host code will take the id above the devices.
+  for(int i = (int)num_devices; i < num_elements; i+= num_devices_plus_host) pA[i] = host_id;
+
+  for(cl_uint id = 0; id < num_devices; id++) clFinish(queues[id]);
+
+  // now check that each device can see the byte writes made by the other devices.
+
+  err |= clSetKernelArgSVMPointer(k2, 0, pA);
+  err |= clSetKernelArg(k2, 1, sizeof(cl_uint), &num_devices_plus_host);
+  test_error(err, "clSetKernelArg failed");
+
+  // adjusted so k2 doesn't read past end of buffer
+  size_t adjusted_num_elements = num_elements - num_devices;
+  for(cl_uint id = 0; id < num_devices; id++)
+  {
+    err = clSetKernelArgSVMPointer(k2, 2, error_counts[id]);
+    test_error(err, "clSetKernelArg failed");
+
+    err = clEnqueueNDRangeKernel(queues[id], k2, 1, NULL, &adjusted_num_elements, NULL, 0, NULL, NULL);
+    test_error(err,"clEnqueueNDRangeKernel failed");
+  }
+
+  for(cl_uint id = 0; id < num_devices; id++) clFinish(queues[id]);
+
+  bool failed = false;
+
+  // see if any of the devices found errors
+  for(cl_uint i=0; i < num_devices; i++) {
+    if(*error_counts[i] > 0)
+      failed = true;
+  }
+  cl_uint expected = (num_devices_plus_host * (num_devices_plus_host - 1))/2;
+  // check that host can see the byte writes made by the devices.
+  for(cl_uint i = 0; i < num_elements - num_devices_plus_host; i++)
+  {
+    int sum = 0;
+    for(cl_uint j=0; j < num_devices_plus_host; j++) sum += pA[i+j];
+    if(sum != expected)
+      failed = true;
+  }
+
+  clSVMFree(context, pA);
+  for(cl_uint i=0; i < num_devices; i++) clSVMFree(context, error_counts[i]);
+
+  if(failed)
+    return -1;
+  return 0;
+}
--- a/test_conformance/SVM/test_cross_buffer_pointers.cpp
+++ b/test_conformance/SVM/test_cross_buffer_pointers.cpp
@@ -0,0 +1,219 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "common.h"
+
+// create linked lists that use nodes from two different buffers.
+const char *SVMCrossBufferPointers_test_kernel[] = {
+  "\n"
+  "typedef struct Node {\n"
+  "    int global_id;\n"
+  "    int position_in_list;\n"
+  "    __global struct Node* pNext;\n"
+  "} Node;\n"
+  "\n"
+  "__global Node* allocate_node(__global Node* pNodes1, __global Node* pNodes2, volatile __global int* allocation_index, size_t i)\n"
+  "{\n"
+  // mix things up, adjacent work items will allocate from different buffers
+  "    if(i & 0x1)\n"
+  "        return &pNodes1[atomic_inc(allocation_index)];\n"
+  "    else\n"
+  "        return &pNodes2[atomic_inc(allocation_index)];\n"
+  "}\n"
+  "\n"
+  // The allocation_index parameter must be initialized on the host to N work-items
+  // The first N nodes in pNodes will be the heads of the lists.
+  "__kernel void create_linked_lists(__global Node* pNodes, __global Node* pNodes2, volatile __global int* allocation_index, int list_length)\n"
+  "{\n"
+  "    size_t i = get_global_id(0);\n"
+  "    __global Node *pNode = &pNodes[i];\n"
+  "\n"
+  "    pNode->global_id = i;\n"
+  "    pNode->position_in_list = 0;\n"
+  "\n"
+  "    __global Node *pNew;\n"
+  "    for(int j=1; j < list_length; j++)\n"
+  "    {\n"
+  "        pNew = allocate_node(pNodes, pNodes2, allocation_index, i);\n"
+  "        pNew->global_id = i;\n"
+  "        pNew->position_in_list = j;\n"
+  "        pNode->pNext = pNew;  // link new node onto end of list\n"
+  "        pNode = pNew;   // move to end of list\n"
+  "    }\n"
+  "}\n"
+  "\n"
+  "__kernel void verify_linked_lists(__global Node* pNodes, __global Node* pNodes2, volatile __global uint* num_correct, int list_length)\n"
+  "{\n"
+  "    size_t i = get_global_id(0);\n"
+  "    __global Node *pNode = &pNodes[i];\n"
+  "\n"
+  "    for(int j=0; j < list_length; j++)\n"
+  "    {\n"
+  "        if( pNode->global_id == i && pNode->position_in_list == j)\n"
+  "        {\n"
+  "            atomic_inc(num_correct);\n"
+  "        }\n"
+  "        else {\n"
+  "            break;\n"
+  "        }\n"
+  "        pNode = pNode->pNext;\n"
+  "    }\n"
+  "}\n"
+};
+
+
+// Creates linked list using host code.
+cl_int create_linked_lists_on_host(cl_command_queue cmdq, cl_mem nodes, cl_mem nodes2, cl_int ListLength, size_t numLists )
+{
+  cl_int error = CL_SUCCESS;
+
+  log_info("SVM: creating linked list on host ");
+
+  Node *pNodes = (Node*) clEnqueueMapBuffer(cmdq, nodes, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(Node)*ListLength*numLists, 0, NULL,NULL, &error);
+  test_error2(error, pNodes, "clEnqueueMapBuffer failed");
+
+  Node *pNodes2 = (Node*) clEnqueueMapBuffer(cmdq, nodes2, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(Node)*ListLength*numLists, 0, NULL,NULL, &error);
+  test_error2(error, pNodes2, "clEnqueueMapBuffer failed");
+
+  create_linked_lists(pNodes, numLists, ListLength);
+
+  error = clEnqueueUnmapMemObject(cmdq, nodes, pNodes, 0,NULL,NULL);
+  test_error(error, "clEnqueueUnmapMemObject failed");
+  error = clEnqueueUnmapMemObject(cmdq, nodes2, pNodes2, 0,NULL,NULL);
+  test_error(error, "clEnqueueUnmapMemObject failed");
+  error = clFinish(cmdq);
+  test_error(error, "clFinish failed");
+  return error;
+}
+
+// Verify correctness of the linked list using host code.
+cl_int verify_linked_lists_on_host(int ci, cl_command_queue cmdq, cl_mem nodes, cl_mem nodes2, cl_int ListLength, size_t numLists )
+{
+  cl_int error = CL_SUCCESS;
+
+  //log_info(" and verifying on host ");
+
+  Node *pNodes = (Node*) clEnqueueMapBuffer(cmdq, nodes, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(Node)*ListLength * numLists, 0, NULL,NULL, &error);
+  test_error2(error, pNodes, "clEnqueueMapBuffer failed");
+  Node *pNodes2 = (Node*) clEnqueueMapBuffer(cmdq, nodes2, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(Node)*ListLength * numLists, 0, NULL,NULL, &error);
+  test_error2(error, pNodes, "clEnqueueMapBuffer failed");
+
+  error = verify_linked_lists(pNodes, numLists, ListLength);
+  if(error) return -1;
+
+  error = clEnqueueUnmapMemObject(cmdq, nodes, pNodes, 0,NULL,NULL);
+  test_error(error, "clEnqueueUnmapMemObject failed");
+  error = clEnqueueUnmapMemObject(cmdq, nodes2, pNodes2, 0,NULL,NULL);
+  test_error(error, "clEnqueueUnmapMemObject failed");
+  error = clFinish(cmdq);
+  test_error(error, "clFinish failed");
+  return error;
+}
+
+// This tests that shared buffers are able to contain pointers that point to other shared buffers.
+// This tests that all devices and the host share a common address space; using only the coarse-grain features.
+// This is done by creating a linked list on a device and then verifying the correctness of the list
+// on another device or the host.
+// The linked list nodes are allocated from two different buffers this is done to ensure that cross buffer pointers work correctly.
+// This basic test is performed for all combinations of devices and the host.
+int test_cross_buffer_pointers_coarse_grain(cl_device_id deviceID, cl_context context2, cl_command_queue queue, int num_elements)
+{
+  clContextWrapper    context = NULL;
+  clProgramWrapper    program = NULL;
+  cl_uint     num_devices = 0;
+  cl_int      error = CL_SUCCESS;
+  clCommandQueueWrapper queues[MAXQ];
+
+  error = create_cl_objects(deviceID, &SVMCrossBufferPointers_test_kernel[0], &context, &program, &queues[0], &num_devices, CL_DEVICE_SVM_COARSE_GRAIN_BUFFER);
+  if(error) return -1;
+
+  size_t numLists =  num_elements;
+  cl_int ListLength = 32;
+
+  clKernelWrapper kernel_create_lists = clCreateKernel(program, "create_linked_lists", &error);
+  test_error(error, "clCreateKernel failed");
+
+  clKernelWrapper kernel_verify_lists = clCreateKernel(program, "verify_linked_lists", &error);
+  test_error(error, "clCreateKernel failed");
+
+  // this buffer holds some of the linked list nodes.
+  Node* pNodes = (Node*) clSVMAlloc(context, CL_MEM_READ_WRITE, sizeof(Node)*ListLength*numLists, 0);
+
+  // this buffer holds some of the linked list nodes.
+  Node* pNodes2 = (Node*) clSVMAlloc(context, CL_MEM_READ_WRITE, sizeof(Node)*ListLength*numLists, 0);
+
+  {
+    clMemWrapper nodes = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, sizeof(Node)*ListLength*numLists, pNodes, &error);
+    test_error(error, "clCreateBuffer failed.");
+
+    clMemWrapper nodes2 = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, sizeof(Node)*ListLength*numLists, pNodes2, &error);
+    test_error(error, "clCreateBuffer failed.");
+
+    // this buffer holds the index into the nodes buffer that is used for node allocation
+    clMemWrapper allocator = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &error);
+    test_error(error, "clCreateBuffer failed.");
+
+    // this buffer holds the count of correct nodes which is computed by the verify kernel.
+    clMemWrapper num_correct = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &error);
+    test_error(error, "clCreateBuffer failed.");
+
+    error |= clSetKernelArg(kernel_create_lists, 0, sizeof(void*), (void *) &nodes);
+    //error |= clSetKernelArgSVMPointer(kernel_create_lists, 0, (void *) pNodes);
+    error |= clSetKernelArg(kernel_create_lists, 1, sizeof(void*), (void *) &nodes2);
+    error |= clSetKernelArg(kernel_create_lists, 2, sizeof(void*), (void *) &allocator);
+    error |= clSetKernelArg(kernel_create_lists, 3, sizeof(cl_int),   (void *) &ListLength);
+
+    error |= clSetKernelArg(kernel_verify_lists, 0, sizeof(void*), (void *) &nodes);
+    error |= clSetKernelArg(kernel_verify_lists, 1, sizeof(void*), (void *) &nodes2);
+    error |= clSetKernelArg(kernel_verify_lists, 2, sizeof(void*), (void *) &num_correct);
+    error |= clSetKernelArg(kernel_verify_lists, 3, sizeof(cl_int),   (void *) &ListLength);
+    test_error(error, "clSetKernelArg failed");
+
+    // Create linked list on one device and verify on another device (or the host).
+    // Do this for all possible combinations of devices and host within the platform.
+    for (int ci=0; ci<(int)num_devices+1; ci++)  // ci is CreationIndex, index of device/q to create linked list on
+    {
+      for (int vi=0; vi<(int)num_devices+1; vi++)  // vi is VerificationIndex, index of device/q to verify linked list on
+      {
+        if(ci == num_devices) // last device index represents the host, note the num_device+1 above.
+        {
+          error = create_linked_lists_on_host(queues[0], nodes, nodes2, ListLength, numLists);
+          if(error) return -1;
+        }
+        else
+        {
+          error = create_linked_lists_on_device(ci, queues[ci], allocator, kernel_create_lists, numLists);
+          if(error) return -1;
+        }
+
+        if(vi == num_devices)
+        {
+          error = verify_linked_lists_on_host(vi, queues[0], nodes, nodes2, ListLength, numLists);
+          if(error) return -1;
+        }
+        else
+        {
+          error = verify_linked_lists_on_device(vi, queues[vi], num_correct, kernel_verify_lists, ListLength, numLists);
+          if(error) return -1;
+        }
+      } // inner loop, vi
+    } // outer loop, ci
+  }
+
+  clSVMFree(context, pNodes2);
+  clSVMFree(context, pNodes);
+
+  return 0;
+}
--- a/test_conformance/SVM/test_enqueue_api.cpp
+++ b/test_conformance/SVM/test_enqueue_api.cpp
@@ -0,0 +1,254 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "common.h"
+#include "../../test_common/harness/mt19937.h"
+
+#include <vector>
+
+typedef struct
+{
+  cl_uint status;
+  cl_uint num_svm_pointers;
+  std::vector<void *> svm_pointers;
+} CallbackData;
+
+void generate_data(std::vector<cl_uchar> &data, size_t size, MTdata seed)
+{
+  cl_uint randomData = genrand_int32(seed);
+  cl_uint bitsLeft = 32;
+
+  for( size_t i = 0; i < size; i++ )
+  {
+    if( 0 == bitsLeft)
+    {
+      randomData = genrand_int32(seed);
+      bitsLeft = 32;
+    }
+    data[i] = (cl_uchar)( randomData & 255 );
+    randomData >>= 8; randomData -= 8;
+  }
+}
+
+//callback which will be passed to clEnqueueSVMFree command
+void CL_CALLBACK callback_svm_free(cl_command_queue queue, cl_uint num_svm_pointers, void * svm_pointers[], void * user_data)
+{
+  CallbackData *data = (CallbackData *)user_data;
+  data->num_svm_pointers = num_svm_pointers;
+  data->svm_pointers.resize(num_svm_pointers, 0);
+
+  cl_context context;
+  if(clGetCommandQueueInfo(queue, CL_QUEUE_CONTEXT, sizeof(cl_context), &context, 0) != CL_SUCCESS)
+  {
+    log_error("clGetCommandQueueInfo failed in the callback\n");
+    return;
+  }
+
+  for (size_t i = 0; i < num_svm_pointers; ++i)
+  {
+    data->svm_pointers[i] = svm_pointers[i];
+    clSVMFree(context, svm_pointers[i]);
+  }
+
+  data->status = 1;
+}
+
+int test_enqueue_api(cl_device_id deviceID, cl_context c, cl_command_queue queue, int num_elements)
+{
+  clContextWrapper context = NULL;
+  clCommandQueueWrapper queues[MAXQ];
+  cl_uint num_devices = 0;
+  const size_t elementNum = 1024;
+  const size_t numSVMBuffers = 32;
+  cl_int error = CL_SUCCESS;
+  RandomSeed seed(0);
+
+  error = create_cl_objects(deviceID, NULL, &context, NULL, &queues[0], &num_devices, CL_DEVICE_SVM_COARSE_GRAIN_BUFFER);
+  if(error) return -1;
+
+  queue = queues[0];
+
+  //all possible sizes of vectors and scalars
+  size_t typeSizes[] = {
+    sizeof(cl_uchar),
+    sizeof(cl_uchar2),
+    sizeof(cl_uchar3),
+    sizeof(cl_uchar4),
+    sizeof(cl_uchar8),
+    sizeof(cl_uchar16),
+    sizeof(cl_ushort),
+    sizeof(cl_ushort2),
+    sizeof(cl_ushort3),
+    sizeof(cl_ushort4),
+    sizeof(cl_ushort8),
+    sizeof(cl_ushort16),
+    sizeof(cl_uint),
+    sizeof(cl_uint2),
+    sizeof(cl_uint3),
+    sizeof(cl_uint4),
+    sizeof(cl_uint8),
+    sizeof(cl_uint16),
+    sizeof(cl_ulong),
+    sizeof(cl_ulong2),
+    sizeof(cl_ulong3),
+    sizeof(cl_ulong4),
+    sizeof(cl_ulong8),
+    sizeof(cl_ulong16),
+  };
+
+  for (size_t i = 0; i < ( sizeof(typeSizes) / sizeof(typeSizes[0]) ); ++i)
+  {
+    //generate initial data
+    std::vector<cl_uchar> fillData0(typeSizes[i]), fillData1(typeSizes[i], 0), fillData2(typeSizes[i]);
+    generate_data(fillData0, typeSizes[i], seed);
+    generate_data(fillData2, typeSizes[i], seed);
+
+    cl_uchar *srcBuffer = (cl_uchar *)clSVMAlloc(context, CL_MEM_READ_WRITE, elementNum * typeSizes[i], 0);
+    cl_uchar *dstBuffer = (cl_uchar *)clSVMAlloc(context, CL_MEM_READ_WRITE, elementNum * typeSizes[i], 0);
+
+    clEventWrapper userEvent = clCreateUserEvent(context, &error);
+    test_error(error, "clCreateUserEvent failed");
+
+    clEventWrapper eventMemFill;
+    error = clEnqueueSVMMemFill(queue, srcBuffer, &fillData0[0], typeSizes[i], elementNum * typeSizes[i], 1, &userEvent, &eventMemFill);
+    test_error(error, "clEnqueueSVMMemFill failed");
+
+    clEventWrapper eventMemcpy;
+    error = clEnqueueSVMMemcpy(queue, CL_FALSE, dstBuffer, srcBuffer, elementNum * typeSizes[i], 1, &eventMemFill, &eventMemcpy);
+    test_error(error, "clEnqueueSVMMemcpy failed");
+
+    error = clSetUserEventStatus(userEvent, CL_COMPLETE);
+    test_error(error, "clSetUserEventStatus failed");
+
+    clEventWrapper eventMap;
+    error = clEnqueueSVMMap(queue, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, dstBuffer, elementNum * typeSizes[i], 1, &eventMemcpy, &eventMap);
+    test_error(error, "clEnqueueSVMMap failed");
+
+    error = clWaitForEvents(1, &eventMap);
+    test_error(error, "clWaitForEvents failed");
+
+    //data verification
+    for (size_t j = 0; j < elementNum * typeSizes[i]; ++j)
+    {
+      if (dstBuffer[j] != fillData0[j % typeSizes[i]])
+      {
+        log_error("Invalid data at index %ld, expected %d, got %d\n", j, fillData0[j % typeSizes[i]], dstBuffer[j]);
+        return -1;
+      }
+    }
+
+    clEventWrapper eventUnmap;
+    error = clEnqueueSVMUnmap(queue, dstBuffer, 0, 0, &eventUnmap);
+    test_error(error, "clEnqueueSVMUnmap failed");
+
+    error = clEnqueueSVMMemFill(queue, srcBuffer, &fillData2[0], typeSizes[i], elementNum * typeSizes[i] / 2, 0, 0, 0);
+    test_error(error, "clEnqueueSVMMemFill failed");
+
+    error = clEnqueueSVMMemFill(queue, dstBuffer + elementNum * typeSizes[i] / 2, &fillData2[0], typeSizes[i], elementNum * typeSizes[i] / 2, 0, 0, 0);
+    test_error(error, "clEnqueueSVMMemFill failed");
+
+    error = clEnqueueSVMMemcpy(queue, CL_FALSE, dstBuffer, srcBuffer, elementNum * typeSizes[i] / 2, 0, 0, 0);
+    test_error(error, "clEnqueueSVMMemcpy failed");
+
+    error = clEnqueueSVMMemcpy(queue, CL_TRUE, dstBuffer + elementNum * typeSizes[i] / 2, srcBuffer + elementNum * typeSizes[i] / 2, elementNum * typeSizes[i] / 2, 0, 0, 0);
+    test_error(error, "clEnqueueSVMMemcpy failed");
+
+    void *ptrs[] = {(void *)srcBuffer, (void *)dstBuffer};
+
+    clEventWrapper eventFree;
+    error = clEnqueueSVMFree(queue, 2, ptrs, 0, 0, 0, 0, &eventFree);
+    test_error(error, "clEnqueueSVMFree failed");
+
+    error = clWaitForEvents(1, &eventFree);
+    test_error(error, "clWaitForEvents failed");
+
+    //event info verification for new SVM commands
+    cl_command_type commandType;
+    error = clGetEventInfo(eventMemFill, CL_EVENT_COMMAND_TYPE, sizeof(cl_command_type), &commandType, NULL);
+    test_error(error, "clGetEventInfo failed");
+    if (commandType != CL_COMMAND_SVM_MEMFILL)
+    {
+      log_error("Invalid command type returned for clEnqueueSVMMemFill\n");
+      return -1;
+    }
+
+    error = clGetEventInfo(eventMemcpy, CL_EVENT_COMMAND_TYPE, sizeof(cl_command_type), &commandType, NULL);
+    test_error(error, "clGetEventInfo failed");
+    if (commandType != CL_COMMAND_SVM_MEMCPY)
+    {
+      log_error("Invalid command type returned for clEnqueueSVMMemcpy\n");
+      return -1;
+    }
+
+    error = clGetEventInfo(eventMap, CL_EVENT_COMMAND_TYPE, sizeof(cl_command_type), &commandType, NULL);
+    test_error(error, "clGetEventInfo failed");
+    if (commandType != CL_COMMAND_SVM_MAP)
+    {
+      log_error("Invalid command type returned for clEnqueueSVMMap\n");
+      return -1;
+    }
+
+    error = clGetEventInfo(eventUnmap, CL_EVENT_COMMAND_TYPE, sizeof(cl_command_type), &commandType, NULL);
+    test_error(error, "clGetEventInfo failed");
+    if (commandType != CL_COMMAND_SVM_UNMAP)
+    {
+      log_error("Invalid command type returned for clEnqueueSVMUnmap\n");
+      return -1;
+    }
+
+    error = clGetEventInfo(eventFree, CL_EVENT_COMMAND_TYPE, sizeof(cl_command_type), &commandType, NULL);
+    test_error(error, "clGetEventInfo failed");
+    if (commandType != CL_COMMAND_SVM_FREE)
+    {
+      log_error("Invalid command type returned for clEnqueueSVMFree\n");
+      return -1;
+    }
+  }
+
+  std::vector<void *> buffers(numSVMBuffers, 0);
+  for(size_t i = 0; i < numSVMBuffers; ++i) buffers[i] = clSVMAlloc(context, CL_MEM_READ_WRITE, elementNum, 0);
+
+  //verify if callback is triggered correctly
+  CallbackData data;
+  data.status = 0;
+
+  error = clEnqueueSVMFree(queue, buffers.size(), &buffers[0], callback_svm_free, &data, 0, 0, 0);
+  test_error(error, "clEnqueueSVMFree failed");
+
+  error = clFinish(queue);
+  test_error(error, "clFinish failed");
+
+  //wait for the callback
+  while(data.status == 0) { }
+
+  //check if number of SVM pointers returned in the callback matches with expected
+  if (data.num_svm_pointers != buffers.size())
+  {
+    log_error("Invalid number of SVM pointers returned in the callback, expected: %ld, got: %d\n", buffers.size(), data.num_svm_pointers);
+    return -1;
+  }
+
+  //check if pointers returned in callback are correct
+  for (size_t i = 0; i < buffers.size(); ++i)
+  {
+    if (data.svm_pointers[i] != buffers[i])
+    {
+      log_error("Invalid SVM pointer returned in the callback, idx: %ld\n", i);
+      return -1;
+    }
+  }
+
+  return 0;
+}
--- a/test_conformance/SVM/test_fine_grain_memory_consistency.cpp
+++ b/test_conformance/SVM/test_fine_grain_memory_consistency.cpp
@@ -0,0 +1,176 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "common.h"
+
+static char hash_table_kernel[] =
+  "#if 0\n"
+  "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n"
+  "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n"
+  "#endif\n"
+  "typedef struct BinNode {\n"
+  " int value;\n"
+  " atomic_uintptr_t pNext;\n"
+  "} BinNode;\n"
+
+  "__kernel void build_hash_table(__global uint* input, __global BinNode* pNodes, volatile __global atomic_uint* pNumNodes, uint numBins)\n"
+  "{\n"
+  " __global BinNode *pNew = &pNodes[ atomic_fetch_add_explicit(pNumNodes, 1, memory_order_relaxed, memory_scope_all_svm_devices) ];\n"
+  " uint i = get_global_id(0);\n"
+  " uint b = input[i] % numBins;\n"
+  " pNew->value = input[i];\n"
+  " uintptr_t next = atomic_load_explicit(&(pNodes[b].pNext), memory_order_seq_cst, memory_scope_all_svm_devices);\n"
+  " do\n"
+  " {\n"
+  "   atomic_store_explicit(&(pNew->pNext), next, memory_order_seq_cst, memory_scope_all_svm_devices);\n" // always inserting at head of list
+  " } while(!atomic_compare_exchange_strong_explicit(&(pNodes[b].pNext), &next, (uintptr_t)pNew, memory_order_seq_cst, memory_order_relaxed, memory_scope_all_svm_devices));\n"
+  "}\n";
+
+typedef struct BinNode{
+  cl_uint value;
+  struct BinNode* pNext;
+} BinNode;
+
+void build_hash_table_on_host(cl_context c, cl_uint* input, size_t inputSize, BinNode* pNodes, cl_int volatile *pNumNodes, cl_uint numBins)
+{
+  for(cl_uint i = 0; i < inputSize; i++)
+  {
+    BinNode *pNew = &pNodes[ AtomicFetchAddExplicit(pNumNodes, 1, memory_order_relaxed) ];
+    cl_uint b = input[i] % numBins;
+    pNew->value = input[i];
+
+    BinNode *next = pNodes[b].pNext;
+    do {
+        pNew->pNext = next;  // always inserting at head of list
+    } while(!AtomicCompareExchangeStrongExplicit(&(pNodes[b].pNext), &next, pNew, memory_order_relaxed, memory_order_seq_cst));
+  }
+}
+
+
+int launch_kernels_and_verify(clContextWrapper &context, clCommandQueueWrapper* queues, clKernelWrapper &kernel, cl_uint num_devices, cl_uint numBins, size_t num_pixels)
+{
+  int err = CL_SUCCESS;
+  cl_uint *pInputImage = (cl_uint*) clSVMAlloc(context, CL_MEM_READ_ONLY  | CL_MEM_SVM_FINE_GRAIN_BUFFER, sizeof(cl_uint) * num_pixels, 0);
+  BinNode *pNodes      = (BinNode*) clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, sizeof(BinNode) * (num_pixels * (num_devices + 1) + numBins), 0);
+  cl_int *pNumNodes       = (cl_int*)  clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, sizeof(cl_int), 0);
+
+  *pNumNodes = numBins;  // using the first numBins nodes to hold the list heads.
+  for(cl_uint i=0;i<numBins;i++) {
+    pNodes[i].pNext = NULL;
+  };
+
+  for(cl_uint i=0; i < num_pixels; i++) pInputImage[i] = i;
+
+  err |= clSetKernelArgSVMPointer(kernel, 0, pInputImage);
+  err |= clSetKernelArgSVMPointer(kernel, 1, pNodes);
+  err |= clSetKernelArgSVMPointer(kernel, 2, pNumNodes);
+  err |= clSetKernelArg(kernel, 3, sizeof(cl_uint), (void*) &numBins);
+
+  test_error(err, "clSetKernelArg failed");
+
+  cl_event done;
+  // get all the devices going simultaneously, each device (and the host) will insert all the pixels.
+  for(cl_uint d=0; d<num_devices; d++)
+  {
+    err = clEnqueueNDRangeKernel(queues[d], kernel, 1, NULL, &num_pixels, 0, 0, NULL, &done);
+    test_error(err,"clEnqueueNDRangeKernel failed");
+  }
+  for(cl_uint d=0; d<num_devices; d++) clFlush(queues[d]);
+
+  // wait until we see some activity from a device (try to run host side simultaneously).
+  while(numBins == AtomicLoadExplicit(pNumNodes, memory_order_relaxed));
+
+  build_hash_table_on_host(context, pInputImage, num_pixels, pNodes, pNumNodes, numBins);
+
+  for(cl_uint d=0; d<num_devices; d++) clFinish(queues[d]);
+
+  cl_uint num_items = 0;
+  // check correctness of each bin in the hash table.
+  for(cl_uint i = 0; i < numBins; i++)
+  {
+    BinNode *pNode = pNodes[i].pNext;
+    while(pNode)
+    {
+      if((pNode->value % numBins) != i)
+      {
+        log_error("Something went wrong, item is in wrong hash bucket\n");
+        break;
+      }
+      num_items++;
+      pNode = pNode->pNext;
+    }
+  }
+
+  clSVMFree(context, pInputImage);
+  clSVMFree(context, pNodes);
+  clSVMFree(context, pNumNodes);
+  // each device and the host inserted all of the pixels, check that none are missing.
+  if(num_items != num_pixels * (num_devices + 1) )
+  {
+    log_error("The hash table is not correct, num items %d, expected num items: %d\n", num_items, num_pixels * (num_devices + 1));
+    return -1; // test did not pass
+  }
+  return 0;
+}
+
+// This tests for memory consistency across devices and the host.
+// Each device and the host simultaneously insert values into a single hash table.
+// Each bin in the hash table is a linked list.  Each bin is protected against simultaneous
+// update using a lock free technique.  The correctness of the list is verfied on the host.
+// This test requires the new OpenCL 2.0 atomic operations that implement the new seq_cst memory ordering.
+int    test_fine_grain_memory_consistency(cl_device_id deviceID, cl_context c, cl_command_queue queue, int num_elements)
+{
+  clContextWrapper context;
+  clProgramWrapper program;
+  clKernelWrapper kernel;
+  clCommandQueueWrapper queues[MAXQ];
+
+  cl_uint     num_devices = 0;
+  cl_int      err = CL_SUCCESS;
+
+  if (sizeof(void *) == 8 && (!is_extension_available(deviceID, "cl_khr_int64_base_atomics") || !is_extension_available(deviceID, "cl_khr_int64_extended_atomics")))
+  {
+      log_info("WARNING: test skipped. 'cl_khr_int64_base_atomics' and 'cl_khr_int64_extended_atomics' extensions are not supported\n");
+      return 0;
+  }
+
+  // Make pragmas visible for 64-bit addresses
+  hash_table_kernel[4] = sizeof(void *) == 8 ? '1' : '0';
+
+  char *source[] = { hash_table_kernel };
+
+  err = create_cl_objects(deviceID, (const char**)source, &context, &program, &queues[0], &num_devices, CL_DEVICE_SVM_FINE_GRAIN_BUFFER | CL_DEVICE_SVM_ATOMICS);
+  if(err == 1) return 0; // no devices capable of requested SVM level, so don't execute but count test as passing.
+  if(err < 0) return -1; // fail test.
+
+  kernel = clCreateKernel(program, "build_hash_table", &err);
+  test_error(err, "clCreateKernel failed");
+  size_t num_pixels = num_elements;
+
+  int result;
+  cl_uint numBins = 1;  // all work groups in all devices and the host code will hammer on this one lock.
+  result = launch_kernels_and_verify(context, queues, kernel, num_devices, numBins, num_pixels);
+  if(result == -1) return result;
+
+  numBins = 2;  // 2 locks within in same cache line will get hit from different devices and host.
+  result = launch_kernels_and_verify(context, queues, kernel, num_devices, numBins, num_pixels);
+  if(result == -1) return result;
+
+  numBins = 29; // locks span a few cache lines.
+  result = launch_kernels_and_verify(context, queues, kernel, num_devices, numBins, num_pixels);
+  if(result == -1) return result;
+
+  return result;
+}
--- a/test_conformance/SVM/test_fine_grain_sync_buffers.cpp
+++ b/test_conformance/SVM/test_fine_grain_sync_buffers.cpp
@@ -0,0 +1,105 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "common.h"
+
+const char *find_targets_kernel[] = {
+
+  "__kernel void find_targets(__global uint* image, uint target, volatile __global atomic_uint *numTargetsFound, volatile __global atomic_uint *targetLocations)\n"
+  "{\n"
+  " size_t i = get_global_id(0);\n"
+  " uint index;\n"
+  " if(image[i] == target) {\n"
+  "   index = atomic_fetch_add_explicit(numTargetsFound, 1, memory_order_relaxed, memory_scope_device); \n"
+  "   atomic_exchange_explicit(&targetLocations[index], i, memory_order_relaxed, memory_scope_all_svm_devices); \n"
+  " }\n"
+  "}\n"
+};
+
+
+void spawnAnalysisTask(int location)
+{
+  //    printf("found target at location %d\n", location);
+}
+
+#define MAX_TARGETS 1024
+
+// Goals: demonstrate use of SVM's atomics to do fine grain synchronization between the device and host.
+// Concept: a device kernel is used to search an input image for regions that match a target pattern.
+// The device immediately notifies the host when it finds a target (via an atomic operation that works across host and devices).
+// The host is then able to spawn a task that further analyzes the target while the device continues searching for more targets.
+int    test_fine_grain_sync_buffers(cl_device_id deviceID, cl_context c, cl_command_queue queue, int num_elements)
+{
+  clContextWrapper    context = NULL;
+  clProgramWrapper    program = NULL;
+  cl_uint     num_devices = 0;
+  cl_int      err = CL_SUCCESS;
+  clCommandQueueWrapper queues[MAXQ];
+
+  err = create_cl_objects(deviceID, &find_targets_kernel[0], &context, &program, &queues[0], &num_devices, CL_DEVICE_SVM_FINE_GRAIN_BUFFER | CL_DEVICE_SVM_ATOMICS);
+  if(err == 1) return 0; // no devices capable of requested SVM level, so don't execute but count test as passing.
+  if(err < 0) return -1; // fail test.
+
+  clKernelWrapper kernel = clCreateKernel(program, "find_targets", &err);
+  test_error(err, "clCreateKernel failed");
+
+  size_t num_pixels = num_elements;
+  //cl_uint num_pixels = 1024*1024*32;
+
+  cl_uint *pInputImage      = (cl_uint*) clSVMAlloc(context, CL_MEM_READ_ONLY  | CL_MEM_SVM_FINE_GRAIN_BUFFER, sizeof(cl_uint) * num_pixels, 0);
+  cl_uint *pNumTargetsFound = (cl_uint*) clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, sizeof(cl_uint), 0);
+  cl_int  *pTargetLocations = (cl_int* ) clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, sizeof(cl_int) * MAX_TARGETS, 0);
+
+  cl_uint targetDescriptor = 777;
+  *pNumTargetsFound = 0;
+  cl_uint i;
+  for(i=0; i < MAX_TARGETS; i++) pTargetLocations[i] = -1;
+  for(i=0; i < num_pixels; i++) pInputImage[i] = 0;
+  pInputImage[0] = targetDescriptor;
+  pInputImage[3] = targetDescriptor;
+  pInputImage[num_pixels - 1] = targetDescriptor;
+
+  err |= clSetKernelArgSVMPointer(kernel, 0, pInputImage);
+  err |= clSetKernelArg(kernel, 1, sizeof(cl_uint), (void*) &targetDescriptor);
+  err |= clSetKernelArgSVMPointer(kernel, 2, pNumTargetsFound);
+  err |= clSetKernelArgSVMPointer(kernel, 3, pTargetLocations);
+  test_error(err, "clSetKernelArg failed");
+
+  cl_event done;
+  err = clEnqueueNDRangeKernel(queues[0], kernel, 1, NULL, &num_pixels, NULL, 0, NULL, &done);
+  test_error(err,"clEnqueueNDRangeKernel failed");
+  clFlush(queues[0]);
+
+
+  i=0;
+  cl_int status;
+  // check for new targets, if found spawn a task to analyze target.
+  do {
+    err = clGetEventInfo(done,CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &status, NULL);
+    test_error(err,"clGetEventInfo failed");
+    if( AtomicLoadExplicit(&pTargetLocations[i], memory_order_relaxed) != -1)  // -1 indicates slot not used yet.
+    {
+      spawnAnalysisTask(pTargetLocations[i]);
+      i++;
+    }
+  } while (status != CL_COMPLETE || AtomicLoadExplicit(&pTargetLocations[i], memory_order_relaxed) != -1);
+
+  clSVMFree(context, pInputImage);
+  clSVMFree(context, pNumTargetsFound);
+  clSVMFree(context, pTargetLocations);
+
+  if(i != 3) return -1;
+  return 0;
+}
--- a/test_conformance/SVM/test_pointer_passing.cpp
+++ b/test_conformance/SVM/test_pointer_passing.cpp
@@ -0,0 +1,115 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "common.h"
+
+const char *SVMPointerPassing_test_kernel[] = {
+  "__kernel void verify_char(__global uchar* pChar, volatile __global uint* num_correct, uchar expected)\n"
+  "{\n"
+  "    if(0 == get_global_id(0))\n"
+  "    {\n"
+  "        *num_correct = 0;\n"
+  "        if(*pChar == expected)\n"
+  "        {\n"
+  "                    *num_correct=1;\n"
+  "        }\n"
+  "    }\n"
+  "}\n"
+};
+
+
+// Test that arbitrarily aligned char pointers into shared buffers can be passed directly to a kernel.
+// This iterates through a buffer passing a pointer to each location to the kernel.
+// The buffer is initialized to known values at each location.
+// The kernel checks that it finds the expected value at each location.
+// TODO: possibly make this work across all base types (including typeN?), also check ptr arithmetic ++,--.
+int test_svm_pointer_passing(cl_device_id deviceID, cl_context context2, cl_command_queue queue, int num_elements)
+{
+  clContextWrapper    context = NULL;
+  clProgramWrapper    program = NULL;
+  cl_uint     num_devices = 0;
+  cl_int      error = CL_SUCCESS;
+  clCommandQueueWrapper queues[MAXQ];
+
+  error = create_cl_objects(deviceID, &SVMPointerPassing_test_kernel[0], &context, &program, &queues[0], &num_devices, CL_DEVICE_SVM_COARSE_GRAIN_BUFFER);
+  if(error) return -1;
+
+  clKernelWrapper kernel_verify_char = clCreateKernel(program, "verify_char", &error);
+  test_error(error,"clCreateKernel failed");
+
+  size_t bufSize = 256;
+  char *pbuf = (char*) clSVMAlloc(context, CL_MEM_READ_WRITE, sizeof(cl_uchar)*bufSize, 0);
+
+  cl_int *pNumCorrect = NULL;
+  pNumCorrect = (cl_int*) clSVMAlloc(context, CL_MEM_READ_WRITE, sizeof(cl_int), 0);
+
+  {
+    clMemWrapper buf = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, sizeof(cl_uchar)*bufSize, pbuf, &error);
+    test_error(error, "clCreateBuffer failed.");
+
+    clMemWrapper num_correct = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, sizeof(cl_int), pNumCorrect, &error);
+    test_error(error, "clCreateBuffer failed.");
+
+    error = clSetKernelArg(kernel_verify_char, 1, sizeof(void*), (void *) &num_correct);
+    test_error(error, "clSetKernelArg failed");
+
+    // put values into buf so that we can expect to see these values in the kernel when we pass a pointer to them.
+    cl_command_queue cmdq = queues[0];
+    cl_uchar* pBuf = (cl_uchar*) clEnqueueMapBuffer(cmdq, buf, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(cl_uchar)*bufSize, 0, NULL,NULL, &error);
+    test_error2(error, pBuf, "clEnqueueMapBuffer failed");
+    for(int i = 0; i<(int)bufSize; i++)
+    {
+      pBuf[i]= (cl_uchar)i;
+    }
+    error = clEnqueueUnmapMemObject(cmdq, buf, pBuf, 0,NULL,NULL);
+    test_error(error, "clEnqueueUnmapMemObject failed.");
+
+    for (cl_uint ii = 0; ii<num_devices; ++ii)  // iterate over all devices in the platform.
+    {
+      cmdq = queues[ii];
+      for(int i = 0; i<(int)bufSize; i++)
+      {
+        cl_uchar* pChar = &pBuf[i];
+        error = clSetKernelArgSVMPointer(kernel_verify_char, 0, pChar); // pass a pointer to a location within the buffer
+        test_error(error, "clSetKernelArg failed");
+        error = clSetKernelArg(kernel_verify_char, 2, sizeof(cl_uchar), (void *) &i );  // pass the expected value at the above location.
+        test_error(error, "clSetKernelArg failed");
+        error = clEnqueueNDRangeKernel(cmdq, kernel_verify_char, 1, NULL, &bufSize, NULL, 0, NULL, NULL);
+        test_error(error,"clEnqueueNDRangeKernel failed");
+
+        pNumCorrect = (cl_int*) clEnqueueMapBuffer(cmdq, num_correct, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(cl_int), 0, NULL,NULL, &error);
+        test_error2(error, pNumCorrect, "clEnqueueMapBuffer failed");
+        cl_int correct_count = *pNumCorrect;
+        error = clEnqueueUnmapMemObject(cmdq, num_correct, pNumCorrect, 0,NULL,NULL);
+        test_error(error, "clEnqueueUnmapMemObject failed.");
+
+        if(correct_count != 1)
+        {
+          log_error("Passing pointer directly to kernel for byte #%d failed on device %d\n", i, ii);
+          return -1;
+        }
+      }
+    }
+
+    error = clFinish(cmdq);
+    test_error(error, "clFinish failed");
+  }
+
+
+  clSVMFree(context, pbuf);
+  clSVMFree(context, pNumCorrect);
+
+  return 0;
+}
--- a/test_conformance/SVM/test_set_kernel_exec_info_svm_ptrs.cpp
+++ b/test_conformance/SVM/test_set_kernel_exec_info_svm_ptrs.cpp
@@ -0,0 +1,153 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "common.h"
+
+typedef struct {
+  cl_int *pA;
+  cl_int *pB;
+  cl_int *pC;
+} BufPtrs;
+
+const char *set_kernel_exec_info_svm_ptrs_kernel[] = {
+  "struct BufPtrs;\n"
+  "\n"
+  "typedef struct {\n"
+  "    __global int *pA;\n"
+  "    __global int *pB;\n"
+  "    __global int *pC;\n"
+  "} BufPtrs;\n"
+  "\n"
+  "__kernel void set_kernel_exec_info_test(__global BufPtrs* pBufs)\n"
+  "{\n"
+  "    size_t i;\n"
+  "   i = get_global_id(0);\n"
+  "    pBufs->pA[i]++;\n"
+  "    pBufs->pB[i]++;\n"
+  "    pBufs->pC[i]++;\n"
+  "}\n"
+};
+
+// Test that clSetKernelExecInfo works correctly with CL_KERNEL_EXEC_INFO_SVM_PTRS flag.
+//
+int test_set_kernel_exec_info_svm_ptrs(cl_device_id deviceID, cl_context context2, cl_command_queue queue, int num_elements)
+{
+  clContextWrapper    c = NULL;
+  clProgramWrapper    program = NULL;
+  cl_uint     num_devices = 0;
+  cl_int      error = CL_SUCCESS;
+  clCommandQueueWrapper queues[MAXQ];
+
+  //error = create_cl_objects(deviceID, &set_kernel_exec_info_svm_ptrs_kernel[0], &context, &program, &q, &num_devices, CL_DEVICE_SVM_FINE_GRAIN);
+  error = create_cl_objects(deviceID, &set_kernel_exec_info_svm_ptrs_kernel[0], &c, &program, &queues[0], &num_devices, CL_DEVICE_SVM_COARSE_GRAIN_BUFFER);
+  if(error == 1) return 0; // no devices capable of requested SVM level, so don't execute but count test as passing.
+  if(error < 0) return -1; // fail test.
+
+
+  clKernelWrapper k = clCreateKernel(program, "set_kernel_exec_info_test", &error);
+  test_error(error, "clCreateKernel failed");
+
+  size_t size = num_elements*sizeof(int);
+  //int* pA = (int*) clSVMalloc(c, CL_MEM_READ_WRITE | CL_DEVICE_SVM_FINE_GRAIN_SYSTEM, sizeof(int)*num_elements, 0);
+  //int* pB = (int*) clSVMalloc(c, CL_MEM_READ_WRITE | CL_DEVICE_SVM_FINE_GRAIN_SYSTEM, sizeof(int)*num_elements, 0);
+  //int* pC = (int*) clSVMalloc(c, CL_MEM_READ_WRITE | CL_DEVICE_SVM_FINE_GRAIN_SYSTEM, sizeof(int)*num_elements, 0);
+  int* pA = (int*) clSVMAlloc(c, CL_MEM_READ_WRITE, size, 0);
+  int* pB = (int*) clSVMAlloc(c, CL_MEM_READ_WRITE, size, 0);
+  int* pC = (int*) clSVMAlloc(c, CL_MEM_READ_WRITE, size, 0);
+  BufPtrs* pBuf = (BufPtrs*) clSVMAlloc(c, CL_MEM_READ_WRITE, sizeof(BufPtrs), 0);
+
+  bool failed = false;
+  {
+    clMemWrapper ba,bb,bc,bBuf;
+    ba = clCreateBuffer(c, CL_MEM_USE_HOST_PTR, size, pA, &error);
+    test_error(error, "clCreateBuffer failed");
+    bb = clCreateBuffer(c, CL_MEM_USE_HOST_PTR, size, pB, &error);
+    test_error(error, "clCreateBuffer failed");
+    bc = clCreateBuffer(c, CL_MEM_USE_HOST_PTR, size, pC, &error);
+    test_error(error, "clCreateBuffer failed");
+    bBuf = clCreateBuffer(c, CL_MEM_USE_HOST_PTR, sizeof(BufPtrs), pBuf, &error);
+    test_error(error, "clCreateBuffer failed");
+
+    clEnqueueMapBuffer(queues[0], ba, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size, 0, NULL, NULL, &error);
+    test_error(error, "clEnqueueMapBuffer failed");
+    clEnqueueMapBuffer(queues[0], bb, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size, 0, NULL, NULL, &error);
+    test_error(error, "clEnqueueMapBuffer failed");
+    clEnqueueMapBuffer(queues[0], bc, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size, 0, NULL, NULL, &error);
+    test_error(error, "clEnqueueMapBuffer failed");
+    clEnqueueMapBuffer(queues[0], bBuf, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(BufPtrs), 0, NULL, NULL, &error);
+    test_error(error, "clEnqueueMapBuffer failed");
+
+    for(int i = 0; i < num_elements; i++) pA[i] = pB[i] = pC[i] = 0;
+
+    pBuf->pA = pA;
+    pBuf->pB = pB;
+    pBuf->pC = pC;
+
+    error = clEnqueueUnmapMemObject(queues[0], ba, pA, 0, NULL, NULL);
+    test_error(error, " clEnqueueUnmapMemObject failed.");
+    error = clEnqueueUnmapMemObject(queues[0], bb, pB, 0, NULL, NULL);
+    test_error(error, " clEnqueueUnmapMemObject failed.");
+    error = clEnqueueUnmapMemObject(queues[0], bc, pC, 0, NULL, NULL);
+    test_error(error, " clEnqueueUnmapMemObject failed.");
+    error = clEnqueueUnmapMemObject(queues[0], bBuf, pBuf, 0, NULL, NULL);
+    test_error(error, " clEnqueueUnmapMemObject failed.");
+
+
+    error = clSetKernelArgSVMPointer(k, 0, pBuf); 
+    test_error(error, "clSetKernelArg failed");
+
+    error = clSetKernelExecInfo(k, CL_KERNEL_EXEC_INFO_SVM_PTRS, sizeof(BufPtrs), pBuf);
+    test_error(error, "clSetKernelExecInfo failed");
+
+    size_t range =  num_elements;
+    error = clEnqueueNDRangeKernel(queues[0], k, 1, NULL, &range, NULL, 0, NULL, NULL);
+    test_error(error,"clEnqueueNDRangeKernel failed");
+
+    error = clFinish(queues[0]);
+    test_error(error, "clFinish failed.");
+
+    clEnqueueMapBuffer(queues[0], ba, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size, 0, NULL, NULL, &error);
+    test_error(error, "clEnqueueMapBuffer failed");
+    clEnqueueMapBuffer(queues[0], bb, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size, 0, NULL, NULL, &error);
+    test_error(error, "clEnqueueMapBuffer failed");
+    clEnqueueMapBuffer(queues[0], bc, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size, 0, NULL, NULL, &error);
+    test_error(error, "clEnqueueMapBuffer failed");
+
+    for(int i = 0; i < num_elements; i++) 
+    {
+      if(pA[i] + pB[i] + pC[i] != 3)
+        failed = true;
+    }
+
+    error = clEnqueueUnmapMemObject(queues[0], ba, pA, 0, NULL, NULL);
+    test_error(error, " clEnqueueUnmapMemObject failed.");
+    error = clEnqueueUnmapMemObject(queues[0], bb, pB, 0, NULL, NULL);
+    test_error(error, " clEnqueueUnmapMemObject failed.");
+    error = clEnqueueUnmapMemObject(queues[0], bc, pC, 0, NULL, NULL);
+    test_error(error, " clEnqueueUnmapMemObject failed.");
+  }
+
+  error = clFinish(queues[0]);
+  test_error(error, " clFinish failed.");
+
+  clSVMFree(c, pA);
+  clSVMFree(c, pB);
+  clSVMFree(c, pC);
+  clSVMFree(c, pBuf);
+
+  if(failed) return -1;
+
+  return 0;
+}
--- a/test_conformance/SVM/test_shared_address_space_coarse_grain.cpp
+++ b/test_conformance/SVM/test_shared_address_space_coarse_grain.cpp
@@ -0,0 +1,282 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "common.h"
+
+// Creates linked list using host code
+cl_int create_linked_lists_on_host(cl_command_queue cmdq, cl_mem nodes, Node *pNodes2, cl_int ListLength, size_t numLists, cl_bool useNewAPI )
+{
+  cl_int error = CL_SUCCESS;
+
+  log_info("SVM: creating linked list on host ");
+
+  Node *pNodes;
+  if (useNewAPI == CL_FALSE)
+  {
+    pNodes = (Node*) clEnqueueMapBuffer(cmdq, nodes, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(Node)*ListLength*numLists, 0, NULL,NULL, &error);
+    test_error2(error, pNodes, "clEnqMapBuffer failed");
+  }
+  else
+  {
+    pNodes = pNodes2;
+    error = clEnqueueSVMMap(cmdq, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, pNodes2, sizeof(Node)*ListLength*numLists, 0, NULL,NULL);
+    test_error2(error, pNodes, "clEnqueueSVMMap failed");
+  }
+
+  create_linked_lists(pNodes, numLists, ListLength);
+
+  if (useNewAPI == CL_FALSE)
+  {
+    error = clEnqueueUnmapMemObject(cmdq, nodes, pNodes, 0,NULL,NULL);
+    test_error(error, "clEnqueueUnmapMemObject failed.");
+  }
+  else
+  {
+    error = clEnqueueSVMUnmap(cmdq, pNodes2, 0, NULL, NULL);
+    test_error(error, "clEnqueueSVMUnmap failed.");
+  }
+
+  error = clFinish(cmdq);
+  test_error(error, "clFinish failed.");
+  return error;
+}
+
+// Purpose: uses host code to verify correctness of the linked list
+cl_int verify_linked_lists_on_host(int ci, cl_command_queue cmdq, cl_mem nodes, Node *pNodes2, cl_int ListLength, size_t numLists, cl_bool useNewAPI )
+{
+  cl_int error = CL_SUCCESS;
+  cl_int correct_count;
+
+  Node *pNodes;
+  if (useNewAPI == CL_FALSE)
+  {
+    pNodes = (Node*) clEnqueueMapBuffer(cmdq, nodes, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(Node)*ListLength * numLists, 0, NULL,NULL, &error);
+    test_error2(error, pNodes, "clEnqueueMapBuffer failed");
+  }
+  else
+  {
+    pNodes = pNodes2;
+    error = clEnqueueSVMMap(cmdq, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, pNodes2, sizeof(Node)*ListLength * numLists, 0, NULL,NULL);
+    test_error2(error, pNodes, "clEnqueueSVMMap failed");
+  }
+
+  correct_count = 0;
+
+  error = verify_linked_lists(pNodes, numLists, ListLength);
+  if(error) return -1;
+
+  if (useNewAPI == CL_FALSE)
+  {
+    error = clEnqueueUnmapMemObject(cmdq, nodes, pNodes, 0,NULL,NULL);
+    test_error(error, "clEnqueueUnmapMemObject failed.");
+  }
+  else
+  {
+    error = clEnqueueSVMUnmap(cmdq, pNodes2, 0,NULL,NULL);
+    test_error(error, "clEnqueueSVMUnmap failed.");
+  }
+
+  error = clFinish(cmdq);
+  test_error(error, "clFinish failed.");
+  return error;
+}
+
+cl_int create_linked_lists_on_device(int ci, cl_command_queue cmdq, cl_mem allocator, cl_kernel kernel_create_lists, size_t numLists  )
+{
+  cl_int error = CL_SUCCESS;
+  log_info("SVM: creating linked list on device: %d ", ci);
+
+  size_t *pAllocator = (size_t*) clEnqueueMapBuffer(cmdq, allocator, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(cl_int), 0, NULL,NULL, &error);
+  test_error2(error, pAllocator, "clEnqueueMapBuffer failed");
+  // reset allocator index
+  *pAllocator = numLists;   // the first numLists elements of the nodes array are already allocated (they hold the head of each list).
+  error = clEnqueueUnmapMemObject(cmdq, allocator, pAllocator, 0,NULL,NULL);
+  test_error(error, " clEnqueueUnmapMemObject failed.");
+
+  error = clEnqueueNDRangeKernel(cmdq, kernel_create_lists, 1, NULL, &numLists, NULL, 0, NULL, NULL);
+  test_error(error, "clEnqueueNDRange failed.");
+  error = clFinish(cmdq);
+  test_error(error, "clFinish failed.");
+
+  return error;
+}
+
+cl_int verify_linked_lists_on_device(int vi, cl_command_queue cmdq,cl_mem num_correct, cl_kernel kernel_verify_lists, cl_int ListLength, size_t numLists  )
+{
+  cl_int error = CL_SUCCESS;
+
+  log_info(" and verifying on device: %d ", vi);
+
+  cl_int *pNumCorrect = (cl_int*) clEnqueueMapBuffer(cmdq, num_correct, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(cl_int), 0, NULL,NULL, &error);
+  test_error2(error, pNumCorrect, "clEnqueueMapBuffer failed");
+
+  *pNumCorrect = 0;     // reset numCorrect to zero
+
+  error = clEnqueueUnmapMemObject(cmdq, num_correct, pNumCorrect, 0,NULL,NULL);
+  test_error(error, "clEnqueueUnmapMemObject failed.");
+
+  error = clEnqueueNDRangeKernel(cmdq, kernel_verify_lists, 1, NULL, &numLists, NULL, 0, NULL, NULL);
+  test_error(error,"clEnqueueNDRangeKernel failed");
+
+  pNumCorrect = (cl_int*) clEnqueueMapBuffer(cmdq, num_correct, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(cl_int), 0, NULL,NULL, &error);
+  test_error2(error, pNumCorrect, "clEnqueueMapBuffer failed");
+  cl_int correct_count = *pNumCorrect;
+  error = clEnqueueUnmapMemObject(cmdq, num_correct, pNumCorrect, 0,NULL,NULL);
+  test_error(error, "clEnqueueUnmapMemObject failed");
+  clFinish(cmdq);
+  test_error(error,"clFinish failed");
+
+  if(correct_count != ListLength * (cl_uint)numLists)
+  {
+    error = -1;
+    log_info("Failed\n");
+  }
+  else
+    log_info("Passed\n");
+
+  return error;
+}
+
+// This tests that all devices and the host share a common address space; using only the coarse-grain features.
+// This is done by creating a linked list on a device and then verifying the correctness of the list
+// on another device or the host.  This basic test is performed for all combinations of devices and the host that exist within
+// the platform.  The test passes only if every combination passes.
+int shared_address_space_coarse_grain(cl_device_id deviceID, cl_context context2, cl_command_queue queue, int num_elements, cl_bool useNewAPI)
+{
+  clContextWrapper    context = NULL;
+  clProgramWrapper    program = NULL;
+  cl_uint     num_devices = 0;
+  cl_int      error = CL_SUCCESS;
+  clCommandQueueWrapper queues[MAXQ];
+
+  error = create_cl_objects(deviceID, &linked_list_create_and_verify_kernels[0], &context, &program, &queues[0], &num_devices, CL_DEVICE_SVM_COARSE_GRAIN_BUFFER);
+  if(error) return -1;
+
+  size_t numLists =  num_elements;
+  cl_int ListLength = 32;
+
+  clKernelWrapper kernel_create_lists = clCreateKernel(program, "create_linked_lists", &error);
+  test_error(error, "clCreateKernel failed");
+
+  clKernelWrapper kernel_verify_lists = clCreateKernel(program, "verify_linked_lists", &error);
+  test_error(error, "clCreateKernel failed");
+
+  // this buffer holds the linked list nodes.
+  Node* pNodes = (Node*) clSVMAlloc(context, CL_MEM_READ_WRITE, sizeof(Node)*ListLength*numLists, 0);
+
+  {
+    cl_bool usesSVMpointer = CL_FALSE;
+    clMemWrapper nodes;
+    if (useNewAPI == CL_FALSE)
+    {
+      nodes = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, sizeof(Node)*ListLength*numLists, pNodes, &error);
+      test_error(error, "clCreateBuffer failed.");
+
+      // verify if buffer uses SVM pointer
+      size_t paramSize = 0;
+      error = clGetMemObjectInfo(nodes, CL_MEM_USES_SVM_POINTER, 0, 0, &paramSize);
+      test_error(error, "clGetMemObjectInfo failed.");
+
+      if (paramSize != sizeof(cl_bool))
+      {
+        log_error("clGetMemObjectInfo(CL_MEM_USES_SVM_POINTER) returned wrong size.");
+        return -1;
+      }
+
+      error = clGetMemObjectInfo(nodes, CL_MEM_USES_SVM_POINTER, sizeof(cl_bool), &usesSVMpointer, 0);
+      test_error(error, "clGetMemObjectInfo failed.");
+
+      if (usesSVMpointer != CL_TRUE)
+      {
+        log_error("clGetMemObjectInfo(CL_MEM_USES_SVM_POINTER) returned CL_FALSE for buffer created from SVM pointer.");
+        return -1;
+      }
+    }
+
+    // this buffer holds an index into the nodes buffer, it is used for node allocation
+    clMemWrapper allocator = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &error);
+    test_error(error, "clCreateBuffer failed.");
+
+    error = clGetMemObjectInfo(allocator, CL_MEM_USES_SVM_POINTER, sizeof(cl_bool), &usesSVMpointer, 0);
+    test_error(error, "clGetMemObjectInfo failed.");
+
+    if (usesSVMpointer != CL_FALSE)
+    {
+      log_error("clGetMemObjectInfo(CL_MEM_USES_SVM_POINTER) returned CL_TRUE for non-SVM buffer.");
+      return -1;
+    }
+
+    // this buffer holds the count of correct nodes, which is computed by the verify kernel.
+    clMemWrapper num_correct = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &error);
+    test_error(error, "clCreateBuffer failed.");
+
+    if (useNewAPI == CL_TRUE)
+      error |= clSetKernelArgSVMPointer(kernel_create_lists, 0, pNodes);
+    else
+      error |= clSetKernelArg(kernel_create_lists, 0, sizeof(void*), (void *) &nodes);
+
+    error |= clSetKernelArg(kernel_create_lists, 1, sizeof(void*), (void *) &allocator);
+    error |= clSetKernelArg(kernel_create_lists, 2, sizeof(cl_int),   (void *) &ListLength);
+
+    error |= clSetKernelArgSVMPointer(kernel_verify_lists, 0, pNodes);
+    error |= clSetKernelArg(kernel_verify_lists, 1, sizeof(void*), (void *) &num_correct);
+    error |= clSetKernelArg(kernel_verify_lists, 2, sizeof(cl_int),   (void *) &ListLength);
+    test_error(error, "clSetKernelArg failed");
+
+    // Create linked list on one device and verify on another device (or the host).
+    // Do this for all possible combinations of devices and host within the platform.
+    for (int ci=0; ci<(int)num_devices+1; ci++)  // ci is CreationIndex, index of device/q to create linked list on
+    {
+      for (int vi=0; vi<(int)num_devices+1; vi++)  // vi is VerificationIndex, index of device/q to verify linked list on
+      {
+        if(ci == num_devices) // last device index represents the host, note the num_device+1 above.
+        {
+          error = create_linked_lists_on_host(queues[0], nodes, pNodes, ListLength, numLists, useNewAPI);
+          if(error) return -1;
+        }
+        else
+        {
+          error = create_linked_lists_on_device(ci, queues[ci], allocator, kernel_create_lists, numLists);
+          if(error) return -1;
+        }
+
+        if(vi == num_devices)
+        {
+          error = verify_linked_lists_on_host(vi, queues[0], nodes, pNodes, ListLength, numLists, useNewAPI);
+          if(error) return -1;
+        }
+        else
+        {
+          error = verify_linked_lists_on_device(vi, queues[vi], num_correct, kernel_verify_lists, ListLength, numLists);
+          if(error) return -1;
+        }
+      }
+    }
+  }
+
+  clSVMFree(context, pNodes);
+
+  return 0;
+}
+
+int test_shared_address_space_coarse_grain_old_api(cl_device_id deviceID, cl_context context2, cl_command_queue queue, int num_elements)
+{
+  return shared_address_space_coarse_grain(deviceID, context2, queue, num_elements, CL_FALSE);
+}
+
+int test_shared_address_space_coarse_grain_new_api(cl_device_id deviceID, cl_context context2, cl_command_queue queue, int num_elements)
+{
+  return shared_address_space_coarse_grain(deviceID, context2, queue, num_elements, CL_TRUE);
+}
--- a/test_conformance/SVM/test_shared_address_space_fine_grain.cpp
+++ b/test_conformance/SVM/test_shared_address_space_fine_grain.cpp
@@ -0,0 +1,101 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "common.h"
+
+
+// This tests that all devices and the host share a common address space using fine-grain mode with no buffers.
+// This is done by creating a linked list on a device and then verifying the correctness of the list
+// on another device or the host.  This basic test is performed for all combinations of devices and the host that exist within
+// the platform.  The test passes only if every combination passes.
+int test_shared_address_space_fine_grain(cl_device_id deviceID, cl_context context2, cl_command_queue queue, int num_elements)
+{
+  clContextWrapper    context = NULL;
+  clProgramWrapper    program = NULL;
+  cl_uint     num_devices = 0;
+  cl_int      error = CL_SUCCESS;
+  clCommandQueueWrapper queues[MAXQ];
+
+  error = create_cl_objects(deviceID, &linked_list_create_and_verify_kernels[0], &context, &program, &queues[0], &num_devices, CL_DEVICE_SVM_FINE_GRAIN_SYSTEM);
+  if(error == 1) return 0; // no devices capable of requested SVM level, so don't execute but count test as passing.
+  if(error < 0) return -1; // fail test.
+
+  size_t numLists =  num_elements;
+  cl_int ListLength = 32;
+
+  clKernelWrapper kernel_create_lists = clCreateKernel(program, "create_linked_lists", &error);
+  test_error(error, "clCreateKernel failed");
+
+  clKernelWrapper kernel_verify_lists = clCreateKernel(program, "verify_linked_lists", &error);
+  test_error(error, "clCreateKernel failed");
+
+  // this allocation holds the linked list nodes.
+  // FIXME: remove the alignment once prototype can handle it
+  Node* pNodes = (Node*) align_malloc(numLists*ListLength*sizeof(Node),128);
+  test_error2(error, pNodes, "malloc failed");
+
+  // this allocation holds an index into the nodes buffer, it is used for node allocation
+  size_t* pAllocator = (size_t*) align_malloc(sizeof(cl_int), 128);
+  test_error2(error, pAllocator, "malloc failed");
+
+  // this allocation holds the count of correct nodes, which is computed by the verify kernel.
+  cl_int* pNum_correct = (cl_int*) align_malloc(sizeof(cl_int), 128);
+  test_error2(error, pNum_correct, "malloc failed");
+
+
+  error |= clSetKernelArgSVMPointer(kernel_create_lists, 0, pNodes);
+  error |= clSetKernelArgSVMPointer(kernel_create_lists, 1, pAllocator);
+  error |= clSetKernelArg(kernel_create_lists, 2, sizeof(cl_int),(void *) &ListLength);
+
+  error |= clSetKernelArgSVMPointer(kernel_verify_lists, 0, pNodes);
+  error |= clSetKernelArgSVMPointer(kernel_verify_lists, 1, pNum_correct);
+  error |= clSetKernelArg(kernel_verify_lists, 2, sizeof(cl_int),   (void *) &ListLength);
+  test_error(error, "clSetKernelArg failed");
+
+  // Create linked list on one device and verify on another device (or the host).
+  // Do this for all possible combinations of devices and host within the platform.
+  for (int ci=0; ci<(int)num_devices+1; ci++)  // ci is CreationIndex, index of device/q to create linked list on
+  {
+    for (int vi=0; vi<(int)num_devices+1; vi++)  // vi is VerificationIndex, index of device/q to verify linked list on
+    {
+      if(ci == num_devices) // last device index represents the host, note the num_device+1 above.
+      {
+        log_info("creating linked list on host ");
+        create_linked_lists(pNodes, numLists, ListLength);
+      }
+      else
+      {
+        error = create_linked_lists_on_device_no_map(ci, queues[ci], pAllocator, kernel_create_lists, numLists);
+        if(error) return -1;
+      }
+
+      if(vi == num_devices)
+      {
+        error = verify_linked_lists(pNodes, numLists, ListLength);
+        if(error) return -1;
+      }
+      else
+      {
+        error = verify_linked_lists_on_device_no_map(vi, queues[vi], pNum_correct, kernel_verify_lists, ListLength, numLists);
+        if(error) return -1;
+      }
+    }
+  }
+
+  align_free(pNodes);
+  align_free(pAllocator);
+  align_free(pNum_correct);
+  return 0;
+}
--- a/test_conformance/SVM/test_shared_address_space_fine_grain_buffers.cpp
+++ b/test_conformance/SVM/test_shared_address_space_fine_grain_buffers.cpp
@@ -0,0 +1,138 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "common.h"
+
+
+
+
+cl_int create_linked_lists_on_device_no_map(int ci, cl_command_queue cmdq, size_t* pAllocator, cl_kernel kernel_create_lists, size_t numLists  )
+{
+  cl_int error = CL_SUCCESS;
+  log_info("SVM: creating linked list on device: %d ", ci);
+
+  // reset allocator index
+  *pAllocator = numLists;   // the first numLists elements of the nodes array are already allocated (they hold the head of each list).
+  error = clEnqueueNDRangeKernel(cmdq, kernel_create_lists, 1, NULL, &numLists, NULL, 0, NULL, NULL);
+  test_error(error, "clEnqueueNDRange failed.");
+  error = clFinish(cmdq);
+  test_error(error, "clFinish failed.");
+  return error;
+}
+
+cl_int verify_linked_lists_on_device_no_map(int vi, cl_command_queue cmdq,cl_int* pNumCorrect, cl_kernel kernel_verify_lists, cl_int ListLength, size_t numLists  )
+{
+  cl_int error = CL_SUCCESS;
+
+  log_info(" and verifying on device: %d ", vi);
+
+  *pNumCorrect = 0;     // reset numCorrect to zero
+
+  error = clEnqueueNDRangeKernel(cmdq, kernel_verify_lists, 1, NULL, &numLists, NULL, 0, NULL, NULL);
+  test_error(error,"clEnqueueNDRangeKernel failed");
+  clFinish(cmdq);
+  test_error(error,"clFinish failed");
+
+  cl_int correct_count = *pNumCorrect;
+  if(correct_count != ListLength * (cl_uint)numLists)
+  {
+    error = -1;
+    log_info("Failed\n");
+  }
+  else
+    log_info("Passed\n");
+
+  return error;
+}
+
+// This tests that all devices and the host share a common address space; using only the fine-grain with buffers mode.
+// This is done by creating a linked list on a device and then verifying the correctness of the list
+// on another device or the host.  This basic test is performed for all combinations of devices and the host that exist within
+// the platform.  The test passes only if every combination passes.
+int test_shared_address_space_fine_grain_buffers(cl_device_id deviceID, cl_context context2, cl_command_queue queue, int num_elements)
+{
+  clContextWrapper    context = NULL;
+  clProgramWrapper    program = NULL;
+  cl_uint     num_devices = 0;
+  cl_int      error = CL_SUCCESS;
+  clCommandQueueWrapper queues[MAXQ];
+
+  error = create_cl_objects(deviceID, &linked_list_create_and_verify_kernels[0], &context, &program, &queues[0], &num_devices, CL_DEVICE_SVM_FINE_GRAIN_BUFFER);
+  if(error == 1) return 0; // no devices capable of requested SVM level, so don't execute but count test as passing.
+  if(error < 0) return -1; // fail test.
+
+  size_t numLists =  num_elements;
+  cl_int ListLength = 32;
+
+  clKernelWrapper kernel_create_lists = clCreateKernel(program, "create_linked_lists", &error);
+  test_error(error, "clCreateKernel failed");
+
+  clKernelWrapper kernel_verify_lists = clCreateKernel(program, "verify_linked_lists", &error);
+  test_error(error, "clCreateKernel failed");
+
+  // this buffer holds the linked list nodes.
+  Node* pNodes = (Node*) clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, sizeof(Node)*ListLength*numLists, 0);
+
+  // this buffer holds an index into the nodes buffer, it is used for node allocation
+  size_t *pAllocator = (size_t*) clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, sizeof(size_t), 0);
+
+  // this buffer holds the count of correct nodes, which is computed by the verify kernel.
+  cl_int *pNumCorrect = (cl_int*) clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, sizeof(cl_int), 0);
+
+  error |= clSetKernelArgSVMPointer(kernel_create_lists, 0, pNodes);
+  error |= clSetKernelArgSVMPointer(kernel_create_lists, 1, pAllocator);
+  error |= clSetKernelArg(kernel_create_lists, 2, sizeof(cl_int),   (void *) &ListLength);
+
+  error |= clSetKernelArgSVMPointer(kernel_verify_lists, 0, pNodes);
+  error |= clSetKernelArgSVMPointer(kernel_verify_lists, 1, pNumCorrect);
+  error |= clSetKernelArg(kernel_verify_lists, 2, sizeof(cl_int),   (void *) &ListLength);
+  test_error(error, "clSetKernelArg failed");
+
+  // Create linked list on one device and verify on another device (or the host).
+  // Do this for all possible combinations of devices and host within the platform.
+  for (int ci=0; ci<(int)num_devices+1; ci++)  // ci is CreationIndex, index of device/q to create linked list on
+  {
+    for (int vi=0; vi<(int)num_devices+1; vi++)  // vi is VerificationIndex, index of device/q to verify linked list on
+    {
+      if(ci == num_devices) // last device index represents the host, note the num_device+1 above.
+      {
+        log_info("SVM: creating linked list on host ");
+        create_linked_lists(pNodes, numLists, ListLength);
+      }
+      else
+      {
+        error = create_linked_lists_on_device_no_map(ci, queues[ci], pAllocator, kernel_create_lists, numLists);
+        if(error) return -1;
+      }
+
+      if(vi == num_devices)
+      {
+        error = verify_linked_lists(pNodes, numLists, ListLength);
+        if(error) return -1;
+      }
+      else
+      {
+        error = verify_linked_lists_on_device_no_map(vi, queues[vi], pNumCorrect, kernel_verify_lists, ListLength, numLists);
+        if(error) return -1;
+      }
+    }
+  }
+
+  clSVMFree(context, pNodes);
+  clSVMFree(context, pAllocator);
+  clSVMFree(context, pNumCorrect);
+
+  return 0;
+}
--- a/test_conformance/SVM/test_shared_sub_buffers.cpp
+++ b/test_conformance/SVM/test_shared_sub_buffers.cpp
@@ -0,0 +1,241 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "common.h"
+
+const char *shared_sub_buffers_test_kernel[] = {
+  "typedef struct Node {\n"
+  "    int global_id;\n"
+  "    int position_in_list;\n"
+  "    __global struct Node* pNext;\n"
+  "} Node;\n"
+
+  // create linked lists that use nodes from 2 different buffers
+  "__global Node* allocate_node(__global Node* pNodes1, __global Node* pNodes2, volatile __global int* allocation_index, size_t i)\n"
+  "{\n"
+  // mix things up, adjacent work items will allocate from different buffers
+  "    if(i & 0x1)\n"
+  "        return &pNodes1[atomic_inc(allocation_index)];\n"
+  "    else\n"
+  "        return &pNodes2[atomic_inc(allocation_index)];\n"
+  "}\n"
+
+  // The allocation_index parameter must be initialized on the host to N work-items
+  // The first N nodes in pNodes will be the heads of the lists.
+  // This tests passing 4 different sub-buffers that come from two parent buffers.
+  // Note that we have arguments that appear to be unused, but they are required so that system knows to get all the sub-buffers on to the device
+  "__kernel void create_linked_lists(__global Node* pNodes_sub1, __global Node* pNodes2_sub1, __global Node* pNodes_sub2, __global Node* pNodes2_sub2, volatile __global int* allocation_index, int list_length) \n"
+  "{\n"
+  "    size_t i = get_global_id(0);\n"
+  "    __global Node *pNode = &pNodes_sub1[i];\n"
+  "    pNode->global_id = i;\n"
+  "    pNode->position_in_list = 0;\n"
+  "    __global Node *pNew;\n"
+  "    for(int j=1; j < list_length; j++) {\n"
+  "        pNew = allocate_node(pNodes_sub1, pNodes2_sub1, allocation_index, i);\n"
+  "        pNew->global_id = i;\n"
+  "        pNew->position_in_list = j;\n"
+  "        pNode->pNext = pNew;  // link new node onto end of list\n"
+  "        pNode = pNew;   // move to end of list\n"
+  "    }\n"
+  "}\n"
+  // Note that we have arguments that appear to be unused, but they are required so that system knows to get all the sub-buffers on to the device
+  "__kernel void verify_linked_lists(__global Node* pNodes_sub1, __global Node* pNodes2_sub1, __global Node* pNodes_sub2, __global Node* pNodes2_sub2, volatile __global uint* num_correct, int list_length)\n"
+  "{\n"
+  "    size_t i = get_global_id(0);\n"
+  "    __global Node *pNode = &pNodes_sub1[i];\n"
+  "    for(int j=0; j < list_length; j++) {\n"
+  "        if( pNode->global_id == i && pNode->position_in_list == j)\n"
+  "            atomic_inc(num_correct);\n"
+  "        else \n"
+  "            break;\n"
+  "        pNode = pNode->pNext;\n"
+  "    }\n"
+  "}\n"
+};
+
+
+// Creates linked list using host code.
+cl_int create_linked_lists_on_host_sb(cl_command_queue cmdq, cl_mem nodes, cl_mem nodes2, cl_int ListLength, size_t numLists )
+{
+  cl_int error = CL_SUCCESS;
+
+  log_info("SVM: creating linked list on host ");
+
+  Node *pNodes = (Node*) clEnqueueMapBuffer(cmdq, nodes, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(Node)*ListLength*numLists, 0, NULL,NULL, &error);
+  test_error2(error, pNodes, "clEnqueueMapBuffer failed");
+
+  Node *pNodes2 = (Node*) clEnqueueMapBuffer(cmdq, nodes2, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(Node)*ListLength*numLists, 0, NULL,NULL, &error);
+  test_error2(error, pNodes2, "clEnqueueMapBuffer failed");
+
+  create_linked_lists(pNodes, numLists, ListLength);
+
+  error = clEnqueueUnmapMemObject(cmdq, nodes, pNodes, 0,NULL,NULL);
+  test_error(error, "clEnqueueUnmapMemObject failed");
+  error = clEnqueueUnmapMemObject(cmdq, nodes2, pNodes2, 0,NULL,NULL);
+  test_error(error, "clEnqueueUnmapMemObject failed");
+  error = clFinish(cmdq);
+  test_error(error, "clFinish failed");
+  return error;
+}
+
+// Verify correctness of the linked list using host code.
+cl_int verify_linked_lists_on_host_sb(int ci, cl_command_queue cmdq, cl_mem nodes, cl_mem nodes2, cl_int ListLength, size_t numLists )
+{
+  cl_int error = CL_SUCCESS;
+
+  //log_info(" and verifying on host ");
+
+  Node *pNodes = (Node*) clEnqueueMapBuffer(cmdq, nodes, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(Node)*ListLength * numLists, 0, NULL,NULL, &error);
+  test_error2(error, pNodes, "clEnqueueMapBuffer failed");
+  Node *pNodes2 = (Node*) clEnqueueMapBuffer(cmdq, nodes2, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(Node)*ListLength * numLists, 0, NULL,NULL, &error);
+  test_error2(error, pNodes, "clEnqueueMapBuffer failed");
+
+  error = verify_linked_lists(pNodes, numLists, ListLength);
+  if(error) return -1;
+
+  error = clEnqueueUnmapMemObject(cmdq, nodes, pNodes, 0,NULL,NULL);
+  test_error(error, "clEnqueueUnmapMemObject failed");
+  error = clEnqueueUnmapMemObject(cmdq, nodes2, pNodes2, 0,NULL,NULL);
+  test_error(error, "clEnqueueUnmapMemObject failed");
+  error = clFinish(cmdq);
+  test_error(error, "clFinish failed");
+  return error;
+}
+
+
+// This tests that shared sub-buffers can be created and that they inherit the flags from the parent buffer when no flags are specified.
+// This tests that passing only the sub-buffers to a kernel works.
+// The test is derived from the cross-buffer pointers test which
+// tests that shared buffers are able to contain pointers that point to other shared buffers.
+// This tests that all devices and the host share a common address space; using only the coarse-grain features.
+// This is done by creating a linked list on a device and then verifying the correctness of the list
+// on another device or the host.
+// The linked list nodes are allocated from two different buffers this is done to ensure that cross buffer pointers work correctly.
+// This basic test is performed for all combinations of devices and the host.
+int test_shared_sub_buffers(cl_device_id deviceID, cl_context context2, cl_command_queue queue, int num_elements)
+{
+  clContextWrapper    context = NULL;
+  clProgramWrapper    program = NULL;
+  cl_uint     num_devices = 0;
+  cl_int      error = CL_SUCCESS;
+  clCommandQueueWrapper queues[MAXQ];
+
+  error = create_cl_objects(deviceID, &shared_sub_buffers_test_kernel[0], &context, &program, &queues[0], &num_devices, CL_DEVICE_SVM_COARSE_GRAIN_BUFFER);
+  if(error) return -1;
+
+  size_t numLists =  num_elements;
+  if(numLists & 0x1) numLists++; // force even size, so we can easily create two sub-buffers of same size.
+
+  cl_int ListLength = 32;
+
+  clKernelWrapper kernel_create_lists = clCreateKernel(program, "create_linked_lists", &error);
+  test_error(error, "clCreateKernel failed");
+
+  clKernelWrapper kernel_verify_lists = clCreateKernel(program, "verify_linked_lists", &error);
+  test_error(error, "clCreateKernel failed");
+
+  size_t nodes_bufsize = sizeof(Node)*ListLength*numLists;
+  Node* pNodes = (Node*) clSVMAlloc(context, CL_MEM_READ_WRITE, nodes_bufsize, 0);
+  Node* pNodes2 = (Node*) clSVMAlloc(context, CL_MEM_READ_WRITE, nodes_bufsize, 0);
+
+  {
+    // this buffer holds some of the linked list nodes.
+    clMemWrapper nodes = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, nodes_bufsize, pNodes, &error);
+    test_error(error, "clCreateBuffer failed.");
+
+    cl_buffer_region r;
+    r.origin = 0;
+    r.size = nodes_bufsize / 2;
+    // this should inherit the flag settings from nodes buffer
+    clMemWrapper nodes_sb1 = clCreateSubBuffer(nodes, 0, CL_BUFFER_CREATE_TYPE_REGION, (void*)&r, &error);
+    test_error(error, "clCreateSubBuffer");
+    r.origin = nodes_bufsize / 2;
+    clMemWrapper nodes_sb2 = clCreateSubBuffer(nodes, 0, CL_BUFFER_CREATE_TYPE_REGION, (void*)&r, &error);
+    test_error(error, "clCreateSubBuffer");
+
+
+    // this buffer holds some of the linked list nodes.
+    clMemWrapper nodes2 = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, sizeof(Node)*ListLength*numLists, pNodes2, &error);
+    test_error(error, "clCreateBuffer failed.");
+    r.origin = 0;
+    r.size = nodes_bufsize / 2;
+    // this should inherit the flag settings from nodes buffer
+    clMemWrapper nodes2_sb1 = clCreateSubBuffer(nodes2, 0, CL_BUFFER_CREATE_TYPE_REGION, (void*)&r, &error);
+    test_error(error, "clCreateSubBuffer");
+    r.origin = nodes_bufsize / 2;
+    clMemWrapper nodes2_sb2 = clCreateSubBuffer(nodes2, 0, CL_BUFFER_CREATE_TYPE_REGION,(void*)&r, &error);
+    test_error(error, "clCreateSubBuffer");
+
+
+
+    // this buffer holds the index into the nodes buffer that is used for node allocation
+    clMemWrapper allocator = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &error);
+    test_error(error, "clCreateBuffer failed.");
+
+    // this buffer holds the count of correct nodes which is computed by the verify kernel.
+    clMemWrapper num_correct = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &error);
+    test_error(error, "clCreateBuffer failed.");
+
+    error |= clSetKernelArg(kernel_create_lists, 0, sizeof(void*), (void *) &nodes_sb1);
+    error |= clSetKernelArg(kernel_create_lists, 1, sizeof(void*), (void *) &nodes2_sb1);
+    error |= clSetKernelArg(kernel_create_lists, 2, sizeof(void*), (void *) &nodes_sb2);
+    error |= clSetKernelArg(kernel_create_lists, 3, sizeof(void*), (void *) &nodes2_sb2);
+    error |= clSetKernelArg(kernel_create_lists, 4, sizeof(void*), (void *) &allocator);
+    error |= clSetKernelArg(kernel_create_lists, 5, sizeof(cl_int),(void *) &ListLength);
+
+    error |= clSetKernelArg(kernel_verify_lists, 0, sizeof(void*), (void *) &nodes_sb1);
+    error |= clSetKernelArg(kernel_verify_lists, 1, sizeof(void*), (void *) &nodes2_sb1);
+    error |= clSetKernelArg(kernel_verify_lists, 2, sizeof(void*), (void *) &nodes_sb2);
+    error |= clSetKernelArg(kernel_verify_lists, 3, sizeof(void*), (void *) &nodes2_sb2);
+    error |= clSetKernelArg(kernel_verify_lists, 4, sizeof(void*), (void *) &num_correct);
+    error |= clSetKernelArg(kernel_verify_lists, 5, sizeof(cl_int),(void *) &ListLength);
+    test_error(error, "clSetKernelArg failed");
+
+    // Create linked list on one device and verify on another device (or the host).
+    // Do this for all possible combinations of devices and host within the platform.
+    for (int ci=0; ci<(int)num_devices+1; ci++)  // ci is CreationIndex, index of device/q to create linked list on
+    {
+      for (int vi=0; vi<(int)num_devices+1; vi++)  // vi is VerificationIndex, index of device/q to verify linked list on
+      {
+        if(ci == num_devices) // last device index represents the host, note the num_device+1 above.
+        {
+          error = create_linked_lists_on_host_sb(queues[0], nodes, nodes2, ListLength, numLists);
+          if(error) return -1;
+        }
+        else
+        {
+          error = create_linked_lists_on_device(ci, queues[ci], allocator, kernel_create_lists, numLists);
+          if(error) return -1;
+        }
+
+        if(vi == num_devices)
+        {
+          error = verify_linked_lists_on_host_sb(vi, queues[0], nodes, nodes2, ListLength, numLists);
+          if(error) return -1;
+        }
+        else
+        {
+          error = verify_linked_lists_on_device(vi, queues[vi], num_correct, kernel_verify_lists, ListLength, numLists);
+          if(error) return -1;
+        }
+      } // inner loop, vi
+    } // outer loop, ci
+  }
+  clSVMFree(context, pNodes2);
+  clSVMFree(context, pNodes);
+
+  return 0;
+}