Initial open source release of OpenCL 2.0 CTS.

2026-03-23 23:49:02 +00:00 · 2017-05-16 18:50:35 +05:30
parent 6911ba5116
commit 3a440d17c8
883 changed files with 318212 additions and 0 deletions
--- a/test_conformance/buffers/CMakeLists.txt
+++ b/test_conformance/buffers/CMakeLists.txt
@@ -0,0 +1,25 @@
+set(MODULE_NAME BUFFERS)
+
+set(${MODULE_NAME}_SOURCES
+    main.c
+    test_buffer_copy.c
+    test_buffer_read.c
+    test_buffer_write.c
+    test_buffer_mem.c
+    array_info.c
+    test_buffer_map.c
+    test_sub_buffers.cpp
+    test_buffer_fill.c
+    test_buffer_migrate.c
+    test_image_migrate.c
+    ../../test_common/harness/errorHelpers.c
+    ../../test_common/harness/threadTesting.c
+    ../../test_common/harness/testHarness.c
+    ../../test_common/harness/kernelHelpers.c
+    ../../test_common/harness/typeWrappers.cpp
+    ../../test_common/harness/mt19937.c
+    ../../test_common/harness/conversions.c
+    ../../test_common/harness/msvc9.c
+)
+
+include(../CMakeCommon.txt)
--- a/test_conformance/buffers/Jamfile
+++ b/test_conformance/buffers/Jamfile
@@ -0,0 +1,24 @@
+project
+    : requirements
+      <toolset>gcc:<cflags>-xc++
+      <toolset>msvc:<cflags>"/TP"
+    ;
+
+exe test_buffers
+    : array_info.c
+      main.c
+      test_buffer_copy.c
+      test_buffer_map.c
+      test_buffer_mem.c
+      test_buffer_read.c
+      test_buffer_write.c
+      test_buffer_fill.c
+    : <library>../..//glew
+    ;
+
+install dist
+    : test_buffers
+    : <variant>debug:<location>$(DIST)/debug/tests/test_conformance/buffers
+      <variant>release:<location>$(DIST)/release/tests/test_conformance/buffers
+    ;
+
--- a/test_conformance/buffers/Makefile
+++ b/test_conformance/buffers/Makefile
@@ -0,0 +1,49 @@
+ifdef BUILD_WITH_ATF
+ATF = -framework ATF
+USE_ATF = -DUSE_ATF
+endif
+
+SRCS = main.c test_buffer_copy.c test_buffer_read.c test_buffer_write.c \
+			test_buffer_mem.c array_info.c test_buffer_map.c \
+			test_sub_buffers.cpp test_buffer_fill.c \
+			test_buffer_migrate.c test_image_migrate.c \
+		  ../../test_common/harness/errorHelpers.c \
+		  ../../test_common/harness/threadTesting.c \
+		  ../../test_common/harness/testHarness.c \
+		  ../../test_common/harness/kernelHelpers.c \
+                  ../../test_common/harness/conversions.c \
+                  ../../test_common/harness/mt19937.c \
+		  ../../test_common/harness/typeWrappers.cpp
+		  
+		
+			
+DEFINES = 
+
+SOURCES = $(abspath $(SRCS))
+LIBPATH += -L/System/Library/Frameworks/OpenCL.framework/Libraries
+LIBPATH += -L.
+FRAMEWORK = $(SOURCES)
+HEADERS = 
+TARGET = test_buffers
+INCLUDE = 
+COMPILERFLAGS = -c -Wall -g -O0 -Wshorten-64-to-32
+CC = c++
+CFLAGS = $(COMPILERFLAGS) ${RC_CFLAGS} ${USE_ATF} $(DEFINES:%=-D%)
+CXXFLAGS = $(COMPILERFLAGS) ${RC_CFLAGS} ${USE_ATF} $(DEFINES:%=-D%)
+LIBRARIES = -framework OpenCL -framework AppKit ${ATF}
+
+OBJECTS := ${SOURCES:.c=.o}
+OBJECTS := ${OBJECTS:.cpp=.o}
+
+TARGETOBJECT =
+all: $(TARGET)
+
+$(TARGET): $(OBJECTS)
+	$(CC) $(RC_CFLAGS) $(OBJECTS) -o $@ $(LIBPATH) $(LIBRARIES)
+
+clean:
+	rm -f $(TARGET) $(OBJECTS)
+
+.DEFAULT:
+	@echo The target \"$@\" does not exist in Makefile.
+
--- a/test_conformance/buffers/array_info.c
+++ b/test_conformance/buffers/array_info.c
@@ -0,0 +1,63 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/compat.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "procs.h"
+
+
+
+int testBufferSize( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+{
+    cl_mem          memobj;
+    cl_int          err;
+    size_t          w = 32, h = 32, d = 32;
+    size_t          retSize;
+    size_t          elementSize = sizeof( cl_int );
+
+    memobj = clCreateBuffer( context, (cl_mem_flags)(CL_MEM_READ_WRITE),  elementSize * w*h*d, NULL, &err);
+    test_error(err, "clCreateBuffer failed.");
+
+    err = clGetMemObjectInfo(memobj, CL_MEM_SIZE, sizeof( size_t ), (void *)&retSize, NULL);
+    if ( err ){
+        log_error( "Error calling clGetMemObjectInfo(): %d\n", err );
+        clReleaseMemObject(memobj);
+        return -1;
+    }
+    if ( (elementSize * w * h * d) != retSize ) {
+        log_error( "Error in clGetMemObjectInfo() check of size\n" );
+        clReleaseMemObject(memobj);
+        return -1;
+    }
+    else{
+        log_info( " CL_MEM_SIZE passed.\n" );
+    }
+
+    // cleanup
+    clReleaseMemObject(memobj);
+
+    return err;
+
+}   // end testArrayElementSize()
+
+
+// FIXME: need to test other flags
+
--- a/test_conformance/buffers/main.c
+++ b/test_conformance/buffers/main.c
@@ -0,0 +1,246 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/compat.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "procs.h"
+#include "../../test_common/harness/testHarness.h"
+
+basefn  bufferfn_list[] = {
+    test_buffer_read_async_int,
+    test_buffer_read_async_uint,
+    test_buffer_read_async_long,
+    test_buffer_read_async_ulong,
+    test_buffer_read_async_short,
+    test_buffer_read_async_ushort,
+    test_buffer_read_async_char,
+    test_buffer_read_async_uchar,
+    test_buffer_read_async_float,
+    test_buffer_read_array_barrier_int,
+    test_buffer_read_array_barrier_uint,
+    test_buffer_read_array_barrier_long,
+    test_buffer_read_array_barrier_ulong,
+    test_buffer_read_array_barrier_short,
+    test_buffer_read_array_barrier_ushort,
+    test_buffer_read_array_barrier_char,
+    test_buffer_read_array_barrier_uchar,
+    test_buffer_read_array_barrier_float,
+    test_buffer_read_int,
+    test_buffer_read_uint,
+    test_buffer_read_long,
+    test_buffer_read_ulong,
+    test_buffer_read_short,
+    test_buffer_read_ushort,
+    test_buffer_read_float,
+    0, //test_buffer_read_half,
+    test_buffer_read_char,
+    test_buffer_read_uchar,
+    test_buffer_read_struct,
+    test_buffer_read_random_size,
+    test_buffer_map_read_int,
+    test_buffer_map_read_uint,
+    test_buffer_map_read_long,
+    test_buffer_map_read_ulong,
+    test_buffer_map_read_short,
+    test_buffer_map_read_ushort,
+    test_buffer_map_read_char,
+    test_buffer_map_read_uchar,
+    test_buffer_map_read_float,
+    test_buffer_map_read_struct,
+
+    test_buffer_map_write_int,
+    test_buffer_map_write_uint,
+    test_buffer_map_write_long,
+    test_buffer_map_write_ulong,
+    test_buffer_map_write_short,
+    test_buffer_map_write_ushort,
+    test_buffer_map_write_char,
+    test_buffer_map_write_uchar,
+    test_buffer_map_write_float,
+    test_buffer_map_write_struct,
+
+    test_buffer_write_int,
+    test_buffer_write_uint,
+    test_buffer_write_short,
+    test_buffer_write_ushort,
+    test_buffer_write_char,
+    test_buffer_write_uchar,
+    test_buffer_write_float,
+    0, //test_buffer_write_half,
+    test_buffer_write_long,
+    test_buffer_write_ulong,
+    test_buffer_write_struct,
+    test_buffer_write_async_int,
+    test_buffer_write_async_uint,
+    test_buffer_write_async_short,
+    test_buffer_write_async_ushort,
+    test_buffer_write_async_char,
+    test_buffer_write_async_uchar,
+    test_buffer_write_async_float,
+    test_buffer_write_async_long,
+    test_buffer_write_async_ulong,
+    test_buffer_copy,
+    test_buffer_partial_copy,
+    test_mem_read_write_flags,
+    test_mem_write_flags,
+    test_mem_read_flags,
+    test_mem_copy_host_flags,
+    0, //test_mem_alloc_ref_flags,
+    testBufferSize,
+
+    test_sub_buffers_read_write,
+    test_sub_buffers_read_write_dual_devices,
+    test_sub_buffers_overlapping,
+
+    test_buffer_fill_int,
+    test_buffer_fill_uint,
+    test_buffer_fill_short,
+    test_buffer_fill_ushort,
+    test_buffer_fill_char,
+    test_buffer_fill_uchar,
+    test_buffer_fill_long,
+    test_buffer_fill_ulong,
+    test_buffer_fill_float,
+    test_buffer_fill_struct,
+
+    test_buffer_migrate,
+    test_image_migrate,
+};
+
+const char *bufferfn_names[] = {
+    "buffer_read_async_int",
+    "buffer_read_async_uint",
+    "buffer_read_async_long",
+    "buffer_read_async_ulong",
+    "buffer_read_async_short",
+    "buffer_read_async_ushort",
+    "buffer_read_async_char",
+    "buffer_read_async_uchar",
+    "buffer_read_async_float",
+    "buffer_read_array_barrier_int",
+    "buffer_read_array_barrier_uint",
+    "buffer_read_array_barrier_long",
+    "buffer_read_array_barrier_ulong",
+    "buffer_read_array_barrier_short",
+    "buffer_read_array_barrier_ushort",
+    "buffer_read_array_barrier_char",
+    "buffer_read_array_barrier_uchar",
+    "buffer_read_array_barrier_float",
+    "buffer_read_int",
+    "buffer_read_uint",
+    "buffer_read_long",
+    "buffer_read_ulong",
+    "buffer_read_short",
+    "buffer_read_ushort",
+    "buffer_read_float",
+    "buffer_read_half",
+    "buffer_read_char",
+    "buffer_read_uchar",
+    "buffer_read_struct",
+    "buffer_read_random_size",
+    "buffer_map_read_int",
+    "buffer_map_read_uint",
+    "buffer_map_read_long",
+    "buffer_map_read_ulong",
+    "buffer_map_read_short",
+    "buffer_map_read_ushort",
+    "buffer_map_read_char",
+    "buffer_map_read_uchar",
+    "buffer_map_read_float",
+    "buffer_map_read_struct",
+
+    "buffer_map_write_int",
+    "buffer_map_write_uint",
+    "buffer_map_write_long",
+    "buffer_map_write_ulong",
+    "buffer_map_write_short",
+    "buffer_map_write_ushort",
+    "buffer_map_write_char",
+    "buffer_map_write_uchar",
+    "buffer_map_write_float",
+    "buffer_map_write_struct",
+
+    "buffer_write_int",
+    "buffer_write_uint",
+    "buffer_write_short",
+    "buffer_write_ushort",
+    "buffer_write_char",
+    "buffer_write_uchar",
+    "buffer_write_float",
+    "buffer_write_half",
+    "buffer_write_long",
+    "buffer_write_ulong",
+    "buffer_write_struct",
+    "buffer_write_async_int",
+    "buffer_write_async_uint",
+    "buffer_write_async_short",
+    "buffer_write_async_ushort",
+    "buffer_write_async_char",
+    "buffer_write_async_uchar",
+    "buffer_write_async_float",
+    "buffer_write_async_long",
+    "buffer_write_async_ulong",
+    "buffer_copy",
+    "buffer_partial_copy",
+    "mem_read_write_flags",
+    "mem_write_only_flags",
+    "mem_read_only_flags",
+    "mem_copy_host_flags",
+    "mem_alloc_ref_flags",
+    "array_info_size",
+    "sub_buffers_read_write",
+    "sub_buffers_read_write_dual_devices",
+    "sub_buffers_overlapping",
+    "buffer_fill_int",
+    "buffer_fill_uint",
+    "buffer_fill_short",
+    "buffer_fill_ushort",
+    "buffer_fill_char",
+    "buffer_fill_uchar",
+    "buffer_fill_long",
+    "buffer_fill_ulong",
+    "buffer_fill_float",
+    "buffer_fill_struct",
+    "buffer_migrate",
+    "image_migrate",
+};
+
+ct_assert((sizeof(bufferfn_names) / sizeof(bufferfn_names[0])) == (sizeof(bufferfn_list) / sizeof(bufferfn_list[0])));
+
+int num_bufferfns = sizeof(bufferfn_names) / sizeof(char *);
+
+const cl_mem_flags flag_set[] = {
+    CL_MEM_ALLOC_HOST_PTR,
+    CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR,
+    CL_MEM_USE_HOST_PTR,
+    CL_MEM_COPY_HOST_PTR,
+    0
+};
+const char* flag_set_names[] = {
+    "CL_MEM_ALLOC_HOST_PTR",
+    "CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR",
+    "CL_MEM_USE_HOST_PTR",
+    "CL_MEM_COPY_HOST_PTR",
+    "0"
+};
+
+int main( int argc, const char *argv[] )
+{
+    return runTestHarness( argc, argv, num_bufferfns, bufferfn_list, bufferfn_names,
+                           false, false, 0 );
+}
--- a/test_conformance/buffers/procs.h
+++ b/test_conformance/buffers/procs.h
@@ -0,0 +1,132 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#ifndef __PROCS_H__
+#define __PROCS_H__
+
+#include "../../test_common/harness/kernelHelpers.h"
+#include "../../test_common/harness/testHarness.h"
+#include "../../test_common/harness/errorHelpers.h"
+#include "../../test_common/harness/typeWrappers.h"
+#include "../../test_common/harness/mt19937.h"
+#include "../../test_common/harness/conversions.h"
+
+#ifndef __APPLE__
+#include <CL/cl.h>
+#endif
+
+extern const cl_mem_flags flag_set[];
+extern const char* flag_set_names[];
+#define NUM_FLAGS 5
+
+extern int      test_buffer_read_int( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_read_uint( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_read_long( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_read_ulong( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_read_short( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_read_ushort( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_read_float( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_read_half( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_read_char( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_read_uchar( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_read_struct( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_read_random_size( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_read_async_int( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_read_async_uint( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_read_async_long( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_read_async_ulong( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_read_async_short( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_read_async_ushort( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_read_async_char( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_read_async_uchar( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_read_async_float( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_read_array_barrier_int( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_read_array_barrier_uint( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_read_array_barrier_short( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_read_array_barrier_ushort( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_read_array_barrier_long( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_read_array_barrier_ulong( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_read_array_barrier_char( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_read_array_barrier_uchar( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_read_array_barrier_float( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_write_int( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_write_uint( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_write_short( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_write_ushort( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_write_char( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_write_uchar( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_write_float( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_write_half( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_write_long( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_write_ulong( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_write_struct( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_write_async_int( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_write_async_uint( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_write_async_short( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_write_async_ushort( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_write_async_char( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_write_async_uchar( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_write_async_float( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_write_async_long( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_write_async_ulong( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_copy( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_partial_copy( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      testBufferSize( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_mem_read_write_flags( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_mem_write_flags( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_mem_read_flags( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_mem_copy_host_flags( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_mem_alloc_ref_flags( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_map_read_int( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_map_read_uint( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_map_read_long( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_map_read_ulong( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_map_read_short( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_map_read_ushort( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_map_read_char( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_map_read_uchar( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_map_read_float( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_map_read_struct( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+
+extern int      test_buffer_map_write_int( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_map_write_uint( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_map_write_long( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_map_write_ulong( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_map_write_short( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_map_write_ushort( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_map_write_char( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_map_write_uchar( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_map_write_float( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_map_write_struct( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+
+extern int      test_sub_buffers_read_write( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_sub_buffers_read_write_dual_devices( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_sub_buffers_overlapping( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_migrate(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int      test_image_migrate(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+
+extern int      test_buffer_fill_int( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_fill_uint( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_fill_short( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_fill_ushort( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_fill_char( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_fill_uchar( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_fill_long( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_fill_ulong( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_fill_float( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_buffer_fill_struct( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+
+#endif    // #ifndef __PROCS_H__
+
--- a/test_conformance/buffers/test_buffer_copy.c
+++ b/test_conformance/buffers/test_buffer_copy.c
@@ -0,0 +1,295 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/compat.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "procs.h"
+#include "../../test_common/harness/errorHelpers.h"
+
+
+static int verify_copy_buffer(int *inptr, int *outptr, int n)
+{
+    int         i;
+
+    for (i=0; i<n; i++){
+        if ( outptr[i] != inptr[i] )
+            return -1;
+    }
+
+    return 0;
+}
+
+
+static int test_copy( cl_command_queue queue, cl_context context, int num_elements, MTdata d )
+{
+    cl_mem  buffers[2];
+    cl_int  *int_input_ptr, *int_output_ptr;
+    cl_int  err;
+    int     i;
+    int     src_flag_id, dst_flag_id;
+    int     errors = 0;
+
+    size_t  min_alignment = get_min_alignment(context);
+
+    int_input_ptr = (cl_int*) align_malloc(sizeof(cl_int) * num_elements, min_alignment);
+    int_output_ptr = (cl_int*)align_malloc(sizeof(cl_int) * num_elements, min_alignment);
+
+    for (src_flag_id=0; src_flag_id < NUM_FLAGS; src_flag_id++) {
+        for (dst_flag_id=0; dst_flag_id < NUM_FLAGS; dst_flag_id++) {
+            log_info("Testing with cl_mem_flags src: %s dst: %s\n", flag_set_names[src_flag_id], flag_set_names[dst_flag_id]);
+
+            for (i=0; i<num_elements; i++){
+                int_input_ptr[i] = (int)genrand_int32( d );
+                int_output_ptr[i] = 0xdeaddead; // seed with incorrect data
+            }
+
+            if ((flag_set[src_flag_id] & CL_MEM_USE_HOST_PTR) || (flag_set[src_flag_id] & CL_MEM_COPY_HOST_PTR))
+                buffers[0] = clCreateBuffer(context, flag_set[src_flag_id],  sizeof(cl_int) * num_elements, int_input_ptr, &err);
+            else
+                buffers[0] = clCreateBuffer(context, flag_set[src_flag_id],  sizeof(cl_int) * num_elements, NULL, &err);
+            if ( err != CL_SUCCESS ){
+                print_error(err, " clCreateBuffer failed\n" );
+                align_free( (void *)int_input_ptr );
+                align_free( (void *)int_output_ptr );
+                return -1;
+            }
+
+            if ((flag_set[dst_flag_id] & CL_MEM_USE_HOST_PTR) || (flag_set[dst_flag_id] & CL_MEM_COPY_HOST_PTR))
+                buffers[1] = clCreateBuffer(context, flag_set[dst_flag_id],  sizeof(cl_int) * num_elements, int_output_ptr, &err);
+            else
+                buffers[1] = clCreateBuffer(context, flag_set[dst_flag_id],  sizeof(cl_int) * num_elements, NULL, &err);
+            if ( err != CL_SUCCESS ){
+                print_error(err, " clCreateBuffer failed\n" );
+                clReleaseMemObject( buffers[0] );
+                align_free( (void *)int_input_ptr );
+                align_free( (void *)int_output_ptr );
+                return -1;
+            }
+
+            if (!(flag_set[src_flag_id] & CL_MEM_USE_HOST_PTR) && !(flag_set[src_flag_id] & CL_MEM_COPY_HOST_PTR)) {
+                err = clEnqueueWriteBuffer(queue, buffers[0], CL_TRUE, 0, sizeof(cl_int)*num_elements, (void *)int_input_ptr, 0, NULL, NULL);
+                if ( err != CL_SUCCESS ){
+                    print_error( err, "clEnqueueWriteBuffer failed" );
+                    clReleaseMemObject( buffers[0] );
+                    clReleaseMemObject( buffers[1] );
+                    align_free( (void *)int_output_ptr );
+                    align_free( (void *)int_input_ptr );
+                    return -1;
+                }
+            }
+
+            err = clEnqueueCopyBuffer(queue, buffers[0], buffers[1], 0, 0, sizeof(cl_int)*num_elements, 0, NULL, NULL);
+            if ( err != CL_SUCCESS ){
+                print_error( err, "clCopyArray failed" );
+                clReleaseMemObject( buffers[0] );
+                clReleaseMemObject( buffers[1] );
+                align_free( (void *)int_output_ptr );
+                align_free( (void *)int_input_ptr );
+                return -1;
+            }
+
+            err = clEnqueueReadBuffer( queue, buffers[1], true, 0, sizeof(int)*num_elements, (void *)int_output_ptr, 0, NULL, NULL );
+            if ( err != CL_SUCCESS ){
+                print_error( err, "clEnqueueReadBuffer failed" );
+                clReleaseMemObject( buffers[0] );
+                clReleaseMemObject( buffers[1] );
+                align_free( (void *)int_output_ptr );
+                align_free( (void *)int_input_ptr );
+                return -1;
+            }
+
+            if ( verify_copy_buffer(int_input_ptr, int_output_ptr, num_elements) ){
+                log_error( " test failed\n" );
+                errors++;
+            }
+            else{
+                log_info( " test passed\n" );
+            }
+    // cleanup
+            clReleaseMemObject( buffers[0] );
+            clReleaseMemObject( buffers[1] );
+        } // dst flags
+    }  // src flags
+    // cleanup
+    align_free( (void *)int_output_ptr );
+    align_free( (void *)int_input_ptr );
+
+    return errors;
+
+}   // end test_copy()
+
+
+static int testPartialCopy( cl_command_queue queue, cl_context context, int num_elements, cl_uint srcStart, cl_uint dstStart, int size, MTdata d )
+{
+    cl_mem  buffers[2];
+    int     *inptr, *outptr;
+    cl_int  err;
+    int     i;
+    int     src_flag_id, dst_flag_id;
+    int     errors = 0;
+
+    size_t  min_alignment = get_min_alignment(context);
+
+    inptr = (int *)align_malloc( sizeof(int) * num_elements, min_alignment);
+    if ( ! inptr ){
+        log_error( " unable to allocate %d bytes of memory\n", (int)sizeof(int) * num_elements );
+        return -1;
+    }
+    outptr = (int *)align_malloc( sizeof(int) * num_elements, min_alignment);
+    if ( ! outptr ){
+        log_error( " unable to allocate %d bytes of memory\n", (int)sizeof(int) * num_elements );
+        align_free( (void *)inptr );
+        return -1;
+    }
+
+    for (src_flag_id=0; src_flag_id < NUM_FLAGS; src_flag_id++) {
+        for (dst_flag_id=0; dst_flag_id < NUM_FLAGS; dst_flag_id++) {
+            log_info("Testing with cl_mem_flags src: %s dst: %s\n", flag_set_names[src_flag_id], flag_set_names[dst_flag_id]);
+
+            for (i=0; i<num_elements; i++){
+                inptr[i] = (int)genrand_int32( d );
+                outptr[i] = (int)0xdeaddead;    // seed with incorrect data
+            }
+
+            if ((flag_set[src_flag_id] & CL_MEM_USE_HOST_PTR) || (flag_set[src_flag_id] & CL_MEM_COPY_HOST_PTR))
+                buffers[0] = clCreateBuffer(context, flag_set[src_flag_id],  sizeof(cl_int) * num_elements, inptr, &err);
+            else
+                buffers[0] = clCreateBuffer(context, flag_set[src_flag_id],  sizeof(cl_int) * num_elements, NULL, &err);
+            if ( err != CL_SUCCESS ){
+                print_error(err, " clCreateBuffer failed\n" )
+                align_free( (void *)outptr );
+                align_free( (void *)inptr );
+                return -1;
+            }
+
+            if ((flag_set[dst_flag_id] & CL_MEM_USE_HOST_PTR) || (flag_set[dst_flag_id] & CL_MEM_COPY_HOST_PTR))
+                buffers[1] = clCreateBuffer(context, flag_set[dst_flag_id],  sizeof(cl_int) * num_elements, outptr, &err);
+            else
+                buffers[1] = clCreateBuffer(context, flag_set[dst_flag_id],  sizeof(cl_int) * num_elements, NULL, &err);
+            if ( err != CL_SUCCESS ){
+                print_error(err, " clCreateBuffer failed\n" );
+                clReleaseMemObject( buffers[0] );
+                align_free( (void *)outptr );
+                align_free( (void *)inptr );
+                return -1;
+            }
+
+            if (!(flag_set[src_flag_id] & CL_MEM_USE_HOST_PTR) && !(flag_set[src_flag_id] & CL_MEM_COPY_HOST_PTR)){
+                err = clEnqueueWriteBuffer(queue, buffers[0], CL_TRUE, 0, sizeof(cl_int)*num_elements, (void *)inptr, 0, NULL, NULL);
+                if ( err != CL_SUCCESS ){
+                    print_error( err, "clEnqueueWriteBuffer failed" );
+                    clReleaseMemObject( buffers[1] );
+                    clReleaseMemObject( buffers[0] );
+                    align_free( (void *)outptr );
+                    align_free( (void *)inptr );
+                    return -1;
+                }
+            }
+
+            err = clEnqueueCopyBuffer(queue, buffers[0], buffers[1], srcStart*sizeof(cl_int), dstStart*sizeof(cl_int), sizeof(cl_int)*size, 0, NULL, NULL);
+            if ( err != CL_SUCCESS){
+                print_error( err, "clEnqueueCopyBuffer failed" );
+                clReleaseMemObject( buffers[1] );
+                clReleaseMemObject( buffers[0] );
+                align_free( (void *)outptr );
+                align_free( (void *)inptr );
+                return -1;
+            }
+
+            err = clEnqueueReadBuffer( queue, buffers[1], true, 0, sizeof(int)*num_elements, (void *)outptr, 0, NULL, NULL );
+            if ( err != CL_SUCCESS){
+                print_error( err, "clEnqueueReadBuffer failed" );
+                clReleaseMemObject( buffers[1] );
+                clReleaseMemObject( buffers[0] );
+                align_free( (void *)outptr );
+                align_free( (void *)inptr );
+                return -1;
+            }
+
+            if ( verify_copy_buffer(inptr + srcStart, outptr + dstStart, size) ){
+                log_error("buffer_COPY test failed\n");
+                errors++;
+            }
+            else{
+                log_info("buffer_COPY test passed\n");
+            }
+    // cleanup
+            clReleaseMemObject( buffers[1] );
+            clReleaseMemObject( buffers[0] );
+        } // dst mem flags
+    } // src mem flags
+    // cleanup
+    align_free( (void *)outptr );
+    align_free( (void *)inptr );
+
+    return errors;
+
+}   // end testPartialCopy()
+
+
+int test_buffer_copy( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+{
+    int     i, err = 0;
+    int     size;
+    MTdata  d = init_genrand( gRandomSeed );
+
+    // test the preset size
+    log_info( "set size: %d: ", num_elements );
+    if (test_copy( queue, context, num_elements, d ))
+        err++;
+
+    // now test random sizes
+    for ( i = 0; i < 8; i++ ){
+        size = (int)get_random_float(2.f,131072.f, d);
+        log_info( "random size: %d: ", size );
+        if (test_copy( queue, context, size, d ))
+            err++;
+    }
+
+    free_mtdata(d);
+
+    return err;
+
+}   // end test_buffer_copy()
+
+
+int test_buffer_partial_copy( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+{
+    int     i, err = 0;
+    int     size;
+    cl_uint srcStart, dstStart;
+    MTdata  d = init_genrand( gRandomSeed );
+
+    // now test copy of partial sizes
+    for ( i = 0; i < 8; i++ ){
+        srcStart = (cl_uint)get_random_float( 0.f, (float)(num_elements - 8), d );
+        size = (int)get_random_float( 8.f, (float)(num_elements - srcStart), d );
+        dstStart = (cl_uint)get_random_float( 0.f, (float)(num_elements - size), d );
+        log_info( "random partial copy from %d to %d, size: %d: ", (int)srcStart, (int)dstStart, size );
+        if (testPartialCopy( queue, context, num_elements, srcStart, dstStart, size, d ))
+            err++;
+    }
+
+    free_mtdata(d);
+    return err;
+
+}   // end test_buffer_partial_copy()
+
--- a/test_conformance/buffers/test_buffer_fill.c
+++ b/test_conformance/buffers/test_buffer_fill.c
--- a/test_conformance/buffers/test_buffer_map.c
+++ b/test_conformance/buffers/test_buffer_map.c
@@ -0,0 +1,703 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/compat.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "procs.h"
+#include "../../test_common/harness/errorHelpers.h"
+
+
+#define TEST_PRIME_INT        ((1<<16)+1)
+#define TEST_PRIME_UINT        ((1U<<16)+1U)
+#define TEST_PRIME_LONG        ((1LL<<32)+1LL)
+#define TEST_PRIME_ULONG    ((1ULL<<32)+1ULL)
+#define TEST_PRIME_SHORT    ((1S<<8)+1S)
+#define TEST_PRIME_FLOAT    (float)3.40282346638528860e+38
+#define TEST_PRIME_HALF        119.f
+#define TEST_BOOL            true
+#define TEST_PRIME_CHAR        0x77
+
+
+#ifndef TestStruct
+typedef struct{
+    int     a;
+    float   b;
+} TestStruct;
+#endif
+
+
+//--- the code for the kernel executables
+static const char *buffer_read_int_kernel_code[] = {
+    "__kernel void test_buffer_read_int(__global int *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = ((1<<16)+1);\n"
+    "}\n",
+
+    "__kernel void test_buffer_read_int2(__global int2 *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = ((1<<16)+1);\n"
+    "}\n",
+
+    "__kernel void test_buffer_read_int4(__global int4 *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = ((1<<16)+1);\n"
+    "}\n",
+
+    "__kernel void test_buffer_read_int8(__global int8 *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = ((1<<16)+1);\n"
+    "}\n",
+
+    "__kernel void test_buffer_read_int16(__global int16 *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = ((1<<16)+1);\n"
+    "}\n" };
+
+static const char *int_kernel_name[] = { "test_buffer_read_int", "test_buffer_read_int2", "test_buffer_read_int4", "test_buffer_read_int8", "test_buffer_read_int16" };
+
+static const char *buffer_read_uint_kernel_code[] = {
+    "__kernel void test_buffer_read_uint(__global uint *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = ((1U<<16)+1U);\n"
+    "}\n",
+
+    "__kernel void test_buffer_read_uint2(__global uint2 *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = ((1U<<16)+1U);\n"
+    "}\n",
+
+    "__kernel void test_buffer_read_uint4(__global uint4 *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = ((1U<<16)+1U);\n"
+    "}\n",
+
+    "__kernel void test_buffer_read_uint8(__global uint8 *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = ((1U<<16)+1U);\n"
+    "}\n",
+
+    "__kernel void test_buffer_read_uint16(__global uint16 *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = ((1U<<16)+1U);\n"
+    "}\n" };
+
+static const char *uint_kernel_name[] = { "test_buffer_read_uint", "test_buffer_read_uint2", "test_buffer_read_uint4", "test_buffer_read_uint8", "test_buffer_read_uint16" };
+
+static const char *buffer_read_long_kernel_code[] = {
+    "__kernel void test_buffer_read_long(__global long *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = ((1L<<32)+1L);\n"
+    "}\n",
+
+    "__kernel void test_buffer_read_long2(__global long2 *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = ((1L<<32)+1L);\n"
+    "}\n",
+
+    "__kernel void test_buffer_read_long4(__global long4 *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = ((1L<<32)+1L);\n"
+    "}\n",
+
+    "__kernel void test_buffer_read_long8(__global long8 *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = ((1L<<32)+1L);\n"
+    "}\n",
+
+    "__kernel void test_buffer_read_long16(__global long16 *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = ((1L<<32)+1L);\n"
+    "}\n" };
+
+static const char *long_kernel_name[] = { "test_buffer_read_long", "test_buffer_read_long2", "test_buffer_read_long4", "test_buffer_read_long8", "test_buffer_read_long16" };
+
+static const char *buffer_read_ulong_kernel_code[] = {
+    "__kernel void test_buffer_read_ulong(__global ulong *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = ((1UL<<32)+1UL);\n"
+    "}\n",
+
+    "__kernel void test_buffer_read_ulong2(__global ulong2 *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = ((1UL<<32)+1UL);\n"
+    "}\n",
+
+    "__kernel void test_buffer_read_ulong4(__global ulong4 *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = ((1UL<<32)+1UL);\n"
+    "}\n",
+
+    "__kernel void test_buffer_read_ulong8(__global ulong8 *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = ((1UL<<32)+1UL);\n"
+    "}\n",
+
+    "__kernel void test_buffer_read_ulong16(__global ulong16 *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = ((1UL<<32)+1UL);\n"
+    "}\n" };
+
+static const char *ulong_kernel_name[] = { "test_buffer_read_ulong", "test_buffer_read_ulong2", "test_buffer_read_ulong4", "test_buffer_read_ulong8", "test_buffer_read_ulong16" };
+
+static const char *buffer_read_short_kernel_code[] = {
+    "__kernel void test_buffer_read_short(__global short *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = (short)((1<<8)+1);\n"
+    "}\n",
+
+    "__kernel void test_buffer_read_short2(__global short2 *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = (short)((1<<8)+1);\n"
+    "}\n",
+
+    "__kernel void test_buffer_read_short4(__global short4 *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = (short)((1<<8)+1);\n"
+    "}\n",
+
+    "__kernel void test_buffer_read_short8(__global short8 *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = (short)((1<<8)+1);\n"
+    "}\n",
+
+    "__kernel void test_buffer_read_short16(__global short16 *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = (short)((1<<8)+1);\n"
+    "}\n" };
+
+static const char *short_kernel_name[] = { "test_buffer_read_short", "test_buffer_read_short2", "test_buffer_read_short4", "test_buffer_read_short8", "test_buffer_read_short16" };
+
+
+static const char *buffer_read_ushort_kernel_code[] = {
+    "__kernel void test_buffer_read_ushort(__global ushort *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = (ushort)((1<<8)+1);\n"
+    "}\n",
+
+    "__kernel void test_buffer_read_ushort2(__global ushort2 *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = (ushort)((1<<8)+1);\n"
+    "}\n",
+
+    "__kernel void test_buffer_read_ushort4(__global ushort4 *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = (ushort)((1<<8)+1);\n"
+    "}\n",
+
+    "__kernel void test_buffer_read_ushort8(__global ushort8 *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = (ushort)((1<<8)+1);\n"
+    "}\n",
+
+    "__kernel void test_buffer_read_ushort16(__global ushort16 *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = (ushort)((1<<8)+1);\n"
+    "}\n" };
+
+static const char *ushort_kernel_name[] = { "test_buffer_read_ushort", "test_buffer_read_ushort2", "test_buffer_read_ushort4", "test_buffer_read_ushort8", "test_buffer_read_ushort16" };
+
+
+static const char *buffer_read_float_kernel_code[] = {
+    "__kernel void test_buffer_read_float(__global float *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = (float)3.40282346638528860e+38;\n"
+    "}\n",
+
+    "__kernel void test_buffer_read_float2(__global float2 *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = (float)3.40282346638528860e+38;\n"
+    "}\n",
+
+    "__kernel void test_buffer_read_float4(__global float4 *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = (float)3.40282346638528860e+38;\n"
+    "}\n",
+
+    "__kernel void test_buffer_read_float8(__global float8 *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = (float)3.40282346638528860e+38;\n"
+    "}\n",
+
+    "__kernel void test_buffer_read_float16(__global float16 *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = (float)3.40282346638528860e+38;\n"
+    "}\n" };
+
+static const char *float_kernel_name[] = { "test_buffer_read_float", "test_buffer_read_float2", "test_buffer_read_float4", "test_buffer_read_float8", "test_buffer_read_float16" };
+
+
+static const char *buffer_read_char_kernel_code[] = {
+    "__kernel void test_buffer_read_char(__global char *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = (char)'w';\n"
+    "}\n",
+
+    "__kernel void test_buffer_read_char2(__global char2 *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = (char)'w';\n"
+    "}\n",
+
+    "__kernel void test_buffer_read_char4(__global char4 *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = (char)'w';\n"
+    "}\n",
+
+    "__kernel void test_buffer_read_char8(__global char8 *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = (char)'w';\n"
+    "}\n",
+
+    "__kernel void test_buffer_read_char16(__global char16 *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = (char)'w';\n"
+    "}\n" };
+
+static const char *char_kernel_name[] = { "test_buffer_read_char", "test_buffer_read_char2", "test_buffer_read_char4", "test_buffer_read_char8", "test_buffer_read_char16" };
+
+
+static const char *buffer_read_uchar_kernel_code[] = {
+    "__kernel void test_buffer_read_uchar(__global uchar *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = 'w';\n"
+    "}\n",
+
+    "__kernel void test_buffer_read_uchar2(__global uchar2 *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = (uchar)'w';\n"
+    "}\n",
+
+    "__kernel void test_buffer_read_uchar4(__global uchar4 *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = (uchar)'w';\n"
+    "}\n",
+
+    "__kernel void test_buffer_read_uchar8(__global uchar8 *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = (uchar)'w';\n"
+    "}\n",
+
+    "__kernel void test_buffer_read_uchar16(__global uchar16 *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid] = (uchar)'w';\n"
+    "}\n" };
+
+static const char *uchar_kernel_name[] = { "test_buffer_read_uchar", "test_buffer_read_uchar2", "test_buffer_read_uchar4", "test_buffer_read_uchar8", "test_buffer_read_uchar16" };
+
+
+static const char *buffer_read_struct_kernel_code[] = {
+    "typedef struct{\n"
+    "int    a;\n"
+    "float    b;\n"
+    "} TestStruct;\n"
+    "__kernel void test_buffer_read_struct(__global TestStruct *dst)\n"
+    "{\n"
+    "    int  tid = get_global_id(0);\n"
+    "\n"
+    "    dst[tid].a = ((1<<16)+1);\n"
+    "     dst[tid].b = (float)3.40282346638528860e+38;\n"
+    "}\n" };
+
+static const char *struct_kernel_name[] = { "test_buffer_read_struct" };
+
+
+//--- the verify functions
+static int verify_read_int(void *ptr, int n)
+{
+    int     i;
+    int     *outptr = (int *)ptr;
+
+    for (i=0; i<n; i++){
+        if ( outptr[i] != TEST_PRIME_INT )
+            return -1;
+    }
+
+    return 0;
+}
+
+
+static int verify_read_uint(void *ptr, int n)
+{
+    int     i;
+    cl_uint *outptr = (cl_uint *)ptr;
+
+    for (i=0; i<n; i++){
+        if ( outptr[i] != TEST_PRIME_UINT )
+            return -1;
+    }
+
+    return 0;
+}
+
+
+static int verify_read_long(void *ptr, int n)
+{
+    int     i;
+    cl_long *outptr = (cl_long *)ptr;
+
+    for (i=0; i<n; i++){
+        if ( outptr[i] != TEST_PRIME_LONG )
+            return -1;
+    }
+
+    return 0;
+}
+
+
+static int verify_read_ulong(void *ptr, int n)
+{
+    int      i;
+    cl_ulong *outptr = (cl_ulong *)ptr;
+
+    for (i=0; i<n; i++){
+        if ( outptr[i] != TEST_PRIME_ULONG )
+            return -1;
+    }
+
+    return 0;
+}
+
+
+static int verify_read_short(void *ptr, int n)
+{
+    int     i;
+    short   *outptr = (short *)ptr;
+
+    for (i=0; i<n; i++){
+        if ( outptr[i] != (short)((1<<8)+1) )
+            return -1;
+    }
+
+    return 0;
+}
+
+
+static int verify_read_ushort(void *ptr, int n)
+{
+    int       i;
+    cl_ushort *outptr = (cl_ushort *)ptr;
+
+    for (i=0; i<n; i++){
+        if ( outptr[i] != (cl_ushort)((1<<8)+1) )
+            return -1;
+    }
+
+    return 0;
+}
+
+
+static int verify_read_float( void *ptr, int n )
+{
+    int     i;
+    float   *outptr = (float *)ptr;
+
+    for (i=0; i<n; i++){
+        if ( outptr[i] != TEST_PRIME_FLOAT )
+            return -1;
+    }
+
+    return 0;
+}
+
+
+static int verify_read_char(void *ptr, int n)
+{
+    int     i;
+    char    *outptr = (char *)ptr;
+
+    for (i=0; i<n; i++){
+        if ( outptr[i] != TEST_PRIME_CHAR )
+            return -1;
+    }
+
+    return 0;
+}
+
+
+static int verify_read_uchar( void *ptr, int n )
+{
+    int      i;
+    cl_uchar *outptr = (cl_uchar *)ptr;
+
+    for ( i = 0; i < n; i++ ){
+        if ( outptr[i] != TEST_PRIME_CHAR )
+            return -1;
+    }
+
+    return 0;
+}
+
+
+static int verify_read_struct( void *ptr, int n )
+{
+    int         i;
+    TestStruct  *outptr = (TestStruct *)ptr;
+
+    for ( i = 0; i < n; i++ ){
+        if ( ( outptr[i].a != TEST_PRIME_INT ) ||
+             ( outptr[i].b != TEST_PRIME_FLOAT ) )
+            return -1;
+    }
+
+    return 0;
+}
+
+
+//----- the test functions
+static int test_buffer_map_read( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, size_t size, char *type, int loops,
+                                 const char *kernelCode[], const char *kernelName[], int (*fn)(void *,int) )
+{
+    cl_mem      buffers[5];
+    void        *outptr[5];
+    cl_program  program[5];
+    cl_kernel   kernel[5];
+    size_t      threads[3], localThreads[3];
+    cl_int      err;
+    int         i;
+    size_t      ptrSizes[5];
+    int         src_flag_id;
+    int         total_errors = 0;
+    void        *mappedPtr;
+
+    size_t      min_alignment = get_min_alignment(context);
+
+    threads[0] = (cl_uint)num_elements;
+
+    ptrSizes[0] = size;
+    ptrSizes[1] = ptrSizes[0] << 1;
+    ptrSizes[2] = ptrSizes[1] << 1;
+    ptrSizes[3] = ptrSizes[2] << 1;
+    ptrSizes[4] = ptrSizes[3] << 1;
+
+    //embedded devices don't support long/ulong so skip over
+    if (! gHasLong && strstr(type,"long"))
+        return 0;
+
+    for (src_flag_id=0; src_flag_id < NUM_FLAGS; src_flag_id++) {
+        log_info("Testing with cl_mem_flags src: %s\n", flag_set_names[src_flag_id]);
+
+        for ( i = 0; i < loops; i++ ){
+            outptr[i] = align_malloc( ptrSizes[i] * num_elements, min_alignment);
+            if ( ! outptr[i] ){
+                log_error( " unable to allocate %d bytes of memory\n", (int)ptrSizes[i] * num_elements );
+                return -1;
+            }
+
+            if ((flag_set[src_flag_id] & CL_MEM_USE_HOST_PTR) || (flag_set[src_flag_id] & CL_MEM_COPY_HOST_PTR))
+                buffers[i] = clCreateBuffer(context, flag_set[src_flag_id],  ptrSizes[i] * num_elements, outptr[i], &err);
+            else
+                buffers[i] = clCreateBuffer(context, flag_set[src_flag_id],  ptrSizes[i] * num_elements, NULL, &err);
+
+            if ( ! buffers[i] | err){
+                print_error(err, "clCreateBuffer failed\n" );
+                align_free( outptr[i] );
+                return -1;
+            }
+
+            err = create_single_kernel_helper(context, &program[i], &kernel[i], 1, &kernelCode[i], kernelName[i] );
+            if ( err ){
+                log_error( " Error creating program for %s\n", type );
+                clReleaseMemObject( buffers[i] );
+                align_free( outptr[i] );
+                return -1;
+            }
+
+            err = clSetKernelArg( kernel[i], 0, sizeof( cl_mem ), (void *)&buffers[i] );
+            if ( err != CL_SUCCESS ){
+                print_error( err, "clSetKernelArg failed\n" );
+                clReleaseKernel( kernel[i] );
+                clReleaseProgram( program[i] );
+                clReleaseMemObject( buffers[i] );
+                align_free( outptr[i] );
+                return -1;
+            }
+
+            threads[0] = (cl_uint)num_elements;
+
+            err = get_max_common_work_group_size( context, kernel[i], threads[0], &localThreads[0] );
+            test_error( err, "Unable to get work group size to use" );
+
+            err = clEnqueueNDRangeKernel( queue, kernel[i], 1, NULL, threads, localThreads, 0, NULL, NULL );
+            if ( err != CL_SUCCESS ){
+                print_error( err, "clEnqueueNDRangeKernel failed\n" );
+                clReleaseKernel( kernel[i] );
+                clReleaseProgram( program[i] );
+                clReleaseMemObject( buffers[i] );
+                align_free( outptr[i] );
+                return -1;
+            }
+
+            mappedPtr = clEnqueueMapBuffer(queue, buffers[i], CL_TRUE, CL_MAP_READ, 0, ptrSizes[i]*num_elements, 0, NULL, NULL, &err);
+            if ( err != CL_SUCCESS ){
+                print_error( err, "clEnqueueMapBuffer failed" );
+                clReleaseKernel( kernel[i] );
+                clReleaseProgram( program[i] );
+                clReleaseMemObject( buffers[i] );
+                align_free( outptr[i] );
+                return -1;
+            }
+
+            if (fn(mappedPtr, num_elements*(1<<i))){
+                log_error(" %s%d test failed\n", type, 1<<i);
+                total_errors++;
+            }
+            else{
+                log_info(" %s%d test passed\n", type, 1<<i);
+            }
+
+            err = clEnqueueUnmapMemObject(queue, buffers[i], mappedPtr, 0, NULL, NULL);
+            test_error(err, "clEnqueueUnmapMemObject failed");
+
+            // cleanup
+            clReleaseKernel( kernel[i] );
+            clReleaseProgram( program[i] );
+            clReleaseMemObject( buffers[i] );
+
+            // If we are using the outptr[i] as backing via USE_HOST_PTR we need to make sure we are done before freeing.
+            if ((flag_set[src_flag_id] & CL_MEM_USE_HOST_PTR)) {
+                err = clFinish(queue);
+                test_error(err, "clFinish failed");
+            }
+            align_free( outptr[i] );
+        }
+    } // cl_mem_flags
+
+    return total_errors;
+
+}   // end test_buffer_map_read()
+
+
+#define DECLARE_LOCK_TEST(type, realType) \
+int test_buffer_map_read_##type( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )    \
+{ \
+return test_buffer_map_read( deviceID, context, queue,  num_elements, sizeof( realType ), (char*)#type, 5, \
+buffer_read_##type##_kernel_code, type##_kernel_name, verify_read_##type ); \
+}
+
+DECLARE_LOCK_TEST(int, cl_int)
+DECLARE_LOCK_TEST(uint, cl_uint)
+DECLARE_LOCK_TEST(long, cl_long)
+DECLARE_LOCK_TEST(ulong, cl_ulong)
+DECLARE_LOCK_TEST(short, cl_short)
+DECLARE_LOCK_TEST(ushort, cl_ushort)
+DECLARE_LOCK_TEST(char, cl_char)
+DECLARE_LOCK_TEST(uchar, cl_uchar)
+DECLARE_LOCK_TEST(float, cl_float)
+
+int test_buffer_map_read_struct( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+{
+    int (*foo)(void *,int);
+    foo = verify_read_struct;
+
+    return test_buffer_map_read( deviceID, context, queue, num_elements, sizeof( TestStruct ), (char*)"struct", 1,
+                                 buffer_read_struct_kernel_code, struct_kernel_name, foo );
+
+}   // end test_buffer_map_struct_read()
+
--- a/test_conformance/buffers/test_buffer_mem.c
+++ b/test_conformance/buffers/test_buffer_mem.c
@@ -0,0 +1,524 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "../../test_common/harness/compat.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "procs.h"
+
+#ifndef uchar
+typedef unsigned char uchar;
+#endif
+
+#define USE_LOCAL_WORK_GROUP 1
+
+
+const char *mem_read_write_kernel_code =
+"__kernel void test_mem_read_write(__global int *dst)\n"
+"{\n"
+"    int  tid = get_global_id(0);\n"
+"\n"
+"    dst[tid] = dst[tid]+1;\n"
+"}\n";
+
+const char *mem_read_kernel_code =
+"__kernel void test_mem_read(__global int *src, __global int *dst)\n"
+"{\n"
+"    int  tid = get_global_id(0);\n"
+"\n"
+"    dst[tid] = src[tid]+1;\n"
+"}\n";
+
+const char *mem_write_kernel_code =
+"__kernel void test_mem_write(__global int *dst)\n"
+"{\n"
+"    int  tid = get_global_id(0);\n"
+"\n"
+"    dst[tid] = dst[tid]+1;\n"
+"}\n";
+
+
+static int verify_mem( int *outptr, int n )
+{
+    int i;
+
+    for ( i = 0; i < n; i++ ){
+        if ( outptr[i] != ( i + 1 ) )
+            return -1;
+    }
+
+    return 0;
+}
+
+
+
+int test_mem_read_write_flags( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+{
+    cl_mem      buffers[1];
+    cl_int      *inptr, *outptr;
+    cl_program  program[1];
+    cl_kernel   kernel[1];
+    size_t      global_work_size[3];
+#ifdef USE_LOCAL_WORK_GROUP
+    size_t      local_work_size[3];
+#endif
+    cl_int      err;
+    int         i;
+
+    size_t      min_alignment = get_min_alignment(context);
+
+    global_work_size[0] = (cl_uint)num_elements;
+
+    inptr = (cl_int*)align_malloc(sizeof(cl_int)  * num_elements, min_alignment);
+    outptr = (cl_int*)align_malloc(sizeof(cl_int) * num_elements, min_alignment);
+    buffers[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int) * num_elements, NULL, &err);
+    if (err != CL_SUCCESS) {
+        print_error( err, "clCreateBuffer failed");
+        align_free( (void *)outptr );
+        align_free( (void *)inptr );
+        return -1;
+    }
+
+    for (i=0; i<num_elements; i++)
+        inptr[i] = i;
+
+    err = clEnqueueWriteBuffer(queue, buffers[0], CL_TRUE, 0, sizeof(cl_int)*num_elements, (void *)inptr, 0, NULL, NULL);
+    if (err != CL_SUCCESS) {
+        print_error( err, "clEnqueueWriteBuffer failed");
+        clReleaseMemObject( buffers[0] );
+        align_free( (void *)outptr );
+        align_free( (void *)inptr );
+        return -1;
+    }
+
+    err = create_single_kernel_helper( context, &program[0], &kernel[0], 1, &mem_read_write_kernel_code, "test_mem_read_write" );
+    if (err){
+        clReleaseMemObject( buffers[0] );
+        align_free( (void *)outptr );
+        align_free( (void *)inptr );
+        return -1;
+    }
+
+#ifdef USE_LOCAL_WORK_GROUP
+    err = get_max_common_work_group_size( context, kernel[0], global_work_size[0], &local_work_size[0] );
+    test_error( err, "Unable to get work group size to use" );
+#endif
+
+    err = clSetKernelArg( kernel[0], 0, sizeof( cl_mem ), (void *)&buffers[0] );
+    if ( err != CL_SUCCESS ){
+        print_error( err, "clSetKernelArg failed" );
+        clReleaseMemObject( buffers[0] );
+        clReleaseKernel( kernel[0] );
+        clReleaseProgram( program[0] );
+        align_free( (void *)outptr );
+        align_free( (void *)inptr );
+        return -1;
+    }
+
+#ifdef USE_LOCAL_WORK_GROUP
+    err = clEnqueueNDRangeKernel( queue, kernel[0], 1, NULL, global_work_size, local_work_size, 0, NULL, NULL );
+#else
+    err = clEnqueueNDRangeKernel( queue, kernel[0], 1, NULL, global_work_size, NULL, 0, NULL, NULL );
+#endif
+    if (err != CL_SUCCESS){
+        log_error("clEnqueueNDRangeKernel failed\n");
+        clReleaseMemObject( buffers[0] );
+        clReleaseKernel( kernel[0] );
+        clReleaseProgram( program[0] );
+        align_free( (void *)outptr );
+        align_free( (void *)inptr );
+        return -1;
+    }
+
+    err = clEnqueueReadBuffer( queue, buffers[0], true, 0, sizeof(cl_int)*num_elements, (void *)outptr, 0, NULL, NULL );
+    if ( err != CL_SUCCESS ){
+        print_error( err, "clEnqueueReadBuffer failed" );
+        clReleaseMemObject( buffers[0] );
+        clReleaseKernel( kernel[0] );
+        clReleaseProgram( program[0] );
+        align_free( (void *)outptr );
+        align_free( (void *)inptr );
+        return -1;
+    }
+
+    if (verify_mem(outptr, num_elements)){
+        log_error("buffer_MEM_READ_WRITE test failed\n");
+        err = -1;
+    }
+    else{
+        log_info("buffer_MEM_READ_WRITE test passed\n");
+        err = 0;
+    }
+
+    // cleanup
+    clReleaseMemObject( buffers[0] );
+    clReleaseKernel( kernel[0] );
+    clReleaseProgram( program[0] );
+    align_free( (void *)outptr );
+    align_free( (void *)inptr );
+
+    return err;
+}   // end test_mem_read_write()
+
+
+int test_mem_write_flags( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+{
+    cl_mem      buffers[1];
+    int         *inptr, *outptr;
+    cl_program  program[1];
+    cl_kernel   kernel[1];
+    size_t      global_work_size[3];
+#ifdef USE_LOCAL_WORK_GROUP
+    size_t      local_work_size[3];
+#endif
+    cl_int      err;
+    int         i;
+
+    size_t      min_alignment = get_min_alignment(context);
+
+    global_work_size[0] = (cl_uint)num_elements;
+
+    inptr = (int *)align_malloc( sizeof(cl_int) * num_elements, min_alignment);
+    if ( ! inptr ){
+        log_error( " unable to allocate %d bytes of memory\n", (int)sizeof(cl_int) * num_elements );
+        return -1;
+    }
+    outptr = (int *)align_malloc( sizeof(cl_int) * num_elements, min_alignment);
+    if ( ! outptr ){
+        log_error( " unable to allocate %d bytes of memory\n", (int)sizeof(cl_int) * num_elements );
+        align_free( (void *)inptr );
+        return -1;
+    }
+    buffers[0] = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_int) * num_elements, NULL, &err);
+    if (err != CL_SUCCESS)
+    {
+        print_error(err, "clCreateBuffer failed\n");
+        align_free( (void *)outptr );
+        align_free( (void *)inptr );
+        return -1;
+    }
+
+    for (i=0; i<num_elements; i++)
+        inptr[i] = i;
+
+    err = clEnqueueWriteBuffer(queue, buffers[0], CL_TRUE, 0, sizeof(cl_int)*num_elements, (void *)inptr, 0, NULL, NULL);
+    if (err != CL_SUCCESS){
+        print_error( err, "clEnqueueWriteBuffer failed" );
+        clReleaseMemObject( buffers[0] );
+        align_free( (void *)outptr );
+        align_free( (void *)inptr );
+        return -1;
+    }
+
+    err = create_single_kernel_helper( context, &program[0], &kernel[0], 1, &mem_write_kernel_code, "test_mem_write" );
+    if (err){
+        clReleaseMemObject( buffers[0] );
+        align_free( (void *)outptr );
+        align_free( (void *)inptr );
+        return -1;
+    }
+
+#ifdef USE_LOCAL_WORK_GROUP
+    err = get_max_common_work_group_size( context, kernel[0], global_work_size[0], &local_work_size[0] );
+    test_error( err, "Unable to get work group size to use" );
+#endif
+
+    err = clSetKernelArg( kernel[0], 0, sizeof( cl_mem ), (void *)&buffers[0] );
+    if ( err != CL_SUCCESS ){
+        print_error( err, "clSetKernelArg failed");
+        clReleaseMemObject( buffers[0] );
+        clReleaseKernel( kernel[0] );
+        clReleaseProgram( program[0] );
+        align_free( (void *)outptr );
+        align_free( (void *)inptr );
+        return -1;
+    }
+
+#ifdef USE_LOCAL_WORK_GROUP
+    err = clEnqueueNDRangeKernel( queue, kernel[0], 1, NULL, global_work_size, local_work_size, 0, NULL, NULL );
+#else
+    err = clEnqueueNDRangeKernel( queue, kernel[0], 1, NULL, global_work_size, NULL, 0, NULL, NULL );
+#endif
+    if ( err != CL_SUCCESS ){
+        print_error( err, "clEnqueueNDRangeKernel failed" );
+        clReleaseMemObject( buffers[0] );
+        clReleaseKernel( kernel[0] );
+        clReleaseProgram( program[0] );
+        align_free( (void *)outptr );
+        align_free( (void *)inptr );
+        return -1;
+    }
+
+    err = clEnqueueReadBuffer( queue, buffers[0], true, 0, sizeof(cl_int)*num_elements, (void *)outptr, 0, NULL, NULL );
+    if ( err != CL_SUCCESS ){
+        print_error( err, "Error reading array" );
+        clReleaseMemObject( buffers[0] );
+        clReleaseKernel( kernel[0] );
+        clReleaseProgram( program[0] );
+        align_free( (void *)outptr );
+        align_free( (void *)inptr );
+        return -1;
+    }
+
+    // cleanup
+    clReleaseMemObject( buffers[0] );
+    clReleaseKernel( kernel[0] );
+    clReleaseProgram( program[0] );
+    align_free( (void *)outptr );
+    align_free( (void *)inptr );
+
+    return err;
+}   // end test_mem_write()
+
+
+int test_mem_read_flags( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+{
+    cl_mem      buffers[2];
+    int         *inptr, *outptr;
+    cl_program  program[1];
+    cl_kernel   kernel[1];
+    size_t      global_work_size[3];
+#ifdef USE_LOCAL_WORK_GROUP
+    size_t      local_work_size[3];
+#endif
+    cl_int      err;
+    int         i;
+
+    size_t      min_alignment = get_min_alignment(context);
+
+    global_work_size[0] = (cl_uint)num_elements;
+
+    inptr = (int *)align_malloc( sizeof(cl_int) * num_elements, min_alignment);
+    if ( ! inptr ){
+        log_error( " unable to allocate %d bytes of memory\n", (int)sizeof(cl_int) * num_elements );
+        return -1;
+    }
+    outptr = (int *)align_malloc( sizeof(cl_int) * num_elements, min_alignment);
+    if ( ! outptr ){
+        log_error( " unable to allocate %d bytes of memory\n", (int)sizeof(cl_int) * num_elements );
+        align_free( (void *)inptr );
+        return -1;
+    }
+
+    buffers[0] = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_int) * num_elements, NULL, &err);
+    if ( err != CL_SUCCESS ){
+        print_error(err, " clCreateBuffer failed to create READ_ONLY array\n" );
+        align_free( (void *)outptr );
+        align_free( (void *)inptr );
+        return -1;
+    }
+
+    for (i=0; i<num_elements; i++)
+        inptr[i] = i;
+
+    buffers[1] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE),  sizeof(cl_int) * num_elements, NULL, &err);
+    if ( err != CL_SUCCESS ){
+        print_error(err, " clCreateBuffer failed to create MEM_ALLOC_GLOBAL_POOL array\n" );
+        clReleaseMemObject( buffers[0]) ;
+        align_free( (void *)inptr );
+        align_free( (void *)outptr );
+        return -1;
+    }
+
+    err = clEnqueueWriteBuffer(queue, buffers[0], CL_TRUE, 0, sizeof(cl_int)*num_elements, (void *)inptr, 0, NULL, NULL);
+    if ( err != CL_SUCCESS ){
+        print_error( err, "clEnqueueWriteBuffer() failed");
+        clReleaseMemObject( buffers[1]) ;
+        clReleaseMemObject( buffers[0]) ;
+        align_free( (void *)inptr );
+        align_free( (void *)outptr );
+        return -1;
+    }
+
+    err = create_single_kernel_helper( context, &program[0], &kernel[0], 1, &mem_read_kernel_code, "test_mem_read" );
+    if ( err ){
+        clReleaseMemObject( buffers[1]) ;
+        clReleaseMemObject( buffers[0]) ;
+        align_free( (void *)inptr );
+        align_free( (void *)outptr );
+        return -1;
+    }
+
+#ifdef USE_LOCAL_WORK_GROUP
+    err = get_max_common_work_group_size( context, kernel[0], global_work_size[0], &local_work_size[0] );
+    test_error( err, "Unable to get work group size to use" );
+#endif
+
+    err = clSetKernelArg( kernel[0], 0, sizeof( cl_mem ), (void *)&buffers[0] );
+    err |= clSetKernelArg( kernel[0], 1, sizeof( cl_mem ), (void *)&buffers[1] );
+    if ( err != CL_SUCCESS ){
+        print_error( err, "clSetKernelArgs failed" );
+        clReleaseMemObject( buffers[1]) ;
+        clReleaseMemObject( buffers[0]) ;
+        clReleaseKernel( kernel[0] );
+        clReleaseProgram( program[0] );
+        align_free( (void *)inptr );
+        align_free( (void *)outptr );
+        return -1;
+    }
+
+#ifdef USE_LOCAL_WORK_GROUP
+    err = clEnqueueNDRangeKernel( queue, kernel[0], 1, NULL, global_work_size, local_work_size, 0, NULL, NULL );
+#else
+    err = clEnqueueNDRangeKernel( queue, kernel[0], 1, NULL, global_work_size, NULL, 0, NULL, NULL );
+#endif
+    if (err != CL_SUCCESS){
+        print_error( err, "clEnqueueNDRangeKernel failed" );
+        clReleaseMemObject( buffers[1]) ;
+        clReleaseMemObject( buffers[0]) ;
+        clReleaseKernel( kernel[0] );
+        clReleaseProgram( program[0] );
+        align_free( (void *)inptr );
+        align_free( (void *)outptr );
+        return -1;
+    }
+
+    err = clEnqueueReadBuffer( queue, buffers[1], true, 0, sizeof(cl_int)*num_elements, (void *)outptr, 0, NULL, NULL );
+    if ( err != CL_SUCCESS ){
+        print_error( err, "clEnqueueReadBuffer failed" );
+        clReleaseMemObject( buffers[1]) ;
+        clReleaseMemObject( buffers[0]) ;
+        clReleaseKernel( kernel[0] );
+        clReleaseProgram( program[0] );
+        align_free( (void *)inptr );
+        align_free( (void *)outptr );
+        return -1;
+    }
+
+    if (verify_mem(outptr, num_elements)){
+        log_error( " CL_MEM_READ_ONLY test failed\n" );
+        err = -1;
+    }
+    else{
+        log_info( " CL_MEM_READ_ONLY test passed\n" );
+        err = 0;
+    }
+
+    // cleanup
+    clReleaseMemObject( buffers[1]) ;
+    clReleaseMemObject( buffers[0]) ;
+    clReleaseKernel( kernel[0] );
+    clReleaseProgram( program[0] );
+    align_free( (void *)inptr );
+    align_free( (void *)outptr );
+
+    return err;
+
+}   // end test_mem_read()
+
+
+int test_mem_copy_host_flags( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+{
+    cl_mem      buffers[1];
+    int         *ptr;
+    cl_program  program[1];
+    cl_kernel   kernel[1];
+    size_t      global_work_size[3];
+#ifdef USE_LOCAL_WORK_GROUP
+    size_t      local_work_size[3];
+#endif
+    cl_int      err;
+    int         i;
+
+    size_t min_alignment = get_min_alignment(context);
+
+    global_work_size[0] = (cl_uint)num_elements;
+
+    ptr = (int *)align_malloc( sizeof(cl_int) * num_elements, min_alignment);
+    if ( ! ptr ){
+        log_error( " unable to allocate %d bytes of memory\n", (int)sizeof(cl_int) * num_elements );
+        return -1;
+    }
+
+    for (i=0; i<num_elements; i++)
+        ptr[i] = i;
+
+    buffers[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * num_elements, (void *)ptr, &err);
+    if (err != CL_SUCCESS){
+        print_error(err, "clCreateBuffer failed for CL_MEM_COPY_HOST_PTR\n");
+        align_free( (void *)ptr );
+        return -1;
+    }
+
+    err = create_single_kernel_helper( context, &program[0], &kernel[0], 1, &mem_read_write_kernel_code, "test_mem_read_write" );
+    if (err){
+        clReleaseMemObject( buffers[0] );
+        align_free( (void *)ptr );
+        return -1;
+    }
+
+#ifdef USE_LOCAL_WORK_GROUP
+    err = get_max_common_work_group_size( context, kernel[0], global_work_size[0], &local_work_size[0] );
+    test_error( err, "Unable to get work group size to use" );
+#endif
+
+    err = clSetKernelArg( kernel[0], 0, sizeof( cl_mem ), (void *)&buffers[0] );
+    if (err != CL_SUCCESS){
+        log_error("clSetKernelArgs failed\n");
+        clReleaseMemObject( buffers[0] );
+        clReleaseKernel( kernel[0] );
+        clReleaseProgram( program[0] );
+        align_free( (void *)ptr );
+        return -1;
+    }
+
+#ifdef USE_LOCAL_WORK_GROUP
+    err = clEnqueueNDRangeKernel( queue, kernel[0], 1, NULL, global_work_size, local_work_size, 0, NULL, NULL );
+#else
+    err = clEnqueueNDRangeKernel( queue, kernel[0], 1, NULL, global_work_size, NULL, 0, NULL, NULL );
+#endif
+    if (err != CL_SUCCESS){
+        log_error("clEnqueueNDRangeKernel failed\n");
+        clReleaseMemObject( buffers[0] );
+        clReleaseKernel( kernel[0] );
+        clReleaseProgram( program[0] );
+        align_free( (void *)ptr );
+        return -1;
+    }
+
+    err = clEnqueueReadBuffer( queue, buffers[0], true, 0, sizeof(cl_int)*num_elements, (void *)ptr, 0, NULL, NULL );
+    if (err != CL_SUCCESS){
+        log_error("CL_MEM_COPY_HOST_PTR | CL_MEM_ALLOC_CONSTANT_POOL failed.\n");
+        clReleaseMemObject( buffers[0] );
+        clReleaseKernel( kernel[0] );
+        clReleaseProgram( program[0] );
+        align_free( (void *)ptr );
+        return -1;
+    }
+
+    if ( verify_mem( ptr, num_elements ) ){
+        log_error("CL_MEM_COPY_HOST_PTR test failed\n");
+        err = -1;
+    }
+    else{
+        log_info("CL_MEM_COPY_HOST_PTR test passed\n");
+        err = 0;
+    }
+
+    // cleanup
+    clReleaseMemObject( buffers[0] );
+    clReleaseKernel( kernel[0] );
+    clReleaseProgram( program[0] );
+    align_free( (void *)ptr );
+
+    return err;
+
+}   // end test_mem_copy_host_flags()
+
--- a/test_conformance/buffers/test_buffer_migrate.c
+++ b/test_conformance/buffers/test_buffer_migrate.c
@@ -0,0 +1,417 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "procs.h"
+#include "../../test_common/harness/errorHelpers.h"
+#include "../../test_common/harness/testHarness.h"
+
+#define MAX_SUB_DEVICES        16        // Limit the sub-devices to ensure no out of resource errors.
+#define BUFFER_SIZE        1024
+
+// Kernel source code
+static const char *buffer_migrate_kernel_code =
+"__kernel void test_buffer_migrate(__global uint *dst, __global uint *src1, __global uint *src2, uint x)\n"
+"{\n"
+"  int tid = get_global_id(0);\n"
+"  dst[tid] = src1[tid] ^ src2[tid] ^ x;\n"
+"}\n";
+
+enum migrations { MIGRATE_PREFERRED,         // migrate to the preferred sub-device
+  MIGRATE_NON_PREFERRED,     // migrate to a randomly chosen non-preferred sub-device
+  MIGRATE_RANDOM,        // migrate to a randomly chosen sub-device with randomly chosen flags
+  NUMBER_OF_MIGRATIONS };
+
+static cl_mem init_buffer(cl_command_queue cmd_q, cl_mem buffer, cl_uint *data)
+{
+  cl_int err;
+
+  if (buffer) {
+    if ((err = clEnqueueWriteBuffer(cmd_q, buffer, CL_TRUE, 0, sizeof(cl_uint)*BUFFER_SIZE, data, 0, NULL, NULL)) != CL_SUCCESS) {
+      print_error(err, "Failed on enqueue write of buffer data.");
+    }
+  }
+  return buffer;
+}
+
+static cl_int migrateMemObject(enum migrations migrate, cl_command_queue *queues, cl_mem *mem_objects, cl_uint num_devices, cl_mem_migration_flags *flags, MTdata d)
+{
+  cl_uint i, j;
+  cl_int  err = CL_SUCCESS;
+
+  for (i=0; i<num_devices; i++) {
+    j = genrand_int32(d) % num_devices;
+    flags[i] = 0;
+    switch (migrate) {
+      case MIGRATE_PREFERRED:
+        // Force the device to be preferred
+        j = i;
+        break;
+      case MIGRATE_NON_PREFERRED:
+        // Coerce the device to be non-preferred
+        if ((j == i) && (num_devices > 1)) j = (j+1) % num_devices;
+        break;
+      case MIGRATE_RANDOM:
+        // Choose a random set of flags
+        flags[i] = (cl_mem_migration_flags)(genrand_int32(d) & (CL_MIGRATE_MEM_OBJECT_HOST | CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED));;
+        break;
+    }
+    if ((err = clEnqueueMigrateMemObjects(queues[j], 1, (const cl_mem *)(&mem_objects[i]), flags[i], 0, NULL, NULL)) != CL_SUCCESS) {
+      print_error(err, "Failed migrating memory object.");
+    }
+  }
+  return err;
+}
+
+static cl_int restoreBuffer(cl_command_queue *queues, cl_mem *buffers, cl_uint num_devices, cl_mem_migration_flags *flags, cl_uint *buffer)
+{
+  cl_uint i, j;
+  cl_int  err;
+
+  // If the buffer was previously migrated with undefined content, reload the content.
+
+  for (i=0; i<num_devices; i++) {
+    if (flags[i] & CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED) {
+      if ((err = clEnqueueWriteBuffer(queues[i], buffers[i], CL_TRUE, 0, sizeof(cl_uint)*BUFFER_SIZE, buffer, 0, NULL, NULL)) != CL_SUCCESS) {
+        print_error(err, "Failed on restoration enqueue write of buffer data.");
+        return err;
+      }
+    }
+  }
+  return CL_SUCCESS;
+}
+
+int test_buffer_migrate(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+  int failed = 0;
+  cl_uint i, j;
+  cl_int err;
+  cl_uint max_sub_devices = 0;
+  cl_uint num_devices, num_devices_limited;
+  cl_uint A[BUFFER_SIZE], B[BUFFER_SIZE], C[BUFFER_SIZE];
+  cl_uint test_number = 1;
+  cl_device_affinity_domain domain, domains;
+  cl_device_id *devices;
+  cl_command_queue *queues;
+  cl_mem_migration_flags *flagsA, *flagsB, *flagsC;
+  cl_device_partition_property property[] = {CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN, 0, 0};
+  cl_mem *bufferA, *bufferB, *bufferC;
+  cl_program program = NULL;
+  cl_kernel kernel = NULL;
+  cl_context ctx = NULL;    // context for all sub-devices
+  enum migrations migrateA, migrateB, migrateC;
+  MTdata d = init_genrand(gRandomSeed);
+  const size_t wgs[1] = {BUFFER_SIZE};
+
+  /* Allocate arrays whose size varies according to the maximum number of sub-devices */
+  if ((err = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(max_sub_devices), &max_sub_devices, NULL)) != CL_SUCCESS) {
+    print_error(err, "clGetDeviceInfo(CL_DEVICE_MAX_COMPUTE_UNITS) failed");
+    return -1;
+  }
+  if (max_sub_devices < 1) {
+    log_error("ERROR: Invalid number of compute units returned.\n");
+    return -1;
+  }
+  devices = (cl_device_id *)malloc(max_sub_devices * sizeof(cl_device_id));
+  queues = (cl_command_queue *)malloc(max_sub_devices * sizeof(cl_command_queue));
+  flagsA = (cl_mem_migration_flags *)malloc(max_sub_devices * sizeof(cl_mem_migration_flags));
+  flagsB = (cl_mem_migration_flags *)malloc(max_sub_devices * sizeof(cl_mem_migration_flags));
+  flagsC = (cl_mem_migration_flags *)malloc(max_sub_devices * sizeof(cl_mem_migration_flags));
+  bufferA = (cl_mem *)malloc(max_sub_devices * sizeof(cl_mem));
+  bufferB = (cl_mem *)malloc(max_sub_devices * sizeof(cl_mem));
+  bufferC = (cl_mem *)malloc(max_sub_devices * sizeof(cl_mem));
+
+  if ((devices == NULL) || (queues  == NULL) ||
+      (flagsA  == NULL) || (flagsB  == NULL) || (flagsC  == NULL) ||
+      (bufferA == NULL) || (bufferB == NULL) || (bufferC == NULL)) {
+    log_error("ERROR: Failed to successfully allocate required local buffers.\n");
+    failed = -1;
+    goto cleanup_allocations;
+  }
+
+  for (i=0; i<max_sub_devices; i++) {
+    devices[i] = NULL;
+    queues [i] = NULL;
+    bufferA[i] = bufferB[i] = bufferC[i] = NULL;
+  }
+
+  for (i=0; i<BUFFER_SIZE; i++) {
+    A[i] = genrand_int32(d);
+    B[i] = genrand_int32(d);
+  }
+
+  // Attempt to partition the device along each of the allowed affinity domain.
+  if ((err = clGetDeviceInfo(deviceID, CL_DEVICE_PARTITION_AFFINITY_DOMAIN, sizeof(domains), &domains, NULL)) != CL_SUCCESS) {
+    print_error(err, "clGetDeviceInfo(CL_PARTITION_AFFINITY_DOMAIN) failed");
+    return -1;
+  }
+
+  domains &= (CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE | CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE |
+              CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE | CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE | CL_DEVICE_AFFINITY_DOMAIN_NUMA);
+
+  do {
+    if (domains) {
+      for (domain = 1; (domain & domains) == 0; domain <<= 1) {};
+      domains &= ~domain;
+    } else {
+      domain = 0;
+    }
+
+    // Determine the number of partitions for the device given the specific domain.
+    if (domain) {
+      property[1] = domain;
+      err = clCreateSubDevices(deviceID, (const cl_device_partition_property *)property, -1, NULL, &num_devices);
+      if ((err != CL_SUCCESS) || (num_devices == 0)) {
+        print_error(err, "Obtaining the number of partions by affinity failed.");
+        failed = 1;
+        goto cleanup;
+      }
+    } else {
+      num_devices = 1;
+    }
+
+    if (num_devices > 1) {
+      // Create each of the sub-devices and a corresponding context.
+      if ((err = clCreateSubDevices(deviceID, (const cl_device_partition_property *)property, num_devices, devices, &num_devices)) != CL_SUCCESS) {
+        print_error(err, "Failed creating sub devices.");
+        failed = 1;
+        goto cleanup;
+      }
+
+      // Create a context containing all the sub-devices
+      ctx = clCreateContext(NULL, num_devices, devices, notify_callback, NULL, &err);
+      if (ctx == NULL) {
+    print_error(err, "Failed creating context containing the sub-devices.");
+    failed = 1;
+    goto cleanup;
+      }
+
+      // Create a command queue for each sub-device
+      for (i=0; i<num_devices; i++) {
+        if (devices[i]) {
+          if ((queues[i] = clCreateCommandQueueWithProperties(ctx, devices[i], 0, &err)) == NULL) {
+            print_error(err, "Failed creating command queues.");
+            failed = 1;
+            goto cleanup;
+          }
+        }
+      }
+    } else {
+      // No partitioning available. Just exercise the APIs on a single device.
+      devices[0] = deviceID;
+      queues[0] = queue;
+      ctx = context;
+    }
+
+    // Build the kernel program.
+    if (err = create_single_kernel_helper(ctx, &program, &kernel, 1, &buffer_migrate_kernel_code, "test_buffer_migrate")) {
+      print_error(err, "Failed creating kernel.");
+      failed = 1;
+      goto cleanup;
+    }
+
+    num_devices_limited = num_devices;
+
+    // Allocate memory buffers. 3 buffers (2 input, 1 output) for each sub-device.
+    // If we run out of memory, then restrict the number of sub-devices to be tested.
+    for (i=0; i<num_devices; i++) {
+      bufferA[i] = init_buffer(queues[i], clCreateBuffer(ctx, (CL_MEM_READ_ONLY  | CL_MEM_ALLOC_HOST_PTR), sizeof(cl_uint) * BUFFER_SIZE, NULL, &err), A);
+      bufferB[i] = init_buffer(queues[i], clCreateBuffer(ctx, (CL_MEM_READ_ONLY  | CL_MEM_ALLOC_HOST_PTR), sizeof(cl_uint) * BUFFER_SIZE, NULL, &err), B);
+      bufferC[i] = clCreateBuffer(ctx, (CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR), sizeof(cl_uint) * BUFFER_SIZE, NULL, &err);
+
+      if ((bufferA[i] == NULL) || (bufferB[i] == NULL) || (bufferC[i] == NULL)) {
+        if (i == 0) {
+          log_error("Failed to allocate even 1 set of buffers.\n");
+          failed = 1;
+          goto cleanup;
+        }
+        num_devices_limited = i;
+        break;
+      }
+    }
+
+    // For each partition, we will execute the test kernel with each of the 3 buffers migrated to one of the migrate options
+    for (migrateA=(enum migrations)(0); migrateA<NUMBER_OF_MIGRATIONS; migrateA = (enum migrations)((int)migrateA + 1)) {
+      if (migrateMemObject(migrateA, queues, bufferA, num_devices_limited, flagsA, d) != CL_SUCCESS) {
+        failed = 1;
+        goto cleanup;
+      }
+      for (migrateC=(enum migrations)(0); migrateC<NUMBER_OF_MIGRATIONS; migrateC = (enum migrations)((int)migrateC + 1)) {
+        if (migrateMemObject(migrateC, queues, bufferC, num_devices_limited, flagsC, d) != CL_SUCCESS) {
+          failed = 1;
+          goto cleanup;
+        }
+        for (migrateB=(enum migrations)(0); migrateB<NUMBER_OF_MIGRATIONS; migrateB = (enum migrations)((int)migrateB + 1)) {
+          if (migrateMemObject(migrateB, queues, bufferB, num_devices_limited, flagsB, d) != CL_SUCCESS) {
+            failed = 1;
+            goto cleanup;
+          }
+          // Run the test on each of the partitions.
+          for (i=0; i<num_devices_limited; i++) {
+            cl_uint x;
+
+            x = i + test_number;
+
+            if ((err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (const void *)&bufferC[i])) != CL_SUCCESS) {
+              print_error(err, "Failed set kernel argument 0.");
+              failed = 1;
+              goto cleanup;
+            }
+
+            if ((err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (const void *)&bufferA[i])) != CL_SUCCESS) {
+              print_error(err, "Failed set kernel argument 1.");
+              failed = 1;
+              goto cleanup;
+            }
+
+            if ((err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (const void *)&bufferB[i])) != CL_SUCCESS) {
+              print_error(err, "Failed set kernel argument 2.");
+              failed = 1;
+              goto cleanup;
+            }
+
+            if ((err = clSetKernelArg(kernel, 3, sizeof(cl_uint), (const void *)&x)) != CL_SUCCESS) {
+              print_error(err, "Failed set kernel argument 3.");
+              failed = 1;
+              goto cleanup;
+            }
+
+            if ((err = clEnqueueNDRangeKernel(queues[i], kernel, 1, NULL, wgs, NULL, 0, NULL, NULL)) != CL_SUCCESS) {
+              print_error(err, "Failed enqueueing the NDRange kernel.");
+              failed = 1;
+              goto cleanup;
+            }
+          }
+          // Verify the results as long as neither input is an undefined migration
+          for (i=0; i<num_devices_limited; i++, test_number++) {
+            if (((flagsA[i] | flagsB[i]) & CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED) == 0) {
+              if ((err = clEnqueueReadBuffer(queues[i], bufferC[i], CL_TRUE, 0, sizeof(cl_uint)*BUFFER_SIZE, C, 0, NULL, NULL)) != CL_SUCCESS) {
+                print_error(err, "Failed reading output buffer.");
+                failed = 1;
+                goto cleanup;
+              }
+              for (j=0; j<BUFFER_SIZE; j++) {
+                cl_uint expected;
+
+                expected = A[j] ^ B[j] ^ test_number;
+                if (C[j] != expected) {
+                  log_error("Failed on device %d,  work item %4d,  expected 0x%08x got 0x%08x (0x%08x ^ 0x%08x ^ 0x%08x)\n", i, j, expected, C[j], A[j], B[j], test_number);
+                  failed = 1;
+                }
+              }
+              if (failed) goto cleanup;
+            }
+          }
+
+          if (restoreBuffer(queues, bufferB, num_devices_limited, flagsB, B) != CL_SUCCESS) {
+            failed = 1;
+            goto cleanup;
+          }
+        }
+      }
+      if (restoreBuffer(queues, bufferA, num_devices_limited, flagsA, A) != CL_SUCCESS) {
+        failed = 1;
+        goto cleanup;
+      }
+    }
+
+  cleanup:
+    // Clean up all the allocted resources create by the test. This includes sub-devices,
+    // command queues, and memory buffers.
+
+    for (i=0; i<max_sub_devices; i++) {
+      // Memory buffer cleanup
+      if (bufferA[i]) {
+        if ((err = clReleaseMemObject(bufferA[i])) != CL_SUCCESS) {
+          print_error(err, "Failed releasing memory object.");
+          failed = 1;
+        }
+      }
+      if (bufferB[i]) {
+        if ((err = clReleaseMemObject(bufferB[i])) != CL_SUCCESS) {
+          print_error(err, "Failed releasing memory object.");
+          failed = 1;
+        }
+      }
+      if (bufferC[i]) {
+        if ((err = clReleaseMemObject(bufferC[i])) != CL_SUCCESS) {
+          print_error(err, "Failed releasing memory object.");
+          failed = 1;
+        }
+      }
+
+
+      if (num_devices > 1) {
+        // Command queue cleanup
+        if (queues[i]) {
+          if ((err = clReleaseCommandQueue(queues[i])) != CL_SUCCESS) {
+            print_error(err, "Failed releasing command queue.");
+            failed = 1;
+          }
+        }
+
+        // Sub-device cleanup
+        if (devices[i]) {
+          if ((err = clReleaseDevice(devices[i])) != CL_SUCCESS) {
+            print_error(err, "Failed releasing sub device.");
+            failed = 1;
+          }
+        }
+        devices[i] = 0;
+      }
+    }
+
+    // Context, program, and kernel cleanup
+    if (program) {
+      if ((err = clReleaseProgram(program)) != CL_SUCCESS) {
+    print_error(err, "Failed releasing program.");
+    failed = 1;
+      }
+      program = NULL;
+    }
+
+    if (kernel) {
+      if ((err = clReleaseKernel(kernel)) != CL_SUCCESS) {
+    print_error(err, "Failed releasing kernel.");
+    failed = 1;
+      }
+      kernel = NULL;
+    }
+
+    if (ctx && (ctx != context)) {
+      if ((err = clReleaseContext(ctx)) != CL_SUCCESS) {
+    print_error(err, "Failed releasing context.");
+    failed = 1;
+      }
+    }
+    ctx = NULL;
+
+    if (failed) goto cleanup_allocations;
+  } while (domains);
+
+cleanup_allocations:
+  if (devices) free(devices);
+  if (queues)  free(queues);
+  if (flagsA)  free(flagsA);
+  if (flagsB)  free(flagsB);
+  if (flagsC)  free(flagsC);
+  if (bufferA) free(bufferA);
+  if (bufferB) free(bufferB);
+  if (bufferC) free(bufferC);
+
+  return ((failed) ? -1 : 0);
+}
--- a/test_conformance/buffers/test_buffer_read.c
+++ b/test_conformance/buffers/test_buffer_read.c
--- a/test_conformance/buffers/test_buffer_write.c
+++ b/test_conformance/buffers/test_buffer_write.c
--- a/test_conformance/buffers/test_image_migrate.c
+++ b/test_conformance/buffers/test_image_migrate.c
@@ -0,0 +1,487 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "procs.h"
+#include "../../test_common/harness/errorHelpers.h"
+
+#define MAX_SUB_DEVICES        16        // Limit the sub-devices to ensure no out of resource errors.
+#define MEM_OBJ_SIZE          1024
+#define IMAGE_DIM         16
+
+// Kernel source code
+static const char *image_migrate_kernel_code =
+"__kernel void test_image_migrate(write_only image2d_t dst, read_only image2d_t src1,\n"
+"                                 read_only image2d_t src2, sampler_t sampler, uint x)\n"
+"{\n"
+"  int tidX = get_global_id(0), tidY = get_global_id(1);\n"
+"  int2 coords = (int2) {tidX, tidY};\n"
+"  uint4 val = read_imageui(src1, sampler, coords) ^\n"
+"              read_imageui(src2, sampler, coords) ^\n"
+"              x;\n"
+"  write_imageui(dst, coords, val);\n"
+"}\n";
+
+enum migrations { MIGRATE_PREFERRED,           // migrate to the preferred sub-device
+                  MIGRATE_NON_PREFERRED,     // migrate to a randomly chosen non-preferred sub-device
+                  MIGRATE_RANDOM,              // migrate to a randomly chosen sub-device with randomly chosen flags
+                  NUMBER_OF_MIGRATIONS };
+
+static cl_mem init_image(cl_command_queue cmd_q, cl_mem image, cl_uint *data)
+{
+  cl_int err;
+
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {IMAGE_DIM, IMAGE_DIM, 1};
+
+  if (image) {
+    if ((err = clEnqueueWriteImage(cmd_q, image, CL_TRUE,
+                                   origin, region, 0, 0, data, 0, NULL, NULL)) != CL_SUCCESS) {
+      print_error(err, "Failed on enqueue write of image data.");
+    }
+  }
+
+  return image;
+}
+
+static cl_int migrateMemObject(enum migrations migrate, cl_command_queue *queues, cl_mem *mem_objects,
+                               cl_uint num_devices, cl_mem_migration_flags *flags, MTdata d)
+{
+  cl_uint i, j;
+  cl_int  err = CL_SUCCESS;
+
+  for (i=0; i<num_devices; i++) {
+    j = genrand_int32(d) % num_devices;
+    flags[i] = 0;
+    switch (migrate) {
+      case MIGRATE_PREFERRED:
+        // Force the device to be preferred
+        j = i;
+        break;
+      case MIGRATE_NON_PREFERRED:
+        // Coerce the device to be non-preferred
+        if ((j == i) && (num_devices > 1)) j = (j+1) % num_devices;
+        break;
+      case MIGRATE_RANDOM:
+        // Choose a random set of flags
+        flags[i] = (cl_mem_migration_flags)(genrand_int32(d) & (CL_MIGRATE_MEM_OBJECT_HOST | CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED));
+        break;
+    }
+    if ((err = clEnqueueMigrateMemObjects(queues[j], 1, (const cl_mem *)(&mem_objects[i]),
+                                          flags[i], 0, NULL, NULL)) != CL_SUCCESS) {
+      print_error(err, "Failed migrating memory object.");
+    }
+  }
+  return err;
+}
+
+static cl_int restoreImage(cl_command_queue *queues, cl_mem *mem_objects, cl_uint num_devices,
+                           cl_mem_migration_flags *flags, cl_uint *buffer)
+{
+  cl_uint i;
+  cl_int  err;
+
+  const size_t origin[3] = {0, 0, 0};
+  const size_t region[3] = {IMAGE_DIM, IMAGE_DIM, 1};
+
+  // If the image was previously migrated with undefined content, reload the content.
+
+  for (i=0; i<num_devices; i++) {
+    if (flags[i] & CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED) {
+      if ((err = clEnqueueWriteImage(queues[i], mem_objects[i], CL_TRUE,
+                                     origin, region, 0, 0, buffer, 0, NULL, NULL)) != CL_SUCCESS) {
+        print_error(err, "Failed on restoration enqueue write of image data.");
+        return err;
+      }
+    }
+  }
+  return CL_SUCCESS;
+}
+
+// Declaration moved out of protected scope/goto
+cl_sampler_properties properties[] = {
+  CL_SAMPLER_NORMALIZED_COORDS, CL_FALSE,
+  CL_SAMPLER_ADDRESSING_MODE, CL_ADDRESS_CLAMP,
+  CL_SAMPLER_FILTER_MODE, CL_FILTER_NEAREST,
+  0
+};
+
+int test_image_migrate(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+  int failed = 0;
+  cl_uint i, j;
+  cl_int err;
+  cl_uint max_sub_devices = 0;
+  cl_uint num_devices, num_devices_limited;
+  cl_uint A[MEM_OBJ_SIZE], B[MEM_OBJ_SIZE], C[MEM_OBJ_SIZE];
+  cl_uint test_number = 1;
+  cl_device_affinity_domain domain, domains;
+  cl_device_id *devices;
+  cl_command_queue *queues;
+  cl_mem_migration_flags *flagsA, *flagsB, *flagsC;
+  cl_device_partition_property property[] = {CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN, 0, 0};
+  cl_mem *imageA, *imageB, *imageC;
+  cl_mem_flags flags;
+  cl_image_format format;
+  cl_sampler sampler = NULL;
+  cl_program program = NULL;
+  cl_kernel kernel = NULL;
+  cl_context ctx = NULL;
+  enum migrations migrateA, migrateB, migrateC;
+  MTdata d = init_genrand(gRandomSeed);
+  const size_t wgs[2] = {IMAGE_DIM, IMAGE_DIM};
+  const size_t wls[2] = {1, 1};
+
+  // Check for image support.
+  if(checkForImageSupport(deviceID) == CL_IMAGE_FORMAT_NOT_SUPPORTED) {
+    log_info("Device does not support images. Skipping test.\n");
+    return 0;
+  }
+
+  // Allocate arrays whose size varies according to the maximum number of sub-devices.
+  if ((err = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(max_sub_devices), &max_sub_devices, NULL)) != CL_SUCCESS) {
+    print_error(err, "clGetDeviceInfo(CL_DEVICE_MAX_COMPUTE_UNITS) failed");
+    return -1;
+  }
+  if (max_sub_devices < 1) {
+    log_error("ERROR: Invalid number of compute units returned.\n");
+    return -1;
+  }
+
+  devices = (cl_device_id *)malloc(max_sub_devices * sizeof(cl_device_id));
+  queues = (cl_command_queue *)malloc(max_sub_devices * sizeof(cl_command_queue));
+  flagsA = (cl_mem_migration_flags *)malloc(max_sub_devices * sizeof(cl_mem_migration_flags));
+  flagsB = (cl_mem_migration_flags *)malloc(max_sub_devices * sizeof(cl_mem_migration_flags));
+  flagsC = (cl_mem_migration_flags *)malloc(max_sub_devices * sizeof(cl_mem_migration_flags));
+  imageA = (cl_mem *)malloc(max_sub_devices * sizeof(cl_mem));
+  imageB = (cl_mem *)malloc(max_sub_devices * sizeof(cl_mem));
+  imageC = (cl_mem *)malloc(max_sub_devices * sizeof(cl_mem));
+
+  if ((devices == NULL) || (queues  == NULL) ||
+      (flagsA  == NULL) || (flagsB  == NULL) || (flagsC == NULL) ||
+      (imageA  == NULL) || (imageB == NULL)  || (imageC == NULL)) {
+    log_error("ERROR: Failed to successfully allocate required local buffers.\n");
+    failed = -1;
+    goto cleanup_allocations;
+  }
+
+  for (i=0; i<max_sub_devices; i++) {
+    devices[i] = NULL;
+    queues [i] = NULL;
+    imageA[i] = imageB[i] = imageC[i] = NULL;
+  }
+
+  for (i=0; i<MEM_OBJ_SIZE; i++) {
+    A[i] = genrand_int32(d);
+    B[i] = genrand_int32(d);
+  }
+
+  // Set image format.
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT32;
+
+
+  // Attempt to partition the device along each of the allowed affinity domain.
+  if ((err = clGetDeviceInfo(deviceID, CL_DEVICE_PARTITION_AFFINITY_DOMAIN, sizeof(domains), &domains, NULL)) != CL_SUCCESS) {
+    print_error(err, "clGetDeviceInfo(CL_PARTITION_AFFINITY_DOMAIN) failed");
+    return -1;
+  }
+
+  domains &= (CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE | CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE |
+              CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE | CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE | CL_DEVICE_AFFINITY_DOMAIN_NUMA);
+
+  do {
+    if (domains) {
+      for (domain = 1; (domain & domains) == 0; domain <<= 1) {};
+      domains &= ~domain;
+    } else {
+      domain = 0;
+    }
+
+    // Determine the number of partitions for the device given the specific domain.
+    if (domain) {
+      property[1] = domain;
+      err = clCreateSubDevices(deviceID, (const cl_device_partition_property *)property, -1, NULL, &num_devices);
+      if ((err != CL_SUCCESS) || (num_devices == 0)) {
+        print_error(err, "Obtaining the number of partions by affinity failed.");
+        failed = 1;
+        goto cleanup;
+      }
+    } else {
+      num_devices = 1;
+    }
+
+    if (num_devices > 1) {
+      // Create each of the sub-devices and a corresponding context.
+      if ((err = clCreateSubDevices(deviceID, (const cl_device_partition_property *)property, num_devices, devices, &num_devices)) != CL_SUCCESS) {
+        print_error(err, "Failed creating sub devices.");
+        failed = 1;
+        goto cleanup;
+      }
+
+      // Create a context containing all the sub-devices
+      ctx = clCreateContext(NULL, num_devices, devices, notify_callback, NULL, &err);
+      if (ctx == NULL) {
+    print_error(err, "Failed creating context containing the sub-devices.");
+    failed = 1;
+    goto cleanup;
+      }
+
+      // Create a command queue for each sub-device
+      for (i=0; i<num_devices; i++) {
+        if (devices[i]) {
+          if ((queues[i] = clCreateCommandQueueWithProperties(ctx, devices[i], 0, &err)) == NULL) {
+            print_error(err, "Failed creating command queues.");
+            failed = 1;
+            goto cleanup;
+          }
+        }
+      }
+    } else {
+      // No partitioning available. Just exercise the APIs on a single device.
+      devices[0] = deviceID;
+      queues[0] = queue;
+      ctx = context;
+    }
+
+    // Build the kernel program.
+    if (err = create_single_kernel_helper(ctx, &program, &kernel, 1, &image_migrate_kernel_code, "test_image_migrate")) {
+      print_error(err, "Failed creating kernel.");
+      failed = 1;
+      goto cleanup;
+    }
+
+    // Create sampler.
+    sampler = clCreateSamplerWithProperties(ctx, properties, &err );
+    if ((err != CL_SUCCESS) || !sampler) {
+      print_error(err, "Failed to create a sampler.");
+      failed = 1;
+      goto cleanup;
+    }
+
+    num_devices_limited = num_devices;
+
+    // Allocate memory buffers. 3 buffers (2 input, 1 output) for each sub-device.
+    // If we run out of memory, then restrict the number of sub-devices to be tested.
+    for (i=0; i<num_devices; i++) {
+      imageA[i] = init_image(queues[i], create_image_2d(ctx, (CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR),
+                                                        &format, IMAGE_DIM, IMAGE_DIM, 0, NULL, &err), A);
+      imageB[i] = init_image(queues[i], create_image_2d(ctx, (CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR),
+                                                        &format, IMAGE_DIM, IMAGE_DIM, 0, NULL, &err), B);
+      imageC[i] = create_image_2d(ctx, (CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR),
+                                  &format, IMAGE_DIM, IMAGE_DIM, 0, NULL, &err);
+
+      if ((imageA[i] == NULL) || (imageB[i] == NULL) || (imageC[i] == NULL)) {
+        if (i == 0) {
+          log_error("Failed to allocate even 1 set of buffers.\n");
+          failed = 1;
+          goto cleanup;
+        }
+        num_devices_limited = i;
+        break;
+      }
+    }
+
+    // For each partition, we will execute the test kernel with each of the 3 buffers migrated to one of the migrate options
+    for (migrateA=(enum migrations)(0); migrateA<NUMBER_OF_MIGRATIONS; migrateA = (enum migrations)((int)migrateA + 1)) {
+      if (migrateMemObject(migrateA, queues, imageA, num_devices_limited, flagsA, d) != CL_SUCCESS) {
+        failed = 1;
+        goto cleanup;
+      }
+      for (migrateC=(enum migrations)(0); migrateC<NUMBER_OF_MIGRATIONS; migrateC = (enum migrations)((int)migrateC + 1)) {
+        if (migrateMemObject(migrateC, queues, imageC, num_devices_limited, flagsC, d) != CL_SUCCESS) {
+          failed = 1;
+          goto cleanup;
+        }
+        for (migrateB=(enum migrations)(0); migrateB<NUMBER_OF_MIGRATIONS; migrateB = (enum migrations)((int)migrateB + 1)) {
+          if (migrateMemObject(migrateB, queues, imageB, num_devices_limited, flagsB, d) != CL_SUCCESS) {
+            failed = 1;
+            goto cleanup;
+          }
+          // Run the test on each of the partitions.
+          for (i=0; i<num_devices_limited; i++) {
+            cl_uint x;
+
+            x = i + test_number;
+
+            if ((err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (const void *)&imageC[i])) != CL_SUCCESS) {
+              print_error(err, "Failed set kernel argument 0.");
+              failed = 1;
+              goto cleanup;
+            }
+
+            if ((err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (const void *)&imageA[i])) != CL_SUCCESS) {
+              print_error(err, "Failed set kernel argument 1.");
+              failed = 1;
+              goto cleanup;
+            }
+
+            if ((err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (const void *)&imageB[i])) != CL_SUCCESS) {
+              print_error(err, "Failed set kernel argument 2.");
+              failed = 1;
+              goto cleanup;
+            }
+
+            if ((err = clSetKernelArg(kernel, 3, sizeof(cl_sampler), (const void *)&sampler)) != CL_SUCCESS) {
+              print_error(err, "Failed set kernel argument 3.");
+              failed = 1;
+              goto cleanup;
+            }
+
+            if ((err = clSetKernelArg(kernel, 4, sizeof(cl_uint), (const void *)&x)) != CL_SUCCESS) {
+              print_error(err, "Failed set kernel argument 4.");
+              failed = 1;
+              goto cleanup;
+            }
+
+            if ((err = clEnqueueNDRangeKernel(queues[i], kernel, 2, NULL, wgs, wls, 0, NULL, NULL)) != CL_SUCCESS) {
+              print_error(err, "Failed enqueueing the NDRange kernel.");
+              failed = 1;
+              goto cleanup;
+            }
+          }
+          // Verify the results as long as neither input is an undefined migration
+          const size_t origin[3] = {0, 0, 0};
+          const size_t region[3] = {IMAGE_DIM, IMAGE_DIM, 1};
+
+          for (i=0; i<num_devices_limited; i++, test_number++) {
+            if (((flagsA[i] | flagsB[i]) & CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED) == 0) {
+              if ((err = clEnqueueReadImage(queues[i], imageC[i], CL_TRUE,
+                                            origin, region, 0, 0, C, 0, NULL, NULL)) != CL_SUCCESS) {
+                print_error(err, "Failed reading output buffer.");
+                failed = 1;
+                goto cleanup;
+              }
+              for (j=0; j<MEM_OBJ_SIZE; j++) {
+                cl_uint expected;
+
+                expected = A[j] ^ B[j] ^ test_number;
+                if (C[j] != expected) {
+                  log_error("Failed on device %d,  work item %4d,  expected 0x%08x got 0x%08x (0x%08x ^ 0x%08x ^ 0x%08x)\n", i, j, expected, C[j], A[j], B[j], test_number);
+                  failed = 1;
+                }
+              }
+              if (failed) goto cleanup;
+            }
+          }
+
+          if (restoreImage(queues, imageB, num_devices_limited, flagsB, B) != CL_SUCCESS) {
+            failed = 1;
+            goto cleanup;
+          }
+        }
+      }
+      if (restoreImage(queues, imageA, num_devices_limited, flagsA, A) != CL_SUCCESS) {
+        failed = 1;
+        goto cleanup;
+      }
+    }
+
+  cleanup:
+    // Clean up all the allocted resources create by the test. This includes sub-devices,
+    // command queues, and memory buffers.
+
+    for (i=0; i<max_sub_devices; i++) {
+      // Memory buffer cleanup
+      if (imageA[i]) {
+        if ((err = clReleaseMemObject(imageA[i])) != CL_SUCCESS) {
+          print_error(err, "Failed releasing memory object.");
+          failed = 1;
+        }
+      }
+      if (imageB[i]) {
+        if ((err = clReleaseMemObject(imageB[i])) != CL_SUCCESS) {
+          print_error(err, "Failed releasing memory object.");
+          failed = 1;
+        }
+      }
+      if (imageC[i]) {
+        if ((err = clReleaseMemObject(imageC[i])) != CL_SUCCESS) {
+          print_error(err, "Failed releasing memory object.");
+          failed = 1;
+        }
+      }
+
+      if (num_devices > 1) {
+        // Command queue cleanup
+        if (queues[i]) {
+          if ((err = clReleaseCommandQueue(queues[i])) != CL_SUCCESS) {
+            print_error(err, "Failed releasing command queue.");
+            failed = 1;
+          }
+        }
+
+        // Sub-device cleanup
+        if (devices[i]) {
+          if ((err = clReleaseDevice(devices[i])) != CL_SUCCESS) {
+            print_error(err, "Failed releasing sub device.");
+            failed = 1;
+          }
+        }
+        devices[i] = 0;
+      }
+    }
+
+    // Sampler cleanup
+    if (sampler) {
+      if ((err = clReleaseSampler(sampler)) != CL_SUCCESS) {
+    print_error(err, "Failed releasing sampler.");
+    failed = 1;
+      }
+      sampler = NULL;
+    }
+
+    // Context, program, and kernel cleanup
+    if (program) {
+      if ((err = clReleaseProgram(program)) != CL_SUCCESS) {
+    print_error(err, "Failed releasing program.");
+    failed = 1;
+      }
+      program = NULL;
+    }
+
+    if (kernel) {
+      if ((err = clReleaseKernel(kernel)) != CL_SUCCESS) {
+    print_error(err, "Failed releasing kernel.");
+    failed = 1;
+      }
+      kernel = NULL;
+    }
+
+    if (ctx && (ctx != context)) {
+      if ((err = clReleaseContext(ctx)) != CL_SUCCESS) {
+    print_error(err, "Failed releasing context.");
+    failed = 1;
+      }
+    }
+    ctx = NULL;
+
+    if (failed) goto cleanup_allocations;
+  } while (domains);
+
+cleanup_allocations:
+  if (devices) free(devices);
+  if (queues)  free(queues);
+  if (flagsA)  free(flagsA);
+  if (flagsB)  free(flagsB);
+  if (flagsC)  free(flagsC);
+  if (imageA)  free(imageA);
+  if (imageB)  free(imageB);
+  if (imageC)  free(imageC);
+
+  return ((failed) ? -1 : 0);
+}
--- a/test_conformance/buffers/test_sub_buffers.cpp
+++ b/test_conformance/buffers/test_sub_buffers.cpp
@@ -0,0 +1,631 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "procs.h"
+
+// Design:
+// To test sub buffers, we first create one main buffer. We then create several sub-buffers and
+// queue Actions on each one. Each Action is encapsulated in a class so it can keep track of
+// what results it expects, and so we can test scaling degrees of Actions on scaling numbers of
+// sub-buffers.
+
+class SubBufferWrapper : public clMemWrapper
+{
+public:
+    cl_mem mParentBuffer;
+    size_t mOrigin;
+    size_t mSize;
+
+    cl_int Allocate( cl_mem parent, cl_mem_flags flags, size_t origin, size_t size )
+    {
+        mParentBuffer = parent;
+        mOrigin = origin;
+        mSize = size;
+
+        cl_buffer_region region;
+        region.origin = mOrigin;
+        region.size = mSize;
+
+        cl_int error;
+        mMem = clCreateSubBuffer( mParentBuffer, flags, CL_BUFFER_CREATE_TYPE_REGION, &region, &error );
+        return error;
+    }
+};
+
+class Action
+{
+public:
+    virtual ~Action() {}
+    virtual cl_int Execute( cl_context context, cl_command_queue queue, cl_char tag, SubBufferWrapper &buffer1, SubBufferWrapper &buffer2, cl_char *parentBufferState ) = 0;
+    virtual const char * GetName( void ) const = 0;
+
+    static MTdata d;
+    static MTdata GetRandSeed( void )
+    {
+        if ( d == 0 )
+            d = init_genrand( gRandomSeed );
+        return d;
+    }
+    static void FreeRandSeed() {
+        if ( d != 0 ) {
+            free_mtdata(d);
+            d = 0;
+        }
+    }
+};
+
+MTdata Action::d = 0;
+
+class ReadWriteAction : public Action
+{
+public:
+    virtual ~ReadWriteAction() {}
+    virtual const char * GetName( void ) const { return "ReadWrite";}
+
+    virtual cl_int Execute( cl_context context, cl_command_queue queue, cl_char tag, SubBufferWrapper &buffer1, SubBufferWrapper &buffer2, cl_char *parentBufferState )
+    {
+        cl_char *tempBuffer = (cl_char*)malloc(buffer1.mSize);
+        if (!tempBuffer) {
+            log_error("Out of memory\n");
+            return -1;
+        }
+        cl_int error = clEnqueueReadBuffer( queue, buffer1, CL_TRUE, 0, buffer1.mSize, tempBuffer, 0, NULL, NULL );
+        test_error( error, "Unable to enqueue buffer read" );
+
+        size_t start = get_random_size_t( 0, buffer1.mSize / 2, GetRandSeed() );
+        size_t end = get_random_size_t( start, buffer1.mSize, GetRandSeed() );
+
+        for ( size_t i = start; i < end; i++ )
+        {
+            tempBuffer[ i ] |= tag;
+            parentBufferState[ i + buffer1.mOrigin ] |= tag;
+        }
+
+        error = clEnqueueWriteBuffer( queue, buffer1, CL_TRUE, 0, buffer1.mSize, tempBuffer, 0, NULL, NULL );
+        test_error( error, "Unable to enqueue buffer write" );
+        free(tempBuffer);
+        return CL_SUCCESS;
+    }
+};
+
+#ifndef MAX
+#define MAX( _a, _b )   ( (_a) > (_b) ? (_a) : (_b) )
+#endif
+#ifndef MIN
+#define MIN( _a, _b )   ( (_a) < (_b) ? (_a) : (_b) )
+#endif
+
+class CopyAction : public Action
+{
+public:
+    virtual ~CopyAction() {}
+    virtual const char * GetName( void ) const { return "Copy";}
+
+    virtual cl_int Execute( cl_context context, cl_command_queue queue, cl_char tag, SubBufferWrapper &buffer1, SubBufferWrapper &buffer2, cl_char *parentBufferState )
+    {
+        // Copy from sub-buffer 1 to sub-buffer 2
+        size_t size = get_random_size_t( 0, MIN( buffer1.mSize, buffer2.mSize ), GetRandSeed() );
+
+        size_t startOffset = get_random_size_t( 0, buffer1.mSize - size, GetRandSeed() );
+        size_t endOffset = get_random_size_t( 0, buffer2.mSize - size, GetRandSeed() );
+
+        cl_int error = clEnqueueCopyBuffer( queue, buffer1, buffer2, startOffset, endOffset, size, 0, NULL, NULL );
+        test_error( error, "Unable to enqueue buffer copy" );
+
+        memcpy( parentBufferState + buffer2.mOrigin + endOffset, parentBufferState + buffer1.mOrigin + startOffset, size );
+
+        return CL_SUCCESS;
+    }
+};
+
+class MapAction : public Action
+{
+public:
+    virtual ~MapAction() {}
+    virtual const char * GetName( void ) const { return "Map";}
+
+    virtual cl_int Execute( cl_context context, cl_command_queue queue, cl_char tag, SubBufferWrapper &buffer1, SubBufferWrapper &buffer2, cl_char *parentBufferState )
+    {
+        size_t size = get_random_size_t( 0, buffer1.mSize, GetRandSeed() );
+        size_t start = get_random_size_t( 0, buffer1.mSize - size, GetRandSeed() );
+
+        cl_int error;
+        void * mappedPtr = clEnqueueMapBuffer( queue, buffer1, CL_TRUE, (cl_map_flags)( CL_MAP_READ | CL_MAP_WRITE ),
+                                               start, size, 0, NULL, NULL, &error );
+        test_error( error, "Unable to map buffer" );
+
+        cl_char *cPtr = (cl_char *)mappedPtr;
+        for ( size_t i = 0; i < size; i++ )
+        {
+            cPtr[ i ] |= tag;
+            parentBufferState[ i + start + buffer1.mOrigin ] |= tag;
+        }
+
+        error = clEnqueueUnmapMemObject( queue, buffer1, mappedPtr, 0, NULL, NULL );
+        test_error( error, "Unable to unmap buffer" );
+
+        return CL_SUCCESS;
+    }
+};
+
+class KernelReadWriteAction : public Action
+{
+public:
+    virtual ~KernelReadWriteAction() {}
+    virtual const char * GetName( void ) const { return "KernelReadWrite";}
+
+    virtual cl_int Execute( cl_context context, cl_command_queue queue, cl_char tag, SubBufferWrapper &buffer1, SubBufferWrapper &buffer2, cl_char *parentBufferState )
+    {
+        const char *kernelCode[] = {
+            "__kernel void readTest( __global char *inBuffer, char tag )\n"
+            "{\n"
+            "    int tid = get_global_id(0);\n"
+            "    inBuffer[ tid ] |= tag;\n"
+            "}\n" };
+
+        clProgramWrapper program;
+        clKernelWrapper kernel;
+        cl_int error;
+
+        if ( create_single_kernel_helper( context, &program, &kernel, 1, kernelCode, "readTest" ) )
+        {
+            return -1;
+        }
+
+        size_t threads[1] = { buffer1.mSize };
+
+        error = clSetKernelArg( kernel, 0, sizeof( cl_mem ), &buffer1 );
+        test_error( error, "Unable to set kernel argument" );
+        error = clSetKernelArg( kernel, 1, sizeof( tag ), &tag );
+        test_error( error, "Unable to set kernel argument" );
+
+        error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, NULL, 0, NULL, NULL );
+        test_error( error, "Unable to queue kernel" );
+
+        for ( size_t i = 0; i < buffer1.mSize; i++ )
+            parentBufferState[ i + buffer1.mOrigin ] |= tag;
+
+        return CL_SUCCESS;
+    }
+};
+
+cl_int get_reasonable_buffer_size( cl_device_id device, size_t &outSize )
+{
+    cl_ulong maxAllocSize;
+    cl_int error;
+
+    // Get the largest possible buffer we could allocate
+    error = clGetDeviceInfo( device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof( maxAllocSize ), &maxAllocSize, NULL );
+    test_error( error, "Unable to get max alloc size" );
+
+    // Don't create a buffer quite that big, just so we have some space left over for other work
+    outSize = (size_t)( maxAllocSize / 5 );
+
+    // Cap at 32M so tests complete in a reasonable amount of time.
+    if ( outSize > 32 << 20 )
+        outSize = 32 << 20;
+
+    return CL_SUCCESS;
+}
+
+size_t find_subbuffer_by_index( SubBufferWrapper * subBuffers, size_t numSubBuffers, size_t index )
+{
+    for ( size_t i = 0; i < numSubBuffers; i++ )
+    {
+        if ( subBuffers[ i ].mOrigin > index )
+            return numSubBuffers;
+        if ( ( subBuffers[ i ].mOrigin <= index ) && ( ( subBuffers[ i ].mOrigin + subBuffers[ i ].mSize ) > index ) )
+            return i;
+    }
+    return numSubBuffers;
+}
+
+// This tests the read/write capabilities of sub buffers (if we are read/write, the sub buffers
+// can't overlap)
+int test_sub_buffers_read_write_core( cl_context context, cl_command_queue queueA, cl_command_queue queueB, size_t mainSize, size_t addressAlign )
+{
+    clMemWrapper mainBuffer;
+    SubBufferWrapper subBuffers[ 8 ];
+    size_t numSubBuffers;
+    cl_int error;
+    size_t i;
+    MTdata m = init_genrand( 22 );
+
+
+    cl_char * mainBufferContents = (cl_char*)calloc(1,mainSize);
+    cl_char * actualResults      = (cl_char*)calloc(1,mainSize);
+
+    for ( i = 0; i < mainSize / 4; i++ )
+        ((cl_uint*) mainBufferContents)[i] = genrand_int32(m);
+
+    free_mtdata( m );
+
+    // Create the main buffer to test against
+    mainBuffer = clCreateBuffer( context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mainSize, mainBufferContents, &error );
+    test_error( error, "Unable to create test main buffer" );
+
+    // Create some sub-buffers to use
+    size_t toStartFrom = 0;
+    for ( numSubBuffers = 0; numSubBuffers < 8; numSubBuffers++ )
+    {
+        size_t endRange = toStartFrom + ( mainSize / 4 );
+        if ( endRange > mainSize )
+            endRange = mainSize;
+
+        size_t offset = get_random_size_t( toStartFrom / addressAlign, endRange / addressAlign, Action::GetRandSeed() ) * addressAlign;
+        size_t size = get_random_size_t( 1, ( MIN( mainSize / 8, mainSize - offset ) ) / addressAlign, Action::GetRandSeed() ) * addressAlign;
+        error = subBuffers[ numSubBuffers ].Allocate( mainBuffer, CL_MEM_READ_WRITE, offset, size );
+        test_error( error, "Unable to allocate sub buffer" );
+
+        toStartFrom = offset + size;
+        if ( toStartFrom > ( mainSize - ( addressAlign * 256 ) ) )
+            break;
+    }
+
+    ReadWriteAction rwAction;
+    MapAction mapAction;
+    CopyAction copyAction;
+    KernelReadWriteAction kernelAction;
+
+    Action * actions[] = { &rwAction, &mapAction, &copyAction, &kernelAction };
+    int numErrors = 0;
+
+    // Do the following steps twice, to make sure the parent gets updated *and* we can
+    // still work on the sub-buffers
+    cl_command_queue prev_queue = queueA;
+    for ( int time = 0; time < 2; time++ )
+    {
+        // Randomly apply actions to the set of sub buffers
+        size_t i;
+        for (  i = 0; i < 64; i++ )
+        {
+            int which = random_in_range( 0, 3, Action::GetRandSeed() );
+            int whichQueue = random_in_range( 0, 1, Action::GetRandSeed() );
+            int whichBufferA = random_in_range( 0, (int)numSubBuffers - 1, Action::GetRandSeed() );
+            int whichBufferB;
+            do
+            {
+                whichBufferB = random_in_range( 0, (int)numSubBuffers - 1, Action::GetRandSeed() );
+            } while ( whichBufferB == whichBufferA );
+
+            cl_command_queue queue = ( whichQueue == 1 ) ? queueB : queueA;
+            if (queue != prev_queue) {
+                error = clFinish( prev_queue );
+                test_error( error, "Error finishing other queue." );
+
+                prev_queue = queue;
+            }
+
+            error = actions[ which ]->Execute( context, queue, (cl_int)i, subBuffers[ whichBufferA ], subBuffers[ whichBufferB ], mainBufferContents );
+            test_error( error, "Unable to execute action against sub buffers" );
+        }
+
+        error = clFinish( queueA );
+        test_error( error, "Error finishing queueA." );
+
+        error = clFinish( queueB );
+        test_error( error, "Error finishing queueB." );
+
+        // Validate by reading the final contents of the main buffer and
+        // validating against our ref copy we generated
+        error = clEnqueueReadBuffer( queueA, mainBuffer, CL_TRUE, 0, mainSize, actualResults, 0, NULL, NULL );
+        test_error( error, "Unable to enqueue buffer read" );
+
+        for ( i = 0; i < mainSize; i += 65536 )
+        {
+            size_t left = 65536;
+            if ( ( i + left ) > mainSize )
+                left = mainSize - i;
+
+            if ( memcmp( actualResults + i, mainBufferContents + i, left ) == 0 )
+                continue;
+
+            // The fast compare failed, so we need to determine where exactly the failure is
+
+            for ( size_t j = 0; j < left; j++ )
+            {
+                if ( actualResults[ i + j ] != mainBufferContents[ i + j ] )
+                {
+                    // Hit a failure; report the subbuffer at this address as having failed
+                    size_t sbThatFailed = find_subbuffer_by_index( subBuffers, numSubBuffers, i + j );
+                    if ( sbThatFailed == numSubBuffers )
+                    {
+                        log_error( "ERROR: Validation failure outside of a sub-buffer! (Shouldn't be possible, but it happened at index %ld out of %ld...)\n", i + j, mainSize );
+                        // Since this is a nonsensical, don't bother continuing to check
+                        // (we will, however, print our map of sub-buffers for comparison)
+                        for ( size_t k = 0; k < numSubBuffers; k++ )
+                        {
+                            log_error( "\tBuffer %ld: %ld to %ld (length %ld)\n", k, subBuffers[ k ].mOrigin, subBuffers[ k ].mOrigin + subBuffers[ k ].mSize, subBuffers[ k ].mSize );
+                        }
+                        return -1;
+                    }
+                    log_error( "ERROR: Validation failure on sub-buffer %ld (start: %ld, length: %ld)\n", sbThatFailed, subBuffers[ sbThatFailed ].mOrigin, subBuffers[ sbThatFailed ].mSize );
+                    size_t newPos = subBuffers[ sbThatFailed ].mOrigin + subBuffers[ sbThatFailed ].mSize - 1;
+                    i = newPos & ~65535;
+                    j = newPos - i;
+                    numErrors++;
+                }
+            }
+        }
+    }
+
+    free(mainBufferContents);
+    free(actualResults);
+    Action::FreeRandSeed();
+
+    return numErrors;
+}
+
+int test_sub_buffers_read_write( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+{
+    cl_int error;
+    size_t mainSize;
+    cl_uint addressAlignBits;
+
+    // Get the size of the main buffer to use
+    error = get_reasonable_buffer_size( deviceID, mainSize );
+    test_error( error, "Unable to get reasonable buffer size" );
+
+    // Determine the alignment of the device so we can make sure sub buffers are valid
+    error = clGetDeviceInfo( deviceID, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof( addressAlignBits ), &addressAlignBits, NULL );
+    test_error( error, "Unable to get device's address alignment" );
+
+    size_t addressAlign = addressAlignBits/8;
+
+    return test_sub_buffers_read_write_core( context, queue, queue, mainSize, addressAlign );
+}
+
+// This test performs the same basic operations as sub_buffers_read_write, but instead of a single
+// device, it creates a context and buffer shared between two devices, then executes commands
+// on queues for each device to ensure that everything still operates as expected.
+int test_sub_buffers_read_write_dual_devices( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+{
+    cl_int error;
+
+
+    // First obtain the second device
+    cl_device_id otherDevice = GetOpposingDevice( deviceID );
+    if ( otherDevice == NULL )
+    {
+        log_error( "ERROR: Unable to obtain a second device for sub-buffer dual-device test.\n" );
+        return -1;
+    }
+    if ( otherDevice == deviceID )
+    {
+        log_info( "Note: Unable to run dual-device sub-buffer test (only one device available). Skipping test (implicitly passing).\n" );
+        return 0;
+    }
+
+    // Determine the device id.
+    size_t param_size;
+    error = clGetDeviceInfo(otherDevice, CL_DEVICE_NAME, 0, NULL, &param_size );
+    test_error( error, "Error obtaining device name" );
+
+#if !(defined(_WIN32) && defined(_MSC_VER))
+    char device_name[param_size];
+#else
+    char* device_name = (char*)_malloca(param_size);
+#endif
+    error = clGetDeviceInfo(otherDevice, CL_DEVICE_NAME, param_size, &device_name[0], NULL );
+    test_error( error, "Error obtaining device name" );
+
+    log_info( "\tOther device obtained for dual device test is type %s\n", device_name );
+
+    // Create a shared context for these two devices
+    cl_device_id devices[ 2 ] = { deviceID, otherDevice };
+    clContextWrapper testingContext = clCreateContext( NULL, 2, devices, NULL, NULL, &error );
+    test_error( error, "Unable to create shared context" );
+
+    // Create two queues (can't use the existing one, because it's on the wrong context)
+    clCommandQueueWrapper queue1 = clCreateCommandQueueWithProperties( testingContext, deviceID, 0, &error );
+    test_error( error, "Unable to create command queue on main device" );
+
+    clCommandQueueWrapper queue2 = clCreateCommandQueueWithProperties( testingContext, otherDevice, 0, &error );
+    test_error( error, "Unable to create command queue on secondary device" );
+
+    // Determine the reasonable buffer size and address alignment that applies to BOTH devices
+    size_t maxBuffer1, maxBuffer2;
+    error = get_reasonable_buffer_size( deviceID, maxBuffer1 );
+    test_error( error, "Unable to get buffer size for main device" );
+
+    error = get_reasonable_buffer_size( otherDevice, maxBuffer2 );
+    test_error( error, "Unable to get buffer size for secondary device" );
+    maxBuffer1 = MIN( maxBuffer1, maxBuffer2 );
+
+    cl_uint addressAlign1Bits, addressAlign2Bits;
+    error = clGetDeviceInfo( deviceID, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof( addressAlign1Bits ), &addressAlign1Bits, NULL );
+    test_error( error, "Unable to get main device's address alignment" );
+
+    error = clGetDeviceInfo( otherDevice, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof( addressAlign2Bits ), &addressAlign2Bits, NULL );
+    test_error( error, "Unable to get secondary device's address alignment" );
+
+    cl_uint addressAlign1 = MAX( addressAlign1Bits, addressAlign2Bits ) / 8;
+
+    // Finally time to run!
+    return test_sub_buffers_read_write_core( testingContext, queue1, queue2, maxBuffer1, addressAlign1 );
+}
+
+cl_int read_buffer_via_kernel( cl_context context, cl_command_queue queue, cl_mem buffer, size_t length, cl_char *outResults )
+{
+    const char *kernelCode[] = {
+        "__kernel void readTest( __global char *inBuffer, __global char *outBuffer )\n"
+        "{\n"
+        "    int tid = get_global_id(0);\n"
+        "    outBuffer[ tid ] = inBuffer[ tid ];\n"
+        "}\n" };
+
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    cl_int error;
+
+    if ( create_single_kernel_helper( context, &program, &kernel, 1, kernelCode, "readTest" ) )
+    {
+        return -1;
+    }
+
+    size_t threads[1] = { length };
+
+    clMemWrapper outStream = clCreateBuffer( context, CL_MEM_READ_WRITE, length, NULL, &error );
+    test_error( error, "Unable to create output stream" );
+
+    error = clSetKernelArg( kernel, 0, sizeof( buffer ), &buffer );
+    test_error( error, "Unable to set kernel argument" );
+    error = clSetKernelArg( kernel, 1, sizeof( outStream ), &outStream );
+    test_error( error, "Unable to set kernel argument" );
+
+    error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, NULL, 0, NULL, NULL );
+    test_error( error, "Unable to queue kernel" );
+
+    error = clEnqueueReadBuffer( queue, outStream, CL_TRUE, 0, length, outResults, 0, NULL, NULL );
+    test_error( error, "Unable to read results from kernel" );
+
+    return CL_SUCCESS;
+}
+
+
+int test_sub_buffers_overlapping( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
+{
+    cl_int error;
+    size_t mainSize;
+    cl_uint addressAlign;
+
+    clMemWrapper mainBuffer;
+    SubBufferWrapper subBuffers[ 16 ];
+
+
+    // Create the main buffer to test against
+    error = get_reasonable_buffer_size( deviceID, mainSize );
+    test_error( error, "Unable to get reasonable buffer size" );
+
+    mainBuffer = clCreateBuffer( context, CL_MEM_READ_WRITE, mainSize, NULL, &error );
+    test_error( error, "Unable to create test main buffer" );
+
+    // Determine the alignment of the device so we can make sure sub buffers are valid
+    error = clGetDeviceInfo( deviceID, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof( addressAlign ), &addressAlign, NULL );
+    test_error( error, "Unable to get device's address alignment" );
+
+    // Create some sub-buffers to use. Note: they don't have to not overlap (we actually *want* them to overlap)
+    for ( size_t i = 0; i < 16; i++ )
+    {
+        size_t offset = get_random_size_t( 0, mainSize / addressAlign, Action::GetRandSeed() ) * addressAlign;
+        size_t size = get_random_size_t( 1, ( mainSize - offset ) / addressAlign, Action::GetRandSeed() ) * addressAlign;
+
+        error = subBuffers[ i ].Allocate( mainBuffer, CL_MEM_READ_ONLY, offset, size );
+        test_error( error, "Unable to allocate sub buffer" );
+    }
+
+    /// For logging, we determine the amount of overlap we just generated
+    // Build a fast in-out map to help with generating the stats
+    int sbMap[ 32 ], mapSize = 0;
+    for ( int i = 0; i < 16; i++ )
+    {
+        int j;
+        for ( j = 0; j < mapSize; j++ )
+        {
+            size_t pt = ( sbMap[ j ] < 0 ) ? ( subBuffers[ -sbMap[ j ] ].mOrigin + subBuffers[ -sbMap[ j ] ].mSize )
+                        : subBuffers[ sbMap[ j ] ].mOrigin;
+            if ( subBuffers[ i ].mOrigin < pt )
+            {
+                // Origin is before this part of the map, so move map forward so we can insert
+                memmove( &sbMap[ j + 1 ], &sbMap[ j ], sizeof( int ) * ( mapSize - j ) );
+                sbMap[ j ] = i;
+                mapSize++;
+                break;
+            }
+        }
+        if ( j == mapSize )
+        {
+            sbMap[ j ] = i;
+            mapSize++;
+        }
+
+        size_t endPt = subBuffers[ i ].mOrigin + subBuffers[ i ].mSize;
+        for ( j = 0; j < mapSize; j++ )
+        {
+            size_t pt = ( sbMap[ j ] < 0 ) ? ( subBuffers[ -sbMap[ j ] ].mOrigin + subBuffers[ -sbMap[ j ] ].mSize )
+                        : subBuffers[ sbMap[ j ] ].mOrigin;
+            if ( endPt < pt )
+            {
+                // Origin is before this part of the map, so move map forward so we can insert
+                memmove( &sbMap[ j + 1 ], &sbMap[ j ], sizeof( int ) * ( mapSize - j ) );
+                sbMap[ j ] = -( i + 1 );
+                mapSize++;
+                break;
+            }
+        }
+        if ( j == mapSize )
+        {
+            sbMap[ j ] = -( i + 1 );
+            mapSize++;
+        }
+    }
+    long long delta = 0;
+    size_t maxOverlap = 1, overlap = 0;
+    for ( int i = 0; i < 32; i++ )
+    {
+        if ( sbMap[ i ] >= 0 )
+        {
+            overlap++;
+            if ( overlap > 1 )
+                delta -= (long long)( subBuffers[ sbMap[ i ] ].mOrigin );
+            if ( overlap > maxOverlap )
+                maxOverlap = overlap;
+        }
+        else
+        {
+            if ( overlap > 1 )
+                delta += (long long)( subBuffers[ -sbMap[ i ] - 1 ].mOrigin + subBuffers[ -sbMap[ i ] - 1 ].mSize );
+            overlap--;
+        }
+    }
+
+    log_info( "\tTesting %d sub-buffers with %lld overlapping Kbytes (%d%%; as many as %ld buffers overlapping at once)\n",
+              16, ( delta / 1024LL ), (int)( delta * 100LL / (long long)mainSize ), maxOverlap );
+
+    // Write some random contents to the main buffer
+    cl_char * contents = new cl_char[ mainSize ];
+    generate_random_data( kChar, mainSize, Action::GetRandSeed(), contents );
+
+    error = clEnqueueWriteBuffer( queue, mainBuffer, CL_TRUE, 0, mainSize, contents, 0, NULL, NULL );
+    test_error( error, "Unable to write to main buffer" );
+
+    // Now read from each sub-buffer and check to make sure that they make sense w.r.t. the main contents
+    cl_char * tempBuffer = new cl_char[ mainSize ];
+
+    int numErrors = 0;
+    for ( size_t i = 0; i < 16; i++ )
+    {
+        // Read from this buffer
+        int which = random_in_range( 0, 1, Action::GetRandSeed() );
+        if ( which )
+            error = clEnqueueReadBuffer( queue, subBuffers[ i ], CL_TRUE, 0, subBuffers[ i ].mSize, tempBuffer, 0, NULL, NULL );
+        else
+            error = read_buffer_via_kernel( context, queue, subBuffers[ i ], subBuffers[ i ].mSize, tempBuffer );
+        test_error( error, "Unable to read sub buffer contents" );
+
+        if ( memcmp( tempBuffer, contents + subBuffers[ i ].mOrigin, subBuffers[ i ].mSize ) != 0 )
+        {
+            log_error( "ERROR: Validation for sub-buffer %ld failed!\n", i );
+            numErrors++;
+        }
+    }
+
+    delete [] contents;
+    delete [] tempBuffer;
+    Action::FreeRandSeed();
+
+    return numErrors;
+}
+