diff --git a/CMakeLists.txt b/CMakeLists.txt
index a1f40261..6b8ee156 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -17,7 +17,7 @@ set(CLConform_VERSION "${CLConform_VERSION_MAJOR}.${CLConform_VERSION_MINOR}")
 set(CLConform_VERSION_FULL
     "${CLConform_VERSION}.${CLConform_VERSION_MICRO}${CLConform_VERSION_EXTRA}")
 
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.1)
 
 add_definitions(-DCL_USE_DEPRECATED_OPENCL_2_1_APIS=1)
 add_definitions(-DCL_USE_DEPRECATED_OPENCL_2_0_APIS=1)
diff --git a/build_android.py b/build_android.py
old mode 100644
new mode 100755
diff --git a/build_lnx.sh b/build_lnx.sh
old mode 100644
new mode 100755
diff --git a/clean_tests.py b/clean_tests.py
old mode 100644
new mode 100755
diff --git a/readme-spir-v-binaries.txt b/readme-spir-v-binaries.txt
index 91f29aba..1b6df9bc 100644
--- a/readme-spir-v-binaries.txt
+++ b/readme-spir-v-binaries.txt
@@ -1,17 +1,14 @@
-To run the 2.2 conformance tests test suite for the C++ features you need
+To run the 2.2 conformance tests test suite for the C++ features you need need 
 SPIR-V binaries.
 
 If you are using a conformance package then the binaries are included in the 
-package. If you are using conformance tests from git repositories then the
-binaries need to be picked up using LFS:
+package. If you are using conformance tests from gitlab repositories then the
+binaries need to be picked up from Khronos SVN URL mentioned below:
 
-1. Setup LFS by following instructions at https://git-lfs.github.com/
+https://cvs.khronos.org/svn/repos/OpenCL/trunk/Khronos/spirv/spirv10_2015.11.25.zip
 
-2. The SPIR-V binaries can then be picked up from test_conformance/clcpp/spirv*.7z
 
-Alternatively you can check out and build all of the below repositories
-manually or use https://github.com/KhronosGroup/OpenCL-CTS-Framework which will
-do it for you.
+Alternatively you can check out and build all of the below repositories.
 
 1. SPIRV-LLVM
 LLVM with support for SPIR-V (required by clang compiler)
@@ -42,4 +39,4 @@ Branch: opencl22
 5. OpenCL ICD (with 2.2 support)
 OpenCL ICD 
 Repository: https://gitlab.khronos.org/opencl/icd 
-Branch: dev_cl22 
+Branch: dev_cl22 
\ No newline at end of file
diff --git a/test_common/harness/imageHelpers.cpp b/test_common/harness/imageHelpers.cpp
index 7606d1a3..a0cce7d4 100644
--- a/test_common/harness/imageHelpers.cpp
+++ b/test_common/harness/imageHelpers.cpp
@@ -479,16 +479,6 @@ struct AddressingTable
 
 static AddressingTable  sAddressingTable;
 
-bool alpha_is_x(cl_image_format *format){
-    switch (format->image_channel_order) {
-        case CL_RGBx:
-        case CL_sRGBx:
-            return true;
-        default:
-            return false;
-    }
-}
-
 bool is_sRGBA_order(cl_channel_order image_channel_order){
     switch (image_channel_order) {
         case CL_sRGB:
@@ -508,19 +498,21 @@ int has_alpha(cl_image_format *format) {
         case CL_R:
             return 0;
         case CL_A:
-        case CL_Rx:
             return 1;
+        case CL_Rx:
+            return 0;
         case CL_RG:
             return 0;
         case CL_RA:
-        case CL_RGx:
             return 1;
+        case CL_RGx:
+            return 0;
         case CL_RGB:
         case CL_sRGB:
             return 0;
         case CL_RGBx:
         case CL_sRGBx:
-            return 1;
+            return 0;
         case CL_RGBA:
             return 1;
         case CL_BGRA:
@@ -719,13 +711,6 @@ void get_max_sizes(size_t *numberOfSizes, const int maxNumberOfSizes,
   }
 }
 
-int issubnormal(float a)
-{
-    union { cl_int i; cl_float f; } u;
-    u.f = a;
-    return (u.i & 0x7f800000U) == 0;
-}
-
 float get_max_absolute_error( cl_image_format *format, image_sampler_data *sampler) {
     if (sampler->filter_mode == CL_FILTER_NEAREST)
         return 0.0f;
@@ -790,6 +775,7 @@ float get_max_relative_error( cl_image_format *format, image_sampler_data *sampl
     {
         if( sampler->filter_mode != CL_FILTER_NEAREST )
         {
+            extern cl_device_type   gDeviceType;
             // The maximum
             if( gDeviceType == CL_DEVICE_TYPE_GPU )
                 maxError += MAKE_HEX_FLOAT(0x1.0p-4f, 0x1L, -4);              // Some GPUs ain't so accurate
@@ -1419,7 +1405,7 @@ void read_image_pixel_float( void *imageData, image_descriptor *imageInfo,
         else {
             outData[ 0 ] = outData[ 1 ] = outData[ 2 ] = outData[ 3 ] = 0;
             if (!has_alpha(imageInfo->format))
-                outData[3] = alpha_is_x(imageInfo->format) ? 0 : 1;
+                outData[3] = 1;
         }
         return;
     }
diff --git a/test_common/harness/imageHelpers.h b/test_common/harness/imageHelpers.h
index 83ff1377..ba80ea77 100644
--- a/test_common/harness/imageHelpers.h
+++ b/test_common/harness/imageHelpers.h
@@ -136,8 +136,6 @@ extern void copy_image_data( image_descriptor *srcImageInfo, image_descriptor *d
 
 int has_alpha(cl_image_format *format);
 
-extern bool alpha_is_x(cl_image_format *format);
-
 extern bool is_sRGBA_order(cl_channel_order image_channel_order);
 
 inline float calculate_array_index( float coord, float extent );
@@ -594,7 +592,6 @@ extern char *create_random_image_data( ExplicitType dataType, image_descriptor *
 extern void get_sampler_kernel_code( image_sampler_data *imageSampler, char *outLine );
 extern float get_max_absolute_error( cl_image_format *format, image_sampler_data *sampler);
 extern float get_max_relative_error( cl_image_format *format, image_sampler_data *sampler, int is3D, int isLinearFilter );
-extern int issubnormal(float);
 
 
 #define errMax( _x , _y )       ( (_x) != (_x) ? (_x) : (_x) > (_y) ? (_x) : (_y) )
diff --git a/test_common/harness/kernelHelpers.c b/test_common/harness/kernelHelpers.c
index 591ab038..59143a3b 100644
--- a/test_common/harness/kernelHelpers.c
+++ b/test_common/harness/kernelHelpers.c
@@ -1070,14 +1070,14 @@ size_t get_pixel_bytes( const cl_image_format *fmt )
     return 0;
 }
 
-int verifyImageSupport( cl_device_id device )
+test_status verifyImageSupport( cl_device_id device )
 {
     if( checkForImageSupport( device ) )
     {
         log_error( "ERROR: Device does not supported images as required by this test!\n" );
-        return CL_IMAGE_FORMAT_NOT_SUPPORTED;
+        return TEST_FAIL;
     }
-    return 0;
+    return TEST_PASS;
 }
 
 int checkForImageSupport( cl_device_id device )
diff --git a/test_common/harness/kernelHelpers.h b/test_common/harness/kernelHelpers.h
index b39113ec..5f4729c4 100644
--- a/test_common/harness/kernelHelpers.h
+++ b/test_common/harness/kernelHelpers.h
@@ -20,6 +20,7 @@
 #include "../config.hpp"
 
 #include "compat.h"
+#include "testHarness.h"
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -128,8 +129,8 @@ extern int is_image_format_supported( cl_context context, cl_mem_flags flags, cl
 /* Helper to get pixel size for a pixel format */
 size_t get_pixel_bytes( const cl_image_format *fmt );
 
-/* Verify the given device supports images. 0 means you're good to go, otherwise an error */
-extern int verifyImageSupport( cl_device_id device );
+/* Verify the given device supports images. */
+extern test_status verifyImageSupport( cl_device_id device );
 
 /* Checks that the given device supports images. Same as verify, but doesn't print an error */
 extern int checkForImageSupport( cl_device_id device );
diff --git a/test_common/harness/testHarness.c b/test_common/harness/testHarness.c
index f513f787..5bc494a3 100644
--- a/test_common/harness/testHarness.c
+++ b/test_common/harness/testHarness.c
@@ -143,7 +143,7 @@ int runTestHarnessWithCheck( int argc, const char *argv[], unsigned int num_fns,
         log_info( "\tid<num>\t\tIndicates device at index <num> should be used (default 0).\n" );
         log_info( "\t<device_type>\tcpu|gpu|accelerator|<CL_DEVICE_TYPE_*> (default CL_DEVICE_TYPE_DEFAULT)\n" );
 
-        for( i = 0; i < num_fns - 1; i++ )
+        for( i = 0; i < num_fns; i++ )
         {
             log_info( "\t\t%s\n", fnNames[ i ] );
         }
@@ -439,10 +439,18 @@ int runTestHarnessWithCheck( int argc, const char *argv[], unsigned int num_fns,
 
 
     /* If we have a device checking function, run it */
-    if( ( deviceCheckFn != NULL ) && deviceCheckFn( device ) != CL_SUCCESS )
+    if( ( deviceCheckFn != NULL ) )
     {
-        test_finish();
-        return -1;
+        test_status status = deviceCheckFn( device );
+        switch (status)
+        {
+            case TEST_PASS:
+                break;
+            case TEST_FAIL:
+                return 1;
+            case TEST_SKIP:
+                return 0;
+        }
     }
 
     if (num_elements <= 0)
diff --git a/test_common/harness/testHarness.h b/test_common/harness/testHarness.h
index 2816b2a0..fc019aa3 100644
--- a/test_common/harness/testHarness.h
+++ b/test_common/harness/testHarness.h
@@ -25,6 +25,13 @@
 extern "C" {
 #endif
 
+typedef enum test_status
+{
+    TEST_PASS = 0,
+    TEST_FAIL = 1,
+    TEST_SKIP = 2,
+} test_status;
+
 extern cl_uint gReSeed;
 extern cl_uint gRandomSeed;
 
@@ -34,8 +41,8 @@ extern int runTestHarness( int argc, const char *argv[], unsigned int num_fns,
                             basefn fnList[], const char *fnNames[],
                             int imageSupportRequired, int forceNoContextCreation, cl_command_queue_properties queueProps );
 
-// Device checking function. See runTestHarnessWithCheck. If this function returns anything other than CL_SUCCESS (0), the harness exits.
-typedef int (*DeviceCheckFn)( cl_device_id device );
+// Device checking function. See runTestHarnessWithCheck. If this function returns anything other than TEST_PASS, the harness exits.
+typedef test_status (*DeviceCheckFn)( cl_device_id device );
 
 // Same as runTestHarness, but also supplies a function that checks the created device for required functionality.
 extern int runTestHarnessWithCheck( int argc, const char *argv[], unsigned int num_fns,
diff --git a/test_conformance/SVM/CMakeLists.txt b/test_conformance/SVM/CMakeLists.txt
index c7d0219e..ad74a829 100644
--- a/test_conformance/SVM/CMakeLists.txt
+++ b/test_conformance/SVM/CMakeLists.txt
@@ -1,3 +1,4 @@
+set(CMAKE_CXX_STANDARD 11)
 set(MODULE_NAME SVM)
 
 set(${MODULE_NAME}_SOURCES
diff --git a/test_conformance/SVM/test_enqueue_api.cpp b/test_conformance/SVM/test_enqueue_api.cpp
index 6a04e955..f92872d6 100644
--- a/test_conformance/SVM/test_enqueue_api.cpp
+++ b/test_conformance/SVM/test_enqueue_api.cpp
@@ -17,10 +17,15 @@
 #include "../../test_common/harness/mt19937.h"
 
 #include <vector>
+#include <atomic>
+
+#if !defined(_WIN32)
+#include <unistd.h>
+#endif 
 
 typedef struct
 {
-  cl_uint status;
+  std::atomic<cl_uint> status;
   cl_uint num_svm_pointers;
   std::vector<void *> svm_pointers;
 } CallbackData;
@@ -62,7 +67,7 @@ void CL_CALLBACK callback_svm_free(cl_command_queue queue, cl_uint num_svm_point
     clSVMFree(context, svm_pointers[i]);
   }
 
-  data->status = 1;
+  data->status.store(1, std::memory_order_release);
 }
 
 int test_enqueue_api(cl_device_id deviceID, cl_context c, cl_command_queue queue, int num_elements)
@@ -231,7 +236,9 @@ int test_enqueue_api(cl_device_id deviceID, cl_context c, cl_command_queue queue
   test_error(error, "clFinish failed");
 
   //wait for the callback
-  while(data.status == 0) { }
+  while(data.status.load(std::memory_order_acquire) == 0) { 
+    usleep(1);
+  }
 
   //check if number of SVM pointers returned in the callback matches with expected
   if (data.num_svm_pointers != buffers.size())
diff --git a/test_conformance/api/test_api_min_max.c b/test_conformance/api/test_api_min_max.c
index 57b0ab64..fa3bdb18 100644
--- a/test_conformance/api/test_api_min_max.c
+++ b/test_conformance/api/test_api_min_max.c
@@ -1307,7 +1307,6 @@ int test_min_max_constant_buffer_size(cl_device_id deviceID, cl_context context,
     int error;
     clProgramWrapper program;
     clKernelWrapper kernel;
-    clMemWrapper            streams[3];
     size_t    threads[1], localThreads[1];
     cl_int *constantData, *resultData;
     cl_ulong maxSize, stepSize, currentSize;
@@ -1349,6 +1348,7 @@ int test_min_max_constant_buffer_size(cl_device_id deviceID, cl_context context,
         for(i=0; i<(int)(numberOfInts); i++)
             constantData[i] = (int)genrand_int32(d);
 
+        clMemWrapper streams[3];
         streams[0] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_COPY_HOST_PTR), sizeToAllocate, constantData, &error);
         test_error( error, "Creating test array failed" );
         streams[1] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_READ_WRITE),  sizeToAllocate, NULL, &error);
diff --git a/test_conformance/api/test_zero_sized_enqueue.cpp b/test_conformance/api/test_zero_sized_enqueue.cpp
index 87aa0d21..dbe2af2e 100644
--- a/test_conformance/api/test_zero_sized_enqueue.cpp
+++ b/test_conformance/api/test_zero_sized_enqueue.cpp
@@ -190,7 +190,7 @@ int test_zero_sized_enqueue(cl_device_id deviceID, cl_context context, cl_comman
     cl_int error = clGetDeviceInfo(deviceID, CL_DEVICE_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties), &props, NULL);
     test_error( error, "clGetDeviceInfo failed.");
 
-    if (props | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)
+    if (props & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)
     {
         // test out of order queue
         cl_queue_properties queue_prop_def[] =
diff --git a/test_conformance/basic/run_array b/test_conformance/basic/run_array
old mode 100644
new mode 100755
diff --git a/test_conformance/basic/run_array_image_copy b/test_conformance/basic/run_array_image_copy
old mode 100644
new mode 100755
diff --git a/test_conformance/basic/run_image b/test_conformance/basic/run_image
old mode 100644
new mode 100755
diff --git a/test_conformance/basic/run_multi_read_image b/test_conformance/basic/run_multi_read_image
old mode 100644
new mode 100755
diff --git a/test_conformance/basic/test_async_strided_copy.cpp b/test_conformance/basic/test_async_strided_copy.cpp
index ec65101d..ca657787 100644
--- a/test_conformance/basic/test_async_strided_copy.cpp
+++ b/test_conformance/basic/test_async_strided_copy.cpp
@@ -209,14 +209,13 @@ int test_strided_copy(cl_device_id deviceID, cl_context context, cl_command_queu
             log_error( "ERROR: Results of copy did not validate!\n" );
             sprintf(values + strlen( values), "%d -> [", i);
             for (int j=0; j<(int)elementSize; j++)
-                sprintf(values + strlen( values), "%2x ", inchar[i*elementSize+j]);
+                sprintf(values + strlen( values), "%2x ", inchar[j]);
             sprintf(values + strlen(values), "] != [");
             for (int j=0; j<(int)elementSize; j++)
-                sprintf(values + strlen( values), "%2x ", outchar[i*elementSize+j]);
+                sprintf(values + strlen( values), "%2x ", outchar[j]);
             sprintf(values + strlen(values), "]");
             log_error("%s\n", values);
-
-               return -1;
+            return -1;
         }
     }
 
diff --git a/test_conformance/basic/test_progvar.cpp b/test_conformance/basic/test_progvar.cpp
index 7445444e..9eb211c2 100644
--- a/test_conformance/basic/test_progvar.cpp
+++ b/test_conformance/basic/test_progvar.cpp
@@ -864,12 +864,14 @@ static int l_write_read_for_type( cl_device_id device, cl_context context, cl_co
 
     // We need to create 5 random values of the given type,
     // and read 4 of them back.
-    cl_uchar CL_ALIGNED(ALIGNMENT) write_data[NUM_TESTED_VALUES * sizeof(cl_ulong16)];
-    cl_uchar CL_ALIGNED(ALIGNMENT) read_data[ (NUM_TESTED_VALUES-1) * sizeof(cl_ulong16)];
+    const size_t write_data_size = NUM_TESTED_VALUES * sizeof(cl_ulong16);
+    const size_t read_data_size = (NUM_TESTED_VALUES - 1) * sizeof(cl_ulong16);
+    cl_uchar* write_data = (cl_uchar*)align_malloc(write_data_size, ALIGNMENT);
+    cl_uchar* read_data = (cl_uchar*)align_malloc(read_data_size, ALIGNMENT);
 
-    clMemWrapper write_mem( clCreateBuffer( context, CL_MEM_USE_HOST_PTR, sizeof(write_data), write_data, &status ) );
+    clMemWrapper write_mem( clCreateBuffer( context, CL_MEM_USE_HOST_PTR, write_data_size, write_data, &status ) );
     test_error_ret(status,"Failed to allocate write buffer",status);
-    clMemWrapper read_mem( clCreateBuffer( context, CL_MEM_USE_HOST_PTR, sizeof(read_data), read_data, &status ) );
+    clMemWrapper read_mem( clCreateBuffer( context, CL_MEM_USE_HOST_PTR, read_data_size, read_data, &status ) );
     test_error_ret(status,"Failed to allocate read buffer",status);
 
     status = clSetKernelArg(writer,0,sizeof(cl_mem),&write_mem); test_error_ret(status,"set arg",status);
@@ -884,7 +886,7 @@ static int l_write_read_for_type( cl_device_id device, cl_context context, cl_co
             // Generate new random data to push through.
             // Generate 5 * 128 bytes all the time, even though the test for many types use less than all that.
 
-            cl_uchar *write_ptr = (cl_uchar *)clEnqueueMapBuffer(queue, write_mem, CL_TRUE, CL_MAP_WRITE, 0, sizeof(write_data), 0, 0, 0, 0);
+            cl_uchar *write_ptr = (cl_uchar *)clEnqueueMapBuffer(queue, write_mem, CL_TRUE, CL_MAP_WRITE, 0, write_data_size, 0, 0, 0, 0);
 
             if ( ti.is_bool() ) {
                 // For boolean, random data cast to bool isn't very random.
@@ -896,7 +898,7 @@ static int l_write_read_for_type( cl_device_id device, cl_context context, cl_co
                 }
                 bool_iter++;
             } else {
-                l_set_randomly( write_data, sizeof(write_data), rand_state );
+                l_set_randomly( write_data, write_data_size, rand_state );
             }
             status = clSetKernelArg(writer,1,sizeof(cl_uint),&iptr_idx); test_error_ret(status,"set arg",status);
 
@@ -905,7 +907,7 @@ static int l_write_read_for_type( cl_device_id device, cl_context context, cl_co
             status = clSetKernelArg(reader,1,ti.get_size(),write_data + (NUM_TESTED_VALUES-1)*ti.get_size()); test_error_ret(status,"set arg",status);
 
             // Determine the expected values.
-            cl_uchar expected[ (NUM_TESTED_VALUES-1) * sizeof(cl_ulong16)];
+            cl_uchar expected[read_data_size];
             memset( expected, -1, sizeof(expected) );
             l_copy( expected, 0, write_data, 0, ti );
             l_copy( expected, 1, write_data, 1, ti );
@@ -922,8 +924,8 @@ static int l_write_read_for_type( cl_device_id device, cl_context context, cl_co
                 for ( unsigned i = 0; i <  NUM_TESTED_VALUES-1 ; i++ ) expected[i] = (bool)expected[i];
             }
 
-            cl_uchar *read_ptr = (cl_uchar *)clEnqueueMapBuffer(queue, read_mem, CL_TRUE, CL_MAP_READ, 0, sizeof(read_data), 0, 0, 0, 0);
-            memset( read_data, -1, sizeof(read_data) );
+            cl_uchar *read_ptr = (cl_uchar *)clEnqueueMapBuffer(queue, read_mem, CL_TRUE, CL_MAP_READ, 0, read_data_size, 0, 0, 0, 0);
+            memset(read_data, -1, read_data_size);
             clEnqueueUnmapMemObject(queue, read_mem, read_ptr, 0, 0, 0);
 
             // Now run the kernel
@@ -932,7 +934,7 @@ static int l_write_read_for_type( cl_device_id device, cl_context context, cl_co
             status = clEnqueueNDRangeKernel(queue,reader,1,0,&one,0,0,0,0); test_error_ret(status,"enqueue reader",status);
             status = clFinish(queue); test_error_ret(status,"finish",status);
 
-            read_ptr = (cl_uchar *)clEnqueueMapBuffer(queue, read_mem, CL_TRUE, CL_MAP_READ, 0, sizeof(read_data), 0, 0, 0, 0);
+            read_ptr = (cl_uchar *)clEnqueueMapBuffer(queue, read_mem, CL_TRUE, CL_MAP_READ, 0, read_data_size, 0, 0, 0, 0);
 
             if ( ti.is_bool() ) {
                 // Collapse down to one bit.
@@ -951,7 +953,8 @@ static int l_write_read_for_type( cl_device_id device, cl_context context, cl_co
     }
 
     if ( CL_SUCCESS == err ) { log_info("OK\n"); FLUSH; }
-
+    align_free(write_data);
+    align_free(read_data);
     return err;
 }
 
@@ -1008,12 +1011,14 @@ static int l_init_write_read_for_type( cl_device_id device, cl_context context,
 
     // We need to create 5 random values of the given type,
     // and read 4 of them back.
-    cl_uchar CL_ALIGNED(ALIGNMENT) write_data[NUM_TESTED_VALUES * sizeof(cl_ulong16)];
-    cl_uchar CL_ALIGNED(ALIGNMENT) read_data[ (NUM_TESTED_VALUES-1) * sizeof(cl_ulong16)];
+    const size_t write_data_size = NUM_TESTED_VALUES * sizeof(cl_ulong16);
+    const size_t read_data_size = (NUM_TESTED_VALUES-1) * sizeof(cl_ulong16);
 
-    clMemWrapper write_mem( clCreateBuffer( context, CL_MEM_USE_HOST_PTR, sizeof(write_data), write_data, &status ) );
+    cl_uchar* write_data = (cl_uchar*)align_malloc(write_data_size, ALIGNMENT);
+    cl_uchar* read_data = (cl_uchar*)align_malloc(read_data_size, ALIGNMENT);
+    clMemWrapper write_mem( clCreateBuffer( context, CL_MEM_USE_HOST_PTR, write_data_size, write_data, &status ) );
     test_error_ret(status,"Failed to allocate write buffer",status);
-    clMemWrapper read_mem( clCreateBuffer( context, CL_MEM_USE_HOST_PTR, sizeof(read_data), read_data, &status ) );
+    clMemWrapper read_mem( clCreateBuffer( context, CL_MEM_USE_HOST_PTR, read_data_size, read_data, &status ) );
     test_error_ret(status,"Failed to allocate read buffer",status);
 
     status = clSetKernelArg(writer,0,sizeof(cl_mem),&write_mem); test_error_ret(status,"set arg",status);
@@ -1033,7 +1038,7 @@ static int l_init_write_read_for_type( cl_device_id device, cl_context context,
             // Generate new random data to push through.
             // Generate 5 * 128 bytes all the time, even though the test for many types use less than all that.
 
-            cl_uchar *write_ptr = (cl_uchar *)clEnqueueMapBuffer(queue, write_mem, CL_TRUE, CL_MAP_WRITE, 0, sizeof(write_data), 0, 0, 0, 0);
+            cl_uchar *write_ptr = (cl_uchar *)clEnqueueMapBuffer(queue, write_mem, CL_TRUE, CL_MAP_WRITE, 0, write_data_size, 0, 0, 0, 0);
 
             if ( ti.is_bool() ) {
                 // For boolean, random data cast to bool isn't very random.
@@ -1045,7 +1050,7 @@ static int l_init_write_read_for_type( cl_device_id device, cl_context context,
                 }
                 bool_iter++;
             } else {
-                l_set_randomly( write_data, sizeof(write_data), rand_state );
+                l_set_randomly( write_data, write_data_size, rand_state );
             }
             status = clSetKernelArg(writer,1,sizeof(cl_uint),&iptr_idx); test_error_ret(status,"set arg",status);
 
@@ -1061,7 +1066,7 @@ static int l_init_write_read_for_type( cl_device_id device, cl_context context,
             status = clSetKernelArg(reader,1,ti.get_size(),write_data + (NUM_TESTED_VALUES-1)*ti.get_size()); test_error_ret(status,"set arg",status);
 
             // Determine the expected values.
-            cl_uchar expected[ (NUM_TESTED_VALUES-1) * sizeof(cl_ulong16)];
+            cl_uchar expected[read_data_size];
             memset( expected, -1, sizeof(expected) );
             if ( iteration ) {
                 l_copy( expected, 0, write_data, 0, ti );
@@ -1092,8 +1097,8 @@ static int l_init_write_read_for_type( cl_device_id device, cl_context context,
 
             clEnqueueUnmapMemObject(queue, write_mem, write_ptr, 0, 0, 0);
 
-            cl_uchar *read_ptr = (cl_uchar *)clEnqueueMapBuffer(queue, read_mem, CL_TRUE, CL_MAP_READ, 0, sizeof(read_data), 0, 0, 0, 0);
-            memset( read_data, -1, sizeof(read_data) );
+            cl_uchar *read_ptr = (cl_uchar *)clEnqueueMapBuffer(queue, read_mem, CL_TRUE, CL_MAP_READ, 0, read_data_size, 0, 0, 0, 0);
+            memset( read_data, -1, read_data_size );
             clEnqueueUnmapMemObject(queue, read_mem, read_ptr, 0, 0, 0);
 
             // Now run the kernel
@@ -1107,7 +1112,7 @@ static int l_init_write_read_for_type( cl_device_id device, cl_context context,
             status = clEnqueueNDRangeKernel(queue,reader,1,0,&one,0,0,0,0); test_error_ret(status,"enqueue reader",status);
             status = clFinish(queue); test_error_ret(status,"finish",status);
 
-            read_ptr = (cl_uchar *)clEnqueueMapBuffer(queue, read_mem, CL_TRUE, CL_MAP_READ, 0, sizeof(read_data), 0, 0, 0, 0);
+            read_ptr = (cl_uchar *)clEnqueueMapBuffer(queue, read_mem, CL_TRUE, CL_MAP_READ, 0, read_data_size, 0, 0, 0, 0);
 
             if ( ti.is_bool() ) {
                 // Collapse down to one bit.
@@ -1129,6 +1134,8 @@ static int l_init_write_read_for_type( cl_device_id device, cl_context context,
     }
 
     if ( CL_SUCCESS == err ) { log_info("OK\n"); FLUSH; }
+    align_free(write_data);
+    align_free(read_data);
 
     return err;
 }
@@ -1340,6 +1347,13 @@ static int l_user_type( cl_device_id device, cl_context context, cl_command_queu
             print_build_log(program, 1, &device, ksrc.num_str(), ksrc.strs(), ksrc.lengths(), OPTIONS);
             return status;
         }
+
+        status = clBuildProgram(program, 1, &device, OPTIONS, 0, 0);
+        if(check_error(status, "Failed to compile program for user type test (%s)", IGetErrorString(status)))
+        {
+            print_build_log(program, 1, &device, ksrc.num_str(), ksrc.strs(), ksrc.lengths(), OPTIONS);
+            return status;
+        }
     }
 
 
@@ -1360,12 +1374,12 @@ static int l_user_type( cl_device_id device, cl_context context, cl_command_queu
     test_error_ret(status,"Failed to create reader kernel for user type test",status);
 
     // Set up data.
-    cl_uchar CL_ALIGNED(ALIGNMENT) uchar_data;
-    cl_uint CL_ALIGNED(ALIGNMENT) uint_data;
+    cl_uchar* uchar_data = (cl_uchar*)align_malloc(sizeof(cl_uchar), ALIGNMENT);
+    cl_uint* uint_data = (cl_uint*)align_malloc(sizeof(cl_uint), ALIGNMENT);
 
-    clMemWrapper uchar_mem( clCreateBuffer( context, CL_MEM_USE_HOST_PTR, sizeof(uchar_data), &uchar_data, &status ) );
+    clMemWrapper uchar_mem( clCreateBuffer( context, CL_MEM_USE_HOST_PTR, sizeof(cl_uchar), uchar_data, &status ) );
     test_error_ret(status,"Failed to allocate uchar buffer",status);
-    clMemWrapper uint_mem( clCreateBuffer( context, CL_MEM_USE_HOST_PTR, sizeof(uint_data), &uint_data, &status ) );
+    clMemWrapper uint_mem( clCreateBuffer( context, CL_MEM_USE_HOST_PTR, sizeof(cl_uint), uint_data, &status ) );
     test_error_ret(status,"Failed to allocate uint buffer",status);
 
     status = clSetKernelArg(reader,0,sizeof(cl_mem),&uchar_mem); test_error_ret(status,"set arg",status);
@@ -1375,18 +1389,18 @@ static int l_user_type( cl_device_id device, cl_context context, cl_command_queu
     cl_uint expected_uint = 42;
     for ( unsigned iter = 0; iter < 5 ; iter++ ) { // Must go around at least twice
         // Read back data
-        uchar_data = -1;
-        uint_data = -1;
+        *uchar_data = -1;
+        *uint_data = -1;
         const size_t one = 1;
         status = clEnqueueNDRangeKernel(queue,reader,1,0,&one,0,0,0,0); test_error_ret(status,"enqueue reader",status);
         status = clFinish(queue); test_error_ret(status,"finish",status);
 
-        cl_uchar *uint_data_ptr = (cl_uchar *)clEnqueueMapBuffer(queue, uint_mem, CL_TRUE, CL_MAP_READ, 0, sizeof(uint_data), 0, 0, 0, 0);
-        cl_uchar *uchar_data_ptr = (cl_uchar *)clEnqueueMapBuffer(queue, uchar_mem, CL_TRUE, CL_MAP_READ, 0, sizeof(uchar_data), 0, 0, 0, 0);
+        cl_uchar *uint_data_ptr = (cl_uchar *)clEnqueueMapBuffer(queue, uint_mem, CL_TRUE, CL_MAP_READ, 0, sizeof(cl_uint), 0, 0, 0, 0);
+        cl_uchar *uchar_data_ptr = (cl_uchar *)clEnqueueMapBuffer(queue, uchar_mem, CL_TRUE, CL_MAP_READ, 0, sizeof(cl_uchar), 0, 0, 0, 0);
 
-        if ( expected_uchar != uchar_data || expected_uint != uint_data ) {
+        if ( expected_uchar != *uchar_data || expected_uint != *uint_data ) {
             log_error("FAILED: Iteration %d Got (0x%2x,%d) but expected (0x%2x,%d)\n",
-                    iter, (int)uchar_data, uint_data, (int)expected_uchar, expected_uint );
+                    iter, (int)*uchar_data, *uint_data, (int)expected_uchar, expected_uint );
             err |= 1;
         }
 
@@ -1398,16 +1412,17 @@ static int l_user_type( cl_device_id device, cl_context context, cl_command_queu
         expected_uint++;
 
         // Write the new values into persistent store.
-        uchar_data = expected_uchar;
-        uint_data = expected_uint;
-        status = clSetKernelArg(writer,0,sizeof(uchar_data),&uchar_data); test_error_ret(status,"set arg",status);
-        status = clSetKernelArg(writer,1,sizeof(uint_data),&uint_data); test_error_ret(status,"set arg",status);
+        *uchar_data = expected_uchar;
+        *uint_data = expected_uint;
+        status = clSetKernelArg(writer,0,sizeof(cl_uchar),uchar_data); test_error_ret(status,"set arg",status);
+        status = clSetKernelArg(writer,1,sizeof(cl_uint),uint_data); test_error_ret(status,"set arg",status);
         status = clEnqueueNDRangeKernel(queue,writer,1,0,&one,0,0,0,0); test_error_ret(status,"enqueue writer",status);
         status = clFinish(queue); test_error_ret(status,"finish",status);
     }
 
     if ( CL_SUCCESS == err ) { log_info("OK\n"); FLUSH; }
-
+    align_free(uchar_data);
+    align_free(uint_data);
     return err;
 }
 
diff --git a/test_conformance/basic/test_sizeof.c b/test_conformance/basic/test_sizeof.c
index 166cf206..bdbc40fe 100644
--- a/test_conformance/basic/test_sizeof.c
+++ b/test_conformance/basic/test_sizeof.c
@@ -50,7 +50,7 @@ cl_int get_type_size( cl_context context, cl_command_queue queue, const char *ty
         sizeof_kernel_code[0] = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
     }
 
-    cl_int err = create_single_kernel_helper( context, &p, &k, 4, sizeof_kernel_code, "test_sizeof" );
+    cl_int err = create_single_kernel_helper_with_build_options(context, &p, &k, 4, sizeof_kernel_code, "test_sizeof", "-cl-std=CL2.0");
     if( err )
         return err;
 
diff --git a/test_conformance/c11_atomics/test_atomics.cpp b/test_conformance/c11_atomics/test_atomics.cpp
index 542e893f..bafb3a02 100644
--- a/test_conformance/c11_atomics/test_atomics.cpp
+++ b/test_conformance/c11_atomics/test_atomics.cpp
@@ -1566,6 +1566,7 @@ public:
   using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
   using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScopeStr;
   using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderScopeStr;
+  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::UseSVM;
   using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::LocalMemory;
   CBasicTestFlag(TExplicitAtomicType dataType, bool useSVM) : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, useSVM)
   {
@@ -1606,7 +1607,7 @@ public:
       program += "    atomic_work_item_fence(" +
                  std::string(LocalMemory() ? "CLK_LOCAL_MEM_FENCE, " : "CLK_GLOBAL_MEM_FENCE, ") +
                  "memory_order_acquire," +
-                 std::string(LocalMemory() ? "memory_scope_work_group" : "memory_scope_device") +
+                 std::string(LocalMemory() ? "memory_scope_work_group" : (UseSVM() ? "memory_scope_all_svm_devices" : "memory_scope_device") ) +
                  ");\n";
 
     program +=
@@ -1632,7 +1633,7 @@ public:
       program += "      atomic_work_item_fence(" +
                  std::string(LocalMemory() ? "CLK_LOCAL_MEM_FENCE, " : "CLK_GLOBAL_MEM_FENCE, ") +
                  "memory_order_release," +
-                 std::string(LocalMemory() ? "memory_scope_work_group" : "memory_scope_device") +
+                 std::string(LocalMemory() ? "memory_scope_work_group" : (UseSVM() ? "memory_scope_all_svm_devices" : "memory_scope_device") ) +
                  ");\n";
 
     program +=
diff --git a/test_conformance/compatibility/test_common/harness/compat.h b/test_conformance/compatibility/test_common/harness/compat.h
index 22119c46..d177f87a 100644
--- a/test_conformance/compatibility/test_common/harness/compat.h
+++ b/test_conformance/compatibility/test_common/harness/compat.h
@@ -13,32 +13,63 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
+/*
+    Header compat.h should be used instead of stdlib.h, stdbool.h, stdint.h, float.h, fenv.h,
+    math.h. It provides workarounds if these headers are not available or not complete.
+
+    Important: It should be included before math.h, directly or indirectly, because Intel mathimf.h
+    is not compatible with Microsoft math.h. Including math.h before mathimf.h causes compile-time
+    error.
+*/
 #ifndef _COMPAT_H_
 #define _COMPAT_H_
 
 #if defined(_WIN32) && defined (_MSC_VER)
-
 #include <Windows.h>
-#include <Winbase.h>
-#include <CL/cl.h>
-#include <float.h>
-#include <xmmintrin.h>
-#include <math.h>
-
-#define MAKE_HEX_FLOAT(x,y,z)  ((float)ldexp( (float)(y), z))
-#define MAKE_HEX_DOUBLE(x,y,z) ldexp( (double)(y), z)
-#define MAKE_HEX_LONG(x,y,z)   ((long double) ldexp( (long double)(y), z))
-
-#define isfinite(x) _finite(x)
-
-#if !defined(__cplusplus)
-typedef char bool;
-#define inline
-
-#else
-extern "C" {
 #endif
 
+#ifdef __cplusplus
+    #define EXTERN_C extern "C"
+#else
+    #define EXTERN_C
+#endif
+
+
+//
+// stdlib.h
+//
+
+#include <stdlib.h>     // On Windows, _MAX_PATH defined there.
+
+// llabs appeared in MS C v16 (VS 10/2010).
+#if defined( _MSC_VER ) && _MSC_VER <= 1500
+    EXTERN_C inline long long llabs(long long __x) { return __x >= 0 ? __x : -__x; }
+#endif
+
+
+//
+// stdbool.h
+//
+
+// stdbool.h appeared in MS C v18 (VS 12/2013).
+#if defined( _MSC_VER ) && MSC_VER <= 1700
+#if !defined(__cplusplus)
+typedef char bool;
+        #define true  1
+        #define false 0
+    #endif
+#else
+    #include <stdbool.h>
+#endif
+
+
+
+//
+// stdint.h
+//
+
+// stdint.h appeared in MS C v16 (VS 10/2010) and Intel C v12.
+#if defined( _MSC_VER ) && ( ! defined( __INTEL_COMPILER ) && _MSC_VER <= 1500 || defined( __INTEL_COMPILER ) && __INTEL_COMPILER < 1200 )
 typedef unsigned char       uint8_t;
 typedef char                int8_t;
 typedef unsigned short      uint16_t;
@@ -47,25 +78,83 @@ typedef unsigned int        uint32_t;
 typedef int                 int32_t;
 typedef unsigned long long  uint64_t;
 typedef long long           int64_t;
-
-#define MAXPATHLEN MAX_PATH
-
-typedef unsigned short ushort;
-typedef unsigned int   uint;
-typedef unsigned long  ulong;
+#else
+#ifndef __STDC_LIMIT_MACROS
+#define __STDC_LIMIT_MACROS
+#endif
+    #include <stdint.h>
+#endif
 
 
-#define INFINITY    (FLT_MAX + FLT_MAX)
-//#define NAN (INFINITY | 1)
-//const static int PINFBITPATT_SP32  = INFINITY;
+
+//
+// float.h
+//
+
+#include <float.h>
+
+
+
+//
+// fenv.h
+//
+
+// fenv.h appeared in MS C v18 (VS 12/2013).
+#if defined( _MSC_VER ) && _MSC_VER <= 1700 && ! defined( __INTEL_COMPILER )
+    // reimplement fenv.h because windows doesn't have it
+    #define FE_INEXACT          0x0020
+    #define FE_UNDERFLOW        0x0010
+    #define FE_OVERFLOW         0x0008
+    #define FE_DIVBYZERO        0x0004
+    #define FE_INVALID          0x0001
+    #define FE_ALL_EXCEPT       0x003D
+    int fetestexcept(int excepts);
+    int feclearexcept(int excepts);
+#else
+    #include <fenv.h>
+#endif
+
+
+//
+// math.h
+//
+
+#if defined( __INTEL_COMPILER )
+    #include <mathimf.h>
+#else
+    #include <math.h>
+#endif
+
+#if defined( _MSC_VER )
+
+    #ifdef __cplusplus
+        extern "C" {
+    #endif
 
 #ifndef M_PI
     #define M_PI    3.14159265358979323846264338327950288
 #endif
 
+    #if ! defined( __INTEL_COMPILER )
 
+        #ifndef NAN
+            #define NAN  (INFINITY - INFINITY)
+        #endif
+        #ifndef HUGE_VALF
+            #define HUGE_VALF (float)HUGE_VAL
+        #endif
+        #ifndef INFINITY
+            #define INFINITY    (FLT_MAX + FLT_MAX)
+        #endif
+        #ifndef isfinite
+            #define isfinite(x) _finite(x)
+        #endif
+        #ifndef isnan
 #define    isnan( x )       ((x) != (x))
+        #endif
+        #ifndef isinf
 #define     isinf( _x)      ((_x) == INFINITY || (_x) == -INFINITY)
+        #endif
 
 double rint( double x);
 float  rintf( float x);
@@ -99,30 +188,6 @@ long double remquol( long double x, long double y, int *quo);
 
 long double scalblnl(long double x, long n);
 
-inline long long
-llabs(long long __x) { return __x >= 0 ? __x : -__x; }
-
-
-// end of math functions
-
-uint64_t ReadTime( void );
-double SubtractTime( uint64_t endTime, uint64_t startTime );
-
-#define sleep(X)   Sleep(1000*X)
-// snprintf added in _MSC_VER == 1900 (Visual Studio 2015)
-#if _MSC_VER < 1900
-	#define snprintf   sprintf_s
-#endif
-//#define hypotl     _hypot
-
-float   make_nan();
-float nanf( const char* str);
-double  nan( const char* str);
-long double nanl( const char* str);
-
-//#if defined USE_BOOST
-//#include <boost/math/tr1.hpp>
-//double hypot(double x, double y);
 float hypotf(float x, float y);
 long double hypotl(long double x, long double y) ;
 double lgamma(double x);
@@ -147,70 +212,179 @@ double round(double x);
 float  roundf(float x);
 long double roundl(long double x);
 
-// Added in _MSC_VER == 1800 (Visual Studio 2013)
-#if _MSC_VER < 1800
-	int signbit(double x);
-#endif
-int signbitf(float x);
+        int cf_signbit(double x);
+        int cf_signbitf(float x);
 
-//bool signbitl(long double x)         { return boost::math::tr1::signbit<long double>(x); }
-//#endif // USE_BOOST
+        static int signbit(double x) { return  cf_signbit(x); }
+        static int signbitf(float x) { return cf_signbitf(x); }
 
 long int lrint (double flt);
 long int lrintf (float flt);
 
-
 float   int2float (int32_t ix);
 int32_t float2int (float   fx);
 
+    #endif
+
+    #if ! defined( __INTEL_COMPILER ) || __INTEL_COMPILER < 1300
+        // These functions appeared in Intel C v13.
+        float  nanf( const char* str);
+        double nan( const char* str);
+        long double nanl( const char* str);
+    #endif
+
+    #ifdef __cplusplus
+        }
+    #endif
+
+#endif
+
+#if defined( __ANDROID__ )
+    #define log2(X)  (log(X)/log(2))
+#endif
+
+
+
+//
+// stdio.h
+//
+
+#if defined( _MSC_VER )
+    #define snprintf   sprintf_s
+#endif
+
+
+
+//
+// unistd.h
+//
+
+#if defined( _MSC_VER )
+    EXTERN_C unsigned int sleep( unsigned int sec );
+    EXTERN_C int usleep( int usec );
+#endif
+
+
+
+//
+// syscall.h
+//
+
+#if defined( __ANDROID__ )
+    // Android bionic's isn't providing SYS_sysctl wrappers.
+    #define SYS__sysctl  __NR__sysctl
+#endif
+
+
+
+// Some tests use _malloca which defined in malloc.h.
+#if !defined (__APPLE__)
+#include <malloc.h>
+#endif
+
+
+//
+// ???
+//
+
+#if defined( _MSC_VER )
+
+    #define MAXPATHLEN _MAX_PATH
+
+    EXTERN_C uint64_t ReadTime( void );
+    EXTERN_C double SubtractTime( uint64_t endTime, uint64_t startTime );
+
 /** Returns the number of leading 0-bits in x,
     starting at the most significant bit position.
     If x is 0, the result is undefined.
 */
-int __builtin_clz(unsigned int pattern);
-
-
-static const double zero=  0.00000000000000000000e+00;
-#define NAN  (INFINITY - INFINITY)
-#define HUGE_VALF (float)HUGE_VAL
-
-int usleep(int usec);
-
-// reimplement fenv.h because windows doesn't have it
-#define FE_INEXACT          0x0020
-#define FE_UNDERFLOW        0x0010
-#define FE_OVERFLOW         0x0008
-#define FE_DIVBYZERO        0x0004
-#define FE_INVALID          0x0001
-#define FE_ALL_EXCEPT       0x003D
-
-int fetestexcept(int excepts);
-int feclearexcept(int excepts);
-
-#ifdef __cplusplus
-}
-#endif
-
-#else // !((defined(_WIN32) && defined(_MSC_VER)
-#if defined(__MINGW32__)
-#include <windows.h>
-#define sleep(X)   Sleep(1000*X)
+    EXTERN_C int __builtin_clz(unsigned int pattern);
 
 #endif
-#if defined(__linux__) || defined(__MINGW32__) || defined(__APPLE__)
-#ifndef __STDC_LIMIT_MACROS
-#define __STDC_LIMIT_MACROS
+
+#ifndef MIN
+    #define MIN(x,y) (((x)<(y))?(x):(y))
 #endif
-#include <fenv.h>
-#include <math.h>
-#include <float.h>
-#include <stdint.h>
+#ifndef MAX
+    #define MAX(x,y) (((x)>(y))?(x):(y))
 #endif
+
+
+/*
+    ------------------------------------------------------------------------------------------------
+    WARNING: DO NOT USE THESE MACROS: MAKE_HEX_FLOAT, MAKE_HEX_DOUBLE, MAKE_HEX_LONG.
+
+    This is a typical usage of the macros:
+
+        double yhi = MAKE_HEX_DOUBLE(0x1.5555555555555p-2,0x15555555555555LL,-2);
+
+     (taken from math_brute_force/reference_math.c). There are two problems:
+
+        1.  There is an error here. On Windows in will produce incorrect result
+            `0x1.5555555555555p+50'. To have a correct result it should be written as
+            `MAKE_HEX_DOUBLE(0x1.5555555555555p-2,0x15555555555555LL,-54)'. A proper value of the
+            third argument is not obvious -- sometimes it should be the same as exponent of the
+            first argument, but sometimes not.
+
+        2.  Information is duplicated. It is easy to make a mistake.
+
+    Use HEX_FLT, HEX_DBL, HEX_LDBL macros instead (see them in the bottom of the file).
+    ------------------------------------------------------------------------------------------------
+*/
+#if defined ( _MSC_VER ) && ! defined( __INTEL_COMPILER )
+
+    #define MAKE_HEX_FLOAT(x,y,z)  ((float)ldexp( (float)(y), z))
+    #define MAKE_HEX_DOUBLE(x,y,z) ldexp( (double)(y), z)
+    #define MAKE_HEX_LONG(x,y,z)   ((long double) ldexp( (long double)(y), z))
+
+#else
+
+// Do not use these macros in new code, use HEX_FLT, HEX_DBL, HEX_LDBL instead.
 #define MAKE_HEX_FLOAT(x,y,z) x
 #define MAKE_HEX_DOUBLE(x,y,z) x
 #define MAKE_HEX_LONG(x,y,z) x
 
-#endif // !((defined(_WIN32) && defined(_MSC_VER)
+#endif
 
 
+/*
+    ------------------------------------------------------------------------------------------------
+    HEX_FLT, HEXT_DBL, HEX_LDBL -- Create hex floating point literal of type float, double, long
+    double respectively. Arguments:
+
+        sm    -- sign of number,
+        int   -- integer part of mantissa (without `0x' prefix),
+        fract -- fractional part of mantissa (without decimal point and `L' or `LL' suffixes),
+        se    -- sign of exponent,
+        exp   -- absolute value of (binary) exponent.
+
+    Example:
+
+        double yhi = HEX_DBL( +, 1, 5555555555555, -, 2 ); // == 0x1.5555555555555p-2
+
+    Note:
+
+        We have to pass signs as separate arguments because gcc pass negative integer values
+        (e. g. `-2') into a macro as two separate tokens, so `HEX_FLT( 1, 0, -2 )' produces result
+        `0x1.0p- 2' (note a space between minus and two) which is not a correct floating point
+        literal.
+    ------------------------------------------------------------------------------------------------
+*/
+#if defined ( _MSC_VER ) && ! defined( __INTEL_COMPILER )
+    // If compiler does not support hex floating point literals:
+    #define HEX_FLT(  sm, int, fract, se, exp ) sm ldexpf(       (float)( 0x ## int ## fract ## UL  ), se exp + ilogbf(       (float) 0x ## int ) - ilogbf(       ( float )( 0x ## int ## fract ## UL  ) ) )
+    #define HEX_DBL(  sm, int, fract, se, exp ) sm ldexp(       (double)( 0x ## int ## fract ## ULL ), se exp + ilogb(       (double) 0x ## int ) - ilogb(       ( double )( 0x ## int ## fract ## ULL ) ) )
+    #define HEX_LDBL( sm, int, fract, se, exp ) sm ldexpl( (long double)( 0x ## int ## fract ## ULL ), se exp + ilogbl( (long double) 0x ## int ) - ilogbl( ( long double )( 0x ## int ## fract ## ULL ) ) )
+#else
+    // If compiler supports hex floating point literals: just concatenate all the parts into a literal.
+    #define HEX_FLT(  sm, int, fract, se, exp ) sm 0x ## int ## . ## fract ## p ## se ## exp ## F
+    #define HEX_DBL(  sm, int, fract, se, exp ) sm 0x ## int ## . ## fract ## p ## se ## exp
+    #define HEX_LDBL( sm, int, fract, se, exp ) sm 0x ## int ## . ## fract ## p ## se ## exp ## L
+#endif
+
+#if defined(__MINGW32__)
+    #include <Windows.h>
+    #define sleep(sec) Sleep((sec) * 1000)
+#endif
+
 #endif // _COMPAT_H_
diff --git a/test_conformance/compatibility/test_common/harness/kernelHelpers.c b/test_conformance/compatibility/test_common/harness/kernelHelpers.c
index 3680c06e..42f41aa4 100644
--- a/test_conformance/compatibility/test_common/harness/kernelHelpers.c
+++ b/test_conformance/compatibility/test_common/harness/kernelHelpers.c
@@ -475,14 +475,14 @@ size_t get_pixel_bytes( const cl_image_format *fmt )
     return 0;
 }
 
-int verifyImageSupport( cl_device_id device )
+test_status verifyImageSupport( cl_device_id device )
 {
     if( checkForImageSupport( device ) )
     {
         log_error( "ERROR: Device does not supported images as required by this test!\n" );
-        return CL_IMAGE_FORMAT_NOT_SUPPORTED;
+        return TEST_FAIL;
     }
-    return 0;
+    return TEST_PASS;
 }
 
 int checkForImageSupport( cl_device_id device )
diff --git a/test_conformance/compatibility/test_common/harness/kernelHelpers.h b/test_conformance/compatibility/test_common/harness/kernelHelpers.h
index 59c01b5f..09515e28 100644
--- a/test_conformance/compatibility/test_common/harness/kernelHelpers.h
+++ b/test_conformance/compatibility/test_common/harness/kernelHelpers.h
@@ -17,6 +17,7 @@
 #define _kernelHelpers_h
 
 #include "compat.h"
+#include "testHarness.h"
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -84,8 +85,8 @@ extern int is_image_format_supported( cl_context context, cl_mem_flags flags, cl
 /* Helper to get pixel size for a pixel format */
 size_t get_pixel_bytes( const cl_image_format *fmt );
 
-/* Verify the given device supports images. 0 means you're good to go, otherwise an error */
-extern int verifyImageSupport( cl_device_id device );
+/* Verify the given device supports images. */
+extern test_status verifyImageSupport( cl_device_id device );
 
 /* Checks that the given device supports images. Same as verify, but doesn't print an error */
 extern int checkForImageSupport( cl_device_id device );
diff --git a/test_conformance/compatibility/test_common/harness/msvc9.c b/test_conformance/compatibility/test_common/harness/msvc9.c
index 811c6654..093bb978 100644
--- a/test_conformance/compatibility/test_common/harness/msvc9.c
+++ b/test_conformance/compatibility/test_common/harness/msvc9.c
@@ -13,15 +13,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#if defined(_WIN32) && defined (_MSC_VER)
-
 #include "compat.h"
-#include <math.h>
-#include <float.h>
-#include <assert.h>
-#include <CL/cl_platform.h>
 
+#if defined ( _MSC_VER )
 
+#include <limits.h>
+#include <stdlib.h>
+
+#include <CL/cl.h>
+
+#include <windows.h>
+
+#if ! defined( __INTEL_COMPILER )
 
 ///////////////////////////////////////////////////////////////////
 //
@@ -387,86 +390,6 @@ long double log2l(long double x)
     return 1.44269504088896340735992468100189214L * log(x);
 }
 
-///////////////////////////////////////////////////////////////////
-//
-//                  misc functions
-//
-///////////////////////////////////////////////////////////////////
-
-/*
-// This function is commented out because the Windows implementation should never call munmap.
-// If it is calling it, we have a bug. Please file a bugzilla.
-int munmap(void *addr, size_t len)
-{
-// FIXME: this is not correct.  munmap is like free()    http://www.opengroup.org/onlinepubs/7990989775/xsh/munmap.html
-
-    return (int)VirtualAlloc( (LPVOID)addr, len,
-                  MEM_COMMIT|MEM_RESERVE, PAGE_NOACCESS );
-}
-*/
-
-uint64_t ReadTime( void )
-{
-    LARGE_INTEGER current;
-    QueryPerformanceCounter(&current);
-    return (uint64_t)current.QuadPart;
-}
-
-double SubtractTime( uint64_t endTime, uint64_t startTime )
-{
-    static double PerformanceFrequency = 0.0;
-
-    if (PerformanceFrequency == 0.0) {
-        LARGE_INTEGER frequency;
-        QueryPerformanceFrequency(&frequency);
-        PerformanceFrequency = (double) frequency.QuadPart;
-    }
-
-    return (double)(endTime - startTime) / PerformanceFrequency * 1e9;
-}
-
-float make_nan()
-{
-/* This is the IEEE 754 single-precision format:
-    unsigned int mantissa:  22;
-    unsigned int quiet_nan:  1;
-    unsigned int exponent:   8;
-    unsigned int negative:   1;
-*/
-     //const static unsigned
-     static const int32_t _nan = 0x7fc00000;
-     return *(const float*)(&_nan);
-}
-
-float nanf( const char* str)
-{
-    cl_uint u = atoi( str );
-    u |= 0x7fc00000U;
-    return *( float*)(&u);
-}
-
-
-double nan( const char* str)
-{
-    cl_ulong u = atoi( str );
-    u |= 0x7ff8000000000000ULL;
-    return *( double*)(&u);
-}
-
-// double check this implementatation
-long double nanl( const char* str)
-{
-    union
-    {
-        long double f;
-        struct { cl_ulong m; cl_ushort sexp; }u;
-    }u;
-    u.u.sexp = 0x7fff;
-    u.u.m = 0x8000000000000000ULL | atoi( str );
-
-    return u.f;
-}
-
 double trunc(double x)
 {
     double absx = fabs(x);
@@ -589,31 +512,6 @@ long double roundl(long double x)
     return x;
 }
 
-// Added in _MSC_VER == 1800 (Visual Studio 2013)
-#if _MSC_VER < 1800
-int signbit(double x)
-{
-    union
-    {
-        double f;
-        cl_ulong u;
-    }u;
-    u.f = x;
-    return u.u >> 63;
-}
-#endif
-
-int signbitf(float x)
-{
-    union
-    {
-        float f;
-        cl_uint u;
-    }u;
-    u.f = x;
-    return u.u >> 31;
-}
-
 float cbrtf( float x )
 {
     float z = pow( fabs((double) x), 1.0 / 3.0 );
@@ -625,6 +523,175 @@ double cbrt( double x )
     return copysign( pow( fabs( x ), 1.0 / 3.0 ), x );
 }
 
+long int lrint (double x)
+{
+    double absx = fabs(x);
+
+    if( x >= (double) LONG_MAX )
+        return LONG_MAX;
+
+    if( absx < 4503599627370496.0 /* 0x1.0p52 */ )
+    {
+        double magic = copysign( 4503599627370496.0 /* 0x1.0p52 */, x );
+        double rounded = x + magic;
+        rounded -= magic;
+        return (long int) rounded;
+    }
+
+    return (long int) x;
+}
+
+long int lrintf (float x)
+{
+    float absx = fabsf(x);
+
+    if( x >= (float) LONG_MAX )
+        return LONG_MAX;
+
+    if( absx < 8388608.0f /* 0x1.0p23f */ )
+    {
+        float magic = copysignf( 8388608.0f /* 0x1.0p23f */, x );
+        float rounded = x + magic;
+        rounded -= magic;
+        return (long int) rounded;
+    }
+
+    return (long int) x;
+}
+
+
+///////////////////////////////////////////////////////////////////
+//
+//                  fenv functions
+//
+///////////////////////////////////////////////////////////////////
+
+int fetestexcept(int excepts)
+{
+    unsigned int status = _statusfp();
+    return excepts & (
+        ((status & _SW_INEXACT) ? FE_INEXACT : 0)      |
+        ((status & _SW_UNDERFLOW) ? FE_UNDERFLOW : 0)  |
+        ((status & _SW_OVERFLOW) ? FE_OVERFLOW : 0)    |
+        ((status & _SW_ZERODIVIDE) ? FE_DIVBYZERO : 0) |
+        ((status & _SW_INVALID) ? FE_INVALID : 0)
+    );
+}
+
+int feclearexcept(int excepts)
+{
+    _clearfp();
+    return 0;
+}
+
+#endif // __INTEL_COMPILER
+
+#if ! defined( __INTEL_COMPILER ) || __INTEL_COMPILER < 1300
+
+float make_nan()
+{
+/* This is the IEEE 754 single-precision format:
+    unsigned int mantissa:  22;
+    unsigned int quiet_nan:  1;
+    unsigned int exponent:   8;
+    unsigned int negative:   1;
+*/
+     //const static unsigned
+     static const int32_t _nan = 0x7fc00000;
+     return *(const float*)(&_nan);
+}
+
+float nanf( const char* str)
+{
+    cl_uint u = atoi( str );
+    u |= 0x7fc00000U;
+    return *( float*)(&u);
+}
+
+
+double nan( const char* str)
+{
+    cl_ulong u = atoi( str );
+    u |= 0x7ff8000000000000ULL;
+    return *( double*)(&u);
+}
+
+// double check this implementatation
+long double nanl( const char* str)
+{
+    union
+    {
+        long double f;
+        struct { cl_ulong m; cl_ushort sexp; }u;
+    }u;
+    u.u.sexp = 0x7fff;
+    u.u.m = 0x8000000000000000ULL | atoi( str );
+
+    return u.f;
+}
+
+#endif
+
+///////////////////////////////////////////////////////////////////
+//
+//                  misc functions
+//
+///////////////////////////////////////////////////////////////////
+
+/*
+// This function is commented out because the Windows implementation should never call munmap.
+// If it is calling it, we have a bug. Please file a bugzilla.
+int munmap(void *addr, size_t len)
+{
+// FIXME: this is not correct.  munmap is like free()    http://www.opengroup.org/onlinepubs/7990989775/xsh/munmap.html
+
+    return (int)VirtualAlloc( (LPVOID)addr, len,
+                  MEM_COMMIT|MEM_RESERVE, PAGE_NOACCESS );
+}
+*/
+
+uint64_t ReadTime( void )
+{
+    LARGE_INTEGER current;
+    QueryPerformanceCounter(&current);
+    return (uint64_t)current.QuadPart;
+}
+
+double SubtractTime( uint64_t endTime, uint64_t startTime )
+{
+    static double PerformanceFrequency = 0.0;
+
+    if (PerformanceFrequency == 0.0) {
+        LARGE_INTEGER frequency;
+        QueryPerformanceFrequency(&frequency);
+        PerformanceFrequency = (double) frequency.QuadPart;
+    }
+
+    return (double)(endTime - startTime) / PerformanceFrequency * 1e9;
+}
+
+int cf_signbit(double x)
+{
+    union
+    {
+        double f;
+        cl_ulong u;
+    }u;
+    u.f = x;
+    return u.u >> 63;
+}
+
+int cf_signbitf(float x)
+{
+    union
+    {
+        float f;
+        cl_uint u;
+    }u;
+    u.f = x;
+    return u.u >> 31;
+}
+
 float int2float (int32_t ix)
 {
     union {
@@ -645,7 +712,7 @@ int32_t float2int (float   fx)
     return u.i;
 }
 
-#if defined(_MSC_VER) && !defined(_WIN64)
+#if !defined(_WIN64)
 /** Returns the number of leading 0-bits in x,
     starting at the most significant bit position.
     If x is 0, the result is undefined.
@@ -685,45 +752,10 @@ int __builtin_clz(unsigned int pattern)
    return count;
 }
 
-#endif //defined(_MSC_VER) && !defined(_WIN64)
+#endif // !defined(_WIN64)
 
 #include <intrin.h>
 #include <emmintrin.h>
-long int lrint (double x)
-{
-    double absx = fabs(x);
-
-    if( x >= (double) LONG_MAX )
-        return LONG_MAX;
-
-    if( absx < 4503599627370496.0 /* 0x1.0p52 */ )
-    {
-        double magic = copysign( 4503599627370496.0 /* 0x1.0p52 */, x );
-        double rounded = x + magic;
-        rounded -= magic;
-        return (long int) rounded;
-    }
-
-    return (long int) x;
-}
-
-long int lrintf (float x)
-{
-    float absx = fabsf(x);
-
-    if( x >= (float) LONG_MAX )
-        return LONG_MAX;
-
-    if( absx < 8388608.0f /* 0x1.0p23f */ )
-    {
-        float magic = copysignf( 8388608.0f /* 0x1.0p23f */, x );
-        float rounded = x + magic;
-        rounded -= magic;
-        return (long int) rounded;
-    }
-
-    return (long int) x;
-}
 
 int usleep(int usec)
 {
@@ -731,24 +763,10 @@ int usleep(int usec)
     return 0;
 }
 
-#if _MSC_VER < 1900
-int fetestexcept(int excepts)
+unsigned int sleep( unsigned int sec )
 {
-    unsigned int status = _statusfp();
-    return excepts & (
-        ((status & _SW_INEXACT) ? FE_INEXACT : 0)      |
-        ((status & _SW_UNDERFLOW) ? FE_UNDERFLOW : 0)  |
-        ((status & _SW_OVERFLOW) ? FE_OVERFLOW : 0)    |
-        ((status & _SW_ZERODIVIDE) ? FE_DIVBYZERO : 0) |
-        ((status & _SW_INVALID) ? FE_INVALID : 0)
-    );
-}
-
-int feclearexcept(int excepts)
-{
-    _clearfp();
+    Sleep( sec * 1000 );
     return 0;
 }
-#endif
 
-#endif //defined(_WIN32)
+#endif // defined( _MSC_VER )
diff --git a/test_conformance/compatibility/test_common/harness/testHarness.c b/test_conformance/compatibility/test_common/harness/testHarness.c
index 1994399d..af05c8d5 100644
--- a/test_conformance/compatibility/test_common/harness/testHarness.c
+++ b/test_conformance/compatibility/test_common/harness/testHarness.c
@@ -439,10 +439,18 @@ int runTestHarnessWithCheck( int argc, const char *argv[], unsigned int num_fns,
 
 
     /* If we have a device checking function, run it */
-    if( ( deviceCheckFn != NULL ) && deviceCheckFn( device ) != CL_SUCCESS )
+    if( ( deviceCheckFn != NULL ) )
     {
-        test_finish();
-        return -1;
+        test_status status = deviceCheckFn( device );
+        switch (status)
+        {
+            case TEST_PASS:
+                break;
+            case TEST_FAIL:
+                return 1;
+            case TEST_SKIP:
+                return 0;
+        }
     }
 
     if (num_elements <= 0)
diff --git a/test_conformance/compatibility/test_common/harness/testHarness.h b/test_conformance/compatibility/test_common/harness/testHarness.h
index c2620647..7c1d9cb2 100644
--- a/test_conformance/compatibility/test_common/harness/testHarness.h
+++ b/test_conformance/compatibility/test_common/harness/testHarness.h
@@ -23,6 +23,13 @@
 extern "C" {
 #endif
 
+typedef enum test_status
+{
+    TEST_PASS = 0,
+    TEST_FAIL = 1,
+    TEST_SKIP = 2,
+} test_status;
+
 extern cl_uint gReSeed;
 extern cl_uint gRandomSeed;
 
@@ -32,8 +39,8 @@ extern int runTestHarness( int argc, const char *argv[], unsigned int num_fns,
                             basefn fnList[], const char *fnNames[],
                             int imageSupportRequired, int forceNoContextCreation, cl_command_queue_properties queueProps );
 
-// Device checking function. See runTestHarnessWithCheck. If this function returns anything other than CL_SUCCESS (0), the harness exits.
-typedef int (*DeviceCheckFn)( cl_device_id device );
+// Device checking function. See runTestHarnessWithCheck. If this function returns anything other than TEST_PASS, the harness exits.
+typedef test_status (*DeviceCheckFn)( cl_device_id device );
 
 // Same as runTestHarness, but also supplies a function that checks the created device for required functionality.
 extern int runTestHarnessWithCheck( int argc, const char *argv[], unsigned int num_fns,
diff --git a/test_conformance/compatibility/test_conformance/api/CMakeLists.txt b/test_conformance/compatibility/test_conformance/api/CMakeLists.txt
index 50dd64ef..4af76989 100644
--- a/test_conformance/compatibility/test_conformance/api/CMakeLists.txt
+++ b/test_conformance/compatibility/test_conformance/api/CMakeLists.txt
@@ -20,6 +20,7 @@ set(${MODULE_NAME}_SOURCES
          test_mem_object_info.cpp
          test_null_buffer_arg.c
          test_kernel_arg_info.c
+         test_queue_properties.cpp
          ../../test_common/harness/errorHelpers.c
          ../../test_common/harness/threadTesting.c
          ../../test_common/harness/testHarness.c
diff --git a/test_conformance/compatibility/test_conformance/api/main.c b/test_conformance/compatibility/test_conformance/api/main.c
index 1870bf8f..a8e9e993 100644
--- a/test_conformance/compatibility/test_conformance/api/main.c
+++ b/test_conformance/compatibility/test_conformance/api/main.c
@@ -113,6 +113,7 @@ basefn    basefn_list[] = {
     test_get_image1d_info,
     test_get_image1d_array_info,
     test_get_image2d_array_info,
+    test_queue_properties,
 };
 
 
@@ -200,6 +201,7 @@ const char    *basefn_names[] = {
     "get_image1d_info",
     "get_image1d_array_info",
     "get_image2d_array_info",
+    "queue_properties",
 };
 
 ct_assert((sizeof(basefn_names) / sizeof(basefn_names[0])) == (sizeof(basefn_list) / sizeof(basefn_list[0])));
diff --git a/test_conformance/compatibility/test_conformance/api/procs.h b/test_conformance/compatibility/test_conformance/api/procs.h
index ff796a38..e0a85356 100644
--- a/test_conformance/compatibility/test_conformance/api/procs.h
+++ b/test_conformance/compatibility/test_conformance/api/procs.h
@@ -105,4 +105,5 @@ extern int      test_get_image1d_info( cl_device_id deviceID, cl_context context
 extern int      test_get_image1d_array_info( cl_device_id deviceID, cl_context context, cl_command_queue ignoreQueue, int num_elements );
 extern int      test_get_image2d_array_info( cl_device_id deviceID, cl_context context, cl_command_queue ignoreQueue, int num_elements );
 extern int      test_get_kernel_arg_info( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
+extern int      test_queue_properties( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements );
 
diff --git a/test_conformance/compatibility/test_conformance/api/test_queue_properties.cpp b/test_conformance/compatibility/test_conformance/api/test_queue_properties.cpp
new file mode 100644
index 00000000..3b368077
--- /dev/null
+++ b/test_conformance/compatibility/test_conformance/api/test_queue_properties.cpp
@@ -0,0 +1,174 @@
+//
+// Copyright (c) 2018 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+
+#include "testBase.h"
+#include "../../test_common/harness/typeWrappers.h"
+#include "../../test_common/harness/conversions.h"
+#include <sstream>
+#include <string>
+#include <vector>
+
+using namespace std;
+/*
+The test against cl_khr_create_command_queue extension. It validates if devices with Opencl 1.X can use clCreateCommandQueueWithPropertiesKHR function.
+Based on device capabilities test will create queue with NULL properties, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE property and
+CL_QUEUE_PROFILING_ENABLE property. Finally simple kernel will be executed on such queue.
+*/
+
+const char *queue_test_kernel[] = {
+"__kernel void vec_cpy(__global int *src, __global int *dst)\n"
+"{\n"
+"    int  tid = get_global_id(0);\n"
+"\n"
+"    dst[tid] = src[tid];\n"
+"\n"
+"}\n" };
+
+int enqueue_kernel(cl_context context, const cl_queue_properties_khr *queue_prop_def, cl_device_id deviceID, clKernelWrapper& kernel, size_t num_elements)
+{
+    clMemWrapper streams[2];
+    int error;
+    std::vector<int> buf(num_elements);
+    clCreateCommandQueueWithPropertiesKHR_fn clCreateCommandQueueWithPropertiesKHR = NULL;
+    cl_platform_id platform;
+    clEventWrapper event;
+
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &platform, NULL);
+    test_error(error, "clGetDeviceInfo for CL_DEVICE_PLATFORM failed");
+
+    clCreateCommandQueueWithPropertiesKHR = (clCreateCommandQueueWithPropertiesKHR_fn) clGetExtensionFunctionAddressForPlatform(platform, "clCreateCommandQueueWithPropertiesKHR");
+    if (clCreateCommandQueueWithPropertiesKHR == NULL)
+    {
+        log_error("ERROR: clGetExtensionFunctionAddressForPlatform failed\n");
+        return -1;
+    }
+
+    clCommandQueueWrapper queue = clCreateCommandQueueWithPropertiesKHR(context, deviceID, queue_prop_def, &error);
+    test_error(error, "clCreateCommandQueueWithPropertiesKHR failed");
+
+    for (int i = 0; i < num_elements; ++i)
+    {
+        buf[i] = i;
+    }
+
+    streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, num_elements * sizeof(int), buf.data(), &error);
+    test_error( error, "clCreateBuffer failed." );
+    streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, num_elements * sizeof(int), NULL, &error);
+    test_error( error, "clCreateBuffer failed." );
+
+    error = clSetKernelArg(kernel, 0, sizeof(streams[0]), &streams[0]);
+    test_error( error, "clSetKernelArg failed." );
+
+    error = clSetKernelArg(kernel, 1, sizeof(streams[1]), &streams[1]);
+    test_error( error, "clSetKernelArg failed." );
+
+    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &num_elements, NULL, 0, NULL, &event);
+    test_error( error, "clEnqueueNDRangeKernel failed." );
+    
+    error = clWaitForEvents(1, &event);
+    test_error(error, "clWaitForEvents failed.");
+    
+    error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, num_elements, buf.data(), 0, NULL, NULL);
+    test_error( error, "clEnqueueReadBuffer failed." );
+
+    for (int i = 0; i < num_elements; ++i)
+    {
+        if (buf[i] != i)
+        {
+            log_error("ERROR: Incorrect vector copy result.");
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+int test_queue_properties(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+{
+    if (num_elements <= 0)
+    {
+        num_elements = 128;
+    }
+    int error = 0;
+
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    size_t strSize;
+    std::string strExt(0, '\0');
+    cl_queue_properties_khr device_props = NULL;
+    cl_queue_properties_khr queue_prop_def[] = { CL_QUEUE_PROPERTIES, 0, 0 };
+
+    // Query extension
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_EXTENSIONS, 0, NULL, &strSize);
+    test_error(error, "clGetDeviceInfo for CL_DEVICE_EXTENSIONS failed");
+    strExt.resize(strSize);
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_EXTENSIONS, strExt.size(), &strExt[0], NULL);
+    test_error(error, "clGetDeviceInfo for CL_DEVICE_EXTENSIONS failed");
+    log_info("CL_DEVICE_EXTENSIONS:\n%s\n\n", strExt.c_str());
+
+    if (strExt.find("cl_khr_create_command_queue") == string::npos)
+    {
+        log_info("extension cl_khr_create_command_queue is not supported.\n");
+        return 0;
+    }
+
+    error = create_single_kernel_helper(context, &program, &kernel, 1, queue_test_kernel, "vec_cpy");
+    test_error(error, "create_single_kernel_helper failed");
+
+    log_info("Queue property NULL. Testing ... \n");
+    error = enqueue_kernel(context, NULL,deviceID, kernel, (size_t)num_elements);
+    test_error(error, "enqueue_kernel failed");
+
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_QUEUE_PROPERTIES, sizeof(device_props), &device_props, NULL);
+    test_error(error, "clGetDeviceInfo for CL_DEVICE_QUEUE_PROPERTIES failed");
+
+    if (device_props & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)
+    {
+        log_info("Queue property CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE supported. Testing ... \n");
+        queue_prop_def[1] = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
+        error = enqueue_kernel(context, queue_prop_def, deviceID, kernel, (size_t)num_elements);
+        test_error(error, "enqueue_kernel failed");
+    } else
+    {
+        log_info("Queue property CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE not supported \n");
+    }
+
+    if (device_props & CL_QUEUE_PROFILING_ENABLE)
+    {
+        log_info("Queue property CL_QUEUE_PROFILING_ENABLE supported. Testing ... \n");
+        queue_prop_def[1] = CL_QUEUE_PROFILING_ENABLE;
+        error = enqueue_kernel(context, queue_prop_def, deviceID, kernel, (size_t)num_elements);
+        test_error(error, "enqueue_kernel failed");
+    } else
+    {
+        log_info("Queue property CL_QUEUE_PROFILING_ENABLE not supported \n");
+    }
+
+    if (device_props & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE && device_props & CL_QUEUE_PROFILING_ENABLE)
+    {
+        log_info("Queue property CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE & CL_QUEUE_PROFILING_ENABLE supported. Testing ... \n");
+        queue_prop_def[1] = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE|CL_QUEUE_PROFILING_ENABLE;
+        error = enqueue_kernel(context, queue_prop_def, deviceID, kernel, (size_t)num_elements);
+        test_error(error, "enqueue_kernel failed");
+    }
+    else
+    {
+        log_info("Queue property CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE or CL_QUEUE_PROFILING_ENABLE not supported \n");
+    }
+
+    return 0;
+}
\ No newline at end of file
diff --git a/test_conformance/compatibility/test_conformance/basic/run_array b/test_conformance/compatibility/test_conformance/basic/run_array
old mode 100644
new mode 100755
diff --git a/test_conformance/compatibility/test_conformance/basic/run_array_image_copy b/test_conformance/compatibility/test_conformance/basic/run_array_image_copy
old mode 100644
new mode 100755
diff --git a/test_conformance/compatibility/test_conformance/basic/run_image b/test_conformance/compatibility/test_conformance/basic/run_image
old mode 100644
new mode 100755
diff --git a/test_conformance/compatibility/test_conformance/basic/run_multi_read_image b/test_conformance/compatibility/test_conformance/basic/run_multi_read_image
old mode 100644
new mode 100755
diff --git a/test_conformance/compatibility/test_conformance/basic/test_async_strided_copy.cpp b/test_conformance/compatibility/test_conformance/basic/test_async_strided_copy.cpp
index 6db06398..dca52b5e 100644
--- a/test_conformance/compatibility/test_conformance/basic/test_async_strided_copy.cpp
+++ b/test_conformance/compatibility/test_conformance/basic/test_async_strided_copy.cpp
@@ -202,10 +202,10 @@ int test_strided_copy(cl_device_id deviceID, cl_context context, cl_command_queu
             log_error( "ERROR: Results of copy did not validate!\n" );
             sprintf(values + strlen( values), "%d -> [", i);
             for (int j=0; j<(int)elementSize; j++)
-                sprintf(values + strlen( values), "%2x ", inchar[i*elementSize+j]);
+                sprintf(values + strlen( values), "%2x ", inchar[j]);
             sprintf(values + strlen(values), "] != [");
             for (int j=0; j<(int)elementSize; j++)
-                sprintf(values + strlen( values), "%2x ", outchar[i*elementSize+j]);
+                sprintf(values + strlen( values), "%2x ", outchar[j]);
             sprintf(values + strlen(values), "]");
             log_error("%s\n", values);
 
diff --git a/test_conformance/compatibility/test_conformance/basic/test_sizeof.c b/test_conformance/compatibility/test_conformance/basic/test_sizeof.c
index bece1b94..9c56bb58 100644
--- a/test_conformance/compatibility/test_conformance/basic/test_sizeof.c
+++ b/test_conformance/compatibility/test_conformance/basic/test_sizeof.c
@@ -133,7 +133,7 @@ const size_table  vector_table[] =
 
 const char  *ptr_table[] =
 {
-    "void*",
+    "global void*",
     "size_t",
     "sizeof(int)",      // check return type of sizeof
     "ptrdiff_t"
diff --git a/test_conformance/compatibility/test_conformance/images/image_helpers.h b/test_conformance/compatibility/test_conformance/images/image_helpers.h
index d2132c7e..cd0eff6c 100644
--- a/test_conformance/compatibility/test_conformance/images/image_helpers.h
+++ b/test_conformance/compatibility/test_conformance/images/image_helpers.h
@@ -481,7 +481,6 @@ extern char *create_random_image_data( ExplicitType dataType, image_descriptor *
 extern void get_sampler_kernel_code( image_sampler_data *imageSampler, char *outLine );
 extern float get_max_absolute_error( cl_image_format *format, image_sampler_data *sampler);
 extern float get_max_relative_error( cl_image_format *format, image_sampler_data *sampler, int is3D, int isLinearFilter );
-extern int issubnormal(float);
 
 
 #define errMax( _x , _y )       ( (_x) != (_x) ? (_x) : (_x) > (_y) ? (_x) : (_y) )
diff --git a/test_conformance/compiler/test_async_build.c b/test_conformance/compiler/test_async_build.c
index e32c64ab..b4c08b5e 100644
--- a/test_conformance/compiler/test_async_build.c
+++ b/test_conformance/compiler/test_async_build.c
@@ -86,9 +86,8 @@ int test_async_build_pieces(cl_device_id deviceID, cl_context context, cl_comman
         return -1;
     }
 
-    clReleaseProgram( program );
+    error = clReleaseProgram( program );
+    test_error( error, "Unable to release program object" );
 
     return 0;
 }
-
-
diff --git a/test_conformance/compiler/test_build_helpers.c b/test_conformance/compiler/test_build_helpers.c
index 0fcb7103..35875e04 100644
--- a/test_conformance/compiler/test_build_helpers.c
+++ b/test_conformance/compiler/test_build_helpers.c
@@ -172,7 +172,8 @@ int test_load_two_kernel_source(cl_device_id deviceID, cl_context context, cl_co
     /* Should probably check binary here to verify the same results... */
 
     /* All done! */
-    clReleaseProgram( program );
+    error = clReleaseProgram( program );
+    test_error( error, "Unable to release program object" );
 
     return 0;
 }
@@ -198,7 +199,8 @@ int test_load_null_terminated_source(cl_device_id deviceID, cl_context context,
     /* Should probably check binary here to verify the same results... */
 
     /* All done! */
-    clReleaseProgram( program );
+    error = clReleaseProgram( program );
+    test_error( error, "Unable to release program object" );
 
     return 0;
 }
@@ -224,7 +226,8 @@ int test_load_null_terminated_multi_line_source(cl_device_id deviceID, cl_contex
     /* Should probably check binary here to verify the same results... */
 
     /* All done! */
-    clReleaseProgram( program );
+    error = clReleaseProgram( program );
+    test_error( error, "Unable to release program object" );
 
     return 0;
 }
@@ -262,7 +265,8 @@ int test_load_discreet_length_source(cl_device_id deviceID, cl_context context,
     /* Should probably check binary here to verify the same results... */
 
     /* All done! */
-    clReleaseProgram( program );
+    error = clReleaseProgram( program );
+    test_error( error, "Unable to release program object" );
 
     return 0;
 }
@@ -297,7 +301,8 @@ int test_load_null_terminated_partial_multi_line_source(cl_device_id deviceID, c
     /* Should probably check binary here to verify the same results... */
 
     /* All done! */
-    clReleaseProgram( program );
+    error = clReleaseProgram( program );
+    test_error( error, "Unable to release program object" );
 
     return 0;
 }
@@ -393,7 +398,9 @@ int test_get_program_info(cl_device_id deviceID, cl_context context, cl_command_
         return -1;
     }
 
-    clReleaseProgram( program );
+    /* All done! */
+    error = clReleaseProgram( program );
+    test_error( error, "Unable to release program object" );
 
     return 0;
 }
@@ -445,8 +452,10 @@ int test_get_program_source(cl_device_id deviceID, cl_context context, cl_comman
         return -1;
     }
 
-    /* if we got here, everything passed */
-    clReleaseProgram( program );
+    /* All done! */
+    error = clReleaseProgram( program );
+    test_error( error, "Unable to release program object" );
+
     return 0;
 }
 
@@ -493,26 +502,25 @@ int test_get_program_build_info(cl_device_id deviceID, cl_context context, cl_co
     error = clGetProgramBuildInfo( program, deviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &length );
     test_error( error, "Unable to get program build log length" );
 
-  log_info("Build log is %ld long.\n", length);
+    log_info("Build log is %ld long.\n", length);
 
-  buffer = (char*)malloc(length);
+    buffer = (char*)malloc(length);
 
     /* Try normal source */
     error = clGetProgramBuildInfo( program, deviceID, CL_PROGRAM_BUILD_LOG, length, buffer, NULL );
     test_error( error, "Unable to get program build log" );
 
-  if( buffer[length-1] != '\0' )
-  {
+    if( buffer[length-1] != '\0' )
+    {
         log_error( "clGetProgramBuildInfo overwrote allocated space for build log! '%c'\n", buffer[length-1]  );
         return -1;
-  }
+    }
 
     /* Try both at once */
     error = clGetProgramBuildInfo( program, deviceID, CL_PROGRAM_BUILD_LOG, length, buffer, &newLength );
     test_error( error, "Unable to get program build log" );
 
-  free(buffer);
-
+    free(buffer);
 
     /***** Build options *****/
     error = clGetProgramBuildInfo( program, deviceID, CL_PROGRAM_BUILD_OPTIONS, 0, NULL, &length );
@@ -530,8 +538,10 @@ int test_get_program_build_info(cl_device_id deviceID, cl_context context, cl_co
 
     free(buffer);
 
-  /* Try with a valid option */
-  clReleaseProgram( program );
+    /* Try with a valid option */
+    error = clReleaseProgram( program );
+    test_error( error, "Unable to release program object" );
+
     program = clCreateProgramWithSource( context, 1, sample_kernel_code_single_line, NULL, &error );
     if( program == NULL )
     {
@@ -546,10 +556,10 @@ int test_get_program_build_info(cl_device_id deviceID, cl_context context, cl_co
         return -1;
     }
 
-  error = clGetProgramBuildInfo( program, deviceID, CL_PROGRAM_BUILD_OPTIONS, NULL, NULL, &length );
+    error = clGetProgramBuildInfo( program, deviceID, CL_PROGRAM_BUILD_OPTIONS, NULL, NULL, &length );
     test_error( error, "Unable to get program build options" );
 
-  buffer = (char*)malloc(length);
+    buffer = (char*)malloc(length);
 
     error = clGetProgramBuildInfo( program, deviceID, CL_PROGRAM_BUILD_OPTIONS, length, buffer, NULL );
     test_error( error, "Unable to get program build options" );
@@ -559,13 +569,11 @@ int test_get_program_build_info(cl_device_id deviceID, cl_context context, cl_co
         return -1;
     }
 
-  free(buffer);
+    /* All done */
+    free( buffer );
+
+    error = clReleaseProgram( program );
+    test_error( error, "Unable to release program object" );
 
-    /* if we got here, everything passed */
-    clReleaseProgram( program );
     return 0;
 }
-
-
-
-
diff --git a/test_conformance/compiler/test_compile.c b/test_conformance/compiler/test_compile.c
index 926d69bd..dc0d0972 100644
--- a/test_conformance/compiler/test_compile.c
+++ b/test_conformance/compiler/test_compile.c
@@ -558,6 +558,9 @@ int test_large_multiple_embedded_headers(cl_context context, cl_device_id device
     free( simple_kernels );
     free( headers );
 
+    error = clReleaseKernel( kernel );
+    test_error( error, "Unable to release kernel object" );
+
     error = clReleaseProgram( my_newly_minted_library );
     test_error( error, "Unable to release program object" );
 
@@ -728,6 +731,9 @@ int test_large_multiple_libraries(cl_context context, cl_device_id deviceID, cl_
     }
     free( simple_kernels );
 
+    error = clReleaseKernel( kernel );
+    test_error( error, "Unable to release kernel object" );
+
     error = clReleaseProgram( my_newly_linked_program );
     test_error( error, "Unable to release program object" );
 
@@ -895,6 +901,9 @@ int test_large_multiple_files_multiple_libraries(cl_context context, cl_device_i
     }
     free( simple_kernels );
 
+    error = clReleaseKernel( kernel );
+    test_error( error, "Unable to release kernel object" );
+
     error = clReleaseProgram( my_newly_linked_program );
     test_error( error, "Unable to release program object" );
 
@@ -1032,6 +1041,9 @@ int test_large_multiple_files(cl_context context, cl_device_id deviceID, cl_comm
     }
     free( lines );
 
+    error = clReleaseKernel( kernel );
+    test_error( error, "Unable to release kernel object" );
+
     error = clReleaseProgram( my_newly_linked_program );
     test_error( error, "Unable to release program object" );
 
@@ -1311,6 +1323,9 @@ int test_simple_embedded_header_compile(cl_device_id deviceID, cl_context contex
     error = clReleaseProgram( program );
     test_error( error, "Unable to release program object" );
 
+    error = clReleaseProgram( header );
+    test_error( error, "Unable to release program object" );
+
     return 0;
 }
 
@@ -1517,6 +1532,9 @@ int test_simple_embedded_header_link(cl_device_id deviceID, cl_context context,
     error = clReleaseProgram( program );
     test_error( error, "Unable to release program object" );
 
+    error = clReleaseProgram( header );
+    test_error( error, "Unable to release program object" );
+
     error = clReleaseProgram( simple_program );
     test_error( error, "Unable to release program object" );
 
@@ -1571,7 +1589,7 @@ int test_simple_link_with_callback(cl_device_id deviceID, cl_context context, cl
 
     simple_user_data simple_link_user_data = {when_i_pondered_weak_and_weary, link_program_completion_event};
 
-    clLinkProgram(context, 1, &deviceID, NULL, 1, &program, simple_link_callback, (void*)&simple_link_user_data, &error);
+    cl_program my_linked_library = clLinkProgram(context, 1, &deviceID, NULL, 1, &program, simple_link_callback, (void*)&simple_link_user_data, &error);
     test_error( error, "Unable to link a simple program" );
 
     error = clWaitForEvents(1, &link_program_completion_event);
@@ -1584,6 +1602,9 @@ int test_simple_link_with_callback(cl_device_id deviceID, cl_context context, cl
     error = clReleaseProgram( program );
     test_error( error, "Unable to release program object" );
 
+    error = clReleaseProgram( my_linked_library );
+    test_error( error, "Unable to release program object" );
+
     return 0;
 }
 
@@ -1738,6 +1759,9 @@ int test_execute_after_simple_compile_and_link(cl_device_id deviceID, cl_context
         return error;
 
     /* All done! */
+    error = clReleaseKernel( kernel );
+    test_error( error, "Unable to release kernel object" );
+
     error = clReleaseProgram( program );
     test_error( error, "Unable to release program object" );
 
@@ -1774,6 +1798,9 @@ int test_execute_after_simple_compile_and_link_no_device_info(cl_device_id devic
         return error;
 
     /* All done! */
+    error = clReleaseKernel( kernel );
+    test_error( error, "Unable to release kernel object" );
+
     error = clReleaseProgram( program );
     test_error( error, "Unable to release program object" );
 
@@ -1810,6 +1837,9 @@ int test_execute_after_simple_compile_and_link_with_defines(cl_device_id deviceI
         return error;
 
     /* All done! */
+    error = clReleaseKernel( kernel );
+    test_error( error, "Unable to release kernel object" );
+
     error = clReleaseProgram( program );
     test_error( error, "Unable to release program object" );
 
@@ -1887,6 +1917,9 @@ int test_execute_after_serialize_reload_object(cl_device_id deviceID, cl_context
         return error;
 
     /* All done! */
+    error = clReleaseKernel( kernel );
+    test_error( error, "Unable to release kernel object" );
+
     error = clReleaseProgram( program );
     test_error( error, "Unable to release program object" );
 
@@ -1991,6 +2024,12 @@ int test_execute_after_serialize_reload_library(cl_device_id deviceID, cl_contex
         return error;
 
     /* All done! */
+    error = clReleaseKernel( kernel );
+    test_error( error, "Unable to release kernel object" );
+
+    error = clReleaseKernel( another_kernel );
+    test_error( error, "Unable to release another kernel object" );
+
     error = clReleaseProgram( program );
     test_error( error, "Unable to release program object" );
 
@@ -2065,7 +2104,8 @@ int test_execute_after_simple_compile_and_link_with_callbacks(cl_device_id devic
     error = clWaitForEvents(1, &compile_program_completion_event);
     test_error( error, "clWaitForEvents failed when waiting on compile_program_completion_event");
 
-    clReleaseEvent(compile_program_completion_event);
+    error = clReleaseEvent(compile_program_completion_event);
+    test_error( error, "Unable to release event object" );
 
     link_program_completion_event = clCreateUserEvent(context, &error);
     test_error( error, "Unable to create a user event");
@@ -2077,7 +2117,8 @@ int test_execute_after_simple_compile_and_link_with_callbacks(cl_device_id devic
     error = clWaitForEvents(1, &link_program_completion_event);
     test_error( error, "clWaitForEvents failed when waiting on link_program_completion_event");
 
-    clReleaseEvent(link_program_completion_event);
+    error = clReleaseEvent(link_program_completion_event);
+    test_error( error, "Unable to release event object" );
 
     cl_kernel kernel = clCreateKernel(my_newly_linked_program, "CopyBuffer", &error);
     test_error( error, "Unable to create a simple kernel" );
@@ -2087,6 +2128,9 @@ int test_execute_after_simple_compile_and_link_with_callbacks(cl_device_id devic
         return error;
 
     /* All done! */
+    error = clReleaseKernel( kernel );
+    test_error( error, "Unable to release kernel object" );
+
     error = clReleaseProgram( program );
     test_error( error, "Unable to release program object" );
 
@@ -2264,6 +2308,12 @@ int test_execute_after_simple_library_with_link(cl_device_id deviceID, cl_contex
         return error;
 
     /* All done! */
+    error = clReleaseKernel( kernel );
+    test_error( error, "Unable to release kernel object" );
+
+    error = clReleaseKernel( another_kernel );
+    test_error( error, "Unable to release another kernel object" );
+
     error = clReleaseProgram( program );
     test_error( error, "Unable to release program object" );
 
@@ -2368,6 +2418,12 @@ int test_execute_after_two_file_link(cl_device_id deviceID, cl_context context,
         return error;
 
     /* All done! */
+    error = clReleaseKernel( kernel );
+    test_error( error, "Unable to release kernel object" );
+
+    error = clReleaseKernel( another_kernel );
+    test_error( error, "Unable to release another kernel object" );
+
     error = clReleaseProgram( program );
     test_error( error, "Unable to release program object" );
 
@@ -2433,9 +2489,18 @@ int test_execute_after_embedded_header_link(cl_device_id deviceID, cl_context co
         return error;
 
     /* All done! */
+    error = clReleaseKernel( kernel );
+    test_error( error, "Unable to release kernel object" );
+
+    error = clReleaseKernel( another_kernel );
+    test_error( error, "Unable to release another kernel object" );
+
     error = clReleaseProgram( program );
     test_error( error, "Unable to release program object" );
 
+    error = clReleaseProgram( header );
+    test_error( error, "Unable to release program object" );
+
     error = clReleaseProgram( simple_program );
     test_error( error, "Unable to release program object" );
 
@@ -2563,6 +2628,12 @@ int test_execute_after_included_header_link(cl_device_id deviceID, cl_context co
         return error;
 
     /* All done! */
+    error = clReleaseKernel( kernel );
+    test_error( error, "Unable to release kernel object" );
+
+    error = clReleaseKernel( another_kernel );
+    test_error( error, "Unable to release another kernel object" );
+
     error = clReleaseProgram( program );
     test_error( error, "Unable to release program object" );
 
@@ -2803,6 +2874,12 @@ int test_program_binary_type(cl_device_id deviceID, cl_context context, cl_comma
         return error;
 
     /* All done! */
+    error = clReleaseKernel( kernel );
+    test_error( error, "Unable to release kernel object" );
+
+    error = clReleaseKernel( another_kernel );
+    test_error( error, "Unable to release another kernel object" );
+
     error = clReleaseProgram( program );
     test_error( error, "Unable to release program object" );
 
@@ -3107,6 +3184,9 @@ int test_large_compile_and_link_status_options_log(cl_context context, cl_device
         return error;
 
     /* All done! */
+    error = clReleaseKernel( kernel );
+    test_error( error, "Unable to release kernel object" );
+
     error = clReleaseProgram( program );
     test_error( error, "Unable to release program object" );
 
diff --git a/test_conformance/compiler/test_compiler_defines_for_extensions.cpp b/test_conformance/compiler/test_compiler_defines_for_extensions.cpp
index a262d019..81e05232 100644
--- a/test_conformance/compiler/test_compiler_defines_for_extensions.cpp
+++ b/test_conformance/compiler/test_compiler_defines_for_extensions.cpp
@@ -55,6 +55,8 @@ const char *known_extensions[] = {
     "cl_khr_egl_event",
     "cl_khr_throttle_hints",
     "cl_khr_priority_hints",
+    "cl_khr_create_command_queue",
+    "cl_khr_il_program",
 };
 
 size_t num_known_extensions = sizeof(known_extensions)/sizeof(char*);
@@ -413,6 +415,10 @@ int test_compiler_defines_for_extensions(cl_device_id device, cl_context context
       free(extensions_supported[i]);
     }
     free(extensions);
+    if( defines ) {
+        error = clReleaseMemObject( defines );
+        test_error( error, "Unable to release memory object" );
+    }
 
     if (total_errors)
         return -1;
diff --git a/test_conformance/compiler/test_image_macro.c b/test_conformance/compiler/test_image_macro.c
index 329a017c..f85f092f 100644
--- a/test_conformance/compiler/test_image_macro.c
+++ b/test_conformance/compiler/test_image_macro.c
@@ -89,7 +89,13 @@ int test_image_macro(cl_device_id deviceID, cl_context context, cl_command_queue
             log_info("CL_DEVICE_IMAGE_SUPPORT not set, __IMAGE_SUPPORT__ macro not set \n");
     }
 
-    clReleaseProgram( program );
+    status = clReleaseProgram( program );
+    if( status )
+    {
+        log_error ("Unable to release program object, [%d] \n", status );
+        return status;
+    }
+
     return status;
 }
 
diff --git a/test_conformance/computeinfo/main.c b/test_conformance/computeinfo/main.c
index 8126c95a..71bb1978 100644
--- a/test_conformance/computeinfo/main.c
+++ b/test_conformance/computeinfo/main.c
@@ -227,6 +227,10 @@ config_info config_infos[] =
     CONFIG_INFO( 2, 0, CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT , cl_uint),
 
     CONFIG_INFO( 2, 0, CL_DEVICE_SVM_CAPABILITIES, cl_device_svm_capabilities),
+
+    CONFIG_INFO( 2, 1, CL_DEVICE_IL_VERSION, string),
+    CONFIG_INFO( 2, 1, CL_DEVICE_MAX_NUM_SUB_GROUPS, cl_uint),
+    CONFIG_INFO( 2, 1, CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS, cl_uint),
 };
 
 #define ENTRY(T) { T, #T }
diff --git a/test_conformance/contractions/contractions.c b/test_conformance/contractions/contractions.c
index 164fde6c..64502edb 100644
--- a/test_conformance/contractions/contractions.c
+++ b/test_conformance/contractions/contractions.c
@@ -401,6 +401,8 @@ static void PrintArch( void )
     vlog( "\tARCH:\tx86_64\n" );
 #elif defined( __arm__ )
     vlog( "\tARCH:\tarm\n" );
+#elif defined( __aarch64__ )
+    vlog( "\tARCH:\taarch64\n" );
 #else
     vlog( "\tARCH:\tunknown\n" );
 #endif
diff --git a/test_conformance/conversions/basic_test_conversions.c b/test_conformance/conversions/basic_test_conversions.c
index 73f38633..bfd31b27 100644
--- a/test_conformance/conversions/basic_test_conversions.c
+++ b/test_conformance/conversions/basic_test_conversions.c
@@ -751,16 +751,11 @@ static void ulong2uint( void *out, void *in){ ((cl_uint*) out)[0] = (cl_uint) ((
 static void ulong2int( void *out, void *in){ ((cl_int*) out)[0] = (cl_int) ((cl_ulong*) in)[0]; }
 static void ulong2float( void *out, void *in)
 {
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) && defined(_M_X64)
     cl_ulong l = ((cl_ulong*) in)[0];
     float result;
-
     cl_long sl = ((cl_long)l < 0) ? (cl_long)((l >> 1) | (l & 1)) : (cl_long)l;
-#if defined(_M_X64)
     _mm_store_ss(&result, _mm_cvtsi64_ss(_mm_setzero_ps(), sl));
-#else
-    result = sl;
-#endif
     ((float*) out)[0] = (l == 0 ? 0.0f : (((cl_long)l < 0) ? result * 2.0f : result));
 #else
     cl_ulong l = ((cl_ulong*) in)[0];
diff --git a/test_conformance/conversions/test_conversions.c b/test_conformance/conversions/test_conversions.c
index fc6f6315..62c709e0 100644
--- a/test_conformance/conversions/test_conversions.c
+++ b/test_conformance/conversions/test_conversions.c
@@ -98,6 +98,7 @@ cl_mem          gOutBuffers[ kCallStyleCount ];
 size_t          gComputeDevices = 0;
 uint32_t        gDeviceFrequency = 0;
 int             gWimpyMode = 0;
+int             gWimpyReductionFactor = 128;
 int             gSkipTesting = 0;
 int             gForceFTZ = 0;
 int             gMultithread = 1;
@@ -418,6 +419,27 @@ static int ParseArgs( int argc, const char **argv )
                     case 'w':
                         gWimpyMode ^= 1;
                         break;
+                    case '[':
+                        // wimpy reduction factor can be set with the option -[2^n]
+                        // Default factor is 128, and n practically can be from 1 to 12
+                        {
+                            const char *arg_temp = strchr(&arg[1], ']');
+                            if (arg_temp != 0)
+                            {
+                                int new_factor = atoi(&arg[1]);
+                                arg = arg_temp; // Advance until ']'
+                                if (new_factor && !(new_factor & (new_factor - 1)))
+                                {
+                                    vlog(" WimpyReduction factor changed from %d to %d \n", gWimpyReductionFactor, new_factor);
+                                    gWimpyReductionFactor = new_factor;
+                                }
+                                else
+                                {
+                                    vlog(" Error in WimpyReduction factor, must be power of 2 \n");
+                                }
+                            }
+                        }
+                        break;
                     case 'z':
                         gForceFTZ ^= 1;
                         break;
@@ -520,6 +542,7 @@ static int ParseArgs( int argc, const char **argv )
         vlog( "*** WARNING: Testing in Wimpy mode!                     ***\n" );
         vlog( "*** Wimpy mode is not sufficient to verify correctness. ***\n" );
         vlog( "*** It gives warm fuzzy feelings and then nevers calls. ***\n\n" );
+        vlog("*** Wimpy Reduction Factor: %-27u ***\n\n", gWimpyReductionFactor);
     }
 
     return 0;
@@ -546,6 +569,7 @@ static void PrintUsage( void )
     vlog( "\t\t-l\tToggle link check mode. When on, testing is skipped, and we just check to see that the kernels build. (Off by default.)\n" );
     vlog( "\t\t-m\tToggle Multithreading. (On by default.)\n" );
     vlog( "\t\t-w\tToggle wimpy mode. When wimpy mode is on, we run a very small subset of the tests for each fn. NOT A VALID TEST! (Off by default.)\n" );
+    vlog(" \t\t-[2^n]\tSet wimpy reduction factor, recommended range of n is 1-12, default factor(%u)\n", gWimpyReductionFactor);
     vlog( "\t\t-z\tToggle flush to zero mode  (Default: per device)\n" );
     vlog( "\t\t-#\tTest just vector size given by #, where # is an element of the set {1,2,3,4,8,16}\n" );
     vlog( "\n" );
@@ -1245,15 +1269,12 @@ static int DoTest( Type outType, Type inType, SaturationMode sat, RoundingMode r
     if ( !gWimpyMode && gIsEmbedded )
       step = blockCount * EMBEDDED_REDUCTION_FACTOR;
 
+    if ( gWimpyMode )
+        step = (size_t)blockCount * (size_t)gWimpyReductionFactor;
     vlog( "Testing... " );
     fflush(stdout);
     for( i = 0; i < (uint64_t)lastCase; i += step )
     {
-        if (gWimpyMode) {
-            uint64_t blockIndex = (i / blockCount) & 0xFF;
-            if (blockIndex != 0 && blockIndex != 0xFF)
-                continue;
-        }
 
         if( 0 == ( i & ((lastCase >> 3) -1))) {
             vlog(".");
diff --git a/test_conformance/d3d10/harness.cpp b/test_conformance/d3d10/harness.cpp
index e93cee94..016d9e72 100644
--- a/test_conformance/d3d10/harness.cpp
+++ b/test_conformance/d3d10/harness.cpp
@@ -200,7 +200,7 @@ cl_int HarnessD3D10_CreateDevice(IDXGIAdapter* pAdapter, ID3D10Device **ppDevice
         pAdapter,
         D3D10_DRIVER_TYPE_HARDWARE,
         NULL,
-        D3D10_CREATE_DEVICE_DEBUG,
+        0,
         D3D10_SDK_VERSION,
         &sd,
         &HarnessD3D10_pSwapChain,
diff --git a/test_conformance/device_partition/Jamfile b/test_conformance/device_partition/Jamfile
index 0e1d4d6a..3bf4b29f 100644
--- a/test_conformance/device_partition/Jamfile
+++ b/test_conformance/device_partition/Jamfile
@@ -1,32 +1,32 @@
-project
-    : requirements
-      <toolset>gcc:<cflags>-xc++
-      <toolset>msvc:<cflags>"/TP"
-    ;
-
-exe test_device_partition
-    : main.c
-      test_device_partition.cpp
-      ../../test_common/harness/errorHelpers.c
-      ../../test_common/harness/threadTesting.c
-      ../../test_common/harness/testHarness.c
-      ../../test_common/harness/kernelHelpers.c
-      ../../test_common/harness/genericThread.cpp
-      ../../test_common/harness/mt19937.c
-      ../../test_common/harness/conversions.c
-      ../../test_common/harness/typeWrappers.cpp
-    : <target-os>windows:<source>../../test_common/harness/msvc9.c
-    ;
-
-install dist
-    : test_device_partition
-    : <variant>debug:<location>$(DIST)/debug/tests/conformance/1.2/x86/device_partition
-      <variant>release:<location>$(DIST)/release/tests/conformance/1.2/x86/device_partition
-    ;
-
-install dist
-    : test_device_partition
-    : <variant>debug:<location>$(DIST)/debug/tests/conformance/1.2/x86_64/device_partition
-      <variant>release:<location>$(DIST)/release/tests/conformance/1.2/x86_64/device_partition
-      <address-model>64
-    ;
+project
+    : requirements
+      <toolset>gcc:<cflags>-xc++
+      <toolset>msvc:<cflags>"/TP"
+    ;
+
+exe test_device_partition
+    : main.c
+      test_device_partition.cpp
+      ../../test_common/harness/errorHelpers.c
+      ../../test_common/harness/threadTesting.c
+      ../../test_common/harness/testHarness.c
+      ../../test_common/harness/kernelHelpers.c
+      ../../test_common/harness/genericThread.cpp
+      ../../test_common/harness/mt19937.c
+      ../../test_common/harness/conversions.c
+      ../../test_common/harness/typeWrappers.cpp
+    : <target-os>windows:<source>../../test_common/harness/msvc9.c
+    ;
+
+install dist
+    : test_device_partition
+    : <variant>debug:<location>$(DIST)/debug/tests/conformance/1.2/x86/device_partition
+      <variant>release:<location>$(DIST)/release/tests/conformance/1.2/x86/device_partition
+    ;
+
+install dist
+    : test_device_partition
+    : <variant>debug:<location>$(DIST)/debug/tests/conformance/1.2/x86_64/device_partition
+      <variant>release:<location>$(DIST)/release/tests/conformance/1.2/x86_64/device_partition
+      <address-model>64
+    ;
diff --git a/test_conformance/device_timer/main.c b/test_conformance/device_timer/main.c
index c3cb2611..7ddeadad 100644
--- a/test_conformance/device_timer/main.c
+++ b/test_conformance/device_timer/main.c
@@ -32,8 +32,7 @@ basefn basefn_list[] = {
 
 const char *basefn_names[] = {
     "test_timer_resolution_queries",
-    "test_device_and_host_timers",
-    "all"
+    "test_device_and_host_timers"
 };
 
 size_t num_fns = sizeof(basefn_names)/sizeof(basefn_names[0]);
diff --git a/test_conformance/geometrics/test_geometrics_double.cpp b/test_conformance/geometrics/test_geometrics_double.cpp
index 2d258d5d..34bd1933 100644
--- a/test_conformance/geometrics/test_geometrics_double.cpp
+++ b/test_conformance/geometrics/test_geometrics_double.cpp
@@ -203,7 +203,7 @@ int test_geom_cross_double(cl_device_id deviceID, cl_context context, cl_command
             return -1;
 
         /* Generate some streams. Note: deliberately do some random data in w to verify that it gets ignored */
-        for( i = 0; i < TEST_SIZE * vecsize; i++ )
+        for( i = 0; i < size * vecsize; i++ )
         {
             inDataA[ i ] = get_random_double( -512.f, 512.f, d );
             inDataB[ i ] = get_random_double( -512.f, 512.f, d );
@@ -237,7 +237,7 @@ int test_geom_cross_double(cl_device_id deviceID, cl_context context, cl_command
         }
 
         /* Run the kernel */
-        threads[0] = TEST_SIZE;
+        threads[0] = size;
 
         error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
         test_error( error, "Unable to get work group size to use" );
diff --git a/test_conformance/gl/test_image_methods.cpp b/test_conformance/gl/test_image_methods.cpp
index fa0c00a6..0d0e5c7e 100644
--- a/test_conformance/gl/test_image_methods.cpp
+++ b/test_conformance/gl/test_image_methods.cpp
@@ -34,7 +34,8 @@ typedef struct image_kernel_data
   cl_int numSamples;
 };
 
-static const char *methodTestKernelPattern =
+static const char *methodTestKernelPattern = 
+"%s"
 "typedef struct {\n"
 "    int width;\n"
 "    int height;\n"
@@ -75,6 +76,8 @@ static const char *channelOrderConstLine =
 "   outData->expectedChannelOrder = CLK_%s;\n";
 static const char *numSamplesKernelLine =
 "   outData->numSamples = get_image_num_samples( input );\n";
+static const char *enableMSAAKernelLine =
+"#pragma OPENCL EXTENSION cl_khr_gl_msaa_sharing : enable\n";
 
 static int verify(cl_int input, cl_int kernelOutput, const char * description)
 {
@@ -185,6 +188,7 @@ int test_image_format_methods( cl_device_id device, cl_context context, cl_comma
   bool doImageChannelOrder = false;
   bool doImageDim = false;
   bool doNumSamples = false;
+  bool doMSAA = false;
   switch(target) {
     case GL_TEXTURE_2D:
       imageType = "image2d_depth_t";
@@ -206,6 +210,7 @@ int test_image_format_methods( cl_device_id device, cl_context context, cl_comma
       break;
     case GL_TEXTURE_2D_MULTISAMPLE:
       doNumSamples = true;
+      doMSAA = true;
       if(format.formattype == GL_DEPTH_COMPONENT) {
         doImageWidth = true;
         imageType = "image2d_msaa_depth_t";
@@ -214,6 +219,7 @@ int test_image_format_methods( cl_device_id device, cl_context context, cl_comma
       }
       break;
     case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
+      doMSAA = true;
       if(format.formattype == GL_DEPTH_COMPONENT) {
         doImageWidth = true;
         imageType = "image2d_msaa_array_depth_t";
@@ -244,9 +250,11 @@ int test_image_format_methods( cl_device_id device, cl_context context, cl_comma
     }
   }
 
-    // Create a program to run against
-    sprintf( programSrc, methodTestKernelPattern,
-            imageType,
+	// Create a program to run against
+	sprintf(programSrc, 
+          methodTestKernelPattern, 
+          ( doMSAA ) ? enableMSAAKernelLine : "",
+	        imageType,
           ( doArraySize ) ? arraySizeKernelLine : "",
           ( doImageWidth ) ? imageWidthKernelLine : "",
           ( doImageHeight ) ? imageHeightKernelLine : "",
diff --git a/test_conformance/gl/test_images_read_common.cpp b/test_conformance/gl/test_images_read_common.cpp
index 068b9a29..1d0529d5 100644
--- a/test_conformance/gl/test_images_read_common.cpp
+++ b/test_conformance/gl/test_images_read_common.cpp
@@ -107,6 +107,7 @@ static const char *kernelpattern_image_read_2darray_depth =
 "}\n";
 
 static const char *kernelpattern_image_multisample_read_2d =
+"#pragma OPENCL EXTENSION cl_khr_gl_msaa_sharing : enable\n"
 "__kernel void sample_test( read_only image2d_msaa_t source, sampler_t sampler, __global %s4 *results )\n"
 "{\n"
 "    int  tidX = get_global_id(0);\n"
@@ -121,6 +122,7 @@ static const char *kernelpattern_image_multisample_read_2d =
 "}\n";
 
 static const char *kernelpattern_image_multisample_read_2d_depth =
+  "#pragma OPENCL EXTENSION cl_khr_gl_msaa_sharing : enable\n"
   "__kernel void sample_test( read_only image2d_msaa_depth_t source, sampler_t sampler, __global %s *results )\n"
 "{\n"
 "    int  tidX = get_global_id(0);\n"
@@ -135,6 +137,7 @@ static const char *kernelpattern_image_multisample_read_2d_depth =
 "}\n";
 
 static const char *kernelpattern_image_multisample_read_2darray =
+"#pragma OPENCL EXTENSION cl_khr_gl_msaa_sharing : enable\n"
 "__kernel void sample_test( read_only image2d_array_msaa_t source, sampler_t sampler, __global %s4 *results )\n"
 "{\n"
 "    int  tidX = get_global_id(0);\n"
@@ -151,6 +154,7 @@ static const char *kernelpattern_image_multisample_read_2darray =
 "}\n";
 
 static const char *kernelpattern_image_multisample_read_2darray_depth =
+  "#pragma OPENCL EXTENSION cl_khr_gl_msaa_sharing : enable\n"
   "__kernel void sample_test( read_only image2d_array_msaa_depth_t source, sampler_t sampler, __global %s *results )\n"
 "{\n"
 "    int  tidX = get_global_id(0);\n"
diff --git a/test_conformance/gl/test_images_write_common.cpp b/test_conformance/gl/test_images_write_common.cpp
index 7b91f585..1345f46f 100644
--- a/test_conformance/gl/test_images_write_common.cpp
+++ b/test_conformance/gl/test_images_write_common.cpp
@@ -333,8 +333,8 @@ int test_cl_image_write( cl_context context, cl_command_queue queue,
     get_explicit_type_name( *outType ), suffix, convert);
 
   programPtr = kernelSource;
-  if( create_single_kernel_helper( context, &program, &kernel, 1,
-    (const char **)&programPtr, "sample_test" ) )
+  if( create_single_kernel_helper_with_build_options( context, &program, &kernel, 1,
+    (const char **)&programPtr, "sample_test", "-cl-std=CL2.0" ) )
   {
       return -1;
   }
diff --git a/test_conformance/gles/main.cpp b/test_conformance/gles/main.cpp
index e7aeb938..92d71238 100644
--- a/test_conformance/gles/main.cpp
+++ b/test_conformance/gles/main.cpp
@@ -386,17 +386,17 @@ int main(int argc, const char *argv[])
 // Intentional falling through
 cleanup:
 
-    // Cleanup EGL
-    glEnv->terminate_egl_display();
-
     // Always make sure that OpenCL context is released properly when the test exit
     if(sCurrentContext)
     {
         clReleaseContext( sCurrentContext );
         sCurrentContext = NULL;
     }
+
+    // Cleanup EGL
+    glEnv->terminate_egl_display();
+
     delete glEnv;
 
-
     return error;
-}
\ No newline at end of file
+}
diff --git a/test_conformance/half/Test_roundTrip.c b/test_conformance/half/Test_roundTrip.c
index 18eb5f9f..0d257b60 100644
--- a/test_conformance/half/Test_roundTrip.c
+++ b/test_conformance/half/Test_roundTrip.c
@@ -162,7 +162,7 @@ int Test_roundTrip( void )
 
     // Figure out how many elements are in a work block
     size_t elementSize = MAX( sizeof(cl_half), sizeof(cl_float));
-    size_t blockCount = (size_t)getBufferSize(gDevice) / elementSize; //elementSize is a power of two
+    size_t blockCount = (size_t)gBufferSize / elementSize; //elementSize is a power of two
     uint64_t lastCase = 1ULL << (8*sizeof(cl_half)); // number of cl_half
     size_t stride = blockCount;
 
diff --git a/test_conformance/half/Test_vLoadHalf.c b/test_conformance/half/Test_vLoadHalf.c
index 3ca8a203..d067fcdc 100644
--- a/test_conformance/half/Test_vLoadHalf.c
+++ b/test_conformance/half/Test_vLoadHalf.c
@@ -453,7 +453,7 @@ int Test_vLoadHalf_private( bool aligned )
 
     // Figure out how many elements are in a work block
     size_t elementSize = MAX( sizeof(cl_half), sizeof(cl_float));
-    size_t blockCount = getBufferSize(gDevice) / elementSize; // elementSize is power of 2
+    size_t blockCount = gBufferSize / elementSize; // elementSize is power of 2
     uint64_t lastCase = 1ULL << (8*sizeof(cl_half)); // number of things of size cl_half
 
     // we handle 64-bit types a bit differently.
@@ -503,7 +503,7 @@ int Test_vLoadHalf_private( bool aligned )
                  continue;
                  }
                  */
-                memset_pattern4( gOut_single, &pattern, getBufferSize(gDevice));
+                memset_pattern4( gOut_single, &pattern, gBufferSize);
                 if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer_single, CL_TRUE, 0, count * sizeof( float ), gOut_single, 0, NULL, NULL)) )
                 {
                     vlog_error( "Failure in clWriteArray\n" );
diff --git a/test_conformance/half/Test_vStoreHalf.c b/test_conformance/half/Test_vStoreHalf.c
index a3117bcb..4b9193d4 100644
--- a/test_conformance/half/Test_vStoreHalf.c
+++ b/test_conformance/half/Test_vStoreHalf.c
@@ -1044,7 +1044,7 @@ int Test_vStoreHalf_private( f2h referenceFunc, d2h doubleReferenceFunc, const c
     size_t stride = blockCount;
 
     if (gWimpyMode)
-        stride = 0x10000000U;
+        stride = (uint64_t)blockCount * (uint64_t)gWimpyReductionFactor;
 
     // we handle 64-bit types a bit differently.
     if( lastCase == 0 )
@@ -1654,7 +1654,7 @@ int Test_vStoreaHalf_private( f2h referenceFunc, d2h doubleReferenceFunc, const
     size_t stride = blockCount;
 
     if (gWimpyMode)
-        stride = 0x10000000U;
+        stride = (uint64_t)blockCount * (uint64_t)gWimpyReductionFactor;
 
     // we handle 64-bit types a bit differently.
     if( lastCase == 0 )
diff --git a/test_conformance/half/cl_utils.c b/test_conformance/half/cl_utils.c
index 939cddf3..74408678 100644
--- a/test_conformance/half/cl_utils.c
+++ b/test_conformance/half/cl_utils.c
@@ -61,9 +61,11 @@ size_t          gWorkGroupSize = 0;
 int             gTestCount = 0;
 int             gFailCount = 0;
 bool            gWimpyMode = false;
+int             gWimpyReductionFactor = 512;
 int             gTestDouble = 0;
 uint32_t        gDeviceIndex = 0;
 int             gIsEmbedded = 0;
+size_t          gBufferSize = 0;
 
 #if defined( __APPLE__ )
 int             gReportTimes = 1;
@@ -184,17 +186,19 @@ int InitCL( void )
 #if defined( __APPLE__ )
     // FIXME: use clProtectedArray
 #endif
+    gBufferSize = getBufferSize(gDevice);
+
     //Allocate buffers
-    gIn_half   = malloc( getBufferSize(gDevice)/2  );
+    gIn_half   = malloc( gBufferSize/2  );
     gOut_half = malloc( BUFFER_SIZE/2  );
     gOut_half_reference = malloc( BUFFER_SIZE/2  );
     gOut_half_reference_double = malloc( BUFFER_SIZE/2  );
     gIn_single   = malloc( BUFFER_SIZE );
-    gOut_single = malloc( getBufferSize(gDevice)  );
-    gOut_single_reference = malloc( getBufferSize(gDevice)  );
+    gOut_single = malloc( gBufferSize  );
+    gOut_single_reference = malloc( gBufferSize  );
     gIn_double   = malloc( 2*BUFFER_SIZE  );
-    // gOut_double = malloc( (2*getBufferSize(gDevice))  );
-    // gOut_double_reference = malloc( (2*getBufferSize(gDevice))  );
+    // gOut_double = malloc( (2*gBufferSize)  );
+    // gOut_double_reference = malloc( (2*gBufferSize)  );
 
     if ( NULL == gIn_half ||
      NULL == gOut_half ||
@@ -207,7 +211,7 @@ int InitCL( void )
          )
         return -3;
 
-    gInBuffer_half = clCreateBuffer(gContext, CL_MEM_READ_ONLY, getBufferSize(gDevice) / 2, NULL, &error);
+    gInBuffer_half = clCreateBuffer(gContext, CL_MEM_READ_ONLY, gBufferSize / 2, NULL, &error);
     if( gInBuffer_half == NULL )
     {
         vlog_error( "clCreateArray failed for input (%d)\n", error );
@@ -235,7 +239,7 @@ int InitCL( void )
         return -5;
     }
 
-    gOutBuffer_single = clCreateBuffer(gContext, CL_MEM_WRITE_ONLY, getBufferSize(gDevice), NULL, &error );
+    gOutBuffer_single = clCreateBuffer(gContext, CL_MEM_WRITE_ONLY, gBufferSize, NULL, &error );
     if( gOutBuffer_single == NULL )
     {
         vlog_error( "clCreateArray failed for output (%d)\n", error );
@@ -243,7 +247,7 @@ int InitCL( void )
     }
 
 #if 0
-    gOutBuffer_double = clCreateBuffer(gContext, CL_MEM_WRITE_ONLY, (size_t)(2*getBufferSize(gDevice)), NULL, &error );
+    gOutBuffer_double = clCreateBuffer(gContext, CL_MEM_WRITE_ONLY, (size_t)(2*gBufferSize), NULL, &error );
     if( gOutBuffer_double == NULL )
     {
         vlog_error( "clCreateArray failed for output (%d)\n", error );
@@ -317,6 +321,15 @@ void ReleaseCL(void)
     // clReleaseMemObject(gOutBuffer_double);
     clReleaseCommandQueue(gQueue);
     clReleaseContext(gContext);
+
+    free(gIn_half);
+    free(gOut_half);
+    free(gOut_half_reference);
+    free(gOut_half_reference_double);
+    free(gIn_single);
+    free(gOut_single);
+    free(gOut_single_reference);
+    free(gIn_double);
 }
 
 cl_uint numVecs(cl_uint count, int vectorSizeIdx, bool aligned) {
@@ -453,19 +466,30 @@ size_t getBufferSize(cl_device_id device_id)
 
     if(s_initialized == 0 || s_device_id != device_id)
     {
-        cl_ulong result;
+        cl_ulong result, maxGlobalSize;
         cl_int err = clGetDeviceInfo (device_id,
                                       CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE,
                                       sizeof(result), (void *)&result,
                                       NULL);
         if(err)
         {
-            vlog_error("clGetDeviceInfo() failed\n");
+            vlog_error("clGetDeviceInfo(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) failed\n");
             s_result = 64*1024;
             goto exit;
         }
-        result = result / 2;
         log_info("Const buffer size is %llx (%llu)\n", result, result);
+        err = clGetDeviceInfo (device_id,
+                               CL_DEVICE_GLOBAL_MEM_SIZE,
+                               sizeof(maxGlobalSize), (void *)&maxGlobalSize,
+                               NULL);
+        if(err)
+        {
+            vlog_error("clGetDeviceInfo(CL_DEVICE_GLOBAL_MEM_SIZE) failed\n");
+            goto exit;
+        }
+        result = result / 2;
+        if(maxGlobalSize < result * 10)
+            result = result / 10;
         s_initialized = 1;
         s_device_id = device_id;
         s_result = result;
diff --git a/test_conformance/half/cl_utils.h b/test_conformance/half/cl_utils.h
index 9a720efd..cba58072 100644
--- a/test_conformance/half/cl_utils.h
+++ b/test_conformance/half/cl_utils.h
@@ -73,11 +73,13 @@ extern int             gFailCount;
 extern int             gTestDouble;
 extern int             gReportTimes;
 extern int             gIsEmbedded;
+extern size_t          gBufferSize;
 
 // gWimpyMode indicates if we run the test in wimpy mode where we limit the
 // size of 32 bit ranges to a much smaller set.  This is meant to be used
 // as a smoke test
 extern bool            gWimpyMode;
+extern int             gWimpyReductionFactor;
 
 uint64_t ReadTime( void );
 double SubtractTime( uint64_t endTime, uint64_t startTime );
diff --git a/test_conformance/half/main.c b/test_conformance/half/main.c
index fe3b6f17..718153b1 100644
--- a/test_conformance/half/main.c
+++ b/test_conformance/half/main.c
@@ -236,7 +236,27 @@ static int ParseArgs( int argc, const char **argv )
                     case 'w':  // Wimpy mode
                         gWimpyMode = true;
                         break;
-
+                    case '[':
+                        // wimpy reduction factor can be set with the option -[2^n]
+                        // Default factor is 512, and n practically can be from 1 to 12
+                        {
+                            const char *arg_temp = strchr(&arg[1], ']');
+                            if (arg_temp != 0)
+                            {
+                                int new_factor = atoi(&arg[1]);
+                                arg = arg_temp; // Advance until ']'
+                                if (new_factor && !(new_factor & (new_factor - 1)))
+                                {
+                                    vlog(" WimpyReduction factor changed from %d to %d \n", gWimpyReductionFactor, new_factor);
+                                    gWimpyReductionFactor = new_factor;
+                                }
+                               else
+                                {
+                                   vlog(" Error in WimpyReduction factor, must be power of 2 \n");
+                                }
+                            }
+                        }
+                        break;
                     default:
                         vlog_error( " <-- unknown flag: %c (0x%2.2x)\n)", *arg, *arg );
                         PrintUsage();
@@ -278,6 +298,7 @@ static int ParseArgs( int argc, const char **argv )
         vlog( "*** WARNING: Testing in Wimpy mode!                     ***\n" );
         vlog( "*** Wimpy mode is not sufficient to verify correctness. ***\n" );
         vlog( "*** It gives warm fuzzy feelings and then nevers calls. ***\n\n" );
+        vlog( "*** Wimpy Reduction Factor: %-27u ***\n\n", gWimpyReductionFactor);
     }
     return 0;
 }
@@ -288,6 +309,7 @@ static void PrintUsage( void )
     vlog( "\t\t-d\tToggle double precision testing (default: on if double supported)\n" );
     vlog( "\t\t-t\tToggle reporting performance data.\n" );
     vlog( "\t\t-w\tRun in wimpy mode\n" );
+    vlog( "\t\t-[2^n]\tSet wimpy reduction factor, recommended range of n is 1-12, default factor(%u)\n", gWimpyReductionFactor);
     vlog( "\t\t-h\tHelp\n" );
     vlog( "\n" );
 }
@@ -307,6 +329,8 @@ static void PrintArch( void )
     vlog( "ARCH:\tx86_64\n" );
 #elif defined( __arm__ )
     vlog( "ARCH:\tarm\n" );
+#elif defined( __aarch64__ )
+    vlog( "\tARCH:\taarch64\n" );
 #else
 #error unknown arch
 #endif
diff --git a/test_conformance/headers/CMakeLists.txt b/test_conformance/headers/CMakeLists.txt
index eff8e286..4e7a1d86 100644
--- a/test_conformance/headers/CMakeLists.txt
+++ b/test_conformance/headers/CMakeLists.txt
@@ -19,6 +19,8 @@ set(CL_H_SOURCES
     test_cl.h.c
 )
 
+set_source_files_properties(${CL_H_SOURCES} PROPERTIES LANGUAGE CXX)
+
 set(CL_H_OUT ${CONFORMANCE_PREFIX}cl_h${CONFORMANCE_SUFFIX})
 
 add_executable(
@@ -36,6 +38,8 @@ set(CL_PLATFORM_H_SOURCES
     test_cl_platform.h.c
 )
 
+set_source_files_properties(${CL_PLATFORM_H_SOURCES} PROPERTIES LANGUAGE CXX)
+
 set(CL_PLATFORM_H_OUT ${CONFORMANCE_PREFIX}cl_platform_h${CONFORMANCE_SUFFIX})
 
 add_executable(
@@ -53,6 +57,8 @@ set(CL_GL_H_SOURCES
     test_cl_gl.h.c
 )
 
+set_source_files_properties(${CL_GL_H_SOURCES} PROPERTIES LANGUAGE CXX)
+
 set(CL_GL_H_OUT ${CONFORMANCE_PREFIX}cl_gl_h${CONFORMANCE_SUFFIX})
 
 add_executable(
@@ -70,6 +76,8 @@ set(OPENCL_H_SOURCES
     test_opencl.h.c
 )
 
+set_source_files_properties(${OPENCL_H_SOURCES} PROPERTIES LANGUAGE CXX)
+
 set(OPENCL_H_OUT ${CONFORMANCE_PREFIX}opencl_h${CONFORMANCE_SUFFIX})
 
 add_executable(
@@ -83,5 +91,4 @@ TARGET_LINK_LIBRARIES(${OPENCL_H_OUT} ${CLConform_LIBRARIES})
 
 ########################################################################################
 
-
 # end of file #
diff --git a/test_conformance/math_brute_force/binary.c b/test_conformance/math_brute_force/binary.c
index 5800aaf5..042076ec 100644
--- a/test_conformance/math_brute_force/binary.c
+++ b/test_conformance/math_brute_force/binary.c
@@ -233,6 +233,7 @@ typedef struct TestInfo
     cl_kernel   *k[VECTOR_SIZE_COUNT ];             // arrays of thread-specific kernels for each worker thread:  k[vector_size][thread_id]
     ThreadInfo  *tinfo;                             // An array of thread specific information for each worker thread
     cl_uint     threadCount;                        // Number of worker threads
+    cl_uint     jobCount;                           // Number of jobs
     cl_uint     step;                               // step between each chunk and the next.
     cl_uint     scale;                              // stride between individual test values
     float       ulps;                               // max_allowed ulps
@@ -268,6 +269,16 @@ int TestFunc_Float_Float_Float_common(const Func *f, MTdata d, int isNextafter)
         test_info.scale =  (cl_uint) sizeof(cl_float) * 2 * gWimpyReductionFactor;
     }
     test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        //there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
     test_info.f = f;
     test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
     test_info.ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
@@ -341,7 +352,7 @@ int TestFunc_Float_Float_Float_common(const Func *f, MTdata d, int isNextafter)
     // Run the kernels
     if( !gSkipCorrectnessTesting )
     {
-        error = ThreadPool_Do( TestFloat, (cl_uint) ((1ULL<<32) / test_info.step), &test_info );
+        error = ThreadPool_Do( TestFloat, test_info.jobCount, &test_info );
 
         // Accumulate the arithmetic errors
         for( i = 0; i < test_info.threadCount; i++ )
@@ -991,6 +1002,16 @@ int TestFunc_Double_Double_Double_common(const Func *f, MTdata d, int isNextafte
         test_info.scale =  (cl_uint) sizeof(cl_double) * 2 * gWimpyReductionFactor;
     }
     test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        //there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
     test_info.f = f;
     test_info.ulps = f->double_ulps;
     test_info.ftz = f->ftz || gForceFTZ;
@@ -1063,7 +1084,7 @@ int TestFunc_Double_Double_Double_common(const Func *f, MTdata d, int isNextafte
 
     if( !gSkipCorrectnessTesting )
     {
-        error = ThreadPool_Do( TestDouble, (cl_uint) ((1ULL<<32) / test_info.step), &test_info );
+        error = ThreadPool_Do( TestDouble, test_info.jobCount, &test_info );
 
         // Accumulate the arithmetic errors
         for( i = 0; i < test_info.threadCount; i++ )
diff --git a/test_conformance/math_brute_force/binaryOperator.c b/test_conformance/math_brute_force/binaryOperator.c
index 718afd3d..0368aafc 100644
--- a/test_conformance/math_brute_force/binaryOperator.c
+++ b/test_conformance/math_brute_force/binaryOperator.c
@@ -207,6 +207,7 @@ typedef struct TestInfo
     cl_kernel   *k[VECTOR_SIZE_COUNT ];             // arrays of thread-specific kernels for each worker thread:  k[vector_size][thread_id]
     ThreadInfo  *tinfo;                             // An array of thread specific information for each worker thread
     cl_uint     threadCount;                        // Number of worker threads
+    cl_uint     jobCount;                           // Number of jobs
     cl_uint     step;                               // step between each chunk and the next.
     cl_uint     scale;                              // stride between individual test values
     float       ulps;                               // max_allowed ulps
@@ -260,6 +261,16 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d)
     }
 
     test_info.step = test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        //there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
     test_info.f = f;
     test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
     test_info.ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
@@ -329,7 +340,7 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d)
 
     if( !gSkipCorrectnessTesting )
     {
-        error = ThreadPool_Do( TestFloat, (cl_uint) ((1ULL<<32) / test_info.step), &test_info );
+        error = ThreadPool_Do( TestFloat, test_info.jobCount, &test_info );
 
         // Accumulate the arithmetic errors
         for( i = 0; i < test_info.threadCount; i++ )
@@ -501,63 +512,51 @@ static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data )
     int totalSpecialValueCount = specialValuesFloatCount * specialValuesFloatCount;
     int indx = (totalSpecialValueCount - 1) / buffer_elements;
 
-    if( job_id <= (cl_uint)indx )
-    { // test edge cases
-        float *fp = (float *)p;
-        float *fp2 = (float *)p2;
+
+    if( job_id <= (cl_uint)indx ) {
+        // Insert special values
         uint32_t x, y;
 
-    x = (job_id * buffer_elements) % specialValuesFloatCount;
-    y = (job_id * buffer_elements) / specialValuesFloatCount;
+        x = (job_id * buffer_elements) % specialValuesFloatCount;
+        y = (job_id * buffer_elements) / specialValuesFloatCount;
 
-        for( ; j < buffer_elements; j++ )
-        {
-            fp[j] = specialValuesFloat[x];
-            fp2[j] = specialValuesFloat[y];
-            if( ++x >= specialValuesFloatCount )
-            {
+        for( ; j < buffer_elements; j++ ) {
+            p[j] = ((cl_uint *)specialValuesFloat)[x];
+            p2[j] = ((cl_uint *)specialValuesFloat)[y];
+            ++x;
+            if (x >= specialValuesFloatCount) {
                 x = 0;
                 y++;
-                if( y >= specialValuesFloatCount )
+                if (y >= specialValuesFloatCount)
                     break;
             }
-            if(gTestFastRelaxed && strcmp(name,"divide") == 0 )
-            {
-              float fpj = *(float*)&fp[j];
-              float fpj2 = *(float*)&fp2[j];
-              if(fabs(fpj) > 0x5E800000 ) //[2^-62,2^62]
-              {
-                fp[j] = NAN;
-              }
-              if( fabs(fpj2) > 0x5E800000 ) //[2^-62,2^62]
-              {
-                fp2[j] = NAN;
-              }
+            if (gTestFastRelaxed && strcmp(name,"divide") == 0) {
+                cl_uint pj = p[j] & 0x7fffffff;
+                cl_uint p2j = p2[j] & 0x7fffffff;
+                // Replace values outside [2^-62, 2^62] with QNaN
+                if (pj < 0x20800000 || pj > 0x5e800000)
+                    p[j] = 0x7fc00000;
+                if (p2j < 0x20800000 || p2j > 0x5e800000)
+                    p2[j] = 0x7fc00000;
+            }
         }
     }
-    }
 
-    //Init any remaining values.
+    // Init any remaining values.
     for( ; j < buffer_elements; j++ )
     {
         p[j] = genrand_int32(d);
         p2[j] = genrand_int32(d);
 
-        if(gTestFastRelaxed)
-        {
-          if( strcmp(name,"divide")==0){
-            float pj = *(float*)&p[j];
-            float pj2 = *(float*)&p2[j];
-            if(fabs(pj) > 0x5E800000 ) //[2^-62,2^62]
-            {
-              p[j] = NAN;
-            }
-            if( fabs(pj2) > 0x5E800000 ) //[2^-62,2^62]
-            {
-              p2[j] = NAN;
-            }
-          }
-    }
+        if (gTestFastRelaxed && strcmp(name,"divide") == 0) {
+            cl_uint pj = p[j] & 0x7fffffff;
+            cl_uint p2j = p2[j] & 0x7fffffff;
+            // Replace values outside [2^-62, 2^62] with QNaN
+            if (pj < 0x20800000 || pj > 0x5e800000)
+                p[j] = 0x7fc00000;
+            if (p2j < 0x20800000 || p2j > 0x5e800000)
+                p2[j] = 0x7fc00000;
+        }
     }
 
     if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) ))
@@ -950,6 +949,16 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d)
     }
 
     test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        //there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
     test_info.f = f;
     test_info.ulps = f->double_ulps;
     test_info.ftz = f->ftz || gForceFTZ;
@@ -1020,7 +1029,7 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d)
 
     if( !gSkipCorrectnessTesting )
     {
-        error = ThreadPool_Do( TestDouble, (cl_uint) ((1ULL<<32) / test_info.step), &test_info );
+        error = ThreadPool_Do( TestDouble, test_info.jobCount, &test_info );
 
         // Accumulate the arithmetic errors
         for( i = 0; i < test_info.threadCount; i++ )
diff --git a/test_conformance/math_brute_force/binary_i.c b/test_conformance/math_brute_force/binary_i.c
index b72d117f..5cea1bc6 100644
--- a/test_conformance/math_brute_force/binary_i.c
+++ b/test_conformance/math_brute_force/binary_i.c
@@ -230,6 +230,7 @@ typedef struct TestInfo
     cl_kernel   *k[VECTOR_SIZE_COUNT ];             // arrays of thread-specific kernels for each worker thread:  k[vector_size][thread_id]
     ThreadInfo  *tinfo;                             // An array of thread specific information for each worker thread
     cl_uint     threadCount;                        // Number of worker threads
+    cl_uint     jobCount;                           // Number of jobs
     cl_uint     step;                               // step between each chunk and the next.
     cl_uint     scale;                              // stride between individual test values
     float       ulps;                               // max_allowed ulps
@@ -262,6 +263,16 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d)
         test_info.scale =  (cl_uint) sizeof(cl_float) * 2 * gWimpyReductionFactor;
     }
     test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        //there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
     test_info.f = f;
     test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
     test_info.ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
@@ -330,7 +341,7 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d)
     }
 
     // Run the kernels
-    error = ThreadPool_Do( TestFloat, (cl_uint) ((1ULL<<32) / test_info.step), &test_info );
+    error = ThreadPool_Do( TestFloat, test_info.jobCount, &test_info );
 
 
     // Accumulate the arithmetic errors
@@ -758,6 +769,16 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d)
         test_info.scale =  (cl_uint) sizeof(cl_double) * 2 * gWimpyReductionFactor;
     }
     test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        //there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
     test_info.f = f;
     test_info.ulps = f->double_ulps;
     test_info.ftz = f->ftz || gForceFTZ;
@@ -831,7 +852,7 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d)
 
     // Run the kernels
     if( !gSkipCorrectnessTesting )
-        error = ThreadPool_Do( TestDouble, (cl_uint) ((1ULL<<32) / test_info.step), &test_info );
+        error = ThreadPool_Do( TestDouble, test_info.jobCount, &test_info );
 
 
     // Accumulate the arithmetic errors
diff --git a/test_conformance/math_brute_force/macro_binary.c b/test_conformance/math_brute_force/macro_binary.c
index e489c598..b65948ee 100644
--- a/test_conformance/math_brute_force/macro_binary.c
+++ b/test_conformance/math_brute_force/macro_binary.c
@@ -222,6 +222,7 @@ typedef struct TestInfo
     cl_kernel   *k[VECTOR_SIZE_COUNT ];             // arrays of thread-specific kernels for each worker thread:  k[vector_size][thread_id]
     ThreadInfo  *tinfo;                             // An array of thread specific information for each worker thread
     cl_uint     threadCount;                        // Number of worker threads
+    cl_uint     jobCount;                           // Number of jobs
     cl_uint     step;                               // step between each chunk and the next.
     cl_uint     scale;                              // stride between individual test values
     int         ftz;                                // non-zero if running in flush to zero mode
@@ -249,6 +250,16 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d)
         test_info.scale =  (cl_uint) sizeof(cl_float) * 2 * gWimpyReductionFactor;
     }
     test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        //there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
     test_info.f = f;
     test_info.ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
 
@@ -319,7 +330,7 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d)
     // Run the kernels
     if( !gSkipCorrectnessTesting )
     {
-        error = ThreadPool_Do( TestFloat, (cl_uint) ((1ULL<<32) / test_info.step), &test_info );
+        error = ThreadPool_Do( TestFloat, test_info.jobCount, &test_info );
 
         if( error )
             goto exit;
@@ -749,6 +760,16 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d)
     }
 
     test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        //there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
     test_info.f = f;
     test_info.ftz = f->ftz || gForceFTZ;
 
@@ -820,7 +841,7 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d)
 
     if( !gSkipCorrectnessTesting )
     {
-        error = ThreadPool_Do( TestDouble, (cl_uint) ((1ULL<<32) / test_info.step), &test_info );
+        error = ThreadPool_Do( TestDouble, test_info.jobCount, &test_info );
 
         if( error )
             goto exit;
diff --git a/test_conformance/math_brute_force/macro_unary.c b/test_conformance/math_brute_force/macro_unary.c
index 74e1dece..95af882a 100644
--- a/test_conformance/math_brute_force/macro_unary.c
+++ b/test_conformance/math_brute_force/macro_unary.c
@@ -193,6 +193,7 @@ typedef struct TestInfo
     cl_kernel   *k[VECTOR_SIZE_COUNT ];             // arrays of thread-specific kernels for each worker thread:  k[vector_size][thread_id]
     ThreadInfo  *tinfo;                             // An array of thread specific information for each worker thread
     cl_uint     threadCount;                        // Number of worker threads
+    cl_uint     jobCount;                           // Number of jobs
     cl_uint     step;                               // step between each chunk and the next.
     cl_uint     scale;                              // stride between individual test values
     int         ftz;                                // non-zero if running in flush to zero mode
@@ -220,6 +221,16 @@ int TestMacro_Int_Float(const Func *f, MTdata d)
         test_info.scale =  (cl_uint) sizeof(cl_float) * 2 * gWimpyReductionFactor;
     }
     test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        //there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
     test_info.f = f;
     test_info.ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
     // cl_kernels aren't thread safe, so we make one for each vector size for every thread
@@ -279,7 +290,7 @@ int TestMacro_Int_Float(const Func *f, MTdata d)
 
     if( !gSkipCorrectnessTesting )
     {
-        error = ThreadPool_Do( TestFloat, (cl_uint) ((1ULL<<32) / test_info.step), &test_info );
+        error = ThreadPool_Do( TestFloat, test_info.jobCount, &test_info );
 
         if( error )
             goto exit;
@@ -602,6 +613,16 @@ int TestMacro_Int_Double(const Func *f, MTdata d)
     }
 
     test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        //there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
     test_info.f = f;
     test_info.ftz = f->ftz || gForceFTZ;
 
@@ -664,7 +685,7 @@ int TestMacro_Int_Double(const Func *f, MTdata d)
 
     if( !gSkipCorrectnessTesting )
     {
-        error = ThreadPool_Do( TestDouble, (cl_uint) ((1ULL<<32) / test_info.step), &test_info );
+        error = ThreadPool_Do( TestDouble, test_info.jobCount, &test_info );
 
         if( error )
             goto exit;
diff --git a/test_conformance/math_brute_force/main.c b/test_conformance/math_brute_force/main.c
index 147e3af2..1cb87b4f 100644
--- a/test_conformance/math_brute_force/main.c
+++ b/test_conformance/math_brute_force/main.c
@@ -699,6 +699,8 @@ static void PrintArch( void )
         vlog( "\tARCH:\tx86_64\n" );
     #elif defined( __arm__ )
         vlog( "\tARCH:\tarm\n" );
+    #elif defined( __aarch64__ )
+        vlog( "\tARCH:\taarch64\n" );
     #else
         vlog( "\tARCH:\tunknown\n" );
     #endif
diff --git a/test_conformance/math_brute_force/reference_math.c b/test_conformance/math_brute_force/reference_math.c
index 791375bc..6e8f79aa 100644
--- a/test_conformance/math_brute_force/reference_math.c
+++ b/test_conformance/math_brute_force/reference_math.c
@@ -1790,7 +1790,7 @@ static const double //two52 = 4.50359962737049600000e+15, /* 0x43300000, 0x00000
 //    *signgamp = 1;
     ix = hx&0x7fffffff;
     if(ix>=0x7ff00000) return x*x;
-    if((ix|lx)==0) return (double)(one)/(double)(zero);
+    if((ix|lx)==0) return INFINITY;
     if(ix<0x3b900000) {    /* |x|<2**-70, return -log(|x|) */
         if(hx<0) {
 //            *signgamp = -1;
@@ -1799,9 +1799,9 @@ static const double //two52 = 4.50359962737049600000e+15, /* 0x43300000, 0x00000
     }
     if(hx<0) {
         if(ix>=0x43300000)     /* |x|>=2**52, must be -integer */
-        return (double)(one)/(double)(zero);
+        return INFINITY;
         t = reference_sinpi(x);
-        if(t==zero) (double)(one)/(double)(zero); /* -integer */
+        if(t==zero) return INFINITY; /* -integer */
         nadj = reference_log(pi/reference_fabs(t*x));
 //        if(t<zero) *signgamp = -1;
         x = -x;
diff --git a/test_conformance/math_brute_force/run_math_brute_force_in_parallel.py b/test_conformance/math_brute_force/run_math_brute_force_in_parallel.py
old mode 100644
new mode 100755
diff --git a/test_conformance/math_brute_force/unary.c b/test_conformance/math_brute_force/unary.c
index d8b3dbae..05dc7764 100644
--- a/test_conformance/math_brute_force/unary.c
+++ b/test_conformance/math_brute_force/unary.c
@@ -200,6 +200,7 @@ typedef struct TestInfo
     cl_kernel   *k[VECTOR_SIZE_COUNT ];             // arrays of thread-specific kernels for each worker thread:  k[vector_size][thread_id]
     ThreadInfo  *tinfo;                             // An array of thread specific information for each worker thread
     cl_uint     threadCount;                        // Number of worker threads
+    cl_uint     jobCount;                           // Number of jobs
     cl_uint     step;                               // step between each chunk and the next.
     cl_uint     scale;                              // stride between individual test values
     float       ulps;                               // max_allowed ulps
@@ -234,6 +235,16 @@ int TestFunc_Float_Float(const Func *f, MTdata d)
         test_info.scale =  (cl_uint) sizeof(cl_float) * 2 * gWimpyReductionFactor;
     }
     test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        //there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
     test_info.f = f;
     test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
     test_info.ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
@@ -309,7 +320,7 @@ int TestFunc_Float_Float(const Func *f, MTdata d)
 
     if( !gSkipCorrectnessTesting || skipTestingRelaxed)
     {
-        error = ThreadPool_Do( TestFloat, (cl_uint) ((1ULL<<32) / test_info.step), &test_info );
+        error = ThreadPool_Do( TestFloat, test_info.jobCount, &test_info );
 
         // Accumulate the arithmetic errors
         for( i = 0; i < test_info.threadCount; i++ )
@@ -997,7 +1008,16 @@ int TestFunc_Double_Double(const Func *f, MTdata d)
         test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
         test_info.scale =  (cl_uint) sizeof(cl_double) * 2 * gWimpyReductionFactor;
     }
-   test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
+    test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        //there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
 
     test_info.f = f;
     test_info.ulps = f->double_ulps;
@@ -1062,7 +1082,7 @@ int TestFunc_Double_Double(const Func *f, MTdata d)
 
     if( !gSkipCorrectnessTesting )
     {
-        error = ThreadPool_Do( TestDouble, (cl_uint) ((1ULL<<32) / test_info.step), &test_info );
+        error = ThreadPool_Do( TestDouble, test_info.jobCount, &test_info );
 
         // Accumulate the arithmetic errors
         for( i = 0; i < test_info.threadCount; i++ )
diff --git a/test_conformance/non_uniform_work_group/TestNonUniformWorkGroup.cpp b/test_conformance/non_uniform_work_group/TestNonUniformWorkGroup.cpp
index 381d29bc..58478a1e 100644
--- a/test_conformance/non_uniform_work_group/TestNonUniformWorkGroup.cpp
+++ b/test_conformance/non_uniform_work_group/TestNonUniformWorkGroup.cpp
@@ -605,6 +605,28 @@ void TestNonUniformWorkGroup::showTestInfo () {
   }
 }
 
+size_t TestNonUniformWorkGroup::adjustLocalArraySize (size_t localArraySize) {
+  // In case if localArraySize is too big, sometimes we can not run kernel because of lack
+  // of resources due to kernel itself requires some local memory to run
+  int err;
+
+  cl_ulong kernelLocalMemSize = 0;
+  err = clGetKernelWorkGroupInfo(_testKernel, _device, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(kernelLocalMemSize), &kernelLocalMemSize, NULL);
+  test_error(err, "clGetKernelWorkGroupInfo failed");
+
+  cl_ulong deviceLocalMemSize = 0;
+  err = clGetDeviceInfo(_device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(deviceLocalMemSize), &deviceLocalMemSize, NULL);
+  test_error(err, "clGetDeviceInfo failed");
+
+  if (kernelLocalMemSize + localArraySize > deviceLocalMemSize) {
+    size_t adjustedLocalArraySize = deviceLocalMemSize - kernelLocalMemSize;
+    log_info("localArraySize was adjusted from %lu to %lu\n", localArraySize, adjustedLocalArraySize);
+    localArraySize = adjustedLocalArraySize;
+  }
+
+  return localArraySize;
+}
+
 int TestNonUniformWorkGroup::runKernel () {
   int err;
 
@@ -622,7 +644,8 @@ int TestNonUniformWorkGroup::runKernel () {
   test_error(err, "clSetKernelArg failed");
 
   //creating local buffer
-  err = clSetKernelArg(_testKernel, 1, localArraySize*sizeof(unsigned int), NULL);
+  localArraySize = adjustLocalArraySize(localArraySize*sizeof(unsigned int));
+  err = clSetKernelArg(_testKernel, 1, localArraySize, NULL);
   test_error(err, "clSetKernelArg failed");
 
   clMemWrapper testGlobalArray = clCreateBuffer(_context, CL_MEM_READ_WRITE, _numOfGlobalWorkItems*sizeof(cl_uint), NULL, &err);
diff --git a/test_conformance/non_uniform_work_group/TestNonUniformWorkGroup.h b/test_conformance/non_uniform_work_group/TestNonUniformWorkGroup.h
index e7f261b5..a077296d 100644
--- a/test_conformance/non_uniform_work_group/TestNonUniformWorkGroup.h
+++ b/test_conformance/non_uniform_work_group/TestNonUniformWorkGroup.h
@@ -116,6 +116,7 @@ private:
   void verifyData (DataContainerAttrib * reference, DataContainerAttrib * results, short regionNumber);
   void calculateExpectedValues ();
   void showTestInfo ();
+  size_t adjustLocalArraySize(size_t localArraySize);
 };
 
 // Class responsible for running subtest scenarios in test function
diff --git a/test_conformance/opencl_conformance_tests_21_legacy_wimpy.csv b/test_conformance/opencl_conformance_tests_21_legacy_wimpy.csv
index ee7ab99e..2415d745 100644
--- a/test_conformance/opencl_conformance_tests_21_legacy_wimpy.csv
+++ b/test_conformance/opencl_conformance_tests_21_legacy_wimpy.csv
@@ -93,6 +93,12 @@ Pipes,pipes/test_pipes
 SVM,SVM/test_SVM
 Workgroups,workgroups/test_workgroups
 
+#####################################
+# OpenCL 2.1 tests
+#####################################
+Device timer,device_timer/test_device_timer
+SPIRV new,spirv_new/test_spirv_new -ILPath spirv_bin
+
 #########################################
 # Extensions
 #########################################
diff --git a/test_conformance/printf/test_printf.c b/test_conformance/printf/test_printf.c
index fcb6d69e..15e6d41d 100644
--- a/test_conformance/printf/test_printf.c
+++ b/test_conformance/printf/test_printf.c
@@ -17,6 +17,7 @@
 
 #include <string.h>
 #include <errno.h>
+#include <memory>
 
 #if ! defined( _WIN32)
 #if ! defined( __ANDROID__ )
@@ -304,9 +305,6 @@ static cl_program makePrintfProgram(cl_kernel *kernel_ptr, const cl_context cont
 //-----------------------------------------
 static bool isLongSupported(cl_device_id device_id)
 {
-    //profile type && device extention for long support checking
-    char *profileType = NULL,*devExt = NULL;
-
     size_t tempSize = 0;
     cl_int status;
     bool extSupport = true;
@@ -325,7 +323,7 @@ static bool isLongSupported(cl_device_id device_id)
         return false;
     }
 
-    profileType = new char[tempSize];
+    std::unique_ptr<char[]> profileType(new char[tempSize]);
     if(profileType == NULL)
     {
         log_error("Failed to allocate memory(profileType)");
@@ -336,11 +334,11 @@ static bool isLongSupported(cl_device_id device_id)
         device_id,
         CL_DEVICE_PROFILE,
         sizeof(char) * tempSize,
-        profileType,
+        profileType.get(),
         NULL);
 
 
-    if(!strcmp("EMBEDDED_PROFILE",profileType))
+    if(!strcmp("EMBEDDED_PROFILE",profileType.get()))
     {
         // Device extention
         status = clGetDeviceInfo(
@@ -356,7 +354,7 @@ static bool isLongSupported(cl_device_id device_id)
             return false;
         }
 
-        devExt = new char[tempSize];
+        std::unique_ptr<char[]> devExt(new char[tempSize]);
         if(devExt == NULL)
         {
             log_error("Failed to allocate memory(devExt)");
@@ -367,13 +365,10 @@ static bool isLongSupported(cl_device_id device_id)
             device_id,
             CL_DEVICE_EXTENSIONS,
             sizeof(char) * tempSize,
-            devExt,
+            devExt.get(),
             NULL);
 
-        extSupport  = (strstr(devExt,"cles_khr_int64") != NULL);
-
-        delete devExt;
-        delete profileType;
+        extSupport  = (strstr(devExt.get(),"cles_khr_int64") != NULL);
     }
     return extSupport;
 }
@@ -460,10 +455,12 @@ static int doTest(cl_command_queue queue, cl_context context, const unsigned int
         }
     }
 
+    int fd = acquireOutputStream();
     globalWorkSize[0] = 1;
     cl_event ndrEvt;
     err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL,&ndrEvt);
     if (err != CL_SUCCESS) {
+        releaseOutputStream(fd);
         log_error("\n clEnqueueNDRangeKernel failed errcode:%d\n", err);
         ++s_test_fail;
         goto exit;
@@ -473,6 +470,7 @@ static int doTest(cl_command_queue queue, cl_context context, const unsigned int
     err = clFlush(queue);
     if(err != CL_SUCCESS)
     {
+        releaseOutputStream(fd);
         log_error("clFlush failed\n");
         goto exit;
     }
@@ -480,6 +478,8 @@ static int doTest(cl_command_queue queue, cl_context context, const unsigned int
     //is immidatly printed
     err = waitForEvent(&ndrEvt);
 
+    releaseOutputStream(fd);
+
     if(err != CL_SUCCESS)
     {
         log_error("waitforEvent failed\n");
@@ -553,6 +553,8 @@ static void printArch( void )
     log_info( "ARCH:\tx86_64\n" );
 #elif defined( __arm__ )
     log_info( "ARCH:\tarm\n" );
+#elif defined( __aarch64__ )
+    log_info( "ARCH:\taarch64\n" );
 #else
 #error unknown arch
 #endif
@@ -669,7 +671,6 @@ int main(int argc, const char* argv[]) {
 
 
     int err;
-    int fd = acquireOutputStream();
 
     // Get platform
     err = clGetPlatformIDs(1, &platform_id, NULL);
@@ -695,8 +696,6 @@ int main(int argc, const char* argv[]) {
     if((err = clGetDeviceInfo(device_id, CL_DEVICE_MAX_CLOCK_FREQUENCY, config_size, &device_frequency, NULL )))
         device_frequency = 1;
 
-    releaseOutputStream(fd);
-
     log_info( "\nCompute Device info:\n" );
     log_info( "\tProcessing with %d devices\n", compute_devices );
     log_info( "\tDevice Frequency: %d MHz\n", device_frequency );
@@ -716,8 +715,6 @@ int main(int argc, const char* argv[]) {
 
     log_info( "Test binary built %s %s\n", __DATE__, __TIME__ );
 
-    fd = acquireOutputStream();
-
     cl_context context = clCreateContext(NULL, 1, &device_id, notify_callback, NULL, NULL);
     checkNull(context, "clCreateContext");
 
@@ -727,17 +724,13 @@ int main(int argc, const char* argv[]) {
     // Forall types
     for (int testId = 0; testId < TYPE_COUNT; ++testId) {
         if (test_filter_num && (testId != test_filter_num)) {
-            releaseOutputStream(fd);
             log_info("\n*** Skipping printf  for %s ***\n",strType[testId]);
-            fd = acquireOutputStream();
         }
         else {
-            releaseOutputStream(fd);
             log_info("\n*** Testing printf for %s ***\n",strType[testId]);
-            fd = acquireOutputStream();
+
             //For all formats
             for(unsigned int testNum = 0;testNum < allTestCase[testId]->_testNum;++testNum){
-                releaseOutputStream(fd);
                 if(allTestCase[testId]->_type == TYPE_VECTOR)
                     log_info("%d)testing printf(\"%sv%s%s\",%s)\n",testNum,allTestCase[testId]->_genParameters[testNum].vectorFormatFlag,allTestCase[testId]->_genParameters[testNum].vectorSize,
                     allTestCase[testId]->_genParameters[testNum].vectorFormatSpecifier,allTestCase[testId]->_genParameters[testNum].dataRepresentation);
@@ -752,7 +745,6 @@ int main(int argc, const char* argv[]) {
                 }
                 else
                     log_info("%d)testing printf(\"%s\",%s)\n",testNum,allTestCase[testId]->_genParameters[testNum].genericFormat,allTestCase[testId]->_genParameters[testNum].dataRepresentation);
-                fd = acquireOutputStream();
 
                 // Long support for varible type
                 if(allTestCase[testId]->_type == TYPE_VECTOR && !strcmp(allTestCase[testId]->_genParameters[testNum].dataType,"long") && !isLongSupported(device_id))
@@ -766,15 +758,11 @@ int main(int argc, const char* argv[]) {
                 // Perform the test
                 if (doTest(queue, context,testId,testNum,device_id,isLongSupport) != 0)
                 {
-                    releaseOutputStream(fd);
                     log_error("*** FAILED ***\n\n");
-                    fd = acquireOutputStream();
                 }
                 else
                 {
-                    releaseOutputStream(fd);
                     log_info("Passed\n");
-                    fd = acquireOutputStream();
                 }
             }
         }
@@ -790,8 +778,6 @@ int main(int argc, const char* argv[]) {
     if(clReleaseContext(context)!= CL_SUCCESS)
         log_error("clReleaseContext\n");
 
-    releaseOutputStream(fd);
-
 
     if (s_test_fail == 0) {
         if (s_test_cnt > 1)
diff --git a/test_conformance/printf/util_printf.c b/test_conformance/printf/util_printf.c
index 5bd9be1a..842a0f74 100644
--- a/test_conformance/printf/util_printf.c
+++ b/test_conformance/printf/util_printf.c
@@ -212,8 +212,6 @@ struct printDataGenParameters printFloatGenParameters[] = {
 
     {"%+#21.15E","789456123.0"},
 
-#if ! defined( __ANDROID__ )
-
     //Double argument representing floating-point,in [-]xh.hhhhpAd style
 
     {"%.6a","0.1"},
@@ -222,8 +220,6 @@ struct printDataGenParameters printFloatGenParameters[] = {
 
     {"%10.2a","9990.235"},
 
-#endif
-
     //Infinity (1.0/0.0)
 
     {"%f","1.0f/0.0f"},
@@ -275,14 +271,10 @@ const char* correctBufferFloat[] = {
 
     "+7.894561230000000E+8",
 
-#if ! defined( __ANDROID__ )
-
     "0x1.99999ap-4",
 
     "0x1.38p+13",
 
-#endif
-
     "inf",
 
     "-nan",
@@ -621,11 +613,6 @@ struct printDataGenParameters printStringGenParameters[] = {
     //%% specification
 
     {"%s","\"%%\""},
-
-    //null string
-
-    {"%s","(void*)0"}
-
 };
 
 //---------------------------------------------------------
@@ -641,9 +628,6 @@ const char * correctBufferString[] = {
     "f",
 
     "%%",
-
-    "(null)"
-
 };
 
 //---------------------------------------------------------
@@ -887,7 +871,7 @@ size_t verifyOutputBuffer(char *analysisBuffer,testCase* pTestCase,size_t testId
     if(!strcmp(pTestCase->_correctBuffer[testId],"inf"))
     return strcmp(analysisBuffer,"inf")&&strcmp(analysisBuffer,"infinity")&&strcmp(analysisBuffer,"1.#INF00")&&strcmp(analysisBuffer,"Inf");
     if(!strcmp(pTestCase->_correctBuffer[testId],"nan") || !strcmp(pTestCase->_correctBuffer[testId],"-nan")) {
-      return strcmp(analysisBuffer,"nan")&&strcmp(analysisBuffer,"-nan")&&strcmp(analysisBuffer,"1.#IND00")&&strcmp(analysisBuffer,"-1.#IND00")&&strcmp(analysisBuffer,"NaN")&&strcmp(analysisBuffer,"nan(ind)")&&strcmp(analysisBuffer,"nan(snan)");
+       return strcmp(analysisBuffer,"nan")&&strcmp(analysisBuffer,"-nan")&&strcmp(analysisBuffer,"1.#IND00")&&strcmp(analysisBuffer,"-1.#IND00")&&strcmp(analysisBuffer,"NaN")&&strcmp(analysisBuffer,"nan(ind)")&&strcmp(analysisBuffer,"nan(snan)")&&strcmp(analysisBuffer,"-nan(ind)");
     }
     return strcmp(analysisBuffer,pTestCase->_correctBuffer[testId]);
 }
diff --git a/test_conformance/run_conformance.py b/test_conformance/run_conformance.py
old mode 100644
new mode 100755
diff --git a/test_conformance/select/test_select.c b/test_conformance/select/test_select.c
index 5eec6a3d..4b9f5f8a 100644
--- a/test_conformance/select/test_select.c
+++ b/test_conformance/select/test_select.c
@@ -72,6 +72,7 @@ static int doTest(cl_command_queue queue, cl_context context,
 // range.  Otherwise, we test a subset of the range
 // [-min_short, min_short]
 static bool  s_wimpy_mode = false;
+static int s_wimpy_reduction_factor = 256;
 
 // Tests are broken into the major test which is based on the
 // src and cmp type and their corresponding vector types and
@@ -304,7 +305,7 @@ static int doTest(cl_command_queue queue, cl_context context, Type stype, Type c
 
     cl_ulong blocks = type_size[stype] * 0x100000000ULL / BUFFER_SIZE;
     size_t block_elements = BUFFER_SIZE / type_size[stype];
-    size_t step = s_wimpy_mode ? 256 : 1;
+    size_t step = s_wimpy_mode ? s_wimpy_reduction_factor : 1;
     cl_ulong cmp_stride = block_elements * step;
 
     // It is more efficient to create the tests all at once since we
@@ -471,6 +472,7 @@ static void printUsage( void )
     log_info("test_select:  [-cghw] [test_name|start_test_num] \n");
     log_info("  default is to run the full test on the default device\n");
     log_info("  -w run in wimpy mode (smoke test)\n");
+    log_info("  -[2^n] Set wimpy reduction factor, recommended range of n is 1-12, default factor(%u)\n", s_wimpy_reduction_factor);
     log_info("  test_name will run only one test of that name\n");
     log_info("  start_test_num will start running from that num\n");
 }
@@ -491,6 +493,8 @@ static void printArch( void )
     log_info( "ARCH:\tx86_64\n" );
 #elif defined( __arm__ )
     log_info( "ARCH:\tarm\n" );
+#elif defined( __aarch64__ )
+    log_info( "ARCH:\taarch64\n" );
 #else
 #error unknown arch
 #endif
@@ -582,6 +586,27 @@ int main(int argc, const char* argv[]) {
                     case 'w':  // Wimpy mode
                         s_wimpy_mode = true;
                         break;
+                    case '[':
+                        // wimpy reduction factor can be set with the option -[2^n]
+                        // Default factor is 256, and n practically can be from 1 to 12
+                        {
+                            const char *arg_temp = strchr(&arg[1], ']');
+                            if (arg_temp != 0)
+                            {
+                                int new_factor = atoi(&arg[1]);
+                                arg = arg_temp; // Advance until ']'
+                                if (new_factor && !(new_factor & (new_factor - 1)))
+                                {
+                                    vlog(" WimpyReduction factor changed from %d to %d \n", s_wimpy_reduction_factor, new_factor);
+                                    s_wimpy_reduction_factor = new_factor;
+                                }
+                                else
+                                {
+                                vlog(" Error in WimpyReduction factor must be power of 2 \n");
+                                }
+                            }
+                        }
+                        break;
                     default:
                         log_error( " <-- unknown flag: %c (0x%2.2x)\n)", *arg, *arg );
                         printUsage();
@@ -659,6 +684,7 @@ int main(int argc, const char* argv[]) {
         log_info("*** WARNING: Testing in Wimpy mode!                     ***\n");
         log_info("*** Wimpy mode is not sufficient to verify correctness. ***\n");
         log_info("*** It gives warm fuzzy feelings and then nevers calls. ***\n\n");
+        log_info("*** Wimpy Reduction Factor: %-27u ***\n\n", s_wimpy_reduction_factor);
     }
 
     cl_context context = clCreateContext(NULL, 1, &device_id, notify_callback, NULL, NULL);
diff --git a/test_conformance/spir/basic.zip b/test_conformance/spir/basic.zip
index d60636fb..14c7a3f4 100644
Binary files a/test_conformance/spir/basic.zip and b/test_conformance/spir/basic.zip differ
diff --git a/test_conformance/spir/half.zip b/test_conformance/spir/half.zip
index f0d3e8ff..0b1deca1 100644
Binary files a/test_conformance/spir/half.zip and b/test_conformance/spir/half.zip differ
diff --git a/test_conformance/spir/images_kernel_read_write.zip b/test_conformance/spir/images_kernel_read_write.zip
index 65fa8728..dc701137 100644
Binary files a/test_conformance/spir/images_kernel_read_write.zip and b/test_conformance/spir/images_kernel_read_write.zip differ
diff --git a/test_conformance/spir/main.cpp b/test_conformance/spir/main.cpp
index b0fba6fc..95fcd8bf 100644
--- a/test_conformance/spir/main.cpp
+++ b/test_conformance/spir/main.cpp
@@ -840,7 +840,6 @@ bool test_basic (cl_device_id device, cl_uint size_t_width, const char *folder)
         "test_sizeof.sizeof_uintptr_t",
         "test_sizeof.sizeof_image2d_t",
         "test_sizeof.sizeof_image3d_t",
-        "test_sizeof.sizeof_sampler_t",
         "test_sizeof.sizeof_double",
         "test_sizeof.sizeof_double2",
         "test_sizeof.sizeof_double4",
diff --git a/test_conformance/spir/profiling.zip b/test_conformance/spir/profiling.zip
index 33ac3396..c792bbec 100644
Binary files a/test_conformance/spir/profiling.zip and b/test_conformance/spir/profiling.zip differ
diff --git a/test_conformance/spir/typeinfo.h b/test_conformance/spir/typeinfo.h
index ea1f6b66..a9dc7124 100644
--- a/test_conformance/spir/typeinfo.h
+++ b/test_conformance/spir/typeinfo.h
@@ -93,6 +93,42 @@ TYPE_HNDL("image3d_int",             false, 0,  1,                    0x0,
 TYPE_HNDL("image3d_uint",            false, 0,  1,                    0x0,               0x7f,    KernelArgGeneratorImage3d<CL_UNSIGNED_INT32>)
 TYPE_HNDL("image_kernel_data",       false, 0,  1,                    0x0,         0xffffffff,    KernelStructTypeArgGenerator<image_kernel_data>)             //image_kernel_data defines as 5 X int
 TYPE_HNDL("image_kernel_data*",      true,  0,  1,                    0x0,         0xffffffff,    KernelStructTypeArgGenerator<image_kernel_data>)             //image_kernel_data defines as 5 X int
+TYPE_HNDL("read_only_image1d_array_float",       false, 0,  1,                    0x0,               0x7f,    KernelArgGeneratorImage1dArray<CL_FLOAT>)
+TYPE_HNDL("read_only_image1d_array_int",         false, 0,  1,                    0x0,               0x7f,    KernelArgGeneratorImage1dArray<CL_SIGNED_INT32>)
+TYPE_HNDL("read_only_image1d_array_uint",        false, 0,  1,                    0x0,               0x7f,    KernelArgGeneratorImage1dArray<CL_UNSIGNED_INT32>)
+TYPE_HNDL("read_only_image1d_buffer_float",      false, 0,  1,                    0x0,               0x7f,    KernelArgGeneratorImage1dBuffer<CL_FLOAT>)
+TYPE_HNDL("read_only_image1d_buffer_int",        false, 0,  1,                    0x0,               0x7f,    KernelArgGeneratorImage1dBuffer<CL_SIGNED_INT32>)
+TYPE_HNDL("read_only_image1d_buffer_uint",       false, 0,  1,                    0x0,               0x7f,    KernelArgGeneratorImage1dBuffer<CL_UNSIGNED_INT32>)
+TYPE_HNDL("read_only_image1d_float",             false, 0,  1,                    0x0,               0x7f,    KernelArgGeneratorImage1d<CL_FLOAT>)
+TYPE_HNDL("read_only_image1d_int",               false, 0,  1,                    0x0,               0x7f,    KernelArgGeneratorImage1d<CL_SIGNED_INT32>)
+TYPE_HNDL("read_only_image1d_uint",              false, 0,  1,                    0x0,               0x7f,    KernelArgGeneratorImage1d<CL_UNSIGNED_INT32>)
+TYPE_HNDL("read_only_image2d_array_float",       false, 0,  1,                    0x0,               0x7f,    KernelArgGeneratorImage2dArray<CL_FLOAT>)
+TYPE_HNDL("read_only_image2d_array_int",         false, 0,  1,                    0x0,               0x7f,    KernelArgGeneratorImage2dArray<CL_SIGNED_INT32>)
+TYPE_HNDL("read_only_image2d_array_uint",        false, 0,  1,                    0x0,               0x7f,    KernelArgGeneratorImage2dArray<CL_UNSIGNED_INT32>)
+TYPE_HNDL("read_only_image2d_float",             false, 0,  1,                    0x0,               0x7f,    KernelArgGeneratorImage2d<CL_FLOAT>)
+TYPE_HNDL("read_only_image2d_int",               false, 0,  1,                    0x0,               0x7f,    KernelArgGeneratorImage2d<CL_SIGNED_INT32>)
+TYPE_HNDL("read_only_image2d_uint",              false, 0,  1,                    0x0,               0x7f,    KernelArgGeneratorImage2d<CL_UNSIGNED_INT32>)
+TYPE_HNDL("read_only_image3d_float",             false, 0,  1,                    0x0,               0x7f,    KernelArgGeneratorImage3d<CL_FLOAT>)
+TYPE_HNDL("read_only_image3d_int",               false, 0,  1,                    0x0,               0x7f,    KernelArgGeneratorImage3d<CL_SIGNED_INT32>)
+TYPE_HNDL("read_only_image3d_uint",              false, 0,  1,                    0x0,               0x7f,    KernelArgGeneratorImage3d<CL_UNSIGNED_INT32>)
+TYPE_HNDL("write_only_image1d_array_float",       false, 0,  1,                    0x0,               0x7f,    KernelArgGeneratorImage1dArray<CL_FLOAT>)
+TYPE_HNDL("write_only_image1d_array_int",         false, 0,  1,                    0x0,               0x7f,    KernelArgGeneratorImage1dArray<CL_SIGNED_INT32>)
+TYPE_HNDL("write_only_image1d_array_uint",        false, 0,  1,                    0x0,               0x7f,    KernelArgGeneratorImage1dArray<CL_UNSIGNED_INT32>)
+TYPE_HNDL("write_only_image1d_buffer_float",      false, 0,  1,                    0x0,               0x7f,    KernelArgGeneratorImage1dBuffer<CL_FLOAT>)
+TYPE_HNDL("write_only_image1d_buffer_int",        false, 0,  1,                    0x0,               0x7f,    KernelArgGeneratorImage1dBuffer<CL_SIGNED_INT32>)
+TYPE_HNDL("write_only_image1d_buffer_uint",       false, 0,  1,                    0x0,               0x7f,    KernelArgGeneratorImage1dBuffer<CL_UNSIGNED_INT32>)
+TYPE_HNDL("write_only_image1d_float",             false, 0,  1,                    0x0,               0x7f,    KernelArgGeneratorImage1d<CL_FLOAT>)
+TYPE_HNDL("write_only_image1d_int",               false, 0,  1,                    0x0,               0x7f,    KernelArgGeneratorImage1d<CL_SIGNED_INT32>)
+TYPE_HNDL("write_only_image1d_uint",              false, 0,  1,                    0x0,               0x7f,    KernelArgGeneratorImage1d<CL_UNSIGNED_INT32>)
+TYPE_HNDL("write_only_image2d_array_float",       false, 0,  1,                    0x0,               0x7f,    KernelArgGeneratorImage2dArray<CL_FLOAT>)
+TYPE_HNDL("write_only_image2d_array_int",         false, 0,  1,                    0x0,               0x7f,    KernelArgGeneratorImage2dArray<CL_SIGNED_INT32>)
+TYPE_HNDL("write_only_image2d_array_uint",        false, 0,  1,                    0x0,               0x7f,    KernelArgGeneratorImage2dArray<CL_UNSIGNED_INT32>)
+TYPE_HNDL("write_only_image2d_float",             false, 0,  1,                    0x0,               0x7f,    KernelArgGeneratorImage2d<CL_FLOAT>)
+TYPE_HNDL("write_only_image2d_int",               false, 0,  1,                    0x0,               0x7f,    KernelArgGeneratorImage2d<CL_SIGNED_INT32>)
+TYPE_HNDL("write_only_image2d_uint",              false, 0,  1,                    0x0,               0x7f,    KernelArgGeneratorImage2d<CL_UNSIGNED_INT32>)
+TYPE_HNDL("write_only_image3d_float",             false, 0,  1,                    0x0,               0x7f,    KernelArgGeneratorImage3d<CL_FLOAT>)
+TYPE_HNDL("write_only_image3d_int",               false, 0,  1,                    0x0,               0x7f,    KernelArgGeneratorImage3d<CL_SIGNED_INT32>)
+TYPE_HNDL("write_only_image3d_uint",              false, 0,  1,                    0x0,               0x7f,    KernelArgGeneratorImage3d<CL_UNSIGNED_INT32>)
 TYPE_HNDL("int",                     false, 0,  1,                    0x0,         0x7fffffff,    KernelArgGeneratorT<cl_int>)
 TYPE_HNDL("int*",                    true,  0, 16,                    0x0,         0x7fffffff,    KernelArgGeneratorT<cl_int>)
 TYPE_HNDL("int16",                   false, 0, 16,                    0x0,         0x7fffffff,    KernelArgGeneratorT<cl_int>)
diff --git a/test_conformance/subgroups/main.cpp b/test_conformance/subgroups/main.cpp
index 5a289126..efb7fe8e 100644
--- a/test_conformance/subgroups/main.cpp
+++ b/test_conformance/subgroups/main.cpp
@@ -40,15 +40,15 @@ ct_assert((sizeof(basefn_names) / sizeof(basefn_names[0])) == (sizeof(basefn_lis
 
 static const int num_fns = sizeof(basefn_names) / sizeof(char *);
 
-static int
+static test_status
 checkSubGroupsExtension(cl_device_id device)
 {
     if (!is_extension_available(device, "cl_khr_subgroups")) {
-        log_info("Device does not support 'cl_khr_subgroups'. Skipping the test.\n");
-        return CL_INVALID_DEVICE;
+        log_error("'cl_khr_subgroups' is a required extension, failing.\n");
+        return TEST_FAIL;
     }
 
-    return CL_SUCCESS;
+    return TEST_PASS;
 }
 
 int