diff --git a/test_conformance/math_brute_force/Utility.h b/test_conformance/math_brute_force/Utility.h
index 31256358..7be12874 100644
--- a/test_conformance/math_brute_force/Utility.h
+++ b/test_conformance/math_brute_force/Utility.h
@@ -31,6 +31,7 @@
 #include "harness/conversions.h"
 
 #define BUFFER_SIZE         (1024*1024*2)
+#define EMBEDDED_REDUCTION_FACTOR (64)
 
 #if defined( __GNUC__ )
     #define UNUSED  __attribute__ ((unused))
diff --git a/test_conformance/math_brute_force/binary.cpp b/test_conformance/math_brute_force/binary.cpp
index eb5007c0..2e61d86e 100644
--- a/test_conformance/math_brute_force/binary.cpp
+++ b/test_conformance/math_brute_force/binary.cpp
@@ -283,6 +283,11 @@ int TestFunc_Float_Float_Float_common(const Func *f, MTdata d, int isNextafter,
         test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
         test_info.scale =  (cl_uint) sizeof(cl_float) * 2 * gWimpyReductionFactor;
     }
+    else if (gIsEmbedded)
+    {
+        test_info.scale *= EMBEDDED_REDUCTION_FACTOR;
+    }
+
     test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
@@ -1021,6 +1026,11 @@ int TestFunc_Double_Double_Double_common(const Func *f, MTdata d,
         test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
         test_info.scale =  (cl_uint) sizeof(cl_double) * 2 * gWimpyReductionFactor;
     }
+    else if (gIsEmbedded)
+    {
+        test_info.scale *= EMBEDDED_REDUCTION_FACTOR;
+    }
+
     test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
diff --git a/test_conformance/math_brute_force/binaryOperator.cpp b/test_conformance/math_brute_force/binaryOperator.cpp
index 0742964d..26c7cc4c 100644
--- a/test_conformance/math_brute_force/binaryOperator.cpp
+++ b/test_conformance/math_brute_force/binaryOperator.cpp
@@ -274,6 +274,10 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
         test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
         test_info.scale =  (cl_uint) sizeof(cl_float) * 2 * gWimpyReductionFactor;
     }
+    else if (gIsEmbedded)
+    {
+        test_info.scale *= EMBEDDED_REDUCTION_FACTOR;
+    }
 
     test_info.step = test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
@@ -969,6 +973,10 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
         test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
         test_info.scale =  (cl_uint) sizeof(cl_double) * 2 * gWimpyReductionFactor;
     }
+    else if (gIsEmbedded)
+    {
+        test_info.scale *= EMBEDDED_REDUCTION_FACTOR;
+    }
 
     test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
diff --git a/test_conformance/math_brute_force/binary_i.cpp b/test_conformance/math_brute_force/binary_i.cpp
index 6ba0eb58..1f2e9be3 100644
--- a/test_conformance/math_brute_force/binary_i.cpp
+++ b/test_conformance/math_brute_force/binary_i.cpp
@@ -272,6 +272,11 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
         test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
         test_info.scale =  (cl_uint) sizeof(cl_float) * 2 * gWimpyReductionFactor;
     }
+    else if (gIsEmbedded)
+    {
+        test_info.scale *= EMBEDDED_REDUCTION_FACTOR;
+    }
+
     test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
@@ -786,6 +791,11 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
         test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
         test_info.scale =  (cl_uint) sizeof(cl_double) * 2 * gWimpyReductionFactor;
     }
+    else if (gIsEmbedded)
+    {
+        test_info.scale *= EMBEDDED_REDUCTION_FACTOR;
+    }
+
     test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
diff --git a/test_conformance/math_brute_force/binary_two_results_i.cpp b/test_conformance/math_brute_force/binary_two_results_i.cpp
index c5577b9e..5d794010 100644
--- a/test_conformance/math_brute_force/binary_two_results_i.cpp
+++ b/test_conformance/math_brute_force/binary_two_results_i.cpp
@@ -297,6 +297,10 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     if(gWimpyMode ){
         step = (1ULL<<32) * gWimpyReductionFactor / (512);
     }
+    else if (gIsEmbedded)
+    {
+        step = (BUFFER_SIZE / sizeof(float)) * EMBEDDED_REDUCTION_FACTOR;
+    }
 
     if( gIsEmbedded )
         float_ulps = f->float_embedded_ulps;
@@ -722,6 +726,10 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     if(gWimpyMode ){
        step = (1ULL<<32) * gWimpyReductionFactor / (512);
     }
+    else if (gIsEmbedded)
+    {
+        step = (BUFFER_SIZE / sizeof(double)) * EMBEDDED_REDUCTION_FACTOR;
+    }
 
 #if defined PARALLEL_REFERENCE
     cl_uint threadCount = GetThreadCount();
diff --git a/test_conformance/math_brute_force/i_unary.cpp b/test_conformance/math_brute_force/i_unary.cpp
index 379d8e35..50e1f1af 100644
--- a/test_conformance/math_brute_force/i_unary.cpp
+++ b/test_conformance/math_brute_force/i_unary.cpp
@@ -199,6 +199,10 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
     {
         step = (1ULL<<32) * gWimpyReductionFactor / (512);
     }
+    else if (gIsEmbedded)
+    {
+        step = (BUFFER_SIZE / sizeof(cl_float)) * EMBEDDED_REDUCTION_FACTOR;
+    }
 
     // This test is not using ThreadPool so we need to disable FTZ here
     // for reference computations
@@ -420,6 +424,11 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
     {
         step = (1ULL<<32) * gWimpyReductionFactor / (512);
     }
+    else if (gIsEmbedded)
+    {
+        step = (BUFFER_SIZE / sizeof(cl_double)) * EMBEDDED_REDUCTION_FACTOR;
+    }
+
     // This test is not using ThreadPool so we need to disable FTZ here
     // for reference computations
     FPU_mode_type oldMode;
diff --git a/test_conformance/math_brute_force/macro_binary.cpp b/test_conformance/math_brute_force/macro_binary.cpp
index b590f50a..a4e4f97b 100644
--- a/test_conformance/math_brute_force/macro_binary.cpp
+++ b/test_conformance/math_brute_force/macro_binary.cpp
@@ -259,6 +259,11 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
         test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
         test_info.scale =  (cl_uint) sizeof(cl_float) * 2 * gWimpyReductionFactor;
     }
+    else if (gIsEmbedded)
+    {
+        test_info.scale *= EMBEDDED_REDUCTION_FACTOR;
+    }
+
     test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
@@ -771,6 +776,10 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
         test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
          test_info.scale =  (cl_uint) sizeof(cl_double) * 2 * gWimpyReductionFactor;
     }
+    else if (gIsEmbedded)
+    {
+        test_info.scale *= EMBEDDED_REDUCTION_FACTOR;
+    }
 
     test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
diff --git a/test_conformance/math_brute_force/macro_unary.cpp b/test_conformance/math_brute_force/macro_unary.cpp
index 872007f1..773bf0b4 100644
--- a/test_conformance/math_brute_force/macro_unary.cpp
+++ b/test_conformance/math_brute_force/macro_unary.cpp
@@ -230,6 +230,11 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
         test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
         test_info.scale =  (cl_uint) sizeof(cl_float) * 2 * gWimpyReductionFactor;
     }
+    else if (gIsEmbedded)
+    {
+        test_info.scale *= EMBEDDED_REDUCTION_FACTOR;
+    }
+
     test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
@@ -629,6 +634,10 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
         test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
         test_info.scale =  (cl_uint) sizeof(cl_double) * 2 * gWimpyReductionFactor;
     }
+    else if (gIsEmbedded)
+    {
+        test_info.scale *= EMBEDDED_REDUCTION_FACTOR;
+    }
 
     test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
diff --git a/test_conformance/math_brute_force/mad.cpp b/test_conformance/math_brute_force/mad.cpp
index 0737afbc..314caaeb 100644
--- a/test_conformance/math_brute_force/mad.cpp
+++ b/test_conformance/math_brute_force/mad.cpp
@@ -213,6 +213,11 @@ int TestFunc_mad(const Func *f, MTdata d, bool relaxedMode)
     {
         step = (1ULL<<32) * gWimpyReductionFactor / (512);
     }
+    else if (gIsEmbedded)
+    {
+        step = (BUFFER_SIZE / sizeof(float)) * EMBEDDED_REDUCTION_FACTOR;
+    }
+
     // Init the kernels
     BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                    f->nameInCode, relaxedMode };
@@ -680,6 +685,11 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
     {
         step = (1ULL<<32) * gWimpyReductionFactor / (512);
     }
+    else if (gIsEmbedded)
+    {
+        step = (BUFFER_SIZE / sizeof(double)) * EMBEDDED_REDUCTION_FACTOR;
+    }
+
     // Init the kernels
     BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
                                    f->nameInCode, relaxedMode };
diff --git a/test_conformance/math_brute_force/ternary.cpp b/test_conformance/math_brute_force/ternary.cpp
index 2c4b503e..4ca38327 100644
--- a/test_conformance/math_brute_force/ternary.cpp
+++ b/test_conformance/math_brute_force/ternary.cpp
@@ -238,6 +238,10 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     {
         step = (1ULL<<32) * gWimpyReductionFactor / (512);
     }
+    else if (gIsEmbedded)
+    {
+        step = (BUFFER_SIZE / sizeof(float)) * EMBEDDED_REDUCTION_FACTOR;
+    }
 
     if( gIsEmbedded )
         float_ulps = f->float_embedded_ulps;
@@ -879,6 +883,10 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
     {
         step = (1ULL<<32) * gWimpyReductionFactor / (512);
     }
+    else if (gIsEmbedded)
+    {
+        step = (BUFFER_SIZE / sizeof(double)) * EMBEDDED_REDUCTION_FACTOR;
+    }
 
     Force64BitFPUPrecision();
 
diff --git a/test_conformance/math_brute_force/unary.cpp b/test_conformance/math_brute_force/unary.cpp
index 91bc92d9..4605e34f 100644
--- a/test_conformance/math_brute_force/unary.cpp
+++ b/test_conformance/math_brute_force/unary.cpp
@@ -246,6 +246,11 @@ int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
         test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
         test_info.scale =  (cl_uint) sizeof(cl_float) * 2 * gWimpyReductionFactor;
     }
+    else if (gIsEmbedded)
+    {
+        test_info.scale *= EMBEDDED_REDUCTION_FACTOR;
+    }
+
     test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
@@ -1033,6 +1038,11 @@ int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
         test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
         test_info.scale =  (cl_uint) sizeof(cl_double) * 2 * gWimpyReductionFactor;
     }
+    else if (gIsEmbedded)
+    {
+        test_info.scale *= EMBEDDED_REDUCTION_FACTOR;
+    }
+
     test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
diff --git a/test_conformance/math_brute_force/unary_two_results.cpp b/test_conformance/math_brute_force/unary_two_results.cpp
index a86277f1..7ea7d859 100644
--- a/test_conformance/math_brute_force/unary_two_results.cpp
+++ b/test_conformance/math_brute_force/unary_two_results.cpp
@@ -215,6 +215,10 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
     {
         step = (1ULL<<32) * gWimpyReductionFactor / (512);
     }
+    else if (gIsEmbedded)
+    {
+        step = (BUFFER_SIZE / sizeof(float)) * EMBEDDED_REDUCTION_FACTOR;
+    }
 
     // Init the kernels
     BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
@@ -674,6 +678,10 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
     {
         step = (1ULL<<32) * gWimpyReductionFactor / (512);
     }
+    else if (gIsEmbedded)
+    {
+        step = (BUFFER_SIZE / sizeof(double)) * EMBEDDED_REDUCTION_FACTOR;
+    }
 
     Force64BitFPUPrecision();
 
diff --git a/test_conformance/math_brute_force/unary_two_results_i.cpp b/test_conformance/math_brute_force/unary_two_results_i.cpp
index 108be6a4..8d0ecda9 100644
--- a/test_conformance/math_brute_force/unary_two_results_i.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_i.cpp
@@ -218,6 +218,11 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
     {
         step = (1ULL<<32) * gWimpyReductionFactor / (512);
     }
+    else if (gIsEmbedded)
+    {
+        step = (BUFFER_SIZE / sizeof(float)) * EMBEDDED_REDUCTION_FACTOR;
+    }
+
     if( gIsEmbedded )
         float_ulps = f->float_embedded_ulps;
     else
@@ -521,6 +526,10 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
     {
         step = (1ULL<<32) * gWimpyReductionFactor / (512);
     }
+    else if (gIsEmbedded)
+    {
+        step = (BUFFER_SIZE / sizeof(double)) * EMBEDDED_REDUCTION_FACTOR;
+    }
 
     Force64BitFPUPrecision();
 
diff --git a/test_conformance/math_brute_force/unary_u.cpp b/test_conformance/math_brute_force/unary_u.cpp
index 87fcae32..bb9b3995 100644
--- a/test_conformance/math_brute_force/unary_u.cpp
+++ b/test_conformance/math_brute_force/unary_u.cpp
@@ -207,6 +207,11 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
     {
         step = (1ULL<<32) * gWimpyReductionFactor / (512);
     }
+    else if (gIsEmbedded)
+    {
+        step = (BUFFER_SIZE / sizeof(float)) * EMBEDDED_REDUCTION_FACTOR;
+    }
+
     if( gIsEmbedded)
         float_ulps = f->float_embedded_ulps;
     else
@@ -480,6 +485,11 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
     {
         step = (1ULL<<32) * gWimpyReductionFactor / (512);
     }
+    else if (gIsEmbedded)
+    {
+        step = (BUFFER_SIZE / sizeof(double)) * EMBEDDED_REDUCTION_FACTOR;
+    }
+
     Force64BitFPUPrecision();
 
     // Init the kernels