From 6958f2ba24012ccd16b1a9820a1a999b94673382 Mon Sep 17 00:00:00 2001
From: Alexander Stoyakin <alexander.stoyakin@gmail.com>
Date: Fri, 8 Nov 2019 10:25:44 +0200
Subject: [PATCH 01/15] [WIP] Fix compilation after nd4j changes (#37)

* Fix compilation.

* Some tests fixed

* Disable tests temporarily.

* Restored test

* Tests restored.

* Test restored.
---
 nd4s/build.sbt                                |  2 +-
 nd4s/src/main/scala/org/nd4s/Implicits.scala  |  2 +-
 .../org/nd4s/samediff/ConstructionTest.scala  |  6 +++---
 .../scala/org/nd4s/samediff/MathTest.scala    | 14 ++++++-------
 .../org/nd4s/samediff/SameDiffTest.scala      | 21 +++++++++++--------
 5 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/nd4s/build.sbt b/nd4s/build.sbt
index 1fbac5ae6..d523b754e 100644
--- a/nd4s/build.sbt
+++ b/nd4s/build.sbt
@@ -61,7 +61,7 @@ lazy val commonSettings = Seq(
 
 lazy val publishNexus = Seq(
   publishTo := {
-    val nexus = "https://nexus.ci.skymind.io/"
+    val nexus = "https://packages.konduit.ai/"
     if (isSnapshot.value)
       Some("snapshots" at nexus + "content/repositories/maven-snapshots")
     else
diff --git a/nd4s/src/main/scala/org/nd4s/Implicits.scala b/nd4s/src/main/scala/org/nd4s/Implicits.scala
index c1fa63e11..35df5a0cb 100644
--- a/nd4s/src/main/scala/org/nd4s/Implicits.scala
+++ b/nd4s/src/main/scala/org/nd4s/Implicits.scala
@@ -80,7 +80,7 @@ object Implicits {
   class IntArray2INDArray(val underlying: Array[Int]) extends AnyVal {
     def mkNDArray(shape: Array[Int], ord: NDOrdering = NDOrdering(Nd4j.order()), offset: Int = 0): INDArray = {
       val strides = Nd4j.getStrides(shape, ord.value)
-      Nd4j.create(underlying, shape.map(_.toLong), strides.map(_.toLong), ord.value, DataType.INT)
+      Nd4j.create(underlying.map(_.toInt), shape.map(_.toLong), strides.map(_.toLong), ord.value, DataType.INT)
     }
 
     def toNDArray: INDArray = Nd4j.createFromArray(underlying: _*)
diff --git a/nd4s/src/test/scala/org/nd4s/samediff/ConstructionTest.scala b/nd4s/src/test/scala/org/nd4s/samediff/ConstructionTest.scala
index 95715ecb1..25e8f374f 100644
--- a/nd4s/src/test/scala/org/nd4s/samediff/ConstructionTest.scala
+++ b/nd4s/src/test/scala/org/nd4s/samediff/ConstructionTest.scala
@@ -170,9 +170,9 @@ class ConstructionTest extends FlatSpec with Matchers {
     sd.setTrainingConfig(conf)
     sd.fit(new SingletonMultiDataSetIterator(mds), 1)
 
-    w.eval.toDoubleVector.head shouldBe (0.0629 +- 0.0001)
-    w.eval.toDoubleVector.tail.head shouldBe (0.3128 +- 0.0001)
-    w.eval.toDoubleVector.tail.tail.head shouldBe (0.2503 +- 0.0001)
+    w.getArr.get(0) shouldBe (0.0629 +- 0.0001)
+    w.getArr.get(1) shouldBe (0.3128 +- 0.0001)
+    w.getArr.get(2) shouldBe (0.2503 +- 0.0001)
     //Console.println(w.eval)
   }
 }
diff --git a/nd4s/src/test/scala/org/nd4s/samediff/MathTest.scala b/nd4s/src/test/scala/org/nd4s/samediff/MathTest.scala
index dc41b31f6..5eec9f237 100644
--- a/nd4s/src/test/scala/org/nd4s/samediff/MathTest.scala
+++ b/nd4s/src/test/scala/org/nd4s/samediff/MathTest.scala
@@ -209,7 +209,7 @@ class MathTest extends FlatSpec with Matchers {
     val x = sd.bind(arr)
     val y = new SDVariableWrapper(x)
 
-    x.get(SDIndex.point(0)).getArr shouldBe y(0).getArr
+    x.get(SDIndex.point(0)).eval shouldBe y(0).eval
   }
 
   "SDVariable " should "be indexable in 2d" in {
@@ -221,7 +221,7 @@ class MathTest extends FlatSpec with Matchers {
 
     x(0, ---).eval shouldBe x(SDIndex.point(0), SDIndex.all()).eval
 
-    val slice1 = x.get(SDIndex.interval(0, 2), SDIndex.all()).eval
+    val slice1 = x.get(SDIndex.interval(0L, 2L), SDIndex.all()).eval
     val slice2 = x(0 :: 2, ---).eval
     slice1 shouldBe slice2
   }
@@ -237,10 +237,10 @@ class MathTest extends FlatSpec with Matchers {
     x.get(SDIndex.point(0), SDIndex.point(0), SDIndex.all()).eval shouldBe x(0, 0, ---).eval
     x.get(SDIndex.point(0), SDIndex.point(0), SDIndex.point(0)).eval shouldBe x(0, 0, 0).eval
 
-    x.get(SDIndex.interval(0, 2), SDIndex.point(0), SDIndex.point(0)).eval shouldBe x(0 :: 2, 0, 0).eval
-    x.get(SDIndex.interval(0, 2), SDIndex.interval(0, 1), SDIndex.interval(0, 2)).eval shouldBe x(0 :: 2,
-                                                                                                  0 :: 1,
-                                                                                                  0 :: 2).eval
-    x.get(SDIndex.interval(0, 2), SDIndex.interval(0, 1), SDIndex.all()).eval shouldBe x(0 :: 2, 0 :: 1, ---).eval
+    x.get(SDIndex.interval(0L, 2L), SDIndex.point(0), SDIndex.point(0)).eval shouldBe x(0 :: 2, 0, 0).eval
+    x.get(SDIndex.interval(0L, 2L), SDIndex.interval(0L, 1L), SDIndex.interval(0L, 2L)).eval shouldBe x(0 :: 2,
+                                                                                                        0 :: 1,
+                                                                                                        0 :: 2).eval
+    x.get(SDIndex.interval(0L, 2L), SDIndex.interval(0L, 1L), SDIndex.all()).eval shouldBe x(0 :: 2, 0 :: 1, ---).eval
   }
 }
diff --git a/nd4s/src/test/scala/org/nd4s/samediff/SameDiffTest.scala b/nd4s/src/test/scala/org/nd4s/samediff/SameDiffTest.scala
index d0efee304..a12a8752e 100644
--- a/nd4s/src/test/scala/org/nd4s/samediff/SameDiffTest.scala
+++ b/nd4s/src/test/scala/org/nd4s/samediff/SameDiffTest.scala
@@ -60,11 +60,11 @@ class SameDiffTest extends FlatSpec with Matchers {
     sd.associateArrayWithVariable(inputArr, input)
     sd.associateArrayWithVariable(labelArr, label)
 
-    val result: INDArray = sd.execAndEndResult
-    assertEquals(1, result.length)
+    val result = sd.output(null: java.util.Map[String, org.nd4j.linalg.api.ndarray.INDArray], "loss")
+    assertEquals(1, result.values().size())
 
     val emptyMap = new HashMap[String, INDArray]()
-    sd.execBackwards(emptyMap)
+    sd.output(emptyMap, "loss")
   }
 
   "SameDiff" should "run test dense layer forward pass" in {
@@ -84,7 +84,7 @@ class SameDiffTest extends FlatSpec with Matchers {
     val expMmul = iInput.mmul(iWeights)
     val expZ = expMmul.addRowVector(iBias)
     val expOut = Transforms.sigmoid(expZ, true)
-    sd.exec(new HashMap[String, INDArray](), sd.outputs)
+    sd.output(new HashMap[String, INDArray](), "mmul", "out", "bias", "add")
     assertEquals(expMmul, mmul.getArr)
     assertEquals(expZ, z.getArr)
     assertEquals(expOut, out.getArr)
@@ -109,15 +109,18 @@ class SameDiffTest extends FlatSpec with Matchers {
       .dataSetFeatureMapping("in", "in2")
       .skipBuilderValidation(true)
       .build
-    sd.setTrainingConfig(c)
-    sd.fit(new SingletonMultiDataSetIterator(new MultiDataSet(Array[INDArray](inArr, inArr2), null)), 1)
-    val out = tanh.eval
+
+    val data = new HashMap[String, INDArray]()
+    data.put("in", Nd4j.randn(1, 3))
+    data.put("in2", Nd4j.randn(3, 4))
     in.convertToConstant
-    val out2 = tanh.eval
+    val out = sd.output(data, "tanh")
+    val out2 = sd.output(data, "tanh")
     assertEquals(out, out2)
     assertEquals(VariableType.CONSTANT, in.getVariableType)
     assertEquals(inArr, in.getArr)
     //Sanity check on fitting:
-    sd.fit(new SingletonMultiDataSetIterator(new MultiDataSet(Array[INDArray](inArr2), null)), 1)
+    sd.setTrainingConfig(c)
+    sd.fit(new SingletonMultiDataSetIterator(new MultiDataSet(Array[INDArray](inArr, inArr2), null)), 1)
   }
 }

From cd961727bbd55e88d2dbca51b76dee98c826a759 Mon Sep 17 00:00:00 2001
From: raver119 <raver119@gmail.com>
Date: Mon, 11 Nov 2019 17:45:59 +0300
Subject: [PATCH 02/15] [WIP] perf tests (#40)

* special maxpool test

Signed-off-by: raver119 <raver119@gmail.com>

* special maxpool test

Signed-off-by: raver119 <raver119@gmail.com>
---
 .../layers_tests/PerformanceTests.cpp         | 90 +++++++++++++++++++
 1 file changed, 90 insertions(+)
 create mode 100644 libnd4j/tests_cpu/layers_tests/PerformanceTests.cpp

diff --git a/libnd4j/tests_cpu/layers_tests/PerformanceTests.cpp b/libnd4j/tests_cpu/layers_tests/PerformanceTests.cpp
new file mode 100644
index 000000000..6ea2ba081
--- /dev/null
+++ b/libnd4j/tests_cpu/layers_tests/PerformanceTests.cpp
@@ -0,0 +1,90 @@
+/*******************************************************************************
+ * Copyright (c) 2019 Konduit
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+
+#include "testlayers.h"
+#include <Graph.h>
+#include <chrono>
+#include <Node.h>
+#include <ops/declarable/CustomOperations.h>
+#include <graph/profiling/GraphProfilingHelper.h>
+#include <type_conversions.h>
+#include <helpers/threshold.h>
+#include <helpers/MmulHelper.h>
+#include <ops/ops.h>
+#include <OmpLaunchHelper.h>
+#include <GradCheck.h>
+#include <ops/declarable/helpers/im2col.h>
+#include <Loops.h>
+#include <RandomLauncher.h>
+
+#include <helpers/BenchmarkHelper.h>
+#include <ops/declarable/helpers/scatter.h>
+#include <helpers/ConstantShapeHelper.h>
+#include <helpers/ConstantTadHelper.h>
+#include <array>
+#include <performance/benchmarking/FullBenchmarkSuit.h>
+#include <performance/benchmarking/LightBenchmarkSuit.h>
+
+#include <ops/declarable/helpers/legacy_helpers.h>
+
+using namespace nd4j;
+using namespace nd4j::graph;
+
+class PerformanceTests : public testing::Test {
+public:
+    int numIterations = 100;
+
+    PerformanceTests() {
+        //
+    }
+};
+
+#ifdef RELEASE_BUILD
+
+TEST_F(PerformanceTests, test_maxpooling2d_1) {
+    std::vector<Nd4jLong> valuesX;
+    auto x = NDArrayFactory::create<float>('c', {32, 3, 224, 224});
+    auto z = NDArrayFactory::create<float>('c', {32, 3, 224, 224});
+    x.linspace(1.0f);
+    Nd4jLong k = 5;
+
+    Nd4jLong iArgs[] {k,k, 1,1, 0,0, 1,1, 1};
+    Context ctx(1);
+    ctx.setInputArray(0, &x);
+    ctx.setOutputArray(0, &z);
+    ctx.setIArguments(iArgs, 9);
+
+    nd4j::ops::maxpool2d op;
+
+    for (int i = 0; i < numIterations; i++) {
+        auto timeStart = std::chrono::system_clock::now();
+
+        op.execute(&ctx);
+
+        auto timeEnd = std::chrono::system_clock::now();
+        auto outerTime = std::chrono::duration_cast<std::chrono::nanoseconds>(timeEnd - timeStart).count();
+        valuesX.emplace_back(outerTime);
+    }
+
+    std::sort(valuesX.begin(), valuesX.end());
+    nd4j_printf("Execution time: %lld; Min: %lld; Max: %lld;\n", valuesX[valuesX.size() / 2], valuesX[0], valuesX[valuesX.size() - 1]);
+}
+
+#endif
\ No newline at end of file

From 0eda1e733e698055e401ed9832f6efbef2e7f3c4 Mon Sep 17 00:00:00 2001
From: Yurii Shyrma <iuriish@yahoo.com>
Date: Tue, 12 Nov 2019 10:58:48 +0200
Subject: [PATCH 03/15] Shyrma bnorm bp (#41)

Batchnorm backprop mkldnn
---
 .../ops/declarable/generic/nn/batchnorm.cpp   | 164 +++++++++++++-----
 .../declarable/platform/mkldnn/batchnorm.cpp  | 130 +++++++++++---
 .../layers_tests/DeclarableOpsTests9.cpp      | 107 ++++++------
 3 files changed, 284 insertions(+), 117 deletions(-)

diff --git a/libnd4j/include/ops/declarable/generic/nn/batchnorm.cpp b/libnd4j/include/ops/declarable/generic/nn/batchnorm.cpp
index 5641bab43..8b6bd24bc 100644
--- a/libnd4j/include/ops/declarable/generic/nn/batchnorm.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/batchnorm.cpp
@@ -1,5 +1,6 @@
 /*******************************************************************************
  * Copyright (c) 2015-2018 Skymind, Inc.
+ * Copyright (c) 2019 Konduit K.K.
  *
  * This program and the accompanying materials are made available under the
  * terms of the Apache License, Version 2.0 which is available at
@@ -88,8 +89,27 @@ CUSTOM_OP_IMPL(batchnorm, 3, 1, false, 1, 2) {
     nd4j_debug("MKL-DNN is not used for batchnorm!\n", 0);
 
     // formula: output = gamma * ((input - mean) / sqrt(variance + epsilon)) + beta
+    // auto v = input->varianceAlongDimension(variance::SummaryStatsVariance, false, ShapeUtils::evalDimsToExclude(input->rankOf(), axes));
+    // auto m = input->reduceAlongDimension(nd4j::reduce::Mean, ShapeUtils::evalDimsToExclude(input->rankOf(), axes));
+
     helpers::batchnorm(input, mean, variance, gamma, beta, output, axes, epsilon);
 
+    // NDArray stdInv = *v + epsilon;
+    // stdInv.applyTransform(transform::Reciprocal);               // 1 / (variance + epsilon)
+    // stdInv.applyTransform(transform::Sqrt);                     // 1 / (variance + epsilon)^0.5
+    // if(applyScale)
+    //     stdInv *= *gamma;
+
+    //  // empty array with same shape as input
+    // input->applyBroadcast(nd4j::broadcast::Subtract, axes, m, output);
+    // output->applyBroadcast(nd4j::broadcast::Multiply, axes, &stdInv);
+
+    // if(applyOffset)
+    //     output->applyBroadcast(nd4j::broadcast::Add, axes, beta);
+
+    // delete v;
+    // delete m;
+
     return Status::OK();
 }
 
@@ -113,10 +133,9 @@ CUSTOM_OP_IMPL(batchnorm_bp, 4, 3, false, 1, 2) {
     NDArray* input    = INPUT_VARIABLE(0);
     NDArray* mean     = INPUT_VARIABLE(1);
     NDArray* variance = INPUT_VARIABLE(2);
-    NDArray* dLdO     = INPUT_VARIABLE(3);    // next epsilon
     NDArray* gamma    = nullptr;
     NDArray* beta     = nullptr;
-
+    NDArray* dLdO     = INPUT_VARIABLE(block.width() - 1);    // next epsilon
 
     NDArray* dLdI = OUTPUT_VARIABLE(0);
     NDArray* dLdM = OUTPUT_VARIABLE(1);
@@ -129,11 +148,11 @@ CUSTOM_OP_IMPL(batchnorm_bp, 4, 3, false, 1, 2) {
     const float  epsilon     = T_ARG(0);
 
     if(applyScale) {
-        gamma = INPUT_VARIABLE(4);
+        gamma = INPUT_VARIABLE(3);
         dLdG  = OUTPUT_VARIABLE(3);
     }
     if(applyOffset) {
-        beta = INPUT_VARIABLE(4 + (int)applyScale);
+        beta = INPUT_VARIABLE(3 + (int)applyScale);
         dLdB = OUTPUT_VARIABLE(3 + (int)applyScale);
     }
 
@@ -172,67 +191,120 @@ CUSTOM_OP_IMPL(batchnorm_bp, 4, 3, false, 1, 2) {
     REQUIRE_TRUE(input->isSameShape(dLdO), 0, "BATCHNORM_BP op: wrong shape of output gradients array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(input).c_str(), ShapeUtils::shapeAsString(dLdO).c_str());
 
     // types of all input arrays should be the same (except dLdO)
-    for(int i = 1; i < block.width() - 1; ++i)
-        if(i != 3)
-            REQUIRE_TRUE(INPUT_VARIABLE(0)->dataType() == INPUT_VARIABLE(i)->dataType(), 0, "BATCHNORM_BP op: types of arrays (input, mean, variance, gamma, beta) should be the same !");
+    for(int i = 1; i < block.width() - 2; ++i)
+        REQUIRE_TRUE(INPUT_VARIABLE(0)->dataType() == INPUT_VARIABLE(i)->dataType(), 0, "BATCHNORM_BP op: types of arrays (input, mean, variance, gamma, beta) should be the same !");
 
     // ***** calculations ***** //
 
-    // formula for forward step: output = gamma * ((input - mean) / sqrt(variance + epsilon)) + beta
+    // notations:
+    // f = g * (gamma * ((x - m) / (v + eps)^0.5) + beta) -> means dLdO * ff_output
+    // g = dLdO
+    // stdInv = 1 / (v + eps)^0.5
+    // N - batch size (product of spatial dimensions)
 
-    // consider mean and variance as constants (since we get them as inputs and don't calculate them)
-    // dLdI = (dLdO * gamma) / (variance + epsilon)^0.5
-    // dLdV = (-0.5  * gamma * (dLdO * (x - mean))_sum) / (variance + epsilon)^1.5
-    // dLdM = - (dLdO_sum * gamma) / (variance + epsilon)^0.5
-    // dLdG = (dLdO * (x - mean))_sum / (variance + epsilon)^0.5
-    // dLdB = dLdO_sum
+    // derivatives:
+    // dLdI = dfdx + dfdm*dmdx + dfdv*(dvdm*dmdx + dvdx)
+
+    // dfdx =  gamma*stdInv*g;
+    // dfdm = -gamma*stdInv*g_sum;
+    // dmdx  = 1/N;
+    // dvdx  = 2 *  (x - m) / N
+    // dvdm  = -2 * [(x - m)]_sum / N
+    // dfdv  = -0.5 * [g*(x - m)]_sum * stdInv^3, drop gamma here for calc convenience
+
+    // finally:
+    // dLdI = gamma * (  stdInv * (g - g_sum/N) + (2/N) * dfdv * (dvdm/2  + (x - m))  )
+
+    // dLdG = (g * (x - m))_sum * stdInv
+    // dLdB = g_sum
+
+    // variance = input->varianceAlongDimension(variance::SummaryStatsVariance, false, ShapeUtils::evalDimsToExclude(input->rankOf(), axes));
+    // mean = input->reduceAlongDimension(nd4j::reduce::Mean, ShapeUtils::evalDimsToExclude(input->rankOf(), axes));
 
     const auto excludedAxes = ShapeUtils::evalDimsToExclude(inRank, axes);
-
-    NDArray temp1 = *variance + epsilon;
-    temp1.applyTransform(transform::Reciprocal);            // 1 / (variance + epsilon)
-    auto temp2 = temp1.transform(transform::Sqrt);     // 1 / (variance + epsilon)^0.5
-    if(applyScale)
-        temp2 *= *gamma;                                    // gamma / (variance + epsilon)^0.5
-
-    NDArray temp3(input); // empty array with same shape as input
-    input->applyBroadcast(nd4j::broadcast::Subtract, axes, mean, &temp3);  // input - mean
-    temp3 *= *dLdO;                                                        // (input - mean) * dLdO
-
     const bool keepUnitiesInShape = inRank == mean->rankOf();
 
-    // dLdI
-    dLdO->applyBroadcast(nd4j::broadcast::Multiply, axes, &temp2, dLdI);
+    // inverse batch size 1/N
+    const float Ninv = 1.f * shape::tadLength(input->getShapeInfo(), axes.data(), axes.size()) / input->lengthOf();
 
-    // dLdM
-    dLdO->reduceAlongDimension(reduce::Sum, dLdM, excludedAxes, keepUnitiesInShape);    // dLdO sum over excluded axes
+    // input - mean
+    NDArray xMinusMean(input); // empty array with same shape as input
+    input->applyBroadcast(nd4j::broadcast::Subtract, axes, mean, &xMinusMean);
+
+    // stdInv
+    NDArray stdInv = *variance + epsilon;
+    stdInv.applyTransform(transform::Reciprocal);                           // 1 / (variance + epsilon)
+    stdInv.applyTransform(transform::Sqrt);                                 // 1 / (variance + epsilon)^0.5
+
+    // dvdm (use dLdM as storage for dvdm)
+    xMinusMean.reduceAlongDimension(nd4j::reduce::Sum, dLdM, excludedAxes, keepUnitiesInShape);
+    *dLdM *= -Ninv;
+
+    // g_sum
+    auto gSum = dLdO->reduceAlongDims(nd4j::reduce::Sum, excludedAxes, keepUnitiesInShape);
 
     // dLdB
     if(applyOffset)
-        dLdB->assign(dLdM);
+        dLdB->assign(gSum);
 
-    // dLdM
-    // dLdM->applyPairwiseTransform(nd4j::pairwise::Multiply, temp2);
-    // dLdM->applyTransform(nd4j::transform::Neg);
-    *dLdM = 0;      // put zeros so far
+    // stdInv * (g - g_sum/N) (use dLdI as storage for this expression)
+    gSum *= Ninv;
+    dLdO->applyBroadcast(nd4j::broadcast::Subtract, axes, &gSum, dLdI);
+    dLdI->applyBroadcast(nd4j::broadcast::Multiply, axes, &stdInv);
 
-    //dLdV
-    temp3.reduceAlongDimension(reduce::Sum, dLdV, excludedAxes, keepUnitiesInShape);     // ((input - mean) * dLdO)_sum
+    // dLdV <- [g*(x - m)]_sum
+    (xMinusMean * *dLdO).reduceAlongDimension(nd4j::reduce::Sum, dLdV, excludedAxes, keepUnitiesInShape);
 
     // dLdG
-    if(applyScale) {
-        dLdV->applyPairwiseTransform(nd4j::pairwise::Multiply, &temp2, dLdG);
-        // dLdV->assign(dLdG);
-        dLdG->applyPairwiseTransform(nd4j::pairwise::Divide, *gamma);
-    }
-    else
-        // dLdV->applyPairwiseTransform(nd4j::pairwise::Multiply, temp2);
+    *dLdV *= stdInv;
+    if(applyScale)
+        dLdG->assign(dLdV);
 
-    // dLdV
-    // dLdV->applyPairwiseTransform(nd4j::pairwise::Multiply, temp1);
-    // *dLdV *= -0.5;
+    // (2 / N) * dfdv (use dLdV as storage for dfdv)
+    *dLdV *= stdInv*stdInv;         // dLdV*stdInv * stdInv^2
+    *dLdV *=  -Ninv;             // -0.5f * (2 / N);
+
+    // dfdv * (dvdm  + (x - m)) (use xMinusMean as storage for this expression)
+    xMinusMean.applyBroadcast(nd4j::broadcast::Add, axes, dLdM);
+    xMinusMean.applyBroadcast(nd4j::broadcast::Multiply, axes, dLdV);
+
+    // dLdI
+    *dLdI += xMinusMean;
+    if(applyScale)
+        dLdI->applyBroadcast(nd4j::broadcast::Multiply, axes, gamma);
+
+    *dLdM = 0;      // put zeros so far
     *dLdV = 0;      // put zeros so far
 
+    // java code
+    // NDArray std = *variance + epsilon;
+    // std.applyTransform(transform::Reciprocal);                           // 1 / (variance + epsilon)
+    // std.applyTransform(transform::Sqrt);                                 // 1 / (variance + epsilon)^0.5
+    // NDArray xMu(input);
+    // input->applyBroadcast(nd4j::broadcast::Subtract, axes, mean, &xMu);
+    // NDArray xHat(input);
+    // xMu.applyBroadcast(nd4j::broadcast::Multiply, axes, &std, &xHat);
+    // NDArray dxhat(input);
+    // dLdO->applyBroadcast(nd4j::broadcast::Multiply, axes, gamma, &dxhat);
+    // NDArray temp = dxhat*xMu;
+    // temp.reduceAlongDimension(reduce::Sum, dLdV, excludedAxes, keepUnitiesInShape);
+    // *dLdV *= -0.5f * std*std*std;
+    // NDArray* dxmu1 = dxhat.reduceAlongDimension(reduce::Sum, excludedAxes, keepUnitiesInShape);
+    // *dxmu1 *= -std;
+    // NDArray* dxmu2 = xMu.reduceAlongDimension(reduce::Sum, excludedAxes, keepUnitiesInShape);
+    // *dxmu2 *=  *dLdV * (-2.f/N);
+    // NDArray dLdmu = *dxmu1 + *dxmu2;
+    // dLdmu *= (1.f /N);
+    // *dLdV *= (2.f/N);
+    // dxhat.applyBroadcast(nd4j::broadcast::Multiply, axes, &std);
+    // xMu.applyBroadcast(nd4j::broadcast::Multiply, axes, dLdV);
+    // dxhat += xMu;
+    // dxhat.applyBroadcast(nd4j::broadcast::Add, axes, &dLdmu, dLdI);
+    // delete  dxmu1;
+    // delete  dxmu2;
+    // xHat *= *dLdO;
+    // xHat.reduceAlongDimension(reduce::Sum, dLdG, excludedAxes, keepUnitiesInShape);
+
     return Status::OK();
 }
 
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/batchnorm.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/batchnorm.cpp
index 13e1cfe11..27f836a0e 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/batchnorm.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/batchnorm.cpp
@@ -1,5 +1,6 @@
 /*******************************************************************************
  * Copyright (c) 2015-2018 Skymind, Inc.
+ * Copyright (c) 2019 Konduit K.K.
  *
  * This program and the accompanying materials are made available under the
  * terms of the Apache License, Version 2.0 which is available at
@@ -55,7 +56,7 @@ static void batchnormMKLDNN(const NDArray* x, const NDArray* mean, const NDArray
     mkldnn::memory::data_type type = mkldnn::memory::data_type::f32;
 
     // indicate whether gamma or/and beta are given
-    auto flags = mkldnn::normalization_flags::use_global_stats;
+    auto flags = mkldnn::normalization_flags::use_global_stats;         // don't calculate the mean and variance for each mini-batch
     if (weights != nullptr)
         flags |= mkldnn::normalization_flags::use_scale_shift;
 
@@ -182,7 +183,7 @@ static void batchnormBackPropMKLDNN(const NDArray* x, const NDArray* mean, const
     mkldnn::memory::data_type type = mkldnn::memory::data_type::f32;
 
     // indicate whether gamma or/and beta are given
-    auto flags = mkldnn::normalization_flags::use_global_stats;
+    auto flags = mkldnn::normalization_flags::use_global_stats;     // don't calculate the mean and variance for each mini-batch
     if (weights != nullptr)
         flags |= mkldnn::normalization_flags::use_scale_shift;
 
@@ -308,6 +309,70 @@ static void batchnormBackPropMKLDNN(const NDArray* x, const NDArray* mean, const
     stream.wait();
 
     // shape::printArray(dLdI_mkl_mem.map_data<float>(),8);
+
+    // notations:
+    // f = g * (gamma * ((x - m) / (v + eps)^0.5) + beta) -> means dLdO * ff_output
+    // g = dLdO
+    // stdInv = 1 / (v + eps)^0.5
+    // N - batch size (product of spatial dimensions)
+
+    // formula for full derivative with respect to input (x)
+    // dLdI = dfdx + dfdm*dmdx + dfdv*(dvdm*dmdx + dvdx)
+
+    // !!! MKL CALCULATES ONLY FIRST TERM dfdx, SO WE SHOULD CALCULATE TERM (dfdm*dmdx + dfdv*(dvdm*dmdx + dvdx)) BY OURSELF !!!
+
+    // dfdm = -gamma*stdInv*g_sum;
+    // dmdx  = 1/N;
+    // dvdx  = 2 *  (x - m) / N
+    // dvdm  = -2 * [(x - m)]_sum / N
+    // dfdv  = -0.5 * [g*(x - m)]_sum * stdInv^3, drop gamma here for calc convenience
+
+    // finally:
+    // dLdI = dfdm / N + (2/N) * dfdv * (dvdm/2  + (x - m))
+    // dLdI = gamma * (  stdInv * -g_sum/N + (2/N) * dfdv * (dvdm/2  + (x - m))  )
+
+    std::vector<int> axes = {1};
+    const auto excludedAxes = ShapeUtils::evalDimsToExclude(x->rankOf(), axes);
+
+    // inversed batch size 1 / N
+    const auto Ninv = 1.f * mean->lengthOf() / x->lengthOf();
+
+    // x - mean
+    NDArray xMinusMean(x); // empty array with same shape as x
+    const_cast<NDArray*>(x)->applyBroadcast(nd4j::broadcast::Subtract, axes, mean, &xMinusMean);
+
+    // stdInv
+    NDArray stdInv = *variance + epsilon;
+    stdInv.applyTransform(transform::Reciprocal);                           // 1 / (variance + epsilon)
+    stdInv.applyTransform(transform::Sqrt);                                 // 1 / (variance + epsilon)^0.5
+
+    // dfdm / N
+    auto dfdm = dLdO->reduceAlongDims(nd4j::reduce::Sum, excludedAxes);
+    dfdm *= stdInv;
+    dfdm *= -Ninv;
+
+    // dvdm / 2
+    NDArray dvdm(mean);                 // empty array with same shape as mean
+    xMinusMean.reduceAlongDimension(nd4j::reduce::Sum, &dvdm, excludedAxes);
+    dvdm *= -Ninv;
+
+    // (2/N)*dfdv
+    NDArray dfdv(variance);                 // empty array with same shape as variance
+    (xMinusMean * *dLdO).reduceAlongDimension(nd4j::reduce::Sum, &dfdv, excludedAxes);
+    dfdv *= stdInv*stdInv*stdInv;
+    dfdv *= -Ninv;
+
+    // dvdm/2  + (x - m)
+    xMinusMean.applyBroadcast(nd4j::broadcast::Add, axes, &dvdm);
+    // dfdv * (dvdm/2  + (x - m))
+    xMinusMean.applyBroadcast(nd4j::broadcast::Multiply, axes, &dfdv);
+    // add dfdm / N
+    xMinusMean.applyBroadcast(nd4j::broadcast::Add, axes, &dfdm);
+    // * gamma
+    auto gamma = (*weights)({0,1, 0,0});
+    xMinusMean.applyBroadcast(nd4j::broadcast::Multiply, axes, &gamma);
+
+    *dLdI += xMinusMean;
 }
 
 PLATFORM_IMPL(batchnorm) {
@@ -371,10 +436,21 @@ PLATFORM_IMPL(batchnorm) {
             (*weights)({1,2, 0,0}).assign(0);
     }
 
+    if(axes[0] == inRank - 1 && inRank > 2) {   // if nhwc or ndhwc
+        std::vector<int> permut = inRank == 4 ? std::vector<int>({0,3,1,2}) : std::vector<int>({0,4,1,2,3});
+        input = new NDArray(input->permute(permut));
+        output = new NDArray(output->permute(permut));
+    }
+
     batchnormMKLDNN(input, mean, variance, weights, epsilon, output);
 
     delete weights;
 
+    if(axes[0] == inRank - 1 && inRank > 2) {
+        delete input;
+        delete output;
+    }
+
     return Status::OK();
 }
 
@@ -418,7 +494,7 @@ PLATFORM_CHECK(batchnorm) {
 
     const int inRank = input->rankOf();
 
-    return block.isUseMKLDNN() && axes.size() == 1 && axes[0] == 1 && (inRank == 2 || inRank == 4 || inRank == 5) &&
+    return block.isUseMKLDNN() && axes.size() == 1 && (axes[0] == 1 || axes[0] == inRank - 1)  && (inRank == 2 || inRank == 4 || inRank == 5) &&
             (inputType == DataType::FLOAT32 && meanType == DataType::FLOAT32 && varType == DataType::FLOAT32 &&
              gammaType == DataType::FLOAT32 && betaType == DataType::FLOAT32 && outType == DataType::FLOAT32);
 }
@@ -558,29 +634,29 @@ PLATFORM_CHECK(batchnorm) {
 //////////////////////////////////////////////////////////////////////////
 PLATFORM_IMPL(batchnorm_bp) {
 
-    NDArray* input    = INPUT_VARIABLE(0);      // 2D:nc, 4D:nchw, 5D:ncdhw
-    NDArray* mean     = INPUT_VARIABLE(1);      // [c]
-    NDArray* variance = INPUT_VARIABLE(2);      // [c]
-    NDArray* dLdO     = INPUT_VARIABLE(3);      // same as input
-    NDArray* gamma    = nullptr;                // [c]
-    NDArray* beta     = nullptr;                // [c]
+    NDArray* input    = INPUT_VARIABLE(0);                  // 2D:nc, 4D:nchw, 5D:ncdhw
+    NDArray* mean     = INPUT_VARIABLE(1);                  // [c]
+    NDArray* variance = INPUT_VARIABLE(2);                  // [c]
+    NDArray* gamma    = nullptr;                            // [c]
+    NDArray* beta     = nullptr;                            // [c]
+    NDArray* dLdO     = INPUT_VARIABLE(block.width() - 1);  // same as input
 
-    NDArray* dLdI = OUTPUT_VARIABLE(0);         // same as input
-    NDArray* dLdM = OUTPUT_VARIABLE(1);         // [c]
-    NDArray* dLdV = OUTPUT_VARIABLE(2);         // [c]
-    NDArray* dLdG = nullptr;                    // [c]
-    NDArray* dLdB = nullptr;                    // [c]
+    NDArray* dLdI = OUTPUT_VARIABLE(0);                     // same as input
+    NDArray* dLdM = OUTPUT_VARIABLE(1);                     // [c]
+    NDArray* dLdV = OUTPUT_VARIABLE(2);                     // [c]
+    NDArray* dLdG = nullptr;                                // [c]
+    NDArray* dLdB = nullptr;                                // [c]
 
     const bool  applyScale  = (bool)INT_ARG(0);
     const bool  applyOffset = (bool)INT_ARG(1);
     const float epsilon     = T_ARG(0);
 
     if(applyScale) {
-        gamma = INPUT_VARIABLE(4);
+        gamma = INPUT_VARIABLE(3);
         dLdG  = OUTPUT_VARIABLE(3);
     }
     if(applyOffset) {
-        beta = INPUT_VARIABLE(4 + (int)applyScale);
+        beta = INPUT_VARIABLE(3 + (int)applyScale);
         dLdB = OUTPUT_VARIABLE(3 + (int)applyScale);
     }
 
@@ -606,7 +682,7 @@ PLATFORM_IMPL(batchnorm_bp) {
     if(beta != nullptr)
         REQUIRE_TRUE(beta->rankOf() == 1 && beta->sizeAt(0) == input->sizeAt(axes[0]), 0, "BATCHNORM_BP_MKLDNN op: wrong shape of beta array, expected is [%lld], but got %s instead !", input->sizeAt(axes[0]), ShapeUtils::shapeAsString(beta).c_str());
 
-    // types of all input arrays should be the same (except dLdO)
+    // types of all input arrays should be the same
     for(int i = 1; i < block.width() - 1; ++i)
         REQUIRE_TRUE(INPUT_VARIABLE(0)->dataType() == INPUT_VARIABLE(i)->dataType(), 0, "BATCHNORM_BP_MKLDNN op: types of all input arrays should be the same !");
 
@@ -626,11 +702,19 @@ PLATFORM_IMPL(batchnorm_bp) {
             (*weights)({1,2, 0,0}).assign(0);
     }
 
-    *dLdM = 0;
-    *dLdV = 0;
+
+    if(axes[0] == inRank - 1 && inRank > 2) {   // if nhwc or ndhwc
+        std::vector<int> permut = inRank == 4 ? std::vector<int>({0,3,1,2}) : std::vector<int>({0,4,1,2,3});
+        input = new NDArray(input->permute(permut));
+        dLdO = new NDArray(dLdO->permute(permut));
+        dLdI = new NDArray(dLdI->permute(permut));
+    }
 
     batchnormBackPropMKLDNN(input, mean, variance, dLdO, weights, epsilon, dLdI, dLdW);
 
+    *dLdM = 0;
+    *dLdV = 0;
+
     if(applyScale || applyOffset) {
         if(applyScale)
             dLdG->assign((*dLdW)({0,1, 0,0}));
@@ -641,6 +725,12 @@ PLATFORM_IMPL(batchnorm_bp) {
         delete dLdW;
     }
 
+    if(axes[0] == inRank - 1 && inRank > 2) {
+        delete input;
+        delete dLdO;
+        delete dLdI;
+    }
+
     return Status::OK();
 }
 
@@ -696,7 +786,7 @@ PLATFORM_CHECK(batchnorm_bp) {
 
     const int inRank = input->rankOf();
 
-    return block.isUseMKLDNN() && axes.size() == 1 && axes[0] == 1 && (inRank == 2 || inRank == 4 || inRank == 5) &&
+    return block.isUseMKLDNN() && axes.size() == 1 && (axes[0] == 1 || axes[0] == inRank - 1)  && (inRank == 2 || inRank == 4 || inRank == 5) &&
             (inputType == DataType::FLOAT32 && meanType  == DataType::FLOAT32 && varType  == DataType::FLOAT32 &&
              dLdOType  == DataType::FLOAT32 && gammaType == DataType::FLOAT32 && betaType == DataType::FLOAT32 &&
              dLdIType  == DataType::FLOAT32 && dLdGType  == DataType::FLOAT32 && dLdBType == DataType::FLOAT32);
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp
index 4871c12e4..654d4bf2c 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp
@@ -2901,31 +2901,29 @@ TEST_F(DeclarableOpsTests9, Floormod_BP_Test_4) {
     delete result;
 }
 
-////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests9, batchnorm_bp_test1) {
 
     NDArray input   ('c', {2,3,4}, nd4j::DataType::FLOAT32);
-    NDArray mean    ('c', {4}, nd4j::DataType::FLOAT32);
+    NDArray mean    ('c', {4}, {1.1, 1.2, 1.3, 1.4}, nd4j::DataType::FLOAT32);
     NDArray variance('c', {4}, nd4j::DataType::FLOAT32);
     NDArray gamma   ('c', {4}, nd4j::DataType::FLOAT32);
     NDArray beta    ('c', {4}, nd4j::DataType::FLOAT32);
     NDArray gradO   ('c', {2,3,4}, nd4j::DataType::FLOAT32);
 
-    NDArray expdLdI('c', {2,3,4}, {-1.527335, -1.272779, -1.018224, -0.763668,-0.509112, -0.254556,  0.,  0.254556,0.509112,  0.763668,  1.018224,  1.272779,
-                                1.527335,  1.781891,  2.036447,  2.291003,2.545559,  2.800115,  3.054671,  3.309227,3.563783,  3.818338,  4.072894,  4.32745}, nd4j::DataType::FLOAT32);
-    NDArray expdLdG('c', {4}, {6.448749, 7.212417, 8.230641, 9.50342 }, nd4j::DataType::FLOAT32);
+    NDArray expdLdI('c', {2,3,4}, {-0.000056, -0.000056, -0.000056, -0.000056, -0.000034, -0.000034, -0.000034, -0.000034, -0.000011, -0.000011, -0.000011, -0.000011, 0.000011, 0.000011, 0.000011, 0.000011, 0.000034, 0.000034, 0.000034, 0.000034, 0.000056, 0.000056, 0.000056, 0.000056}, nd4j::DataType::FLOAT32);
+    NDArray expdLdG('c', {4}, {6.148104, 6.148104, 6.148105, 6.148105}, nd4j::DataType::FLOAT32);
     NDArray expdLdB('c', {4}, {3.6, 4.5, 5.4, 6.3}, nd4j::DataType::FLOAT32);
 
     input.linspace(0.1, 0.1);
-    mean.assign(1.);
-    variance.assign(0.5);
+    variance.assign(0.46666667);
     gamma.assign(1.2);
-    // beta.assign(1.);     // has no effect on gradient calculations
+    beta.assign(1.);     // has no effect on gradient calculations
     gradO.linspace(-0.9, 0.15);
 
     nd4j::ops::batchnorm_bp op;
 
-    auto results = op.execute({&input, &mean, &variance, &gradO, &gamma, &beta}, {1e-5}, {1,1});
+    auto results = op.execute({&input, &mean, &variance, &gamma, &beta, &gradO}, {1e-5}, {1,1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
@@ -2945,20 +2943,22 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test1) {
     delete results;
 }
 
+
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests9, batchnorm_bp_test2) {
 
-    NDArray input   ('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray mean    ('c', {3}, {1.05, 1.1, 1.15});
-    NDArray variance('c', {3}, {0.5, 0.6, 0.7});
-    NDArray gamma   ('c', {3}, {1.2, 1.3, 1.4});
-    NDArray beta    ('c', {3}, nd4j::DataType::DOUBLE);
-    NDArray gradO   ('c', {2,3,4}, nd4j::DataType::DOUBLE);
+    NDArray input   ('c', {2,3,4}, nd4j::DataType::FLOAT32);
+    NDArray mean    ('c', {3}, {1.05, 1.1, 1.15}, nd4j::DataType::FLOAT32);
+    NDArray variance('c', {3}, {0.5, 0.6, 0.7}, nd4j::DataType::FLOAT32);
+    NDArray gamma   ('c', {3}, {1.2, 1.3, 1.4}, nd4j::DataType::FLOAT32);
+    NDArray beta    ('c', {3}, nd4j::DataType::FLOAT32);
+    NDArray gradO   ('c', {2,3,4}, nd4j::DataType::FLOAT32);
 
-    NDArray expdLdI('c', {2,3,4}, {-1.527335, -1.272779, -1.018224, -0.763668,-0.503484, -0.251742,  0.,  0.251742,0.501992,  0.752989,  1.003985,  1.254981,
-                                    1.527335,  1.781891,  2.036447,  2.291003,2.517418,  2.76916 ,  3.020902,  3.272644,3.513947,  3.764943,  4.015939,  4.266936});
-    NDArray expdLdG('c', {3}, {5.81236 ,  7.048771, 12.155388});
-    NDArray expdLdB('c', {3}, {1.8,  6.6, 11.4});
+    NDArray expdLdI('c', {2,3,4}, {-0.601415, -0.521226, -0.441037, -0.360849, -0.456306, -0.395465, -0.334624, -0.273784, 0.396631, 0.343747,
+                                    0.290863, 0.237978, 0.360849, 0.441037, 0.521226, 0.601415, 0.273784, 0.334625, 0.395465, 0.456306, -0.237978,
+                                    -0.290863, -0.343746, -0.396631}, nd4j::DataType::FLOAT32);
+    NDArray expdLdG('c', {3}, {5.81236 ,  7.048771, 12.155388}, nd4j::DataType::FLOAT32);
+    NDArray expdLdB('c', {3}, {1.8,  6.6, 11.4}, nd4j::DataType::FLOAT32);
 
     input.linspace(0.1, 0.1);
     // beta.assign(1.);     // has no effect on gradient calculations
@@ -2966,7 +2966,7 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test2) {
 
     nd4j::ops::batchnorm_bp op;
 
-    auto results = op.execute({&input, &mean, &variance, &gradO, &gamma, &beta}, {1e-5}, {1,1,1});
+    auto results = op.execute({&input, &mean, &variance, &gamma, &beta, &gradO}, {1e-5}, {1,1,1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
@@ -2989,17 +2989,18 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test2) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests9, batchnorm_bp_test3) {
 
-    NDArray input   ('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray mean    ('c', {2,1,4}, {1.05, 1.1, 1.15, 1.2, 1.25, 1.3, 1.35, 1.4});
-    NDArray variance('c', {2,1,4}, {0.5, 0.6, 0.7, 0.8, 0.9, 1., 1.1, 1.2});
-    NDArray gamma   ('c', {2,1,4}, {1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9});
-    NDArray beta    ('c', {2,1,4}, nd4j::DataType::DOUBLE);
-    NDArray gradO   ('c', {2,3,4}, nd4j::DataType::DOUBLE);
+    NDArray input   ('c', {2,3,4}, nd4j::DataType::FLOAT32);
+    NDArray mean    ('c', {2,1,4}, {1.05, 1.1, 1.15, 1.2, 1.25, 1.3, 1.35, 1.4}, nd4j::DataType::FLOAT32);
+    NDArray variance('c', {2,1,4}, {0.5, 0.6, 0.7, 0.8, 0.9, 1., 1.1, 1.2}, nd4j::DataType::FLOAT32);
+    NDArray gamma   ('c', {2,1,4}, {1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9}, nd4j::DataType::FLOAT32);
+    NDArray beta    ('c', {2,1,4}, nd4j::DataType::FLOAT32);
+    NDArray gradO   ('c', {2,3,4}, nd4j::DataType::FLOAT32);
 
-    NDArray expdLdI('c', {2,3,4}, {-1.527335, -1.258709, -1.003985, -0.754668,-0.509112, -0.251742,  0.,  0.251556,0.509112,  0.755225,  1.003985,  1.25778 ,
-                                   1.517885,  1.784991,  2.05947 ,  2.341504,2.529808,  2.804986,  3.089205,  3.382173,3.541731,  3.824981,  4.11894 ,  4.422841});
-    NDArray expdLdG('c', {2,1,4}, {1.378844, 0.910144, 0.573706, 0.335408, 2.640487, 2.954985, 3.289431, 3.64234 });
-    NDArray expdLdB('c', {2,1,4}, {-0.9 , -0.45,  0.  ,  0.45,  4.5 ,  4.95,  5.4 ,  5.85});
+    NDArray expdLdI('c', {2,3,4}, {-0.577002, -0.744041, -0.850999, -0.922373, -0.000000, -0.000000, -0.000000, -0.000000, 0.577002,
+                                    0.744041, 0.850999, 0.922373, -0.386037, -0.350205, -0.312047, -0.271737, -0.000000, -0.000000,
+                                    -0.000000, -0.000000, 0.386037, 0.350205, 0.312047, 0.271736}, nd4j::DataType::FLOAT32);
+    NDArray expdLdG('c', {2,1,4}, {1.378844, 0.910144, 0.573706, 0.335408, 2.640487, 2.954985, 3.289431, 3.64234 }, nd4j::DataType::FLOAT32);
+    NDArray expdLdB('c', {2,1,4}, {-0.9 , -0.45,  0.  ,  0.45,  4.5 ,  4.95,  5.4 ,  5.85}, nd4j::DataType::FLOAT32);
 
     input.linspace(0.1, 0.1);
     // beta.assign(1.);     // has no effect on gradient calculations
@@ -3007,7 +3008,7 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test3) {
 
     nd4j::ops::batchnorm_bp op;
 
-    auto results = op.execute({&input, &mean, &variance, &gradO, &gamma, &beta}, {1e-5}, {1,1,0,2});
+    auto results = op.execute({&input, &mean, &variance, &gamma, &beta, &gradO}, {1e-5}, {1,1,0,2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
@@ -3037,8 +3038,8 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test4) {
     NDArray beta    ('c', {4}, nd4j::DataType::FLOAT32);
     NDArray gradO   ('c', {2,4}, nd4j::DataType::FLOAT32);
 
-    NDArray expdLdI('c', {2,4}, {1.527335, -1.16534 ,  0.885433, -0.643584,  0.509112, -0.233068, -0.,  0.214528}, nd4j::DataType::FLOAT32);
-    NDArray expdLdG('c', {4}, {1.442483, 0.9502  , 0.569207, 0.314641}, nd4j::DataType::FLOAT32);
+    NDArray expdLdI('c', {2,4}, {0.162923, -0.289673, 0.354174, -0.386151, -0.162923, 0.289673, -0.354174, 0.386151}, nd4j::DataType::FLOAT32);
+    NDArray expdLdG('c', {4}, {1.442483, 0.950200, 0.569207, 0.314641}, nd4j::DataType::FLOAT32);
     NDArray expdLdB('c', {4}, {-1.2, -0.9, -0.6, -0.3}, nd4j::DataType::FLOAT32);
 
     input.linspace(0.1, 0.1);
@@ -3046,7 +3047,7 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test4) {
 
     nd4j::ops::batchnorm_bp op;
 
-    auto results = op.execute({&input, &mean, &variance, &gradO, &gamma, &beta}, {1e-5}, {1,1});
+    auto results = op.execute({&input, &mean, &variance, &gamma, &beta, &gradO}, {1e-5}, {1,1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
@@ -3076,8 +3077,9 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test5) {
     NDArray beta    ('c', {4}, nd4j::DataType::FLOAT32);
     NDArray gradO   ('c', {2,4,2,2}, nd4j::DataType::FLOAT32);
 
-    NDArray expdLdI('c', {2,4,2,2}, {1.527335,  1.272779,1.018224,  0.763668,-0.466136, -0.233068,0.,  0.233068,-0.442716, -0.664075,-0.885433, -1.106791,1.287169,  1.501697,1.716225,  1.930753,
-                                    -2.545559, -2.800115,-3.054671, -3.309227,3.262951,  3.496019,3.729087,  3.962155,-3.984448, -4.205806,-4.427164, -4.648522,4.719618,  4.934146,5.148675,  5.363203}, nd4j::DataType::FLOAT32);
+    NDArray expdLdI('c', {2,4,2,2}, {-0.737512, -0.659880, -0.582247, -0.504614, 0.561404, 0.502309, 0.443214, 0.384118, -1.168243,
+        -1.045270, -0.922297, -0.799324, 1.899026, 1.699128, 1.499231, 1.299333, 0.504614, 0.582247, 0.659880, 0.737512, -0.384118,
+        -0.443214, -0.502308, -0.561404, 0.799324, 0.922297, 1.045270, 1.168243, -1.299334, -1.499231, -1.699129, -1.899026}, nd4j::DataType::FLOAT32);
     NDArray expdLdG('c', {4}, {11.073181, 12.585667, 17.708657, 24.313186}, nd4j::DataType::FLOAT32);
     NDArray expdLdB('c', {4}, {4.2,  9. , 13.8, 18.6}, nd4j::DataType::FLOAT32);
 
@@ -3086,7 +3088,7 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test5) {
 
     nd4j::ops::batchnorm_bp op;
 
-    auto results = op.execute({&input, &mean, &variance, &gradO, &gamma, &beta}, {1e-5}, {1,1,1});
+    auto results = op.execute({&input, &mean, &variance, &gamma, &beta, &gradO}, {1e-5}, {1,1,1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
@@ -3116,8 +3118,9 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test6) {
     NDArray beta    ('c', {4}, nd4j::DataType::FLOAT32);
     NDArray gradO   ('c', {2,2,2,4}, nd4j::DataType::FLOAT32);
 
-    NDArray expdLdI('c', {2,2,2,4}, {1.527335, -1.16534 ,  0.885433, -0.643584, 0.509112, -0.233068, -0.,  0.214528, -0.509112,  0.699204, -0.885433,  1.072641, -1.527335,  1.631475, -1.770866,  1.930753,
-                                    -2.545559,  2.563747, -2.656298,  2.788865, -3.563783,  3.496019, -3.541731,  3.646978, -4.582006,  4.42829 , -4.427164,  4.50509 , -5.60023 ,  5.360562, -5.312597,  5.363203}, nd4j::DataType::FLOAT32);
+    NDArray expdLdI('c', {2,2,2,4}, {-4.989124, 2.540357, -1.515022, 0.791769, -3.563660, 1.814540, -1.082159, 0.565549, -2.138196, 1.088724, -0.649295,
+                                    0.339329, -0.712732, 0.362908, -0.216432, 0.113110, 0.712732, -0.362908, 0.216432, -0.113110, 2.138195, -1.088724, 0.649295,
+                                    -0.339330, 3.563660,-1.814540, 1.082159, -0.565549, 4.989125, -2.540356, 1.515022, -0.791770}, nd4j::DataType::FLOAT32);
     NDArray expdLdG('c', {4}, {20.364472, 17.856588, 16.949714, 15.903684}, nd4j::DataType::FLOAT32);
     NDArray expdLdB('c', {4}, {9.6, 10.8, 12. , 13.2}, nd4j::DataType::FLOAT32);
 
@@ -3126,7 +3129,7 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test6) {
 
     nd4j::ops::batchnorm_bp op;
 
-    auto results = op.execute({&input, &mean, &variance, &gradO, &gamma, &beta}, {1e-5}, {1,1,3});
+    auto results = op.execute({&input, &mean, &variance, &gamma, &beta, &gradO}, {1e-5}, {1,1,3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
@@ -3156,20 +3159,21 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test7) {
     NDArray beta    ('c', {4}, nd4j::DataType::FLOAT32);
     NDArray gradO   ('c', {2,2,2,2,4}, nd4j::DataType::FLOAT32);
 
-    NDArray expdLdI('c', {2,2,2,2,4}, {1.527335,  -1.16534 ,   0.885433,  -0.643584,0.509112,  -0.233068,  -0.,   0.214528,-0.509112,   0.699204,  -0.885433,   1.072641,-1.527335,   1.631475,  -1.770866,
-                                      1.930753,-2.545559,   2.563747,  -2.656298,   2.788865,-3.563783,   3.496019,  -3.541731,   3.646978,-4.582006,   4.42829 ,  -4.427164,
-                                      4.50509 ,-5.60023 ,   5.360562,  -5.312597,   5.363203,  -6.618453,   6.292834,  -6.19803 ,   6.221315,-7.636677,   7.225105,  -7.083463,
-                                      7.079428,-8.6549  ,   8.157377,  -7.968895,   7.93754 ,-9.673124,   9.089649,  -8.854328,   8.795652, -10.691348,  10.02192 ,  -9.739761,
-                                      9.653765,-11.709571,  10.954192, -10.625194,  10.511877,-12.727795,  11.886464, -11.510627,  11.36999 ,-13.746018,  12.818735, -12.39606 ,  12.228102}, nd4j::DataType::FLOAT32);
+    NDArray expdLdI('c', {2,2,2,2,4}, {-119.435059, 78.159744, -58.732986, 46.630123, -103.510391, 67.738441, -50.901920, 40.412773, -87.585716, 57.317142,
+        -43.070854, 34.195419, -71.661041, 46.895844, -35.239792, 27.978071, -55.736359, 36.474548, -27.408726, 21.760721, -39.811687, 26.053242, -19.577662,
+        15.543370, -23.887009, 15.631950, -11.746595, 9.326023, -7.962326, 5.210644, -3.915531, 3.108671, 7.962341, -5.210655, 3.915535, -3.108677, 23.887032,
+        -15.631958, 11.746601, -9.326031, 39.811691, -26.053246, 19.577671, -15.543377, 55.736382, -36.474548, 27.408726, -21.760731, 71.661064, -46.895851, 35.239788,
+        -27.978077, 87.585732, -57.317154, 43.070866, -34.195431, 103.510384, -67.738464, 50.901920, -40.412777, 119.435097, -78.159744, 58.732998, -46.630131}, nd4j::DataType::FLOAT32);
     NDArray expdLdG('c', {4}, {282.38734 , 244.542027, 224.140995, 207.548793}, nd4j::DataType::FLOAT32);
     NDArray expdLdB('c', {4}, {57.6, 60. , 62.4, 64.8}, nd4j::DataType::FLOAT32);
 
     input.linspace(0.1, 0.1);
     gradO.linspace(-0.9, 0.15);
 
+
     nd4j::ops::batchnorm_bp op;
 
-    auto results = op.execute({&input, &mean, &variance, &gradO, &gamma, &beta}, {1e-5}, {1,1,4});
+    auto results = op.execute({&input, &mean, &variance, &gamma, &beta, &gradO}, {1e-5}, {1,1,4});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
@@ -3201,10 +3205,11 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test8) {
     NDArray beta    ('c', {4}, nd4j::DataType::FLOAT32);
     NDArray gradO   ('c', {2,4,2,2,2}, nd4j::DataType::FLOAT32);
 
-    NDArray expdLdI('c', {2,4,2,2,2}, {1.527335,   1.272779, 1.018224,   0.763668, 0.509112,   0.254556, -0.      ,  -0.254556, 0.466136,   0.699204, 0.932272,   1.16534 , 1.398407,   1.631475, 1.864543,   2.097611,
-                                    -2.213582,  -2.43494 , -2.656298,  -2.877657, -3.099015,  -3.320373, -3.541731,  -3.76309 , 3.861506,   4.076034, 4.290562,   4.50509 , 4.719618,   4.934146, 5.148675,   5.363203,
-                                    -6.618453,  -6.873009, -7.127565,  -7.382121, -7.636677,  -7.891233, -8.145789,  -8.400345, 7.924309,   8.157377, 8.390445,   8.623513, 8.856581,   9.089649, 9.322717,   9.555784,
-                                    -9.297045,  -9.518403, -9.739761,  -9.961119, -10.182477, -10.403836, -10.625194, -10.846552, 10.726405,  10.940933, 11.155462,  11.36999 , 11.584518,  11.799046, 12.013574,  12.228102}, nd4j::DataType::FLOAT32);
+    NDArray expdLdI('c', {2,4,2,2,2}, {-34.373802, -32.611046, -30.848286, -29.085529, -27.322769, -25.560009, -23.797251, -22.034491, 36.146996, 34.293301,
+        32.439610, 30.585917, 28.732227, 26.878534, 25.024841, 23.171150, -42.876553, -40.677757, -38.478958, -36.280159, -34.081367, -31.882565, -29.683767,
+        -27.484968, 50.674446, 48.075760, 45.477066, 42.878380, 40.279686, 37.681000, 35.082310, 32.483616, 22.034489, 23.797249, 25.560009, 27.322765, 29.085526,
+        30.848286, 32.611046, 34.373802, -23.171146, -25.024837, -26.878536, -28.732231, -30.585918, -32.439613, -34.293297, -36.146996, 27.484982, 29.683773,
+        31.882572, 34.081364, 36.280178, 38.478970, 40.677776, 42.876560, -32.483627, -35.082329, -37.681023, -40.279701, -42.878403, -45.477081, -48.075775, -50.674484}, nd4j::DataType::FLOAT32);
     NDArray expdLdG('c', {4}, {134.490365, 179.785003, 248.933114, 330.087248}, nd4j::DataType::FLOAT32);
     NDArray expdLdB('c', {4}, {32.4, 51.6, 70.8, 90.}, nd4j::DataType::FLOAT32);
 
@@ -3213,7 +3218,7 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test8) {
 
     nd4j::ops::batchnorm_bp op;
 
-    auto results = op.execute({&input, &mean, &variance, &gradO, &gamma, &beta}, {1e-5}, {1,1,1});
+    auto results = op.execute({&input, &mean, &variance, &gamma, &beta, &gradO}, {1e-5}, {1,1,1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 

From 18c01f5bdc4b2349869b861aaebfbc4c7c9e9c21 Mon Sep 17 00:00:00 2001
From: Alex Black <blacka101@gmail.com>
Date: Tue, 12 Nov 2019 21:15:44 +1100
Subject: [PATCH 04/15] Add SameDiff memory reuse memory manager (array cache)
 (#39)

* Attention op comments

Signed-off-by: AlexDBlack <blacka101@gmail.com>

* ArrayCacheMemoryMgr - first pass

Signed-off-by: AlexDBlack <blacka101@gmail.com>

* Tweak array cache for use with SameDiff identity arrays

Signed-off-by: AlexDBlack <blacka101@gmail.com>

* ArrayCacheMemoryMgr javadoc and properly get max memory

Signed-off-by: AlexDBlack <blacka101@gmail.com>

* LRU cache policy + add tests

Signed-off-by: AlexDBlack <blacka101@gmail.com>

* Fixes

Signed-off-by: AlexDBlack <blacka101@gmail.com>

* Resize arrays internally if required for ArrayCacheMemoryMgr

Signed-off-by: AlexDBlack <blacka101@gmail.com>

* Test improvement

Signed-off-by: AlexDBlack <blacka101@gmail.com>

* Small polish

Signed-off-by: AlexDBlack <blacka101@gmail.com>
---
 .../CompareTrainingImplementations.java       |   1 +
 .../include/helpers/impl/AttentionHelper.cpp  |  10 +-
 .../nn/multi_head_dot_product_attention.cpp   |  21 +-
 .../samediff/internal/InferenceSession.java   |  42 +--
 .../internal/memory/ArrayCacheMemoryMgr.java  | 292 ++++++++++++++++++
 .../nd4j/autodiff/samediff/MemoryMgrTest.java | 119 +++++++
 6 files changed, 442 insertions(+), 43 deletions(-)
 create mode 100644 nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/samediff/internal/memory/ArrayCacheMemoryMgr.java
 create mode 100644 nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/autodiff/samediff/MemoryMgrTest.java

diff --git a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/samediff/CompareTrainingImplementations.java b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/samediff/CompareTrainingImplementations.java
index 12564f01a..fa0fc335f 100644
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/samediff/CompareTrainingImplementations.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/samediff/CompareTrainingImplementations.java
@@ -98,6 +98,7 @@ public class CompareTrainingImplementations extends BaseDL4JTest {
 
                 SDVariable diff = sd.f().squaredDifference(a1, label);
                 SDVariable lossMse = diff.mean();
+                lossMse.markAsLoss();
 
                 IUpdater updater;
                 double lr;
diff --git a/libnd4j/include/helpers/impl/AttentionHelper.cpp b/libnd4j/include/helpers/impl/AttentionHelper.cpp
index 4e7393a8e..3cfee1c08 100644
--- a/libnd4j/include/helpers/impl/AttentionHelper.cpp
+++ b/libnd4j/include/helpers/impl/AttentionHelper.cpp
@@ -34,16 +34,16 @@ namespace nd4j {
         auto numHeads = projectionMatrix->sizeAt(0);
         auto projectedSize = projectionMatrix->sizeAt(1);
 
-        auto inputPerm = input->permute({1, 0, 2});
-        auto inputPrep = inputPerm.reshape('c', {input->sizeAt(1), (miniBatchSize * seqLength)});
-        auto projectionPrep = projectionMatrix->reshape('c', {numHeads * projectionMatrix->sizeAt(1), projectionMatrix->sizeAt(2)});
+        auto inputPerm = input->permute({1, 0, 2});     //[batch, nIn, timeSteps] -> [nIn, batch, timeSteps]
+        auto inputPrep = inputPerm.reshape('c', {input->sizeAt(1), (miniBatchSize * seqLength)});   //[nIn, batch*timeSteps]
+        auto projectionPrep = projectionMatrix->reshape('c', {numHeads * projectionMatrix->sizeAt(1), projectionMatrix->sizeAt(2)});    //[nHeads, hS, nIn] -> [nHeads*hS, nIn]
 
-        NDArray projected('c', {numHeads * projectionMatrix->sizeAt(1), (miniBatchSize * seqLength)}, input->dataType(), context);
+        NDArray projected('c', {numHeads * projectionMatrix->sizeAt(1), (miniBatchSize * seqLength)}, input->dataType(), context);  //[nHeads*hS, batch*timeSteps]
         nd4j::ops::matmul mmul;
         mmul.execute({&projectionPrep, &inputPrep}, {&projected},  {}, {}, {});
 
         projected.reshapei({numHeads, projectedSize, miniBatchSize, seqLength});
-        projected.permutei({2, 0, 1, 3});
+        projected.permutei({2, 0, 1, 3});   //[minibatch, numHeads, projectedSize, seqLength]
 
         return projected;
     }
diff --git a/libnd4j/include/ops/declarable/generic/nn/multi_head_dot_product_attention.cpp b/libnd4j/include/ops/declarable/generic/nn/multi_head_dot_product_attention.cpp
index 45324300d..2123317b5 100644
--- a/libnd4j/include/ops/declarable/generic/nn/multi_head_dot_product_attention.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/multi_head_dot_product_attention.cpp
@@ -28,13 +28,13 @@ namespace nd4j {
 namespace ops  {
 
     CUSTOM_OP_IMPL(multi_head_dot_product_attention, 7, -1, false, 0, 2) {
-        auto queries = INPUT_VARIABLE(0);
-        auto keys    = INPUT_VARIABLE(1);
-        auto values  = INPUT_VARIABLE(2);
-        auto Wq      = INPUT_VARIABLE(3);
-        auto Wk      = INPUT_VARIABLE(4);
-        auto Wv      = INPUT_VARIABLE(5);
-        auto Wo      = INPUT_VARIABLE(6);
+        auto queries = INPUT_VARIABLE(0);       //[batch, nIn, timeSteps]
+        auto keys    = INPUT_VARIABLE(1);       //[batch, nIn, timeSteps]
+        auto values  = INPUT_VARIABLE(2);       //[batch, nIn, timeSteps]
+        auto Wq      = INPUT_VARIABLE(3);       //[nHeads, headSize, nIn]
+        auto Wk      = INPUT_VARIABLE(4);       //[nHeads, headSize, nIn]
+        auto Wv      = INPUT_VARIABLE(5);       //[nHeads, headSize, nIn]
+        auto Wo      = INPUT_VARIABLE(6);       //[nHeads * headSize, nOut]
         auto mask    = block.width() > 7 ? INPUT_VARIABLE(7) : nullptr;
 
 
@@ -93,11 +93,12 @@ namespace ops  {
 
 
         // Project queries, keys, values
-        auto projectedQueries = AttentionHelper::multiHeadProject(queries, Wq, block.launchContext());
-        auto projectedKeys = AttentionHelper::multiHeadProject(keys, Wk, block.launchContext());
-        auto projectedValues = AttentionHelper::multiHeadProject(values, Wv, block.launchContext());
+        auto projectedQueries = AttentionHelper::multiHeadProject(queries, Wq, block.launchContext());      //[minibatch, numHeads, projectedSize, seqLength]
+        auto projectedKeys = AttentionHelper::multiHeadProject(keys, Wk, block.launchContext());            //[minibatch, numHeads, projectedSize, seqLength]
+        auto projectedValues = AttentionHelper::multiHeadProject(values, Wv, block.launchContext());        //[minibatch, numHeads, projectedSize, seqLength]
 
         // Apply Attention
+        // attnResults = [minibatch, numHeads, projectedSize, seqLenth
         NDArray attnResults('c', {projectedQueries.sizeAt(0), projectedValues.sizeAt(1), projectedValues.sizeAt(2), projectedQueries.sizeAt(3)}, projectedValues.dataType(), block.launchContext());
         nd4j::ops::dot_product_attention attention;
         attention.execute({&projectedQueries, &projectedKeys, &projectedValues, mask}, {&attnResults, weights ? OUTPUT_VARIABLE(1) : nullptr}, {}, {normalization, weights}, {});
diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/samediff/internal/InferenceSession.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/samediff/internal/InferenceSession.java
index 55165b530..32a1cc362 100644
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/samediff/internal/InferenceSession.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/samediff/internal/InferenceSession.java
@@ -24,7 +24,7 @@ import org.nd4j.autodiff.listeners.Listener;
 import org.nd4j.autodiff.samediff.SDVariable;
 import org.nd4j.autodiff.samediff.SameDiff;
 import org.nd4j.autodiff.samediff.VariableType;
-import org.nd4j.autodiff.samediff.internal.memory.ArrayCloseMemoryMgr;
+import org.nd4j.autodiff.samediff.internal.memory.ArrayCacheMemoryMgr;
 import org.nd4j.base.Preconditions;
 import org.nd4j.linalg.api.buffer.DataType;
 import org.nd4j.linalg.api.memory.MemoryWorkspace;
@@ -84,8 +84,7 @@ public class InferenceSession extends AbstractSession<INDArray, SameDiffOp> {
 
     public InferenceSession(@NonNull SameDiff sameDiff) {
         super(sameDiff);
-
-        mmgr = new ArrayCloseMemoryMgr();   //TODO replace this with new (planned) array reuse memory manager
+        mmgr = new ArrayCacheMemoryMgr();
     }
 
     @Override
@@ -215,7 +214,6 @@ public class InferenceSession extends AbstractSession<INDArray, SameDiffOp> {
         }
 
         INDArray[] out = doExec(op.getOp(), outputFrameIter, opInputs, allIterInputs, constAndPhInputs);
-        op.getOp().clearArrays();
 
         if (log.isTraceEnabled()) {
             StringBuilder sb = new StringBuilder();
@@ -254,6 +252,7 @@ public class InferenceSession extends AbstractSession<INDArray, SameDiffOp> {
                 }
             }
         }
+        op.getOp().clearArrays();
 
 
         //Record array uses for memory management/deallocation
@@ -842,11 +841,10 @@ public class InferenceSession extends AbstractSession<INDArray, SameDiffOp> {
                     reqShape = reqShape.asDataType(dt);
                 }
 
-                if (currOutput == null || currOutput.wasClosed() || !currOutput.shapeDescriptor().equals(reqShape) || currOutput.isEmpty() != reqShape.isEmpty() || isLoop) {
-                    boolean isOutput = allReqVariables.contains(outNames[i]);
-                    INDArray out = mmgr.allocate(isOutput, reqShape);
-                    customOp.setOutputArgument(i, out);
-                }
+                //Always allocate new output array, rely on memory manager for efficient memory management and array reuse etc
+                boolean isOutput = allReqVariables.contains(outNames[i]);
+                INDArray out = mmgr.allocate(isOutput, reqShape);
+                customOp.setOutputArgument(i, out);
             }
 
         } else if (df instanceof Op) {
@@ -893,29 +891,17 @@ public class InferenceSession extends AbstractSession<INDArray, SameDiffOp> {
 
             //Check output shape; allocate a new Z if required
             //For example, if minibatch size has changed since last op execution
+            boolean isOutput = allReqVariables.contains(((BaseOp) op).outputVariablesNames()[0]);
             if (emptyReduce) {
-                INDArray z = op.z();
-                if (z == null || !op.x().equalShapes(z) || isLoop) {
-                    //Note: edge case: [x,y].sum(empty) = [x,y] for TF import compatibility.
-                    z = mmgr.allocate(false, op.x().dataType(), op.x().shape());
-                    op.setZ(z);
-                }
+                //Always allocate new output array, rely on memory manager for efficient memory management and array reuse etc
+                INDArray z = mmgr.allocate(false, op.x().dataType(), op.x().shape());
+                op.setZ(z);
             } else {
                 List<LongShapeDescriptor> outputShape = ((BaseOp) op).calculateOutputShape();
                 Preconditions.checkState(outputShape != null && outputShape.size() == 1, "Could not calculate output shape for op: %s", op.getClass());
-                INDArray z = op.z();
-                if (z == null || z.wasClosed() || !outputShape.get(0).equals(z.shapeDescriptor()) || isLoop) {
-                    if (log.isTraceEnabled()) {
-                        log.trace("Existing op result (z) array shape for op {} was {}, allocating new array of shape {}",
-                                op.getClass().getSimpleName(), (z == null ? null : Arrays.toString(z.shape())), outputShape.get(0).toString());
-                    }
-
-                    LongShapeDescriptor lsd = outputShape.get(0);
-
-                    boolean isOutput = allReqVariables.contains(((BaseOp) op).outputVariablesNames()[0]);
-                    z = mmgr.allocate(isOutput, lsd);
-                    op.setZ(z);
-                }
+                LongShapeDescriptor lsd = outputShape.get(0);
+                INDArray z = mmgr.allocate(isOutput, lsd);
+                op.setZ(z);
             }
         }
 
diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/samediff/internal/memory/ArrayCacheMemoryMgr.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/samediff/internal/memory/ArrayCacheMemoryMgr.java
new file mode 100644
index 000000000..c802dd4e2
--- /dev/null
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/samediff/internal/memory/ArrayCacheMemoryMgr.java
@@ -0,0 +1,292 @@
+package org.nd4j.autodiff.samediff.internal.memory;
+
+import lombok.*;
+import org.bytedeco.javacpp.Pointer;
+import org.nd4j.base.Preconditions;
+import org.nd4j.linalg.api.buffer.DataType;
+import org.nd4j.linalg.api.ndarray.INDArray;
+import org.nd4j.linalg.api.shape.LongShapeDescriptor;
+import org.nd4j.linalg.factory.Nd4j;
+import org.nd4j.linalg.util.ArrayUtil;
+
+import java.util.*;
+
+/**
+ * ArrayCacheMemoryMgr reuses arrays to reduce the number of memory allocations and deallocations.<br>
+ * Memory allocations and deallocations can be quite expensive, especially on GPUs.<br>
+ * Note that when arrays are reused, they are reused for the same datatype only.<br>
+ * If caching a released array would result in the the maximum cache size being is exceeded, the oldest arrays will
+ * be deallocated first, until the new array can in the cache.
+ * <br><br>
+ * By default, the following parameters are used for the cache:
+ * <ul>
+ * <li>Maximum cache size: 0.25 x max memory, where:</li>
+ * <ul>
+ *      <li>CPU: max memory is determined using {@link Pointer#maxBytes()}</li>
+ *      <li>GPU: max memory is determined using GPU 0 total memory</li>
+ * </ul>
+ * <li>Larger array max multiple: 2.0</li>
+ * <ul>
+ *     <li>This means: if an exact array size can't be provided from the cache, use the next smallest array with a buffer up to 2.0x larger than requested</li>
+ *     <li>If no cached arrays of size &lt; 2x requested exists, allocate a new array</li>
+ * </ul>
+ * <li>Small array threshold: 1024 elements</li>
+ * <ul>
+ *      <li>This means: the "larger array max multiple" doesn't apply below this level. For example, we might return a size 1 array backed by a size 1023 buffer</li>
+ * </ul>
+ * </ul>
+ *
+ * @author Alex Black
+ */
+@Getter
+public class ArrayCacheMemoryMgr extends AbstractMemoryMgr {
+
+    private final double maxMemFrac;
+    private final long smallArrayThreshold;
+    private final double largerArrayMaxMultiple;
+
+    private final long maxCacheBytes;
+    private final long totalMemBytes;
+
+    private long currentCacheSize = 0;
+    private Map<DataType, ArrayStore> arrayStores = new HashMap<>();
+
+    private LinkedHashSet<Long> lruCache = new LinkedHashSet<>();
+    private Map<Long,INDArray> lruCacheValues = new HashMap<>();
+
+    /**
+     * Create an ArrayCacheMemoryMgr with default settings as per {@link ArrayCacheMemoryMgr}
+     */
+    public ArrayCacheMemoryMgr() {
+        this(0.25, 1024, 2.0);
+    }
+
+    /**
+     * @param maxMemFrac             Maximum memory fraciton to use as cache
+     * @param smallArrayThreshold    Below this size (elements), don't apply the "largerArrayMaxMultiple" rule
+     * @param largerArrayMaxMultiple Maximum multiple of the requested size to return from the cache. If an array of size
+     *                               1024 is requested, and largerArrayMaxMultiple is 2.0, then we'll return from the cache
+     *                               the array with the smallest data buffer up to 2.0*1024 elements; otherwise we'll return
+     *                               a new array
+     */
+    public ArrayCacheMemoryMgr(double maxMemFrac, long smallArrayThreshold, double largerArrayMaxMultiple) {
+        Preconditions.checkArgument(maxMemFrac > 0 && maxMemFrac < 1, "Maximum memory fraction for cache must be between 0.0 and 1.0, got %s", maxMemFrac);
+        Preconditions.checkArgument(smallArrayThreshold >= 0, "Small array threshould must be >= 0, got %s", smallArrayThreshold);
+        Preconditions.checkArgument(largerArrayMaxMultiple >= 1.0, "Larger array max multiple must be >= 1.0, got %s", largerArrayMaxMultiple);
+        this.maxMemFrac = maxMemFrac;
+        this.smallArrayThreshold = smallArrayThreshold;
+        this.largerArrayMaxMultiple = largerArrayMaxMultiple;
+
+        if(isCpu()){
+            totalMemBytes = Pointer.maxBytes();
+        } else {
+            Properties p = Nd4j.getExecutioner().getEnvironmentInformation();
+            List devList = (List) p.get("cuda.devicesInformation");
+            Map m = (Map) devList.get(0);
+            totalMemBytes = (Long)m.get("cuda.totalMemory");
+        }
+        maxCacheBytes = (long)(maxMemFrac * totalMemBytes);
+    }
+
+    private boolean isCpu(){
+        String backend = Nd4j.getExecutioner().getEnvironmentInformation().getProperty("backend");
+        return !"CUDA".equalsIgnoreCase(backend);
+    }
+
+    @Override
+    public INDArray allocate(boolean detached, DataType dataType, long... shape) {
+        if (arrayStores.containsKey(dataType)) {
+            INDArray arr = arrayStores.get(dataType).get(shape);
+            if (arr != null) {
+                //Decrement cache size
+                currentCacheSize -= dataType.width() * arr.data().length();
+
+                return arr; //Allocated from cache
+            }
+        }
+
+        //Allocation failed, allocate new array
+        return Nd4j.createUninitializedDetached(dataType, shape);
+    }
+
+    @Override
+    public INDArray allocate(boolean detached, LongShapeDescriptor descriptor) {
+        return allocate(detached, descriptor.dataType(), descriptor.getShape());
+    }
+
+    @Override
+    public void release(@NonNull INDArray array) {
+        //Check for multiple releases of the array
+        long id = array.getId();
+        Preconditions.checkState(!lruCache.contains(id), "Array was released multiple times: id=%s, shape=%ndShape", id, array);
+
+
+        DataType dt = array.dataType();
+        long thisBytes = array.data().length() * dt.width();
+        if(array.dataType() == DataType.UTF8) {
+            //Don't cache string arrays due to variable length buffers
+            if(array.closeable())
+                array.close();
+        } else if (currentCacheSize + thisBytes > maxCacheBytes) {
+            if(thisBytes > maxCacheBytes){
+                //Can't store even if we clear everything - too large
+                if(array.closeable())
+                    array.close();
+                return;
+            }
+
+            //Need to deallocate some arrays to stay under limit - do in "oldest first" order
+            Iterator<Long> iter = lruCache.iterator();
+            while(currentCacheSize + thisBytes > maxCacheBytes){
+                long next = iter.next();
+                iter.remove();
+                INDArray nextOldest = lruCacheValues.remove(next);
+                DataType ndt = nextOldest.dataType();
+                long nextBytes = ndt.width() * nextOldest.data().length();
+                arrayStores.get(ndt).removeObject(nextOldest);
+                currentCacheSize -= nextBytes;
+
+                if(nextOldest.closeable())
+                    nextOldest.close();
+            }
+
+            //After clearing space - can now cache
+            cacheArray(array);
+        } else {
+            //OK to cache
+            cacheArray(array);
+        }
+
+        //Store in LRU cache for "last used" removal if we exceed cache size
+        lruCache.add(array.getId());
+        lruCacheValues.put(array.getId(), array);
+    }
+
+    private void cacheArray(INDArray array){
+        DataType dt = array.dataType();
+        if (!arrayStores.containsKey(dt))
+            arrayStores.put(dt, new ArrayStore());
+        arrayStores.get(dt).add(array);
+        currentCacheSize += array.data().length() * dt.width();
+
+        lruCache.add(array.getId());
+        lruCacheValues.put(array.getId(), array);
+    }
+
+    @Override
+    public void close() {
+        for (ArrayStore as : arrayStores.values()) {
+            as.close();
+        }
+    }
+
+
+    @Getter
+    public class ArrayStore {
+        private INDArray[] sorted = new INDArray[1000];     //TODO resizing, don't hardcode
+        private long[] lengths = new long[1000];
+        private long lengthSum;
+        private long bytesSum;
+        private int size;
+
+        private void add(@NonNull INDArray array) {
+            //Resize arrays
+            if(size == sorted.length){
+                sorted = Arrays.copyOf(sorted, 2*sorted.length);
+                lengths = Arrays.copyOf(lengths, 2*lengths.length);
+            }
+
+            long length = array.data().length();
+            int idx = Arrays.binarySearch(lengths, 0, size, length);
+            if (idx < 0) {
+                idx = -idx - 1;  //See binarySearch javadoc
+            }
+            for (int i = size - 1; i >= idx; i--) {
+                sorted[i + 1] = sorted[i];
+                lengths[i + 1] = lengths[i];
+            }
+            sorted[idx] = array;
+            lengths[idx] = length;
+            size++;
+            lengthSum += length;
+            bytesSum += length * array.dataType().width();
+        }
+
+        private INDArray get(long[] shape) {
+            if (size == 0)
+                return null;
+
+            long length = shape.length == 0 ? 1 : ArrayUtil.prod(shape);
+
+            int idx = Arrays.binarySearch(lengths, 0, size, length);
+            if (idx < 0) {
+                idx = -idx - 1;
+                if (idx >= size) {
+                    //Largest array is smaller than required -> can't return from cache
+                    return null;
+                }
+                INDArray nextSmallest = sorted[idx];
+                long nextSmallestLength = nextSmallest.data().length();
+                long nextSmallestLengthBytes = nextSmallestLength * nextSmallest.dataType().width();
+
+                boolean tooLarge = (length > (long) (nextSmallestLength * largerArrayMaxMultiple));
+
+                if (nextSmallestLengthBytes > smallArrayThreshold && tooLarge) {
+                    return null;
+                } // If less than smallArrayThreshold, ok, return as is
+            }
+
+            //Remove
+            INDArray arr = removeIdx(idx);
+
+            lruCache.remove(arr.getId());
+            lruCacheValues.remove(arr.getId());
+
+            //Create a new array with the specified buffer. This is for 2 reasons:
+            //(a) the cached array and requested array sizes may differ (though this is easy to check for)
+            //(b) Some SameDiff array use tracking uses *object identity* - so we want different objects when reusing arrays
+            //    to avoid issues there
+            return Nd4j.create(arr.data(), shape);
+        }
+
+        private void removeObject(INDArray array){
+            long length = array.data().length();
+            int idx = Arrays.binarySearch(lengths, 0, size, length);
+            Preconditions.checkState(idx > 0, "Cannot remove array from ArrayStore: no array with this length exists in the cache");
+            boolean found = false;
+            int i = 0;
+            while(!found && i <= size && lengths[i] == length){
+                found = sorted[i++] == array; //Object equality
+            }
+            Preconditions.checkState(found, "Cannot remove array: not found in ArrayCache");
+            removeIdx(i - 1);
+        }
+
+        private INDArray removeIdx(int idx){
+            INDArray arr = sorted[idx];
+            for (int i = idx; i < size; i++) {
+                sorted[i] = sorted[i + 1];
+                lengths[i] = lengths[i + 1];
+            }
+            sorted[size] = null;
+            lengths[size] = 0;
+            size--;
+
+            bytesSum -= (arr.data().length() * arr.dataType().width());
+            lengthSum -= arr.data().length();
+
+            return arr;
+        }
+
+        private void close() {
+            for (int i = 0; i < size; i++) {
+                if (sorted[i].closeable())
+                    sorted[i].close();
+                lengths[i] = 0;
+            }
+            lengthSum = 0;
+            bytesSum = 0;
+            size = 0;
+        }
+    }
+}
diff --git a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/autodiff/samediff/MemoryMgrTest.java b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/autodiff/samediff/MemoryMgrTest.java
new file mode 100644
index 000000000..6505bee20
--- /dev/null
+++ b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/autodiff/samediff/MemoryMgrTest.java
@@ -0,0 +1,119 @@
+package org.nd4j.autodiff.samediff;
+
+import org.junit.Test;
+import org.nd4j.autodiff.samediff.internal.memory.ArrayCacheMemoryMgr;
+import org.nd4j.linalg.BaseNd4jTest;
+import org.nd4j.linalg.api.buffer.DataType;
+import org.nd4j.linalg.api.ndarray.INDArray;
+import org.nd4j.linalg.factory.Nd4j;
+import org.nd4j.linalg.factory.Nd4jBackend;
+
+import java.lang.reflect.Field;
+
+import static org.junit.Assert.*;
+
+public class MemoryMgrTest extends BaseNd4jTest {
+
+    public MemoryMgrTest(Nd4jBackend b){
+        super(b);
+    }
+
+    @Override
+    public char ordering(){
+        return 'c';
+    }
+
+    @Test
+    public void testArrayReuseTooLarge() throws Exception {
+
+        ArrayCacheMemoryMgr mmgr = new ArrayCacheMemoryMgr();
+        Field f = ArrayCacheMemoryMgr.class.getDeclaredField("maxCacheBytes");
+        f.setAccessible(true);
+        f.set(mmgr, 1000);
+
+        assertEquals(1000, mmgr.getMaxCacheBytes());
+
+        INDArray[] arrays = new INDArray[100];
+        for( int i=0; i<arrays.length; i++ ){
+            arrays[i] = Nd4j.create(DataType.FLOAT, 25);        //100 bytes each
+        }
+
+        for( int i=0; i<10; i++ ){
+            mmgr.release(arrays[i]);
+        }
+
+        assertEquals(1000, mmgr.getCurrentCacheSize());
+        ArrayCacheMemoryMgr.ArrayStore as = mmgr.getArrayStores().get(DataType.FLOAT);
+        assertEquals(1000, as.getBytesSum());
+        assertEquals(250, as.getLengthSum());
+        assertEquals(10, as.getSize());
+        assertEquals(10, mmgr.getLruCache().size());
+        assertEquals(10, mmgr.getLruCacheValues().size());
+
+
+        //At this point: array store is full.
+        //If we try to release more, the oldest (first released) values should be closed
+        for( int i=0; i<10; i++ ) {
+            INDArray toRelease = Nd4j.create(DataType.FLOAT, 25);
+            mmgr.release(toRelease);
+            //oldest N only should be closed by this point...
+            for( int j=0; j<10; j++ ){
+                if(j <= i){
+                    //Should have been closed
+                    assertTrue(arrays[j].wasClosed());
+                } else {
+                    //Should still be open
+                    assertFalse(arrays[j].wasClosed());
+                }
+            }
+        }
+
+
+        assertEquals(1000, mmgr.getCurrentCacheSize());
+        assertEquals(1000, as.getBytesSum());
+        assertEquals(250, as.getLengthSum());
+        assertEquals(10, as.getSize());
+        assertEquals(10, mmgr.getLruCache().size());
+        assertEquals(10, mmgr.getLruCacheValues().size());
+
+        //now, allocate some values:
+        for( int i=1; i<=10; i++ ) {
+            INDArray a1 = mmgr.allocate(true, DataType.FLOAT, 25);
+            assertEquals(1000 - i * 100, mmgr.getCurrentCacheSize());
+            assertEquals(1000 - i * 100, as.getBytesSum());
+            assertEquals(250 - i * 25, as.getLengthSum());
+            assertEquals(10 - i, as.getSize());
+            assertEquals(10 - i, mmgr.getLruCache().size());
+            assertEquals(10 - i, mmgr.getLruCacheValues().size());
+        }
+
+        assertEquals(0, mmgr.getCurrentCacheSize());
+        assertEquals(0, as.getBytesSum());
+        assertEquals(0, as.getLengthSum());
+        assertEquals(0, as.getSize());
+        assertEquals(0, mmgr.getLruCache().size());
+        assertEquals(0, mmgr.getLruCacheValues().size());
+    }
+
+    @Test
+    public void testManyArrays(){
+
+        ArrayCacheMemoryMgr mmgr = new ArrayCacheMemoryMgr();
+        for( int i=0; i<1000; i++ ){
+            mmgr.release(Nd4j.scalar(0));
+        }
+
+        assertEquals(4*1000, mmgr.getCurrentCacheSize());
+        assertEquals(1000, mmgr.getLruCache().size());
+        assertEquals(1000, mmgr.getLruCacheValues().size());
+
+        for( int i=0; i<1000; i++ ){
+            mmgr.release(Nd4j.scalar(0));
+        }
+
+        assertEquals(4*2000, mmgr.getCurrentCacheSize());
+        assertEquals(2000, mmgr.getLruCache().size());
+        assertEquals(2000, mmgr.getLruCacheValues().size());
+    }
+
+}

From 1d96bb9e6e3f8c005e5e86af2ad7d1ace44ccdd2 Mon Sep 17 00:00:00 2001
From: Alex Black <blacka101@gmail.com>
Date: Tue, 12 Nov 2019 22:51:09 +1100
Subject: [PATCH 05/15] SameDiff op runtime benchmarking listener (#42)

Signed-off-by: AlexDBlack <blacka101@gmail.com>
---
 .../functions/DifferentialFunction.java       |   2 +-
 .../debugging/OpBenchmarkListener.java        | 189 ++++++++++++++++++
 2 files changed, 190 insertions(+), 1 deletion(-)
 create mode 100644 nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/listeners/debugging/OpBenchmarkListener.java

diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/functions/DifferentialFunction.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/functions/DifferentialFunction.java
index 32df3e69d..8c80e3bb4 100644
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/functions/DifferentialFunction.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/functions/DifferentialFunction.java
@@ -509,7 +509,7 @@ public abstract class DifferentialFunction {
      * @return the arguments for a given function
      */
     public  SDVariable[] args() {
-        return sameDiff.getInputVariablesForOp(this);
+        return sameDiff == null ? null : sameDiff.getInputVariablesForOp(this);
     }
 
     /**
diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/listeners/debugging/OpBenchmarkListener.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/listeners/debugging/OpBenchmarkListener.java
new file mode 100644
index 000000000..103b0f960
--- /dev/null
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/listeners/debugging/OpBenchmarkListener.java
@@ -0,0 +1,189 @@
+package org.nd4j.autodiff.listeners.debugging;
+
+import lombok.*;
+import org.nd4j.autodiff.listeners.At;
+import org.nd4j.autodiff.listeners.BaseListener;
+import org.nd4j.autodiff.listeners.Operation;
+import org.nd4j.autodiff.samediff.SameDiff;
+import org.nd4j.autodiff.samediff.internal.SameDiffOp;
+import org.nd4j.linalg.api.ndarray.INDArray;
+import org.nd4j.linalg.api.ops.DynamicCustomOp;
+import org.nd4j.linalg.api.ops.Op;
+import org.nd4j.linalg.dataset.api.MultiDataSet;
+import org.nd4j.linalg.factory.Nd4j;
+import org.nd4j.linalg.util.ArrayUtil;
+
+import java.text.DecimalFormat;
+import java.util.*;
+
+/**
+ * A simple listener for benchmarking single operations in SameDiff<br>
+ * Supports 2 modes:<br>
+ * - SINGLE_ITER_PRINT: Print the runtime of the first iteration<br>
+ * - AGGREGATE: Collect statistics for multiple runs, that can be accessed (by op name) via {@link #getAggregateModeMap()}
+ *
+ * @author Alex Black
+ */
+@Getter
+public class OpBenchmarkListener extends BaseListener {
+
+    public enum Mode {SINGLE_ITER_PRINT, AGGREGATE}
+
+    private final Operation operation;
+    private final Mode mode;
+    private final long minRuntime;
+    private Map<String,OpExec> aggregateModeMap;
+
+    @Getter(AccessLevel.PRIVATE)
+    private long start;
+    @Getter(AccessLevel.PRIVATE)
+    private boolean printActive;
+    private boolean printDone;
+
+    public OpBenchmarkListener(Operation operation, @NonNull Mode mode) {
+        this(operation, mode, 0);
+    }
+
+    /**
+     * @param operation  Operation to collect stats for
+     * @param mode       Mode - see {@link OpBenchmarkListener}
+     * @param minRuntime Minimum runtime - only applies to Mode.SINGLE_ITER_PRINT. If op runtime below this: don't print
+     */
+    public OpBenchmarkListener(Operation operation, @NonNull Mode mode, long minRuntime) {
+        this.operation = operation;
+        this.mode = mode;
+        this.minRuntime = minRuntime;
+    }
+
+    @Override
+    public boolean isActive(Operation operation) {
+        return this.operation == null || this.operation == operation;
+    }
+
+    @Override
+    public void operationStart(SameDiff sd, Operation op) {
+        if(printDone)
+            return;
+        if(this.operation == null || this.operation == op)
+            printActive = true;
+    }
+
+    @Override
+    public void operationEnd(SameDiff sd, Operation op) {
+        if(printDone)
+            return;
+        if(this.operation == null || this.operation == op) {
+            printActive = false;
+            printDone = true;
+        }
+    }
+
+    @Override
+    public void preOpExecution(SameDiff sd, At at, SameDiffOp op) {
+        start = System.currentTimeMillis();
+    }
+
+    @Override
+    public void opExecution(SameDiff sd, At at, MultiDataSet batch, SameDiffOp op, INDArray[] outputs) {
+        long now = System.currentTimeMillis();
+
+        if (mode == Mode.SINGLE_ITER_PRINT && printActive && (now-start) > this.minRuntime) {
+            System.out.println(getOpString(op, now));
+        } else if (mode == Mode.AGGREGATE) {
+            if(aggregateModeMap == null)
+                aggregateModeMap = new LinkedHashMap<>();
+
+            if(!aggregateModeMap.containsKey(op.getName())){
+                String s = getOpString(op, null);
+                OpExec oe = new OpExec(op.getName(), op.getOp().opName(), op.getOp().getClass(),
+                        new ArrayList<Long>(), s);
+                aggregateModeMap.put(op.getName(), oe);
+            }
+
+            aggregateModeMap.get(op.getName()).getRuntimeMs().add(now-start);
+        }
+    }
+
+    private String getOpString(SameDiffOp op, Long now){
+        StringBuilder sb = new StringBuilder();
+        sb.append(op.getName()).append(" - ").append(op.getOp().getClass().getSimpleName())
+                .append("(").append(op.getOp().opName()).append(") - ");
+        if(now != null) {
+            sb.append(now - start).append(" ms\n");
+        }
+
+        if (op.getOp() instanceof DynamicCustomOp) {
+            DynamicCustomOp dco = (DynamicCustomOp) op.getOp();
+            int x = 0;
+
+            for (INDArray i : dco.inputArguments()) {
+                sb.append("  in ").append(x++).append(": ").append(i.shapeInfoToString()).append("\n");
+            }
+            x = 0;
+            for (INDArray o : dco.outputArguments()) {
+                sb.append("  out ").append(x++).append(": ").append(o.shapeInfoToString()).append("\n");
+            }
+            long[] iargs = dco.iArgs();
+            boolean[] bargs = dco.bArgs();
+            double[] targs = dco.tArgs();
+            if (iargs != null && iargs.length > 0) {
+                sb.append("  iargs: ").append(Arrays.toString(iargs)).append("\n");
+            }
+            if (bargs != null && bargs.length > 0) {
+                sb.append("  bargs: ").append(Arrays.toString(bargs)).append("\n");
+            }
+            if (targs != null && targs.length > 0) {
+                sb.append("  targs: ").append(Arrays.toString(targs)).append("\n");
+            }
+        } else {
+            Op o = (Op) op.getOp();
+            if (o.x() != null)
+                sb.append("  x: ").append(o.x().shapeInfoToString());
+            if (o.y() != null)
+                sb.append("  y: ").append(o.y().shapeInfoToString());
+            if (o.z() != null)
+                sb.append("  z: ").append(o.z().shapeInfoToString());
+        }
+        return sb.toString();
+    }
+
+
+    @AllArgsConstructor
+    @Data
+    public static class OpExec {
+        private final String opOwnName;
+        private final String opName;
+        private final Class<?> opClass;
+        private List<Long> runtimeMs;
+        private String firstIter;
+
+        @Override
+        public String toString(){
+            DecimalFormat df = new DecimalFormat("0.000");
+
+            return opOwnName + " - op class: " + opClass.getSimpleName() + " (op name: " + opName + ")\n"
+                    + "count: " + runtimeMs.size() + ", mean: " + df.format(avgMs()) + "ms, std: " + df.format(stdMs()) + "ms, min: " + minMs() + "ms, max: " + maxMs() + "ms\n"
+                    + firstIter;
+        }
+
+        public double avgMs() {
+            long sum = 0;
+            for (Long l : runtimeMs) {
+                sum += l;
+            }
+            return sum / (double) runtimeMs.size();
+        }
+
+        public double stdMs() {
+            return Nd4j.createFromArray(ArrayUtil.toArrayLong(runtimeMs)).stdNumber().doubleValue();
+        }
+
+        public long minMs() {
+            return Nd4j.createFromArray(ArrayUtil.toArrayLong(runtimeMs)).minNumber().longValue();
+        }
+
+        public long maxMs() {
+            return Nd4j.createFromArray(ArrayUtil.toArrayLong(runtimeMs)).maxNumber().longValue();
+        }
+    }
+}

From f05c6ee13922d3fbca8d73c8007b403dd3d3cbef Mon Sep 17 00:00:00 2001
From: raver119 <raver119@gmail.com>
Date: Tue, 12 Nov 2019 15:12:31 +0300
Subject: [PATCH 06/15] INLINE_LOOPS for windows

Signed-off-by: raver119 <raver119@gmail.com>
---
 libnd4j/CMakeLists.txt                               |  4 ++--
 .../include/helpers/cpu/loops/Reduction3Loops_0.cpp  | 12 ++++++++++--
 .../include/helpers/cpu/loops/Reduction3Loops_1.cpp  | 12 ++++++++++--
 .../include/helpers/cpu/loops/Reduction3Loops_2.cpp  | 12 ++++++++++--
 .../include/helpers/cpu/loops/Reduction3Loops_3.cpp  | 12 ++++++++++--
 libnd4j/include/helpers/cpu/loops/ReductionLoops.hpp |  1 +
 .../helpers/cpu/loops/ReductionLoops_bool.cpp        |  4 ++++
 .../helpers/cpu/loops/ReductionLoops_float_0.cpp     |  5 ++++-
 .../helpers/cpu/loops/ReductionLoops_float_1.cpp     |  5 ++++-
 .../helpers/cpu/loops/ReductionLoops_float_2.cpp     |  5 ++++-
 .../helpers/cpu/loops/ReductionLoops_float_3.cpp     |  5 ++++-
 .../helpers/cpu/loops/ReductionLoops_long.cpp        |  5 ++++-
 .../helpers/cpu/loops/ReductionLoops_same.cpp        |  4 ++++
 13 files changed, 71 insertions(+), 15 deletions(-)

diff --git a/libnd4j/CMakeLists.txt b/libnd4j/CMakeLists.txt
index 949dbd542..a16c9eaf7 100755
--- a/libnd4j/CMakeLists.txt
+++ b/libnd4j/CMakeLists.txt
@@ -22,8 +22,8 @@ if (APPLE)
 elseif(WIN32)
     set(X86_BUILD true)
     if (NOT CUDA_BLAS)
-        set(CMAKE_CXX_FLAGS_RELEASE  "-O3 -fPIC -std=c++11 -fmax-errors=2 -D_RELEASE=true")
-        set(CMAKE_CXX_FLAGS_DEBUG  " -g -fPIC -std=c++11 -fmax-errors=2")
+        set(CMAKE_CXX_FLAGS_RELEASE  "-O3 -fPIC -std=c++11 -fmax-errors=2 -DINLINE_LOOPS -D_RELEASE=true")
+        set(CMAKE_CXX_FLAGS_DEBUG  " -g -fPIC -std=c++11 -DINLINE_LOOPS -fmax-errors=2")
     else()
         set(CMAKE_CXX_FLAGS_RELEASE  "-D_RELEASE=true /wd4804")
         set(CMAKE_CXX_FLAGS_DEBUG  "  /FS /EHsc /wd4661 /wd4804 /wd4267 /wd4244 /wd4251 /wd4305")
diff --git a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_0.cpp b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_0.cpp
index 895afccfd..16bf3b08b 100644
--- a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_0.cpp
+++ b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_0.cpp
@@ -29,23 +29,31 @@ namespace nd4j {
     template<typename X, typename Z>
     template <typename OpType>
     void Reduction3Loops<X,Z>::innerloopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams) {
-         Reduction3Loops<X,Z>::template loopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams);
+#ifndef INLINE_LOOPS
+        Reduction3Loops<X,Z>::template loopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams);
+#endif
     }
 
     template<typename X, typename Z>
     template <typename OpType>
     void Reduction3Loops<X,Z>::innerloopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams) {
-         Reduction3Loops<X,Z>::template loopReduce3All<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams);
+#ifndef INLINE_LOOPS
+        Reduction3Loops<X,Z>::template loopReduce3All<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams);
+#endif
     }
 
     template<typename X, typename Y>
     void Reduction3Loops<X, Y>::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, int* dims, int dimsLen, Y *extraParams) {
+#ifndef INLINE_LOOPS
         DISPATCH_BY_OPNUM_TT(innerloopReduce3, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams), REDUCE3_OPS);
+#endif
     }
 
     template<typename X, typename Y>
     void Reduction3Loops<X, Y>::wrapperAll(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Y* extraParams) {
+#ifndef INLINE_LOOPS
         DISPATCH_BY_OPNUM_TT(innerloopReduce3All, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo,  xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams), REDUCE3_OPS);
+#endif
     }
 
     BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT Reduction3Loops, , LIBND4J_TYPES, FLOAT_TYPES_0);
diff --git a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_1.cpp b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_1.cpp
index d8c24e096..4e350ce15 100644
--- a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_1.cpp
+++ b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_1.cpp
@@ -29,23 +29,31 @@ namespace nd4j {
     template<typename X, typename Z>
     template <typename OpType>
     void Reduction3Loops<X,Z>::innerloopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams) {
-         Reduction3Loops<X,Z>::template loopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams);
+#ifndef INLINE_LOOPS
+        Reduction3Loops<X,Z>::template loopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams);
+#endif
     }
 
     template<typename X, typename Z>
     template <typename OpType>
     void Reduction3Loops<X,Z>::innerloopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams) {
-         Reduction3Loops<X,Z>::template loopReduce3All<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams);
+#ifndef INLINE_LOOPS
+        Reduction3Loops<X,Z>::template loopReduce3All<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams);
+#endif
     }
 
     template<typename X, typename Y>
     void Reduction3Loops<X, Y>::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, int* dims, int dimsLen, Y *extraParams) {
+#ifndef INLINE_LOOPS
         DISPATCH_BY_OPNUM_TT(innerloopReduce3, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams), REDUCE3_OPS);
+#endif
     }
 
     template<typename X, typename Y>
     void Reduction3Loops<X, Y>::wrapperAll(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Y* extraParams) {
+#ifndef INLINE_LOOPS
         DISPATCH_BY_OPNUM_TT(innerloopReduce3All, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo,  xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams), REDUCE3_OPS);
+#endif
     }
 
     BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT Reduction3Loops, , LIBND4J_TYPES, FLOAT_TYPES_1);
diff --git a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_2.cpp b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_2.cpp
index 4ecc0e370..e869793a8 100644
--- a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_2.cpp
+++ b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_2.cpp
@@ -29,23 +29,31 @@ namespace nd4j {
     template<typename X, typename Z>
     template <typename OpType>
     void Reduction3Loops<X,Z>::innerloopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams) {
-         Reduction3Loops<X,Z>::template loopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams);
+#ifndef INLINE_LOOPS
+        Reduction3Loops<X,Z>::template loopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams);
+#endif
     }
 
     template<typename X, typename Z>
     template <typename OpType>
     void Reduction3Loops<X,Z>::innerloopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams) {
-         Reduction3Loops<X,Z>::template loopReduce3All<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams);
+#ifndef INLINE_LOOPS
+        Reduction3Loops<X,Z>::template loopReduce3All<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams);
+#endif
     }
 
     template<typename X, typename Y>
     void Reduction3Loops<X, Y>::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, int* dims, int dimsLen, Y *extraParams) {
+#ifndef INLINE_LOOPS
         DISPATCH_BY_OPNUM_TT(innerloopReduce3, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams), REDUCE3_OPS);
+#endif
     }
 
     template<typename X, typename Y>
     void Reduction3Loops<X, Y>::wrapperAll(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Y* extraParams) {
+#ifndef INLINE_LOOPS
         DISPATCH_BY_OPNUM_TT(innerloopReduce3All, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo,  xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams), REDUCE3_OPS);
+#endif
     }
 
     BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT Reduction3Loops, , LIBND4J_TYPES, FLOAT_TYPES_2);
diff --git a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_3.cpp b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_3.cpp
index 218c335ca..474443fd3 100644
--- a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_3.cpp
+++ b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_3.cpp
@@ -29,23 +29,31 @@ namespace nd4j {
     template<typename X, typename Z>
     template <typename OpType>
     void Reduction3Loops<X,Z>::innerloopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams) {
-         Reduction3Loops<X,Z>::template loopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams);
+#ifndef INLINE_LOOPS
+        Reduction3Loops<X,Z>::template loopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams);
+#endif
     }
 
     template<typename X, typename Z>
     template <typename OpType>
     void Reduction3Loops<X,Z>::innerloopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams) {
-         Reduction3Loops<X,Z>::template loopReduce3All<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams);
+#ifndef INLINE_LOOPS
+        Reduction3Loops<X,Z>::template loopReduce3All<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams);
+#endif
     }
 
     template<typename X, typename Y>
     void Reduction3Loops<X, Y>::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, int* dims, int dimsLen, Y *extraParams) {
+#ifndef INLINE_LOOPS
         DISPATCH_BY_OPNUM_TT(innerloopReduce3, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams), REDUCE3_OPS);
+#endif
     }
 
     template<typename X, typename Y>
     void Reduction3Loops<X, Y>::wrapperAll(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Y* extraParams) {
+#ifndef INLINE_LOOPS
         DISPATCH_BY_OPNUM_TT(innerloopReduce3All, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo,  xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams), REDUCE3_OPS);
+#endif
     }
 
     BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT Reduction3Loops, , LIBND4J_TYPES, FLOAT_TYPES_3);
diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops.hpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops.hpp
index 4a223a0f2..0709e5f3c 100644
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops.hpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops.hpp
@@ -19,3 +19,4 @@
 //
 
 #include <helpers/Loops.h>
+#include <op_boilerplate.h>
diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_bool.cpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_bool.cpp
index 35ae99afb..3d7a85eff 100644
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_bool.cpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_bool.cpp
@@ -27,7 +27,9 @@ namespace nd4j {
     template<typename X, typename Z>
     template <typename OpType>
     void ReductionBoolLoops<X, Z>::innerloopReduce(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams) {
+#ifndef INLINE_LOOPS
         ReductionLoops<X,Z,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams);
+#endif
     }
 
     template<typename X, typename Y>
@@ -35,7 +37,9 @@ namespace nd4j {
                                             Nd4jLong *zShapeInfo, Nd4jLong *tadShapeInfo,
                                             Nd4jLong *tadOffsets,
                                             X *extraParams) {
+#ifndef INLINE_LOOPS
         DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams), REDUCE_BOOL_OPS);
+#endif
     }
 
     BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT ReductionBoolLoops, , LIBND4J_TYPES, BOOL_TYPES);
diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_0.cpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_0.cpp
index c7b1f6ff8..f545c8c83 100644
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_0.cpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_0.cpp
@@ -29,15 +29,18 @@ namespace nd4j {
     template<typename X, typename Z>
     template <typename OpType>
     void ReductionFloatLoops<X, Z>::innerloopReduce(X * x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams) {
+#ifndef INLINE_LOOPS
         ReductionLoops<X,Z,Z>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams);
+#endif
     }
 
     template<typename X, typename Y>
     void ReductionFloatLoops<X, Y>::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, Y *z,
                                                   Nd4jLong *zShapeInfo, Nd4jLong *tadShapeInfo,
                                                   Nd4jLong *tadOffsets, Y *extraParams) {
-
+#ifndef INLINE_LOOPS
         DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams ), REDUCE_FLOAT_OPS);
+#endif
     }
 
     BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT ReductionFloatLoops, , LIBND4J_TYPES, FLOAT_TYPES_0);
diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_1.cpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_1.cpp
index 76c1141bf..fa52015ca 100644
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_1.cpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_1.cpp
@@ -29,15 +29,18 @@ namespace nd4j {
     template<typename X, typename Z>
     template <typename OpType>
     void ReductionFloatLoops<X, Z>::innerloopReduce(X * x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams) {
+#ifndef INLINE_LOOPS
         ReductionLoops<X,Z,Z>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams);
+#endif
     }
 
     template<typename X, typename Y>
     void ReductionFloatLoops<X, Y>::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, Y *z,
                                                   Nd4jLong *zShapeInfo, Nd4jLong *tadShapeInfo,
                                                   Nd4jLong *tadOffsets, Y *extraParams) {
-
+#ifndef INLINE_LOOPS
         DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams ), REDUCE_FLOAT_OPS);
+#endif
     }
 
     BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT ReductionFloatLoops, , LIBND4J_TYPES, FLOAT_TYPES_1);
diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_2.cpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_2.cpp
index 7288816ad..eb144fcc6 100644
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_2.cpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_2.cpp
@@ -29,15 +29,18 @@ namespace nd4j {
     template<typename X, typename Z>
     template <typename OpType>
     void ReductionFloatLoops<X, Z>::innerloopReduce(X * x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams) {
+#ifndef INLINE_LOOPS
         ReductionLoops<X,Z,Z>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams);
+#endif
     }
 
     template<typename X, typename Y>
     void ReductionFloatLoops<X, Y>::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, Y *z,
                                                   Nd4jLong *zShapeInfo, Nd4jLong *tadShapeInfo,
                                                   Nd4jLong *tadOffsets, Y *extraParams) {
-
+#ifndef INLINE_LOOPS
         DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams ), REDUCE_FLOAT_OPS);
+#endif
     }
 
     BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT ReductionFloatLoops, , LIBND4J_TYPES, FLOAT_TYPES_2);
diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_3.cpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_3.cpp
index 251624076..d2991b51b 100644
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_3.cpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_3.cpp
@@ -29,15 +29,18 @@ namespace nd4j {
     template<typename X, typename Z>
     template <typename OpType>
     void ReductionFloatLoops<X, Z>::innerloopReduce(X * x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams) {
+#ifndef INLINE_LOOPS
         ReductionLoops<X,Z,Z>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams);
+#endif
     }
 
     template<typename X, typename Y>
     void ReductionFloatLoops<X, Y>::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, Y *z,
                                                   Nd4jLong *zShapeInfo, Nd4jLong *tadShapeInfo,
                                                   Nd4jLong *tadOffsets, Y *extraParams) {
-
+#ifndef INLINE_LOOPS
         DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams ), REDUCE_FLOAT_OPS);
+#endif
     }
 
     BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT ReductionFloatLoops, , LIBND4J_TYPES, FLOAT_TYPES_3);
diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_long.cpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_long.cpp
index a6dd992c6..04a3d8559 100644
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_long.cpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_long.cpp
@@ -34,15 +34,18 @@ namespace nd4j {
     template<typename X, typename Z>
     template <typename OpType>
     void ReductionLongLoops<X, Z>::innerloopReduce(X * x, Nd4jLong* xShapeInfo, Z *z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams) {
+#ifndef INLINE_LOOPS
         ReductionLoops<X,Z,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams);
+#endif
     }
 
     template<typename X, typename Y>
     void ReductionLongLoops<X, Y>::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, Y *z,
                                             Nd4jLong *zShapeInfo, Nd4jLong *tadShapeInfo,
                                             Nd4jLong *tadOffsets, X *extraParams) {
-
+#ifndef INLINE_LOOPS
         DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams ), REDUCE_LONG_OPS);
+#endif
     }
 
     BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT ReductionLongLoops, , LIBND4J_TYPES, LONG_TYPES);
diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_same.cpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_same.cpp
index 623d97e79..9932b04c5 100644
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_same.cpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_same.cpp
@@ -27,7 +27,9 @@ namespace nd4j {
     template<typename X>
     template <typename OpType>
     void ReductionSameLoops<X>::innerloopReduce(X* x, Nd4jLong* xShapeInfo, X* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams) {
+#ifndef INLINE_LOOPS
         ReductionLoops<X,X,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams);
+#endif
     }
 
     template<typename X>
@@ -35,11 +37,13 @@ namespace nd4j {
                                            Nd4jLong *zShapeInfo, Nd4jLong *tadShapeInfo,
                                            Nd4jLong *tadOffsets,
                                            X *vextraParams) {
+#ifndef INLINE_LOOPS
         auto x = reinterpret_cast<X *>(vx);
         auto z = reinterpret_cast<X *>(vz);
         auto extraParams = reinterpret_cast<X *>(vextraParams);
 
         DISPATCH_BY_OPNUM_T(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams), REDUCE_SAME_OPS);
+#endif
     }
 
     BUILD_SINGLE_TEMPLATE(template class ReductionSameLoops, , LIBND4J_TYPES);

From 48df1acdfbcb86c403d66215b6f4f0f27676a82d Mon Sep 17 00:00:00 2001
From: raver119 <raver119@gmail.com>
Date: Wed, 13 Nov 2019 17:04:59 +0300
Subject: [PATCH 07/15] [WIP] ThreadPool (#8)

This PR removes OpenMP use in 95% of cases
---
 .../deeplearning4j/util/ConvolutionUtils.java |   14 +-
 libnd4j/CMakeLists.txt                        |   22 +-
 libnd4j/CMakeLists.txt.mkldnn.in              |    2 +-
 libnd4j/blas/CMakeLists.txt                   |   27 +-
 libnd4j/blas/Environment.cpp                  |    7 +-
 libnd4j/blas/NDArray.h                        |    2 +-
 libnd4j/blas/NDArray.hpp                      |    5 +-
 libnd4j/blas/NativeOpExecutioner.h            |    6 +-
 libnd4j/blas/NativeOps.h                      |    5 +-
 libnd4j/blas/cpu/NDArray.cpp                  |  122 +-
 libnd4j/blas/cpu/NDArrayLambda.hpp            |  191 +-
 libnd4j/blas/cpu/NativeOpExecutioner.cpp      |  441 ++--
 libnd4j/blas/cpu/NativeOps.cpp                |  349 +--
 libnd4j/blas/cuda/NativeOps.cu                |   43 +-
 libnd4j/buildnativeoperations.sh              |   38 +-
 libnd4j/include/array/DataTypeConversions.h   |   37 +-
 libnd4j/include/buffer.h                      |    1 +
 libnd4j/include/cnpy/cnpy.h                   |   30 +-
 libnd4j/include/dll.h                         |    3 +
 libnd4j/include/execution/BlockingQueue.h     |   52 +
 libnd4j/include/execution/CallableInterface.h |   94 +
 .../include/execution/CallableWithArguments.h |   92 +
 libnd4j/include/execution/ThreadPool.h        |   71 +
 libnd4j/include/execution/Threads.h           |  160 ++
 libnd4j/include/execution/Ticket.h            |   67 +
 .../include/execution/impl/BlockingQueue.cpp  |   73 +
 .../execution/impl/CallableInterface.cpp      |  213 ++
 .../execution/impl/CallableWithArguments.cpp  |  103 +
 libnd4j/include/execution/impl/ThreadPool.cpp |  194 ++
 libnd4j/include/execution/impl/Threads.cpp    |  641 +++++
 libnd4j/include/execution/impl/Ticket.cpp     |   94 +
 libnd4j/include/graph/Node.h                  |    1 +
 libnd4j/include/graph/impl/Graph.cpp          |    3 +-
 libnd4j/include/graph/impl/Node.cpp           |   69 +-
 libnd4j/include/helpers/Loops.h               |  924 +++----
 libnd4j/include/helpers/TAD.h                 |    2 +-
 .../helpers/benchmark/MatrixBenchmark.h       |    1 -
 libnd4j/include/helpers/cpu/MmulHelper.cpp    |   76 +-
 .../helpers/cpu/TrueBroadcastHelper.cpp       |    1 +
 .../helpers/cpu/loops/IndexReductionLoops.cpp |  266 +-
 .../helpers/cpu/loops/Reduction3Loops_0.cpp   |   16 +-
 .../helpers/cpu/loops/Reduction3Loops_1.cpp   |   16 +-
 .../helpers/cpu/loops/Reduction3Loops_2.cpp   |   16 +-
 .../helpers/cpu/loops/Reduction3Loops_3.cpp   |   16 +-
 .../helpers/cpu/loops/ReductionLoops_bool.cpp |    8 +-
 .../cpu/loops/ReductionLoops_float_0.cpp      |    8 +-
 .../cpu/loops/ReductionLoops_float_1.cpp      |    8 +-
 .../cpu/loops/ReductionLoops_float_2.cpp      |    8 +-
 .../cpu/loops/ReductionLoops_float_3.cpp      |    8 +-
 .../helpers/cpu/loops/ReductionLoops_long.cpp |    8 +-
 .../helpers/cpu/loops/ReductionLoops_same.cpp |    8 +-
 .../helpers/cuda/TrueBroadcastHelper.cu       |    1 +
 libnd4j/include/helpers/impl/BlasHelper.cpp   |   24 +-
 libnd4j/include/helpers/impl/DebugHelper.cpp  |   18 +-
 libnd4j/include/helpers/impl/GradCheck.cpp    |    2 -
 .../include/helpers/impl/OmpLaunchHelper.cpp  |    6 +-
 libnd4j/include/loops/aggregates.h            |   66 -
 libnd4j/include/loops/broadcasting.h          |   19 +-
 libnd4j/include/loops/broadcasting_bool.h     |   19 +-
 libnd4j/include/loops/broadcasting_int.h      |   19 +-
 libnd4j/include/loops/cpu/broadcasting.hpp    |  130 +-
 .../include/loops/cpu/broadcasting_bool.cpp   |  117 +-
 .../include/loops/cpu/broadcasting_int.cpp    |  137 +-
 libnd4j/include/loops/cpu/indexreduce.cpp     |   62 +-
 libnd4j/include/loops/cpu/pairwise.hpp        |  200 +-
 libnd4j/include/loops/cpu/pairwise2.hpp       |  106 -
 libnd4j/include/loops/cpu/pairwise_bool.cpp   |  201 +-
 libnd4j/include/loops/cpu/pairwise_int.cpp    |  201 +-
 libnd4j/include/loops/cpu/random.cpp          |  139 +-
 .../include/loops/cpu/reduce/reduce_bool.cpp  |  104 +-
 .../include/loops/cpu/reduce/reduce_float.cpp |  121 +-
 .../include/loops/cpu/reduce/reduce_long.cpp  |  117 +-
 .../include/loops/cpu/reduce/reduce_same.cpp  |  123 +-
 libnd4j/include/loops/cpu/reduce3.cpp         |  101 +-
 libnd4j/include/loops/cpu/scalar.hpp          |  112 +-
 libnd4j/include/loops/cpu/scalar_bool.cpp     |  116 +-
 libnd4j/include/loops/cpu/scalar_int.cpp      |  118 +-
 .../include/loops/cpu/summarystatsreduce.cpp  |   54 +-
 .../loops/cpu/transform/transform_any.cpp     |   18 +-
 .../loops/cpu/transform/transform_bool.cpp    |   18 +-
 .../loops/cpu/transform/transform_float.cpp   |   16 +-
 .../loops/cpu/transform/transform_same.cpp    |   14 +-
 .../loops/cpu/transform/transform_strict.cpp  |   17 +-
 libnd4j/include/loops/cuda/aggregates.cu      |  145 --
 libnd4j/include/loops/cuda/broadcasting.cu    |   78 -
 .../include/loops/cuda/broadcasting_bool.cu   |   70 -
 .../include/loops/cuda/broadcasting_int.cu    |   69 -
 libnd4j/include/loops/cuda/indexreduce.cu     |   26 -
 libnd4j/include/loops/cuda/pairwise.cu        |   52 -
 libnd4j/include/loops/cuda/pairwise_bool.cu   |   57 -
 libnd4j/include/loops/cuda/pairwise_int.cu    |   57 -
 libnd4j/include/loops/cuda/random.cu          |   33 -
 libnd4j/include/loops/cuda/reduce3.chpp       |    2 +-
 libnd4j/include/loops/cuda/reduce3.cu         |   49 -
 libnd4j/include/loops/cuda/scalar_bool.cu     |   35 -
 libnd4j/include/loops/cuda/scalar_int.cu      |   34 -
 .../include/loops/cuda/summarystatsreduce.cu  |   67 -
 .../loops/cuda/transform/transform_any.cu     |   11 -
 .../loops/cuda/transform/transform_bool.cu    |   11 -
 .../loops/cuda/transform/transform_float.cu   |   12 -
 .../loops/cuda/transform/transform_same.cu    |   11 -
 .../loops/cuda/transform/transform_strict.cu  |   11 -
 .../include/loops/impl/type_conversions.cpp   |   42 +-
 libnd4j/include/loops/indexreduce.h           |    7 +-
 libnd4j/include/loops/legacy_ops.h            |    3 -
 libnd4j/include/loops/pairwise_bool.h         |   25 +-
 libnd4j/include/loops/pairwise_int.h          |   24 +-
 libnd4j/include/loops/pairwise_transform.h    |   22 +-
 libnd4j/include/loops/random.h                |    3 +-
 libnd4j/include/loops/reduce3.h               |   20 +-
 libnd4j/include/loops/reduce_bool.h           |   13 +-
 libnd4j/include/loops/reduce_float.h          |   13 +-
 libnd4j/include/loops/reduce_long.h           |   13 +-
 libnd4j/include/loops/reduce_same.h           |   14 +-
 libnd4j/include/loops/scalar.h                |   15 +-
 libnd4j/include/loops/scalar_bool.h           |   15 +-
 libnd4j/include/loops/scalar_int.h            |   18 +-
 libnd4j/include/loops/summarystatsreduce.h    |    4 +-
 libnd4j/include/loops/transform_any.h         |   15 +-
 libnd4j/include/loops/transform_bool.h        |   15 +-
 libnd4j/include/loops/transform_float.h       |   14 +-
 libnd4j/include/loops/transform_same.h        |   14 +-
 libnd4j/include/loops/transform_strict.h      |   17 +-
 libnd4j/include/msvc.h                        |   39 +
 libnd4j/include/op_boilerplate.h              |    3 +-
 libnd4j/include/openmp_pragmas.h              |   40 +-
 libnd4j/include/ops/aggregate_ops.h           |  996 -------
 libnd4j/include/ops/declarable/BooleanOp.h    |    1 -
 .../include/ops/declarable/BroadcastableOp.h  |    1 -
 .../ops/declarable/DeclarableCustomOp.h       |    1 -
 .../include/ops/declarable/DeclarableListOp.h |    3 +-
 libnd4j/include/ops/declarable/DeclarableOp.h |    2 +-
 .../ops/declarable/DeclarableReductionOp.h    |    1 -
 libnd4j/include/ops/declarable/LegacyOp.h     |    1 +
 libnd4j/include/ops/declarable/LogicOp.h      |    1 -
 libnd4j/include/ops/declarable/OpTuple.h      |    2 +-
 .../ops/declarable/generic/blas/axpy.cpp      |   20 +-
 .../ops/declarable/generic/datatypes/cast.cpp |    8 -
 .../declarable/generic/parity_ops/argmax.cpp  |    2 +-
 .../declarable/generic/parity_ops/argmin.cpp  |    2 +-
 .../recurrent/dynamicBidirectionalRNN.cpp     |    6 +-
 .../generic/transforms/reverseSequence.cpp    |   16 +-
 .../declarable/helpers/cpu/BarnesHutTsne.cpp  |   39 +-
 .../declarable/helpers/cpu/activations.cpp    |  103 +-
 .../ops/declarable/helpers/cpu/addBias.cpp    |   65 +-
 .../ops/declarable/helpers/cpu/adjust_hue.cpp |   58 +-
 .../helpers/cpu/adjust_saturation.cpp         |   62 +-
 .../declarable/helpers/cpu/batched_gemm.cpp   |   34 +-
 .../ops/declarable/helpers/cpu/batchnorm.cpp  |   20 +-
 .../ops/declarable/helpers/cpu/betaInc.cpp    |   12 +-
 .../ops/declarable/helpers/cpu/col2im.cpp     |   90 +-
 .../declarable/helpers/cpu/compare_elem.cpp   |   43 +-
 .../ops/declarable/helpers/cpu/confusion.cpp  |   18 +-
 .../declarable/helpers/cpu/convolutions.cpp   | 1419 +++++-----
 .../ops/declarable/helpers/cpu/cross.cpp      |   17 +-
 .../ops/declarable/helpers/cpu/d_t_s.cpp      |   67 +-
 .../ops/declarable/helpers/cpu/diag.cpp       |    1 -
 .../ops/declarable/helpers/cpu/dilation2d.cpp |   40 +-
 .../ops/declarable/helpers/cpu/dropout.cpp    |   34 +-
 .../ops/declarable/helpers/cpu/dynamic.cpp    |   39 +-
 .../helpers/cpu/extract_patches.cpp           |   65 +-
 .../ops/declarable/helpers/cpu/gather.cpp     |   33 +-
 .../ops/declarable/helpers/cpu/hamming.cpp    |   47 +-
 .../ops/declarable/helpers/cpu/hashcode.cpp   |   45 +-
 .../helpers/cpu/histogramFixedWidth.cpp       |   20 +-
 .../ops/declarable/helpers/cpu/im2col.cpp     |   76 +-
 .../declarable/helpers/cpu/image_resize.cpp   |  149 +-
 .../helpers/cpu/image_suppression.cpp         |    3 +-
 .../ops/declarable/helpers/cpu/ismax.cpp      |   15 +-
 .../declarable/helpers/cpu/legacy_helper.cpp  |    1 +
 .../ops/declarable/helpers/cpu/lrn.cpp        |  378 +--
 .../ops/declarable/helpers/cpu/lstm.cpp       |   14 +-
 .../declarable/helpers/cpu/matrixSetDiag.cpp  |   29 +-
 .../helpers/cpu/matrix_diag_part.cpp          |   13 +-
 .../declarable/helpers/cpu/nth_element.cpp    |   14 +-
 .../ops/declarable/helpers/cpu/one_hot.cpp    |   63 +-
 .../ops/declarable/helpers/cpu/percentile.cpp |    2 +-
 .../ops/declarable/helpers/cpu/polyGamma.cpp  |   11 +-
 .../ops/declarable/helpers/cpu/range.cpp      |   10 +-
 .../ops/declarable/helpers/cpu/reverse.cpp    |  117 +-
 .../ops/declarable/helpers/cpu/s_t_b.cpp      |  112 +-
 .../ops/declarable/helpers/cpu/s_t_d.cpp      |   73 +-
 .../ops/declarable/helpers/cpu/scatter.cpp    |  105 +-
 .../ops/declarable/helpers/cpu/segment.cpp    |  323 ++-
 .../declarable/helpers/cpu/sequence_mask.cpp  |   14 +-
 .../ops/declarable/helpers/cpu/sg_cb.cpp      |  310 +--
 .../ops/declarable/helpers/cpu/sru.cpp        |  206 +-
 .../ops/declarable/helpers/cpu/stack.cpp      |   18 +-
 .../ops/declarable/helpers/cpu/top_k.cpp      |   25 +-
 .../ops/declarable/helpers/cpu/transforms.cpp |  519 ++--
 .../ops/declarable/helpers/cpu/zeta.cpp       |   10 +-
 .../include/ops/declarable/helpers/cross.h    |   18 +-
 .../ops/declarable/helpers/cuda/col2im.cppc   |  138 -
 .../ops/declarable/helpers/cuda/im2col.cppc   |  129 -
 .../declarable/helpers/cuda/legacy/relu.cu    |    1 +
 .../declarable/helpers/cuda/legacy/tanh.cu    |    1 +
 .../declarable/helpers/cuda/legacy_helper.cu  |    1 +
 .../ops/declarable/helpers/cuda/transforms.cu |    3 +-
 .../include/ops/declarable/helpers/helpers.h  |    1 +
 .../ops/declarable/helpers/impl/choose.cpp    |    1 +
 .../ops/declarable/helpers/impl/unique.cpp    |   15 +-
 .../include/ops/declarable/helpers/matmul.h   |    1 -
 .../include/ops/declarable/impl/BooleanOp.cpp |    4 -
 .../ops/declarable/impl/BroadcastableOp.cpp   |    4 -
 .../declarable/impl/DeclarableCustomOp.cpp    |    4 -
 .../ops/declarable/impl/DeclarableListOp.cpp  |    4 -
 .../declarable/impl/DeclarableReductionOp.cpp |    8 +-
 .../ops/declarable/impl/LegacyReduce3Op.cpp   |    5 +-
 .../declarable/impl/LegacyReduceBoolOp.cpp    |    5 +-
 .../declarable/impl/LegacyReduceFloatOp.cpp   |    5 +-
 .../declarable/impl/LegacyReduceLongOp.cpp    |    5 +-
 .../declarable/impl/LegacyReduceSameOp.cpp    |    3 +-
 .../ops/declarable/impl/LegacyStatsOp.cpp     |    5 +-
 .../ops/declarable/platform/mkldnn/conv3d.cpp |    3 +
 libnd4j/include/ops/impl/gemm.cpp             |   87 +-
 libnd4j/include/ops/impl/specials.cpp         |  252 +-
 libnd4j/include/ops/ops.h                     |   36 -
 .../include/ops/special_accumulation_ops.h    |  213 --
 libnd4j/include/ops/special_ops.h             | 2293 -----------------
 libnd4j/include/ops/special_random_ops.h      |  176 +-
 libnd4j/include/ops/specials.h                |    7 +-
 .../benchmarking/impl/FullBenchmarkSuit.cpp   |    3 +-
 .../benchmarking/impl/LightBenchmarkSuit.cpp  |   19 +-
 libnd4j/include/pointercast.h                 |    1 +
 libnd4j/include/templatemath.h                |   43 +-
 libnd4j/pom.xml                               |    2 +
 .../layers_tests/BooleanOpsTests.cpp          |    2 +-
 .../layers_tests/BroadcastableOpsTests.cpp    |    8 +-
 .../tests_cpu/layers_tests/BrodcastTests.cpp  |    2 +-
 libnd4j/tests_cpu/layers_tests/CMakeLists.txt |   21 +-
 .../layers_tests/ConditionalTests.cpp         |    1 -
 .../layers_tests/ConstantShapeHelperTests.cpp |    4 +-
 .../layers_tests/ConvolutionTests1.cpp        |  180 +-
 .../layers_tests/DataTypesValidationTests.cpp |    4 +-
 .../layers_tests/DeclarableOpsTests1.cpp      |   53 +-
 .../layers_tests/DeclarableOpsTests10.cpp     |   42 +-
 .../layers_tests/DeclarableOpsTests11.cpp     |   11 -
 .../layers_tests/DeclarableOpsTests12.cpp     |   18 -
 .../layers_tests/DeclarableOpsTests13.cpp     |    1 -
 .../layers_tests/DeclarableOpsTests14.cpp     |   12 +-
 .../layers_tests/DeclarableOpsTests15.cpp     |    1 -
 .../layers_tests/DeclarableOpsTests16.cpp     |   11 +
 .../layers_tests/DeclarableOpsTests2.cpp      |    2 -
 .../layers_tests/DeclarableOpsTests4.cpp      |    8 -
 .../layers_tests/DeclarableOpsTests5.cpp      |   72 +-
 .../layers_tests/DeclarableOpsTests6.cpp      |   49 +-
 .../layers_tests/DeclarableOpsTests7.cpp      |   83 +-
 .../layers_tests/DeclarableOpsTests8.cpp      | 1008 ++++----
 .../layers_tests/DeclarableOpsTests9.cpp      |    8 +-
 libnd4j/tests_cpu/layers_tests/EmptyTests.cpp |    3 -
 .../tests_cpu/layers_tests/HelpersTests1.cpp  |    5 +-
 .../tests_cpu/layers_tests/IndexingTests.cpp  |    5 -
 .../layers_tests/JavaInteropCudaTests.cu      |    2 -
 .../layers_tests/JavaInteropTests.cpp         |   25 +-
 libnd4j/tests_cpu/layers_tests/LambdaTests.cu |    9 -
 .../tests_cpu/layers_tests/LegacyOpsTests.cpp |   24 +-
 .../layers_tests/NDArrayCudaBasicsTests.cu    |   42 +-
 .../tests_cpu/layers_tests/NDArrayTests.cpp   |    5 -
 .../tests_cpu/layers_tests/NDArrayTests2.cpp  |   19 -
 .../tests_cpu/layers_tests/NativeOpsTests.cpp |    7 +-
 .../layers_tests/OmpLaunchHelperTests.cpp     |   28 -
 libnd4j/tests_cpu/layers_tests/OpsArena.cpp   |  200 --
 .../tests_cpu/layers_tests/ParityOpsTests.cpp |    6 +-
 .../layers_tests/PerformanceTests.cpp         |    7 +-
 .../layers_tests/PlaygroundTests.cpp          |  191 +-
 libnd4j/tests_cpu/layers_tests/RNGTests.cpp   |    1 -
 .../tests_cpu/layers_tests/ReduceTests.cpp    |    6 +-
 .../tests_cpu/layers_tests/ShapeTests2.cpp    |    1 -
 libnd4j/tests_cpu/layers_tests/TadTests.cpp   |    7 -
 .../tests_cpu/layers_tests/ThreadsTests.cpp   |  233 ++
 .../tests_cpu/layers_tests/WorkspaceTests.cpp |    2 -
 .../tests_cpu/libnd4j_tests/CMakeLists.txt    |    6 +-
 libnd4j/tests_cpu/run_tests.sh                |   25 +-
 .../api/ops/impl/reduce3/EqualsWithEps.java   |    2 +-
 .../java/org/nd4j/nativeblas/Nd4jCuda.java    |    2 +-
 .../cpu/nativecpu/CpuMemoryManager.java       |    2 +-
 .../java/org/nd4j/nativeblas/Nd4jCpu.java     |   66 +-
 277 files changed, 8610 insertions(+), 11878 deletions(-)
 create mode 100644 libnd4j/include/execution/BlockingQueue.h
 create mode 100644 libnd4j/include/execution/CallableInterface.h
 create mode 100644 libnd4j/include/execution/CallableWithArguments.h
 create mode 100644 libnd4j/include/execution/ThreadPool.h
 create mode 100644 libnd4j/include/execution/Threads.h
 create mode 100644 libnd4j/include/execution/Ticket.h
 create mode 100644 libnd4j/include/execution/impl/BlockingQueue.cpp
 create mode 100644 libnd4j/include/execution/impl/CallableInterface.cpp
 create mode 100644 libnd4j/include/execution/impl/CallableWithArguments.cpp
 create mode 100644 libnd4j/include/execution/impl/ThreadPool.cpp
 create mode 100644 libnd4j/include/execution/impl/Threads.cpp
 create mode 100644 libnd4j/include/execution/impl/Ticket.cpp
 delete mode 100644 libnd4j/include/loops/aggregates.h
 delete mode 100644 libnd4j/include/loops/cpu/pairwise2.hpp
 delete mode 100644 libnd4j/include/loops/cuda/aggregates.cu
 create mode 100644 libnd4j/include/msvc.h
 delete mode 100644 libnd4j/include/ops/aggregate_ops.h
 delete mode 100644 libnd4j/include/ops/declarable/helpers/cuda/col2im.cppc
 delete mode 100644 libnd4j/include/ops/declarable/helpers/cuda/im2col.cppc
 delete mode 100644 libnd4j/include/ops/special_accumulation_ops.h
 delete mode 100644 libnd4j/include/ops/special_ops.h
 delete mode 100644 libnd4j/tests_cpu/layers_tests/OpsArena.cpp
 create mode 100644 libnd4j/tests_cpu/layers_tests/ThreadsTests.cpp

diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/util/ConvolutionUtils.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/util/ConvolutionUtils.java
index d5c8ee1f6..56421bc00 100644
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/util/ConvolutionUtils.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/util/ConvolutionUtils.java
@@ -35,6 +35,7 @@ import org.nd4j.linalg.api.ops.Op;
 import org.nd4j.linalg.api.ops.impl.broadcast.BroadcastCopyOp;
 import org.nd4j.linalg.api.ops.impl.layers.convolution.MaxPooling2D;
 import org.nd4j.linalg.api.ops.impl.layers.convolution.config.Pooling2DConfig;
+import org.nd4j.linalg.api.ops.impl.transforms.custom.Assign;
 import org.nd4j.linalg.api.shape.Shape;
 import org.nd4j.linalg.exception.ND4JArraySizeException;
 import org.nd4j.linalg.factory.NDArrayFactory;
@@ -482,23 +483,12 @@ public class ConvolutionUtils {
             return reshape5dTo2d(format, mask, workspaceMgr, type);
         } else {
             //Need to broadcast first
-            IntArrayList broadcastDims = new IntArrayList();
-            for(int i=0; i<mask.rank(); i++ ){
-                if(mask.size(i) == label.size(i)){
-                    if((format == Convolution3D.DataFormat.NCDHW && i == 1) || (format == Convolution3D.DataFormat.NDHWC && i == 4)){
-                        //Skip channels dimension
-                        continue;
-                    }
-                    broadcastDims.add(i);
-                }
-            }
             long[] lShape = label.shape().clone();
             int channelIdx = format == Convolution3D.DataFormat.NCDHW ? 1 : 4;
             lShape[channelIdx] = mask.size(channelIdx);     //Keep existing channel size
 
             INDArray bMask = workspaceMgr.createUninitialized(type, mask.dataType(), lShape, 'c');
-            int[] bcDims = broadcastDims.toIntArray();
-            Nd4j.getExecutioner().exec(new BroadcastCopyOp(bMask, mask, bMask, bcDims));
+            Nd4j.exec(new Assign(new INDArray[]{bMask, mask}, new INDArray[]{bMask}));
             return reshape5dTo2d(format, bMask, workspaceMgr, type);
         }
     }
diff --git a/libnd4j/CMakeLists.txt b/libnd4j/CMakeLists.txt
index a16c9eaf7..c563eda27 100755
--- a/libnd4j/CMakeLists.txt
+++ b/libnd4j/CMakeLists.txt
@@ -16,17 +16,20 @@ endif()
 
 # -fsanitize=address
 # -fsanitize=leak
-if (APPLE)
-    set(CMAKE_CXX_FLAGS_RELEASE  "-O3 -fPIC -std=c++11 -fmax-errors=2 -Wno-braced-scalar-init -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else -D__APPLE_OS__=true -D_RELEASE=true")
-    set(CMAKE_CXX_FLAGS_DEBUG  " -O0 -g -fPIC -std=c++11 -fmax-errors=2 -D__APPLE_OS__=true")
+if (ANDROID_BUILD)
+    set(CMAKE_CXX_FLAGS_RELEASE  "${CMAKE_CXX_FLAGS_RELEASE} -O3 -fPIC -std=c++11 -Wno-braced-scalar-init -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else -D_RELEASE=true")
+    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} -O0 -g -fPIC -std=c++11 -Wno-braced-scalar-init -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else")
+elseif (APPLE)
+    set(CMAKE_CXX_FLAGS_RELEASE  "-O3 -fPIC -std=c++11 -Wno-braced-scalar-init -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else -D__APPLE_OS__=true -D_RELEASE=true")
+    set(CMAKE_CXX_FLAGS_DEBUG  " -O0 -g -fPIC -std=c++11 -Wno-braced-scalar-init -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else -D__APPLE_OS__=true")
 elseif(WIN32)
     set(X86_BUILD true)
-    if (NOT CUDA_BLAS)
-        set(CMAKE_CXX_FLAGS_RELEASE  "-O3 -fPIC -std=c++11 -fmax-errors=2 -DINLINE_LOOPS -D_RELEASE=true")
-        set(CMAKE_CXX_FLAGS_DEBUG  " -g -fPIC -std=c++11 -DINLINE_LOOPS -fmax-errors=2")
-    else()
-        set(CMAKE_CXX_FLAGS_RELEASE  "-D_RELEASE=true /wd4804")
+    if (CUDA_BLAS)
+        set(CMAKE_CXX_FLAGS_RELEASE  " /O2 -D_RELEASE=true /wd4804")
         set(CMAKE_CXX_FLAGS_DEBUG  "  /FS /EHsc /wd4661 /wd4804 /wd4267 /wd4244 /wd4251 /wd4305")
+    else()
+        set(CMAKE_CXX_FLAGS_RELEASE  "-O3 -fPIC -std=c++11 -fmax-errors=2 -D_RELEASE=true")
+        set(CMAKE_CXX_FLAGS_DEBUG  " -g -O2 -fPIC -std=c++11 -fmax-errors=2")
     endif()
 else()
     set(CMAKE_CXX_FLAGS_RELEASE  "-O3 -fPIC -std=c++11 -fmax-errors=2 -D_RELEASE=true")
@@ -75,6 +78,9 @@ if(NOT CUDA_BLAS)
 
             message("Found external BLAS implementation: ${BLAS_LIBRARIES} ")
             add_definitions(-D__EXTERNAL_BLAS__=true)
+        elseif(WIN32)
+            message("BLAS not found, using downloaded OpenBLAS instead")
+            add_definitions(-D__EXTERNAL_BLAS__=true)
         endif()
     else()
         # if we have externally provided OPENBLAS_PATH - let's use it
diff --git a/libnd4j/CMakeLists.txt.mkldnn.in b/libnd4j/CMakeLists.txt.mkldnn.in
index 26d82034f..ac0f9accf 100644
--- a/libnd4j/CMakeLists.txt.mkldnn.in
+++ b/libnd4j/CMakeLists.txt.mkldnn.in
@@ -5,7 +5,7 @@ project(mkldnn-download NONE)
 include(ExternalProject)
 ExternalProject_Add(mkldnn
   GIT_REPOSITORY     https://github.com/intel/mkl-dnn.git
-  GIT_TAG           v1.0.2
+  GIT_TAG           v1.0.4
   SOURCE_DIR        "${CMAKE_CURRENT_BINARY_DIR}/mkldnn-src"
   BINARY_DIR        "${CMAKE_CURRENT_BINARY_DIR}/mkldnn-build"
   CONFIGURE_COMMAND ""
diff --git a/libnd4j/blas/CMakeLists.txt b/libnd4j/blas/CMakeLists.txt
index e3d6cedb8..c804ce5ec 100755
--- a/libnd4j/blas/CMakeLists.txt
+++ b/libnd4j/blas/CMakeLists.txt
@@ -30,8 +30,8 @@ if(APPLE)
 endif()
 
 if (APPLE_BUILD)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DAPPLE_BUILD=true")
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DAPPLE_BUILD=true")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DAPPLE_BUILD=true -mmacosx-version-min=10.10")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DAPPLE_BUILD=true -mmacosx-version-min=10.10")
 endif()
 
 if (ANDROID_BUILD)
@@ -92,11 +92,13 @@ ELSE()
     IF(${EXTENSION} MATCHES "avx512")
         message("Building AVX512 binary...")
         # we need to set flag here, that we can use hardware f16 conversion + tell that cpu features should be tracked
-        message("Current CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mmmx -msse -msse2 -msse3 -msse4.1 -msse4.2 -mavx -mavx2 -mfma -mf16c -mavx512f -mavx512vl -mavx512bw -mavx512dq  -mavx512cd -mbmi -mbmi2 -mprefetchwt1 -mclflushopt -mxsavec -mxsaves -DSD_F16C=true -DF_AVX512=true")
     endif()
 
-    set(ARCH_TUNE "-march=${ARCH} -mtune=${ARCH_TYPE}")
+    if (NOT WIN32)
+        # we don't want this definition for msvc
+        set(ARCH_TUNE "-march=${ARCH} -mtune=${ARCH_TYPE}")
+    endif()
 ENDIF()
 
 if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
@@ -109,7 +111,7 @@ elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
 elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
     # using Visual Studio C++
 
-    set( CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} /EHsc /w")
+    set( CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} /EHsc ${ARCH_TUNE}")
 elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
     # using GCC
     SET( CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} ${ARCH_TUNE}")
@@ -283,8 +285,8 @@ if(CUDA_BLAS)
 
         if(WIN32)
             message("CUDA on Windows: enabling /EHsc")
-            SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc /bigobj")
-            SET_TARGET_PROPERTIES(${LIBND4J_NAME} PROPERTIES COMPILER_FLAGS "/EHsc /bigobj")
+            SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc /bigobj /std:c++14")
+            SET_TARGET_PROPERTIES(${LIBND4J_NAME} PROPERTIES COMPILER_FLAGS "/EHsc /bigobj /std:c++14")
         endif()
 
 
@@ -322,7 +324,7 @@ elseif(CPU_BLAS)
     endif()
 
     if (X86_BUILD)
-        #we disable platform optimizations for certains files
+        # we disable platform optimizations for certains files for linux/macos
         set_source_files_properties(cpu/NativeOps.cpp PROPERTIES COMPILE_FLAGS "-march=x86-64 -mtune=generic")
         set_source_files_properties(../include/helpers/impl/OpTracker.cpp PROPERTIES COMPILE_FLAGS "-march=x86-64 -mtune=generic")
     endif()
@@ -342,7 +344,16 @@ elseif(CPU_BLAS)
         add_library(${LIBND4J_NAME}       SHARED $<TARGET_OBJECTS:nd4jobj>)
     endif()
 
+    #if(WIN32)
+    #    message("CPU on Windows: enabling /EHsc")
+    #    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc /bigobj /std:c++14")
+    #    SET_TARGET_PROPERTIES(${LIBND4J_NAME} PROPERTIES COMPILER_FLAGS "/EHsc /bigobj /std:c++14")
+    #endif()
+
     # we're including {MKLDNN} here in case of building from sources. in future that'll replace {MKLDNN_LIBRARIES}. same applies to BLAS
+    if (NOT BLAS_LIBRARIES)
+        set(BLAS_LIBRARIES "")
+    endif()
     target_link_libraries(${LIBND4J_NAME} ${MKLDNN} ${MKLDNN_LIBRARIES} ${OPENBLAS_LIBRARIES} ${BLAS_LIBRARIES} ${CPU_FEATURES})
 
     if ("${LIBND4J_ALL_OPS}" AND "${LIBND4J_BUILD_MINIFIER}")
diff --git a/libnd4j/blas/Environment.cpp b/libnd4j/blas/Environment.cpp
index 0c23f61be..90c391cf1 100644
--- a/libnd4j/blas/Environment.cpp
+++ b/libnd4j/blas/Environment.cpp
@@ -24,6 +24,8 @@
 #include <string>
 #include "Environment.h"
 #include <helpers/StringUtils.h>
+#include <thread>
+#include <helpers/logger.h>
 
 #ifdef _OPENMP
 
@@ -49,6 +51,7 @@ namespace nd4j {
         _precBoost.store(false);
         _leaks.store(false);
         _dataType.store(nd4j::DataType::FLOAT32);
+        _maxThreads = std::thread::hardware_concurrency();
 
 #ifndef ANDROID
         const char* omp_threads = std::getenv("OMP_NUM_THREADS");
@@ -86,9 +89,7 @@ namespace nd4j {
 	    cudaSetDevice(0);
 	    delete[] devProperties;
 #else
-#ifdef _OPENMP
-        omp_set_nested(1);
-#endif
+
 #endif
     }
 
diff --git a/libnd4j/blas/NDArray.h b/libnd4j/blas/NDArray.h
index 10847f882..de2488f9d 100644
--- a/libnd4j/blas/NDArray.h
+++ b/libnd4j/blas/NDArray.h
@@ -26,6 +26,7 @@
 #include <indexing/IndicesList.h>
 #include <graph/Intervals.h>
 #include <array/DataType.h>
+#include <array/DataTypeUtils.h>
 #include <stdint.h>
 #include <array/ArrayOptions.h>
 #include <array/ArrayType.h>
@@ -1678,7 +1679,6 @@ namespace nd4j {
     //////////////////////////////////////////////////////////////////////////
 
     size_t NDArray::sizeOfT() const {
-
         return DataTypeUtils::sizeOfElement(_dataType);
     }
 
diff --git a/libnd4j/blas/NDArray.hpp b/libnd4j/blas/NDArray.hpp
index 2a601033a..c4a631cf5 100644
--- a/libnd4j/blas/NDArray.hpp
+++ b/libnd4j/blas/NDArray.hpp
@@ -2478,7 +2478,6 @@ double NDArray::getTrace() const {
 
     double sum = 0.;
 
-PRAGMA_OMP_PARALLEL_FOR_ARGS(reduction(OMP_SUMT:sum) OMP_IF(minDim > Environment::getInstance()->elementwiseThreshold()) schedule(guided))
     for(int i = 0; i < minDim; ++i)
         sum += e<double>(i * offset);
 
@@ -3275,7 +3274,7 @@ bool NDArray::equalsTo(const NDArray *other, double eps) const {
         // regular numeric types
         NDArray tmp(nd4j::DataType::FLOAT32, getContext()); // scalar = 0
 
-        ExtraArguments extras({eps});
+        ExtraArguments extras({0.0, 0.0, eps});
 
         NDArray::prepareSpecialUse({&tmp}, {this, other});
         NativeOpExecutioner::execReduce3Scalar(getContext(), reduce3::EqualsWithEps, getBuffer(), getShapeInfo(),
@@ -3288,7 +3287,7 @@ bool NDArray::equalsTo(const NDArray *other, double eps) const {
 
         synchronize("NDArray::equalsTo");
 
-        if (tmp.e<int>(0) > 0)
+        if (tmp.e<Nd4jLong>(0) != 0)
             return false;
 
         return true;
diff --git a/libnd4j/blas/NativeOpExecutioner.h b/libnd4j/blas/NativeOpExecutioner.h
index cae7a4e56..fb2ca58f0 100644
--- a/libnd4j/blas/NativeOpExecutioner.h
+++ b/libnd4j/blas/NativeOpExecutioner.h
@@ -24,10 +24,10 @@
 
 #include <types/types.h>
 #include <dll.h>
-#include <loops/aggregates.h>
 #include <ops/specials.h>
 #include <ops/specials_sparse.h>
 #include <execution/LaunchContext.h>
+#include <array/ArrayOptions.h>
 
 /**
  * Native op executioner:
@@ -624,10 +624,6 @@ static void execTransformBool(nd4j::LaunchContext  *lc,
                               void *vrealArguments,
                               int numRealArguments) {
 
-        auto arguments = reinterpret_cast<X **>(varguments);
-        auto realArguments = reinterpret_cast<X *>(vrealArguments);
-
-        functions::aggregate::AggregatedFunction<X>::exec(opNum, arguments, numArguments, shapeArguments, numShapeArguments, indexArguments, numIndexArguments, intArrays, numIntArrays, realArguments, numRealArguments);
     }
     
 
diff --git a/libnd4j/blas/NativeOps.h b/libnd4j/blas/NativeOps.h
index b2679f537..b10b3807a 100755
--- a/libnd4j/blas/NativeOps.h
+++ b/libnd4j/blas/NativeOps.h
@@ -55,7 +55,6 @@
 #define ND4J_EXPORT
 #endif
 #include <dll.h>
-#include <helpers/BlasHelper.h>
 
 /*
 int tad_threshold = 1;
@@ -1430,7 +1429,11 @@ static const char* getNpyArrayNameFromMap(void *map, int index){
     for(; it != end; ++it, ++cnt){
         if (cnt == index){
             // FIXME: @fariz, this is a leak!
+#ifdef _MSC_VER
+            return const_cast<const char *>(_strdup(it->first.c_str()));
+#else
             return const_cast<const char *>(strdup(it->first.c_str()));
+#endif
         }
     }
     throw std::runtime_error("No array at index.");
diff --git a/libnd4j/blas/cpu/NDArray.cpp b/libnd4j/blas/cpu/NDArray.cpp
index 03c7c53e1..dc9d09231 100644
--- a/libnd4j/blas/cpu/NDArray.cpp
+++ b/libnd4j/blas/cpu/NDArray.cpp
@@ -98,24 +98,27 @@ void NDArray::fillAsTriangular(const float val, int lower, int upper, const char
 
     const bool areSameOffsets = shape::haveSameShapeAndStrides(getShapeInfo(), target->getShapeInfo());
 
-    std::vector<Nd4jLong> coords(zRank);
 
-    PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(zLen > Environment::getInstance()->elementwiseThreshold()) firstprivate(coords))
-    for (Nd4jLong i = 0; i < zLen; ++i) {
+    auto func = PRAGMA_THREADS_FOR {
+        Nd4jLong coords[MAX_RANK];
+        for (auto i = start; i < stop; i += increment) {
+            shape::index2coords(i, target->getShapeInfo(), coords);
+            const auto zOffset = shape::getOffset(target->getShapeInfo(), coords);
 
-        shape::index2coords(i, target->getShapeInfo(), coords.data());
-        const auto zOffset = shape::getOffset(target->getShapeInfo(), coords.data());
+            // if( (row + upper < col) || (row + lower > col) )
+            if ((coords[zRank - 2] + upper < coords[zRank - 1]) || (coords[zRank - 2] + lower > coords[zRank - 1]))
+                z[zOffset] = value;
+            else if (this != target) {      // when this and target are different arrays
+                if (xRank != zRank)
+                    coords[0] = coords[1];
 
-        // if( (row + upper < col) || (row + lower > col) )
-        if((coords[zRank - 2] + upper < coords[zRank - 1]) || (coords[zRank - 2] + lower > coords[zRank - 1]))
-            z[zOffset] = value;
-        else if(this != target) {      // when this and target are different arrays
-            if(xRank != zRank)
-                coords[0] = coords[1];
-            const auto xOffset = areSameOffsets ? zOffset : shape::getOffset(getShapeInfo(), coords.data());
-            z[zOffset] = x[xOffset];
+                const auto xOffset = areSameOffsets ? zOffset : shape::getOffset(getShapeInfo(), coords);
+                z[zOffset] = x[xOffset];
+            }
         }
-    }
+    };
+
+    samediff::Threads::parallel_for(func, 0, zLen);
 }
 BUILD_SINGLE_TEMPLATE(template void NDArray::fillAsTriangular, (const float val, int lower, int upper, const char direction, NDArray* target), LIBND4J_TYPES);
 
@@ -140,7 +143,7 @@ void NDArray::setIdentity() {
             minDim = shape[i];
 
     float v = 1.0f;
-    PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(minDim > Environment::getInstance()->elementwiseThreshold()) schedule(guided))
+
     for(int i = 0; i < minDim; ++i)
         templatedSet<float>(buffer(), i*offset, this->dataType(), &v);
 }
@@ -151,12 +154,15 @@ static void templatedSwap(void *xBuffer, void *yBuffer, Nd4jLong length) {
     auto x = reinterpret_cast<T *>(xBuffer);
     auto y = reinterpret_cast<T *>(yBuffer);
 
-    PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(schedule(static))
-    for (Nd4jLong i = 0; i < length; ++i) {
-        auto temp = x[i];
-        x[i] = y[i];
-        y[i] = temp;
-    }
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto i = start; i < stop; i += increment) {
+            auto temp = x[i];
+            x[i] = y[i];
+            y[i] = temp;
+        }
+    };
+
+    samediff::Threads::parallel_for(func, 0, length);
 }
 BUILD_SINGLE_TEMPLATE(template void templatedSwap, (void *xBuffer, void *yBuffer, Nd4jLong length), LIBND4J_TYPES);
 
@@ -262,21 +268,26 @@ NDArray NDArray::tile(const std::vector<Nd4jLong>& reps) const {
     auto xType = this->dataType();
     if(result.ordering() == 'c') {           //  ews == 1 always here
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for(Nd4jLong i = 0;  i < resultLen; ++i) {
-            auto yOffset = shape::subArrayOffset(i, newShapeInfo, getShapeInfo());
-            BUILD_SINGLE_SELECTOR(xType, this->template templatedAssign, (result.getBuffer(), i, this->getBuffer(), yOffset), LIBND4J_TYPES);
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
+                auto yOffset = shape::subArrayOffset(i, newShapeInfo, getShapeInfo());
+                BUILD_SINGLE_SELECTOR(xType, this->template templatedAssign,(result.getBuffer(), i, this->getBuffer(), yOffset), LIBND4J_TYPES);
+            }
+        };
 
-        }
+        samediff::Threads::parallel_for(func, 0, resultLen);
     }
     else {
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for(Nd4jLong i=0;  i<resultLen; ++i) {
-            auto xOffset = result.getOffset(i);
-            auto yOffset = shape::subArrayOffset(i, newShapeInfo, getShapeInfo());
-            BUILD_SINGLE_SELECTOR(xType, this->template templatedAssign, (result.getBuffer(), xOffset, this->getBuffer(), yOffset), LIBND4J_TYPES);
-        }
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
+                auto xOffset = result.getOffset(i);
+                auto yOffset = shape::subArrayOffset(i, newShapeInfo, getShapeInfo());
+                BUILD_SINGLE_SELECTOR(xType, this->template templatedAssign,(result.getBuffer(), xOffset, this->getBuffer(), yOffset), LIBND4J_TYPES);
+            }
+        };
+
+        samediff::Threads::parallel_for(func, 0, resultLen);
     }
     result.tickWriteHost();
     return result;
@@ -337,14 +348,7 @@ void NDArray::tile(NDArray& target) const {
     // looping through _buffer goes automatically by means of getSubArrayIndex applying
     const auto ews = target.ews();
     const auto targetLen = target.lengthOf();
-    if(target.ordering() == 'c' && ews == 1) {           //  ews == 1 always here
-
-        for (Nd4jLong i = 0; i < targetLen; ++i) {
-            auto yOffset = shape::subArrayOffset(i, target.getShapeInfo(), getShapeInfo());
-            BUILD_DOUBLE_SELECTOR(target.dataType(), dataType(), templatedDoubleAssign, (target.getBuffer(), i, getBuffer(), yOffset), LIBND4J_TYPES, LIBND4J_TYPES);
-        }
-    }
-    else if(target.ordering() == 'c' && ews > 1) {
+    if(target.ordering() == 'c' && ews >= 1) {
 
         for(Nd4jLong i=0;  i<targetLen; ++i) {
             auto yOffset = shape::subArrayOffset(i, target.getShapeInfo(), getShapeInfo());
@@ -373,30 +377,30 @@ static void repeat_(const NDArray& input, NDArray& output, const std::vector<int
     const int zLen    = output.lengthOf(); // xLen <= zLen
     const int repSize = repeats.size();
 
-    std::vector<Nd4jLong> coords(rank);
-
     // loop through input array
-    PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) firstprivate(coords))
-    for (Nd4jLong i = 0; i < zLen; ++i) {
+    auto func = PRAGMA_THREADS_FOR {
+        Nd4jLong coords[MAX_RANK];
+        for (auto i = start; i < stop; i += increment) {
+            shape::index2coords(i, output.getShapeInfo(), coords);
 
-        shape::index2coords(i, output.getShapeInfo(), coords.data());
+            const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
 
-        const auto zOffset = shape::getOffset(output.getShapeInfo(), coords.data());
-
-        if(repSize > 1) {
-            for (uint j = 0; j < repSize; ++j) {
-                coords[axis] -= repeats[j];
-                if (coords[axis] < 0) {
-                    coords[axis] = j;
-                    break;
+            if (repSize > 1) {
+                for (uint j = 0; j < repSize; ++j) {
+                    coords[axis] -= repeats[j];
+                    if (coords[axis] < 0) {
+                        coords[axis] = j;
+                        break;
+                    }
                 }
-            }
-        }
-        else
-            coords[axis] /= repeats[0];
+            } else
+                coords[axis] /= repeats[0];
 
-        z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords.data())];
-    }
+            z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords)];
+        }
+    };
+
+    samediff::Threads::parallel_for(func, 0, zLen);
 }
 
 //////////////////////////////////////////////////////////////////////////
diff --git a/libnd4j/blas/cpu/NDArrayLambda.hpp b/libnd4j/blas/cpu/NDArrayLambda.hpp
index ecf2aa9ed..6ce8e6823 100644
--- a/libnd4j/blas/cpu/NDArrayLambda.hpp
+++ b/libnd4j/blas/cpu/NDArrayLambda.hpp
@@ -32,33 +32,40 @@ void NDArray::applyTriplewiseLambda(NDArray* second, NDArray *third, const std::
 
     if (this->ordering() == second->ordering() && this->ordering() == third->ordering()  && this->ordering() == target->ordering() && (this->ews() == 1 && target->ews() == 1) && this->ews() == second->ews() && this->ews() == third->ews()) {
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for (Nd4jLong e = 0; e < _length; e++)
-            z[e] = func(f[e], s[e], t[e]);
+        auto loop = PRAGMA_THREADS_FOR {
+            for (auto e = start; e < stop; e += increment)
+                z[e] = func(f[e], s[e], t[e]);
+        };
+
+        samediff::Threads::parallel_for(loop, 0, _length);
     } else {
         if (f == z) {
 
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (Nd4jLong e = 0; e < _length; e++) {
+            auto loop = PRAGMA_THREADS_FOR {
+                for (auto e = start; e < stop; e += increment) {
+                    auto tOffset = this->getOffset(e);
+                    auto uOffset = second->getOffset(e);
+                    auto vOffset = third->getOffset(e);
 
-                auto tOffset = this->getOffset(e);
-                auto uOffset = second->getOffset(e);
-                auto vOffset = third->getOffset(e);
+                    f[tOffset] = func(f[tOffset], s[uOffset], t[vOffset]);
+                }
+            };
 
-                f[tOffset] = func(f[tOffset], s[uOffset], t[vOffset]);
-            }
+            samediff::Threads::parallel_for(loop, 0, _length);
         } else {
 
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (Nd4jLong e = 0; e < _length; e++) {
+            auto loop = PRAGMA_THREADS_FOR {
+                for (auto e = start; e < stop; e += increment) {
+                    auto tOffset = this->getOffset(e);
+                    auto uOffset = second->getOffset(e);
+                    auto vOffset = third->getOffset(e);
+                    auto zOffset = target->getOffset(e);
 
-                auto tOffset = this->getOffset(e);
-                auto uOffset = second->getOffset(e);
-                auto vOffset = third->getOffset(e);
-                auto zOffset = target->getOffset(e);
+                    z[zOffset] = func(f[tOffset], s[uOffset], t[vOffset]);
+                }
+            };
 
-                z[zOffset] = func(f[tOffset], s[uOffset], t[vOffset]);
-            }
+            samediff::Threads::parallel_for(loop, 0, _length);
         }
     }
 }
@@ -103,31 +110,38 @@ void NDArray::applyPairwiseLambda(const NDArray* other, const std::function<T(T,
 
     if (this->ordering() == other->ordering() && this->ordering() == target->ordering() && (this->ews() == 1 && target->ews() == 1) && this->ews() == other->ews()) {
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for (Nd4jLong e = 0; e < _length; e++)
-            z[e] = func(f[e], s[e]);
+        auto loop = PRAGMA_THREADS_FOR {
+            for (auto e = start; e < stop; e += increment)
+                z[e] = func(f[e], s[e]);
+        };
+
+        samediff::Threads::parallel_for(loop, 0, _length);
     } else {
         if (f == z) {
 
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (Nd4jLong e = 0; e < _length; e++) {
+            auto loop = PRAGMA_THREADS_FOR {
+                for (auto e = start; e < stop; e += increment) {
+                    auto xOffset = this->getOffset(e);
+                    auto yOffset = other->getOffset(e);
 
-                auto xOffset = this->getOffset(e);
-                auto yOffset = other->getOffset(e);
+                    f[xOffset] = func(f[xOffset], s[yOffset]);
+                }
+            };
 
-                f[xOffset] = func(f[xOffset], s[yOffset]);
-            }
+            samediff::Threads::parallel_for(loop, 0, _length);
         } else {
 
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (Nd4jLong e = 0; e < _length; e++) {
+            auto loop = PRAGMA_THREADS_FOR {
+                for (auto e = start; e < stop; e += increment) {
+                    auto xOffset = this->getOffset(e);
+                    auto yOffset = other->getOffset(e);
+                    auto zOffset = target->getOffset(e);
 
-                auto xOffset = this->getOffset(e);
-                auto yOffset = other->getOffset(e);
-                auto zOffset = target->getOffset(e);
+                    z[zOffset] = func(f[xOffset], s[yOffset]);
+                }
+            };
 
-                z[zOffset] = func(f[xOffset], s[yOffset]);
-            }
+            samediff::Threads::parallel_for(loop, 0, _length);
         }
     }
 }
@@ -161,29 +175,36 @@ void NDArray::applyLambda(const std::function<T(T)>& func, NDArray* target) {
 
     if (this->ordering() == target->ordering() && (this->ews() == 1 && target->ews() == 1)) {
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for (int e = 0; e < _length; e++)
-            z[e] = func(f[e]);
+        auto loop = PRAGMA_THREADS_FOR {
+            for (auto e = start; e < stop; e += increment)
+                z[e] = func(f[e]);
+        };
+
+        samediff::Threads::parallel_for(loop, 0, _length);
     } else {
         if (f == z) {
 
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (int e = 0; e < _length; e++) {
+            auto loop = PRAGMA_THREADS_FOR {
+                for (auto e = start; e < stop; e += increment) {
+                    auto xOffset = this->getOffset(e);
 
-                auto xOffset = this->getOffset(e);
+                    f[xOffset] = func(f[xOffset]);
+                }
+            };
 
-                f[xOffset] = func(f[xOffset]);
-            }
+            samediff::Threads::parallel_for(loop, 0, _length);
         } else {
 
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (int e = 0; e < _length; e++) {
+            auto loop = PRAGMA_THREADS_FOR {
+                for (auto e = start; e < stop; e += increment) {
+                    auto xOffset = this->getOffset(e);
+                    auto zOffset = target->getOffset(e);
 
-                auto xOffset = this->getOffset(e);
-                auto zOffset = target->getOffset(e);
+                    z[zOffset] = func(f[xOffset]);
+                }
+            };
 
-                z[zOffset] = func(f[xOffset]);
-            }
+            samediff::Threads::parallel_for(loop, 0, _length);
         }
     }
 }
@@ -217,29 +238,36 @@ void NDArray::applyIndexedLambda(const std::function<T(Nd4jLong, T)>& func, NDAr
 
     if (this->ordering() == target->ordering() && (this->ews() == 1 && target->ews() == 1)) {
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for (Nd4jLong e = 0; e < _length; e++)
-            z[e] = func(e, f[e]);
+        auto loop = PRAGMA_THREADS_FOR {
+            for (auto e = start; e < stop; e += increment)
+                z[e] = func(e, f[e]);
+        };
+
+        samediff::Threads::parallel_for(loop, 0, _length);
     } else {
         if (f == z) {
 
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (Nd4jLong e = 0; e < _length; e++) {
+            auto loop = PRAGMA_THREADS_FOR {
+                for (auto e = start; e < stop; e += increment) {
+                    auto xOffset = this->getOffset(e);
 
-                auto xOffset = this->getOffset(e);
+                    f[xOffset] = func(e, f[xOffset]);
+                }
+            };
 
-                f[xOffset] = func(e, f[xOffset]);
-            }
+            samediff::Threads::parallel_for(loop, 0, _length);
         } else {
 
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (Nd4jLong e = 0; e < _length; e++) {
+            auto loop = PRAGMA_THREADS_FOR {
+                for (auto e = start; e < stop; e += increment) {
+                    auto xOffset = this->getOffset(e);
+                    auto zOffset = target->getOffset(e);
 
-                auto xOffset = this->getOffset(e);
-                auto zOffset = target->getOffset(e);
+                    z[zOffset] = func(e, f[xOffset]);
+                }
+            };
 
-                z[zOffset] = func(e, f[xOffset]);
-            }
+            samediff::Threads::parallel_for(loop, 0, _length);
         }
     }
 }
@@ -282,31 +310,38 @@ void NDArray::applyIndexedPairwiseLambda(NDArray* other, const std::function<T(N
 
     if (this->ordering() == other->ordering() && this->ordering() == target->ordering() && (this->ews() == 1 && target->ews() == 1) && this->ews() == other->ews()) {
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for (Nd4jLong e = 0; e < _length; e++)
-            z[e] = func((Nd4jLong) e, f[e], s[e]);
+        auto loop = PRAGMA_THREADS_FOR {
+            for (auto e = start; e < stop; e += increment)
+                z[e] = func((Nd4jLong) e, f[e], s[e]);
+        };
+
+        samediff::Threads::parallel_for(loop, 0, _length);
     } else {
         if (f == z) {
 
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (int e = 0; e < _length; e++) {
+            auto loop = PRAGMA_THREADS_FOR {
+                for (auto e = start; e < stop; e += increment) {
+                    auto xOffset = this->getOffset(e);
+                    auto yOffset = other->getOffset(e);
 
-                auto xOffset = this->getOffset(e);
-                auto yOffset = other->getOffset(e);
+                    f[xOffset] = func((Nd4jLong) e, f[xOffset], s[yOffset]);
+                }
+            };
 
-                f[xOffset] = func((Nd4jLong) e, f[xOffset], s[yOffset]);
-            }
+            samediff::Threads::parallel_for(loop, 0, _length);
         } else {
 
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (int e = 0; e < _length; e++) {
+            auto loop = PRAGMA_THREADS_FOR {
+                for (auto e = start; e < stop; e += increment) {
+                    auto xOffset = this->getOffset(e);
+                    auto yOffset = other->getOffset(e);
+                    auto zOffset = target->getOffset(e);
 
-                auto xOffset = this->getOffset(e);
-                auto yOffset = other->getOffset(e);
-                auto zOffset = target->getOffset(e);
+                    z[zOffset] = func((Nd4jLong) e, f[xOffset], s[yOffset]);
+                }
+            };
 
-                z[zOffset] = func((Nd4jLong) e, f[xOffset], s[yOffset]);
-            }
+            samediff::Threads::parallel_for(loop, 0, _length);
         }
     }
 }
diff --git a/libnd4j/blas/cpu/NativeOpExecutioner.cpp b/libnd4j/blas/cpu/NativeOpExecutioner.cpp
index 22fd9eca4..dc27c1cce 100644
--- a/libnd4j/blas/cpu/NativeOpExecutioner.cpp
+++ b/libnd4j/blas/cpu/NativeOpExecutioner.cpp
@@ -20,6 +20,8 @@
 #include "NativeOpExecutioner.h"
 #include <types/types.h>
 
+#include <LoopKind.h>
+
 #include <pairwise_bool.h>
 #include <broadcasting_bool.h>
 #include <scalar_bool.h>
@@ -50,11 +52,14 @@
 #include <loops/random.h>
 #include <pointercast.h>
 #include <exceptions/datatype_exception.h>
+#include <array/TadPack.h>
+#include <helpers/ConstantTadHelper.h>
 
 
 #ifdef _OPENMP
 
 #include <omp.h>
+#include <helpers/ConstantTadHelper.h>
 
 #endif
 
@@ -78,9 +83,7 @@ void NativeOpExecutioner::execIndexReduceScalar(nd4j::LaunchContext  *lc, int op
                                     void *hZ, Nd4jLong *hZShapeInfo,
                                     void *dZ, Nd4jLong *dZShapeInfo) {
 
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
@@ -111,9 +114,7 @@ void NativeOpExecutioner::execIndexReduce(nd4j::LaunchContext  *lc,
                                 void *dZ, Nd4jLong *dZShapeInfo,
                                 int *dimension, int dimensionLength,
                                 Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
@@ -149,9 +150,7 @@ void NativeOpExecutioner::execBroadcast(nd4j::LaunchContext  *lc,
                             Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets,
                             Nd4jLong *tadOnlyShapeInfoZ,Nd4jLong *tadOffsetsZ) {
 
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
@@ -160,7 +159,16 @@ void NativeOpExecutioner::execBroadcast(nd4j::LaunchContext  *lc,
 #ifdef __ND4J_EXPERIMENTAL__
     BUILD_PAIRWISE_SELECTOR(xType, yType, zType, functions::broadcast::Broadcast, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES, LIBND4J_TYPES);
 #else
-    BUILD_SINGLE_SELECTOR_THRICE(xType, functions::broadcast::Broadcast, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES);
+
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_SINGLE_SELECTOR_THRICE(xType, functions::broadcast::Broadcast, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ, start, stop), LIBND4J_TYPES);
+    };
+
+    auto xLen = shape::length(hXShapeInfo);
+    auto yLen = shape::length(hYShapeInfo);
+    auto numTads = xLen / yLen;
+
+    samediff::Threads::parallel_tad(func, 0, numTads);
 #endif
 }
 
@@ -179,9 +187,7 @@ void NativeOpExecutioner::execInverseBroadcast(nd4j::LaunchContext  *lc,
     auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     if (!nd4j::Environment::getInstance()->isExperimentalBuild())
         if ((yType != xType && yType != nd4j::DataType::BOOL) || xType != zType)
@@ -190,7 +196,15 @@ void NativeOpExecutioner::execInverseBroadcast(nd4j::LaunchContext  *lc,
 #ifdef __ND4J_EXPERIMENTAL__
     BUILD_PAIRWISE_SELECTOR(xType, yType, zType, functions::broadcast::Broadcast, ::execInverse(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES, LIBND4J_TYPES);
 #else
-    BUILD_SINGLE_SELECTOR_THRICE(xType, functions::broadcast::Broadcast, ::execInverse(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES);
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_SINGLE_SELECTOR_THRICE(xType, functions::broadcast::Broadcast, ::execInverse(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ, start, stop), LIBND4J_TYPES);
+    };
+
+    auto xLen = shape::length(hXShapeInfo);
+    auto yLen = shape::length(hYShapeInfo);
+    auto numTads = yLen / xLen;
+
+    samediff::Threads::parallel_tad(func, 0, numTads);
 #endif
 
 }
@@ -208,15 +222,21 @@ void NativeOpExecutioner::execBroadcastBool(nd4j::LaunchContext  *lc,
                             int *dimension, int dimensionLength,
                             Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets,
                             Nd4jLong *tadOnlyShapeInfoZ,Nd4jLong *tadOffsetsZ) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
-    BUILD_DOUBLE_SELECTOR(xType, zType, functions::broadcast::BroadcastBool, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES, BOOL_TYPES);
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_DOUBLE_SELECTOR(xType, zType, functions::broadcast::BroadcastBool, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ, start, stop), LIBND4J_TYPES, BOOL_TYPES);
+    };
+
+    auto xLen = shape::length(hXShapeInfo);
+    auto yLen = shape::length(hYShapeInfo);
+    auto numTads = xLen / yLen;
+
+    samediff::Threads::parallel_tad(func, 0, numTads);
 }
 
 void NativeOpExecutioner::execInverseBroadcastBool(nd4j::LaunchContext  *lc,
@@ -231,9 +251,7 @@ void NativeOpExecutioner::execInverseBroadcastBool(nd4j::LaunchContext  *lc,
                                                   Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets,
                                                   Nd4jLong *tadOnlyShapeInfoZ,Nd4jLong *tadOffsetsZ) {
 
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
@@ -243,7 +261,15 @@ void NativeOpExecutioner::execInverseBroadcastBool(nd4j::LaunchContext  *lc,
         if (yType != xType || nd4j::DataType::BOOL != zType)
             throw nd4j::datatype_exception::build("NativeOps::execInverseBroadcastBool both operands must have same data type", xType, yType);
 
-    BUILD_DOUBLE_SELECTOR(xType, zType, functions::broadcast::BroadcastBool, ::execInverse(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES, BOOL_TYPES);
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_DOUBLE_SELECTOR(xType, zType, functions::broadcast::BroadcastBool, ::execInverse(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ, start, stop), LIBND4J_TYPES, BOOL_TYPES);
+    };
+
+    auto xLen = shape::length(hXShapeInfo);
+    auto yLen = shape::length(hYShapeInfo);
+    auto numTads = yLen / xLen;
+
+    samediff::Threads::parallel_tad(func, 0, numTads);
 }
 
 
@@ -260,9 +286,7 @@ void NativeOpExecutioner::execBroadcastInt(nd4j::LaunchContext  *lc,
                                             int *dimension, int dimensionLength,
                                             Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets,
                                             Nd4jLong *tadOnlyShapeInfoZ,Nd4jLong *tadOffsetsZ) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
@@ -274,7 +298,15 @@ void NativeOpExecutioner::execBroadcastInt(nd4j::LaunchContext  *lc,
     if (!nd4j::DataTypeUtils::isZ(zType))
         throw nd4j::datatype_exception::build("NativeOpExecutioner::execBroadcastInt requires integer data type", zType);
 
-    BUILD_SINGLE_SELECTOR(xType, functions::broadcast::BroadcastInt, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ), INTEGER_TYPES);
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_SINGLE_SELECTOR(xType, functions::broadcast::BroadcastInt, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ, start, stop), INTEGER_TYPES);
+    };
+
+    auto xLen = shape::length(hXShapeInfo);
+    auto yLen = shape::length(hYShapeInfo);
+    auto numTads = xLen / yLen;
+
+    samediff::Threads::parallel_tad(func, 0, numTads);
 }
 
 void NativeOpExecutioner::execInverseBroadcastInt(nd4j::LaunchContext  *lc,
@@ -289,21 +321,27 @@ void NativeOpExecutioner::execInverseBroadcastInt(nd4j::LaunchContext  *lc,
                                                    Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets,
                                                    Nd4jLong *tadOnlyShapeInfoZ,Nd4jLong *tadOffsetsZ) {
 
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
     if (xType != yType || xType != zType)
-        throw nd4j::datatype_exception::build("NativeOpExecutioner::execPairwiseIntTransform", zType, xType, yType);
+        throw nd4j::datatype_exception::build("NativeOpExecutioner::execInverseBroadcastInt", zType, xType, yType);
 
     if (!nd4j::DataTypeUtils::isZ(zType))
-        throw nd4j::datatype_exception::build("NativeOpExecutioner::execBroadcastInt requires integer data type", zType);
+        throw nd4j::datatype_exception::build("NativeOpExecutioner::execInverseBroadcastInt requires integer data type", zType);
 
-    BUILD_SINGLE_SELECTOR(xType, functions::broadcast::BroadcastInt, ::execInverse(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ), INTEGER_TYPES);
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_SINGLE_SELECTOR(xType, functions::broadcast::BroadcastInt,::execInverse(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ, start, stop), INTEGER_TYPES);
+    };
+
+    auto xLen = shape::length(hXShapeInfo);
+    auto yLen = shape::length(hYShapeInfo);
+    auto numTads = yLen / xLen;
+
+    samediff::Threads::parallel_tad(func, 0, numTads);
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -328,9 +366,7 @@ void NativeOpExecutioner::execPairwiseTransform(nd4j::LaunchContext  *lc,
                                     void *hZ, Nd4jLong *hZShapeInfo,
                                     void *dZ, Nd4jLong *dZShapeInfo,
                                     void *extraParams) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
@@ -339,7 +375,15 @@ void NativeOpExecutioner::execPairwiseTransform(nd4j::LaunchContext  *lc,
 #ifdef __ND4J_EXPERIMENTAL__
     BUILD_PAIRWISE_SELECTOR(xType, yType, zType, functions::pairwise_transforms::PairWiseTransform, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, extraParams), LIBND4J_TYPES, LIBND4J_TYPES);
 #else
-    BUILD_SINGLE_SELECTOR_THRICE(xType, functions::pairwise_transforms::PairWiseTransform, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, extraParams), LIBND4J_TYPES);
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_SINGLE_SELECTOR_THRICE(xType, functions::pairwise_transforms::PairWiseTransform,
+                                     ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, extraParams, start, stop),
+                                     LIBND4J_TYPES);
+    };
+
+    auto zLen = shape::length(hZShapeInfo);
+    samediff::Threads::parallel_for(func, 0, zLen, 1, nd4j::math::nd4j_max<int>(1, nd4j::math::nd4j_min<int>(zLen / 1024, nd4j::Environment::getInstance()->maxThreads())));
+
 #endif
 }
 
@@ -353,9 +397,7 @@ void NativeOpExecutioner::execPairwiseBoolTransform(nd4j::LaunchContext  *lc,
                                     void *hZ, Nd4jLong *hZShapeInfo,
                                     void *dZ, Nd4jLong *dZShapeInfo,
                                     void *extraParams) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
@@ -367,7 +409,13 @@ void NativeOpExecutioner::execPairwiseBoolTransform(nd4j::LaunchContext  *lc,
     if (zType != nd4j::DataType::BOOL)
         throw nd4j::datatype_exception::build("NativeOpExecutioner::execPairwiseBoolTransform", nd4j::DataType::BOOL, zType);
 
-    BUILD_DOUBLE_SELECTOR(xType, zType, functions::pairwise_transforms::PairWiseBoolTransform, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, extraParams), LIBND4J_TYPES, BOOL_TYPES);
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_DOUBLE_SELECTOR(xType, zType, functions::pairwise_transforms::PairWiseBoolTransform, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, extraParams, start, stop), LIBND4J_TYPES, BOOL_TYPES);
+    };
+
+    auto zLen = shape::length(hZShapeInfo);
+    samediff::Threads::parallel_for(func, 0, zLen, 1, nd4j::math::nd4j_max<int>(1, nd4j::math::nd4j_min<int>(zLen / 1024, nd4j::Environment::getInstance()->maxThreads())));
+
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -380,9 +428,7 @@ void NativeOpExecutioner::execPairwiseIntTransform(nd4j::LaunchContext  *lc,
                                                     void *hZ, Nd4jLong *hZShapeInfo,
                                                     void *dZ, Nd4jLong *dZShapeInfo,
                                                     void *extraParams) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
@@ -394,7 +440,13 @@ void NativeOpExecutioner::execPairwiseIntTransform(nd4j::LaunchContext  *lc,
     if (!nd4j::DataTypeUtils::isZ(zType))
         throw nd4j::datatype_exception::build("NativeOpExecutioner::execSPairwiseInt requires integer data type", zType);
 
-    BUILD_SINGLE_SELECTOR(xType, functions::pairwise_transforms::PairWiseIntTransform, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, extraParams), INTEGER_TYPES);
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_SINGLE_SELECTOR(xType, functions::pairwise_transforms::PairWiseIntTransform, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, extraParams, start, stop), INTEGER_TYPES);
+    };
+
+    auto zLen = shape::length(hZShapeInfo);
+    samediff::Threads::parallel_for(func, 0, zLen, 1, nd4j::math::nd4j_max<int>(1, nd4j::math::nd4j_min<int>(zLen / 1024, nd4j::Environment::getInstance()->maxThreads())));
+
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -417,14 +469,22 @@ void NativeOpExecutioner::execReduceFloat(nd4j::LaunchContext  *lc,
                             int *dimension, int dimensionLength,
                             Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
 
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
-    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceFloatFunction, ::exec(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets), LIBND4J_TYPES, FLOAT_TYPES);
+    // nothing to do here if result is empty
+    if (shape::isEmpty(hZShapeInfo))
+        return;
+
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceFloatFunction, ::exec(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets, start, stop), LIBND4J_TYPES, FLOAT_TYPES);
+    };
+
+    const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopTadXZ(hXShapeInfo, hZShapeInfo, tadShapeInfo);
+
+    samediff::Threads::parallel_tad(func, 0, shape::length(hZShapeInfo), 1, kindOfLoop == nd4j::LoopKind::Kind::SMALLARR2DX ? 1 : nd4j::Environment::getInstance()->maxThreads());
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -437,14 +497,22 @@ void NativeOpExecutioner::execReduceSame(nd4j::LaunchContext  *lc,
                                 void *dZ, Nd4jLong *dZShapeInfo,
                                 int *dimension, int dimensionLength,
                                 Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
-    BUILD_SINGLE_SELECTOR(xType, functions::reduce::ReduceSameFunction, ::exec(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets), LIBND4J_TYPES);
+    // nothing to do here if result is empty
+    if (shape::isEmpty(hZShapeInfo))
+        return;
+
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_SINGLE_SELECTOR(xType, functions::reduce::ReduceSameFunction, ::exec(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets, start, stop), LIBND4J_TYPES);
+    };
+
+    const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopTadXZ(hXShapeInfo, hZShapeInfo, tadShapeInfo);
+
+    samediff::Threads::parallel_tad(func, 0, shape::length(hZShapeInfo), 1, kindOfLoop == nd4j::LoopKind::Kind::SMALLARR2DX ? 1 : nd4j::Environment::getInstance()->maxThreads());
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -457,14 +525,22 @@ void NativeOpExecutioner::execReduceBool(nd4j::LaunchContext  *lc,
                                 void *dZ, Nd4jLong *dZShapeInfo,
                                 int *dimension, int dimensionLength,
                                 Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
-    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceBoolFunction, ::exec(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets), LIBND4J_TYPES, BOOL_TYPES);
+    // nothing to do here if result is empty
+    if (shape::isEmpty(hZShapeInfo))
+        return;
+
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceBoolFunction, ::exec(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets, start, stop), LIBND4J_TYPES, BOOL_TYPES);
+    };
+
+    const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopTadXZ(hXShapeInfo, hZShapeInfo, tadShapeInfo);
+
+    samediff::Threads::parallel_tad(func, 0, shape::length(hZShapeInfo), 1, kindOfLoop == nd4j::LoopKind::Kind::SMALLARR2DX ? 1 : nd4j::Environment::getInstance()->maxThreads());
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -477,14 +553,22 @@ void NativeOpExecutioner::execReduceLong(nd4j::LaunchContext  *lc,
                                 void *dZ, Nd4jLong *dZShapeInfo,
                                 int *dimension, int dimensionLength,
                                 Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
-    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceLongFunction, ::exec(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets), LIBND4J_TYPES, LONG_TYPES);
+    // nothing to do here if result is empty
+    if (shape::isEmpty(hZShapeInfo))
+        return;
+
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceLongFunction, ::exec(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets, start, stop), LIBND4J_TYPES, LONG_TYPES);
+    };
+
+    const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopTadXZ(hXShapeInfo, hZShapeInfo, tadShapeInfo);
+
+    samediff::Threads::parallel_tad(func, 0, shape::length(hZShapeInfo), 1, kindOfLoop == nd4j::LoopKind::Kind::SMALLARR2DX ? 1 : nd4j::Environment::getInstance()->maxThreads());
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -503,9 +587,7 @@ void NativeOpExecutioner::execReduceFloatScalar(nd4j::LaunchContext  *lc,
                                     void *extraParams,
                                     void *hZ, Nd4jLong *hZShapeInfo,
                                     void *dZ, Nd4jLong *dZShapeInfo) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
@@ -521,9 +603,7 @@ void NativeOpExecutioner::execReduceSameScalar(nd4j::LaunchContext  *lc,
                                         void *extraParams,
                                         void *hZ, Nd4jLong *hZShapeInfo,
                                         void *dZ, Nd4jLong *dZShapeInfo) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
 
@@ -539,9 +619,7 @@ void NativeOpExecutioner::execReduceBoolScalar(nd4j::LaunchContext  *lc,
                                         void *hZ, Nd4jLong *hZShapeInfo,
                                         void *dZ, Nd4jLong *dZShapeInfo) {
 
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
@@ -557,9 +635,7 @@ void NativeOpExecutioner::execReduceLongScalar(nd4j::LaunchContext  *lc,
                                         void *extraParams,
                                         void *hZ, Nd4jLong *hZShapeInfo,
                                         void *dZ, Nd4jLong *dZShapeInfo) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
@@ -591,10 +667,6 @@ void NativeOpExecutioner::execReduce3Scalar(nd4j::LaunchContext  *lc,
                             void *dY, Nd4jLong *dYShapeInfo,
                             void *hZ, Nd4jLong *hZShapeInfo,
                             void *dZ, Nd4jLong *dZShapeInfo) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
-
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
@@ -623,15 +695,13 @@ void NativeOpExecutioner::execReduce3(nd4j::LaunchContext  *lc,
                             void *dY, Nd4jLong *dYShapeInfo,
                             void *hZ, Nd4jLong *hZShapeInfo,
                             void *dZ, Nd4jLong *dZShapeInfo) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
-    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::exec(opNum, hX, hXShapeInfo, extraParamsVals, hY, hYShapeInfo, hZ, hZShapeInfo, nullptr, 1), LIBND4J_TYPES, FLOAT_TYPES);
-
+    //BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::exec(opNum, hX, hXShapeInfo, extraParamsVals, hY, hYShapeInfo, hZ, hZShapeInfo, nullptr, 0), LIBND4J_TYPES, FLOAT_TYPES);
+    NativeOpExecutioner::execReduce3Scalar(lc, opNum, hX, hXShapeInfo, dX, dXShapeInfo, extraParamsVals, hY, hYShapeInfo, dY, dYShapeInfo, hZ, hZShapeInfo, dZ, dZShapeInfo);
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -647,14 +717,31 @@ void NativeOpExecutioner::execReduce3(nd4j::LaunchContext  *lc,
                             int *dimension, int dimensionLength,
                             Nd4jLong *xTadOnlyShapeInfo, Nd4jLong *xTadOffsets,
                             Nd4jLong *yTadOnlyShapeInfo, Nd4jLong *yTadOffsets) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
-    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::exec(opNum, hX, hXShapeInfo, extraParamsVals, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength), LIBND4J_TYPES, FLOAT_TYPES);
+    const auto xLen = shape::length(hXShapeInfo);
+    const auto yLen = shape::length(hYShapeInfo);
+
+    nd4j::TadPack tadPack;
+
+    if(xLen == yLen) {
+        tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension, dimensionLength);
+    }
+    else if(yLen > xLen) {
+        tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hYShapeInfo, dimension, dimensionLength);
+    }
+    else {
+        tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension, dimensionLength);
+    }
+
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::exec(opNum, hX, hXShapeInfo, extraParamsVals, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, start, stop), LIBND4J_TYPES, FLOAT_TYPES);
+    };
+
+    samediff::Threads::parallel_tad(func, 0, tadPack.numberOfTads());
 }
 
 
@@ -671,15 +758,19 @@ void NativeOpExecutioner::execReduce3All(nd4j::LaunchContext  *lc,
                             int *dimension, int dimensionLength,
                             Nd4jLong *xTadShapeInfo, Nd4jLong *xOffsets,
                             Nd4jLong *yTadShapeInfo, Nd4jLong *yOffsets) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
-    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::execAll(opNum, hX, hXShapeInfo, extraParamsVals, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, xTadShapeInfo, xOffsets, yTadShapeInfo, yOffsets), LIBND4J_TYPES, FLOAT_TYPES);
-//    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::execAll(opNum, hX, hXShapeInfo, dX, dXShapeInfo, extraParamsVals, hY, hYShapeInfo, dY, dYShapeInfo, hZ, hZShapeInfo, dZ, dZShapeInfo, dimension, dimensionLength, xTadShapeInfo, xOffsets, yTadShapeInfo, yOffsets), LIBND4J_TYPES, FLOAT_TYPES);
+    auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension, dimensionLength);
+
+    // TODO: make it 2d
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::execAll(opNum, hX, hXShapeInfo, extraParamsVals, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, xTadShapeInfo, xOffsets, yTadShapeInfo, yOffsets, start, stop), LIBND4J_TYPES, FLOAT_TYPES);
+    };
+
+    samediff::Threads::parallel_tad(func, 0, tadPack.numberOfTads());
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -696,15 +787,31 @@ void NativeOpExecutioner::execReduce3TAD(nd4j::LaunchContext  *lc,
                             Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets,
                             Nd4jLong *yTadShapeInfo, Nd4jLong *yTadOffsets) {
 
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
-    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::exec(opNum, hX, hXShapeInfo, extraParamsVals, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets), LIBND4J_TYPES, FLOAT_TYPES);
-//    BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::exec(opNum, hX, hXShapeInfo, dX, dXShapeInfo, extraParamsVals, hY, hYShapeInfo, dY, dYShapeInfo, hZ, hZShapeInfo, dZ, dZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets), LIBND4J_TYPES, FLOAT_TYPES);
+    const auto xLen = shape::length(hXShapeInfo);
+    const auto yLen = shape::length(hYShapeInfo);
+
+    nd4j::TadPack tadPack;
+
+    if(xLen == yLen) {
+        tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension, dimensionLength);
+    }
+    else if(yLen > xLen) {
+        tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hYShapeInfo, dimension, dimensionLength);
+    }
+    else {
+        tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension, dimensionLength);
+    }
+
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::exec(opNum, hX, hXShapeInfo, extraParamsVals, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets, start, stop), LIBND4J_TYPES, FLOAT_TYPES);
+    };
+
+    samediff::Threads::parallel_tad(func, 0, tadPack.numberOfTads());
 }
 
 
@@ -729,9 +836,7 @@ void NativeOpExecutioner::execScalar(nd4j::LaunchContext  *lc,
                             void *hScalar, Nd4jLong *hScalarShapeInfo,
                             void *dScalar, Nd4jLong *dScalarShapeInfo,
                             void *extraParams, bool allowParallelism) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto yType = nd4j::ArrayOptions::dataType(hScalarShapeInfo);
@@ -743,7 +848,13 @@ void NativeOpExecutioner::execScalar(nd4j::LaunchContext  *lc,
     if (xType != yType || xType != zType)
         throw nd4j::datatype_exception::build("NativeOpExecutioner::execScalar", zType, xType, yType);
 
-    BUILD_SINGLE_SELECTOR_THRICE(xType, functions::scalar::ScalarTransform, ::transform(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, hScalar, extraParams, allowParallelism), LIBND4J_TYPES);
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_SINGLE_SELECTOR_THRICE(xType, functions::scalar::ScalarTransform,::transform(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, hScalar, extraParams, start, stop), LIBND4J_TYPES);
+    };
+
+    auto zLen = shape::length(hZShapeInfo);
+    samediff::Threads::parallel_for(func, 0, zLen, 1, !allowParallelism ? 1 : nd4j::math::nd4j_max<int>(1, nd4j::math::nd4j_min<int>(zLen / 1024, nd4j::Environment::getInstance()->maxThreads())));
+
 #endif
 }
 
@@ -760,9 +871,7 @@ void NativeOpExecutioner::execScalar(nd4j::LaunchContext  *lc,
                             int *dimension, int dimensionLength,
                             Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets,
                             Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto yType = nd4j::ArrayOptions::dataType(hScalarShapeInfo);
@@ -774,7 +883,13 @@ void NativeOpExecutioner::execScalar(nd4j::LaunchContext  *lc,
     if (xType != yType || xType != zType)
         throw nd4j::datatype_exception::build("NativeOpExecutioner::execScalar", zType, xType, yType);
 
-    BUILD_SINGLE_SELECTOR_THRICE(xType, functions::scalar::ScalarTransform, ::transform(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, hScalars, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES);
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_SINGLE_SELECTOR_THRICE(xType, functions::scalar::ScalarTransform, ::transform(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, hScalars, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ, start, stop), LIBND4J_TYPES);
+    };
+
+    auto yLen = shape::length(hScalarShapeInfo);
+    samediff::Threads::parallel_tad(func, 0, yLen, 1, nd4j::math::nd4j_min<int>(yLen, nd4j::Environment::getInstance()->maxThreads()));
+
 #endif
 }
 
@@ -789,9 +904,7 @@ void NativeOpExecutioner::execScalarBool(nd4j::LaunchContext  *lc,
                             void *dScalar, Nd4jLong *dSscalarShapeInfo,
                             void *extraParams, bool allowParallelism) {
 
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto yType = nd4j::ArrayOptions::dataType(hSscalarShapeInfo);
@@ -803,7 +916,13 @@ void NativeOpExecutioner::execScalarBool(nd4j::LaunchContext  *lc,
     if (zType != nd4j::DataType::BOOL)
         throw nd4j::datatype_exception::build("NativeOpExecutioner::execScalarBool", nd4j::DataType::BOOL, zType);
 
-    BUILD_DOUBLE_SELECTOR(xType, zType, functions::scalar::ScalarBoolTransform, ::transform(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, hScalar, extraParams), LIBND4J_TYPES, BOOL_TYPES);
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_DOUBLE_SELECTOR(xType, zType, functions::scalar::ScalarBoolTransform, ::transform(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, hScalar, extraParams, start, stop), LIBND4J_TYPES, BOOL_TYPES);
+    };
+
+    auto zLen = shape::length(hZShapeInfo);
+    samediff::Threads::parallel_for(func, 0, zLen, 1,  !allowParallelism ? 1 : nd4j::math::nd4j_max<int>(1, nd4j::math::nd4j_min<int>(zLen / 1024, nd4j::Environment::getInstance()->maxThreads())));
+
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -819,9 +938,7 @@ void NativeOpExecutioner::execScalarBool(nd4j::LaunchContext  *lc,
                             int *dimension, int dimensionLength,
                             Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets,
                             Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto yType = nd4j::ArrayOptions::dataType(hScalarShapeInfo);
@@ -833,7 +950,12 @@ void NativeOpExecutioner::execScalarBool(nd4j::LaunchContext  *lc,
     if (zType != nd4j::DataType::BOOL)
         throw nd4j::datatype_exception::build("NativeOpExecutioner::execScalarBool", nd4j::DataType::BOOL, zType);
 
-    BUILD_DOUBLE_SELECTOR(xType, zType, functions::scalar::ScalarBoolTransform, ::transform(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, hScalars, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES, BOOL_TYPES);
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_DOUBLE_SELECTOR(xType, zType, functions::scalar::ScalarBoolTransform, ::transform(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, hScalars, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ, start, stop), LIBND4J_TYPES, BOOL_TYPES);
+    };
+
+    auto yLen = shape::length(hScalarShapeInfo);
+    samediff::Threads::parallel_tad(func, 0, yLen, 1, nd4j::math::nd4j_min<int>(yLen, nd4j::Environment::getInstance()->maxThreads()));
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -847,9 +969,7 @@ void NativeOpExecutioner::execScalarInt(nd4j::LaunchContext  *lc,
                                          void *dScalar, Nd4jLong *dSscalarShapeInfo,
                                          void *extraParams, bool allowParallelism) {
 
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto yType = nd4j::ArrayOptions::dataType(hSscalarShapeInfo);
@@ -861,7 +981,13 @@ void NativeOpExecutioner::execScalarInt(nd4j::LaunchContext  *lc,
     if (!nd4j::DataTypeUtils::isZ(zType))
         throw nd4j::datatype_exception::build("NativeOpExecutioner::execScalarInt", nd4j::DataType::INT32, zType);
 
-    BUILD_SINGLE_SELECTOR(xType, functions::scalar::ScalarIntTransform, ::transform(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, hScalar, extraParams), INTEGER_TYPES);
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_SINGLE_SELECTOR(xType, functions::scalar::ScalarIntTransform, ::transform(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, hScalar, extraParams, start, stop), INTEGER_TYPES);
+    };
+
+    auto zLen = shape::length(hZShapeInfo);
+    samediff::Threads::parallel_for(func, 0, zLen, 1, !allowParallelism ? 1 : nd4j::math::nd4j_max<int>(1, nd4j::math::nd4j_min<int>(zLen / 1024, nd4j::Environment::getInstance()->maxThreads())));
+
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -877,9 +1003,7 @@ void NativeOpExecutioner::execScalarInt(nd4j::LaunchContext  *lc,
                                          int *dimension, int dimensionLength,
                                          Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets,
                                          Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto yType = nd4j::ArrayOptions::dataType(hScalarShapeInfo);
@@ -891,7 +1015,12 @@ void NativeOpExecutioner::execScalarInt(nd4j::LaunchContext  *lc,
     if (!nd4j::DataTypeUtils::isZ(zType))
         throw nd4j::datatype_exception::build("NativeOpExecutioner::execScalarInt requires integer data type", zType);
 
-    BUILD_SINGLE_SELECTOR(xType, functions::scalar::ScalarIntTransform, ::transform(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, hScalars, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ), INTEGER_TYPES);
+    auto func = PRAGMA_THREADS_FOR {
+        BUILD_SINGLE_SELECTOR(xType, functions::scalar::ScalarIntTransform, ::transform(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, hScalars, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ, start, stop), INTEGER_TYPES);
+    };
+
+    auto yLen = shape::length(hScalarShapeInfo);
+    samediff::Threads::parallel_tad(func, 0, yLen, 1, nd4j::math::nd4j_min<int>(yLen, nd4j::Environment::getInstance()->maxThreads()));
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -912,9 +1041,7 @@ void NativeOpExecutioner::execSummaryStats(nd4j::LaunchContext  *lc,
                                 void *hZ, Nd4jLong *hZShapeInfo,
                                 void *dZ, Nd4jLong *dZShapeInfo,
                                 bool biasCorrected) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
@@ -940,9 +1067,7 @@ void NativeOpExecutioner::execSummaryStatsScalar(nd4j::LaunchContext  *lc,
                                     void *hZ, Nd4jLong *hZShapeInfo,
                                     void *dZ, Nd4jLong *dZShapeInfo,
                                     bool biasCorrected) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
@@ -972,10 +1097,6 @@ void NativeOpExecutioner::execSummaryStats(nd4j::LaunchContext  *lc,
                                 int *dimension, int dimensionLength,
                                 Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets,
                                 bool biasCorrected) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
-
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
@@ -1002,14 +1123,14 @@ void NativeOpExecutioner::execTransformFloat(nd4j::LaunchContext  *lc,
                                 void *dZ, Nd4jLong *dZShapeInfo,
                                 void *extraParams,
                                 Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
-
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
-    BUILD_DOUBLE_SELECTOR(xType, zType, functions::transform::TransformFloat, ::exec(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, extraParams, tadShapeInfo, tadOffsets), LIBND4J_TYPES, FLOAT_TYPES);
+    auto func = PRAGMA_THREADS_DO {
+        BUILD_DOUBLE_SELECTOR(xType, zType, functions::transform::TransformFloat, ::exec(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, extraParams, thread_id, numThreads), LIBND4J_TYPES, FLOAT_TYPES);
+    };
+
+    samediff::Threads::parallel_do(func, nd4j::math::nd4j_max<int>(1, nd4j::math::nd4j_min<int>(shape::length(hZShapeInfo) / 1024, nd4j::Environment::getInstance()->maxThreads())));
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -1021,14 +1142,14 @@ void NativeOpExecutioner::execTransformBool(nd4j::LaunchContext  *lc,
                                 void *dZ, Nd4jLong *dZShapeInfo,
                                 void *extraParams,
                                 Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
-
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
-    BUILD_DOUBLE_SELECTOR(xType, zType, functions::transform::TransformBool, ::exec(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, extraParams, tadShapeInfo, tadOffsets), LIBND4J_TYPES, BOOL_TYPES);
+    auto func = PRAGMA_THREADS_DO {
+        BUILD_DOUBLE_SELECTOR(xType, zType, functions::transform::TransformBool, ::exec(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, extraParams, thread_id, numThreads), LIBND4J_TYPES, BOOL_TYPES);
+    };
+
+    samediff::Threads::parallel_do(func, nd4j::math::nd4j_max<int>(1, nd4j::math::nd4j_min<int>(shape::length(hZShapeInfo) / 1024, nd4j::Environment::getInstance()->maxThreads())));
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -1040,14 +1161,14 @@ void NativeOpExecutioner::execTransformAny(nd4j::LaunchContext  *lc,
                                 void *dZ, Nd4jLong *dZShapeInfo,
                                 void *extraParams,
                                 Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, bool allowParallelism) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
-
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
-    BUILD_DOUBLE_SELECTOR(xType, zType, functions::transform::TransformAny, ::exec(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, extraParams, tadShapeInfo, tadOffsets, allowParallelism), LIBND4J_TYPES, LIBND4J_TYPES);
+    auto func = PRAGMA_THREADS_DO {
+        BUILD_DOUBLE_SELECTOR(xType, zType, functions::transform::TransformAny, ::exec(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, extraParams, thread_id, numThreads), LIBND4J_TYPES, LIBND4J_TYPES);
+    };
+
+    samediff::Threads::parallel_do(func, nd4j::math::nd4j_max<int>(1, nd4j::math::nd4j_min<int>(shape::length(hZShapeInfo) / 1024, nd4j::Environment::getInstance()->maxThreads())));
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -1059,14 +1180,14 @@ void NativeOpExecutioner::execTransformSame(nd4j::LaunchContext  *lc,
                                 void *dZ, Nd4jLong *dZShapeInfo,
                                 void *extraParams,
                                 Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
-
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
-    BUILD_SINGLE_SELECTOR(xType, functions::transform::TransformSame, ::exec(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, extraParams, tadShapeInfo, tadOffsets), LIBND4J_TYPES);
+    auto func = PRAGMA_THREADS_DO {
+        BUILD_SINGLE_SELECTOR(xType, functions::transform::TransformSame, ::exec(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, extraParams, thread_id, numThreads), LIBND4J_TYPES);
+    };
+
+    samediff::Threads::parallel_do(func, nd4j::math::nd4j_max<int>(1, nd4j::math::nd4j_min<int>(shape::length(hZShapeInfo) / 1024, nd4j::Environment::getInstance()->maxThreads())));
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -1078,14 +1199,14 @@ void NativeOpExecutioner::execTransformStrict(nd4j::LaunchContext  *lc,
                                 void *dZ, Nd4jLong *dZShapeInfo,
                                 void *extraParams,
                                 Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
-
     auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
-    BUILD_SINGLE_SELECTOR(xType, functions::transform::TransformStrict, ::exec(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, extraParams, tadShapeInfo, tadOffsets), FLOAT_TYPES);
+    auto func = PRAGMA_THREADS_DO {
+        BUILD_SINGLE_SELECTOR(xType, functions::transform::TransformStrict, ::exec(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, extraParams, thread_id, numThreads), FLOAT_TYPES);
+    };
+
+    samediff::Threads::parallel_do(func, nd4j::math::nd4j_max<int>(1, nd4j::math::nd4j_min<int>(shape::length(hZShapeInfo) / 1024, nd4j::Environment::getInstance()->maxThreads())));
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -1095,9 +1216,7 @@ void NativeOpExecutioner::execRandom(nd4j::LaunchContext  *lc,
                             void *hZ, Nd4jLong *hZShapeInfo,
                             void *dZ, Nd4jLong *dZShapeInfo,
                             void *extraArguments) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
@@ -1116,9 +1235,7 @@ void NativeOpExecutioner::execRandom(nd4j::LaunchContext  *lc,
                             void *hZ, Nd4jLong *hZShapeInfo,
                             void *dZ, Nd4jLong *dZShapeInfo,
                             void *extraArguments) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
@@ -1139,9 +1256,7 @@ void NativeOpExecutioner::execRandom(nd4j::LaunchContext  *lc,
                           void *hZ, Nd4jLong *hZShapeInfo,
                           void *dZ, Nd4jLong *dZShapeInfo,
                           void *extraArguments) {
-#ifdef _OPENMP
-    omp_set_nested(1);
-#endif
+
 
     auto xType = nd4j::ArrayOptions::dataType(hZShapeInfo);
 
diff --git a/libnd4j/blas/cpu/NativeOps.cpp b/libnd4j/blas/cpu/NativeOps.cpp
index 7449bb022..151f5c883 100644
--- a/libnd4j/blas/cpu/NativeOps.cpp
+++ b/libnd4j/blas/cpu/NativeOps.cpp
@@ -28,7 +28,6 @@
 #include <templatemath.h>
 #include <types/float8.h>
 #include <loops/type_conversions.h>
-#include <loops/aggregates.h>
 #include <helpers/helper_ptrmap.h>
 #include <helpers/logger.h>
 #include <pointercast.h>
@@ -36,6 +35,7 @@
 #include <types/types.h>
 #include <ops/declarable/helpers/transforms.h>
 #include <exceptions/allocation_exception.h>
+#include <helpers/BlasHelper.h>
 
 
 #include <fcntl.h>
@@ -75,6 +75,7 @@ bool experimentalSupport = false;
 #include <performance/benchmarking/BenchmarkSuit.h>
 #include <performance/benchmarking/FullBenchmarkSuit.h>
 #include <performance/benchmarking/LightBenchmarkSuit.h>
+#include <execution/Threads.h>
 
 #ifdef CPU_FEATURES
 #include <cpuinfo_x86.h>
@@ -1152,10 +1153,7 @@ void initializeFunctions(Nd4jPointer *functions) {
        * @param flags optional parameter
        */
 Nd4jPointer mallocHost(Nd4jLong memorySize, int flags) {
-    Nd4jPointer pointer = (Nd4jPointer) malloc(memorySize);
-    if (pointer == 0)
-        return 0L;
-    return pointer;
+    return reinterpret_cast<Nd4jPointer>(new int8_t[memorySize]);
 }
 
 /**
@@ -1179,7 +1177,7 @@ Nd4jPointer mallocDevice(Nd4jLong memorySize, int deviceId, int flags) {
  * @param pointer pointer that'll be freed
  */
 int freeHost(Nd4jPointer pointer) {
-    free(reinterpret_cast<void *>(pointer));
+    delete[] reinterpret_cast<int8_t *>(pointer);
     return 1L;
 }
 
@@ -1364,37 +1362,37 @@ void pullRowsGeneric(void *vx,
 
     int elementsPerThread = n / TAD_THRESHOLD;
     int _threads = nd4j::math::nd4j_max<int>(1, elementsPerThread);
-    _threads = nd4j::math::nd4j_min<int>(_threads, omp_get_max_threads());
+    _threads = nd4j::math::nd4j_min<int>(_threads, nd4j::Environment::getInstance()->maxThreads());
 
-    PRAGMA_OMP_PARALLEL_FOR_THREADS(_threads)
-    for (int idx = 0; idx < n; idx++) {
-        auto xTadOffsetForBlock = tadOffsets[indexes[idx]];
-        auto zTadOffsetForBlock = zTadOffsets[idx];
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto idx = start; idx < stop; idx += increment) {
+            auto xTadOffsetForBlock = tadOffsets[indexes[idx]];
+            auto zTadOffsetForBlock = zTadOffsets[idx];
 
-        auto rX = hX + xTadOffsetForBlock;
-        auto rZ = hZ + zTadOffsetForBlock;
+            auto rX = hX + xTadOffsetForBlock;
+            auto rZ = hZ + zTadOffsetForBlock;
 
-        if (xEWS == 1 && zEWS == 1) {
-
-            PRAGMA_OMP_SIMD
-            for (int i = 0; i < tadLength; i++ ) {
-                rZ[i] = rX[i];
-            }
-        } else if (xEWS >= 1 && zEWS >= 1) {
-
-            PRAGMA_OMP_SIMD
-            for (int i = 0; i < tadLength; i++ ) {
-                rZ[i * zEWS] = rX[i * xEWS];
+            if (xEWS == 1 && zEWS == 1) {
+                PRAGMA_OMP_SIMD
+                for (int i = 0; i < tadLength; i++) {
+                    rZ[i] = rX[i];
+                }
+            } else if (xEWS >= 1 && zEWS >= 1) {
+                PRAGMA_OMP_SIMD
+                for (int i = 0; i < tadLength; i++) {
+                    rZ[i * zEWS] = rX[i * xEWS];
+                }
+            } else {
+                for (int i = 0; i < tadLength; i++) {
+                    auto xOffset = xTadOffsetForBlock + shape::getIndexOffset(i, tadShapeInfo);
+                    auto zOffset = zTadOffsetForBlock + shape::getIndexOffset(i, zTadShapeInfo);
+                    hZ[zOffset] = hX[xOffset];
+                }
             }
         }
-        else {
-            for (int i = 0; i < tadLength; i++) {
-                auto xOffset = xTadOffsetForBlock + shape::getIndexOffset(i, tadShapeInfo);
-                auto zOffset = zTadOffsetForBlock + shape::getIndexOffset(i, zTadShapeInfo);
-                hZ[zOffset] = hX[xOffset];
-            }
-        }
-    }
+    };
+
+    samediff::Threads::parallel_tad(func, 0, n, 1, _threads);
 }
 
 void pullRows(Nd4jPointer *extraPointers,
@@ -1433,30 +1431,29 @@ void tearGeneric(void *vx,
     auto zEWS = shape::elementWiseStride(hZShapeInfo);
     auto numTads = shape::length(hXShapeInfo) / tadLength;
 
-    PRAGMA_OMP_PARALLEL_FOR
-    for (Nd4jLong i = 0; i < numTads; i++) {
-        auto hZ = reinterpret_cast<T *>(targets[i]);
-        auto s = hX + tadOffsets[i];
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto i = start; i < stop; i += increment) {
+            auto hZ = reinterpret_cast<T *>(targets[i]);
+            auto s = hX + tadOffsets[i];
 
-        if (zEWS == 1 && tadEWS == 1) {
-
-            PRAGMA_OMP_SIMD
-            for (Nd4jLong j = 0; j < tadLength; j++) {
-                hZ[j] = s[j];
-            }
-        } else if (zEWS > 0 && tadEWS > 0) {
-
-            PRAGMA_OMP_SIMD
-            for (Nd4jLong j = 0; j < tadLength; j++) {
-                hZ[j * zEWS] = s[j * tadEWS];
+            if (zEWS == 1 && tadEWS == 1) {
+                PRAGMA_OMP_SIMD
+                for (Nd4jLong j = 0; j < tadLength; j++) {
+                    hZ[j] = s[j];
+                }
+            } else if (zEWS > 0 && tadEWS > 0) {
+                PRAGMA_OMP_SIMD
+                for (Nd4jLong j = 0; j < tadLength; j++) {
+                    hZ[j * zEWS] = s[j * tadEWS];
+                }
+            } else {
+                for (Nd4jLong j = 0; j < tadLength; j++)
+                    hZ[shape::getIndexOffset(j, hZShapeInfo)] = s[shape::getIndexOffset(j, tadShapeInfo)];
             }
         }
-        else {
+    };
 
-            for (Nd4jLong j = 0; j < tadLength; j++)
-                hZ[shape::getIndexOffset(j, hZShapeInfo)] = s[shape::getIndexOffset(j, tadShapeInfo)];
-        }
-    }
+    samediff::Threads::parallel_tad(func,0, numTads);
 }
 
 void tear(Nd4jPointer *extraPointers,
@@ -1557,57 +1554,60 @@ void shuffleGeneric(void **hX, Nd4jLong **hXShapeInfo, void **dz, Nd4jLong **hZS
     auto dX = reinterpret_cast<T **>(hX);
     auto dZ = reinterpret_cast<T **>(dz);
 
-    PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(N)
-    for (int f = 0; f < N; f++) {
-        auto hX = reinterpret_cast<T *>(dX[f]);
-        //auto hZ = reinterpret_cast<T *>(dZ[f]);
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto f = start; f < stop; f += increment) {
+            auto hX = reinterpret_cast<T *>(dX[f]);
+            //auto hZ = reinterpret_cast<T *>(dZ[f]);
 
-        auto xShapeInfo = hXShapeInfo[f];
-        auto tadOffset = reinterpret_cast<Nd4jLong *>(tadOffsets[f]);
+            auto xShapeInfo = hXShapeInfo[f];
+            auto tadOffset = reinterpret_cast<Nd4jLong *>(tadOffsets[f]);
 
 
-        const auto tadLength = shape::length(tadOnlyShapeInfo[f]);
-        auto tadEWS = shape::elementWiseStride(tadOnlyShapeInfo[f]);
-        auto tadRank = shape::rank(tadOnlyShapeInfo[f]);
-        auto numTads = shape::length(hXShapeInfo[f]) / tadLength;
+            const auto tadLength = shape::length(tadOnlyShapeInfo[f]);
+            auto tadEWS = shape::elementWiseStride(tadOnlyShapeInfo[f]);
+            auto tadRank = shape::rank(tadOnlyShapeInfo[f]);
+            auto numTads = shape::length(hXShapeInfo[f]) / tadLength;
 
-        auto tadShape = shape::shapeOf(tadOnlyShapeInfo[f]);
-        auto tadStride = shape::stride(tadOnlyShapeInfo[f]);
+            auto tadShape = shape::shapeOf(tadOnlyShapeInfo[f]);
+            auto tadStride = shape::stride(tadOnlyShapeInfo[f]);
 
-        if (shape::rank(xShapeInfo) == 1) {
-            auto xLength = shape::length(xShapeInfo);
-            auto ews = shape::elementWiseStride(xShapeInfo);
-            for (Nd4jLong r = 0; r < xLength; r++) {
-                auto swapIdx = shuffleMap[r];
-                if (swapIdx < 0)
-                    continue;
+            if (shape::rank(xShapeInfo) == 1) {
+                auto xLength = shape::length(xShapeInfo);
+                auto ews = shape::elementWiseStride(xShapeInfo);
+                for (Nd4jLong r = 0; r < xLength; r++) {
+                    auto swapIdx = shuffleMap[r];
+                    if (swapIdx < 0)
+                        continue;
 
-                nd4j::math::nd4j_swap<T>(hX[r*ews], hX[swapIdx*ews]);
-            }
-        } else {
-            for (Nd4jLong r = 0; r < numTads; r++) {
-                if (shuffleMap[r] < 0)
-                    continue;
+                    nd4j::math::nd4j_swap<T>(hX[r * ews], hX[swapIdx * ews]);
+                }
+            } else {
+                for (Nd4jLong r = 0; r < numTads; r++) {
+                    if (shuffleMap[r] < 0)
+                        continue;
 
-                auto oldOffset = tadOffset[r];
-                auto newOffset = tadOffset[shuffleMap[r]];
+                    auto oldOffset = tadOffset[r];
+                    auto newOffset = tadOffset[shuffleMap[r]];
 
-                auto rX = hX + oldOffset;
-                auto rY = hX + newOffset;
+                    auto rX = hX + oldOffset;
+                    auto rY = hX + newOffset;
 
-                if (tadEWS == 1) {
-                    for (Nd4jLong i = 0; i < tadLength; i++) {
-                        nd4j::math::nd4j_swap<T>(rX[i], rY[i]);
-                    }
-                } else {
-                    for (Nd4jLong i = 0; i < tadLength; i++) {
-                        auto offset = shape::getIndexOffset(i, tadOnlyShapeInfo[f]);
-                        nd4j::math::nd4j_swap<T>(hX[offset + oldOffset], hX[offset + newOffset]);
+                    if (tadEWS == 1) {
+                        for (Nd4jLong i = 0; i < tadLength; i++) {
+                            nd4j::math::nd4j_swap<T>(rX[i], rY[i]);
+                        }
+                    } else {
+                        for (Nd4jLong i = 0; i < tadLength; i++) {
+                            auto offset = shape::getIndexOffset(i, tadOnlyShapeInfo[f]);
+                            nd4j::math::nd4j_swap<T>(hX[offset + oldOffset], hX[offset + newOffset]);
+                        }
                     }
                 }
             }
         }
-    }
+    };
+
+    samediff::Threads::parallel_tad(func, 0, N);
 }
 
 void shuffle(Nd4jPointer *extras,
@@ -1772,72 +1772,9 @@ void execAggregate(Nd4jPointer *extraPointers,int opNum,
                                     void *realArguments,
                                     int numRealArguments,
                                     nd4j::DataType dtype) {
-    try {
-        BUILD_SINGLE_SELECTOR(dtype, NativeOpExecutioner::execAggregate, (nullptr, opNum, arguments, numArguments, shapeArguments, numShapeArguments, indexArguments, numIndexArguments, intArrays, numIntArrays, realArguments, numRealArguments), FLOAT_TYPES);
-    } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
-    }
 
 }
 
-template <typename T>
-void _batchExecutor(Nd4jPointer *extraPointers,
-                           int numAggregates,
-                           int opNum,
-                           int maxArgs,
-                           int maxShapes,
-                           int maxIntArrays,
-                           int maxIntArraySize,
-                           int maxIdx,
-                           int maxReals,
-                           void *ptrToArguments,
-                           nd4j::DataType dtype) {
-    // probably, we don't want too much threads as usually
-    int _threads = nd4j::math::nd4j_min<int>(numAggregates, omp_get_max_threads());
-
-    nd4j::PointersHelper<T> helper(ptrToArguments,
-                                        numAggregates,
-                                        maxArgs,
-                                        maxShapes,
-                                        maxIntArrays,
-                                        maxIntArraySize,
-                                        maxIdx,
-                                        maxReals);
-
-    // special case here, we prefer spread arrangement here, all threads are detached from each other
-    PRAGMA_OMP_PARALLEL_FOR_THREADS(_threads)
-    for (int i = 0; i < numAggregates; i++) {
-        auto intArrays = new int *[maxIntArrays];
-
-        auto arguments = helper.getArguments(i);
-        auto shapes = helper.getShapeArguments(i);
-        auto idxArg = helper.getIndexArguments(i);
-        auto realArg = helper.getRealArguments(i);
-
-        for (int e = 0; e < maxIntArrays; e++) {
-            intArrays[e] = helper.getIntArrayArguments(i, e);
-        }
-
-        execAggregate(extraPointers,
-                      opNum,
-                      reinterpret_cast<void **>(arguments),
-                      helper.getNumArguments(i),
-                      shapes,
-                      helper.getNumShapeArguments(i),
-                      idxArg,
-                      helper.getNumIndexArguments(i),
-                      intArrays,
-                      helper.getNumIntArrayArguments(i),
-                      realArg,
-                      helper.getNumRealArguments(i),
-                      dtype);
-
-        delete [] intArrays;
-    }
-}
-BUILD_SINGLE_TEMPLATE(template void _batchExecutor, (Nd4jPointer *extraPointers, int numAggregates, int opNum, int maxArgs, int maxShapes, int maxIntArrays, int maxIntArraySize, int maxIdx, int maxReals, void *ptrToArguments, nd4j::DataType dtype), FLOAT_TYPES);
-
 void batchExecutor(Nd4jPointer *extraPointers,
                                int numAggregates,
                                int opNum,
@@ -1849,12 +1786,7 @@ void batchExecutor(Nd4jPointer *extraPointers,
                                int maxReals,
                                void *ptrToArguments,
                                nd4j::DataType dtype) {
-    try {
-        BUILD_SINGLE_SELECTOR(dtype, _batchExecutor, (extraPointers, numAggregates, opNum, maxArgs, maxShapes, maxIntArrays, maxIntArraySize, maxIdx, maxReals, ptrToArguments, dtype), FLOAT_TYPES);
-    } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
-    }
+
 }
 
 void execAggregateBatch(Nd4jPointer *extraPointers,
@@ -1868,12 +1800,7 @@ void execAggregateBatch(Nd4jPointer *extraPointers,
                                          int maxReals,
                                          void *ptrToArguments,
                                          nd4j::DataType dtype) {
-    try {
-        BUILD_SINGLE_SELECTOR(dtype, _batchExecutor, (extraPointers, numAggregates, opNum, maxArgs, maxShapes, maxIntArrays, maxIntArraySize, maxIdx, maxReals, ptrToArguments, dtype), FLOAT_TYPES);
-    } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
-    }
+
 }
 
 
@@ -2094,27 +2021,21 @@ const char* getAllCustomOps() {
 template <typename T>
 FORCEINLINE int estimateThresholdGeneric(Nd4jPointer *extraPointers, Nd4jPointer hX, int N, T threshold) {
     auto buffer = reinterpret_cast<T *>(hX);
-
     int span = (N / 6) + 8;
-    int cnt = 0;
-
-    PRAGMA_OMP_PARALLEL_REDUCTION(+:cnt)
-    {
-        int tid = omp_get_thread_num();
-        int start = span * tid;
-        int stop = span * (tid + 1);
-        if (stop > N)
-            stop = N;
 
+    auto func = PRAGMA_REDUCE_LONG {
+        int64_t cnt = 0;
         PRAGMA_OMP_SIMD
-        for (int e = start; e < stop; e++) {
+        for (auto e = start; e < stop; e++) {
             auto v = nd4j::math::nd4j_abs<T>(buffer[e]);
             if (v >= threshold)
                 cnt++;
         }
-    }
 
-    return cnt;
+        return cnt;
+    };
+
+    return samediff::Threads::parallel_long(func, LAMBDA_AL { return _old + _new; }, 0, N);
 }
 
 
@@ -2776,58 +2697,51 @@ static void  _scatterUpdate(Nd4jPointer *extraPointers, int opCode, int numOfSub
                             void* vIindexes, Nd4jLong* hIndicesShapeInfo, void* dIindexes, Nd4jLong* dIndicesShapeInfo) {
 
     auto hIindexes = reinterpret_cast<I*>(vIindexes);
-
-    int numThreads = omp_get_max_threads();
-
-    PRAGMA_OMP_PARALLEL_THREADS(numThreads)
-    {
-        for (int i = 0; i < numOfSubArrs; ++i) {
-
-            int threadIndex = omp_get_thread_num();
+        auto func = PRAGMA_THREADS_DO {
+            for (int i = 0; i < numOfSubArrs; ++i) {
+                int threadIndex = thread_id;
             const auto xIndex = hIindexes[i];
             const bool isOwner = xIndex < numThreads ? threadIndex == xIndex : threadIndex == xIndex % numThreads;
 
             if (!isOwner)
                 continue;
 
-            NDArray inSubArr(
-                    reinterpret_cast<int8_t *>(hX) + (hXOffsets[hIindexes[i]] * DataTypeUtils::sizeOf(hXShapeInfo)),
-                    hXShapeInfo);
-            NDArray updSubArr(reinterpret_cast<int8_t *>(hY) + (hYOffsets[i] * DataTypeUtils::sizeOf(hXShapeInfo)),
-                              hYShapeInfo);
+                NDArray inSubArr(reinterpret_cast<int8_t *>(hX) + (hXOffsets[hIindexes[i]] * DataTypeUtils::sizeOf(hXShapeInfo)), hXShapeInfo);
+                NDArray updSubArr(reinterpret_cast<int8_t *>(hY) + (hYOffsets[i] * DataTypeUtils::sizeOf(hXShapeInfo)), hYShapeInfo);
 
             if (inSubArr.lengthOf() != updSubArr.lengthOf()) {
                 continue;
             }
 
-            switch (opCode) {
-                case 0:
-                    inSubArr.applyPairwiseTransform(pairwise::Add, &updSubArr, &inSubArr, nullptr);
-                    break;
-                case 1:
-                    inSubArr.applyPairwiseTransform(pairwise::Subtract, &updSubArr, &inSubArr, nullptr);
-                    break;
-                case 2:
-                    inSubArr.applyPairwiseTransform(pairwise::Multiply, &updSubArr, &inSubArr, nullptr);
-                    break;
-                case 3:
-                    inSubArr.applyPairwiseTransform(pairwise::Divide, &updSubArr, &inSubArr, nullptr);
-                    break;
-                case 4:
-                    inSubArr.applyPairwiseTransform(pairwise::ReverseSubtract, &updSubArr, &inSubArr, nullptr);
-                    break;
-                case 5:
-                    inSubArr.applyPairwiseTransform(pairwise::ReverseDivide, &updSubArr, &inSubArr, nullptr);
-                    break;
-                case 6:
-                    inSubArr.applyPairwiseTransform(pairwise::CopyPws, &updSubArr, &inSubArr, nullptr);
-                    break;
-                default:
-                    continue;
+                switch (opCode) {
+                    case 0:
+                        inSubArr.applyPairwiseTransform(pairwise::Add, &updSubArr, &inSubArr, nullptr);
+                        break;
+                    case 1:
+                        inSubArr.applyPairwiseTransform(pairwise::Subtract, &updSubArr, &inSubArr, nullptr);
+                        break;
+                    case 2:
+                        inSubArr.applyPairwiseTransform(pairwise::Multiply, &updSubArr, &inSubArr, nullptr);
+                        break;
+                    case 3:
+                        inSubArr.applyPairwiseTransform(pairwise::Divide, &updSubArr, &inSubArr, nullptr);
+                        break;
+                    case 4:
+                        inSubArr.applyPairwiseTransform(pairwise::ReverseSubtract, &updSubArr, &inSubArr, nullptr);
+                        break;
+                    case 5:
+                        inSubArr.applyPairwiseTransform(pairwise::ReverseDivide, &updSubArr, &inSubArr, nullptr);
+                        break;
+                    case 6:
+                        inSubArr.applyPairwiseTransform(pairwise::CopyPws, &updSubArr, &inSubArr, nullptr);
+                        break;
+                    default:
+                        continue;
+                }
             }
-        }
-    }
+        };
 
+        samediff::Threads::parallel_do(func);
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -2847,6 +2761,7 @@ void scatterUpdate(Nd4jPointer *extraPointers, int opCode, int numOfSubArrs,
     }
 }
 
+
 void inspectArray(Nd4jPointer *extraPointers, Nd4jPointer buffer, Nd4jLong *shapeInfo, Nd4jPointer specialBuffer, Nd4jLong *specialShapeInfo, Nd4jPointer debugInfo) {
     try {
         auto p = reinterpret_cast<nd4j::DebugInfo *>(debugInfo);
diff --git a/libnd4j/blas/cuda/NativeOps.cu b/libnd4j/blas/cuda/NativeOps.cu
index 2db1aa128..2af0e3783 100755
--- a/libnd4j/blas/cuda/NativeOps.cu
+++ b/libnd4j/blas/cuda/NativeOps.cu
@@ -25,6 +25,7 @@
 #include <loops/transform_any.h>
 #include <loops/reduce_bool.h>
 #include <loops/reduce_long.h>
+#include <loops/scalar.h>
 #include <helpers/threshold.h>
 #include <ops/specials_cuda.h>
 #include <helpers/DebugHelper.h>
@@ -33,8 +34,8 @@
 #include <exceptions/datatype_exception.h>
 #include <exceptions/cuda_exception.h>
 #include <helpers/CudaLaunchHelper.h>
-// FIXME: we need cuda-specific implementations
 #include <GraphExecutioner.h>
+#include <helpers/BlasHelper.h>
 #include <graph/GraphHolder.h>
 #include <ops/declarable/CustomOperations.h>
 #include <PointersManager.h>
@@ -1723,11 +1724,7 @@ void execScalarTad(Nd4jPointer *extraPointers,
 #ifdef __ND4J_EXPERIMENTAL__
         BUILD_PAIRWISE_SELECTOR(xType, yType, zType, functions::scalar::ScalarTransform, ::executeCudaAlongDimension(launchDims, stream, opNum, dX, dXShapeInfo, dZ, dZShapeInfo, dScalars, extraParams, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES, LIBND4J_TYPES);
 #else
-        BUILD_SINGLE_SELECTOR_THRICE(xType, functions::scalar::ScalarTransform,
-                                     ::executeCudaAlongDimension(launchDims, stream, opNum, dX, dXShapeInfo, dZ,
-                                                                 dZShapeInfo, dScalars, extraParams, dimension,
-                                                                 dimensionLength, tadShapeInfo, tadOffsets,
-                                                                 tadShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES);
+        BUILD_SINGLE_SELECTOR_THRICE(xType, functions::scalar::ScalarTransform, ::executeCudaAlongDimension(launchDims, stream, opNum, dX, dXShapeInfo, dZ, dZShapeInfo, dScalars, extraParams, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES);
 #endif
 
         DEBUG_KERNEL(stream, opNum);
@@ -1750,23 +1747,7 @@ void execAggregate(Nd4jPointer *extraPointers,
                                    void *realArguments,
                                    int numRealArguments,
                                    nd4j::DataType dtype) {
-    try {
-        cudaStream_t *stream = reinterpret_cast<cudaStream_t *>(extraPointers[1]);
-        int numBlocks = getDeviceId(extraPointers[2]);
-        int numThreads = getDeviceId(extraPointers[3]);
-        int shmem = getDeviceId(extraPointers[4]);
 
-        dim3 launchDims = dim3(numBlocks, numThreads, shmem);
-
-        BUILD_SINGLE_SELECTOR(dtype, functions::aggregate::AggregatedFunction,
-                              ::aggregateKernelGeneric(launchDims, stream, opNum, arguments, numArguments, shapes,
-                                                       numShapes, indexArguments, numIndexArguments, intArrays,
-                                                       numIntArrays, realArguments, numRealArguments), FLOAT_TYPES);
-        nd4j::DebugHelper::checkErrorCode(stream, "execAggregateFloat(...) failed");
-    } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
-    }
 }
 
 void batchExecutor(Nd4jPointer *extraPointers,
@@ -1788,25 +1769,7 @@ void execAggregateBatch(Nd4jPointer *extraPointers,
 									int maxIntArrays, int maxIntArraySize,
 									int maxIdx, int maxReals,
 									void *ptrToArguments, nd4j::DataType dtype) {
-    try {
-        // not implemented yet
-        cudaStream_t *stream = reinterpret_cast<cudaStream_t *>(extraPointers[1]);
-        int numBlocks = getDeviceId(extraPointers[2]);
-        int numThreads = getDeviceId(extraPointers[3]);
-        int shmem = getDeviceId(extraPointers[4]);
 
-        dim3 launchDims = dim3(numAggregates, numThreads, shmem);
-
-        BUILD_SINGLE_SELECTOR(dtype, functions::aggregate::AggregatedFunction,
-                              ::aggregateBatchKernelGeneric(launchDims, stream, opNum, numAggregates, maxArgs,
-                                                            maxShapes, maxIntArrays, maxIntArraySize, maxIdx, maxReals,
-                                                            ptrToArguments), FLOAT_TYPES);
-
-        DEBUG_KERNEL(stream, opNum);
-    } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
-    }
 }
 
 ////////////////////////////////////////////////////////////////////////
diff --git a/libnd4j/buildnativeoperations.sh b/libnd4j/buildnativeoperations.sh
index 599c4f250..56e225a5d 100755
--- a/libnd4j/buildnativeoperations.sh
+++ b/libnd4j/buildnativeoperations.sh
@@ -53,6 +53,7 @@ CLEAN="false"
 MINIFIER="false"
 TESTS="false"
 VERBOSE="false"
+VERBOSE_ARG="VERBOSE=1"
 HELPER=
 NAME=
 while [[ $# > 0 ]]
@@ -291,38 +292,37 @@ case "$OS" in
 
     macosx*)
     # Do something under Mac OS X platform
-    if [ "$CHIP" == "cuda" ]; then
+    #if [ "$CHIP" == "cuda" ]; then
         export CC=clang
         export CXX=clang++
-        PARALLEL="false"
-    else
-        export CC="$(ls -1 /usr/local/bin/gcc-? | head -n 1)"
-        export CXX="$(ls -1 /usr/local/bin/g++-? | head -n 1)"
         PARALLEL="true"
-    fi
+    #else
+    #    export CC="$(ls -1 /usr/local/bin/gcc-? | head -n 1)"
+    #    export CXX="$(ls -1 /usr/local/bin/g++-? | head -n 1)"
+    #    PARALLEL="true"
+    #fi
     export CMAKE_COMMAND="$CMAKE_COMMAND -DCMAKE_MACOSX_RPATH=ON -DAPPLE_BUILD=true"
     ;;
 
     windows*)
-    # Do something under Windows NT platform
-    if [ "$CHIP" == "cuda" ]; then
+      # Do something under Windows NT platform
+      if [ "$CHIP" == "cuda" ]; then
         export CMAKE_COMMAND="cmake -G \"Ninja\""
         export MAKE_COMMAND="ninja"
         export CC="cl.exe"
         export CXX="cl.exe"
         PARALLEL="true"
-    else
+        VERBOSE_ARG="-v"
+      else
         export CMAKE_COMMAND="cmake -G \"MSYS Makefiles\""
         export MAKE_COMMAND="make"
-
-        # Sam, do we really need this?
         export CC=/mingw64/bin/gcc
         export CXX=/mingw64/bin/g++
         PARALLEL="true"
+      fi
 
-    fi
-    # Try some defaults for Visual Studio 2013 if user has not run vcvarsall.bat or something
-    if [ -z "${VCINSTALLDIR:-}" ]; then
+      # Try some defaults for Visual Studio 2013 if user has not run vcvarsall.bat or something
+      if [ -z "${VCINSTALLDIR:-}" ]; then
         export VisualStudioVersion=12.0
         export VSINSTALLDIR="C:\\Program Files (x86)\\Microsoft Visual Studio $VisualStudioVersion"
         export VCINSTALLDIR="$VSINSTALLDIR\\VC"
@@ -332,10 +332,10 @@ case "$OS" in
         export LIB="$VCINSTALLDIR\\LIB\\amd64;$WindowsSdkDir\\lib\\winv6.3\\um\\x64"
         export LIBPATH="$VCINSTALLDIR\\LIB\\amd64;$WindowsSdkDir\\References\\CommonConfiguration\\Neutral"
         export PATH="$PATH:$VCINSTALLDIR\\BIN\\amd64:$WindowsSdkDir\\bin\\x64:$WindowsSdkDir\\bin\\x86"
-    fi
-    # Make sure we are using 64-bit MinGW-w64
-    export PATH=/mingw64/bin/:$PATH
-    # export GENERATOR="MSYS Makefiles"
+      fi
+      # Make sure we are using 64-bit MinGW-w64
+      export PATH=/mingw64/bin/:/mingw64/lib:$PATH
+      # export GENERATOR="MSYS Makefiles"
     ;;
 esac
 
@@ -534,6 +534,6 @@ if [ "$PARALLEL" == "true" ]; then
     MAKE_ARGUMENTS="$MAKE_ARGUMENTS -j $MAKEJ"
 fi
 if [ "$VERBOSE" == "true" ]; then
-    MAKE_ARGUMENTS="$MAKE_ARGUMENTS VERBOSE=1"
+    MAKE_ARGUMENTS="$MAKE_ARGUMENTS $VERBOSE_ARG"
 fi
 eval $MAKE_COMMAND $MAKE_ARGUMENTS && cd ../../..
diff --git a/libnd4j/include/array/DataTypeConversions.h b/libnd4j/include/array/DataTypeConversions.h
index 677401954..3af77ca39 100644
--- a/libnd4j/include/array/DataTypeConversions.h
+++ b/libnd4j/include/array/DataTypeConversions.h
@@ -29,6 +29,7 @@
 #include <helpers/BitwiseUtils.h>
 #include <loops/type_conversions.h>
 #include <dll.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
     template <typename T>
@@ -50,9 +51,12 @@ namespace nd4j {
                             else
                                 TypeCast::convertGeneric<T2, T>(nullptr, tmp, length, buffer);
 #else
-                PRAGMA_OMP_PARALLEL_FOR_SIMD
-                for (Nd4jLong e = 0; e < length; e++)
-                    buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
+                auto func = PRAGMA_THREADS_FOR {
+                    for (auto e = start; e < stop; e += increment)
+                        buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
+                };
+
+                samediff::Threads::parallel_for(func, 0, length);
 #endif
 
                 delete[] tmp;
@@ -105,9 +109,12 @@ namespace nd4j {
                             else
                                 TypeCast::convertGeneric<float, T>(nullptr, tmp, length, buffer);
 #else
-                            PRAGMA_OMP_PARALLEL_FOR_SIMD
-                            for (Nd4jLong e = 0; e < length; e++)
-                                buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
+                            auto func = PRAGMA_THREADS_FOR {
+                                for (auto e = start; e < stop; e += increment)
+                                    buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
+                            };
+
+                            samediff::Threads::parallel_for(func, 0, length);
 #endif
 
                             delete[] tmp;
@@ -130,9 +137,12 @@ namespace nd4j {
 
 
 #else
-                            PRAGMA_OMP_PARALLEL_FOR
-                            for (Nd4jLong e = 0; e < length; e++)
-                                buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
+                            auto func = PRAGMA_THREADS_FOR {
+                                for (auto e = start; e < stop; e += increment)
+                                    buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
+                            };
+
+                            samediff::Threads::parallel_for(func, 0, length);
 #endif
                             delete[] tmp;
                         }
@@ -153,9 +163,12 @@ namespace nd4j {
                             else
                                 TypeCast::convertGeneric<float16, T>(nullptr, tmp, length, buffer);
 #else
-                            PRAGMA_OMP_PARALLEL_FOR
-                            for (Nd4jLong e = 0; e < length; e++)
-                                buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
+                            auto func = PRAGMA_THREADS_FOR {
+                                for (auto e = start; e < stop; e += increment)
+                                    buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
+                            };
+
+                            samediff::Threads::parallel_for(func, 0, length);
 #endif
                             delete[] tmp;
                         }
diff --git a/libnd4j/include/buffer.h b/libnd4j/include/buffer.h
index e2aa70046..79197753d 100755
--- a/libnd4j/include/buffer.h
+++ b/libnd4j/include/buffer.h
@@ -26,6 +26,7 @@
 #ifdef __CUDACC__
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <helpers/DebugHelper.h>
 #endif
 #include <dll.h>
 
diff --git a/libnd4j/include/cnpy/cnpy.h b/libnd4j/include/cnpy/cnpy.h
index ac7fef863..06ff3336d 100644
--- a/libnd4j/include/cnpy/cnpy.h
+++ b/libnd4j/include/cnpy/cnpy.h
@@ -97,10 +97,10 @@ namespace cnpy {
      * @param t
      * @return
      */
-    char mapType(const std::type_info &t);
+    ND4J_EXPORT char mapType(const std::type_info &t);
 
     template <typename T>
-    char mapType();
+    ND4J_EXPORT char mapType();
 
     /**
      *
@@ -111,7 +111,7 @@ namespace cnpy {
      * @return
      */
     template<typename T>
-    std::vector<char> createNpyHeader(const void *data,
+    ND4J_EXPORT std::vector<char> createNpyHeader(const void *data,
                                       const unsigned int *shape,
                                       const unsigned int ndims,
                                       unsigned int wordSize = 4);
@@ -126,7 +126,7 @@ namespace cnpy {
      * @param ndims
      * @param fortranOrder
      */
-    void parseNpyHeader(FILE *fp,
+    ND4J_EXPORT void parseNpyHeader(FILE *fp,
                         unsigned int &wordSize,
                         unsigned int *&shape,
                         unsigned int &ndims,
@@ -143,7 +143,7 @@ namespace cnpy {
     * @param ndims
     * @param fortran_order
     */
-    void parseNpyHeaderPointer(
+    ND4J_EXPORT void parseNpyHeaderPointer(
             const char *header,
             unsigned int& word_size,
             unsigned int*& shape,
@@ -156,7 +156,7 @@ namespace cnpy {
      * @param global_header_size
      * @param global_header_offset
      */
-    void parseZipFooter(FILE *fp,
+    ND4J_EXPORT void parseZipFooter(FILE *fp,
                         unsigned short &nrecs,
                         unsigned int &global_header_size,
                         unsigned int &global_header_offset);
@@ -167,14 +167,14 @@ namespace cnpy {
      * @param varname
      * @return
      */
-    NpyArray npzLoad(std::string fname, std::string varname);
+    ND4J_EXPORT NpyArray npzLoad(std::string fname, std::string varname);
 
     /**
      *
      * @param fname
      * @return
      */
-    NpyArray npyLoad(std::string fname);
+    ND4J_EXPORT NpyArray npyLoad(std::string fname);
 
     /**
     * Parse the numpy header from
@@ -187,7 +187,7 @@ namespace cnpy {
     * @param ndims
     * @param fortranOrder
     */
-    void parseNpyHeaderStr(std::string header,
+    ND4J_EXPORT void parseNpyHeaderStr(std::string header,
                            unsigned int &wordSize,
                            unsigned int *&shape,
                            unsigned int &ndims,
@@ -199,14 +199,14 @@ namespace cnpy {
      * @param fp
      * @return
      */
-    int * shapeFromFile(FILE *fp);
+    ND4J_EXPORT int* shapeFromFile(FILE *fp);
 
     /**
      *
      * @param data
      * @return
      */
-    int * shapeFromPointer(char *data);
+    ND4J_EXPORT int* shapeFromPointer(char *data);
 
     /**
      * Load the numpy array from the given file.
@@ -250,7 +250,7 @@ namespace cnpy {
 * @param ndims
 * @param fortran_order
 */
-    void parseNpyHeader(std::string header,
+    ND4J_EXPORT void parseNpyHeader(std::string header,
                         unsigned int &word_size,
                         unsigned int *&shape,
                         unsigned int &ndims,
@@ -273,7 +273,7 @@ namespace cnpy {
 
 
     template<typename T>
-    void npy_save(std::string fname, const T* data, const unsigned int* shape, const unsigned int ndims, std::string mode = "w");
+    ND4J_EXPORT void npy_save(std::string fname, const T* data, const unsigned int* shape, const unsigned int ndims, std::string mode = "w");
 
 }
 
@@ -284,8 +284,8 @@ namespace cnpy {
      * @param rhs
      * @return
      */
-template<typename T>
-std::vector<char>& operator+=(std::vector<char>& lhs, const T rhs);
+    template<typename T>
+    ND4J_EXPORT std::vector<char>& operator+=(std::vector<char>& lhs, const T rhs);
 
 
 #endif
diff --git a/libnd4j/include/dll.h b/libnd4j/include/dll.h
index 4b5a71eec..91d5a7677 100644
--- a/libnd4j/include/dll.h
+++ b/libnd4j/include/dll.h
@@ -20,6 +20,9 @@
 
 #ifndef NATIVEOPERATIONS_DLL_H
 #define NATIVEOPERATIONS_DLL_H
+
+#include <msvc.h>
+
 #ifdef _WIN32
 //#include <windows.h>
 #  define ND4J_EXPORT __declspec(dllexport)
diff --git a/libnd4j/include/execution/BlockingQueue.h b/libnd4j/include/execution/BlockingQueue.h
new file mode 100644
index 000000000..a78196dfc
--- /dev/null
+++ b/libnd4j/include/execution/BlockingQueue.h
@@ -0,0 +1,52 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+
+#ifndef SAMEDIFF_BLOCKINGQUEUE_H
+#define SAMEDIFF_BLOCKINGQUEUE_H
+
+#include <functional>
+#include <queue>
+#include <mutex>
+#include <atomic>
+#include <condition_variable>
+
+namespace samediff {
+    template <typename T>
+    class BlockingQueue {
+    private:
+        std::queue<T> _queue;
+        std::mutex _lock;
+        std::atomic<int> _size;
+        std::atomic<bool> _available;
+
+        std::condition_variable _condition;
+    public:
+        BlockingQueue(int queueSize);
+        ~BlockingQueue() = default;
+        T poll();
+        void put(const T &t);
+
+        bool available();
+        void markAvailable();
+        void markUnavailable();
+    };
+}
+
+#endif //DEV_TESTS_BLOCKINGQUEUE_H
diff --git a/libnd4j/include/execution/CallableInterface.h b/libnd4j/include/execution/CallableInterface.h
new file mode 100644
index 000000000..7e5502af1
--- /dev/null
+++ b/libnd4j/include/execution/CallableInterface.h
@@ -0,0 +1,94 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+
+#ifndef SAMEDIFF_CALLABLEINTERFACE_H
+#define SAMEDIFF_CALLABLEINTERFACE_H
+
+#include <openmp_pragmas.h>
+#include <cstdint>
+#include <functional>
+#include <atomic>
+#include <array>
+#include <mutex>
+#include <condition_variable>
+
+namespace samediff {
+    /**
+     * This class is suited for passing functions to execution threads without queues
+     */
+    class CallableInterface {
+    private:
+        // parallel_for functions
+        FUNC_1D _function_1d;
+        FUNC_2D _function_2d;
+        FUNC_3D _function_3d;
+
+        // parallel function
+        FUNC_DO _function_do;
+
+        // reduction functions
+        FUNC_RL _function_rl;
+        FUNC_RD _function_rd;
+
+        std::array<int64_t, 9> _arguments;
+
+        volatile int _branch = 0;
+        volatile uint32_t _thread_id = 0;
+        volatile uint32_t _num_threads = 0;
+
+        std::atomic<bool> _finished;
+        std::atomic<bool> _filled;
+        std::atomic<bool> _available;
+
+        std::condition_variable _starter;
+        std::condition_variable _finisher;
+
+        int64_t* _lptr = nullptr;
+        double* _dptr = nullptr;
+
+        std::mutex _ms;
+        std::mutex _mf;
+    public:
+        CallableInterface();
+        ~CallableInterface() = default;
+
+        void waitForTask();
+        void waitForCompletion();
+
+        void fill(int thread_id, int num_threads, int64_t *lpt, FUNC_RL func, int64_t start_x, int64_t stop_x, int64_t inc_x);
+        void fill(int thread_id, int num_threads, double *dpt, FUNC_RD func, int64_t start_x, int64_t stop_x, int64_t inc_x);
+
+        void fill(int thread_id, int num_threads, FUNC_DO func);
+        void fill(int thread_id, int num_threads, FUNC_1D func, int64_t start_x, int64_t stop_x, int64_t inc_x);
+        void fill(int thread_id, int num_threads, FUNC_2D func, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y);
+        void fill(int thread_id, int num_threads, FUNC_3D func, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y, int64_t start_z, int64_t stop_z, int64_t inc_z);
+
+        bool available();
+        void markAvailable();
+        void markUnavailable();
+
+        void finish();
+
+        void execute();
+    };
+}
+
+
+#endif //DEV_TESTS_CALLABLEINTERFACE_H
diff --git a/libnd4j/include/execution/CallableWithArguments.h b/libnd4j/include/execution/CallableWithArguments.h
new file mode 100644
index 000000000..ebf1f0019
--- /dev/null
+++ b/libnd4j/include/execution/CallableWithArguments.h
@@ -0,0 +1,92 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+
+#ifndef DEV_TESTS_CALLABLEWITHARGUMENTS_H
+#define DEV_TESTS_CALLABLEWITHARGUMENTS_H
+
+#include <functional>
+#include <vector>
+#include <atomic>
+#include <condition_variable>
+#include <op_boilerplate.h>
+
+namespace samediff {
+    class CallableWithArguments {
+        FUNC_DO _function_do;
+        FUNC_1D _function_1d;
+        FUNC_2D _function_2d;
+        FUNC_3D _function_3d;
+
+        std::vector<int64_t> _arguments;
+
+        std::atomic<bool> _finished;
+
+        std::condition_variable _condition;
+
+        std::mutex _lock;
+
+        int _dimensions = 0;
+
+        uint64_t _threadId;
+        uint64_t _numThreads;
+    public:
+        CallableWithArguments(FUNC_DO func, uint64_t thread_id, uint64_t numThreads);
+        CallableWithArguments(FUNC_1D func, uint64_t thread_id, int64_t start_x, int64_t stop_x, int64_t increment_x);
+        CallableWithArguments(FUNC_2D func, uint64_t thread_id, int64_t start_x, int64_t stop_x, int64_t increment_x, int64_t start_y, int64_t stop_y, int64_t increment_y);
+        CallableWithArguments(FUNC_3D func, uint64_t thread_id, int64_t start_x, int64_t stop_x, int64_t increment_x, int64_t start_y, int64_t stop_y, int64_t increment_y, int64_t start_z, int64_t stop_z, int64_t increment_z);
+
+
+        /**
+         * This method returns number of dimensions
+         * @return
+         */
+        int dimensions();
+
+        /**
+         * This method checks if this callable is finished
+         * @return
+         */
+        bool finished();
+
+        /**
+         * this method marks this Callable as finished
+         */
+        void finish();
+
+        /**
+         * This method blocks until callable is finished
+         */
+        void waitUntilFinished();
+
+        std::vector<int64_t>& arguments();
+        FUNC_DO function_do();
+        FUNC_1D function_1d();
+        FUNC_2D function_2d();
+        FUNC_3D function_3d();
+
+
+        uint64_t threadId();
+
+        uint64_t numThreads();
+    };
+}
+
+
+#endif //DEV_TESTS_CALLABLEWITHARGUMENTS_H
diff --git a/libnd4j/include/execution/ThreadPool.h b/libnd4j/include/execution/ThreadPool.h
new file mode 100644
index 000000000..e17b4b540
--- /dev/null
+++ b/libnd4j/include/execution/ThreadPool.h
@@ -0,0 +1,71 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+
+#ifndef SAMEDIFF_THREADPOOL_H
+#define SAMEDIFF_THREADPOOL_H
+
+#include <list>
+#include <vector>
+#include <thread>
+#include <atomic>
+#include <mutex>
+#include <execution/BlockingQueue.h>
+#include <execution/CallableWithArguments.h>
+#include <execution/CallableInterface.h>
+#include <execution/Ticket.h>
+#include <queue>
+
+namespace samediff {
+    class ThreadPool {
+    private:
+        static ThreadPool* _INSTANCE;
+
+        std::vector<std::thread*> _threads;
+        std::vector<BlockingQueue<CallableWithArguments*>*> _queues;
+        std::vector<CallableInterface*> _interfaces;
+
+        std::mutex _lock;
+        std::atomic<int> _available;
+        std::queue<Ticket*> _tickets;
+    protected:
+        ThreadPool();
+        ~ThreadPool();
+    public:
+        static ThreadPool* getInstance();
+
+        /**
+         * This method returns list of pointers to threads ONLY if num_threads of threads were available upon request, returning empty list otherwise
+         * @param num_threads
+         * @return
+         */
+        Ticket* tryAcquire(int num_threads);
+
+        /**
+         * This method marks specified number of threads as released, and available for use
+         * @param num_threads
+         */
+        void release(int num_threads = 1);
+
+        void release(Ticket *ticket);
+    };
+}
+
+
+#endif //DEV_TESTS_THREADPOOL_H
diff --git a/libnd4j/include/execution/Threads.h b/libnd4j/include/execution/Threads.h
new file mode 100644
index 000000000..683220b61
--- /dev/null
+++ b/libnd4j/include/execution/Threads.h
@@ -0,0 +1,160 @@
+/*******************************************************************************
+ * Copyright (c) 2019 Konduit
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+#ifndef SAMEDIFF_THREADS_H
+#define SAMEDIFF_THREADS_H
+
+#include <functional>
+#include <openmp_pragmas.h>
+#include <op_boilerplate.h>
+#include <Environment.h>
+#include <op_enums.h>
+
+namespace samediff {
+    class ThreadsHelper {
+    public:
+        static int numberOfThreads(int maxThreads, uint64_t numberOfElements);
+        static int numberOfThreads2d(int maxThreads, uint64_t iters_x, uint64_t iters_y);
+        static int numberOfThreads3d(int maxThreads, uint64_t iters_x, uint64_t iters_y, uint64_t iters_z);
+        static int pickLoop2d(int numThreads, uint64_t iters_x, uint64_t iters_y);
+        static int pickLoop3d(int numThreads, uint64_t iters_x, uint64_t iters_y, uint64_t iters_z);
+    };
+
+    class Span {
+    private:
+        int64_t _startX, _stopX, _incX;
+    public:
+        Span(int64_t start_x, int64_t stop_x, int64_t inc_x);
+        ~Span() = default;
+
+        int64_t startX() const;
+        int64_t stopX() const;
+        int64_t incX() const;
+
+        static Span build(uint64_t thread_id, uint64_t num_threads, int64_t start_x, int64_t stop_x, int64_t inc_x);
+    };
+
+    class Span2 {
+    private:
+        int64_t _startX, _stopX, _incX;
+        int64_t _startY, _stopY, _incY;
+    public:
+        Span2(int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y);
+        ~Span2() = default;
+
+        int64_t startX() const;
+        int64_t startY() const;
+
+        int64_t stopX() const;
+        int64_t stopY() const;
+
+        int64_t incX() const;
+        int64_t incY() const;
+
+        static Span2 build(int loop, uint64_t thread_id, uint64_t num_threads, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y);
+    };
+
+    class Span3 {
+    private:
+        int64_t _startX, _stopX, _incX;
+        int64_t _startY, _stopY, _incY;
+        int64_t _startZ, _stopZ, _incZ;
+    public:
+        Span3(int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y, int64_t start_z, int64_t stop_z, int64_t inc_z);
+        ~Span3() = default;
+
+        int64_t startX() const;
+        int64_t startY() const;
+        int64_t startZ() const;
+
+        int64_t stopX() const;
+        int64_t stopY() const;
+        int64_t stopZ() const;
+
+        int64_t incX() const;
+        int64_t incY() const;
+        int64_t incZ() const;
+
+        static Span3 build(int loop, uint64_t thread_id, uint64_t num_threads, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y, int64_t start_z, int64_t stop_z, int64_t inc_z);
+    };
+
+    class Threads {
+    public:
+        /**
+         * This function executes 1 dimensional loop for a given number of threads
+         * PLEASE NOTE: this function can use smaller number of threads than requested.
+         *
+         * @param function
+         * @param numThreads
+         * @param start
+         * @param stop
+         * @param increment
+         * @return
+         */
+        static int parallel_for(FUNC_1D function, int64_t start, int64_t stop, int64_t increment = 1, uint32_t numThreads = nd4j::Environment::getInstance()->maxThreads());
+
+        static int parallel_tad(FUNC_1D function, int64_t start, int64_t stop, int64_t increment = 1, uint32_t numThreads = nd4j::Environment::getInstance()->maxThreads());
+
+        /**
+         *
+         * @param function
+         * @param numThreads
+         * @param start_x
+         * @param stop_x
+         * @param inc_x
+         * @param start_y
+         * @param stop_y
+         * @param inc_y
+         * @return
+         */
+        static int parallel_for(FUNC_2D function, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y, uint64_t numThreads = nd4j::Environment::getInstance()->maxThreads(), bool debug = false);
+
+        /**
+         *
+         * @param function
+         * @param numThreads
+         * @param start_x
+         * @param stop_x
+         * @param inc_x
+         * @param start_y
+         * @param stop_y
+         * @param inc_y
+         * @param start_z
+         * @param stop_z
+         * @param inc_z
+         * @return
+         */
+        static int parallel_for(FUNC_3D function, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y, int64_t start_z, int64_t stop_z, int64_t inc_z, uint64_t numThreads = nd4j::Environment::getInstance()->maxThreads());
+
+        /**
+         *
+         * @param function
+         * @param numThreads
+         * @return
+         */
+        static int parallel_do(FUNC_DO function, uint64_t numThreads = nd4j::Environment::getInstance()->maxThreads());
+
+        static int64_t parallel_long(FUNC_RL function, FUNC_AL aggregator, int64_t start, int64_t stop, int64_t increment = 1, uint64_t numThreads = nd4j::Environment::getInstance()->maxThreads());
+
+        static double parallel_double(FUNC_RD function, FUNC_AD aggregator, int64_t start, int64_t stop, int64_t increment = 1, uint64_t numThreads = nd4j::Environment::getInstance()->maxThreads());
+    };
+}
+
+
+#endif //SAMEDIFF_THREADS_H
diff --git a/libnd4j/include/execution/Ticket.h b/libnd4j/include/execution/Ticket.h
new file mode 100644
index 000000000..e4152b66a
--- /dev/null
+++ b/libnd4j/include/execution/Ticket.h
@@ -0,0 +1,67 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+
+#ifndef SAMEDIFF_TICKET_H
+#define SAMEDIFF_TICKET_H
+
+#include <vector>
+#include <execution/BlockingQueue.h>
+#include <execution/CallableWithArguments.h>
+#include <execution/CallableInterface.h>
+#include <atomic>
+#include <mutex>
+
+namespace samediff {
+    class Ticket {
+    private:
+        bool _acquired = false;
+        std::vector<BlockingQueue<CallableWithArguments*>*> _queues;
+        std::vector<CallableWithArguments*> _callables;
+        std::vector<CallableInterface*> _interfaces;
+
+        uint32_t _acquiredThreads = 0;
+    public:
+        explicit Ticket(const std::vector<BlockingQueue<CallableWithArguments*>*> &queues);
+        Ticket();
+        ~Ticket() = default;
+
+        bool acquired();
+
+        void acquiredThreads(uint32_t threads);
+
+        void attach(uint32_t thread_id, CallableInterface *interface);
+
+        // deprecated one
+        void enqueue(int thread_id, CallableWithArguments* callable);
+
+        void enqueue(uint32_t thread_id, uint32_t num_threads, int64_t *lpt, FUNC_RL func, int64_t start_x, int64_t stop_x, int64_t inc_x);
+        void enqueue(uint32_t thread_id, uint32_t num_threads, double *lpt, FUNC_RD func, int64_t start_x, int64_t stop_x, int64_t inc_x);
+
+        void enqueue(uint32_t thread_id, uint32_t num_threads, FUNC_DO func);
+        void enqueue(uint32_t thread_id, uint32_t num_threads, FUNC_1D func, int64_t start_x, int64_t stop_x, int64_t inc_x);
+        void enqueue(uint32_t thread_id, uint32_t num_threads, FUNC_2D func, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y);
+        void enqueue(uint32_t thread_id, uint32_t num_threads, FUNC_3D func, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y, int64_t start_, int64_t stop_z, int64_t inc_z);
+
+        void waitAndRelease();
+    };
+}
+
+
+#endif //DEV_TESTS_TICKET_H
diff --git a/libnd4j/include/execution/impl/BlockingQueue.cpp b/libnd4j/include/execution/impl/BlockingQueue.cpp
new file mode 100644
index 000000000..ff483fd28
--- /dev/null
+++ b/libnd4j/include/execution/impl/BlockingQueue.cpp
@@ -0,0 +1,73 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+
+#include <execution/BlockingQueue.h>
+#include <CallableWithArguments.h>
+#include <thread>
+
+namespace samediff {
+    template <typename T>
+    BlockingQueue<T>::BlockingQueue(int queueSize) {
+        _size = 0;
+        _available = true;
+    }
+
+    template <typename T>
+    T BlockingQueue<T>::poll() {
+        // locking untill there's something within queue
+        std::unique_lock<std::mutex> lock(_lock);
+        _condition.wait(lock, [&]{ return this->_size.load() != 0; });
+
+        T t(std::move(_queue.front()));
+        _queue.pop();
+        _size--;
+        return t;
+    }
+
+    template <typename T>
+    void BlockingQueue<T>::put(const T &t) {
+        {
+            // locking before push, unlocking after
+            std::unique_lock<std::mutex> lock(_lock);
+            _queue.push(t);
+            _size++;
+        }
+
+        // notifying condition
+        _condition.notify_one();
+    }
+
+    template <typename T>
+    bool BlockingQueue<T>::available() {
+        return _available.load();
+    }
+
+    template <typename T>
+    void BlockingQueue<T>::markAvailable() {
+        _available = true;
+    }
+
+    template <typename T>
+    void BlockingQueue<T>::markUnavailable() {
+        _available = false;
+    }
+
+    template class BlockingQueue<CallableWithArguments*>;
+}
diff --git a/libnd4j/include/execution/impl/CallableInterface.cpp b/libnd4j/include/execution/impl/CallableInterface.cpp
new file mode 100644
index 000000000..a719af848
--- /dev/null
+++ b/libnd4j/include/execution/impl/CallableInterface.cpp
@@ -0,0 +1,213 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+
+#include <execution/CallableInterface.h>
+#include <helpers/logger.h>
+
+namespace samediff {
+    CallableInterface::CallableInterface() {
+        // initial state is available
+        _available = true;
+        _filled = false;
+        _finished = false;
+    }
+
+    bool CallableInterface::available() {
+        return _available.load();
+    }
+
+    void CallableInterface::markUnavailable() {
+        _available = false;
+    }
+
+    void CallableInterface::markAvailable() {
+        _available = true;
+    }
+
+    void CallableInterface::fill(int threadID, int numThreads, FUNC_DO func) {
+        _function_do = std::move(func);
+
+        _branch = 0;
+        _num_threads = numThreads;
+        _thread_id = threadID;
+        _finished = false;
+        {
+            std::unique_lock<std::mutex> l(_ms);
+            _filled = true;
+        }
+        _starter.notify_one();
+    }
+
+    void CallableInterface::fill(int threadID, int numThreads, FUNC_1D func, int64_t startX, int64_t stopX, int64_t incX) {
+        _function_1d = std::move(func);
+        _arguments[0] = startX;
+        _arguments[1] = stopX;
+        _arguments[2] = incX;
+
+        _branch = 1;
+        _num_threads = numThreads;
+        _thread_id = threadID;
+        _finished = false;
+
+        {
+            std::unique_lock<std::mutex> l(_ms);
+            _filled = true;
+        }
+        _starter.notify_one();
+    }
+
+    void CallableInterface::fill(int threadID, int numThreads, FUNC_2D func, int64_t startX, int64_t stopX, int64_t incX, int64_t start_y, int64_t stop_y, int64_t inc_y) {
+        _function_2d = std::move(func);
+        _arguments[0] = startX;
+        _arguments[1] = stopX;
+        _arguments[2] = incX;
+        _arguments[3] = start_y;
+        _arguments[4] = stop_y;
+        _arguments[5] = inc_y;
+
+        _branch = 2;
+        _num_threads = numThreads;
+        _thread_id = threadID;
+        _finished = false;
+
+        {
+            std::unique_lock<std::mutex> l(_ms);
+            _filled = true;
+        }
+        _starter.notify_one();
+    }
+
+    void CallableInterface::fill(int threadID, int numThreads, FUNC_3D func, int64_t startX, int64_t stopX, int64_t incX, int64_t start_y, int64_t stop_y, int64_t inc_y, int64_t start_z, int64_t stop_z, int64_t inc_z) {
+        _function_3d = std::move(func);
+        _arguments[0] = startX;
+        _arguments[1] = stopX;
+        _arguments[2] = incX;
+        _arguments[3] = start_y;
+        _arguments[4] = stop_y;
+        _arguments[5] = inc_y;
+        _arguments[6] = start_z;
+        _arguments[7] = stop_z;
+        _arguments[8] = inc_z;
+
+        _branch = 3;
+        _num_threads = numThreads;
+        _thread_id = threadID;
+        _finished = false;
+
+        {
+            std::unique_lock<std::mutex> l(_ms);
+            _filled = true;
+        }
+        _starter.notify_one();
+    }
+
+    void CallableInterface::fill(int threadID, int numThreads, int64_t *lptr, FUNC_RL func, int64_t startX, int64_t stopX, int64_t incX) {
+        _function_rl = std::move(func);
+        _arguments[0] = startX;
+        _arguments[1] = stopX;
+        _arguments[2] = incX;
+
+        _lptr = lptr;
+
+        _branch = 4;
+        _num_threads = numThreads;
+        _thread_id = threadID;
+        _finished = false;
+
+        {
+            std::unique_lock<std::mutex> l(_ms);
+            _filled = true;
+        }
+        _starter.notify_one();
+    }
+
+    void CallableInterface::fill(int threadID, int numThreads, double *dptr, FUNC_RD func, int64_t startX, int64_t stopX, int64_t incX) {
+        _function_rd = std::move(func);
+        _arguments[0] = startX;
+        _arguments[1] = stopX;
+        _arguments[2] = incX;
+
+        _dptr = dptr;
+
+        _branch = 5;
+        _num_threads = numThreads;
+        _thread_id = threadID;
+        _finished = false;
+
+        {
+            std::unique_lock<std::mutex> l(_ms);
+            _filled = true;
+        }
+        _starter.notify_one();
+    }
+
+    void CallableInterface::waitForTask() {
+        // block until task is available
+        std::unique_lock<std::mutex> lock(_ms);
+        _starter.wait(lock, [&]{ return _filled.load(); });
+    }
+
+    void CallableInterface::waitForCompletion() {
+        //while (!_finished.load());
+
+        // block until finished
+        std::unique_lock<std::mutex> lock(_mf);
+        _finisher.wait(lock, [&] { return _finished.load(); });
+    }
+
+    void CallableInterface::finish() {
+        // mark as finished
+        {
+            std::unique_lock<std::mutex> l(_mf);
+            _finished.store(true);
+        }
+        _finisher.notify_one();
+    }
+
+    void CallableInterface::execute() {
+        // mark it as consumed
+        _filled = false;
+
+        // actually executing op
+        switch (_branch) {
+            case 0:
+                _function_do(_thread_id, _num_threads);
+                break;
+            case 1:
+                _function_1d(_thread_id, _arguments[0], _arguments[1], _arguments[2]);
+                break;
+            case 2:
+                _function_2d(_thread_id, _arguments[0], _arguments[1], _arguments[2], _arguments[3], _arguments[4], _arguments[5]);
+                break;
+            case 3:
+                _function_3d(_thread_id, _arguments[0], _arguments[1], _arguments[2], _arguments[3], _arguments[4], _arguments[5], _arguments[6], _arguments[7], _arguments[8]);
+                break;
+            case 4:
+                _lptr[0] = _function_rl(_thread_id, _arguments[0], _arguments[1], _arguments[2]);
+                break;
+            case 5:
+                _dptr[0] = _function_rd(_thread_id, _arguments[0], _arguments[1], _arguments[2]);
+                break;
+        }
+
+        // notify that thread finished the job
+        this->finish();
+    }
+}
\ No newline at end of file
diff --git a/libnd4j/include/execution/impl/CallableWithArguments.cpp b/libnd4j/include/execution/impl/CallableWithArguments.cpp
new file mode 100644
index 000000000..8f17622b7
--- /dev/null
+++ b/libnd4j/include/execution/impl/CallableWithArguments.cpp
@@ -0,0 +1,103 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+
+#include <execution/CallableWithArguments.h>
+
+namespace samediff {
+    CallableWithArguments::CallableWithArguments(FUNC_DO func, uint64_t thread_id, uint64_t numThreads) {
+        _function_do = func;
+        _finished = false;
+        _threadId = thread_id;
+        _numThreads = numThreads;
+        _dimensions = 0;
+    }
+
+    CallableWithArguments::CallableWithArguments(FUNC_3D func, uint64_t thread_id, int64_t start_x, int64_t stop_x, int64_t increment_x, int64_t start_y, int64_t stop_y, int64_t increment_y, int64_t start_z, int64_t stop_z, int64_t increment_z) {
+        _function_3d = func;
+        _arguments = {start_x, stop_x, increment_x, start_y, stop_y, increment_y, start_z, stop_z, increment_z};
+        _finished = false;
+        _threadId = thread_id;
+        _dimensions = 3;
+    }
+
+    CallableWithArguments::CallableWithArguments(FUNC_1D func, uint64_t thread_id, int64_t start_x, int64_t stop_x, int64_t increment_x) {
+        _function_1d = func;
+        _arguments = {start_x, stop_x, increment_x};
+        _finished = false;
+        _threadId = thread_id;
+        _dimensions = 1;
+    }
+
+    CallableWithArguments::CallableWithArguments(FUNC_2D func, uint64_t thread_id, int64_t start_x, int64_t stop_x, int64_t increment_x, int64_t start_y, int64_t stop_y, int64_t increment_y) {
+        _function_2d = func;
+        _arguments = {start_x, stop_x, increment_x, start_y, stop_y, increment_y};
+        _finished = false;
+        _threadId = thread_id;
+        _dimensions = 2;
+    }
+
+    int CallableWithArguments::dimensions() {
+        return _dimensions;
+    }
+
+    std::vector<int64_t>& CallableWithArguments::arguments() {
+        return _arguments;
+    }
+
+    bool CallableWithArguments::finished() {
+        return _finished.load();
+    }
+
+    void CallableWithArguments::finish() {
+        std::lock_guard<std::mutex> lock(_lock);
+        _finished = true;
+        _condition.notify_one();
+    }
+
+    void CallableWithArguments::waitUntilFinished() {
+        std::unique_lock<std::mutex> lock(_lock);
+        _condition.wait(lock, [&]{ return _finished.load(); });
+    }
+
+
+    FUNC_1D CallableWithArguments::function_1d() {
+        return _function_1d;
+    }
+
+    FUNC_2D CallableWithArguments::function_2d() {
+        return _function_2d;
+    }
+
+    FUNC_DO CallableWithArguments::function_do() {
+        return _function_do;
+    }
+
+    FUNC_3D CallableWithArguments::function_3d() {
+        return _function_3d;
+    }
+
+    uint64_t CallableWithArguments::threadId() {
+        return _threadId;
+    }
+
+    uint64_t CallableWithArguments::numThreads() {
+        return _numThreads;
+    }
+}
\ No newline at end of file
diff --git a/libnd4j/include/execution/impl/ThreadPool.cpp b/libnd4j/include/execution/impl/ThreadPool.cpp
new file mode 100644
index 000000000..5d9e2d5eb
--- /dev/null
+++ b/libnd4j/include/execution/impl/ThreadPool.cpp
@@ -0,0 +1,194 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+
+#include <execution/ThreadPool.h>
+#include <stdexcept>
+#include <helpers/logger.h>
+
+#if defined(_WIN32) || defined(_WIN64)
+//#include <windows.h>
+#endif
+
+namespace samediff {
+
+    // this function executed once per thread, it polls functions from queue, and executes them via wrapper
+    static void executionLoop_(int thread_id, BlockingQueue<CallableWithArguments*> *queue) {
+        while (true) {
+            // this method blocks until there's something within queue
+            auto c = queue->poll();
+            //nd4j_printf("ThreadPool: starting thread %i\n", c->threadId());
+            switch (c->dimensions()) {
+                case 0: {
+                        c->function_do()(c->threadId(), c->numThreads());
+                        c->finish();
+                    }
+                    break;
+                case 1: {
+                        auto args = c->arguments();
+                        c->function_1d()(c->threadId(), args[0], args[1], args[2]);
+                        c->finish();
+                    }
+                    break;
+                case 2: {
+                        auto args = c->arguments();
+                        c->function_2d()(c->threadId(), args[0], args[1], args[2], args[3], args[4], args[5]);
+                        c->finish();
+                        //nd4j_printf("ThreadPool: finished thread %i\n", c->threadId());
+                    }
+                    break;
+                case 3: {
+                        auto args = c->arguments();
+                        c->function_3d()(c->threadId(), args[0], args[1], args[2], args[3], args[4], args[5], args[6], args[7], args[8]);
+                        c->finish();
+                    }
+                    break;
+                default:
+                    throw std::runtime_error("Don't know what to do with provided Callable");
+            }
+        }
+    }
+
+    static void executionLoopWithInterface_(int thread_id, CallableInterface *c) {
+        while (true) {
+            // blocking here until there's something to do
+            c->waitForTask();
+
+            // execute whatever we have
+            c->execute();
+        }
+    }
+
+    ThreadPool::ThreadPool() {
+        // TODO: number of threads must reflect number of cores for UMA system. In case of NUMA it should be per-device pool
+        // FIXME: on mobile phones this feature must NOT be used
+        _available = nd4j::Environment::getInstance()->maxThreads();
+
+        _queues.resize(_available.load());
+        _threads.resize(_available.load());
+        _interfaces.resize(_available.load());
+
+        // creating threads here
+        for (int e = 0; e < _available.load(); e++) {
+            _queues[e] = new BlockingQueue<CallableWithArguments*>(2);
+            _interfaces[e] = new CallableInterface();
+            _threads[e] = new std::thread(executionLoopWithInterface_, e, _interfaces[e]);
+            _tickets.push(new Ticket());
+            // _threads[e] = new std::thread(executionLoop_, e, _queues[e]);
+
+            // TODO: add other platforms here as well
+            // now we must set affinity, and it's going to be platform-specific thing
+#ifdef LINUX_BUILD
+            cpu_set_t cpuset;
+            CPU_ZERO(&cpuset);
+            CPU_SET(e, &cpuset);
+            int rc = pthread_setaffinity_np(_threads[e]->native_handle(), sizeof(cpu_set_t), &cpuset);
+            if (rc != 0)
+                throw std::runtime_error("Failed to set pthread affinity");
+#endif
+            /*
+#if defined(_WIN32) || defined(_WIN64)
+            // we can't set affinity to more than 64 cores
+            if (e <= 64) {
+                auto mask = (static_cast<DWORD_PTR>(1) << e);
+                auto result = SetThreadAffinityMask(_threads[e]->native_handle(), mask);
+                if (!result)
+                    throw std::runtime_error("Failed to set pthread affinity");
+            }
+
+            // that's fine. no need for time_critical here
+            SetThreadPriority(_threads[e]->native_handle(), THREAD_PRIORITY_HIGHEST);
+#endif
+             */
+        }
+    }
+
+    ThreadPool::~ThreadPool() {
+        // TODO: implement this one properly
+        for (int e = 0; e < _queues.size(); e++) {
+            // stop each and every thread
+
+            // release queue and thread
+            //delete _queues[e];
+            //delete _threads[e];
+        }
+    }
+
+    static std::mutex _lmutex;
+
+    ThreadPool* ThreadPool::getInstance() {
+        std::unique_lock<std::mutex> lock(_lmutex);
+        if (!_INSTANCE)
+            _INSTANCE = new ThreadPool();
+
+        return _INSTANCE;
+    }
+
+    void ThreadPool::release(int numThreads) {
+        _available += numThreads;
+    }
+
+    Ticket* ThreadPool::tryAcquire(int numThreads) {
+        //std::vector<BlockingQueue<CallableWithArguments*>*> queues;
+
+        Ticket *t = nullptr;
+        // we check for threads availability first
+        bool threaded = false;
+        {
+            // we lock before checking availability
+            std::unique_lock<std::mutex> lock(_lock);
+            if (_available >= numThreads) {
+                threaded = true;
+                _available -= numThreads;
+
+                // getting a ticket from the queue
+                t = _tickets.front();
+                _tickets.pop();
+
+                // ticket must contain information about number of threads for the current session
+                t->acquiredThreads(numThreads);
+
+                // filling ticket with executable interfaces
+                for (int e = 0, i = 0; e < _queues.size() && i < numThreads; e++) {
+                    if (_interfaces[e]->available()) {
+                        t->attach(i++, _interfaces[e]);
+                        _interfaces[e]->markUnavailable();
+                    }
+                }
+            }
+        }
+
+        // we either dispatch tasks to threads, or run single-threaded
+        if (threaded) {
+            return t;
+        } else {
+            // if there's no threads available - return nullptr
+            return nullptr;
+        }
+    }
+
+    void ThreadPool::release(samediff::Ticket *ticket) {
+        // returning ticket back to the queue
+        std::unique_lock<std::mutex> lock(_lock);
+        _tickets.push(ticket);
+    }
+
+
+    ThreadPool* ThreadPool::_INSTANCE = 0;
+}
diff --git a/libnd4j/include/execution/impl/Threads.cpp b/libnd4j/include/execution/impl/Threads.cpp
new file mode 100644
index 000000000..f5ae5b5eb
--- /dev/null
+++ b/libnd4j/include/execution/impl/Threads.cpp
@@ -0,0 +1,641 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+#include <execution/Threads.h>
+#include <execution/ThreadPool.h>
+#include <vector>
+#include <thread>
+#include <helpers/logger.h>
+#include <templatemath.h>
+#include <shape.h>
+
+
+namespace samediff {
+
+    int ThreadsHelper::numberOfThreads(int maxThreads, uint64_t numberOfElements) {
+        // let's see how many threads we actually need first
+        auto optimalThreads = nd4j::math::nd4j_max<uint64_t>(1, numberOfElements / 1024);
+
+        // now return the smallest value
+        return nd4j::math::nd4j_min<int>(optimalThreads, maxThreads);
+    }
+
+    Span3::Span3(int64_t startX, int64_t stopX, int64_t incX, int64_t startY, int64_t stopY, int64_t incY, int64_t startZ, int64_t stopZ, int64_t incZ) {
+        _startX = startX;
+        _startY = startY;
+        _startZ = startZ;
+        _stopX = stopX;
+        _stopY = stopY;
+        _stopZ = stopZ;
+        _incX = incX;
+        _incY = incY;
+        _incZ = incZ;
+    }
+
+    Span3 Span3::build(int loop, uint64_t threadID, uint64_t numThreads, int64_t startX, int64_t stopX, int64_t incX, int64_t startY, int64_t stopY, int64_t incY, int64_t startZ, int64_t stopZ, int64_t incZ) {
+        switch (loop) {
+            case 1: {
+                    auto span = (stopX - startX) / numThreads;
+                    auto s = span * threadID;
+                    auto e = s + span;
+                    if (threadID == numThreads - 1)
+                        e = stopX;
+
+                    return Span3(s, e, incX, startY, stopY, incY, startZ, stopZ, incZ);
+                }
+                break;
+            case 2: {
+                    auto span = (stopY - startY) / numThreads;
+                    auto s = span * threadID;
+                    auto e = s + span;
+                    if (threadID == numThreads - 1)
+                        e = stopY;
+
+                    return Span3(startX, stopX, incX, s, e, incY, startZ, stopZ, incZ);
+                }
+                break;
+            case 3: {
+                    auto span = (stopZ - startZ) / numThreads;
+                    auto s = span * threadID;
+                    auto e = s + span;
+                    if (threadID == numThreads - 1)
+                        e = stopZ;
+
+                    return Span3(startX, stopX, incX, startY, stopY, incY, s, e, incZ);
+                }
+                break;
+            default:
+                throw std::runtime_error("");
+        }
+        return Span3(startX, stopX, incX, startY, stopY, incY, startZ, stopZ, incZ);
+    }
+
+    Span::Span(int64_t startX, int64_t stopX, int64_t incX) {
+        _startX = startX;
+        _stopX = stopX;
+        _incX = incX;
+    }
+
+    Span Span::build(uint64_t threadID, uint64_t numThreads, int64_t startX, int64_t stopX, int64_t incX) {
+        auto span = (stopX - startX) / numThreads;
+        auto s = span * threadID;
+        auto e = s + span;
+        if (threadID == numThreads - 1)
+            e = stopX;
+
+        return Span(s, e, incX);
+    }
+
+    Span2::Span2(int64_t startX, int64_t stopX, int64_t incX, int64_t startY, int64_t stopY, int64_t incY) {
+        _startX = startX;
+        _startY = startY;
+        _stopX = stopX;
+        _stopY = stopY;
+        _incX = incX;
+        _incY = incY;
+    }
+
+
+    Span2 Span2::build(int loop, uint64_t threadID, uint64_t numThreads, int64_t startX, int64_t stopX, int64_t incX, int64_t startY, int64_t stopY, int64_t incY) {
+
+        switch (loop) {
+            case 1: {
+                    auto span = (stopX - startX) / numThreads;
+                    auto s = span * threadID;
+                    auto e = s + span;
+                    if (threadID == numThreads - 1)
+                        e = stopX;
+
+                    return Span2(s, e, incX, startY, stopY, incY);
+                }
+                break;
+            case 2: {
+                    auto span = (stopY - startY) / numThreads;
+                    auto s = span * threadID;
+                    auto e = s + span;
+                    if (threadID == numThreads - 1)
+                        e = stopY;
+
+                    return Span2(startX, stopX, incX, s, e, incY);
+                }
+                break;
+            default:
+                throw std::runtime_error("");
+        }
+    }
+
+    int64_t Span::startX() const {
+        return _startX;
+    }
+
+    int64_t Span::stopX() const {
+        return _stopX;
+    }
+
+    int64_t Span::incX() const {
+        return _incX;
+    }
+
+    int64_t Span2::startX() const {
+        return _startX;
+    }
+
+    int64_t Span2::startY() const {
+        return _startY;
+    }
+
+    int64_t Span2::stopX() const {
+        return _stopX;
+    }
+
+    int64_t Span2::stopY() const {
+        return _stopY;
+    }
+
+    int64_t Span2::incX() const {
+        return _incX;
+    }
+
+    int64_t Span2::incY() const {
+        return _incY;
+    }
+
+    int64_t Span3::startX() const {
+        return _startX;
+    }
+
+    int64_t Span3::startY() const {
+        return _startY;
+    }
+
+    int64_t Span3::startZ() const {
+        return _startZ;
+    }
+
+    int64_t Span3::stopX() const {
+        return _stopX;
+    }
+
+    int64_t Span3::stopY() const {
+        return _stopY;
+    }
+
+    int64_t Span3::stopZ() const {
+        return _stopZ;
+    }
+
+    int64_t Span3::incX() const {
+        return _incX;
+    }
+
+    int64_t Span3::incY() const {
+        return _incY;
+    }
+
+    int64_t Span3::incZ() const {
+        return _incZ;
+    }
+
+    int ThreadsHelper::pickLoop2d(int numThreads, uint64_t itersX, uint64_t itersY) {
+        // if one of dimensions is definitely too small - we just pick the other one
+        if (itersX < numThreads && itersY >= numThreads)
+            return 2;
+        if (itersY < numThreads && itersX >= numThreads)
+            return 1;
+
+        // next step - we pick the most balanced dimension
+        auto remX = itersX % numThreads;
+        auto remY = itersY % numThreads;
+        auto splitY = itersY / numThreads;
+
+        // if there's no remainder left in some dimension - we're picking that dimension, because it'll be the most balanced work distribution
+        if (remX == 0)
+            return 1;
+        if (remY == 0)
+            return 2;
+
+        // if there's no loop without a remainder - we're picking one with smaller remainder
+        if (remX < remY)
+            return 1;
+        if (remY < remX && splitY >= 64) // we don't want too small splits over last dimension, or vectorization will fail
+            return 2;
+        // if loops are equally sized - give the preference to the first thread
+        return 1;
+    }
+
+
+    static int threads_(int maxThreads, uint64_t elements) {
+
+        if (elements == maxThreads) {
+            return maxThreads;
+        }
+        else if (elements > maxThreads) {
+            // if we have full load across thread, or at least half of threads can be utilized
+            auto rem = elements % maxThreads;
+            if (rem == 0 || rem >= maxThreads / 3)
+                return maxThreads;
+            else
+                return threads_(maxThreads - 1, elements);
+
+        }
+        else if (elements < maxThreads) {
+            return elements;
+        }
+
+        return 1;
+    }
+
+    int ThreadsHelper::numberOfThreads2d(int maxThreads, uint64_t iters_x, uint64_t iters_y) {
+        // in some cases there's nothing to think about, part 1
+        if (iters_x < maxThreads && iters_y < maxThreads)
+            return nd4j::math::nd4j_max<int>(iters_x, iters_y);
+
+        auto remX = iters_x % maxThreads;
+        auto remY = iters_y % maxThreads;
+
+        // in some cases there's nothing to think about, part 2
+        if ((iters_x >= maxThreads && remX == 0 )|| (iters_y >= maxThreads && remY == 0))
+            return maxThreads;
+
+        // at this point we suppose that there's no loop perfectly matches number of our threads
+        // so let's pick something as equal as possible
+        if (iters_x > maxThreads || iters_y > maxThreads)
+            return maxThreads;
+        else
+            return numberOfThreads2d(maxThreads - 1, iters_x, iters_y);
+    }
+
+    int ThreadsHelper::numberOfThreads3d(int maxThreads, uint64_t itersX, uint64_t itersY, uint64_t itersZ) {
+        // we don't want to run underloaded threads
+        if (itersX * itersY * itersZ <= 32)
+            return 1;
+
+        auto remX = itersX % maxThreads;
+        auto remY = itersY % maxThreads;
+        auto remZ = itersZ % maxThreads;
+
+        // if we have perfect balance across one of dimensions - just go for it
+        if ((itersX >= maxThreads && remX == 0) || (itersY >= maxThreads && remY == 0) || (itersZ >= maxThreads && remZ == 0))
+            return maxThreads;
+
+        int threadsX = 0, threadsY = 0, threadsZ = 0;
+
+        // now we look into possible number of
+        threadsX = threads_(maxThreads, itersX);
+        threadsY = threads_(maxThreads, itersY);
+        threadsZ = threads_(maxThreads, itersZ);
+
+        // we want to split as close to outer loop as possible, so checking it out first
+        if (threadsX >= threadsY && threadsX >= threadsZ)
+            return threadsX;
+        else if (threadsY >= threadsX && threadsY >= threadsZ)
+            return threadsY;
+        else if (threadsZ >= threadsX && threadsZ >= threadsY)
+            return threadsZ;
+
+        return 1;
+    }
+
+    int ThreadsHelper::pickLoop3d(int numThreads, uint64_t itersX, uint64_t itersY, uint64_t itersZ) {
+        auto remX = itersX % numThreads;
+        auto remY = itersY % numThreads;
+        auto remZ = itersZ % numThreads;
+
+        auto splitX = itersX / numThreads;
+        auto splitY = itersY / numThreads;
+        auto splitZ = itersZ / numThreads;
+
+        // if there's no remainder left in some dimension - we're picking that dimension, because it'll be the most balanced work distribution
+        if (remX == 0)
+            return 1;
+        else if (remY == 0)
+            return 2;
+        else if (remZ == 0) // TODO: we don't want too smal splits over last dimension? or we do?
+            return 3;
+
+        if (itersX > numThreads)
+            return 1;
+        else if (itersY > numThreads)
+            return 2;
+        else if (itersZ > numThreads)
+            return 3;
+
+        return 1;
+    }
+
+    int Threads::parallel_tad(FUNC_1D function, int64_t start, int64_t stop, int64_t increment, uint32_t numThreads) {
+        if (start > stop)
+            throw std::runtime_error("Threads::parallel_for got start > stop");
+
+        auto delta = (stop - start);
+
+        if (numThreads > delta)
+            numThreads = delta;
+
+        if (numThreads == 0)
+            return 0;
+
+        // shortcut
+        if (numThreads == 1) {
+            function(0, start, stop, increment);
+            return 1;
+        }
+
+        auto ticket = ThreadPool::getInstance()->tryAcquire(numThreads);
+        if (ticket != nullptr) {
+            // if we got our threads - we'll run our jobs here
+            auto span = delta / numThreads;
+
+            for (uint32_t e = 0; e < numThreads; e++) {
+                auto start_ = span * e + start;
+                auto stop_  = start_ + span;
+
+                // last thread will process tail
+                if (e == numThreads - 1)
+                    stop_ = stop;
+
+                // putting the task into the queue for a given thread
+                ticket->enqueue(e, numThreads, function, start_, stop_, increment);
+            }
+
+            // block and wait till all threads finished the job
+            ticket->waitAndRelease();
+
+            // we tell that parallelism request succeeded
+            return numThreads;
+        } else {
+            // if there were no threads available - we'll execute function right within current thread
+            function(0, start, stop, increment);
+
+            // we tell that parallelism request declined
+            return 1;
+        }
+    }
+
+    int Threads::parallel_for(FUNC_1D function, int64_t start, int64_t stop, int64_t increment, uint32_t numThreads) {
+        if (start > stop)
+            throw std::runtime_error("Threads::parallel_for got start > stop");
+
+        auto delta = (stop - start);
+
+        // in some cases we just fire func as is
+        if (delta == 0 || numThreads == 1) {
+            function(0, start, stop, increment);
+            return 1;
+        }
+
+        auto numElements = delta / increment;
+
+        // we decide what's optimal number of threads we need here, and execute it in parallel_tad.
+        numThreads = ThreadsHelper::numberOfThreads(numThreads, numElements);
+        return parallel_tad(function, start, stop, increment, numThreads);
+    }
+
+    int Threads::parallel_for(FUNC_2D function, int64_t startX, int64_t stopX, int64_t incX, int64_t startY, int64_t stopY, int64_t incY, uint64_t numThreads, bool debug) {
+        if (startX > stopX)
+            throw std::runtime_error("Threads::parallel_for got startX > stopX");
+
+        if (startY > stopY)
+            throw std::runtime_error("Threads::parallel_for got startY > stopY");
+
+        // number of elements per loop
+        auto delta_x = (stopX - startX);
+        auto delta_y = (stopY - startY);
+
+        // number of iterations per loop
+        auto itersX = delta_x / incX;
+        auto itersY = delta_y / incY;
+
+        // total number of iterations
+        auto iters_t = itersX * itersY;
+
+        // we are checking the case of number of requested threads was smaller
+        numThreads = ThreadsHelper::numberOfThreads2d(numThreads, itersX, itersY);
+
+        // basic shortcut for no-threading cases
+        if (numThreads == 1) {
+            function(0, startX, stopX, incX, startY, stopY, incY);
+            return 1;
+        }
+
+        // We have couple of scenarios:
+        // either we split workload along 1st loop, or 2nd
+        auto splitLoop = ThreadsHelper::pickLoop2d(numThreads, itersX, itersY);
+
+        // for debug mode we execute things inplace, without any threads
+        if (debug) {
+            for (int e = 0; e < numThreads; e++) {
+                auto span = Span2::build(splitLoop, e, numThreads, startX, stopX, incX, startY, stopY, incY);
+
+                function(e, span.startX(), span.stopX(), span.incX(), span.startY(), span.stopY(), span.incY());
+            }
+
+            // but we still mimic multithreaded execution
+            return numThreads;
+        } else {
+            auto ticket = ThreadPool::getInstance()->tryAcquire(numThreads);
+            if (ticket != nullptr) {
+
+                for (int e = 0; e < numThreads; e++) {
+                    auto threadId = numThreads - e - 1;
+                    auto span = Span2::build(splitLoop, threadId, numThreads, startX, stopX, incX, startY, stopY, incY);
+
+                    ticket->enqueue(e, numThreads, function, span.startX(), span.stopX(), span.incX(), span.startY(), span.stopY(), span.incY());
+                }
+
+                // block until all threads finish their job
+                ticket->waitAndRelease();
+
+                return numThreads;
+            } else {
+                // if there were no threads available - we'll execute function right within current thread
+                function(0, startX, stopX, incX, startY, stopY, incY);
+
+                // we tell that parallelism request declined
+                return 1;
+            }
+        };
+    }
+
+
+    int Threads::parallel_for(FUNC_3D function, int64_t startX, int64_t stopX, int64_t incX, int64_t startY, int64_t stopY, int64_t incY, int64_t startZ, int64_t stopZ, int64_t incZ, uint64_t numThreads) {
+        if (startX > stopX)
+            throw std::runtime_error("Threads::parallel_for got startX > stopX");
+
+        if (startY > stopY)
+            throw std::runtime_error("Threads::parallel_for got startY > stopY");
+
+        if (startZ > stopZ)
+            throw std::runtime_error("Threads::parallel_for got startZ > stopZ");
+
+        auto delta_x = stopX - startX;
+        auto delta_y = stopY - startY;
+        auto delta_z = stopZ - startZ;
+
+        auto itersX = delta_x / incX;
+        auto itersY = delta_y / incY;
+        auto itersZ = delta_z / incZ;
+
+        numThreads = 1; //ThreadsHelper::numberOfThreads3d(numThreads, itersX, itersY, itersZ);
+        if (numThreads == 1) {
+            // loop is too small - executing function as is
+            function(0, startX, stopX, incX, startY, stopY, incY, startZ, stopZ, incZ);
+            return 1;
+        }
+
+        auto ticket = ThreadPool::getInstance()->tryAcquire(numThreads);
+        if (ticket != nullptr) {
+            auto splitLoop = ThreadsHelper::pickLoop3d(numThreads, itersX, itersY, itersZ);
+
+            for (int e = 0; e < numThreads; e++) {
+                auto thread_id = numThreads - e - 1;
+                auto span = Span3::build(splitLoop, thread_id, numThreads, startX, stopX, incX, startY, stopY, incY, startZ, stopZ, incZ);
+
+                ticket->enqueue(e, numThreads, function, span.startX(), span.stopX(), span.incX(), span.startY(), span.stopY(), span.incY(), span.startZ(), span.stopZ(), span.incZ());
+            }
+
+            // block until we're done
+            ticket->waitAndRelease();
+
+            // we tell that parallelism request succeeded
+            return numThreads;
+        } else {
+            // if there were no threads available - we'll execute function right within current thread
+            function(0, startX, stopX, incX, startY, stopY, incY, startZ, stopZ, incZ);
+
+            // we tell that parallelism request declined
+            return 1;
+        }
+
+    }
+
+    int Threads::parallel_do(FUNC_DO function, uint64_t numThreads) {
+        auto ticket = ThreadPool::getInstance()->tryAcquire(numThreads - 1);
+        if (ticket != nullptr) {
+
+            // submit tasks one by one
+            for (uint64_t e = 0; e < numThreads - 1; e++)
+                ticket->enqueue(e, numThreads, function);
+
+            function(numThreads - 1, numThreads);
+
+            ticket->waitAndRelease();
+
+            return numThreads;
+        } else {
+            // if there's no threads available - we'll execute function sequentially one by one
+            for (uint64_t e = 0; e < numThreads; e++)
+                function(e, numThreads);
+
+            return numThreads;
+        }
+
+
+        return numThreads;
+    }
+
+    int64_t Threads::parallel_long(FUNC_RL function, FUNC_AL aggregator, int64_t start, int64_t stop, int64_t increment, uint64_t numThreads) {
+        if (start > stop)
+            throw std::runtime_error("Threads::parallel_long got start > stop");
+
+        auto delta = (stop - start);
+        if (delta == 0 || numThreads == 1)
+            return function(0, start, stop, increment);
+
+        auto numElements = delta / increment;
+
+        // we decide what's optimal number of threads we need here, and execute it
+        numThreads = ThreadsHelper::numberOfThreads(numThreads, numElements);
+        if (numThreads == 1)
+            return function(0, start, stop, increment);
+
+        auto ticket = ThreadPool::getInstance()->tryAcquire(numThreads - 1);
+        if (ticket == nullptr)
+            return function(0, start, stop, increment);
+
+        // create temporary array
+        int64_t intermediatery[256];
+        auto span = delta / numThreads;
+
+        // execute threads in parallel
+        for (uint32_t e = 0; e < numThreads; e++) {
+            auto start_ = span * e + start;
+            auto stop_ = span * (e + 1) + start;
+
+            if (e == numThreads - 1)
+                intermediatery[e] = function(e, start_, stop, increment);
+            else
+                ticket->enqueue(e, numThreads, &intermediatery[e], function, start_, stop_, increment);
+        }
+
+        ticket->waitAndRelease();
+
+        // aggregate results in single thread
+        for (uint64_t e = 1; e < numThreads; e++)
+            intermediatery[0] = aggregator(intermediatery[0], intermediatery[e]);
+
+        // return accumulated result
+        return intermediatery[0];
+    }
+
+    double Threads::parallel_double(FUNC_RD function, FUNC_AD aggregator, int64_t start, int64_t stop, int64_t increment, uint64_t numThreads) {
+        if (start > stop)
+            throw std::runtime_error("Threads::parallel_long got start > stop");
+
+        auto delta = (stop - start);
+        if (delta == 0 || numThreads == 1)
+            return function(0, start, stop, increment);
+
+        auto numElements = delta / increment;
+
+        // we decide what's optimal number of threads we need here, and execute it
+        numThreads = ThreadsHelper::numberOfThreads(numThreads, numElements);
+        if (numThreads == 1)
+            return function(0, start, stop, increment);
+
+        auto ticket = ThreadPool::getInstance()->tryAcquire(numThreads - 1);
+        if (ticket == nullptr)
+            return function(0, start, stop, increment);
+
+        // create temporary array
+        double intermediatery[256];
+        auto span = delta / numThreads;
+
+        // execute threads in parallel
+        for (uint32_t e = 0; e < numThreads; e++) {
+            auto start_ = span * e + start;
+            auto stop_ = span * (e + 1) + start;
+
+            if (e == numThreads - 1)
+                intermediatery[e] = function(e, start_, stop, increment);
+            else
+                ticket->enqueue(e, numThreads, &intermediatery[e], function, start_, stop_, increment);
+        }
+
+        ticket->waitAndRelease();
+
+        // aggregate results in single thread
+        for (uint64_t e = 1; e < numThreads; e++)
+            intermediatery[0] = aggregator(intermediatery[0], intermediatery[e]);
+
+        // return accumulated result
+        return intermediatery[0];
+    }
+
+}
\ No newline at end of file
diff --git a/libnd4j/include/execution/impl/Ticket.cpp b/libnd4j/include/execution/impl/Ticket.cpp
new file mode 100644
index 000000000..5bf911fd0
--- /dev/null
+++ b/libnd4j/include/execution/impl/Ticket.cpp
@@ -0,0 +1,94 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+
+#include <execution/Ticket.h>
+#include <execution/ThreadPool.h>
+#include <helpers/logger.h>
+#include <array>
+
+namespace samediff {
+    Ticket::Ticket(const std::vector<BlockingQueue<CallableWithArguments*>*> &queues) {
+        _acquired = true;
+        _queues = queues;
+    }
+
+    Ticket::Ticket() {
+        _acquired = true;
+        _interfaces.resize(nd4j::Environment::getInstance()->maxThreads());
+    }
+
+    bool Ticket::acquired() {
+        return _acquired;
+    }
+
+    void Ticket::enqueue(int thread_id, samediff::CallableWithArguments *callable) {
+        _queues[thread_id]->put(callable);
+        _callables.emplace_back(callable);
+    }
+
+    void Ticket::enqueue(uint32_t thread_id, uint32_t num_threads, FUNC_DO func) {
+        _interfaces[thread_id]->fill(thread_id, num_threads, func);
+    }
+
+    void Ticket::enqueue(uint32_t thread_id, uint32_t num_threads, FUNC_1D func, int64_t start_x, int64_t stop_x, int64_t inc_x) {
+        _interfaces[thread_id]->fill(thread_id, num_threads, func, start_x, stop_x, inc_x);
+    }
+
+    void Ticket::enqueue(uint32_t thread_id, uint32_t num_threads, int64_t *lpt, FUNC_RL func, int64_t start_x, int64_t stop_x, int64_t inc_x) {
+        _interfaces[thread_id]->fill(thread_id, num_threads, lpt, func, start_x, stop_x, inc_x);
+    }
+
+    void Ticket::enqueue(uint32_t thread_id, uint32_t num_threads, double *dpt, FUNC_RD func, int64_t start_x, int64_t stop_x, int64_t inc_x) {
+        _interfaces[thread_id]->fill(thread_id, num_threads, dpt, func, start_x, stop_x, inc_x);
+    }
+
+    void Ticket::enqueue(uint32_t thread_id, uint32_t num_threads, FUNC_2D func, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y) {
+        _interfaces[thread_id]->fill(thread_id, num_threads, std::move(func), start_x, stop_x, inc_x, start_y, stop_y, inc_y);
+    }
+
+    void Ticket::enqueue(uint32_t thread_id, uint32_t num_threads, FUNC_3D func, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y, int64_t start_z, int64_t stop_z, int64_t inc_z) {
+        _interfaces[thread_id]->fill(thread_id, num_threads, func, start_x, stop_x, inc_x, start_y, stop_y, inc_y, start_z, stop_z, inc_z);
+    }
+
+    void Ticket::acquiredThreads(uint32_t threads) {
+        _acquiredThreads = threads;
+    }
+
+    void Ticket::waitAndRelease() {
+        for (uint32_t e = 0; e < this->_acquiredThreads; e++) {
+            // block until finished
+            _interfaces[e]->waitForCompletion();
+
+            // mark available
+            _interfaces[e]->markAvailable();
+
+            // increment availability counter
+            ThreadPool::getInstance()->release();
+        }
+
+        // return this ticket back to the pool
+        ThreadPool::getInstance()->release(this);
+    }
+
+
+    void Ticket::attach(uint32_t thread_id, samediff::CallableInterface *interface) {
+        _interfaces[thread_id] = interface;
+    }
+}
\ No newline at end of file
diff --git a/libnd4j/include/graph/Node.h b/libnd4j/include/graph/Node.h
index 3eac03e07..b57998e38 100644
--- a/libnd4j/include/graph/Node.h
+++ b/libnd4j/include/graph/Node.h
@@ -232,6 +232,7 @@ namespace nd4j {
             }
 
             static nd4j::ops::DeclarableOp* buildOpByType(OpType opType, int numInputs, int numIArgs, int numTArgs, int opNum, NDArray *scalar);
+            static void deleteOpByType(OpType opType, void *op);
         };
     }
 }
diff --git a/libnd4j/include/graph/impl/Graph.cpp b/libnd4j/include/graph/impl/Graph.cpp
index f4514efdb..2acedcea3 100644
--- a/libnd4j/include/graph/impl/Graph.cpp
+++ b/libnd4j/include/graph/impl/Graph.cpp
@@ -19,6 +19,7 @@
 //
 
 #include <graph/Graph.h>
+#include <array/DataTypeUtils.h>
 #include <helpers/EnumUtils.h>
 #include <graph/FlatUtils.h>
 #include <NativeOps.h>
@@ -154,7 +155,7 @@ namespace nd4j {
                         Nd4jLong *newShape = nullptr;
 
                         // if that's scalar output - we don't care about previous node
-                        if (node->getDimensions()->size() == 0 || (node->getDimensions()->size() == 1 && node->getDimensions()->at(0) == MAX_INT)) {
+                        if (node->getDimensions()->size() == 0 || (node->getDimensions()->size() == 1 && node->getDimensions()->at(0) == nd4j::DataTypeUtils::max<int>())) {
                             newShape = new Nd4jLong[8];
 
                             newShape[0] = 2;
diff --git a/libnd4j/include/graph/impl/Node.cpp b/libnd4j/include/graph/impl/Node.cpp
index d365ddd6a..795d9b7f0 100644
--- a/libnd4j/include/graph/impl/Node.cpp
+++ b/libnd4j/include/graph/impl/Node.cpp
@@ -682,8 +682,9 @@ namespace nd4j {
             if (_protoContext != nullptr)
                 delete _protoContext;
 
-            if (_isDeductable && _customOp != nullptr)
-                delete _customOp;
+            if (_isDeductable && _customOp != nullptr) {
+                Node::deleteOpByType(_opType, _customOp);
+            }
         }
 
         int nd4j::graph::Node::getRewindNode() {
@@ -710,6 +711,70 @@ namespace nd4j {
             return false;
         }
 
+        void nd4j::graph::Node::deleteOpByType(OpType opType, void *op) {
+            switch (opType) {
+                case OpType_PAIRWISE:
+                    delete reinterpret_cast<nd4j::ops::LegacyPairwiseTransformOp*>(op);
+                    break;
+                case OpType_PAIRWISE_BOOL:
+                    delete reinterpret_cast<nd4j::ops::LegacyPairwiseTransformBoolOp*>(op);
+                    break;
+                case OpType_TRANSFORM_STRICT:
+                    delete reinterpret_cast<nd4j::ops::LegacyTransformStrictOp*>(op);
+                    break;
+                case OpType_TRANSFORM_SAME:
+                    delete reinterpret_cast<nd4j::ops::LegacyTransformSameOp*>(op);
+                    break;
+                case OpType_TRANSFORM_FLOAT:
+                    delete reinterpret_cast<nd4j::ops::LegacyTransformFloatOp*>(op);
+                    break;
+                case OpType_TRANSFORM_BOOL:
+                    delete reinterpret_cast<nd4j::ops::LegacyTransformBoolOp*>(op);
+                    break;
+                case OpType_SCALAR:
+                    delete reinterpret_cast<nd4j::ops::LegacyScalarOp*>(op);
+                    break;
+                case OpType_SCALAR_BOOL:
+                    delete reinterpret_cast<nd4j::ops::LegacyScalarBoolOp*>(op);
+                    break;
+                case OpType_REDUCE_3:
+                    delete reinterpret_cast<nd4j::ops::LegacyReduce3Op*>(op);
+                    break;
+                case OpType_REDUCE_SAME:
+                    delete reinterpret_cast<nd4j::ops::LegacyReduceSameOp*>(op);
+                    break;
+                case OpType_REDUCE_FLOAT:
+                    delete reinterpret_cast<nd4j::ops::LegacyReduceFloatOp*>(op);
+                    break;
+                case OpType_REDUCE_LONG:
+                    delete reinterpret_cast<nd4j::ops::LegacyReduceLongOp*>(op);
+                    break;
+                case OpType_REDUCE_BOOL:
+                    delete reinterpret_cast<nd4j::ops::LegacyReduceBoolOp*>(op);
+                    break;
+                case OpType_INDEX_REDUCE:
+                    delete reinterpret_cast<nd4j::ops::LegacyIndexReduceOp*>(op);
+                    break;
+                case OpType_SUMMARYSTATS:
+                    delete reinterpret_cast<nd4j::ops::LegacyStatsOp*>(op);
+                    break;
+                case OpType_RANDOM:
+                    delete reinterpret_cast<nd4j::ops::LegacyRandomOp*>(op);
+                    break;
+                case OpType_BROADCAST:
+                    delete reinterpret_cast<nd4j::ops::LegacyBroadcastOp*>(op);
+                    break;
+                case OpType_BROADCAST_BOOL:
+                    delete reinterpret_cast<nd4j::ops::LegacyBroadcastBoolOp*>(op);
+                    break;
+                case OpType_CUSTOM:
+                    delete reinterpret_cast<nd4j::ops::DeclarableOp*>(op);
+                    break;
+                default:
+                    throw std::runtime_error("Bad opType passed in");
+            }
+        }
+
         nd4j::ops::DeclarableOp* nd4j::graph::Node::buildOpByType(OpType opType, int numInputs,  int numIArgs, int numTArgs, int opNum, NDArray *scalar) {
             switch (opType) {
                 case OpType_PAIRWISE:
diff --git a/libnd4j/include/helpers/Loops.h b/libnd4j/include/helpers/Loops.h
index 392ed3edf..fb1582056 100644
--- a/libnd4j/include/helpers/Loops.h
+++ b/libnd4j/include/helpers/Loops.h
@@ -31,6 +31,7 @@
 #include <indexreduce.h>
 #include <helpers/ConstantTadHelper.h>
 #include <openmp_pragmas.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 
@@ -40,43 +41,43 @@ namespace nd4j {
     public:
 
         template <typename OpType>
-        static FORCEINLINE void loopReduce(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, E* extraParams);
+        static FORCEINLINE void loopReduce(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, E* extraParams, int64_t start, int64_t stop);
     };
 
     template <typename X, typename Z>
     class ReductionFloatLoops : public ReductionLoops<X,Z,Z> {
     public:
-        static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams);
+        static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams, int64_t start, int64_t stop);
 
         template <typename OpType>
-        static void innerloopReduce(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams);
+        static void innerloopReduce(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams, int64_t start, int64_t stop);
     };
 
     template <typename X, typename Z>
     class ND4J_EXPORT ReductionBoolLoops : public ReductionLoops<X,Z,X> {
     public:
-        static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams);
+        static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop);
 
         template <typename OpType>
-        static void innerloopReduce(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams);
+        static void innerloopReduce(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop);
     };
 
     template <typename X, typename Z>
     class ND4J_EXPORT ReductionLongLoops : public ReductionLoops<X,Z,X> {
     public:
-        static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams);
+        static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop);
 
         template <typename OpType>
-        static void innerloopReduce(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams);
+        static void innerloopReduce(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop);
     };
 
     template <typename X>
     class ND4J_EXPORT ReductionSameLoops : public ReductionLoops<X,X,X> {
     public:
-        static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, X* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams);
+        static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, X* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop);
 
         template <typename OpType>
-        static void innerloopReduce(X* x, Nd4jLong* xShapeInfo, X* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams);
+        static void innerloopReduce(X* x, Nd4jLong* xShapeInfo, X* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop);
     };
 
 
@@ -96,8 +97,8 @@ namespace nd4j {
 
     public:
 
-        template<typename OpType, bool doParallel>
-        static FORCEINLINE void loopTransform(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, E* extraParams);
+        template<typename OpType>
+        static FORCEINLINE void loopTransform(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, E* extraParams, uint64_t threadId, uint64_t numThreads);
     };
 
     template <typename X, typename Z>
@@ -105,20 +106,20 @@ namespace nd4j {
     public:
 
         template <typename OpType>
-        static FORCEINLINE void loopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams);
+        static FORCEINLINE void loopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams, int64_t start, int64_t stop);
 
         template <typename OpType>
-        static FORCEINLINE void loopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams);
+        static FORCEINLINE void loopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams, int64_t start, int64_t stop);
 
-        static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams);
+        static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams, int64_t start, int64_t stop);
 
-        static void wrapperAll(const int opNum, X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams);
+        static void wrapperAll(const int opNum, X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams, int64_t start, int64_t stop);
 
         template <typename OpType>
-        static void innerloopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams);
+        static void innerloopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams, int64_t start, int64_t stop);
 
         template <typename OpType>
-        static void innerloopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams);
+        static void innerloopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams, int64_t start, int64_t stop);
     };
 
 
@@ -265,7 +266,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
     void nd4j::ReductionLoops<X, Z, E>::loopReduce(X* x, Nd4jLong* xShapeInfo,
                                                   Z* z, Nd4jLong* zShapeInfo,
                                                   Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets,
-                                                  E* extraParams) {
+                                                  E* extraParams, int64_t start, int64_t stop) {
 
         const LoopKind::Kind kindOfLoop = LoopKind::deduceKindOfLoopTadXZ(xShapeInfo, zShapeInfo, tadShapeInfo);
 
@@ -319,263 +320,170 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
 
             //*********************************************//
             case LoopKind::EWS1: {
-
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-                for (uint i = 0; i < zLen; i++) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = x + tadOffsets[i];
-                    auto start = OpType::startingValue(tad);
+                    auto s = OpType::startingValue(tad);
 
                     for (uint j = 0; j < tadLen; j++)
-                        start = OpType::update(start, OpType::op(tad[j], extraParams), extraParams);
+                        s = OpType::update(s, OpType::op(tad[j], extraParams), extraParams);
 
-                    z[i] = OpType::postProcess(start, tadLen, extraParams);
-                }
+                    z[i] = OpType::postProcess(s, tadLen, extraParams);
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::EWSNONZERO: {
-
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-                for (uint i = 0; i < zLen; i++) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = x + tadOffsets[i];
-                    auto start = OpType::startingValue(tad);
+                    auto s = OpType::startingValue(tad);
 
                     for (uint j = 0; j < tadLen; j++)
-                        start = OpType::update(start, OpType::op(tad[j * tadEws], extraParams), extraParams);
+                        s = OpType::update(s, OpType::op(tad[j * tadEws], extraParams), extraParams);
 
-                    z[i * zEws] = OpType::postProcess(start, tadLen, extraParams);
-                }
+                    z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::RANK1: {
-
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-                for (uint i = 0; i < zLen; i++) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = x + tadOffsets[i];
-                    auto start = OpType::startingValue(tad);
+                    auto s = OpType::startingValue(tad);
 
                     for (uint i0 = 0; i0 < tadLen; ++i0)
-                        start = OpType::update(start, OpType::op(tad[i0 * tadStride[0]], extraParams), extraParams);
+                        s = OpType::update(s, OpType::op(tad[i0 * tadStride[0]], extraParams), extraParams);
 
-                    z[i] = OpType::postProcess(start, tadLen, extraParams);
-                }
+                    z[i] = OpType::postProcess(s, tadLen, extraParams);
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::RANK2: {
-
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-                for (uint i = 0; i < zLen; ++i) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = x + tadOffsets[i];
-                    auto start = OpType::startingValue(tad);
+                    auto s = OpType::startingValue(tad);
 
                     for (uint i0 = 0; i0 < tadShape[0]; ++i0)
                         for (uint i1 = 0; i1 < tadShape[1]; ++i1)
-                            start = OpType::update(start, OpType::op(tad[i0*tadStride[0] + i1*tadStride[1]], extraParams), extraParams);
+                            s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1]], extraParams), extraParams);
 
-                    z[i] = OpType::postProcess(start, tadLen, extraParams);
-                }
+                    z[i] = OpType::postProcess(s, tadLen, extraParams);
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::RANK3: {
-
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-                for (uint i = 0; i < zLen; ++i) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = x + tadOffsets[i];
-                    auto start = OpType::startingValue(tad);
+                    auto s = OpType::startingValue(tad);
 
                     for (uint i0 = 0; i0 < tadShape[0]; ++i0)
                         for (uint i1 = 0; i1 < tadShape[1]; ++i1)
                             for (uint i2 = 0; i2 < tadShape[2]; ++i2)
-                                start = OpType::update(start, OpType::op(tad[i0*tadStride[0] + i1*tadStride[1] + i2*tadStride[2]], extraParams), extraParams);
+                                s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2]], extraParams), extraParams);
 
-                    z[i] = OpType::postProcess(start, tadLen, extraParams);
-                }
+                    z[i] = OpType::postProcess(s, tadLen, extraParams);
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::RANK4: {
-
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-                for (uint i = 0; i < zLen; ++i) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = x + tadOffsets[i];
-                    auto start = OpType::startingValue(tad);
+                    auto s = OpType::startingValue(tad);
 
                     for (uint i0 = 0; i0 < tadShape[0]; ++i0)
                         for (uint i1 = 0; i1 < tadShape[1]; ++i1)
                             for (uint i2 = 0; i2 < tadShape[2]; ++i2)
                                 for (uint i3 = 0; i3 < tadShape[3]; ++i3)
-                                    start = OpType::update(start, OpType::op(tad[i0*tadStride[0] + i1*tadStride[1] + i2*tadStride[2] + i3*tadStride[3]], extraParams), extraParams);
+                                    s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3]], extraParams), extraParams);
 
-                    z[i] = OpType::postProcess(start, tadLen, extraParams);
-                }
+                    z[i] = OpType::postProcess(s, tadLen, extraParams);
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::RANK5: {
-
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-                for (uint i = 0; i < zLen; ++i) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = x + tadOffsets[i];
-                    auto start = OpType::startingValue(tad);
+                    auto s = OpType::startingValue(tad);
 
                     for (uint i0 = 0; i0 < tadShape[0]; ++i0)
                         for (uint i1 = 0; i1 < tadShape[1]; ++i1)
                             for (uint i2 = 0; i2 < tadShape[2]; ++i2)
                                 for (uint i3 = 0; i3 < tadShape[3]; ++i3)
                                     for (uint i4 = 0; i4 < tadShape[4]; ++i4)
-                                        start = OpType::update(start, OpType::op(tad[i0*tadStride[0] + i1*tadStride[1] + i2*tadStride[2] + i3*tadStride[3] + i4*tadStride[4] ], extraParams), extraParams);
+                                        s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3] + i4 * tadStride[4]], extraParams), extraParams);
 
-                    z[i] = OpType::postProcess(start, tadLen, extraParams);
-                }
+                    z[i] = OpType::postProcess(s, tadLen, extraParams);
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::X_EWSNONZERO: {
                 uint castZShapeInfo[MAX_RANK];
                 const bool canCastZ   = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo,   castZShapeInfo);
 
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-                for (uint i = 0; i < zLen; i++) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = x + tadOffsets[i];
-                    auto start = OpType::startingValue(tad);
+                    auto s = OpType::startingValue(tad);
 
                     for (uint j = 0; j < tadLen; j++)
-                        start = OpType::update(start, OpType::op(tad[j * tadEws], extraParams), extraParams);
+                        s = OpType::update(s, OpType::op(tad[j * tadEws], extraParams), extraParams);
 
                     auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ);
-                    z[zOffset] = OpType::postProcess(start, tadLen, extraParams);
-                }
+                    z[zOffset] = OpType::postProcess(s, tadLen, extraParams);
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::Z_EWSNONZERO: {
                 uint castTadShapeInfo[MAX_RANK];
                 const bool canCastTad = nd4j::DataTypeUtils::castShapeInfo<uint>(tadShapeInfo, castTadShapeInfo);
 
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-                for (uint i = 0; i < zLen; i++) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = x + tadOffsets[i];
-                    auto start = OpType::startingValue(tad);
+                    auto s = OpType::startingValue(tad);
 
                     for (uint j = 0; j < tadLen; j++) {
                         auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad);
-                        start = OpType::update(start, OpType::op(tad[tadOffset], extraParams), extraParams);
+                        s = OpType::update(s, OpType::op(tad[tadOffset], extraParams), extraParams);
                     }
 
-                    z[i * zEws] = OpType::postProcess(start, tadLen, extraParams);
-                }
+                    z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
+                };
             }
-                break;
-
-            //*********************************************//
-            // default: {
-            //     uint castTadShapeInfo[MAX_RANK];
-            //     uint castZShapeInfo[MAX_RANK];
-            //     const bool canCastTad = nd4j::DataTypeUtils::castShapeInfo<uint>(tadShapeInfo, castTadShapeInfo);
-            //     const bool canCastZ   = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo,   castZShapeInfo);
-
-            //     PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-            //     for (uint i = 0; i < zLen; i++) {
-            //         auto tad = x + tadOffsets[i];
-            //         auto start = OpType::startingValue(tad);
-
-            //         for (uint j = 0; j < tadLen; j++) {
-            //             auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad);
-            //             start = OpType::update(start, OpType::op(tad[tadOffset], extraParams), extraParams);
-            //         }
-
-            //         auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ);
-            //         z[zOffset] = OpType::postProcess(start, tadLen, extraParams);
-            //     }
-            // }
+            break;
 
             //*********************************************//
             default: {
-
-                Nd4jLong* innertadOffsets = new Nd4jLong[tadLen];
+                auto innertadOffsets = new Nd4jLong[tadLen];
                 shape::calcOffsets(tadShapeInfo, innertadOffsets);
 
                 uint castZShapeInfo[MAX_RANK];
                 const bool canCastZ   = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo,   castZShapeInfo);
 
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-                for (uint i = 0; i < zLen; i++) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = x + tadOffsets[i];
-                    auto start = OpType::startingValue(tad);
+                    auto s = OpType::startingValue(tad);
 
                     for (uint j = 0; j < tadLen; j++)
-                        start = OpType::update(start, OpType::op(tad[innertadOffsets[j]], extraParams), extraParams);
+                        s = OpType::update(s, OpType::op(tad[innertadOffsets[j]], extraParams), extraParams);
 
                     auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ);
-                    z[zOffset] = OpType::postProcess(start, tadLen, extraParams);
-                }
+                    z[zOffset] = OpType::postProcess(s, tadLen, extraParams);
+                };
 
-                delete []innertadOffsets;
+                delete[] innertadOffsets;
             }
-
-            //*********************************************//
-            // default: {
-
-            //     Nd4jLong* innertadOffsets = new Nd4jLong[tadLen];
-            //     shape::calcOffsets(tadShapeInfo, innertadOffsets);
-
-            //     const int zRankMinusOne   = shape::rank(zShapeInfo) - 1;
-
-            //     Nd4jLong* offsetPerDimZ   = new Nd4jLong[zRankMinusOne];
-            //     int* idxZ = new int[zRankMinusOne];
-
-            //     memset(idxZ,   0, sizeof(Nd4jLong) * zRankMinusOne);
-
-            //     const Nd4jLong* shapeZ    = shape::shapeOf(zShapeInfo);
-            //     const Nd4jLong* strideZ   = shape::stride(zShapeInfo);
-
-            //     PRAGMA_OMP_SIMD
-            //     for (int k = 0; k < zRankMinusOne; ++k)
-            //         offsetPerDimZ[k] = (shapeZ[k] - 1) * strideZ[k];
-
-            //     int dimZ = zRankMinusOne, lZ = 1;
-            //     Nd4jLong initZ = 0, zOffset = 0, e = 1;
-
-            //     // first iteration
-            //     auto tad = x + tadOffsets[0];
-            //     auto start = OpType::startingValue(tad);
-            //     for (uint j = 0; j < tadLen; j++)
-            //         start = OpType::update(start, OpType::op(tad[innertadOffsets[j]], extraParams), extraParams);
-            //     z[0] = OpType::postProcess(start, OpType::startingValue(x), extraParams);
-
-            //     // rest iterations
-            //     while (dimZ >= 0) {
-
-            //         if(shapeZ[dimZ] == 1) { --dimZ; continue; } // ignore dimensions equal to unity
-            //             if(dimZ == zRankMinusOne) {              // last dimension
-            //                 if(lZ < shapeZ[dimZ]) { zOffset += strideZ[dimZ]; ++lZ;}
-            //                 else                  { lZ = 1; --dimZ; continue; }
-            //             }
-            //         else if(idxZ[dimZ] < shapeZ[dimZ] - 1) { initZ += strideZ[dimZ]; zOffset = initZ; ++idxZ[dimZ]; dimZ = zRankMinusOne; }
-            //         else                                   { initZ -= offsetPerDimZ[dimZ]; idxZ[dimZ--] = 0; continue;}
-
-            //         start = OpType::startingValue(tad);
-            //         tad = x + tadOffsets[e++];
-
-            //         for (uint j = 0; j < tadLen; j++)
-            //             start = OpType::update(start, OpType::op(tad[innertadOffsets[j]], extraParams), extraParams);
-
-            //         z[zOffset] = OpType::postProcess(start, tadLen, extraParams);
-            //     }
-
-            //     delete []innertadOffsets;
-            // }
         }
     }
 
@@ -583,10 +491,10 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
 
     //////////////////////////////////////////////////////////////////////////////
     template <typename X, typename Z, typename E>
-    template <typename OpType, bool doParallel>
+    template <typename OpType>
     void nd4j::TransformLoops<X,Z,E>::loopTransform(X* x, Nd4jLong* xShapeInfo,
                                              Z* z, Nd4jLong* zShapeInfo,
-                                             E* extraParams) {
+                                             E* extraParams, uint64_t threadId, uint64_t numThreads) {
 
         const LoopKind::Kind kindOfLoop = LoopKind::deduceKindOfLoopXZ(xShapeInfo, zShapeInfo);
 
@@ -596,265 +504,176 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
 
         const Nd4jLong len = shape::length(xShapeInfo);
 
-        OmpLaunchHelper threadsInfo(len, doParallel ? -1 : 1);
+        if (len == 0)
+            return;
 
         switch (kindOfLoop) {
 
             //*********************************************//
             case LoopKind::EWS1: {
+                    auto span = samediff::Span::build(threadId, numThreads, 0, len, 1);
+                    int64_t start = span.startX(), stop = span.stopX();
 
-                PRAGMA_OMP_PARALLEL_THREADS(threadsInfo._numThreads)
-                {
-                    const auto threadNum = omp_get_thread_num();
-                    const auto threadOffset = threadsInfo.getThreadOffset(threadNum);
-                    const auto lenPerThread = static_cast<uint>(threadsInfo.getItersPerThread(threadNum));
-
-                    const auto xi = x + threadOffset;
-                    const auto zi = z + threadOffset;
-
-                    PRAGMA_OMP_SIMD
-                    for (uint i = 0; i < lenPerThread; i++)
-                        zi[i] = OpType::op(xi[i], extraParams);
+                    for (auto i = start; i < stop; i++)
+                        z[i] = OpType::op(x[i], extraParams);
                 }
-            }
                 break;
 
             //*********************************************//
             case LoopKind::EWSNONZERO: {
-                const uint xEws = shape::elementWiseStride(xShapeInfo);
-                const uint zEws = shape::elementWiseStride(zShapeInfo);
+                    const uint xEws = shape::elementWiseStride(xShapeInfo);
+                    const uint zEws = shape::elementWiseStride(zShapeInfo);
 
-                PRAGMA_OMP_PARALLEL_THREADS(threadsInfo._numThreads)
-                {
-                    const auto threadNum = omp_get_thread_num();
-                    const auto threadOffset = threadsInfo.getThreadOffset(threadNum);
-                    const auto lenPerThread = static_cast<uint>(threadsInfo.getItersPerThread(threadNum));
+                    auto span = samediff::Span::build(threadId, numThreads, 0, len, 1);
+                    int64_t start = span.startX(), stop = span.stopX();
 
-                    const auto xi = x + threadOffset * xEws;
-                    auto zi = z + threadOffset * zEws;
-
-                    PRAGMA_OMP_SIMD
-                    for (uint i = 0; i < lenPerThread; i++)
-                        zi[i*zEws] = OpType::op(xi[i*xEws], extraParams);
+                    for (auto i = start; i < stop; i++)
+                        z[i*zEws] = OpType::op(x[i*xEws], extraParams);
                 }
-            }
                 break;
 
                 //*********************************************//
             case LoopKind::Z_EWSNONZERO: {
-                const uint zEws = shape::elementWiseStride(zShapeInfo);
-                uint castXShapeInfo[MAX_RANK];
-                const bool canCastX = nd4j::DataTypeUtils::castShapeInfo<uint>(xShapeInfo, castXShapeInfo);
+                    const uint zEws = shape::elementWiseStride(zShapeInfo);
+                    uint castXShapeInfo[MAX_RANK];
+                    const bool canCastX = nd4j::DataTypeUtils::castShapeInfo<uint>(xShapeInfo, castXShapeInfo);
 
-                PRAGMA_OMP_PARALLEL_THREADS(threadsInfo._numThreads)
-                {
-                    const auto threadNum = omp_get_thread_num();
-                    const auto threadOffset = threadsInfo.getThreadOffset(threadNum);
-                    const auto lenPerThread = static_cast<uint>(threadsInfo.getItersPerThread(threadNum));
-
-                    auto zi = z + threadOffset * zEws;
+                    auto span = samediff::Span::build(threadId, numThreads, 0, len, 1);
+                    int64_t start = span.startX(), stop = span.stopX();
 
                     if (zEws > 1) {
-
-                        PRAGMA_OMP_SIMD
-                        for (uint i = 0; i < lenPerThread; i++) {
-                            const auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, castXShapeInfo, canCastX);
-                            zi[i * zEws] = OpType::op(x[xOffset], extraParams);
+                        for (auto i = start; i < stop; i++) {
+                            const auto xOffset = shape::indexOffset(i, xShapeInfo, castXShapeInfo, canCastX);
+                            z[i * zEws] = OpType::op(x[xOffset], extraParams);
                         }
                     } else {
-                        PRAGMA_OMP_SIMD
-                        for (uint i = 0; i < lenPerThread; i++) {
-                            const auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, castXShapeInfo, canCastX);
-                            zi[i] = OpType::op(x[xOffset], extraParams);
+                        for (auto i = start; i < stop; i++) {
+                            const auto xOffset = shape::indexOffset(i, xShapeInfo, castXShapeInfo, canCastX);
+                            z[i] = OpType::op(x[xOffset], extraParams);
                         }
                     }
                 }
-            }
                 break;
 
                 //*********************************************//
             case LoopKind::RANK1: {
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(threadsInfo._numThreads)
-                for (uint i0 = 0; i0 < len; ++i0)
-                    z[i0 * zStride[0]] = OpType::op(x[i0 * xStride[0]], extraParams);
-            }
+                    auto span = samediff::Span::build(threadId, numThreads, 0, len, 1);
+
+                    for (auto i0 = span.startX(); i0 < span.stopX(); i0++)
+                        z[i0 * zStride[0]] = OpType::op(x[i0 * xStride[0]], extraParams);
+                }
                 break;
 
                 //*********************************************//
             case LoopKind::RANK2: {
-                auto uXShape0 = static_cast<uint>(xShape[0]);
-                auto uXShape1 = static_cast<uint>(xShape[1]);
+                    auto uXShape0 = static_cast<uint>(xShape[0]);
+                    auto uXShape1 = static_cast<uint>(xShape[1]);
 
-                //PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(threadsInfo._numThreads)
-                PRAGMA_OMP_PARALLEL_FOR_SIMD
-                for (uint i0 = 0; i0 < uXShape0; ++i0) {
+                    auto loop = samediff::ThreadsHelper::pickLoop2d(numThreads, uXShape0, uXShape1);
+                    auto span = samediff::Span2::build(loop, threadId, numThreads, 0, uXShape0, 1, 0, uXShape1, 1);
 
-                    auto z0 = i0 * zStride[0];
-                    auto x0 = i0 * xStride[0];
-                    for (uint i1 = 0; i1 < uXShape1; ++i1)
-                        z[z0 + i1 * zStride[1]] = OpType::op(x[x0 + i1 * xStride[1]], extraParams);
+                    for (auto i0 = span.startX(); i0 < span.stopX(); i0++) {
+                        auto z0 = i0 * zStride[0];
+                        auto x0 = i0 * xStride[0];
+
+                        for (uint i1 = span.startY(); i1 < span.stopY(); ++i1)
+                            z[z0 + i1 * zStride[1]] = OpType::op(x[x0 + i1 * xStride[1]], extraParams);
+                    }
                 }
-            }
                 break;
 
                 //*********************************************//
             case LoopKind::RANK3: {
-                auto uXShape0 = static_cast<uint>(xShape[0]);
-                auto uXShape1 = static_cast<uint>(xShape[1]);
-                auto uXShape2 = static_cast<uint>(xShape[2]);
+                    auto uXShape0 = static_cast<uint>(xShape[0]);
+                    auto uXShape1 = static_cast<uint>(xShape[1]);
+                    auto uXShape2 = static_cast<uint>(xShape[2]);
 
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS_COLLAPSE(threadsInfo._numThreads, 2)
-                for (uint i0 = 0; i0 < uXShape0; ++i0)
-                    for (uint i1 = 0; i1 < uXShape1; ++i1) {
+                    auto loop = samediff::ThreadsHelper::pickLoop2d(numThreads, uXShape0, uXShape1);
+                    auto span = samediff::Span2::build(loop, threadId, numThreads, 0, uXShape0, 1, 0, uXShape1, 1);
 
-                        auto z0 = i0 * zStride[0] + i1 * zStride[1];
-                        auto x0 = i0 * xStride[0] + i1 * xStride[1];
 
-                        for (uint i2 = 0; i2 < uXShape2; ++i2)
-                            z[z0 + i2 * zStride[2]] = OpType::op(x[x0 + i2 * xStride[2]], extraParams);
-                    }
-            }
+                    for (auto i0 = span.startX(); i0 < span.stopX(); i0++)
+                        for (auto i1 = span.startY(); i1 < span.stopY(); i1++) {
+                            auto z0 = i0 * zStride[0] + i1 * zStride[1];
+                            auto x0 = i0 * xStride[0] + i1 * xStride[1];
+
+                            for (uint i2 = 0; i2 < uXShape2; ++i2)
+                                z[z0 + i2 * zStride[2]] = OpType::op(x[x0 + i2 * xStride[2]], extraParams);
+                        }
+                }
                 break;
 
                 //*********************************************//
             case LoopKind::RANK4: {
-                auto uXShape0 = static_cast<uint>(xShape[0]);
-                auto uXShape1 = static_cast<uint>(xShape[1]);
-                auto uXShape2 = static_cast<uint>(xShape[2]);
-                auto uXShape3 = static_cast<uint>(xShape[3]);
+                    auto uXShape0 = static_cast<uint>(xShape[0]);
+                    auto uXShape1 = static_cast<uint>(xShape[1]);
+                    auto uXShape2 = static_cast<uint>(xShape[2]);
+                    auto uXShape3 = static_cast<uint>(xShape[3]);
 
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS_COLLAPSE(threadsInfo._numThreads, 2)
-                for (uint i0 = 0; i0 < uXShape0; ++i0)
-                    for (uint i1 = 0; i1 < uXShape1; ++i1)
-                        for (uint i2 = 0; i2 < uXShape2; ++i2) {
+                    auto loop = samediff::ThreadsHelper::pickLoop3d(numThreads, uXShape0, uXShape1, uXShape2);
+                    auto span = samediff::Span3::build(loop, threadId, numThreads, 0, uXShape0, 1, 0, uXShape1, 1, 0, uXShape2, 1);
 
-                            auto x0 = i0 * xStride[0] + i1 * xStride[1] + i2 * xStride[2];
-                            auto z0 = i0 * zStride[0] + i1 * zStride[1] + i2 * zStride[2];
+                    for (auto i0 = span.startX(); i0 < span.stopX(); i0++)
+                        for (auto i1 = span.startY(); i1 < span.stopY(); i1++)
+                            for (auto i2 = span.startZ(); i2 < span.stopZ(); i2++) {
+                                auto x0 = i0 * xStride[0] + i1 * xStride[1] + i2 * xStride[2];
+                                auto z0 = i0 * zStride[0] + i1 * zStride[1] + i2 * zStride[2];
 
-                            for (uint i3 = 0; i3 < uXShape3; ++i3)
-                                z[z0 + i3 * zStride[3]] = OpType::op(x[x0 + i3 * xStride[3]], extraParams);
-                        }
-            }
+                                for (uint i3 = 0; i3 < uXShape3; ++i3)
+                                    z[z0 + i3 * zStride[3]] = OpType::op(x[x0 + i3 * xStride[3]], extraParams);
+                            }
+                }
                 break;
 
                 //*********************************************//
             case LoopKind::RANK5: {
-                auto uXShape0 = static_cast<uint>(xShape[0]);
-                auto uXShape1 = static_cast<uint>(xShape[1]);
-                auto uXShape2 = static_cast<uint>(xShape[2]);
-                auto uXShape3 = static_cast<uint>(xShape[3]);
-                auto uXShape4 = static_cast<uint>(xShape[4]);
+                    auto uXShape0 = static_cast<uint>(xShape[0]);
+                    auto uXShape1 = static_cast<uint>(xShape[1]);
+                    auto uXShape2 = static_cast<uint>(xShape[2]);
+                    auto uXShape3 = static_cast<uint>(xShape[3]);
+                    auto uXShape4 = static_cast<uint>(xShape[4]);
 
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS_COLLAPSE(threadsInfo._numThreads, 3)
-                for (uint i0 = 0; i0 < uXShape0; ++i0)
-                    for (uint i1 = 0; i1 < uXShape1; ++i1)
-                        for (uint i2 = 0; i2 < uXShape2; ++i2) {
+                    auto loop = samediff::ThreadsHelper::pickLoop3d(numThreads, uXShape0, uXShape1, uXShape2);
+                    auto span = samediff::Span3::build(loop, threadId, numThreads, 0, uXShape0, 1, 0, uXShape1, 1, 0, uXShape2, 1);
 
-                            auto z0 = i0 * zStride[0] + i1 * zStride[1] + i2 * zStride[2];
-                            auto x0 = i0 * xStride[0] + i1 * xStride[1] + i2 * xStride[2];
 
-                            for (uint i3 = 0; i3 < uXShape3; ++i3) {
+                    for (auto i0 = span.startX(); i0 < span.stopX(); i0++)
+                        for (auto i1 = span.startY(); i1 < span.stopY(); i1++)
+                            for (auto i2 = span.startZ(); i2 < span.stopZ(); i2++) {
+                                auto z0 = i0 * zStride[0] + i1 * zStride[1] + i2 * zStride[2];
+                                auto x0 = i0 * xStride[0] + i1 * xStride[1] + i2 * xStride[2];
 
-                                auto z1 = z0 + i3 * zStride[3];
-                                auto x1 = x0 + i3 * xStride[3];
+                                for (uint i3 = 0; i3 < uXShape3; ++i3) {
 
-                                for (uint i4 = 0; i4 < uXShape4; ++i4)
-                                    z[z1 + i4 * zStride[4]] = OpType::op(x[x1 + i4 * xStride[4]], extraParams);
+                                    auto z1 = z0 + i3 * zStride[3];
+                                    auto x1 = x0 + i3 * xStride[3];
 
+                                    for (uint i4 = 0; i4 < uXShape4; ++i4)
+                                        z[z1 + i4 * zStride[4]] = OpType::op(x[x1 + i4 * xStride[4]], extraParams);
+
+                                }
                             }
-                        }
-            }
+
+                }
                 break;
 
             //*********************************************//
             default: {
-                uint xShapeInfoCast[MAX_RANK];
-                uint zShapeInfoCast[MAX_RANK];
+                    uint xShapeInfoCast[MAX_RANK];
+                    uint zShapeInfoCast[MAX_RANK];
 
-                bool canCastX = DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-                bool canCastZ = DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
+                    bool canCastX = DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                    bool canCastZ = DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
-                PRAGMA_OMP_PARALLEL_THREADS(threadsInfo._numThreads)
-                {
-                    auto threadNum = omp_get_thread_num();
-                    auto threadOffset = threadsInfo.getThreadOffset(threadNum);
-                    auto lenPerThread = static_cast<uint>(threadsInfo.getItersPerThread(threadNum));
+                    auto span = samediff::Span::build(threadId, numThreads, 0, len, 1);
 
-                    PRAGMA_OMP_SIMD
-                    for (uint i = 0; i < lenPerThread; i++) {
-                        auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                        auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
+                    for (auto i = span.startX(); i < span.stopX(); i++) {
+                        auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
                         z[zOffset] = OpType::op(x[xOffset], extraParams);
                     }
                 }
-            }
 
-            // default: {
-
-            //     const int xRankMinusOne = shape::rank(xShapeInfo) - 1;
-            //     const int zRankMinusOne = shape::rank(zShapeInfo) - 1;
-
-            //     printf("%i  %i \n", xRankMinusOne, zRankMinusOne);
-
-            //     uint* xIdx = new uint[xRankMinusOne + 1];
-            //     uint* zIdx = new uint[zRankMinusOne + 1];
-
-            //     Nd4jLong* xOffsetPerDim = new Nd4jLong[xRankMinusOne];
-            //     Nd4jLong* zOffsetPerDim = new Nd4jLong[zRankMinusOne];
-
-            //     memset(xIdx, 0, sizeof(uint) * xRankMinusOne);
-            //     memset(zIdx, 0, sizeof(uint) * zRankMinusOne);
-
-            //     xIdx[xRankMinusOne] = zIdx[zRankMinusOne] = 1;
-
-            //     const Nd4jLong* xShape  = shape::shapeOf(xShapeInfo);
-            //     const Nd4jLong* zShape  = shape::shapeOf(zShapeInfo);
-            //     const Nd4jLong* xStride = shape::stride(xShapeInfo);
-            //     const Nd4jLong* zStride = shape::stride(zShapeInfo);
-
-            //     PRAGMA_OMP_SIMD
-            //     for (int k = 0; k < xRankMinusOne; ++k)
-            //         xOffsetPerDim[k] = (xShape[k] - 1) * xStride[k];
-            //     PRAGMA_OMP_SIMD
-            //     for (int k = 0; k < zRankMinusOne; ++k)
-            //         zOffsetPerDim[k] = (zShape[k] - 1) * zStride[k];
-
-            //     Nd4jLong xInit = 0, zInit = 0, xOffset = 0, zOffset = 0;
-            //     int jX = xRankMinusOne, jZ = zRankMinusOne;
-
-            //     // first iteration
-            //     z[0] = OpType::op(x[0], extraParams);
-
-            //     // rest iterations
-            //     for (uint i = 1; i < len; i++) {
-
-            //         while(true) {
-            //             if(xShape[jX] == 1) { --jX; continue; }
-            //             if(jX == xRankMinusOne) {
-            //                 if(xIdx[jX] < xShape[jX]) { xOffset += xStride[jX]; ++xIdx[jX]; break; }
-            //                 else                      { xIdx[jX] = 1; --jX; continue; }
-            //             }
-            //             else if(xIdx[jX] < xShape[jX] - 1) { xInit += xStride[jX]; xOffset = xInit; ++xIdx[jX]; jX = xRankMinusOne; break; }
-            //             else                               { xInit -= xOffsetPerDim[jX]; xIdx[jX--] = 0; continue; }
-            //         }
-
-            //         while(true) {
-            //             if(zShape[jZ] == 1) { --jZ; continue; }
-            //             if(jZ == zRankMinusOne) {
-            //                 if(zIdx[jZ] < zShape[jZ]) { zOffset += zStride[jZ]; ++zIdx[jZ]; break; }
-            //                 else                      { zIdx[jZ] = 1; --jZ; continue; }
-            //             }
-            //             else if(zIdx[jZ] < zShape[jZ] - 1) { zInit += zStride[jZ]; zOffset = zInit; ++zIdx[jZ]; jZ = zRankMinusOne; break; }
-            //             else                               { zInit -= zOffsetPerDim[jZ]; zIdx[jZ--] = 0; continue; }
-            //         }
-            //         z[zOffset] = OpType::op(x[xOffset], extraParams);
-            //     }
-
-            //     delete []xIdx;
-            //     delete []zIdx;
-            //     delete []xOffsetPerDim;
-            //     delete []zOffsetPerDim;
-            // }
         }
     }
 
@@ -866,12 +685,11 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
                                                   X* y, Nd4jLong* yShapeInfo,
                                                   Z* z, Nd4jLong* zShapeInfo,
                                                   int* dims, int dimsLen,
-                                                  Z* extraParameters) {
+                                                  Z* extraParameters, int64_t start, int64_t stop) {
 
         // both tads have same shape, however strides and ews may differ
 
         Z param0(OpType::startingValue(x)), param1(OpType::startingValue(x)), param2(extraParameters ? extraParameters[0] : OpType::startingValue(x));
-        Z extraParams[3] = {param0, param1, param2};
 
         const Nd4jLong xLen = shape::length(xShapeInfo);
         const Nd4jLong yLen = shape::length(yShapeInfo);
@@ -921,139 +739,128 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
 
             //*********************************************//
             case LoopKind::EWS1: {
-
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams))
-                for (uint i = 0; i < zLen; ++i) {
-
+                Z extraParams[3];
+                for (auto i = start; i < stop; i++) {
                     extraParams[0] = param0;
                     extraParams[1] = param1;
                     extraParams[2] = param2;
 
                     const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x;
                     const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
-                    auto start      = OpType::startingValue(xTad);
+                    auto s = OpType::startingValue(xTad);
 
                     for (uint j = 0; j < tadLen; ++j)
-                        start = OpType::update(start, OpType::op(xTad[j], yTad[j], extraParams), extraParams);
+                        s = OpType::update(s, OpType::op(xTad[j], yTad[j], extraParams), extraParams);
 
-                    z[i] = OpType::postProcess(start, tadLen, extraParams);
-                }
+                    z[i] = OpType::postProcess(s, tadLen, extraParams);
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::EWSNONZERO: {
-
-               PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams))
-                for (uint i = 0; i < zLen; ++i) {
-
+                Z extraParams[3];
+                for (auto i = start; i < stop; i++) {
                     extraParams[0] = param0;
                     extraParams[1] = param1;
                     extraParams[2] = param2;
 
-                    const auto xTad  = xTadOffsets ? x + xTadOffsets[i] : x;
-                    const auto yTad  = yTadOffsets ? y + yTadOffsets[i] : y;
-                          auto start = OpType::startingValue(xTad);
+                    const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x;
+                    const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
+                    auto s = OpType::startingValue(xTad);
 
                     for (uint j = 0; j < tadLen; ++j)
-                        start = OpType::update(start, OpType::op(xTad[j * xTadEws], yTad[j * yTadEws], extraParams), extraParams);
+                        s = OpType::update(s, OpType::op(xTad[j * xTadEws], yTad[j * yTadEws], extraParams), extraParams);
 
-                    z[i * zEws] = OpType::postProcess(start, tadLen, extraParams);
-                }
+                    z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::RANK1: {
-
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams))
-                for (uint i = 0; i < zLen; i++) {
-
+                Z extraParams[3];
+                for (auto i = start; i < stop; i++) {
                     extraParams[0] = param0;
                     extraParams[1] = param1;
                     extraParams[2] = param2;
 
-                    const auto xTad  = xTadOffsets ? x + xTadOffsets[i] : x;
-                    const auto yTad  = yTadOffsets ? y + yTadOffsets[i] : y;
-                          auto start = OpType::startingValue(xTad);
+                    const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x;
+                    const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
+                    auto s = OpType::startingValue(xTad);
 
                     for (uint i0 = 0; i0 < tadLen; ++i0) {
                         const auto xTadOffset = i0 * xTadStride[0];
                         const auto yTadOffset = i0 * yTadStride[0];
-                        start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
+                        s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
                     }
-                    z[i * zEws] = OpType::postProcess(start, tadLen, extraParams);
-                }
+
+                    z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::RANK2: {
-
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams))
-                for (uint i = 0; i < zLen; i++) {
-
+                Z extraParams[3];
+                for (auto i = start; i < stop; i++) {
                     extraParams[0] = param0;
                     extraParams[1] = param1;
                     extraParams[2] = param2;
 
-                    const auto xTad  = xTadOffsets ? x + xTadOffsets[i] : x;
-                    const auto yTad  = yTadOffsets ? y + yTadOffsets[i] : y;
-                          auto start = OpType::startingValue(xTad);
+                    const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x;
+                    const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
+                    auto s = OpType::startingValue(xTad);
 
                     for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
                         for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
                             const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1];
                             const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1];
-                            start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
+                            s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
                         }
                     }
-                    z[i * zEws] = OpType::postProcess(start, tadLen, extraParams);
-                }
+                    z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::RANK3: {
-
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams))
-                for (uint i = 0; i < zLen; i++) {
-
+                Z extraParams[3];
+                for (auto i = start; i < stop; i++) {
                     extraParams[0] = param0;
                     extraParams[1] = param1;
                     extraParams[2] = param2;
 
-                    const auto xTad  = xTadOffsets ? x + xTadOffsets[i] : x;
-                    const auto yTad  = yTadOffsets ? y + yTadOffsets[i] : y;
-                          auto start = OpType::startingValue(xTad);
+                    const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x;
+                    const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
+                    auto s = OpType::startingValue(xTad);
 
                     for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
                         for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
                             for (uint i2 = 0; i2 < tadShape[2]; ++i2) {
                                 const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2];
                                 const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2];
-                                start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
+                                s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
                             }
                         }
                     }
-                    z[i * zEws] = OpType::postProcess(start, tadLen, extraParams);
-                }
+                    z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::RANK4: {
-
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams))
-                for (uint i = 0; i < zLen; i++) {
-
+                Z extraParams[3];
+                for (auto i = start; i < stop; i++) {
                     extraParams[0] = param0;
                     extraParams[1] = param1;
                     extraParams[2] = param2;
 
-                    const auto xTad  = xTadOffsets ? x + xTadOffsets[i] : x;
-                    const auto yTad  = yTadOffsets ? y + yTadOffsets[i] : y;
-                          auto start = OpType::startingValue(xTad);
+                    const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x;
+                    const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
+                    auto s = OpType::startingValue(xTad);
 
                     for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
                         for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
@@ -1061,29 +868,27 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
                                 for (uint i3 = 0; i3 < tadShape[3]; ++i3) {
                                     const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3];
                                     const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3];
-                                    start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
+                                    s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
                                 }
                             }
                         }
                     }
-                    z[i * zEws] = OpType::postProcess(start, tadLen, extraParams);
-                }
+                    z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::RANK5: {
-
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams))
-                for (uint i = 0; i < zLen; i++) {
-
+                Z extraParams[3];
+                for (auto i = start; i < stop; i++) {
                     extraParams[0] = param0;
                     extraParams[1] = param1;
                     extraParams[2] = param2;
 
-                    const auto xTad  = xTadOffsets ? x + xTadOffsets[i] : x;
-                    const auto yTad  = yTadOffsets ? y + yTadOffsets[i] : y;
-                          auto start = OpType::startingValue(xTad);
+                    const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x;
+                    const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
+                    auto s = OpType::startingValue(xTad);
 
                     for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
                         for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
@@ -1092,68 +897,62 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
                                     for (uint i4 = 0; i4 < tadShape[4]; ++i4) {
                                         const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3] + i4 * xTadStride[4];
                                         const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3] + i4 * yTadStride[4];
-                                        start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
+                                        s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
                                     }
                                 }
                             }
                         }
                     }
-                    z[i * zEws] = OpType::postProcess(start, tadLen, extraParams);
-                }
+                    z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             default: {
-
                 uint castXTadShapeInfo[MAX_RANK];
                 const bool canCastXTad = nd4j::DataTypeUtils::castShapeInfo<uint>(xTadShapeInfo, castXTadShapeInfo);
 
                 if(shape::haveSameShapeAndStrides(xTadShapeInfo, yTadShapeInfo)) {
-
-                    PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams))
-                    for (uint i = 0; i < zLen; ++i) {
-
+                    Z extraParams[3];
+                    for (auto i = start; i < stop; i++) {
                         extraParams[0] = param0;
                         extraParams[1] = param1;
                         extraParams[2] = param2;
 
                         const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x;
                         const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
-                        auto start      = OpType::startingValue(xTad);
+                        auto s = OpType::startingValue(xTad);
 
                         for (uint j = 0; j < tadLen; ++j) {
                             const auto tadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad);
-                            start = OpType::update(start, OpType::op(xTad[tadOffset], yTad[tadOffset], extraParams), extraParams);
+                            s = OpType::update(s, OpType::op(xTad[tadOffset], yTad[tadOffset], extraParams), extraParams);
                         }
 
-                        z[i * zEws] = OpType::postProcess(start, tadLen, extraParams);
-                    }
+                        z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
+                    };
                 }
                 else {
-
                     uint castYTadShapeInfo[MAX_RANK];
                     const bool canCastYTad = nd4j::DataTypeUtils::castShapeInfo<uint>(yTadShapeInfo, castYTadShapeInfo);
 
-                    PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams))
-                    for (uint i = 0; i < zLen; ++i) {
-
+                    Z extraParams[3];
+                    for (auto i = start; i < stop; i++) {
                         extraParams[0] = param0;
                         extraParams[1] = param1;
                         extraParams[2] = param2;
 
                         const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x;
                         const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
-                        auto start      = OpType::startingValue(xTad);
+                        auto s = OpType::startingValue(xTad);
 
                         for (uint j = 0; j < tadLen; ++j) {
                             const auto xTadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad);
                             const auto yTadOffset = shape::indexOffset(j, yTadShapeInfo, castYTadShapeInfo, canCastYTad);
-                            start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
+                            s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
                         }
-
-                        z[i * zEws] = OpType::postProcess(start, tadLen, extraParams);
-                    }
+                        z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
+                    };
                 }
             }
         }
@@ -1167,12 +966,11 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
                                                      Z* z, Nd4jLong* zShapeInfo,
                                                      Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets,
                                                      Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets,
-                                                     Z* extraParameters) {
+                                                     Z* extraParameters, int64_t start, int64_t stop) {
 
         // both tads have same shape, however strides and ews may differ
 
         Z param0(OpType::startingValue(x)), param1(OpType::startingValue(x)), param2(extraParameters ? extraParameters[0] : OpType::startingValue(x));
-        Z extraParams[3] = {param0, param1, param2};
 
         const LoopKind::Kind kindOfLoop = LoopKind::deduceKindOfLoopTadXYZ(xTadShapeInfo, yTadShapeInfo, zShapeInfo);
 
@@ -1195,159 +993,146 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
         int numThreads = OmpLaunchHelper::tadThreads(tadLen, numXTads*numYTads);
 
         switch (kindOfLoop) {
-
             //*********************************************//
             case LoopKind::EWS1: {
-
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(collapse(2) num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams))
-                for (uint ix = 0; ix < numXTads; ++ix) {
-                    for (uint iy = 0; iy < numYTads; ++iy) {
-
+                Z extraParams[3];
+                for (auto ix = 0; ix < numXTads; ix++) {
+                    for (auto iy = 0; iy < numYTads; iy++) {
                         extraParams[0] = param0;
                         extraParams[1] = param1;
                         extraParams[2] = param2;
 
-                        const auto xTad  = x + xTadOffsets[ix];
-                        const auto yTad  = y + yTadOffsets[iy];
-                        const auto zInd  = ix * numYTads + iy;
-                        auto start = startVal;
+                        const auto xTad = x + xTadOffsets[ix];
+                        const auto yTad = y + yTadOffsets[iy];
+                        const auto zInd = ix * numYTads + iy;
+                        auto s = startVal;
 
                         for (uint j = 0; j < tadLen; ++j)
-                            start = OpType::update(start, OpType::op(xTad[j], yTad[j], extraParams), extraParams);
+                            s = OpType::update(s, OpType::op(xTad[j], yTad[j], extraParams), extraParams);
 
-                        z[zInd] = OpType::postProcess(start, tadLen, extraParams);
+                        z[zInd] = OpType::postProcess(s, tadLen, extraParams);
                     }
-                }
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::EWSNONZERO: {
-
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(collapse(2) num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams))
-                for (uint ix = 0; ix < numXTads; ++ix) {
-                    for (uint iy = 0; iy < numYTads; ++iy) {
-
+                Z extraParams[3];
+                for (auto ix = 0; ix < numXTads; ix++) {
+                    for (auto iy = 0; iy < numYTads; iy++) {
                         extraParams[0] = param0;
                         extraParams[1] = param1;
                         extraParams[2] = param2;
 
-                        const auto xTad  = x + xTadOffsets[ix];
-                        const auto yTad  = y + yTadOffsets[iy];
-                        const auto zInd  = ix * numYTads + iy;
-                              auto start = startVal;
+                        const auto xTad = x + xTadOffsets[ix];
+                        const auto yTad = y + yTadOffsets[iy];
+                        const auto zInd = ix * numYTads + iy;
+                        auto s = startVal;
 
                         for (uint j = 0; j < tadLen; ++j)
-                            start = OpType::update(start, OpType::op(xTad[j * xTadEws], yTad[j * yTadEws], extraParams), extraParams);
+                            s = OpType::update(s, OpType::op(xTad[j * xTadEws], yTad[j * yTadEws], extraParams), extraParams);
 
-                        z[zInd * zEws] = OpType::postProcess(start, tadLen, extraParams);
+                        z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams);
                     }
-                }
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::RANK1: {
-
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(collapse(2) num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams))
-                for (uint ix = 0; ix < numXTads; ++ix) {
-                    for (uint iy = 0; iy < numYTads; ++iy) {
-
+                Z extraParams[3];
+                for (auto ix = 0; ix < numXTads; ix++) {
+                    for (auto iy = 0; iy < numYTads; iy++) {
                         extraParams[0] = param0;
                         extraParams[1] = param1;
                         extraParams[2] = param2;
 
-                        const auto xTad  = x + xTadOffsets[ix];
-                        const auto yTad  = y + yTadOffsets[iy];
-                        const auto zInd  = ix * numYTads + iy;
-                              auto start = startVal;
+                        const auto xTad = x + xTadOffsets[ix];
+                        const auto yTad = y + yTadOffsets[iy];
+                        const auto zInd = ix * numYTads + iy;
+                        auto s = startVal;
 
                         for (uint i0 = 0; i0 < tadLen; ++i0) {
                             const auto xTadOffset = i0 * xTadStride[0];
                             const auto yTadOffset = i0 * yTadStride[0];
-                            start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
+                            s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
                         }
-                        z[zInd * zEws] = OpType::postProcess(start, tadLen, extraParams);
+                        z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams);
                     }
-                }
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::RANK2: {
-
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(collapse(2) num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams))
-                for (uint ix = 0; ix < numXTads; ++ix) {
-                    for (uint iy = 0; iy < numYTads; ++iy) {
-
+                Z extraParams[3];
+                for (auto ix = 0; ix < numXTads; ix++) {
+                    for (auto iy = 0; iy < numYTads; iy++) {
                         extraParams[0] = param0;
                         extraParams[1] = param1;
                         extraParams[2] = param2;
 
-                        const auto xTad  = x + xTadOffsets[ix];
-                        const auto yTad  = y + yTadOffsets[iy];
-                        const auto zInd  = ix * numYTads + iy;
-                              auto start = startVal;
+                        const auto xTad = x + xTadOffsets[ix];
+                        const auto yTad = y + yTadOffsets[iy];
+                        const auto zInd = ix * numYTads + iy;
+                        auto s = startVal;
 
                         for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
                             for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
                                 const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1];
                                 const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1];
-                                start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
+                                s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
                             }
                         }
-                        z[zInd * zEws] = OpType::postProcess(start, tadLen, extraParams);
+                        z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams);
                     }
-                }
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::RANK3: {
-
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(collapse(2) num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams))
-                for (uint ix = 0; ix < numXTads; ++ix) {
-                    for (uint iy = 0; iy < numYTads; ++iy) {
-
+                Z extraParams[3];
+                for (auto ix = 0; ix < numXTads; ix++) {
+                    for (auto iy = 0; iy < numYTads; iy++) {
                         extraParams[0] = param0;
                         extraParams[1] = param1;
                         extraParams[2] = param2;
 
-                        const auto xTad  = x + xTadOffsets[ix];
-                        const auto yTad  = y + yTadOffsets[iy];
-                        const auto zInd  = ix * numYTads + iy;
-                              auto start = startVal;
+                        const auto xTad = x + xTadOffsets[ix];
+                        const auto yTad = y + yTadOffsets[iy];
+                        const auto zInd = ix * numYTads + iy;
+                        auto s = startVal;
 
                         for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
                             for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
                                 for (uint i2 = 0; i2 < tadShape[2]; ++i2) {
                                     const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2];
                                     const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2];
-                                    start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
+                                    s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
                                 }
                             }
                         }
-                        z[zInd * zEws] = OpType::postProcess(start, tadLen, extraParams);
+                        z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams);
                     }
-                }
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::RANK4: {
-
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(collapse(2) num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams))
-                for (uint ix = 0; ix < numXTads; ++ix) {
-                    for (uint iy = 0; iy < numYTads; ++iy) {
-
+                Z extraParams[3];
+                for (auto ix = 0; ix < numXTads; ix++) {
+                    for (auto iy = 0; iy < numYTads; iy++) {
                         extraParams[0] = param0;
                         extraParams[1] = param1;
                         extraParams[2] = param2;
 
-                        const auto xTad  = x + xTadOffsets[ix];
-                        const auto yTad  = y + yTadOffsets[iy];
-                        const auto zInd  = ix * numYTads + iy;
-                              auto start = startVal;
+                        const auto xTad = x + xTadOffsets[ix];
+                        const auto yTad = y + yTadOffsets[iy];
+                        const auto zInd = ix * numYTads + iy;
+                        auto s = startVal;
 
                         for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
                             for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
@@ -1355,32 +1140,30 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
                                     for (uint i3 = 0; i3 < tadShape[3]; ++i3) {
                                         const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3];
                                         const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3];
-                                        start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
+                                        s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
                                     }
                                 }
                             }
                         }
-                        z[zInd * zEws] = OpType::postProcess(start, tadLen, extraParams);
+                        z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams);
                     }
-                }
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             case LoopKind::RANK5: {
-
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(collapse(2) num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams))
-                for (uint ix = 0; ix < numXTads; ++ix) {
-                    for (uint iy = 0; iy < numYTads; ++iy) {
-
+                Z extraParams[3];
+                for (auto ix = 0; ix < numXTads; ix++) {
+                    for (auto iy = 0; iy < numYTads; iy++) {
                         extraParams[0] = param0;
                         extraParams[1] = param1;
                         extraParams[2] = param2;
 
-                        const auto xTad  = x + xTadOffsets[ix];
-                        const auto yTad  = y + yTadOffsets[iy];
-                        const auto zInd  = ix * numYTads + iy;
-                              auto start = startVal;
+                        const auto xTad = x + xTadOffsets[ix];
+                        const auto yTad = y + yTadOffsets[iy];
+                        const auto zInd = ix * numYTads + iy;
+                        auto s = startVal;
 
                         for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
                             for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
@@ -1389,7 +1172,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
                                         for (uint i4 = 0; i4 < tadShape[4]; ++i4) {
                                             const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3] + i4 * xTadStride[4];
                                             const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3] + i4 * yTadStride[4];
-                                            start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
+                                            s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
                                         }
                                     }
                                 }
@@ -1397,66 +1180,61 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
                         }
                         z[zInd * zEws] = OpType::postProcess(start, tadLen, extraParams);
                     }
-                }
+                };
             }
-                break;
+            break;
 
             //*********************************************//
             default: {
-
                 uint castXTadShapeInfo[MAX_RANK];
                 const bool canCastXTad = nd4j::DataTypeUtils::castShapeInfo<uint>(xTadShapeInfo, castXTadShapeInfo);
 
                 if(shape::haveSameShapeAndStrides(xTadShapeInfo, yTadShapeInfo)) {
-
-                    PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(collapse(2) num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams))
-                    for (uint ix = 0; ix < numXTads; ++ix) {
-                        for (uint iy = 0; iy < numYTads; ++iy) {
-
+                    Z extraParams[3];
+                    for (auto ix = 0; ix < numXTads; ix++) {
+                        for (auto iy = 0; iy < numYTads; iy++) {
                             extraParams[0] = param0;
                             extraParams[1] = param1;
                             extraParams[2] = param2;
 
-                            const auto xTad  = x + xTadOffsets[ix];
-                            const auto yTad  = y + yTadOffsets[iy];
-                            const auto zInd  = ix * numYTads + iy;
-                                  auto start = startVal;
+                            const auto xTad = x + xTadOffsets[ix];
+                            const auto yTad = y + yTadOffsets[iy];
+                            const auto zInd = ix * numYTads + iy;
+                            auto s = startVal;
 
                             for (uint j = 0; j < tadLen; ++j) {
                                 const auto tadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad);
-                                start = OpType::update(start, OpType::op(xTad[tadOffset], yTad[tadOffset], extraParams), extraParams);
+                                s = OpType::update(s, OpType::op(xTad[tadOffset], yTad[tadOffset], extraParams), extraParams);
                             }
-                            z[zInd * zEws] = OpType::postProcess(start, tadLen, extraParams);
+                            z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams);
                         }
-                    }
+                    };
                 }
                 else {
-
                     uint castYTadShapeInfo[MAX_RANK];
                     const bool canCastYTad = nd4j::DataTypeUtils::castShapeInfo<uint>(yTadShapeInfo, castYTadShapeInfo);
 
-                    PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(collapse(2) num_threads(numThreads) OMP_IF(numThreads > 1) private(extraParams))
-                    for (uint ix = 0; ix < numXTads; ++ix) {
-                        for (uint iy = 0; iy < numYTads; ++iy) {
-
+                    Z extraParams[3];
+                    for (auto ix = 0; ix < numXTads; ix++) {
+                        for (auto iy = 0; iy < numYTads; iy++) {
                             extraParams[0] = param0;
                             extraParams[1] = param1;
                             extraParams[2] = param2;
 
-                            const auto xTad  = x + xTadOffsets[ix];
-                            const auto yTad  = y + yTadOffsets[iy];
-                            const auto zInd  = ix * numYTads + iy;
-                                  auto start = startVal;
+                            const auto xTad = x + xTadOffsets[ix];
+                            const auto yTad = y + yTadOffsets[iy];
+                            const auto zInd = ix * numYTads + iy;
+                            auto s = startVal;
 
                             for (uint j = 0; j < tadLen; ++j) {
                                 const auto xTadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad);
                                 const auto yTadOffset = shape::indexOffset(j, yTadShapeInfo, castYTadShapeInfo, canCastYTad);
-                                start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
+                                s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
                             }
 
-                            z[zInd * zEws] = OpType::postProcess(start, tadLen, extraParams);
+                            z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams);
                         }
-                    }
+                    };
                 }
             }
         }
diff --git a/libnd4j/include/helpers/TAD.h b/libnd4j/include/helpers/TAD.h
index 9888bb1fd..fb52e639c 100644
--- a/libnd4j/include/helpers/TAD.h
+++ b/libnd4j/include/helpers/TAD.h
@@ -721,7 +721,7 @@ namespace shape {
     INLINEDEF void TAD::createOffsets() {
         this->tadOffsets = new Nd4jLong[this->numTads];
         uint nT = this->numTads;
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
+
         for(uint i = 0; i < nT; i++)
             this->tadOffsets[i] = this->tadOffset(i);
     }
diff --git a/libnd4j/include/helpers/benchmark/MatrixBenchmark.h b/libnd4j/include/helpers/benchmark/MatrixBenchmark.h
index fe64b364f..7c1330648 100644
--- a/libnd4j/include/helpers/benchmark/MatrixBenchmark.h
+++ b/libnd4j/include/helpers/benchmark/MatrixBenchmark.h
@@ -19,7 +19,6 @@
 //
 
 #include "../OpBenchmark.h"
-#include <helpers/BlasHelper.h>
 #include <MmulHelper.h>
 
 #ifndef DEV_TESTS_MATRIXBENCHMARK_H
diff --git a/libnd4j/include/helpers/cpu/MmulHelper.cpp b/libnd4j/include/helpers/cpu/MmulHelper.cpp
index fbf2fbc20..fca40d564 100644
--- a/libnd4j/include/helpers/cpu/MmulHelper.cpp
+++ b/libnd4j/include/helpers/cpu/MmulHelper.cpp
@@ -22,6 +22,7 @@
 #include <NDArrayFactory.h>
 #include <helpers/BlasHelper.h>
 #include <exceptions/datatype_exception.h>
+#include <execution/Threads.h>
 
 
 namespace nd4j {
@@ -74,26 +75,28 @@ static void usualGemm(const char cOrder, const bool transA, const bool transB, c
     //     }
     // }   
 
-    PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(M*N > Environment::getInstance()->elementwiseThreshold()) schedule(guided) collapse(2))
-    for(uint row = 0; row < M; ++row) {
-       for(uint col = 0; col < N; ++col) {
-            
-            T3* c = flagC ? (C + row + col * ldc) : (C + row * ldc + col);
-            T3 val = 0;  
+    auto func = PRAGMA_THREADS_FOR_2D { ;
+        for (auto row = start_x; row < stop_x; row += inc_x) {
+            for (auto col = start_y; col < stop_y; col += inc_y) {
+                T3 *c = flagC ? (C + row + col * ldc) : (C + row * ldc + col);
+                T3 val = 0;
 
-            PRAGMA_OMP_SIMD
-            for(uint i = 0; i < K; ++i) {
-                T3 a = flagA ? *(A + row * lda + i) : *(A + row + i * lda);
-                T3 b = flagB ? *(B + col + i * ldb) : *(B + col * ldb + i);             
-                val += alphaZ * a * b;
+                PRAGMA_OMP_SIMD
+                for (uint i = 0; i < K; ++i) {
+                    T3 a = flagA ? *(A + row * lda + i) : *(A + row + i * lda);
+                    T3 b = flagB ? *(B + col + i * ldb) : *(B + col * ldb + i);
+                    val += alphaZ * a * b;
+                }
+
+                if (betaZ)
+                    *c = val + betaZ * *c;
+                else
+                    *c = val;
             }
-            
-            if(betaZ)
-                *c = val + betaZ * *c;
-            else
-                *c = val;
-       }
-    }
+        }
+    };
+
+    samediff::Threads::parallel_for(func, 0, M, 1, 0, N, 1);
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -108,24 +111,27 @@ static void usualGemv(const char aOrder, const int M, const int N, const double
     
     const bool flagA = aOrder == 'f';
 
-    PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(M > Environment::getInstance()->elementwiseThreshold()) schedule(guided))
-    for(int row = 0; row < M; ++row) {
-                        
-        T3* y = Y + row * incy;
-        T3 val = 0;
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto row = start; row < stop; row += increment) {
 
-        PRAGMA_OMP_SIMD
-        for(int i = 0; i < N; ++i) {
-            T3 a = flagA ? *(A + row + i * lda) : *(A + row * lda + i);
-            T3 x = *(X + i * incx);
-            val += alphaZ * a * x;
+            T3 *y = Y + row * incy;
+            T3 val = 0;
+
+            PRAGMA_OMP_SIMD
+            for (int i = 0; i < N; ++i) {
+                T3 a = flagA ? *(A + row + i * lda) : *(A + row * lda + i);
+                T3 x = *(X + i * incx);
+                val += alphaZ * a * x;
+            }
+
+            if (betaZ)
+                *y = val + betaZ * *y;
+            else
+                *y = val;
         }
-        
-        if(betaZ)
-            *y = val + betaZ * *y;
-        else
-            *y = val;
-    }
+    };
+
+        samediff::Threads::parallel_for(func, 0, M);
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -141,7 +147,7 @@ static void usualDot(const Nd4jLong length, const double alpha, const void* vX,
     T3 sum = 0;
     PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(length > Environment::getInstance()->elementwiseThreshold()) schedule(guided) reduction(OMP_SUMT:sum))
     for(int i = 0; i < length; ++i)
-            sum = sum + X[i * incx] * Y[i * incy];        
+            sum += X[i * incx] * Y[i * incy];
     
     *Z = alphaZ * sum + betaZ * *Z;
 }
diff --git a/libnd4j/include/helpers/cpu/TrueBroadcastHelper.cpp b/libnd4j/include/helpers/cpu/TrueBroadcastHelper.cpp
index 5f8789077..c4c2fa995 100644
--- a/libnd4j/include/helpers/cpu/TrueBroadcastHelper.cpp
+++ b/libnd4j/include/helpers/cpu/TrueBroadcastHelper.cpp
@@ -19,6 +19,7 @@
 //
 
 #include <TrueBroadcastHelper.h>
+#include <ops/ops.h>
 
 using namespace simdOps;
 
diff --git a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.cpp b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.cpp
index 22ff3e6b1..4bd456da2 100644
--- a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.cpp
+++ b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.cpp
@@ -44,62 +44,67 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
     const Nd4jLong* tadShape  = shape::shapeOf(const_cast<Nd4jLong*>(tadShapeInfo));
     const Nd4jLong* tadStride = shape::stride(const_cast<Nd4jLong*>(tadShapeInfo));
 
-    int tadsPerThread = zLen / TAD_THRESHOLD;
-    int numThreads = nd4j::math::nd4j_max<int>(1, tadsPerThread);
-    numThreads = nd4j::math::nd4j_min<int>(numThreads, omp_get_max_threads());
-
     switch (kindOfLoop) {
         //*********************************************//
         case nd4j::LoopKind::EWS1: {
 
-            PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-            for (uint i = 0; i < zLen; i++) {
-                auto tad = const_cast<X*>(x) + tadOffsets[i];
-                auto indexValue = OpType::startingIndexValue(tad);
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    auto tad = const_cast<X *>(x) + tadOffsets[i];
+                    auto indexValue = OpType::startingIndexValue(tad);
 
-                for (uint j = 0; j < tadLen; j++) {
-                    functions::indexreduce::IndexValue<X> comp(tad[j], j);
-                    indexValue = OpType::update(indexValue, comp, extraParams);
+                    for (uint j = 0; j < tadLen; j++) {
+                        functions::indexreduce::IndexValue<X> comp(tad[j], j);
+                        indexValue = OpType::update(indexValue, comp, extraParams);
+                    }
+
+                    z[i] = (Z) indexValue.index;
                 }
+            };
 
-                z[i] = (Z) indexValue.index;
-            }
+            samediff::Threads::parallel_tad(func, 0, zLen);
         }
             break;
 
             //*********************************************//
         case nd4j::LoopKind::EWSNONZERO: {
 
-            PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-            for (uint i = 0; i < zLen; i++) {
-                auto tad = const_cast<X*>(x) + tadOffsets[i];
-                auto indexValue = OpType::startingIndexValue(tad);
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    auto tad = const_cast<X *>(x) + tadOffsets[i];
+                    auto indexValue = OpType::startingIndexValue(tad);
 
-                for (uint j = 0; j < tadLen; j++) {
-                    functions::indexreduce::IndexValue<X> comp(tad[j * tadEws], j);
-                    indexValue = OpType::update(indexValue, comp, extraParams);
+                    for (uint j = 0; j < tadLen; j++) {
+                        functions::indexreduce::IndexValue<X> comp(tad[j * tadEws], j);
+                        indexValue = OpType::update(indexValue, comp, extraParams);
+                    }
+
+                    z[i * zEws] = (Z) indexValue.index;
                 }
+            };
 
-                z[i * zEws] = (Z) indexValue.index;
-            }
+            samediff::Threads::parallel_tad(func, 0, zLen);
         }
             break;
 
             //*********************************************//
         case nd4j::LoopKind::RANK1: {
 
-            PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-            for (uint i = 0; i < zLen; i++) {
-                auto tad = const_cast<X*>(x) + tadOffsets[i];
-                auto indexValue = OpType::startingIndexValue(tad);
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    auto tad = const_cast<X *>(x) + tadOffsets[i];
+                    auto indexValue = OpType::startingIndexValue(tad);
 
-                for (uint i0 = 0; i0 < tadLen; ++i0) {
-                    functions::indexreduce::IndexValue<X> comp(tad[i0 * tadStride[0]], i0);
-                    indexValue = OpType::update(indexValue, comp, extraParams);
+                    for (uint i0 = 0; i0 < tadLen; ++i0) {
+                        functions::indexreduce::IndexValue<X> comp(tad[i0 * tadStride[0]], i0);
+                        indexValue = OpType::update(indexValue, comp, extraParams);
+                    }
+
+                    z[i] = (Z) indexValue.index;
                 }
+            };
 
-                z[i] = (Z) indexValue.index;
-            }
+            samediff::Threads::parallel_tad(func, 0, zLen);
         }
             break;
 
@@ -108,22 +113,25 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
             Nd4jLong newStride[2];
             shape::updateStrides(2, tadShape, newStride, 'c');
 
-            PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-            for (uint i = 0; i < zLen; ++i) {
-                auto tad = const_cast<X*>(x) + tadOffsets[i];
-                auto indexValue = OpType::startingIndexValue(tad);
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    auto tad = const_cast<X *>(x) + tadOffsets[i];
+                    auto indexValue = OpType::startingIndexValue(tad);
 
-                for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
-                    for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
-                        const auto tadOffset = i0 * tadStride[0] + i1 * tadStride[1];
-                        const auto tadIndex  = i0 * newStride[0] + i1;
-                        functions::indexreduce::IndexValue<X> comp(tad[tadOffset], tadIndex);
-                        indexValue = OpType::update(indexValue, comp, extraParams);
+                    for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
+                        for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
+                            const auto tadOffset = i0 * tadStride[0] + i1 * tadStride[1];
+                            const auto tadIndex = i0 * newStride[0] + i1;
+                            functions::indexreduce::IndexValue<X> comp(tad[tadOffset], tadIndex);
+                            indexValue = OpType::update(indexValue, comp, extraParams);
+                        }
                     }
-                }
 
-                z[i] = (Z) indexValue.index;
-            }
+                    z[i] = (Z) indexValue.index;
+                }
+            };
+
+            samediff::Threads::parallel_tad(func, 0, zLen);
         }
             break;
 
@@ -132,24 +140,27 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
             Nd4jLong newStride[3];
             shape::updateStrides(3, tadShape, newStride, 'c');
 
-            PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-            for (uint i = 0; i < zLen; ++i) {
-                auto tad = const_cast<X*>(x) + tadOffsets[i];
-                auto indexValue = OpType::startingIndexValue(tad);
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    auto tad = const_cast<X *>(x) + tadOffsets[i];
+                    auto indexValue = OpType::startingIndexValue(tad);
 
-                for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
-                    for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
-                        for (uint i2 = 0; i2 < tadShape[2]; ++i2) {
-                            const auto tadOffset = i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2];
-                            const auto tadIndex  = i0 * newStride[0] + i1 * newStride[1] + i2;
-                            functions::indexreduce::IndexValue<X> comp(tad[tadOffset], tadIndex);
-                            indexValue = OpType::update(indexValue, comp, extraParams);
+                    for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
+                        for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
+                            for (uint i2 = 0; i2 < tadShape[2]; ++i2) {
+                                const auto tadOffset = i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2];
+                                const auto tadIndex = i0 * newStride[0] + i1 * newStride[1] + i2;
+                                functions::indexreduce::IndexValue<X> comp(tad[tadOffset], tadIndex);
+                                indexValue = OpType::update(indexValue, comp, extraParams);
+                            }
                         }
                     }
-                }
 
-                z[i] = (Z) indexValue.index;
-            }
+                    z[i] = (Z) indexValue.index;
+                }
+            };
+
+            samediff::Threads::parallel_tad(func, 0, zLen);
         }
             break;
 
@@ -158,26 +169,29 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
             Nd4jLong newStride[4];
             shape::updateStrides(4, tadShape, newStride, 'c');
 
-            PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-            for (uint i = 0; i < zLen; ++i) {
-                auto tad = const_cast<X*>(x) + tadOffsets[i];
-                auto indexValue = OpType::startingIndexValue(tad);
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    auto tad = const_cast<X *>(x) + tadOffsets[i];
+                    auto indexValue = OpType::startingIndexValue(tad);
 
-                for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
-                    for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
-                        for (uint i2 = 0; i2 < tadShape[2]; ++i2) {
-                            for (uint i3 = 0; i3 < tadShape[3]; ++i3) {
-                                const auto tadOffset = i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3];
-                                const auto tadIndex  = i0 * newStride[0] + i1 * newStride[1] + i2 * newStride[2] + i3;
-                                functions::indexreduce::IndexValue<X> comp(tad[tadOffset], tadIndex);
-                                indexValue = OpType::update(indexValue, comp, extraParams);
+                    for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
+                        for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
+                            for (uint i2 = 0; i2 < tadShape[2]; ++i2) {
+                                for (uint i3 = 0; i3 < tadShape[3]; ++i3) {
+                                    const auto tadOffset = i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3];
+                                    const auto tadIndex = i0 * newStride[0] + i1 * newStride[1] + i2 * newStride[2] + i3;
+                                    functions::indexreduce::IndexValue<X> comp(tad[tadOffset], tadIndex);
+                                    indexValue = OpType::update(indexValue, comp, extraParams);
+                                }
                             }
                         }
                     }
-                }
 
-                z[i] = (Z) indexValue.index;
-            }
+                    z[i] = (Z) indexValue.index;
+                }
+            };
+
+            samediff::Threads::parallel_tad(func, 0, zLen);
         }
             break;
 
@@ -186,28 +200,31 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
             Nd4jLong newStride[5];
             shape::updateStrides(5, tadShape, newStride, 'c');
 
-            PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-            for (uint i = 0; i < zLen; ++i) {
-                auto tad = const_cast<X*>(x) + tadOffsets[i];
-                auto indexValue = OpType::startingIndexValue(tad);
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    auto tad = const_cast<X *>(x) + tadOffsets[i];
+                    auto indexValue = OpType::startingIndexValue(tad);
 
-                for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
-                    for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
-                        for (uint i2 = 0; i2 < tadShape[2]; ++i2) {
-                            for (uint i3 = 0; i3 < tadShape[3]; ++i3) {
-                                for (uint i4 = 0; i4 < tadShape[4]; ++i4) {
-                                    const auto tadOffset = i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3] + i4 * tadStride[4];
-                                    const auto tadIndex  = i0 * newStride[0] + i1 * newStride[1] + i2 * newStride[2] + i3 * newStride[3] + i4;
-                                    functions::indexreduce::IndexValue<X> comp(tad[tadOffset], tadIndex);
-                                    indexValue = OpType::update(indexValue, comp, extraParams);
+                    for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
+                        for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
+                            for (uint i2 = 0; i2 < tadShape[2]; ++i2) {
+                                for (uint i3 = 0; i3 < tadShape[3]; ++i3) {
+                                    for (uint i4 = 0; i4 < tadShape[4]; ++i4) {
+                                        const auto tadOffset = i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3] + i4 * tadStride[4];
+                                        const auto tadIndex = i0 * newStride[0] + i1 * newStride[1] + i2 * newStride[2] + i3 * newStride[3] + i4;
+                                        functions::indexreduce::IndexValue<X> comp(tad[tadOffset], tadIndex);
+                                        indexValue = OpType::update(indexValue, comp, extraParams);
+                                    }
                                 }
                             }
                         }
                     }
-                }
 
-                z[i] = (Z) indexValue.index;
-            }
+                    z[i] = (Z) indexValue.index;
+                }
+            };
+
+            samediff::Threads::parallel_tad(func, 0, zLen);
         }
             break;
 
@@ -216,19 +233,22 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
             uint castZShapeInfo[MAX_RANK];
             const bool canCastZ   = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo,   castZShapeInfo);
 
-            PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-            for (uint i = 0; i < zLen; i++) {
-                auto tad = const_cast<X*>(x) + tadOffsets[i];
-                auto indexValue = OpType::startingIndexValue(tad);
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    auto tad = const_cast<X *>(x) + tadOffsets[i];
+                    auto indexValue = OpType::startingIndexValue(tad);
 
-                for (uint j = 0; j < tadLen; j++) {
-                    functions::indexreduce::IndexValue<X> comp(tad[j * tadEws], j);
-                    indexValue = OpType::update(indexValue, comp, extraParams);
+                    for (uint j = 0; j < tadLen; j++) {
+                        functions::indexreduce::IndexValue<X> comp(tad[j * tadEws], j);
+                        indexValue = OpType::update(indexValue, comp, extraParams);
+                    }
+
+                    auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ);
+                    z[zOffset] = (Z) indexValue.index;
                 }
+            };
 
-                auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ);
-                z[zOffset] = (Z) indexValue.index;
-            }
+            samediff::Threads::parallel_tad(func, 0, zLen);
         }
             break;
 
@@ -237,19 +257,22 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
             uint castTadShapeInfo[MAX_RANK];
             const bool canCastTad = nd4j::DataTypeUtils::castShapeInfo<uint>(tadShapeInfo, castTadShapeInfo);
 
-            PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-            for (uint i = 0; i < zLen; i++) {
-                auto tad = const_cast<X*>(x) + tadOffsets[i];
-                auto indexValue = OpType::startingIndexValue(tad);
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    auto tad = const_cast<X *>(x) + tadOffsets[i];
+                    auto indexValue = OpType::startingIndexValue(tad);
 
-                for (uint j = 0; j < tadLen; j++) {
-                    auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad);
-                    functions::indexreduce::IndexValue<X> comp(tad[tadOffset], j);
-                    indexValue = OpType::update(indexValue, comp, extraParams);
+                    for (uint j = 0; j < tadLen; j++) {
+                        auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad);
+                        functions::indexreduce::IndexValue<X> comp(tad[tadOffset], j);
+                        indexValue = OpType::update(indexValue, comp, extraParams);
+                    }
+
+                    z[i * zEws] = (Z) indexValue.index;
                 }
+            };
 
-                z[i * zEws] = (Z) indexValue.index;
-            }
+            samediff::Threads::parallel_tad(func, 0, zLen);
         }
             break;
 
@@ -260,20 +283,23 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
             const bool canCastTad = nd4j::DataTypeUtils::castShapeInfo<uint>(tadShapeInfo, castTadShapeInfo);
             const bool canCastZ   = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo,   castZShapeInfo);
 
-            PRAGMA_OMP_PARALLEL_FOR_THREADS(numThreads)
-            for (uint i = 0; i < zLen; i++) {
-                auto tad = const_cast<X*>(x) + tadOffsets[i];
-                auto indexValue = OpType::startingIndexValue(tad);
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    auto tad = const_cast<X *>(x) + tadOffsets[i];
+                    auto indexValue = OpType::startingIndexValue(tad);
 
-                for (uint j = 0; j < tadLen; j++) {
-                    auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad);
-                    functions::indexreduce::IndexValue<X> comp(tad[tadOffset], j);
-                    indexValue = OpType::update(indexValue, comp, extraParams);
+                    for (uint j = 0; j < tadLen; j++) {
+                        auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad);
+                        functions::indexreduce::IndexValue<X> comp(tad[tadOffset], j);
+                        indexValue = OpType::update(indexValue, comp, extraParams);
+                    }
+
+                    auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ);
+                    z[zOffset] = (Z) indexValue.index;
                 }
+            };
 
-                auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ);
-                z[zOffset] = (Z) indexValue.index;
-            }
+            samediff::Threads::parallel_tad(func, 0, zLen);
         }
     }
 }
diff --git a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_0.cpp b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_0.cpp
index 16bf3b08b..b8405553e 100644
--- a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_0.cpp
+++ b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_0.cpp
@@ -28,31 +28,31 @@ namespace nd4j {
 
     template<typename X, typename Z>
     template <typename OpType>
-    void Reduction3Loops<X,Z>::innerloopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams) {
+    void Reduction3Loops<X,Z>::innerloopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams, int64_t start, int64_t stop) {
 #ifndef INLINE_LOOPS
-        Reduction3Loops<X,Z>::template loopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams);
+        Reduction3Loops<X,Z>::template loopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams, start, stop);
 #endif
     }
 
     template<typename X, typename Z>
     template <typename OpType>
-    void Reduction3Loops<X,Z>::innerloopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams) {
+    void Reduction3Loops<X,Z>::innerloopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams, int64_t start, int64_t stop) {
 #ifndef INLINE_LOOPS
-        Reduction3Loops<X,Z>::template loopReduce3All<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams);
+        Reduction3Loops<X,Z>::template loopReduce3All<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams, start, stop);
 #endif
     }
 
     template<typename X, typename Y>
-    void Reduction3Loops<X, Y>::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, int* dims, int dimsLen, Y *extraParams) {
+    void Reduction3Loops<X, Y>::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, int* dims, int dimsLen, Y *extraParams, int64_t start, int64_t stop) {
 #ifndef INLINE_LOOPS
-        DISPATCH_BY_OPNUM_TT(innerloopReduce3, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams), REDUCE3_OPS);
+        DISPATCH_BY_OPNUM_TT(innerloopReduce3, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams, start, stop), REDUCE3_OPS);
 #endif
     }
 
     template<typename X, typename Y>
-    void Reduction3Loops<X, Y>::wrapperAll(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Y* extraParams) {
+    void Reduction3Loops<X, Y>::wrapperAll(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Y* extraParams, int64_t start, int64_t stop) {
 #ifndef INLINE_LOOPS
-        DISPATCH_BY_OPNUM_TT(innerloopReduce3All, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo,  xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams), REDUCE3_OPS);
+        DISPATCH_BY_OPNUM_TT(innerloopReduce3All, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo,  xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams, start, stop), REDUCE3_OPS);
 #endif
     }
 
diff --git a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_1.cpp b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_1.cpp
index 4e350ce15..44ccea08c 100644
--- a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_1.cpp
+++ b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_1.cpp
@@ -28,31 +28,31 @@ namespace nd4j {
 
     template<typename X, typename Z>
     template <typename OpType>
-    void Reduction3Loops<X,Z>::innerloopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams) {
+    void Reduction3Loops<X,Z>::innerloopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams, int64_t start, int64_t stop) {
 #ifndef INLINE_LOOPS
-        Reduction3Loops<X,Z>::template loopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams);
+        Reduction3Loops<X,Z>::template loopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams, start, stop);
 #endif
     }
 
     template<typename X, typename Z>
     template <typename OpType>
-    void Reduction3Loops<X,Z>::innerloopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams) {
+    void Reduction3Loops<X,Z>::innerloopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams, int64_t start, int64_t stop) {
 #ifndef INLINE_LOOPS
-        Reduction3Loops<X,Z>::template loopReduce3All<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams);
+        Reduction3Loops<X,Z>::template loopReduce3All<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams, start, stop);
 #endif
     }
 
     template<typename X, typename Y>
-    void Reduction3Loops<X, Y>::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, int* dims, int dimsLen, Y *extraParams) {
+    void Reduction3Loops<X, Y>::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, int* dims, int dimsLen, Y *extraParams, int64_t start, int64_t stop) {
 #ifndef INLINE_LOOPS
-        DISPATCH_BY_OPNUM_TT(innerloopReduce3, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams), REDUCE3_OPS);
+        DISPATCH_BY_OPNUM_TT(innerloopReduce3, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams, start, stop), REDUCE3_OPS);
 #endif
     }
 
     template<typename X, typename Y>
-    void Reduction3Loops<X, Y>::wrapperAll(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Y* extraParams) {
+    void Reduction3Loops<X, Y>::wrapperAll(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Y* extraParams, int64_t start, int64_t stop) {
 #ifndef INLINE_LOOPS
-        DISPATCH_BY_OPNUM_TT(innerloopReduce3All, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo,  xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams), REDUCE3_OPS);
+        DISPATCH_BY_OPNUM_TT(innerloopReduce3All, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo,  xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams, start, stop), REDUCE3_OPS);
 #endif
     }
 
diff --git a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_2.cpp b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_2.cpp
index e869793a8..ec261a7ea 100644
--- a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_2.cpp
+++ b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_2.cpp
@@ -28,31 +28,31 @@ namespace nd4j {
 
     template<typename X, typename Z>
     template <typename OpType>
-    void Reduction3Loops<X,Z>::innerloopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams) {
+    void Reduction3Loops<X,Z>::innerloopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams, int64_t start, int64_t stop) {
 #ifndef INLINE_LOOPS
-        Reduction3Loops<X,Z>::template loopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams);
+        Reduction3Loops<X,Z>::template loopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams, start, stop);
 #endif
     }
 
     template<typename X, typename Z>
     template <typename OpType>
-    void Reduction3Loops<X,Z>::innerloopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams) {
+    void Reduction3Loops<X,Z>::innerloopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams, int64_t start, int64_t stop) {
 #ifndef INLINE_LOOPS
-        Reduction3Loops<X,Z>::template loopReduce3All<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams);
+        Reduction3Loops<X,Z>::template loopReduce3All<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams, start, stop);
 #endif
     }
 
     template<typename X, typename Y>
-    void Reduction3Loops<X, Y>::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, int* dims, int dimsLen, Y *extraParams) {
+    void Reduction3Loops<X, Y>::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, int* dims, int dimsLen, Y *extraParams, int64_t start, int64_t stop) {
 #ifndef INLINE_LOOPS
-        DISPATCH_BY_OPNUM_TT(innerloopReduce3, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams), REDUCE3_OPS);
+        DISPATCH_BY_OPNUM_TT(innerloopReduce3, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams, start, stop), REDUCE3_OPS);
 #endif
     }
 
     template<typename X, typename Y>
-    void Reduction3Loops<X, Y>::wrapperAll(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Y* extraParams) {
+    void Reduction3Loops<X, Y>::wrapperAll(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Y* extraParams, int64_t start, int64_t stop) {
 #ifndef INLINE_LOOPS
-        DISPATCH_BY_OPNUM_TT(innerloopReduce3All, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo,  xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams), REDUCE3_OPS);
+        DISPATCH_BY_OPNUM_TT(innerloopReduce3All, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo,  xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams, start, stop), REDUCE3_OPS);
 #endif
     }
 
diff --git a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_3.cpp b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_3.cpp
index 474443fd3..3b1efadc9 100644
--- a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_3.cpp
+++ b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_3.cpp
@@ -28,31 +28,31 @@ namespace nd4j {
 
     template<typename X, typename Z>
     template <typename OpType>
-    void Reduction3Loops<X,Z>::innerloopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams) {
+    void Reduction3Loops<X,Z>::innerloopReduce3(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, int* dims, int dimsLen, Z* extraParams, int64_t start, int64_t stop) {
 #ifndef INLINE_LOOPS
-        Reduction3Loops<X,Z>::template loopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams);
+        Reduction3Loops<X,Z>::template loopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams, start, stop);
 #endif
     }
 
     template<typename X, typename Z>
     template <typename OpType>
-    void Reduction3Loops<X,Z>::innerloopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams) {
+    void Reduction3Loops<X,Z>::innerloopReduce3All(X* x, Nd4jLong* xShapeInfo, X* y, Nd4jLong* yShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Z* extraParams, int64_t start, int64_t stop) {
 #ifndef INLINE_LOOPS
-        Reduction3Loops<X,Z>::template loopReduce3All<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams);
+        Reduction3Loops<X,Z>::template loopReduce3All<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams, start, stop);
 #endif
     }
 
     template<typename X, typename Y>
-    void Reduction3Loops<X, Y>::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, int* dims, int dimsLen, Y *extraParams) {
+    void Reduction3Loops<X, Y>::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, int* dims, int dimsLen, Y *extraParams, int64_t start, int64_t stop) {
 #ifndef INLINE_LOOPS
-        DISPATCH_BY_OPNUM_TT(innerloopReduce3, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams), REDUCE3_OPS);
+        DISPATCH_BY_OPNUM_TT(innerloopReduce3, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dims, dimsLen, extraParams, start, stop), REDUCE3_OPS);
 #endif
     }
 
     template<typename X, typename Y>
-    void Reduction3Loops<X, Y>::wrapperAll(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Y* extraParams) {
+    void Reduction3Loops<X, Y>::wrapperAll(const int opNum, X *x, Nd4jLong *xShapeInfo, X *y, Nd4jLong *yShapeInfo, Y *z, Nd4jLong *zShapeInfo, Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets, Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets, Y* extraParams, int64_t start, int64_t stop) {
 #ifndef INLINE_LOOPS
-        DISPATCH_BY_OPNUM_TT(innerloopReduce3All, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo,  xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams), REDUCE3_OPS);
+        DISPATCH_BY_OPNUM_TT(innerloopReduce3All, PARAMS(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo,  xTadShapeInfo, xTadOffsets, yTadShapeInfo, yTadOffsets, extraParams, start, stop), REDUCE3_OPS);
 #endif
     }
 
diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_bool.cpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_bool.cpp
index 3d7a85eff..151bc6a82 100644
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_bool.cpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_bool.cpp
@@ -26,9 +26,9 @@ namespace nd4j {
 
     template<typename X, typename Z>
     template <typename OpType>
-    void ReductionBoolLoops<X, Z>::innerloopReduce(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams) {
+    void ReductionBoolLoops<X, Z>::innerloopReduce(X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop) {
 #ifndef INLINE_LOOPS
-        ReductionLoops<X,Z,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams);
+        ReductionLoops<X,Z,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop);
 #endif
     }
 
@@ -36,9 +36,9 @@ namespace nd4j {
     void ReductionBoolLoops<X, Y>::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, Y *z,
                                             Nd4jLong *zShapeInfo, Nd4jLong *tadShapeInfo,
                                             Nd4jLong *tadOffsets,
-                                            X *extraParams) {
+                                            X *extraParams, int64_t start, int64_t stop) {
 #ifndef INLINE_LOOPS
-        DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams), REDUCE_BOOL_OPS);
+        DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop), REDUCE_BOOL_OPS);
 #endif
     }
 
diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_0.cpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_0.cpp
index f545c8c83..af8b0b451 100644
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_0.cpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_0.cpp
@@ -28,18 +28,18 @@ namespace nd4j {
 
     template<typename X, typename Z>
     template <typename OpType>
-    void ReductionFloatLoops<X, Z>::innerloopReduce(X * x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams) {
+    void ReductionFloatLoops<X, Z>::innerloopReduce(X * x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams, int64_t start, int64_t stop) {
 #ifndef INLINE_LOOPS
-        ReductionLoops<X,Z,Z>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams);
+        ReductionLoops<X,Z,Z>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop);
 #endif
     }
 
     template<typename X, typename Y>
     void ReductionFloatLoops<X, Y>::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, Y *z,
                                                   Nd4jLong *zShapeInfo, Nd4jLong *tadShapeInfo,
-                                                  Nd4jLong *tadOffsets, Y *extraParams) {
+                                                  Nd4jLong *tadOffsets, Y *extraParams, int64_t start, int64_t stop) {
 #ifndef INLINE_LOOPS
-        DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams ), REDUCE_FLOAT_OPS);
+        DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop), REDUCE_FLOAT_OPS);
 #endif
     }
 
diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_1.cpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_1.cpp
index fa52015ca..137ffc011 100644
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_1.cpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_1.cpp
@@ -28,18 +28,18 @@ namespace nd4j {
 
     template<typename X, typename Z>
     template <typename OpType>
-    void ReductionFloatLoops<X, Z>::innerloopReduce(X * x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams) {
+    void ReductionFloatLoops<X, Z>::innerloopReduce(X * x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams, int64_t start, int64_t stop) {
 #ifndef INLINE_LOOPS
-        ReductionLoops<X,Z,Z>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams);
+        ReductionLoops<X,Z,Z>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop);
 #endif
     }
 
     template<typename X, typename Y>
     void ReductionFloatLoops<X, Y>::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, Y *z,
                                                   Nd4jLong *zShapeInfo, Nd4jLong *tadShapeInfo,
-                                                  Nd4jLong *tadOffsets, Y *extraParams) {
+                                                  Nd4jLong *tadOffsets, Y *extraParams, int64_t start, int64_t stop) {
 #ifndef INLINE_LOOPS
-        DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams ), REDUCE_FLOAT_OPS);
+        DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop), REDUCE_FLOAT_OPS);
 #endif
     }
 
diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_2.cpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_2.cpp
index eb144fcc6..79b11b419 100644
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_2.cpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_2.cpp
@@ -28,18 +28,18 @@ namespace nd4j {
 
     template<typename X, typename Z>
     template <typename OpType>
-    void ReductionFloatLoops<X, Z>::innerloopReduce(X * x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams) {
+    void ReductionFloatLoops<X, Z>::innerloopReduce(X * x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams, int64_t start, int64_t stop) {
 #ifndef INLINE_LOOPS
-        ReductionLoops<X,Z,Z>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams);
+        ReductionLoops<X,Z,Z>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop);
 #endif
     }
 
     template<typename X, typename Y>
     void ReductionFloatLoops<X, Y>::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, Y *z,
                                                   Nd4jLong *zShapeInfo, Nd4jLong *tadShapeInfo,
-                                                  Nd4jLong *tadOffsets, Y *extraParams) {
+                                                  Nd4jLong *tadOffsets, Y *extraParams, int64_t start, int64_t stop) {
 #ifndef INLINE_LOOPS
-        DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams ), REDUCE_FLOAT_OPS);
+        DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop), REDUCE_FLOAT_OPS);
 #endif
     }
 
diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_3.cpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_3.cpp
index d2991b51b..ddedd6c18 100644
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_3.cpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_3.cpp
@@ -28,18 +28,18 @@ namespace nd4j {
 
     template<typename X, typename Z>
     template <typename OpType>
-    void ReductionFloatLoops<X, Z>::innerloopReduce(X * x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams) {
+    void ReductionFloatLoops<X, Z>::innerloopReduce(X * x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams, int64_t start, int64_t stop) {
 #ifndef INLINE_LOOPS
-        ReductionLoops<X,Z,Z>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams);
+        ReductionLoops<X,Z,Z>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop);
 #endif
     }
 
     template<typename X, typename Y>
     void ReductionFloatLoops<X, Y>::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, Y *z,
                                                   Nd4jLong *zShapeInfo, Nd4jLong *tadShapeInfo,
-                                                  Nd4jLong *tadOffsets, Y *extraParams) {
+                                                  Nd4jLong *tadOffsets, Y *extraParams, int64_t start, int64_t stop) {
 #ifndef INLINE_LOOPS
-        DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams ), REDUCE_FLOAT_OPS);
+        DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop), REDUCE_FLOAT_OPS);
 #endif
     }
 
diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_long.cpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_long.cpp
index 04a3d8559..2e7708497 100644
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_long.cpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_long.cpp
@@ -33,18 +33,18 @@ namespace nd4j {
 
     template<typename X, typename Z>
     template <typename OpType>
-    void ReductionLongLoops<X, Z>::innerloopReduce(X * x, Nd4jLong* xShapeInfo, Z *z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams) {
+    void ReductionLongLoops<X, Z>::innerloopReduce(X * x, Nd4jLong* xShapeInfo, Z *z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop) {
 #ifndef INLINE_LOOPS
-        ReductionLoops<X,Z,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams);
+        ReductionLoops<X,Z,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop);
 #endif
     }
 
     template<typename X, typename Y>
     void ReductionLongLoops<X, Y>::wrapper(const int opNum, X *x, Nd4jLong *xShapeInfo, Y *z,
                                             Nd4jLong *zShapeInfo, Nd4jLong *tadShapeInfo,
-                                            Nd4jLong *tadOffsets, X *extraParams) {
+                                            Nd4jLong *tadOffsets, X *extraParams, int64_t start, int64_t stop) {
 #ifndef INLINE_LOOPS
-        DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams ), REDUCE_LONG_OPS);
+        DISPATCH_BY_OPNUM_TT(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop), REDUCE_LONG_OPS);
 #endif
     }
 
diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_same.cpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_same.cpp
index 9932b04c5..08a67ec59 100644
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_same.cpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_same.cpp
@@ -26,9 +26,9 @@ namespace nd4j {
 
     template<typename X>
     template <typename OpType>
-    void ReductionSameLoops<X>::innerloopReduce(X* x, Nd4jLong* xShapeInfo, X* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams) {
+    void ReductionSameLoops<X>::innerloopReduce(X* x, Nd4jLong* xShapeInfo, X* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop) {
 #ifndef INLINE_LOOPS
-        ReductionLoops<X,X,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams);
+        ReductionLoops<X,X,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop);
 #endif
     }
 
@@ -36,13 +36,13 @@ namespace nd4j {
     void ReductionSameLoops<X>::wrapper(const int opNum, X *vx, Nd4jLong *xShapeInfo, X *vz,
                                            Nd4jLong *zShapeInfo, Nd4jLong *tadShapeInfo,
                                            Nd4jLong *tadOffsets,
-                                           X *vextraParams) {
+                                           X *vextraParams, int64_t start, int64_t stop) {
 #ifndef INLINE_LOOPS
         auto x = reinterpret_cast<X *>(vx);
         auto z = reinterpret_cast<X *>(vz);
         auto extraParams = reinterpret_cast<X *>(vextraParams);
 
-        DISPATCH_BY_OPNUM_T(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams), REDUCE_SAME_OPS);
+        DISPATCH_BY_OPNUM_T(innerloopReduce, PARAMS(x, xShapeInfo, z, zShapeInfo, tadShapeInfo, tadOffsets, extraParams, start, stop), REDUCE_SAME_OPS);
 #endif
     }
 
diff --git a/libnd4j/include/helpers/cuda/TrueBroadcastHelper.cu b/libnd4j/include/helpers/cuda/TrueBroadcastHelper.cu
index 152e74652..8f67f0004 100644
--- a/libnd4j/include/helpers/cuda/TrueBroadcastHelper.cu
+++ b/libnd4j/include/helpers/cuda/TrueBroadcastHelper.cu
@@ -24,6 +24,7 @@
 #include <execution/LaunchContext.h>
 #include <specials.h>
 #include <logger.h>
+#include <ops/ops.h>
 // #include <cuda_runtime.h>
 // #include <cuda.h>
 
diff --git a/libnd4j/include/helpers/impl/BlasHelper.cpp b/libnd4j/include/helpers/impl/BlasHelper.cpp
index 61b542697..bf52fe2c6 100644
--- a/libnd4j/include/helpers/impl/BlasHelper.cpp
+++ b/libnd4j/include/helpers/impl/BlasHelper.cpp
@@ -74,7 +74,7 @@ namespace nd4j {
 
     template <>
     bool BlasHelper::hasGEMV<float>() {
-#if defined(__EXTERNAL_BLAS__) || defined(HAVE_MKLDNN) || defined(HAVE_OPENBLAS)
+#if defined(__EXTERNAL_BLAS__) || defined(HAVE_OPENBLAS)
         return true;
 #else
         return _hasSgemv;
@@ -83,7 +83,7 @@ namespace nd4j {
 
     template <>
     bool BlasHelper::hasGEMV<double>() {
-#if defined(__EXTERNAL_BLAS__) || defined(HAVE_MKLDNN) || defined(HAVE_OPENBLAS)
+#if defined(__EXTERNAL_BLAS__) || defined(HAVE_OPENBLAS)
         return true;
 #else
     return _hasDgemv;
@@ -132,14 +132,14 @@ namespace nd4j {
 
     bool BlasHelper::hasGEMV(const nd4j::DataType dtype)  {
         if(dtype == DataType::FLOAT32) {
-            #if defined(__EXTERNAL_BLAS__) || defined(HAVE_MKLDNN) || defined(HAVE_OPENBLAS)
+            #if defined(__EXTERNAL_BLAS__) || defined(HAVE_OPENBLAS)
                 return true;
             #else
                 return _hasSgemv;
             #endif
         }
         if(dtype == DataType::DOUBLE) {
-            #if defined(__EXTERNAL_BLAS__) || defined(HAVE_MKLDNN) || defined(HAVE_OPENBLAS)
+            #if defined(__EXTERNAL_BLAS__) || defined(HAVE_OPENBLAS)
                 return true;
             #else
                 return _hasDgemv;
@@ -150,7 +150,7 @@ namespace nd4j {
 
     template <>
     bool BlasHelper::hasGEMM<float>() {
-#if defined(__EXTERNAL_BLAS__) || defined(HAVE_MKLDNN) || defined(HAVE_OPENBLAS)
+#if defined(__EXTERNAL_BLAS__) || defined(HAVE_OPENBLAS)
         return true;
 #else
     return _hasSgemm;
@@ -159,7 +159,7 @@ namespace nd4j {
 
     template <>
     bool BlasHelper::hasGEMM<double>() {
-#if defined(__EXTERNAL_BLAS__) || defined(HAVE_MKLDNN) || defined(HAVE_OPENBLAS)
+#if defined(__EXTERNAL_BLAS__) || defined(HAVE_OPENBLAS)
         return true;
 #else
     return _hasDgemm;
@@ -208,14 +208,14 @@ namespace nd4j {
 
     bool BlasHelper:: hasGEMM(const nd4j::DataType dtype) {
         if(dtype == DataType::FLOAT32) {
-            #if defined(__EXTERNAL_BLAS__) || defined(HAVE_MKLDNN) || defined(HAVE_OPENBLAS)
+            #if defined(__EXTERNAL_BLAS__) || defined(HAVE_OPENBLAS)
                 return true;
             #else
                 return _hasSgemm;
             #endif
         }
         if(dtype == DataType::DOUBLE) {
-            #if defined(__EXTERNAL_BLAS__) || defined(HAVE_MKLDNN) || defined(HAVE_OPENBLAS)
+            #if defined(__EXTERNAL_BLAS__) || defined(HAVE_OPENBLAS)
                 return true;
             #else
                 return _hasDgemm;
@@ -276,14 +276,14 @@ namespace nd4j {
     }
 
     CblasSgemv BlasHelper::sgemv() {
-#if defined(__EXTERNAL_BLAS__) || defined(HAVE_MKLDNN) || defined(HAVE_OPENBLAS)
+#if defined(__EXTERNAL_BLAS__)|| defined(HAVE_OPENBLAS)
         return (CblasSgemv)&cblas_sgemv;
 #else
         return this->cblasSgemv;
 #endif
     }
     CblasDgemv BlasHelper::dgemv() {
-#if defined(__EXTERNAL_BLAS__) || defined(HAVE_MKLDNN) || defined(HAVE_OPENBLAS)
+#if defined(__EXTERNAL_BLAS__) || defined(HAVE_OPENBLAS)
         return (CblasDgemv)&cblas_dgemv;
 #else
         return this->cblasDgemv;
@@ -291,7 +291,7 @@ namespace nd4j {
     }
 
     CblasSgemm BlasHelper::sgemm() {
-#if defined(__EXTERNAL_BLAS__) || defined(HAVE_MKLDNN) || defined(HAVE_OPENBLAS)
+#if defined(__EXTERNAL_BLAS__) || defined(HAVE_OPENBLAS)
         return (CblasSgemm)&cblas_sgemm;
 #else
         return this->cblasSgemm;
@@ -299,7 +299,7 @@ namespace nd4j {
     }
 
     CblasDgemm BlasHelper::dgemm() {
-#if defined(__EXTERNAL_BLAS__) || defined(HAVE_MKLDNN) || defined(HAVE_OPENBLAS)
+#if defined(__EXTERNAL_BLAS__) || defined(HAVE_OPENBLAS)
         return (CblasDgemm)&cblas_dgemm;
 #else
         return this->cblasDgemm;
diff --git a/libnd4j/include/helpers/impl/DebugHelper.cpp b/libnd4j/include/helpers/impl/DebugHelper.cpp
index f1ba8a755..704c463e6 100644
--- a/libnd4j/include/helpers/impl/DebugHelper.cpp
+++ b/libnd4j/include/helpers/impl/DebugHelper.cpp
@@ -23,6 +23,7 @@
 #include <NDArrayFactory.h>
 #include <ops/declarable/headers/parity_ops.h>
 #include <helpers/DebugInfo.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
     DebugInfo DebugHelper::debugStatistics(NDArray const* input) {
@@ -88,11 +89,18 @@ PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) reduction(+:_nanCount,_infCount,_m
             }
             *info = {_minValue, _maxValue, _meanValue / input->lengthOf(), _stdDevValue, _zeroCount, _positiveCount, _negativeCount, _infCount, _nanCount};
             _stdDevValue = 0; //math::nd4j_sqrt<double, double>(info->_stdDevValue / (input->lengthOf() - 1));
-PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule (static) reduction(+:_stdDevValue))
-            for (Nd4jLong e = 0; e < input->lengthOf(); e++) {
-                double current = input->e<double>(e);
-                _stdDevValue += (info->_meanValue - current) * (info->_meanValue - current); //info->_minValue;
-            }
+
+            auto func = PRAGMA_REDUCE_DOUBLE {
+                auto _stdDevValue = 0.0;
+                for (auto e = start; e < stop; e++) {
+                    double current = input->e<double>(e);
+                    _stdDevValue += (info->_meanValue - current) * (info->_meanValue - current); //info->_minValue;
+                }
+
+                return _stdDevValue;
+            };
+            _stdDevValue = samediff::Threads::parallel_double(func, LAMBDA_AD { return _old + _new; }, 0, input->lengthOf());
+
             info->_stdDevValue = math::nd4j_sqrt<double, double>(_stdDevValue / input->lengthOf());
 
         }
diff --git a/libnd4j/include/helpers/impl/GradCheck.cpp b/libnd4j/include/helpers/impl/GradCheck.cpp
index a3ae7d1ac..8b24e5f16 100644
--- a/libnd4j/include/helpers/impl/GradCheck.cpp
+++ b/libnd4j/include/helpers/impl/GradCheck.cpp
@@ -33,13 +33,11 @@ void GradCheck::fillGradArrays(const LossFunc loss, const std::vector<NDArray*>&
 	switch(loss) {
 
 		case MEAN:
-            PRAGMA_OMP_PARALLEL_FOR_IF(numInGradArrs > 1)
 			for(int i = 0; i < numInGradArrs; ++i)
 				*gradArrs[i] = 1. / gradArrs[i]->lengthOf();
 			break;
 
 		case SUM:
-            PRAGMA_OMP_PARALLEL_FOR_IF(numInGradArrs > 1)
 			for(int i = 0; i < numInGradArrs; ++i)
 				*gradArrs[i] = 1.;
 			break;
diff --git a/libnd4j/include/helpers/impl/OmpLaunchHelper.cpp b/libnd4j/include/helpers/impl/OmpLaunchHelper.cpp
index a4b9c4000..80e456e29 100644
--- a/libnd4j/include/helpers/impl/OmpLaunchHelper.cpp
+++ b/libnd4j/include/helpers/impl/OmpLaunchHelper.cpp
@@ -45,7 +45,7 @@ OmpLaunchHelper::OmpLaunchHelper(const Nd4jLong N, float desiredNumThreads) {
             else
                 desiredNumThreads = nd4j::math::nd4j_min<int>(omp_get_max_threads(), desiredNumThreads);
         #else
-            desiredNumThreads = 1;
+            desiredNumThreads = nd4j::Environment::getInstance()->maxThreads();
         #endif
         _numThreads = nd4j::math::nd4j_min<int>(N / maxItersPerThread, desiredNumThreads);        
     }
@@ -75,7 +75,7 @@ Nd4jLong OmpLaunchHelper::betterSpan(Nd4jLong N) {
         #ifdef _OPENMP
             return betterThreads(N, omp_get_max_threads());
         #else
-            return 1;
+            return betterThreads(N, nd4j::Environment::getInstance()->maxThreads());;
         #endif
     }
 
@@ -92,7 +92,7 @@ Nd4jLong OmpLaunchHelper::betterSpan(Nd4jLong N) {
 #ifdef _OPENMP
         auto maxThreads = omp_get_max_threads();
 #else
-        auto maxThreads = 1;
+        auto maxThreads = nd4j::Environment::getInstance()->maxThreads();
 #endif
 
         // if there's only 1 thread allowed - nothing to do here
diff --git a/libnd4j/include/loops/aggregates.h b/libnd4j/include/loops/aggregates.h
deleted file mode 100644
index 8fbdefcaf..000000000
--- a/libnd4j/include/loops/aggregates.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2015-2018 Skymind, Inc.
- *
- * This program and the accompanying materials are made available under the
- * terms of the Apache License, Version 2.0 which is available at
- * https://www.apache.org/licenses/LICENSE-2.0.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations
- * under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- ******************************************************************************/
-
-//
-// @author raver119@gmail.com
-//
-
-#ifndef LIBND4J_AGGREGATES_H
-#define LIBND4J_AGGREGATES_H
-
-#include <ops/aggregate_ops.h>
-#include <helpers/DebugHelper.h>
-#include <helpers/helper_ptrmap.h>
-
-namespace functions {
-namespace aggregate {
-
-        template<typename X>
-        class AggregatedFunction {
-
-        public:
-#ifdef __CUDACC__
-            template<typename OpClass>
-            __device__ static void execCuda(X **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays,  X *realArguments, int numRealArguments);
-
-            __device__ static void execCuda(int opNum, X **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays,  X *realArguments, int numRealArguments);
-      
-            __device__ static void aggregateBatch(int numAggregates, int opNum, int maxArgs, int maxShapes, int maxIntArrays, int maxIntArraySize, int maxIdx, int maxReals, void *ptrToArguments);
-
-            __host__ static void aggregateBatchKernelGeneric(dim3& launchDims, cudaStream_t *stream, int opNum, int numAggregates, int maxArgs, int maxShapes, int maxIntArrays, int maxIntArraySize, int maxIdx, int maxReals, void *ptrToArguments);
-
-            __host__ static void aggregateKernelGeneric(dim3& launchDims, cudaStream_t *stream, int opNum, void **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, void *realArguments, int numRealArguments);
-            
-#endif
-
-             template<typename OpClass>
-            inline static void exec(X **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays,  X *realArguments, int numRealArguments) {
-                OpClass::executeAggregate(arguments, numArguments, shapeArguments, numShapeArguments, indexArguments, numIndexArguments, intArrays, numIntArrays, realArguments, numRealArguments);
-            }
-
-            inline static void exec(int opNum, X **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, X *realArguments, int numRealArguments) {
-                DISPATCH_BY_OPNUM_T(exec, PARAMS(arguments, numArguments, shapeArguments, numShapeArguments, indexArguments, numIndexArguments, intArrays, numIntArrays, realArguments, numRealArguments), AGGREGATE_OPS);
-            }
-		};
-}
-}
-
-#ifdef __CUDACC__
-
-
-#endif
-
-#endif //LIBND4J_AGGREGATES_H
diff --git a/libnd4j/include/loops/broadcasting.h b/libnd4j/include/loops/broadcasting.h
index cc0331549..a38e79c3f 100755
--- a/libnd4j/include/loops/broadcasting.h
+++ b/libnd4j/include/loops/broadcasting.h
@@ -91,7 +91,7 @@ namespace functions {
             static __host__ void execInverseBroadcast(dim3 launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShapeInfo, void *y, Nd4jLong *yShapeInfo, void *result, Nd4jLong *resultShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadOnlyShapeInfoZ, Nd4jLong *tadOffsetsZ);
 
 
-#endif
+#else
 
             static void execInverse(int opNum,
                              void *x,
@@ -105,7 +105,9 @@ namespace functions {
                              Nd4jLong *tadShapeInfo,
                              Nd4jLong *tadOffset,
                              Nd4jLong *tadShapeInfoZ,
-                             Nd4jLong *tadOffsetZ);
+                             Nd4jLong *tadOffsetZ,
+                             uint64_t start,
+                             uint64_t stop);
 
             static void exec(int opNum,
                              void *x,
@@ -119,7 +121,9 @@ namespace functions {
                              Nd4jLong *tadShapeInfo,
                              Nd4jLong *tadOffset,
                              Nd4jLong *tadShapeInfoZ,
-                             Nd4jLong *tadOffsetZ);
+                             Nd4jLong *tadOffsetZ,
+                             uint64_t start,
+                             uint64_t stop);
 
             /**
              * CPU execution
@@ -144,7 +148,9 @@ namespace functions {
                              Nd4jLong *tadShapeInfo,
                              Nd4jLong *tadOffset,
                              Nd4jLong *tadShapeInfoZ,
-                             Nd4jLong *tadOffsetZ);
+                             Nd4jLong *tadOffsetZ,
+                             uint64_t start,
+                             uint64_t stop);
 
             template<typename OpType>
             static void execInverse(void *x,
@@ -158,7 +164,10 @@ namespace functions {
                              Nd4jLong *tadShapeInfo,
                              Nd4jLong *tadOffset,
                              Nd4jLong *tadShapeInfoZ,
-                             Nd4jLong *tadOffsetZ);
+                             Nd4jLong *tadOffsetZ,
+                             uint64_t start,
+                             uint64_t stop);
+#endif
         };
     }
 }
diff --git a/libnd4j/include/loops/broadcasting_bool.h b/libnd4j/include/loops/broadcasting_bool.h
index a3098abbb..3b0958be1 100644
--- a/libnd4j/include/loops/broadcasting_bool.h
+++ b/libnd4j/include/loops/broadcasting_bool.h
@@ -89,7 +89,7 @@ namespace functions {
 
             static __host__ void execInverseBroadcast(dim3 launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShapeInfo, void *y, Nd4jLong *yShapeInfo, void *result, Nd4jLong *resultShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadOnlyShapeInfoZ, Nd4jLong *tadOffsetsZ);
 
-#endif
+#else
 
             static void exec(int opNum,
                              void *x,
@@ -103,7 +103,9 @@ namespace functions {
                              Nd4jLong *tadShapeInfo,
                              Nd4jLong *tadOffset,
                              Nd4jLong *tadShapeInfoZ,
-                             Nd4jLong *tadOffsetZ);
+                             Nd4jLong *tadOffsetZ,
+                             uint64_t start,
+                             uint64_t stop);
 
             static void execInverse(int opNum,
                              void *x,
@@ -117,7 +119,9 @@ namespace functions {
                              Nd4jLong *tadShapeInfo,
                              Nd4jLong *tadOffset,
                              Nd4jLong *tadShapeInfoZ,
-                             Nd4jLong *tadOffsetZ);
+                             Nd4jLong *tadOffsetZ,
+                             uint64_t start,
+                             uint64_t stop);
 
             /**
              * CPU execution
@@ -142,7 +146,9 @@ namespace functions {
                              Nd4jLong *tadShapeInfo,
                              Nd4jLong *tadOffset,
                              Nd4jLong *tadShapeInfoZ,
-                             Nd4jLong *tadOffsetZ);
+                             Nd4jLong *tadOffsetZ,
+                             uint64_t start,
+                             uint64_t stop);
 
             template<typename OpType>
             static void execInverse(void *x,
@@ -156,7 +162,10 @@ namespace functions {
                              Nd4jLong *tadShapeInfo,
                              Nd4jLong *tadOffset,
                              Nd4jLong *tadShapeInfoZ,
-                             Nd4jLong *tadOffsetZ);
+                             Nd4jLong *tadOffsetZ,
+                             uint64_t start,
+                             uint64_t stop);
+#endif
         };
     }
 }
diff --git a/libnd4j/include/loops/broadcasting_int.h b/libnd4j/include/loops/broadcasting_int.h
index 84bc0f949..92e4ca7dd 100644
--- a/libnd4j/include/loops/broadcasting_int.h
+++ b/libnd4j/include/loops/broadcasting_int.h
@@ -89,7 +89,7 @@ namespace functions {
 
             static __host__ void execInverseBroadcast(dim3 launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShapeInfo, void *y, Nd4jLong *yShapeInfo, void *result, Nd4jLong *resultShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadOnlyShapeInfoZ, Nd4jLong *tadOffsetsZ);
 
-#endif
+#else
 
             static void exec(int opNum,
                              void *x,
@@ -103,7 +103,9 @@ namespace functions {
                              Nd4jLong *tadShapeInfo,
                              Nd4jLong *tadOffset,
                              Nd4jLong *tadShapeInfoZ,
-                             Nd4jLong *tadOffsetZ);
+                             Nd4jLong *tadOffsetZ,
+                             uint64_t start,
+                             uint64_t stop);
 
             static void execInverse(int opNum,
                              void *x,
@@ -117,7 +119,9 @@ namespace functions {
                              Nd4jLong *tadShapeInfo,
                              Nd4jLong *tadOffset,
                              Nd4jLong *tadShapeInfoZ,
-                             Nd4jLong *tadOffsetZ);
+                             Nd4jLong *tadOffsetZ,
+                             uint64_t start,
+                             uint64_t stop);
 
             /**
              * CPU execution
@@ -142,7 +146,9 @@ namespace functions {
                              Nd4jLong *tadShapeInfo,
                              Nd4jLong *tadOffset,
                              Nd4jLong *tadShapeInfoZ,
-                             Nd4jLong *tadOffsetZ);
+                             Nd4jLong *tadOffsetZ,
+                             uint64_t start,
+                             uint64_t stop);
 
             template<typename OpType>
             static void execInverse(void *x,
@@ -156,7 +162,10 @@ namespace functions {
                              Nd4jLong *tadShapeInfo,
                              Nd4jLong *tadOffset,
                              Nd4jLong *tadShapeInfoZ,
-                             Nd4jLong *tadOffsetZ);
+                             Nd4jLong *tadOffsetZ,
+                             uint64_t start,
+                             uint64_t stop);
+#endif
         };
     }
 }
diff --git a/libnd4j/include/loops/cpu/broadcasting.hpp b/libnd4j/include/loops/cpu/broadcasting.hpp
index 3bd619827..37dbf833f 100644
--- a/libnd4j/include/loops/cpu/broadcasting.hpp
+++ b/libnd4j/include/loops/cpu/broadcasting.hpp
@@ -24,6 +24,7 @@
 #include <types/types.h>
 #include <LoopKind.h>
 #include <helpers/ConstantTadHelper.h>
+#include <execution/Threads.h>
 
 using namespace simdOps;
 
@@ -43,7 +44,9 @@ namespace functions {
                                       Nd4jLong *xTadShapeInfo,
                                       Nd4jLong *xTadOffset,
                                       Nd4jLong *zTadShapeInfo,
-                                      Nd4jLong *zTadOffset) {
+                                      Nd4jLong *zTadOffset,
+                                      uint64_t start,
+                                      uint64_t stop) {
             DISPATCH_BY_OPNUM_TTT(execInverse, PARAMS(x,
                                                xShapeInfo,
                                                y,
@@ -55,7 +58,7 @@ namespace functions {
                                                xTadShapeInfo,
                                                xTadOffset,
                                                zTadShapeInfo,
-                                               zTadOffset), BROADCAST_OPS);
+                                               zTadOffset, start, stop), BROADCAST_OPS);
         }
 
         template <typename X, typename Y, typename Z>
@@ -71,7 +74,9 @@ namespace functions {
                              Nd4jLong *xTadShapeInfo,
                              Nd4jLong *xTadOffset,
                              Nd4jLong *zTadShapeInfo,
-                             Nd4jLong *zTadOffset) {
+                             Nd4jLong *zTadOffset,
+                             uint64_t start,
+                             uint64_t stop) {
             DISPATCH_BY_OPNUM_TTT(exec, PARAMS(x,
                                                xShapeInfo,
                                                y,
@@ -83,7 +88,7 @@ namespace functions {
                                                xTadShapeInfo,
                                                xTadOffset,
                                                zTadShapeInfo,
-                                               zTadOffset), BROADCAST_OPS);
+                                               zTadOffset, start, stop), BROADCAST_OPS);
         }
 
         template <typename X, typename  Y, typename Z>
@@ -99,7 +104,9 @@ namespace functions {
                              Nd4jLong *xTadShapeInfo,
                              Nd4jLong *xTadOffset,
                              Nd4jLong *zTadShapeInfo,
-                             Nd4jLong *zTadOffset) {
+                             Nd4jLong *zTadOffset,
+                             uint64_t start,
+                             uint64_t stop) {
 
                 auto x = reinterpret_cast<X *>(vx);
                 auto y = reinterpret_cast<Y *>(vy);
@@ -131,10 +138,6 @@ namespace functions {
                 auto lenZ = shape::length(zTadShapeInfo);
                 auto lenY = shape::length(yShapeInfo);
 
-                int tadsPerThread = tads / TAD_THRESHOLD;
-                int threads = nd4j::math::nd4j_max<int>(1, tadsPerThread);
-                threads = nd4j::math::nd4j_min<int>(threads, omp_get_max_threads());
-
                 auto xEws = shape::elementWiseStride(xTadShapeShapeInfo);
                 auto yEws = shape::elementWiseStride(yShapeInfo);
                 auto zEws = shape::elementWiseStride(zTadShapeInfo);
@@ -142,19 +145,17 @@ namespace functions {
                 const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXYZ(xTadShapeShapeInfo, yShapeInfo, zTadShapeInfo);
 
                 if (kindOfLoop == nd4j::LoopKind::EWS1) {
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-                      auto oX = x + tadOffsets[i];
-                      auto oZ = z + zTadOffset[i];
+                    for (auto i = start; i < stop; i++) {
+                        auto oX = x + tadOffsets[i];
+                        auto oZ = z + zTadOffset[i];
 
-                      PRAGMA_OMP_SIMD
-                      for (unsigned int f = 0; f < tadLength; f++)
-                        oZ[f] = OpType::op(oX[f], y[f]);
+                        PRAGMA_OMP_SIMD
+                        for (unsigned int f = 0; f < tadLength; f++)
+                            oZ[f] = OpType::op(oX[f], y[f]);
                     }
                 }
                 else if(kindOfLoop == nd4j::LoopKind::EWSNONZERO){
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
+                    for (auto i = start; i < stop; i++) {
                         auto oX = x + tadOffsets[i];
                         auto oZ = z + zTadOffset[i];
 
@@ -164,13 +165,10 @@ namespace functions {
                     }
                 }
                 else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xTadShapeShapeInfo, zTadShapeInfo)) {
-
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i++) {
                         auto oX = x + tadOffsets[i];
                         auto oZ = z + zTadOffset[i];
 
@@ -182,70 +180,61 @@ namespace functions {
                     }
                 }
                 else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, yShapeInfo)) {
-
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint tadShapeInfoZCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
                     bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
 
+                    for (auto i = start; i < stop; i++) {
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto offset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
+                            auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                             oZ[zOffset] = OpType::op(oX[offset], y[offset]);
                         }
                     }
                 }
                 else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, zTadShapeInfo)) {
-
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i++) {
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto offset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
+                            auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
                             oZ[offset] = OpType::op(oX[offset], y[yOffset]);
                         }
                     }
                 }
                 else if(shape::haveSameShapeAndStrides(yShapeInfo, zTadShapeInfo)) {
-
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i++) {
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto xOffset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
+                            auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
                             oZ[offset] = OpType::op(oX[xOffset], y[offset]);
                         }
                     }
                 }
                 else {
-
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint tadShapeInfoZCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
@@ -253,17 +242,15 @@ namespace functions {
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
                     bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i++) {
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto xOffset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
+                            auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
-                            auto zOffset  = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
+                            auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                             oZ[zOffset] = OpType::op(oX[xOffset], y[yOffset]);
                         }
                     }
@@ -285,7 +272,9 @@ namespace functions {
                                       Nd4jLong *yTadShapeInfo,
                                       Nd4jLong *yTadOffset,
                                       Nd4jLong *zTadShapeInfo,
-                                      Nd4jLong *zTadOffset) {
+                                      Nd4jLong *zTadOffset,
+                                      uint64_t start,
+                                      uint64_t stop) {
 
             auto x = reinterpret_cast<X *>(vx);
             auto y = reinterpret_cast<Y *>(vy);
@@ -319,7 +308,7 @@ namespace functions {
 
             int tadsPerThread = tads / TAD_THRESHOLD;
             int threads = nd4j::math::nd4j_max<int>(1, tadsPerThread);
-            threads = nd4j::math::nd4j_min<int>(threads, omp_get_max_threads());
+            threads = nd4j::math::nd4j_min<int>(threads, nd4j::Environment::getInstance()->maxThreads());
 
             auto yEws = shape::elementWiseStride(yTadShapeShapeInfo);
             auto xEws = shape::elementWiseStride(xShapeInfo);
@@ -328,8 +317,7 @@ namespace functions {
             const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXYZ(yTadShapeShapeInfo, xShapeInfo, zTadShapeInfo);
 
             if(kindOfLoop == nd4j::LoopKind::EWS1) {
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                for (unsigned int i = 0; i < tads; i++) {
+                for (auto i = start; i < stop; i++) {
                     auto oY = y + tadOffsets[i];
                     auto oZ = z + zTadOffset[i];
 
@@ -339,24 +327,20 @@ namespace functions {
                 }
             }
             else if(kindOfLoop == nd4j::LoopKind::EWSNONZERO) {
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                for (int i = 0; i < tads; i++) {
+                for (auto i = start; i < stop; i++) {
                     auto oY = y + tadOffsets[i];
                     auto oZ = z + zTadOffset[i];
 
                     PRAGMA_OMP_SIMD
                     for (unsigned int f = 0; f < tadLength; f++)
                         oZ[f * zEws] = OpType::op(x[f * xEws], oY[f * yEws]);
-                }
+                };
             }
             else if(shape::haveSameShapeAndStrides(yTadShapeShapeInfo, xShapeInfo) && shape::haveSameShapeAndStrides(yTadShapeShapeInfo, zTadShapeInfo)) {
-
                 uint tadShapeShapeInfoCast[MAX_RANK];
                 bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
 
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                for (int i = 0; i < tads; i++) {
-
+                for (auto i = start; i < stop; i++) {
                     auto oY = x + tadOffsets[i];
                     auto oZ = z + zTadOffset[i];
 
@@ -365,73 +349,63 @@ namespace functions {
                         auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                         oZ[offset] = OpType::op(x[offset], oY[offset]);
                     }
-                }
+                };
             }
             else if(shape::haveSameShapeAndStrides(yTadShapeShapeInfo, xShapeInfo)) {
-
                 uint tadShapeShapeInfoCast[MAX_RANK];
                 uint tadShapeInfoZCast[MAX_RANK];
                 bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
                 bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
 
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                for (int i = 0; i < tads; i++) {
-
+                for (auto i = start; i < stop; i++) {
                     auto oZ = z + zTadOffset[i];
                     auto oY = y + tadOffsets[i];
 
                     PRAGMA_OMP_SIMD
                     for (int f = 0; f < tadLength; f++) {
-                        auto offset  = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
+                        auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                         auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                         oZ[zOffset] = OpType::op(x[offset], oY[offset]);
                     }
-                }
+                };
             }
             else if(shape::haveSameShapeAndStrides(yTadShapeShapeInfo, zTadShapeInfo)) {
-
                 uint tadShapeShapeInfoCast[MAX_RANK];
                 uint xShapeInfoCast[MAX_RANK];
                 bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                 bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
 
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                for (int i = 0; i < tads; i++) {
-
+                for (auto i = start; i < stop; i++) {
                     auto oZ = z + zTadOffset[i];
                     auto oY = y + tadOffsets[i];
 
                     PRAGMA_OMP_SIMD
                     for (int f = 0; f < tadLength; f++) {
-                        auto offset  = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
+                        auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                         auto xOffset = shape::indexOffset(f, yShapeInfo, xShapeInfoCast, canCastX);
                         oZ[offset] = OpType::op(x[xOffset], oY[offset]);
                     }
-                }
+                };
             }
             else if(shape::haveSameShapeAndStrides(xShapeInfo, zTadShapeInfo)) {
-
                 uint tadShapeShapeInfoCast[MAX_RANK];
                 uint xShapeInfoCast[MAX_RANK];
                 bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                 bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
 
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                for (int i = 0; i < tads; i++) {
-
+                for (auto i = start; i < stop; i++) {
                     auto oZ = z + zTadOffset[i];
                     auto oY = y + tadOffsets[i];
 
                     PRAGMA_OMP_SIMD
                     for (int f = 0; f < tadLength; f++) {
-                        auto yOffset  = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
+                        auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                         auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
                         oZ[offset] = OpType::op(x[offset], oY[yOffset]);
                     }
-                }
+                };
             }
             else {
-
                 uint tadShapeShapeInfoCast[MAX_RANK];
                 uint tadShapeInfoZCast[MAX_RANK];
                 uint xShapeInfoCast[MAX_RANK];
@@ -439,20 +413,18 @@ namespace functions {
                 bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
                 bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
 
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                for (int i = 0; i < tads; i++) {
-
+                for (auto i = start; i < stop; i++) {
                     auto oZ = z + zTadOffset[i];
                     auto oY = y + tadOffsets[i];
 
                     PRAGMA_OMP_SIMD
                     for (int f = 0; f < tadLength; f++) {
                         auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
-                        auto yOffset  = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
-                        auto zOffset  = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
+                        auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
+                        auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                         oZ[zOffset] = OpType::op(x[xOffset], oY[yOffset]);
                     }
-                }
+                };
             }
         }
     }
diff --git a/libnd4j/include/loops/cpu/broadcasting_bool.cpp b/libnd4j/include/loops/cpu/broadcasting_bool.cpp
index bca423e3e..7a3eb1e31 100644
--- a/libnd4j/include/loops/cpu/broadcasting_bool.cpp
+++ b/libnd4j/include/loops/cpu/broadcasting_bool.cpp
@@ -24,6 +24,7 @@
 #include <types/types.h>
 #include <LoopKind.h>
 #include <helpers/ConstantTadHelper.h>
+#include <execution/Threads.h>
 
 using namespace simdOps;
 
@@ -43,7 +44,9 @@ namespace functions {
                              Nd4jLong *xTadShapeInfo,
                              Nd4jLong *xTadOffset,
                              Nd4jLong *zTadShapeInfo,
-                             Nd4jLong *zTadOffset) {
+                             Nd4jLong *zTadOffset,
+                             uint64_t start,
+                             uint64_t stop) {
             DISPATCH_BY_OPNUM_TT(exec, PARAMS(x,
                                                xShapeInfo,
                                                y,
@@ -55,7 +58,7 @@ namespace functions {
                                                xTadShapeInfo,
                                                xTadOffset,
                                                zTadShapeInfo,
-                                               zTadOffset), BROADCAST_BOOL_OPS);
+                                               zTadOffset, start, stop), BROADCAST_BOOL_OPS);
         }
 
         template <typename X, typename Y>
@@ -71,7 +74,9 @@ namespace functions {
                              Nd4jLong *xTadShapeInfo,
                              Nd4jLong *xTadOffset,
                              Nd4jLong *zTadShapeInfo,
-                             Nd4jLong *zTadOffset) {
+                             Nd4jLong *zTadOffset,
+                             uint64_t start,
+                             uint64_t stop) {
             DISPATCH_BY_OPNUM_TT(execInverse, PARAMS(x,
                                                xShapeInfo,
                                                y,
@@ -83,7 +88,7 @@ namespace functions {
                                                xTadShapeInfo,
                                                xTadOffset,
                                                zTadShapeInfo,
-                                               zTadOffset), BROADCAST_BOOL_OPS);
+                                               zTadOffset, start, stop), BROADCAST_BOOL_OPS);
         }
 
         template <typename X, typename Z>
@@ -99,7 +104,9 @@ namespace functions {
                              Nd4jLong *xTadShapeInfo,
                              Nd4jLong *xTadOffset,
                              Nd4jLong *zTadShapeInfo,
-                             Nd4jLong *zTadOffset) {
+                             Nd4jLong *zTadOffset,
+                             uint64_t start,
+                             uint64_t stop) {
 
                 auto x = reinterpret_cast<X *>(vx);
                 auto y = reinterpret_cast<X *>(vy);
@@ -133,7 +140,7 @@ namespace functions {
 
                 int tadsPerThread = tads / TAD_THRESHOLD;
                 int threads = nd4j::math::nd4j_max<int>(1, tadsPerThread);
-                threads = nd4j::math::nd4j_min<int>(threads, omp_get_max_threads());
+                threads = nd4j::math::nd4j_min<int>(threads, nd4j::Environment::getInstance()->maxThreads());
 
                 auto xEws = shape::elementWiseStride(xTadShapeShapeInfo);
                 auto yEws = shape::elementWiseStride(yShapeInfo);
@@ -142,10 +149,9 @@ namespace functions {
                 const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXYZ(xTadShapeShapeInfo, yShapeInfo, zTadShapeInfo);
 
                 if (kindOfLoop == nd4j::LoopKind::EWS1) {
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
+                    for (auto i = start; i < stop; i++) {
                         auto oX = x + tadOffsets[i];
-                           auto oZ = z + zTadOffset[i];
+                        auto oZ = z + zTadOffset[i];
 
                         PRAGMA_OMP_SIMD
                         for (unsigned int f = 0; f < tadLength; f++)
@@ -153,101 +159,86 @@ namespace functions {
                     }
                 }
                 else if(kindOfLoop == nd4j::LoopKind::EWSNONZERO) {
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
+                    for (auto i = start; i < stop; i ++) {
                         auto oX = x + tadOffsets[i];
                         auto oZ = z + zTadOffset[i];
 
                         PRAGMA_OMP_SIMD
                         for (unsigned int f = 0; f < tadLength; f++)
                             oZ[f * zEws] = OpType::op(oX[f * xEws], y[f * yEws]);
-                    }
+                    };
                 }
                 else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xTadShapeShapeInfo, zTadShapeInfo)) {
-
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
-                        // TODO: cover this codebranch with tests
-                        // all this stuff already happens within thread
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
                             auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             oZ[offset] = OpType::op(oX[offset], y[offset]);
                         }
-                    }
+                    };
                 }
                 else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, yShapeInfo)) {
-
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint tadShapeInfoZCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
                     bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto offset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
+                            auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                             oZ[zOffset] = OpType::op(oX[offset], y[offset]);
                         }
-                    }
+                    };
                 }
                 else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, zTadShapeInfo)) {
-
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto offset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
+                            auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
                             oZ[offset] = OpType::op(oX[offset], y[yOffset]);
                         }
-                    }
+                    };
+
                 }
                 else if(shape::haveSameShapeAndStrides(yShapeInfo, zTadShapeInfo)) {
-
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto xOffset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
+                            auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
                             oZ[offset] = OpType::op(oX[xOffset], y[offset]);
                         }
-                    }
+                    };
                 }
                 else {
-
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint tadShapeInfoZCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
@@ -255,20 +246,18 @@ namespace functions {
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
                     bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto xOffset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
+                            auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
-                            auto zOffset  = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
+                            auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                             oZ[zOffset] = OpType::op(oX[xOffset], y[yOffset]);
                         }
-                    }
+                    };
                 }
         }
 
@@ -286,7 +275,9 @@ namespace functions {
                              Nd4jLong *yTadShapeInfo,
                              Nd4jLong *yTadOffset,
                              Nd4jLong *zTadShapeInfo,
-                             Nd4jLong *zTadOffset) {
+                             Nd4jLong *zTadOffset,
+                             uint64_t start,
+                             uint64_t stop) {
 
                 auto x = reinterpret_cast<X *>(vx);
                 auto y = reinterpret_cast<X *>(vy);
@@ -320,7 +311,7 @@ namespace functions {
 
                 int tadsPerThread = tads / TAD_THRESHOLD;
                 int threads = nd4j::math::nd4j_max<int>(1, tadsPerThread);
-                threads = nd4j::math::nd4j_min<int>(threads, omp_get_max_threads());
+                threads = nd4j::math::nd4j_min<int>(threads, nd4j::Environment::getInstance()->maxThreads());
 
                 auto yEws = shape::elementWiseStride(yTadShapeShapeInfo);
                 auto xEws = shape::elementWiseStride(xShapeInfo);
@@ -329,8 +320,7 @@ namespace functions {
                 const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXYZ(yTadShapeShapeInfo, xShapeInfo, zTadShapeInfo);
 
                 if (kindOfLoop == nd4j::LoopKind::EWS1) {
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
+                    for (auto i = start; i < stop; i ++) {
                         auto oY = y + tadOffsets[i];
                         auto oZ = z + zTadOffset[i];
 
@@ -340,8 +330,7 @@ namespace functions {
                     }
                 }
                 else if(kindOfLoop == nd4j::LoopKind::EWSNONZERO) {
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
+                    for (auto i = start; i < stop; i ++) {
                         auto oY = y + tadOffsets[i];
                         auto oZ = z + zTadOffset[i];
 
@@ -355,14 +344,10 @@ namespace functions {
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oY = y + tadOffsets[i];
                         auto oZ = z + zTadOffset[i];
 
-                        // TODO: cover this codebranch with tests
-                        // all this stuff already happens within thread
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
                             auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
@@ -377,15 +362,13 @@ namespace functions {
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
                     bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
                         auto oY = y + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto offset  = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
+                            auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                             auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                             oZ[zOffset] = OpType::op(x[offset], oY[offset]);
                         }
@@ -398,15 +381,13 @@ namespace functions {
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
                         auto oY = y + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto offset  = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
+                            auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                             auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
                             oZ[offset] = OpType::op(x[xOffset], oY[offset]);
                         }
@@ -419,16 +400,14 @@ namespace functions {
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
                         auto oY = y + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
                             auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
-                            auto offset  = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
+                            auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
                             oZ[offset] = OpType::op(x[offset], oY[yOffset]);
                         }
                     }
@@ -442,9 +421,7 @@ namespace functions {
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
                     bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
                         auto oY = y + tadOffsets[i];
 
diff --git a/libnd4j/include/loops/cpu/broadcasting_int.cpp b/libnd4j/include/loops/cpu/broadcasting_int.cpp
index 375d7577a..9dcce7545 100644
--- a/libnd4j/include/loops/cpu/broadcasting_int.cpp
+++ b/libnd4j/include/loops/cpu/broadcasting_int.cpp
@@ -24,6 +24,7 @@
 #include <types/types.h>
 #include <LoopKind.h>
 #include <helpers/ConstantTadHelper.h>
+#include <execution/Threads.h>
 
 using namespace simdOps;
 
@@ -43,7 +44,9 @@ namespace functions {
                              Nd4jLong *xTadShapeInfo,
                              Nd4jLong *xTadOffset,
                              Nd4jLong *zTadShapeInfo,
-                             Nd4jLong *zTadOffset) {
+                             Nd4jLong *zTadOffset,
+                             uint64_t start,
+                             uint64_t stop) {
             DISPATCH_BY_OPNUM_T(exec, PARAMS(x,
                                                xShapeInfo,
                                                y,
@@ -55,7 +58,7 @@ namespace functions {
                                                xTadShapeInfo,
                                                xTadOffset,
                                                zTadShapeInfo,
-                                               zTadOffset), BROADCAST_INT_OPS);
+                                               zTadOffset, start, stop), BROADCAST_INT_OPS);
         }
 
         template <typename X>
@@ -71,7 +74,9 @@ namespace functions {
                              Nd4jLong *xTadShapeInfo,
                              Nd4jLong *xTadOffset,
                              Nd4jLong *zTadShapeInfo,
-                             Nd4jLong *zTadOffset) {
+                             Nd4jLong *zTadOffset,
+                             uint64_t start,
+                             uint64_t stop) {
             DISPATCH_BY_OPNUM_T(execInverse, PARAMS(x,
                                                xShapeInfo,
                                                y,
@@ -83,7 +88,7 @@ namespace functions {
                                                xTadShapeInfo,
                                                xTadOffset,
                                                zTadShapeInfo,
-                                               zTadOffset), BROADCAST_INT_OPS);
+                                               zTadOffset, start, stop), BROADCAST_INT_OPS);
         }
 
         template <typename X>
@@ -99,7 +104,9 @@ namespace functions {
                              Nd4jLong *xTadShapeInfo,
                              Nd4jLong *xTadOffset,
                              Nd4jLong *zTadShapeInfo,
-                             Nd4jLong *zTadOffset) {
+                             Nd4jLong *zTadOffset,
+                             uint64_t start,
+                             uint64_t stop) {
 
                 auto x = reinterpret_cast<X *>(vx);
                 auto y = reinterpret_cast<X *>(vy);
@@ -133,7 +140,7 @@ namespace functions {
 
                 int tadsPerThread = tads / TAD_THRESHOLD;
                 int threads = nd4j::math::nd4j_max<int>(1, tadsPerThread);
-                threads = nd4j::math::nd4j_min<int>(threads, omp_get_max_threads());
+                threads = nd4j::math::nd4j_min<int>(threads, nd4j::Environment::getInstance()->maxThreads());
 
                 auto xEws = shape::elementWiseStride(xTadShapeShapeInfo);
                 auto yEws = shape::elementWiseStride(yShapeInfo);
@@ -142,112 +149,95 @@ namespace functions {
                 const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXYZ(xTadShapeShapeInfo, yShapeInfo, zTadShapeInfo);
 
                 if (kindOfLoop == nd4j::LoopKind::EWS1) {
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
+                    for (auto i = start; i < stop; i ++) {
                         auto oX = x + tadOffsets[i];
-                           auto oZ = z + zTadOffset[i];
+                        auto oZ = z + zTadOffset[i];
 
                         PRAGMA_OMP_SIMD
                         for (unsigned int f = 0; f < tadLength; f++)
                             oZ[f] = OpType::op(oX[f], y[f]);
-                    }
+                    };
                 }
                 else if(kindOfLoop == nd4j::LoopKind::EWSNONZERO) {
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
+                    for (auto i = start; i < stop; i ++) {
                         auto oX = x + tadOffsets[i];
                         auto oZ = z + zTadOffset[i];
 
                         PRAGMA_OMP_SIMD
                         for (unsigned int f = 0; f < tadLength; f++)
                             oZ[f * zEws] = OpType::op(oX[f * xEws], y[f * yEws]);
-                    }
+                    };
                 }
                 else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xTadShapeShapeInfo, zTadShapeInfo)) {
-
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
-                        // TODO: cover this codebranch with tests
-                        // all this stuff already happens within thread
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
                             auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             oZ[offset] = OpType::op(oX[offset], y[offset]);
                         }
-                    }
+                    };
                 }
                 else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, yShapeInfo)) {
-
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint tadShapeInfoZCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
                     bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto offset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
+                            auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                             oZ[zOffset] = OpType::op(oX[offset], y[offset]);
                         }
-                    }
+                    };
                 }
                 else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, zTadShapeInfo)) {
-
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto offset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
+                            auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
                             oZ[offset] = OpType::op(oX[offset], y[yOffset]);
                         }
-                    }
+                    };
                 }
                 else if(shape::haveSameShapeAndStrides(yShapeInfo, zTadShapeInfo)) {
-
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto xOffset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
+                            auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
                             oZ[offset] = OpType::op(oX[xOffset], y[offset]);
                         }
-                    }
+                    };
                 }
                 else {
-
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint tadShapeInfoZCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
@@ -255,20 +245,18 @@ namespace functions {
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
                     bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto xOffset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
+                            auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
-                            auto zOffset  = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
+                            auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                             oZ[zOffset] = OpType::op(oX[xOffset], y[yOffset]);
                         }
-                    }
+                    };
                 }
         }
 
@@ -286,7 +274,9 @@ namespace functions {
                              Nd4jLong *yTadShapeInfo,
                              Nd4jLong *yTadOffset,
                              Nd4jLong *zTadShapeInfo,
-                             Nd4jLong *zTadOffset) {
+                             Nd4jLong *zTadOffset,
+                             uint64_t start,
+                             uint64_t stop) {
 
                 auto x = reinterpret_cast<X *>(vx);
                 auto y = reinterpret_cast<X *>(vy);
@@ -320,7 +310,7 @@ namespace functions {
 
                 int tadsPerThread = tads / TAD_THRESHOLD;
                 int threads = nd4j::math::nd4j_max<int>(1, tadsPerThread);
-                threads = nd4j::math::nd4j_min<int>(threads, omp_get_max_threads());
+                threads = nd4j::math::nd4j_min<int>(threads, nd4j::Environment::getInstance()->maxThreads());
 
                 auto yEws = shape::elementWiseStride(yTadShapeShapeInfo);
                 auto xEws = shape::elementWiseStride(xShapeInfo);
@@ -329,46 +319,39 @@ namespace functions {
                 const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXYZ(yTadShapeShapeInfo, xShapeInfo, zTadShapeInfo);
 
                 if (kindOfLoop == nd4j::LoopKind::EWS1) {
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
+                    for (auto i = start; i < stop; i ++) {
                         auto oY = y + tadOffsets[i];
                         auto oZ = z + zTadOffset[i];
 
                         PRAGMA_OMP_SIMD
                         for (unsigned int f = 0; f < tadLength; f++)
                             oZ[f] = OpType::op(x[f], oY[f]);
-                    }
+                    };
                 }
                 else if(kindOfLoop == nd4j::LoopKind::EWSNONZERO) {
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
+                    for (auto i = start; i < stop; i ++) {
                         auto oY = y + tadOffsets[i];
                         auto oZ = z + zTadOffset[i];
 
                         PRAGMA_OMP_SIMD
                         for (uint f = 0; f < tadLength; f++)
                             oZ[f * zEws] = OpType::op(x[f * xEws], oY[f * yEws]);
-                    }
+                    };
                 }
                 else if(shape::haveSameShapeAndStrides(yTadShapeShapeInfo, xShapeInfo) && shape::haveSameShapeAndStrides(yTadShapeShapeInfo, zTadShapeInfo)) {
-
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oY = y + tadOffsets[i];
                         auto oZ = z + zTadOffset[i];
 
-                        // TODO: cover this codebranch with tests
-                        // all this stuff already happens within thread
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
                             auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                             oZ[offset] = OpType::op(x[offset], oY[offset]);
                         }
-                    }
+                    };
                 }
                 else if(shape::haveSameShapeAndStrides(yTadShapeShapeInfo, xShapeInfo)) {
 
@@ -377,64 +360,54 @@ namespace functions {
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
                     bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
                         auto oY = y + tadOffsets[i];
 
-                        PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto offset  = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
+                            auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                             auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                             oZ[zOffset] = OpType::op(x[offset], oY[offset]);
                         }
-                    }
+                    };
                 }
                 else if(shape::haveSameShapeAndStrides(yTadShapeShapeInfo, zTadShapeInfo)) {
-
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint xShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
                         auto oY = y + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto offset  = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
+                            auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                             auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
                             oZ[offset] = OpType::op(x[xOffset], oY[offset]);
                         }
-                    }
+                    };
                 }
                 else if(shape::haveSameShapeAndStrides(xShapeInfo, zTadShapeInfo)) {
-
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint xShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
                         auto oY = y + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
                             auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
-                            auto offset  = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
+                            auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
                             oZ[offset] = OpType::op(x[offset], oY[yOffset]);
                         }
-                    }
+                    };
                 }
                 else {
-
                     uint xShapeInfoCast[MAX_RANK];
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint tadShapeInfoZCast[MAX_RANK];
@@ -442,9 +415,7 @@ namespace functions {
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
                     bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
-                    for (int i = 0; i < tads; i++) {
-
+                    for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
                         auto oY = y + tadOffsets[i];
 
@@ -455,7 +426,7 @@ namespace functions {
                             auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                             oZ[zOffset] = OpType::op(x[xOffset], oY[yOffset]);
                         }
-                    }
+                    };
                 }
         }
 
diff --git a/libnd4j/include/loops/cpu/indexreduce.cpp b/libnd4j/include/loops/cpu/indexreduce.cpp
index 23286ecd9..df3fd64a9 100644
--- a/libnd4j/include/loops/cpu/indexreduce.cpp
+++ b/libnd4j/include/loops/cpu/indexreduce.cpp
@@ -23,6 +23,7 @@
 #include <Loops.h>
 #include <types/types.h>
 #include <helpers/ConstantTadHelper.h>
+#include <execution/Threads.h>
 #include "../legacy_ops.h"
 
 using namespace simdOps;
@@ -44,8 +45,7 @@ void IndexReduce<X,Y>::exec(const int opNum,
                         void *z, Nd4jLong *zShapeInfo,
                         int *dimension, int dimensionLength,
                         Nd4jLong *tadShapeInfo, Nd4jLong *tadOffset) {
-
-DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffset), INDEX_REDUCE_OPS);
+    DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffset), INDEX_REDUCE_OPS);
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -64,42 +64,41 @@ Nd4jLong IndexReduce<X, Y>::execScalar(void *vx, Nd4jLong *xShapeInfo, void *vex
 
     uint xShapeInfoCast[MAX_RANK];
     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+    int maxThreads = nd4j::math::nd4j_min<int>(64, nd4j::Environment::getInstance()->maxThreads());
+    IndexValue<X> intermediatery[64];
+    for (int e = 0; e < maxThreads; e++)
+        intermediatery[e].index = -1;
 
     if (xEws == 1) {
-        PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-        {
-            auto local = OpType::startingIndexValue(x);
-            auto threadNum = omp_get_thread_num();
-            auto threadOffset = info.getThreadOffset(threadNum);
+        auto func = PRAGMA_THREADS_FOR {
+            intermediatery[thread_id] = OpType::startingIndexValue(x);
 
-            auto ulen = info.getItersPerThread(threadNum);
-
-            for (Nd4jLong i = 0; i < ulen; i++) {
-                IndexValue<X> curr(x[i + threadOffset], i + threadOffset);
-                local = OpType::update(local, curr, extraParams);
+            for (auto i = start; i < stop; i += increment) {
+                IndexValue<X> curr(x[i], i);
+                intermediatery[thread_id] = OpType::update(intermediatery[thread_id], curr, extraParams);
             }
+        };
+
+        maxThreads = samediff::Threads::parallel_for(func, 0, len, 1, maxThreads);
+
+        for (int e = 0; e < maxThreads; e++)
+            startingIndex = OpType::update(startingIndex, intermediatery[e], extraParams);
 
-            PRAGMA_OMP_CRITICAL
-            startingIndex = OpType::update(startingIndex, local, extraParams);
-        }
     } else {
-        PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-        {
-            auto local = OpType::startingIndexValue(x);
-            auto threadNum = omp_get_thread_num();
-            auto threadOffset = info.getThreadOffset(threadNum);
+        auto func = PRAGMA_THREADS_FOR {
+            intermediatery[thread_id] = OpType::startingIndexValue(x);
 
-            auto ulen = info.getItersPerThread(threadNum);
-
-            for (Nd4jLong i = 0; i < ulen; i++) {
-                auto offset = shape::indexOffset(threadOffset + i, xShapeInfo, xShapeInfoCast, canCastX);
-                IndexValue<X> curr(x[offset], threadOffset + i);
-                local = OpType::update(local, curr, extraParams);
+            for (auto i = start; i < stop; i += increment) {
+                auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                IndexValue<X> curr(x[offset], i);
+                intermediatery[thread_id] = OpType::update(intermediatery[thread_id], curr, extraParams);
             }
+        };
 
-            PRAGMA_OMP_CRITICAL
-            startingIndex = OpType::update(startingIndex, local, extraParams);
-        }
+        maxThreads = samediff::Threads::parallel_for(func, 0, len, 1, maxThreads);
+
+        for (int e = 0; e < maxThreads; e++)
+            startingIndex = OpType::update(startingIndex, intermediatery[e], extraParams);
     }
     return startingIndex.index;
 }
@@ -124,9 +123,10 @@ void IndexReduce<X, Z>::exec(void *vx, Nd4jLong *xShapeInfo,
         if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
             return;
         const auto indexValue = OpType::startingIndexValue(x);
-        PRAGMA_OMP_PARALLEL_FOR_IF(zLen > nd4j::Environment::getInstance()->elementwiseThreshold())
+
         for (uint i = 0; i < zLen; i++)
-            z[i] = (Z) indexValue.index;;
+            z[i] = (Z) indexValue.index;
+
         return;
     }
 
diff --git a/libnd4j/include/loops/cpu/pairwise.hpp b/libnd4j/include/loops/cpu/pairwise.hpp
index 9dfa129aa..1fc85e5d8 100644
--- a/libnd4j/include/loops/cpu/pairwise.hpp
+++ b/libnd4j/include/loops/cpu/pairwise.hpp
@@ -26,6 +26,7 @@
 #include <helpers/shape.h>
 #include <op_boilerplate.h>
 #include <OmpLaunchHelper.h>
+#include <execution/Threads.h>
 
 using namespace simdOps;
 
@@ -42,7 +43,9 @@ namespace functions {
                 void *z,
                 Nd4jLong zEws,
                 void *extraParams,
-                Nd4jLong n) {
+                Nd4jLong n,
+                const uint64_t start,
+                const uint64_t stop) {
             DISPATCH_BY_OPNUM_TTT(exec, PARAMS(x,
                                                xEws,
                                                y,
@@ -50,7 +53,7 @@ namespace functions {
                                                z,
                                                zEws,
                                                extraParams,
-                                               n), PAIRWISE_TRANSFORM_OPS);
+                                               n, start, stop), PAIRWISE_TRANSFORM_OPS);
         };
 
 
@@ -61,48 +64,24 @@ namespace functions {
                                               void *vy, Nd4jLong yEws,
                                               void *vz, Nd4jLong zEws,
                                               void *vextraParams,
-                                              const Nd4jLong n) {
+                                              const Nd4jLong n,
+                                              const uint64_t start,
+                                              const uint64_t stop) {
 
             auto x = reinterpret_cast<X *>(vx);
             auto y = reinterpret_cast<Y *>(vy);
             auto z = reinterpret_cast<Z *>(vz);
             auto extraParams = reinterpret_cast<Z *>(vextraParams);
 
-            nd4j::OmpLaunchHelper info(n);
-
             if (xEws == 1 && yEws == 1 && zEws == 1) {
-
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {
-                    auto threadNum = omp_get_thread_num();
-                    auto threadOffset = info.getThreadOffset(threadNum);
-                    auto xi = x + threadOffset;
-                    auto yi = y + threadOffset;
-                    auto zi = z + threadOffset;
-
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                    PRAGMA_OMP_SIMD
-                    for (unsigned int i = 0; i < ulen; i++)
-                        zi[i] = OpType::op(xi[i], yi[i], extraParams);
-                }
+                PRAGMA_OMP_SIMD
+                for (auto i = start; i < stop; i++)
+                        z[i] = OpType::op(x[i], y[i], extraParams);
             }
             else {
-
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {
-                    auto threadNum = omp_get_thread_num();
-                    auto threadOffset = info.getThreadOffset(threadNum);
-                    auto xi = x + xEws*threadOffset;
-                    auto yi = y + yEws*threadOffset;
-                    auto zi = z + zEws*threadOffset;
-
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                    PRAGMA_OMP_SIMD
-                    for (unsigned int i = 0; i < ulen; i++)
-                        zi[i*zEws] = OpType::op(xi[i*xEws], yi[i*yEws], extraParams);
-                }
+                PRAGMA_OMP_SIMD
+                for (auto i = start; i < stop; i++)
+                    z[i*zEws] = OpType::op(x[i*xEws], y[i*yEws], extraParams);
             }
         }
 
@@ -115,14 +94,16 @@ namespace functions {
                 Nd4jLong *yShapeInfo,
                 void *z,
                 Nd4jLong *zShapeInfo,
-                void *extraParams) {
+                void *extraParams,
+                const uint64_t start,
+                const uint64_t stop) {
             DISPATCH_BY_OPNUM_TTT(exec, PARAMS(x,
                                               xShapeInfo,
                                               y,
                                               yShapeInfo,
                                               z,
                                               zShapeInfo,
-                                              extraParams),
+                                              extraParams, start, stop),
                                  PAIRWISE_TRANSFORM_OPS);
         };
 
@@ -136,7 +117,9 @@ namespace functions {
                 Nd4jLong* yShapeInfo,
                 void *vz,
                 Nd4jLong* zShapeInfo,
-                void *vextraParams) {
+                void *vextraParams,
+                const uint64_t start,
+                const uint64_t stop) {
 
             auto x = reinterpret_cast<X *>(vx);
             auto y = reinterpret_cast<Y *>(vy);
@@ -148,7 +131,6 @@ namespace functions {
             auto yEws = shape::elementWiseStride(yShapeInfo);
             auto zEws = shape::elementWiseStride(zShapeInfo);
 
-            nd4j::OmpLaunchHelper info(n);
 
             if (shape::isScalar(yShapeInfo)) {
 
@@ -156,38 +138,22 @@ namespace functions {
                 const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
                 if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
-
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for(unsigned int i = 0; i < ulen; i++)  {
-                            auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            z[offset] = OpType::op(x[offset], y[0], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for(auto i = start; i < stop; i++)  {
+                        auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        z[offset] = OpType::op(x[offset], y[0], extraParams);
+                    };
                 }
                 else {
                     uint zShapeInfoCast[MAX_RANK];
                     const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for(unsigned int i = 0; i < ulen; i++)  {
-                            auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
-                            z[zOffset] = OpType::op(x[xOffset], y[0], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for(auto i = start; i < stop; i++)  {
+                        auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
+                        z[zOffset] = OpType::op(x[xOffset], y[0], extraParams);
+                    };
                 }
                 return;
             }
@@ -198,96 +164,63 @@ namespace functions {
             const bool sameShapesXY = shape::shapeEquals(xShapeInfo, yShapeInfo);
 
             if ((kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) && sameShapesXY) {
-                exec<OpType>(x, xEws, y, yEws, z, zEws, extraParams, n);
+                exec<OpType>(x, xEws, y, yEws, z, zEws, extraParams, n, start, stop);
             }
             else if ((kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) && !sameShapesXY) { //not same shape
-                exec<OpType>(x, xEws, y, yEws, z, zEws, extraParams, shape::length(yShapeInfo));
+                exec<OpType>(x, xEws, y, yEws, z, zEws, extraParams, shape::length(yShapeInfo), start, stop);
             }
             else {
 
                 if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
-
                     uint xShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for (unsigned int i = 0; i < ulen; i++)  {
-                            auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            z[offset] = OpType::op(x[offset], y[offset], extraParams);
-                        }
+                    PRAGMA_OMP_SIMD
+                    for (auto i = start; i < stop; i++)  {
+                        auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        z[offset] = OpType::op(x[offset], y[offset], extraParams);
                     }
                 }
                 else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) {
-
                     uint xShapeInfoCast[MAX_RANK];
                     uint zShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                     bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for (unsigned int i = 0; i < ulen; i++)  {
-                            auto offset  = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
-                            z[zOffset] = OpType::op(x[offset], y[offset], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for (auto i = start; i < stop; i++)  {
+                        auto offset  = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
+                        z[zOffset] = OpType::op(x[offset], y[offset], extraParams);
+                    };
                 }
                 else if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
-
                     uint xShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for (unsigned int i = 0; i < ulen; i++)  {
-                            auto offset  = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
-                            z[offset] = OpType::op(x[offset], y[yOffset], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for (auto i = start; i < stop; i++)  {
+                        auto offset  = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
+                        z[offset] = OpType::op(x[offset], y[yOffset], extraParams);
+                    };
                 }
                 else if(shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo)) {
-
                     uint xShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for (unsigned int i = 0; i < ulen; i++)  {
-                            auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            auto offset  = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
-                            z[offset] = OpType::op(x[xOffset], y[offset], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for (auto i = start; i < stop; i++)  {
+                        auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto offset  = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
+                        z[offset] = OpType::op(x[xOffset], y[offset], extraParams);
+                    };
                 }
                 else {
-
                     uint xShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     uint zShapeInfoCast[MAX_RANK];
@@ -295,20 +228,13 @@ namespace functions {
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
                     bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for (unsigned int i = 0; i < ulen; i++)  {
-                            auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
-                            auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
-                            z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for (auto i = start; i < stop; i++)  {
+                        auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
+                        auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
+                        z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
+                    };
                 }
             }
         }
diff --git a/libnd4j/include/loops/cpu/pairwise2.hpp b/libnd4j/include/loops/cpu/pairwise2.hpp
deleted file mode 100644
index 17acd35b7..000000000
--- a/libnd4j/include/loops/cpu/pairwise2.hpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2015-2018 Skymind, Inc.
- *
- * This program and the accompanying materials are made available under the
- * terms of the Apache License, Version 2.0 which is available at
- * https://www.apache.org/licenses/LICENSE-2.0.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations
- * under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- ******************************************************************************/
-
-//
-// Created by remote on 2018-09-20.
-//
-
-#include <ops/ops.h>
-#include <loops/pairwise_transform.h>
-#include <types/types.h>
-#include <templatemath.h>
-#include <helpers/shape.h>
-#include <op_boilerplate.h>
-#include <OmpLaunchHelper.h>
-
-using namespace simdOps;
-
-namespace functions {
-    namespace pairwise_transforms {
-
-        template <typename X, typename Y, typename Z>
-        void PairWiseTransform<X, Y, Z>::exec(
-                const int opNum,
-                void *x,
-                Nd4jLong xEws,
-                void *y,
-                Nd4jLong yEws,
-                void *z,
-                Nd4jLong zEws,
-                void *extraParams,
-                Nd4jLong n) {
-            DISPATCH_BY_OPNUM_TTT(exec, PARAMS(x,
-                                              xEws,
-                                              y,
-                                              yEws,
-                                              z,
-                                              zEws,
-                                              extraParams,
-                                              n), PAIRWISE_TRANSFORM_OPS);
-        };
-
-
-
-        template <typename X, typename Y, typename Z>
-        template <typename OpType>
-        void PairWiseTransform<X, Y, Z>::exec(void *vx, Nd4jLong xEws,
-                                            void *vy, Nd4jLong yEws,
-                                            void *vz, Nd4jLong zEws,
-                                            void *vextraParams,
-                                            const Nd4jLong n) {
-
-            auto x = reinterpret_cast<X *>(vx);
-            auto y = reinterpret_cast<Y *>(vy);
-            auto z = reinterpret_cast<Z *>(vz);
-            auto extraParams = reinterpret_cast<Z *>(vextraParams);
-
-            nd4j::OmpLaunchHelper info(n);
-
-            if (xEws == 1 && yEws == 1 && zEws == 1) {
-
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {                
-                    auto threadNum = omp_get_thread_num();
-                    Nd4jLong threadOffset = info.getThreadOffset(threadNum);        
-                    auto xi = x + threadOffset;
-                    auto yi = y + threadOffset;
-                    auto zi = z + threadOffset;
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                    PRAGMA_OMP_SIMD
-                    for (Nd4jLong i = 0; i < ulen; i++)
-                        zi[i] = OpType::op(xi[i], yi[i], extraParams);
-                }
-            }
-            else {
-
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {                
-                    auto threadNum = omp_get_thread_num();
-                    Nd4jLong threadOffset = info.getThreadOffset(threadNum);        
-                    auto xi = x + xEws*threadOffset;
-                    auto yi = y + yEws*threadOffset;
-                    auto zi = z + zEws*threadOffset;
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                    PRAGMA_OMP_SIMD
-                    for (Nd4jLong i = 0; i < ulen; i++)
-                        zi[i*zEws] = OpType::op(xi[i*xEws], yi[i*yEws], extraParams);
-                }
-            }
-        }
-    }
-}
diff --git a/libnd4j/include/loops/cpu/pairwise_bool.cpp b/libnd4j/include/loops/cpu/pairwise_bool.cpp
index 8feabb98a..2259c37b0 100644
--- a/libnd4j/include/loops/cpu/pairwise_bool.cpp
+++ b/libnd4j/include/loops/cpu/pairwise_bool.cpp
@@ -22,6 +22,7 @@
 #include <types/types.h>
 #include <LoopKind.h>
 #include <OmpLaunchHelper.h>
+#include <execution/Threads.h>
 
 using namespace simdOps;
 
@@ -38,7 +39,9 @@ namespace functions {
                 void *z,
                 Nd4jLong zEws,
                 void *extraParams,
-                Nd4jLong n) {
+                Nd4jLong n,
+                const uint64_t start,
+                const uint64_t stop) {
             DISPATCH_BY_OPNUM_TT(exec, PARAMS(x,
                                                xEws,
                                                y,
@@ -46,7 +49,7 @@ namespace functions {
                                                z,
                                                zEws,
                                                extraParams,
-                                               n), PAIRWISE_BOOL_OPS);
+                                               n, start, stop), PAIRWISE_BOOL_OPS);
         };
 
 
@@ -60,46 +63,24 @@ namespace functions {
                                               void *vz,
                                               Nd4jLong zEws,
                                               void *vextraParams,
-                                              const Nd4jLong n) {
+                                              const Nd4jLong n,
+                                               const uint64_t start,
+                                               const uint64_t stop) {
 
             auto x = reinterpret_cast<X *>(vx);
             auto y = reinterpret_cast<X *>(vy);
             auto z = reinterpret_cast<Z *>(vz);
             auto extraParams = reinterpret_cast<X *>(vextraParams);
 
-            nd4j::OmpLaunchHelper info(n);
-
             if (xEws == 1 && yEws == 1 && zEws == 1) {
-
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {
-                    auto threadNum = omp_get_thread_num();
-                    Nd4jLong threadOffset = info.getThreadOffset(threadNum);
-                    auto xi = x + threadOffset;
-                    auto yi = y + threadOffset;
-                    auto zi = z + threadOffset;
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                    PRAGMA_OMP_SIMD
-                    for (Nd4jLong i = 0; i < ulen; i++)
-                        zi[i] = OpType::op(xi[i], yi[i], extraParams);
-                }
+                PRAGMA_OMP_SIMD
+                for (auto i = start; i < stop; i++)
+                    z[i] = OpType::op(x[i], y[i], extraParams);
             }
             else {
-
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {
-                    auto threadNum = omp_get_thread_num();
-                    Nd4jLong threadOffset = info.getThreadOffset(threadNum);
-                    auto xi = x + xEws*threadOffset;
-                    auto yi = y + yEws*threadOffset;
-                    auto zi = z + zEws*threadOffset;
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                    PRAGMA_OMP_SIMD
-                    for (Nd4jLong i = 0; i < ulen; i++)
-                        zi[i*zEws] = OpType::op(xi[i*xEws], yi[i*yEws], extraParams);
-                }
+                PRAGMA_OMP_SIMD
+                for (auto i = start; i < stop; i++)
+                    z[i*zEws] = OpType::op(x[i*xEws], y[i*yEws], extraParams);
             }
         }
 
@@ -112,14 +93,16 @@ namespace functions {
                 Nd4jLong *yShapeInfo,
                 void *z,
                 Nd4jLong *zShapeInfo,
-                void *extraParams) {
+                void *extraParams,
+                const uint64_t start,
+                const uint64_t stop) {
             DISPATCH_BY_OPNUM_TT(exec, PARAMS(x,
                                               xShapeInfo,
                                               y,
                                               yShapeInfo,
                                               z,
                                               zShapeInfo,
-                                              extraParams),
+                                              extraParams, start, stop),
                                  PAIRWISE_BOOL_OPS);
         };
 
@@ -129,7 +112,9 @@ namespace functions {
         void PairWiseBoolTransform<X, Z>::exec(void *vx, Nd4jLong* xShapeInfo,
                                             void *vy, Nd4jLong* yShapeInfo,
                                             void *vz, Nd4jLong* zShapeInfo,
-                                            void *vextraParams) {
+                                            void *vextraParams,
+                                            const uint64_t start,
+                                            const uint64_t stop) {
 
             auto x = reinterpret_cast<X *>(vx);
             auto y = reinterpret_cast<X *>(vy);
@@ -141,8 +126,6 @@ namespace functions {
             auto yEws = shape::elementWiseStride(yShapeInfo);
             auto zEws = shape::elementWiseStride(zShapeInfo);
 
-            nd4j::OmpLaunchHelper info(n);
-
             if (shape::isScalar(yShapeInfo)) {
 
                uint xShapeInfoCast[MAX_RANK];
@@ -150,37 +133,22 @@ namespace functions {
 
                 if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for(Nd4jLong i = 0; i < ulen; i++)  {
-                            auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            z[offset] = OpType::op(x[offset], y[0], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for(auto i = start; i < stop; i++)  {
+                        auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        z[offset] = OpType::op(x[offset], y[0], extraParams);
+                    };
                 }
                 else {
-
                     uint zShapeInfoCast[MAX_RANK];
                     const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for(Nd4jLong i = 0; i < ulen; i++)  {
-                            auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
-                            z[zOffset] = OpType::op(x[xOffset], y[0], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for(auto i = start; i < stop; i++)  {
+                        auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
+                        z[zOffset] = OpType::op(x[xOffset], y[0], extraParams);
+                    };
                 }
                 return;
             }
@@ -189,96 +157,62 @@ namespace functions {
             const bool sameShapesXY = shape::shapeEquals(xShapeInfo, yShapeInfo);
 
             if ((kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) && sameShapesXY) {
-                exec<OpType>(x, xEws, y, yEws, z, zEws, extraParams, n);
+                exec<OpType>(x, xEws, y, yEws, z, zEws, extraParams, n, start, stop);
             }
             else if ((kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) && !sameShapesXY) { //not same shape
-                exec<OpType>(x, xEws, y, yEws, z, zEws, extraParams, shape::length(yShapeInfo));
+                exec<OpType>(x, xEws, y, yEws, z, zEws, extraParams, shape::length(yShapeInfo), start, stop);
             }
             else {
-
                 if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
-
                     uint xShapeInfoCast[MAX_RANK];
                     const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for (Nd4jLong i = 0; i < ulen; i++)  {
-                            auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            z[offset] = OpType::op(x[offset], y[offset], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for (auto i = start; i < stop; i++)  {
+                        auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        z[offset] = OpType::op(x[offset], y[offset], extraParams);
+                    };
                 }
                 else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) {
-
                     uint xShapeInfoCast[MAX_RANK];
                     uint zShapeInfoCast[MAX_RANK];
                     const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                     const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for (Nd4jLong i = 0; i < ulen; i++)  {
-                            auto offset  = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
-                            z[zOffset] = OpType::op(x[offset], y[offset], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for (auto i = start; i < stop; i++)  {
+                        auto offset  = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
+                        z[zOffset] = OpType::op(x[offset], y[offset], extraParams);
+                    };
                 }
                 else if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
-
                     uint xShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                     const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for (Nd4jLong i = 0; i < ulen; i++)  {
-                            auto offset  = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
-                            z[offset] = OpType::op(x[offset], y[yOffset], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for (auto i = start; i < stop; i++)  {
+                        auto offset  = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
+                        z[offset] = OpType::op(x[offset], y[yOffset], extraParams);
+                    };
                 }
                 else if(shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo)) {
-
                     uint xShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                     const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for (Nd4jLong i = 0; i < ulen; i++)  {
-                            auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            auto offset  = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
-                            z[offset] = OpType::op(x[xOffset], y[offset], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for (auto i = start; i < stop; i++)  {
+                        auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto offset  = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
+                        z[offset] = OpType::op(x[xOffset], y[offset], extraParams);
+                    };
                 }
                 else {
-
                     uint xShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     uint zShapeInfoCast[MAX_RANK];
@@ -286,20 +220,13 @@ namespace functions {
                     const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
                     const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for (Nd4jLong i = 0; i < ulen; i++)  {
-                            auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
-                            auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
-                            z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for (auto i = start; i < stop; i++)  {
+                        auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
+                        auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
+                        z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
+                    };
                 }
             }
         }
diff --git a/libnd4j/include/loops/cpu/pairwise_int.cpp b/libnd4j/include/loops/cpu/pairwise_int.cpp
index 63b9dc8c8..673951d6a 100644
--- a/libnd4j/include/loops/cpu/pairwise_int.cpp
+++ b/libnd4j/include/loops/cpu/pairwise_int.cpp
@@ -22,6 +22,7 @@
 #include <types/types.h>
 #include <LoopKind.h>
 #include <OmpLaunchHelper.h>
+#include <execution/Threads.h>
 
 using namespace simdOps;
 
@@ -38,7 +39,9 @@ namespace functions {
                 void *z,
                 Nd4jLong zEws,
                 void *extraParams,
-                Nd4jLong n) {
+                Nd4jLong n,
+                const uint64_t start,
+                const uint64_t stop) {
             DISPATCH_BY_OPNUM_T(exec, PARAMS(x,
                                                xEws,
                                                y,
@@ -46,7 +49,7 @@ namespace functions {
                                                z,
                                                zEws,
                                                extraParams,
-                                               n), PAIRWISE_INT_OPS);
+                                               n, start, stop), PAIRWISE_INT_OPS);
         };
 
 
@@ -60,46 +63,24 @@ namespace functions {
                                               void *vz,
                                               Nd4jLong zEws,
                                               void *vextraParams,
-                                              const Nd4jLong n) {
+                                              const Nd4jLong n,
+                                              const uint64_t start,
+                                              const uint64_t stop) {
 
             auto x = reinterpret_cast<X *>(vx);
             auto y = reinterpret_cast<X *>(vy);
             auto z = reinterpret_cast<X *>(vz);
             auto extraParams = reinterpret_cast<X *>(vextraParams);
 
-            nd4j::OmpLaunchHelper info(n);
-
             if (xEws == 1 && yEws == 1 && zEws == 1) {
-
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {
-                    auto threadNum = omp_get_thread_num();
-                    Nd4jLong threadOffset = info.getThreadOffset(threadNum);
-                    auto xi = x + threadOffset;
-                    auto yi = y + threadOffset;
-                    auto zi = z + threadOffset;
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                    PRAGMA_OMP_SIMD
-                    for (Nd4jLong i = 0; i < ulen; i++)
-                        zi[i] = OpType::op(xi[i], yi[i], extraParams);
-                }
+                PRAGMA_OMP_SIMD
+                for (auto i = start; i < stop; i++)
+                    z[i] = OpType::op(x[i], y[i], extraParams);
             }
             else {
-
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {
-                    auto threadNum = omp_get_thread_num();
-                    Nd4jLong threadOffset = info.getThreadOffset(threadNum);
-                    auto xi = x + xEws*threadOffset;
-                    auto yi = y + yEws*threadOffset;
-                    auto zi = z + zEws*threadOffset;
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                    PRAGMA_OMP_SIMD
-                    for (Nd4jLong i = 0; i < ulen; i++)
-                        zi[i*zEws] = OpType::op(xi[i*xEws], yi[i*yEws], extraParams);
-                }
+                PRAGMA_OMP_SIMD
+                for (auto i = start; i < stop; i++)
+                    z[i*zEws] = OpType::op(x[i*xEws], y[i*yEws], extraParams);
             }
         }
 
@@ -112,14 +93,16 @@ namespace functions {
                 Nd4jLong *yShapeInfo,
                 void *z,
                 Nd4jLong *zShapeInfo,
-                void *extraParams) {
+                void *extraParams,
+                const uint64_t start,
+                const uint64_t stop) {
             DISPATCH_BY_OPNUM_T(exec, PARAMS(x,
                                               xShapeInfo,
                                               y,
                                               yShapeInfo,
                                               z,
                                               zShapeInfo,
-                                              extraParams),
+                                              extraParams, start, stop),
                                  PAIRWISE_INT_OPS);
         };
 
@@ -129,7 +112,9 @@ namespace functions {
         void PairWiseIntTransform<X>::exec(void *vx, Nd4jLong* xShapeInfo,
                                             void *vy, Nd4jLong* yShapeInfo,
                                             void *vz, Nd4jLong* zShapeInfo,
-                                            void *vextraParams) {
+                                            void *vextraParams,
+                                            const uint64_t start,
+                                            const uint64_t stop) {
 
             auto x = reinterpret_cast<X *>(vx);
             auto y = reinterpret_cast<X *>(vy);
@@ -141,46 +126,28 @@ namespace functions {
             auto yEws = shape::elementWiseStride(yShapeInfo);
             auto zEws = shape::elementWiseStride(zShapeInfo);
 
-            nd4j::OmpLaunchHelper info(n);
-
             if (shape::isScalar(yShapeInfo)) {
 
                uint xShapeInfoCast[MAX_RANK];
                const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
                 if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
-
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for(Nd4jLong i = 0; i < ulen; i++)  {
-                            auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            z[offset] = OpType::op(x[offset], y[0], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for(auto i = start; i < stop; i++)  {
+                        auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        z[offset] = OpType::op(x[offset], y[0], extraParams);
+                    };
                 }
                 else {
-
                     uint zShapeInfoCast[MAX_RANK];
                     const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for(Nd4jLong i = 0; i < ulen; i++)  {
-                            auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
-                            z[zOffset] = OpType::op(x[xOffset], y[0], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for(auto i = start; i < stop; i++)  {
+                        auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
+                        z[zOffset] = OpType::op(x[xOffset], y[0], extraParams);
+                    };
                 }
                 return;
             }
@@ -189,96 +156,63 @@ namespace functions {
             const bool sameShapesXY = shape::shapeEquals(xShapeInfo, yShapeInfo);
 
             if ((kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) && sameShapesXY) {
-                exec<OpType>(x, xEws, y, yEws, z, zEws, extraParams, n);
+                exec<OpType>(x, xEws, y, yEws, z, zEws, extraParams, n, start, stop);
             }
             else if ((kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) && !sameShapesXY) { //not same shape
-                exec<OpType>(x, xEws, y, yEws, z, zEws, extraParams, shape::length(yShapeInfo));
+                exec<OpType>(x, xEws, y, yEws, z, zEws, extraParams, shape::length(yShapeInfo), start, stop);
             }
             else {
 
                 if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
-
                     uint xShapeInfoCast[MAX_RANK];
                     const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for (Nd4jLong i = 0; i < ulen; i++)  {
-                            auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            z[offset] = OpType::op(x[offset], y[offset], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for (auto i = start; i < stop; i++)  {
+                        auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        z[offset] = OpType::op(x[offset], y[offset], extraParams);
+                    };
                 }
                 else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) {
-
                     uint xShapeInfoCast[MAX_RANK];
                     uint zShapeInfoCast[MAX_RANK];
                     const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                     const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for (Nd4jLong i = 0; i < ulen; i++)  {
-                            auto offset  = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
-                            z[zOffset] = OpType::op(x[offset], y[offset], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for (auto i = start; i < stop; i++)  {
+                        auto offset  = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
+                        z[zOffset] = OpType::op(x[offset], y[offset], extraParams);
+                    };
                 }
                 else if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
-
                     uint xShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                     const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for (Nd4jLong i = 0; i < ulen; i++)  {
-                            auto offset  = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
-                            z[offset] = OpType::op(x[offset], y[yOffset], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for (auto i = start; i < stop; i++)  {
+                        auto offset  = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
+                        z[offset] = OpType::op(x[offset], y[yOffset], extraParams);
+                    };
                 }
                 else if(shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo)) {
-
                     uint xShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                     const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for (Nd4jLong i = 0; i < ulen; i++)  {
-                            auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            auto offset  = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
-                            z[offset] = OpType::op(x[xOffset], y[offset], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for (auto i = start; i < stop; i++)  {
+                        auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto offset  = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
+                        z[offset] = OpType::op(x[xOffset], y[offset], extraParams);
+                    };
                 }
                 else {
-
                     uint xShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     uint zShapeInfoCast[MAX_RANK];
@@ -286,20 +220,13 @@ namespace functions {
                     const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
                     const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for (Nd4jLong i = 0; i < ulen; i++)  {
-                            auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                            auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
-                            auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
-                            z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
-                        }
-                    }
+                    PRAGMA_OMP_SIMD
+                    for (auto i = start; i < stop; i++)  {
+                        auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
+                        auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
+                        z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
+                    };
                 }
             }
         }
diff --git a/libnd4j/include/loops/cpu/random.cpp b/libnd4j/include/loops/cpu/random.cpp
index 5abc1447a..d4c808719 100644
--- a/libnd4j/include/loops/cpu/random.cpp
+++ b/libnd4j/include/loops/cpu/random.cpp
@@ -52,28 +52,22 @@ namespace functions {
 
             auto length = shape::length(zShapeInfo);
 
-//            nd4j::random::RandomBuffer *buffer = reinterpret_cast<nd4j::random::RandomBuffer *> (state);
             nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
-            nd4j::OmpLaunchHelper info(length);
-
 
             if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
 
                 uint xShapeInfoCast[MAX_RANK];
                 const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {
-                    auto threadNum = omp_get_thread_num();
-                    auto threadOffset = info.getThreadOffset(threadNum);
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
+                auto func = PRAGMA_THREADS_FOR {
                     PRAGMA_OMP_SIMD
-                    for (Nd4jLong i = 0; i < ulen; i++)  {
-                        auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
+                    for (auto i = start; i < stop; i += increment)  {
+                        auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                         z[offset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments);
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func,  0, length, 1);
             }
             else if (shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) {
 
@@ -82,19 +76,16 @@ namespace functions {
                 const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                 const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {
-                    auto threadNum = omp_get_thread_num();
-                    auto threadOffset = info.getThreadOffset(threadNum);
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
+                auto func = PRAGMA_THREADS_FOR {
                     PRAGMA_OMP_SIMD
-                    for (Nd4jLong i = 0; i < ulen; i++)  {
-                        auto offset  = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                        auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
+                    for (uint64_t i = start; i < stop; i += increment)  {
+                        auto offset  = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
                         z[zOffset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments);
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func,  0, length, 1);
             }
             else if (shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
 
@@ -103,19 +94,16 @@ namespace functions {
                 const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                 const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {
-                    auto threadNum = omp_get_thread_num();
-                    auto threadOffset = info.getThreadOffset(threadNum);
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
+                auto func = PRAGMA_THREADS_FOR {
                     PRAGMA_OMP_SIMD
-                    for (Nd4jLong i = 0; i < ulen; i++)  {
-                        auto offset  = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                        auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
+                    for (uint64_t i = start; i < stop; i += increment)  {
+                        auto offset  = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
                         z[offset] = OpClass::op(x[offset], y[yOffset], i, length, rng, extraArguments);
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func,  0, length, 1);
             }
             else if (shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo)) {
 
@@ -124,19 +112,16 @@ namespace functions {
                 const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                 const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {
-                    auto threadNum = omp_get_thread_num();
-                    auto threadOffset = info.getThreadOffset(threadNum);
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
+                auto func = PRAGMA_THREADS_FOR {
                     PRAGMA_OMP_SIMD
-                    for (Nd4jLong i = 0; i < info.getItersPerThread(threadNum); i++)  {
-                        auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                        auto offset  = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
+                    for (uint64_t i = start; i < stop; i += increment)  {
+                        auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto offset  = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
                         z[offset] = OpClass::op(x[xOffset], y[offset], i, length, rng, extraArguments);
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func,  0, length, 1);
             }
             else {
 
@@ -147,20 +132,17 @@ namespace functions {
                 const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
                 const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {
-                    auto threadNum = omp_get_thread_num();
-                    auto threadOffset = info.getThreadOffset(threadNum);
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
+                auto func = PRAGMA_THREADS_FOR {
                     PRAGMA_OMP_SIMD
-                    for (Nd4jLong i = 0; i < ulen; i++)  {
-                        auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                        auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
-                        auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
+                    for (uint64_t i = start; i < stop; i += increment)  {
+                        auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
+                        auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
                         z[zOffset] = OpClass::op(x[xOffset], y[yOffset], i, length, rng, extraArguments);
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func,  0, length, 1);
             }
         };
 
@@ -184,41 +166,34 @@ namespace functions {
             const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
             nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
-            nd4j::OmpLaunchHelper info(length);
 
             if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
 
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {
-                    auto threadNum = omp_get_thread_num();
-                    auto threadOffset = info.getThreadOffset(threadNum);
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
+                auto func = PRAGMA_THREADS_FOR {
                     PRAGMA_OMP_SIMD
-                    for (Nd4jLong i = 0; i < ulen; i++)  {
-                        auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
+                    for (uint64_t i = start; i < stop; i += increment)  {
+                        auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                         z[offset] = OpClass::op(x[offset], i, length, rng, extraArguments);
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func,  0, length, 1);
             }
             else {
 
                 uint zShapeInfoCast[MAX_RANK];
                 const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {
-                    auto threadNum = omp_get_thread_num();
-                    auto threadOffset = info.getThreadOffset(threadNum);
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
+                auto func = PRAGMA_THREADS_FOR {
                     PRAGMA_OMP_SIMD
-                    for (Nd4jLong i = 0; i < ulen; i++)  {
-                        auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                        auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
+                    for (uint64_t i = start; i < stop; i += increment)  {
+                        auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
                         z[zOffset] = OpClass::op(x[xOffset], i, length, rng, extraArguments);
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func,  0, length, 1);
             }
         }
 
@@ -232,25 +207,21 @@ namespace functions {
 
             auto length = shape::length(zShapeInfo);
 
-            //nd4j::random::RandomBuffer *buffer = reinterpret_cast<nd4j::random::RandomBuffer *> (state);
             nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
             nd4j::OmpLaunchHelper info(length);
 
             uint zShapeInfoCast[MAX_RANK];
             const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
-            PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-            {
-                auto threadNum = omp_get_thread_num();
-                auto threadOffset = info.getThreadOffset(threadNum);
-                auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
+            auto func = PRAGMA_THREADS_FOR {
                 PRAGMA_OMP_SIMD
-                for (Nd4jLong i = 0; i < ulen; i++)  {
-                    auto offset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
-                    z[offset] = OpClass::op(i+threadOffset, length, rng, extraArguments);
+                for (uint64_t i = start; i < stop; i += increment)  {
+                    auto offset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
+                    z[offset] = OpClass::op(i, length, rng, extraArguments);
                 }
-            }
+            };
+
+            samediff::Threads::parallel_for(func,  0, length, 1);
         }
 
         template<typename X>
diff --git a/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp b/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp
index 246d18ac4..882b1740e 100644
--- a/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp
+++ b/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp
@@ -55,7 +55,7 @@ namespace functions {
                 if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
                     return;
                 const auto startingVal = OpType::startingValue(x);
-                PRAGMA_OMP_PARALLEL_FOR_IF(length > nd4j::Environment::getInstance()->elementwiseThreshold())
+
                 for (uint i = 0; i < length; i++)
                     z[i] = startingVal;
                 return;
@@ -65,25 +65,14 @@ namespace functions {
                 z[0] = execScalar<OpType>(x, xEws, length, extraParams);
             }
             else {
-                X start = OpType::startingValue(x);
-                const int maxThreads = nd4j::math::nd4j_min<int>(256, omp_get_max_threads());
-                X intermediate[256];
-
-                for (int e = 0; e < maxThreads; e++)
-                    intermediate[e] = start;
-
+                auto startingValue = OpType::startingValue(x);
                 uint xShapeInfoCast[MAX_RANK];
                 const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(maxThreads)
-                for(Nd4jLong i = 0; i < length; ++i)
-                    intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
+                for (auto i = 0; i < length; i++)
+                    startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
 
-
-                for (int e = 0; e < maxThreads; e++)
-                    start = OpType::update(start, intermediate[e], extraParams);
-
-                z[0] = OpType::postProcess(start, shape::length(xShapeInfo), extraParams);
+                z[0] = OpType::postProcess(startingValue, length, extraParams);
             }
         }
 
@@ -102,23 +91,14 @@ namespace functions {
                     return execScalar<OpType>(x, xEws, length, extraParams);
                 }
                 else {
-                    X start = OpType::startingValue(x);
-                    auto intermediate = new X[nd4j::math::nd4j_max<int>(1, omp_get_max_threads())];
-                    for (int e = 0; e < omp_get_max_threads(); e++)
-                        intermediate[e] = start;
-
+                    auto startingValue = OpType::startingValue(x);
                     uint xShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_SIMD
-                    for(Nd4jLong i = 0; i < length; ++i)
-                        intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
+                    for (auto i = 0; i < length; i++)
+                        startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
 
-                    for (int e = 0; e < omp_get_max_threads(); e++)
-                        start = OpType::update(start, intermediate[e], extraParams);
-
-                    delete[] intermediate;
-                    return OpType::postProcess(start, shape::length(xShapeInfo), extraParams);
+                    return OpType::postProcess(startingValue, length, extraParams);
                 }
             }
 
@@ -150,8 +130,8 @@ namespace functions {
                              int *dimension,
                              int dimensionLength,
                              Nd4jLong *tadShapeInfo,
-                             Nd4jLong *tadOffset) {
-                DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffset), REDUCE_BOOL_OPS);
+                             Nd4jLong *tadOffset, int64_t start, int64_t stop) {
+                DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffset, start, stop), REDUCE_BOOL_OPS);
         }
 
         template <typename X, typename Z>
@@ -164,7 +144,7 @@ namespace functions {
                              int *dimension,
                              int dimensionLength,
                              Nd4jLong *tadShapeInfo,
-                             Nd4jLong *tadOffset) {
+                             Nd4jLong *tadOffset, int64_t start, int64_t stop) {
 
                 auto x = reinterpret_cast<X *>(vx);
                 auto z = reinterpret_cast<Z *>(vresult);
@@ -176,7 +156,7 @@ namespace functions {
                     if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
                         return;
                     const auto startingVal = OpType::startingValue(x);
-                    PRAGMA_OMP_PARALLEL_FOR_IF(resultLength > nd4j::Environment::getInstance()->elementwiseThreshold())
+
                     for (uint i = 0; i < resultLength; i++)
                         z[i] = startingVal;
                     return;
@@ -205,9 +185,9 @@ namespace functions {
                 }
 
 #ifdef INLINE_LOOPS
-                nd4j::ReductionLoops<X,Z,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams);
+                nd4j::ReductionLoops<X,Z,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
 #else
-                nd4j::ReductionBoolLoops<X,Z>::template innerloopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams);
+                nd4j::ReductionBoolLoops<X,Z>::template innerloopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
 #endif
             }
 
@@ -227,49 +207,33 @@ namespace functions {
         template <typename X, typename Z>
         template <typename OpType>
         Z _CUDA_H ReduceBoolFunction<X, Z>::execScalar(void *vx, Nd4jLong xEws, Nd4jLong length, void *vextraParams) {
-
                 auto x = reinterpret_cast<X *>(vx);
                 auto extraParams = reinterpret_cast<X *>(vextraParams);
+                int maxThreads = nd4j::math::nd4j_min<int>(64, nd4j::Environment::getInstance()->maxThreads());
+                Z intermediate[64];
 
-                auto startingVal = OpType::startingValue(x);
-                nd4j::OmpLaunchHelper info(length);
+                PRAGMA_OMP_SIMD
+                for (auto e = 0; e < maxThreads; e++)
+                    intermediate[e] = OpType::startingValue(x);
 
-                if (xEws == 1) {
-
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto local = OpType::startingValue(x);
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto xi = x + threadOffset;
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        for (Nd4jLong i = 0; i < ulen; i++) {
-                            local = OpType::update(local, OpType::op(xi[i], extraParams), extraParams);
-                        }
-
-                        PRAGMA_OMP_CRITICAL
-                        startingVal = OpType::update(startingVal, local, extraParams);
+                auto func = PRAGMA_THREADS_FOR {
+                    if (xEws == 1) {
+                        for (auto i = start; i < stop; i++)
+                            intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i], extraParams), extraParams);
+                    } else {
+                        for (auto i = start; i < stop; i++)
+                            intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i * xEws], extraParams), extraParams);
                     }
-                }
-                else {
+                };
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto local = OpType::startingValue(x);
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto xi = x + xEws*threadOffset;
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
+                maxThreads = samediff::Threads::parallel_for(func, 0, length, 1, maxThreads);
 
-                        for (Nd4jLong i = 0; i < ulen; i++)
-                            local = OpType::update(local, OpType::op(xi[i*xEws], extraParams), extraParams);
+                // merge results
+                for (int e = 1; e < maxThreads; e++)
+                    intermediate[0] = OpType::update(intermediate[0], intermediate[e], extraParams);
 
-                        PRAGMA_OMP_CRITICAL
-                        startingVal = OpType::update(startingVal, local, extraParams);
-                    }
-                }
-                return OpType::postProcess(startingVal, length, extraParams);
+                // return result
+                return OpType::postProcess(intermediate[0], length, extraParams);
             }
 
 
diff --git a/libnd4j/include/loops/cpu/reduce/reduce_float.cpp b/libnd4j/include/loops/cpu/reduce/reduce_float.cpp
index a94a19b25..112656852 100644
--- a/libnd4j/include/loops/cpu/reduce/reduce_float.cpp
+++ b/libnd4j/include/loops/cpu/reduce/reduce_float.cpp
@@ -59,9 +59,10 @@ namespace functions {
                 if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
                     return;
                 const auto startingVal = OpType::startingValue(x);
-                PRAGMA_OMP_PARALLEL_FOR_IF(length > nd4j::Environment::getInstance()->elementwiseThreshold())
+
                 for (uint i = 0; i < length; i++)
                     z[i] = startingVal;
+
                 return;
             }
 
@@ -69,25 +70,29 @@ namespace functions {
                 z[0] = execScalar<OpType>(x, xEws, length, extraParams);
             }
             else {
-                X start = OpType::startingValue(x);
-                const int maxThreads = nd4j::math::nd4j_min<int>(256, omp_get_max_threads());
-                X intermediate[256];
-
-                for (int e = 0; e < maxThreads; e++)
-                    intermediate[e] = start;
-
+                auto startingValue = OpType::startingValue(x);
                 uint xShapeInfoCast[MAX_RANK];
                 const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                int maxThreads = nd4j::math::nd4j_min<int>(64, nd4j::Environment::getInstance()->maxThreads());
+                Z intermediate[64];
 
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(maxThreads)
-                for(Nd4jLong i = 0; i < length; ++i)
-                    intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
+                PRAGMA_OMP_SIMD
+                for (auto e = 0; e < maxThreads; e++)
+                    intermediate[e] = OpType::startingValue(x);
 
+                auto func = PRAGMA_THREADS_FOR {
+                    for (auto i = start; i < stop; i++)
+                        intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
+                };
 
-                for (int e = 0; e < maxThreads; e++)
-                    start = OpType::update(start, intermediate[e], extraParams);
+                maxThreads = samediff::Threads::parallel_for(func, 0, length, 1, maxThreads);
 
-                z[0] = OpType::postProcess(start, shape::length(xShapeInfo), extraParams);
+                // merge results
+                for (int e = 1; e < maxThreads; e++)
+                    intermediate[0] = OpType::update(intermediate[0], intermediate[e], extraParams);
+
+                // write out results
+                z[0] = OpType::postProcess(intermediate[0], length, extraParams);
             }
         }
 
@@ -105,23 +110,14 @@ namespace functions {
                     return execScalar<OpType>(x, xEws, length, extraParams);
                 }
                 else {
-                    X start = OpType::startingValue(x);
-                    auto intermediate = new X[nd4j::math::nd4j_max<int>(1, omp_get_max_threads())];
-                    for (int e = 0; e < omp_get_max_threads(); e++)
-                        intermediate[e] = start;
-
+                    auto startingValue = OpType::startingValue(x);
                     uint xShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_SIMD
-                    for(Nd4jLong i = 0; i < length; ++i)
-                        intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
+                    for (auto i = 0; i < length; i++)
+                        startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
 
-                    for (int e = 0; e < omp_get_max_threads(); e++)
-                        start = OpType::update(start, intermediate[e], extraParams);
-
-                    delete[] intermediate;
-                    return OpType::postProcess(start, shape::length(xShapeInfo), extraParams);
+                    return OpType::postProcess(startingValue, length, extraParams);
                 }
             }
 
@@ -153,7 +149,7 @@ namespace functions {
                              int *dimension,
                              int dimensionLength,
                              Nd4jLong *tadShapeInfo,
-                             Nd4jLong *tadOffset) {
+                             Nd4jLong *tadOffset, int64_t start, int64_t stop) {
                 DISPATCH_BY_OPNUM_TT(exec, PARAMS(x,
                                                xShapeInfo,
                                                extraParams,
@@ -162,7 +158,7 @@ namespace functions {
                                                dimension,
                                                dimensionLength,
                                                tadShapeInfo,
-                                               tadOffset),
+                                               tadOffset, start, stop),
                                   REDUCE_FLOAT_OPS);
         }
 
@@ -176,7 +172,7 @@ namespace functions {
                              int *dimension,
                              int dimensionLength,
                              Nd4jLong *tadShapeInfo,
-                             Nd4jLong *tadOffset) {
+                             Nd4jLong *tadOffset, int64_t start, int64_t stop) {
 
                 auto x = reinterpret_cast<X *>(vx);
                 auto z = reinterpret_cast<Z *>(vresult);
@@ -188,7 +184,7 @@ namespace functions {
                     if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
                         return;
                     const auto startingVal = std::is_same<OpType, simdOps::Mean<X,Z>>::value ? nd4j::DataTypeUtils::nanOrZero<Z>() : static_cast<Z>(OpType::startingValue(x));
-                    PRAGMA_OMP_PARALLEL_FOR_IF(resultLength > nd4j::Environment::getInstance()->elementwiseThreshold())
+
                     for (uint i = 0; i < resultLength; i++)
                         z[i] = startingVal;
                     return;
@@ -222,9 +218,9 @@ namespace functions {
                 }
 
 #ifdef INLINE_LOOPS
-                nd4j::ReductionLoops<X,Z,Z>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams);
+                nd4j::ReductionLoops<X,Z,Z>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
 #else
-                nd4j::ReductionFloatLoops<X,Z>::template innerloopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams);
+                nd4j::ReductionFloatLoops<X,Z>::template innerloopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
 #endif
             }
 
@@ -245,49 +241,34 @@ namespace functions {
         template <typename OpType>
         Z _CUDA_H ReduceFloatFunction<X, Z>::execScalar(void *vx, Nd4jLong xEws, Nd4jLong length, void *vextraParams) {
 
-                auto x = reinterpret_cast<X *>(vx);
-                auto extraParams = reinterpret_cast<Z *>(vextraParams);
+            auto x = reinterpret_cast<X *>(vx);
+            auto extraParams = reinterpret_cast<Z *>(vextraParams);
+            int maxThreads = nd4j::math::nd4j_min<int>(64, nd4j::Environment::getInstance()->maxThreads());
+            Z intermediate[64];
 
-                auto startingVal = OpType::startingValue(x);
-                nd4j::OmpLaunchHelper info(length);
-                int nt = info._numThreads;
+            PRAGMA_OMP_SIMD
+            for (auto e = 0; e < maxThreads; e++)
+                intermediate[e] = OpType::startingValue(x);
 
-            if (xEws == 1) {
-
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {
-                    auto local = OpType::startingValue(x);
-                    auto threadNum = omp_get_thread_num();
-                    auto threadOffset = info.getThreadOffset(threadNum);
-                    auto xi = x + threadOffset;
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                    for (Nd4jLong i = 0; i < ulen; i++)
-                        local = OpType::update(local, OpType::op(xi[i], extraParams), extraParams);
-
-                    PRAGMA_OMP_CRITICAL
-                    startingVal = OpType::update(startingVal, local, extraParams);
+            auto func = PRAGMA_THREADS_FOR {
+                if (xEws == 1) {
+                    for (auto i = start; i < stop; i++)
+                        intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i], extraParams), extraParams);
+                } else {
+                    for (auto i = start; i < stop; i++)
+                        intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i * xEws], extraParams), extraParams);
                 }
-            }
-            else {
+            };
 
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {
-                    auto local = OpType::startingValue(x);
-                    auto threadNum = omp_get_thread_num();
-                    auto threadOffset = info.getThreadOffset(threadNum);
-                    auto xi = x + xEws*threadOffset;
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
+            maxThreads = samediff::Threads::parallel_for(func, 0, length, 1, maxThreads);
 
-                    for (Nd4jLong i = 0; i < ulen; i++)
-                        local = OpType::update(local, OpType::op(xi[i*xEws], extraParams), extraParams);
+            // merge results
+            for (int e = 1; e < maxThreads; e++)
+                intermediate[0] = OpType::update(intermediate[0], intermediate[e], extraParams);
 
-                    PRAGMA_OMP_CRITICAL
-                    startingVal = OpType::update(startingVal, local, extraParams);
-                }
-            }
-            return OpType::postProcess(startingVal, length, extraParams);
-            }
+            // return result
+            return OpType::postProcess(intermediate[0], length, extraParams);
+        }
 
 
         BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT ReduceFloatFunction, , LIBND4J_TYPES, FLOAT_TYPES);
diff --git a/libnd4j/include/loops/cpu/reduce/reduce_long.cpp b/libnd4j/include/loops/cpu/reduce/reduce_long.cpp
index 1a148805e..76dc209f6 100644
--- a/libnd4j/include/loops/cpu/reduce/reduce_long.cpp
+++ b/libnd4j/include/loops/cpu/reduce/reduce_long.cpp
@@ -55,7 +55,7 @@ namespace functions {
                 if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
                     return;
                 const auto startingVal = OpType::startingValue(x);
-                PRAGMA_OMP_PARALLEL_FOR_IF(length > nd4j::Environment::getInstance()->elementwiseThreshold())
+
                 for (uint i = 0; i < length; i++)
                     z[i] = startingVal;
                 return;
@@ -65,25 +65,29 @@ namespace functions {
                 z[0] = execScalar<OpType>(x, xEws, length, extraParams);
             }
             else {
-                X start = OpType::startingValue(x);
-                const int maxThreads = nd4j::math::nd4j_min<int>(256, omp_get_max_threads());
-                X intermediate[256];
-
-                for (int e = 0; e < maxThreads; e++)
-                    intermediate[e] = start;
-
+                auto startingValue = OpType::startingValue(x);
                 uint xShapeInfoCast[MAX_RANK];
                 const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                int maxThreads = nd4j::math::nd4j_min<int>(64, nd4j::Environment::getInstance()->maxThreads());
+                Z intermediate[64];
 
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(maxThreads)
-                for(Nd4jLong i = 0; i < length; ++i)
-                    intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
+                PRAGMA_OMP_SIMD
+                for (auto e = 0; e < maxThreads; e++)
+                    intermediate[e] = OpType::startingValue(x);
 
+                auto func = PRAGMA_THREADS_FOR {
+                    for (auto i = start; i < stop; i++)
+                        intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
+                };
 
-                for (int e = 0; e < maxThreads; e++)
-                    start = OpType::update(start, intermediate[e], extraParams);
+                maxThreads = samediff::Threads::parallel_for(func, 0, length, 1, maxThreads);
 
-                z[0] = OpType::postProcess(start, shape::length(xShapeInfo), extraParams);
+                // merge results
+                for (int e = 1; e < maxThreads; e++)
+                    intermediate[0] = OpType::update(intermediate[0], intermediate[e], extraParams);
+
+                // write out results
+                z[0] = OpType::postProcess(intermediate[0], length, extraParams);
             }
         }
 
@@ -103,23 +107,14 @@ namespace functions {
                     return execScalar<OpType>(x, xEws, length, extraParams);
                 }
                 else {
-                    X start = OpType::startingValue(x);
-                    auto intermediate = new X[nd4j::math::nd4j_max<int>(1, omp_get_max_threads())];
-                    for (int e = 0; e < omp_get_max_threads(); e++)
-                        intermediate[e] = start;
-
+                    auto startingValue = OpType::startingValue(x);
                     uint xShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_SIMD
-                    for(Nd4jLong i = 0; i < length; ++i)
-                        intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
+                    for (auto i = 0; i < length; i++)
+                        startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
 
-                    for (int e = 0; e < omp_get_max_threads(); e++)
-                        start = OpType::update(start, intermediate[e], extraParams);
-
-                    delete[] intermediate;
-                    return OpType::postProcess(start, shape::length(xShapeInfo), extraParams);
+                    return OpType::postProcess(startingValue, length, extraParams);
                 }
             }
 
@@ -152,8 +147,8 @@ namespace functions {
                              int *dimension,
                              int dimensionLength,
                              Nd4jLong *tadShapeInfo,
-                             Nd4jLong *tadOffset) {
-                DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffset), REDUCE_LONG_OPS);
+                             Nd4jLong *tadOffset, int64_t start, int64_t stop) {
+                DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffset, start, stop), REDUCE_LONG_OPS);
         }
 
         template <typename X, typename Z>
@@ -166,7 +161,7 @@ namespace functions {
                              int *dimension,
                              int dimensionLength,
                              Nd4jLong *tadShapeInfo,
-                             Nd4jLong *tadOffset) {
+                             Nd4jLong *tadOffset, int64_t start, int64_t stop) {
 
                 auto x = reinterpret_cast<X *>(vx);
                 auto z = reinterpret_cast<Z *>(vresult);
@@ -178,7 +173,7 @@ namespace functions {
                     if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
                         return;
                     const auto startingVal = OpType::startingValue(x);
-                    PRAGMA_OMP_PARALLEL_FOR_IF(resultLength > nd4j::Environment::getInstance()->elementwiseThreshold())
+
                     for (uint i = 0; i < resultLength; i++)
                         z[i] = startingVal;
                     return;
@@ -212,9 +207,9 @@ namespace functions {
                 }
 
 #ifdef INLINE_LOOPS
-                nd4j::ReductionLoops<X,Z,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams);
+                nd4j::ReductionLoops<X,Z,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
 #else
-                nd4j::ReductionLongLoops<X,Z>::template innerloopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams);
+                nd4j::ReductionLongLoops<X,Z>::template innerloopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
 #endif
             }
 
@@ -235,48 +230,34 @@ namespace functions {
         template <typename OpType>
         Z _CUDA_H ReduceLongFunction<X, Z>::execScalar(void *vx, Nd4jLong xEws, Nd4jLong length, void *vextraParams) {
 
-                auto x = reinterpret_cast<X *>(vx);
-                auto extraParams = reinterpret_cast<X *>(vextraParams);
+            auto x = reinterpret_cast<X *>(vx);
+            auto extraParams = reinterpret_cast<X *>(vextraParams);
+            int maxThreads = nd4j::math::nd4j_min<int>(64, nd4j::Environment::getInstance()->maxThreads());
+            Z intermediate[64];
 
-                auto startingVal = OpType::startingValue(x);
-                nd4j::OmpLaunchHelper info(length);
+            PRAGMA_OMP_SIMD
+            for (auto e = 0; e < maxThreads; e++)
+                intermediate[e] = OpType::startingValue(x);
 
+            auto func = PRAGMA_THREADS_FOR {
                 if (xEws == 1) {
-
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto local = OpType::startingValue(x);
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto xi = x + threadOffset;
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        for (Nd4jLong i = 0; i < ulen; i++)
-                            local = OpType::update(local, OpType::op(xi[i], extraParams), extraParams);
-
-                        PRAGMA_OMP_CRITICAL
-                        startingVal = OpType::update(startingVal, local, extraParams);
-                    }
+                    for (auto i = start; i < stop; i++)
+                        intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i], extraParams), extraParams);
+                } else {
+                    for (auto i = start; i < stop; i++)
+                        intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i * xEws], extraParams), extraParams);
                 }
-                else {
+            };
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto local = OpType::startingValue(x);
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto xi = x + xEws*threadOffset;
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
+            maxThreads = samediff::Threads::parallel_for(func, 0, length, 1, maxThreads);
 
-                        for (Nd4jLong i = 0; i < ulen; i++)
-                            local = OpType::update(local, OpType::op(xi[i*xEws], extraParams), extraParams);
+            // merge results
+            for (int e = 1; e < maxThreads; e++)
+                intermediate[0] = OpType::update(intermediate[0], intermediate[e], extraParams);
 
-                        PRAGMA_OMP_CRITICAL
-                        startingVal = OpType::update(startingVal, local, extraParams);
-                    }
-                }
-                return OpType::postProcess(startingVal, length, extraParams);
-            }
+            // return result
+            return OpType::postProcess(intermediate[0], length, extraParams);
+        }
 
 
         BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT ReduceLongFunction, , LIBND4J_TYPES, LONG_TYPES);
diff --git a/libnd4j/include/loops/cpu/reduce/reduce_same.cpp b/libnd4j/include/loops/cpu/reduce/reduce_same.cpp
index 0dfff5e73..cbd7e6e12 100644
--- a/libnd4j/include/loops/cpu/reduce/reduce_same.cpp
+++ b/libnd4j/include/loops/cpu/reduce/reduce_same.cpp
@@ -57,7 +57,7 @@ namespace functions {
                 if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
                     return;
                 const auto startingVal = OpType::startingValue(x);
-                PRAGMA_OMP_PARALLEL_FOR_IF(length > nd4j::Environment::getInstance()->elementwiseThreshold())
+
                 for (uint i = 0; i < length; i++)
                     z[i] = startingVal;
                 return;
@@ -67,25 +67,29 @@ namespace functions {
                 z[0] = execScalar<OpType>(x, xEws, length, extraParams);
             }
             else {
-                X start = OpType::startingValue(x);
-                const int maxThreads = nd4j::math::nd4j_min<int>(256, omp_get_max_threads());
-                X intermediate[256];
-
-                for (int e = 0; e < maxThreads; e++)
-                    intermediate[e] = start;
-
+                auto startingValue = OpType::startingValue(x);
                 uint xShapeInfoCast[MAX_RANK];
                 const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                int maxThreads = nd4j::math::nd4j_min<int>(64, nd4j::Environment::getInstance()->maxThreads());
+                X intermediate[64];
 
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(maxThreads)
-                for(Nd4jLong i = 0; i < length; ++i)
-                    intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
+                PRAGMA_OMP_SIMD
+                for (auto e = 0; e < maxThreads; e++)
+                    intermediate[e] = OpType::startingValue(x);
 
+                auto func = PRAGMA_THREADS_FOR {
+                    for (auto i = start; i < stop; i++)
+                        intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
+                };
 
-                for (int e = 0; e < maxThreads; e++)
-                    start = OpType::update(start, intermediate[e], extraParams);
+                maxThreads = samediff::Threads::parallel_for(func, 0, length, 1, maxThreads);
 
-                z[0] = OpType::postProcess(start, length, extraParams);
+                // merge results
+                for (int e = 1; e < maxThreads; e++)
+                    intermediate[0] = OpType::update(intermediate[0], intermediate[e], extraParams);
+
+                // write out results
+                z[0] = OpType::postProcess(intermediate[0], length, extraParams);
             }
         }
 
@@ -103,26 +107,15 @@ namespace functions {
 
                 if (xEws >= 1) {
                     return execScalar<OpType>(x, xEws, length, extraParams);
-                }
-                else {
-                    X start = OpType::startingValue(x);
-                    const int maxThreads = nd4j::math::nd4j_min<int>(256, omp_get_max_threads());
-                    X intermediate[256];
-
-                    for (int e = 0; e < maxThreads; e++)
-                        intermediate[e] = start;
-
+                } else {
+                    auto startingValue = OpType::startingValue(x);
                     uint xShapeInfoCast[MAX_RANK];
-                    const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                    bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
-                    PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(maxThreads)
-                    for(Nd4jLong i = 0; i < length; ++i)
-                        intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
+                    for (auto i = 0; i < length; i++)
+                        startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
 
-                    for (int e = 0; e < maxThreads; e++)
-                        start = OpType::update(start, intermediate[e], extraParams);
-
-                    return OpType::postProcess(start, shape::length(xShapeInfo), extraParams);
+                    return OpType::postProcess(startingValue, length, extraParams);
                 }
             }
 
@@ -154,7 +147,7 @@ namespace functions {
                              int *dimension,
                              int dimensionLength,
                              Nd4jLong *tadShapeInfo,
-                             Nd4jLong *tadOffset) {
+                             Nd4jLong *tadOffset, int64_t start, int64_t stop) {
                 DISPATCH_BY_OPNUM_T(exec, PARAMS(x,
                                                xShapeInfo,
                                                extraParams,
@@ -163,7 +156,7 @@ namespace functions {
                                                dimension,
                                                dimensionLength,
                                                tadShapeInfo,
-                                               tadOffset),
+                                               tadOffset, start, stop),
                                   REDUCE_SAME_OPS);
         }
 
@@ -177,7 +170,7 @@ namespace functions {
                              int *dimension,
                              int dimensionLength,
                              Nd4jLong *tadShapeInfo,
-                             Nd4jLong *tadOffset) {
+                             Nd4jLong *tadOffset, int64_t start, int64_t stop) {
 
                 auto x = reinterpret_cast<X *>(vx);
                 auto z = reinterpret_cast<X *>(vz);
@@ -189,7 +182,7 @@ namespace functions {
                     if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
                         return;
                     const auto startingVal = OpType::startingValue(x);
-                    PRAGMA_OMP_PARALLEL_FOR_IF(zLength > nd4j::Environment::getInstance()->elementwiseThreshold())
+
                     for (uint i = 0; i < zLength; i++)
                         z[i] = startingVal;
                     return;
@@ -223,9 +216,9 @@ namespace functions {
                 }
 
 #ifdef INLINE_LOOPS
-                nd4j::ReductionLoops<X,X,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams);
+                nd4j::ReductionLoops<X,X,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
 #else
-                nd4j::ReductionSameLoops<X>::template innerloopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadOnlyShapeInfo, tadOffsets, extraParams);
+                nd4j::ReductionSameLoops<X>::template innerloopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
 #endif
             }
 
@@ -246,48 +239,34 @@ namespace functions {
         template <typename OpType>
         X _CUDA_H ReduceSameFunction<X>::execScalar(void *vx, Nd4jLong xEws, Nd4jLong length, void *vextraParams) {
 
-                auto x = reinterpret_cast<X *>(vx);
-                auto extraParams = reinterpret_cast<X *>(vextraParams);
+            auto x = reinterpret_cast<X *>(vx);
+            auto extraParams = reinterpret_cast<X *>(vextraParams);
+            int maxThreads = nd4j::math::nd4j_min<int>(64, nd4j::Environment::getInstance()->maxThreads());
+            X intermediate[64];
 
-                auto startingVal = OpType::startingValue(x);
-                nd4j::OmpLaunchHelper info(length);
+            PRAGMA_OMP_SIMD
+            for (auto e = 0; e < maxThreads; e++)
+                intermediate[e] = OpType::startingValue(x);
 
+            auto func = PRAGMA_THREADS_FOR {
                 if (xEws == 1) {
-
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto local = OpType::startingValue(x);
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto xi = x + threadOffset;
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        for (Nd4jLong i = 0; i < ulen; i++)
-                            local = OpType::update(local, OpType::op(xi[i], extraParams), extraParams);
-
-                        PRAGMA_OMP_CRITICAL
-                        startingVal = OpType::update(startingVal, local, extraParams);
-                    }
+                    for (auto i = start; i < stop; i++)
+                        intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i], extraParams), extraParams);
+                } else {
+                    for (auto i = start; i < stop; i++)
+                        intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i * xEws], extraParams), extraParams);
                 }
-                else {
+            };
 
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto local = OpType::startingValue(x);
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto xi = x + xEws*threadOffset;
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
+            maxThreads = samediff::Threads::parallel_for(func, 0, length, 1, maxThreads);
 
-                        for (Nd4jLong i = 0; i < ulen; i++)
-                            local = OpType::update(local, OpType::op(xi[i*xEws], extraParams), extraParams);
+            // merge results
+            for (int e = 1; e < maxThreads; e++)
+                intermediate[0] = OpType::update(intermediate[0], intermediate[e], extraParams);
 
-                        PRAGMA_OMP_CRITICAL
-                        startingVal = OpType::update(startingVal, local, extraParams);
-                    }
-                }
-                return OpType::postProcess(startingVal, length, extraParams);
-            }
+            // return result
+            return OpType::postProcess(intermediate[0], length, extraParams);
+        }
 
 
         BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT ReduceSameFunction, , LIBND4J_TYPES);
diff --git a/libnd4j/include/loops/cpu/reduce3.cpp b/libnd4j/include/loops/cpu/reduce3.cpp
index fd09dc0e1..dbe93620a 100644
--- a/libnd4j/include/loops/cpu/reduce3.cpp
+++ b/libnd4j/include/loops/cpu/reduce3.cpp
@@ -24,6 +24,7 @@
 #include <loops/legacy_ops.h>
 #include <helpers/ConstantTadHelper.h>
 #include <Loops.h>
+#include <execution/Threads.h>
 
 using namespace simdOps;
 
@@ -51,72 +52,82 @@ void Reduce3<X,Z>::execScalar(void *vx, Nd4jLong *xShapeInfo,
         if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
             return;
         const auto startingVal = OpType::startingValue(x);
-        PRAGMA_OMP_PARALLEL_FOR_IF(length > nd4j::Environment::getInstance()->elementwiseThreshold())
+
         for (uint i = 0; i < length; i++)
             z[i] = startingVal;
+
         return;
     }
 
     Z extraParamsVals[3] = {(Z) 0.0f, (Z) 0.0f, (Z) 0.0f};
-    // it's possible case for EqualsWithEps op
-    if (extraParams != nullptr)
-        extraParamsVals[2] = extraParams[0];
 
     uint xShapeInfoCast[MAX_RANK];
     const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
     Z startingVal = OpType::startingValue(x);
-    const int maxThreads = nd4j::math::nd4j_min<int>(256, omp_get_max_threads());
-    nd4j::OmpLaunchHelper t(length, maxThreads);
-    Z intermediate[256];
-    Z extraParamsLocal[3 * 256];
+    int maxThreads = nd4j::math::nd4j_min<int>(64, nd4j::Environment::getInstance()->maxThreads());
+    Z intermediate[64];
+    Z extraParamsLocal[3 * 64];
 
     PRAGMA_OMP_SIMD
     for (int e = 0; e < maxThreads; e++)
         intermediate[e] = startingVal;
 
-    memset(extraParamsLocal, 0, 3 * 256 * sizeof(Z));
-    if (extraParams != nullptr)
+    memset(extraParamsLocal, 0, 3 * 64 * sizeof(Z));
+    if (extraParams != nullptr) {
         PRAGMA_OMP_SIMD
-        for (int e = 0; e < maxThreads; e++)
-            extraParamsLocal[3 * e + 2] = extraParams[0];
+        // mostly for future reference
+        for (int e = 0; e < maxThreads; e++) {
+            extraParamsLocal[3 * e] = extraParams[0];
+            extraParamsLocal[3 * e + 1] = extraParams[1];
+            extraParamsLocal[3 * e + 2] = extraParams[2];
+        }
+    }
 
     nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXZ(xShapeInfo, yShapeInfo);
 
     if (kindOfLoop == nd4j::LoopKind::EWS1) {
-        PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(t._numThreads)
-        for(unsigned int i = 0; i < length; i++) {
-            const auto threadNum = omp_get_thread_num();
-            intermediate[threadNum] = OpType::update(intermediate[threadNum], OpType::op(x[i], y[i], extraParamsLocal + 3 * threadNum), extraParamsLocal + 3 * threadNum);
-        }
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
+                intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i], y[i], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id);
+            }
+        };
+
+        maxThreads = samediff::Threads::parallel_for(func, 0, length, 1, maxThreads);
 
     } else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) {
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(t._numThreads)
-        for(unsigned int i = 0; i < length; i++) {
-            const auto threadNum = omp_get_thread_num();
-            auto offset  = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
-            intermediate[threadNum] = OpType::update(intermediate[threadNum], OpType::op(x[offset], y[offset], extraParamsLocal + 3 * threadNum), extraParamsLocal + 3 * threadNum);
-        }
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
+                auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[offset], y[offset], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id);
+            }
+        };
+
+        maxThreads = samediff::Threads::parallel_for(func, 0, length, 1, maxThreads);
     } else {
         uint yShapeInfoCast[MAX_RANK];
         const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(t._numThreads)
-        for(unsigned int i = 0; i < length; i++) {
-            const auto threadNum = omp_get_thread_num();
-            auto xOffset  = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
-            auto yOffset  = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
-            intermediate[threadNum] = OpType::update(intermediate[threadNum], OpType::op(x[xOffset], y[yOffset], extraParamsLocal + 3 * threadNum), extraParamsLocal + 3 * threadNum);
-        }
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
+                auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
+                intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[xOffset], y[yOffset], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id);
+            }
+        };
+
+        maxThreads = samediff::Threads::parallel_for(func, 0, length, 1, maxThreads);
     }
 
     // merge step
     for (int e = 0; e < maxThreads; e++)
         OpType::aggregateExtraParams(extraParamsVals, extraParamsLocal + 3 * e);
+
     for (int e = 0; e < maxThreads; e++)
         startingVal = OpType::update(startingVal, intermediate[e], extraParamsVals);
 
+    // writing out result
     z[0] = OpType::postProcess(startingVal, length, extraParamsVals);
 }
 
@@ -139,7 +150,7 @@ void Reduce3<X,Z>::exec(void *vx, Nd4jLong *xShapeInfo,
                     void *vextraParams,
                     void *vy, Nd4jLong *yShapeInfo,
                     void *vz, Nd4jLong *zShapeInfo,
-                    int *dimension, int dimensionLength) {
+                    int *dimension, int dimensionLength, int64_t start, int64_t stop) {
 
     auto x = reinterpret_cast<X*>(vx);
     auto y = reinterpret_cast<X*>(vy);
@@ -151,9 +162,9 @@ void Reduce3<X,Z>::exec(void *vx, Nd4jLong *xShapeInfo,
         return;
     }
 #ifdef INLINE_LOOPS
-    nd4j::Reduction3Loops<X,Z>::template loopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dimension, dimensionLength, extraParams);
+    nd4j::Reduction3Loops<X,Z>::template loopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dimension, dimensionLength, extraParams, start, stop);
 #else
-    nd4j::Reduction3Loops<X,Z>::template innerloopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dimension, dimensionLength, extraParams);
+    nd4j::Reduction3Loops<X,Z>::template innerloopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dimension, dimensionLength, extraParams, start, stop);
 #endif
 }
 
@@ -165,16 +176,16 @@ void Reduce3<X,Z>::exec(void *vx, Nd4jLong *xShapeInfo,
                         void *vy, Nd4jLong *yShapeInfo,
                         void *vz, Nd4jLong *zShapeInfo,
                         int *dimension, int dimensionLength,
-                        Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
+                        Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, int64_t start, int64_t stop) {
 
     auto x = reinterpret_cast<X *>(vx);
     auto y = reinterpret_cast<X *>(vy);
     auto z = reinterpret_cast<Z *>(vz);
     auto extraParams = reinterpret_cast<Z *>(vextraParams);
 #ifdef INLINE_LOOPS
-    nd4j::Reduction3Loops<X,Z>::template loopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dimension, dimensionLength, extraParams);
+    nd4j::Reduction3Loops<X,Z>::template loopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dimension, dimensionLength, extraParams, start, stop);
 #else
-    nd4j::Reduction3Loops<X,Z>::template innerloopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dimension, dimensionLength, extraParams);
+    nd4j::Reduction3Loops<X,Z>::template innerloopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dimension, dimensionLength, extraParams, start, stop);
 #endif
 }
 
@@ -188,7 +199,7 @@ void Reduce3<X,Z>:: execAll(void *vx, Nd4jLong *xShapeInfo,
                             void *vz, Nd4jLong *zShapeInfo,
                             int *dimension, int dimensionLength,
                             Nd4jLong *xTadShapeInfo, Nd4jLong *xOffsets,
-                            Nd4jLong *yTadShapeInfo, Nd4jLong *yOffsets) {
+                            Nd4jLong *yTadShapeInfo, Nd4jLong *yOffsets, int64_t start, int64_t stop) {
 
     auto x = reinterpret_cast<X *>(vx);
     auto y = reinterpret_cast<X *>(vy);
@@ -196,9 +207,9 @@ void Reduce3<X,Z>:: execAll(void *vx, Nd4jLong *xShapeInfo,
     auto extraParams = reinterpret_cast<Z*>(vextraParams);
 
 #ifdef INLINE_LOOPS
-    nd4j::Reduction3Loops<X,Z>::template loopReduce3All<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xOffsets, yTadShapeInfo, yOffsets, extraParams);
+    nd4j::Reduction3Loops<X,Z>::template loopReduce3All<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xOffsets, yTadShapeInfo, yOffsets, extraParams, start, stop);
 #else
-    nd4j::Reduction3Loops<X,Z>::template innerloopReduce3All<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xOffsets, yTadShapeInfo, yOffsets, extraParams);
+    nd4j::Reduction3Loops<X,Z>::template innerloopReduce3All<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xOffsets, yTadShapeInfo, yOffsets, extraParams, start, stop);
 #endif
 }
 
@@ -209,9 +220,9 @@ void Reduce3<X,Y>::exec( const int opNum,
                         void *extraParamsVals,
                         void *vy, Nd4jLong *yShapeInfo,
                         void *vz, Nd4jLong *zShapeInfo,
-                        int *dimension, int dimensionLength) {
+                        int *dimension, int dimensionLength, int64_t start, int64_t stop) {
 
-    DISPATCH_BY_OPNUM_TT(exec, PARAMS(vx, xShapeInfo, extraParamsVals, vy, yShapeInfo, vz, zShapeInfo, dimension, dimensionLength), REDUCE3_OPS);
+    DISPATCH_BY_OPNUM_TT(exec, PARAMS(vx, xShapeInfo, extraParamsVals, vy, yShapeInfo, vz, zShapeInfo, dimension, dimensionLength, start, stop), REDUCE3_OPS);
 }
 
 
@@ -223,9 +234,9 @@ void Reduce3<X,Y>::exec( const int opNum,
                         void *vy, Nd4jLong *yShapeInfo,
                         void *vz, Nd4jLong *zShapeInfo,
                         int *dimension, int dimensionLength,
-                        Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
+                        Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, int64_t start, int64_t stop) {
 
-    DISPATCH_BY_OPNUM_TT(exec, PARAMS(vx,xShapeInfo,extraParamsVals,vy, yShapeInfo,vz,zShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets), REDUCE3_OPS);
+    DISPATCH_BY_OPNUM_TT(exec, PARAMS(vx,xShapeInfo,extraParamsVals,vy, yShapeInfo,vz,zShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets, start, stop), REDUCE3_OPS);
 }
 
 
@@ -238,9 +249,9 @@ void Reduce3<X,Y>::execAll(const int opNum,
                             void *vz, Nd4jLong *zShapeInfo,
                             int *dimension, int dimensionLength,
                             Nd4jLong *xTadShapeInfo, Nd4jLong *xOffsets,
-                            Nd4jLong *yTadShapeInfo, Nd4jLong *yOffsets) {
+                            Nd4jLong *yTadShapeInfo, Nd4jLong *yOffsets, int64_t start, int64_t stop) {
 
-    DISPATCH_BY_OPNUM_TT(execAll, PARAMS(vx, xShapeInfo, extraParamsVals, vy, yShapeInfo, vz, zShapeInfo, dimension, dimensionLength, xTadShapeInfo, xOffsets, yTadShapeInfo, yOffsets), REDUCE3_OPS);
+    DISPATCH_BY_OPNUM_TT(execAll, PARAMS(vx, xShapeInfo, extraParamsVals, vy, yShapeInfo, vz, zShapeInfo, dimension, dimensionLength, xTadShapeInfo, xOffsets, yTadShapeInfo, yOffsets, start, stop), REDUCE3_OPS);
 }
 
 
diff --git a/libnd4j/include/loops/cpu/scalar.hpp b/libnd4j/include/loops/cpu/scalar.hpp
index 79e53e4a2..071913e22 100644
--- a/libnd4j/include/loops/cpu/scalar.hpp
+++ b/libnd4j/include/loops/cpu/scalar.hpp
@@ -22,6 +22,7 @@
 #include <op_boilerplate.h>
 #include <types/types.h>
 #include <LoopKind.h>
+#include <execution/Threads.h>
 #include "../legacy_ops.h"
 
 using namespace simdOps;
@@ -39,7 +40,8 @@ void ScalarTransform<X, Y, Z>::transform(void *vx, Nd4jLong *xShapeInfo,
                                                 void *vscalars,
                                                 int *dimension, int dimensionLength,
                                                 Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffsets,
-                                                Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets) {
+                                                Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets,
+                                                const uint64_t start, const uint64_t stop) {
 
     auto x = reinterpret_cast<X *>(vx);
     auto z = reinterpret_cast<Z *>(vz);
@@ -63,29 +65,27 @@ void ScalarTransform<X, Y, Z>::transform(void *vx, Nd4jLong *xShapeInfo,
         return;
     }
 
-    int num_threads = nd4j::math::nd4j_min<int>(numTads, omp_get_max_threads());
+    int num_threads = nd4j::math::nd4j_min<int>(numTads, nd4j::Environment::getInstance()->maxThreads());
 
     if (kindOfLoop == nd4j::LoopKind::EWS1) {
-        PRAGMA_OMP_PARALLEL_FOR_THREADS(num_threads)
-        for (unsigned int r = 0; r < numTads; r++) {
+        for (auto r = start; r < stop; r++) {
             auto oZ = z + zTadOffsets[r];
             auto oX = x + xTadOffsets[r];
 
             PRAGMA_OMP_SIMD
             for (unsigned int f = 0; f < tadLength; f++)
                 oZ[f] = OpType::op(oX[f], scalars[r], extraParams);
-        }
+        };
     }
     else {
-        PRAGMA_OMP_PARALLEL_FOR_THREADS(num_threads)
-        for (unsigned int r = 0; r < numTads; r++) {
+        for (auto r = start; r < stop; r++) {
             auto oZ = z + zTadOffsets[r];
             auto oX = x + xTadOffsets[r];
 
             PRAGMA_OMP_SIMD
             for (unsigned int f = 0; f < tadLength; f++)
                 oZ[f * zTadEws] = OpType::op(oX[f * xTadEws], scalars[r], extraParams);
-        }
+        };
     }
 }
 
@@ -98,9 +98,10 @@ void ScalarTransform<X,Y,Z>::transform(int opNum,
                               void *scalars,
                               int *dimension, int dimensionLength,
                               Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffsets,
-                              Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets) {
+                              Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets,
+                              const uint64_t start, const uint64_t stop) {
 
-    DISPATCH_BY_OPNUM_TTT(transform, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, scalars, dimension, dimensionLength, xTadShapeInfo, xTadOffsets, zTadShapeInfo, zTadOffsets), SCALAR_OPS);
+    DISPATCH_BY_OPNUM_TTT(transform, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, scalars, dimension, dimensionLength, xTadShapeInfo, xTadOffsets, zTadShapeInfo, zTadOffsets, start, stop), SCALAR_OPS);
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -110,9 +111,10 @@ void ScalarTransform<X, Y, Z>::transform(const int opNum,
                                         void *z, Nd4jLong zStride,
                                         void *scalar,
                                         void *extraParams,
-                                        const Nd4jLong n, bool allowParallelism) {
+                                        const uint64_t n,
+                                        const uint64_t start, const uint64_t stop) {
 
-    DISPATCH_BY_OPNUM_TTT(transform, PARAMS(x, xStride, z, zStride, scalar, extraParams, n, allowParallelism), SCALAR_OPS);
+    DISPATCH_BY_OPNUM_TTT(transform, PARAMS(x, xStride, z, zStride, scalar, extraParams, n, start, stop), SCALAR_OPS);
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -121,9 +123,10 @@ void ScalarTransform<X, Y, Z>::transform(const int opNum,
                                         void *x, Nd4jLong *xShapeInfo,
                                         void *z, Nd4jLong *zShapeInfo,
                                         void *scalar,
-                                        void *extraParams, bool allowParallelism) {
+                                        void *extraParams,
+                                        const uint64_t start, const uint64_t stop) {
 
-    DISPATCH_BY_OPNUM_TTT(transform, PARAMS(x, xShapeInfo, z, zShapeInfo, scalar, extraParams, allowParallelism), SCALAR_OPS);
+    DISPATCH_BY_OPNUM_TTT(transform, PARAMS(x, xShapeInfo, z, zShapeInfo, scalar, extraParams, start, stop), SCALAR_OPS);
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -132,7 +135,8 @@ template<typename OpType>
 void ScalarTransform<X, Y, Z>::transform(void *vx, Nd4jLong *xShapeInfo,
                                         void *vz, Nd4jLong *zShapeInfo,
                                         void *vscalar,
-                                        void *vextraParams, bool allowParallelism) {
+                                        void *vextraParams,
+                                        const uint64_t start, const uint64_t stop) {
 
     auto x = reinterpret_cast<X *>(vx);
     auto z = reinterpret_cast<Z *>(vz);
@@ -146,48 +150,30 @@ void ScalarTransform<X, Y, Z>::transform(void *vx, Nd4jLong *xShapeInfo,
     nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXZ(xShapeInfo, zShapeInfo);
 
     if (kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) {
-        transform<OpType>(x, xEws, z, zEws, vscalar, extraParams, len, allowParallelism);
+        transform<OpType>(x, xEws, z, zEws, vscalar, extraParams, len, start, stop);
     }
     else {
 
         uint xShapeInfoCast[MAX_RANK];
         const bool canCastX = nd4j::DataTypeUtils::castShapeInfo<uint>(xShapeInfo, xShapeInfoCast);
 
-        nd4j::OmpLaunchHelper info(len, allowParallelism ? -1 : 1);
-
         if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
-
-            PRAGMA_OMP_PARALLEL_THREADS_IF(info._numThreads, allowParallelism)
-            {
-                auto threadNum = omp_get_thread_num();
-                auto threadOffset = info.getThreadOffset(threadNum);
-                auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                PRAGMA_OMP_SIMD
-                for (unsigned int i = 0; i < ulen; i++) {
-                    auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                    z[offset] = OpType::op(x[offset], scalar, extraParams);
-                }
-            }
+            PRAGMA_OMP_SIMD
+            for (auto i = start; i < stop; i++) {
+                auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                z[offset] = OpType::op(x[offset], scalar, extraParams);
+            };
         }
         else {
-
             uint zShapeInfoCast[MAX_RANK];
             const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo, zShapeInfoCast);
 
-            PRAGMA_OMP_PARALLEL_THREADS_IF(info._numThreads, allowParallelism)
-            {
-                auto threadNum = omp_get_thread_num();
-                auto threadOffset = info.getThreadOffset(threadNum);
-                auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                PRAGMA_OMP_SIMD
-                for (unsigned int i = 0; i < ulen; i++) {
-                    auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                    auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
-                    z[zOffset] = OpType::op(x[xOffset], scalar, extraParams);
-                }
-            }
+            PRAGMA_OMP_SIMD
+            for (auto i = start; i < stop; i++) {
+                auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
+                z[zOffset] = OpType::op(x[xOffset], scalar, extraParams);
+            };
         }
     }
 }
@@ -199,44 +185,22 @@ void ScalarTransform<X, Y, Z>::transform(void *vx, Nd4jLong xEws,
                                         void *vz, Nd4jLong zEws,
                                         void *vscalar,
                                         void *vextraParams,
-                                        const Nd4jLong len, bool allowParallelism) {
+                                        const uint64_t len, const uint64_t start, const uint64_t stop) {
 
     auto x = reinterpret_cast<X *>(vx);
     auto z = reinterpret_cast<Z *>(vz);
     auto scalar = reinterpret_cast<Y *>(vscalar)[0];
     auto extraParams = reinterpret_cast<Z *>(vextraParams);
 
-    nd4j::OmpLaunchHelper info(len, allowParallelism ? -1 : 1);
-
     if (xEws == 1 && zEws == 1) {
-
-        PRAGMA_OMP_PARALLEL_THREADS_IF(info._numThreads, allowParallelism)
-        {
-            auto threadNum = omp_get_thread_num();
-            auto threadOffset = info.getThreadOffset(threadNum);
-            auto xi = x + threadOffset;
-            auto zi = z + threadOffset;
-            auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-            PRAGMA_OMP_SIMD
-            for (unsigned int i = 0; i < ulen; i++)
-                zi[i] = OpType::op(xi[i], scalar, extraParams);
-        }
+        PRAGMA_OMP_SIMD
+        for (auto i = start; i < stop; i++)
+            z[i] = OpType::op(x[i], scalar, extraParams);
     }
     else {
-
-        PRAGMA_OMP_PARALLEL_THREADS_IF(info._numThreads, allowParallelism)
-        {
-            auto threadNum = omp_get_thread_num();
-            auto threadOffset = info.getThreadOffset(threadNum);
-            auto xi = x + xEws * threadOffset;
-            auto zi = z + zEws * threadOffset;
-            auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-            PRAGMA_OMP_SIMD
-            for (unsigned int i = 0; i < ulen; i++)
-                zi[i * zEws] = OpType::op(xi[i * xEws], scalar, extraParams);
-        }
+        PRAGMA_OMP_SIMD
+        for (auto i = start; i < stop; i++)
+            z[i * zEws] = OpType::op(x[i * xEws], scalar, extraParams);
     }
 }
 
diff --git a/libnd4j/include/loops/cpu/scalar_bool.cpp b/libnd4j/include/loops/cpu/scalar_bool.cpp
index b37bdd6ef..d6dce445b 100644
--- a/libnd4j/include/loops/cpu/scalar_bool.cpp
+++ b/libnd4j/include/loops/cpu/scalar_bool.cpp
@@ -22,6 +22,7 @@
 #include <op_boilerplate.h>
 #include <types/types.h>
 #include <LoopKind.h>
+#include <execution/Threads.h>
 
 #include "../legacy_ops.h"
 
@@ -39,7 +40,8 @@ namespace functions {
                                                 void *vscalars,
                                                 int *dimension, int dimensionLength,
                                                 Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffsets,
-                                                Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets) {
+                                                Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets,
+                                                const uint64_t start, const uint64_t stop) {
 
             auto x = reinterpret_cast<X *>(vx);
             auto z = reinterpret_cast<Z *>(vz);
@@ -64,29 +66,27 @@ namespace functions {
                 return;
             }
 
-            int num_threads = nd4j::math::nd4j_min<int>(numTads, omp_get_max_threads());
+            int num_threads = nd4j::math::nd4j_min<int>(numTads, nd4j::Environment::getInstance()->maxThreads());
 
             if (kindOfLoop == nd4j::LoopKind::EWS1) {
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(num_threads)
-                for (unsigned int r = 0; r < numTads; r++) {
+                for (auto r = start; r < stop; r++) {
                     auto oZ = z + zTadOffsets[r];
                     auto oX = x + xTadOffsets[r];
 
                     PRAGMA_OMP_SIMD
                     for (unsigned int f = 0; f < tadLength; f++)
                         oZ[f] = OpType::op(oX[f], scalars[r], extraParams);
-                }
+                };
             }
-            else { // kindOfLoop != nd4j::LoopKind::EWSNONZERO
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(num_threads)
-                for (unsigned int r = 0; r < numTads; r++) {
+            else {
+                for (auto r = start; r < stop; r++) {
                     auto oZ = z + zTadOffsets[r];
                     auto oX = x + xTadOffsets[r];
 
                     PRAGMA_OMP_SIMD
                     for (unsigned int f = 0; f < tadLength; f++)
                         oZ[f * zTadEws] = OpType::op(oX[f * xTadEws], scalars[r], extraParams);
-                }
+                };
             }
         }
 
@@ -103,8 +103,8 @@ namespace functions {
                               Nd4jLong *xTadShapeInfo,
                               Nd4jLong *xTadOffsets,
                               Nd4jLong *zTadShapeInfo,
-                              Nd4jLong *zTadOffsets) {
-            DISPATCH_BY_OPNUM_TT(transform, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, scalars, dimension, dimensionLength, xTadShapeInfo, xTadOffsets, zTadShapeInfo, zTadOffsets), SCALAR_BOOL_OPS);
+                              Nd4jLong *zTadOffsets, const uint64_t start, const uint64_t stop) {
+            DISPATCH_BY_OPNUM_TT(transform, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, scalars, dimension, dimensionLength, xTadShapeInfo, xTadOffsets, zTadShapeInfo, zTadOffsets, start, stop), SCALAR_BOOL_OPS);
         }
 
 
@@ -116,8 +116,9 @@ namespace functions {
                 Nd4jLong zEws,
                 void *scalar,
                 void *extraParams,
-                const Nd4jLong n) {
-            DISPATCH_BY_OPNUM_TT(transform, PARAMS(x, xEws, z, zEws, scalar, extraParams, n), SCALAR_BOOL_OPS);
+                const uint64_t n,
+                const uint64_t start, const uint64_t stop) {
+            DISPATCH_BY_OPNUM_TT(transform, PARAMS(x, xEws, z, zEws, scalar, extraParams, n, start, stop), SCALAR_BOOL_OPS);
         }
 
         template<typename X, typename Y>
@@ -127,8 +128,9 @@ namespace functions {
                 void *z,
                 Nd4jLong *zShapeInfo,
                 void *scalar,
-                void *extraParams) {
-            DISPATCH_BY_OPNUM_TT(transform, PARAMS(x, xShapeInfo, z, zShapeInfo, scalar, extraParams), SCALAR_BOOL_OPS);
+                void *extraParams,
+                const uint64_t start, const uint64_t stop) {
+            DISPATCH_BY_OPNUM_TT(transform, PARAMS(x, xShapeInfo, z, zShapeInfo, scalar, extraParams, start, stop), SCALAR_BOOL_OPS);
         }
 
         template<typename X, typename Z>
@@ -138,7 +140,8 @@ namespace functions {
                                void *vz,
                                Nd4jLong *zShapeInfo,
                                void *vscalar,
-                               void *vextraParams) {
+                               void *vextraParams,
+                               const uint64_t start, const uint64_t stop) {
 
             auto x = reinterpret_cast<X *>(vx);
             auto z = reinterpret_cast<Z *>(vz);
@@ -149,53 +152,33 @@ namespace functions {
             auto zEws = shape::elementWiseStride(zShapeInfo);
             auto len = shape::length(xShapeInfo);
 
-            // nd4j_logger("Launching scalar: xOrder: %i; zOrder: %i; xEWS: %i\n", xOrder, zOrder, xEws);
-
             nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXZ(xShapeInfo, zShapeInfo);
 
             if (kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) {
-                transform<OpType>(x, xEws, z, zEws, vscalar, extraParams, len);
+                transform<OpType>(x, xEws, z, zEws, vscalar, extraParams, len, start, stop);
                 return;
             }
 
             uint xShapeInfoCast[MAX_RANK];
             const bool canCastX = nd4j::DataTypeUtils::castShapeInfo<uint>(xShapeInfo, xShapeInfoCast);
 
-            nd4j::OmpLaunchHelper info(len);
-
             if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
-
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {
-                    auto threadNum = omp_get_thread_num();
-                    auto threadOffset = info.getThreadOffset(threadNum);
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                    PRAGMA_OMP_SIMD
-                    for (unsigned int i = 0; i < ulen; i++) {
-                        auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                        z[offset] = OpType::op(x[offset], scalar, extraParams);
-                    }
-                }
+                PRAGMA_OMP_SIMD
+                for (auto i = start; i < stop; i++) {
+                    auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                    z[offset] = OpType::op(x[offset], scalar, extraParams);
+                };
             }
             else {
-
                 uint zShapeInfoCast[MAX_RANK];
                 const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo, zShapeInfoCast);
 
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {
-                    auto threadNum = omp_get_thread_num();
-                    auto threadOffset = info.getThreadOffset(threadNum);
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                    PRAGMA_OMP_SIMD
-                    for (unsigned int i = 0; i < ulen; i++) {
-                        auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                        auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
-                        z[zOffset] = OpType::op(x[xOffset], scalar, extraParams);
-                    }
-                }
+                PRAGMA_OMP_SIMD
+                for (auto i = start; i < stop; i++) {
+                    auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                    auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
+                    z[zOffset] = OpType::op(x[xOffset], scalar, extraParams);
+                };
             }
         }
 
@@ -208,44 +191,23 @@ namespace functions {
                     Nd4jLong zEws,
                     void *vscalar,
                     void *vextraParams,
-                    const Nd4jLong len) {
+                    const uint64_t len,
+                    const uint64_t start, const uint64_t stop) {
 
                 auto x = reinterpret_cast<X *>(vx);
                 auto z = reinterpret_cast<Z *>(vz);
                 auto scalar = reinterpret_cast<X *>(vscalar)[0];
                 auto extraParams = reinterpret_cast<X *>(vextraParams);
 
-                nd4j::OmpLaunchHelper info(len);
-
                 if (xEws == 1 && zEws == 1) {
-
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto xi = x + threadOffset;
-                        auto zi = z + threadOffset;
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for (unsigned int i = 0; i < ulen; i++)
-                            zi[i] = OpType::op(xi[i], scalar, extraParams);
-                    }
+                    PRAGMA_OMP_SIMD
+                    for (auto i = start; i < stop; i++)
+                        z[i] = OpType::op(x[i], scalar, extraParams);
                 }
                 else {
-
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto xi = x + xEws * threadOffset;
-                        auto zi = z + zEws * threadOffset;
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for (unsigned int i = 0; i < ulen; i++)
-                            zi[i * zEws] = OpType::op(xi[i * xEws], scalar, extraParams);
-                    }
+                    PRAGMA_OMP_SIMD
+                    for (auto i = start; i < stop; i++)
+                        z[i * zEws] = OpType::op(x[i * xEws], scalar, extraParams);
                 }
             }
 
diff --git a/libnd4j/include/loops/cpu/scalar_int.cpp b/libnd4j/include/loops/cpu/scalar_int.cpp
index 9e73e2756..5f2308418 100644
--- a/libnd4j/include/loops/cpu/scalar_int.cpp
+++ b/libnd4j/include/loops/cpu/scalar_int.cpp
@@ -22,6 +22,7 @@
 #include <op_boilerplate.h>
 #include <types/types.h>
 #include <LoopKind.h>
+#include <execution/Threads.h>
 
 #include "../legacy_ops.h"
 
@@ -39,7 +40,8 @@ namespace functions {
                                                 void *vscalars,
                                                 int *dimension, int dimensionLength,
                                                 Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffsets,
-                                                Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets) {
+                                                Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets,
+                                                const uint64_t start, const uint64_t stop) {
 
             auto x = reinterpret_cast<X *>(vx);
             auto z = reinterpret_cast<X *>(vz);
@@ -64,29 +66,27 @@ namespace functions {
                 return;
             }
 
-            int num_threads = nd4j::math::nd4j_min<int>(numTads, omp_get_max_threads());
+            int num_threads = nd4j::math::nd4j_min<int>(numTads, nd4j::Environment::getInstance()->maxThreads());
 
             if (kindOfLoop == nd4j::LoopKind::EWS1) {
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(num_threads)
-                for (unsigned int r = 0; r < numTads; r++) {
+                for (auto r = start; r < stop; r++) {
                     auto oZ = z + zTadOffsets[r];
                     auto oX = x + xTadOffsets[r];
 
                     PRAGMA_OMP_SIMD
                     for (unsigned int f = 0; f < tadLength; f++)
                         oZ[f] = OpType::op(oX[f], scalars[r], extraParams);
-                }
+                };
             }
-            else { // kindOfLoop != nd4j::LoopKind::EWSNONZERO
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(num_threads)
-                for (unsigned int r = 0; r < numTads; r++) {
+            else {
+                for (auto r = start; r < stop; r++) {
                     auto oZ = z + zTadOffsets[r];
                     auto oX = x + xTadOffsets[r];
 
                     PRAGMA_OMP_SIMD
                     for (unsigned int f = 0; f < tadLength; f++)
                         oZ[f * zTadEws] = OpType::op(oX[f * xTadEws], scalars[r], extraParams);
-                }
+                };
             }
         }
 
@@ -103,8 +103,10 @@ namespace functions {
                               Nd4jLong *xTadShapeInfo,
                               Nd4jLong *xTadOffsets,
                               Nd4jLong *zTadShapeInfo,
-                              Nd4jLong *zTadOffsets) {
-            DISPATCH_BY_OPNUM_T(transform, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, scalars, dimension, dimensionLength, xTadShapeInfo, xTadOffsets, zTadShapeInfo, zTadOffsets), SCALAR_INT_OPS);
+                              Nd4jLong *zTadOffsets,
+                              const uint64_t start, const uint64_t stop) {
+
+            DISPATCH_BY_OPNUM_T(transform, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, scalars, dimension, dimensionLength, xTadShapeInfo, xTadOffsets, zTadShapeInfo, zTadOffsets, start, stop), SCALAR_INT_OPS);
         }
 
 
@@ -116,8 +118,9 @@ namespace functions {
                 Nd4jLong zEws,
                 void *scalar,
                 void *extraParams,
-                const Nd4jLong n) {
-            DISPATCH_BY_OPNUM_T(transform, PARAMS(x, xEws, z, zEws, scalar, extraParams, n), SCALAR_INT_OPS);
+                const uint64_t n,
+                const uint64_t start, const uint64_t stop) {
+            DISPATCH_BY_OPNUM_T(transform, PARAMS(x, xEws, z, zEws, scalar, extraParams, n, start, stop), SCALAR_INT_OPS);
         }
 
         template<typename X>
@@ -127,8 +130,9 @@ namespace functions {
                 void *z,
                 Nd4jLong *zShapeInfo,
                 void *scalar,
-                void *extraParams) {
-            DISPATCH_BY_OPNUM_T(transform, PARAMS(x, xShapeInfo, z, zShapeInfo, scalar, extraParams), SCALAR_INT_OPS);
+                void *extraParams,
+                const uint64_t start, const uint64_t stop) {
+            DISPATCH_BY_OPNUM_T(transform, PARAMS(x, xShapeInfo, z, zShapeInfo, scalar, extraParams, start, stop), SCALAR_INT_OPS);
         }
 
         template<typename X>
@@ -138,7 +142,8 @@ namespace functions {
                                void *vz,
                                Nd4jLong *zShapeInfo,
                                void *vscalar,
-                               void *vextraParams) {
+                               void *vextraParams,
+                               const uint64_t start, const uint64_t stop) {
 
             auto x = reinterpret_cast<X *>(vx);
             auto z = reinterpret_cast<X *>(vz);
@@ -149,53 +154,33 @@ namespace functions {
             auto zEws = shape::elementWiseStride(zShapeInfo);
             auto len = shape::length(xShapeInfo);
 
-            // nd4j_logger("Launching scalar: xOrder: %i; zOrder: %i; xEWS: %i\n", xOrder, zOrder, xEws);
-
             nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXZ(xShapeInfo, zShapeInfo);
 
             if (kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) {
-                transform<OpType>(x, xEws, z, zEws, vscalar, extraParams, len);
+                transform<OpType>(x, xEws, z, zEws, vscalar, extraParams, len, start, stop);
                 return;
             }
 
             uint xShapeInfoCast[MAX_RANK];
             const bool canCastX = nd4j::DataTypeUtils::castShapeInfo<uint>(xShapeInfo, xShapeInfoCast);
 
-            nd4j::OmpLaunchHelper info(len);
-
             if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
-
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {
-                    auto threadNum = omp_get_thread_num();
-                    auto threadOffset = info.getThreadOffset(threadNum);
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                    PRAGMA_OMP_SIMD
-                    for (unsigned int i = 0; i < ulen; i++) {
-                        auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                        z[offset] = OpType::op(x[offset], scalar, extraParams);
-                    }
-                }
+                PRAGMA_OMP_SIMD
+                for (auto i = start; i < stop; i++) {
+                    auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                    z[offset] = OpType::op(x[offset], scalar, extraParams);
+                };
             }
             else {
-
                 uint zShapeInfoCast[MAX_RANK];
                 const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo, zShapeInfoCast);
 
-                PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {
-                    auto threadNum = omp_get_thread_num();
-                    auto threadOffset = info.getThreadOffset(threadNum);
-                    auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                    PRAGMA_OMP_SIMD
-                    for (unsigned int i = 0; i < ulen; i++) {
-                        auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                        auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
-                        z[zOffset] = OpType::op(x[xOffset], scalar, extraParams);
-                    }
-                }
+                PRAGMA_OMP_SIMD
+                for (auto i = start; i < stop; i++) {
+                    auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                    auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
+                    z[zOffset] = OpType::op(x[xOffset], scalar, extraParams);
+                };
             }
         }
 
@@ -208,44 +193,23 @@ namespace functions {
                     Nd4jLong zEws,
                     void *vscalar,
                     void *vextraParams,
-                    const Nd4jLong len) {
+                    const uint64_t len,
+                    const uint64_t start, const uint64_t stop) {
 
                 auto x = reinterpret_cast<X *>(vx);
                 auto z = reinterpret_cast<X *>(vz);
                 auto scalar = reinterpret_cast<X *>(vscalar)[0];
                 auto extraParams = reinterpret_cast<X *>(vextraParams);
 
-                nd4j::OmpLaunchHelper info(len);
-
                 if (xEws == 1 && zEws == 1) {
-
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto xi = x + threadOffset;
-                        auto zi = z + threadOffset;
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for (unsigned int i = 0; i < ulen; i++)
-                            zi[i] = OpType::op(xi[i], scalar, extraParams);
-                    }
+                    PRAGMA_OMP_SIMD
+                    for (auto i = start; i < stop; i++)
+                        z[i] = OpType::op(x[i], scalar, extraParams);
                 }
                 else {
-
-                    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {
-                        auto threadNum = omp_get_thread_num();
-                        auto threadOffset = info.getThreadOffset(threadNum);
-                        auto xi = x + xEws * threadOffset;
-                        auto zi = z + zEws * threadOffset;
-                        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-                        PRAGMA_OMP_SIMD
-                        for (unsigned int i = 0; i < ulen; i++)
-                            zi[i * zEws] = OpType::op(xi[i * xEws], scalar, extraParams);
-                    }
+                    PRAGMA_OMP_SIMD
+                    for (auto i = start; i < stop; i++)
+                        z[i * zEws] = OpType::op(x[i * xEws], scalar, extraParams);
                 }
             }
 
diff --git a/libnd4j/include/loops/cpu/summarystatsreduce.cpp b/libnd4j/include/loops/cpu/summarystatsreduce.cpp
index 1f5a7c339..a8f766f6a 100644
--- a/libnd4j/include/loops/cpu/summarystatsreduce.cpp
+++ b/libnd4j/include/loops/cpu/summarystatsreduce.cpp
@@ -24,6 +24,7 @@
 #include <helpers/shape.h>
 #include <helpers/TAD.h>
 #include <helpers/ConstantTadHelper.h>
+#include <execution/Threads.h>
 
 using namespace simdOps;
 
@@ -90,8 +91,7 @@ namespace functions {
             uint xShapeInfoCast[MAX_RANK];
             const bool canCast = nd4j::DataTypeUtils::castShapeInfo<uint>(xShapeInfo, xShapeInfoCast);
 
-            for (Nd4jLong i = 0; i < length; i++) {
-
+            for (uint64_t i = 0; i < length; i++) {
                 auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCast);
 
                 SummaryStatsData<X> curr;
@@ -123,7 +123,7 @@ namespace functions {
                     return;
                 SummaryStatsData<X> comp;
                 comp.initWithValue(x[0]);
-                PRAGMA_OMP_PARALLEL_FOR_IF(resultLength > nd4j::Environment::getInstance()->elementwiseThreshold())
+
                 for (uint i = 0; i < resultLength; i++)
                     z[i] = OpType::getValue(biasCorrected, comp);
                 return;
@@ -157,35 +157,37 @@ namespace functions {
             uint tadShapeShapeInfoCast[MAX_RANK];
             const bool canCast = tadEWS == 1 && tadOrder == 'c' ? false : nd4j::DataTypeUtils::castShapeInfo<uint>(tadShapeShapeInfo, tadShapeShapeInfoCast);
 
-            PRAGMA_OMP_PARALLEL_FOR
-            for (int r = 0; r < resultLength; r++) {
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto r = start; r < stop; r += increment) {
 
-                auto tadOffsetForBlock = tadPack.primaryOffsets()[r];
-                auto tx = x + tadOffsetForBlock;
-                SummaryStatsData<X> comp;
-                comp.initWithValue(tx[0]);
+                    auto tadOffsetForBlock = tadPack.primaryOffsets()[r];
+                    auto tx = x + tadOffsetForBlock;
+                    SummaryStatsData <X> comp;
+                    comp.initWithValue(tx[0]);
 
-                if (tadEWS == 1 && tadOrder == 'c') {
-                    for (int i = 1; i < tadLength; i ++) {
-                        SummaryStatsData <X> indexVal2;
-                        indexVal2.initWithValue(tx[i]);
+                    if (tadEWS == 1 && tadOrder == 'c') {
+                        for (int i = 1; i < tadLength; i++) {
+                            SummaryStatsData <X> indexVal2;
+                            indexVal2.initWithValue(tx[i]);
 
-                        comp = update(comp, OpType::op(indexVal2, extraParams), extraParams);
+                            comp = update(comp, OpType::op(indexVal2, extraParams), extraParams);
+                        }
+                    } else {
+                        for (int i = 1; i < tadLength; i++) {
+                            auto xOffset = shape::indexOffset(i, tadShapeShapeInfo, tadShapeShapeInfoCast, canCast);
+
+                            SummaryStatsData <X> indexVal2;
+                            indexVal2.initWithValue(tx[xOffset]);
+
+                            comp = update(comp, OpType::op(indexVal2, extraParams), extraParams);
+                        }
                     }
+
+                    z[r] = OpType::getValue(biasCorrected, comp);
                 }
-                else {
-                    for (int i = 1; i < tadLength; i ++) {
-                        auto xOffset = shape::indexOffset(i, tadShapeShapeInfo, tadShapeShapeInfoCast, canCast);
+            };
 
-                        SummaryStatsData <X> indexVal2;
-                        indexVal2.initWithValue(tx[xOffset]);
-
-                        comp = update(comp, OpType::op(indexVal2, extraParams), extraParams);
-                    }
-                }
-
-                z[r] = OpType::getValue(biasCorrected, comp);
-            }
+            samediff::Threads::parallel_tad(func,  0, resultLength, 1);
         }
 
 
diff --git a/libnd4j/include/loops/cpu/transform/transform_any.cpp b/libnd4j/include/loops/cpu/transform/transform_any.cpp
index 5727c096d..5b3c4a0f8 100644
--- a/libnd4j/include/loops/cpu/transform/transform_any.cpp
+++ b/libnd4j/include/loops/cpu/transform/transform_any.cpp
@@ -37,9 +37,8 @@ namespace functions {
 				void *z,
 				Nd4jLong *zShapeInfo,
 				void *extraParams,
-				Nd4jLong *tadShapeInfo,
-				Nd4jLong *tadOffsets, bool allowParallelism) {
-                    DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, z, zShapeInfo, extraParams, tadShapeInfo, tadOffsets, allowParallelism), TRANSFORM_ANY_OPS);
+                uint64_t threadId, uint64_t numThreads) {
+                    DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads), TRANSFORM_ANY_OPS);
 		}
 
 /////////////////////////////////////////////////////////////////////
@@ -47,22 +46,13 @@ template <typename X, typename Z>
 template<typename OpType>
 void _CUDA_H TransformAny<X, Z>::exec(void *vx, Nd4jLong *xShapeInfo,
                                     void *vz,Nd4jLong *zShapeInfo,
-                                    void *vextraParams,
-                                    Nd4jLong *tadShapeInfo,Nd4jLong *tadOffsets, bool allowParallelism) {
+                                    void *vextraParams, uint64_t threadId, uint64_t numThreads) {
 
 	auto x = reinterpret_cast<X *>(vx);
 	auto z = reinterpret_cast<Z *>(vz);
 	auto extraParams = reinterpret_cast<X *>(vextraParams);
-             
-    if(OpType::requiresSpecial) {
-        OpType::execSpecial(x, xShapeInfo, z, zShapeInfo, extraParams, tadShapeInfo, tadOffsets);
-        return;
-    }
 
-    if (allowParallelism)
-        nd4j::TransformLoops<X,Z,X>::template loopTransform<OpType, true>(x, xShapeInfo, z, zShapeInfo, extraParams);
-    else
-        nd4j::TransformLoops<X,Z,X>::template loopTransform<OpType, false>(x, xShapeInfo, z, zShapeInfo, extraParams);
+    nd4j::TransformLoops<X,Z,X>::template loopTransform<OpType>(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads);
 }
 
 
diff --git a/libnd4j/include/loops/cpu/transform/transform_bool.cpp b/libnd4j/include/loops/cpu/transform/transform_bool.cpp
index 3560c85fe..fdfde93f5 100644
--- a/libnd4j/include/loops/cpu/transform/transform_bool.cpp
+++ b/libnd4j/include/loops/cpu/transform/transform_bool.cpp
@@ -37,9 +37,8 @@ namespace functions {
 				void *z,
 				Nd4jLong *zShapeInfo,
 				void *extraParams,
-				Nd4jLong *tadShapeInfo,
-				Nd4jLong *tadOffsets) {
-                    DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, z, zShapeInfo, extraParams, tadShapeInfo, tadOffsets), TRANSFORM_BOOL_OPS);
+                uint64_t threadId, uint64_t numThreads) {
+                    DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads), TRANSFORM_BOOL_OPS);
 		}
 
         template <typename X, typename Z>
@@ -49,20 +48,13 @@ namespace functions {
                     Nd4jLong *xShapeInfo,
                     void *vz,
                     Nd4jLong *zShapeInfo,
-                    void *vextraParams,
-                    Nd4jLong *tadShapeInfo,
-                    Nd4jLong *tadOffsets) {
+                    void *vextraParams, uint64_t threadId, uint64_t numThreads) {
 
             auto x = reinterpret_cast<X *>(vx);
 		    auto z = reinterpret_cast<Z *>(vz);
-		       auto extraParams = reinterpret_cast<X *>(vextraParams);
+		    auto extraParams = reinterpret_cast<X *>(vextraParams);
 
-            if(OpType::requiresSpecial) {
-                OpType::execSpecial(x, xShapeInfo, z, zShapeInfo, extraParams, tadShapeInfo, tadOffsets);
-                return;
-            }
-
-            nd4j::TransformLoops<X,Z,X>::template loopTransform<OpType, true>(x, xShapeInfo, z, zShapeInfo, extraParams);
+            nd4j::TransformLoops<X,Z,X>::template loopTransform<OpType>(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads);
         }
 
         BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT TransformBool, , LIBND4J_TYPES, BOOL_TYPES);
diff --git a/libnd4j/include/loops/cpu/transform/transform_float.cpp b/libnd4j/include/loops/cpu/transform/transform_float.cpp
index 922a76265..8e164a90f 100644
--- a/libnd4j/include/loops/cpu/transform/transform_float.cpp
+++ b/libnd4j/include/loops/cpu/transform/transform_float.cpp
@@ -36,9 +36,8 @@ namespace functions {
 				void *z,
 				Nd4jLong *zShapeInfo,
 				void *extraParams,
-				Nd4jLong *tadShapeInfo,
-				Nd4jLong *tadOffsets) {
-                    DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, z, zShapeInfo, extraParams, tadShapeInfo, tadOffsets), TRANSFORM_FLOAT_OPS);
+                uint64_t threadId, uint64_t numThreads) {
+                    DISPATCH_BY_OPNUM_TT(exec, PARAMS(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads), TRANSFORM_FLOAT_OPS);
 		}
 
         template <typename X, typename Z>
@@ -48,20 +47,13 @@ namespace functions {
                     Nd4jLong *xShapeInfo,
                     void *vz,
                     Nd4jLong *zShapeInfo,
-                    void *vextraParams,
-                    Nd4jLong *tadShapeInfo,
-                    Nd4jLong *tadOffsets) {
+                    void *vextraParams, uint64_t threadId, uint64_t numThreads) {
 
             auto x = reinterpret_cast<X *>(vx);
 		    auto z = reinterpret_cast<Z *>(vz);
 		    auto extraParams = reinterpret_cast<Z *>(vextraParams);
 
-            if(OpType::requiresSpecial) {
-                OpType::execSpecial(x, xShapeInfo, z, zShapeInfo, extraParams, tadShapeInfo, tadOffsets);
-                return;
-            }
-
-            nd4j::TransformLoops<X,Z,Z>::template loopTransform<OpType, true>(x, xShapeInfo, z, zShapeInfo, extraParams);
+            nd4j::TransformLoops<X,Z,Z>::template loopTransform<OpType>(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads);
         }
 
         BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT TransformFloat, , LIBND4J_TYPES, FLOAT_TYPES);
diff --git a/libnd4j/include/loops/cpu/transform/transform_same.cpp b/libnd4j/include/loops/cpu/transform/transform_same.cpp
index f821d73bc..67f7762f0 100644
--- a/libnd4j/include/loops/cpu/transform/transform_same.cpp
+++ b/libnd4j/include/loops/cpu/transform/transform_same.cpp
@@ -36,10 +36,8 @@ namespace functions {
 				Nd4jLong *xShapeInfo,
 				void *z,
 				Nd4jLong *zShapeInfo,
-				void *extraParams,
-				Nd4jLong *tadShapeInfo,
-				Nd4jLong *tadOffsets) {
-                    DISPATCH_BY_OPNUM_T(exec, PARAMS(x, xShapeInfo, z, zShapeInfo, extraParams, tadShapeInfo, tadOffsets), TRANSFORM_SAME_OPS);
+				void *extraParams, uint64_t threadId, uint64_t numThreads) {
+                    DISPATCH_BY_OPNUM_T(exec, PARAMS(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads), TRANSFORM_SAME_OPS);
 		}
 
         template <typename X>
@@ -47,18 +45,14 @@ namespace functions {
 		void _CUDA_H TransformSame<X>::exec(void *vx, Nd4jLong *xShapeInfo,
                                             void *vz, Nd4jLong *zShapeInfo,
                                             void *vextraParams,
-                                            Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
+                                            uint64_t threadId, uint64_t numThreads) {
 
 		    auto x = reinterpret_cast<X *>(vx);
 		    auto z = reinterpret_cast<X *>(vz);
 		    auto extraParams = reinterpret_cast<X *>(vextraParams);
 
-            if(OpType::requiresSpecial) {
-                OpType::execSpecial(x, xShapeInfo, z, zShapeInfo, extraParams, tadShapeInfo, tadOffsets);
-                return;
-            }
 
-            nd4j::TransformLoops<X,X,X>::template loopTransform<OpType, true>(x, xShapeInfo, z, zShapeInfo, extraParams);
+            nd4j::TransformLoops<X,X,X>::template loopTransform<OpType>(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads);
         }
 
         BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT TransformSame, , LIBND4J_TYPES);
diff --git a/libnd4j/include/loops/cpu/transform/transform_strict.cpp b/libnd4j/include/loops/cpu/transform/transform_strict.cpp
index e600d2fb8..29964e3e0 100644
--- a/libnd4j/include/loops/cpu/transform/transform_strict.cpp
+++ b/libnd4j/include/loops/cpu/transform/transform_strict.cpp
@@ -36,10 +36,8 @@ namespace functions {
 				Nd4jLong *xShapeInfo,
 				void *z,
 				Nd4jLong *zShapeInfo,
-				void *extraParams,
-				Nd4jLong *tadShapeInfo,
-				Nd4jLong *tadOffsets) {
-                    DISPATCH_BY_OPNUM_T(exec, PARAMS(x, xShapeInfo, z, zShapeInfo, extraParams, tadShapeInfo, tadOffsets), TRANSFORM_STRICT_OPS);
+				void *extraParams, uint64_t threadId, uint64_t numThreads) {
+                    DISPATCH_BY_OPNUM_T(exec, PARAMS(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads), TRANSFORM_STRICT_OPS);
 		}
 
         template <typename X>
@@ -49,20 +47,13 @@ namespace functions {
                     Nd4jLong *xShapeInfo,
                     void *vz,
                     Nd4jLong *zShapeInfo,
-                    void *vextraParams,
-                    Nd4jLong *tadShapeInfo,
-                    Nd4jLong *tadOffsets) {
+                    void *vextraParams, uint64_t threadId, uint64_t numThreads) {
 
             auto x = reinterpret_cast<X *>(vx);
             auto z = reinterpret_cast<X *>(vz);
             auto extraParams = reinterpret_cast<X *>(vextraParams);
 
-            if(OpType::requiresSpecial) {
-                OpType::execSpecial(x, xShapeInfo, z, zShapeInfo, extraParams, tadShapeInfo, tadOffsets);
-                return;
-            }
-
-            nd4j::TransformLoops<X,X,X>::template loopTransform<OpType, true>(x, xShapeInfo, z, zShapeInfo, extraParams);
+            nd4j::TransformLoops<X,X,X>::template loopTransform<OpType>(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads);
         }
 
         BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT TransformStrict, , FLOAT_TYPES);
diff --git a/libnd4j/include/loops/cuda/aggregates.cu b/libnd4j/include/loops/cuda/aggregates.cu
deleted file mode 100644
index 9ced20e51..000000000
--- a/libnd4j/include/loops/cuda/aggregates.cu
+++ /dev/null
@@ -1,145 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2015-2018 Skymind, Inc.
- *
- * This program and the accompanying materials are made available under the
- * terms of the Apache License, Version 2.0 which is available at
- * https://www.apache.org/licenses/LICENSE-2.0.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations
- * under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- ******************************************************************************/
-
-//
-// @author raver119@gmail.com
-// @author Yurii Shyrma, created on 27.11.2018
-//
-
-#include "../aggregates.h"
-
-namespace functions {
-namespace aggregate {
-
-///////////////////////////////////////////////////////////////////////
-template <typename X>
-template<typename OpClass>
-__device__ void AggregatedFunction<X>::execCuda(X **arguments, int numArguments, 
-                                        Nd4jLong **shapeArguments, int numShapeArguments, 
-                                        int *indexArguments, int numIndexArguments, 
-                                        int **intArrays, int numIntArrays,  
-                                        X *realArguments, int numRealArguments) {
-
-    OpClass::executeAggregateCuda(arguments, numArguments, shapeArguments, numShapeArguments, indexArguments, numIndexArguments, intArrays, numIntArrays, realArguments, numRealArguments);
-}
-
-///////////////////////////////////////////////////////////////////////
-template <typename X>
-__device__ void AggregatedFunction<X>::execCuda(int opNum, 
-                                        X **arguments, int numArguments, 
-                                        Nd4jLong **shapeArguments, int numShapeArguments, 
-                                        int *indexArguments, int numIndexArguments, 
-                                        int **intArrays, int numIntArrays,  
-                                        X *realArguments, int numRealArguments) {
-    
-    DISPATCH_BY_OPNUM_T(execCuda, PARAMS(arguments, numArguments, shapeArguments, numShapeArguments, indexArguments, numIndexArguments, intArrays, numIntArrays, realArguments, numRealArguments), AGGREGATE_OPS);
-}
-
-///////////////////////////////////////////////////////////////////////
-template <typename X>
-__global__ static void execAggregateKernel(int opNum,
-                                void **varguments, int numArguments,
-                                Nd4jLong **shapeArguments, int numShapeArguments,
-                                int *indexArguments, int numIndexArguments,
-                                int **intArrays, int numIntArrays,
-                                void *vrealArguments, int numRealArguments) {
-
-    auto arguments = reinterpret_cast<X**>(varguments);
-    auto realArguments = reinterpret_cast<X*>(vrealArguments);
-    functions::aggregate::AggregatedFunction<X>::execCuda(opNum, arguments, numArguments, shapeArguments, numShapeArguments, indexArguments, numIndexArguments, intArrays, numIntArrays, realArguments, numRealArguments);    
-}
-
-///////////////////////////////////////////////////////////////////////
-template <typename X>
-__host__ void AggregatedFunction<X>::aggregateKernelGeneric(dim3& launchDims, cudaStream_t *stream,
-                                int opNum,
-                                void **arguments, int numArguments,
-                                Nd4jLong **shapeArguments, int numShapeArguments,
-                                int *indexArguments, int numIndexArguments,
-                                int **intArrays, int numIntArrays,
-                                void *realArguments, int numRealArguments) {
-
-    execAggregateKernel<X><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(opNum, arguments, numArguments, shapeArguments, numShapeArguments, indexArguments, numIndexArguments, intArrays, numIntArrays, realArguments, numRealArguments);
-    nd4j::DebugHelper::checkErrorCode(stream, "aggregateKernelGeneric(...) failed");
-}
-
-///////////////////////////////////////////////////////////////////////
-template <typename X>
-__device__ void AggregatedFunction<X>::aggregateBatch(int opNum, int numAggregates, 
-                                                    int maxArgs, int maxShapes, 
-                                                    int maxIntArrays, int maxIntArraySize, 
-                                                    int maxIdx, int maxReals, 
-                                                    void *ptrToArguments) {
-
-    nd4j::PointersHelper<X> helper(ptrToArguments, numAggregates, maxArgs, maxShapes, maxIntArrays, maxIntArraySize, maxIdx, maxReals);
-
-    // TODO: we probably should lift this restriction
-    __shared__ int *intArrays[32];
-
-    __shared__ X **arguments;
-    __shared__ Nd4jLong **shapes;
-    __shared__ int *idxArg;
-    __shared__ X *realArg;
-
-    for(int r = blockIdx.x; r < numAggregates; r += gridDim.x) {
-        if (threadIdx.x == 0) {
-            arguments = helper.getArguments(r);
-            shapes = helper.getShapeArguments(r);
-            idxArg = helper.getIndexArguments(r);
-            realArg = helper.getRealArguments(r);
-        }
-
-        // we fill intArrays param in parallel within block
-        if (threadIdx.x < 32 && threadIdx.x < maxIntArrays) {
-            intArrays[threadIdx.x] = helper.getIntArrayArguments(r, threadIdx.x);
-        }
-        __syncthreads();
-
-        functions::aggregate::AggregatedFunction<X>::execCuda(opNum, arguments, helper.getNumArguments(r), shapes, helper.getNumShapeArguments(r), idxArg, helper.getNumIndexArguments(r), intArrays, helper.getNumIntArrayArguments(r), realArg, helper.getNumRealArguments(r));
-    }
-}
-
-///////////////////////////////////////////////////////////////////////
-template <typename X>
-__global__ static void execAggregateBatch(int opNum, int numAggregates, 
-                                        int maxArgs, int maxShapes, 
-                                        int maxIntArrays, int maxIntArraySize, 
-                                        int maxIdx, int maxReals, 
-                                        void *ptrToArguments) {
-
-    functions::aggregate::AggregatedFunction<X>::aggregateBatch(opNum, numAggregates, maxArgs, maxShapes, maxIntArrays, maxIntArraySize, maxIdx, maxReals, ptrToArguments);
-}
-
-///////////////////////////////////////////////////////////////////////
-template <typename X>
-__host__ void AggregatedFunction<X>::aggregateBatchKernelGeneric(dim3& launchDims, cudaStream_t *stream, 
-                                                    int opNum, int numAggregates, 
-                                                    int maxArgs, int maxShapes, 
-                                                    int maxIntArrays, int maxIntArraySize, 
-                                                    int maxIdx, int maxReals, 
-                                                    void *ptrToArguments) {
-
-    execAggregateBatch<X><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(opNum, numAggregates, maxArgs, maxShapes, maxIntArrays, maxIntArraySize, maxIdx, maxReals, ptrToArguments);
-    nd4j::DebugHelper::checkErrorCode(stream, "aggregateBatchKernel(...) failed");
-}
-
-
-
-
-
-BUILD_SINGLE_TEMPLATE(template class AggregatedFunction, , FLOAT_TYPES);
-}
-}
diff --git a/libnd4j/include/loops/cuda/broadcasting.cu b/libnd4j/include/loops/cuda/broadcasting.cu
index 8028db2ba..8846e5473 100644
--- a/libnd4j/include/loops/cuda/broadcasting.cu
+++ b/libnd4j/include/loops/cuda/broadcasting.cu
@@ -32,84 +32,6 @@
 
 namespace functions {
     namespace broadcast {
-        template <typename X, typename Y, typename Z>
-        void Broadcast<X, Y, Z>::execInverse(int opNum,
-                                void *x,
-                                Nd4jLong *xShapeInfo,
-                                void *y,
-                                Nd4jLong *yShapeInfo,
-                                void *result,
-                                Nd4jLong *resultShapeInfo,
-                                int *dimension,
-                                int dimensionLength,
-                                Nd4jLong *tadShapeInfo,
-                                Nd4jLong *tadOffset,
-                                Nd4jLong *tadShapeInfoZ,
-                                Nd4jLong *tadOffsetZ) {
-            //
-        }
 
-        template <typename X, typename Y, typename Z>
-        void Broadcast<X, Y, Z>::exec(int opNum,
-                         void *x,
-                         Nd4jLong *xShapeInfo,
-                         void *y,
-                         Nd4jLong *yShapeInfo,
-                         void *result,
-                         Nd4jLong *resultShapeInfo,
-                         int *dimension,
-                         int dimensionLength,
-                         Nd4jLong *tadShapeInfo,
-                         Nd4jLong *tadOffset,
-                         Nd4jLong *tadShapeInfoZ,
-                         Nd4jLong *tadOffsetZ) {
-
-        }
-
-        /**
-         * CPU execution
-         * @param x the input
-         * @param xShapeInfo the x shape information
-         * @param y the y data
-         * @param yShapeInfo the y shape information
-         * @param result the result
-         * @param resultShapeInfo the result shape information
-         * @param dimension the dimension to broadcast along long
-         * @param dimensionLength the length of the dimension buffer
-         */
-        template <typename X, typename Y, typename Z>
-        template<typename OpType>
-        void Broadcast<X, Y, Z>::exec(void *x,
-                         Nd4jLong *xShapeInfo,
-                         void *y,
-                         Nd4jLong *yShapeInfo,
-                         void *result,
-                         Nd4jLong *resultShapeInfo,
-                         int *dimension,
-                         int dimensionLength,
-                         Nd4jLong *tadShapeInfo,
-                         Nd4jLong *tadOffset,
-                         Nd4jLong *tadShapeInfoZ,
-                         Nd4jLong *tadOffsetZ) {
-            //
-        }
-
-
-        template <typename X, typename Y, typename Z>
-        template<typename OpType>
-        void Broadcast<X, Y, Z>::execInverse(void *x,
-                                Nd4jLong *xShapeInfo,
-                                void *y,
-                                Nd4jLong *yShapeInfo,
-                                void *result,
-                                Nd4jLong *resultShapeInfo,
-                                int *dimension,
-                                int dimensionLength,
-                                Nd4jLong *tadShapeInfo,
-                                Nd4jLong *tadOffset,
-                                Nd4jLong *tadShapeInfoZ,
-                                Nd4jLong *tadOffsetZ) {
-
-        }
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/loops/cuda/broadcasting_bool.cu b/libnd4j/include/loops/cuda/broadcasting_bool.cu
index aaec44690..af354a2e2 100644
--- a/libnd4j/include/loops/cuda/broadcasting_bool.cu
+++ b/libnd4j/include/loops/cuda/broadcasting_bool.cu
@@ -224,76 +224,6 @@ namespace functions {
 	}
 
 
-        template<typename X, typename Y>
-        void BroadcastBool<X,Y>::exec(int opNum,
-                         void *x,
-                         Nd4jLong *xShapeInfo,
-                         void *y,
-                         Nd4jLong *yShapeInfo,
-                         void *result,
-                         Nd4jLong *resultShapeInfo,
-                         int *dimension,
-                         int dimensionLength,
-                         Nd4jLong *tadShapeInfo,
-                         Nd4jLong *tadOffset,
-                         Nd4jLong *tadShapeInfoZ,
-                         Nd4jLong *tadOffsetZ) {
-
-        }
-
-        template<typename X, typename Y>
-        void BroadcastBool<X,Y>::execInverse(int opNum,
-                                void *x,
-                                Nd4jLong *xShapeInfo,
-                                void *y,
-                                Nd4jLong *yShapeInfo,
-                                void *result,
-                                Nd4jLong *resultShapeInfo,
-                                int *dimension,
-                                int dimensionLength,
-                                Nd4jLong *tadShapeInfo,
-                                Nd4jLong *tadOffset,
-                                Nd4jLong *tadShapeInfoZ,
-                                Nd4jLong *tadOffsetZ) {
-
-        }
-
-        template<typename X, typename Y>
-        template<typename OpType>
-        void BroadcastBool<X,Y>::exec(void *x,
-                         Nd4jLong *xShapeInfo,
-                         void *y,
-                         Nd4jLong *yShapeInfo,
-                         void *result,
-                         Nd4jLong *resultShapeInfo,
-                         int *dimension,
-                         int dimensionLength,
-                         Nd4jLong *tadShapeInfo,
-                         Nd4jLong *tadOffset,
-                         Nd4jLong *tadShapeInfoZ,
-                         Nd4jLong *tadOffsetZ) {
-
-        }
-
-        template<typename X, typename Y>
-        template<typename OpType>
-        void BroadcastBool<X,Y>::execInverse(void *x,
-                                Nd4jLong *xShapeInfo,
-                                void *y,
-                                Nd4jLong *yShapeInfo,
-                                void *result,
-                                Nd4jLong *resultShapeInfo,
-                                int *dimension,
-                                int dimensionLength,
-                                Nd4jLong *tadShapeInfo,
-                                Nd4jLong *tadOffset,
-                                Nd4jLong *tadShapeInfoZ,
-                                Nd4jLong *tadOffsetZ) {
-
-        }
-
-
-
     BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT BroadcastBool, , LIBND4J_TYPES, BOOL_TYPES);
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/loops/cuda/broadcasting_int.cu b/libnd4j/include/loops/cuda/broadcasting_int.cu
index fc613a438..f183c009e 100644
--- a/libnd4j/include/loops/cuda/broadcasting_int.cu
+++ b/libnd4j/include/loops/cuda/broadcasting_int.cu
@@ -217,75 +217,6 @@ namespace functions {
 		}
 	}
 
-
-        template<typename X>
-        void BroadcastInt<X>::exec(int opNum,
-                         void *x,
-                         Nd4jLong *xShapeInfo,
-                         void *y,
-                         Nd4jLong *yShapeInfo,
-                         void *result,
-                         Nd4jLong *resultShapeInfo,
-                         int *dimension,
-                         int dimensionLength,
-                         Nd4jLong *tadShapeInfo,
-                         Nd4jLong *tadOffset,
-                         Nd4jLong *tadShapeInfoZ,
-                         Nd4jLong *tadOffsetZ) {
-
-        }
-
-        template<typename X>
-        void BroadcastInt<X>::execInverse(int opNum,
-                                void *x,
-                                Nd4jLong *xShapeInfo,
-                                void *y,
-                                Nd4jLong *yShapeInfo,
-                                void *result,
-                                Nd4jLong *resultShapeInfo,
-                                int *dimension,
-                                int dimensionLength,
-                                Nd4jLong *tadShapeInfo,
-                                Nd4jLong *tadOffset,
-                                Nd4jLong *tadShapeInfoZ,
-                                Nd4jLong *tadOffsetZ) {
-
-        }
-
-        template<typename X>
-        template<typename OpType>
-        void BroadcastInt<X>::exec(void *x,
-                         Nd4jLong *xShapeInfo,
-                         void *y,
-                         Nd4jLong *yShapeInfo,
-                         void *result,
-                         Nd4jLong *resultShapeInfo,
-                         int *dimension,
-                         int dimensionLength,
-                         Nd4jLong *tadShapeInfo,
-                         Nd4jLong *tadOffset,
-                         Nd4jLong *tadShapeInfoZ,
-                         Nd4jLong *tadOffsetZ) {
-
-        }
-
-        template<typename X>
-        template<typename OpType>
-        void BroadcastInt<X>::execInverse(void *x,
-                                Nd4jLong *xShapeInfo,
-                                void *y,
-                                Nd4jLong *yShapeInfo,
-                                void *result,
-                                Nd4jLong *resultShapeInfo,
-                                int *dimension,
-                                int dimensionLength,
-                                Nd4jLong *tadShapeInfo,
-                                Nd4jLong *tadOffset,
-                                Nd4jLong *tadShapeInfoZ,
-                                Nd4jLong *tadOffsetZ) {
-
-        }
-
         BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT BroadcastInt, , INTEGER_TYPES);
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/loops/cuda/indexreduce.cu b/libnd4j/include/loops/cuda/indexreduce.cu
index 8a560e416..1bd5d10cb 100644
--- a/libnd4j/include/loops/cuda/indexreduce.cu
+++ b/libnd4j/include/loops/cuda/indexreduce.cu
@@ -359,32 +359,6 @@ namespace functions {
             }
         }
 
-
-
-
-        template <typename X, typename Z>
-        Nd4jLong IndexReduce<X,Z>::execScalar(const int opNum, void *x, Nd4jLong *xShapeInfo, void *extraParams) {
-            return 0;
-        }
-
-        template <typename X, typename Z>
-        void IndexReduce<X,Z>::exec(const int opNum, void *x, Nd4jLong *xShapeInfo, void *extraParams, void *result, Nd4jLong *resultShapeInfoBuffer, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffset) {
-
-        }
-
-        template <typename X, typename Z>
-        template<typename OpType>
-        Nd4jLong IndexReduce<X,Z>:: execScalar(void *x, Nd4jLong *xShapeInfo, void *extraParams) {
-            return 0;
-        }
-
-        template <typename X, typename Z>
-        template<typename OpType>
-        _CUDA_H void IndexReduce<X,Z>::exec(void *x, Nd4jLong *xShapeInfo, void *extraParams, void *result, Nd4jLong *resultShapeInfoBuffer, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffset) {
-
-        }
-
-
         BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES, INDEXING_TYPES);
     }
 }
diff --git a/libnd4j/include/loops/cuda/pairwise.cu b/libnd4j/include/loops/cuda/pairwise.cu
index 17f8537e5..4833d32d0 100644
--- a/libnd4j/include/loops/cuda/pairwise.cu
+++ b/libnd4j/include/loops/cuda/pairwise.cu
@@ -22,58 +22,6 @@
 
 namespace functions {
     namespace pairwise_transforms {
-        template <typename X, typename Y, typename Z>
-        void PairWiseTransform<X, Y, Z>::exec(
-                const int opNum,
-                void *x,
-                Nd4jLong *xShapeInfo,
-                void *y,
-                Nd4jLong *yShapeInfo,
-                void *z,
-                Nd4jLong *zShapeInfo,
-                void *extraParams) {
 
-        }
-
-        template <typename X, typename Y, typename Z>
-        void PairWiseTransform<X, Y, Z>::exec(
-                const int opNum,
-                void *x,
-                Nd4jLong xStride,
-                void *y,
-                Nd4jLong yStride,
-                void *z,
-                Nd4jLong resultStride,
-                void *extraParams,
-                Nd4jLong len) {
-
-        }
-
-
-        template <typename X, typename Y, typename Z>
-        template<typename OpType>
-        void PairWiseTransform<X, Y, Z>:: exec(
-                void *vx,
-                Nd4jLong* xShapeInfo,
-                void *vy,
-                Nd4jLong* yShapeInfo,
-                void *vresult,
-                Nd4jLong* zShapeInfo,
-                void *vextraParams) {
-
-        }
-
-        template <typename X, typename Y, typename Z>
-        template<typename OpType>
-        void PairWiseTransform<X, Y, Z>::exec(void *vx,
-                         Nd4jLong xStride,
-                         void *vy,
-                         Nd4jLong yStride,
-                         void *vresult,
-                         Nd4jLong resultStride,
-                         void *vextraParams,
-                         const Nd4jLong len) {
-
-        }
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/loops/cuda/pairwise_bool.cu b/libnd4j/include/loops/cuda/pairwise_bool.cu
index 414aadd30..05adbbce4 100644
--- a/libnd4j/include/loops/cuda/pairwise_bool.cu
+++ b/libnd4j/include/loops/cuda/pairwise_bool.cu
@@ -110,63 +110,6 @@ void PairWiseBoolTransform<X,Y>::executeCudaShaped(dim3& launchDims, cudaStream_
 	DISPATCH_BY_OPNUM_TT(intermediateShaped, PARAMS(launchDims, stream, vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo, vextraParams), PAIRWISE_BOOL_OPS);
 }
 
-
-    template<typename X, typename Y>
-    void PairWiseBoolTransform<X,Y>::exec(
-            const int opNum,
-            void *dx,
-            Nd4jLong *xShapeBuffer,
-            void *y,
-            Nd4jLong *yShapeBuffer,
-            void *result,
-            Nd4jLong *resultShapeBuffer,
-            void *extraParams) {
-
-    }
-
-    template<typename X, typename Y>
-    void PairWiseBoolTransform<X,Y>::exec(
-            const int opNum,
-            void *dx,
-            Nd4jLong xStride,
-            void *y,
-            Nd4jLong yStride,
-            void *result,
-            Nd4jLong resultStride,
-            void *extraParams,
-            Nd4jLong n) {
-
-    }
-
-
-    template<typename X, typename Y>
-    template<typename OpType>
-    void PairWiseBoolTransform<X,Y>::exec(
-            void *vx,
-            Nd4jLong* xShapeBuffer,
-            void *vy,
-            Nd4jLong* yShapeBuffer,
-            void *vresult,
-            Nd4jLong* resultShapeBuffer,
-            void *vextraParams) {
-
-    }
-
-    template<typename X, typename Y>
-    template<typename OpType>
-    void PairWiseBoolTransform<X,Y>::exec(void *vx,
-                     Nd4jLong xStride,
-                     void *vy,
-                     Nd4jLong yStride,
-                     void *vresult,
-                     Nd4jLong resultStride,
-                     void *vextraParams,
-                     const Nd4jLong n) {
-
-    }
-
-
-
     BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT PairWiseBoolTransform, , LIBND4J_TYPES, BOOL_TYPES);
 }
 }
diff --git a/libnd4j/include/loops/cuda/pairwise_int.cu b/libnd4j/include/loops/cuda/pairwise_int.cu
index 2bedb4a82..85dce56f2 100644
--- a/libnd4j/include/loops/cuda/pairwise_int.cu
+++ b/libnd4j/include/loops/cuda/pairwise_int.cu
@@ -109,63 +109,6 @@ void PairWiseIntTransform<X>::executeCudaShaped(dim3& launchDims, cudaStream_t *
 	DISPATCH_BY_OPNUM_T(intermediateShaped, PARAMS(launchDims, stream, vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo, vextraParams), PAIRWISE_INT_OPS);
 }
 
-
-    template<typename X>
-    void PairWiseIntTransform<X>::exec(
-            const int opNum,
-            void *dx,
-            Nd4jLong *xShapeBuffer,
-            void *y,
-            Nd4jLong *yShapeBuffer,
-            void *result,
-            Nd4jLong *resultShapeBuffer,
-            void *extraParams) {
-
-    }
-
-    template<typename X>
-    void PairWiseIntTransform<X>::exec(
-            const int opNum,
-            void *dx,
-            Nd4jLong xStride,
-            void *y,
-            Nd4jLong yStride,
-            void *result,
-            Nd4jLong resultStride,
-            void *extraParams,
-            Nd4jLong n) {
-
-    }
-
-
-    template<typename X>
-    template<typename OpType>
-    void PairWiseIntTransform<X>::exec(
-            void *vx,
-            Nd4jLong* xShapeBuffer,
-            void *vy,
-            Nd4jLong* yShapeBuffer,
-            void *vresult,
-            Nd4jLong* resultShapeBuffer,
-            void *vextraParams) {
-
-    }
-
-    template<typename X>
-    template<typename OpType>
-    void PairWiseIntTransform<X>::exec(void *vx,
-                     Nd4jLong xStride,
-                     void *vy,
-                     Nd4jLong yStride,
-                     void *vresult,
-                     Nd4jLong resultStride,
-                     void *vextraParams,
-                     const Nd4jLong n) {
-
-    }
-
-
-
     BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT PairWiseIntTransform, , INTEGER_TYPES);
 }
 }
diff --git a/libnd4j/include/loops/cuda/random.cu b/libnd4j/include/loops/cuda/random.cu
index 3bf06ae91..47ced2769 100644
--- a/libnd4j/include/loops/cuda/random.cu
+++ b/libnd4j/include/loops/cuda/random.cu
@@ -442,39 +442,6 @@ namespace functions {
             DEBUG_KERNEL(stream, opNum);
         }
 
-        template<typename T>
-        template<typename OpClass>
-        void RandomFunction<T>::execTransform(Nd4jPointer state, void *x, Nd4jLong *xShapeBuffer, void *y, Nd4jLong *yShapeBuffer, void *z, Nd4jLong *zShapeBuffer, void *extraArguments) {
-
-        }
-
-        template<typename T>
-        template<typename OpClass>
-        void RandomFunction<T>::execTransform(Nd4jPointer state, void *x, Nd4jLong *xShapeBuffer, void *z, Nd4jLong *zShapeBuffer, void *extraArguments) {
-
-        }
-
-        template<typename T>
-        template<typename OpClass>
-        void RandomFunction<T>::execTransform(Nd4jPointer state, void *z, Nd4jLong *zShapeBuffer, void *extraArguments) {
-
-        }
-
-        template<typename T>
-        void RandomFunction<T>::execTransform(int opNum, Nd4jPointer state, void *x, Nd4jLong *xShapeBuffer, void *z, Nd4jLong *zShapeBuffer, void *extraArguments) {
-
-        }
-
-        template<typename T>
-        void RandomFunction<T>::execTransform(int opNum, Nd4jPointer state, void *x, Nd4jLong *xShapeBuffer, void *y, Nd4jLong *yShapeBuffer, void *z, Nd4jLong *zShapeBuffer, void *extraArguments) {
-
-        }
-
-        template<typename T>
-        void RandomFunction<T>::execTransform(int opNum, Nd4jPointer state, void *z, Nd4jLong *zShapeBuffer, void *extraArguments) {
-
-        }
-
         BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT RandomFunction, , FLOAT_TYPES);
     }
 }
diff --git a/libnd4j/include/loops/cuda/reduce3.chpp b/libnd4j/include/loops/cuda/reduce3.chpp
index fa1ab2e17..ac1d1adc3 100644
--- a/libnd4j/include/loops/cuda/reduce3.chpp
+++ b/libnd4j/include/loops/cuda/reduce3.chpp
@@ -132,7 +132,7 @@ __device__ void Reduce3<X,Z>::execScalarCuda( void *vx, Nd4jLong *xShapeInfo,
 		extraZ[1] = (Z) 0.0f;
 
 		if (extraParams != nullptr)
-			extraZ[2] = *(static_cast<Z*>(extraParams));
+			extraZ[2] = static_cast<Z*>(extraParams)[2];
 		else
 			extraZ[2] = (Z) 0.0f;
 	}
diff --git a/libnd4j/include/loops/cuda/reduce3.cu b/libnd4j/include/loops/cuda/reduce3.cu
index 1ad94beee..4f0e0457c 100644
--- a/libnd4j/include/loops/cuda/reduce3.cu
+++ b/libnd4j/include/loops/cuda/reduce3.cu
@@ -27,56 +27,7 @@
 
 namespace functions {
     namespace reduce3 {
-        template <typename X, typename Y>
-        template<typename OpType>
-        void Reduce3<X,Y>::execScalar(void *vx, Nd4jLong *xShapeInfo, void *vextraParams, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo) {
 
-        }
-
-
-        template <typename X, typename Y>
-        void Reduce3<X,Y>::execScalar(const int opNum, void *x, Nd4jLong *xShapeInfo, void *extraParamsVals, void *y, Nd4jLong *yShapeInfo, void *z, Nd4jLong *zShapeInfo) {
-
-        }
-
-
-        template <typename X, typename Y>
-        template<typename OpType>
-        void Reduce3<X,Y>::exec(void *vx, Nd4jLong *xShapeInfo, void *vextraParams, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength) {
-
-        }
-
-
-        template <typename X, typename Y>
-        template<typename OpType>
-        void Reduce3<X,Y>::exec(void *vx, Nd4jLong *xShapeInfo, void *vextraParams, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-        }
-
-
-        template <typename X, typename Y>
-        template<typename OpType>
-        void Reduce3<X,Y>::execAll(void *vx, Nd4jLong *xShapeInfo, void *vextraParams, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength,  Nd4jLong *xTadShapeInfo, Nd4jLong *xOffsets, Nd4jLong *yTadShapeInfo, Nd4jLong *yOffsets) {
-
-        }
-
-
-        template <typename X, typename Y>
-        void Reduce3<X,Y>::exec(const int opNum, void *vx, Nd4jLong *xShapeInfo, void *extraParamsVals, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength) {
-
-        }
-
-
-        template <typename X, typename Y>
-        void Reduce3<X,Y>::exec(const int opNum, void *vx, Nd4jLong *xShapeInfo, void *extraParamsVals, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-        }
-
-
-        template <typename X, typename Y>
-        void Reduce3<X,Y>::execAll(const int opNum, void *vx, Nd4jLong *xShapeInfo, void *extraParamsVals, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, Nd4jLong *xTadShapeInfo, Nd4jLong *xOffsets, Nd4jLong *yTadShapeInfo, Nd4jLong *yOffsets) {
-
-        }
 
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/loops/cuda/scalar_bool.cu b/libnd4j/include/loops/cuda/scalar_bool.cu
index 37939b9b9..bb498c3a9 100644
--- a/libnd4j/include/loops/cuda/scalar_bool.cu
+++ b/libnd4j/include/loops/cuda/scalar_bool.cu
@@ -231,41 +231,6 @@ void ScalarBoolTransform<X,Y>::executeCudaAlongDimension(dim3& launchDims, cudaS
 }
 
     BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT ScalarBoolTransform, , LIBND4J_TYPES, BOOL_TYPES);
-
-
-    template<typename X, typename Y>
-    template <typename OpType>
-    void ScalarBoolTransform<X,Y>::transform(void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) {
-
-    }
-
-    template<typename X, typename Y>
-    void ScalarBoolTransform<X,Y>::transform(int opNum, void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) {
-
-    }
-
-    template<typename X, typename Y>
-    void ScalarBoolTransform<X,Y>::transform(const int opNum, void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo,  void *scalar,  void *extraParams) {
-
-    }
-
-    template<typename X, typename Y>
-    void ScalarBoolTransform<X,Y>::transform(const int opNum, void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const Nd4jLong n) {
-
-    }
-
-    template<typename X, typename Y>
-    template<typename OpType>
-    void ScalarBoolTransform<X,Y>::transform(void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *scalar, void *extraParams) {
-
-    }
-
-
-    template<typename X, typename Y>
-    template<typename OpType>
-    void ScalarBoolTransform<X,Y>::transform(void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const Nd4jLong n) {
-
-    }
 }
 }
 
diff --git a/libnd4j/include/loops/cuda/scalar_int.cu b/libnd4j/include/loops/cuda/scalar_int.cu
index 44c73fcb4..f25beca82 100644
--- a/libnd4j/include/loops/cuda/scalar_int.cu
+++ b/libnd4j/include/loops/cuda/scalar_int.cu
@@ -230,40 +230,6 @@ void ScalarIntTransform<X>::executeCudaAlongDimension(dim3& launchDims, cudaStre
 
     BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT ScalarIntTransform, , INTEGER_TYPES);
 
-
-    template<typename X>
-    template <typename OpType>
-    void ScalarIntTransform<X,>::transform(void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) {
-
-    }
-
-    template<typename X>
-    void ScalarIntTransform<X>::transform(int opNum, void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) {
-
-    }
-
-    template<typename X>
-    void ScalarIntTransform<X>::transform(const int opNum, void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo,  void *scalar,  void *extraParams) {
-
-    }
-
-    template<typename X>
-    void ScalarIntTransform<X>::transform(const int opNum, void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const Nd4jLong n) {
-
-    }
-
-    template<typename X>
-    template<typename OpType>
-    void ScalarIntTransform<X>::transform(void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *scalar, void *extraParams) {
-
-    }
-
-
-    template<typename X>
-    template<typename OpType>
-    void ScalarIntTransform<X>::transform(void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const Nd4jLong n) {
-
-    }
 }
 }
 
diff --git a/libnd4j/include/loops/cuda/summarystatsreduce.cu b/libnd4j/include/loops/cuda/summarystatsreduce.cu
index 4867f5de1..e505929e6 100644
--- a/libnd4j/include/loops/cuda/summarystatsreduce.cu
+++ b/libnd4j/include/loops/cuda/summarystatsreduce.cu
@@ -414,73 +414,6 @@ void _CUDA_G summaryStatsReduceT(int op, void *dx, Nd4jLong *xShapeInfo, int xRa
         }
 
 
-        template <typename X, typename Y>
-        Y SummaryStatsReduce<X,Y>::execScalar(int opNum,
-                            bool biasCorrected,
-                            void *x,
-                            Nd4jLong *xShapeInfo,
-                            void *extraParams) {
-            return 0;
-        }
-
-        template <typename X, typename Y>
-        void SummaryStatsReduce<X,Y>::execScalar(int opNum,
-                               bool biasCorrected,
-                               void *x,
-                               Nd4jLong *xShapeInfo,
-                               void *extraParams,
-                               void *vz,
-                               Nd4jLong *resultShapeInfoBuffer) {
-
-        }
-
-        template <typename X, typename Y>
-        void SummaryStatsReduce<X,Y>::exec(int opNum,
-                         bool biasCorrected,
-                         void *x,
-                         Nd4jLong *xShapeInfo,
-                         void *extraParams,
-                         void *vz,
-                         Nd4jLong *resultShapeInfoBuffer,
-                         int *dimension, int dimensionLength) {
-
-        }
-
-        template <typename X, typename Y>
-        template<typename OpType>
-        Y SummaryStatsReduce<X,Y>::execScalar(bool biasCorrected,
-                            void *x,
-                            Nd4jLong *xShapeInfo,
-                            void *extraParams) {
-            return 0;
-        }
-
-        template <typename X, typename Y>
-        template<typename OpType>
-        void SummaryStatsReduce<X,Y>::execScalar(bool biasCorrected,
-                               void *x,
-                               Nd4jLong *xShapeInfo,
-                               void *extraParams,
-                               void *vz,
-                               Nd4jLong *resultShapeInfoBuffer) {
-            //
-        }
-
-
-        template <typename X, typename Y>
-        template<typename OpType>
-        void SummaryStatsReduce<X,Y>::exec(bool biasCorrected,
-                         void *x,
-                         Nd4jLong *xShapeInfo,
-                         void *extraParams,
-                         void *vz,
-                         Nd4jLong *resultShapeInfoBuffer,
-                         int *dimension,
-                         int dimensionLength) {
-
-        }
-
-
         BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT SummaryStatsReduce, , LIBND4J_TYPES, FLOAT_TYPES);
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/loops/cuda/transform/transform_any.cu b/libnd4j/include/loops/cuda/transform/transform_any.cu
index 18b53cea7..5ca6f0067 100644
--- a/libnd4j/include/loops/cuda/transform/transform_any.cu
+++ b/libnd4j/include/loops/cuda/transform/transform_any.cu
@@ -114,17 +114,6 @@ namespace functions {
             nd4j::DebugHelper::checkErrorCode(stream, "transformAny(...) failed");
 		}
 
-        template<typename X, typename Z>
-        void TransformAny<X,Z>::exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *vz, Nd4jLong *zShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, bool allowParallelism) {
-
-        }
-
-        template<typename X, typename Z>
-        template <typename OpType>
-        void TransformAny<X,Z>::exec(void *dx, Nd4jLong *xShapeInfo, void *vz, Nd4jLong *zShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, bool allowParallelism) {
-
-        }
-
         BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT TransformAny, , LIBND4J_TYPES, LIBND4J_TYPES);
     }
 }
diff --git a/libnd4j/include/loops/cuda/transform/transform_bool.cu b/libnd4j/include/loops/cuda/transform/transform_bool.cu
index e88a4274b..0f56020b0 100644
--- a/libnd4j/include/loops/cuda/transform/transform_bool.cu
+++ b/libnd4j/include/loops/cuda/transform/transform_bool.cu
@@ -120,17 +120,6 @@ namespace functions {
             nd4j::DebugHelper::checkErrorCode(stream, "transformBool(...) failed");
 		}
 
-        template<typename X, typename Z>
-        void TransformBool<X,Z>::exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-        }
-
-        template<typename X, typename Z>
-        template <typename OpType>
-        void TransformBool<X,Z>::exec(void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-        }
-
         BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT TransformBool, , LIBND4J_TYPES, BOOL_TYPES);
     }
 }
diff --git a/libnd4j/include/loops/cuda/transform/transform_float.cu b/libnd4j/include/loops/cuda/transform/transform_float.cu
index 44ddb0246..49d6ab26f 100644
--- a/libnd4j/include/loops/cuda/transform/transform_float.cu
+++ b/libnd4j/include/loops/cuda/transform/transform_float.cu
@@ -142,18 +142,6 @@ namespace functions {
             nd4j::DebugHelper::checkErrorCode(stream, "transformFloat(...) failed");
 		}
 
-        template<typename X, typename Z>
-        void TransformFloat<X,Z>::exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-        }
-
-        template<typename X, typename Z>
-        template <typename OpType>
-        void TransformFloat<X,Z>::exec(void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-        }
-
-
 		BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT TransformFloat, , LIBND4J_TYPES, FLOAT_TYPES);
     }
 }
diff --git a/libnd4j/include/loops/cuda/transform/transform_same.cu b/libnd4j/include/loops/cuda/transform/transform_same.cu
index e59381fba..4c587111b 100644
--- a/libnd4j/include/loops/cuda/transform/transform_same.cu
+++ b/libnd4j/include/loops/cuda/transform/transform_same.cu
@@ -118,17 +118,6 @@ namespace functions {
             nd4j::DebugHelper::checkErrorCode(stream, "transformSame(...) failed");
 		}
 
-        template<typename X>
-        void TransformSame<X>::exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-        }
-
-        template<typename X>
-        template <typename OpType>
-        void TransformSame<X>::exec(void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-        }
-
         BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT TransformSame, , LIBND4J_TYPES);
     }
 }
diff --git a/libnd4j/include/loops/cuda/transform/transform_strict.cu b/libnd4j/include/loops/cuda/transform/transform_strict.cu
index 0befdf35f..1136ef695 100644
--- a/libnd4j/include/loops/cuda/transform/transform_strict.cu
+++ b/libnd4j/include/loops/cuda/transform/transform_strict.cu
@@ -119,17 +119,6 @@ namespace functions {
             nd4j::DebugHelper::checkErrorCode(stream, "transformStrict(...) failed");
 		}
 
-        template<typename X>
-        void TransformStrict<X>::exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-        }
-
-        template<typename X>
-        template <typename OpType>
-        void TransformStrict<X>::exec(void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-        }
-
         BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT TransformStrict, , FLOAT_TYPES);
     }
 }
diff --git a/libnd4j/include/loops/impl/type_conversions.cpp b/libnd4j/include/loops/impl/type_conversions.cpp
index dc85b9554..5a4a9db41 100644
--- a/libnd4j/include/loops/impl/type_conversions.cpp
+++ b/libnd4j/include/loops/impl/type_conversions.cpp
@@ -22,6 +22,7 @@
 #include <op_boilerplate.h>
 #include <loops/type_conversions.h>
 #include <OmpLaunchHelper.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 
@@ -79,10 +80,13 @@ namespace nd4j {
         auto amin = nd4j::math::nd4j_abs<float>(min);
 
         // now we actually apply quantization
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for (Nd4jLong e = 0; e < N; e++) {
-            rz[e] = static_cast<char>(nd4j::math::nd4j_round<float,char>(1.0f * x[e] / nd4j::math::nd4j_max<float>(amax, amin) * max_byte));
-        }
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto e = start; e < stop; e += increment) {
+                rz[e] = static_cast<char>(nd4j::math::nd4j_round<float, char>(1.0f * x[e] / nd4j::math::nd4j_max<float>(amax, amin) * max_byte));
+            }
+        };
+
+        samediff::Threads::parallel_for(func,  0, N);
     }
 
     template <typename T>
@@ -172,12 +176,15 @@ PRAGMA_OMP_ATOMIC_ARGS(write)
         // we use 3 as offset, since first 12 bytes are occupied with header
         int flimit = limit + 4;
 
-        PRAGMA_OMP_PARALLEL_FOR_IF(flimit > Environment::getInstance()->elementwiseThreshold())
-        for (int e = 4; e < flimit; e++) {
-            int el = x[e];
-            int ael = nd4j::math::nd4j_abs<int>(el) - 1;
-            z[ael] += el > 0 ? threshold : -threshold;
-        }
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto e = start; e < stop; e += increment) {
+                int el = x[e];
+                int ael = nd4j::math::nd4j_abs<int>(el) - 1;
+                z[ael] += el > 0 ? threshold : -threshold;
+            }
+        };
+
+        samediff::Threads::parallel_for(func,  4, flimit);
     }
 
     /**
@@ -194,19 +201,12 @@ PRAGMA_OMP_ATOMIC_ARGS(write)
         auto x = reinterpret_cast<S *>(dx);
         auto z = reinterpret_cast<T *>(dz);
 
-        if (N < nd4j::Environment::getInstance()->elementwiseThreshold()) {
-            for (int i = 0; i < N; i++) {
-                // FIXME: get rid of through-float though
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
                 z[i] = static_cast<T>(static_cast<float>(x[i]));
             }
-        } else {
-
-            PRAGMA_OMP_PARALLEL_FOR
-            for (int i = 0; i < N; i++) {
-                // FIXME: get rid of through-float though
-                z[i] = static_cast<T>(static_cast<float>(x[i]));
-            }
-        }
+        };
+        samediff::Threads::parallel_for(func,  0, N);
     };
 
     template void TypeCast::convertFromThreshold<float>(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz);
diff --git a/libnd4j/include/loops/indexreduce.h b/libnd4j/include/loops/indexreduce.h
index 792ed16a9..ad4472dec 100755
--- a/libnd4j/include/loops/indexreduce.h
+++ b/libnd4j/include/loops/indexreduce.h
@@ -37,10 +37,6 @@
 #include <cuda_runtime.h>
 #endif
 
-#ifndef _OPENMP
-#define omp_get_thread_num() 0
-#define omp_get_max_threads() 1
-#endif
 
 #include <helpers/TAD.h>
 
@@ -70,7 +66,7 @@ namespace functions {
     static _CUDA_H void executeIndexReduceScalar(dim3 launchDims, cudaStream_t *stream, const int op, void *dx, Nd4jLong *xShapeInfo, int xRank, void *extraParams, void *result, Nd4jLong *resultShapeInfo, int zRank, int *dimension, int dimensionLength, int postProcessOrNot, int *allocationBuffer, void *reductionBuffer, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets);
 
     static _CUDA_H void executeIndexReduce(dim3 launchDims, cudaStream_t *stream, const int op, void *dx, Nd4jLong *xShapeInfo, int xRank, void *extraParams, void *result, Nd4jLong *resultShapeInfo, int zRank, int *dimension, int dimensionLength, int postProcessOrNot, int *allocationBuffer, void *reductionBuffer, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets);
-#endif
+#else
 
 		static Nd4jLong execScalar(const int opNum, void *x, Nd4jLong *xShapeInfo, void *extraParams);
 
@@ -81,6 +77,7 @@ namespace functions {
 
 		template<typename OpType>
 		static _CUDA_H void exec(void *x, Nd4jLong *xShapeInfo, void *extraParams, void *result, Nd4jLong *resultShapeInfoBuffer, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffset);
+#endif
 		};
 	}
 }
diff --git a/libnd4j/include/loops/legacy_ops.h b/libnd4j/include/loops/legacy_ops.h
index 0e5200321..92fd58d7a 100644
--- a/libnd4j/include/loops/legacy_ops.h
+++ b/libnd4j/include/loops/legacy_ops.h
@@ -92,8 +92,6 @@
         (5, TimesOneMinus), \
         (6, Cube), \
         (7, OneMinus), \
-        (8, Col2Im), \
-        (9, Im2col),\
         (11, Reciprocal), \
         (12, Square), \
         (13, CompareAndSetTransform) ,\
@@ -101,7 +99,6 @@
         (17, Ceiling), \
         (18, Floor), \
         (19, ClipByValue) ,\
-        (20, Reverse), \
         (21, Copy)
 
 #define TRANSFORM_ANY_OPS \
diff --git a/libnd4j/include/loops/pairwise_bool.h b/libnd4j/include/loops/pairwise_bool.h
index 0ff4ebdee..f7a65c3f5 100644
--- a/libnd4j/include/loops/pairwise_bool.h
+++ b/libnd4j/include/loops/pairwise_bool.h
@@ -40,11 +40,6 @@
 #include <cuda_runtime.h>
 #endif
 
-#ifndef _OPENMP
-#define omp_get_thread_num() 0
-#define omp_get_max_threads() 1
-#endif
-
 
 #include "legacy_ops.h"
 
@@ -68,8 +63,7 @@ namespace functions {
             static __host__ void executeCudaShaped(dim3& launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShapeInfo, void *y, Nd4jLong *yShapeInfo, void *z, Nd4jLong *zShapeInfo, void *extraParams);
 
 
-#endif
-        public:
+#else
 
             static void exec(
 				const int opNum,
@@ -79,7 +73,9 @@ namespace functions {
 				Nd4jLong *yShapeBuffer,
 				void *result,
 				Nd4jLong *resultShapeBuffer,
-				void *extraParams);
+				void *extraParams,
+                const uint64_t start,
+                const uint64_t stop);
 			
 			static void exec(
 				const int opNum,
@@ -90,7 +86,9 @@ namespace functions {
 				void *result,
 				Nd4jLong resultStride,
 				void *extraParams,
-				Nd4jLong n);
+				Nd4jLong n,
+                const uint64_t start,
+                const uint64_t stop);
 
 
 			template<typename OpType>
@@ -101,7 +99,9 @@ namespace functions {
                     Nd4jLong* yShapeBuffer,
                     void *vresult,
                     Nd4jLong* resultShapeBuffer,
-                    void *vextraParams);
+                    void *vextraParams,
+                    const uint64_t start,
+                    const uint64_t stop);
 
             template<typename OpType>
             static void exec(void *vx,
@@ -111,7 +111,10 @@ namespace functions {
                              void *vresult,
                              Nd4jLong resultStride,
                              void *vextraParams,
-                             const Nd4jLong n);
+                             const Nd4jLong n,
+                             const uint64_t start,
+                             const uint64_t stop);
+#endif
         };
     }
 }
diff --git a/libnd4j/include/loops/pairwise_int.h b/libnd4j/include/loops/pairwise_int.h
index 14d273285..aa6437d17 100644
--- a/libnd4j/include/loops/pairwise_int.h
+++ b/libnd4j/include/loops/pairwise_int.h
@@ -40,10 +40,6 @@
 #include <cuda_runtime.h>
 #endif
 
-#ifndef _OPENMP
-#define omp_get_thread_num() 0
-#define omp_get_max_threads() 1
-#endif
 
 
 #include "legacy_ops.h"
@@ -68,8 +64,7 @@ namespace functions {
             static __host__ void executeCudaShaped(dim3& launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShapeInfo, void *y, Nd4jLong *yShapeInfo, void *z, Nd4jLong *zShapeInfo, void *extraParams);
 
 
-#endif
-        public:
+#else
 
             static void exec(
 				const int opNum,
@@ -79,7 +74,9 @@ namespace functions {
 				Nd4jLong *yShapeBuffer,
 				void *result,
 				Nd4jLong *resultShapeBuffer,
-				void *extraParams);
+				void *extraParams,
+                const uint64_t start,
+                const uint64_t stop);
 			
 			static void exec(
 				const int opNum,
@@ -90,7 +87,9 @@ namespace functions {
 				void *result,
 				Nd4jLong resultStride,
 				void *extraParams,
-				Nd4jLong n);
+				Nd4jLong n,
+                const uint64_t start,
+                const uint64_t stop);
 
 
 			template<typename OpType>
@@ -101,7 +100,9 @@ namespace functions {
                     Nd4jLong* yShapeBuffer,
                     void *vresult,
                     Nd4jLong* resultShapeBuffer,
-                    void *vextraParams);
+                    void *vextraParams,
+                    const uint64_t start,
+                    const uint64_t stop);
 
             template<typename OpType>
             static void exec(void *vx,
@@ -111,7 +112,10 @@ namespace functions {
                              void *vresult,
                              Nd4jLong resultStride,
                              void *vextraParams,
-                             const Nd4jLong n);
+                             const Nd4jLong n,
+                             const uint64_t start,
+                             const uint64_t stop);
+#endif
         };
     }
 }
diff --git a/libnd4j/include/loops/pairwise_transform.h b/libnd4j/include/loops/pairwise_transform.h
index 4fe3eb0cc..0109b309f 100755
--- a/libnd4j/include/loops/pairwise_transform.h
+++ b/libnd4j/include/loops/pairwise_transform.h
@@ -41,12 +41,6 @@
 #include <types/float16.h>
 #endif
 
-#ifndef _OPENMP
-#define omp_get_thread_num() 0
-#define omp_get_max_threads() 1
-#endif
-
-
 
 namespace functions {
     namespace pairwise_transforms {
@@ -76,7 +70,9 @@ namespace functions {
 				Nd4jLong *yShapeInfo,
 				void *z,
 				Nd4jLong *zShapeInfo,
-				void *extraParams);
+				void *extraParams,
+                uint64_t start,
+                uint64_t stop);
 
 			static void exec(
 				const int opNum,
@@ -87,7 +83,9 @@ namespace functions {
 				void *z,
 				Nd4jLong resultStride,
 				void *extraParams,
-				Nd4jLong len);
+				Nd4jLong len,
+                uint64_t start,
+                uint64_t stop);
 
 
 			template<typename OpType>
@@ -98,7 +96,9 @@ namespace functions {
                     Nd4jLong* yShapeInfo,
                     void *vresult,
                     Nd4jLong* zShapeInfo,
-                    void *vextraParams);
+                    void *vextraParams,
+                    uint64_t start,
+                    uint64_t stop);
 
             template<typename OpType>
             static void exec(void *vx,
@@ -108,7 +108,9 @@ namespace functions {
                              void *vresult,
                              Nd4jLong resultStride,
                              void *vextraParams,
-                             const Nd4jLong len);
+                             Nd4jLong len,
+                             uint64_t start,
+                             uint64_t stop);
         };
     }
 }
diff --git a/libnd4j/include/loops/random.h b/libnd4j/include/loops/random.h
index 620187b82..5048e5ce0 100644
--- a/libnd4j/include/loops/random.h
+++ b/libnd4j/include/loops/random.h
@@ -52,7 +52,7 @@ namespace functions {
             static _CUDA_H void executeCudaSingle(dim3& launchDims, cudaStream_t* stream, int opNum, Nd4jPointer stateHost, void *z, Nd4jLong *zShapeBuffer, void *extraArguments);
             static _CUDA_H void executeCudaDouble(dim3& launchDims, cudaStream_t* stream, int opNum, Nd4jPointer stateHost, void *x, Nd4jLong *xShapeBuffer, void *z, Nd4jLong *zShapeBuffer, void *extraArguments);
             static _CUDA_H void executeCudaTriple(dim3& launchDims, cudaStream_t* stream, int opNum, Nd4jPointer stateHost, void *x, Nd4jLong *xShapeBuffer, void *y, Nd4jLong *yShapeBuffer, void *z, Nd4jLong *zShapeBuffer, void *extraArguments);
-#endif
+#else
 
             template<typename OpClass>
             static void execTransform(Nd4jPointer state, void *x, Nd4jLong *xShapeBuffer, void *y, Nd4jLong *yShapeBuffer, void *z, Nd4jLong *zShapeBuffer, void *extraArguments);
@@ -66,6 +66,7 @@ namespace functions {
             static void execTransform(int opNum, Nd4jPointer state, void *x, Nd4jLong *xShapeBuffer, void *z, Nd4jLong *zShapeBuffer, void *extraArguments);
             static void execTransform(int opNum, Nd4jPointer state, void *x, Nd4jLong *xShapeBuffer, void *y, Nd4jLong *yShapeBuffer, void *z, Nd4jLong *zShapeBuffer, void *extraArguments);
             static void execTransform(int opNum, Nd4jPointer state, void *z, Nd4jLong *zShapeBuffer, void *extraArguments);
+#endif
         };
     }
 }
diff --git a/libnd4j/include/loops/reduce3.h b/libnd4j/include/loops/reduce3.h
index 781a17bb7..178bac7c2 100755
--- a/libnd4j/include/loops/reduce3.h
+++ b/libnd4j/include/loops/reduce3.h
@@ -44,10 +44,6 @@
 #include <cuda_runtime.h>
 #endif
 
-#ifndef _OPENMP
-#define omp_get_thread_num() 0
-#define omp_get_max_threads() 1
-#endif
 
 #include "legacy_ops.h"
 
@@ -114,7 +110,7 @@ class Reduce3 {
 
 
 
-#endif
+#else
 
 		template<typename OpType>
 		static void execScalar(void *vx, Nd4jLong *xShapeInfo, void *vextraParams, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo);
@@ -124,25 +120,25 @@ class Reduce3 {
 
 		
 		template<typename OpType>
-		static void exec(void *vx, Nd4jLong *xShapeInfo, void *vextraParams, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength);
+		static void exec(void *vx, Nd4jLong *xShapeInfo, void *vextraParams, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, int64_t start, int64_t stop);
 
 		
 		template<typename OpType>
-		static void exec(void *vx, Nd4jLong *xShapeInfo, void *vextraParams, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
+		static void exec(void *vx, Nd4jLong *xShapeInfo, void *vextraParams, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, int64_t start, int64_t stop);
 
 
 		template<typename OpType>
-		static void execAll(void *vx, Nd4jLong *xShapeInfo, void *vextraParams, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength,  Nd4jLong *xTadShapeInfo, Nd4jLong *xOffsets, Nd4jLong *yTadShapeInfo, Nd4jLong *yOffsets);
+		static void execAll(void *vx, Nd4jLong *xShapeInfo, void *vextraParams, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength,  Nd4jLong *xTadShapeInfo, Nd4jLong *xOffsets, Nd4jLong *yTadShapeInfo, Nd4jLong *yOffsets, int64_t start, int64_t stop);
 		
 		
-		static void exec(const int opNum, void *vx, Nd4jLong *xShapeInfo, void *extraParamsVals, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength);
+		static void exec(const int opNum, void *vx, Nd4jLong *xShapeInfo, void *extraParamsVals, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, int64_t start, int64_t stop);
 
 
-		static void exec(const int opNum, void *vx, Nd4jLong *xShapeInfo, void *extraParamsVals, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
+		static void exec(const int opNum, void *vx, Nd4jLong *xShapeInfo, void *extraParamsVals, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, int64_t start, int64_t stop);
 
 		
-		static void execAll(const int opNum, void *vx, Nd4jLong *xShapeInfo, void *extraParamsVals, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, Nd4jLong *xTadShapeInfo, Nd4jLong *xOffsets, Nd4jLong *yTadShapeInfo, Nd4jLong *yOffsets);
-
+		static void execAll(const int opNum, void *vx, Nd4jLong *xShapeInfo, void *extraParamsVals, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, Nd4jLong *xTadShapeInfo, Nd4jLong *xOffsets, Nd4jLong *yTadShapeInfo, Nd4jLong *yOffsets, int64_t start, int64_t stop);
+#endif
 };
 
 
diff --git a/libnd4j/include/loops/reduce_bool.h b/libnd4j/include/loops/reduce_bool.h
index 89df1330f..540a6041d 100644
--- a/libnd4j/include/loops/reduce_bool.h
+++ b/libnd4j/include/loops/reduce_bool.h
@@ -28,7 +28,6 @@
 #include <nd4jmalloc.h>
 #include <pairwise_util.h>
 #include <ops/ops.h>
-#include <ops/special_accumulation_ops.h>
 #include <op_boilerplate.h>
 
 #pragma once
@@ -37,10 +36,6 @@
 #include <cuda_runtime.h>
 #endif
 
-#ifndef _OPENMP
-#define omp_get_thread_num() 0
-#define omp_get_max_threads() 1
-#endif
 
 #include "legacy_ops.h"
 
@@ -77,7 +72,7 @@ namespace functions {
             static __host__ void execReduceScalar(dim3 launchDims, cudaStream_t *stream, int opNum, void *vx, Nd4jLong *xShapeInfo, Nd4jLong* hXShapeInfo, void *extraParams, void *vz, Nd4jLong *zShapeInfo, Nd4jLong* hZShapeInfo, int *dimension, int dimensionLength, void *reductionBuffer, Nd4jLong *tadOnlyShapeInfo);
 
             static __host__ void execReduceXD(dim3 launchDims, cudaStream_t *stream, int opNum, int rank, void *vx, Nd4jLong *xShapeInfo, Nd4jLong* hXShapeInfo, void *extraParams, void *vz, Nd4jLong *zShapeInfo, Nd4jLong* hZShapeInfo, int *dimension, int dimensionLength, void *reductionPointer, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
-#endif
+#else
 
             /**
              * Reduce down to 1 number
@@ -121,7 +116,7 @@ namespace functions {
                              int *dimension,
                              int dimensionLength,
                              Nd4jLong *tadShapeInfo,
-                             Nd4jLong *tadOffset);
+                             Nd4jLong *tadOffset, int64_t start, int64_t stop);
 
             /**
              * Execute on the cpu
@@ -145,7 +140,7 @@ namespace functions {
                              int *dimension,
                              int dimensionLength,
                              Nd4jLong *tadShapeInfo,
-                             Nd4jLong *tadOffset);
+                             Nd4jLong *tadOffset, int64_t start, int64_t stop);
 
             /**
             * CPU implementation
@@ -178,8 +173,10 @@ namespace functions {
                     Nd4jLong xElementWiseStride,
                     Nd4jLong length,
                     void *extraParams);
+#endif
         };
 
+
 #ifdef __CUDACC__
         /**
     *
diff --git a/libnd4j/include/loops/reduce_float.h b/libnd4j/include/loops/reduce_float.h
index 9856e1d8e..ff2c0e668 100644
--- a/libnd4j/include/loops/reduce_float.h
+++ b/libnd4j/include/loops/reduce_float.h
@@ -28,7 +28,6 @@
 #include <nd4jmalloc.h>
 #include <pairwise_util.h>
 #include <ops/ops.h>
-#include <ops/special_accumulation_ops.h>
 #include <op_boilerplate.h>
 
 #pragma once
@@ -37,10 +36,6 @@
 #include <cuda_runtime.h>
 #endif
 
-#ifndef _OPENMP
-#define omp_get_thread_num() 0
-#define omp_get_max_threads() 1
-#endif
 
 #include "legacy_ops.h"
 
@@ -79,7 +74,7 @@ namespace functions {
             static __host__ void execReduceScalar(dim3 launchDims, cudaStream_t *stream, int opNum, void *vx, Nd4jLong *xShapeInfo, Nd4jLong *hXShapeInfo, void *extraParams, void *vz, Nd4jLong *zShapeInfo, Nd4jLong *hZShapeInfo, int *dimension, int dimensionLength, void *reductionBuffer, Nd4jLong *tadOnlyShapeInfo);
 
             static __host__ void execReduceXD(dim3 launchDims, cudaStream_t *stream, int opNum, int rank, void *vx, Nd4jLong *xShapeInfo, Nd4jLong *hXShapeInfo, void *extraParams, void *vz, Nd4jLong *zShape, Nd4jLong *hZShapeInfo, int *dimension, int dimensionLength, void *reductionPointer, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
-#endif
+#else
 
             /**
              * Reduce down to 1 number
@@ -123,7 +118,7 @@ namespace functions {
                              int *dimension,
                              int dimensionLength,
                              Nd4jLong *tadShapeInfo,
-                             Nd4jLong *tadOffset);
+                             Nd4jLong *tadOffset, int64_t start, int64_t stop);
 
             /**
              * Execute on the cpu
@@ -147,7 +142,7 @@ namespace functions {
                              int *dimension,
                              int dimensionLength,
                              Nd4jLong *tadShapeInfo,
-                             Nd4jLong *tadOffset);
+                             Nd4jLong *tadOffset, int64_t start, int64_t stop);
 
             /**
             * CPU implementation
@@ -180,8 +175,10 @@ namespace functions {
                     Nd4jLong xElementWiseStride,
                     Nd4jLong length,
                     void *extraParams);
+#endif
         };
 
+
 #ifdef __CUDACC__
         /**
     *
diff --git a/libnd4j/include/loops/reduce_long.h b/libnd4j/include/loops/reduce_long.h
index 193160074..a5d2a9498 100644
--- a/libnd4j/include/loops/reduce_long.h
+++ b/libnd4j/include/loops/reduce_long.h
@@ -28,7 +28,6 @@
 #include <nd4jmalloc.h>
 #include <pairwise_util.h>
 #include <ops/ops.h>
-#include <ops/special_accumulation_ops.h>
 #include <op_boilerplate.h>
 
 #pragma once
@@ -37,11 +36,6 @@
 #include <cuda_runtime.h>
 #endif
 
-#ifndef _OPENMP
-#define omp_get_thread_num() 0
-#define omp_get_max_threads() 1
-#endif
-
 #include "legacy_ops.h"
 
 //an op for the kernel
@@ -78,7 +72,7 @@ namespace functions {
 
             static __host__ void execReduceXD(dim3 launchDims, cudaStream_t *stream, int opNum, int rank, void *vx, Nd4jLong *xShapeInfo, Nd4jLong* hXShapeInfo, void *extraParams, void *vz, Nd4jLong *zShapeInfo, Nd4jLong* hZShapeInfo, int *dimension, int dimensionLength, void *reductionPointer, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
 
-#endif
+#else
 
             /**
              * Reduce down to 1 number
@@ -122,7 +116,7 @@ namespace functions {
                              int *dimension,
                              int dimensionLength,
                              Nd4jLong *tadShapeInfo,
-                             Nd4jLong *tadOffset);
+                             Nd4jLong *tadOffset, int64_t start, int64_t stop);
 
             /**
              * Execute on the cpu
@@ -146,7 +140,7 @@ namespace functions {
                              int *dimension,
                              int dimensionLength,
                              Nd4jLong *tadShapeInfo,
-                             Nd4jLong *tadOffset);
+                             Nd4jLong *tadOffset, int64_t start, int64_t stop);
 
             /**
             * CPU implementation
@@ -179,6 +173,7 @@ namespace functions {
                     Nd4jLong xElementWiseStride,
                     Nd4jLong length,
                     void *extraParams);
+#endif
         };
 
 #ifdef __CUDACC__
diff --git a/libnd4j/include/loops/reduce_same.h b/libnd4j/include/loops/reduce_same.h
index c7f5f9173..e828ecf46 100644
--- a/libnd4j/include/loops/reduce_same.h
+++ b/libnd4j/include/loops/reduce_same.h
@@ -28,7 +28,6 @@
 #include <nd4jmalloc.h>
 #include <pairwise_util.h>
 #include <ops/ops.h>
-#include <ops/special_accumulation_ops.h>
 #include <op_boilerplate.h>
 
 #pragma once
@@ -37,11 +36,6 @@
 #include <cuda_runtime.h>
 #endif
 
-#ifndef _OPENMP
-#define omp_get_thread_num() 0
-#define omp_get_max_threads() 1
-#endif
-
 #include "legacy_ops.h"
 
 //an op for the kernel
@@ -80,7 +74,7 @@ namespace functions {
             static __host__ void execReduceScalar(dim3 launchDims, cudaStream_t *stream, int opNum, void *vx, Nd4jLong *xShapeInfo, Nd4jLong* hXShapeInfo, void *extraParams, void *vz, Nd4jLong *zShapeInfo, Nd4jLong* hZShapeInfo, int *dimension, int dimensionLength, void *reductionBuffer, Nd4jLong *tadOnlyShapeInfo);
 
             static __host__ void execReduceXD(dim3 launchDims, cudaStream_t *stream, int opNum, int rank, void *vx, Nd4jLong *xShapeInfo, Nd4jLong* hXShapeInfo, void *extraParams, void *vz, Nd4jLong *zShapeInfo, Nd4jLong* hZShapeInfo, int *dimension, int dimensionLength, void *reductionPointer, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
-#endif
+#else
 
             /**
              * Reduce down to 1 number
@@ -124,7 +118,7 @@ namespace functions {
                              int *dimension,
                              int dimensionLength,
                              Nd4jLong *tadShapeInfo,
-                             Nd4jLong *tadOffset);
+                             Nd4jLong *tadOffset, int64_t start, int64_t stop);
 
             /**
              * Execute on the cpu
@@ -148,7 +142,7 @@ namespace functions {
                              int *dimension,
                              int dimensionLength,
                              Nd4jLong *tadShapeInfo,
-                             Nd4jLong *tadOffset);
+                             Nd4jLong *tadOffset, int64_t start, int64_t stop);
 
             /**
             * CPU implementation
@@ -181,6 +175,8 @@ namespace functions {
                     Nd4jLong xElementWiseStride,
                     Nd4jLong length,
                     void *extraParams);
+
+#endif
         };
 
 #ifdef __CUDACC__
diff --git a/libnd4j/include/loops/scalar.h b/libnd4j/include/loops/scalar.h
index b2ee46dba..0f32dedf3 100755
--- a/libnd4j/include/loops/scalar.h
+++ b/libnd4j/include/loops/scalar.h
@@ -70,15 +70,15 @@ namespace functions {
             __host__
             static void executeCudaAlongDimension(dim3& launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShapeInfo, void *z, Nd4jLong *zShapeInfo, void *scalars, void *extraParams, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ);
 
-#endif
+#else
             template <typename OpType>
-            static void transform(void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ);
+            static void transform(void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ, const uint64_t start, const uint64_t stop);
 
-            static void transform(int opNum, void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ);
+            static void transform(int opNum, void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ, const uint64_t start, const uint64_t stop);
 
-            static void transform(const int opNum, void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo,  void *scalar,  void *extraParams, bool allowParallelism);
+            static void transform(const int opNum, void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo,  void *scalar,  void *extraParams, const uint64_t start, const uint64_t stop);
 
-            static void transform(const int opNum, void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const Nd4jLong len, bool allowParallelism);
+            static void transform(const int opNum, void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const uint64_t len, const uint64_t start, const uint64_t stop);
 
 
 
@@ -101,7 +101,7 @@ namespace functions {
          */
 
             template<typename OpType>
-            static  void transform(void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *scalar, void *extraParams, bool allowParallelism);
+            static  void transform(void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *scalar, void *extraParams, const uint64_t start, const uint64_t stop);
 
 
             /**
@@ -117,7 +117,8 @@ namespace functions {
              */
 
             template<typename OpType>
-            static void transform(void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const Nd4jLong len, bool allowParallelism);
+            static void transform(void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const uint64_t len, const uint64_t start, const uint64_t stop);
+#endif
         };
     }
 }
diff --git a/libnd4j/include/loops/scalar_bool.h b/libnd4j/include/loops/scalar_bool.h
index ddc039d89..a5931ddfb 100644
--- a/libnd4j/include/loops/scalar_bool.h
+++ b/libnd4j/include/loops/scalar_bool.h
@@ -86,15 +86,15 @@ namespace functions {
 /*
 #include "cuda/scalar_temp.cu"
 */
-#endif
+#else
             template <typename OpType>
-            static void transform(void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ);
+            static void transform(void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ, const uint64_t start, const uint64_t stop);
  
-           static void transform(int opNum, void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ);
+           static void transform(int opNum, void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ, const uint64_t start, const uint64_t stop);
 
-            static void transform(const int opNum, void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo,  void *scalar,  void *extraParams);
+            static void transform(const int opNum, void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo,  void *scalar,  void *extraParams, const uint64_t start, const uint64_t stop);
 
-            static void transform(const int opNum, void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const Nd4jLong n);
+            static void transform(const int opNum, void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const uint64_t n, const uint64_t start, const uint64_t stop);
 
 
 
@@ -117,7 +117,7 @@ namespace functions {
          */
 
             template<typename OpType>
-            static  void transform(void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *scalar, void *extraParams);
+            static  void transform(void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *scalar, void *extraParams, const uint64_t start, const uint64_t stop);
 
 
             /**
@@ -133,7 +133,8 @@ namespace functions {
              */
 
             template<typename OpType>
-            static void transform(void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const Nd4jLong n);
+            static void transform(void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const uint64_t n, const uint64_t start, const uint64_t stop);
+#endif
         };
     }
 }
diff --git a/libnd4j/include/loops/scalar_int.h b/libnd4j/include/loops/scalar_int.h
index f873d5419..509d7574f 100644
--- a/libnd4j/include/loops/scalar_int.h
+++ b/libnd4j/include/loops/scalar_int.h
@@ -83,18 +83,15 @@ namespace functions {
             static void executeCudaAlongDimension(dim3& launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShapeInfo, void *z, Nd4jLong *zShapeInfo, void *scalars, void *extraParams, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ);
 
 
-/*
-#include "cuda/scalar_temp.cu"
-*/
-#endif
+#else
             template <typename OpType>
-            static void transform(void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ);
+            static void transform(void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ, const uint64_t start, const uint64_t stop);
  
-           static void transform(int opNum, void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ);
+           static void transform(int opNum, void *x, Nd4jLong *xShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, void *scalars, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ, const uint64_t start, const uint64_t stop);
 
-            static void transform(const int opNum, void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo,  void *scalar,  void *extraParams);
+            static void transform(const int opNum, void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo,  void *scalar,  void *extraParams, const uint64_t start, const uint64_t stop);
 
-            static void transform(const int opNum, void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const Nd4jLong n);
+            static void transform(const int opNum, void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const uint64_t n, const uint64_t start, const uint64_t stop);
 
 
 
@@ -117,7 +114,7 @@ namespace functions {
          */
 
             template<typename OpType>
-            static  void transform(void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *scalar, void *extraParams);
+            static  void transform(void *x, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *scalar, void *extraParams, const uint64_t start, const uint64_t stop);
 
 
             /**
@@ -133,7 +130,8 @@ namespace functions {
              */
 
             template<typename OpType>
-            static void transform(void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const Nd4jLong n);
+            static void transform(void *x, Nd4jLong xStride, void *result, Nd4jLong resultStride, void *scalar, void *extraParams, const uint64_t n, const uint64_t start, const uint64_t stop);
+#endif
         };
     }
 }
diff --git a/libnd4j/include/loops/summarystatsreduce.h b/libnd4j/include/loops/summarystatsreduce.h
index 915293904..afaee9c47 100755
--- a/libnd4j/include/loops/summarystatsreduce.h
+++ b/libnd4j/include/loops/summarystatsreduce.h
@@ -286,7 +286,7 @@ namespace functions {
             static _CUDA_H void execSummaryStatsReduceScalar(dim3& launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShapeInfo, Nd4jLong *hxShapeInfo, void *extraParams, void *vz, Nd4jLong *zShapeInfo, Nd4jLong *hzShapeInfo, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, bool biasCorrected, void *reductionBuffer);
             static _CUDA_H void execSummaryStatsReduce(dim3& launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShapeInfo, Nd4jLong *hxShapeInfo, void *extraParams, void *vz, Nd4jLong *zShapeInfo, Nd4jLong *hzShapeInfo, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, bool biasCorrected, void *reductionBuffer);
             static _CUDA_H void execSummaryStatsReduce(dim3& launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShapeInfo, Nd4jLong *hxShapeInfo, void *extraParams, void *vz, Nd4jLong *zShapeInfo, Nd4jLong *hzShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, bool biasCorrected, void *reductionBuffer);
-#endif
+#else
 
             static Z execScalar(int opNum,
                     bool biasCorrected,
@@ -335,7 +335,7 @@ namespace functions {
                     Nd4jLong *resultShapeInfoBuffer,
                     int *dimension,
                     int dimensionLength);
-
+#endif
         };
     }
 }
diff --git a/libnd4j/include/loops/transform_any.h b/libnd4j/include/loops/transform_any.h
index ab9ad47c4..d97e3e90e 100644
--- a/libnd4j/include/loops/transform_any.h
+++ b/libnd4j/include/loops/transform_any.h
@@ -27,7 +27,7 @@
 #include <vector>
 #include <templatemath.h>
 #include <ops/ops.h>
-#include <ops/special_ops.h>
+
 #ifdef _OPENMP
 #include <omp.h>
 #endif
@@ -44,11 +44,6 @@
 #include <cuda_runtime.h>
 #endif
 
-#ifndef _OPENMP
-#define omp_get_thread_num() 0
-#define omp_get_max_threads() 1
-#endif
-
 #include "legacy_ops.h"
 
 
@@ -69,12 +64,12 @@ class TransformAny {
 
 		static _CUDA_H void executeTransformShaped(dim3 launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShape, int xRank, void *extraParams, void *z, Nd4jLong *zShape, int zRank, int *allocationPointer, void *reductionPointer,  Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
 
-#endif
-
-		static void exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *vz, Nd4jLong *zShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, bool allowParallelism);
+#else
+		static void exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *vz, Nd4jLong *zShapeInfo, void *extraParams, uint64_t threadId, uint64_t numThreads);
 
 		template<typename OpType>
-		static ND4J_EXPORT void exec(void *dx, Nd4jLong *xShapeInfo, void *vz, Nd4jLong *zShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, bool allowParallelism);
+		static ND4J_EXPORT void exec(void *dx, Nd4jLong *xShapeInfo, void *vz, Nd4jLong *zShapeInfo, void *extraParams, uint64_t threadId, uint64_t numThreads);
+#endif
 };
 
 }
diff --git a/libnd4j/include/loops/transform_bool.h b/libnd4j/include/loops/transform_bool.h
index ee416ea87..4c87ae58c 100644
--- a/libnd4j/include/loops/transform_bool.h
+++ b/libnd4j/include/loops/transform_bool.h
@@ -27,7 +27,7 @@
 #include <vector>
 #include <templatemath.h>
 #include <ops/ops.h>
-#include <ops/special_ops.h>
+
 #ifdef _OPENMP
 #include <omp.h>
 #endif
@@ -44,11 +44,6 @@
 #include <cuda_runtime.h>
 #endif
 
-#ifndef _OPENMP
-#define omp_get_thread_num() 0
-#define omp_get_max_threads() 1
-#endif
-
 #include "legacy_ops.h"
 
 
@@ -78,12 +73,12 @@ namespace functions {
 
 	static _CUDA_H void executeTransformShaped(dim3 launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShape, int xRank, void *extraParams, void *z, Nd4jLong *zShape, int zRank, int *allocationPointer, void *reductionPointer,  Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
 
-#endif
-
-			static void exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
+#else
+			static void exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, uint64_t threadId, uint64_t numThreads);
 
 			template<typename OpType>
-			static ND4J_EXPORT void exec(void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
+			static ND4J_EXPORT void exec(void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, uint64_t threadId, uint64_t numThreads);
+#endif
         };
     }
 }
diff --git a/libnd4j/include/loops/transform_float.h b/libnd4j/include/loops/transform_float.h
index 66547ee79..ae28e069f 100644
--- a/libnd4j/include/loops/transform_float.h
+++ b/libnd4j/include/loops/transform_float.h
@@ -27,7 +27,7 @@
 #include <vector>
 #include <templatemath.h>
 #include <ops/ops.h>
-#include <ops/special_ops.h>
+
 #ifdef _OPENMP
 #include <omp.h>
 #endif
@@ -44,11 +44,6 @@
 #include <cuda_runtime.h>
 #endif
 
-#ifndef _OPENMP
-#define omp_get_thread_num() 0
-#define omp_get_max_threads() 1
-#endif
-
 #include "legacy_ops.h"
 
 
@@ -102,11 +97,12 @@ namespace functions {
 
 	static _CUDA_H void executeTransformShaped(dim3 launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShape, int xRank, void *extraParams, void *z, Nd4jLong *zShape, int zRank, int *allocationPointer, void *reductionPointer,  Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
 
-#endif
-			static void exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
+#else
+			static void exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, uint64_t threadId, uint64_t numThreads);
 
 			template<typename OpType>
-			static ND4J_EXPORT void exec(void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
+			static ND4J_EXPORT void exec(void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, uint64_t threadId, uint64_t numThreads);
+#endif
         };
     }
 }
diff --git a/libnd4j/include/loops/transform_same.h b/libnd4j/include/loops/transform_same.h
index ef646a1b6..ae5b498e6 100644
--- a/libnd4j/include/loops/transform_same.h
+++ b/libnd4j/include/loops/transform_same.h
@@ -27,7 +27,7 @@
 #include <vector>
 #include <templatemath.h>
 #include <ops/ops.h>
-#include <ops/special_ops.h>
+
 #ifdef _OPENMP
 #include <omp.h>
 #endif
@@ -44,11 +44,6 @@
 #include <cuda_runtime.h>
 #endif
 
-#ifndef _OPENMP
-#define omp_get_thread_num() 0
-#define omp_get_max_threads() 1
-#endif
-
 #include "legacy_ops.h"
 
 
@@ -79,12 +74,13 @@ namespace functions {
 
 	static _CUDA_H void executeTransformShaped(dim3 launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShape, int xRank, void *extraParams, void *z, Nd4jLong *zShape, int zRank, int *allocationPointer, void *reductionPointer,  Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
 
-#endif
 
-			static void exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
+#else
+			static void exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, uint64_t threadId, uint64_t numThreads);
 
 			template<typename OpType>
-			static ND4J_EXPORT void exec(void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
+			static ND4J_EXPORT void exec(void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, uint64_t threadId, uint64_t numThreads);
+#endif
         };
     }
 }
diff --git a/libnd4j/include/loops/transform_strict.h b/libnd4j/include/loops/transform_strict.h
index fe520743e..96917ebc1 100644
--- a/libnd4j/include/loops/transform_strict.h
+++ b/libnd4j/include/loops/transform_strict.h
@@ -27,7 +27,7 @@
 #include <vector>
 #include <templatemath.h>
 #include <ops/ops.h>
-#include <ops/special_ops.h>
+
 #ifdef _OPENMP
 #include <omp.h>
 #endif
@@ -44,11 +44,6 @@
 #include <cuda_runtime.h>
 #endif
 
-#ifndef _OPENMP
-#define omp_get_thread_num() 0
-#define omp_get_max_threads() 1
-#endif
-
 #include "legacy_ops.h"
 
 
@@ -79,12 +74,16 @@ namespace functions {
 
 	static _CUDA_H void executeTransformShaped(dim3 launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShape, int xRank, void *extraParams, void *z, Nd4jLong *zShape, int zRank, int *allocationPointer, void *reductionPointer,  Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
 
-#endif
+#else
 
-			static void exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
+
+
+			static void exec(int opNum, void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, uint64_t threadId, uint64_t numThreads);
 
 			template<typename OpType>
-			static ND4J_EXPORT void exec(void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
+			static ND4J_EXPORT void exec(void *dx, Nd4jLong *xShapeInfo, void *result, Nd4jLong *resultShapeInfo, void *extraParams, uint64_t threadId, uint64_t numThreads);
+
+#endif
         };
     }
 }
diff --git a/libnd4j/include/msvc.h b/libnd4j/include/msvc.h
new file mode 100644
index 000000000..c884736f3
--- /dev/null
+++ b/libnd4j/include/msvc.h
@@ -0,0 +1,39 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+//  @author raver119@gmail.com
+//
+
+#ifndef SAMEDIFF_MSVC_H
+#define SAMEDIFF_MSVC_H
+
+#if defined(_MSC_VER)
+
+#pragma warning( disable : 4244 )
+#pragma warning( disable : 4267 )
+#pragma warning( disable : 4251 )
+#pragma warning( disable : 4101 )
+#pragma warning( disable : 4305 )
+#pragma warning( disable : 4309 )
+#pragma warning( disable : 4333 )
+#pragma warning( disable : 4146 )
+#pragma warning( disable : 4018 )
+#pragma warning( disable : 4297 )
+
+#endif
+
+#endif //DEV_TESTS_MSVC_H
diff --git a/libnd4j/include/op_boilerplate.h b/libnd4j/include/op_boilerplate.h
index 4f70d9bf2..102a1776a 100644
--- a/libnd4j/include/op_boilerplate.h
+++ b/libnd4j/include/op_boilerplate.h
@@ -1461,7 +1461,7 @@
 
 #ifdef _RELEASE
 
-#define ALLOCATE_SPECIAL(VARIABLE, WORKSPACE, LENGTH, TT) if (WORKSPACE == nullptr) {auto erc_##VARIABLE = cudaMalloc(reinterpret_cast<void**>(&VARIABLE), LENGTH * sizeof(TT) + 8); if (erc_##VARIABLE != 0) {throw cuda_exception::build("[DEVICE] allocation failed", erc_##VARIABLE);} else { }; } else {VARIABLE = reinterpret_cast<TT *>(WORKSPACE->allocateBytes(nd4j::memory::MemoryType::DEVICE, LENGTH * sizeof(TT) + 8)); }
+#define ALLOCATE_SPECIAL(VARIABLE, WORKSPACE, LENGTH, TT) if (WORKSPACE == nullptr) {auto erc_##VARIABLE = cudaMalloc(reinterpret_cast<void**>(&VARIABLE), LENGTH * sizeof(TT)); if (erc_##VARIABLE != 0) {throw cuda_exception::build("[DEVICE] allocation failed", erc_##VARIABLE);} else { }; } else {VARIABLE = reinterpret_cast<TT *>(WORKSPACE->allocateBytes(nd4j::memory::MemoryType::DEVICE, LENGTH * sizeof(TT))); }
 #define RELEASE_SPECIAL(VARIABLE, WORKSPACE) if (VARIABLE != nullptr) {if (WORKSPACE == nullptr) { auto erc_##VARIABLE = cudaFree(reinterpret_cast<void *>(VARIABLE));  if (erc_##VARIABLE != 0) {throw cuda_exception::build("[DEVICE] deallocation failed", erc_##VARIABLE);}; }; };
 
 #else
@@ -1528,6 +1528,7 @@
 #elif _MSC_VER
 #define FORCEINLINE __forceinline
 #elif __GNUC__
+#define INLINE_LOOPS
 #define FORCEINLINE __attribute__((always_inline)) inline 
 #elif __CUDACC__
 #define FORCEINLINE __forceinline__ inline 
diff --git a/libnd4j/include/openmp_pragmas.h b/libnd4j/include/openmp_pragmas.h
index f1d4a8f67..667f54521 100644
--- a/libnd4j/include/openmp_pragmas.h
+++ b/libnd4j/include/openmp_pragmas.h
@@ -23,7 +23,7 @@
 
 #if defined(_MSC_VER)
 
-#define OMP_STRINGIFY(args)
+#define OMP_STRINGIFY(args) #args
 #define OMP_IF(args)
 #define OMP_SCHEDULE(args)
 #define OMP_MAXT
@@ -32,7 +32,7 @@
 #define PRAGMA_OMP_ATOMIC
 #define PRAGMA_OMP_ATOMIC_ARGS(args)
 #define PRAGMA_OMP_CRITICAL
-#define PRAGMA_OMP_SIMD
+#define PRAGMA_OMP_SIMD __pragma(omp simd)
 #define PRAGMA_OMP_SIMD_ARGS(args)
 #define PRAGMA_OMP_SIMD_SUM(args)
 #define PRAGMA_OMP_SIMD_MAX(args)
@@ -61,6 +61,7 @@
 
 #else
 
+
 #define OMP_STRINGIFY(args) #args
 #define OMP_IF(args) if(args)
 #define OMP_SCHEDULE(args) schedule(args)
@@ -99,4 +100,39 @@
 
 #endif
 
+// reductions
+#define FUNC_RL std::function<int64_t(uint64_t, int64_t, int64_t, int64_t)>
+#define FUNC_AL std::function<int64_t(int64_t, int64_t)>
+
+// aggregation functions
+#define FUNC_RD std::function<double(uint64_t, int64_t, int64_t, int64_t)>
+#define FUNC_AD std::function<double(double, double)>
+
+// parallel block
+#define FUNC_DO std::function<void(uint64_t, uint64_t)>
+
+// parallel_for block
+#define FUNC_1D std::function<void(uint64_t, int64_t, int64_t, int64_t)>
+#define FUNC_2D std::function<void(uint64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t)>
+#define FUNC_3D std::function<void(uint64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t)>
+
+// aggregation lambda
+#define LAMBDA_AL [&](int64_t _old, int64_t _new) -> int64_t
+#define LAMBDA_AD [&](double _old, double _new) -> double
+
+#define LAMBDA_SUML LAMBDA_AL {return _old + _new; }
+#define LAMBDA_SUMD LAMBDA_AD {return _old + _new; }
+
+// reduction lambda
+#define PRAGMA_REDUCE_LONG  [&] (uint64_t thread_id, int64_t start, int64_t stop, int64_t increment) mutable -> int64_t
+#define PRAGMA_REDUCE_DOUBLE  [&] (uint64_t thread_id, int64_t start, int64_t stop, int64_t increment) mutable -> double
+
+// paralllel block lambda
+#define PRAGMA_THREADS_DO  [&](uint64_t thread_id, uint64_t numThreads) -> void
+
+// paralllel_for lambdas
+#define PRAGMA_THREADS_FOR  [&](uint64_t thread_id, int64_t start, int64_t stop, int64_t increment) -> void
+#define PRAGMA_THREADS_FOR_2D [&](uint64_t thread_id, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y) -> void
+#define PRAGMA_THREADS_FOR_3D [&](uint64_t thread_id, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y, int64_t start_z, int64_t stop_z, int64_t inc_z) -> void
+
 #endif //DEV_TESTS_OPENMP_PRAGMAS_H
diff --git a/libnd4j/include/ops/aggregate_ops.h b/libnd4j/include/ops/aggregate_ops.h
deleted file mode 100644
index a10a2912e..000000000
--- a/libnd4j/include/ops/aggregate_ops.h
+++ /dev/null
@@ -1,996 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2015-2018 Skymind, Inc.
- *
- * This program and the accompanying materials are made available under the
- * terms of the Apache License, Version 2.0 which is available at
- * https://www.apache.org/licenses/LICENSE-2.0.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations
- * under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- ******************************************************************************/
-
-//
-// @author raver119@gmail.com
-//
-#ifndef LIBND4J_AGGREGATE_OPS_H
-#define LIBND4J_AGGREGATE_OPS_H
-
-#include <ops/ops.h>
-#include <templatemath.h>
-
-#define HS_MAX_EXP 6.0f
-
-#ifdef __CUDACC__
-#define aggregate_def __device__ inline static
-#else
-#include <ops/gemm.h>
-#define aggregate_def inline static
-#endif
-/*
- *
- *
- * Aggregate Ops are special things suited for CUDA mostly. They are meant to be executed within single block ONLY.
- * So, when batched, they should provide proper parallelism levels on poorly parallel tasks otherwise.
- *
- * On CPU aggregate ops are trying to minimize OpenMP multi-threading use, only SIMD is enforced
- *
- *
- */
-namespace aggregateOps {
-
-    template<typename T>
-    class GEMM {
-    public:
-#ifdef __CUDACC__
-        aggregate_def void executeAggregateCuda(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) {
-            // no-op
-        }
-#endif
-
-#ifndef __CUDACC__
-        static CBLAS_ORDER  convertOrder(int from) {
-            switch(from) {
-                //'c'
-                case 99:
-                    return CblasRowMajor;
-                    //'C'
-                case 67: return CblasRowMajor;
-                    //'f'
-                case 102: return CblasColMajor;
-                    //'F'
-                case 70: return CblasColMajor;
-                default: return CblasColMajor;
-
-            }
-        }
-
-
-        static CBLAS_TRANSPOSE convertTranspose(int from) {
-            switch(from) {
-                //'t'
-                case 116: return CblasTrans;
-                    //'T'
-                case 84: return CblasTrans;
-                    //'n'
-                case 110: return CblasNoTrans;
-                    //'N'
-                case 78: return CblasNoTrans;
-                    //'c'
-                case 99: return CblasConjTrans;
-                    //'C'
-                case 67: return CblasConjTrans;
-                default: return CblasNoTrans;
-            }
-        }
-#endif
-
-#ifndef __CUDACC__
-        aggregate_def void executeAggregate(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) {
-            int M = indexArguments[0];
-            int N = indexArguments[1];
-            int K = indexArguments[2];
-            int lda = indexArguments[3];
-            int ldb = indexArguments[4];
-            int ldc = indexArguments[5];
-            int TransA = indexArguments[6];
-            int TransB = indexArguments[7];
-            int Order = indexArguments[8];
-
-            T alpha = realArguments[0];
-            T beta = realArguments[1];
-
-            T *A = arguments[0];
-            T *B = arguments[1];
-            T *C = arguments[2];
-
-            nd4j::blas::GEMM<T, T, T>::op(convertOrder(Order), convertTranspose(TransA), convertTranspose(TransB),M,N,K,(T) alpha,A,lda,B,ldb,(T) beta,C,ldc);
-        }
-#else
-        aggregate_def void executeAggregate(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) {
-            // stub for nvcc
-        }
-#endif
-    };
-
-    /**
-     * We don't include this class into ops directly, since it won't be ever used directly,
-     * Only as part of SkipGram or CBOW
-     */
-    template<typename T>
-    class HierarchicSoftmax {
-        private:
-
-        public:
-
-        aggregate_def void executeAggregate(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) {
-            int vectorLength = indexArguments[0];
-            int expLength = indexArguments[1];
-            int code = indexArguments[2];
-            int isInference = indexArguments[3];
-
-            T *syn0 = arguments[0]; // we pass row pointer here
-            T *syn1 = arguments[1]; // we pass row pointer here
-            T *expTable = arguments[2];
-            T *neu1e = arguments[3];
-
-
-            T dot(0.0f);
-            T g(0.0f);
-            T f(0.0f);
-            T alpha = realArguments[0];
-
-            //nd4j_printf("Vector length: [%i]; expLength: [%i]; Code: [%i]; Inf: [%i]\n", vectorLength, expLength, code, isInference);
-
-
-//            shape::printArray<T>(syn0, vectorLength, "syn0");
-//            shape::printArray<T>(syn1, vectorLength, "syn1");
-//            shape::printArray<T>(neu1e, vectorLength, "neu1e");
-
-            // dot
-            for (int x = 0; x < vectorLength; x++) {
-                dot += syn0[x] * syn1[x];
-            }
-
-            // gradient
-            if (dot < (T) - HS_MAX_EXP || dot >= (T) HS_MAX_EXP) {
-                return;
-            }
-
-            int idx = static_cast<int>((dot + HS_MAX_EXP) * ((T) expLength / HS_MAX_EXP / 2.0f));
-
-            if (idx >= expLength || idx < 0) {
-                return;
-            }
-
-            f = expTable[idx];
-            g = (static_cast<T>(1.0f) - static_cast<T>(code) - f) * alpha;
-
-            //nd4j_printf("dot: [%f]; idx: [%i]; f: [%f]; g: [%f]\n", (float) dot, idx, (float) f, (float) g);
-
-            // axpy1
-            PRAGMA_OMP_SIMD
-            for (int x = 0; x < vectorLength; x++) {
-                neu1e[x] = g * syn1[x] + neu1e[x];
-            }
-
-            // axpy2
-            if (!isInference) {
-                PRAGMA_OMP_SIMD
-                for (int x = 0; x < vectorLength; x++) {
-                    syn1[x] = g * syn0[x] + syn1[x];
-                }
-            }
-        }
-
-#ifdef __CUDACC__
-        aggregate_def void executeAggregateCuda(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) {
-            /*
-                We know that syn0 & syn1 are 2D matrices, so we can just use offsets here
-            */
-            __shared__ int vectorLength;
-            __shared__ int expLength;
-            __shared__ int code;
-            __shared__ int isInference;
-
-            T *syn0 = arguments[0];
-            T *syn1 = arguments[1];
-            T *expTable = arguments[2];
-            T *neu1e = arguments[3];
-
-            __shared__ T dot;
-            __shared__ T g;
-            __shared__ T f;
-            __shared__ T alpha;
-
-            if (threadIdx.x == 0) {
-                vectorLength = indexArguments[0];
-                expLength = indexArguments[1];
-                code = indexArguments[2];
-                isInference = indexArguments[3];
-
-                dot = (T) 0.0f;
-
-                alpha = realArguments[0];
-            }
-            __syncthreads();
-
-
-            // TODO: it would be great to implement dot without atomicAdd call. like aggregateParticles, or something like that
-            // dot
-            for (int x = threadIdx.x; x < vectorLength; x+=blockDim.x) {
-                T prod = syn0[x] * syn1[x];
-                nd4j::math::atomics::nd4j_atomicAdd<T>(&dot, prod);
-            }
-
-
-            // gradient
-            __syncthreads();
-
-            if (dot < - (T) HS_MAX_EXP || dot >= (T) HS_MAX_EXP)
-                return;
-
-            int idx = (int) ((dot + HS_MAX_EXP) * ((T) expLength / (T) HS_MAX_EXP / 2.0));
-
-            if (idx >= expLength)
-                return;
-
-
-            if (threadIdx.x == 0) {
-                // gradient calculation
-                f = expTable[idx];
-                g = ((T) 1.0f - (T) code - f) * alpha;
-            }
-            __syncthreads();
-
-            // axpy1
-            for (int x = threadIdx.x; x < vectorLength; x+=blockDim.x) {
-                neu1e[x] = g * syn1[x] + neu1e[x];
-            }
-
-            // axpy2
-            if (!isInference)
-                for (int x = threadIdx.x; x < vectorLength; x+=blockDim.x) {
-                    syn1[x] = g * syn0[x] + syn1[x];
-                }
-        }
-#endif
-    };
-
-    /**
-     * We don't include this class into ops directly, since it won't be ever used directly,
-     * Only as part of SkipGram or CBOW
-     */
-    template<typename T>
-    class NegativeSampling {
-    public:
-
-        aggregate_def void executeAggregate(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) {
-            int vectorLength = indexArguments[0];
-            int expLength = indexArguments[1];
-            int code = indexArguments[2];
-            int isInference = indexArguments[3];
-
-            T *syn0 = arguments[0]; // we pass row pointer here
-            T *syn1Neg = arguments[1]; // we pass row pointer here
-            T *expTable = arguments[2];
-            T *neu1e = arguments[3];
-
-            T dot = (T) 0.0f;
-            T g = (T) 0.0f;
-            T alpha = realArguments[0];
-
-            // dot
-            for (int x = 0; x < vectorLength; x++) {
-                dot += syn0[x] * syn1Neg[x];
-            }
-
-            if (dot > HS_MAX_EXP)
-                g = (code - 1) * alpha;
-            else if (dot < (T) - HS_MAX_EXP)
-                g = (code - 0) * alpha;
-            else {
-                int idx = (int) ((dot + (T) HS_MAX_EXP) * ((T) expLength / HS_MAX_EXP / 2.0));
-                if (idx >= expLength)
-                    return;
-
-                if (idx < 0)
-                    return;
-
-                g = ((T) code - expTable[idx]) * alpha;
-            }
-
-            // axpy1
-            PRAGMA_OMP_SIMD
-            for (int x = 0; x < vectorLength; x++) {
-                neu1e[x] = g * syn1Neg[x] + neu1e[x];
-            }
-
-            // axpy2
-            if (!isInference) {
-                PRAGMA_OMP_SIMD
-                for (int x = 0; x < vectorLength; x++) {
-                    syn1Neg[x] = g * syn0[x] + syn1Neg[x];
-                }
-            }
-        }
-
-#ifdef __CUDACC__
-        aggregate_def void executeAggregateCuda(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) {
-            /*
-                We know that syn0 & syn1 are 2D matrices, so we can just use offsets here
-            */
-            __shared__ int vectorLength;
-            __shared__ int expLength;
-            __shared__ int code;
-            __shared__ int isInference;
-
-            T *syn0 = arguments[0];
-            T *syn1Neg = arguments[1];
-            T *expTable = arguments[2];
-            T *neu1e = arguments[3];
-
-            __shared__ T dot;
-            __shared__ T g;
-            __shared__ T alpha;
-
-            if (threadIdx.x == 0) {
-                vectorLength = indexArguments[0];
-                expLength = indexArguments[1];
-                code = indexArguments[2];
-                isInference = indexArguments[3];
-
-                dot = (T) 0.0f;
-
-                alpha = realArguments[0];
-            }
-            __syncthreads();
-
-
-            // TODO: it would be great to implement dot without atomicAdd call. like aggregateParticles, or something like that
-            // dot
-            for (int x = threadIdx.x; x < vectorLength; x+=blockDim.x) {
-                T prod = syn0[x] * syn1Neg[x];
-                nd4j::math::atomics::nd4j_atomicAdd<T>(&dot, prod);
-            }
-
-
-            // gradient
-            __syncthreads();
-
-
-            int idx = (int) ((dot + (T) HS_MAX_EXP) * ((T) expLength / (T) HS_MAX_EXP / 2.0));
-            if (idx >= expLength && dot <= (T) HS_MAX_EXP && dot >= (T) -HS_MAX_EXP)
-                return;
-
-
-            if (threadIdx.x == 0) {
-                // gradient calculation
-                if (dot > (T) HS_MAX_EXP)
-                    g = (code - 1) * alpha;
-                else if (dot < (T) - HS_MAX_EXP)
-                    g = (code - 0) * alpha;
-                else {
-
-
-                    g = ((T) code - expTable[idx]) * alpha;
-                }
-
-            //    printf("dot: [%f]; g: [%f]\n", dot, g);
-            }
-            __syncthreads();
-
-           // printf("before syn1Neg[%i]: [%f], dot: [%f]; g: [%f]; vectorLength: [%i]\n", threadIdx.x, syn1Neg[threadIdx.x], dot, g, vectorLength);
-
-            // axpy1
-            for (int x = threadIdx.x; x < vectorLength; x+=blockDim.x) {
-                neu1e[x] = g * syn1Neg[x] + neu1e[x];
-            }
-
-            // axpy2
-            if (!isInference)
-                for (int x = threadIdx.x; x < vectorLength; x+=blockDim.x) {
-                    syn1Neg[x] = g * syn0[x] + syn1Neg[x];
-                }
-
-        //    printf("after syn1Neg[%i]: [%f]\n", threadIdx.x, syn1Neg[threadIdx.x]);
-
-        }
-#endif
-    };
-
-    template<typename T>
-    class Dot {
-    public:
-
-        aggregate_def void executeAggregate(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) {
-            T *vecX = arguments[0];
-            T *vecY = arguments[1];
-            T *vecZ = arguments[2];
-
-            T dot = (T) 0.0f;
-
-            int vectorLength = indexArguments[0];
-
-            PRAGMA_OMP_SIMD_SUM(dot)
-            for (int x = 0; x < vectorLength; x++) {
-                dot += vecX[x] * vecY[x];
-            }
-
-            vecZ[0] = dot;
-        };
-
-#ifdef __CUDACC__
-        aggregate_def void executeAggregateCuda(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) {
-            T *vecX = arguments[0];
-            T *vecY = arguments[1];
-            T *vecZ = arguments[2];
-
-            int vectorLength = indexArguments[0];
-
-            __shared__ T dot;
-            if (threadIdx.x == 0)
-                dot = (T) 0.0f;
-            __syncthreads();
-
-            for (int x = threadIdx.x; x < vectorLength; x+=blockDim.x) {
-                T prod = vecX[x] * vecY[x];
-                nd4j::math::atomics::nd4j_atomicAdd<T>(&dot, prod);
-            }
-            __syncthreads();
-
-            if (threadIdx.x == 0)
-                vecZ[0] = dot;
-        }
-#endif
-    };
-
-    template<typename T>
-    class Axpy {
-    public:
-
-        aggregate_def void executeAggregate(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) {
-            T *vecX = arguments[0];
-            T *vecY = arguments[1];
-
-            T alpha = realArguments[0];
-
-            int vectorLength = indexArguments[0];
-
-            PRAGMA_OMP_SIMD
-            for (int x = 0; x < vectorLength; x++) {
-                vecY[x] = alpha * vecX[x] + vecY[x];
-            }
-        };
-
-#ifdef __CUDACC__
-        aggregate_def void executeAggregateCuda(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) {
-            T *vecX = arguments[0];
-            T *vecY = arguments[1];
-
-            T alpha = realArguments[0];
-
-            int vectorLength = indexArguments[0];
-
-            for (int x = threadIdx.x; x < vectorLength; x+=blockDim.x) {
-                vecY[x] = alpha * vecX[x] + vecY[x];
-            }
-            __syncthreads();
-        }
-#endif
-    };
-
-
-    template<typename T>
-    class SkipGram {
-    public:
-
-        aggregate_def void executeAggregate(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) {
-            int syn0Row = indexArguments[0];
-            int vectorLength = indexArguments[1];
-            int hsRounds = indexArguments[2];
-            int ngRounds = indexArguments[3];
-            int expLength = indexArguments[4];
-            int vocabSize = indexArguments[5];
-            int ngStarter = indexArguments[6];
-            int negTableLength = indexArguments[7];
-            int isInference = indexArguments[8];
-
-
-            auto neu1e = new T[vectorLength];
-            std::memset(neu1e, 0, sizeof(T) * vectorLength);
-
-            T *args[4];
-            int idxArgs[4];
-
-            args[1] = arguments[1]; // syn1
-            args[2] = arguments[2]; // expTable
-            args[3] = neu1e;
-
-
-            idxArgs[0] = vectorLength; // vectorLength
-            idxArgs[1] = expLength; // expLength
-            idxArgs[3] = isInference;
-
-            T *syn1Neg = arguments[3];
-            T *negTable = arguments[4];
-            T *inferenceVector = arguments[5];
-
-            T *syn0 = isInference == 1 ? inferenceVector : arguments[0] + (syn0Row * vectorLength);
-
-            args[0] = syn0;// syn0
-
-            int *idxSyn1 = intArrays[0];
-            int *codes = intArrays[1];
-
-            //nd4j_printf("syn0Row: [%i]; vecLen: [%i]; hsRounds: [%i]; ngRounds: [%i]; expLength: [%i]; vocabSize: [%i]; ngStarter: [%i]; negTableLength: [%i]; isInf: [%i]\n", syn0Row, vectorLength, hsRounds, ngRounds, expLength, vocabSize, ngStarter, negTableLength, isInference);
-
-            auto next_random = static_cast<unsigned long long>(realArguments[1]);
-
-            if (hsRounds > 0) {
-                for (int r = 0; r < hsRounds; r++) {
-                    args[1] = arguments[1] + (idxSyn1[r] * vectorLength); // syn1 row
-                    idxArgs[2] = codes[r];  // code for row
-
-                    //nd4j_printf("idx syn1: [%i]; code: [%i]\n", idxSyn1[r], idxArgs[2]);
-
-                    HierarchicSoftmax<T>::executeAggregate(args, 4, nullptr, 0, idxArgs, 5, nullptr, 0, realArguments, 1);
-                }
-            }
-
-
-
-            int target = ngStarter;
-            if (ngRounds > 0) {
-                for (int r = 0; r < ngRounds + 1; r++) {
-                    if (r == 0) {
-                        idxArgs[2] = 1;
-                    } else {
-                        next_random = next_random * (unsigned long long) 25214903917 + 11;
-                        target = negTable[(next_random >> 16) % negTableLength];
-
-                        if (target <= 0 || target >= vocabSize) target = next_random % (vocabSize - 1) + 1;
-                        if (target == ngStarter)
-                            continue;
-
-                        idxArgs[2] = 0;
-                    }
-
-                    args[1] = syn1Neg + (target * vectorLength); // syn1Neg instead of syn1
-
-                    NegativeSampling<T>::executeAggregate(args, 4, nullptr, 0, idxArgs, 5, nullptr, 0, realArguments, 1);
-                }
-            }
-
-            //nd4j_printf("applying...\n","");
-
-            if (!isInference) {
-                PRAGMA_OMP_SIMD
-                for (int x = 0; x < vectorLength; x++) {
-                    syn0[x] += neu1e[x];
-                }
-            } else {
-                PRAGMA_OMP_SIMD
-                for (int x = 0; x < vectorLength; x++) {
-                    inferenceVector[x] += neu1e[x];
-                }
-            }
-
-            delete[] neu1e;
-        }
-
-#ifdef __CUDACC__
-        aggregate_def void executeAggregateCuda(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments, int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays, T *realArguments, int numRealArguments) {
-            __shared__ int syn0Row;
-            __shared__ int vectorLength;
-            __shared__ int hsRounds;
-            __shared__ int ngRounds;
-            __shared__ int expLength;
-            __shared__ int vocabSize;
-            __shared__ int ngStarter;
-            __shared__ int negTableLength;
-            __shared__ int isInference;
-
-            __shared__ T *neu1e;
-
-            __shared__ T *args[4];
-            __shared__ int idxArgs[4];
-
-
-            __shared__ unsigned long long next_random;
-
-            __shared__ T *negTable;
-            T *syn1Neg = arguments[3];
-            __shared__ T *inferenceVector;
-
-            if (threadIdx.x == 0) {
-                extern __shared__ unsigned char shmem[];
-                neu1e = (T *) shmem;
-
-                syn0Row = indexArguments[0];
-                vectorLength = indexArguments[1];
-                hsRounds = indexArguments[2];
-                ngRounds = indexArguments[3];
-                expLength = indexArguments[4];
-                vocabSize = indexArguments[5];
-                ngStarter = indexArguments[6];
-                negTableLength = indexArguments[7];
-                isInference = indexArguments[8];
-
-                inferenceVector = arguments[5];
-
-                next_random = (unsigned long long) realArguments[1];
-
-                args[0] = isInference == 1 ? inferenceVector : arguments[0] + (syn0Row * vectorLength); // syn0
-                args[1] = arguments[1]; // syn1
-                args[2] = arguments[2]; // expTable
-                args[3] = neu1e;
-
-                negTable = arguments[4];
-
-                idxArgs[0] = vectorLength; // vectorLength
-                idxArgs[1] = expLength; // expLength
-                idxArgs[3] = isInference;
-            }
-            __syncthreads();
-
-            T *syn0 = isInference ? inferenceVector : arguments[0] + (syn0Row * vectorLength);
-
-            for (int i = threadIdx.x; i < vectorLength; i+=blockDim.x) {
-                neu1e[i] = (T) 0.0f;
-            }
-
-            int *idxSyn1 = intArrays[0];
-            int *codes = intArrays[1];
-
-
-            for (int r = 0; r < hsRounds; r++) {
-                if (threadIdx.x == 0) {
-                    args[1] = arguments[1] + (idxSyn1[r] * vectorLength);// syn1 row
-                    idxArgs[2] = codes[r];  // code for row
-                }
-                __syncthreads();
-
-                HierarchicSoftmax<T>::executeAggregateCuda(args, 4, nullptr, 0, idxArgs, 3, nullptr, 0,  realArguments, 1);
-            }
-            __syncthreads();
-
-
-            __shared__ int target;
-            if (ngRounds > 0)
-                for (int r = 0; r < ngRounds + 1; r++) {
-                    if (threadIdx.x == 0) {
-                        if (r == 0) {
-                            // this line isn't a mistake
-                            target = ngStarter;
-
-                            idxArgs[2] = 1;
-                        } else {
-                            next_random = next_random * (unsigned long long)25214903917 + 11 + blockIdx.x;
-                            target = negTable[(next_random >> 16) % negTableLength];
-
-                            if (target <= 0 || target >= vocabSize) target = next_random % (vocabSize - 1) + 1;
-
-                            idxArgs[2] = 0;
-                        }
-
-                        args[1] = syn1Neg + (target * vectorLength);
-                    }
-                    __syncthreads();
-
-                    // we put it here, to make sure all threads pick up continue call
-                    if (r != 0 && target == ngStarter)
-                        continue;
-
-                    NegativeSampling<T>::executeAggregateCuda(args, 4, nullptr, 0, idxArgs, 3, nullptr, 0, realArguments, 1);
-                }
-
-
-
-            // final axpy with 1.0f as alpha
-            if (!isInference)
-                for (int x = threadIdx.x; x < vectorLength; x+= blockDim.x) {
-                    syn0[x] += neu1e[x];
-                }
-            else
-                for (int x = threadIdx.x; x < vectorLength; x+= blockDim.x) {
-                    inferenceVector[x] += neu1e[x];
-                }
-        }
-#endif
-    };
-
-    template<typename T>
-    class CBOW {
-    public:
-
-        aggregate_def void executeAggregate(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments,
-                         int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays,
-                         T *realArguments, int numRealArguments) {
-            int vectorLength = indexArguments[0];
-            int hsRounds = indexArguments[1];
-            int ngRounds = indexArguments[2];
-            int expLength = indexArguments[3];
-            int vocabSize = indexArguments[4];
-            int ngStarter = indexArguments[5];
-            int negTableLength = indexArguments[6];
-            int idxSyn0Length = indexArguments[7];
-            //int initialIdx = indexArguments[8];
-            int numLabels = indexArguments[9];
-            int trainWords = indexArguments[10];
-            int isInference = indexArguments[11];
-
-
-            int *idxSyn0 = intArrays[0];
-            int *idxSyn1 = intArrays[1];
-            int *codes = intArrays[2];
-
-
-            T *neu1 = new T[vectorLength];
-            T *neu1e = new T[vectorLength];
-            std::memset(neu1, 0, sizeof(T) * vectorLength);
-            std::memset(neu1e, 0, sizeof(T) * vectorLength);
-
-            T *syn0 = arguments[0];
-            T *syn1 = arguments[1];
-            T *expTable = arguments[2];
-            T *syn1Neg = arguments[3];
-            T *negTable = arguments[4];
-            T *inferenceVector = arguments[5];
-
-            T *args[4];
-            int idxArgs[4];
-            idxArgs[0] = vectorLength; // vectorLength
-            idxArgs[1] = expLength; // expLength
-            idxArgs[3] = isInference;
-
-            unsigned long long next_random = (unsigned long long) realArguments[1];
-
-            // building neu1 for current window
-            for (int c = 0; c < idxSyn0Length; c++) {
-                T *syn0word = syn0 + (idxSyn0[c] * vectorLength);
-
-                PRAGMA_OMP_SIMD
-                for (int i = 0; i < vectorLength; i++) {
-                    neu1[i] += syn0word[i];
-                }
-            }
-
-            // for inference we use additional inference vector
-            if (isInference) {
-                PRAGMA_OMP_SIMD
-                for (int i = 0; i < vectorLength; i++) {
-                    neu1[i] += inferenceVector[i];
-                }
-            }
-
-
-            // average neu1
-            if (idxSyn0Length > 0) {
-                PRAGMA_OMP_SIMD
-                for (int i = 0; i < vectorLength; i++) {
-                    neu1[i] /= idxSyn0Length + isInference;
-                }
-            }
-
-            args[0] = neu1;
-            args[2] = expTable;
-            args[3] = neu1e;
-
-            if (hsRounds > 0)
-                for (int i = 0; i < hsRounds; i++) {
-                    args[1] = syn1 + (idxSyn1[i] * vectorLength);
-                    idxArgs[2] = codes[i];
-
-                    HierarchicSoftmax<T>::executeAggregate((T **)args, 4, nullptr, 0, idxArgs, 3, nullptr, 0, realArguments, 2);
-                }
-
-            int target = ngStarter;
-            if (ngRounds > 0)
-                for (int i = 0; i < ngRounds + 1; i++) {
-                    if (i == 0) {
-                        idxArgs[2] = 1;
-                    } else {
-                        next_random = next_random * (unsigned long long) 25214903917 + 11;
-                        target = negTable[(next_random >> 16) % negTableLength];
-
-                        if (target <= 0 || target >= vocabSize) target = next_random % (vocabSize - 1) + 1;
-                        if (target == ngStarter)
-                            continue;
-
-                        idxArgs[2] = 0;
-                    }
-
-                    args[1] = syn1Neg + (target * vectorLength); // syn1Neg instead of syn1
-
-                    //printf("Negative round: target: [%i]; code: [%i]; neu1e[0]: [%f]\n", target, idxArgs[4], neu1e[0]);
-
-                    NegativeSampling<T>::executeAggregate((T **)args, 4, nullptr, 0, idxArgs, 3, nullptr, 0, realArguments, 2);
-                }
-
-
-            // if we don't train words - we skip start of idxSyn0
-            int starter = trainWords == 1 ? 0 : idxSyn0Length - numLabels;
-
-            // propagate neu1e -> syn0
-            if (!isInference) {
-                for (int c = starter; c < idxSyn0Length; c++) {
-                    T *syn0word = arguments[0] + (idxSyn0[c] * vectorLength);
-
-                    PRAGMA_OMP_SIMD
-                    for (int i = 0; i < vectorLength; i++) {
-                        syn0word[i] += neu1e[i];
-                    }
-                }
-            } else {
-                PRAGMA_OMP_SIMD
-                for (int i = 0; i < vectorLength; i++) {
-                    inferenceVector[i] += neu1e[i];
-                }
-            }
-
-
-
-            delete[] neu1;
-            delete[] neu1e;
-        }
-
-
-#ifdef __CUDACC__
-        aggregate_def void executeAggregateCuda(T **arguments, int numArguments, Nd4jLong **shapeArguments, int numShapeArguments,
-                         int *indexArguments, int numIndexArguments, int **intArrays, int numIntArrays,
-                         T *realArguments, int numRealArguments) {
-            __shared__ int vectorLength;
-            __shared__ int hsRounds;
-            __shared__ int ngRounds;
-            __shared__ int expLength;
-            __shared__ int vocabSize;
-            __shared__ int ngStarter;
-            __shared__ int negTableLength;
-            __shared__ int idxSyn0Length;
-            __shared__ int initialIdx;
-            __shared__ int numLabels;
-            __shared__ int trainWords;
-            __shared__ int isInference;
-
-            int *idxSyn0 = intArrays[0];
-            int *idxSyn1 = intArrays[1];
-            int *codes = intArrays[2];
-
-            __shared__ T *neu1;
-            __shared__ T *neu1e;
-
-            __shared__ T *args[5];
-            __shared__ int idxArgs[4];
-
-            T *syn0 = arguments[0];
-            T *syn1 = arguments[1];
-            //T *expTable = arguments[2];
-            T *syn1Neg = arguments[3];
-            T *negTable = arguments[4];
-            T *inferenceVector = arguments[5];
-
-            if (threadIdx.x == 0) {
-                vectorLength = indexArguments[0];
-                hsRounds = indexArguments[1];
-                ngRounds = indexArguments[2];
-                expLength = indexArguments[3];
-                vocabSize = indexArguments[4];
-                ngStarter = indexArguments[5];
-                negTableLength = indexArguments[6];
-                idxSyn0Length = indexArguments[7];
-                initialIdx = indexArguments[8];
-                numLabels = indexArguments[9];
-                trainWords = indexArguments[10];
-                isInference = indexArguments[11];
-
-                extern __shared__ unsigned char shmem[];
-                neu1 = (T *) shmem;
-                neu1e = neu1 + vectorLength;
-
-                args[0] = neu1;
-                args[2] = arguments[2]; //expTable
-                args[3] = neu1e;
-
-                idxArgs[0] = vectorLength; // vectorLength
-                idxArgs[1] = expLength; // expLength
-                idxArgs[3] = isInference;
-            }
-            __syncthreads();
-
-            for (int i = threadIdx.x; i < vectorLength; i += blockDim.x) {
-                neu1[i] = (T) 0.0f;
-                neu1e[i] = (T) 0.0f;
-            }
-
-            unsigned long long next_random = (unsigned long long) realArguments[1];
-            for (int c = 0; c < idxSyn0Length; c++) {
-                T *syn0word = syn0 + (idxSyn0[c] * vectorLength);
-
-                for (int i = threadIdx.x; i < vectorLength; i += blockDim.x) {
-                    neu1[i] += syn0word[i];
-                }
-            }
-
-            if (isInference)
-                for (int i = threadIdx.x; i < vectorLength; i += blockDim.x) {
-                    neu1[i] += inferenceVector[i];
-                }
-
-            // average neu1
-            if (idxSyn0Length > 0) {
-                for (int i = threadIdx.x; i < vectorLength; i += blockDim.x) {
-                    neu1[i] /= idxSyn0Length + + isInference;
-                }
-            }
-            __syncthreads();
-
-
-
-            if (hsRounds > 0)
-                for (int i = 0; i < hsRounds; i++) {
-                    if (threadIdx.x == 0) {
-                        args[1] = syn1 + (idxSyn1[i] * vectorLength);
-                        idxArgs[2] = codes[i];
-                    }
-                    __syncthreads();
-
-                    HierarchicSoftmax<T>::executeAggregateCuda(args, 4, nullptr, 0, idxArgs, 3, nullptr, 0, realArguments, 2);
-                }
-
-            __shared__ int target;
-            if (ngRounds > 0)
-                for (int i = 0; i < ngRounds + 1; i++) {
-                    if (threadIdx.x == 0) {
-                        if (i == 0) {
-                            target = ngStarter;
-                        } else {
-                            next_random = next_random * (unsigned long long) 25214903917 + 11;
-                            target = negTable[(next_random >> 16) % negTableLength];
-
-                            if (target <= 0 || target >= vocabSize) target = next_random % (vocabSize - 1) + 1;
-                        }
-
-                        args[1] = syn1Neg + (target * vectorLength); // syn1Neg instead of syn1
-                        idxArgs[2] = i == 0 ? 1 : 0;
-                    }
-                    __syncthreads();
-
-                    if (i != 0 && target == ngStarter)
-                            continue;
-
-
-                    NegativeSampling<T>::executeAggregateCuda(args, 4, nullptr, 0, idxArgs, 3, nullptr, 0, realArguments, 2);
-
-                    //printf("Negative round: target: [%i]; code: [%i]; neu1[%i]: [%f]; neu1e[%i]: [%f]\n", target, idxArgs[2], threadIdx.x, neu1[threadIdx.x], threadIdx.x, neu1e[threadIdx.x]);
-                }
-
-
-            // if we don't train words - we skip start of idxSyn0
-            int starter = trainWords == 1 ? 0 : idxSyn0Length - numLabels;
-
-            if (!isInference)
-                for (int c = starter; c < idxSyn0Length; c++) {
-                    T *syn0word = arguments[0] + (idxSyn0[c] * vectorLength);
-
-                    for (int i = threadIdx.x; i < vectorLength; i += blockDim.x) {
-                        syn0word[i] += neu1e[i];
-                    }
-                }
-            else {
-                for (int i = threadIdx.x; i < vectorLength; i += blockDim.x) {
-                        inferenceVector[i] += neu1e[i];
-                }
-            }
-
-        }
-#endif
-    };
-
-}
-
-#endif //LIBND4J_AGGREGATE_OPS_H
diff --git a/libnd4j/include/ops/declarable/BooleanOp.h b/libnd4j/include/ops/declarable/BooleanOp.h
index b341ce394..b741c61c4 100644
--- a/libnd4j/include/ops/declarable/BooleanOp.h
+++ b/libnd4j/include/ops/declarable/BooleanOp.h
@@ -35,7 +35,6 @@ namespace nd4j {
             Nd4jStatus validateAndExecute(Context& block) override = 0;
         public:
             BooleanOp(const char *name, int numInputs, bool scalar);
-            ~BooleanOp();
 
             bool evaluate(std::initializer_list<nd4j::NDArray*> args);
             bool evaluate(std::vector<nd4j::NDArray*>& args);
diff --git a/libnd4j/include/ops/declarable/BroadcastableOp.h b/libnd4j/include/ops/declarable/BroadcastableOp.h
index bc2cddc59..39435195b 100644
--- a/libnd4j/include/ops/declarable/BroadcastableOp.h
+++ b/libnd4j/include/ops/declarable/BroadcastableOp.h
@@ -33,7 +33,6 @@ namespace nd4j {
             Nd4jStatus validateAndExecute(Context& block) override = 0;
         public:
             BroadcastableOp(const char *name, int numTArgs, int numIArgs);
-            ~BroadcastableOp();
 
             ShapeList *calculateOutputShape(ShapeList *inputShape, nd4j::graph::Context& block) override;
         };
diff --git a/libnd4j/include/ops/declarable/DeclarableCustomOp.h b/libnd4j/include/ops/declarable/DeclarableCustomOp.h
index 38cc20e71..49d3735d4 100644
--- a/libnd4j/include/ops/declarable/DeclarableCustomOp.h
+++ b/libnd4j/include/ops/declarable/DeclarableCustomOp.h
@@ -33,7 +33,6 @@ namespace nd4j {
             Nd4jStatus validateAndExecute(Context& block) override = 0;
         public:
             DeclarableCustomOp(int numInputs, int numOutputs, const char *opName, bool allowsInplace, int tArgs, int iArgs);
-            ~DeclarableCustomOp();
 
             ShapeList* calculateOutputShape(ShapeList* inputShapes, nd4j::graph::Context& block) override = 0;
         };
diff --git a/libnd4j/include/ops/declarable/DeclarableListOp.h b/libnd4j/include/ops/declarable/DeclarableListOp.h
index 6fa4fe086..2d6115027 100644
--- a/libnd4j/include/ops/declarable/DeclarableListOp.h
+++ b/libnd4j/include/ops/declarable/DeclarableListOp.h
@@ -34,13 +34,12 @@ namespace nd4j {
         protected:
             Nd4jStatus validateAndExecute(Context& block) override = 0;
 
-            nd4j::NDArray* getZ(Context& block, int inputId);
+            nd4j::NDArray* getZ(Context& block, int inputId) ;
             void setupResult(NDArray* array, Context& block);
             void setupResultList(NDArrayList* arrayList, Context& block);
 
         public:
             DeclarableListOp(int numInputs, int numOutputs, const char* opName, int tArgs, int iArgs);
-            ~DeclarableListOp();
 
             
             Nd4jStatus execute(Context* block) override;
diff --git a/libnd4j/include/ops/declarable/DeclarableOp.h b/libnd4j/include/ops/declarable/DeclarableOp.h
index f8c96d400..5da74860b 100644
--- a/libnd4j/include/ops/declarable/DeclarableOp.h
+++ b/libnd4j/include/ops/declarable/DeclarableOp.h
@@ -126,7 +126,7 @@ namespace nd4j {
             DeclarableOp(const char *name, bool isLogical);
 
             // default testructor
-            ~DeclarableOp();
+            virtual ~DeclarableOp();
 
             // this method returns OpDescriptor, describing this Op instance
             OpDescriptor *getOpDescriptor();
diff --git a/libnd4j/include/ops/declarable/DeclarableReductionOp.h b/libnd4j/include/ops/declarable/DeclarableReductionOp.h
index 4a75c5daf..5306f60eb 100644
--- a/libnd4j/include/ops/declarable/DeclarableReductionOp.h
+++ b/libnd4j/include/ops/declarable/DeclarableReductionOp.h
@@ -33,7 +33,6 @@ namespace nd4j {
             Nd4jStatus validateAndExecute(Context& block) override = 0;
         public:
             DeclarableReductionOp(int numInputs, int numOutputs, const char *opName, bool allowsInplace, int tArgs, int iArgs);
-            ~DeclarableReductionOp();
 
             ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) override;
         };
diff --git a/libnd4j/include/ops/declarable/LegacyOp.h b/libnd4j/include/ops/declarable/LegacyOp.h
index 951f60165..a7c7ad055 100644
--- a/libnd4j/include/ops/declarable/LegacyOp.h
+++ b/libnd4j/include/ops/declarable/LegacyOp.h
@@ -45,6 +45,7 @@ namespace nd4j {
         public:
             LegacyOp(int numInputs);
             LegacyOp(int numInputs, int opNum);
+            ~LegacyOp() = default;
 
             // All Op classes provide own specific implementation for this method
             ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) override = 0;
diff --git a/libnd4j/include/ops/declarable/LogicOp.h b/libnd4j/include/ops/declarable/LogicOp.h
index 026afe634..70fa3a6ff 100644
--- a/libnd4j/include/ops/declarable/LogicOp.h
+++ b/libnd4j/include/ops/declarable/LogicOp.h
@@ -37,7 +37,6 @@ namespace nd4j {
             Nd4jStatus validateAndExecute(nd4j::graph::Context& block) override;
         public:
             LogicOp(const char *name);
-            ~LogicOp() = default;
 
             ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context &block) override;
         };
diff --git a/libnd4j/include/ops/declarable/OpTuple.h b/libnd4j/include/ops/declarable/OpTuple.h
index e0296dd9c..fc0fd594a 100644
--- a/libnd4j/include/ops/declarable/OpTuple.h
+++ b/libnd4j/include/ops/declarable/OpTuple.h
@@ -29,7 +29,7 @@ namespace nd4j {
     namespace ops {
         class ND4J_EXPORT OpTuple {
         public:
-            const char * _opName;
+            std::string _opName;
             std::vector<nd4j::NDArray*> _inputs;
             std::vector<nd4j::NDArray*> _outputs;
             std::vector<double> _tArgs;
diff --git a/libnd4j/include/ops/declarable/generic/blas/axpy.cpp b/libnd4j/include/ops/declarable/generic/blas/axpy.cpp
index 986b93019..1b949eb35 100644
--- a/libnd4j/include/ops/declarable/generic/blas/axpy.cpp
+++ b/libnd4j/include/ops/declarable/generic/blas/axpy.cpp
@@ -30,9 +30,10 @@ namespace nd4j {
             auto y = INPUT_VARIABLE(1);
             auto z = OUTPUT_VARIABLE(0);
 
-            REQUIRE_TRUE(x->isSameShape(y),0, "Axpy: both arguments should have the same shape")
+            REQUIRE_TRUE(x->isSameShape(y),0, "Axpy: both arguments should have the same shape");
+            REQUIRE_TRUE(x->dataType() == y->dataType() && x->dataType() == z->dataType(), 0, "Axpy: all arguments must have the same data type");
 
-            double a = (double) 1.0f;
+            double a = 1.0;
 
             if (block.width() > 2) {
                 auto alpha = INPUT_VARIABLE(2);
@@ -41,15 +42,6 @@ namespace nd4j {
                 a = T_ARG(0);
             }
 
-            /*
-            auto lambda = LAMBDA_TT(_y, _x, a) {
-                return a * _x + _y;
-            };
-
-            y->applyPairwiseLambda(x, lambda, z);
-            */
-
-            // FIXME: set proper extras here
             ExtraArguments arguments({a});
 
             y->applyPairwiseTransform(pairwise::Axpy, x, z, &arguments);
@@ -59,9 +51,9 @@ namespace nd4j {
 
         DECLARE_TYPES(axpy) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(0, {DataType::FLOAT32, DataType ::DOUBLE, DataType::HALF})
-                    ->setAllowedInputTypes(1, {DataType::FLOAT32, DataType ::DOUBLE, DataType::HALF})
-                    ->setAllowedOutputTypes(0, {DataType::FLOAT32, DataType ::DOUBLE, DataType::HALF});
+                    ->setAllowedInputTypes(0, {ALL_FLOATS})
+                    ->setAllowedInputTypes(1, {ALL_FLOATS})
+                    ->setAllowedOutputTypes(0, {ALL_FLOATS});
         }
     }
 }
diff --git a/libnd4j/include/ops/declarable/generic/datatypes/cast.cpp b/libnd4j/include/ops/declarable/generic/datatypes/cast.cpp
index 6897f7f77..ad7b7fee2 100644
--- a/libnd4j/include/ops/declarable/generic/datatypes/cast.cpp
+++ b/libnd4j/include/ops/declarable/generic/datatypes/cast.cpp
@@ -30,14 +30,6 @@ namespace nd4j {
             auto input = INPUT_VARIABLE(0);
             auto output = OUTPUT_VARIABLE(0);
 
-            // TODO: once we add support for multiple dtypes - uncommend this
-            /*
-            int it = INT_ARG(0);
-            DataType newType = DataTypeUtils::fromInt(it);
-
-            input->cast(output, newType);
-            */
-			
             if(input->isEmpty()){
                 REQUIRE_TRUE(output->isEmpty(), 0, "If input is empty, output array must also be empty");
                 return Status::OK();
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/argmax.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/argmax.cpp
index bdfdfb6c6..3fd5e2250 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/argmax.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/argmax.cpp
@@ -78,7 +78,7 @@ namespace nd4j {
             }
 
             // special case - output is scalar
-            if (dims.size() == 0 || (dims.size() == 1 && dims.at(0) == MAX_INT)) {
+            if (dims.size() == 0 || (dims.size() == 1 && dims.at(0) == nd4j::DataTypeUtils::max<int>())) {
                 return SHAPELIST(ConstantShapeHelper::getInstance()->scalarShapeInfo(nd4j::DataType::INT64));
             }
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/argmin.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/argmin.cpp
index a80194eb2..91e9d5a41 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/argmin.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/argmin.cpp
@@ -77,7 +77,7 @@ namespace nd4j {
             }
 
             // special case - output is scalar
-            if (dims.size() == 0 || (dims.size() == 1 && dims.at(0) == MAX_INT)) {
+            if (dims.size() == 0 || (dims.size() == 1 && dims.at(0) == nd4j::DataTypeUtils::max<int>())) {
                 return SHAPELIST(ConstantShapeHelper::getInstance()->scalarShapeInfo(DataType::INT64));
             }
 
diff --git a/libnd4j/include/ops/declarable/generic/recurrent/dynamicBidirectionalRNN.cpp b/libnd4j/include/ops/declarable/generic/recurrent/dynamicBidirectionalRNN.cpp
index f027bfca3..eb1a01861 100644
--- a/libnd4j/include/ops/declarable/generic/recurrent/dynamicBidirectionalRNN.cpp
+++ b/libnd4j/include/ops/declarable/generic/recurrent/dynamicBidirectionalRNN.cpp
@@ -95,11 +95,9 @@ CUSTOM_OP_IMPL(dynamic_bidirectional_rnn, 7, 4, false, 0, 0) {
     	seqLen->assign(time);                                        // set each element of seqLen to be equal to time
     }
 
-    std::initializer_list<Nd4jLong> dimsForReverse = timeMajor ? std::initializer_list<Nd4jLong>{0,1} : std::initializer_list<Nd4jLong>{1,0};
-
     // reverse x     
     nd4j::ops::reverse_sequence reverse;
-    auto resultsIn = reverse.execute({x, seqLen}, {}, dimsForReverse, {}, false, x->dataType());
+    auto resultsIn = timeMajor ? reverse.execute({x, seqLen}, {}, {0, 1}, {}, false, x->dataType()) : reverse.execute({x, seqLen}, {}, {1, 0}, {}, false, x->dataType());
     REQUIRE_TRUE (resultsIn->status() == ND4J_STATUS_OK, 0, "dynamic_bidirectional_rnn: there is a problem with reverse on the sequence.");
     auto revInput = resultsIn->at(0);
 
@@ -109,7 +107,7 @@ CUSTOM_OP_IMPL(dynamic_bidirectional_rnn, 7, 4, false, 0, 0) {
     hBWFinal->assign(resultsBW->at(1));
 
     // reverse hBWtemp 
-    auto resultsOut = reverse.execute({hBWtemp, seqLen}, {}, dimsForReverse, {});
+    auto resultsOut = timeMajor ? reverse.execute({hBWtemp, seqLen}, {}, {0, 1}, {}) : reverse.execute({hBWtemp, seqLen}, {}, {1, 0}, {});
     hBW->assign(resultsOut->at(0));
     
 	delete resultsOut;
diff --git a/libnd4j/include/ops/declarable/generic/transforms/reverseSequence.cpp b/libnd4j/include/ops/declarable/generic/transforms/reverseSequence.cpp
index fef13d44b..b3c2a93d4 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/reverseSequence.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/reverseSequence.cpp
@@ -28,7 +28,7 @@ namespace nd4j {
 namespace ops {
 
 CUSTOM_OP_IMPL(reverse_sequence, 2, 1, false, 0, 2) {
-        
+
     auto input      = INPUT_VARIABLE(0);
     auto seqLengths = INPUT_VARIABLE(1);
     auto output     = OUTPUT_VARIABLE(0);
@@ -39,13 +39,13 @@ CUSTOM_OP_IMPL(reverse_sequence, 2, 1, false, 0, 2) {
     REQUIRE_TRUE(input->rankOf() > 1, 0, "REVERSE_SEQUENSE operation: input array must have rank > 1, but got %i instead !", input->rankOf());
     REQUIRE_TRUE(seqLengths->rankOf() == 1, 0, "REVERSE_SEQUENSE operation: input array seqLengths must be 1D vector, that is it must have rank == 1, but got %i instead !", seqLengths->rankOf());
     REQUIRE_TRUE(seqLengths->lengthOf() == input->sizeAt(batchDim), 0, "REVERSE_SEQUENSE custom operation: the length of array seqLengths must be equal to the value of batchDim dimension of input array, but got %i and %i correspondingly !", seqLengths->lengthOf(), input->sizeAt(batchDim));
-    REQUIRE_TRUE(seqDim != batchDim, 0, "REVERSE_SEQUENSE operation: input integer parameters seqDim and batchDim must be different, but they are %i and %i correspondingly !", seqDim, batchDim);
+    REQUIRE_TRUE(seqDim != batchDim, 0, "REVERSE_SEQUENSE operation: input integer parameters seqDim and batchDim must be different, but they both are equal to %i !", batchDim);
     REQUIRE_TRUE(batchDim < input->rankOf(), 0, "REVERSE_SEQUENSE operation: input integer parameter batchDim must be smaller than input array rank, but got %i and %i correspondingly !", batchDim, input->rankOf());
-    REQUIRE_TRUE(seqDim < input->rankOf(), 0, "REVERSE_SEQUENSE operation: input integer parameter seqDim must be smaller than input array rank, but got %i  and %i correspondingly !", seqDim, input->rankOf());        
+    REQUIRE_TRUE(seqDim < input->rankOf(), 0, "REVERSE_SEQUENSE operation: input integer parameter seqDim must be smaller than input array rank, but got %i  and %i correspondingly !", seqDim, input->rankOf());
 
     auto maxElem = seqLengths->reduceNumber(reduce::Max);
     REQUIRE_TRUE(maxElem.e<Nd4jLong>(0) <= input->sizeAt(seqDim), 0, "REVERSE_SEQUENSE operation: max element in seqLengths array must be not greater than value of seqDim dimension of input array !");
-    
+
     helpers::reverseSequence(block.launchContext(), input, seqLengths, output, seqDim, batchDim);
 
     return Status::OK();
@@ -65,15 +65,15 @@ DECLARE_SHAPE_FN(reverse_sequence) {
     int seqDim = INT_ARG(0);
     int batchDim = block.numI() > 1 ? INT_ARG(1) : 0;
 
+    REQUIRE_TRUE(batchDim < inShapeInfo[0], 0, "REVERSE_SEQUENSE operation: input integer parameter batchDim must be smaller than input array rank, but got %i and %i correspondingly !", batchDim, inShapeInfo[0]);
+    REQUIRE_TRUE(seqDim < inShapeInfo[0], 0, "REVERSE_SEQUENSE operation: input integer parameter seqDim must be smaller than input array rank, but got %i  and %i correspondingly !", seqDim, inShapeInfo[0]);
     REQUIRE_TRUE(inShapeInfo[0] > 1, 0, "REVERSE_SEQUENSE operation: input array must have rank > 1, but got %i instead !", inShapeInfo[0]);
     REQUIRE_TRUE(seqLenShapeInfo[0] == 1, 0, "REVERSE_SEQUENSE operation: input array seqLengths must be 1D vector, that is it must have rank == 1, but got %i instead !", seqLenShapeInfo[0]);
     REQUIRE_TRUE(seqLenShapeInfo[1] == inShapeInfo[batchDim+1], 0, "REVERSE_SEQUENSE custom operation: the length of array seqLengths must be equal to the value of batchDim dimension of input array, but got %i and %i correspondingly !", seqLenShapeInfo[1], inShapeInfo[batchDim+1]);
-    REQUIRE_TRUE(batchDim < inShapeInfo[0], 0, "REVERSE_SEQUENSE operation: input integer parameter batchDim must be smaller than input array rank, but got %i and %i correspondingly !", batchDim, inShapeInfo[0]);
-    REQUIRE_TRUE(seqDim < inShapeInfo[0], 0, "REVERSE_SEQUENSE operation: input integer parameter seqDim must be smaller than input array rank, but got %i  and %i correspondingly !", seqDim, inShapeInfo[0]);
-    
+
     Nd4jLong* outShapeInfo = nullptr;
     COPY_SHAPE(inShapeInfo, outShapeInfo);
-        
+
     return SHAPELIST(CONSTANT(outShapeInfo));
 }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp b/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp
index 5b6e6122e..a7123d42f 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp
@@ -19,6 +19,7 @@
 //
 
 #include <ops/declarable/helpers/BarnesHutTsne.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -150,26 +151,30 @@ namespace helpers {
 
 //        auto shift = 0;
         auto rowSize = sizeof(T) * colCount;
-        PRAGMA_OMP_PARALLEL_FOR
-        for (int n = 0; n < N; n++) {
-            int start = rowP->e<int>(n);
-            int end = rowP->e<int>(n+1);
-            int shift = n * colCount;
-            for (int i = start; i < end; i++) {
-                T const* thisSlice = dataP + colP->e<int>(i) * colCount;
-                T res = 1;
 
-                for (int k = 0; k < colCount; k++) {
-                    auto tempVal = dataP[shift + k] - thisSlice[k];//thisSlice[k];
-                    res += tempVal * tempVal;
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto n = start; n < stop; n += increment) {
+                int s = rowP->e<int>(n);
+                int end = rowP->e<int>(n + 1);
+                int shift = n * colCount;
+                for (int i = s; i < end; i++) {
+                    T const *thisSlice = dataP + colP->e<int>(i) * colCount;
+                    T res = 1;
+
+                    for (int k = 0; k < colCount; k++) {
+                        auto tempVal = dataP[shift + k] - thisSlice[k];//thisSlice[k];
+                        res += tempVal * tempVal;
+                    }
+
+                    res = vals[i] / res;
+                    for (int k = 0; k < colCount; k++)
+                        outputP[shift + k] += ((dataP[shift + k] - thisSlice[k]) * res);
                 }
-
-                res = vals[i] / res;
-                for (int k = 0; k < colCount; k++)
-                    outputP[shift + k] += ((dataP[shift + k] - thisSlice[k]) * res);
+                //shift += colCount;
             }
-            //shift += colCount;
-        }
+        };
+
+        samediff::Threads::parallel_tad(func, 0, N);
     }
 
     void barnes_edge_forces(const NDArray* rowP, NDArray const* colP, NDArray const* valP, int N, NDArray* output, NDArray const& data) {
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp b/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp
index bd29094ec..ba0f36eb5 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp
@@ -23,6 +23,7 @@
 #include <ShapeUtils.h>
 #include <numeric>
 #include <ConstantTadHelper.h>
+#include <execution/Threads.h>
 
 namespace nd4j    {
 namespace ops     {
@@ -44,11 +45,9 @@ static void softMaxForVector_(void *input, Nd4jLong *inShapeInfo, void *output,
 
         if (inEWS == 1 && outEWS == 1) {
 
-            PRAGMA_OMP_SIMD_MAX(max)
             for (int i = 0; i < length; i++)
                 max = nd4j::math::nd4j_max<T>(max, inBuff[i]);
 
-            PRAGMA_OMP_SIMD_SUM(sum)
             for (int i = 0; i < length; i++) {
                 outBuff[i] = nd4j::math::nd4j_exp<T, T>(inBuff[i] - max);
                 sum += outBuff[i];
@@ -60,11 +59,9 @@ static void softMaxForVector_(void *input, Nd4jLong *inShapeInfo, void *output,
         }
         else {
 
-            PRAGMA_OMP_SIMD_MAX(max)
             for (int i = 0; i < length; i++)
                 max = nd4j::math::nd4j_max<T>(max, inBuff[i * inEWS]);
 
-            PRAGMA_OMP_SIMD_SUM(sum)
             for (int i = 0; i < length; i++) {
                 T r = nd4j::math::nd4j_exp<T, T>(inBuff[i * inEWS] - max);
                 outBuff[i * outEWS] = r;
@@ -89,19 +86,17 @@ static void softMaxForVector_(void *input, Nd4jLong *inShapeInfo, void *output,
         T sum = 0.;
         int length = shape::length(inShapeInfo);
 
-PRAGMA_OMP_SIMD_ARGS(reduction(OMP_MAXT:max))
         for (int i = 0; i < length; i++) {
             const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo);
             max = nd4j::math::nd4j_max<T>(max, inBuff[offset]);
         }
 
-PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(reduction(OMP_SUMT:sum))
         for (int i = 0; i < length; i++) {
             const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo);
             outBuff[offset] = nd4j::math::nd4j_exp<T, T>(inBuff[offset] - max);
             sum += outBuff[offset];
         }
-PRAGMA_OMP_SIMD
+
         for (int i = 0; i < length; i++) {
             const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo);
             outBuff[offset] /= sum;
@@ -151,7 +146,6 @@ void softMaxForVector(nd4j::LaunchContext * context, const NDArray& input, NDArr
         auto length = shape::length(inShapeInfo);
 
         if (inEWS == 1) {
-            PRAGMA_OMP_SIMD_MAX(max)
             for (int i = 0; i < length; i++)
                 max = nd4j::math::nd4j_max<T>(max, inBuff[i]);
 
@@ -212,7 +206,7 @@ static void softmax_(nd4j::LaunchContext * context, const NDArray& input, NDArra
     }
     else if(input.isSameShapeStrict(&output)) {
 
-        TadPack tadPack  = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input.getShapeInfo(), {dimension});
+        TadPack tadPack  = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input.getShapeInfo(), dimension);
         Nd4jLong* tadShapeInfo  = tadPack.primaryShapeInfo();
         Nd4jLong* tadOffsets    = tadPack.primaryOffsets();
         const uint numOfSubArrs = tadPack.numberOfTads();
@@ -220,27 +214,30 @@ static void softmax_(nd4j::LaunchContext * context, const NDArray& input, NDArra
 
         if(shape::elementWiseStride(tadShapeInfo) == 1){
 
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (uint i = 0; i < numOfSubArrs; ++i) {
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
 
-                T* inBuff  = input.bufferAsT<T>()  + tadOffsets[i];
-                T* outBuff = output.bufferAsT<T>() + tadOffsets[i];
+                    T *inBuff = input.bufferAsT<T>() + tadOffsets[i];
+                    T *outBuff = output.bufferAsT<T>() + tadOffsets[i];
 
-                T max = -DataTypeUtils::max<T>();
-                T sum = 0;
+                    T max = -DataTypeUtils::max<T>();
+                    T sum = 0;
 
-                for(uint j = 0; j < tadLen; ++j)
-                    max = nd4j::math::nd4j_max<T>(max, inBuff[j]);
+                    for (uint j = 0; j < tadLen; ++j)
+                        max = nd4j::math::nd4j_max<T>(max, inBuff[j]);
 
-                for (uint j = 0; j < tadLen; ++j) {
-                    T temp = nd4j::math::nd4j_exp<T,T>(inBuff[j] - max);
-                    outBuff[j] = temp;
-                    sum += temp;
+                    for (uint j = 0; j < tadLen; ++j) {
+                        T temp = nd4j::math::nd4j_exp<T, T>(inBuff[j] - max);
+                        outBuff[j] = temp;
+                        sum += temp;
+                    }
+
+                    for (uint j = 0; j < tadLen; ++j)
+                        outBuff[j] /= sum;
                 }
+            };
 
-                for (uint j = 0; j < tadLen; ++j)
-                    outBuff[j] /= sum;
-            }
+            samediff::Threads::parallel_tad(func,0, numOfSubArrs);
         }
         else {
 
@@ -250,29 +247,30 @@ static void softmax_(nd4j::LaunchContext * context, const NDArray& input, NDArra
             auto offsets = new Nd4jLong[tadLen];
             shape::calcOffsets(tadShapeInfo, offsets);
 
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (uint i = 0; i < numOfSubArrs; ++i) {
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    auto inBuff = input.bufferAsT<T>() + tadOffsets[i];
+                    auto outBuff = output.bufferAsT<T>() + tadOffsets[i];
 
-                T* inBuff  = input.bufferAsT<T>()  + tadOffsets[i];
-                T* outBuff = output.bufferAsT<T>() + tadOffsets[i];
+                    T max = -DataTypeUtils::max<T>();
+                    T sum = 0.f;
 
-                T max = -DataTypeUtils::max<T>();
-                T sum = 0.f;
+                    for (uint j = 0; j < tadLen; ++j)
+                        max = nd4j::math::nd4j_max<T>(max, inBuff[offsets[j]]);
 
+                    for (uint j = 0; j < tadLen; ++j) {
+                        T temp = nd4j::math::nd4j_exp<T, T>(inBuff[offsets[j]] - max);
+                        outBuff[offsets[j]] = temp;
+                        sum += temp;
+                    }
 
-
-                for(uint j = 0; j < tadLen; ++j)
-                    max = nd4j::math::nd4j_max<T>(max, inBuff[offsets[j]]);
-
-                for (uint j = 0; j < tadLen; ++j) {
-                    T temp = nd4j::math::nd4j_exp<T,T>(inBuff[offsets[j]] - max);
-                    outBuff[offsets[j]] = temp;
-                    sum += temp;
+                    for (uint j = 0; j < tadLen; ++j)
+                        outBuff[offsets[j]] /= sum;
                 }
+            };
+
+            samediff::Threads::parallel_tad(func, 0, numOfSubArrs);
 
-                for (uint j = 0; j < tadLen; ++j)
-                    outBuff[offsets[j]] /= sum;
-            }
             delete []offsets;
         }
     }
@@ -299,16 +297,19 @@ void prelu(nd4j::LaunchContext * context, const NDArray& input, const NDArray& a
     const Nd4jLong* inputShapeInfo = input.getShapeInfo();
     const Nd4jLong* alphaShapeInfo = alpha.getShapeInfo();
 
-    PRAGMA_OMP_PARALLEL_FOR_IF(inputLen > Environment::getInstance()->elementwiseThreshold())
-    for(Nd4jLong i = 0; i < inputLen; ++i) {
-         // FIXME: double!
-        double x = input.e<double>(i);
-        if(x < 0.0) {
-            // FIXME: double
-            output.p(i, (x * alpha.e<double>(shape::subArrayIndex(i, inputShapeInfo, alphaShapeInfo))));
-        } else
-            output.p(i, x);
-    }
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto i = start; i < stop; i += increment) {
+            // FIXME: double!
+            double x = input.e<double>(i);
+            if (x < 0.0) {
+                // FIXME: double
+                output.p(i, (x * alpha.e<double>(shape::subArrayIndex(i, inputShapeInfo, alphaShapeInfo))));
+            } else
+                output.p(i, x);
+        }
+    };
+
+    samediff::Threads::parallel_for(func, 0, inputLen);
 }
 
 //////////////////////////////////////////////////////////////////////////
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp b/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp
index 0e6e1f777..a36330fbe 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp
@@ -20,6 +20,7 @@
 
 
 #include<ops/declarable/helpers/addBias.h>
+#include <execution/Threads.h>
 
 namespace nd4j 	  {
 namespace ops 	  {
@@ -62,12 +63,15 @@ static void addBias_(const NDArray& input, const NDArray& bias, NDArray &output,
 
         if(inOutAreSame) {
 
-            PRAGMA_OMP_PARALLEL_FOR_ARGS(collapse(4))
-            for(uint b = 0; b < bS; ++b)
-                for(uint c = 0; c < C; ++c)
-                    for(uint h = 0; h < oH ; ++h)
-                        for(uint w = 0; w < oW ; ++w)
-                            z[b*zStrideB + c*zStrideC + h*zStrideH + w*zStrideW] += static_cast<X>(y[c*yStrideC]);
+            auto func = PRAGMA_THREADS_FOR_3D {
+                for (uint b = start_x; b < stop_x; b += inc_x)
+                    for (uint c = start_y; c < stop_y; c += inc_y)
+                        for (uint h = start_z; h < stop_z; h += inc_z)
+                            for (uint w = 0; w < oW; ++w)
+                                z[b * zStrideB + c * zStrideC + h * zStrideH + w * zStrideW] += static_cast<X>(y[c * yStrideC]);
+            };
+
+            samediff::Threads::parallel_for(func, 0, bS, 1, 0, C, 1, 0, oH, 1);
         }
         else {
 
@@ -76,12 +80,15 @@ static void addBias_(const NDArray& input, const NDArray& bias, NDArray &output,
             const Nd4jLong xStrideH = isNCHW ? input.stridesOf()[2] : input.stridesOf()[1];
             const Nd4jLong xStrideW = isNCHW ? input.stridesOf()[3] : input.stridesOf()[2];
 
-            PRAGMA_OMP_PARALLEL_FOR_ARGS(collapse(4))
-            for(uint b = 0; b < bS; ++b)
-                for(uint c = 0; c < C; ++c)
-                    for(uint h = 0; h < oH ; ++h)
-                        for(uint w = 0; w < oW ; ++w)
-                            z[b*zStrideB + c*zStrideC + h*zStrideH + w*zStrideW] = x[b*xStrideB + c*xStrideC + h*xStrideH + w*xStrideW] +  static_cast<X>(y[c*yStrideC]);
+            auto func = PRAGMA_THREADS_FOR_3D {
+                for (uint b = start_x; b < stop_x; b += inc_x)
+                    for (uint c = start_y; c < stop_y; c += inc_y)
+                        for (uint h = start_z; h < stop_z; h += inc_z)
+                            for (uint w = 0; w < oW; ++w)
+                                z[b * zStrideB + c * zStrideC + h * zStrideH + w * zStrideW] = x[b * xStrideB + c * xStrideC + h * xStrideH + w * xStrideW] + static_cast<X>(y[c * yStrideC]);
+            };
+
+            samediff::Threads::parallel_for(func, 0, bS, 1, 0, C, 1, 0, oH, 1);
         }
     }
     else if(output.rankOf() == 5) {
@@ -98,13 +105,16 @@ static void addBias_(const NDArray& input, const NDArray& bias, NDArray &output,
 
         if(inOutAreSame) {
 
-            PRAGMA_OMP_PARALLEL_FOR_ARGS(collapse(5))
-            for(uint b = 0; b < bS; ++b)
-                for(uint c = 0; c < C; ++c)
-                    for(uint d = 0; d < oD ; ++d)
-                        for(uint h = 0; h < oH ; ++h)
-                            for(uint w = 0; w < oW ; ++w)
-                                z[b*zStrideB + c*zStrideC + d*zStrideD + h*zStrideH + w*zStrideW] += static_cast<X>(y[c*yStrideC]);
+            auto func = PRAGMA_THREADS_FOR_3D {
+                for (uint b = start_x; b < stop_x; b += inc_x)
+                    for (uint c = start_y; c < stop_y; c += inc_y)
+                        for (uint d = start_z; d < stop_z; d += inc_z)
+                            for (uint h = 0; h < oH; ++h)
+                                for (uint w = 0; w < oW; ++w)
+                                    z[b * zStrideB + c * zStrideC + d * zStrideD + h * zStrideH + w * zStrideW] += static_cast<X>(y[c * yStrideC]);
+            };
+
+            samediff::Threads::parallel_for(func, 0, bS, 1, 0, C, 1, 0, oD, 1);
         }
         else {
 
@@ -114,13 +124,16 @@ static void addBias_(const NDArray& input, const NDArray& bias, NDArray &output,
             const Nd4jLong xStrideH = isNCHW ? input.stridesOf()[3] : input.stridesOf()[2];
             const Nd4jLong xStrideW = isNCHW ? input.stridesOf()[4] : input.stridesOf()[3];
 
-            PRAGMA_OMP_PARALLEL_FOR_ARGS(collapse(5))
-            for(uint b = 0; b < bS; ++b)
-                for(uint c = 0; c < C; ++c)
-                    for(uint d = 0; d < oD ; ++d)
-                        for(uint h = 0; h < oH ; ++h)
-                            for(uint w = 0; w < oW ; ++w)
-                                z[b*zStrideB + c*zStrideC + d*zStrideD + h*zStrideH + w*zStrideW] = x[b*xStrideB + c*xStrideC + d*xStrideD + h*xStrideH + w*xStrideW] + static_cast<X>(y[c*yStrideC]);
+            auto func = PRAGMA_THREADS_FOR_3D {
+                for (uint b = start_x; b < stop_x; b += inc_x)
+                    for (uint c = start_y; c < stop_y; c += inc_y)
+                        for (uint d = start_z; d < stop_z; d += inc_z)
+                            for (uint h = 0; h < oH; ++h)
+                                for (uint w = 0; w < oW; ++w)
+                                    z[b * zStrideB + c * zStrideC + d * zStrideD + h * zStrideH + w * zStrideW] = x[b * xStrideB + c * xStrideC + d * xStrideD + h * xStrideH + w * xStrideW] + static_cast<X>(y[c * yStrideC]);
+            };
+
+            samediff::Threads::parallel_for(func, 0, bS, 1, 0, C, 1, 0, oD, 1);
         }
     }
     else {
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp b/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp
index 5484d822d..ae76f0289 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp
@@ -21,6 +21,7 @@
 
 #include <ops/declarable/helpers/adjust_hue.h>
 #include <helpers/ConstantTadHelper.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -38,50 +39,55 @@ static void adjustHue_(const NDArray *input, const NDArray* deltaScalarArr, NDAr
 
     if(dimC == rank - 1 && input->ews() == 1 && output->ews() == 1 && input->ordering() == 'c' && output->ordering() == 'c') {
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for (Nd4jLong i = 0; i < input->lengthOf(); i += 3) {
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
+                T h, s, v;
 
-            T h, s, v;
+                rgbToHsv<T>(x[i], x[i + 1], x[i + 2], h, s, v);
 
-            rgbToHsv<T>(x[i], x[i+1], x[i+2], h, s, v);
+                h += delta * 360;
+                if (h > 360)
+                    h -= 360;
+                else if (h < 0)
+                    h += 360;
 
-            h += delta * 360;
-            if(h > 360)
-                h -= 360;
-            else if(h < 0)
-                h += 360;
+                hsvToRgb<T>(h, s, v, z[i], z[i + 1], z[i + 2]);
+            }
+        };
 
-            hsvToRgb<T>(h, s, v, z[i], z[i+1], z[i+2]);
-        }
+        samediff::Threads::parallel_for(func, 0, input->lengthOf(), 3);
     }
     else {
 
-        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(),  {dimC});
-        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {dimC});
+        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(),  dimC);
+        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimC);
 
         const Nd4jLong numOfTads   = packX.numberOfTads();
         const Nd4jLong xDimCstride = input->stridesOf()[dimC];
         const Nd4jLong zDimCstride = output->stridesOf()[dimC];
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for(Nd4jLong i = 0; i < numOfTads; ++i) {
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
 
-            const T* xTad = x + packX.platformOffsets()[i];
-                  T* zTad = z + packZ.platformOffsets()[i];
+                const T *xTad = x + packX.platformOffsets()[i];
+                T *zTad = z + packZ.platformOffsets()[i];
 
-            T h, s, v;
+                T h, s, v;
 
-            rgbToHsv<T>(xTad[0], xTad[xDimCstride], xTad[2 * xDimCstride], h, s, v);
+                rgbToHsv<T>(xTad[0], xTad[xDimCstride], xTad[2 * xDimCstride], h, s, v);
 
-            h += delta * 360;
-            if(h > 360)
-                h -= 360;
-            else if(h < 0)
-                h += 360;
+                h += delta * 360;
+                if (h > 360)
+                    h -= 360;
+                else if (h < 0)
+                    h += 360;
 
-            hsvToRgb<T>(h, s, v, zTad[0], zTad[zDimCstride], zTad[2 * zDimCstride]);
+                hsvToRgb<T>(h, s, v, zTad[0], zTad[zDimCstride], zTad[2 * zDimCstride]);
 
-        }
+            }
+        };
+
+        samediff::Threads::parallel_tad(func, 0, numOfTads);
     }
 }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp b/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp
index 9a5141a82..d4b0de398 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp
@@ -22,6 +22,7 @@
 #include <ops/declarable/helpers/adjust_saturation.h>
 #include <ops/declarable/helpers/adjust_hue.h>
 #include <helpers/ConstantTadHelper.h>
+#include <execution/Threads.h>
 
 
 namespace nd4j    {
@@ -39,50 +40,51 @@ static void adjustSaturation_(const NDArray *input, const NDArray* factorScalarA
 
     if(dimC == rank - 1 && input->ews() == 1 && output->ews() == 1 && input->ordering() == 'c' && output->ordering() == 'c') {
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for (Nd4jLong i = 0; i < input->lengthOf(); i += 3) {
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
+                T h, s, v;
 
-            T h, s, v;
+                rgbToHsv<T>(x[i], x[i + 1], x[i + 2], h, s, v);
 
-            rgbToHsv<T>(x[i], x[i+1], x[i+2], h, s, v);
+                s *= factor;
+                if (s > 1.f)
+                    s = 1.f;
+                else if (s < 0.f)
+                    s = 0.f;
 
-            s *= factor;
-            if(s > 1.f)
-                s = 1.f;
-            else if(s < 0.f)
-                s = 0.f;
+                hsvToRgb<T>(h, s, v, z[i], z[i + 1], z[i + 2]);
+            }
+        };
 
-            hsvToRgb<T>(h, s, v, z[i], z[i+1], z[i+2]);
-        }
-    }
-    else {
-
-        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(),  {dimC});
-        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {dimC});
+        samediff::Threads::parallel_for(func, 0, input->lengthOf(), 3);
+    } else {
+        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(),  dimC);
+        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimC);
 
         const Nd4jLong numOfTads   = packX.numberOfTads();
         const Nd4jLong xDimCstride = input->stridesOf()[dimC];
         const Nd4jLong zDimCstride = output->stridesOf()[dimC];
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for(Nd4jLong i = 0; i < numOfTads; ++i) {
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
+                const T *xTad = x + packX.platformOffsets()[i];
+                T *zTad = z + packZ.platformOffsets()[i];
 
-            const T* xTad = x + packX.platformOffsets()[i];
-                  T* zTad = z + packZ.platformOffsets()[i];
+                T h, s, v;
 
-            T h, s, v;
+                rgbToHsv<T>(xTad[0], xTad[xDimCstride], xTad[2 * xDimCstride], h, s, v);
 
-            rgbToHsv<T>(xTad[0], xTad[xDimCstride], xTad[2 * xDimCstride], h, s, v);
+                s *= factor;
+                if (s > 1.f)
+                    s = 1.f;
+                else if (s < 0.f)
+                    s = 0.f;
 
-            s *= factor;
-            if(s > 1.f)
-                s = 1.f;
-            else if(s < 0.f)
-                s = 0.f;
+                hsvToRgb<T>(h, s, v, zTad[0], zTad[zDimCstride], zTad[2 * zDimCstride]);
+            }
+        };
 
-            hsvToRgb<T>(h, s, v, zTad[0], zTad[zDimCstride], zTad[2 * zDimCstride]);
-
-        }
+        samediff::Threads::parallel_tad(func, 0, numOfTads);
     }
 }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/batched_gemm.cpp b/libnd4j/include/ops/declarable/helpers/cpu/batched_gemm.cpp
index ffd75e435..b408da720 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/batched_gemm.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/batched_gemm.cpp
@@ -22,6 +22,7 @@
 #include <types/float16.h>
 #include <ops/declarable/helpers/batched_gemm.h>
 #include <helpers/BlasHelper.h>
+#include <execution/Threads.h>
 
 
 namespace nd4j    {
@@ -92,25 +93,28 @@ void bgemm_(const std::vector<NDArray*>& vA, const std::vector<NDArray*>& vB, st
 
         int vaSize = vA.size();
 
-        PRAGMA_OMP_PARALLEL_FOR
-        for (int p = 0; p < vaSize; ++p) {
-            auto A = reinterpret_cast<T*>(vA.at(p)->buffer());
-            auto B = reinterpret_cast<T*>(vB.at(p)->buffer());
-            auto C = reinterpret_cast<T*>(vC.at(p)->buffer());
-            auto alpha = alphas->e<T>(p);
-            auto beta = betas->e<T>(p);
-            for (int m = 0; m < M; ++m) {
-                for (int n = 0; n < N; ++n) {
-                    T c_mnp = 0;
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto p = start; p < stop; p += increment) {
+                auto A = reinterpret_cast<T *>(vA.at(p)->buffer());
+                auto B = reinterpret_cast<T *>(vB.at(p)->buffer());
+                auto C = reinterpret_cast<T *>(vC.at(p)->buffer());
+                auto alpha = alphas->e<T>(p);
+                auto beta = betas->e<T>(p);
+                for (int m = 0; m < M; ++m) {
+                    for (int n = 0; n < N; ++n) {
+                        T c_mnp = 0;
 
-                    PRAGMA_OMP_SIMD
-                    for (int k = 0; k < K; ++k)
-                        c_mnp += A[tA == CblasNoTrans ? (m + k * lda) : (m * lda + k)] * B[tB == CblasNoTrans ? (k + n * ldb) : (k * ldb + n)];
+                        PRAGMA_OMP_SIMD
+                        for (int k = 0; k < K; ++k)
+                            c_mnp += A[tA == CblasNoTrans ? (m + k * lda) : (m * lda + k)] * B[tB == CblasNoTrans ? (k + n * ldb) : (k * ldb + n)];
 
-                    C[m + n * ldc] = alpha * c_mnp + beta * C[m + n * ldc];
+                        C[m + n * ldc] = alpha * c_mnp + beta * C[m + n * ldc];
+                    }
                 }
             }
-        }
+        };
+
+        samediff::Threads::parallel_tad(func, 0, vaSize);
     }
 }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp b/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp
index a0847f704..7a0d8b97b 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp
@@ -22,6 +22,7 @@
 #include<ops/declarable/helpers/batchnorm.h>
 #include <helpers/ShapeUtils.h>
 #include <OmpLaunchHelper.h>
+#include <execution/Threads.h>
 
 namespace nd4j 	  {
 namespace ops 	  {
@@ -71,9 +72,8 @@ static void batchnorm_(const NDArray* input, const NDArray* mean, const NDArray*
 
     if(beta != nullptr) {
         const T* betaBuff  = beta->bufferAsT<T>();
-        PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-        {
-            const auto threadNum = omp_get_thread_num();
+        auto func = PRAGMA_THREADS_DO {
+            const auto threadNum = thread_id;
             Nd4jLong* inOffsets = new Nd4jLong[step];
             Nd4jLong* memBuff = new Nd4jLong[2 * inShapeInfo[0]];
 
@@ -98,17 +98,17 @@ static void batchnorm_(const NDArray* input, const NDArray* mean, const NDArray*
             }
             delete []inOffsets;
             delete []memBuff;
-        }
+        };
+
+        samediff::Threads::parallel_do(func, info._numThreads);
     }
     else {
-        PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-        {
-            const auto threadNum = omp_get_thread_num();
+        auto func = PRAGMA_THREADS_DO {
+            const auto threadNum = thread_id;
             Nd4jLong* inOffsets = new Nd4jLong[step];
             Nd4jLong* memBuff = new Nd4jLong[2 * inShapeInfo[0]];
 
             for (int j = 0; j < lenSmall; ++j) {
-
                 const bool isOwner = j < info._numThreads ? threadNum == j : threadNum == j % info._numThreads;
                 if (!isOwner) continue;
 
@@ -128,7 +128,9 @@ static void batchnorm_(const NDArray* input, const NDArray* mean, const NDArray*
             }
             delete []inOffsets;
             delete []memBuff;
-        }
+        };
+
+        samediff::Threads::parallel_do(func, info._numThreads);
     }
 }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp b/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp
index bba3e8acb..ddd1ad892 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp
@@ -22,6 +22,7 @@
 #include <DataTypeUtils.h>
 #include<ops/declarable/helpers/betaInc.h>
 #include <NDArrayFactory.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -84,7 +85,7 @@ static T continuedFraction(const T a, const T b, const T x) {
 			return f;
     }
 
-    return 1.f / 0.f;	// no convergence, more iterations is required
+    return std::numeric_limits<float>::infinity(); // no convergence, more iterations is required
 }
 
 ///////////////////////////////////////////////////////////////////
@@ -121,9 +122,12 @@ static void betaIncForArray(nd4j::LaunchContext * context, const NDArray& a, con
 
 	int xLen = x.lengthOf();
 
-    PRAGMA_OMP_PARALLEL_FOR_IF(xLen > Environment::getInstance()->elementwiseThreshold())
-	for(int i = 0; i < xLen; ++i)
-		output.t<T>(i) = betaIncCore<T>(a.t<T>(i), b.t<T>(i), x.t<T>(i));
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto i = start; i < stop; i += increment)
+            output.t<T>(i) = betaIncCore<T>(a.t<T>(i), b.t<T>(i), x.t<T>(i));
+    };
+
+    samediff::Threads::parallel_for(func, 0, xLen);
 }
 
 ///////////////////////////////////////////////////////////////////
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp b/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp
index b4a54ad7a..5aad38da8 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp
@@ -19,6 +19,7 @@
 //
 
 #include <ops/declarable/helpers/col2im.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -56,64 +57,77 @@ void col2im_(nd4j::LaunchContext & context, const NDArray& input,  NDArray& outp
         
     memset(imBuff, 0, shape::length(imShapeBuffer) * sizeof(T));
 
-	T *col, *im;
-    int imRow, imCol;
 
     // if (shape::order(colShapeBuffer) == 'c' &&  shape::order(imShapeBuffer) == 'c' && shape::strideDescendingCAscendingF(colShapeBuffer) && shape::strideDescendingCAscendingF(imShapeBuffer)) {
     if (false) {
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(private(col, im, imRow, imCol) collapse(2))
-    	for (int b = 0; b < bS; b++) {
-      		for (int c = 0; c < iC; ++c) {                    
-            	for (int kRow = 0; kRow < kH; ++kRow) {                        
-                	for (int kCol = 0; kCol < kW; ++kCol) {                            
-                    	for (int colH = 0; colH < oH; ++colH) {
-                        	for (int colW = 0; colW < oW; ++colW) {                    
+        auto func = PRAGMA_THREADS_FOR_2D {
+            T *col, *im;
+            int imRow, imCol;
 
-                            	imRow = (-pH + kRow * dH) + colH*sH;
-                                imCol = (-pW + kCol * dW) + colW*sW;
+            for (uint b = start_x; b < stop_x; b += inc_x) {
+                for (uint c = start_y; c < stop_y; c += inc_y) {
+                    for (int kRow = 0; kRow < kH; ++kRow) {
+                        for (int kCol = 0; kCol < kW; ++kCol) {
+                            for (int colH = 0; colH < oH; ++colH) {
+                                for (int colW = 0; colW < oW; ++colW) {
 
-                                col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5;
-                                im  = imBuff  + b*imStride0  + c*imStride1  + imRow*imStride2 + imCol*imStride3;
+                                    imRow = (-pH + kRow * dH) + colH * sH;
+                                    imCol = (-pW + kCol * dW) + colW * sW;
 
-                                if (static_cast<unsigned>(imRow) < static_cast<unsigned>(iH) && static_cast<unsigned>(imCol) < static_cast<unsigned>(iW))
-                                	*im += *col;
+                                    col = colBuff + b * colStride0 + c * colStride1 + kRow * colStride2 + kCol * colStride3 + colH * colStride4 + colW * colStride5;
+                                    im = imBuff + b * imStride0 + c * imStride1 + imRow * imStride2 + imCol * imStride3;
+
+                                    if (static_cast<unsigned>(imRow) < static_cast<unsigned>(iH) &&
+                                        static_cast<unsigned>(imCol) < static_cast<unsigned>(iW))
+                                        *im += *col;
+                                }
                             }
                         }
                     }
                 }
             }
-        }  
+        };
+
+        samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1);
     }
     else {
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(firstprivate(imRow, imCol))
-    	for (int b = 0; b < bS; ++b) {
-            T* im0  = imBuff + b*imStride0;
-            T* col4 = colBuff + b*colStride0;
-        	for (int colH = 0; colH < oH; ++colH, col4 += colStride4) {
-                T* col5 = col4;
-            	for (int colW = 0; colW < oW; ++colW, col5 += colStride5) {
-                    T* col1 = col5;
-                    T* im1 = im0;
-                	for (int c = 0; c < iC; ++c, col1 += colStride1, im1 += imStride1) {
-                        int imRow = (-pH + colH*sH);
-                        T* col2 = col1;
-                        T* im2 = im1 + imRow*imStride2;
-                    	for (int kRow = 0; kRow < kH; ++kRow, col2 += colStride2, imRow += dH, im2 += dH*imStride2) {
-                            int imCol =-pW + colW*sW;
-                            T* col3 = col2;                            
-                            T* im3 = im2 + imCol*imStride3;
-                        	for (int kCol = 0; kCol < kW; ++kCol, col3 += colStride3, imCol += dW, im3 += dW*imStride3) {
+        auto func = PRAGMA_THREADS_FOR {
+            T *col, *im;
 
-                                if (static_cast<unsigned>(imRow) < static_cast<unsigned>(iH) && static_cast<unsigned>(imCol) < static_cast<unsigned>(iW))
-                                	*im3 += *col3;
+            for (uint b = start; b < stop; b += increment) {
+                T *im0 = imBuff + b * imStride0;
+                T *col4 = colBuff + b * colStride0;
+                for (int colH = 0; colH < oH; ++colH, col4 += colStride4) {
+                    T *col5 = col4;
+                    for (int colW = 0; colW < oW; ++colW, col5 += colStride5) {
+                        T *col1 = col5;
+                        T *im1 = im0;
+                        for (int c = 0; c < iC; ++c, col1 += colStride1, im1 += imStride1) {
+                            int imRow = (-pH + colH * sH);
+                            T *col2 = col1;
+                            T *im2 = im1 + imRow * imStride2;
+                            for (int kRow = 0;
+                                 kRow < kH; ++kRow, col2 += colStride2, imRow += dH, im2 += dH * imStride2) {
+                                int imCol = -pW + colW * sW;
+                                T *col3 = col2;
+                                T *im3 = im2 + imCol * imStride3;
+                                for (int kCol = 0;
+                                     kCol < kW; ++kCol, col3 += colStride3, imCol += dW, im3 += dW * imStride3) {
+
+                                    if (static_cast<unsigned>(imRow) < static_cast<unsigned>(iH) &&
+                                        static_cast<unsigned>(imCol) < static_cast<unsigned>(iW))
+                                        *im3 += *col3;
+                                }
                             }
                         }
                     }
-                }                           
+                }
             }
-        }  
+        };
+
+        samediff::Threads::parallel_tad(func, 0, bS);
     }
 }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/compare_elem.cpp b/libnd4j/include/ops/declarable/helpers/cpu/compare_elem.cpp
index 50a11f767..5f7fbf694 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/compare_elem.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/compare_elem.cpp
@@ -15,6 +15,7 @@
  ******************************************************************************/
 
 #include <ops/declarable/helpers/compare_elem.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
     namespace ops {
@@ -26,26 +27,38 @@ namespace nd4j {
                 int elementsPerThread = length / ELEMENT_THRESHOLD;
                 int num_threads = nd4j::math::nd4j_max<int>(1, elementsPerThread);
                 num_threads = nd4j::math::nd4j_min<int>(num_threads, omp_get_max_threads());
-
-                Nd4jLong sum = 0;
+                Nd4jLong sumt = 0;
 
                 if(isStrictlyIncreasing) {
-                    PRAGMA_OMP_PARALLEL_FOR_SIMD_REDUCTION(+:sum)
-                    for (Nd4jLong i = 0; i < length - 1; i++) {
-                        auto val0 = input->t<T>(i);
-                        auto val1 = input->t<T>(i + 1);
-                        sum += val0 >= val1 ? -1 : 0;
-                    }
+                    //PRAGMA_OMP_PARALLEL_FOR_SIMD_REDUCTION(+:sum)
+                    auto func = PRAGMA_REDUCE_LONG {
+                        Nd4jLong sum = 0;
+                        for (auto i = start; i < stop; i++) {
+                            auto val0 = input->t<T>(i);
+                            auto val1 = input->t<T>(i + 1);
+                            sum += val0 >= val1 ? -1 : 0;
+                        }
+                        return sum;
+                    };
+                    sumt = samediff::Threads::parallel_long(func, LAMBDA_SUML, 0, length - 1);
                 } else {
-                    PRAGMA_OMP_PARALLEL_FOR_SIMD_REDUCTION(+:sum)
-                    for (Nd4jLong i = 0; i < length - 1; i++) {
-                        auto val0 = input->t<T>(i);
-                        auto val1 = input->t<T>(i + 1);
-                        sum += val0 > val1 ? -1 : 0;
-                    }
+                    //PRAGMA_OMP_PARALLEL_FOR_SIMD_REDUCTION(+:sum)
+                    auto func = PRAGMA_REDUCE_LONG {
+                        Nd4jLong sum = 0;
+                        for (auto i = start; i < stop; i++) {
+                            auto val0 = input->t<T>(i);
+                            auto val1 = input->t<T>(i + 1);
+                            sum += val0 > val1 ? -1 : 0;
+                        }
+
+                        return sum;
+                    };
+                    sumt = samediff::Threads::parallel_long(func, LAMBDA_SUML, 0, length - 1);
                 }
 
-                output = (sum > -1);
+                nd4j_printf("Sum: %lld\n", sumt)
+
+                output = (sumt > -1);
 
             }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/confusion.cpp b/libnd4j/include/ops/declarable/helpers/cpu/confusion.cpp
index 859330a9d..e2d24c591 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/confusion.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/confusion.cpp
@@ -19,6 +19,7 @@
 //
 
 #include <ops/declarable/helpers/confusion.h>
+#include <execution/Threads.h>
 
 
 namespace nd4j {
@@ -30,13 +31,16 @@ namespace helpers {
         std::unique_ptr<ResultSet> arrs(output->allTensorsAlongDimension({1}));
         int lLen = labels->lengthOf();
 
-        PRAGMA_OMP_PARALLEL_FOR_IF(lLen > Environment::getInstance()->elementwiseThreshold())
-        for (int j = 0; j < lLen; ++j){
-            auto label = labels->e<Nd4jLong>(j);
-            auto pred = predictions->e<Nd4jLong>(j);
-            T value = (weights == nullptr ? (T)1.0f : weights->e<T>(j));
-            (*arrs->at(label)).p<T>(pred, value);
-        }
+        auto func = PRAGMA_THREADS_FOR {
+            for (int j = start; j < stop; j += increment) {
+                auto label = labels->e<Nd4jLong>(j);
+                auto pred = predictions->e<Nd4jLong>(j);
+                T value = (weights == nullptr ? (T) 1.0f : weights->e<T>(j));
+                (*arrs->at(label)).p<T>(pred, value);
+            }
+        };
+
+        samediff::Threads::parallel_for(func, 0, lLen);
     }
 
     void confusionFunctor(nd4j::LaunchContext * context, NDArray* labels, NDArray* predictions, NDArray* weights, NDArray* output) {
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp b/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp
index 93d00220e..0829bcbe6 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp
@@ -24,6 +24,7 @@
 #include <ops/declarable/helpers/col2im.h>
 #include <NDArrayFactory.h>
 #include <MmulHelper.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
     namespace ops  {
@@ -62,32 +63,34 @@ namespace nd4j {
             T* colBuff = columns.bufferAsT<T>();
             T* volBuff = const_cast<NDArray&>(volume).bufferAsT<T>();
 
-            T *col, *vol;
-            int volDep, volRow, volCol;
 
-            if (volume.ordering() == 'c' &&  columns.ordering() == 'c' && shape::strideDescendingCAscendingF(volume.getShapeInfo()) && shape::strideDescendingCAscendingF(columns.getShapeInfo()))
+            if (volume.ordering() == 'c' &&  columns.ordering() == 'c' && shape::strideDescendingCAscendingF(volume.getShapeInfo()) && shape::strideDescendingCAscendingF(columns.getShapeInfo())) {
 
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(col, vol, volDep, volRow, volCol) collapse(2))
-                for (int b = 0; b < bS; ++b) {
-                    for (int c = 0; c < iC; ++c) {
-                        for (int kDep = 0; kDep < kD; ++kDep) {
-                            for (int kRow = 0; kRow < kH; ++kRow) {
-                                for (int kCol = 0; kCol < kW; ++kCol) {
-                                    for (int colD = 0; colD < oD; ++colD) {
-                                        for (int colH = 0; colH < oH; ++colH) {
-                                            for (int colW = 0; colW < oW; ++colW) {
+                auto func = PRAGMA_THREADS_FOR_3D {
+                    T *col, *vol;
+                    int volDep, volRow, volCol;
 
-                                                volDep = (-pD + kDep * dD) + colD*sD;
-                                                volRow = (-pH + kRow * dH) + colH*sH;
-                                                volCol = (-pW + kCol * dW) + colW*sW;
+                    for (int b = start_x; b < stop_x; b += inc_x) {
+                        for (int c = start_y; c < stop_y; c += inc_y) {
+                            for (int kDep = start_z; kDep < stop_z; kDep += inc_z) {
+                                for (int kRow = 0; kRow < kH; ++kRow) {
+                                    for (int kCol = 0; kCol < kW; ++kCol) {
+                                        for (int colD = 0; colD < oD; ++colD) {
+                                            for (int colH = 0; colH < oH; ++colH) {
+                                                for (int colW = 0; colW < oW; ++colW) {
 
-                                                col = colBuff + b*colStride0 + c*colStride1 + kDep*colStride2 + kRow*colStride3 + kCol*colStride4 + colD*colStride5 + colH*colStride6 + colW*colStride7;
+                                                    volDep = (-pD + kDep * dD) + colD * sD;
+                                                    volRow = (-pH + kRow * dH) + colH * sH;
+                                                    volCol = (-pW + kCol * dW) + colW * sW;
 
-                                                if (static_cast<unsigned>(volDep) >= static_cast<unsigned>(iD) || static_cast<unsigned>(volRow) >= static_cast<unsigned>(iH) || static_cast<unsigned>(volCol) >= static_cast<unsigned>(iW))
-                                                    *col = static_cast<T>(0.);
-                                                else {
-                                                    vol = volBuff + b*volStride0 + c*volStride1 + volDep*volStride2 + volRow*volStride3 + volCol*volStride4;
-                                                    *col = *vol;
+                                                    col = colBuff + b * colStride0 + c * colStride1 + kDep * colStride2 + kRow * colStride3 + kCol * colStride4 + colD * colStride5 + colH * colStride6 + colW * colStride7;
+
+                                                    if (static_cast<unsigned>(volDep) >= static_cast<unsigned>(iD) || static_cast<unsigned>(volRow) >= static_cast<unsigned>(iH) || static_cast<unsigned>(volCol) >= static_cast<unsigned>(iW))
+                                                        *col = static_cast<T>(0.);
+                                                    else {
+                                                        vol = volBuff + b * volStride0 + c * volStride1 + volDep * volStride2 + volRow * volStride3 + volCol * volStride4;
+                                                        *col = *vol;
+                                                    }
                                                 }
                                             }
                                         }
@@ -96,31 +99,36 @@ namespace nd4j {
                             }
                         }
                     }
-                }
+                };
 
-            else
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, kD, 1);
 
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(vol, col, volDep, volRow, volCol))
-                for (int b = 0; b < bS; b++) {
-                    for (int colD = 0; colD < oD; ++colD) {
-                        for (int colH = 0; colH < oH; ++colH) {
-                            for (int colW = 0; colW < oW; ++colW) {
-                                for (int c = 0; c < iC; ++c) {
-                                    for (int kDep = 0; kDep < kD; ++kDep) {
-                                        for (int kRow = 0; kRow < kH; ++kRow) {
-                                            for (int kCol = 0; kCol < kW; ++kCol) {
+            } else {
 
-                                                volDep = (-pD + kDep * dD) + colD*sD;
-                                                volRow = (-pH + kRow * dH) + colH*sH;
-                                                volCol = (-pW + kCol * dW) + colW*sW;
+                auto func = PRAGMA_THREADS_FOR_2D {
+                    T *col, *vol;
+                    int volDep, volRow, volCol;
+                    for (int b = start_x; b < stop_x; b++) {
+                        for (int colD = start_y; colD < stop_y; colD++) {
+                            for (int colH = 0; colH < oH; ++colH) {
+                                for (int colW = 0; colW < oW; ++colW) {
+                                    for (int c = 0; c < iC; ++c) {
+                                        for (int kDep = 0; kDep < kD; ++kDep) {
+                                            for (int kRow = 0; kRow < kH; ++kRow) {
+                                                for (int kCol = 0; kCol < kW; ++kCol) {
 
-                                                col = colBuff + b*colStride0 + c*colStride1 + kDep*colStride2 + kRow*colStride3 + kCol*colStride4 + colD*colStride5 + colH*colStride6 + colW*colStride7;
+                                                    volDep = (-pD + kDep * dD) + colD * sD;
+                                                    volRow = (-pH + kRow * dH) + colH * sH;
+                                                    volCol = (-pW + kCol * dW) + colW * sW;
 
-                                                if (static_cast<unsigned>(volDep) >= static_cast<unsigned>(iD) || static_cast<unsigned>(volRow) >= static_cast<unsigned>(iH) || static_cast<unsigned>(volCol) >= static_cast<unsigned>(iW))
-                                                    *col = static_cast<T>(0.);
-                                                else {
-                                                    vol = volBuff + b*volStride0 + c*volStride1 + volDep*volStride2 + volRow*volStride3 + volCol*volStride4;
-                                                    *col = *vol;
+                                                    col = colBuff + b * colStride0 + c * colStride1 + kDep * colStride2 + kRow * colStride3 + kCol * colStride4 + colD * colStride5 + colH * colStride6 + colW * colStride7;
+
+                                                    if (static_cast<unsigned>(volDep) >= static_cast<unsigned>(iD) || static_cast<unsigned>(volRow) >= static_cast<unsigned>(iH) || static_cast<unsigned>(volCol) >= static_cast<unsigned>(iW))
+                                                        *col = static_cast<T>(0.f);
+                                                    else {
+                                                        vol = volBuff + b * volStride0 + c * volStride1 + volDep * volStride2 + volRow * volStride3 + volCol * volStride4;
+                                                        *col = *vol;
+                                                    }
                                                 }
                                             }
                                         }
@@ -129,7 +137,11 @@ namespace nd4j {
                             }
                         }
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, oD, 1);
+                //func(0, 0, bS, 1, 0, oD, 1);
+            }
         }
 
 //////////////////////////////////////////////////////////////////////////
@@ -168,29 +180,31 @@ namespace nd4j {
             T* volBuff = volume.bufferAsT<T>();
             T* colBuff = const_cast<NDArray&>(columns).bufferAsT<T>();
 
-            T* col, *vol;
-            int volDep, volRow, volCol;
 
-            if (volume.ordering() == 'c' &&  columns.ordering() == 'c' && shape::strideDescendingCAscendingF(volume.getShapeInfo()) && shape::strideDescendingCAscendingF(columns.getShapeInfo()))
+            if (volume.ordering() == 'c' &&  columns.ordering() == 'c' && shape::strideDescendingCAscendingF(volume.getShapeInfo()) && shape::strideDescendingCAscendingF(columns.getShapeInfo())) {
 
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(col, vol, volDep, volRow, volCol) collapse(2))
-                for (int b = 0; b < bS; b++) {
-                    for (int c = 0; c < iC; ++c) {
-                        for (int kDep = 0; kDep < kD; ++kDep) {
-                            for (int kRow = 0; kRow < kH; ++kRow) {
-                                for (int kCol = 0; kCol < kW; ++kCol) {
-                                    for (int colD = 0; colD < oD; ++colD) {
-                                        for (int colH = 0; colH < oH; ++colH) {
-                                            for (int colW = 0; colW < oW; ++colW) {
+                auto func = PRAGMA_THREADS_FOR {
+                    T* col, *vol;
+                    int volDep, volRow, volCol;
 
-                                                volDep = -pD + kDep * dD + colD * sD;
-                                                volRow = -pH + kRow * dH + colH * sH;
-                                                volCol = -pW + kCol * dW + colW * sW;
+                    for (int b = start; b < stop; b++) {
+                        for (int c = 0; c < iC; c++) {
+                            for (int kDep = 0; kDep < kD; ++kDep) {
+                                for (int kRow = 0; kRow < kH; ++kRow) {
+                                    for (int kCol = 0; kCol < kW; ++kCol) {
+                                        for (int colD = 0; colD < oD; ++colD) {
+                                            for (int colH = 0; colH < oH; ++colH) {
+                                                for (int colW = 0; colW < oW; ++colW) {
 
-                                                if (static_cast<unsigned>(volDep) < static_cast<unsigned>(iD) && static_cast<unsigned>(volRow) < static_cast<unsigned>(iH) && static_cast<unsigned>(volCol) < static_cast<unsigned>(iW)) {
-                                                    col = colBuff + b*colStride0 + c*colStride1 + kDep*colStride2 + kRow*colStride3 + kCol*colStride4 + colD*colStride5 + colH*colStride6 + colW*colStride7;
-                                                    vol = volBuff + b*volStride0 + c*volStride1 + volDep*volStride2 + volRow*volStride3 + volCol*volStride4;
-                                                    *vol += *col;
+                                                    volDep = -pD + kDep * dD + colD * sD;
+                                                    volRow = -pH + kRow * dH + colH * sH;
+                                                    volCol = -pW + kCol * dW + colW * sW;
+
+                                                    if (static_cast<unsigned>(volDep) < static_cast<unsigned>(iD) && static_cast<unsigned>(volRow) < static_cast<unsigned>(iH) && static_cast<unsigned>(volCol) < static_cast<unsigned>(iW)) {
+                                                        col = colBuff + b * colStride0 + c * colStride1 + kDep * colStride2 + kRow * colStride3 + kCol * colStride4 + colD * colStride5 + colH * colStride6 + colW * colStride7;
+                                                        vol = volBuff + b * volStride0 + c * volStride1 + volDep * volStride2 + volRow * volStride3 + volCol * volStride4;
+                                                        *vol += *col;
+                                                    }
                                                 }
                                             }
                                         }
@@ -199,28 +213,34 @@ namespace nd4j {
                             }
                         }
                     }
-                }
+                };
 
-            else
+                samediff::Threads::parallel_tad(func, 0, bS);
 
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(vol, col, volDep, volRow, volCol))
-                for (int b = 0; b < bS; b++) {
-                    for (int colD = 0; colD < oD; ++colD) {
-                        for (int colH = 0; colH < oH; ++colH) {
-                            for (int colW = 0; colW < oW; ++colW) {
-                                for (int c = 0; c < iC; ++c) {
-                                    for (int kDep = 0; kDep < kD; ++kDep) {
-                                        for (int kRow = 0; kRow < kH; ++kRow) {
-                                            for (int kCol = 0; kCol < kW; ++kCol) {
+            } else {
 
-                                                volDep = (-pD + kDep * dD) + colD*sD;
-                                                volRow = (-pH + kRow * dH) + colH*sH;
-                                                volCol = (-pW + kCol * dW) + colW*sW;
+                auto func = PRAGMA_THREADS_FOR {
+                    T* col, *vol;
+                    int volDep, volRow, volCol;
 
-                                                if (static_cast<unsigned>(volDep) < static_cast<unsigned>(iD) && static_cast<unsigned>(volRow) < static_cast<unsigned>(iH) && static_cast<unsigned>(volCol) < static_cast<unsigned>(iW)) {
-                                                    col = colBuff + b*colStride0 + c*colStride1 + kDep*colStride2 + kRow*colStride3 + kCol*colStride4 + colD*colStride5 + colH*colStride6 + colW*colStride7;
-                                                    vol = volBuff + b*volStride0 + c*volStride1 + volDep*volStride2 + volRow*volStride3 + volCol*volStride4;
-                                                    *vol += *col;
+                    for (int b = start; b < stop; b++) {
+                        for (int colD = 0; colD < oD; colD++) {
+                            for (int colH = 0; colH < oH; ++colH) {
+                                for (int colW = 0; colW < oW; ++colW) {
+                                    for (int c = 0; c < iC; ++c) {
+                                        for (int kDep = 0; kDep < kD; ++kDep) {
+                                            for (int kRow = 0; kRow < kH; ++kRow) {
+                                                for (int kCol = 0; kCol < kW; ++kCol) {
+
+                                                    volDep = (-pD + kDep * dD) + colD * sD;
+                                                    volRow = (-pH + kRow * dH) + colH * sH;
+                                                    volCol = (-pW + kCol * dW) + colW * sW;
+
+                                                    if (static_cast<unsigned>(volDep) < static_cast<unsigned>(iD) && static_cast<unsigned>(volRow) < static_cast<unsigned>(iH) && static_cast<unsigned>(volCol) < static_cast<unsigned>(iW)) {
+                                                        col = colBuff + b * colStride0 + c * colStride1 + kDep * colStride2 + kRow * colStride3 + kCol * colStride4 + colD * colStride5 + colH * colStride6 + colW * colStride7;
+                                                        vol = volBuff + b * volStride0 + c * volStride1 + volDep * volStride2 + volRow * volStride3 + volCol * volStride4;
+                                                        *vol += *col;
+                                                    }
                                                 }
                                             }
                                         }
@@ -229,7 +249,10 @@ namespace nd4j {
                             }
                         }
                     }
-                }
+                };
+
+                samediff::Threads::parallel_tad(func, 0, bS);
+            }
         }
 
 
@@ -568,22 +591,24 @@ namespace nd4j {
             const Nd4jLong zStride2 = output.stridesOf()[dimIH];
             const Nd4jLong zStride3 = output.stridesOf()[dimIH + 1];
 
-            uint xCoord2, xCoord3;
             // loop through output array
-            PRAGMA_OMP_PARALLEL_FOR_ARGS(collapse(4) private(xCoord2, xCoord3))
-            for(uint b = 0; b < bS; ++b) {
-                for(uint c = 0; c < iC; ++c) {
-                    for(uint h = 0; h < oH ; ++h) {
-                        for(uint w = 0; w < oW ; ++w) {
+            auto func = PRAGMA_THREADS_FOR_3D {
+                uint xCoord2, xCoord3;
+                for (uint b = start_x; b < stop_x; b += inc_x) {
+                    for (uint c = start_y; c < stop_y; c += inc_y) {
+                        for (uint h = start_z; h < stop_z; h += inc_z) {
+                            for (uint w = 0; w < oW; ++w) {
+                                xCoord2 = h / factorH;
+                                xCoord3 = w / factorW;
 
-                            xCoord2 = h / factorH;
-                            xCoord3 = w / factorW;
-
-                            z[b*zStride0 + c*zStride1 + h*zStride2 + w*zStride3] = x[b*xStride0 + c*xStride1 + xCoord2*xStride2 + xCoord3*xStride3];
+                                z[b * zStride0 + c * zStride1 + h * zStride2 + w * zStride3] = x[b * xStride0 + c * xStride1 + xCoord2 * xStride2 + xCoord3 * xStride3];
+                            }
                         }
                     }
                 }
-            }
+            };
+
+            samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, oH, 1);
         }
 
 //////////////////////////////////////////////////////////////////////////
@@ -616,25 +641,31 @@ namespace nd4j {
             const Nd4jLong zStride3 = output.stridesOf()[dimID + 1];
             const Nd4jLong zStride4 = output.stridesOf()[dimID + 2];
 
-            uint xCoord2, xCoord3, xCoord4;
             // loop through output array
-            PRAGMA_OMP_PARALLEL_FOR_ARGS(collapse(5) private(xCoord2, xCoord3, xCoord4))
-            for(uint b = 0; b < bS; ++b) {
-                for(uint c = 0; c < iC; ++c) {
-                    for(uint d = 0; d < oD ; ++d) {
-                        for(uint h = 0; h < oH ; ++h) {
-                            for(uint w = 0; w < oW ; ++w) {
+            auto func = PRAGMA_THREADS_FOR_3D {
+                uint xCoord2, xCoord3, xCoord4;
 
-                                xCoord2 = d / factorD;
-                                xCoord3 = h / factorH;
-                                xCoord4 = w / factorW;
+                for (uint b = start_x; b < stop_x; b += inc_x) {
+                    for (uint c = start_y; c < stop_y; c += inc_y) {
+                        for (uint d = start_z; d < stop_z; d += inc_z) {
+                            for (uint h = 0; h < oH; ++h) {
+                                for (uint w = 0; w < oW; ++w) {
 
-                                z[b*zStride0 + c*zStride1 + d*zStride2 + h*zStride3 + w*zStride4] = x[b*xStride0 + c*xStride1 + xCoord2*xStride2 + xCoord3*xStride3 + xCoord4*xStride4];
+                                    xCoord2 = d / factorD;
+                                    xCoord3 = h / factorH;
+                                    xCoord4 = w / factorW;
+
+                                    z[b * zStride0 + c * zStride1 + d * zStride2 + h * zStride3 + w * zStride4] = x[
+                                            b * xStride0 + c * xStride1 + xCoord2 * xStride2 + xCoord3 * xStride3 +
+                                            xCoord4 * xStride4];
+                                }
                             }
                         }
                     }
                 }
-            }
+            };
+
+            samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, oD, 1);
         }
 
 //////////////////////////////////////////////////////////////////////////
@@ -668,23 +699,26 @@ namespace nd4j {
             const Nd4jLong zStride3 = gradI.stridesOf()[dimIH + 1];
 
             // loop through output array
-            PRAGMA_OMP_PARALLEL_FOR_ARGS(collapse(4))
-            for(uint b = 0; b < bS; ++b) {
-                for(uint c = 0; c < iC; ++c) {
-                    for(uint h = 0; h < iH; ++h) {
-                        for(uint w = 0; w < iW; ++w) {
+            auto func = PRAGMA_THREADS_FOR_3D {
+                for (uint b = start_x; b < stop_x; b += inc_x) {
+                    for (uint c = start_y; c < stop_y; c += inc_y) {
+                        for (uint h = start_z; h < stop_z; h += inc_z) {
+                            for (uint w = 0; w < iW; ++w) {
 
-                            const auto zOffset = b*zStride0 + c*zStride1 + h*zStride2 + w*zStride3;
+                                const auto zOffset = b * zStride0 + c * zStride1 + h * zStride2 + w * zStride3;
 
-                            z[zOffset] = 0;
+                                z[zOffset] = 0;
 
-                            for(uint xh = h * factorH; xh < h * factorH + factorH; ++xh)
-                                for(uint xw = w * factorW; xw < w * factorW + factorW; ++xw)
-                                    z[zOffset] += x[b*xStride0 + c*xStride1 + xh*xStride2 + xw*xStride3];
+                                for (uint xh = h * factorH; xh < h * factorH + factorH; ++xh)
+                                    for (uint xw = w * factorW; xw < w * factorW + factorW; ++xw)
+                                        z[zOffset] += x[b * xStride0 + c * xStride1 + xh * xStride2 + xw * xStride3];
+                            }
                         }
                     }
                 }
-            }
+            };
+
+            samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, iH, 1);
         }
 
 //////////////////////////////////////////////////////////////////////////
@@ -723,26 +757,29 @@ namespace nd4j {
             const Nd4jLong zStride4 = gradI.stridesOf()[dimID + 2];
 
             // loop through output array
-            PRAGMA_OMP_PARALLEL_FOR_ARGS(collapse(5))
-            for(uint b = 0; b < bS; ++b) {
-                for(uint c = 0; c < iC; ++c) {
-                    for(uint d = 0; d < iD; ++d) {
-                        for(uint h = 0; h < iH; ++h) {
-                            for(uint w = 0; w < iW; ++w) {
+            auto func = PRAGMA_THREADS_FOR_3D {
+                for (uint b = start_x; b < stop_x; b += inc_x) {
+                    for (uint c = start_y; c < stop_y; c += inc_y) {
+                        for (uint d = start_z; d < stop_z; d += inc_z) {
+                            for (uint h = 0; h < iH; ++h) {
+                                for (uint w = 0; w < iW; ++w) {
 
-                                const auto zOffset = b*zStride0 + c*zStride1 + d*zStride2 + h*zStride3 + w*zStride4;
+                                    const auto zOffset = b * zStride0 + c * zStride1 + d * zStride2 + h * zStride3 + w * zStride4;
 
-                                z[zOffset] = 0;
+                                    z[zOffset] = 0;
 
-                                for(uint xd = d * factorD; xd < d * factorD + factorD; ++xd)
-                                    for(uint xh = h * factorH; xh < h * factorH + factorH; ++xh)
-                                        for(uint xw = w * factorW; xw < w * factorW + factorW; ++xw)
-                                            z[zOffset] += x[b*xStride0 + c*xStride1 + xd*xStride2 + xh*xStride3 + xw*xStride4];
+                                    for (uint xd = d * factorD; xd < d * factorD + factorD; ++xd)
+                                        for (uint xh = h * factorH; xh < h * factorH + factorH; ++xh)
+                                            for (uint xw = w * factorW; xw < w * factorW + factorW; ++xw)
+                                                z[zOffset] += x[b * xStride0 + c * xStride1 + xd * xStride2 + xh * xStride3 + xw * xStride4];
+                                }
                             }
                         }
                     }
                 }
-            }
+            };
+
+            samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, iD, 1);
         }
 
 //////////////////////////////////////////////////////////////////////////
@@ -779,142 +816,156 @@ namespace nd4j {
             const Nd4jLong iStep3   = dW*iStride3;
             const int kProd         = kH*kW;
 
-            Nd4jLong hstart, wstart, hend, wend;
-            T *pIn;
-
             if(poolingMode == 0) {        // max
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, hstart, wstart, hend, wend) collapse(2))
-                for(int b = 0; b < bS; ++b) {
-                    for(int c = 0; c < iC; ++c) {
-                        for(int oh = 0; oh < oH; ++oh) {
-                            for(int ow = 0; ow < oW; ++ow) {
+                auto func = PRAGMA_THREADS_FOR_2D {
+                    Nd4jLong hstart, wstart, hend, wend;
+                    T *pIn;
 
-                                pIn  = in  + b * iStride0 + c * iStride1;
+                    for (int b = start_x; b < stop_x; b += inc_x) {
+                        for (int c = start_y; c < stop_y; c += inc_y) {
+                            for (int oh = 0; oh < oH; ++oh) {
+                                for (int ow = 0; ow < oW; ++ow) {
 
-                                hstart = oh * sH - pH;
-                                wstart = ow * sW - pW;
-                                hend = hstart + kHEff;
-                                wend = wstart + kWEff;
+                                    pIn = in + b * iStride0 + c * iStride1;
 
-                                if(hstart < 0)
-                                    hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
-                                if(wstart < 0)
-                                    wstart += dW * ((-wstart + dW -1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
-                                if(hend > iH)
-                                    hend -= dH * ((hend-iH + dH - 1) / dH); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
-                                if(wend > iW)
-                                    wend -= dW * ((wend-iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
+                                    hstart = oh * sH - pH;
+                                    wstart = ow * sW - pW;
+                                    hend = hstart + kHEff;
+                                    wend = wstart + kWEff;
 
-                                hstart *= iStride2;
-                                hend   *= iStride2;
-                                wstart *= iStride3;
-                                wend   *= iStride3;
+                                    if (hstart < 0)
+                                        hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
+                                    if (wstart < 0)
+                                        wstart += dW * ((-wstart + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
+                                    if (hend > iH)
+                                        hend -= dH * ((hend - iH + dH - 1) / dH); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
+                                    if (wend > iW)
+                                        wend -= dW * ((wend - iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
 
-                                T max = -DataTypeUtils::max<T>();
+                                    hstart *= iStride2;
+                                    hend *= iStride2;
+                                    wstart *= iStride3;
+                                    wend *= iStride3;
 
-                                for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
-                                    for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) {
-                                        T val = pIn[kh + kw];
-                                        if (val > max)
-                                            max = val;
-                                    }
-                                out[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3] = max;
+                                    T max = -DataTypeUtils::max<T>();
+
+                                    for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
+                                        for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) {
+                                            T val = pIn[kh + kw];
+                                            if (val > max)
+                                                max = val;
+                                        }
+                                    out[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3] = max;
+                                }
                             }
                         }
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1);
             }
 /*************************************************************************/
             else if(poolingMode == 1) {      // avg
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, hstart, wstart, hend, wend) collapse(2))
-                for(int b = 0; b < bS; ++b) {
-                    for(int c = 0; c < iC; ++c) {
-                        for(int oh = 0; oh < oH; ++oh) {
-                            for(int ow = 0; ow < oW; ++ow) {
+                auto func = PRAGMA_THREADS_FOR_2D {
+                    Nd4jLong hstart, wstart, hend, wend;
+                    T *pIn;
 
-                                pIn  = in  + b * iStride0 + c * iStride1;
+                    for (int b = start_x; b < stop_x; b += inc_x) {
+                        for (int c = start_y; c < stop_y; c += inc_y) {
+                            for (int oh = 0; oh < oH; ++oh) {
+                                for (int ow = 0; ow < oW; ++ow) {
 
-                                hstart = oh * sH - pH;
-                                wstart = ow * sW - pW;
-                                hend = hstart + kHEff;
-                                wend = wstart + kWEff;
+                                    pIn = in + b * iStride0 + c * iStride1;
 
-                                if(hstart < 0)
-                                    hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
-                                if(wstart < 0)
-                                    wstart += dW * ((-wstart + dW -1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
-                                if(hend > iH)
-                                    hend -= dH * ((hend-iH + dH - 1) / dH); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
-                                if(wend > iW)
-                                    wend -= dW * ((wend-iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
+                                    hstart = oh * sH - pH;
+                                    wstart = ow * sW - pW;
+                                    hend = hstart + kHEff;
+                                    wend = wstart + kWEff;
 
-                                hstart *= iStride2;
-                                hend   *= iStride2;
-                                wstart *= iStride3;
-                                wend   *= iStride3;
+                                    if (hstart < 0)
+                                        hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
+                                    if (wstart < 0)
+                                        wstart += dW * ((-wstart + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
+                                    if (hend > iH)
+                                        hend -= dH * ((hend - iH + dH - 1) / dH); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
+                                    if (wend > iW)
+                                        wend -= dW * ((wend - iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
 
-                                T sum = static_cast<T>(0.f);
+                                    hstart *= iStride2;
+                                    hend *= iStride2;
+                                    wstart *= iStride3;
+                                    wend *= iStride3;
 
-                                for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
-                                    for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
-                                        sum += pIn[kh + kw];
+                                    T sum = static_cast<T>(0.f);
 
-                                if (extraParam0 == 0) {                     //Exclude padding
-                                    int a = (hend-hstart)/iStep2 + ((hend-hstart) % iStep2 == 0 ? 0 : 1);
-                                    int b = (wend-wstart)/iStep3 + ((wend-wstart) % iStep3 == 0 ? 0 : 1);
-                                    sum /=  static_cast<T>(a * b);          //  Accounts for dilation
+                                    for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
+                                        for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
+                                            sum += pIn[kh + kw];
+
+                                    if (extraParam0 == 0) {                     //Exclude padding
+                                        int a = (hend - hstart) / iStep2 + ((hend - hstart) % iStep2 == 0 ? 0 : 1);
+                                        int r = (wend - wstart) / iStep3 + ((wend - wstart) % iStep3 == 0 ? 0 : 1);
+                                        sum /= static_cast<T>(a * r);          //  Accounts for dilation
+                                    } else if (extraParam0 == 1)                  //Include padding
+                                        sum /= kProd;
+
+                                    out[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3] = sum;
                                 }
-                                else if (extraParam0 == 1)                  //Include padding
-                                    sum /= kProd;
-
-                                out[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3] = sum;
                             }
                         }
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1);
             }
 /*************************************************************************/
             else if(poolingMode == 2) {  // pnorm
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, hstart, wstart, hend, wend) collapse(2))
-                for(int b = 0; b < bS; ++b) {
-                    for(int c = 0; c < iC; ++c) {
-                        for(int oh = 0; oh < oH; ++oh) {
-                            for(int ow = 0; ow < oW; ++ow) {
+                auto func = PRAGMA_THREADS_FOR_2D {
+                    Nd4jLong hstart, wstart, hend, wend;
+                    T *pIn;
 
-                                pIn  = in  + b * iStride0 + c * iStride1;
+                    for (int b = start_x; b < stop_x; b += inc_x) {
+                        for (int c = start_y; c < stop_y; c += inc_y) {
+                            for (int oh = 0; oh < oH; ++oh) {
+                                for (int ow = 0; ow < oW; ++ow) {
 
-                                hstart = oh * sH - pH;
-                                wstart = ow * sW - pW;
-                                hend = hstart + kHEff;
-                                wend = wstart + kWEff;
+                                    pIn = in + b * iStride0 + c * iStride1;
 
-                                if(hstart < 0)
-                                    hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
-                                if(wstart < 0)
-                                    wstart += dW * ((-wstart + dW -1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
-                                if(hend > iH)
-                                    hend -= dH * ((hend-iH + dH - 1) / dH); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
-                                if(wend > iW)
-                                    wend -= dW * ((wend-iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
+                                    hstart = oh * sH - pH;
+                                    wstart = ow * sW - pW;
+                                    hend = hstart + kHEff;
+                                    wend = wstart + kWEff;
 
-                                hstart *= iStride2;
-                                hend   *= iStride2;
-                                wstart *= iStride3;
-                                wend   *= iStride3;
+                                    if (hstart < 0)
+                                        hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
+                                    if (wstart < 0)
+                                        wstart += dW * ((-wstart + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
+                                    if (hend > iH)
+                                        hend -= dH * ((hend - iH + dH - 1) / dH); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
+                                    if (wend > iW)
+                                        wend -= dW * ((wend - iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
 
-                                T sum = static_cast<T>(0.f);
+                                    hstart *= iStride2;
+                                    hend *= iStride2;
+                                    wstart *= iStride3;
+                                    wend *= iStride3;
 
-                                for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
-                                    for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
-                                        sum += nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kh + kw]), extraParam0);
+                                    T sum = static_cast<T>(0.f);
 
-                                sum = nd4j::math::nd4j_pow<T,T,T>(sum, static_cast<T>((T)1.f) / extraParam0);
+                                    for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
+                                        for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
+                                            sum += nd4j::math::nd4j_pow<T, T, T>(nd4j::math::nd4j_abs<T>(pIn[kh + kw]), extraParam0);
 
-                                out[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3] = sum;
+                                    sum = nd4j::math::nd4j_pow<T, T, T>(sum, static_cast<T>((T) 1.f) / extraParam0);
+
+                                    out[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3] = sum;
+                                }
                             }
                         }
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1);
             }
             else {
                 nd4j_printf("ConvolutionUtils::pooling2d: pooling mode argument can take three values only: 0, 1, 2, but got %i instead !\n", poolingMode);
@@ -961,176 +1012,192 @@ namespace nd4j {
             const Nd4jLong iStep4   = dW*iStride4;
             const int kProd         = kD*kH*kW;
 
-            Nd4jLong dstart, hstart, wstart, dend, hend, wend;
-            T sum, *pIn;
-
             if(poolingMode == 0) {        // max
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, sum, dstart, hstart, wstart, dend, hend, wend))
-                for(int b = 0; b < bS; ++b) {
-                    for(int c = 0; c < iC; ++c) {
-                        for(int od = 0; od < oD; ++od) {
-                            for(int oh = 0; oh < oH; ++oh) {
-                                for(int ow = 0; ow < oW; ++ow) {
+                auto func = PRAGMA_THREADS_FOR_3D {
+                    Nd4jLong dstart, hstart, wstart, dend, hend, wend;
+                    T sum, *pIn;
 
-                                    pIn  = in  + b * iStride0 + c * iStride1;
+                    for (int b = start_x; b < stop_x; b += inc_x) {
+                        for (int c = start_y; c < stop_y; c += inc_y) {
+                            for (int od = start_z; od < stop_z; od += inc_z) {
+                                for (int oh = 0; oh < oH; ++oh) {
+                                    for (int ow = 0; ow < oW; ++ow) {
 
-                                    dstart = od * sD - pD;
-                                    hstart = oh * sH - pH;
-                                    wstart = ow * sW - pW;
-                                    dend = dstart + kDEff;
-                                    hend = hstart + kHEff;
-                                    wend = wstart + kWEff;
+                                        pIn = in + b * iStride0 + c * iStride1;
 
-                                    if(dstart < 0)
-                                        dstart += dD * ((-dstart + dD - 1) / dD);
-                                    if(hstart < 0)
-                                        hstart += dH * ((-hstart + dH - 1) / dH);
-                                    if(wstart < 0)
-                                        wstart += dW * ((-wstart + dW - 1) / dW);
-                                    if(dend > iD)
-                                        dend -= dD * ((dend-iD + dD - 1) / dD);
-                                    if(hend > iH)
-                                        hend -= dH * ((hend-iH + dH - 1) / dH);
-                                    if(wend > iW)
-                                        wend -= dW * ((wend-iW + dW - 1) / dW);
+                                        dstart = od * sD - pD;
+                                        hstart = oh * sH - pH;
+                                        wstart = ow * sW - pW;
+                                        dend = dstart + kDEff;
+                                        hend = hstart + kHEff;
+                                        wend = wstart + kWEff;
 
-                                    dstart *= iStride2;
-                                    dend   *= iStride2;
-                                    hstart *= iStride3;
-                                    hend   *= iStride3;
-                                    wstart *= iStride4;
-                                    wend   *= iStride4;
+                                        if (dstart < 0)
+                                            dstart += dD * ((-dstart + dD - 1) / dD);
+                                        if (hstart < 0)
+                                            hstart += dH * ((-hstart + dH - 1) / dH);
+                                        if (wstart < 0)
+                                            wstart += dW * ((-wstart + dW - 1) / dW);
+                                        if (dend > iD)
+                                            dend -= dD * ((dend - iD + dD - 1) / dD);
+                                        if (hend > iH)
+                                            hend -= dH * ((hend - iH + dH - 1) / dH);
+                                        if (wend > iW)
+                                            wend -= dW * ((wend - iW + dW - 1) / dW);
 
-                                    sum = -DataTypeUtils::max<T>();
+                                        dstart *= iStride2;
+                                        dend *= iStride2;
+                                        hstart *= iStride3;
+                                        hend *= iStride3;
+                                        wstart *= iStride4;
+                                        wend *= iStride4;
 
-                                    for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
-                                        for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
-                                            for (Nd4jLong kw = wstart; kw < wend; kw += iStep4) {
-                                                T val = pIn[kd + kh + kw];
-                                                if (val > sum)
-                                                    sum = val;
-                                            }
-                                    out[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4] = sum;
+                                        sum = -DataTypeUtils::max<T>();
+
+                                        for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
+                                            for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
+                                                for (Nd4jLong kw = wstart; kw < wend; kw += iStep4) {
+                                                    T val = pIn[kd + kh + kw];
+                                                    if (val > sum)
+                                                        sum = val;
+                                                }
+
+                                        out[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4] = sum;
+                                    }
                                 }
                             }
                         }
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, oD, 1);
             }
 /*************************************************************************/
             else if(poolingMode == 1) {     // avg
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, sum, dstart, hstart, wstart, dend, hend, wend))
-                for(int b = 0; b < bS; ++b) {
-                    for(int c = 0; c < iC; ++c) {
-                        for(int od = 0; od < oD; ++od) {
-                            for(int oh = 0; oh < oH; ++oh) {
-                                for(int ow = 0; ow < oW; ++ow) {
+                auto func = PRAGMA_THREADS_FOR_3D {
+                    Nd4jLong dstart, hstart, wstart, dend, hend, wend;
+                    T sum, *pIn;
 
-                                    pIn  = in  + b * iStride0 + c * iStride1;
+                    for (int b = start_x; b < stop_x; b += inc_x) {
+                        for (int c = start_y; c < stop_y; c += inc_y) {
+                            for (int od = start_z; od < stop_z; od += inc_z) {
+                                for (int oh = 0; oh < oH; ++oh) {
+                                    for (int ow = 0; ow < oW; ++ow) {
 
-                                    dstart = od * sD - pD;
-                                    hstart = oh * sH - pH;
-                                    wstart = ow * sW - pW;
-                                    dend = dstart + kDEff;
-                                    hend = hstart + kHEff;
-                                    wend = wstart + kWEff;
+                                        pIn = in + b * iStride0 + c * iStride1;
 
-                                    if(dstart < 0)
-                                        dstart += dD * ((-dstart + dD - 1) / dD);
-                                    if(hstart < 0)
-                                        hstart += dH * ((-hstart + dH - 1) / dH);
-                                    if(wstart < 0)
-                                        wstart += dW * ((-wstart + dW - 1) / dW);
-                                    if(dend > iD)
-                                        dend -= dD * ((dend-iD + dD - 1) / dD);
-                                    if(hend > iH)
-                                        hend -= dH * ((hend-iH + dH - 1) / dH);
-                                    if(wend > iW)
-                                        wend -= dW * ((wend-iW + dW - 1) / dW);
+                                        dstart = od * sD - pD;
+                                        hstart = oh * sH - pH;
+                                        wstart = ow * sW - pW;
+                                        dend = dstart + kDEff;
+                                        hend = hstart + kHEff;
+                                        wend = wstart + kWEff;
 
-                                    dstart *= iStride2;
-                                    dend   *= iStride2;
-                                    hstart *= iStride3;
-                                    hend   *= iStride3;
-                                    wstart *= iStride4;
-                                    wend   *= iStride4;
+                                        if (dstart < 0)
+                                            dstart += dD * ((-dstart + dD - 1) / dD);
+                                        if (hstart < 0)
+                                            hstart += dH * ((-hstart + dH - 1) / dH);
+                                        if (wstart < 0)
+                                            wstart += dW * ((-wstart + dW - 1) / dW);
+                                        if (dend > iD)
+                                            dend -= dD * ((dend - iD + dD - 1) / dD);
+                                        if (hend > iH)
+                                            hend -= dH * ((hend - iH + dH - 1) / dH);
+                                        if (wend > iW)
+                                            wend -= dW * ((wend - iW + dW - 1) / dW);
 
-                                    sum = static_cast<T>(0.);
+                                        dstart *= iStride2;
+                                        dend *= iStride2;
+                                        hstart *= iStride3;
+                                        hend *= iStride3;
+                                        wstart *= iStride4;
+                                        wend *= iStride4;
 
-                                    for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
-                                        for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
-                                            for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
-                                                sum += pIn[kd + kh + kw];
+                                        sum = static_cast<T>(0.);
 
-                                    if (extraParam0 == 0)         //Exclude padding
-                                        sum /= nd4j::math::nd4j_ceil<double,T>(static_cast<double>(dend-dstart) / static_cast<double>(iStep2)) * nd4j::math::nd4j_ceil<double,T>(static_cast<double>(hend-hstart) / static_cast<double>(iStep3)) * nd4j::math::nd4j_ceil<double,T>(static_cast<double>(wend-wstart) / static_cast<double>(iStep4));   //Accounts for dilation
-                                    else if (extraParam0 == 1)    //Include padding
-                                        sum /= kProd;
+                                        for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
+                                            for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
+                                                for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
+                                                    sum += pIn[kd + kh + kw];
 
-                                    out[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4] = sum;
+                                        if (extraParam0 == 0)         //Exclude padding
+                                            sum /= nd4j::math::nd4j_ceil<double, T>(static_cast<double>(dend - dstart) / static_cast<double>(iStep2)) * nd4j::math::nd4j_ceil<double, T>(static_cast<double>(hend - hstart) / static_cast<double>(iStep3)) * nd4j::math::nd4j_ceil<double, T>(static_cast<double>(wend - wstart) / static_cast<double>(iStep4));   //Accounts for dilation
+                                        else if (extraParam0 == 1)    //Include padding
+                                            sum /= kProd;
+
+                                        out[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4] = sum;
+                                    }
                                 }
                             }
                         }
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, oD, 1);
             }
 /*************************************************************************/
             else if(poolingMode == 2) {  // pnorm
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, sum, dstart, hstart, wstart, dend, hend, wend))
-                for(int b = 0; b < bS; ++b) {
-                    for(int c = 0; c < iC; ++c) {
-                        for(int od = 0; od < oD; ++od) {
-                            for(int oh = 0; oh < oH; ++oh) {
-                                for(int ow = 0; ow < oW; ++ow) {
+                auto func = PRAGMA_THREADS_FOR_3D {
+                    Nd4jLong dstart, hstart, wstart, dend, hend, wend;
+                    T sum, *pIn;
 
-                                    pIn  = in  + b * iStride0 + c * iStride1;
+                    for (int b = start_x; b < stop_x; b += inc_x) {
+                        for (int c = start_y; c < stop_y; c += inc_y) {
+                            for (int od = start_z; od < stop_z; od += inc_z) {
+                                for (int oh = 0; oh < oH; ++oh) {
+                                    for (int ow = 0; ow < oW; ++ow) {
 
-                                    dstart = od * sD - pD;
-                                    hstart = oh * sH - pH;
-                                    wstart = ow * sW - pW;
-                                    dend = dstart + kDEff;
-                                    hend = hstart + kHEff;
-                                    wend = wstart + kWEff;
+                                        pIn = in + b * iStride0 + c * iStride1;
 
-                                    if(dstart < 0)
-                                        dstart += dD * ((-dstart + dD - 1) / dD);
-                                    if(hstart < 0)
-                                        hstart += dH * ((-hstart + dH - 1) / dH);
-                                    if(wstart < 0)
-                                        wstart += dW * ((-wstart + dW - 1) / dW);
-                                    if(dend > iD)
-                                        dend -= dD * ((dend-iD + dD - 1) / dD);
-                                    if(hend > iH)
-                                        hend -= dH * ((hend-iH + dH - 1) / dH);
-                                    if(wend > iW)
-                                        wend -= dW * ((wend-iW + dW - 1) / dW);
+                                        dstart = od * sD - pD;
+                                        hstart = oh * sH - pH;
+                                        wstart = ow * sW - pW;
+                                        dend = dstart + kDEff;
+                                        hend = hstart + kHEff;
+                                        wend = wstart + kWEff;
 
-                                    dstart *= iStride2;
-                                    dend   *= iStride2;
-                                    hstart *= iStride3;
-                                    hend   *= iStride3;
-                                    wstart *= iStride4;
-                                    wend   *= iStride4;
+                                        if (dstart < 0)
+                                            dstart += dD * ((-dstart + dD - 1) / dD);
+                                        if (hstart < 0)
+                                            hstart += dH * ((-hstart + dH - 1) / dH);
+                                        if (wstart < 0)
+                                            wstart += dW * ((-wstart + dW - 1) / dW);
+                                        if (dend > iD)
+                                            dend -= dD * ((dend - iD + dD - 1) / dD);
+                                        if (hend > iH)
+                                            hend -= dH * ((hend - iH + dH - 1) / dH);
+                                        if (wend > iW)
+                                            wend -= dW * ((wend - iW + dW - 1) / dW);
 
-                                    sum = static_cast<T>(0.);
+                                        dstart *= iStride2;
+                                        dend *= iStride2;
+                                        hstart *= iStride3;
+                                        hend *= iStride3;
+                                        wstart *= iStride4;
+                                        wend *= iStride4;
 
-                                    for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
-                                        for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
-                                            for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
-                                                sum += nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kd + kh + kw]), extraParam0);
+                                        sum = static_cast<T>(0.);
 
-                                    sum = nd4j::math::nd4j_pow<T,T,T>(sum, (T) 1.f / extraParam0);
+                                        for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
+                                            for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
+                                                for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
+                                                    sum += nd4j::math::nd4j_pow<T, T, T>(nd4j::math::nd4j_abs<T>(pIn[kd + kh + kw]), extraParam0);
 
-                                    out[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4] = sum;
+                                        sum = nd4j::math::nd4j_pow<T, T, T>(sum, (T) 1.f / extraParam0);
+
+                                        out[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4] = sum;
+                                    }
                                 }
                             }
                         }
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, oD, 1);
             }
             else {
                 nd4j_printf("ConvolutionUtils::pooling3d: pooling mode argument can take three values only: 0, 1, 2, but got %i instead !\n", poolingMode);
-                throw "";
+                throw std::runtime_error("Incorrect poooling3d mode");
             }
         }
 
@@ -1182,191 +1249,230 @@ namespace nd4j {
 
             const bool sameStrides = iStride0 == gIStride0 && iStride1 == gIStride1 && iStride2 == gIStride2 && iStride3 == gIStride3;
 
-            Nd4jLong hstart, wstart,hend, wend, maxKH, maxKW;
-            T sum, valO, *pIn, *pgI;
-
             if(poolingMode == 0) {        // max
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, valO, sum, hstart, wstart, hend, wend, maxKH, maxKW))
-                for(int b = 0; b < bS; ++b) {
-                    for(int c = 0; c < iC; ++c) {
-                        for(int oh = 0; oh < oH; ++oh) {
-                            for(int ow = 0; ow < oW; ++ow) {
+                auto func = PRAGMA_THREADS_FOR_2D {
+                    Nd4jLong hstart, wstart,hend, wend, maxKH, maxKW;
+                    T sum, valO, *pIn, *pgI;
 
-                                pIn = in + b * iStride0 + c * iStride1;
+                    for (int b = start_x; b < stop_x; b += inc_x) {
+                        for (int c = start_y; c < stop_y; c += inc_y) {
+                            for (int oh = 0; oh < oH; ++oh) {
+                                for (int ow = 0; ow < oW; ++ow) {
 
-                                hstart = oh * sH - pH;
-                                wstart = ow * sW - pW;
-                                hend = hstart + kHEff;
-                                wend = wstart + kWEff;
+                                    pIn = in + b * iStride0 + c * iStride1;
 
-                                if(hstart < 0)
-                                    hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
-                                if(wstart < 0)
-                                    wstart += dW * ((-wstart + dW -1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
-                                if(hend > iH)
-                                    hend -= dH * ((hend-iH + dH - 1) / dH); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
-                                if(wend > iW)
-                                    wend -= dW * ((wend-iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
+                                    hstart = oh * sH - pH;
+                                    wstart = ow * sW - pW;
+                                    hend = hstart + kHEff;
+                                    wend = wstart + kWEff;
 
-                                sum = -DataTypeUtils::max<T>();
-                                valO = gO[b*oStride0 + c*oStride1 + oh*oStride2 + ow*oStride3];
+                                    if (hstart < 0)
+                                        hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
+                                    if (wstart < 0)
+                                        wstart += dW * ((-wstart + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
+                                    if (hend > iH)
+                                        hend -= dH * ((hend - iH + dH - 1) / dH); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
+                                    if (wend > iW)
+                                        wend -= dW * ((wend - iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
 
-                                if(sameStrides) {
+                                    sum = -DataTypeUtils::max<T>();
+                                    valO = gO[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3];
 
-                                    hstart *= iStride2;
-                                    hend   *= iStride2;
-                                    wstart *= iStride3;
-                                    wend   *= iStride3;
+                                    if (sameStrides) {
 
-                                    // we set these to default values
-                                    maxKH = hstart;
-                                    maxKW = wstart;
+                                        hstart *= iStride2;
+                                        hend *= iStride2;
+                                        wstart *= iStride3;
+                                        wend *= iStride3;
 
-                                    for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
-                                        for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) {
-                                            T valIn = pIn[kh + kw];
-                                            if (valIn > sum) {
-                                                sum = valIn;
-                                                maxKH = kh;
-                                                maxKW = kw;
+                                        // we set these to default values
+                                        maxKH = hstart;
+                                        maxKW = wstart;
+
+                                        for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
+                                            for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) {
+                                                T valIn = pIn[kh + kw];
+                                                if (valIn > sum) {
+                                                    sum = valIn;
+                                                    maxKH = kh;
+                                                    maxKW = kw;
+                                                }
                                             }
-                                        }
-                                    gI[pIn - in + maxKH + maxKW] += valO;
-                                }
-                                else {
+                                        gI[pIn - in + maxKH + maxKW] += valO;
+                                    } else {
 
-                                    // we set these to default values
-                                    maxKH = hstart;
-                                    maxKW = wstart;
+                                        // we set these to default values
+                                        maxKH = hstart;
+                                        maxKW = wstart;
 
-                                    for (Nd4jLong kh = hstart; kh < hend; kh += dH)
-                                        for (Nd4jLong kw = wstart; kw < wend; kw += dW) {
-                                            T valIn = pIn[kh * iStride2 + kw * iStride3];
-                                            if (valIn > sum) {
-                                                sum = valIn;
-                                                maxKH = kh;
-                                                maxKW = kw;
+                                        for (Nd4jLong kh = hstart; kh < hend; kh += dH)
+                                            for (Nd4jLong kw = wstart; kw < wend; kw += dW) {
+                                                T valIn = pIn[kh * iStride2 + kw * iStride3];
+                                                if (valIn > sum) {
+                                                    sum = valIn;
+                                                    maxKH = kh;
+                                                    maxKW = kw;
+                                                }
                                             }
-                                        }
-                                    gI[b * gIStride0 + c * gIStride1 + maxKH * gIStride2 + maxKW * gIStride3] += valO;
+
+                                        gI[b * gIStride0 + c * gIStride1 + maxKH * gIStride2 + maxKW * gIStride3] += valO;
+                                    }
                                 }
                             }
                         }
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1);
             }
 /*************************************************************************/
             else if(poolingMode == 1) {     // avg
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pgI, valO, hstart, wstart, hend, wend))
-                for(int b = 0; b < bS; ++b) {
-                    for(int c = 0; c < iC; ++c) {
-                        for(int oh = 0; oh < oH; ++oh) {
-                            for(int ow = 0; ow < oW; ++ow) {
+                auto func = PRAGMA_THREADS_FOR_2D {
+                    Nd4jLong hstart, wstart, hend, wend, maxKH, maxKW;
+                    T sum, valO, *pIn, *pgI;
 
-                                pgI  = gI + b * gIStride0 + c * gIStride1;
+                    for (int b = start_x; b < stop_x; b += inc_x) {
+                        for (int c = start_y; c < stop_y; c += inc_y) {
+                            for (int oh = 0; oh < oH; ++oh) {
+                                for (int ow = 0; ow < oW; ++ow) {
 
-                                hstart = oh * sH - pH;
-                                wstart = ow * sW - pW;
-                                hend = hstart + kHEff;
-                                wend = wstart + kWEff;
+                                    pgI = gI + b * gIStride0 + c * gIStride1;
 
-                                if(hstart < 0)
-                                    hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
-                                if(wstart < 0)
-                                    wstart += dW * ((-wstart + dW -1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
-                                if(hend > iH)
-                                    hend -= dH * ((hend-iH + dH - 1) / dH); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
-                                if(wend > iW)
-                                    wend -= dW * ((wend-iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
+                                    hstart = oh * sH - pH;
+                                    wstart = ow * sW - pW;
+                                    hend = hstart + kHEff;
+                                    wend = wstart + kWEff;
 
-                                hstart *= gIStride2;
-                                hend   *= gIStride2;
-                                wstart *= gIStride3;
-                                wend   *= gIStride3;
+                                    if (hstart < 0)
+                                        hstart += dH * ((-hstart + dH - 1) /
+                                                        dH); // (Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
+                                    if (wstart < 0)
+                                        wstart += dW * ((-wstart + dW - 1) /
+                                                        dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
+                                    if (hend > iH)
+                                        hend -= dH * ((hend - iH + dH - 1) /
+                                                      dH); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
+                                    if (wend > iW)
+                                        wend -= dW * ((wend - iW + dW - 1) /
+                                                      dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
 
-                                valO = gO[b*oStride0 + c*oStride1 + oh*oStride2 + ow*oStride3];
+                                    hstart *= gIStride2;
+                                    hend *= gIStride2;
+                                    wstart *= gIStride3;
+                                    wend *= gIStride3;
 
-                                if ((int) extraParam0 == 0)         //Exclude padding
-                                    valO /= static_cast<T>(nd4j::math::nd4j_ceil<double,T>(static_cast<double>(hend-hstart) / static_cast<double>(gIStep2))) * static_cast<T>(nd4j::math::nd4j_ceil<double,T>(static_cast<double>(wend-wstart) / static_cast<double>(gIStep3)));   //Accounts for dilation
-                                else if ((int) extraParam0 == 1)    //Include padding
-                                    valO /= kProd;
+                                    valO = gO[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3];
 
-                                for (Nd4jLong kh = hstart; kh < hend; kh += gIStep2)
-                                    for (Nd4jLong kw = wstart; kw < wend; kw += gIStep3)
-                                        pgI[kh + kw] += valO;
+                                    if ((int) extraParam0 == 0)         //Exclude padding
+                                        valO /= static_cast<T>(nd4j::math::nd4j_ceil<double, T>(
+                                                static_cast<double>(hend - hstart) / static_cast<double>(gIStep2))) *
+                                                static_cast<T>(nd4j::math::nd4j_ceil<double, T>(
+                                                        static_cast<double>(wend - wstart) /
+                                                        static_cast<double>(gIStep3)));   //Accounts for dilation
+                                    else if ((int) extraParam0 == 1)    //Include padding
+                                        valO /= kProd;
+
+                                    for (Nd4jLong kh = hstart; kh < hend; kh += gIStep2)
+                                        for (Nd4jLong kw = wstart; kw < wend; kw += gIStep3)
+                                            pgI[kh + kw] += valO;
+                                }
                             }
                         }
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1);
             }
 /*************************************************************************/
             else if(poolingMode == 2) {  // pnorm
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, valO, pgI, sum, hstart, wstart, hend, wend))
-                for(int b = 0; b < bS; ++b) {
-                    for(int c = 0; c < iC; ++c) {
-                        for(int oh = 0; oh < oH; ++oh) {
-                            for(int ow = 0; ow < oW; ++ow) {
+                auto func = PRAGMA_THREADS_FOR_2D {
+                    Nd4jLong hstart, wstart, hend, wend, maxKH, maxKW;
+                    T sum, valO, *pIn, *pgI;
 
-                                pIn  = in + b * iStride0 + c * iStride1;
-                                pgI  = sameStrides ? gI + (pIn - in) : gI + b * gIStride0 + c * gIStride1;
+                    for (int b = start_x; b < stop_x; b += inc_x) {
+                        for (int c = start_y; c < stop_y; c += inc_y) {
+                            for (int oh = 0; oh < oH; ++oh) {
+                                for (int ow = 0; ow < oW; ++ow) {
 
-                                hstart = oh * sH - pH;
-                                wstart = ow * sW - pW;
-                                hend = hstart + kHEff;
-                                wend = wstart + kWEff;
+                                    pIn = in + b * iStride0 + c * iStride1;
+                                    pgI = sameStrides ? gI + (pIn - in) : gI + b * gIStride0 + c * gIStride1;
 
-                                if(hstart < 0)
-                                    hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
-                                if(wstart < 0)
-                                    wstart += dW * ((-wstart + dW -1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
-                                if(hend > iH)
-                                    hend -= dH * ((hend-iH + dH - 1) / dH); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
-                                if(wend > iW)
-                                    wend -= dW * ((wend-iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
+                                    hstart = oh * sH - pH;
+                                    wstart = ow * sW - pW;
+                                    hend = hstart + kHEff;
+                                    wend = wstart + kWEff;
 
-                                sum = static_cast<T>(0.f);
-                                valO = gO[b*oStride0 + c*oStride1 + oh*oStride2 + ow*oStride3];
+                                    if (hstart < 0)
+                                        hstart += dH * ((-hstart + dH - 1) /
+                                                        dH); // (Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
+                                    if (wstart < 0)
+                                        wstart += dW * ((-wstart + dW - 1) /
+                                                        dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
+                                    if (hend > iH)
+                                        hend -= dH * ((hend - iH + dH - 1) /
+                                                      dH); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
+                                    if (wend > iW)
+                                        wend -= dW * ((wend - iW + dW - 1) /
+                                                      dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
 
-                                if(sameStrides) {
+                                    sum = static_cast<T>(0.f);
+                                    valO = gO[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3];
 
-                                    hstart *= iStride2;
-                                    hend   *= iStride2;
-                                    wstart *= iStride3;
-                                    wend   *= iStride3;
+                                    if (sameStrides) {
 
-                                    for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
-                                        for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
-                                            sum += nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kh + kw]), extraParam0);
+                                        hstart *= iStride2;
+                                        hend *= iStride2;
+                                        wstart *= iStride3;
+                                        wend *= iStride3;
 
-                                    valO *= nd4j::math::nd4j_pow<T,T,T>(sum, ((T)1. - extraParam0) / extraParam0);
+                                        for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
+                                            for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
+                                                sum += nd4j::math::nd4j_pow<T, T, T>(
+                                                        nd4j::math::nd4j_abs<T>(pIn[kh + kw]), extraParam0);
 
-                                    for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
-                                        for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
-                                            pgI[kh + kw] += valO * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kh + kw]), extraParam0 - 1.f) * nd4j::math::nd4j_sgn<T,T>(pIn[kh + kw]);
-                                }
-                                else {
+                                        valO *= nd4j::math::nd4j_pow<T, T, T>(sum,
+                                                                              ((T) 1. - extraParam0) / extraParam0);
 
-                                    for (Nd4jLong kh = hstart; kh < hend; kh += dH)
-                                        for (Nd4jLong kw = wstart; kw < wend; kw += dW)
-                                            sum += nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kh * iStride2 + kw * iStride3]), extraParam0);
+                                        for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
+                                            for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
+                                                pgI[kh + kw] += valO * nd4j::math::nd4j_pow<T, T, T>(
+                                                        nd4j::math::nd4j_abs<T>(pIn[kh + kw]), extraParam0 - 1.f) *
+                                                                nd4j::math::nd4j_sgn<T, T>(pIn[kh + kw]);
+                                    } else {
 
-                                    valO *= nd4j::math::nd4j_pow<T,T,T>(sum, ((T)1. - extraParam0) / extraParam0);
+                                        for (Nd4jLong kh = hstart; kh < hend; kh += dH)
+                                            for (Nd4jLong kw = wstart; kw < wend; kw += dW)
+                                                sum += nd4j::math::nd4j_pow<T, T, T>(
+                                                        nd4j::math::nd4j_abs<T>(pIn[kh * iStride2 + kw * iStride3]),
+                                                        extraParam0);
 
-                                    for (Nd4jLong kh = hstart; kh < hend; kh += dH) {
-                                        for (Nd4jLong kw = wstart; kw < wend; kw += dW) {
-                                            const auto inVal = pIn[kh * iStride2 + kw * iStride3];
-                                            pgI[kh * gIStride2 + kw * gIStride3] += valO * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(inVal), extraParam0 - 1.f) * nd4j::math::nd4j_sgn<T,T>(inVal);
+                                        valO *= nd4j::math::nd4j_pow<T, T, T>(sum,
+                                                                              ((T) 1. - extraParam0) / extraParam0);
+
+                                        for (Nd4jLong kh = hstart; kh < hend; kh += dH) {
+                                            for (Nd4jLong kw = wstart; kw < wend; kw += dW) {
+                                                const auto inVal = pIn[kh * iStride2 + kw * iStride3];
+                                                pgI[kh * gIStride2 + kw * gIStride3] += valO *
+                                                                                        nd4j::math::nd4j_pow<T, T, T>(
+                                                                                                nd4j::math::nd4j_abs<T>(
+                                                                                                        inVal),
+                                                                                                extraParam0 - 1.f) *
+                                                                                        nd4j::math::nd4j_sgn<T, T>(
+                                                                                                inVal);
+                                            }
                                         }
                                     }
                                 }
                             }
                         }
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1);
             }
             else {
                 nd4j_printf("ConvolutionUtils::pooling2dBP: pooling mode argument can take three values only: 0, 1, 2, but got %i instead !\n", poolingMode);
-                throw "";
+                throw std::runtime_error("Incorrect pooling2dBP mode");
             }
         }
 
@@ -1425,226 +1531,239 @@ namespace nd4j {
 
             const bool sameStrides = iStride0 == gIStride0 && iStride1 == gIStride1 && iStride2 == gIStride2 && iStride3 == gIStride3 && iStride4 == gIStride4;
 
-            Nd4jLong dstart, hstart, wstart, dend, hend, wend, maxKD, maxKH, maxKW;
-            T sum, valO, *pIn, *pgI;
-
             if(poolingMode == 0) {        // max
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, valO, sum, dstart, hstart, wstart, dend, hend, wend, maxKD, maxKH, maxKW))
-                for(int b = 0; b < bS; ++b) {
-                    for(int c = 0; c < iC; ++c) {
-                        for(int od = 0; od < oD; ++od) {
-                            for(int oh = 0; oh < oH; ++oh) {
-                                for(int ow = 0; ow < oW; ++ow) {
+                auto func = PRAGMA_THREADS_FOR_3D {
+                    Nd4jLong dstart, hstart, wstart, dend, hend, wend, maxKD, maxKH, maxKW;
+                    T sum, valO, *pIn, *pgI;
 
-                                    pIn = in + b * iStride0 + c * iStride1;
+                    for (int b = start_x; b < stop_x; b += inc_x) {
+                        for (int c = start_y; c < stop_y; c += inc_y) {
+                            for (int od = start_z; od < stop_z; od += inc_z) {
+                                for (int oh = 0; oh < oH; ++oh) {
+                                    for (int ow = 0; ow < oW; ++ow) {
 
-                                    dstart = od * sD - pD;
-                                    hstart = oh * sH - pH;
-                                    wstart = ow * sW - pW;
-                                    dend = dstart + kDEff;
-                                    hend = hstart + kHEff;
-                                    wend = wstart + kWEff;
+                                        pIn = in + b * iStride0 + c * iStride1;
 
-                                    if(dstart < 0)
-                                        dstart += dD * ((-dstart + dD - 1) / dD);
-                                    if(hstart < 0)
-                                        hstart += dH * ((-hstart + dH - 1) / dH);
-                                    if(wstart < 0)
-                                        wstart += dW * ((-wstart + dW - 1) / dW);
-                                    if(dend > iD)
-                                        dend -= dD * ((dend-iD + dD - 1) / dD);
-                                    if(hend > iH)
-                                        hend -= dH * ((hend-iH + dH - 1) / dH);
-                                    if(wend > iW)
-                                        wend -= dW * ((wend-iW + dW - 1) / dW);
+                                        dstart = od * sD - pD;
+                                        hstart = oh * sH - pH;
+                                        wstart = ow * sW - pW;
+                                        dend = dstart + kDEff;
+                                        hend = hstart + kHEff;
+                                        wend = wstart + kWEff;
 
-                                    sum = -DataTypeUtils::max<T>();
-                                    valO = gO[b*oStride0 + c*oStride1+ od*oStride2 + oh*oStride3 + ow*oStride4];
+                                        if (dstart < 0)
+                                            dstart += dD * ((-dstart + dD - 1) / dD);
+                                        if (hstart < 0)
+                                            hstart += dH * ((-hstart + dH - 1) / dH);
+                                        if (wstart < 0)
+                                            wstart += dW * ((-wstart + dW - 1) / dW);
+                                        if (dend > iD)
+                                            dend -= dD * ((dend - iD + dD - 1) / dD);
+                                        if (hend > iH)
+                                            hend -= dH * ((hend - iH + dH - 1) / dH);
+                                        if (wend > iW)
+                                            wend -= dW * ((wend - iW + dW - 1) / dW);
 
-                                    if(sameStrides) {
+                                        sum = -DataTypeUtils::max<T>();
+                                        valO = gO[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4];
 
-                                        dstart *= iStride2;
-                                        dend   *= iStride2;
-                                        hstart *= iStride3;
-                                        hend   *= iStride3;
-                                        wstart *= iStride4;
-                                        wend   *= iStride4;
+                                        if (sameStrides) {
 
-                                        maxKD = dstart;
-                                        maxKH = hstart;
-                                        maxKW = wstart;
+                                            dstart *= iStride2;
+                                            dend *= iStride2;
+                                            hstart *= iStride3;
+                                            hend *= iStride3;
+                                            wstart *= iStride4;
+                                            wend *= iStride4;
 
-                                        for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
-                                            for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
-                                                for (Nd4jLong kw = wstart; kw < wend; kw += iStep4) {
-                                                    T valIn = pIn[kd + kh + kw];
-                                                    if (valIn > sum) {
-                                                        sum = valIn;
-                                                        maxKD = kd;
-                                                        maxKH = kh;
-                                                        maxKW = kw;
+                                            maxKD = dstart;
+                                            maxKH = hstart;
+                                            maxKW = wstart;
+
+                                            for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
+                                                for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
+                                                    for (Nd4jLong kw = wstart; kw < wend; kw += iStep4) {
+                                                        T valIn = pIn[kd + kh + kw];
+                                                        if (valIn > sum) {
+                                                            sum = valIn;
+                                                            maxKD = kd;
+                                                            maxKH = kh;
+                                                            maxKW = kw;
+                                                        }
                                                     }
-                                                }
-                                        gI[pIn - in + maxKD + maxKH + maxKW] += valO;
-                                    }
-                                    else {
+                                            gI[pIn - in + maxKD + maxKH + maxKW] += valO;
+                                        } else {
 
-                                        // we set these to default values
-                                        maxKH = hstart;
-                                        maxKW = wstart;
-                                        maxKD = dstart;
+                                            // we set these to default values
+                                            maxKH = hstart;
+                                            maxKW = wstart;
+                                            maxKD = dstart;
 
-                                        for (Nd4jLong kd = dstart; kd < dend; kd += dD)
-                                            for (Nd4jLong kh = hstart; kh < hend; kh += dH)
-                                                for (Nd4jLong kw = wstart; kw < wend; kw += dW) {
-                                                    T valIn = pIn[kd * iStride2 + kh * iStride3 + kw * iStride4];
-                                                    if (valIn > sum) {
-                                                        sum = valIn;
-                                                        maxKD = kd;
-                                                        maxKH = kh;
-                                                        maxKW = kw;
+                                            for (Nd4jLong kd = dstart; kd < dend; kd += dD)
+                                                for (Nd4jLong kh = hstart; kh < hend; kh += dH)
+                                                    for (Nd4jLong kw = wstart; kw < wend; kw += dW) {
+                                                        T valIn = pIn[kd * iStride2 + kh * iStride3 + kw * iStride4];
+                                                        if (valIn > sum) {
+                                                            sum = valIn;
+                                                            maxKD = kd;
+                                                            maxKH = kh;
+                                                            maxKW = kw;
+                                                        }
                                                     }
-                                                }
-                                        gI[b * gIStride0 + c * gIStride1 + maxKD * gIStride2 + maxKH * gIStride3 + maxKW * gIStride4] += valO;
+
+                                            gI[b * gIStride0 + c * gIStride1 + maxKD * gIStride2 + maxKH * gIStride3 + maxKW * gIStride4] += valO;
+                                        }
                                     }
                                 }
                             }
                         }
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, oD, 1);
             }
 /*************************************************************************/
             else if(poolingMode == 1) {     // avg
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pgI, valO, dstart, hstart, wstart, dend, hend, wend))
-                for(int b = 0; b < bS; ++b) {
-                    for(int c = 0; c < iC; ++c) {
-                        for(int od = 0; od < oD; ++od) {
-                            for(int oh = 0; oh < oH; ++oh) {
-                                for(int ow = 0; ow < oW; ++ow) {
+                auto func = PRAGMA_THREADS_FOR_3D {
+                    Nd4jLong dstart, hstart, wstart, dend, hend, wend, maxKD, maxKH, maxKW;
+                    T sum, valO, *pIn, *pgI;
 
-                                    pgI  = gI + b * gIStride0 + c * gIStride1;
+                    for (int b = start_x; b < stop_x; b += inc_x) {
+                        for (int c = start_y; c < stop_y; c += inc_y) {
+                            for (int od = start_z; od < stop_z; od += inc_z) {
+                                for (int oh = 0; oh < oH; ++oh) {
+                                    for (int ow = 0; ow < oW; ++ow) {
 
-                                    dstart = od * sD - pD;
-                                    hstart = oh * sH - pH;
-                                    wstart = ow * sW - pW;
-                                    dend = dstart + kDEff;
-                                    hend = hstart + kHEff;
-                                    wend = wstart + kWEff;
+                                        pgI = gI + b * gIStride0 + c * gIStride1;
 
-                                    if(dstart < 0)
-                                        dstart += dD * ((-dstart + dD - 1) / dD);
-                                    if(hstart < 0)
-                                        hstart += dH * ((-hstart + dH - 1) / dH);
-                                    if(wstart < 0)
-                                        wstart += dW * ((-wstart + dW - 1) / dW);
-                                    if(dend > iD)
-                                        dend -= dD * ((dend-iD + dD - 1) / dD);
-                                    if(hend > iH)
-                                        hend -= dH * ((hend-iH + dH - 1) / dH);
-                                    if(wend > iW)
-                                        wend -= dW * ((wend-iW + dW - 1) / dW);
+                                        dstart = od * sD - pD;
+                                        hstart = oh * sH - pH;
+                                        wstart = ow * sW - pW;
+                                        dend = dstart + kDEff;
+                                        hend = hstart + kHEff;
+                                        wend = wstart + kWEff;
 
-                                    dstart *= gIStride2;
-                                    dend   *= gIStride2;
-                                    hstart *= gIStride3;
-                                    hend   *= gIStride3;
-                                    wstart *= gIStride4;
-                                    wend   *= gIStride4;
+                                        if (dstart < 0)
+                                            dstart += dD * ((-dstart + dD - 1) / dD);
+                                        if (hstart < 0)
+                                            hstart += dH * ((-hstart + dH - 1) / dH);
+                                        if (wstart < 0)
+                                            wstart += dW * ((-wstart + dW - 1) / dW);
+                                        if (dend > iD)
+                                            dend -= dD * ((dend - iD + dD - 1) / dD);
+                                        if (hend > iH)
+                                            hend -= dH * ((hend - iH + dH - 1) / dH);
+                                        if (wend > iW)
+                                            wend -= dW * ((wend - iW + dW - 1) / dW);
 
-                                    valO = gO[b*oStride0 + c*oStride1+ od*oStride2 + oh*oStride3 + ow*oStride4];
+                                        dstart *= gIStride2;
+                                        dend *= gIStride2;
+                                        hstart *= gIStride3;
+                                        hend *= gIStride3;
+                                        wstart *= gIStride4;
+                                        wend *= gIStride4;
 
-                                    if (extraParam0 == 0)         //Exclude padding
-                                        valO /= nd4j::math::nd4j_ceil<double,T>(static_cast<double>(dend-dstart) / static_cast<double>(gIStep2)) * nd4j::math::nd4j_ceil<double,T>(static_cast<double>(hend-hstart) / static_cast<double>(gIStep3)) * nd4j::math::nd4j_ceil<double,T>(static_cast<double>(wend-wstart) / static_cast<double>(gIStep4));   //Accounts for dilation
-                                    else if (extraParam0 == 1)    //Include padding
-                                        valO /= kProd;
+                                        valO = gO[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4];
 
-                                    for (Nd4jLong kd = dstart; kd < dend; kd += gIStep2)
-                                        for (Nd4jLong kh = hstart; kh < hend; kh += gIStep3)
-                                            for (Nd4jLong kw = wstart; kw < wend; kw += gIStep4)
-                                                pgI[kd + kh + kw] += valO;
+                                        if (extraParam0 == 0)         //Exclude padding
+                                            valO /= nd4j::math::nd4j_ceil<double, T>(static_cast<double>(dend - dstart) / static_cast<double>(gIStep2)) * nd4j::math::nd4j_ceil<double, T>(static_cast<double>(hend - hstart) / static_cast<double>(gIStep3)) * nd4j::math::nd4j_ceil<double, T>(static_cast<double>(wend - wstart) / static_cast<double>(gIStep4));   //Accounts for dilation
+                                        else if (extraParam0 == 1)    //Include padding
+                                            valO /= kProd;
+
+                                        for (Nd4jLong kd = dstart; kd < dend; kd += gIStep2)
+                                            for (Nd4jLong kh = hstart; kh < hend; kh += gIStep3)
+                                                for (Nd4jLong kw = wstart; kw < wend; kw += gIStep4)
+                                                    pgI[kd + kh + kw] += valO;
+                                    }
                                 }
                             }
                         }
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, oD, 1);
             }
 /*************************************************************************/
             else if(poolingMode == 2) {  // pnorm
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, pgI, valO, sum, dstart, hstart, wstart, dend, hend, wend))
-                for(int b = 0; b < bS; ++b) {
-                    for(int c = 0; c < iC; ++c) {
-                        for(int od = 0; od < oD; ++od) {
-                            for(int oh = 0; oh < oH; ++oh) {
-                                for(int ow = 0; ow < oW; ++ow) {
+                auto func = PRAGMA_THREADS_FOR_3D {
+                    Nd4jLong dstart, hstart, wstart, dend, hend, wend, maxKD, maxKH, maxKW;
+                    T sum, valO, *pIn, *pgI;
 
-                                    pIn  = in + b * iStride0 + c * iStride1;
-                                    pgI  = gI + (pIn - in);
+                    for (int b = start_x; b < stop_x; b += inc_x) {
+                        for (int c = start_y; c < stop_y; c += inc_y) {
+                            for (int od = start_z; od < stop_z; od += inc_z) {
+                                for (int oh = 0; oh < oH; ++oh) {
+                                    for (int ow = 0; ow < oW; ++ow) {
 
-                                    dstart = od * sD - pD;
-                                    hstart = oh * sH - pH;
-                                    wstart = ow * sW - pW;
-                                    dend = dstart + kDEff;
-                                    hend = hstart + kHEff;
-                                    wend = wstart + kWEff;
+                                        pIn = in + b * iStride0 + c * iStride1;
+                                        pgI = gI + (pIn - in);
 
-                                    if(dstart < 0)
-                                        dstart += dD * ((-dstart + dD - 1) / dD);
-                                    if(hstart < 0)
-                                        hstart += dH * ((-hstart + dH - 1) / dH);
-                                    if(wstart < 0)
-                                        wstart += dW * ((-wstart + dW - 1) / dW);
-                                    if(dend > iD)
-                                        dend -= dD * ((dend-iD + dD - 1) / dD);
-                                    if(hend > iH)
-                                        hend -= dH * ((hend-iH + dH - 1) / dH);
-                                    if(wend > iW)
-                                        wend -= dW * ((wend-iW + dW - 1) / dW);
+                                        dstart = od * sD - pD;
+                                        hstart = oh * sH - pH;
+                                        wstart = ow * sW - pW;
+                                        dend = dstart + kDEff;
+                                        hend = hstart + kHEff;
+                                        wend = wstart + kWEff;
 
-                                    sum = static_cast<T>(0.);
-                                    valO = gO[b*oStride0 + c*oStride1+ od*oStride2 + oh*oStride3 + ow*oStride4];
+                                        if (dstart < 0)
+                                            dstart += dD * ((-dstart + dD - 1) / dD);
+                                        if (hstart < 0)
+                                            hstart += dH * ((-hstart + dH - 1) / dH);
+                                        if (wstart < 0)
+                                            wstart += dW * ((-wstart + dW - 1) / dW);
+                                        if (dend > iD)
+                                            dend -= dD * ((dend - iD + dD - 1) / dD);
+                                        if (hend > iH)
+                                            hend -= dH * ((hend - iH + dH - 1) / dH);
+                                        if (wend > iW)
+                                            wend -= dW * ((wend - iW + dW - 1) / dW);
 
-                                    if(sameStrides) {
+                                        sum = static_cast<T>(0.);
+                                        valO = gO[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4];
 
-                                        dstart *= iStride2;
-                                        dend   *= iStride2;
-                                        hstart *= iStride3;
-                                        hend   *= iStride3;
-                                        wstart *= iStride4;
-                                        wend   *= iStride4;
+                                        if (sameStrides) {
 
-                                        for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
-                                            for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
-                                                for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
-                                                    sum += nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kd + kh + kw]), extraParam0);
+                                            dstart *= iStride2;
+                                            dend *= iStride2;
+                                            hstart *= iStride3;
+                                            hend *= iStride3;
+                                            wstart *= iStride4;
+                                            wend *= iStride4;
 
-                                        valO *= nd4j::math::nd4j_pow<T,T,T>(sum, ((T)1.f - extraParam0) / extraParam0);
+                                            for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
+                                                for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
+                                                    for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
+                                                        sum += nd4j::math::nd4j_pow<T, T, T>(nd4j::math::nd4j_abs<T>(pIn[kd + kh + kw]), extraParam0);
 
-                                        for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
-                                            for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
-                                                for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
-                                                    pgI[kd + kh + kw] += valO * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kd + kh + kw]), extraParam0 - (T)1.f) *  nd4j::math::nd4j_sgn<T,T>(pIn[kd + kh + kw]);
-                                    }
-                                    else {
+                                            valO *= nd4j::math::nd4j_pow<T, T, T>(sum, ((T) 1.f - extraParam0) / extraParam0);
 
-                                        for (Nd4jLong kd = dstart; kd < dend; kd += dD)
-                                            for (Nd4jLong kh = hstart; kh < hend; kh += dH)
-                                                for (Nd4jLong kw = wstart; kw < wend; kw += dW)
-                                                    sum += nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(pIn[kd * iStride2 + kh * iStride3 + kw * iStride4]), extraParam0);
+                                            for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
+                                                for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
+                                                    for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
+                                                        pgI[kd + kh + kw] += valO * nd4j::math::nd4j_pow<T, T, T>(nd4j::math::nd4j_abs<T>(pIn[kd + kh + kw]),extraParam0 - (T) 1.f) * nd4j::math::nd4j_sgn<T, T>(pIn[kd + kh + kw]);
+                                        } else {
+                                            for (Nd4jLong kd = dstart; kd < dend; kd += dD)
+                                                for (Nd4jLong kh = hstart; kh < hend; kh += dH)
+                                                    for (Nd4jLong kw = wstart; kw < wend; kw += dW)
+                                                        sum += nd4j::math::nd4j_pow<T, T, T>(nd4j::math::nd4j_abs<T>(pIn[kd * iStride2 + kh * iStride3 + kw * iStride4]), extraParam0);
 
-                                        valO *= nd4j::math::nd4j_pow<T,T,T>(sum, ((T)1.f - extraParam0) / extraParam0);
+                                            valO *= nd4j::math::nd4j_pow<T, T, T>(sum, ((T) 1.f - extraParam0) / extraParam0);
 
-                                        for (Nd4jLong kd = dstart; kd < dend; kd += dD)
-                                            for (Nd4jLong kh = hstart; kh < hend; kh += dH)
-                                                for (Nd4jLong kw = wstart; kw < wend; kw += dW) {
-                                                    const auto inVal = pIn[kD * iStride2 + kh * iStride3 + kw * iStride4];
-                                                    pgI[kd * gIStride2 + kh * gIStride3 + kw * gIStride4] += valO * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(inVal), extraParam0 - 1.f) * nd4j::math::nd4j_sgn<T,T>(inVal);
-                                                }
+                                            for (Nd4jLong kd = dstart; kd < dend; kd += dD)
+                                                for (Nd4jLong kh = hstart; kh < hend; kh += dH)
+                                                    for (Nd4jLong kw = wstart; kw < wend; kw += dW) {
+                                                        const auto inVal = pIn[kD * iStride2 + kh * iStride3 + kw * iStride4];
+                                                        pgI[kd * gIStride2 + kh * gIStride3 + kw * gIStride4] += valO * nd4j::math::nd4j_pow<T, T, T>(nd4j::math::nd4j_abs<T>(inVal), extraParam0 - 1.f) * nd4j::math::nd4j_sgn<T, T>(inVal);
+                                                    }
+                                        }
                                     }
                                 }
                             }
                         }
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, oD, 1);
             }
             else {
                 nd4j_printf("ConvolutionUtils::pooling3dBP: pooling mode argument can take three values only: 0, 1, 2, but got %i instead !\n", poolingMode);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/cross.cpp b/libnd4j/include/ops/declarable/helpers/cpu/cross.cpp
index f61a53f30..3150c0cfd 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/cross.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/cross.cpp
@@ -38,14 +38,17 @@ void crossBatched(nd4j::LaunchContext * context, NDArray *a, NDArray *b, NDArray
 
     int tads = tadsA->size();
 
-    PRAGMA_OMP_PARALLEL_FOR_SIMD
-    for (Nd4jLong e = 0; e < tads; e++) {
-        auto a_ = tadsA->at(e);
-        auto b_ = tadsB->at(e);
-        auto o_ = tadsO->at(e);
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto e = start; e < stop; e += increment) {
+            auto a_ = tadsA->at(e);
+            auto b_ = tadsB->at(e);
+            auto o_ = tadsO->at(e);
 
-        helpers::cross(context, a_, b_, o_);
-    }
+            helpers::cross(context, a_, b_, o_);
+        }
+    };
+
+    samediff::Threads::parallel_tad(func, 0, tads);
 
     delete tadsA;
     delete tadsB;
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/d_t_s.cpp b/libnd4j/include/ops/declarable/helpers/cpu/d_t_s.cpp
index 55cc57d3e..f041452ab 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/d_t_s.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/d_t_s.cpp
@@ -19,6 +19,7 @@
 //
 
 #include <ops/declarable/helpers/d_t_s.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -44,45 +45,51 @@ namespace helpers {
 
         if (isNHWC) {
             const int total_count = batch_size * output_height * output_width * output_depth;
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (int out_idx = 0; out_idx < total_count; out_idx++) {
-                const int d = out_idx % output_depth;
-                const int out_idx2 = out_idx / output_depth;
-                const int w = out_idx2 % output_width;
-                const int out_idx3 = out_idx2 / output_width;
-                const int h = out_idx3 % output_height;
-                const int b = out_idx3 / output_height;
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto out_idx = start; out_idx < stop; out_idx += increment) {
+                    const int d = out_idx % output_depth;
+                    const int out_idx2 = out_idx / output_depth;
+                    const int w = out_idx2 % output_width;
+                    const int out_idx3 = out_idx2 / output_width;
+                    const int h = out_idx3 % output_height;
+                    const int b = out_idx3 / output_height;
 
-                const int in_h = h / block_size;
-                const int offset_h = h % block_size;
-                const int in_w = w / block_size;
-                const int offset_w = w % block_size;
-                const int offset_d = (offset_h * block_size + offset_w) * output_depth;
-                const int in_d = d + offset_d;
-                const int inp_idx = in_d + input_depth * (in_w + input_width * (in_h + input_height * b));
-                (output_ptr + out_idx)[0] = (input_ptr + inp_idx)[0];
-            }
+                    const int in_h = h / block_size;
+                    const int offset_h = h % block_size;
+                    const int in_w = w / block_size;
+                    const int offset_w = w % block_size;
+                    const int offset_d = (offset_h * block_size + offset_w) * output_depth;
+                    const int in_d = d + offset_d;
+                    const int inp_idx = in_d + input_depth * (in_w + input_width * (in_h + input_height * b));
+                    (output_ptr + out_idx)[0] = (input_ptr + inp_idx)[0];
+                }
+            };
+
+            samediff::Threads::parallel_for(func, 0, total_count);
         } else {
             const int total_count = batch_size * input_depth_by_input_area;
 
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (int input_idx = 0; input_idx < total_count; input_idx++) {
-                const int n_bY_bX_oC_iY = input_idx / input_width;
-                const int iX = input_idx - n_bY_bX_oC_iY * input_width;
+            auto func = PRAGMA_THREADS_FOR {
+                for (int input_idx = start; input_idx < stop; input_idx += increment) {
+                    const int n_bY_bX_oC_iY = input_idx / input_width;
+                    const int iX = input_idx - n_bY_bX_oC_iY * input_width;
 
-                const int n_bY_bX = n_bY_bX_oC_iY / output_depth_by_input_height;
-                const int oC_iY = n_bY_bX_oC_iY - n_bY_bX * output_depth_by_input_height;
+                    const int n_bY_bX = n_bY_bX_oC_iY / output_depth_by_input_height;
+                    const int oC_iY = n_bY_bX_oC_iY - n_bY_bX * output_depth_by_input_height;
 
-                const int n_bY = n_bY_bX / block_size;
-                const int bX = n_bY_bX - n_bY * block_size;
+                    const int n_bY = n_bY_bX / block_size;
+                    const int bX = n_bY_bX - n_bY * block_size;
 
-                const int n = n_bY / block_size;
-                const int bY = n_bY - n * block_size;
+                    const int n = n_bY / block_size;
+                    const int bY = n_bY - n * block_size;
 
-                const int output_idx = bX + block_size * (iX + input_width * (bY + block_size * (oC_iY + n * output_depth_by_input_height)));
+                    const int output_idx = bX + block_size * (iX + input_width * (bY + block_size * (oC_iY + n * output_depth_by_input_height)));
 
-                (output_ptr + output_idx)[0] = (input_ptr + input_idx)[0];
-            }
+                    (output_ptr + output_idx)[0] = (input_ptr + input_idx)[0];
+                }
+            };
+
+            samediff::Threads::parallel_for(func, 0, total_count);
         }
     }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/diag.cpp b/libnd4j/include/ops/declarable/helpers/cpu/diag.cpp
index 3a687981e..f2f2033c1 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/diag.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/diag.cpp
@@ -34,7 +34,6 @@ static void _diagFunctor(const NDArray* input, NDArray* output) {
 
     const int inLength = input->lengthOf();
 
-    PRAGMA_OMP_PARALLEL_FOR_IF(inLength > Environment::getInstance()->elementwiseThreshold())
     for(int i = 0; i < inLength; ++i)
         output->p<T>(i * (inLength + 1), (*input).e<T>(i));
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/dilation2d.cpp b/libnd4j/include/ops/declarable/helpers/cpu/dilation2d.cpp
index c75bbf131..f5c0fe71c 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/dilation2d.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/dilation2d.cpp
@@ -20,6 +20,7 @@
 
 #include <ops/declarable/helpers/dilation2d.h>
 #include <array/DataTypeUtils.h>
+#include <execution/Threads.h>
 
 namespace nd4j    {
 namespace ops     {
@@ -52,33 +53,36 @@ static void dilation2d_(NDArray *input, NDArray *weights, NDArray *output, const
     const uint oH = output->sizeAt(1);
     const uint oW = output->sizeAt(2);
 
-    PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(collapse(4))
-    for (uint b = 0; b < bS; ++b) {
-        for (uint oh = 0; oh < oH; ++oh) {
-            for (uint ow = 0; ow < oW; ++ow) {
-                for (uint c = 0; c < iC; ++c)  {
+    auto func = PRAGMA_THREADS_FOR_2D {
+        for (uint b = start_x; b < stop_x; b += inc_x) {
+            for (uint oh = start_y; oh < stop_y; oh += inc_y) {
+                for (uint ow = 0; ow < oW; ++ow) {
+                    for (uint c = 0; c < iC; ++c) {
 
-                    X max = -DataTypeUtils::max<X>();
+                        X max = -DataTypeUtils::max<X>();
 
-                    for (uint kh = 0; kh < kH; ++kh) {
-                        const int ih = oh * sH - pH + kh * dH;
-                        if (ih < 0 || ih >= iH) continue;
+                        for (uint kh = 0; kh < kH; ++kh) {
+                            const int ih = oh * sH - pH + kh * dH;
+                            if (ih < 0 || ih >= iH) continue;
 
-                        for (uint kw = 0; kw < kW; ++kw) {
-                            const int iw = ow * sW - pW + kw * dW;
-                            if(iw < 0 || iw >= iW) continue;
+                            for (uint kw = 0; kw < kW; ++kw) {
+                                const int iw = ow * sW - pW + kw * dW;
+                                if (iw < 0 || iw >= iW) continue;
 
-                            const X val = x[shape::getOffset(xShapeInfo, {b,(uint)ih,(uint)iw,c})] + y[shape::getOffset(yShapeInfo, {kh,kw,c})];
-                            if (val > max)
-                                max = val;
+                                const X val = x[shape::getOffset(xShapeInfo, {b, (uint) ih, (uint) iw, c})] + y[shape::getOffset(yShapeInfo, {kh, kw, c})];
+                                if (val > max)
+                                    max = val;
+                            }
                         }
-                    }
 
-                    z[shape::getOffset(zShapeInfo, {b,oh,ow,c})] = static_cast<Z>(max);
+                        z[shape::getOffset(zShapeInfo, {b, oh, ow, c})] = static_cast<Z>(max);
+                    }
                 }
             }
         }
-    }
+    };
+
+    samediff::Threads::parallel_for(func, 0, bS, 1, 0, oH, 1);
 }
 
 void dilation2d(nd4j::LaunchContext* context, NDArray *input, NDArray *weights, NDArray *output, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW) {
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp b/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp
index 7b40d0fa7..9db974b36 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp
@@ -22,6 +22,7 @@
 #include <NativeOps.h>
 #include <vector>
 #include <memory>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -33,13 +34,16 @@ namespace helpers {
         nd4j::graph::RandomGenerator nodeRng(3019L, seed);
         int inLen = input->lengthOf();
 
-        PRAGMA_OMP_PARALLEL_FOR_IF(inLen > Environment::getInstance()->elementwiseThreshold())
-        for (Nd4jLong e = 0; e < inLen; ++e) {
-            float val = nodeRng.relativeT(e, T(0.f), T(1.f));
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto e = start; e < stop; e += increment) {
+                float val = nodeRng.relativeT<T>(e, T(0.f), T(1.f));
 
-            if (val < probValue)
-                output->p<T>(e, input->e<T>(e) / probValue);
-        }
+                if (val < probValue)
+                    output->p<T>(e, input->e<T>(e) / probValue);
+            }
+        };
+
+        samediff::Threads::parallel_for(func, 0, inLen);
     }
     BUILD_SINGLE_TEMPLATE(template void dropoutSimple, (NDArray const* input, NDArray* output, double probValue, int seed), FLOAT_TYPES);
 
@@ -59,7 +63,6 @@ namespace helpers {
             std::vector<Nd4jLong> dims(reduceShape->lengthOf());
 
             bool fit = true;
-            PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(fit))
             for( int i = 0; i < dims.size(); i++ ) {
                 if (fit) {
                     dims[i] = reduceShape->e<Nd4jLong>(i);
@@ -126,14 +129,17 @@ namespace helpers {
         //input->template applyRandom<randomOps::AlphaDropOut<T>>(rng, nullptr, output, probValueArr);
         nd4j::graph::RandomGenerator nodeRng(3019L, seed);
 
-        PRAGMA_OMP_PARALLEL_FOR_IF(input->lengthOf() > Environment::getInstance()->elementwiseThreshold())
-        for (Nd4jLong e = 0; e < input->lengthOf(); ++e) {
-            float randVal = nodeRng.relativeT(e, T(0.f), T(1.f));
-            float xVal = input->e<float>(e);
-            output->p<float>(e, randVal >= probValue ? alpha * beta + alpha1 : alpha * xVal + alpha1);
-        }
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto e = start; e < stop; e += increment) {
+                float randVal = nodeRng.relativeT(e, T(0.f), T(1.f));
+                float xVal = input->e<float>(e);
+                output->p<float>(e, randVal >= probValue ? alpha * beta + alpha1 : alpha * xVal + alpha1);
+            }
+        };
 
-        return ND4J_STATUS_OK;
+        samediff::Threads::parallel_for(func, 0, input->lengthOf());
+
+        return Status::OK();
     }
 
     template <typename T>
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp b/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp
index 2a2b631c8..073167f18 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp
@@ -18,6 +18,7 @@
 // Created by george on 05.04.18.
 //
 #include <ops/declarable/helpers/dynamic.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
     namespace ops {
@@ -61,14 +62,17 @@ namespace nd4j {
                 } else {
                     unsigned int outSize = outputList.size();
 
-                    PRAGMA_OMP_PARALLEL_FOR_IF(outSize > Environment::getInstance()->tadThreshold())
-                    for (unsigned int i = 0; i < outSize; i++) {
-                        outputs[i].first = outputList[i];
-                        outputs[i].second = 0;
-                        for (int e = 0; e < indices->lengthOf(); ++e)
-                            if (indices->e<Nd4jLong>(e) == i)
-                                outputs[i].first->p(outputs[i].second++, input->e<T>(e));
-                    }
+                    auto func = PRAGMA_THREADS_FOR {
+                        for (auto i = start; i < stop; i += increment) {
+                            outputs[i].first = outputList[i];
+                            outputs[i].second = 0;
+                            for (int e = 0; e < indices->lengthOf(); ++e)
+                                if (indices->e<Nd4jLong>(e) == i)
+                                    outputs[i].first->p(outputs[i].second++, input->e<T>(e));
+                        }
+                    };
+
+                    samediff::Threads::parallel_tad(func, 0, outSize);
                 }
             }
             template <typename T>
@@ -165,14 +169,17 @@ namespace nd4j {
                     auto output = outputList[0];
                     unsigned int gradsSize = inputGradientList.size();
 
-                    PRAGMA_OMP_PARALLEL_FOR_IF(gradsSize > Environment::getInstance()->tadThreshold())
-                    for (unsigned int i = 0; i < gradsSize; i++) {
-                        outputs[i].first = inputGradientList[i];
-                        outputs[i].second = 0;
-                        for (int e = 0; e < indices->lengthOf(); ++e)
-                            if (indices->e<Nd4jLong>(e) == i)
-                                output->p<T>(e, outputs[i].first->e<T>(outputs[i].second++));
-                    }
+                    auto func = PRAGMA_THREADS_FOR {
+                        for (auto i = start; i < stop; i += increment) {
+                            outputs[i].first = inputGradientList[i];
+                            outputs[i].second = 0;
+                            for (int e = 0; e < indices->lengthOf(); ++e)
+                                if (indices->e<Nd4jLong>(e) == i)
+                                    output->p<T>(e, outputs[i].first->e<T>(outputs[i].second++));
+                        }
+                    };
+
+                    samediff::Threads::parallel_tad(func, 0, gradsSize);
                 }
 
                 outputList[1]->assign(indices);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/extract_patches.cpp b/libnd4j/include/ops/declarable/helpers/cpu/extract_patches.cpp
index f450584d7..f3fe89103 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/extract_patches.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/extract_patches.cpp
@@ -19,6 +19,7 @@
 //
 
 #include <ops/declarable/helpers/axis.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -47,37 +48,41 @@ namespace helpers {
             rowCast = 0;
         if (sizeCol * rateCol < 3)
             colCast = 0;
-        //Nd4jLong outputLastDim = output->sizeAt(3);
-       PRAGMA_OMP_PARALLEL_FOR
-        for (Nd4jLong batch = 0; batch < batchCount; batch++) {
-            auto patch = listOfMatricies->at(batch);
-            auto outMatrix = listOfOutputs->at(batch);
 
-            for (Nd4jLong i = 0; i < outRowDim; i++) {
-                for (Nd4jLong j = 0; j < outColDim; j++) {
-                    Nd4jLong pos = 0;
-                    //for (Nd4jLong k = 0; k < outputLastDim; k++) {
-                    auto rowStart = i * strideRow - (theSame?rowCast:0);
-                    auto colStart = j * strideCol - (theSame?colCast:0);
-                    auto rowEnd = rowStart + sizeRow * rateRow;
-                    auto colEnd = colStart + sizeCol * rateCol;
-                    if (!theSame) {
-                        rowEnd = math::nd4j_min(rowStart + sizeRow * rateRow, rowDim);
-                        colEnd = math::nd4j_min(colStart + sizeCol * rateCol, colDim);
-                    }
-                    //auto pixel = 0LL;
-                    for (auto row = rowStart; row < rowEnd; row += rateRow)
-                        for (auto col = colStart; col < colEnd; col += rateCol)
-                            for (auto pixel = 0; pixel < lastDim; pixel++) {
-                                bool setUp = (theSame && row >= 0 && col >= 0 && row < rowDim && col < colDim) || (!theSame);
-                                if (setUp) {
-                                    outMatrix->t<T>(i, j, pos) = patch->e<T>(row, col, pixel);
-                                }
-                                pos++;
-                            }
-                }
-            }
-        }
+       auto func = PRAGMA_THREADS_FOR {
+           for (auto batch = 0; batch < stop; batch += increment) {
+               auto patch = listOfMatricies->at(batch);
+               auto outMatrix = listOfOutputs->at(batch);
+
+               for (Nd4jLong i = 0; i < outRowDim; i++) {
+                   for (Nd4jLong j = 0; j < outColDim; j++) {
+                       Nd4jLong pos = 0;
+                       //for (Nd4jLong k = 0; k < outputLastDim; k++) {
+                       auto rowStart = i * strideRow - (theSame ? rowCast : 0);
+                       auto colStart = j * strideCol - (theSame ? colCast : 0);
+                       auto rowEnd = rowStart + sizeRow * rateRow;
+                       auto colEnd = colStart + sizeCol * rateCol;
+                       if (!theSame) {
+                           rowEnd = math::nd4j_min(rowStart + sizeRow * rateRow, rowDim);
+                           colEnd = math::nd4j_min(colStart + sizeCol * rateCol, colDim);
+                       }
+                       //auto pixel = 0LL;
+                       for (auto row = rowStart; row < rowEnd; row += rateRow)
+                           for (auto col = colStart; col < colEnd; col += rateCol)
+                               for (auto pixel = 0; pixel < lastDim; pixel++) {
+                                   bool setUp = (theSame && row >= 0 && col >= 0 && row < rowDim && col < colDim) ||
+                                                (!theSame);
+                                   if (setUp) {
+                                       outMatrix->t<T>(i, j, pos) = patch->e<T>(row, col, pixel);
+                                   }
+                                   pos++;
+                               }
+                   }
+               }
+           }
+       };
+
+       samediff::Threads::parallel_tad(func, 0, batchCount);
     }
 
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp b/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp
index 1a43fb250..3fb7c290d 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp
@@ -20,6 +20,7 @@
 
 #include <ops/declarable/helpers/gather.h>
 #include <numeric>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -56,12 +57,16 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray*
             std::vector<int> dimsOut(indices->rankOf());
             std::iota(dimsOut.begin(), dimsOut.end(), axis);   // fill with axis, axis+1, ... axis+indices->rankOf()-1
             const Nd4jLong numOfSubArrs = indices->lengthOf();
-            PRAGMA_OMP_PARALLEL_FOR_IF(numOfSubArrs > Environment::getInstance()->tadThreshold())
-            for(int i = 0; i < numOfSubArrs; ++i) {
-                NDArray subArrOut = (*output)(i, dimsOut);
-                NDArray subArrIn  = (*input)(indices->e<Nd4jLong>(i), {axis});
-                subArrOut.assign(subArrIn);
-            }
+
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    NDArray subArrOut = (*output)(i, dimsOut);
+                    NDArray subArrIn = (*input)(indices->e<Nd4jLong>(i), {axis});
+                    subArrOut.assign(subArrIn);
+                }
+            };
+
+            samediff::Threads::parallel_tad(func, 0, numOfSubArrs);
         }
     } 
     else {
@@ -72,12 +77,16 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray*
         }
         else { // vector case
             const Nd4jLong numOfSubArrs = intArgs.size() - 1;
-            PRAGMA_OMP_PARALLEL_FOR_IF(numOfSubArrs > Environment::getInstance()->tadThreshold())
-            for(int i = 0; i < numOfSubArrs; ++i) {
-                NDArray subArrOut = (*output)(i, {axis});
-                NDArray subArrIn  = (*input)(intArgs[i+1], {axis});
-                subArrOut.assign(subArrIn);
-            }
+
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    NDArray subArrOut = (*output)(i, {axis});
+                    NDArray subArrIn = (*input)(intArgs[i + 1], {axis});
+                    subArrOut.assign(subArrIn);
+                }
+            };
+
+            samediff::Threads::parallel_tad(func, 0, numOfSubArrs);
         }
     }    
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/hamming.cpp b/libnd4j/include/ops/declarable/helpers/cpu/hamming.cpp
index 687dc0bde..9e3bdf885 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/hamming.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/hamming.cpp
@@ -20,6 +20,7 @@
 
 #include <ops/declarable/helpers/helpers.h>
 #include <ops/declarable/helpers/hamming.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
     namespace ops {
@@ -46,7 +47,7 @@ namespace nd4j {
 
                 Nd4jLong distance = 0;
                 auto lengthOf = x.lengthOf();
-                const int maxThreads = nd4j::math::nd4j_min<int>(256, omp_get_max_threads());
+                int maxThreads = nd4j::math::nd4j_min<int>(256, omp_get_max_threads());
                 Nd4jLong intermediate[256];
 
                 // nullify temp values
@@ -54,30 +55,38 @@ namespace nd4j {
                     intermediate[e] = 0;
 
                 if (xEws == 1 && yEws == 1 && x.ordering() == y.ordering()) {
-                    PRAGMA_OMP_PARALLEL_FOR
-                    for (Nd4jLong e = 0; e < lengthOf; e++) {
-                        auto _x = static_cast<unsigned long long>(xBuffer[e]);
-                        auto _y = static_cast<unsigned long long>(yBuffer[e]);
+                    auto func = PRAGMA_THREADS_FOR {
+                        for (auto e = start; e < stop; e += increment) {
+                            auto _x = static_cast<unsigned long long>(xBuffer[e]);
+                            auto _y = static_cast<unsigned long long>(yBuffer[e]);
 
-                        intermediate[omp_get_thread_num()] += hamming_distance(_x, _y);
-                    }
+                            intermediate[thread_id] += hamming_distance(_x, _y);
+                        }
+                    };
 
+                    maxThreads = samediff::Threads::parallel_for(func, 0, lengthOf);
                 } else if (xEws > 1 && yEws > 1 && x.ordering() == y.ordering()) {
-                    PRAGMA_OMP_PARALLEL_FOR
-                    for (Nd4jLong e = 0; e < lengthOf; e++) {
-                        auto _x = static_cast<unsigned long long>(xBuffer[e * xEws]);
-                        auto _y = static_cast<unsigned long long>(yBuffer[e * yEws]);
+                    auto func = PRAGMA_THREADS_FOR {
+                        for (auto e = start; e < stop; e += increment) {
+                            auto _x = static_cast<unsigned long long>(xBuffer[e * xEws]);
+                            auto _y = static_cast<unsigned long long>(yBuffer[e * yEws]);
 
-                        intermediate[omp_get_thread_num()] += hamming_distance(_x, _y);
-                    }
+                            intermediate[thread_id] += hamming_distance(_x, _y);
+                        }
+                    };
+
+                    maxThreads = samediff::Threads::parallel_for(func, 0, lengthOf);
                 } else {
-                    PRAGMA_OMP_PARALLEL_FOR
-                    for (Nd4jLong e = 0; e < lengthOf; e++) {
-                        auto _x = static_cast<unsigned long long>(x.e<Nd4jLong>(e));
-                        auto _y = static_cast<unsigned long long>(y.e<Nd4jLong>(e));
+                    auto func = PRAGMA_THREADS_FOR {
+                        for (auto e = start; e < stop; e += increment) {
+                            auto _x = static_cast<unsigned long long>(x.e<Nd4jLong>(e));
+                            auto _y = static_cast<unsigned long long>(y.e<Nd4jLong>(e));
 
-                        intermediate[omp_get_thread_num()] += hamming_distance(_x, _y);
-                    }
+                            intermediate[thread_id] += hamming_distance(_x, _y);
+                        }
+                    };
+
+                    maxThreads = samediff::Threads::parallel_for(func, 0, lengthOf);
                 }
 
                 // accumulate intermediate variables into output array
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp b/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp
index b254788a8..04df86c36 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp
@@ -19,6 +19,7 @@
 //
 
 #include <ops/declarable/helpers/hashcode.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
     namespace ops {
@@ -40,18 +41,20 @@ namespace nd4j {
                 auto tempResult = tempBufferB;
 
                 // we divide array into 32 element chunks, and store intermediate results once
-                PRAGMA_OMP_PARALLEL_FOR_SIMD
-                for (int b = 0; b < numBlocks; b++) {
-                    auto blockBuffer = buffer + b * numBlocks;
+                auto func = PRAGMA_THREADS_FOR {
+                    for (auto b = 0; b < stop; b += increment) {
+                        auto blockBuffer = buffer + b * numBlocks;
 
-                    Nd4jLong r = 1;
-                    for (int e = 0; e < blockSize && e + (b * numBlocks) < length; e++) {
-                        auto v = longBytes<T>(blockBuffer[e]);
-                        r = 31 * r + v;
+                        Nd4jLong r = 1;
+                        for (int e = 0; e < blockSize && e + (b * numBlocks) < length; e++) {
+                            auto v = longBytes<T>(blockBuffer[e]);
+                            r = 31 * r + v;
+                        }
+
+                        tempBuffer[b] = r;
                     }
-
-                    tempBuffer[b] = r;
-                }
+                };
+                samediff::Threads::parallel_tad(func, 0, numBlocks);
 
                 // we replace pointer with intermediate one, and repeat only one chunk left
                 int iterationCount = 0;
@@ -60,18 +63,20 @@ namespace nd4j {
                     numBlocks = lastLength / blockSize + ((lastLength % blockSize == 0) ? 0 : 1);
 
 
-                    PRAGMA_OMP_PARALLEL_FOR_SIMD
-                    for (int b = 0; b < numBlocks; b++) {
-                        auto blockBuffer = tempBuffer + b * numBlocks;
+                    auto func2 = PRAGMA_THREADS_FOR {
+                        for (auto b = start; b < stop; b += increment) {
+                            auto blockBuffer = tempBuffer + b * numBlocks;
 
-                        Nd4jLong r = 1;
-                        for (int e = 0; e < blockSize && e + (b * numBlocks) < lastLength; e++) {
-                            auto v = longBytes<T>(blockBuffer[e]);
-                            r = 31 * r + v;
+                            Nd4jLong r = 1;
+                            for (int e = 0; e < blockSize && e + (b * numBlocks) < lastLength; e++) {
+                                auto v = longBytes<T>(blockBuffer[e]);
+                                r = 31 * r + v;
+                            }
+
+                            tempResult[b] = r;
                         }
-
-                        tempResult[b] = r;
-                    }
+                    };
+                    samediff::Threads::parallel_tad(func2, 0, numBlocks);
 
 
                     iterationCount++;
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/histogramFixedWidth.cpp b/libnd4j/include/ops/declarable/helpers/cpu/histogramFixedWidth.cpp
index 349d0381a..1ffb59824 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/histogramFixedWidth.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/histogramFixedWidth.cpp
@@ -42,29 +42,17 @@ void histogramFixedWidth_(const NDArray& input, const NDArray& range, NDArray& o
 
     Nd4jLong inputLength = input.lengthOf();
 
-    PRAGMA_OMP_PARALLEL_FOR
+    // FIXME: make this one parallel without CRITICAL section
     for(Nd4jLong i = 0; i < inputLength; ++i) {
-
         const T value = input.e<T>(i);
 
         if(value < secondEdge) {
-
-            PRAGMA_OMP_CRITICAL
-            {
-                output.p<Nd4jLong>(0, output.e<Nd4jLong>(0) + 1);
-            }
+            output.p<Nd4jLong>(0, output.e<Nd4jLong>(0) + 1);
         } else if(value >= lastButOneEdge) {
-            PRAGMA_OMP_CRITICAL
-            {
-                output.p<Nd4jLong>(nbins - 1, output.e<Nd4jLong>(nbins - 1) + 1);
-            }
+            output.p<Nd4jLong>(nbins - 1, output.e<Nd4jLong>(nbins - 1) + 1);
         } else {
             Nd4jLong currInd = static_cast<Nd4jLong>((value - leftEdge) / binWidth);
-
-            PRAGMA_OMP_CRITICAL
-            {
-                output.p<Nd4jLong>(currInd, output.e<Nd4jLong>(currInd) + 1);
-            }
+            output.p<Nd4jLong>(currInd, output.e<Nd4jLong>(currInd) + 1);
         }
     }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/im2col.cpp b/libnd4j/include/ops/declarable/helpers/cpu/im2col.cpp
index 002c68226..7be34e6ca 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/im2col.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/im2col.cpp
@@ -19,6 +19,7 @@
 //
 
 #include <ops/declarable/helpers/im2col.h>
+#include <execution/Threads.h>
 
 
 namespace nd4j    {
@@ -59,64 +60,71 @@ static void im2col_(nd4j::LaunchContext & context, const NDArray& input,  NDArra
     const Nd4jLong imStride2  = imStride[2];
     const Nd4jLong imStride3  = imStride[3];
 
-    T *col, *im;
-    int imRow, imCol;
 
     if (shape::order(imShapeBuffer) == 'c' &&  shape::order(colShapeBuffer) == 'c' && shape::strideDescendingCAscendingF(imShapeBuffer) && shape::strideDescendingCAscendingF(colShapeBuffer)) {
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(private(col, im, imRow, imCol) collapse(2))
-    	for (int b = 0; b < bS; b++) {
-        	for (int c = 0; c < iC; ++c) {
-            	for (int kRow = 0; kRow < kH; ++kRow) {
-                	for (int kCol = 0; kCol < kW; ++kCol) {
-                    	for (int colH = 0; colH < oH; ++colH) {
-                        	for (int colW = 0; colW < oW; ++colW) {
+        auto func = PRAGMA_THREADS_FOR_2D {
+            for (int b = start_x; b < stop_x; b++) {
+                for (int c = start_y; c < stop_y; c++) {
+                    for (int kRow = 0; kRow < kH; ++kRow) {
+                        for (int kCol = 0; kCol < kW; ++kCol) {
+                            for (int colH = 0; colH < oH; ++colH) {
+                                for (int colW = 0; colW < oW; ++colW) {
 
-                            	imRow = (-pH + kRow * dH) + colH*sH;
-                                imCol = (-pW + kCol * dW) + colW*sW;
+                                    int imRow = (-pH + kRow * dH) + colH * sH;
+                                    int imCol = (-pW + kCol * dW) + colW * sW;
 
-                                col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5;
+                                    auto col = colBuff + b * colStride0 + c * colStride1 + kRow * colStride2 + kCol * colStride3 + colH * colStride4 + colW * colStride5;
 
-                                if (static_cast<unsigned>(imRow) >= static_cast<unsigned>(iH) || static_cast<unsigned>(imCol) >= static_cast<unsigned>(iW))
-                                	*col = zeroPadVal;
-                                else {
-                                    im  = imBuff  + b*imStride0  + c*imStride1  + imRow*imStride2 + imCol*imStride3;
-                                	*col = *im;
+                                    if (static_cast<unsigned>(imRow) >= static_cast<unsigned>(iH) || static_cast<unsigned>(imCol) >= static_cast<unsigned>(iW))
+                                        *col = zeroPadVal;
+                                    else {
+                                        auto im = imBuff + b * imStride0 + c * imStride1 + imRow * imStride2 + imCol * imStride3;
+                                        *col = *im;
+                                    }
                                 }
                             }
                         }
                     }
                 }
             }
-        }
+        };
+
+        samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1);
     }
     else {
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(private(im, col, imRow, imCol) collapse(2))
-    	for (int b = 0; b < bS; b++) {
-        	for (int colH = 0; colH < oH; ++colH) {
-            	for (int colW = 0; colW < oW; ++colW) {
-                	for (int c = 0; c < iC; ++c) {
-                    	for (int kRow = 0; kRow < kH; ++kRow) {
-                        	for (int kCol = 0; kCol < kW; ++kCol) {
+        auto func = PRAGMA_THREADS_FOR_2D {
+            T *col, *im;
+            int imRow, imCol;
 
-                            	imRow = (-pH + kRow * dH) + colH*sH;
-                                imCol = (-pW + kCol * dW) + colW*sW;
+            for (int b = start_x; b < stop_x; b += inc_x) {
+                for (int colH = start_y; colH < stop_y; colH += inc_y) {
+                    for (int colW = 0; colW < oW; ++colW) {
+                        for (int c = 0; c < iC; ++c) {
+                            for (int kRow = 0; kRow < kH; ++kRow) {
+                                for (int kCol = 0; kCol < kW; ++kCol) {
 
-                                col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5;
+                                    imRow = (-pH + kRow * dH) + colH * sH;
+                                    imCol = (-pW + kCol * dW) + colW * sW;
 
-                                if (static_cast<unsigned>(imRow) >= static_cast<unsigned>(iH) || static_cast<unsigned>(imCol) >= static_cast<unsigned>(iW))
-                                	*col = zeroPadVal;
-                                else {
-                                    im  = imBuff  + b*imStride0  + c*imStride1  + imRow*imStride2 + imCol*imStride3;
-                                	*col = *im;
+                                    col = colBuff + b * colStride0 + c * colStride1 + kRow * colStride2 + kCol * colStride3 + colH * colStride4 + colW * colStride5;
+
+                                    if (static_cast<unsigned>(imRow) >= static_cast<unsigned>(iH) || static_cast<unsigned>(imCol) >= static_cast<unsigned>(iW))
+                                        *col = zeroPadVal;
+                                    else {
+                                        im = imBuff + b * imStride0 + c * imStride1 + imRow * imStride2 + imCol * imStride3;
+                                        *col = *im;
+                                    }
                                 }
                             }
                         }
                     }
                 }
             }
-        }
+        };
+
+        samediff::Threads::parallel_for(func, 0, bS, 1, 0, oH, 1);
     }
 }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp b/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp
index 2ac679fc5..11bc1ecaa 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp
@@ -19,6 +19,7 @@
 //
 
 #include <ops/declarable/helpers/image_resize.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -83,7 +84,7 @@ namespace helpers {
             return top + (bottom - top) * yVal;
         };
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
+        // FIXME: fix parallelism here
         for (Nd4jLong b = 0; b < batchSize; ++b) {
             for (Nd4jLong y = 0; y < outHeight; ++y) {
                 const T *ys_input_lower_ptr = input_b_ptr + ys[y].bottomIndex * inRowSize;
@@ -149,11 +150,13 @@ namespace helpers {
 
         int xsSize = xs.size();
         // Scale x interpolation weights to avoid a multiplication during iteration.
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for (int i = 0; i < xsSize; ++i) {
-            xs[i].bottomIndex *= channels;
-            xs[i].topIndex *= channels;
-        }
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
+                xs[i].bottomIndex *= channels;
+                xs[i].topIndex *= channels;
+            }
+        };
+        samediff::Threads::parallel_for(func, 0, xsSize);
 
         resizeImage(images, batchSize, inHeight, inWidth, outHeight, outWidth, channels, xs, ys, output);
         return ND4J_STATUS_OK;
@@ -184,24 +187,22 @@ namespace helpers {
         double heightScale = center ? (inHeight - 1.) / double(outHeight - 1.0) : (inHeight / double(outHeight));
         double widthScale = center ? (inWidth - 1.) / double(outWidth - 1.0) : (inWidth / double(outWidth));
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD_COLLAPSE(2)
-        for (int b = 0; b < batchSize; ++b) {
-            for (int y = 0; y < outHeight; ++y) {
-                Nd4jLong inY = nd4j::math::nd4j_min(
-                        (center) ? static_cast<Nd4jLong>(nd4j::math::p_round<float>(y * heightScale)) : static_cast<Nd4jLong>(nd4j::math::p_floor<float>(
-                                y * heightScale)), inHeight - 1);
-                for (int x = 0; x < outWidth; ++x) {
-                    Nd4jLong inX = nd4j::math::nd4j_min(
-                            (center) ? static_cast<Nd4jLong>(nd4j::math::p_round<float>(x * widthScale)) : static_cast<Nd4jLong>(nd4j::math::p_floor<float>(
-                                    x * widthScale)), inWidth - 1);
-                    for (Nd4jLong e = 0; e < channels; e++)
-                        output->p(b, y, x, e, images->e<T>(b, inY, inX, e));
-//              std::copy_n(&input(b, in_y, in_x, 0), channels, &output(b, y, x, 0));
+        auto func = PRAGMA_THREADS_FOR_2D {
+            for (auto b = start_x; b < stop_x; b += inc_x) {
+                for (auto y = start_y; y < stop_y; y += inc_y) {
+                    Nd4jLong inY = nd4j::math::nd4j_min((center) ? static_cast<Nd4jLong>(nd4j::math::p_round<float>(y * heightScale)) : static_cast<Nd4jLong>(nd4j::math::p_floor<float>(y * heightScale)), inHeight - 1);
+
+                    for (int x = 0; x < outWidth; ++x) {
+                        Nd4jLong inX = nd4j::math::nd4j_min((center) ? static_cast<Nd4jLong>(nd4j::math::p_round<float>(x * widthScale)) : static_cast<Nd4jLong>(nd4j::math::p_floor<float>(x * widthScale)),inWidth - 1);
+                        for (Nd4jLong e = 0; e < channels; e++)
+                            output->p(b, y, x, e, images->e<T>(b, inY, inX, e));
+                    }
                 }
             }
-        }
+        };
+        samediff::Threads::parallel_for(func, 0, batchSize, 1, 0, outHeight, 1);
 
-        return ND4J_STATUS_OK;
+        return Status::OK();
     }
 
     void resizeImage(NDArray const *images, Nd4jLong batchSize, Nd4jLong inHeight, Nd4jLong inWidth, Nd4jLong outHeight,
@@ -263,67 +264,73 @@ namespace helpers {
             T heightScale = (cropHeight > 1) ? (y2 - y1) * (imageHeight - 1) / (cropHeight - 1) : T(0);
             T widthScale = (cropWidth > 1) ? (x2 - x1) * (imageWidth - 1) / (cropWidth - 1) : T(0);
 
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (int y = 0; y < cropHeight; ++y) {
-                const float inY = (cropHeight > 1)
-                                  ? y1 * (imageHeight - 1) + y * heightScale
-                                  : 0.5 * (y1 + y2) * (imageHeight - 1);
-                if (inY < 0 || inY > imageHeight - 1) {
-                    for (int x = 0; x < cropWidth; ++x) {
-                        for (int d = 0; d < depth; ++d) {
-                            crops->p(b, y, x, d, extrapolationVal);
-                        }
-                    }
-                    continue;
-                }
-                if (method == 0 /* bilinear */) {
-                    const int topYIndex = nd4j::math::p_floor(inY);
-                    const int bottomYIndex = nd4j::math::p_ceil(inY);
-                    const float y_lerp = inY - topYIndex;
+            auto func = PRAGMA_THREADS_FOR {
+                for (int y = start; y < stop; y += increment) {
+                    const float inY = (cropHeight > 1)
+                                      ? y1 * (imageHeight - 1) + y * heightScale
+                                      : 0.5 * (y1 + y2) * (imageHeight - 1);
 
-                    for (int x = 0; x < cropWidth; ++x) {
-                        const float in_x = (cropWidth > 1)
-                                           ? x1 * (imageWidth - 1) + x * widthScale
-                                           : 0.5 * (x1 + x2) * (imageWidth - 1);
-                        if (in_x < 0 || in_x > imageWidth - 1) {
+                    if (inY < 0 || inY > imageHeight - 1) {
+                        for (int x = 0; x < cropWidth; ++x) {
                             for (int d = 0; d < depth; ++d) {
                                 crops->p(b, y, x, d, extrapolationVal);
                             }
-                            continue;
-                        }
-                        int left_x_index = math::p_floor(in_x);
-                        int right_x_index = math::p_ceil(in_x);
-                        T x_lerp = in_x - left_x_index;
-
-                        for (int d = 0; d < depth; ++d) {
-                            const float topLeft(images->e<float>(bIn, topYIndex, left_x_index, d));
-                            const float topRight(images->e<float>(bIn, topYIndex, right_x_index, d));
-                            const float bottomLeft(images->e<float>(bIn, bottomYIndex, left_x_index, d));
-                            const float bottomRight(images->e<float>(bIn, bottomYIndex, right_x_index, d));
-                            const float top = topLeft + (topRight - topLeft) * x_lerp;
-                            const float bottom = bottomLeft + (bottomRight - bottomLeft) * x_lerp;
-                            crops->p(b, y, x, d, top + (bottom - top) * y_lerp);
                         }
+                        continue;
                     }
-                } else {  // method is "nearest neighbor"
-                    for (int x = 0; x < cropWidth; ++x) {
-                        const float inX = (cropWidth > 1)
-                                          ? x1 * (imageWidth - 1) + x * widthScale
-                                          : 0.5 * (x1 + x2) * (imageWidth - 1);
-                        if (inX < 0 || inX > imageWidth - 1) {
-                            for (int d = 0; d < depth; ++d) {
-                                crops->p(b, y, x, d, extrapolationVal);
+                    if (method == 0 /* bilinear */) {
+                        const int topYIndex = nd4j::math::p_floor(inY);
+                        const int bottomYIndex = nd4j::math::p_ceil(inY);
+                        const float y_lerp = inY - topYIndex;
+
+                        for (int x = 0; x < cropWidth; ++x) {
+                            const float in_x = (cropWidth > 1)
+                                               ? x1 * (imageWidth - 1) + x * widthScale
+                                               : 0.5 * (x1 + x2) * (imageWidth - 1);
+
+                            if (in_x < 0 || in_x > imageWidth - 1) {
+                                for (int d = 0; d < depth; ++d) {
+                                    crops->p(b, y, x, d, extrapolationVal);
+                                }
+                                continue;
+                            }
+                            int left_x_index = math::p_floor(in_x);
+                            int right_x_index = math::p_ceil(in_x);
+                            T x_lerp = in_x - left_x_index;
+
+                            for (int d = 0; d < depth; ++d) {
+                                const float topLeft(images->e<float>(bIn, topYIndex, left_x_index, d));
+                                const float topRight(images->e<float>(bIn, topYIndex, right_x_index, d));
+                                const float bottomLeft(images->e<float>(bIn, bottomYIndex, left_x_index, d));
+                                const float bottomRight(images->e<float>(bIn, bottomYIndex, right_x_index, d));
+                                const float top = topLeft + (topRight - topLeft) * x_lerp;
+                                const float bottom = bottomLeft + (bottomRight - bottomLeft) * x_lerp;
+                                crops->p(b, y, x, d, top + (bottom - top) * y_lerp);
                             }
-                            continue;
                         }
-                        const int closestXIndex = roundf(inX);
-                        const int closestYIndex = roundf(inY);
-                        for (int d = 0; d < depth; ++d) {
-                            crops->p(b, y, x, d, (F)images->e<T>(bIn, closestYIndex, closestXIndex, d));
+                    } else {  // method is "nearest neighbor"
+                        for (int x = 0; x < cropWidth; ++x) {
+                            const float inX = (cropWidth > 1)
+                                              ? x1 * (imageWidth - 1) + x * widthScale
+                                              : 0.5 * (x1 + x2) * (imageWidth - 1);
+
+                            if (inX < 0 || inX > imageWidth - 1) {
+                                for (int d = 0; d < depth; ++d) {
+                                    crops->p(b, y, x, d, extrapolationVal);
+                                }
+                                continue;
+                            }
+                            const int closestXIndex = roundf(inX);
+                            const int closestYIndex = roundf(inY);
+                            for (int d = 0; d < depth; ++d) {
+                                crops->p(b, y, x, d, (F) images->e<T>(bIn, closestYIndex, closestXIndex, d));
+                            }
                         }
                     }
                 }
-            }
+            };
+
+            samediff::Threads::parallel_for(func, 0, cropHeight);
         }
     }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/image_suppression.cpp b/libnd4j/include/ops/declarable/helpers/cpu/image_suppression.cpp
index f4fb98b2a..ab48ebb32 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/image_suppression.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/image_suppression.cpp
@@ -72,7 +72,8 @@ namespace helpers {
 
         for (int i = 0; i < numBoxes; ++i) {
             bool shouldSelect = numSelected < output->lengthOf();
-            PRAGMA_OMP_PARALLEL_FOR //_ARGS(firstprivate(numSelected))
+
+            // FIXME: add parallelism here
             for (int j = numSelected - 1; j >= 0; --j) {
                 if (shouldSelect)
                 if (needToSuppressWithThreshold(*boxes, indices[i], indices[selectedIndices[j]], T(overlapThreshold))) {
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp b/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp
index def210457..4bc9d3304 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp
@@ -23,6 +23,7 @@
 #include <helpers/TAD.h>
 #include<ops/declarable/helpers/ismax.h>
 #include <helpers/ConstantTadHelper.h>
+#include <execution/Threads.h>
 
 namespace nd4j 	  {
 namespace ops 	  {
@@ -144,14 +145,8 @@ static void ismax_(const NDArray* input, NDArray* output, const std::vector<int>
 
         int span = (tads / num_threads) + 8;
 
-        PRAGMA_OMP_PARALLEL_THREADS(num_threads)
-        {
-            int tid = omp_get_thread_num();
-            int start = span * tid;
-            int end = span * (tid + 1);
-            if (end > tads) end = tads;
-
-            for (int r = start; r < end; r++) {
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto r = start; r < stop; r += increment) {
                     auto rX = const_cast<NDArray*>(input)->bufferAsT<X>() + tadOffsets[r];
                     auto rZ = output->bufferAsT<Z>() + zOfsets[r];
 
@@ -198,7 +193,9 @@ static void ismax_(const NDArray* input, NDArray* output, const std::vector<int>
                         }
                     }
             }
-        }
+        };
+
+        samediff::Threads::parallel_tad(func, 0, tads);
     }
 }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/legacy_helper.cpp b/libnd4j/include/ops/declarable/helpers/cpu/legacy_helper.cpp
index 09cb2df2e..62f8316ce 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/legacy_helper.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/legacy_helper.cpp
@@ -20,6 +20,7 @@
 
 #include <ops/declarable/helpers/legacy_helpers.h>
 #include <NDArrayFactory.h>
+#include <ops/ops.h>
 
 namespace nd4j {
 namespace ops {
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp b/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp
index 0d0705104..c9b833cf5 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp
@@ -22,6 +22,7 @@
 #include <ops/declarable/helpers/lrn.h>
 #include <Status.h>
 #include <ConstantTadHelper.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -60,76 +61,80 @@ static int lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* out
 
     if(inTadEws == 1 && outTadEws == 1) {
         
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for (uint i = 0; i < numOfTads; ++i) {
-            const T* x = inBuff    + inTadOffsets[i];
-                  T* y = outBuff + outTadOffsets[i];
+        auto func = PRAGMA_THREADS_FOR {
+            for (uint i = start; i < stop; i += increment) {
+                const T *x = inBuff + inTadOffsets[i];
+                T *y = outBuff + outTadOffsets[i];
 
-            T prev = 0;
+                T prev = 0;
 
-            // calculate squared sum of elements per each j-th element range [j - depth, j + depth + 1]
-            // we store each squared sum in corresponding element of y array
-            for (uint j = 0; j < tadLen; ++j) {
-                const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
-                const uint last  = depth + j + 1;           
-                const uint end   = nd4j::math::nd4j_min<int>(last, tadLen);
-                
-                if (j == 0) {                    
-                    for (uint s = begin; s < end; ++s)
-                        prev = prev + x[s] * x[s];
-                    y[j] = prev;
+                // calculate squared sum of elements per each j-th element range [j - depth, j + depth + 1]
+                // we store each squared sum in corresponding element of y array
+                for (uint j = 0; j < tadLen; ++j) {
+                    const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
+                    const uint last = depth + j + 1;
+                    const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
+
+                    if (j == 0) {
+                        for (uint s = begin; s < end; ++s)
+                            prev = prev + x[s] * x[s];
+                        y[j] = prev;
+                    } else if (begin == 0 && last <= tadLen)
+                        y[j] = prev + x[end - 1] * x[end - 1];
+                    else if (begin > 0 && last <= tadLen)
+                        y[j] = prev + x[end - 1] * x[end - 1] - x[begin - 1] * x[begin - 1];
+                    else if (begin > 0 && last > tadLen)
+                        y[j] = prev - x[begin - 1] * x[begin - 1];
+                    else
+                        y[j] = prev;
+
+                    if (j != 0)
+                        prev = y[j];
+
+                    y[j] = x[j] / nd4j::math::nd4j_pow<T, T, T>(tbias + alpha * prev, tbeta);
                 }
-                else if (begin == 0 && last <= tadLen)
-                    y[j] = prev + x[end - 1] * x[end - 1];
-                else if (begin > 0 && last <= tadLen)
-                    y[j] = prev + x[end - 1] * x[end - 1] - x[begin - 1] * x[begin - 1];
-                else if (begin > 0 && last > tadLen)
-                    y[j] = prev - x[begin - 1] * x[begin - 1];
-                else
-                    y[j] = prev;
+            }
+        };
 
-                  if(j != 0)
-                    prev = y[j];
-                
-                y[j] = x[j] / nd4j::math::nd4j_pow<T, T, T>(tbias + alpha * prev, tbeta); 
-            }          
-        }
+        samediff::Threads::parallel_tad(func, 0, numOfTads);
     }
     else {
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for (uint i = 0; i < numOfTads; ++i) {
-            const T* x = inBuff    + inTadOffsets[i];
-                  T* y = outBuff + outTadOffsets[i];
+        auto func = PRAGMA_THREADS_FOR {
+            for (uint i = 0; i < numOfTads; ++i) {
+                const T *x = inBuff + inTadOffsets[i];
+                T *y = outBuff + outTadOffsets[i];
 
-            T prev = 0;
+                T prev = 0;
 
-            // calculate squared sum of elements per each j-th element range [j - depth, j + depth + 1]
-            // we store each squared sum in corresponding element of y array
-            for (uint j = 0; j < tadLen; ++j) {
-                const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
-                const uint last  = depth + j + 1;           
-                const uint end   = nd4j::math::nd4j_min<int>(last, tadLen);
-                
-                if (j == 0) {                    
-                    for (uint s = begin; s < end; ++s)
-                        prev = prev + x[s*inTadEws] * x[s*inTadEws];
-                    y[j*outTadEws] = prev;
+                // calculate squared sum of elements per each j-th element range [j - depth, j + depth + 1]
+                // we store each squared sum in corresponding element of y array
+                for (uint j = 0; j < tadLen; ++j) {
+                    const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
+                    const uint last = depth + j + 1;
+                    const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
+
+                    if (j == 0) {
+                        for (uint s = begin; s < end; ++s)
+                            prev = prev + x[s * inTadEws] * x[s * inTadEws];
+                        y[j * outTadEws] = prev;
+                    } else if (begin == 0 && last <= tadLen)
+                        y[j * outTadEws] = prev + x[(end - 1) * inTadEws] * x[(end - 1) * inTadEws];
+                    else if (begin > 0 && last <= tadLen)
+                        y[j * outTadEws] = prev + x[(end - 1) * inTadEws] * x[(end - 1) * inTadEws] - x[(begin - 1) * inTadEws] * x[(begin - 1) * inTadEws];
+                    else if (begin > 0 && last > tadLen)
+                        y[j * outTadEws] = prev - x[(begin - 1) * inTadEws] * x[(begin - 1) * inTadEws];
+                    else
+                        y[j * outTadEws] = prev;
+
+                    if (j != 0)
+                        prev = y[j * outTadEws];
+
+                    y[j * outTadEws] = x[j * inTadEws] / nd4j::math::nd4j_pow<T, T, T>(tbias + alpha * prev, tbeta);
                 }
-                else if (begin == 0 && last <= tadLen)
-                    y[j*outTadEws] = prev + x[(end - 1)*inTadEws] * x[(end - 1)*inTadEws];
-                else if (begin > 0 && last <= tadLen)
-                    y[j*outTadEws] = prev + x[(end - 1)*inTadEws] * x[(end - 1)*inTadEws] - x[(begin - 1)*inTadEws] * x[(begin - 1)*inTadEws];
-                else if (begin > 0 && last > tadLen)
-                    y[j*outTadEws] = prev - x[(begin - 1)*inTadEws] * x[(begin - 1)*inTadEws];
-                else
-                    y[j*outTadEws] = prev;
+            }
+        };
 
-                  if(j != 0)
-                    prev = y[j*outTadEws];
-                
-                y[j*outTadEws] = x[j*inTadEws] / nd4j::math::nd4j_pow<T, T, T>(tbias + alpha * prev, tbeta); 
-            }          
-        }
+        samediff::Threads::parallel_tad(func, 0, numOfTads);
     }    
     return Status::OK();
 }
@@ -173,141 +178,146 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c
 
     if(inTadEws == 1 && gradITadEws == 1) {
         
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for (uint i = 0; i < numOfTads; ++i) {
-            const X* x = inBuff    + inTadOffsets[i];
-                  Y* y = gradIBuff + gradITadOffsets[i];
+        auto func = PRAGMA_THREADS_FOR {
+            for (uint i = start; i < stop; i += increment) {
+                const X *x = inBuff + inTadOffsets[i];
+                      Y *y = gradIBuff + gradITadOffsets[i];
 
-            // this loop calculates squared sum of elements per each j-th element range [j - depth, j + depth + 1]
-            // we store each squared sum in corresponding element of y array
-            for (uint j = 0; j < tadLen; ++j) {
-                const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
-                const uint last  = depth + j + 1;           
-                const uint end   = nd4j::math::nd4j_min<int>(last, tadLen);
-                
-                if (j == 0) {
-                    y[0] = 0;
-                    for (uint s = begin; s < end; ++s)
-                        y[0] = y[0] + x[s] * x[s];
+                // this loop calculates squared sum of elements per each j-th element range [j - depth, j + depth + 1]
+                // we store each squared sum in corresponding element of y array
+                for (uint j = 0; j < tadLen; ++j) {
+                    const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
+                    const uint last = depth + j + 1;
+                    const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
+
+                    if (j == 0) {
+                        y[0] = 0;
+                        for (uint s = begin; s < end; ++s)
+                            y[0] = y[0] + x[s] * x[s];
+                    } else if (begin == 0 && last <= tadLen)
+                        y[j] = y[j - 1] + x[end - 1] * x[end - 1];
+                    else if (begin > 0 && last <= tadLen)
+                        y[j] = y[j - 1] + x[end - 1] * x[end - 1] - x[begin - 1] * x[begin - 1];
+                    else if (begin > 0 && last > tadLen)
+                        y[j] = y[j - 1] - x[begin - 1] * x[begin - 1];
+                    else
+                        y[j] = y[j - 1];
                 }
-                else if (begin == 0 && last <= tadLen)
-                    y[j] = y[j - 1] + x[end - 1] * x[end - 1];
-                else if (begin > 0 && last <= tadLen)
-                    y[j] = y[j - 1] + x[end - 1] * x[end - 1] - x[begin - 1] * x[begin - 1];
-                else if (begin > 0 && last > tadLen)
-                    y[j] = y[j - 1] - x[begin - 1] * x[begin - 1];
-                else
-                    y[j] = y[j - 1];                
+
+                Y *factor = new Y[tadLen];
+
+                Y prev = 0;
+                // second loop calculates derivatives using information gained in first loop above
+                for (uint j = 0; j < tadLen; ++j) {
+                    const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
+                    const uint last = depth + j + 1;
+                    const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
+
+                    Y init = tbias + talpha * y[j];
+
+                    if (j == 0) {
+                        for (uint s = begin; s < end; ++s) {
+                            factor[s] = nd4j::math::nd4j_pow<Y, Y, Y>(tbias + talpha * y[s], -tbeta - 1);
+                            prev = prev + x[s] * factor[s];
+                        }
+                        y[0] = prev;
+                    } else if (begin == 0 && last <= tadLen) {
+                        factor[end - 1] = nd4j::math::nd4j_pow<Y, Y, Y>(tbias + talpha * y[end - 1], -tbeta - 1);
+                        y[j] = prev + x[end - 1] * factor[end - 1];
+                    } else if (begin > 0 && last <= tadLen) {
+                        factor[end - 1] = nd4j::math::nd4j_pow<Y, Y, Y>(tbias + talpha * y[end - 1], -tbeta - 1);
+                        y[j] = prev + x[end - 1] * factor[end - 1] - x[begin - 1] * factor[begin - 1];
+                    } else if (begin > 0 && last > tadLen)
+                        y[j] = prev - x[begin - 1] * factor[begin - 1];
+                    else
+                        y[j] = prev;
+
+                    if (j != 0)
+                        prev = y[j];
+
+                    y[j] = factor[j] * init - 2 * x[j] * coeff * prev;
+                }
+
+                delete[]factor;
             }
+        };
 
-            Y* factor = new Y[tadLen];
-
-            Y prev = 0;
-            // second loop calculates derivatives using information gained in first loop above
-            for (uint j = 0; j < tadLen; ++j) {
-                const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
-                const uint last  = depth + j + 1;
-                const uint end   = nd4j::math::nd4j_min<int>(last, tadLen);
-
-                Y init = tbias + talpha * y[j];
-
-                if (j == 0) {                    
-                    for (uint s = begin; s < end; ++s) {
-                        factor[s] = nd4j::math::nd4j_pow<Y, Y, Y>(tbias + talpha * y[s], -tbeta - 1);
-                        prev = prev + x[s] * factor[s];
-                    }
-                    y[0] = prev;
-                }
-                else if(begin == 0 && last <= tadLen) {
-                    factor[end - 1] = nd4j::math::nd4j_pow<Y, Y, Y>(tbias + talpha * y[end - 1], -tbeta - 1);
-                    y[j] = prev + x[end - 1] * factor[end - 1];
-                }
-                else if (begin > 0 && last <= tadLen) {
-                    factor[end - 1] = nd4j::math::nd4j_pow<Y, Y, Y>(tbias + talpha * y[end - 1], -tbeta - 1);
-                    y[j] = prev + x[end - 1] * factor[end - 1] - x[begin - 1] * factor[begin - 1];
-                }
-                else if (begin > 0 && last > tadLen)
-                    y[j] = prev - x[begin - 1] * factor[begin - 1];
-                else 
-                    y[j] = prev;
-                
-                if(j != 0)
-                    prev = y[j];
-
-                y[j] = factor[j] * init - 2 * x[j] * coeff * prev;                
-            }
-            
-            delete []factor;
-        }
+        samediff::Threads::parallel_tad(func, 0, numOfTads);
     }
     else {
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for (uint i = 0; i < numOfTads; ++i) {
-            const X* x = inBuff    + inTadOffsets[i];
-                  Y* y = gradIBuff + gradITadOffsets[i];
+        auto func = PRAGMA_THREADS_FOR {
+            for (uint i = start; i < stop; i += increment) {
+                const X *x = inBuff + inTadOffsets[i];
+                      Y *y = gradIBuff + gradITadOffsets[i];
 
-            // this loop calculates squared sum of elements per each j-th element range [j - depth, j + depth + 1]
-            // we store each squared sum in corresponding element of y array
-            for (uint j = 0; j < tadLen; ++j) {
-                const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
-                const uint last  = depth + j + 1;           
-                const uint end   = nd4j::math::nd4j_min<int>(last, tadLen);
-                
-                if (j == 0) {
-                    y[0] = 0;
-                    for (uint s = begin; s < end; ++s)
-                        y[0] = y[0] + x[s*inTadEws] * x[s*inTadEws];
+                // this loop calculates squared sum of elements per each j-th element range [j - depth, j + depth + 1]
+                // we store each squared sum in corresponding element of y array
+                for (uint j = 0; j < tadLen; ++j) {
+                    const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
+                    const uint last = depth + j + 1;
+                    const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
+
+                    if (j == 0) {
+                        y[0] = 0;
+                        for (uint s = begin; s < end; ++s)
+                            y[0] = y[0] + x[s * inTadEws] * x[s * inTadEws];
+                    } else if (begin == 0 && last <= tadLen)
+                        y[j * gradITadEws] =
+                                y[(j - 1) * gradITadEws] + x[(end - 1) * inTadEws] * x[(end - 1) * inTadEws];
+                    else if (begin > 0 && last <= tadLen)
+                        y[j * gradITadEws] =
+                                y[(j - 1) * gradITadEws] + x[(end - 1) * inTadEws] * x[(end - 1) * inTadEws] -
+                                x[(begin - 1) * inTadEws] * x[(begin - 1) * inTadEws];
+                    else if (begin > 0 && last > tadLen)
+                        y[j * gradITadEws] =
+                                y[(j - 1) * gradITadEws] - x[(begin - 1) * inTadEws] * x[(begin - 1) * inTadEws];
+                    else
+                        y[j * gradITadEws] = y[(j - 1) * gradITadEws];
                 }
-                else if (begin == 0 && last <= tadLen)
-                    y[j*gradITadEws] = y[(j - 1)*gradITadEws] + x[(end - 1)*inTadEws] * x[(end - 1)*inTadEws];
-                else if (begin > 0 && last <= tadLen)
-                    y[j*gradITadEws] = y[(j - 1)*gradITadEws] + x[(end - 1)*inTadEws] * x[(end - 1)*inTadEws] - x[(begin - 1)*inTadEws] * x[(begin - 1)*inTadEws];
-                else if (begin > 0 && last > tadLen)
-                    y[j*gradITadEws] = y[(j - 1)*gradITadEws] - x[(begin - 1)*inTadEws] * x[(begin - 1)*inTadEws];
-                else
-                    y[j*gradITadEws] = y[(j - 1)*gradITadEws];              
+
+                Y *factor = new Y[tadLen];
+
+                Y prev = 0;
+                // second loop calculates derivatives using information gained in first loop above
+                for (uint j = 0; j < tadLen; ++j) {
+                    const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
+                    const uint last = depth + j + 1;
+                    const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
+
+                    Y init = tbias + talpha * y[j * gradITadEws];
+
+                    if (j == 0) {
+                        for (uint s = begin; s < end; ++s) {
+                            factor[s] = nd4j::math::nd4j_pow<Y, Y, Y>(tbias + talpha * y[s * gradITadEws], -tbeta - 1);
+                            prev = prev + x[s * inTadEws] * factor[s];
+                        }
+                        y[0] = prev;
+                    } else if (begin == 0 && last <= tadLen) {
+                        factor[end - 1] = nd4j::math::nd4j_pow<Y, Y, Y>(tbias + talpha * y[(end - 1) * gradITadEws],
+                                                                        -tbeta - 1);
+                        y[j * gradITadEws] = prev + x[(end - 1) * inTadEws] * factor[end - 1];
+                    } else if (begin > 0 && last <= tadLen) {
+                        factor[end - 1] = nd4j::math::nd4j_pow<Y, Y, Y>(tbias + talpha * y[(end - 1) * gradITadEws],
+                                                                        -tbeta - 1);
+                        y[j * gradITadEws] = prev + x[(end - 1) * inTadEws] * factor[end - 1] -
+                                             x[(begin - 1) * inTadEws] * factor[begin - 1];
+                    } else if (begin > 0 && last > tadLen)
+                        y[j * gradITadEws] = prev - x[(begin - 1) * inTadEws] * factor[begin - 1];
+                    else
+                        y[j * gradITadEws] = prev;
+
+                    if (j != 0)
+                        prev = y[j * gradITadEws];
+
+                    y[j * gradITadEws] = factor[j] * init - 2 * x[j * inTadEws] * coeff * prev;
+                }
+
+                delete[]factor;
             }
+        };
 
-            Y* factor = new Y[tadLen];
-
-            Y prev = 0;
-            // second loop calculates derivatives using information gained in first loop above
-            for (uint j = 0; j < tadLen; ++j) {
-                const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
-                const uint last  = depth + j + 1;
-                const uint end   = nd4j::math::nd4j_min<int>(last, tadLen);
-
-                Y init = tbias + talpha * y[j*gradITadEws];
-
-                if (j == 0) {                    
-                    for (uint s = begin; s < end; ++s) {
-                        factor[s] = nd4j::math::nd4j_pow<Y, Y, Y>(tbias + talpha * y[s*gradITadEws], -tbeta - 1);
-                        prev = prev + x[s*inTadEws] * factor[s];
-                    }
-                    y[0] = prev;
-                }
-                else if(begin == 0 && last <= tadLen) {
-                    factor[end - 1] = nd4j::math::nd4j_pow<Y, Y, Y>(tbias + talpha * y[(end - 1)*gradITadEws], -tbeta - 1);
-                    y[j*gradITadEws] = prev + x[(end - 1)*inTadEws] * factor[end - 1];
-                }
-                else if (begin > 0 && last <= tadLen) {
-                    factor[end - 1] = nd4j::math::nd4j_pow<Y, Y, Y>(tbias + talpha * y[(end - 1)*gradITadEws], -tbeta - 1);
-                    y[j*gradITadEws] = prev + x[(end - 1)*inTadEws] * factor[end - 1] - x[(begin - 1)*inTadEws] * factor[begin - 1];
-                }
-                else if (begin > 0 && last > tadLen)
-                    y[j*gradITadEws] = prev - x[(begin - 1)*inTadEws] * factor[begin - 1];
-                else 
-                    y[j*gradITadEws] = prev;
-                
-                if(j != 0)
-                    prev = y[j*gradITadEws];
-
-                y[j*gradITadEws] = factor[j] * init - 2 * x[j*inTadEws] * coeff * prev;                
-            }
-            
-            delete []factor;
-        }
+        samediff::Threads::parallel_tad(func, 0, numOfTads);
     }    
     gradI *= gradO;
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp b/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp
index 743aab40a..922fdc3a9 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp
@@ -34,6 +34,7 @@
 #include <array/NDArrayList.h>
 #include <iterator>
 #include <MmulHelper.h>
+#include <execution/Threads.h>
 
 namespace nd4j 	  {
 namespace ops 	  {
@@ -122,11 +123,14 @@ static void fusedTanh(NDArray *z, NDArray *i, NDArray *c, const NDArray *cLast,
     auto cLast_ = cLast->bufferAsT<T>();
     auto h_ = h->bufferAsT<T>();
 
-    PRAGMA_OMP_PARALLEL_FOR_SIMD
-    for (uint e = 0; e < uLen; e++) {
-        c_[e] = z_[e] * i_[e] + (f_[e] * cLast_[e]);
-        h_[e] = nd4j::math::nd4j_tanh<T,T>(c_[e]);
-    }
+    auto func = PRAGMA_THREADS_FOR {
+        for (uint e = start; e < stop; e += increment) {
+            c_[e] = z_[e] * i_[e] + (f_[e] * cLast_[e]);
+            h_[e] = nd4j::math::nd4j_tanh<T, T>(c_[e]);
+        }
+    };
+
+    samediff::Threads::parallel_for(func, 0, uLen);
 }
 
 //////////////////////////////////////////////////////////////////////////
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp b/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp
index 9a2034fd0..25605d77e 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp
@@ -20,6 +20,7 @@
 
 #include "ResultSet.h"
 #include <ops/declarable/helpers/matrixSetDiag.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -47,22 +48,22 @@ void matrixSetDiag_(const NDArray& input, const NDArray& diagonal, NDArray& outp
     const int xRank = input.rankOf();
     const auto xLen = input.lengthOf();
 
-    std::vector<Nd4jLong> coords(xRank);  // we use the same coordinates storage both for input and output since their ranks are the same
+    auto func = PRAGMA_THREADS_FOR {
+        Nd4jLong coords[MAX_RANK];
+        for (Nd4jLong i = 0; i < xLen; ++i) {
+            shape::index2coords(i, xShapeInfo, coords);
 
-    PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(coords))
-    for (Nd4jLong i = 0; i < xLen; ++i) {
+            const auto xOffset = shape::getOffset(xShapeInfo, coords);
+            const auto zOffset = areSameOffsets ? xOffset : shape::getOffset(zShapeInfo, coords);
 
-        shape::index2coords(i, xShapeInfo, coords.data());
-
-        const auto xOffset = shape::getOffset(xShapeInfo, coords.data());
-        const auto zOffset = areSameOffsets ? xOffset : shape::getOffset(zShapeInfo, coords.data());
-
-        // condition to be on diagonal of innermost matrix
-        if(coords[xRank - 2] == coords[xRank - 1])
-            z[zOffset] = y[shape::getOffset(yShapeInfo, coords.data())];
-        else
-            z[zOffset] = zeroPad ? static_cast<T>(0) : x[xOffset];
-    }
+            // condition to be on diagonal of innermost matrix
+            if (coords[xRank - 2] == coords[xRank - 1])
+                z[zOffset] = y[shape::getOffset(yShapeInfo, coords)];
+            else
+                z[zOffset] = zeroPad ? static_cast<T>(0) : x[xOffset];
+        }
+    };
+    samediff::Threads::parallel_for(func, 0, xLen);
 }
 
 //////////////////////////////////////////////////////////////////////////
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag_part.cpp b/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag_part.cpp
index c26637bd8..e0e487e82 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag_part.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag_part.cpp
@@ -21,6 +21,7 @@
 #include "ResultSet.h"
 #include <ops/declarable/helpers/matrix_diag_part.h>
 #include <Status.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -43,10 +44,14 @@ int _matrixDiagPart(const NDArray* input, NDArray* output) {
     int lastDimension = nd4j::math::nd4j_min(input->sizeAt(-2), input->sizeAt(-1));
     // TODO: tune this properlys
     int lO = listOut->size();
-    PRAGMA_OMP_PARALLEL_FOR_IF(lO > Environment::getInstance()->tadThreshold())
-    for(int i = 0; i < lO; ++i)
-        for(int j = 0; j < lastDimension; ++j)
-            listOut->at(i)->p(j, listDiag->at(i)->e<T>(j, j));
+
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto i = start; i < stop; i += increment)
+            for (int j = 0; j < lastDimension; ++j)
+                listOut->at(i)->p(j, listDiag->at(i)->e<T>(j, j));
+    };
+
+    samediff::Threads::parallel_tad(func, 0, lO);
     
     delete listOut;
     delete listDiag;
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/nth_element.cpp b/libnd4j/include/ops/declarable/helpers/cpu/nth_element.cpp
index daffa8f17..8c5332be6 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/nth_element.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/nth_element.cpp
@@ -22,6 +22,7 @@
 #include <TAD.h>
 #include <ShapeUtils.h>
 #include <helpers/ConstantTadHelper.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -53,11 +54,14 @@ namespace helpers {
             std::unique_ptr<ResultSet> rows(sortedVals.allTensorsAlongDimension(lastDims));
             Nd4jLong oL = output->lengthOf();
 
-            PRAGMA_OMP_PARALLEL_FOR
-            for (Nd4jLong e = 0; e < oL; e++) {
-                auto row = rows->at(e);
-                output->p(e, row->e<T>(n));
-            }
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto e = start; e < stop; e += increment) {
+                    auto row = rows->at(e);
+                    output->p(e, row->e<T>(n));
+                }
+            };
+
+            samediff::Threads::parallel_for(func, 0, oL);
         }
     }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp b/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp
index a83518899..3e18d6d14 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp
@@ -20,6 +20,7 @@
 
 #include <helpers/TAD.h>
 #include <helpers/ConstantTadHelper.h>
+#include <execution/Threads.h>
 #include "../one_hot.h"
 
 namespace nd4j {
@@ -47,41 +48,47 @@ namespace nd4j {
                 Z one = static_cast<Z>(on);
 
                 if (tadEws >= 1) {
-                    PRAGMA_OMP_PARALLEL_FOR
-                    for (unsigned int e = 0; e < numTads; e++) {
-                        auto cO = output + tadPack.primaryOffsets()[e];
+                    auto func = PRAGMA_THREADS_FOR {
+                        for (auto e = 0; e < stop; e += increment) {
+                            auto cO = output + tadPack.primaryOffsets()[e];
 
-                        auto idx = static_cast<int>(indices[e]);
-                        if (idx < 0 || idx >= tLen) {
-                            PRAGMA_OMP_SIMD
-                            for (unsigned int t = 0; t < tLen; t++) {
-                                cO[t * tadEws] = zero;
-                            }
-                        } else {
-                            PRAGMA_OMP_SIMD
-                            for (unsigned int t = 0; t < tLen; t++) {
-                                cO[t * tadEws] = idx == t ? one : zero;
+                            auto idx = static_cast<int>(indices[e]);
+                            if (idx < 0 || idx >= tLen) {
+                                PRAGMA_OMP_SIMD
+                                for (unsigned int t = 0; t < tLen; t++) {
+                                    cO[t * tadEws] = zero;
+                                }
+                            } else {
+                                PRAGMA_OMP_SIMD
+                                for (unsigned int t = 0; t < tLen; t++) {
+                                    cO[t * tadEws] = idx == t ? one : zero;
+                                }
                             }
                         }
-                    }
+                    };
+
+                    samediff::Threads::parallel_tad(func, 0, numTads);
                 } else {
-                    PRAGMA_OMP_PARALLEL_FOR
-                    for (unsigned int e = 0; e < numTads; e++) {
-                        auto cO = output + tadPack.primaryOffsets()[e];
+                    auto func = PRAGMA_THREADS_FOR {
+                        for (auto e = start; e < stop; e += increment) {
+                            auto cO = output + tadPack.primaryOffsets()[e];
 
-                        auto idx = static_cast<int>(indices[e]);
-                        if (idx < 0 || idx >= tLen) {
-                            PRAGMA_OMP_SIMD
-                            for (unsigned int t = 0; t < tLen; t++) {
-                                cO[shape::getIndexOffset(t, tadPack.primaryShapeInfo())] = zero;
-                            }
-                        } else {
-                            PRAGMA_OMP_SIMD
-                            for (unsigned int t = 0; t < tLen; t++) {
-                                cO[shape::getIndexOffset(t, tadPack.primaryShapeInfo())] = idx == t ? one : zero;
+                            auto idx = static_cast<int>(indices[e]);
+                            if (idx < 0 || idx >= tLen) {
+                                PRAGMA_OMP_SIMD
+                                for (unsigned int t = 0; t < tLen; t++) {
+                                    cO[shape::getIndexOffset(t, tadPack.primaryShapeInfo())] = zero;
+                                }
+                            } else {
+                                PRAGMA_OMP_SIMD
+                                for (unsigned int t = 0; t < tLen; t++) {
+                                    cO[shape::getIndexOffset(t, tadPack.primaryShapeInfo())] = idx == t ? one : zero;
+                                }
                             }
                         }
-                    }
+                    };
+
+                    samediff::Threads::parallel_tad(func, 0, numTads);
                 }
             }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/percentile.cpp b/libnd4j/include/ops/declarable/helpers/cpu/percentile.cpp
index 6ebbb784b..5c1f3c28d 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/percentile.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/percentile.cpp
@@ -66,7 +66,7 @@ static void _percentile(const NDArray& input, NDArray& output, std::vector<int>&
     position = len - position - 1;
 
     // FIXME: our sort impl should be used instead, so this operation might be implemented as generic
-    PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(flattenedArr))
+    // FIXME: parallelism !
     for(int i=0; i<listOfSubArrs->size(); ++i) {
         
         T* buff = reinterpret_cast<T *>(flattenedArr.getBuffer());
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/polyGamma.cpp b/libnd4j/include/ops/declarable/helpers/cpu/polyGamma.cpp
index 6290de6ad..cb97ffe1e 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/polyGamma.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/polyGamma.cpp
@@ -21,6 +21,7 @@
 #include<ops/declarable/helpers/polyGamma.h>
 #include<ops/declarable/helpers/zeta.h>
 #include <NDArrayFactory.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -39,7 +40,6 @@ static FORCEINLINE T getFactorial(const int n) {
 
 	T result = (T)1.f;
 
-    PRAGMA_OMP_PARALLEL_FOR_SIMD_REDUCTION(prodT : result)
 	for(int i = 2; i <= n; ++i)
 		result *= i;
 	
@@ -74,9 +74,12 @@ static void polyGamma_(nd4j::LaunchContext * context, const NDArray& n, const ND
 	NDArray& result = output;
 
 	int xLen = x.lengthOf();
-    PRAGMA_OMP_PARALLEL_FOR_IF(xLen > Environment::getInstance()->elementwiseThreshold())
-	for(int i = 0; i < x.lengthOf(); ++i)
-		result.p(i, polyGammaScalar<T>(context, n.e<int>(i), x.e<T>(i)));
+
+	auto func = PRAGMA_THREADS_FOR {
+        for (auto i = start; i < stop; i += increment)
+            result.p(i, polyGammaScalar<T>(context, n.e<int>(i), x.e<T>(i)));
+    };
+	samediff::Threads::parallel_for(func, 0, x.lengthOf());
 
 //	return result;
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/range.cpp b/libnd4j/include/ops/declarable/helpers/cpu/range.cpp
index 33ba9575d..bb0e7e24e 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/range.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/range.cpp
@@ -20,6 +20,7 @@
 
 
 #include <ops/declarable/helpers/range.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -37,10 +38,11 @@ static void _range(const NDArray& start, const NDArray& delta, NDArray& outVecto
     auto s = start.e<T>(0);
     auto d = delta.e<T>(0);
 
-    PRAGMA_OMP_PARALLEL_FOR_SIMD
-    for(Nd4jLong i = 0; i < len; ++i)
-    	buff[i] = s + i * d;
-        
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto i = start; i < stop; i += increment)
+            buff[i] = s + i * d;
+    };
+    samediff::Threads::parallel_for(func, 0, len);
 }
 
     void range(nd4j::LaunchContext * context, const NDArray& start, const NDArray& delta, NDArray& outVector) {
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp b/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp
index 83deeca88..9f424606d 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp
@@ -21,6 +21,7 @@
 #include <ops/declarable/helpers/reverse.h>
 #include <helpers/ShapeUtils.h>
 #include <array/ResultSet.h>
+#include <execution/Threads.h>
 
 
 namespace nd4j    {
@@ -52,36 +53,36 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
             // two step phase here
             if (inArr == outArr) {
                 if (inEWS == 1) {
-                    PRAGMA_OMP_PARALLEL_FOR
-                    for (Nd4jLong e = 0; e < numOfElemsToReverse / 2; e++) {
-                        auto idx = sLength - e;
-                        swap(inArr, e, idx);
-//                        T tmp = inArr[e];
-//                        inArr[e] = inArr[idx];
-//                        inArr[idx] = tmp;
-                    }
+                    auto func = PRAGMA_THREADS_FOR {
+                        for (auto e = start; e < stop; e += increment) {
+                            auto idx = sLength - e;
+                            swap(inArr, e, idx);
+                        }
+                    };
+                    samediff::Threads::parallel_for(func, 0, numOfElemsToReverse / 2);
                 }
                 else if (inEWS > 1) {
-                    PRAGMA_OMP_PARALLEL_FOR
-                    for (Nd4jLong e = 0; e < numOfElemsToReverse / 2; e++) {
-                        auto idx1 = (sLength - e) * inEWS;
-                        Nd4jLong idx2 =  e * inEWS;
-//                        T tmp = inArr[idx2];
-//                        inArr[idx2] = inArr[idx1];
-//                        inArr[idx1] = tmp;
-                        swap(inArr, idx1, idx2);
-                    }
+                    auto func = PRAGMA_THREADS_FOR {
+                        for (auto e = start; e < stop; e += increment) {
+                            auto idx1 = (sLength - e) * inEWS;
+                            Nd4jLong idx2 = e * inEWS;
+                            swap(inArr, idx1, idx2);
+                        }
+                    };
+
+                    samediff::Threads::parallel_for(func, 0, numOfElemsToReverse / 2);
                 }
                 else {
 
-                    PRAGMA_OMP_PARALLEL_FOR
-                    for (Nd4jLong e = 0; e < numOfElemsToReverse / 2; e++) {
+                    auto func = PRAGMA_THREADS_FOR {
+                        for (auto e = start; e < stop; e += increment) {
+                            auto inOffset = shape::getIndexOffset(e, inShapeBuffer);
+                            auto outOffset = shape::getIndexOffset(sLength - e, inShapeBuffer);
+                            swap(outArr, inOffset, outOffset);
+                        }
+                    };
 
-                        auto inOffset  = shape::getIndexOffset(e, inShapeBuffer);
-                        auto outOffset = shape::getIndexOffset(sLength - e, inShapeBuffer);
-                        //outArr[outOffset] = inArr[inOffset];
-                        swap(outArr, inOffset, outOffset);
-                    }
+                    samediff::Threads::parallel_for(func, 0, numOfElemsToReverse / 2);
                 }
             }
             else {
@@ -91,47 +92,57 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
 
                 if (inEWS == 1 && outEWS == 1 && inOrder == outOrder) {
 
-                    PRAGMA_OMP_PARALLEL_FOR
-                    for (Nd4jLong e = 0; e < numOfElemsToReverse; e++)
-                        outArr[sLength - e] = inArr[e];
+                    auto func = PRAGMA_THREADS_FOR {
+                        for (Nd4jLong e = start; e < stop; e += increment)
+                            outArr[sLength - e] = inArr[e];
+                    };
+                    samediff::Threads::parallel_for(func, 0, numOfElemsToReverse);
 
                     if(inLength != numOfElemsToReverse) {
-                        PRAGMA_OMP_PARALLEL_FOR
-                        for (Nd4jLong e = numOfElemsToReverse; e < inLength; e++)
-                            outArr[e] = inArr[e];
+                        auto f2 = PRAGMA_THREADS_FOR {
+                            for (auto e = start; e < stop; e += increment)
+                                outArr[e] = inArr[e];
+                        };
+                        samediff::Threads::parallel_for(f2, numOfElemsToReverse, inLength);
                     }
                 }
                 else if (inEWS >= 1 && outEWS >= 1 && inOrder == outOrder) {
 
-                    PRAGMA_OMP_PARALLEL_FOR
-                    for (Nd4jLong e = 0; e < numOfElemsToReverse; e++)
-                        outArr[(sLength - e) * outEWS] = inArr[e * inEWS];
+                    auto func = PRAGMA_THREADS_FOR {
+                        for (auto e = start; e < stop; e += increment)
+                            outArr[(sLength - e) * outEWS] = inArr[e * inEWS];
+                    };
+                    samediff::Threads::parallel_for(func, 0, numOfElemsToReverse);
 
                     if(inLength != numOfElemsToReverse) {
-                        PRAGMA_OMP_PARALLEL_FOR
-                        for (Nd4jLong e = numOfElemsToReverse; e < inLength; e++)
-                            outArr[e * outEWS] = inArr[e * inEWS];
+                        auto f2 = PRAGMA_THREADS_FOR {
+                            for (auto e = start; e < stop; e += increment)
+                                outArr[e * outEWS] = inArr[e * inEWS];
+                        };
+                        samediff::Threads::parallel_for(f2, numOfElemsToReverse, inLength);
                     }
                 }
                 else {
 
-                    PRAGMA_OMP_PARALLEL_FOR
-                    for (Nd4jLong e = 0; e < numOfElemsToReverse; e++) {
-
-                        auto inOffset = shape::getIndexOffset(e, inShapeBuffer);
-                        auto outOffset = shape::getIndexOffset(sLength - e, outShapeBuffer);
-                        outArr[outOffset] = inArr[inOffset];
-                    }
+                    auto func = PRAGMA_THREADS_FOR {
+                        for (auto e = start; e < stop; e += increment) {
+                            auto inOffset = shape::getIndexOffset(e, inShapeBuffer);
+                            auto outOffset = shape::getIndexOffset(sLength - e, outShapeBuffer);
+                            outArr[outOffset] = inArr[inOffset];
+                        }
+                    };
+                    samediff::Threads::parallel_for(func, 0, numOfElemsToReverse);
 
                     if(inLength != numOfElemsToReverse) {
 
-                        PRAGMA_OMP_PARALLEL_FOR
-                        for (Nd4jLong e = numOfElemsToReverse; e < inLength; e++) {
-
-                            auto inOffset  = shape::getIndexOffset(e, inShapeBuffer);
-                            auto outOffset = shape::getIndexOffset(e, outShapeBuffer);
-                            outArr[outOffset] = inArr[inOffset];
-                        }
+                        auto f2 = PRAGMA_THREADS_FOR {
+                            for (auto e = start; e < stop; e += increment) {
+                                auto inOffset = shape::getIndexOffset(e, inShapeBuffer);
+                                auto outOffset = shape::getIndexOffset(e, outShapeBuffer);
+                                outArr[outOffset] = inArr[inOffset];
+                            }
+                        };
+                        samediff::Threads::parallel_for(f2, numOfElemsToReverse, inLength);
                     }
                 }
             }
@@ -140,7 +151,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
 
 ///////////////////////////////////////////////////////////////////
 template <typename T>
-static void _reverseSequence(nd4j::LaunchContext * context, const NDArray* input, const NDArray* seqLengths, NDArray* output, int seqDim, const int batchDim){
+static void reverseSequence_(nd4j::LaunchContext * context, const NDArray* input, const NDArray* seqLengths, NDArray* output, int seqDim, const int batchDim){
 
     int posOfNonUnityDim = -1;
     if(input->isVector() || shape::isLikeVector(input->getShapeInfo(), posOfNonUnityDim)) {
@@ -184,7 +195,7 @@ static void _reverseSequence(nd4j::LaunchContext * context, const NDArray* input
 }
 
     void reverseSequence(nd4j::LaunchContext * context, const NDArray* input, const NDArray* seqLengths, NDArray* output, int seqDim, const int batchDim) {
-        BUILD_SINGLE_SELECTOR(input->dataType(), _reverseSequence, (context, input, seqLengths, output, seqDim, batchDim), LIBND4J_TYPES);
+        BUILD_SINGLE_SELECTOR(input->dataType(), reverseSequence_, (context, input, seqLengths, output, seqDim, batchDim), LIBND4J_TYPES);
     }
 
 //////////////////////////////////////////////////////////////////////////
@@ -208,7 +219,7 @@ void reverse(nd4j::LaunchContext * context, const NDArray* input, NDArray* outpu
     delete listIn;
 }
 
-BUILD_SINGLE_TEMPLATE(template void _reverseSequence, (nd4j::LaunchContext * context, const NDArray* input, const NDArray* seqLengths, NDArray* output, int seqDim, const int batchDim), LIBND4J_TYPES);
+BUILD_SINGLE_TEMPLATE(template void reverseSequence_, (nd4j::LaunchContext * context, const NDArray* input, const NDArray* seqLengths, NDArray* output, int seqDim, const int batchDim), LIBND4J_TYPES);
 BUILD_SINGLE_TEMPLATE(template void reverseArray, (nd4j::LaunchContext * context, void *inArr, Nd4jLong *inShapeBuffer, void *outArr, Nd4jLong *outShapeBuffer, int numOfElemsToReverse), LIBND4J_TYPES);
 
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp b/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp
index 5b4c44874..5422d04c1 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp
@@ -20,6 +20,7 @@
 //
 
 #include <ops/declarable/helpers/s_t_b.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -53,21 +54,22 @@ static void batchToSpace_(const NDArray& input, NDArray& output, const uint crop
     const uint iC = xShapeInfo[4];
 
     // loop through output array
-    PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(collapse(4))
-    for (uint b = 0; b < bS; ++b) {
-        for (uint h = cropBottom; h < iH - cropTop; ++h) {
-            for (uint w = cropLeft; w < iW - cropRight; ++w) {
-                for (uint c = 0; c < iC; ++c) {
+    auto func = PRAGMA_THREADS_FOR_3D {
+        for (uint b = start_x; b < stop_x; b += inc_x) {
+            for (uint h = start_y; h < stop_y; h += inc_y) {
+                for (uint w = start_z; w < stop_z; w += inc_z) {
+                    for (uint c = 0; c < iC; ++c) {
+                        const Nd4jLong xOffset = b * xShapeInfo[5] + h * xShapeInfo[6] + w * xShapeInfo[7] + c * xShapeInfo[8];
+                        const Nd4jLong zOffset = b * zShapeInfo[5] + (h - cropBottom) * zShapeInfo[6] + (w - cropLeft) * zShapeInfo[7] + c * zShapeInfo[8];
 
-                    const Nd4jLong xOffset = b * xShapeInfo[5] + h * xShapeInfo[6] + w * xShapeInfo[7] + c * xShapeInfo[8];
-
-                    const Nd4jLong zOffset = b * zShapeInfo[5] + (h - cropBottom) * zShapeInfo[6] + (w - cropLeft) * zShapeInfo[7] + c * zShapeInfo[8];
-
-                    z[zOffset] = x[xOffset];
+                        z[zOffset] = x[xOffset];
+                    }
                 }
             }
         }
-    }
+    };
+
+    samediff::Threads::parallel_for(func, 0, bS, 1, cropBottom, iH - cropTop, 1, cropLeft, iW - cropRight, 1);
 }
 
 BUILD_SINGLE_TEMPLATE(template void batchToSpace_, (const NDArray& input, NDArray& output, const uint cropBottom, const uint cropTop, const uint cropLeft, const uint cropRight), LIBND4J_TYPES);
@@ -109,23 +111,24 @@ static void batchToSpaceND_(const NDArray& input, const NDArray& crop, NDArray&
     const int rank = input.rankOf();
     const Nd4jLong zLen = output.lengthOf();
 
-    std::vector<Nd4jLong> coords(rank);
-
     // loop through input array
-    PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) firstprivate(coords))
+    auto func = PRAGMA_THREADS_FOR {
+        Nd4jLong coords[MAX_RANK];
+        for (auto i = start; i < stop; i += increment) {
 
-    for (Nd4jLong i = 0; i < zLen; ++i) {
+            shape::index2coords(i, output.getShapeInfo(), coords);
 
-        shape::index2coords(i, output.getShapeInfo(), coords.data());
+            const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
 
-        const auto zOffset = shape::getOffset(output.getShapeInfo(), coords.data());
+            // evaluate spatial coordinates for x
+            for (uint j = 1; j <= numOfSpatialDims; ++j)
+                coords[j] += crop.e<uint>(j - 1, 0);       // add crop left
 
-        // evaluate spatial coordinates for x
-        for(uint j = 1; j <= numOfSpatialDims; ++j)
-            coords[j] += crop.e<uint>(j - 1, 0);       // add crop left
+            z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords)];
+        }
+    };
 
-        z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords.data())];
-    }
+    samediff::Threads::parallel_tad(func, 0, zLen);
 }
 
 BUILD_SINGLE_TEMPLATE(template void batchToSpaceND_, (const NDArray& input, const NDArray& crop, NDArray& output, const uint numOfSpatialDims), LIBND4J_TYPES);
@@ -212,24 +215,26 @@ static void spaceToBatch_(const NDArray& input, NDArray& output, const uint padB
     const uint iC = zShapeInfo[4];
 
     // loop through output array
-    PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(collapse(4))
-    for (uint b = 0; b < bS; ++b) {
-        for (uint h = 0; h < oH; ++h) {
-            for (uint w = 0; w < oW; ++w) {
-                for (uint c = 0; c < iC; ++c) {
+    auto func = PRAGMA_THREADS_FOR_2D {
+        for (uint b = start_x; b < stop_x; b += inc_x) {
+            for (uint h = start_y; h < stop_y; h += inc_y) {
+                for (uint w = 0; w < oW; ++w) {
+                    for (uint c = 0; c < iC; ++c) {
 
-                    const Nd4jLong zOffset = b * zShapeInfo[5] + h * zShapeInfo[6] + w * zShapeInfo[7] + c * zShapeInfo[8];
+                        const Nd4jLong zOffset = b * zShapeInfo[5] + h * zShapeInfo[6] + w * zShapeInfo[7] + c * zShapeInfo[8];
 
-                    if(h >= padBottom && h < oH - padTop && w >= padLeft && w < oW - padRight) {
-                        const Nd4jLong xOffset = b * xShapeInfo[5] + (h - padBottom) * xShapeInfo[6] + (w - padLeft) * xShapeInfo[7] + c * xShapeInfo[8];
-                        z[zOffset] = x[xOffset];
+                        if (h >= padBottom && h < oH - padTop && w >= padLeft && w < oW - padRight) {
+                            const Nd4jLong xOffset = b * xShapeInfo[5] + (h - padBottom) * xShapeInfo[6] + (w - padLeft) * xShapeInfo[7] + c * xShapeInfo[8];
+                            z[zOffset] = x[xOffset];
+                        } else
+                            z[zOffset] = 0.f;
                     }
-                    else
-                        z[zOffset] = 0.f;
                 }
             }
         }
-    }
+    };
+
+    samediff::Threads::parallel_for(func, 0, bS, 1, 0, oH, 1);
 }
 
 BUILD_SINGLE_TEMPLATE(template void spaceToBatch_, (const NDArray& input, NDArray& output, const uint padBottom, const uint padTop, const uint padLeft, const uint padRight), LIBND4J_TYPES);
@@ -292,36 +297,37 @@ static void spaceToBatchND_(const NDArray& input, const NDArray& padding, NDArra
     const int rank = input.rankOf();
     const Nd4jLong zLen = output.lengthOf();
 
-    std::vector<Nd4jLong> coords(rank);
-
     // loop through output array
-    PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) firstprivate(coords))
-    for (Nd4jLong i = 0; i < zLen; ++i) {
+    auto func = PRAGMA_THREADS_FOR {
+        Nd4jLong coords[MAX_RANK];
+        for (auto i = start; i < stop; i += increment) {
+            shape::index2coords(i, output.getShapeInfo(), coords);
 
-        shape::index2coords(i, output.getShapeInfo(), coords.data());
+            const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
 
-        const auto zOffset = shape::getOffset(output.getShapeInfo(), coords.data());
+            bool within = true;
 
-        bool within = true;
+            for (uint j = 1; j <= numOfSpatialDims; ++j) {
 
-        for(uint j = 1; j <= numOfSpatialDims; ++j) {
+                const auto padLeft = padding.e<uint>(j - 1, 0);
+                const auto padRight = padding.e<uint>(j - 1, 1);
 
-            const auto padLeft  = padding.e<uint>(j - 1, 0);
-            const auto padRight = padding.e<uint>(j - 1, 1);
+                within &= (coords[j] >= padLeft && coords[j] < output.sizeAt(j) - padRight);
 
-            within &= (coords[j] >= padLeft && coords[j] < output.sizeAt(j) - padRight);
+                if (!within)
+                    break;
 
-            if(!within)
-                break;
+                coords[j] -= padLeft;       // get coordinates for x
+            }
 
-            coords[j] -= padLeft;       // get coordinates for x
+            if (within)
+                z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords)];
+            else
+                z[zOffset] = 0.f;
         }
+    };
 
-        if(within)
-            z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords.data())];
-        else
-            z[zOffset] = 0.f;
-    }
+    samediff::Threads::parallel_tad(func, 0, zLen);
 }
 
 BUILD_SINGLE_TEMPLATE(template void spaceToBatchND_, (const NDArray& input, const NDArray& padding, NDArray& output, const uint numOfSpatialDims), LIBND4J_TYPES);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/s_t_d.cpp b/libnd4j/include/ops/declarable/helpers/cpu/s_t_d.cpp
index af9a74b68..fd285ed9c 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/s_t_d.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/s_t_d.cpp
@@ -19,6 +19,7 @@
 //
 
 #include <ops/declarable/helpers/s_t_d.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -46,47 +47,53 @@ namespace helpers {
         if (isNHWC) {
             const int total_count = batch_size * input_height * input_width * input_depth;
 
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (int inp_idx = 0; inp_idx < total_count; inp_idx++){
-                // inp_idx = d + input_depth * (w + input_width * (h + input_height * b))
-                const int d = inp_idx % input_depth;
-                const int inp_idx2 = inp_idx / input_depth;
-                const int w = inp_idx2 % input_width;
-                const int inp_idx3 = inp_idx2 / input_width;
-                const int h = inp_idx3 % input_height;
-                const int b = inp_idx3 / input_height;
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto inp_idx = start; inp_idx < stop; inp_idx += increment) {
+                    // inp_idx = d + input_depth * (w + input_width * (h + input_height * b))
+                    const int d = inp_idx % input_depth;
+                    const int inp_idx2 = inp_idx / input_depth;
+                    const int w = inp_idx2 % input_width;
+                    const int inp_idx3 = inp_idx2 / input_width;
+                    const int h = inp_idx3 % input_height;
+                    const int b = inp_idx3 / input_height;
 
-                const int out_h = h / block_size;
-                const int offset_h = h % block_size;
-                const int out_w = w / block_size;
-                const int offset_w = w % block_size;
-                const int offset_d = (offset_h * block_size + offset_w) * input_depth;
-                const int out_d = d + offset_d;
-                
-                const int out_idx = out_d + output_depth * (out_w + output_width * (out_h + output_height * b));
-                *(output_ptr + out_idx) = *(input_ptr + inp_idx);
-            }
+                    const int out_h = h / block_size;
+                    const int offset_h = h % block_size;
+                    const int out_w = w / block_size;
+                    const int offset_w = w % block_size;
+                    const int offset_d = (offset_h * block_size + offset_w) * input_depth;
+                    const int out_d = d + offset_d;
+
+                    const int out_idx = out_d + output_depth * (out_w + output_width * (out_h + output_height * b));
+                    *(output_ptr + out_idx) = *(input_ptr + inp_idx);
+                }
+            };
+
+            samediff::Threads::parallel_for(func, 0, total_count);
         } else {
             const int total_count = batch_size * output_depth_by_output_area;
 
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (int inp_idx = 0; inp_idx < total_count; inp_idx++) {
-                const int n_iC_oY_bY_oX = inp_idx / block_size;
-                const int bX = inp_idx - n_iC_oY_bY_oX * block_size;
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto inp_idx = start; inp_idx < stop; inp_idx += increment) {
+                    const int n_iC_oY_bY_oX = inp_idx / block_size;
+                    const int bX = inp_idx - n_iC_oY_bY_oX * block_size;
 
-                const int n_iC_oY_bY = n_iC_oY_bY_oX / output_width;
-                const int oX = n_iC_oY_bY_oX - n_iC_oY_bY * output_width;
+                    const int n_iC_oY_bY = n_iC_oY_bY_oX / output_width;
+                    const int oX = n_iC_oY_bY_oX - n_iC_oY_bY * output_width;
 
-                const int n_iC_oY = n_iC_oY_bY / block_size;
-                const int bY = n_iC_oY_bY - n_iC_oY * block_size;
+                    const int n_iC_oY = n_iC_oY_bY / block_size;
+                    const int bY = n_iC_oY_bY - n_iC_oY * block_size;
 
-                const int n = n_iC_oY / input_depth_by_output_height;
-                const int iC_oY = n_iC_oY - n * input_depth_by_output_height;
+                    const int n = n_iC_oY / input_depth_by_output_height;
+                    const int iC_oY = n_iC_oY - n * input_depth_by_output_height;
 
-                const int output_idx = oX + (((n * block_size + bY) * block_size + bX) * input_depth_by_output_height + iC_oY) * output_width;
-                
-                *(output_ptr + output_idx) = *(input_ptr + inp_idx);
-            }
+                    const int output_idx = oX + (((n * block_size + bY) * block_size + bX) * input_depth_by_output_height + iC_oY) * output_width;
+
+                    *(output_ptr + output_idx) = *(input_ptr + inp_idx);
+                }
+            };
+
+            samediff::Threads::parallel_for(func, 0, total_count);
         }
     }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp b/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp
index 0b16ac989..99605e7cc 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp
@@ -21,6 +21,7 @@
 #include <ops/declarable/helpers/scatter.h>
 #include <numeric>
 #include <helpers/ShapeUtils.h>
+#include <execution/Threads.h>
 
 namespace nd4j    {
 namespace ops     {
@@ -34,16 +35,16 @@ void scatter(nd4j::LaunchContext  *context, pairwise::Ops op, const NDArray& ind
     const Nd4jLong indLen = indices.lengthOf();
 
     if(outRank == 1) {
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
+                Nd4jLong idx = indices.e<Nd4jLong>(i);
+                NDArray out = output({idx, idx + 1});
 
-// PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(indLen > Environment::getInstance()->elementwiseThreshold()) schedule(guided))
-PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(!lock) schedule(guided))
-        for(Nd4jLong i = 0; i < indLen; ++i) {
+                out.applyPairwiseTransform(op, updates.e(i), nullptr);
+            }
+        };
 
-            Nd4jLong idx = indices.e<Nd4jLong>(i);
-            NDArray out = output({idx, idx+1});
-
-            out.applyPairwiseTransform(op, updates.e(i), nullptr);
-        }
+        samediff::Threads::parallel_tad(func, 0, indLen, 1, lock ? 1 : nd4j::Environment::getInstance()->maxThreads());
     }
     else {      // outRank > 1
 
@@ -54,17 +55,16 @@ PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(!lock) schedule(guided))
         std::vector<int> dimsToExcludeUpd(sizeOfDims);
         std::iota(dimsToExcludeUpd.begin(), dimsToExcludeUpd.end(), 0);
 
-        shape::printIntArray(dimsToExcludeUpd.data(),dimsToExcludeUpd.size());
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
+                NDArray outSubArr = output(indices.e<Nd4jLong>(i), std::vector<int>({0}));
+                NDArray updSubArr = updates(i, dimsToExcludeUpd);
 
-// PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(indLen > Environment::getInstance()->elementwiseThreshold()) schedule(guided)) // causes known openMP asan bug !
-PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(!lock) schedule(guided))
-        for(Nd4jLong i = 0; i < indLen; ++i) {
+                outSubArr.applyPairwiseTransform(op, updSubArr, nullptr);
+            }
+        };
 
-            NDArray outSubArr = output(indices.e<Nd4jLong>(i), std::vector<int>({0}));
-            NDArray updSubArr = updates(i, dimsToExcludeUpd);
-
-            outSubArr.applyPairwiseTransform(op, updSubArr, nullptr);
-        }
+        samediff::Threads::parallel_tad(func, 0, indLen, 1, lock ? 1 : nd4j::Environment::getInstance()->maxThreads());
     }
 }
 
@@ -77,40 +77,41 @@ void scatterND(nd4j::LaunchContext  *context, pairwise::Ops op, const NDArray& i
     const Nd4jLong indLastDim = indices.sizeAt(-1);
 
     if(outRank == 1) {
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
+                Nd4jLong idx = indices.e<Nd4jLong>(i);
+                NDArray out = output({idx, idx + 1});
 
-// PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(indLen > Environment::getInstance()->elementwiseThreshold()) schedule(guided))
-PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(!lock) schedule(guided))
-        for(Nd4jLong i = 0; i < indLen; ++i) {
+                out.applyPairwiseTransform(op, updates.e(i), nullptr);
+            }
+        };
 
-            Nd4jLong idx = indices.e<Nd4jLong>(i);
-            NDArray out = output({idx, idx+1});
-
-            out.applyPairwiseTransform(op, updates.e(i), nullptr);
-        }
+        samediff::Threads::parallel_tad(func, 0, indLen, 1, lock ? 1 : nd4j::Environment::getInstance()->maxThreads());
     }
     else {
-
         std::vector<int> dimsToExcludeInd = ShapeUtils::evalDimsToExclude(indRank, {indRank-1});
         std::vector<int> dimsToExcludeUpd(indRank - 1);
         std::iota(dimsToExcludeUpd.begin(), dimsToExcludeUpd.end(), 0);
-        std::vector<Nd4jLong> idxRangeOut(2*outRank, 0);
 
-// PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(indLen/indLastDim > Environment::getInstance()->elementwiseThreshold()) schedule(guided) firstprivate(idxRangeOut))
-PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(!lock) schedule(guided) firstprivate(idxRangeOut))
-        for(Nd4jLong i = 0; i < indLen/indLastDim; ++i) {
+        auto func = PRAGMA_THREADS_FOR {
+            std::vector<Nd4jLong> idxRangeOut(2*outRank, 0);
 
-            NDArray indSubArr = indices(i, dimsToExcludeInd);
+            for (auto i = start; i < stop; i += increment) {
+                NDArray indSubArr = indices(i, dimsToExcludeInd);
 
-            for(Nd4jLong j = 0; j < indLastDim; ++j) {
-                idxRangeOut[2*j] = indSubArr.e<Nd4jLong>(j);
-                idxRangeOut[2*j + 1] = idxRangeOut[2*j] + 1;
+                for (Nd4jLong j = 0; j < indLastDim; ++j) {
+                    idxRangeOut[2 * j] = indSubArr.e<Nd4jLong>(j);
+                    idxRangeOut[2 * j + 1] = idxRangeOut[2 * j] + 1;
+                }
+
+                NDArray outSubArr = output(idxRangeOut);
+                NDArray updSubArr = updates(i, dimsToExcludeUpd);
+
+                outSubArr.applyPairwiseTransform(op, updSubArr, nullptr);
             }
+        };
 
-            NDArray outSubArr = output(idxRangeOut);
-            NDArray updSubArr = updates(i, dimsToExcludeUpd);
-
-            outSubArr.applyPairwiseTransform(op, updSubArr, nullptr);
-        }
+        samediff::Threads::parallel_tad(func, 0, indLen / indLastDim, 1, lock ? 1 : nd4j::Environment::getInstance()->maxThreads());
     }
 }
 
@@ -125,20 +126,24 @@ void scatterForLoss(nd4j::LaunchContext  *context, const NDArray& indices, NDArr
     std::vector<int> dimsToExclude = ShapeUtils::evalDimsToExclude(updates.rankOf(), {-1});
 
     if(!calcGrad) {
-PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided))
-        for(Nd4jLong i = 0; i < indicesLen; ++i) {
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
+                auto subArr = updates(i, dimsToExclude);
+                output.p(i, subArr.e(indices.e<Nd4jLong>(i)));
+            }
+        };
 
-            auto subArr = updates(i, dimsToExclude);
-            output.p(i, subArr.e(indices.e<Nd4jLong>(i)));
-        }
+        samediff::Threads::parallel_for(func, 0, indicesLen);
     } else {
-PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided))
-		for(Nd4jLong i = 0; i < indicesLen; ++i) {
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
+                auto subArr = updates(i, dimsToExclude);
+                auto ind = indices.e<Nd4jLong>(i);
+                subArr.p(ind, subArr.e(ind) - 1.);
+            }
+        };
 
-            auto subArr = updates(i, dimsToExclude);
-            auto ind = indices.e<Nd4jLong>(i);
-            subArr.p(ind, subArr.e(ind) - 1.);
-        }
+        samediff::Threads::parallel_for(func, 0, indicesLen);
     }
 }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp b/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp
index e13cfb177..2884107f3 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp
@@ -21,6 +21,9 @@
 
 #include <ops/declarable/helpers/segment.h>
 #include <ShapeUtils.h>
+#include <execution/Threads.h>
+#include <map>
+
 namespace nd4j {
 namespace ops {
 namespace helpers {
@@ -167,10 +170,13 @@ namespace helpers {
 
             for (int i = 1; i < indices->lengthOf(); i++) {
                 if (indices->e<int>(i) == idx) {
-                    PRAGMA_OMP_PARALLEL_FOR
-                    for (int e = 0; e < meanT->lengthOf(); e++) {
-                       meanV->p<T>(e, meanV->e<T>(e) + listOfTensors->at(i)->e<T>(e));
-                    }
+                    auto func = PRAGMA_THREADS_FOR {
+                        for (auto e = start; e < stop; e += increment) {
+                            meanV->p<T>(e, meanV->e<T>(e) + listOfTensors->at(i)->e<T>(e));
+                        }
+                    };
+                    samediff::Threads::parallel_for(func, 0, meanT->lengthOf());
+
                     count++;
                 }
                 else {
@@ -221,10 +227,12 @@ namespace helpers {
 
             for (int i = 0; i < indices->lengthOf(); i++) {
                 if (indices->e<int>(i) == idx) {
-                    PRAGMA_OMP_PARALLEL_FOR
-                    for (int e = 0; e < sumT->lengthOf(); e++) {
-                       sumT->p(e, sumT->e<T>(e) + listOfTensors->at(i)->e<T>(e));
-                    }
+                    auto func = PRAGMA_THREADS_FOR {
+                        for (auto e = start; e < stop; e += increment) {
+                            sumT->p(e, sumT->e<T>(e) + listOfTensors->at(i)->e<T>(e));
+                        }
+                    };
+                    samediff::Threads::parallel_for(func, 0, sumT->lengthOf());
                 }
                 else {
                     idx = indices->e<int>(i);
@@ -270,10 +278,12 @@ namespace helpers {
             sumT->assign(listOfTensors->at(0));
             for (int i = 1; i < indices->lengthOf(); i++) {
                 if (indices->e<int>(i)  == idx) {
-                    PRAGMA_OMP_PARALLEL_FOR
-                    for (int e = 0; e < sumT->lengthOf(); e++) {
-                       sumT->p(e, sumT->e<T>(e) * listOfTensors->at(i)->e<T>(e));
-                    }
+                    auto func = PRAGMA_THREADS_FOR {
+                        for (auto e = start; e < stop; e += increment) {
+                            sumT->p(e, sumT->e<T>(e) * listOfTensors->at(i)->e<T>(e));
+                        }
+                    };
+                    samediff::Threads::parallel_for(func, 0, sumT->lengthOf());
                 }
                 else {
                     idx = indices->e<int>(i);
@@ -463,7 +473,8 @@ namespace helpers {
             for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) {
                 double sumValue = input->e<double>(fi->second.at(0));
                 int loop_size = fi->second.size();
-                PRAGMA_OMP_PARALLEL_FOR_SIMD_REDUCTION(+:sumValue)
+
+                // FIXME: parallelism here?
                 for (size_t idx = 1; idx < loop_size; ++idx) {
                     sumValue += input->e<double>(fi->second.at(idx));
                 }
@@ -477,11 +488,12 @@ namespace helpers {
             std::unique_ptr<ResultSet> listOfTensors(input->allTensorsAlongDimension(restDims));
             std::unique_ptr<ResultSet> listOfOutTensors(output->allTensorsAlongDimension(restDims));
 
+            // FIXME: parallelism here?
             for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) {
                 auto outputT = listOfOutTensors->at(fi->first);
                 outputT->assign(listOfTensors->at(fi->second.at(0)));
                 Nd4jLong loopSize = fi->second.size();
-                PRAGMA_OMP_PARALLEL_FOR
+
                 for (Nd4jLong idx = 1; idx < loopSize; ++idx) {
                     auto current = listOfTensors->at(fi->second.at(idx));
                     *outputT += *current;
@@ -501,7 +513,8 @@ namespace helpers {
             for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) {
                 double sumValue = input->e<double>(fi->second.at(0));
                 Nd4jLong loop_size = fi->second.size();
-                PRAGMA_OMP_PARALLEL_FOR_REDUCTION(+:sumValue)
+
+                // FIXME: parallelism here?
                 for (Nd4jLong idx = 1; idx < loop_size; ++idx) {
                     sumValue += input->e<double>(fi->second.at(idx));
                 }
@@ -518,7 +531,8 @@ namespace helpers {
                 auto outputT = listOfOutTensors->at(fi->first);
                 outputT->assign(listOfTensors->at(fi->second.at(0)));
                 Nd4jLong loop_size = fi->second.size();
-                PRAGMA_OMP_PARALLEL_FOR
+
+                // FIXME: parallelism here?
                 for (Nd4jLong idx = 1; idx < loop_size; ++idx) {
                     auto current = listOfTensors->at(fi->second.at(idx));
                     *(outputT) += *current;
@@ -619,12 +633,15 @@ namespace helpers {
         segmentMaxFunctor_<T>(input, indices, tempRes);
         if (input->isVector()) {
             Nd4jLong loop_size = input->lengthOf();
-            PRAGMA_OMP_PARALLEL_FOR
-            for (Nd4jLong e = 0; e < loop_size; ++e) {
-                Nd4jLong classNum = indices->e<Nd4jLong>(e);
-                if (nd4j::math::nd4j_abs(tempRes->e<T>(classNum) - input->e<T>(e)) <= T(1.e-6))
-                    output->p(e, gradOut->e<T>(classNum));
-            }
+
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto e = start; e < stop; e += increment) {
+                    auto classNum = indices->e<Nd4jLong>(e);
+                    if (nd4j::math::nd4j_abs(tempRes->e<T>(classNum) - input->e<T>(e)) <= T(1.e-6))
+                        output->p(e, gradOut->e<T>(classNum));
+                }
+            };
+            samediff::Threads::parallel_for(func, 0, loop_size);
         }
         else {
             std::vector<int> restDims = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
@@ -637,18 +654,21 @@ namespace helpers {
             //int numOfClasses = tempRes->sizeAt(0); // number of classes
             //std::vector<std::pair<NDArray*, int>> outputs(numOfClasses);
 
-            PRAGMA_OMP_PARALLEL_FOR
-            for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
-                Nd4jLong classNum = indices->e<Nd4jLong>(i);
-                NDArray* current = listOfTensors->at(i);
-                NDArray* currentOut = listOfOutTensors->at(i);
-                NDArray* currentGradOut = listOfGradOuts->at(classNum);
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    auto classNum = indices->e<Nd4jLong>(i);
+                    auto current = listOfTensors->at(i);
+                    auto currentOut = listOfOutTensors->at(i);
+                    auto currentGradOut = listOfGradOuts->at(classNum);
 
-                for (Nd4jLong e = 0; e < current->lengthOf(); e++) {
-                    if (nd4j::math::nd4j_abs(listOfBPTensors->at(classNum)->e<T>(e) - current->e<T>(e)) <= T(1.e-6))
-                        currentOut->p(e, currentGradOut->e<T>(e));
+                    for (uint64_t e = 0; e < current->lengthOf(); e++) {
+                        if (nd4j::math::nd4j_abs(listOfBPTensors->at(classNum)->e<T>(e) - current->e<T>(e)) <= T(1.e-6))
+                            currentOut->p(e, currentGradOut->e<T>(e));
+                    }
                 }
-            }
+            };
+
+            samediff::Threads::parallel_tad(func, 0, indices->lengthOf());
         }
         delete tempRes;
         return ND4J_STATUS_OK;
@@ -664,12 +684,14 @@ namespace helpers {
         std::unique_ptr<NDArray> tempRes(gradOut->dup());
         segmentMinFunctor(context, input, indices, tempRes.get());
         if (input->isVector()) {
-            PRAGMA_OMP_PARALLEL_FOR
-            for (Nd4jLong e = 0; e < input->lengthOf(); ++e) {
-                Nd4jLong classNum = indices->e<Nd4jLong>(e);
-                if (nd4j::math::nd4j_abs(tempRes->e<double>(classNum) - input->e<double>(e)) < 1.e-5)
-                    output->p(e, gradOut->e<double>(classNum));
-            }
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto e = start; e < stop; e += increment) {
+                    auto classNum = indices->e<Nd4jLong>(e);
+                    if (nd4j::math::nd4j_abs(tempRes->e<double>(classNum) - input->e<double>(e)) < 1.e-5)
+                        output->p(e, gradOut->e<double>(classNum));
+                }
+            };
+            samediff::Threads::parallel_for(func, 0, input->lengthOf());
         }
         else {
             auto restDims = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
@@ -684,17 +706,22 @@ namespace helpers {
             output->assign(0.);
             int pos = 0;
 
-            PRAGMA_OMP_PARALLEL_FOR
-            for (int i = 0; i < indices->lengthOf(); i++) {
-                Nd4jLong classNum = indices->e<Nd4jLong>(i);
-                NDArray* current = listOfTensors->at(i);
-                NDArray* currentOut = listOfOutTensors->at(i);
-                NDArray* currentGradOut = listOfGradOuts->at(classNum);
-                for (int e = 0; e < current->lengthOf(); e++) {
-                    if (nd4j::math::nd4j_abs(listOfBPTensors->at(classNum)->e<double>(e) - current->e<double>(e)) < 1.e-5)
-                        currentOut->p(e, currentGradOut->e<double>(e));
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    auto classNum = indices->e<Nd4jLong>(i);
+                    auto current = listOfTensors->at(i);
+                    auto currentOut = listOfOutTensors->at(i);
+                    auto currentGradOut = listOfGradOuts->at(classNum);
+
+                    for (int e = 0; e < current->lengthOf(); e++) {
+                        if (nd4j::math::nd4j_abs(listOfBPTensors->at(classNum)->e<double>(e) - current->e<double>(e)) <
+                            1.e-5)
+                            currentOut->p(e, currentGradOut->e<double>(e));
+                    }
                 }
-            }
+            };
+
+            samediff::Threads::parallel_tad(func, 0, indices->lengthOf());
         }
         return ND4J_STATUS_OK;
     }
@@ -730,17 +757,20 @@ namespace helpers {
             //std::vector<std::pair<NDArray*, int>> outputs(numOfClasses);
 
             int pos = 0;
-            PRAGMA_OMP_PARALLEL_FOR
-            for (int i = 0; i < indices->lengthOf(); i++) {
-                Nd4jLong classNum = indices->e<Nd4jLong>(i);
-                NDArray* current = listOfTensors->at(i);
-                NDArray* currentOut = listOfOutTensors->at(i);
-                NDArray* currentGradOut = listOfGradOuts->at(classNum);
+            //auto func = [&](uint64_t thread_id, uint64_t start, uint64_t stop, uint64_t increment) -> void {
+                for (auto i = 0; i < indices->lengthOf(); i++) {
+                    auto classNum = indices->e<Nd4jLong>(i);
+                    auto current = listOfTensors->at(i);
+                    auto currentOut = listOfOutTensors->at(i);
+                    auto currentGradOut = listOfGradOuts->at(classNum);
 
-                for (int e = 0; e < current->lengthOf(); e++) {
-                    currentOut->p(e, currentGradOut->e<double>(e) / classCount[classNum]);
+                    for (int e = 0; e < current->lengthOf(); e++) {
+                        currentOut->p(e, currentGradOut->e<double>(e) / classCount.at(classNum));
+                    }
                 }
-            }
+            //};
+
+            //samediff::Threads::parallel_for(func, 0, indices->lengthOf());
         }
         return ND4J_STATUS_OK;
     }
@@ -762,16 +792,20 @@ namespace helpers {
             std::unique_ptr<ResultSet> listOfTensors(input->allTensorsAlongDimension(restDims));
             std::unique_ptr<ResultSet> listOfOutTensors(output->allTensorsAlongDimension(restDims));
 
-            PRAGMA_OMP_PARALLEL_FOR
-            for (int i = 0; i < indices->lengthOf(); i++) {
-                Nd4jLong classNum = indices->e<Nd4jLong>(i);
-                NDArray* current = listOfTensors->at(i);
-                NDArray* currentOut = listOfOutTensors->at(i);
-                NDArray* currentGradOut = listOfGradOuts->at(classNum);
-                currentOut->assign(currentGradOut);
-            }
+            //auto func = PRAGMA_THREADS_FOR {
+                for (auto i = 0; i < indices->lengthOf(); i++) {
+                    auto classNum = indices->e<Nd4jLong>(i);
+                    auto current = listOfTensors->at(i);
+                    auto currentOut = listOfOutTensors->at(i);
+                    auto currentGradOut = listOfGradOuts->at(classNum);
+
+                    currentOut->assign(currentGradOut);
+                }
+            //};
+
+            //samediff::Threads::parallel_for(func, 0, indices->lengthOf());
         }
-        return ND4J_STATUS_OK;
+        return Status::OK();
     }
 
     int segmentProdFunctorBP(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
@@ -794,16 +828,19 @@ namespace helpers {
             //int numOfClasses = tempRes->sizeAt(0); // number of classes
             //std::vector<std::pair<NDArray*, int>> outputs(numOfClasses);
 
-            PRAGMA_OMP_PARALLEL_FOR
-            for (int i = 0; i < indices->lengthOf(); i++) {
-                Nd4jLong classNum = indices->e<Nd4jLong>(i);
-                NDArray* current = listOfTensors->at(i);
-                NDArray* currentOut = listOfOutTensors->at(i);
-                NDArray* currentGradOut = listOfGradOuts->at(classNum);
-                NDArray* currentFFOut = listOfBPTensors->at(classNum);
+            //auto func = PRAGMA_THREADS_FOR {
+                for (auto i = 0; i < indices->lengthOf(); i++) {
+                    auto classNum = indices->e<Nd4jLong>(i);
+                    auto current = listOfTensors->at(i);
+                    auto currentOut = listOfOutTensors->at(i);
+                    auto currentGradOut = listOfGradOuts->at(classNum);
+                    auto currentFFOut = listOfBPTensors->at(classNum);
 
-                currentOut->assign((*currentFFOut) * (*currentGradOut) / (*current));
-            }
+                    currentOut->assign((*currentFFOut) * (*currentGradOut) / (*current));
+                }
+            //};
+
+            //samediff::Threads::parallel_for(func, 0, indices->lengthOf());
         }
         delete tempRes;
         return ND4J_STATUS_OK;
@@ -861,12 +898,15 @@ namespace helpers {
         unsortedSegmentMinFunctor(context, input, indices, numOfClasses, tempRes);
         if (input->isVector()) {
 
-            PRAGMA_OMP_PARALLEL_FOR
-            for (Nd4jLong e = 0; e < input->lengthOf(); ++e) {
-                Nd4jLong classNum = indices->e<Nd4jLong>(e);
-                if (nd4j::math::nd4j_abs(tempRes->t<T>(classNum) - input->t<T>(e)) < 1.e-6)
-                    output->t<T>(e) = gradOut->t<T>(classNum);
-            }
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto e = start; e < stop; e += increment) {
+                    auto classNum = indices->e<Nd4jLong>(e);
+                    if (nd4j::math::nd4j_abs(tempRes->t<T>(classNum) - input->t<T>(e)) < 1.e-6)
+                        output->t<T>(e) = gradOut->t<T>(classNum);
+                }
+            };
+
+            samediff::Threads::parallel_for(func, 0, input->lengthOf());
         }
         else {
             auto restDims = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
@@ -876,21 +916,21 @@ namespace helpers {
             std::unique_ptr<ResultSet> listOfTensors(input->allTensorsAlongDimension(restDims));
             std::unique_ptr<ResultSet> listOfOutTensors(output->allTensorsAlongDimension(restDims));
 
-            //int numOfClasses = tempRes->sizeAt(0); // number of classes
-            //std::vector<std::pair<NDArray*, int>> outputs(numOfClasses);
+            //auto func = PRAGMA_THREADS_FOR {
+                for (auto i = 0; i < indices->lengthOf(); i++) {
+                    auto classNum = indices->e<Nd4jLong>(i);
+                    auto current = listOfTensors->at(i);
+                    auto currentOut = listOfOutTensors->at(i);
+                    auto currentGradOut = listOfGradOuts->at(classNum);
 
-            PRAGMA_OMP_PARALLEL_FOR
-            for (int i = 0; i < indices->lengthOf(); i++) {
-                Nd4jLong classNum = indices->e<Nd4jLong>(i);
-                NDArray* current = listOfTensors->at(i);
-                NDArray* currentOut = listOfOutTensors->at(i);
-                NDArray* currentGradOut = listOfGradOuts->at(classNum);
-
-                for (int e = 0; e < current->lengthOf(); e++) {
-                    if (nd4j::math::nd4j_abs(listOfBPTensors->at(classNum)->t<T>(e) - current->t<T>(e)) < 1.e-6)
-                        currentOut->t<T>(e) = currentGradOut->t<T>(e);
+                    for (int e = 0; e < current->lengthOf(); e++) {
+                        if (nd4j::math::nd4j_abs(listOfBPTensors->at(classNum)->t<T>(e) - current->t<T>(e)) < 1.e-6)
+                            currentOut->t<T>(e) = currentGradOut->t<T>(e);
+                    }
                 }
-            }
+            //};
+
+            //samediff::Threads::parallel_for(func, 0, indices->lengthOf());
         }
         delete tempRes;
         return ND4J_STATUS_OK;
@@ -955,17 +995,19 @@ namespace helpers {
             std::unique_ptr<ResultSet> listOfTensors(input->allTensorsAlongDimension(restDims));
             std::unique_ptr<ResultSet> listOfOutTensors(output->allTensorsAlongDimension(restDims));
 
-            PRAGMA_OMP_PARALLEL_FOR
-            for (int i = 0; i < indices->lengthOf(); i++) {
-                Nd4jLong classNum = indices->e<Nd4jLong>(i);
-                //NDArray* current = listOfTensors->at(i);
-                NDArray* currentOut = listOfOutTensors->at(i);
-                NDArray* currentGradOut = listOfGradOuts->at(classNum);
+            //auto func = PRAGMA_THREADS_FOR {
+                for (auto i = 0; i < indices->lengthOf(); i++) {
+                    auto classNum = indices->e<Nd4jLong>(i);
+                    auto currentOut = listOfOutTensors->at(i);
+                    auto currentGradOut = listOfGradOuts->at(classNum);
 
-                currentOut->assign(currentGradOut);
-            }
+                    currentOut->assign(currentGradOut);
+                }
+            //};
+
+            //samediff::Threads::parallel_for(func, 0, indices->lengthOf());
         }
-        return ND4J_STATUS_OK;
+        return Status::OK();
     }
 
     int unsortedSegmentProdFunctorBP(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
@@ -973,11 +1015,14 @@ namespace helpers {
 
         unsortedSegmentProdFunctor(context, input, indices, numOfClasses, tempRes);
         if (input->isVector()) {
-            PRAGMA_OMP_PARALLEL_FOR
-            for (Nd4jLong e = 0; e < indices->lengthOf(); ++e) {
-                Nd4jLong classNum = indices->e<Nd4jLong>(e);
-                output->p<double>(e, gradOut->e<double>(classNum) * tempRes->e<double>(classNum)/ input->e<double>(e));
-            }
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto e = start; e < stop; e += increment) {
+                    auto classNum = indices->e<Nd4jLong>(e);
+                    output->p<double>(e, gradOut->e<double>(classNum) * tempRes->e<double>(classNum) / input->e<double>(e));
+                }
+            };
+
+            samediff::Threads::parallel_for(func, 0, indices->lengthOf());
         }
         else {
             auto restDims = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
@@ -987,19 +1032,22 @@ namespace helpers {
             std::unique_ptr<ResultSet> listOfTensors(input->allTensorsAlongDimension(restDims));
             std::unique_ptr<ResultSet> listOfOutTensors(output->allTensorsAlongDimension(restDims));
 
-            PRAGMA_OMP_PARALLEL_FOR
-            for (int i = 0; i < indices->lengthOf(); i++) {
-                Nd4jLong classNum = indices->e<Nd4jLong>(i);
-                NDArray* current = listOfTensors->at(i);
-                NDArray* currentOut = listOfOutTensors->at(i);
-                NDArray* currentGradOut = listOfGradOuts->at(classNum);
-                auto currentFFOut = listOfBPTensors->at(classNum);
+            //auto func = PRAGMA_THREADS_FOR {
+                for (auto i = 0; i < indices->lengthOf(); i++) {
+                    auto classNum = indices->e<Nd4jLong>(i);
+                    auto current = listOfTensors->at(i);
+                    auto currentOut = listOfOutTensors->at(i);
+                    auto currentGradOut = listOfGradOuts->at(classNum);
+                    auto currentFFOut = listOfBPTensors->at(classNum);
 
-                currentOut->assign((*currentFFOut) * (*currentGradOut) / (*current));
-            }
+                    currentOut->assign((*currentFFOut) * (*currentGradOut) / (*current));
+                }
+            //};
+
+            //samediff::Threads::parallel_for(func, 0, indices->lengthOf());
         }
         delete tempRes;
-        return ND4J_STATUS_OK;
+        return Status::OK();
     }
 
 //    template <typename T>
@@ -1016,11 +1064,14 @@ namespace helpers {
 
         // if input is a vector: (as if in doc sample)
         if (input->isVector()) {
-            PRAGMA_OMP_PARALLEL_FOR
-            for (Nd4jLong e = 0; e < indices->lengthOf(); ++e) {
-                Nd4jLong classNum = indices->e<Nd4jLong>(e);
-                output->p(e, gradOut->e<double>(classNum) / nd4j::math::nd4j_sqrt<double,double>(classCount[classNum]));
-            }
+            //auto func = PRAGMA_THREADS_FOR {
+                for (auto e = 0; e < indices->lengthOf(); e++) {
+                    auto classNum = indices->e<Nd4jLong>(e);
+                    output->p(e, gradOut->e<double>(classNum) / nd4j::math::nd4j_sqrt<double, double>(classCount[classNum]));
+                }
+            //};
+
+            //samediff::Threads::parallel_for(func, 0, indices->lengthOf());
         }
         else {
             auto restDims = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
@@ -1029,22 +1080,22 @@ namespace helpers {
             std::unique_ptr<ResultSet> listOfTensors(input->allTensorsAlongDimension(restDims));
             std::unique_ptr<ResultSet> listOfOutTensors(output->allTensorsAlongDimension(restDims));
 
-            //int numOfClasses = tempRes->sizeAt(0); // number of classes
-            //std::vector<std::pair<NDArray*, int>> outputs(numOfClasses);
+            //auto func = PRAGMA_THREADS_FOR {
+                for (auto i = 0; i < indices->lengthOf(); i++) {
+                    auto classNum = indices->e<Nd4jLong>(i);
+                    auto current = listOfTensors->at(i);
+                    auto currentOut = listOfOutTensors->at(i);
+                    auto currentGradOut = listOfGradOuts->at(classNum);
 
-            PRAGMA_OMP_PARALLEL_FOR
-            for (int i = 0; i < indices->lengthOf(); i++) {
-                Nd4jLong classNum = indices->e<Nd4jLong>(i);
-                NDArray* current = listOfTensors->at(i);
-                NDArray* currentOut = listOfOutTensors->at(i);
-                NDArray* currentGradOut = listOfGradOuts->at(classNum);
-
-                for (int e = 0; e < current->lengthOf(); e++) {
-                    currentOut->p(e, currentGradOut->e<double>(e) / nd4j::math::nd4j_sqrt<double,double>(classCount[classNum]));
+                    for (int e = 0; e < current->lengthOf(); e++) {
+                        currentOut->p<double>(e, currentGradOut->e<double>(e) / nd4j::math::nd4j_sqrt<double, double>(classCount[classNum]));
+                    }
                 }
-            }
+            //};
+
+            //samediff::Threads::parallel_for(func, 0, indices->lengthOf());
         }
-        return ND4J_STATUS_OK;
+        return Status::OK();
     }
 
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/sequence_mask.cpp b/libnd4j/include/ops/declarable/helpers/cpu/sequence_mask.cpp
index 03f61d453..bf3463afe 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/sequence_mask.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/sequence_mask.cpp
@@ -19,6 +19,7 @@
 //
 
 #include <ops/declarable/helpers/sequence_mask.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -26,11 +27,14 @@ namespace helpers {
 
     template <typename I, typename B>
     static void sequenceMask_(NDArray* input, NDArray* output, int maxIndex) {
-        PRAGMA_OMP_PARALLEL_FOR_SIMD_COLLAPSE(2)
-        for (Nd4jLong i = 0; i < maxIndex; i++)
-            for(Nd4jLong k = 0; k < input->lengthOf(); k++)
-                if (i < input->t<I>(k))
-                    output->t<B>(k * maxIndex + i) = B(true); //,  T(1.0f));
+        auto func = PRAGMA_THREADS_FOR_2D {
+            for (auto i = start_x; i < stop_x; i += inc_x)
+                for (auto k = start_y; k < stop_y; k += inc_y)
+                    if (i < input->t<I>(k))
+                        output->t<B>(k * maxIndex + i) = B(true); //,  T(1.0f));
+        };
+
+        samediff::Threads::parallel_for(func, 0, maxIndex, 1, 0, input->lengthOf(), 1);
     }
 
     void sequenceMask(nd4j::LaunchContext * context, NDArray* input, NDArray* output, int maxIndex) {
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp b/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp
index b0fd449c7..59c257c28 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp
@@ -20,6 +20,7 @@
 
 #include <ops/declarable/helpers/sg_cb.h>
 #include <specials.h>
+#include <execution/Threads.h>
 
 #define HS_MAX_EXP 6.0f
 
@@ -350,8 +351,6 @@ namespace nd4j {
                 const auto negTable = reinterpret_cast<T*>(vnegTable);
                 const auto infVector = reinterpret_cast<T*>(vinfVector);
 
-                T sneu1e[600];
-
                 //const auto numThreads = omp_get_max_threads();
                 const auto idxShift = indices.isEmpty() ? 0 : indices.sizeAt(1);
                 const auto hsRounds = codes.isEmpty() ? 0 : codes.sizeAt(1);
@@ -362,64 +361,71 @@ namespace nd4j {
                     auto bIndices = indices.bufferAsT<int>();
                     auto bCodes = codes.bufferAsT<int8_t>();
 
-                    PRAGMA_OMP_PARALLEL_FOR_ARGS(num_threads(numThreads) private(sneu1e))
-                    for (int t = 0; t < numTargets; t++) {
-                        T* neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength];
-                        memset(neu1e, 0, vectorLength * sizeof(T));
+                    auto func = PRAGMA_THREADS_FOR {
+                        T sneu1e[600];
 
-                        auto target = bTarget[t];
-                        auto alpha = lr.e<double>(t);
-                        unsigned long long randomValue = nextRandom.e<Nd4jLong>(t);
+                        for (auto t = start; t < stop; t += increment) {
+                            T *neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength];
+                            memset(neu1e, 0, vectorLength * sizeof(T));
 
-                        auto syn0row = reinterpret_cast<T*>(s0.bufferWithOffset(target * vectorLength));
+                            auto target = bTarget[t];
+                            auto alpha = lr.e<double>(t);
+                            unsigned long long randomValue = nextRandom.e<Nd4jLong>(t);
 
-                        if (hsRounds > 0) {
-                            int irow = 0;
-                            auto cShift = t * idxShift;
+                            auto syn0row = reinterpret_cast<T *>(s0.bufferWithOffset(target * vectorLength));
 
-                            for (int e = 0; e < hsRounds; e++) {
-                                irow = bIndices[e + cShift];
-                                if (irow < 0 || irow >= vocabSize)
-                                    continue;
+                            if (hsRounds > 0) {
+                                int irow = 0;
+                                auto cShift = t * idxShift;
 
-                                auto syn1row = s1.bufferWithOffset(irow * vectorLength);
-                                auto code = bCodes[e + cShift];
+                                for (int e = 0; e < hsRounds; e++) {
+                                    irow = bIndices[e + cShift];
+                                    if (irow < 0 || irow >= vocabSize)
+                                        continue;
+
+                                    auto syn1row = s1.bufferWithOffset(irow * vectorLength);
+                                    auto code = bCodes[e + cShift];
 
                                     //nd4j_printf("syn0: [%i]; syn1: [%i]; code: [%i]\n", target, irow, code);
-                                hSoftmax_<T>(syn0row, syn1row, expTable, neu1e, alpha, vectorLength, code, expLength, false);
-                            }
-                        }
-
-
-                        if (nsRounds > 0) {
-                            int irow = negStarters.e<int>(t);
-                            int nsStarter = irow;
-                            for (int r = 0; r < nsRounds + 1; r++) {
-                                if (r == 0) {
-                                    // target is known in advance
-                                } else {
-                                    randomValue = randomValue * (unsigned long long) 25214903917 + 11;
-                                    auto idx = nd4j::math::nd4j_abs<Nd4jLong >((randomValue >> 16) % negLength);
-                                    irow = idx >= negLength ? -1 : static_cast<int>(negTable[idx]);
-
-                                    if (irow < 0 || irow >= vocabSize)
-                                        irow = randomValue % (vocabSize - 1) + 1;
-
-                                    if (irow == nsStarter)
-                                        continue;
+                                    hSoftmax_<T>(syn0row, syn1row, expTable, neu1e, alpha, vectorLength, code,
+                                                 expLength, false);
                                 }
-
-                                nSampling_<T>(syn0row, s1n.bufferWithOffset(irow * vectorLength), expTable, neu1e, alpha, vectorLength, r == 0 ? 1 : 0, expLength, infVector != nullptr);
                             }
+
+
+                            if (nsRounds > 0) {
+                                int irow = negStarters.e<int>(t);
+                                int nsStarter = irow;
+                                for (int r = 0; r < nsRounds + 1; r++) {
+                                    if (r == 0) {
+                                        // target is known in advance
+                                    } else {
+                                        randomValue = randomValue * (unsigned long long) 25214903917 + 11;
+                                        auto idx = nd4j::math::nd4j_abs<Nd4jLong>((randomValue >> 16) % negLength);
+                                        irow = idx >= negLength ? -1 : static_cast<int>(negTable[idx]);
+
+                                        if (irow < 0 || irow >= vocabSize)
+                                            irow = randomValue % (vocabSize - 1) + 1;
+
+                                        if (irow == nsStarter)
+                                            continue;
+                                    }
+
+                                    nSampling_<T>(syn0row, s1n.bufferWithOffset(irow * vectorLength), expTable, neu1e,
+                                                  alpha, vectorLength, r == 0 ? 1 : 0, expLength, infVector != nullptr);
+                                }
+                            }
+
+                            for (int e = 0; e < vectorLength; e++)
+                                syn0row[e] += neu1e[e];
+
+                            // optionally release temp arrays
+                            if (vectorLength > 600)
+                                delete[] neu1e;
                         }
+                    };
 
-                        for (int e = 0; e < vectorLength; e++)
-                            syn0row[e] += neu1e[e];
-
-                        // optionally release temp arrays
-                        if (vectorLength > 600)
-                            delete[] neu1e;
-                    }
+                    samediff::Threads::parallel_tad(func, 0, numTargets, 1, numThreads);
             }
             BUILD_SINGLE_TEMPLATE(template void skipgramBatchExec_, (NDArray &s0, NDArray &s1, NDArray &s1n, void *vexpTable, void *vnegTable, void *vinfVector, NDArray &targets, NDArray &negStarters, NDArray &indices, NDArray &codes, NDArray &lr, NDArray &nextRandom, const int nsRounds, const int vocabSize, const int vectorLength, const int expLength, const int negLength, const bool preciseMode, const int numThreads), FLOAT_TYPES);
 
@@ -434,9 +440,6 @@ namespace nd4j {
                 const auto negTable = reinterpret_cast<T*>(vnegTable);
                 const auto infVector = reinterpret_cast<T*>(vinfVector);
 
-                T sneu1[600];
-                T sneu1e[600];
-
                 //const auto numThreads = omp_get_max_threads();
                 const auto idxShift = indices.isEmpty() ? 0 : indices.sizeAt(1);
                 const auto hsRounds = codes.isEmpty() ? 0 : codes.sizeAt(1);
@@ -450,122 +453,131 @@ namespace nd4j {
                 const auto bStarters = negStarters.bufferAsT<int>();
                 const auto numIndices = indices.isEmpty() ? 0 : indices.sizeAt(1);
 
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(num_threads(numThreads) private(sneu1, sneu1e))
-                for (int e = 0; e < numTargets; e++){
-                    T* neu1 = vectorLength <= 600 ? sneu1 : new T[vectorLength];
-                    T* neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength];
+                auto func = PRAGMA_THREADS_FOR {
+                    T sneu1[600];
+                    T sneu1e[600];
 
-                    // optionally we nullify temp arrays after successful (and on first) cycle
-                    memset(neu1, 0, sizeof(T) * vectorLength);
-                    memset(neu1e, 0, sizeof(T) * vectorLength);
+                    for (int e = start; e < stop; e += increment) {
+                        T *neu1 = vectorLength <= 600 ? sneu1 : new T[vectorLength];
+                        T *neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength];
 
-                    auto alpha = lr.e<double>(e);
-                    auto numLabels = nLabels.isEmpty() ? 0 : nLabels.e<int>(e);
+                        // optionally we nullify temp arrays after successful (and on first) cycle
+                        memset(neu1, 0, sizeof(T) * vectorLength);
+                        memset(neu1e, 0, sizeof(T) * vectorLength);
 
-                    int actualContext = 0;
+                        auto alpha = lr.e<double>(e);
+                        auto numLabels = nLabels.isEmpty() ? 0 : nLabels.e<int>(e);
 
-                    // building neu1 for current window
-                    for (int c = 0; c < contextWidth; c++) {
-                        // getting next context word
-                        auto cContext = bContext[c + (e * contextWidth)];
+                        int actualContext = 0;
 
-                        // skipping padded values
-                        if (cContext < 0)
-                            continue;
+                        // building neu1 for current window
+                        for (int c = 0; c < contextWidth; c++) {
+                            // getting next context word
+                            auto cContext = bContext[c + (e * contextWidth)];
 
-                        if (cContext >= vocabSize)
-                            throw std::runtime_error("ContextID can't be >= vocab size");
-
-                        T *syn0word = syn0 + (cContext * vectorLength);
-
-                        for (int i = 0; i < vectorLength; i++)
-                            neu1[i] += syn0word[i];
-
-                        actualContext++;
-                    }
-
-                    if (infVector != nullptr)
-                        actualContext++;
-
-                    if (actualContext > 1) {
-                        for (int i = 0; i < vectorLength; i++)
-                            neu1[i] /= actualContext;
-                    }
-
-                    // hierarchic softmax step
-                    if (!indices.isEmpty()) {
-                        for (int i = 0; i < numIndices; i++) {
-                            const int cIndex = bIndices[(e * numIndices) + i];
-                            const int cCode = bCodes[(e * numIndices) + i];
-
-                            // we're skipping padded values
-                            if (cIndex < 0)
+                            // skipping padded values
+                            if (cContext < 0)
                                 continue;
 
-                            if (cIndex >= vocabSize)
-                                throw std::runtime_error("Index can't be > vocab size");
+                            if (cContext >= vocabSize)
+                                throw std::runtime_error("ContextID can't be >= vocab size");
 
-                            hSoftmax_<T>(neu1, syn1 + (cIndex * vectorLength), expTable, neu1e, alpha, vectorLength, cCode, expLength, false);
+                            T *syn0word = syn0 + (cContext * vectorLength);
+
+                            for (int i = 0; i < vectorLength; i++)
+                                neu1[i] += syn0word[i];
+
+                            actualContext++;
                         }
-                    }
 
-                    // negative sampling step
-                    if (!negStarters.isEmpty() && nsRounds > 0) {
-                        int irow = bStarters[e];
-                        const int nsStarter = irow;
-                        unsigned long long randomValue = nextRandom.e<Nd4jLong>(e);
+                        if (infVector != nullptr)
+                            actualContext++;
 
-                        for (int r = 0; r < nsRounds + 1; r++) {
-                            // we're skipping rng on 0 step
-                            if (r != 0) {
-                                randomValue = randomValue * (unsigned long long) 25214903917 + 11;
-                                auto idx = nd4j::math::nd4j_abs<Nd4jLong>((randomValue >> 16) % negLength);
-                                irow = idx >= negLength ? -1 : static_cast<int>(negTable[idx]);
+                        if (actualContext > 1) {
+                            for (int i = 0; i < vectorLength; i++)
+                                neu1[i] /= actualContext;
+                        }
 
-                                if (irow < 0 || irow >= vocabSize) irow = randomValue % (vocabSize - 1) + 1;
-                                if (irow == nsStarter)
+                        // hierarchic softmax step
+                        if (!indices.isEmpty()) {
+                            for (int i = 0; i < numIndices; i++) {
+                                const int cIndex = bIndices[(e * numIndices) + i];
+                                const int cCode = bCodes[(e * numIndices) + i];
+
+                                // we're skipping padded values
+                                if (cIndex < 0)
                                     continue;
 
-                                nSampling_<T>(neu1, s1n.bufferWithOffset(irow * vectorLength), expTable, neu1e, alpha, vectorLength, r == 0 ? 1 : 0, expLength, infVector != nullptr);
-                            } else {
-                                nSampling_<T>(neu1, s1n.bufferWithOffset(irow * vectorLength), expTable, neu1e, alpha, vectorLength, r == 0 ? 1 : 0, expLength, infVector != nullptr);
-                            }
+                                if (cIndex >= vocabSize)
+                                    throw std::runtime_error("Index can't be > vocab size");
 
-                            //nd4j_printf("Thread <%i>: syn0: [%i]; s1n: [%i];\n", omp_get_thread_num(), 0, irow);
+                                hSoftmax_<T>(neu1, syn1 + (cIndex * vectorLength), expTable, neu1e, alpha, vectorLength,
+                                             cCode, expLength, false);
+                            }
+                        }
+
+                        // negative sampling step
+                        if (!negStarters.isEmpty() && nsRounds > 0) {
+                            int irow = bStarters[e];
+                            const int nsStarter = irow;
+                            unsigned long long randomValue = nextRandom.e<Nd4jLong>(e);
+
+                            for (int r = 0; r < nsRounds + 1; r++) {
+                                // we're skipping rng on 0 step
+                                if (r != 0) {
+                                    randomValue = randomValue * (unsigned long long) 25214903917 + 11;
+                                    auto idx = nd4j::math::nd4j_abs<Nd4jLong>((randomValue >> 16) % negLength);
+                                    irow = idx >= negLength ? -1 : static_cast<int>(negTable[idx]);
+
+                                    if (irow < 0 || irow >= vocabSize) irow = randomValue % (vocabSize - 1) + 1;
+                                    if (irow == nsStarter)
+                                        continue;
+
+                                    nSampling_<T>(neu1, s1n.bufferWithOffset(irow * vectorLength), expTable, neu1e,
+                                                  alpha, vectorLength, r == 0 ? 1 : 0, expLength, infVector != nullptr);
+                                } else {
+                                    nSampling_<T>(neu1, s1n.bufferWithOffset(irow * vectorLength), expTable, neu1e,
+                                                  alpha, vectorLength, r == 0 ? 1 : 0, expLength, infVector != nullptr);
+                                }
+
+                                //nd4j_printf("Thread <%i>: syn0: [%i]; s1n: [%i];\n", omp_get_thread_num(), 0, irow);
+                            }
+                        }
+
+
+                        // if we're skipping labels
+                        int starter = trainWords == 1 ? 0 : contextWidth - numLabels;
+
+                        // applying previously averaged results
+                        for (int c = starter; c < contextWidth; c++) {
+                            // getting context
+                            auto cContext = bContext[c + (e * contextWidth)];
+                            auto cLock = bLocker[c + (e * contextWidth)];
+
+                            // skipping padded values
+                            if (cContext < 0 || cLock == 1)
+                                continue;
+
+                            if (cContext >= vocabSize)
+                                throw std::runtime_error("ContextID can't be > vocab size");
+
+                            // one word from context
+                            T *syn0word = syn0 + (cContext * vectorLength);
+
+                            for (int i = 0; i < vectorLength; i++)
+                                syn0word[i] += neu1e[i];
+
+                        }
+
+                        // optionally release temp arrays
+                        if (vectorLength > 600) {
+                            delete[] neu1;
+                            delete[] neu1e;
                         }
                     }
+                };
 
-
-                    // if we're skipping labels
-                    int starter = trainWords == 1 ? 0 : contextWidth - numLabels;
-
-                    // applying previously averaged results
-                    for (int c = starter; c < contextWidth; c++) {
-                        // getting context
-                        auto cContext = bContext[c + (e * contextWidth)];
-                        auto cLock = bLocker[c + (e * contextWidth)];
-
-                        // skipping padded values
-                        if (cContext < 0 || cLock == 1)
-                            continue;
-
-                        if (cContext >= vocabSize)
-                            throw std::runtime_error("ContextID can't be > vocab size");
-
-                        // one word from context
-                        T *syn0word = syn0 + (cContext * vectorLength);
-
-                        for (int i = 0; i < vectorLength; i++)
-                            syn0word[i] += neu1e[i];
-
-                    }
-
-                    // optionally release temp arrays
-                    if (vectorLength > 600) {
-                        delete[] neu1;
-                        delete[] neu1e;
-                    }
-                }
+                samediff::Threads::parallel_tad(func, 0, numTargets, 1, numThreads);
             }
             BUILD_SINGLE_TEMPLATE(template void cbowBatchExec_, (NDArray &s0, NDArray &s1, NDArray &s1n, void *vexpTable, void *vnegTable, void *vinfVector, NDArray &context, NDArray &lockedWords, NDArray &targets, NDArray &negStarters, NDArray &indices, NDArray &codes, NDArray &lr, NDArray &nextRandom, NDArray &nLabels, const int nsRounds, const int vocabSize, const int vectorLength, const int expLength, const int negLength,  const bool trainWords, const int numThreads), FLOAT_TYPES);
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/sru.cpp b/libnd4j/include/ops/declarable/helpers/cpu/sru.cpp
index a80e65999..1fea14824 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/sru.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/sru.cpp
@@ -23,6 +23,7 @@
 #include<ops/declarable/helpers/sru.h>
 #include <NDArrayFactory.h>
 #include <MmulHelper.h>
+#include <execution/Threads.h>
 
 namespace nd4j    {
 namespace ops     {
@@ -141,47 +142,49 @@ static void sruBI_(NDArray* x, const NDArray* w, const NDArray* b, const NDArray
     T* pHt   = ht->bufferAsT<T>();
     T* pCt   = ct->bufferAsT<T>();
 
-    PRAGMA_OMP_PARALLEL_FOR
-    for (Nd4jLong col = 0; col < ncols; ++col) {
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto col = start; col < stop; col += increment) {
+            const auto colNum = col % d2;
+            bool flip = colNum >= K;
+            T maskVal = mask ? *(pMask + col) : T(1);
+            T cur = *(pInit + col);
+            T bF = *(pBias + colNum);
+            T bR = *(pBias + colNum + d2);
+            T *pWiVal = pWi + 3 * col;
+            T *pIVal = pI + col;
+            T *pHtVal = pHt + col;
+            T *pCtVal = pCt + col;
 
-        const auto colNum = col % d2;
-        bool flip = colNum >= K;
-        T maskVal = mask ? *(pMask + col) : T(1);
-        T cur     = *(pInit + col);
-        T bF      = *(pBias + colNum);
-        T bR      = *(pBias + colNum + d2);
-        T* pWiVal = pWi     + 3*col;
-        T* pIVal  = pI      + col;
-        T* pHtVal = pHt     + col;
-        T* pCtVal = pCt     + col;
+            if (flip) {
+                const auto step = (time - 1) * ncols;
+                pIVal += step;
+                pHtVal += step;
+                pCtVal += step;
+                pWiVal += (time - 1) * ncolsWi;
+            }
 
-        if (flip) {
-            const auto step = (time - 1) * ncols;
-            pIVal  += step;
-            pHtVal += step;
-            pCtVal += step;
-            pWiVal += (time - 1) * ncolsWi;
+            auto ncolsRev = flip ? -ncols : ncols;
+            auto ncolsWiRev = flip ? -ncolsWi : ncolsWi;
+
+            for (Nd4jLong t = 0; t < time; ++t) {
+                // evaluate sigmoids
+                T ft = (1.) / (1. + nd4j::math::nd4j_exp<T, T>(-(pWiVal[1] + bF)));
+                T rt = (1.) / (1. + nd4j::math::nd4j_exp<T, T>(-(pWiVal[2] + bR)));
+
+                cur = (cur - *pWiVal) * ft + *pWiVal;
+                *pCtVal = cur;
+                T val = nd4j::math::nd4j_tanh<T, T>(cur);
+                *pHtVal = (val * maskVal - *pIVal) * rt + *pIVal;
+
+                pIVal += ncolsRev;
+                pWiVal += ncolsWiRev;
+                pCtVal += ncolsRev;
+                pHtVal += ncolsRev;
+            }
         }
+    };
 
-        auto ncolsRev   = flip ? -ncols   : ncols;
-        auto ncolsWiRev = flip ? -ncolsWi : ncolsWi;
-
-        for (Nd4jLong t = 0; t < time; ++t) {
-            // evaluate sigmoids
-            T ft = (1.)/(1. + nd4j::math::nd4j_exp<T, T>(-(pWiVal[1] + bF)));
-            T rt = (1.)/(1. + nd4j::math::nd4j_exp<T, T>(-(pWiVal[2] + bR)));
-
-            cur = (cur - *pWiVal)*ft + *pWiVal;
-            *pCtVal = cur;
-            T val = nd4j::math::nd4j_tanh<T, T>(cur);
-            *pHtVal = (val*maskVal - *pIVal)*rt + *pIVal;
-
-            pIVal  += ncolsRev;
-            pWiVal += ncolsWiRev;
-            pCtVal += ncolsRev;
-            pHtVal += ncolsRev;
-        }
-    }
+    samediff::Threads::parallel_tad(func, 0, ncols);
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -232,72 +235,75 @@ static void sruBIBP_(NDArray* x, const NDArray* w, const NDArray* b, const NDArr
     T* pGradBias  = gradBias.bufferAsT<T>();
     T* pGradInit  = gradC0->bufferAsT<T>();
 
-    PRAGMA_OMP_PARALLEL_FOR
-    for (Nd4jLong col = 0; col < ncols; ++col) {
-        T gbF = 0.f;
-        T gbR = 0.f;
-        const auto colNum = col % d2;
-        const bool flip = colNum >= K;
-        T maskVal       = mask ? *(pMask + col) : T(1.);
-        T cur           = *(pInGradCt + col);
-        T bF            = *(pBias     + colNum);
-        T bR            = *(pBias     + colNum + d2);
-        T* pWiVal        = pWi         + 3*col;
-        T* pInputVal     = pInput      + col;
-        T* pStateVal     = pState      + col;
-        T* pInGradHtVal  = pInGradHt    + col;
-        T* pGradWiVal    = pGradWi     + 3*col;
-        T* pGradInputVal = pGradInput  + col;
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto col = start; col < stop; col += increment) {
+            T gbF = 0.f;
+            T gbR = 0.f;
+            const auto colNum = col % d2;
+            const bool flip = colNum >= K;
+            T maskVal = mask ? *(pMask + col) : T(1.);
+            T cur = *(pInGradCt + col);
+            T bF = *(pBias + colNum);
+            T bR = *(pBias + colNum + d2);
+            T *pWiVal = pWi + 3 * col;
+            T *pInputVal = pInput + col;
+            T *pStateVal = pState + col;
+            T *pInGradHtVal = pInGradHt + col;
+            T *pGradWiVal = pGradWi + 3 * col;
+            T *pGradInputVal = pGradInput + col;
 
-        if (!flip) {
-            const auto stepI = (time - 1) * ncols;
-            const auto stepW = (time - 1) * ncolsWi;
-            pInputVal     += stepI;
-            pStateVal     += stepI;
-            pInGradHtVal  += stepI;
-            pGradInputVal += stepI;
-            pWiVal        += stepW;
-            pGradWiVal    += stepW;
+            if (!flip) {
+                const auto stepI = (time - 1) * ncols;
+                const auto stepW = (time - 1) * ncolsWi;
+                pInputVal += stepI;
+                pStateVal += stepI;
+                pInGradHtVal += stepI;
+                pGradInputVal += stepI;
+                pWiVal += stepW;
+                pGradWiVal += stepW;
+            }
+
+            Nd4jLong ncolsRev = flip ? -ncols : ncols;
+            Nd4jLong ncolsWiRev = flip ? -ncolsWi : ncolsWi;
+
+            for (Nd4jLong t = 0; t < time; ++t) {
+                // evaluate sigmoids
+                T ft = ((T) 1.) / ((T) 1. + nd4j::math::nd4j_exp<T, T>(-(*(pWiVal + 1) + bF)));
+                T rt = ((T) 1.) / ((T) 1. + nd4j::math::nd4j_exp<T, T>(-(*(pWiVal + 2) + bR)));
+
+                T val = nd4j::math::nd4j_tanh<T, T>(*pStateVal);
+                T prevVal = (t < time - 1) ? (*(pStateVal - ncolsRev)) : (*(pInit + col));
+                // grad wrt input
+                *pGradInputVal = *pInGradHtVal - (*pInGradHtVal) * rt;
+                // grad wrt rt, wiR and bR
+                T grt = (*pInGradHtVal) * (val * maskVal - *pInputVal) * (rt - rt * rt);
+                *(pGradWiVal + 2) = grt;
+                gbR += grt;
+                // grad wrt state
+                T gradSateVal = (*pInGradHtVal) * maskVal * (rt - rt * val * val) + cur;
+                // grad wrt wi0
+                *pGradWiVal = gradSateVal - gradSateVal * ft;
+                // grad wrt ft, wi1, and bF
+                T gft = gradSateVal * (prevVal - *pWiVal) * (ft - ft * ft);
+                *(pGradWiVal + 1) = gft;
+                gbF += gft;
+                // grad wrt c_previous
+                cur = gradSateVal * ft;
+
+                pInputVal -= ncolsRev;
+                pWiVal -= ncolsWiRev;
+                pStateVal -= ncolsRev;
+                pGradWiVal -= ncolsWiRev;
+                pGradInputVal -= ncolsRev;
+                pInGradHtVal -= ncolsRev;
+            }
+            *(pGradBias + col) = gbF;
+            *(pGradBias + col + ncols) = gbR;
+            *(pGradInit + col) = cur;
         }
+    };
 
-        Nd4jLong ncolsRev   = flip ? -ncols   : ncols;
-        Nd4jLong ncolsWiRev = flip ? -ncolsWi : ncolsWi;
-
-        for (Nd4jLong t = 0; t < time; ++t) {
-            // evaluate sigmoids
-            T ft = ((T)1.)/((T)1. + nd4j::math::nd4j_exp<T,T>(-(*(pWiVal + 1) + bF)));
-            T rt = ((T)1.)/((T)1. + nd4j::math::nd4j_exp<T,T>(-(*(pWiVal + 2) + bR)));
-
-            T val     = nd4j::math::nd4j_tanh<T,T>(*pStateVal);
-            T prevVal = (t < time-1) ? (*(pStateVal - ncolsRev)) : (*(pInit + col));
-            // grad wrt input
-            *pGradInputVal = *pInGradHtVal - (*pInGradHtVal)*rt ;
-            // grad wrt rt, wiR and bR
-            T grt = (*pInGradHtVal) * (val*maskVal - *pInputVal) * (rt - rt*rt);
-            *(pGradWiVal + 2) = grt;
-            gbR += grt;
-            // grad wrt state
-            T gradSateVal = (*pInGradHtVal) * maskVal * (rt - rt*val*val) + cur;
-            // grad wrt wi0
-            *pGradWiVal = gradSateVal - gradSateVal*ft;
-            // grad wrt ft, wi1, and bF
-            T gft = gradSateVal * (prevVal - *pWiVal) * (ft - ft*ft);
-            *(pGradWiVal + 1) = gft;
-            gbF += gft;
-            // grad wrt c_previous
-            cur = gradSateVal * ft;
-
-            pInputVal     -= ncolsRev;
-            pWiVal        -= ncolsWiRev;
-            pStateVal     -= ncolsRev;
-            pGradWiVal    -= ncolsWiRev;
-            pGradInputVal -= ncolsRev;
-            pInGradHtVal  -= ncolsRev;
-        }
-        *(pGradBias + col) = gbF;
-        *(pGradBias + col + ncols) = gbR;
-        *(pGradInit + col) = cur;
-    }
+    samediff::Threads::parallel_tad(func, 0, ncols);
 
     // gradB
     gradBias.reduceAlongDimension(reduce::Sum, gradB, {0});    // [4*K]
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/stack.cpp b/libnd4j/include/ops/declarable/helpers/cpu/stack.cpp
index 55de117a5..b974a236b 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/stack.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/stack.cpp
@@ -21,6 +21,7 @@
 #include <ops/declarable/helpers/stack.h>
 #include <helpers/ShapeUtils.h>
 #include <array/ResultSet.h>
+#include <execution/Threads.h>
 
 
 namespace nd4j {
@@ -35,9 +36,12 @@ static void stack_(const std::vector<const NDArray*>& inArrs, NDArray* outArr, c
 	if(inArrs[0]->rankOf() == 0) {
 	    int inSize = inArrs.size();
 
-        PRAGMA_OMP_PARALLEL_FOR_IF(inSize > Environment::getInstance()->tadThreshold())
-		for(int i=0; i < inSize; ++i)
-			outArr->p<T>(i, inArrs[i]->t<T>(0));
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment)
+                outArr->p<T>(i, inArrs[i]->t<T>(0));
+        };
+
+        samediff::Threads::parallel_for(func, 0, inSize);
 	}
 	else {
 
@@ -45,9 +49,11 @@ static void stack_(const std::vector<const NDArray*>& inArrs, NDArray* outArr, c
 		auto list = outArr->allTensorsAlongDimension(dimsToExclude);		// list.size() == block.width()
         int listSize = list->size();
 
-        PRAGMA_OMP_PARALLEL_FOR_IF(listSize > Environment::getInstance()->tadThreshold())
-		for(int i=0; i<listSize; ++i)
-			list->at(i)->assign(inArrs[i]);
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment)
+                list->at(i)->assign(inArrs[i]);
+        };
+        samediff::Threads::parallel_tad(func, 0, listSize);
 
 		delete list;
 	}
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp b/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp
index f05647589..e38232928 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp
@@ -21,6 +21,7 @@
 #include <ops/declarable/helpers/top_k.h>
 #include <ops/declarable/headers/parity_ops.h>
 #include <NDArrayFactory.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -148,19 +149,21 @@ namespace helpers {
             int status = topKFunctor(context, input, values, indices.get(), k, true);
             result->assign(0);
             if (status == ND4J_STATUS_OK) {
-                bool condition = target->lengthOf() > Environment::getInstance()->tadThreshold();
-                PRAGMA_OMP_PARALLEL_FOR_IF(condition)
-                for (int e = 0; e < target->lengthOf(); e++) {
-                    bool found = false;
-                    for (int j = 0; j < k; j++) {
-                        if (target->e<Nd4jLong>(e) == indices->e<Nd4jLong>(e * k + j)) {
-                            found = true;
-                            break;
+                auto func = PRAGMA_THREADS_FOR {
+                    for (auto e = start; e < stop; e += increment) {
+                        bool found = false;
+                        for (int j = 0; j < k; j++) {
+                            if (target->e<Nd4jLong>(e) == indices->e<Nd4jLong>(e * k + j)) {
+                                found = true;
+                                break;
+                            }
                         }
+                        if (found)
+                            result->p<bool>(e, true);
                     }
-                    if (found)
-                        result->p<bool>(e, true);
-                }
+                };
+
+                samediff::Threads::parallel_tad(func, 0, target->lengthOf());
             }
             return status;
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp
index 9e04ed4df..ea2fb348a 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp
@@ -42,11 +42,13 @@ static void triuBP_(nd4j::LaunchContext * context, const NDArray& input, const N
     const_cast<NDArray&>(input).fillAsTriangular<T>(0, diagonal, dOdI.sizeAt(-1), 'b', &dOdI);
     int dLen = dOdI.lengthOf();
 
-    PRAGMA_OMP_PARALLEL_FOR_IF(dLen > Environment::getInstance()->elementwiseThreshold())
-    for(Nd4jLong i = 0; i < dLen; ++i) {
-        if(dOdI.t<T>(i) != static_cast<T>(0.f))
-            dOdI.t<T>(i) = static_cast<T>(1.f);
-    }
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto i = start; i < stop; i += increment) {
+            if (dOdI.t<T>(i) != static_cast<T>(0.f))
+                dOdI.t<T>(i) = static_cast<T>(1.f);
+        }
+    };
+    samediff::Threads::parallel_for(func, 0, dLen);
 
     // FIXME: !!!
     gradI.assign(dOdI * gradO);                          // chain rule: dLoss/dI = dO/dI * dLoss/dO
@@ -59,14 +61,14 @@ static void triuBP_(nd4j::LaunchContext * context, const NDArray& input, const N
 //////////////////////////////////////////////////////////////////////////
 template <typename T>
 static void trace_(const NDArray& input, NDArray& output) {
-
     const int inRank = input.rankOf();
-
     auto setOfSubArrs = input.allTensorsAlongDimension({inRank-2, inRank-1});
 
-    PRAGMA_OMP_PARALLEL_FOR_IF(setOfSubArrs->size() > Environment::getInstance()->tadThreshold())
-    for(int i = 0; i < setOfSubArrs->size(); ++i)
-        output.p(i, setOfSubArrs->at(i)->getTrace());
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto i = start; i < stop; i += increment)
+            output.p(i, setOfSubArrs->at(i)->getTrace());
+    };
+    samediff::Threads::parallel_for(func, 0, setOfSubArrs->size());
 
     delete setOfSubArrs;
 }
@@ -107,7 +109,8 @@ void randomShuffle_(NDArray& input, NDArray& output, nd4j::graph::RandomGenerato
             std::vector<int> indices(firstDim);
             std::iota(indices.begin(), indices.end(), 0);
             output.p<T>(Nd4jLong(0), input.e<T>(0));
-            PRAGMA_OMP_PARALLEL_FOR_IF((firstDim-1) > Environment::getInstance()->tadThreshold())
+
+            // FIXME: parallelism!!
             for(int i = firstDim-1; i > 0; --i) {
                 int r = rng.relativeInt(i) % i;
                 output.t<T>(i) = input.t<T>(indices[r]);
@@ -184,54 +187,61 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray
 
     const auto zLen = output.lengthOf();
 
-    std::vector<Nd4jLong> coords(rank);  // we use the same coordinates storage both for input and output since their ranks are the same
-
     if(mode == 0) { // CONSTANT case
 
         const T padVal = padValue.e<T>(0);
 
-        PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(coords))
-        for(uint i = 0; i < zLen; ++i) {
+        auto func = PRAGMA_THREADS_FOR {
+            Nd4jLong coords[MAX_RANK];
+            for (auto i = start; i < stop; i += increment) {
+                shape::index2coords(i, output.getShapeInfo(), coords);
+                const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
 
-            shape::index2coords(i, output.getShapeInfo(), coords.data());
-            const auto zOffset = shape::getOffset(output.getShapeInfo(), coords.data());
+                bool within = true;
+                for (int j = rankMinusOne; j >= 0; --j) {
+                    if (xShape[j] == zShape[j]) continue;
+                    const auto left = paddings.e<Nd4jLong>(j, 0);
+                    if (coords[j] < left || coords[j] >= left + xShape[j]) {
+                        within = false;
+                        break;
+                    }
+                    else { coords[j] = coords[j] - left; }
+                }
 
-            bool within = true;
-            for(int j = rankMinusOne; j >= 0; --j) {
-                if(xShape[j] == zShape[j]) continue;
-                const auto left = paddings.e<Nd4jLong>(j, 0);
-                if(coords[j] < left || coords[j] >= left + xShape[j]) {within = false; break;}
-                else                                                  {coords[j] = coords[j] - left;}
+                if (within)
+                    z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords)];
+                else
+                    z[zOffset] = padVal;
             }
+        };
 
-            if(within)
-                z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords.data())];
-            else
-                z[zOffset] = padVal;
-        }
+        samediff::Threads::parallel_tad(func, 0, zLen);
     }
     else {  // REFLECT and SYMMETRIC cases
 
         const Nd4jLong shift1 = mode == 1 ? 0 : 1;         // REFLECT : SYMMETRIC
         const Nd4jLong shift2 = mode == 1 ? 2 : 1;         // REFLECT : SYMMETRIC
 
-        PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(coords))
-        for(uint i = 0; i < zLen; ++i) {
+        auto func = PRAGMA_THREADS_FOR {
+            Nd4jLong coords[MAX_RANK];
+            for (auto i = start; i < stop; i += increment) {
+                shape::index2coords(i, output.getShapeInfo(), coords);
+                const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
 
-            shape::index2coords(i, output.getShapeInfo(), coords.data());
-            const auto zOffset = shape::getOffset(output.getShapeInfo(), coords.data());
+                for (int j = rankMinusOne; j >= 0; --j) {
 
-            for(int j = rankMinusOne; j >= 0; --j) {
+                    if (xShape[j] == zShape[j]) continue;
+                    coords[j] = coords[j] - paddings.e<Nd4jLong>(j, 0);                             // are ready to fill middle (within input dimension range)
+                    if (coords[j] < 0) coords[j] = -coords[j] - shift1;                // means fill from left
+                    else if (coords[j] >= xShape[j]) coords[j] = 2 * xShape[j] - coords[j] - shift2; // means fill from right
+                }
 
-                if(xShape[j] == zShape[j]) continue;
-                coords[j] = coords[j] - paddings.e<Nd4jLong>(j, 0);                             // are ready to fill middle (within input dimension range)
-                if(coords[j] < 0)               coords[j] = -coords[j] - shift1;                // means fill from left
-                else if(coords[j] >= xShape[j]) coords[j] = 2 * xShape[j] - coords[j] - shift2; // means fill from right
+                const auto xOffset = shape::getOffset(input.getShapeInfo(), coords);
+                z[zOffset] = x[xOffset];
             }
+        };
 
-            const auto xOffset = shape::getOffset(input.getShapeInfo(), coords.data());
-            z[zOffset] = x[xOffset];
-        }
+        samediff::Threads::parallel_tad(func, 0, zLen);
     }
 }
 
@@ -558,50 +568,49 @@ static void gatherND_(NDArray& input, NDArray& indices, NDArray& output) {
 
     const int yLastDim = indices.sizeAt(-1);
 
-    std::vector<Nd4jLong> coords(maxRank);
+    auto func = PRAGMA_THREADS_FOR {
+        Nd4jLong coords[MAX_RANK * 3];
+        for (auto i = start; i < stop; i += increment) {
+            Nd4jLong *zCoordStart, *xCoordStart;
 
-    PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(zLen > Environment::getInstance()->elementwiseThreshold()) firstprivate(coords))
-    for (Nd4jLong i = 0; i < zLen; ++i) {
+            if (yLastDim == xRank) {
+                zCoordStart = coords;
+                xCoordStart = coords;
+            } else if (zRank >= xRank) {
+                zCoordStart = coords;
+                xCoordStart = coords + zRank - xRank;
+            } else {
+                zCoordStart = coords + xRank - zRank;
+                xCoordStart = coords;
+            }
 
-        Nd4jLong *zCoordStart, *xCoordStart;
+            shape::index2coords(i, output.getShapeInfo(), zCoordStart);
 
-        if(yLastDim == xRank) {
-            zCoordStart = coords.data();
-            xCoordStart = coords.data();
-        }
-        else if(zRank >= xRank) {
-            zCoordStart = coords.data();
-            xCoordStart = coords.data() + zRank - xRank;
-        }
-        else {
-            zCoordStart = coords.data() + xRank - zRank;
-            xCoordStart = coords.data();
+            const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoordStart);
+
+            // last y coordinate
+            uint coordToRestore;
+            if (yLastDim != xRank)
+                coordToRestore = static_cast<uint>(zCoordStart[yRank - 1]);
+
+            zCoordStart[yRank - 1] = 0;
+            const auto yOffset = shape::getOffset(indices.getShapeInfo(), zCoordStart);
+
+            //restore z coordinate
+            if (yLastDim != xRank)
+                zCoordStart[yRank - 1] = coordToRestore;
+
+            // construct coordinates for x
+            for (uint j = 0; j < yLastDim; ++j)
+                xCoordStart[j] = y[yOffset + j * indices.stridesOf()[yRank - 1]];   // last stride
+
+            const auto xOffset = shape::getOffset(input.getShapeInfo(), xCoordStart);
+
+            z[zOffset] = x[xOffset];
         }
+    };
 
-        shape::index2coords(i, output.getShapeInfo(), zCoordStart);
-
-        const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoordStart);
-
-        // last y coordinate
-        uint coordToRestore;
-        if(yLastDim != xRank)
-            coordToRestore = static_cast<uint>(zCoordStart[yRank - 1]);
-
-        zCoordStart[yRank - 1] = 0;
-        const auto yOffset = shape::getOffset(indices.getShapeInfo(), zCoordStart);
-
-        //restore z coordinate
-        if(yLastDim != xRank)
-            zCoordStart[yRank - 1] = coordToRestore;
-
-        // construct coordinates for x
-        for(uint j = 0; j < yLastDim; ++j)
-            xCoordStart[j] = y[yOffset + j * indices.stridesOf()[yRank - 1]];   // last stride
-
-        const auto xOffset = shape::getOffset(input.getShapeInfo(), xCoordStart);
-
-        z[zOffset] = x[xOffset];
-    }
+    samediff::Threads::parallel_tad(func, 0, zLen);
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -644,21 +653,28 @@ static void gather_(NDArray* input, const NDArray* indices, NDArray* output, con
         }
         else if (input->rankOf() == 1 && indices->isVector()) {
             // special case
-            PRAGMA_OMP_PARALLEL_FOR_IF(indices->lengthOf() > Environment::getInstance()->tadThreshold())
-            for (int e = 0; e < indices->lengthOf(); e++)
-                output->p(e, input->e<T>(indices->e<Nd4jLong>(e)));
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto e = start; e < stop; e += increment)
+                    output->p(e, input->e<T>(indices->e<Nd4jLong>(e)));
+            };
+
+            samediff::Threads::parallel_for(func, 0, indices->lengthOf());
         }
         else {
 
             std::vector<int> dimsOut(indices->rankOf());
             std::iota(dimsOut.begin(), dimsOut.end(), axis);   // fill with axis, axis+1, ... indices->rankOf()-1
             const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(output->getShapeInfo(), dimsOut);
-            PRAGMA_OMP_PARALLEL_FOR_IF(numOfSubArrs > Environment::getInstance()->tadThreshold())
-            for(int i = 0; i < numOfSubArrs; ++i) {
-                NDArray subArrOut = (*output)(i, dimsOut);
-                NDArray subArrIn  = (*input)(indices->e<Nd4jLong>(i), {axis});
-                subArrOut.assign(subArrIn);
-            }
+
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    NDArray subArrOut = (*output)(i, dimsOut);
+                    NDArray subArrIn = (*input)(indices->e<Nd4jLong>(i), {axis});
+                    subArrOut.assign(subArrIn);
+                }
+            };
+
+            samediff::Threads::parallel_tad(func, 0, numOfSubArrs);
         }
     }
     else {
@@ -673,12 +689,16 @@ static void gather_(NDArray* input, const NDArray* indices, NDArray* output, con
         }
         else { // vector case
             const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(output->getShapeInfo(), {axis});
-            PRAGMA_OMP_PARALLEL_FOR_IF(numOfSubArrs > Environment::getInstance()->tadThreshold())
-            for(int i = 0; i < numOfSubArrs; ++i) {
-                NDArray subArrOut = (*output)(i, {axis});
-                NDArray subArrIn  = (*input)(intArgs[i+1], {axis});
-                subArrOut.assign(subArrIn);
-            }
+
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    NDArray subArrOut = (*output)(i, {axis});
+                    NDArray subArrIn = (*input)(intArgs[i + 1], {axis});
+                    subArrOut.assign(subArrIn);
+                }
+            };
+
+            samediff::Threads::parallel_tad(func, 0, numOfSubArrs);
         }
     }
 }
@@ -693,9 +713,12 @@ void eye(nd4j::LaunchContext * context, NDArray& output) {
     const int rank = output.rankOf();
     auto arrs = output.allTensorsAlongDimension({rank-2, rank-1});
 
-    PRAGMA_OMP_PARALLEL_FOR_IF(arrs->size() > Environment::getInstance()->tadThreshold())
-    for(int i = 0; i < arrs->size(); ++i)
-        arrs->at(i)->setIdentity();
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto i = start; i < stop; i += increment)
+            arrs->at(i)->setIdentity();
+    };
+
+    samediff::Threads::parallel_tad(func, 0, arrs->size());
 
     delete arrs;
 }
@@ -719,41 +742,43 @@ void scatterUpdate(nd4j::LaunchContext * context, NDArray& input, NDArray& updat
     for (; e < intArgs->size(); e++)
         indices.push_back((*intArgs)[e]);
 
-    PRAGMA_OMP_PARALLEL_FOR
-    for (Nd4jLong i = 0; i < indices.size(); ++i) {
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto i = start; i < stop; i += increment) {
+            auto inSubArr = input(indices[i], dimsToExclude, true);
+            auto updSubArr = updates(i, dimsToExclude, true);
 
-        auto inSubArr  = input(indices[i], dimsToExclude, true);
-        auto updSubArr = updates(i,        dimsToExclude, true);
-
-        if (inSubArr.lengthOf() != updSubArr.lengthOf())
-            continue;
-
-        switch (opCode) {
-            case 0:
-                inSubArr.applyPairwiseTransform(pairwise::Add, &updSubArr, &inSubArr, nullptr);
-                break;
-            case 1:
-                inSubArr.applyPairwiseTransform(pairwise::Subtract, &updSubArr, &inSubArr, nullptr);
-                break;
-            case 2:
-                inSubArr.applyPairwiseTransform(pairwise::Multiply, &updSubArr, &inSubArr, nullptr);
-                break;
-            case 3:
-                inSubArr.applyPairwiseTransform(pairwise::Divide, &updSubArr, &inSubArr, nullptr);
-                break;
-            case 4:
-                inSubArr.applyPairwiseTransform(pairwise::ReverseSubtract, &updSubArr, &inSubArr, nullptr);
-                break;
-            case 5:
-                inSubArr.applyPairwiseTransform(pairwise::ReverseDivide, &updSubArr, &inSubArr, nullptr);
-                break;
-            case 6:
-                inSubArr.applyPairwiseTransform(pairwise::CopyPws, &updSubArr, &inSubArr, nullptr);
-                break;
-            default:
+            if (inSubArr.lengthOf() != updSubArr.lengthOf())
                 continue;
+
+            switch (opCode) {
+                case 0:
+                    inSubArr.applyPairwiseTransform(pairwise::Add, &updSubArr, &inSubArr, nullptr);
+                    break;
+                case 1:
+                    inSubArr.applyPairwiseTransform(pairwise::Subtract, &updSubArr, &inSubArr, nullptr);
+                    break;
+                case 2:
+                    inSubArr.applyPairwiseTransform(pairwise::Multiply, &updSubArr, &inSubArr, nullptr);
+                    break;
+                case 3:
+                    inSubArr.applyPairwiseTransform(pairwise::Divide, &updSubArr, &inSubArr, nullptr);
+                    break;
+                case 4:
+                    inSubArr.applyPairwiseTransform(pairwise::ReverseSubtract, &updSubArr, &inSubArr, nullptr);
+                    break;
+                case 5:
+                    inSubArr.applyPairwiseTransform(pairwise::ReverseDivide, &updSubArr, &inSubArr, nullptr);
+                    break;
+                case 6:
+                    inSubArr.applyPairwiseTransform(pairwise::CopyPws, &updSubArr, &inSubArr, nullptr);
+                    break;
+                default:
+                    continue;
+            }
         }
-    }
+    };
+
+    samediff::Threads::parallel_tad(func, 0, indices.size());
 }
 
 
@@ -766,11 +791,14 @@ void scatterSimple(nd4j::LaunchContext * context, const int opId, NDArray& input
     switch (opId) {
 
         case 6: {   // copy
-            PRAGMA_OMP_PARALLEL_FOR_IF(len > Environment::getInstance()->elementwiseThreshold())
-            for(uint i = 0; i < len; ++i) {
-                auto inSubArr = input(i, dimensions);
-                inSubArr.p(indices.t<Nd4jLong>(i), updates.e(i));
-            }
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    auto inSubArr = input(i, dimensions);
+                    inSubArr.p(indices.t<Nd4jLong>(i), updates.e(i));
+                }
+            };
+
+            samediff::Threads::parallel_for(func, 0, len);
         }
             break;
 
@@ -786,70 +814,79 @@ static void mergeMaxIndex_(const std::vector<NDArray*>& inArrs, NDArray& output)
     const Nd4jLong numArgs = inArrs.size();
     auto x = inArrs[0];
 
-    PRAGMA_OMP_PARALLEL_FOR_IF(x->lengthOf() > Environment::getInstance()->elementwiseThreshold())
-    for (Nd4jLong e = 0; e < x->lengthOf(); e++) {
-        T max = -DataTypeUtils::max<T>();
-        Nd4jLong idx = 0;
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto e = start; e < stop; e += increment) {
+            T max = -DataTypeUtils::max<T>();
+            Nd4jLong idx = 0;
 
-        for (int i = 0; i < numArgs; i++){
-
-            T v = inArrs[i]->e<T>(e);
-            if (v > max) {
-                max = v;
-                idx = i;
+            for (int i = 0; i < numArgs; i++) {
+                T v = inArrs[i]->e<T>(e);
+                if (v > max) {
+                    max = v;
+                    idx = i;
+                }
             }
+            output.p(e, idx);
         }
-        output.p(e, idx);
-    }
+    };
+
+    samediff::Threads::parallel_for(func, 0, x->lengthOf());
+}
+
+void mergeMaxIndex(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
+    BUILD_SINGLE_SELECTOR(inArrs[0]->dataType(), mergeMaxIndex_, (inArrs, output), LIBND4J_TYPES);
 }
-    void mergeMaxIndex(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
-        BUILD_SINGLE_SELECTOR(inArrs[0]->dataType(), mergeMaxIndex_, (inArrs, output), LIBND4J_TYPES);
-    }
 
 
 //////////////////////////////////////////////////////////////////////////
 template<typename T>
 static void mergeMax_(const std::vector<NDArray*>& inArrs, NDArray& output) {
-
     const Nd4jLong numArgs = inArrs.size();
     auto x = inArrs[0];
 
-    PRAGMA_OMP_PARALLEL_FOR_IF(x->lengthOf() > Environment::getInstance()->elementwiseThreshold())
-     for (Nd4jLong e = 0; e < x->lengthOf(); e++) {
-        T max = -DataTypeUtils::max<T>();
-        for (int i = 0; i < numArgs; i++) {
-            T v = inArrs[i]->e<T>(e);
-            if (v > max)
-                max = v;
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto e = start; e < stop; e += increment) {
+            T max = -DataTypeUtils::max<T>();
+            for (int i = 0; i < numArgs; i++) {
+                T v = inArrs[i]->e<T>(e);
+                if (v > max)
+                    max = v;
+            }
+            output.p(e, max);
         }
-        output.p(e, max);
-    }
+    };
+
+    samediff::Threads::parallel_for(func, 0, x->lengthOf());
+}
+
+void mergeMax(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
+    BUILD_SINGLE_SELECTOR(output.dataType(), mergeMax_, (inArrs, output), LIBND4J_TYPES);
 }
-    void mergeMax(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
-        BUILD_SINGLE_SELECTOR(output.dataType(), mergeMax_, (inArrs, output), LIBND4J_TYPES);
-    }
 
 //////////////////////////////////////////////////////////////////////////
 template<typename T>
 static void mergeAvg_(const std::vector<NDArray*>& inArrs, NDArray& output) {
-
     const Nd4jLong numArgs = inArrs.size();
     const T factor = 1.f / numArgs;
     auto x = inArrs[0];
 
-    PRAGMA_OMP_PARALLEL_FOR_IF(x->lengthOf() > Environment::getInstance()->elementwiseThreshold())
-    for (Nd4jLong e = 0; e < x->lengthOf(); e++) {
-        T sum = 0.;
-        for (int i = 0; i < numArgs; i++) {
-            T v = inArrs[i]->e<T>(e);
-            sum += v;
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto e = start; e < stop; e += increment) {
+            T sum = 0.;
+            for (int i = 0; i < numArgs; i++) {
+                T v = inArrs[i]->e<T>(e);
+                sum += v;
+            }
+            output.p<T>(e, sum * factor);
         }
-        output.p<T>(e, sum * factor);
-    }
+    };
+
+    samediff::Threads::parallel_for(func, 0, x->lengthOf());
+}
+
+void mergeAvg(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
+    BUILD_SINGLE_SELECTOR(output.dataType(), mergeAvg_, (inArrs, output), LIBND4J_TYPES);
 }
-    void mergeAvg(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
-        BUILD_SINGLE_SELECTOR(output.dataType(), mergeAvg_, (inArrs, output), LIBND4J_TYPES);
-    }
 
 
 //////////////////////////////////////////////////////////////////////////
@@ -859,16 +896,17 @@ static void mergeAdd_(const std::vector<NDArray*>& inArrs, NDArray& output) {
     const Nd4jLong numArgs = inArrs.size();
     auto x = inArrs[0];
 
-    PRAGMA_OMP_PARALLEL_FOR_IF(x->lengthOf() > Environment::getInstance()->elementwiseThreshold())
-    for (Nd4jLong e = 0; e < x->lengthOf(); e++) {
+    auto func = PRAGMA_THREADS_FOR {
+        for (auto e = start; e < stop; e += increment) {
+            T sum = (T) 0.f;
+            for (int i = 0; i < numArgs; i++)
+                sum += inArrs[i]->e<T>(e);
 
-        T sum = (T) 0.f;
+            output.p(e, sum);
+        }
+    };
 
-        for (int i = 0; i < numArgs; i++)
-            sum += inArrs[i]->e<T>(e);
-
-        output.p(e, sum);
-    }
+    samediff::Threads::parallel_for(func, 0, x->lengthOf());
 }
     void mergeAdd(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
         BUILD_SINGLE_SELECTOR(output.dataType(), mergeAdd_, (inArrs, output), LIBND4J_TYPES);
@@ -895,14 +933,15 @@ static void clipByNorm_(NDArray& input, NDArray& output, const std::vector<int>&
 
             auto listOfInSubArrs = input.allTensorsAlongDimension(dimensions);
 
-            PRAGMA_OMP_PARALLEL_FOR
-            for(Nd4jLong i = 0; i < listOfInSubArrs->size(); ++i) {
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    const T iNormActual = norm2.e<T>(i);
+                    if (iNormActual > normClip)
+                        *listOfInSubArrs->at(i) *= normClip / iNormActual;
+                }
+            };
+            samediff::Threads::parallel_tad(func, 0, listOfInSubArrs->size());
 
-                const T iNormActual = norm2.e<T>(i);
-
-                if (iNormActual > normClip)
-                    *listOfInSubArrs->at(i) *= normClip / iNormActual;
-            }
             delete listOfInSubArrs;
         }
     }
@@ -920,18 +959,19 @@ static void clipByNorm_(NDArray& input, NDArray& output, const std::vector<int>&
             auto listOfInSubArrs  = input.allTensorsAlongDimension(dimensions);
             auto listOfOutSubArrs = output.allTensorsAlongDimension(dimensions);
 
-            PRAGMA_OMP_PARALLEL_FOR
-            for(Nd4jLong i = 0; i < listOfInSubArrs->size(); ++i) {
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    auto inputSubArr = listOfInSubArrs->at(i);
+                    auto outputSubArr = listOfOutSubArrs->at(i);
+                    outputSubArr->assign(inputSubArr);
 
-                auto inputSubArr  = listOfInSubArrs->at(i);
-                auto outputSubArr = listOfOutSubArrs->at(i);
-                outputSubArr->assign(inputSubArr);
+                    const T iNormActual = norm2.e<T>(i);
 
-                const T iNormActual = norm2.e<T>(i);
-
-                if (iNormActual > clipNorm.e<T>(0))
-                    *outputSubArr *= clipNorm / iNormActual;
-            }
+                    if (iNormActual > clipNorm.e<T>(0))
+                        *outputSubArr *= clipNorm / iNormActual;
+                }
+            };
+            samediff::Threads::parallel_tad(func, 0, listOfInSubArrs->size());
 
             delete listOfInSubArrs;
             delete listOfOutSubArrs;
@@ -1028,31 +1068,29 @@ static void clipByNormBP_(const NDArray& input, const NDArray& gradO, NDArray& g
 
         auto cn = clipNorm.e<T>(0);
 
-        PRAGMA_OMP_PARALLEL_FOR
-        for(Nd4jLong i = 0; i < gradISubArrs->size(); ++i) {
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
+                T N = norm2.e<T>(i);
 
-            T N = norm2.e<T>(i);
+                auto gradOSubArr = gradOSubArrs->at(i);
+                auto gradISubArr = gradISubArrs->at(i);
 
-            auto gradOSubArr = gradOSubArrs->at(i);
-            auto gradISubArr = gradISubArrs->at(i);
+                if (N > cn) {
+                    auto inputSubArr = inputSubArrs->at(i);
+                    const T sumOfProd = (*inputSubArr * *gradOSubArr).reduceNumber(reduce::Sum).e<T>(0);    // reduce to scalar
+                    const T factor1 = static_cast<T>(1.f) / N;
+                    const T factor3 = factor1 / (N * N);                                            // 1 / (N*N*N)
 
-            if (N > cn) {
+                    auto lambda = LAMBDA_TT(elem1, elem2, cn, sumOfProd, factor1, factor3) {
+                        return cn * (factor1 * elem2 - factor3 * elem1 * sumOfProd);
+                    };
 
-                auto inputSubArr = inputSubArrs->at(i);
-
-                const T sumOfProd = (*inputSubArr * *gradOSubArr).reduceNumber(reduce::Sum).e<T>(0);    // reduce to scalar
-                const T factor1 = static_cast<T>(1.f) / N;
-                const T factor3 = factor1 / (N * N) ;                                            // 1 / (N*N*N)
-
-                auto lambda = LAMBDA_TT(elem1, elem2, cn, sumOfProd, factor1, factor3) {
-                    return cn * (factor1 * elem2 - factor3 * elem1 * sumOfProd);
-                };
-
-                inputSubArr->applyPairwiseLambda<T>(gradOSubArr, lambda, gradISubArr);
+                    inputSubArr->applyPairwiseLambda<T>(gradOSubArr, lambda, gradISubArr);
+                } else
+                    gradISubArr->assign(gradOSubArr);
             }
-            else
-                gradISubArr->assign(gradOSubArr);
-        }
+        };
+        samediff::Threads::parallel_tad(func, 0, gradISubArrs->size());
 
         delete gradISubArrs;
         delete gradOSubArrs;
@@ -1165,34 +1203,35 @@ static void mirrorPad_(const NDArray& input, const NDArray& paddings, NDArray& o
     }
     else {
 
-        std::vector<Nd4jLong> inIdx(rank), outIdx(rank);
+        auto func = PRAGMA_THREADS_FOR {
+            Nd4jLong inIdx[MAX_RANK];
+            Nd4jLong outIdx[MAX_RANK];
+            for (auto i = start; i < stop; i += increment) {
+                shape::index2coords(i, output.getShapeInfo(), outIdx);
 
-        PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(inIdx, outIdx))
-        for(int i = 0; i < outLen; ++i) {
+                for (int j = 0; j < rank; ++j) {
+                    const Nd4jLong inLen = input.sizeAt(j);
+                    const auto leftSide = paddings.e<T>(j, 0);
+                    const auto leftSideCorrected = leftSide - reflBorder;
+                    const Nd4jLong len = 2 * (inLen - 1) + leftSide + reflBorder;
 
-            shape::index2coords(i, output.getShapeInfo(), outIdx.data());
+                    if (outIdx[j] < leftSide)                                        // left side
+                        inIdx[j] = leftSideCorrected - outIdx[j];
 
-            for(int j = 0; j < rank; ++j) {
+                    else if (outIdx[j] >= leftSide && outIdx[j] < leftSide + inLen)  // middle
+                        inIdx[j] = outIdx[j] - leftSide;
 
-                const Nd4jLong inLen         = input.sizeAt(j);
-                const auto leftSide          = paddings.e<T>(j, 0);
-                const auto leftSideCorrected = leftSide - reflBorder;
-                const Nd4jLong len           = 2*(inLen-1) + leftSide + reflBorder;
+                    else                                                            // right side
+                        inIdx[j] = len - outIdx[j];
+                }
 
-                if(outIdx[j] < leftSide)                                        // left side
-                    inIdx[j] = leftSideCorrected - outIdx[j];
-
-                else if(outIdx[j] >= leftSide && outIdx[j] < leftSide + inLen)  // middle
-                    inIdx[j] = outIdx[j] - leftSide;
-
-                else                                                            // right side
-                    inIdx[j] = len - outIdx[j];
+                auto outOffset = shape::getOffset(output.getShapeInfo(), outIdx);
+                auto inOffset = shape::getOffset(input.getShapeInfo(), inIdx);
+                reinterpret_cast<T *>(output.buffer())[outOffset] = reinterpret_cast<T *>(input.getBuffer())[inOffset];
             }
+        };
 
-            auto outOffset = shape::getOffset(output.getShapeInfo(), outIdx.data());
-            auto inOffset  = shape::getOffset(input.getShapeInfo(), inIdx.data());
-            reinterpret_cast<T*>(output.buffer())[outOffset] = reinterpret_cast<T*>(input.getBuffer())[inOffset];
-        }
+        samediff::Threads::parallel_for(func, 0, outLen);
     }
 }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/zeta.cpp b/libnd4j/include/ops/declarable/helpers/cpu/zeta.cpp
index a365d8135..5d4ed9f2e 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/zeta.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/zeta.cpp
@@ -19,6 +19,7 @@
 //
 
 #include<ops/declarable/helpers/zeta.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -62,9 +63,12 @@ static void zeta_(nd4j::LaunchContext * context, const NDArray& x, const NDArray
 	//auto result = NDArray(&x, false, context);
 	int xLen = x.lengthOf();
 
-	PRAGMA_OMP_PARALLEL_FOR_IF(xLen > Environment::getInstance()->elementwiseThreshold())
-	for(int i = 0; i < xLen; ++i)
-		  z.p(i, zetaScalar<T>(x.e<T>(i), q.e<T>(i)));
+	auto func = PRAGMA_THREADS_FOR {
+        for (auto i = start; i < stop; i += increment)
+            z.p(i, zetaScalar<T>(x.e<T>(i), q.e<T>(i)));
+    };
+
+	samediff::Threads::parallel_for(func, 0, xLen);
 }
 
 void zeta(nd4j::LaunchContext * context, const NDArray& x, const NDArray& q, NDArray& z) {
diff --git a/libnd4j/include/ops/declarable/helpers/cross.h b/libnd4j/include/ops/declarable/helpers/cross.h
index 27caedd0c..d087a4849 100644
--- a/libnd4j/include/ops/declarable/helpers/cross.h
+++ b/libnd4j/include/ops/declarable/helpers/cross.h
@@ -19,6 +19,7 @@
 //
 
 #include <ops/declarable/helpers/helpers.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -66,14 +67,17 @@ void FORCEINLINE cross(nd4j::LaunchContext * context, NDArray *a, NDArray *b, ND
 
         int tads = tadsA->size();
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for (int e = 0; e < tads; e++) {
-            auto a_ = tadsA->at(e);
-            auto b_ = tadsB->at(e);
-            auto o_ = tadsO->at(e);
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto e = start; e < stop; e += increment) {
+                auto a_ = tadsA->at(e);
+                auto b_ = tadsB->at(e);
+                auto o_ = tadsO->at(e);
 
-            helpers::cross(context, a_, b_, o_);
-        }
+                helpers::cross(context, a_, b_, o_);
+            }
+        };
+
+        samediff::Threads::parallel_tad(func,  0, tads);
 
         delete tadsA;
         delete tadsB;
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/col2im.cppc b/libnd4j/include/ops/declarable/helpers/cuda/col2im.cppc
deleted file mode 100644
index 63e406cc6..000000000
--- a/libnd4j/include/ops/declarable/helpers/cuda/col2im.cppc
+++ /dev/null
@@ -1,138 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2015-2018 Skymind, Inc.
- *
- * This program and the accompanying materials are made available under the
- * terms of the Apache License, Version 2.0 which is available at
- * https://www.apache.org/licenses/LICENSE-2.0.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations
- * under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- ******************************************************************************/
-
-//
-// Created by raver119 on 30.11.17.
-//
-
-#include <ops/declarable/helpers/col2im.h>
-
-namespace nd4j {
-namespace ops {
-namespace helpers {
-
-// [bS, iC, kH, kW, oH, oW] is de-convoluted to [bS, iC, iH, iW]
-template <typename T>
-void col2im_(nd4j::LaunchContext & context, const NDArray& input,  NDArray& output, const int sH, const int sW, const int pH, const int pW, const int iH, const int iW, const int dH, const int dW) {
-
-    auto imBuff         = output.bufferAsT<T>();
-	auto colBuff        = input.bufferAsT<T>();
-	auto imShapeBuffer  = output.getShapeInfo();
-	auto colShapeBuffer = input.getShapeInfo();
-    auto colShape  		= shape::shapeOf(colShapeBuffer);
-    auto colStride 		= shape::stride(colShapeBuffer);
-    auto imShape  	    = shape::shapeOf(imShapeBuffer);
-    auto imStride 	    = shape::stride(imShapeBuffer);
-
-    const int bS = imShape[0];
-    const int iC = imShape[1];
-    const int kH = colShape[2];
-    const int kW = colShape[3];
-    const int oH = colShape[4];
-    const int oW = colShape[5];
-    const Nd4jLong colStride0 = colStride[0];
-    const Nd4jLong colStride1 = colStride[1];
-    const Nd4jLong colStride2 = colStride[2];
-    const Nd4jLong colStride3 = colStride[3];
-    const Nd4jLong colStride4 = colStride[4];
-    const Nd4jLong colStride5 = colStride[5];
-    const Nd4jLong imStride0  = imStride[0];
-    const Nd4jLong imStride1  = imStride[1];
-    const Nd4jLong imStride2  = imStride[2];
-    const Nd4jLong imStride3  = imStride[3];
-
-    // initial zeroing of image content
-    const auto imEWS = shape::elementWiseStride(imShapeBuffer);
-    if(imEWS == 1) {
-        memset(imBuff, 0, shape::length(imShapeBuffer) * sizeof(T));
-    }
-    else if (imEWS > 1) {
-PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close))
-        for (int i = 0; i < shape::length(imShapeBuffer) * imEWS; i += imEWS)
-            imBuff[i] = static_cast<T>(0.f);
-    }
-    else {
-        const auto len = shape::length(imShapeBuffer);
-PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close))
-        for (int i = 0; i < len; i++)
-            imBuff[shape::getIndexOffset(i, imShapeBuffer)] = static_cast<T>(0.f);
-    }
-
-	T *col, *im;
-    int imRow, imCol;
-
-    if (shape::order(colShapeBuffer) == 'c' &&  shape::order(imShapeBuffer) == 'c' && shape::strideDescendingCAscendingF(colShapeBuffer) && shape::strideDescendingCAscendingF(imShapeBuffer)) {
-
-PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close) private(col, im, imRow, imCol))
-    	for (int b = 0; b < bS; b++) {
-      		for (int c = 0; c < iC; ++c) {
-            	for (int kRow = 0; kRow < kH; ++kRow) {
-                	for (int kCol = 0; kCol < kW; ++kCol) {
-                    	for (int colH = 0; colH < oH; ++colH) {
-                        	for (int colW = 0; colW < oW; ++colW) {
-
-                            	imRow = (-pH + kRow * dH) + colH*sH;
-                                imCol = (-pW + kCol * dW) + colW*sW;
-
-                                col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5;
-                                im  = imBuff  + b*imStride0  + c*imStride1  + imRow*imStride2 + imCol*imStride3;
-
-                                if (static_cast<unsigned>(imRow) < static_cast<unsigned>(iH) && static_cast<unsigned>(imCol) < static_cast<unsigned>(iW))
-                                	*im += *col;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-    else {
-
-PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close) private(im, col, imRow, imCol))
-    	for (int b = 0; b < bS; b++) {
-        	for (int colH = 0; colH < oH; ++colH) {
-            	for (int colW = 0; colW < oW; ++colW) {
-                	for (int c = 0; c < iC; ++c) {
-                    	for (int kRow = 0; kRow < kH; ++kRow) {
-                        	for (int kCol = 0; kCol < kW; ++kCol) {
-
-                            	imRow = (-pH + kRow * dH) + colH*sH;
-                                imCol = (-pW + kCol * dW) + colW*sW;
-
-                                col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5;
-                                im  = imBuff  + b*imStride0  + c*imStride1  + imRow*imStride2 + imCol*imStride3;
-
-                                if (static_cast<unsigned>(imRow) < static_cast<unsigned>(iH) && static_cast<unsigned>(imCol) < static_cast<unsigned>(iW))
-                                	*im += *col;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-
-void col2im(nd4j::LaunchContext & context, const NDArray& input,  NDArray& output, const int sH, const int sW, const int pH, const int pW, const int iH, const int iW, const int dH, const int dW) {
-	BUILD_SINGLE_SELECTOR(input.dataType(), col2im_, (context, input, output, sH, sW, pH, pW, iH, iW, dH, dW), LIBND4J_TYPES);
-}
-
-BUILD_SINGLE_TEMPLATE(template void col2im_, (nd4j::LaunchContext & context, const NDArray& input,  NDArray& output, const int sH, const int sW, const int pH, const int pW, const int iH, const int iW, const int dH, const int dW), LIBND4J_TYPES);
-
-}
-}
-}
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/im2col.cppc b/libnd4j/include/ops/declarable/helpers/cuda/im2col.cppc
deleted file mode 100644
index 67f5650bd..000000000
--- a/libnd4j/include/ops/declarable/helpers/cuda/im2col.cppc
+++ /dev/null
@@ -1,129 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2015-2018 Skymind, Inc.
- *
- * This program and the accompanying materials are made available under the
- * terms of the Apache License, Version 2.0 which is available at
- * https://www.apache.org/licenses/LICENSE-2.0.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations
- * under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- ******************************************************************************/
-
-//
-// @author Yurii Shyrma (iuriish@yahoo.com), created on 19.09.2018
-//
-
-#include <ops/declarable/helpers/im2col.h>
-
-
-namespace nd4j    {
-namespace ops     {
-namespace helpers {
-
-// input [bS, iC, iH, iW] is convoluted to output [bS, iC, kH, kW, oH, oW]
-template <typename T>
-static void im2col_(nd4j::LaunchContext & context, const NDArray& input,  NDArray& output, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const NDArray& arrZeroPadVal) {
-
-	auto imBuff         = static_cast<T*>(input.getBuffer());
-	auto colBuff        = static_cast<T*>(output.getBuffer());
-	auto imShapeBuffer  = input.getShapeInfo();
-	auto colShapeBuffer = output.getShapeInfo();
-    auto colShape       = shape::shapeOf(colShapeBuffer);
-    auto colStride      = shape::stride(colShapeBuffer);
-    auto imShape        = shape::shapeOf(imShapeBuffer);
-    auto imStride       = shape::stride(imShapeBuffer);
-
-    const T zeroPadVal =  arrZeroPadVal.e<T>(0);
-
-    const int bS = imShape[0];
-    const int iC = imShape[1];
-    const int iH = imShape[2];
-    const int iW = imShape[3];
-    const int oH = colShape[4];
-    const int oW = colShape[5];
-    const Nd4jLong colStride0 = colStride[0];
-    const Nd4jLong colStride1 = colStride[1];
-    const Nd4jLong colStride2 = colStride[2];
-    const Nd4jLong colStride3 = colStride[3];
-    const Nd4jLong colStride4 = colStride[4];
-    const Nd4jLong colStride5 = colStride[5];
-    const Nd4jLong imStride0  = imStride[0];
-    const Nd4jLong imStride1  = imStride[1];
-    const Nd4jLong imStride2  = imStride[2];
-    const Nd4jLong imStride3  = imStride[3];
-
-    T *col, *im;
-    int imRow, imCol;
-            
-    if (shape::order(imShapeBuffer) == 'c' &&  shape::order(colShapeBuffer) == 'c' && shape::strideDescendingCAscendingF(imShapeBuffer) && shape::strideDescendingCAscendingF(colShapeBuffer)) {
-
-PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close) private(col, im, imRow, imCol))
-    	for (int b = 0; b < bS; b++) {
-        	for (int c = 0; c < iC; ++c) {        
-            	for (int kRow = 0; kRow < kH; ++kRow) {                        
-                	for (int kCol = 0; kCol < kW; ++kCol) {                            
-                    	for (int colH = 0; colH < oH; ++colH) {
-                        	for (int colW = 0; colW < oW; ++colW) {                    
-                                
-                            	imRow = (-pH + kRow * dH) + colH*sH;
-                                imCol = (-pW + kCol * dW) + colW*sW;
-                                        
-                                col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5;
-                                im  = imBuff  + b*imStride0  + c*imStride1  + imRow*imStride2 + imCol*imStride3; 
-                                                    
-                                if (static_cast<unsigned>(imRow) >= static_cast<unsigned>(iH) || static_cast<unsigned>(imCol) >= static_cast<unsigned>(iW))
-                                	*col = zeroPadVal;
-                                else 
-                                	*col = *im;
-                            }
-                        }
-                    }
-                }
-            }
-        }  
-    }
-    else {
- 
-PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close) private(im, col, imRow, imCol))
-    	for (int b = 0; b < bS; b++) {
-        	for (int colH = 0; colH < oH; ++colH) {
-            	for (int colW = 0; colW < oW; ++colW) {
-                	for (int c = 0; c < iC; ++c) {
-                    	for (int kRow = 0; kRow < kH; ++kRow) {                        
-                        	for (int kCol = 0; kCol < kW; ++kCol) {                            
-                        
-                            	imRow = (-pH + kRow * dH) + colH*sH;
-                                imCol = (-pW + kCol * dW) + colW*sW;
-                                        
-                                col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5;
-                                im  = imBuff  + b*imStride0  + c*imStride1  + imRow*imStride2 + imCol*imStride3;
-                                                    
-                                if (static_cast<unsigned>(imRow) >= static_cast<unsigned>(iH) || static_cast<unsigned>(imCol) >= static_cast<unsigned>(iW))
-                                	*col = zeroPadVal;
-                                else 
-                                	*col = *im;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-
-void im2col(nd4j::LaunchContext & context, const NDArray& im,  NDArray& col, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const NDArray& arrZeroPadVal) {
-	BUILD_SINGLE_SELECTOR(im.dataType(), im2col_, (context, im, col, kH, kW, sH, sW, pH, pW, dH, dW, arrZeroPadVal), LIBND4J_TYPES);
-}
-
-BUILD_SINGLE_TEMPLATE(template void im2col_, (nd4j::LaunchContext & context, const NDArray& im,  NDArray& col, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const NDArray& arrZeroPadVal), LIBND4J_TYPES);
-
-
-}
-}
-}
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/legacy/relu.cu b/libnd4j/include/ops/declarable/helpers/cuda/legacy/relu.cu
index c2dd4919d..753c8ae64 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/legacy/relu.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/legacy/relu.cu
@@ -19,6 +19,7 @@
 //
 
 #include <ops/declarable/helpers/legacy_helpers.h>
+#include <ops/ops.h>
 #include <NDArrayFactory.h>
 #include <op_boilerplate.h>
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/legacy/tanh.cu b/libnd4j/include/ops/declarable/helpers/cuda/legacy/tanh.cu
index 017180b38..3a09f9a80 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/legacy/tanh.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/legacy/tanh.cu
@@ -20,6 +20,7 @@
 
 #include <ops/declarable/helpers/legacy_helpers.h>
 #include <NDArrayFactory.h>
+#include <ops/ops.h>
 #include <op_boilerplate.h>
 
 namespace nd4j {
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/legacy_helper.cu b/libnd4j/include/ops/declarable/helpers/cuda/legacy_helper.cu
index 8db1f66d4..fa97a3de2 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/legacy_helper.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/legacy_helper.cu
@@ -21,6 +21,7 @@
 #include <ops/declarable/helpers/legacy_helpers.h>
 #include <NDArrayFactory.h>
 #include <op_boilerplate.h>
+#include <ops/ops.h>
 
 namespace nd4j {
 namespace ops {
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu b/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu
index 0a707ffb3..8a9986e23 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu
@@ -644,7 +644,6 @@ void clipByNormBP(nd4j::LaunchContext* context, const NDArray& input, const NDAr
 
             // apply Fisher-Yates shuffle
             if(isInplace) {
-                PRAGMA_OMP_PARALLEL_FOR_IF((firstDim-1) > Environment::getInstance()->elementwiseThreshold())
                 for(int i = firstDim - 1; i > 0; --i) {
                     int r = rng.relativeInt(i) % i;
 
@@ -658,7 +657,7 @@ void clipByNormBP(nd4j::LaunchContext* context, const NDArray& input, const NDAr
                 std::vector<int> indices(firstDim);
                 std::iota(indices.begin(), indices.end(), 0);
                 bool isZeroShuffled = false;
-                PRAGMA_OMP_PARALLEL_FOR_IF((firstDim-1) > Environment::getInstance()->tadThreshold())
+
                 for(int i = firstDim - 1; i > 0; --i) {
                     int r = rng.relativeInt(i) % i;
                     subArrsListOut->at(i)->assign(subArrsListIn->at(indices[r]));
diff --git a/libnd4j/include/ops/declarable/helpers/helpers.h b/libnd4j/include/ops/declarable/helpers/helpers.h
index 0914d2d49..f2e19063e 100644
--- a/libnd4j/include/ops/declarable/helpers/helpers.h
+++ b/libnd4j/include/ops/declarable/helpers/helpers.h
@@ -38,6 +38,7 @@
 #include <cuda_runtime_api.h>
 #include <cuda_runtime.h>
 #include <cuda_device_runtime_api.h>
+#include <helpers/DebugHelper.h>
 #include <stdio.h>
 #include <stdlib.h>
 #endif // CUDACC
diff --git a/libnd4j/include/ops/declarable/helpers/impl/choose.cpp b/libnd4j/include/ops/declarable/helpers/impl/choose.cpp
index 47ca64d3b..4fb32e2f8 100644
--- a/libnd4j/include/ops/declarable/helpers/impl/choose.cpp
+++ b/libnd4j/include/ops/declarable/helpers/impl/choose.cpp
@@ -20,6 +20,7 @@
 
 #include <ops/declarable/helpers/choose.h>
 #include <NDArrayFactory.h>
+#include <ops/ops.h>
 
 namespace nd4j {
 namespace ops {
diff --git a/libnd4j/include/ops/declarable/helpers/impl/unique.cpp b/libnd4j/include/ops/declarable/helpers/impl/unique.cpp
index 5a73e0a00..8ef63101e 100644
--- a/libnd4j/include/ops/declarable/helpers/impl/unique.cpp
+++ b/libnd4j/include/ops/declarable/helpers/impl/unique.cpp
@@ -20,6 +20,7 @@
 
 #include <ops/declarable/helpers/unique.h>
 #include <Status.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
 namespace ops {
@@ -67,12 +68,14 @@ namespace helpers {
             }
         }
 
-        PRAGMA_OMP_PARALLEL_FOR_IF(values->lengthOf() > Environment::getInstance()->elementwiseThreshold())
-        for (int e = 0; e < values->lengthOf(); e++) {
-            values->p(e, static_cast<T>(valuesVector[e]));
-            if (counts != nullptr) 
-                counts->p(e, countsMap[valuesVector[e]]);
-        }
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto e = start; e < stop; e += increment) {
+                values->p(e, static_cast<T>(valuesVector[e]));
+                if (counts != nullptr)
+                    counts->p(e, countsMap[valuesVector[e]]);
+            }
+        };
+        samediff::Threads::parallel_for(func, 0, values->lengthOf());
 
         for (int e = 0; e < indices->lengthOf(); e++) {
             auto posI = std::find(valuesVector.begin(), valuesVector.end(), input->e<T>(e));
diff --git a/libnd4j/include/ops/declarable/helpers/matmul.h b/libnd4j/include/ops/declarable/helpers/matmul.h
index 8d253cabf..2e7cce13f 100644
--- a/libnd4j/include/ops/declarable/helpers/matmul.h
+++ b/libnd4j/include/ops/declarable/helpers/matmul.h
@@ -22,7 +22,6 @@
 #define LIBND4J_HELPERS_MATMUL_H
 
 #include <NDArray.h>
-#include <helpers/BlasHelper.h>
 
 namespace nd4j {
     namespace ops {
diff --git a/libnd4j/include/ops/declarable/impl/BooleanOp.cpp b/libnd4j/include/ops/declarable/impl/BooleanOp.cpp
index 579fdf394..436cddda3 100644
--- a/libnd4j/include/ops/declarable/impl/BooleanOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/BooleanOp.cpp
@@ -29,10 +29,6 @@ namespace nd4j {
             //
         }
 
-        BooleanOp::~BooleanOp() {
-            //
-        }
-
         /**
         * Output shape of any BooleanOp is ALWAYS scalar
         */
diff --git a/libnd4j/include/ops/declarable/impl/BroadcastableOp.cpp b/libnd4j/include/ops/declarable/impl/BroadcastableOp.cpp
index 71c722bca..7d696c8ef 100644
--- a/libnd4j/include/ops/declarable/impl/BroadcastableOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/BroadcastableOp.cpp
@@ -29,10 +29,6 @@ namespace nd4j {
             //
         }
 
-        BroadcastableOp::~BroadcastableOp() {
-            // no-op
-        }
-
         ShapeList *BroadcastableOp::calculateOutputShape(ShapeList *inputShape, nd4j::graph::Context &block) {
             auto shapeList = SHAPELIST();
             auto x = inputShape->at(0);
diff --git a/libnd4j/include/ops/declarable/impl/DeclarableCustomOp.cpp b/libnd4j/include/ops/declarable/impl/DeclarableCustomOp.cpp
index 691a3154d..1fd57c867 100644
--- a/libnd4j/include/ops/declarable/impl/DeclarableCustomOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/DeclarableCustomOp.cpp
@@ -26,9 +26,5 @@ namespace nd4j {
         DeclarableCustomOp::DeclarableCustomOp(int numInputs, int numOutputs, const char *opName, bool allowsInplace, int tArgs, int iArgs) : nd4j::ops::DeclarableOp(numInputs, numOutputs, opName, allowsInplace, tArgs, iArgs) {
             //
         }
-
-        DeclarableCustomOp::~DeclarableCustomOp()  {
-            //
-        }
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/impl/DeclarableListOp.cpp b/libnd4j/include/ops/declarable/impl/DeclarableListOp.cpp
index 7cb28e76d..624d6dbef 100644
--- a/libnd4j/include/ops/declarable/impl/DeclarableListOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/DeclarableListOp.cpp
@@ -26,10 +26,6 @@
 
 namespace nd4j {
     namespace ops {
-        DeclarableListOp::~DeclarableListOp() {
-            //
-        }
-
         DeclarableListOp::DeclarableListOp(int numInputs, int numOutputs, const char* opName, int tArgs, int iArgs) : DeclarableOp::DeclarableOp(numInputs, numOutputs, opName, false, tArgs, iArgs) {
             // This kind of operations work with sets: NDArrayList
             this->getOpDescriptor()->setInputType(InputType_NUMERIC_SET);
diff --git a/libnd4j/include/ops/declarable/impl/DeclarableReductionOp.cpp b/libnd4j/include/ops/declarable/impl/DeclarableReductionOp.cpp
index ef3b04d30..98a60b28b 100644
--- a/libnd4j/include/ops/declarable/impl/DeclarableReductionOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/DeclarableReductionOp.cpp
@@ -23,6 +23,7 @@
 #include <helpers/TAD.h>
 #include <helpers/ShapeUtils.h>
 #include <helpers/ConstantTadHelper.h>
+#include <array/DataTypeUtils.h>
 
 namespace nd4j {
     namespace ops {
@@ -30,11 +31,6 @@ namespace nd4j {
             //
         }
 
-        DeclarableReductionOp::~DeclarableReductionOp()  {
-            //
-        }
-
-
         nd4j::ShapeList* DeclarableReductionOp::calculateOutputShape(nd4j::ShapeList* inputShape, nd4j::graph::Context& block)  {
            // int numDims = INT_ARG(0);
             std::vector<int> dims;
@@ -55,7 +51,7 @@ namespace nd4j {
                 std::sort(dims.begin(), dims.end());
 
             // special case - output is scalar
-            if (dims.size() == 0 || (dims.size() == 1 && dims.at(0) == MAX_INT)) {
+            if (dims.size() == 0 || (dims.size() == 1 && dims.at(0) == nd4j::DataTypeUtils::max<int>())) {
                 auto newShape = ConstantShapeHelper::getInstance()->scalarShapeInfo(block.dataType());
                 return SHAPELIST(newShape);
             }
diff --git a/libnd4j/include/ops/declarable/impl/LegacyReduce3Op.cpp b/libnd4j/include/ops/declarable/impl/LegacyReduce3Op.cpp
index 2b83b200a..684f09262 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyReduce3Op.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyReduce3Op.cpp
@@ -22,6 +22,7 @@
 #include <helpers/ShapeUtils.h>
 #include <helpers/TAD.h>
 #include <helpers/ConstantTadHelper.h>
+#include <array/DataTypeUtils.h>
 
 namespace nd4j {
     namespace ops {
@@ -39,7 +40,7 @@ namespace nd4j {
             ExtraArguments extras(*block.getTArguments());
             PointersManager manager(block.launchContext(), "LegacyReduce3Op");
 
-            if (x->isSameShape(y) && (block.getIArguments()->size() == 0 || (block.getIArguments()->size() == 1 && INT_ARG(0) == MAX_INT))) {
+            if (x->isSameShape(y) && (block.getIArguments()->size() == 0 || (block.getIArguments()->size() == 1 && INT_ARG(0) == nd4j::DataTypeUtils::max<int>()))) {
                 // reduce3 to scalar
                 NativeOpExecutioner::execReduce3Scalar(block.launchContext(), opNum, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(),
                         extras.argumentsAsT(z->dataType()),
@@ -97,7 +98,7 @@ namespace nd4j {
 
             Nd4jLong *zShape = nullptr;
 
-            if (shape::equalsSoft(xShape, yShape) && (block.getIArguments()->size() == 0 || (block.getIArguments()->size() == 1 && INT_ARG(0) == MAX_INT))) {
+            if (shape::equalsSoft(xShape, yShape) && (block.getIArguments()->size() == 0 || (block.getIArguments()->size() == 1 && INT_ARG(0) == nd4j::DataTypeUtils::max<int>()))) {
                 // reduce3 to scalar case
                 ALLOCATE(zShape, block.getWorkspace(), shape::shapeInfoLength(2), Nd4jLong);
                 zShape[0] = 2;
diff --git a/libnd4j/include/ops/declarable/impl/LegacyReduceBoolOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyReduceBoolOp.cpp
index ac4bb33b7..12a25537d 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyReduceBoolOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyReduceBoolOp.cpp
@@ -23,6 +23,7 @@
 #include <helpers/ShapeUtils.h>
 #include <Status.h>
 #include <helpers/ConstantTadHelper.h>
+#include <array/DataTypeUtils.h>
 
 namespace nd4j {
     namespace ops {
@@ -60,7 +61,7 @@ namespace nd4j {
                     allAxes = true;
 
                 if ((axis.empty()) ||
-                    (axis.size() == 1 && axis[0] == MAX_INT) || allAxes) {
+                    (axis.size() == 1 && axis[0] == nd4j::DataTypeUtils::max<int>()) || allAxes) {
                     // scalar
                     NativeOpExecutioner::execReduceBoolScalar(block.launchContext(), opNum, x->getBuffer(), x->getShapeInfo(), x->specialBuffer(), x->specialShapeInfo(),
                             extras.argumentsAsT(x->dataType()), z->buffer(), z->shapeInfo(), z->specialBuffer(), z->specialShapeInfo());
@@ -100,7 +101,7 @@ namespace nd4j {
                     dims[e] = f >= 0 ? f : f += x->rankOf();
                 }
 
-                if ((block.getIArguments()->size() == 1 && INT_ARG(0) == MAX_INT) || allAxes) {
+                if ((block.getIArguments()->size() == 1 && INT_ARG(0) == nd4j::DataTypeUtils::max<int>()) || allAxes) {
                     // scalar
                     NativeOpExecutioner::execReduceBoolScalar(block.launchContext(), opNum, x->getBuffer(), x->getShapeInfo(), x->specialBuffer(), x->specialShapeInfo(), extras.argumentsAsT(x->dataType()), z->buffer(), z->shapeInfo(), z->specialBuffer(), z->specialShapeInfo());
                 } else {
diff --git a/libnd4j/include/ops/declarable/impl/LegacyReduceFloatOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyReduceFloatOp.cpp
index e1da0621e..2765e1b3f 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyReduceFloatOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyReduceFloatOp.cpp
@@ -23,6 +23,7 @@
 #include <helpers/ShapeUtils.h>
 #include <Status.h>
 #include <helpers/ConstantTadHelper.h>
+#include <array/DataTypeUtils.h>
 
 namespace nd4j {
     namespace ops {
@@ -60,7 +61,7 @@ namespace nd4j {
                     allAxes = true;
 
                 // _axis.(block.getIArguments()->size() == 0) ||
-                //                    (block.getIArguments()->size() == 1 && INT_ARG(0) == MAX_INT)
+                //                    (block.getIArguments()->size() == 1 && INT_ARG(0) == nd4j::DataTypeUtils::max<int>())
                 if (block.getAxis()->empty() || allAxes) {
                     // scalar
                     NativeOpExecutioner::execReduceFloatScalar(block.launchContext(), opNum, x->getBuffer(), x->getShapeInfo(), x->specialBuffer(), x->specialShapeInfo(),
@@ -101,7 +102,7 @@ namespace nd4j {
                     dims[e] = f >= 0 ? f : f += x->rankOf();
                 }
 
-                if ((block.getIArguments()->size() == 1 && INT_ARG(0) == MAX_INT) || allAxes) {
+                if ((block.getIArguments()->size() == 1 && INT_ARG(0) == nd4j::DataTypeUtils::max<int>()) || allAxes) {
                     // scalar
                     NativeOpExecutioner::execReduceFloatScalar(block.launchContext(), opNum, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(), extras.argumentsAsT(x->dataType()), z->buffer(), z->shapeInfo(), z->specialBuffer(), z->specialShapeInfo());
                 } else {
diff --git a/libnd4j/include/ops/declarable/impl/LegacyReduceLongOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyReduceLongOp.cpp
index 3c83df702..836564c79 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyReduceLongOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyReduceLongOp.cpp
@@ -23,6 +23,7 @@
 #include <helpers/ShapeUtils.h>
 #include <Status.h>
 #include <helpers/ConstantTadHelper.h>
+#include <array/DataTypeUtils.h>
 
 namespace nd4j {
     namespace ops {
@@ -60,7 +61,7 @@ namespace nd4j {
                     allAxes = true;
 
                 if ((axis.empty()) ||
-                    (axis.size() == 1 && axis[0] == MAX_INT) || allAxes) {
+                    (axis.size() == 1 && axis[0] == nd4j::DataTypeUtils::max<int>()) || allAxes) {
                     // scalar
                     NativeOpExecutioner::execReduceLongScalar(block.launchContext(), opNum, x->getBuffer(), x->getShapeInfo(), x->specialBuffer(), x->specialShapeInfo(),
                             extras.argumentsAsT(x->dataType()), z->buffer(), z->shapeInfo(), z->specialBuffer(), z->specialShapeInfo());
@@ -103,7 +104,7 @@ namespace nd4j {
                     dims[e] = f >= 0 ? f : f += x->rankOf();
                 }
 
-                if ((block.getIArguments()->size() == 1 && INT_ARG(0) == MAX_INT) || allAxes) {
+                if ((block.getIArguments()->size() == 1 && INT_ARG(0) == nd4j::DataTypeUtils::max<int>()) || allAxes) {
                     // scalar
                     NativeOpExecutioner::execReduceLongScalar(block.launchContext(), opNum, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(), extras.argumentsAsT(x->dataType()), z->buffer(), z->shapeInfo(), z->specialBuffer(), z->specialShapeInfo());
                 } else {
diff --git a/libnd4j/include/ops/declarable/impl/LegacyReduceSameOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyReduceSameOp.cpp
index 09a225b19..2340f39b0 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyReduceSameOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyReduceSameOp.cpp
@@ -23,6 +23,7 @@
 #include <helpers/ShapeUtils.h>
 #include <Status.h>
 #include <helpers/ConstantTadHelper.h>
+#include <array/DataTypeUtils.h>
 
 namespace nd4j {
     namespace ops {
@@ -98,7 +99,7 @@ namespace nd4j {
                     dims[e] = f >= 0 ? f : f += x->rankOf();
                 }
 
-                if ((block.getIArguments()->size() == 1 && INT_ARG(0) == MAX_INT) || allAxes) {
+                if ((block.getIArguments()->size() == 1 && INT_ARG(0) == nd4j::DataTypeUtils::max<int>()) || allAxes) {
                     // scalar
                     NativeOpExecutioner::execReduceSameScalar(block.launchContext(), opNum, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(), extras.argumentsAsT(z->dataType()), z->buffer(), z->shapeInfo(), z->specialBuffer(), z->specialShapeInfo());
                 } else {
diff --git a/libnd4j/include/ops/declarable/impl/LegacyStatsOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyStatsOp.cpp
index bb4dda4d4..08ebb80de 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyStatsOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyStatsOp.cpp
@@ -22,6 +22,7 @@
 #include <helpers/ShapeUtils.h>
 #include <helpers/TAD.h>
 #include <helpers/ConstantTadHelper.h>
+#include <array/DataTypeUtils.h>
 
 
 namespace nd4j {
@@ -43,7 +44,7 @@ namespace nd4j {
             ExtraArguments extras(*block.getTArguments());
             PointersManager manager(block.launchContext(),"LegacyStatsOp");
 
-            if (block.getIArguments()->size() == 1 || (block.getIArguments()->size() == 2 && INT_ARG(1) == MAX_INT)) {
+            if (block.getIArguments()->size() == 1 || (block.getIArguments()->size() == 2 && INT_ARG(1) == nd4j::DataTypeUtils::max<int>())) {
                 // scalar
                 NativeOpExecutioner::execSummaryStatsScalar(block.launchContext(), opNum, x->getBuffer(), x->getShapeInfo(), x->specialBuffer(), x->specialShapeInfo(),
                         extras.argumentsAsT(z->dataType()), z->getBuffer(), z->getShapeInfo(), z->specialBuffer(), z->specialShapeInfo(), biasCorrected);
@@ -92,7 +93,7 @@ namespace nd4j {
             auto inShape = inputShape->at(0);
 
             Nd4jLong *newShape;
-            if (block.getIArguments()->size() == 0 || (block.getIArguments()->size() == 1 && INT_ARG(0) == MAX_INT)) {
+            if (block.getIArguments()->size() == 0 || (block.getIArguments()->size() == 1 && INT_ARG(0) == nd4j::DataTypeUtils::max<int>())) {
                 // in this case we just return scalar
                 ALLOCATE(newShape, block.getWorkspace(), shape::shapeInfoLength(2), Nd4jLong);
                 newShape[0] = 2;
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/conv3d.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/conv3d.cpp
index 3c334e726..3d9a79535 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/conv3d.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/conv3d.cpp
@@ -213,6 +213,9 @@ PLATFORM_IMPL(conv3dnew_bp) {
     ConvolutionUtils::getSizesAndIndexesConv3d(isNDHWC, *input, *gradO, bS, iC, iD, iH, iW, oC, oD, oH, oW,
                                                indIOioC, indIOioD, indWiC, indWoC, indWkD);
 
+    if(isSameMode)                       // SAME
+        ConvolutionUtils::calcPadding3D(pD, pH, pW, oD, oH, oW, iD, iH, iW, kD, kH, kW, sD, sH, sW, dD, dH, dW);
+
     int trueoD, trueoH, trueoW;          // true output depth/height/width
     ConvolutionUtils::calcOutSizePool3D(trueoD, trueoH, trueoW, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH,
                                         dW, iD, iH, iW, isSameMode);
diff --git a/libnd4j/include/ops/impl/gemm.cpp b/libnd4j/include/ops/impl/gemm.cpp
index e004dc379..74b832b4a 100644
--- a/libnd4j/include/ops/impl/gemm.cpp
+++ b/libnd4j/include/ops/impl/gemm.cpp
@@ -22,6 +22,7 @@
 #include <gemm.h>
 #include <types/types.h>
 #include <Environment.h>
+#include <execution/Threads.h>
 
 namespace nd4j {
     namespace blas {
@@ -32,15 +33,18 @@ namespace nd4j {
             auto source = reinterpret_cast<T *>(vsource);
 
             // handle transpose in parallel
-            PRAGMA_OMP_PARALLEL_FOR_COLLAPSE(2)
-            for (int r = 0; r < rows; r++) {
-                for (int c = 0; c < cols; c++) {
-                    int zIdx = orderTarget == CblasRowMajor ? linearIndexC(rows, cols, r, c) : linearIndexF(rows, cols, r, c);
-                    int xIdx = orderSource == CblasColMajor ? linearIndexF(rows, cols, r, c) : linearIndexC(rows, cols, r, c);
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto r = start; r < stop; r += increment) {
+                    for (int c = 0; c < cols; c++) {
+                        int zIdx = orderTarget == CblasRowMajor ? linearIndexC(rows, cols, r, c) : linearIndexF(rows, cols, r, c);
+                        int xIdx = orderSource == CblasColMajor ? linearIndexF(rows, cols, r, c) : linearIndexC(rows, cols, r, c);
 
-                    ret[zIdx] = source[xIdx];
+                        ret[zIdx] = source[xIdx];
+                    }
                 }
-            }
+            };
+
+            samediff::Threads::parallel_for(func, 0, rows);
 
             return ret;
         }
@@ -62,44 +66,49 @@ namespace nd4j {
             bool transBFlag = TransB == CblasTrans;
 
             if (beta == 0.0) {
+                Z z = 0.f;
                 int length = M*N;
                 if (length <= Environment::getInstance()->elementwiseThreshold()) {
-                    PRAGMA_OMP_SIMD
                     for (int r = 0; r < length; r++)
-                        C[r] = static_cast<Z>(0.0f);
+                        C[r] = z;
                 } else {
-                    PRAGMA_OMP_PARALLEL_FOR_SIMD
-                    for (int r = 0; r < length; r++)
-                        C[r] = static_cast<Z>(0.0f);
+                    auto func = PRAGMA_THREADS_FOR {
+                        for (auto r = start; r < stop; r += increment)
+                            C[r] = z;
+                    };
+                    samediff::Threads::parallel_for(func, 0, length);
                 }
             }
 
 
-            PRAGMA_OMP_PARALLEL_FOR_SIMD_COLLAPSE(2)
-            for (int r = 0; r < M; r++) {
-                for (int c = 0; c < N; c++) {
-                    int zIdx = linearIndexF(M, N, r, c);
+            auto func = PRAGMA_THREADS_FOR_2D {
+                for (auto r = start_x; r < stop_x; r += inc_x) {
+                    for (auto c = start_y; c < stop_y; c += inc_y) {
+                        int zIdx = linearIndexF(M, N, r, c);
 
-                    Z dot = static_cast<Z>(0.0f);
+                        Z dot = static_cast<Z>(0.0f);
 
-                    if (alpha != 0.0) {
-                        int bIdx; // = linearIndexF(K, N, 0, c);
-                        int aIdx;
+                        if (alpha != 0.0) {
+                            int bIdx; // = linearIndexF(K, N, 0, c);
+                            int aIdx;
 
-                        for (int k = 0; k < K; k++) {
-                            aIdx = (transAFlag ? linearIndexC(M, K, r, k) : linearIndexF(M, K, r, k));
-                            bIdx = (transBFlag ? linearIndexC(K, N, k, c) : linearIndexF(K,N, k, c));
-                            dot += static_cast<Z>(alpha) * static_cast<Z>(A[aIdx]) * static_cast<Z>(B[bIdx]);//A[aIdx]nd4j::math::nd4j_dot<T>(aX, bX, K) * alpha;
+                            for (int k = 0; k < K; k++) {
+                                aIdx = (transAFlag ? linearIndexC(M, K, r, k) : linearIndexF(M, K, r, k));
+                                bIdx = (transBFlag ? linearIndexC(K, N, k, c) : linearIndexF(K, N, k, c));
+                                dot += static_cast<Z>(alpha) * static_cast<Z>(A[aIdx]) * static_cast<Z>(B[bIdx]);//A[aIdx]nd4j::math::nd4j_dot<T>(aX, bX, K) * alpha;
+                            }
+                        }
+
+                        if (beta != 0.0) {
+                            C[zIdx] = static_cast<Z>(dot + beta * C[zIdx]);
+                        } else {
+                            C[zIdx] = static_cast<Z>(dot);
                         }
                     }
-
-                    if (beta != 0.0) {
-                        C[zIdx] = static_cast<Z>(dot + beta * C[zIdx]);
-                    } else {
-                        C[zIdx] = static_cast<Z>(dot);
-                    }
                 }
-            }
+            };
+
+            samediff::Threads::parallel_for(func, 0, M, 1, 0, N, 1);
         }
 
 
@@ -120,14 +129,16 @@ namespace nd4j {
 
             auto aT = TRANS == CblasTrans ? reinterpret_cast<X *>(nd4j::blas::transpose<X>(CblasColMajor, CblasRowMajor, M, N, reinterpret_cast<void *>(x))) : x;
 
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (int r = 0; r < M; r++) {
-                int aIdx = linearIndexC(M, N, r, 0);
-                auto aX = aT + aIdx;
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto r = start; r < stop; r += increment) {
+                    int aIdx = linearIndexC(M, N, r, 0);
+                    auto aX = aT + aIdx;
 
-                auto dot = nd4j::math::nd4j_dot<X, Y, Z>(aX, y, lda) * alpha;
-                z[r] =  beta == 0.0f ? dot : dot + beta * z[r];
-            }
+                    auto dot = nd4j::math::nd4j_dot<X, Y, Z>(aX, y, lda) * alpha;
+                    z[r] = beta == 0.0f ? dot : dot + beta * z[r];
+                }
+            };
+            samediff::Threads::parallel_for(func, 0, M);
 
             if (TRANS == CblasTrans)
                 delete[] aT;
diff --git a/libnd4j/include/ops/impl/specials.cpp b/libnd4j/include/ops/impl/specials.cpp
index 85642d6c8..11cca1b15 100644
--- a/libnd4j/include/ops/impl/specials.cpp
+++ b/libnd4j/include/ops/impl/specials.cpp
@@ -63,22 +63,24 @@ void SpecialMethods<T>::concatCpuGeneric(const std::vector<NDArray*>& inArrs, ND
 
                 T* outBuff = output.bufferAsT<T>();
 
-                PRAGMA_OMP_PARALLEL_FOR_SIMD
-                for (uint r = 0; r < numOfArrs; r++) {
+                auto func = PRAGMA_THREADS_FOR {
+                    for (auto r = start; r < stop; r += increment) {
+                        const Nd4jLong arrLen = inArrs[r]->lengthOf();
+                        const uint xEws = (arrLen == 1) ? 1 : inArrs[r]->stridesOf()[nonUnityDim[r]];
 
-                    const Nd4jLong arrLen = inArrs[r]->lengthOf();
-                    const uint xEws    = (arrLen == 1) ? 1 : inArrs[r]->stridesOf()[nonUnityDim[r]];
+                        T *z = outBuff + zOffset[r];
+                        T *x = inArrs[r]->bufferAsT<T>();
 
-                    T *z = outBuff + zOffset[r];
-                    T *x = inArrs[r]->bufferAsT<T>();
+                        if (outEws == 1 && xEws == 1)
+                            for (Nd4jLong e = 0; e < arrLen; e++)
+                                z[e] = x[e];
+                        else
+                            for (Nd4jLong e = 0; e < arrLen; e++)
+                                z[e * outEws] = x[e * xEws];
+                    }
+                };
 
-                    if(outEws == 1 && xEws == 1)
-                        for (Nd4jLong e = 0; e < arrLen; e++)
-                            z[e] = x[e];
-                    else
-                        for (Nd4jLong e = 0; e < arrLen; e++)
-                            z[e * outEws] = x[e * xEws];
-                }
+                samediff::Threads::parallel_tad(func, 0, numOfArrs);
                 return;
             }
         }
@@ -96,11 +98,14 @@ void SpecialMethods<T>::concatCpuGeneric(const std::vector<NDArray*>& inArrs, ND
             indices[i][2 * axis + 1] = indices[i-1][2 * axis + 1] + inArrs[i]->sizeAt(axis);      // index end with (excluding)
         }
 
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for(int i = 0; i < numOfArrs; ++i) {
-            auto temp = output(indices[i], true);
-            nd4j::TransformLoops<T,T,T>::template loopTransform<simdOps::Assign<T,T>, false>(inArrs[i]->bufferAsT<T>(), inArrs[i]->getShapeInfo(), temp.bufferAsT<T>(), temp.getShapeInfo(), nullptr);
-        }
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
+                auto temp = output(indices[i], true);
+                nd4j::TransformLoops<T, T, T>::template loopTransform<simdOps::Assign<T, T>>( inArrs[i]->bufferAsT<T>(), inArrs[i]->getShapeInfo(), temp.bufferAsT<T>(), temp.getShapeInfo(), nullptr, 0, 1);
+            }
+        };
+
+        samediff::Threads::parallel_tad(func, 0, numOfArrs);
 }
 
 /**
@@ -137,21 +142,15 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint
         auto z = reinterpret_cast<T *>(vz);
         auto x = reinterpret_cast<T **>(vx);
 
-        // aggregation step
-#ifdef _OPENMP
-        int _threads = omp_get_max_threads();
-#else
-        // we can use whatever we want here, this value won't be used if there's no omp
-    int _threads = 4;
-#endif
-
-        PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for (Nd4jLong i = 0; i < length; i++) {
-
-            for (Nd4jLong ar = 0; ar < n; ar++) {
-                z[i] += x[ar][i];
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
+                for (auto ar = 0L; ar < n; ar++) {
+                    z[i] += x[ar][i];
+                }
             }
-        }
+        };
+
+        samediff::Threads::parallel_for(func, 0, length);
     }
 
 
@@ -175,24 +174,18 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint
             z = x[0];
 
             PRAGMA_OMP_SIMD
-            for (Nd4jLong i = 0; i < length; i++) {
+            for (uint64_t i = 0; i < length; i++) {
                 z[i] /= n;
             }
 
-#ifdef _OPENNMP
-            int _threads = omp_get_max_threads(); //nd4j::math::nd4j_min<int>(omp_get_max_threads() / 2, 4);
-#else
-            // we can use whatever we want here, this value won't be used if there's no omp
-            int _threads = 4;
-#endif
-
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (Nd4jLong i = 0; i < length; i++) {
-
-                for (Nd4jLong ar = 1; ar < n; ar++) {
-                    z[i] += x[ar][i] / n;
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    for (Nd4jLong ar = 1; ar < n; ar++) {
+                        z[i] += x[ar][i] / n;
+                    }
                 }
-            }
+            };
+            samediff::Threads::parallel_for(func, 0, length);
 
             // instead of doing element-wise propagation, we just issue memcpy to propagate data
             for (Nd4jLong ar = 1; ar < n; ar++) {
@@ -205,20 +198,14 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint
             memset(z, 0, length * sizeof(T));
 
             // aggregation step
-#ifdef _OPENNMP
-            int _threads = omp_get_max_threads(); //nd4j::math::nd4j_min<int>(omp_get_max_threads() / 2, 4);
-#else
-            // we can use whatever we want here, this value won't be used if there's no omp
-            int _threads = 4;
-#endif
-
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (Nd4jLong i = 0; i < length; i++) {
-
-                for (Nd4jLong ar = 0; ar < n; ar++) {
-                    z[i] += x[ar][i] / n;
+            auto func = PRAGMA_THREADS_FOR {
+                for (auto i = start; i < stop; i += increment) {
+                    for (Nd4jLong ar = 0; ar < n; ar++) {
+                        z[i] += x[ar][i] / n;
+                    }
                 }
-            }
+            };
+            samediff::Threads::parallel_for(func, 0, length);
 
             // instead of doing element-wise propagation, we just issue memcpy to propagate data
             for (Nd4jLong ar = 0; ar < n; ar++) {
@@ -348,12 +335,14 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
         Nd4jLong xTadLength = shape::tadLength(xShapeInfo, dimension, dimensionLength);
         int numTads = xLength / xTadLength;
 
-        PRAGMA_OMP_PARALLEL_FOR
-        for (int r = 0; r < numTads; r++) {
-            T *dx = x + tadOffsets[r];
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto r = start; r < stop; r += increment) {
+                T *dx = x + tadOffsets[r];
 
-            quickSort_parallel(dx, tadShapeInfo, xTadLength, 1, descending);
-        }
+                quickSort_parallel(dx, tadShapeInfo, xTadLength, 1, descending);
+            }
+        };
+        samediff::Threads::parallel_tad(func, 0, numTads);
     }
 
 
@@ -368,23 +357,25 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
         float threshold = fb.f_;
 
 
-        PRAGMA_OMP_PARALLEL_FOR
-        for (Nd4jLong e = 4; e < lim; e++) {
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto e = start; e < stop; e += increment) {
+                for (int bitId = 0; bitId < 16; bitId++) {
+                    bool hasBit = (x[e] & 1 << (bitId)) != 0;
+                    bool hasSign = (x[e] & 1 << (bitId + 16)) != 0;
 
-            for (int bitId = 0; bitId < 16; bitId++) {
-                bool hasBit = (x[e] & 1 << (bitId) ) != 0;
-                bool hasSign = (x[e] & 1 << (bitId + 16) ) != 0;
-
-                if (hasBit) {
-                    if (hasSign)
-                        dz[(e - 4) * 16 + bitId] -= threshold;
-                    else
-                        dz[(e - 4) * 16 + bitId] += threshold;
-                } else if (hasSign) {
-                    dz[(e - 4) * 16 + bitId] -= threshold / 2;
+                    if (hasBit) {
+                        if (hasSign)
+                            dz[(e - 4) * 16 + bitId] -= threshold;
+                        else
+                            dz[(e - 4) * 16 + bitId] += threshold;
+                    } else if (hasSign) {
+                        dz[(e - 4) * 16 + bitId] -= threshold / 2;
+                    }
                 }
             }
-        }
+        };
+
+        samediff::Threads::parallel_for(func, 4, lim);
     }
 
     template<typename S, typename T>
@@ -392,17 +383,14 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
         auto x = reinterpret_cast<S *>(dx);
         auto z = reinterpret_cast<T *>(dz);
 
-        if (N < nd4j::Environment::getInstance()->elementwiseThreshold()) {
-            for (int i = 0; i < N; i++) {
-                z[i] = static_cast<T>(x[i]);
-            }
-        } else {
 
-            PRAGMA_OMP_PARALLEL_FOR
-            for (int i = 0; i < N; i++) {
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i += increment) {
                 z[i] = static_cast<T>(x[i]);
             }
-        }
+        };
+
+        samediff::Threads::parallel_for(func, 0, N);
     };
     BUILD_DOUBLE_TEMPLATE(template void SpecialTypeConverter::convertGeneric, (Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz), LIBND4J_TYPES, LIBND4J_TYPES);
 
@@ -410,49 +398,49 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
     Nd4jLong SpecialMethods<T>::encodeBitmapGeneric(void *vx, Nd4jLong *xShapeInfo, Nd4jLong N, int *dz, float threshold) {
         auto dx = reinterpret_cast<T *>(vx);
 
-        Nd4jLong retVal = 0L;
+//PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) proc_bind(close) reduction(+:retVal))
+        auto func = PRAGMA_REDUCE_LONG {
+            Nd4jLong retVal = 0L;
 
-PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) proc_bind(close) reduction(+:retVal))
-        for (Nd4jLong x = 0; x < N; x += 16) {
+            for (auto x = start; x < stop; x += increment) {
+                int byte = 0;
+                int byteId = x / 16 + 4;
 
-            int byte = 0;
-            int byteId = x / 16 + 4;
+                for (int f = 0; f < 16; f++) {
+                    Nd4jLong e = x + f;
 
-            for (int f = 0; f < 16; f++) {
-                Nd4jLong e = x + f;
+                    if (e >= N)
+                        continue;
 
-                if (e >= N)
-                    continue;
+                    T val = dx[e];
+                    T abs = nd4j::math::nd4j_abs<T>(val);
 
-                T val = dx[e];
-                T abs = nd4j::math::nd4j_abs<T>(val);
+                    int bitId = e % 16;
 
-                int bitId = e % 16;
+                    if (abs >= (T) threshold) {
+                        byte |= 1 << (bitId);
+                        retVal++;
 
-                if (abs >= (T) threshold) {
-                    byte |= 1 << (bitId);
-
-                    retVal++;
-
-
-                    if (val < (T) 0.0f) {
+                        if (val < (T) 0.0f) {
+                            byte |= 1 << (bitId + 16);
+                            dx[e] += threshold;
+                        } else {
+                            dx[e] -= threshold;
+                        }
+                    } else if (abs >= (T) threshold / (T) 2.0f && val < (T) 0.0f) {
                         byte |= 1 << (bitId + 16);
-                        dx[e] += threshold;
-                    } else {
-                        dx[e] -= threshold;
-                    }
-                } else if (abs >= (T) threshold / (T) 2.0f && val < (T) 0.0f) {
-                    byte |= 1 << (bitId + 16);
-                    dx[e] += threshold / 2;
+                        dx[e] += threshold / 2;
 
-                    retVal++;
+                        retVal++;
+                    }
                 }
+
+                dz[byteId] = byte;
             }
 
-            dz[byteId] = byte;
-        }
-
-        return retVal;
+            return retVal;
+        };
+        return samediff::Threads::parallel_long(func, LAMBDA_SUML, 0, N, 16);
     }
 
     template <typename X, typename Y>
@@ -637,13 +625,16 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
         auto xTadLength = shape::length(packX.primaryShapeInfo());
         auto numTads = packX.numberOfTads();
 
-        PRAGMA_OMP_PARALLEL_FOR
-        for (Nd4jLong r = 0; r < numTads; r++) {
-            auto dx = x + packX.primaryOffsets()[r];
-            auto dy = y + packY.primaryOffsets()[r];
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto r = start; r < stop; r += increment) {
+                auto dx = x + packX.primaryOffsets()[r];
+                auto dy = y + packY.primaryOffsets()[r];
 
-            quickSort_parallel_key<X,Y>(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending);
-        }
+                quickSort_parallel_key<X, Y>(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending);
+            }
+        };
+
+        samediff::Threads::parallel_tad(func, 0, numTads);
     }
 
     template <typename X, typename Y>
@@ -658,13 +649,16 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
         auto xTadLength = shape::length(packX.primaryShapeInfo());
         auto numTads = packX.numberOfTads();
 
-        PRAGMA_OMP_PARALLEL_FOR
-        for (Nd4jLong r = 0; r < numTads; r++) {
-            auto dx = x + packX.primaryOffsets()[r];
-            auto dy = y + packY.primaryOffsets()[r];
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto r = start; r < stop; r += increment) {
+                auto dx = x + packX.primaryOffsets()[r];
+                auto dy = y + packY.primaryOffsets()[r];
 
-            quickSort_parallel_value<X,Y>(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending);
-        }
+                quickSort_parallel_value<X, Y>(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending);
+            }
+        };
+
+        samediff::Threads::parallel_tad(func, 0, numTads);
     }
 
     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES);
diff --git a/libnd4j/include/ops/ops.h b/libnd4j/include/ops/ops.h
index 601481b21..ab4bfca90 100644
--- a/libnd4j/include/ops/ops.h
+++ b/libnd4j/include/ops/ops.h
@@ -77,42 +77,6 @@
 #define SELU_ALPHA 1.6732632423543772848170429916717
 #define SELU_LAMBDA 1.0507009873554804934193349852946
 
-#ifdef _OPENMP
-#pragma omp declare reduction(maxTF : float,double,float16,bfloat16 :              \
-                omp_out = nd4j::math::nd4j_max(omp_in, omp_out) )\
-                initializer (omp_priv=-MAX_FLOAT)
-
-#pragma omp declare reduction(minTF : float,double,float16,bfloat16 :              \
-                omp_out = nd4j::math::nd4j_min(omp_in, omp_out) )\
-                initializer (omp_priv=MAX_FLOAT)
-
-#pragma omp declare reduction(maxT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t :              \
-                omp_out = nd4j::math::nd4j_max(omp_in, omp_out) )\
-                initializer (omp_priv=0)
-
-#pragma omp declare reduction(minT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t :              \
-                omp_out = nd4j::math::nd4j_min(omp_in, omp_out) )\
-                initializer (omp_priv=0)
-
-#pragma omp declare reduction(amaxT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t :              \
-                omp_out = nd4j::math::nd4j_max(nd4j::math::nd4j_abs(omp_in), nd4j::math::nd4j_abs(omp_out)) )
-
-#pragma omp declare reduction(aminT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t :              \
-                omp_out = nd4j::math::nd4j_min(nd4j::math::nd4j_abs(omp_in), nd4j::math::nd4j_abs(omp_out)) )
-
-#pragma omp declare reduction(asumT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t :              \
-                omp_out = nd4j::math::nd4j_abs(omp_in) + nd4j::math::nd4j_abs(omp_out))\
-                initializer (omp_priv=0)
-
-#pragma omp declare reduction(sumT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t :              \
-                omp_out = omp_in + omp_out)\
-                initializer (omp_priv=0)
-
-#pragma omp declare reduction(prodT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t :              \
-                omp_out = omp_in * omp_out)\
-                initializer (omp_priv=1)
-#endif
-
 
 namespace functions {
 	namespace indexreduce {
diff --git a/libnd4j/include/ops/special_accumulation_ops.h b/libnd4j/include/ops/special_accumulation_ops.h
deleted file mode 100644
index 3f2b2ed1d..000000000
--- a/libnd4j/include/ops/special_accumulation_ops.h
+++ /dev/null
@@ -1,213 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2015-2018 Skymind, Inc.
- *
- * This program and the accompanying materials are made available under the
- * terms of the Apache License, Version 2.0 which is available at
- * https://www.apache.org/licenses/LICENSE-2.0.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations
- * under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- ******************************************************************************/
-
-//
-// @author raver119@gmail.com
-//
-
-#ifndef LIBND4J_SPECIAL_ACCUMULATION_OPS_H
-#define LIBND4J_SPECIAL_ACCUMULATION_OPS_H
-
-#include <templatemath.h>
-#include <helpers/TAD.h>
-#include <helpers/ConstantTadHelper.h>
-//#include <ops/ops.h>
-//#include <loops/reduce.h>
-
-namespace simdOps {
-
-    template<typename T, typename Z>
-    class LogSumExp {
-    public:
-        static const bool requiresSpecialAccumulation = true;
-
-        constexpr static functions::ReduceType reduceType = functions::ReduceType::SUM;
-
-        op_def static T startingValue(const T *input) {
-            return (T) 0.0f;
-        }
-
-        op_def static Z merge(T old, T opOutput, Z *extraParams) {
-            return opOutput + old;
-        }
-
-        op_def static T update(T old, T opOutput, Z *extraParams) {
-            return opOutput + old;
-        }
-
-        op_def static Z op(T d1, T d2) {
-            return nd4j::math::nd4j_exp<T, Z>(d1 - d2);
-        }
-
-        op_def static Z op(T d1, Z* extraParams) {
-            return nd4j::math::nd4j_exp<Z, Z>(static_cast<Z>(d1) - extraParams[0]);
-        }
-
-        op_def static Z postProcess(T reduction, Nd4jLong n, Z *extraParams) {
-            return extraParams[0] + nd4j::math::nd4j_log<T, Z>(reduction);
-        }
-
-#ifdef __CUDACC__
-        __device__ static inline void aggregatePartials(Z *sPartials, int tid, int numItems, Z *extraParams) {
-            // start the shared memory loop on the next power of 2 less
-            // than the block size.  If block size is not a power of 2,
-            // accumulate the intermediate sums in the remainder range.
-            int floorPow2 = numItems;
-
-            if (floorPow2 & (floorPow2 - 1)) {
-                while (floorPow2 & (floorPow2 - 1)) {
-                    floorPow2 &= floorPow2 - 1;
-                }
-                if (tid >= floorPow2) {
-                    sPartials[tid - floorPow2] = update(sPartials[tid - floorPow2], sPartials[tid], extraParams);
-                }
-
-                __syncthreads();
-            }
-
-
-            for (int activeThreads = floorPow2 >> 1; activeThreads; activeThreads >>= 1) {
-                if (tid < activeThreads && tid + activeThreads < numItems) {
-                    sPartials[tid] = update(sPartials[tid], sPartials[tid + activeThreads], extraParams);
-                }
-                __syncthreads();
-            }
-        }
-
-        static inline __device__ void execSpecialCuda(
-				T *dx,
-				Nd4jLong *xShapeInfo,
-				Z *extraParams,
-				Z *result,
-				Nd4jLong *resultShapeInfo,
-				int *dimension,
-				int dimensionLength,
-				Z *reductionBuffer,
-				Nd4jLong *tadOnlyShapeInfo,
-				Nd4jLong *tadOffsets) {
-
-				// we assume that RESULT already holds max values
-
-				//shared memory space for storing intermediate results
-				__shared__ Z *sPartials;
-
-				//                __shared__ shape::TAD *tad;
-				__shared__ Nd4jLong tadLength;
-				__shared__ Nd4jLong numTads;
-
-				if (threadIdx.x == 0) {
-				    extern __shared__ unsigned char shmem[];
-				    sPartials = (Z *) shmem;
-					tadLength = shape::length(tadOnlyShapeInfo);//shape::tadLength(xShapeInfo, dimension, dimensionLength);
-					numTads = shape::length(xShapeInfo) / tadLength;
-				}
-				__syncthreads();
-
-				for (int r = blockIdx.x; r < numTads; r += gridDim.x) {
-					auto tadOffsetForBlock = tadOffsets[r];
-
-					sPartials[threadIdx.x] = startingValue(dx + tadOffsetForBlock);
-
-					for (int i = threadIdx.x; i < tadLength; i += blockDim.x) {
-						auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo);
-						sPartials[threadIdx.x] = update(sPartials[threadIdx.x], op(dx[xOffset], result[r]), extraParams);
-					}
-					__syncthreads();
-
-					// aggregate. do NOT reduce for elements > tadLength
-					aggregatePartials(sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, tadLength), &result[r]);
-
-					__syncthreads();
-					if (threadIdx.x == 0)
-						result[r] = postProcess(sPartials[threadIdx.x], tadLength, &result[r]);
-				}
-			}
-#endif
-
-        static void execSpecial(T *x,
-                         Nd4jLong *xShapeInfo,
-                         Z *extraParams,
-                         Z *result,
-                         Nd4jLong *resultShapeInfoBuffer,
-                         int *dimension,
-                         int dimensionLength,
-                         Nd4jLong *tadShapeInfo,
-                         Nd4jLong *tadOffset) {
-            Nd4jLong resultLength = shape::length(resultShapeInfoBuffer);
-
-            auto tadOnlyShapeInfo = tadShapeInfo;
-            auto tadOffsets = tadOffset;
-
-            if (tadOnlyShapeInfo == nullptr || tadOffsets == nullptr) {
-                if (dimensionLength < 1)
-                    return;
-
-                auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
-                tadOnlyShapeInfo = tadPack.primaryShapeInfo();
-                tadOffsets = tadPack.primaryOffsets();
-            }
-
-
-            const Nd4jLong tadLength = shape::length(tadOnlyShapeInfo);//shape::tadLength(xShapeInfo, dimension, dimensionLength);
-            auto numTads = shape::length(xShapeInfo) / tadLength;
-            auto tadEWS = shape::elementWiseStride(tadOnlyShapeInfo);
-
-            int tadsPerThread = resultLength / TAD_THRESHOLD;
-            int num_threads = nd4j::math::nd4j_max<int>(1, tadsPerThread);
-            num_threads = nd4j::math::nd4j_min<int>(num_threads, omp_get_max_threads());
-
-            if (tadEWS > 0 && (numTads == 1 || shape::isVector(tadOnlyShapeInfo) || shape::isScalar(tadOnlyShapeInfo))) {
-
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(num_threads)
-                for (int i = 0; i < resultLength; i++) {
-
-                    T *iter = x + tadOffsets[i];
-                    T start = startingValue(iter);
-                    if (tadEWS == 1) {
-                        for (int j = 0; j < tadLength; j++) {
-                            start = update(start, op(iter[j], result[i]), extraParams);
-
-                        }
-                    }
-                    else {
-                        for (int j = 0; j < tadLength; j++) {
-                            start = update(start, op(iter[j * tadEWS], result[i]), extraParams);
-                        }
-                    }
-                    result[i] = postProcess(start, tadLength, &result[i]);
-                }
-            }
-            else {
-
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(num_threads)
-                for (int i = 0; i < resultLength; i++) {
-
-                    auto offset = tadOffsets[i];
-                    T start = startingValue(x + offset);
-
-                    for (int j = 0; j < tadLength; j++) {
-                        auto xOffset = offset + shape::getIndexOffset(j, tadOnlyShapeInfo);
-                        start = update(start, op(x[xOffset], result[i]), extraParams);
-                    }
-
-                    result[i] = postProcess(start, tadLength, &result[i]);;
-                }
-            }
-        }
-    };
-}
-
-#endif //LIBND4J_SPECIAL_ACCUMULATION_OPS_H
diff --git a/libnd4j/include/ops/special_ops.h b/libnd4j/include/ops/special_ops.h
deleted file mode 100644
index 8f6ef6b5b..000000000
--- a/libnd4j/include/ops/special_ops.h
+++ /dev/null
@@ -1,2293 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2015-2018 Skymind, Inc.
- *
- * This program and the accompanying materials are made available under the
- * terms of the Apache License, Version 2.0 which is available at
- * https://www.apache.org/licenses/LICENSE-2.0.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations
- * under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- ******************************************************************************/
-
-#pragma once
-#include <ops/ops.h>
-#include <loops/reduce_float.h>
-#include <loops/reduce_same.h>
-#include <loops/scalar.h>
-#include <loops/indexreduce.h>
-#include <loops/broadcasting.h>
-#include <loops/transform_float.h>
-#include <op_enums.h>
-#include <loops/transform_strict.h>
-#include <helpers/ConstantTadHelper.h>
-
-#ifdef __CUDACC__
-#include <loops/cuda/inplace_loops/reduce_same_inplace.h>
-#include <loops/cuda/inplace_loops/transform_strict_inplace.h>
-#include <loops/cuda/inplace_loops/scalar_inplace.h>
-#endif
-
-namespace functions {
-	namespace broadcast {
-		template <typename X, typename Y, typename Z>
-		class Broadcast;
-	}
-
-	namespace transform {
-		template <typename X>
-		class TransformStrict;
-	}
-
-	namespace scalar {
-	}
-
-	namespace reduce {
-		template <typename X, typename Z>
-		class ReduceFloatFunction;
-
-        template <typename X>
-        class ReduceSameFunction;
-	}
-}
-
-namespace simdOps {
-
-	template<typename T, typename Z>
-	class Pooling2D {
-	public:
-		static const bool requiresSpecial = true;
-#ifdef __CUDACC__
-		inline __host__ __device__
-#elif defined(__GNUC__)
-
-#endif
-		static int outSize(int size, int k, int s, int p, bool coverAll) {
-			if (coverAll)
-				return (size + p * 2 - k + s - 1) / s + 1;
-			else
-				return (size + p * 2 - k) / s + 1;
-		}
-
-#ifdef __CUDACC__
-		/**
-		* Based on:  https://github.com/pjreddie/darknet/blob/master/src/im2col_kernels.cu
-		*/
-
-		static inline __device__ void execSpecialCuda(
-			             T *dx, Nd4jLong *xShapeBuffer,
-			             Z *result, Nd4jLong *zShapeBuffer,
-			             Z *extraParams,
-                         int *allocationPointer, Z *reductionPointer,
-                         Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-			__shared__ int kH;
-			__shared__ int kW;
-			__shared__ int sH;
-			__shared__ int sW;
-			__shared__ int pH;
-			__shared__ int pW;
-			__shared__ int dH;
-			__shared__ int dW;
-			__shared__ int poolingMode;
-			__shared__ Z extraParam0;
-
-			__shared__ int batchSize;
-			__shared__ int inChannels;
-			__shared__ int outH;
-			__shared__ int outW;
-			__shared__ int inH;
-			__shared__ int inW;
-
-            //__shared__ int *strideIn;
-            //__shared__ int *strideOut;
-            __shared__ int strideB;
-            __shared__ int strideC;
-            __shared__ int strideY;
-            __shared__ int strideX;
-
-			__shared__ int strideOB;
-            __shared__ int strideOC;
-            __shared__ int strideOY;
-            __shared__ int strideOX;
-
-            __shared__ int length;
-            __shared__ int kHEff;
-            __shared__ int kWEff;
-			__shared__ bool fOrder;
-
-
-			if (threadIdx.x == 0) {
-				kH = (int)extraParams[0];
-				kW = (int)extraParams[1];
-				sH = (int)extraParams[2];
-				sW = (int)extraParams[3];
-				pH = (int)extraParams[4];
-				pW = (int)extraParams[5];
-				dH = (int)extraParams[6];			//Dilation, height dimension
-				dW = (int)extraParams[7];			//Dilation, width dimension
-				poolingMode = (int)extraParams[9];
-				extraParam0 = extraParams[10];
-
-				batchSize = shape::sizeAt(xShapeBuffer, 0);
-				inChannels = shape::sizeAt(xShapeBuffer, 1);
-				outH = shape::sizeAt(zShapeBuffer, 2);
-				outW = shape::sizeAt(zShapeBuffer, 3);
-				inH = shape::sizeAt(xShapeBuffer, 2);
-				inW = shape::sizeAt(xShapeBuffer, 3);
-
-            	strideB = shape::stride(xShapeBuffer)[0];
-            	strideC = shape::stride(xShapeBuffer)[1];
-            	strideY = shape::stride(xShapeBuffer)[2];
-            	strideX = shape::stride(xShapeBuffer)[3];
-
-				strideOB = shape::stride(zShapeBuffer)[0];
-            	strideOC = shape::stride(zShapeBuffer)[1];
-            	strideOY = shape::stride(zShapeBuffer)[2];
-            	strideOX = shape::stride(zShapeBuffer)[3];
-
-            	length = shape::length(zShapeBuffer);
-
-				//Replace kernel H/W with *effective* kernel H/W accounting for dilatyon
-				kHEff = kH + (kH-1)*(dH-1);
-				kWEff = kW + (kW-1)*(dW-1);
-
-				fOrder = shape::order(zShapeBuffer) == 'f';
-/*
-				if (blockIdx.x == 0) {
-					printf("kH: %i; kW: %i; sH: %i; sW: %i; pH: %i; pW: %i; dH: %i; dW: %i; poolingMode: %i; extraParam0: %f;\n", kH, kW, sH, sW, pH, pW, dH, dW, poolingMode, (float) extraParam0);
-					printf("batchSize: %i; inChannels: %i; outH: %i; outW: %i; inH: %i; inW: %i; strideB: %i; strideC: %i; strideY: %i; strideX: %i;\n", batchSize, inChannels, outH, outW, inH, inW, strideB, strideC, strideY, strideX);
-				}
-*/
-            }
-            __syncthreads();
-
-			int tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-            for (int index = tid; index < length; index += blockDim.x * gridDim.x) {
-				const int pw = index % outW;
-    			const int ph = (index / outW) % outH;
-    			const int c = (index / outW / outH) % inChannels;
-    			const int n = index / outW / outH / inChannels;
-    			int hstart = sH * ph - pH;
-    			int wstart = sW * pw - pW;
-    			int hend = hstart + kHEff;
-    			int wend = wstart + kWEff;
-
-//    			const int hSO = hstart;
-//    			const int hEO = hend;
-
-    			if(hstart < 0){
-                    int f = nd4j::math::nd4j_ceil<Z,int>((Z) -hstart / (Z)dH);
-                    hstart += f * dH;
-                }
-                if(wstart < 0){
-                    int f = nd4j::math::nd4j_ceil<Z,int>((Z) -wstart / (Z) dW);
-                    wstart += f * dW;
-                }
-                if(hend > inH){
-                    int f = nd4j::math::nd4j_ceil<Z,int>((Z) (hend-inH) / (Z) dH);
-                    hend -= f * dH;
-                }
-                if(wend > inW){
-                    int f = nd4j::math::nd4j_ceil<Z,int>((Z) (wend-inW) / (Z) dW);
-                    wend -= f * dW;
-                }
-                //Accounts for dilation
-    			int pool_size = nd4j::math::nd4j_ceil<double,int>((double) (hend-hstart) / (double) dH) * nd4j::math::nd4j_ceil<double,int>((double) (wend-wstart) / (double) dW);
-
-    			Z sum = poolingMode == 0 ? -nd4j::DataTypeUtils::max<Z>() : static_cast<Z>(0.f);
-
-    			T *input_slice = dx + (n * strideB + c * strideC);
-    			if (poolingMode == 0) {
-    			    for (int h = hstart; h < hend; h += dH) {
-      				    for (int w = wstart; w < wend; w += dW) {
-        				    Z v = static_cast<Z>(input_slice[h * strideY + w * strideX]);
-        				    if (v > sum)
-        				        sum = v;
-      				    }
-    			    }
-    			} else if (poolingMode == 1) {
-    			    for (int h = hstart; h < hend; h += dH) {
-      				    for (int w = wstart; w < wend; w += dW) {
-        				    sum += static_cast<Z>(input_slice[h * strideY + w * strideX]);
-      				    }
-    			    }
-    			} else if (poolingMode == 2) {
-    			    for (int h = hstart; h < hend; h += dH) {
-      				    for (int w = wstart; w < wend; w += dW) {
-        				    sum += nd4j::math::nd4j_pow<Z,Z,Z>(static_cast<Z>(nd4j::math::nd4j_abs<T>(input_slice[h * strideY + w * strideX])), extraParam0);
-      				    }
-    			    }
-    			}
-
-				Z res;
-
-    			if (poolingMode == 0) {
-                    res = sum;
-    			} else if (poolingMode == 1) {
-    			    int divide_factor = pool_size;  //Case 0: exclude padding
-    			    if ((int) extraParam0 == 1)     //Case 1: include padding
-					    divide_factor = kH * kW;
-
-    			    res = sum / static_cast<Z>(divide_factor);
-    			} else if (poolingMode == 2) {
-                    res = nd4j::math::nd4j_pow<Z,Z,Z>(sum, (Z) 1.0f / extraParam0);
-    			}
-
-
-				if (!fOrder) {
-					result[index] = res;
-                } else {
-					result[n * strideOB + c * strideOC + pw * strideOX + ph * strideOY] = res;
-                }
-/*
-                if (index >= 0 && index < 400000) {
-    			    printf("index: %i; hstart: %i; hend: %i; wstart: %i; wend: %i; ph: %i; pw: %i; hstart_orig: %i; hend_orig: %i;\n", index, hstart, hend, wstart, wend, ph, pw, hSO, hEO);
-    			}
-*/
-            }
-
-            __syncthreads();
-		}
-#endif
-
-
-static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outShapeBuffer, Z *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-	// input is  [bS, iC, iH, iW]
-	// output is [bS, iC, oH, oW]
-
-	const Nd4jLong kH = (int)extraParams[0];
-	const Nd4jLong kW = (int)extraParams[1];
-    const Nd4jLong sH = (int)extraParams[2];
-    const Nd4jLong sW = (int)extraParams[3];
-    const Nd4jLong pH = (int)extraParams[4];
-    const Nd4jLong pW = (int)extraParams[5];
-    const Nd4jLong dH = (int)extraParams[6];
-    const Nd4jLong dW = (int)extraParams[7];
-    Nd4jLong poolingMode = (int)extraParams[9];
-    T extraParam0 = extraParams[10];
-
-    if(dH == 0 || dW == 0) {
-       printf("Special_ops pooling2d:: dilation must not be zero, but got instead {%lld, %lld} \n", dH, dW);
-       throw "";
-    }
-
-    const Nd4jLong kHEff = kH + (kH-1)*(dH-1);
-    const Nd4jLong kWEff = kW + (kW-1)*(dW-1);
-
-	const int bS = shape::sizeAt(inShapeBuffer, 0);
-    const int iC = shape::sizeAt(inShapeBuffer, 1);
-    const int iH = shape::sizeAt(inShapeBuffer, 2);
-    const int iW = shape::sizeAt(inShapeBuffer, 3);
-    const int oH = shape::sizeAt(outShapeBuffer, 2);
-    const int oW = shape::sizeAt(outShapeBuffer, 3);
-    const Nd4jLong iStride0 = shape::stride(inShapeBuffer)[0];
-    const Nd4jLong iStride1 = shape::stride(inShapeBuffer)[1];
-    const Nd4jLong iStride2 = shape::stride(inShapeBuffer)[2];
-    const Nd4jLong iStride3 = shape::stride(inShapeBuffer)[3];
-    const Nd4jLong oStride0 = shape::stride(outShapeBuffer)[0];
-    const Nd4jLong oStride1 = shape::stride(outShapeBuffer)[1];
-    const Nd4jLong oStride2 = shape::stride(outShapeBuffer)[2];
-    const Nd4jLong oStride3 = shape::stride(outShapeBuffer)[3];
-
-    const Nd4jLong iStep2 = dH*iStride2;
-    const Nd4jLong iStep3 = dW*iStride3;
-    const int kProd  = kH*kW;
-    const T iStep2Inv = 1./iStep2;
-    const T iStep3Inv = 1./iStep3;
-
-    Nd4jLong hstart, wstart, hend, wend;
-    T sum, *pIn;
-
-    if(poolingMode == 0) {        // max
-        PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, sum, hstart, wstart, hend, wend) collapse(2))
-        for(int b = 0; b < bS; ++b) {
-            for(int c = 0; c < iC; ++c) {
-                for(int oh = 0; oh < oH; ++oh) {
-                    for(int ow = 0; ow < oW; ++ow) {
-
-                        pIn  = in  + b * iStride0 + c * iStride1;
-
-                        hstart = oh * sH - pH;
-                        wstart = ow * sW - pW;
-                        hend = hstart + kHEff;
-                        wend = wstart + kWEff;
-
-                        if(hstart < 0)
-                            hstart += dH * (Nd4jLong)nd4j::math::nd4j_ceil<T,Nd4jLong>(static_cast<T>(-hstart) / static_cast<T>(dH));
-                        if(wstart < 0)
-                            wstart += dW * (Nd4jLong)nd4j::math::nd4j_ceil<T,Nd4jLong>(static_cast<T>(-wstart) / static_cast<T>(dW));
-                        if(hend > iH)
-                            hend -= dH * (Nd4jLong)nd4j::math::nd4j_ceil<T,Nd4jLong>(static_cast<T>(hend-iH) / static_cast<T>(dH));
-                        if(wend > iW)
-                            wend -= dW * (Nd4jLong)nd4j::math::nd4j_ceil<T,Nd4jLong>(static_cast<T>(wend-iW) / static_cast<T>(dW));
-
-                        hstart *= iStride2;
-                        hend   *= iStride2;
-                        wstart *= iStride3;
-                        wend   *= iStride3;
-
-                        sum = -nd4j::DataTypeUtils::max<Z>();
-
-                        for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
-                            for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) {
-                                T val = pIn[kh + kw];
-                                    if (val > sum)
-                                        sum = val;
-                                    }
-                        out[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3] = sum;
-                    }
-                }
-            }
-        }
-    }
-/*************************************************************************/
-    else if(poolingMode == 1) {      // avg
-        PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, sum, hstart, wstart, hend, wend) collapse(2))
-        for(int b = 0; b < bS; ++b) {
-            for(int c = 0; c < iC; ++c) {
-                for(int oh = 0; oh < oH; ++oh) {
-                    for(int ow = 0; ow < oW; ++ow) {
-
-                        pIn  = in  + b * iStride0 + c * iStride1;
-
-                        hstart = oh * sH - pH;
-                        wstart = ow * sW - pW;
-                        hend = hstart + kHEff;
-                        wend = wstart + kWEff;
-
-                        if(hstart < 0)
-                            hstart += dH * (Nd4jLong)nd4j::math::nd4j_ceil<T,Nd4jLong>(static_cast<T>(-hstart) / static_cast<T>(dH));
-                        if(wstart < 0)
-                            wstart += dW * (Nd4jLong)nd4j::math::nd4j_ceil<T,Nd4jLong>(static_cast<T>(-wstart) / static_cast<T>(dW));
-                        if(hend > iH)
-                            hend -= dH * (Nd4jLong)nd4j::math::nd4j_ceil<T,Nd4jLong>(static_cast<T>(hend-iH) / static_cast<T>(dH));
-                        if(wend > iW)
-                            wend -= dW * (Nd4jLong)nd4j::math::nd4j_ceil<T,Nd4jLong>(static_cast<T>(wend-iW) / static_cast<T>(dW));
-
-                        hstart *= iStride2;
-                        hend   *= iStride2;
-                        wstart *= iStride3;
-                        wend   *= iStride3;
-
-                        sum = static_cast<Z>(0.);
-
-                        for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
-                            for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
-                                sum += pIn[kh + kw];
-
-                        if ((int) extraParam0 == 0)         //Exclude padding
-                            sum /= static_cast<T>(nd4j::math::nd4j_ceil<double,T>(static_cast<double>(hend-hstart) / static_cast<double>(iStep2))) * static_cast<T>(nd4j::math::nd4j_ceil<double,T>(static_cast<double>(wend-wstart) / static_cast<double>(iStep3)));   //Accounts for dilation
-                        else if ((int) extraParam0 == 1)    //Include padding
-                            sum /= kProd;
-
-                        out[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3] = sum;
-                    }
-                }
-            }
-        }
-    }
-/*************************************************************************/
-    else if(poolingMode == 2) {  // pnorm
-        PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, sum, hstart, wstart, hend, wend) collapse(2))
-        for(int b = 0; b < bS; ++b) {
-            for(int c = 0; c < iC; ++c) {
-                for(int oh = 0; oh < oH; ++oh) {
-                    for(int ow = 0; ow < oW; ++ow) {
-
-                        pIn  = in  + b * iStride0 + c * iStride1;
-
-                        hstart = oh * sH - pH;
-                        wstart = ow * sW - pW;
-                        hend = hstart + kHEff;
-                        wend = wstart + kWEff;
-
-                        if(hstart < 0)
-                            hstart += dH * (Nd4jLong)nd4j::math::nd4j_ceil<T,Nd4jLong>(static_cast<T>(-hstart) / static_cast<T>(dH));
-                        if(wstart < 0)
-                            wstart += dW * (Nd4jLong)nd4j::math::nd4j_ceil<T,Nd4jLong>(static_cast<T>(-wstart) / static_cast<T>(dW));
-                        if(hend > iH)
-                            hend -= dH * (Nd4jLong)nd4j::math::nd4j_ceil<T,Nd4jLong>(static_cast<T>(hend-iH) / static_cast<T>(dH));
-                        if(wend > iW)
-                            wend -= dW * (Nd4jLong)nd4j::math::nd4j_ceil<T,Nd4jLong>(static_cast<T>(wend-iW) / static_cast<T>(dW));
-
-                        hstart *= iStride2;
-                        hend   *= iStride2;
-                        wstart *= iStride3;
-                        wend   *= iStride3;
-
-                        sum = static_cast<T>(0.);
-
-                        for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
-                            for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
-                                sum += nd4j::math::nd4j_pow<T, T, T>(nd4j::math::nd4j_abs<T>(pIn[kh + kw]), extraParam0);
-
-                        sum = nd4j::math::nd4j_pow<T,T,T>(sum, (T) 1. / extraParam0);
-
-                        out[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3] = sum;
-                    }
-                }
-            }
-        }
-    }
-    else {
-        nd4j_printf("Special_ops::pooling2d: pooling mode argument can take three values only: 0, 1, 2, but got %i instead !\n", poolingMode);
-        throw "";
-	}
-}
-
-		op_def static T op(T d1, Z *params) {
-			return d1;
-		}
-
-
-		/** Calculate buffer offset (like Shape.getOffset) without checking on input for negative indices etc
-		*  normally negative indices are bad, OK here because of other checks on input indices
-		*  Uses unrolled loop specifically for length 4
-		*/
-		static _CUDA_HD int getOffsetUnsafe4(int baseOffset, int *shape, int *stride, int *indices) {
-			int offset = baseOffset;
-			if (shape[0] != 1) offset += indices[0] * stride[0];
-			if (shape[1] != 1) offset += indices[1] * stride[1];
-			if (shape[2] != 1) offset += indices[2] * stride[2];
-			if (shape[3] != 1) offset += indices[3] * stride[3];
-			return offset;
-		}
-
-
-		/**
-		* A version of Shape.getOffset without checking on input for negative indices etc
-		* normally negative indices are bad, OK here because of other checks on input indices
-		* Uses unrolled loop specifically for length 6, where indices[2] and indices[3] are zero (always are here)
-		*/
-		static _CUDA_HD int getOffsetUnsafe6(int baseOffset, int *shape, int *stride, int *indices) {
-			int offset = baseOffset;
-			if (shape[0] != 1) offset += indices[0] * stride[0];
-			if (shape[1] != 1) offset += indices[1] * stride[1];
-			if (shape[4] != 1) offset += indices[4] * stride[4];
-			if (shape[5] != 1) offset += indices[5] * stride[5];
-			return offset;
-		}
-
-	};
-
-
-    FORCEINLINE bool is_a_ge_zero_and_a_lt_b(int a, int b) {
-        return static_cast<unsigned>(a) < static_cast<unsigned>(b);
-    }
-
-	template<typename T>
-	class
-	Im2col {
-	public:
-		static const bool requiresSpecial = true;
-
-		static _CUDA_HD int outSize(int size, int k, int s, int p, bool coverAll) {
-			if (coverAll)
-				return (size + p * 2 - k + s - 1) / s + 1;
-			else
-				return (size + p * 2 - k) / s + 1;
-		}
-
-#ifdef __CUDACC__
-		/**
-		* Based on:  https://github.com/pjreddie/darknet/blob/master/src/im2col_kernels.cu
-		*/
-
-		static inline __device__ void execSpecialCuda(
-			                             T *dx, Nd4jLong *xShapeBuffer,
-			                             T *result, Nd4jLong *zShapeBuffer,
-			                             T *extraParams,
-                                         int *allocationPointer, T *reductionPointer,
-                                         Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-			/*kernel[0], kernel[1], stride[0], stride[1], padding[0], padding[1], 0, false*/
-			__shared__ int kernelHeight, kernelWidth, strideY, strideX, padHeight, padWidth, dY, dX, kSize, samples, depth, height, width, strideex, stridech, strideh, stridew, height_col, width_col, n;
-			__shared__ T zeroPadVal;
-			__shared__ Nd4jLong *outShape, *outStride, *inShape, *inStride;
-			__shared__ char resultOrder;
-
-			if (threadIdx.x == 0) {
-			    kernelHeight = (int) extraParams[0];
-			    kernelWidth = (int) extraParams[1];
-			    strideY = (int) extraParams[2];
-			    strideX = (int) extraParams[3];
-			    padHeight = (int) extraParams[4];
-			    padWidth = (int) extraParams[5];
-			    dY = (int) extraParams[6];			//Dilation, height/y dimension
-			    dX = (int) extraParams[7];			//Dilation, width/x dimension
-                kSize = kernelWidth * kernelHeight;
-                zeroPadVal = (T) extraParams[9];	//Value to use when value is padding. Usually 0 but not always
-
-                outShape = shape::shapeOf(zShapeBuffer);
-                resultOrder = shape::order(zShapeBuffer);
-			    outStride = shape::stride(zShapeBuffer);
-
-			    inShape = shape::shapeOf(xShapeBuffer);
-                inStride = shape::stride(xShapeBuffer);
-
-                samples = (int) inShape[0];
-                depth = (int) inShape[1];
-                height = (int) inShape[2];
-                width = (int) inShape[3];
-
-
-                strideex = (int) inStride[0];
-			    stridech = (int) inStride[1];
-			    strideh = (int) inStride[2];
-                stridew = (int) inStride[3];
-
-			    // (height + 2 * padHeight - kernelHeight) / strideX + 1; //
-			    // (width + 2 * padWidth - kernelWidth) / strideY + 1; //
-			    height_col = (int) outShape[4];
-			    width_col = (int) outShape[5];
-
-			    n = samples * depth * height_col * width_col;
-			}
-			__syncthreads();
-
-			int index = blockIdx.x * blockDim.x + threadIdx.x;
-			for (; index < n; index += blockDim.x*gridDim.x) {
-				int h_index = index / width_col;
-				int h_col = h_index % height_col;
-				int w_col = index % width_col;
-
-				int c_im = h_index / height_col;
-				int c_col = c_im * kSize;
-
-				int depth_im = c_im % depth;
-				int num_im = c_im / depth;
-				int h_offset = h_col * strideY - padHeight;
-				int w_offset = w_col * strideX - padWidth;
-
-				T* data_col_ptr = result;
-
-				int i_c = (c_col * height_col + h_col) * width_col + w_col;
-				data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
-
-				T* data_im_ptr = dx;
-
-				data_im_ptr += num_im * strideex + depth_im * stridech + h_offset * strideh + w_offset*stridew;
-
-				for (int i = 0; i < kernelHeight; ++i) {
-					for (int j = 0; j < kernelWidth; ++j) {
-						int h_im = h_offset + i * dY;
-						int w_im = w_offset + j * dX;
-						int i_f = 0;
-						int i_c_temp = i_c;
-						for (int dim = 5; dim >= 0; dim--) {
-							i_f += (i_c_temp % outShape[dim])  * outStride[dim];
-							i_c_temp = i_c_temp / outShape[dim];
-						}
-						if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width){
-							result[i_f] = data_im_ptr[i * dY * strideh + j * dX * stridew];
-						} else result[i_f] = zeroPadVal;
-
-						//result[i_f] = (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ? data_im_ptr[i * strideh + j*stridew] : 0;
-						data_col_ptr += height_col * width_col;
-						i_c += height_col * width_col;
-					}
-				}
-			}
-		}
-#endif
-
-
-		static void execSpecial(
-			T *imBuff,
-			Nd4jLong *imShapeBuffer,
-			T *colBuff,
-			Nd4jLong *colShapeBuffer,
-			T *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-			/*kernel[0], kernel[1], stride[0], stride[1], padding[0], padding[1], 0, false*/
-
-			// [bS, iC, iH, iW] is convoluted to [bS, iC, kH, kW, oH, oW]
-
-			int kH = (int)extraParams[0];
-			int kW = (int)extraParams[1];
-			int sH = (int)extraParams[2];
-			int sW = (int)extraParams[3];
-			int pH = (int)extraParams[4];
-			int pW = (int)extraParams[5];
-			int dH = (int)extraParams[6];			//Dilation, height/y dimension
-			int dW = (int)extraParams[7];			//Dilation, width/x dimension
-            T zeroPadVal = extraParams[9];
-
-            auto colShape  = shape::shapeOf(colShapeBuffer);
-            auto colStride = shape::stride(colShapeBuffer);
-            auto imShape = shape::shapeOf(imShapeBuffer);
-            auto imStride = shape::stride(imShapeBuffer);
-
-            const int bS = imShape[0];
-            const int iC = imShape[1];
-            const int iH = imShape[2];
-            const int iW = imShape[3];
-            const int oH = colShape[4];
-            const int oW = colShape[5];
-            const Nd4jLong colStride0 = colStride[0];
-            const Nd4jLong colStride1 = colStride[1];
-            const Nd4jLong colStride2 = colStride[2];
-            const Nd4jLong colStride3 = colStride[3];
-            const Nd4jLong colStride4 = colStride[4];
-            const Nd4jLong colStride5 = colStride[5];
-            const Nd4jLong imStride0  = imStride[0];
-            const Nd4jLong imStride1  = imStride[1];
-            const Nd4jLong imStride2  = imStride[2];
-            const Nd4jLong imStride3  = imStride[3];
-
-            T *col, *im;
-            int imRow, imCol;
-
-            if (shape::order(imShapeBuffer) == 'c' &&  shape::order(colShapeBuffer) == 'c' && shape::strideDescendingCAscendingF(imShapeBuffer) && shape::strideDescendingCAscendingF(colShapeBuffer)) {
-
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(col, im, imRow, imCol) collapse(2))
-                for (int b = 0; b < bS; b++) {
-                    for (int c = 0; c < iC; ++c) {
-                        for (int kRow = 0; kRow < kH; ++kRow) {
-                            for (int kCol = 0; kCol < kW; ++kCol) {
-                                for (int colH = 0; colH < oH; ++colH) {
-                                    for (int colW = 0; colW < oW; ++colW) {
-
-                                        imRow = (-pH + kRow * dH) + colH*sH;
-                                        imCol = (-pW + kCol * dW) + colW*sW;
-
-                                        col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5;
-                                        im  = imBuff  + b*imStride0  + c*imStride1  + imRow*imStride2 + imCol*imStride3;
-
-                                        if (static_cast<unsigned>(imRow) >= static_cast<unsigned>(iH) || static_cast<unsigned>(imCol) >= static_cast<unsigned>(iW))
-                                            *col = zeroPadVal;
-                                        else
-                                            *col = *im;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-            else {
-
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(im, col, imRow, imCol) collapse(2))
-                for (int b = 0; b < bS; b++) {
-                    for (int colH = 0; colH < oH; ++colH) {
-                        for (int colW = 0; colW < oW; ++colW) {
-                            for (int c = 0; c < iC; ++c) {
-                                for (int kRow = 0; kRow < kH; ++kRow) {
-                                    for (int kCol = 0; kCol < kW; ++kCol) {
-
-                                        imRow = (-pH + kRow * dH) + colH*sH;
-                                        imCol = (-pW + kCol * dW) + colW*sW;
-
-                                        col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5;
-                                        im  = imBuff  + b*imStride0  + c*imStride1  + imRow*imStride2 + imCol*imStride3;
-
-                                        if (static_cast<unsigned>(imRow) >= static_cast<unsigned>(iH) || static_cast<unsigned>(imCol) >= static_cast<unsigned>(iW))
-                                            *col = zeroPadVal;
-                                        else
-                                            *col = *im;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-		}
-
-		op_def static T op(T d1, T *params) {
-			return d1;
-		}
-
-
-		/** Calculate buffer offset (like Shape.getOffset) without checking on input for negative indices etc
-		*  normally negative indices are bad, OK here because of other checks on input indices
-		*  Uses unrolled loop specifically for length 4
-		*/
-		static _CUDA_HD int getOffsetUnsafe4(int baseOffset, int *shape, int *stride, int *indices) {
-			int offset = baseOffset;
-			if (shape[0] != 1) offset += indices[0] * stride[0];
-			if (shape[1] != 1) offset += indices[1] * stride[1];
-			if (shape[2] != 1) offset += indices[2] * stride[2];
-			if (shape[3] != 1) offset += indices[3] * stride[3];
-			return offset;
-		}
-
-
-		/**
-		* A version of Shape.getOffset without checking on input for negative indices etc
-		* normally negative indices are bad, OK here because of other checks on input indices
-		* Uses unrolled loop specifically for length 6, where indices[2] and indices[3] are zero (always are here)
-		*/
-		static _CUDA_HD int getOffsetUnsafe6(int baseOffset, int *shape, int *stride, int *indices) {
-			int offset = baseOffset;
-			if (shape[0] != 1) offset += indices[0] * stride[0];
-			if (shape[1] != 1) offset += indices[1] * stride[1];
-			if (shape[4] != 1) offset += indices[4] * stride[4];
-			if (shape[5] != 1) offset += indices[5] * stride[5];
-			return offset;
-		}
-
-	};
-
-	template<typename T, typename Z>
-	class Histogram {
-	public:
-		static const bool requiresSpecial = true;
-
-#ifdef __CUDACC__
-		static inline __device__ void execSpecialCuda(
-			                 T *dx, Nd4jLong *xShapeBuffer,
-			                 Z *result, Nd4jLong *zShapeBuffer,
-			                 Z *extraParams,
-                             int *allocationPointer, Z *reductionPointer,
-                             Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-
-
-		};
-#endif
-
-		static void execSpecial(
-				T *dx,
-				Nd4jLong *xShapeBuffer,
-				Z *result,
-				Nd4jLong *zShapeBuffer,
-				Z *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-
-
-		}
-
-
-        op_def static T op(T d1, Z *params) {
-            return d1;
-        }
-	};
-
-	template<typename X>
-	class Col2Im {
-
-	public:
-		static const bool requiresSpecial = true;
-#ifdef __CUDACC__
-		/**
-		* https://github.com/pjreddie/darknet/blob/master/src/col2im_kernels.cu
-		*/
-
-		static inline __device__ void execSpecialCuda(
-			X *dx, Nd4jLong *xShapeBuffer,
-			X *result, Nd4jLong *zShapeBuffer,
-			X *extraParams, int *allocationPointer,
-            X *reductionPointer,
-            Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-		    __shared__ int strideex, stridech, stridekrow, stridekcol, striderow, stridecol, kernelHeight, kernelWidth, strideY, strideX, padHeight, padWidth, imgHeight, imgWidth, dY, dX, samples, depth, imgH, imgW, height_col, width_col, n, kEffectiveW, kEffectiveH;
-		    __shared__ Nd4jLong *inShape, *inStride, *outShape, *outStride;
-		    __shared__ char resultOrder;
-
-		    if (threadIdx.x == 0) {
-			    inShape = shape::shapeOf(xShapeBuffer);
-                inStride = shape::stride(xShapeBuffer);
-
-			    strideex = (int) inStride[0];
-                stridech = (int) inStride[1];
-                stridekrow = (int) inStride[2];
-                stridekcol = (int) inStride[3];
-                striderow = (int) inStride[4];
-                stridecol = (int) inStride[5];
-
-			    kernelHeight = (int) inShape[2];
-                kernelWidth = (int) inShape[3];
-
-                strideY = (int) extraParams[0];
-                strideX = (int) extraParams[1];
-                padHeight = (int) extraParams[2];
-			    padWidth = (int) extraParams[3];
-                imgHeight = (int) extraParams[4];
-                imgWidth = (int) extraParams[5];
-                dY = (int) extraParams[6];			//Dilation in height/y dimension
-                dX = (int) extraParams[7];			//Dilation in width/x dimension
-
-			    outShape = shape::shapeOf(zShapeBuffer);
-			    resultOrder = shape::order(zShapeBuffer);
-			    outStride = shape::stride(zShapeBuffer);
-
-                samples = (int) outShape[0];
-                depth = (int) outShape[1];
-                imgH = (int) outShape[2];
-                imgW = (int) outShape[3];
-
-                height_col = inShape[4];//(imgHeight + 2 * padHeight - kernelHeight) / strideX + 1;
-			    width_col = inShape[5];//(imgWidth + 2 * padWidth - kernelWidth) / strideY + 1;
-
-			    n = samples * depth * imgHeight * imgWidth;
-
-			    //Effective kernel size, accounting for dilation
-                kEffectiveW = kernelWidth + (kernelWidth - 1) * (dX - 1);
-                kEffectiveH = kernelHeight + (kernelHeight - 1) * (dY - 1);
-			}
-		    __syncthreads();
-
-			for (int i = (blockDim.x * blockIdx.x) + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
-				X val = 0;
-				int w_im = i % imgWidth + padWidth;
-				int h_im = (i / imgWidth) % imgHeight + padHeight;
-				int c_im = i / (imgWidth * imgHeight);
-
-				int num_im = c_im / depth;
-				int depth_im = c_im % depth;
-
-				// compute the start and end of the output
-				// These are the indexes for dimensions ??? in the 6d col matrix
-				int w_col_start = (w_im < kEffectiveW) ? 0 : (w_im - kEffectiveW) / strideX + 1;
-				int w_col_end = nd4j::math::nd4j_min<int>(w_im / strideX + 1, width_col);
-
-				int h_col_start = (h_im < kEffectiveH) ? 0 : (h_im - kEffectiveH) / strideY + 1;
-				int h_col_end = nd4j::math::nd4j_min<int>(h_im / strideY + 1, height_col);
-
-
-				//Iterate over col entries in the 6d array... these are added up
-				for (int h_col = h_col_start; h_col < h_col_end; h_col += 1) {
-					for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) {
-						int h_k = (h_im - h_col * strideY);
-						int w_k = (w_im - w_col * strideX);
-
-						if(h_k % dY == 0 && w_k % dX == 0){
-							h_k /= dY;
-							w_k /= dX;
-
-							int data_col_index = num_im * strideex + depth_im * stridech + h_k * stridekrow + w_k * stridekcol + h_col * striderow + w_col * stridecol;
-							val += dx[data_col_index];
-						}
-					}
-				}
-				int i_f = 0;
-				int i_c = i;
-				for (int dim = 3; dim >= 0; dim--)
-				{
-					i_f += (i_c % outShape[dim])  * outStride[dim];
-					i_c = i_c / outShape[dim];
-				}
-				result[i_f] = val;
-			}
-		}
-#endif
-
-		static void execSpecial(
-			X *colBuff,
-			Nd4jLong *colShapeBuffer,
-			X *imBuff,
-			Nd4jLong *imShapeBuffer,
-			X *extraParams,
-			Nd4jLong *tadShapeInfo,
-			Nd4jLong *tadOffsets) {
-
-            // [bS, iC, kH, kW, oH, oW] is de-convoluted to [bS, iC, iH, iW]
-
-            auto colShape  = shape::shapeOf(colShapeBuffer);
-            auto colStride = shape::stride(colShapeBuffer);
-            auto imShape = shape::shapeOf(imShapeBuffer);
-            auto imStride = shape::stride(imShapeBuffer);
-
-            const int sH = (int)extraParams[0];
-            const int sW = (int)extraParams[1];
-            const int pH = (int)extraParams[2];
-            const int pW = (int)extraParams[3];
-            const int iH = (int)extraParams[4];
-            const int iW = (int)extraParams[5];
-            const int dH = (int)extraParams[6];
-            const int dW = (int)extraParams[7];
-
-            const int bS = imShape[0];
-            const int iC = imShape[1];
-            const int kH = colShape[2];
-            const int kW = colShape[3];
-            const int oH = colShape[4];
-            const int oW = colShape[5];
-            const Nd4jLong colStride0 = colStride[0];
-            const Nd4jLong colStride1 = colStride[1];
-            const Nd4jLong colStride2 = colStride[2];
-            const Nd4jLong colStride3 = colStride[3];
-            const Nd4jLong colStride4 = colStride[4];
-            const Nd4jLong colStride5 = colStride[5];
-            const Nd4jLong imStride0  = imStride[0];
-            const Nd4jLong imStride1  = imStride[1];
-            const Nd4jLong imStride2  = imStride[2];
-            const Nd4jLong imStride3  = imStride[3];
-
-            auto zLength = shape::length(imShapeBuffer);
-
-            // initial zeroing of image content
-            memset(imBuff, 0, zLength * sizeof(X));
-
-
-            X *col, *im;
-            int imRow, imCol;
-
-            if (shape::order(colShapeBuffer) == 'c' &&  shape::order(imShapeBuffer) == 'c' && shape::strideDescendingCAscendingF(colShapeBuffer) && shape::strideDescendingCAscendingF(imShapeBuffer)) {
-
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(col, im, imRow, imCol) collapse(2))
-                for (int b = 0; b < bS; b++) {
-                    for (int c = 0; c < iC; ++c) {
-                        for (int kRow = 0; kRow < kH; ++kRow) {
-                            for (int kCol = 0; kCol < kW; ++kCol) {
-                                for (int colH = 0; colH < oH; ++colH) {
-                                    for (int colW = 0; colW < oW; ++colW) {
-
-                                        imRow = (-pH + kRow * dH) + colH*sH;
-                                        imCol = (-pW + kCol * dW) + colW*sW;
-
-                                        col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5;
-                                        im  = imBuff  + b*imStride0  + c*imStride1  + imRow*imStride2 + imCol*imStride3;
-
-                                        if (static_cast<unsigned>(imRow) < static_cast<unsigned>(iH) && static_cast<unsigned>(imCol) < static_cast<unsigned>(iW))
-                                            *im += *col;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-            else {
-
-                PRAGMA_OMP_PARALLEL_FOR_ARGS(private(im, col, imRow, imCol))
-                for (int b = 0; b < bS; b++) {
-                    for (int colH = 0; colH < oH; ++colH) {
-                        for (int colW = 0; colW < oW; ++colW) {
-                            for (int c = 0; c < iC; ++c) {
-                                for (int kRow = 0; kRow < kH; ++kRow) {
-                                    for (int kCol = 0; kCol < kW; ++kCol) {
-
-                                        imRow = (-pH + kRow * dH) + colH*sH;
-                                        imCol = (-pW + kCol * dW) + colW*sW;
-
-                                        col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5;
-                                        im  = imBuff  + b*imStride0  + c*imStride1  + imRow*imStride2 + imCol*imStride3;
-
-                                        if (static_cast<unsigned>(imRow) < static_cast<unsigned>(iH) && static_cast<unsigned>(imCol) < static_cast<unsigned>(iW))
-                                            *im += *col;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-
-		op_def static X op(X d1, X *params) {
-			return d1;
-		}
-
-
-		/** Calculate buffer offset (like Shape.getOffset) without checking on input for negative indices etc
-		*  normally negative indices are bad, OK here because of other checks on input indices
-		*  Uses unrolled loop specifically for length 4
-		*/
-		static _CUDA_HD int getOffsetUnsafe4(int baseOffset, int *shape, int *stride, int *indices) {
-			int offset = baseOffset;
-			if (shape[0] != 1) offset += indices[0] * stride[0];
-			if (shape[1] != 1) offset += indices[1] * stride[1];
-			if (shape[2] != 1) offset += indices[2] * stride[2];
-			if (shape[3] != 1) offset += indices[3] * stride[3];
-			return offset;
-		}
-
-		/** A version of Shape.getOffset without checking on input for negative indices etc
-		* normally negative indices are bad, OK here because of other checks on input indices
-		* Uses unrolled loop specifically for length 6, where indices[2] and indices[3] are zero (always are here)
-		*/
-		static _CUDA_HD int getOffsetUnsafe6(int baseOffset, int *shape, int *stride, int *indices) {
-			int offset = baseOffset;
-			if (shape[0] != 1) offset += indices[0] * stride[0];
-			if (shape[1] != 1) offset += indices[1] * stride[1];
-			if (shape[4] != 1) offset += indices[4] * stride[4];
-			if (shape[5] != 1) offset += indices[5] * stride[5];
-			return offset;
-		}
-
-	};
-
-
-	template<typename X>
-	class Reverse {
-	public:
-		static const bool requiresSpecial = true;
-
-#ifdef __CUDACC__
-		static inline __device__ void execSpecialCuda(X *dx, Nd4jLong *xShapeBuffer,
-                                                    X *result, Nd4jLong *zShapeBuffer,
-                                                    X *extraParams, int *allocationPointer,
-                                                    X *reductionPointer,
-                                                    Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-            __shared__ Nd4jLong xLength;
-			__shared__ int xEWS;
-            __shared__ char xOrder;
-            __shared__ Nd4jLong sLength;
-            __shared__ X *shmem;
-            int tid = threadIdx.x + blockIdx.x * blockDim.x;
-
-            if (threadIdx.x == 0) {
-                xLength = shape::length(xShapeBuffer);
-			    xEWS = shape::elementWiseStride(xShapeBuffer);
-                xOrder = shape::order(xShapeBuffer);
-                sLength = xLength - 1;
-
-                extern __shared__ unsigned char shrd[];
-                shmem = (X *) shrd;
-            }
-            __syncthreads();
-
-
-
-            if (dx == result) {
-
-                if (xEWS == 1) {
-                    for (int e = tid; e < xLength / 2; e += blockDim.x * gridDim.x) {
-                        Nd4jLong idx = sLength - e;
-                        X tmp = dx[e];
-                        dx[e] = dx[idx];
-                        dx[idx] = tmp;
-                    }
-                } else if (xEWS >= 1) {
-                    for (int e = tid; e < xLength / 2; e += blockDim.x * gridDim.x) {
-                        Nd4jLong idx1 = (sLength - e) * xEWS;
-                        Nd4jLong idx2 =  e * xEWS;
-                        X tmp = dx[idx2];
-                        dx[idx2] = dx[idx1];
-                        dx[idx1] = tmp;
-                    }
-                }
-                else {
-
-					for (int e = tid; e < xLength / 2; e += blockDim.x * gridDim.x) {
-                        auto xOffset = shape::getIndexOffset(e, xShapeBuffer);
-                        auto zOffset = shape::getIndexOffset(sLength - e, xShapeBuffer);
-                        result[zOffset] = dx[xOffset];
-					}
-                }
-
-            } else {
-                __shared__ int zEWS;
-				__shared__ char zOrder;
-
-				if (threadIdx.x == 0) {
-				    zEWS = shape::elementWiseStride(zShapeBuffer);
-				    zOrder = shape::order(zShapeBuffer);
-				}
-				__syncthreads();
-
-                if (xEWS == 1 && zEWS == 1 && xOrder == zOrder) {
-                    // loop for whole array
-                    for (int e = tid; e < xLength; e += blockDim.x * gridDim.x) {
-                        result[sLength - e] = dx[e];
-                    }
-                } else if (xEWS >= 1 && zEWS >= 1 && xOrder == zOrder) {
-
-                    for (int e = tid; e < xLength; e += blockDim.x * gridDim.x) {
-                        result[(sLength - e) * zEWS] = dx[e * xEWS];
-                    }
-                }
-                else {
-
-                    for (int e = tid; e < xLength; e += blockDim.x * gridDim.x) {
-                        auto xOffset = shape::getIndexOffset(e, xShapeBuffer);
-                        auto zOffset = shape::getIndexOffset(sLength - e, xShapeBuffer);
-                        result[zOffset] = dx[xOffset];
-                    }
-                }
-            }
-		}
-
-#endif
-
-
-		static void execSpecial(X *dx, Nd4jLong *xShapeBuffer, X *result, Nd4jLong *zShapeBuffer, X *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-			Nd4jLong xLength = shape::length(xShapeBuffer);
-			int xEWS = shape::elementWiseStride(xShapeBuffer);
-            char xOrder = shape::order(xShapeBuffer);
-            Nd4jLong sLength = xLength - 1;
-
-			// two step phase here
-			if (dx == result) {
-				if (xEWS == 1) {
-                    PRAGMA_OMP_PARALLEL_FOR_SIMD
-                    for (Nd4jLong e = 0; e < xLength / 2; e++) {
-                        Nd4jLong idx = sLength - e;
-                        auto tmp = dx[e];
-                        dx[e] = dx[idx];
-                        dx[idx] = tmp;
-                    }
-				} else if (xEWS > 1) {
-                    PRAGMA_OMP_PARALLEL_FOR_SIMD
-                    for (Nd4jLong e = 0; e < xLength / 2; e++) {
-                        Nd4jLong idx1 = (sLength - e) * xEWS;
-                        Nd4jLong idx2 =  e * xEWS;
-                        auto tmp = dx[idx2];
-                        dx[idx2] = dx[idx1];
-                        dx[idx1] = tmp;
-                    }
-				}
-                else {
-
-                    PRAGMA_OMP_PARALLEL_FOR_SIMD
-                    for (Nd4jLong e = 0; e < xLength / 2; e++) {
-                        auto xOffset = shape::getIndexOffset(e, xShapeBuffer);
-                        auto zOffset = shape::getIndexOffset(sLength - e, xShapeBuffer);
-
-                        result[zOffset] = dx[xOffset];
-                    }
-				}
-			} else {
-				// single step phase here
-				auto zEWS = shape::elementWiseStride(zShapeBuffer);
-				auto zOrder = shape::order(zShapeBuffer);
-
-				if (xEWS == 1 && zEWS == 1 && xOrder == zOrder) {
-                    PRAGMA_OMP_PARALLEL_FOR_SIMD
-					for (Nd4jLong e = 0; e < xLength; e++) {
-						result[sLength - e] = dx[e];
-					}
-				} else if (xEWS >= 1 && zEWS >= 1 && xOrder == zOrder) {
-                    PRAGMA_OMP_PARALLEL_FOR_SIMD
-					for (Nd4jLong e = 0; e < xLength; e++) {
-						result[(sLength - e) * zEWS] = dx[e * xEWS];
-					}
-				}
-                else {
-
-                    PRAGMA_OMP_PARALLEL_FOR_SIMD
-					for (Nd4jLong e = 0; e < xLength; e++) {
-						auto xOffset = shape::getIndexOffset(e, xShapeBuffer);
-                        auto zOffset = shape::getIndexOffset(sLength - e, zShapeBuffer);
-						result[zOffset] = dx[xOffset];
-					}
-				}
-			}
-		}
-
-        op_def static X op(X d1, X *params) {
-            return d1;
-        }
-	};
-
-	template<typename X>
-	class SoftMax {
-	public:
-		static const bool requiresSpecial = true;
-
-#ifdef __CUDACC__
-		/**
-		*
-		*/
-
-		static inline __device__ void execSpecialCuda(
-			void *vx, Nd4jLong *xShapeBuffer,
-			void *vresult, Nd4jLong *zShapeBuffer,
-			void *vextraParams,
-			int *allocationPointer, void *reductionPointer,
-            Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-            auto dx = reinterpret_cast<X *>(vx);
-            auto result = reinterpret_cast<X *>(vresult);
-            auto extraParams = reinterpret_cast<X *>(vextraParams);
-
-			auto shape = shape::shapeOf(xShapeBuffer);
-			__shared__ X maxResult;
-			__shared__ Nd4jLong *maxResultShapeBuffer;
-
-			auto length = shape::length(xShapeBuffer);
-
-			auto stride = shape::stride(xShapeBuffer);
-			//compute the row wise maxes
-
-			__shared__ Nd4jLong maxShape[2];
-
-			// it's always 2d here
-			__shared__ Nd4jLong tempBuffer[8];
-
-			if (threadIdx.x == 0) {
-			    maxResult = (X) 0.0;
-			    maxShape[0] = shape[0];
-			    maxShape[1] = 1;
-                maxResultShapeBuffer = shape::shapeBuffer(2, nd4j::DataTypeUtils::fromT<X>(), maxShape, tempBuffer);
-			}
-			__syncthreads();
-
-
-			functions::reduce::ReduceSameInplace<X>::execScalarCudaLegacy(nd4j::reduce::Max, dx, xShapeBuffer, extraParams, &maxResult, maxResultShapeBuffer, reductionPointer, nullptr);
-			__syncthreads();
-
-			//subtract max of each row
-			functions::scalar::ScalarInplace<X,X,X>::transformCudaLegacy(nd4j::scalar::Subtract, &maxResult, dx, xShapeBuffer, extraParams, result, zShapeBuffer, allocationPointer);
-			__syncthreads();
-
-			//after subtracting the row wise maxes take the exp
-			functions::transform::TransformStrictInplace<X>::transformCudaLegacy(nd4j::transform::Exp, result, zShapeBuffer, extraParams, result, zShapeBuffer, allocationPointer, reductionPointer, tadShapeInfo, tadOffsets);
-			__syncthreads();
-
-			//take the sum for the exponential
-			functions::reduce::ReduceSameInplace<X>::execScalarCudaLegacy(nd4j::reduce::Sum, result, zShapeBuffer, extraParams, &maxResult, maxResultShapeBuffer, reductionPointer, nullptr);
-			__syncthreads();
-
-			//divide by the sum
-			functions::scalar::ScalarInplace<X,X,X>::transformCudaLegacy(nd4j::scalar::Divide, &maxResult, result, zShapeBuffer, extraParams, result, zShapeBuffer, allocationPointer);
-		}
-#endif
-
-		      static void execSpecial(
-            void *vx,
-            Nd4jLong *xShapeInfo,
-            void *vz,
-            Nd4jLong *zShapeInfo,
-            void *vextraParams,
-            Nd4jLong *tadShapeInfo,
-            Nd4jLong *tadOffsets) {
-
-            auto x = reinterpret_cast<X *>(vx);
-            auto z = reinterpret_cast<X *>(vz);
-            auto extraParams = reinterpret_cast<X *>(vextraParams);
-
-            if (shape::isMatrix(xShapeInfo)) {
-
-                if(shape::equalsStrict(xShapeInfo, zShapeInfo)) {
-                    if (tadShapeInfo == nullptr) {
-                        auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, 1);
-                        tadShapeInfo = tadPack.primaryShapeInfo();
-                        tadOffsets = tadPack.primaryOffsets();
-                    }
-
-                    const uint tadLen    = shape::length(tadShapeInfo);
-                    const uint numOfTads = shape::length(xShapeInfo) / tadLen;
-
-                    if(shape::elementWiseStride(tadShapeInfo) == 1) {
-
-                        PRAGMA_OMP_PARALLEL_FOR_SIMD
-                        for (uint i = 0; i < numOfTads; ++i) {
-
-                            X* inBuff  = x + tadOffsets[i];
-                            X* outBuff = z + tadOffsets[i];
-
-                            X max = -nd4j::DataTypeUtils::max<X>();
-                            X sum = 0;
-
-                            for(uint j = 0; j < tadLen; ++j)
-                                max = nd4j::math::nd4j_max<X>(max, inBuff[j]);
-
-                            for (uint j = 0; j < tadLen; ++j) {
-                                X temp = nd4j::math::nd4j_exp<X,X>(inBuff[j] - max);
-                                outBuff[j] = temp;
-                                sum += temp;
-                            }
-
-                            for (uint j = 0; j < tadLen; ++j)
-                            outBuff[j] /= sum;
-                        }
-                    }
-                    else {
-
-                        uint xShapeInfoCast[MAX_RANK];
-                        bool canCast = nd4j::DataTypeUtils::castShapeInfo(tadShapeInfo, xShapeInfoCast);
-
-                        auto offsets = new Nd4jLong[tadLen];
-                        shape::calcOffsets(tadShapeInfo, offsets);
-
-                        PRAGMA_OMP_PARALLEL_FOR_SIMD
-                        for (uint i = 0; i < numOfTads; ++i) {
-
-                            X* inBuff  = x  + tadOffsets[i];
-                            X* outBuff = z + tadOffsets[i];
-
-                            X max = -nd4j::DataTypeUtils::max<X>();
-                            X sum = 0.f;
-
-                            for(uint j = 0; j < tadLen; ++j)
-                                max = nd4j::math::nd4j_max<X>(max, inBuff[offsets[j]]);
-
-                            for (uint j = 0; j < tadLen; ++j) {
-                                X temp = nd4j::math::nd4j_exp<X,X>(inBuff[offsets[j]] - max);
-                                outBuff[offsets[j]] = temp;
-                                sum += temp;
-                            }
-
-                            for (uint j = 0; j < tadLen; ++j)
-                                outBuff[offsets[j]] /= sum;
-                        }
-                        delete []offsets;
-                    }
-                }
-                else {
-
-                    auto shape = shape::shapeOf(xShapeInfo);
-                    //iterate along rows
-                    int dimension[1] = { 0 };
-                    int maxDimension[1] = { 1 };
-                    //compute the row wise maxes
-                    auto maxResult = new X[shape[0]];
-                    for (int i = 0; i < shape[0]; i++)
-                        maxResult[i] = 0.0;
-                    Nd4jLong maxShape[2] = { shape[0], 1 };
-                    auto maxResultShapeBuffer = shape::shapeBuffer(2, nd4j::DataTypeUtils::fromT<X>(), maxShape);
-                    functions::reduce::ReduceSameFunction<X>::exec(nd4j::reduce::Max, x, xShapeInfo, extraParams, maxResult, maxResultShapeBuffer, maxDimension, 1,  nullptr, nullptr);
-
-                    //subtract max of each row
-                    functions::broadcast::Broadcast<X, X, X>::exec(nd4j::broadcast::Subtract, x, xShapeInfo, maxResult, maxResultShapeBuffer, z, zShapeInfo, dimension, 1, nullptr, nullptr, nullptr, nullptr);
-
-                    //after subtracting the row wise maxes take the exp
-                    functions::transform::TransformStrict<X>::exec(nd4j::transform::Exp, z, zShapeInfo, z, zShapeInfo, extraParams, tadShapeInfo, tadOffsets);
-
-                    //take the sum for the exponential
-                    functions::reduce::ReduceSameFunction<X>::exec(nd4j::reduce::Sum, z, zShapeInfo, extraParams, maxResult, maxResultShapeBuffer, maxDimension, 1, nullptr, nullptr);
-
-                    //divide by the sum
-                    functions::broadcast::Broadcast<X,X,X>::exec(nd4j::broadcast::Divide, z, zShapeInfo, maxResult, maxResultShapeBuffer, z, zShapeInfo, dimension, 1, nullptr, nullptr, nullptr, nullptr);
-
-                    delete[] maxResultShapeBuffer;
-                    delete[] maxResult;
-                }
-            }
-            else if (shape::isVector(xShapeInfo)) {
-                auto max = -nd4j::DataTypeUtils::max<X>();
-                X sum = 0;
-                int elementWiseStride = shape::elementWiseStride(xShapeInfo);
-                int resultElementWiseStride = shape::elementWiseStride(zShapeInfo);
-                int length = shape::length(xShapeInfo);
-                if (elementWiseStride >= 1 && resultElementWiseStride >= 1) {
-                    if (elementWiseStride == 1 && resultElementWiseStride == 1) {
-
-                        for (int i = 0; i < length; i++) {
-                            max = nd4j::math::nd4j_max<X>(max, x[i]);
-                        }
-
-                        for (int i = 0; i < length; i++) {
-                            z[i] = nd4j::math::nd4j_exp<X,X>(x[i] - max);
-                            sum += z[i];
-                        }
-
-                        PRAGMA_OMP_SIMD
-                        for (int i = 0; i < length; i++) {
-                            z[i] /= sum;
-                        }
-                    }
-                    else {
-
-                        for (int i = 0; i < length; i++) {
-                            max = nd4j::math::nd4j_max<X>(max, x[i * elementWiseStride]);
-                        }
-
-                        for (int i = 0; i < length; i++) {
-                            auto r = nd4j::math::nd4j_exp<X, X>(x[i * elementWiseStride] - max);
-                            z[i * resultElementWiseStride] = r;
-                            sum += r;
-                        }
-
-                        for (int i = 0; i < length; i++) {
-                            z[i * resultElementWiseStride] /= sum;
-                        }
-                    }
-                }
-            }
-        }
-
-		op_def static X op(X d1, X *params) {
-			return d1;
-		}
-	};
-
-
-
-	template<typename X>
-	class LogSoftMax {
-	public:
-		static const bool requiresSpecial = true;
-#ifdef __CUDACC__
-		/**
-		*
-		*/
-
-		static inline __device__ void execSpecialCuda(
-            			void *vx, Nd4jLong *xShapeBuffer,
-            			void *vresult, Nd4jLong *zShapeBuffer,
-            			void *vextraParams,
-            			int *allocationPointer, void *reductionPointer,
-                        Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-			auto shape = shape::shapeOf(xShapeBuffer);
-			auto stride = shape::stride(xShapeBuffer);
-			//iterate along rows
-
-            auto dx = reinterpret_cast<X *>(vx);
-            auto result = reinterpret_cast<X *>(vresult);
-            auto extraParams = reinterpret_cast<X *>(vextraParams);
-
-			__shared__ X maxResult;
-			__shared__ Nd4jLong *maxResultShapeBuffer;
-			if (threadIdx.x == 0) {
-				maxResult = (X) 0.0;
-			}
-			__syncthreads();
-			//compute the row wise maxes
-
-			Nd4jLong maxShape[2] = { shape[0], 1 };
-			__shared__ Nd4jLong tempBuffer[8];
-
-			if (threadIdx.x == 0)
-                maxResultShapeBuffer = shape::shapeBuffer(2, nd4j::DataTypeUtils::fromT<X>(), maxShape, tempBuffer);
-			__syncthreads();
-
-			functions::reduce::ReduceSameInplace<X>::execScalarCudaLegacy(nd4j::reduce::Max, dx, xShapeBuffer, extraParams, &maxResult, maxResultShapeBuffer, reductionPointer, nullptr);
-			__syncthreads();
-
-			//subtract max of each row
-			functions::scalar::ScalarInplace<X,X,X>::transformCudaLegacy(nd4j::scalar::Subtract, &maxResult, dx, xShapeBuffer, extraParams, result, zShapeBuffer, allocationPointer);
-			__syncthreads();
-
-			//after subtracting the row wise maxes take the exp
-			functions::transform::TransformStrictInplace<X>::transformCudaLegacy(nd4j::transform::Exp, result, zShapeBuffer, extraParams, result, zShapeBuffer, allocationPointer, reductionPointer, tadShapeInfo, tadOffsets);
-			__syncthreads();
-
-			//take the sum for the exponential
-			functions::reduce::ReduceSameInplace<X>::execScalarCudaLegacy(nd4j::reduce::Sum, result, zShapeBuffer, extraParams, &maxResult, maxResultShapeBuffer, reductionPointer, nullptr);
-			__syncthreads();
-
-			//divide by the sum
-			functions::scalar::ScalarInplace<X,X,X>::transformCudaLegacy(nd4j::scalar::Divide, &maxResult, result, zShapeBuffer, extraParams, result, zShapeBuffer, allocationPointer);
-			__syncthreads();
-
-			functions::transform::TransformStrictInplace<X>::transformCudaLegacy(nd4j::transform::Log, result, zShapeBuffer, extraParams, result, zShapeBuffer, allocationPointer, reductionPointer, tadShapeInfo, tadOffsets);
-
-		}
-#endif
-
-
-		static void execSpecial(
-			void *vx,
-			Nd4jLong *xShapeBuffer,
-			void *vresult,
-			Nd4jLong *zShapeBuffer,
-			void *vextraParams,
-			Nd4jLong *tadShapeInfo,
-			Nd4jLong *tadOffsets) {
-
-            auto dx = reinterpret_cast<X *>(vx);
-            auto result = reinterpret_cast<X *>(vresult);
-            auto extraParams = reinterpret_cast<X *>(vextraParams);
-
-			if (shape::isMatrix(xShapeBuffer, 2)) {
-				auto shape = shape::shapeOf(xShapeBuffer);
-				//iterate along rows
-				int dimension[1] = { 0 };
-				int maxDimension[1] = { 1 };
-				//compute the row wise maxes
-				auto maxResult = new X[shape[0]];
-
-                PRAGMA_OMP_SIMD
-				for (int i = 0; i < shape[0]; i++)
-					maxResult[i] = 0.0;
-
-				Nd4jLong maxShape[2] = { shape[0], 1 };
-                auto maxResultShapeBuffer = shape::shapeBuffer(2, nd4j::DataTypeUtils::fromT<X>(), maxShape);
-				functions::reduce::ReduceSameFunction<X>::exec(nd4j::reduce::Max, dx, xShapeBuffer, extraParams, maxResult, maxResultShapeBuffer, maxDimension, 1, nullptr, nullptr);
-
-				//subtract max of each row
-				functions::broadcast::Broadcast<X,X,X>::exec(nd4j::broadcast::Subtract, dx, xShapeBuffer, maxResult, maxResultShapeBuffer, result, zShapeBuffer, dimension, 1, nullptr, nullptr, nullptr, nullptr);
-
-				//after subtracting the row wise maxes take the exp
-				functions::transform::TransformStrict<X>::exec(nd4j::transform::Exp, result, zShapeBuffer, result, zShapeBuffer, extraParams, tadShapeInfo, tadOffsets);
-
-				//take the sum for the exponential
-				functions::reduce::ReduceSameFunction<X>::exec(nd4j::reduce::Sum, result, zShapeBuffer, extraParams, maxResult, maxResultShapeBuffer, maxDimension, 1, nullptr, nullptr);
-
-				//divide by the sum
-				functions::broadcast::Broadcast<X,X,X>::exec(nd4j::broadcast::Divide, result, zShapeBuffer, maxResult, maxResultShapeBuffer, result, zShapeBuffer, dimension, 1, nullptr, nullptr, nullptr, nullptr);
-
-				functions::transform::TransformStrict<X>::exec(nd4j::transform::Log, result, zShapeBuffer, result, zShapeBuffer, extraParams, tadShapeInfo, tadOffsets);
-
-
-				delete[] maxResultShapeBuffer;
-			}
-			else if (shape::isVector(xShapeBuffer, 2)) {
-				auto max = -FLOAT_MAX_VALUE;
-				X sum = 0;
-
-				auto elementWiseStride = shape::elementWiseStride(xShapeBuffer);
-                auto length = shape::length(xShapeBuffer);
-				if (elementWiseStride == 1) {
-
-					for (int i = 0; i < length; i++) {
-						max = nd4j::math::nd4j_max<X>(max, result[i]);
-					}
-
-
-					for (int i = 0; i < length; i++) {
-						result[i] = nd4j::math::nd4j_exp<X, X>(dx[i] - max);
-						sum += result[i];
-					}
-
-                    PRAGMA_OMP_SIMD
-					for (int i = 0; i < length; i++) {
-						result[i] /= sum;
-						result[i] = nd4j::math::nd4j_log<X, X>(result[i]);
-					}
-				}
-				else if (elementWiseStride > 1) {
-					for (int i = 0; i < length; i++) {
-						max = nd4j::math::nd4j_max<X>(max, result[i * elementWiseStride]);
-					}
-
-					for (int i = 0; i < length; i++) {
-						result[i * elementWiseStride] = nd4j::math::nd4j_exp<X, X>(dx[i * elementWiseStride] - max);
-						sum += result[i * elementWiseStride];
-					}
-
-					for (int i = 0; i < length; i++) {
-						result[i * elementWiseStride] /= sum;
-						result[i * elementWiseStride] = nd4j::math::nd4j_log<X, X>(result[i * elementWiseStride]);
-					}
-				}
-			}
-		}
-
-		op_def static X op(X d1, X *params) {
-			return d1;
-		}
-	};
-
-
-	/**
-	* softmax(x)
-	*/
-	template<typename X>
-	class SoftMaxDerivative {
-	public:
-		static const bool requiresSpecial = true;
-
-#ifdef __CUDACC__
-		/**
-		*
-		*/
-
-		static inline __device__ void execSpecialCuda(
-			                 void *vx, Nd4jLong *xShapeBuffer,
-			                 void *vresult, Nd4jLong *zShapeBuffer,
-			                 void *vextraParams,
-			                 int *allocationPointer, void *reductionPointer,
-                             Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-            auto dx = reinterpret_cast<X *>(vx);
-            auto result = reinterpret_cast<X *>(vresult);
-            auto extraParams = reinterpret_cast<X *>(vextraParams);
-
-			auto shape = shape::shapeOf(xShapeBuffer);
-			__shared__ X maxResult;
-			__shared__ Nd4jLong *maxResultShapeBuffer;
-			__shared__ Nd4jLong resultEWS;
-
-			auto length = shape::length(xShapeBuffer);
-
-			if (threadIdx.x == 0) {
-				resultEWS = shape::elementWiseStride(zShapeBuffer);
-
-				maxResult = (X) 0.0;
-			}
-			__syncthreads();
-
-			auto tride = shape::stride(xShapeBuffer);
-			Nd4jLong maxShape[2] = { shape[0], 1 };
-
-			__shared__ Nd4jLong tempBuffer[8];
-
-			if (threadIdx.x == 0)
-                maxResultShapeBuffer = shape::shapeBuffer(2, nd4j::DataTypeUtils::fromT<X>(), maxShape, tempBuffer);
-			__syncthreads();
-
-			functions::reduce::ReduceSameInplace<X>::execScalarCudaLegacy(nd4j::reduce::Max, dx, xShapeBuffer, extraParams, &maxResult, maxResultShapeBuffer, reductionPointer, nullptr);
-			__syncthreads();
-
-			//subtract max of each row
-			functions::scalar::ScalarInplace<X,X,X>::transformCudaLegacy(nd4j::scalar::Subtract, &maxResult, dx, xShapeBuffer, extraParams, result, zShapeBuffer, allocationPointer);
-			__syncthreads();
-
-			//after subtracting the row wise maxes take the exp
-			functions::transform::TransformStrictInplace<X>::transformCudaLegacy(nd4j::transform::Exp, result, zShapeBuffer, extraParams, result, zShapeBuffer, allocationPointer, reductionPointer, tadShapeInfo, tadOffsets);
-			__syncthreads();
-
-			//take the sum for the exponential
-			functions::reduce::ReduceSameInplace<X>::execScalarCudaLegacy(nd4j::reduce::Sum, result, zShapeBuffer, extraParams, &maxResult, maxResultShapeBuffer, reductionPointer, nullptr);
-			__syncthreads();
-
-			//divide by the sum
-			functions::scalar::ScalarInplace<X,X,X>::transformCudaLegacy(nd4j::scalar::Divide, &maxResult, result, zShapeBuffer, extraParams, result, zShapeBuffer, allocationPointer);
-			__syncthreads();
-
-			if (resultEWS >= 1) {
-				for (int i = threadIdx.x; i < length; i += blockDim.x) {
-					result[i * resultEWS] = result[i * resultEWS] * ((X) 1.0 - result[i * resultEWS]);
-				}
-			}
-			else {
-				printf("Non element wise stride not supported right now\n");
-			}
-
-		}
-#endif
-
-
-		static void execSpecial(
-			void *vx,
-			Nd4jLong *xShapeBuffer,
-			void *vresult,
-			Nd4jLong *zShapeBuffer,
-			void *vextraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-            auto dx = reinterpret_cast<X *>(vx);
-            auto result = reinterpret_cast<X *>(vresult);
-            auto extraParams = reinterpret_cast<X *>(vextraParams);
-
-			if (shape::isMatrix(xShapeBuffer, 2)) {
-				auto shape = shape::shapeOf(xShapeBuffer);
-
-				auto resultEleStide = shape::elementWiseStride(zShapeBuffer);
-
-				//iterate along rows
-				int dimension[1] = { 0 };
-				int maxDimension[1] = { 1 };
-				auto len = shape::length(xShapeBuffer);
-				//compute the row wise maxes
-				auto maxResult = new X[shape[0]];
-
-                PRAGMA_OMP_SIMD
-				for (int i = 0; i < shape[0]; i++)
-					maxResult[i] = 0.0f;
-
-				Nd4jLong maxShape[2] = { shape[0], 1 };
-                auto maxResultShapeBuffer = shape::shapeBuffer(2, nd4j::DataTypeUtils::fromT<X>(), maxShape);
-				functions::reduce::ReduceSameFunction<X>::exec(nd4j::reduce::Max, dx, xShapeBuffer, extraParams, maxResult, maxResultShapeBuffer, maxDimension, 1, nullptr, nullptr);
-
-				//subtract max of each row
-				functions::broadcast::Broadcast<X,X,X>::exec(nd4j::broadcast::Subtract, result, zShapeBuffer, maxResult, maxResultShapeBuffer, result, zShapeBuffer, dimension, 1, nullptr, nullptr, nullptr, nullptr);
-
-				//after subtracting the row wise maxes take the exp
-				functions::transform::TransformStrict<X>::exec(nd4j::transform::Exp, result, zShapeBuffer, result, zShapeBuffer, extraParams, tadShapeInfo, tadOffsets);
-
-				//take the sum for the exponential
-				functions::reduce::ReduceSameFunction<X>::exec(nd4j::reduce::Sum, result, zShapeBuffer, extraParams, maxResult, maxResultShapeBuffer, maxDimension, 1, nullptr, nullptr);
-
-				//divide by the sum
-				functions::broadcast::Broadcast<X,X,X>::exec(nd4j::broadcast::Divide, result, zShapeBuffer, maxResult, maxResultShapeBuffer, result, zShapeBuffer, dimension, 1, nullptr, nullptr, nullptr, nullptr);
-
-				if (resultEleStide >= 1) {
-					if (resultEleStide == 1) {
-                        PRAGMA_OMP_SIMD
-						for (int i = 0; i < len; i++) {
-							result[i] = result[i] * (static_cast<X>(1.0f) - result[i]);
-						}
-
-					}
-					else {
-                        PRAGMA_OMP_SIMD
-						for (int i = 0; i < len; i++) {
-							result[i * resultEleStide] = result[i * resultEleStide] * (static_cast<X>(1.0f) - result[i * resultEleStide]);
-						}
-
-					}
-				}
-				else {
-
-                    for (int i = 0; i < len; i++) {
-                        Nd4jLong zOffset = shape::getIndexOffset(i, zShapeBuffer);
-                        result[zOffset] = result[zOffset] * ((X) 1.0f - result[zOffset]);
-                    }
-                }
-
-
-				delete[] maxResultShapeBuffer;
-				delete[] maxResult;
-			}
-			else if (shape::isVector(xShapeBuffer, 2)) {
-				auto max = -nd4j::DataTypeUtils::max<X>();
-				X sum = 0;
-
-				auto elementWiseStride = shape::elementWiseStride(xShapeBuffer);
-				auto length = shape::length(xShapeBuffer);
-				if (elementWiseStride == 1) {
-
-					for (int i = 0; i < length; i++) {
-						max = nd4j::math::nd4j_max<X>(max, result[i]);
-					}
-
-					for (int i = 0; i < length; i++) {
-						result[i] -= max;
-						result[i] = nd4j::math::nd4j_exp<X, X>(result[i]);
-						sum += result[i];
-					}
-
-					for (int i = 0; i < length; i++) {
-						result[i] /= sum;
-					}
-
-                    for (int i = 0; i < length; i++) {
-                        result[i] = result[i] * ((X) 1.0f - result[i]);
-                    }
-                } else if (elementWiseStride >= 1) {
-
-					for (int i = 0; i < length; i++) {
-						max = nd4j::math::nd4j_max<X>(max, result[i * elementWiseStride]);
-					}
-
-					for (int i = 0; i < length; i++) {
-						result[i * elementWiseStride] -= max;
-						result[i * elementWiseStride] = nd4j::math::nd4j_exp<X, X>(result[i * elementWiseStride]);
-						sum += result[i * elementWiseStride];
-					}
-
-                    PRAGMA_OMP_SIMD
-					for (int i = 0; i < length; i++) {
-						result[i * elementWiseStride] /= sum;
-					}
-
-                    PRAGMA_OMP_SIMD
-					for (int i = 0; i < length; i++) {
-						result[i * elementWiseStride] = result[i * elementWiseStride] * ((X) 1.0f - result[i * elementWiseStride]);
-					}
-				} else {
-                    printf("non-ews access on row not implemented yet");
-                }
-			}
-		}
-
-		op_def static X op(X d1, X *params) {
-			return d1;
-		}
-	};
-
-
-	template<typename X, typename Z>
-	class IsMax {
-	public:
-		static const bool requiresSpecial = true;
-
-
-#ifdef __CUDACC__
-
-		static inline  __device__ void doAllCuda(
-			void *vx,
-			Nd4jLong *xShapeBuffer,
-			void *vresult,
-			Nd4jLong *zShapeBuffer,
-			void *vextraParams,
-			int *allocationPointer, void *reductionPointer) {
-
-            auto dx = reinterpret_cast<X *>(vx);
-            auto result = reinterpret_cast<Z *>(vresult);
-            auto extraParams = reinterpret_cast<X *>(vextraParams);
-
-// this code is safe to delete, it's never used
-/*
-			__shared__ int maxIdx;
-			__shared__ int length;
-			if (threadIdx.x == 0) {
-				length = shape::length(zShapeBuffer);
-			}
-			__syncthreads();
-
-			functions::indexreduce::IndexReduce<T>::template transform<simdOps::IndexMax<T>>(
-				dx,
-				xShapeBuffer,
-				extraParams,
-				result,
-				zShapeBuffer,
-				nullptr,
-				1,
-				1, allocationPointer, reductionPointer,  nullptr, nullptr);
-
-			__syncthreads();
-			if (threadIdx.x == 0)
-				maxIdx = (int)result[0];
-			__syncthreads();
-
-			for (int i = threadIdx.x; i < length; i += blockDim.x)
-				result[i] = 0;
-			__syncthreads();
-
-			if (threadIdx.x == 0) {
-				result[maxIdx] = 1.0;
-			}
-			*/
-		}
-#endif
-
-#ifdef __CUDACC__
-		inline __host__
-
-#elif defined(__GNUC__)
-
-
-#endif
-		static void doAll(
-			void *vx,
-			Nd4jLong *xShapeBuffer,
-            void *vresult,
-			Nd4jLong *zShapeBuffer,
-			void *vextraParams) {
-
-            auto dx = reinterpret_cast<X *>(vx);
-            auto result = reinterpret_cast<Z *>(vresult);
-            auto extraParams = reinterpret_cast<X *>(vextraParams);
-
-			auto length = shape::length(xShapeBuffer);
-			auto eleStride = shape::elementWiseStride(xShapeBuffer);
-			auto resultEleStride = shape::elementWiseStride(zShapeBuffer);
-			auto xOrder = shape::order(xShapeBuffer);
-			auto resultOrder = shape::order(zShapeBuffer);
-
-			if (xOrder == resultOrder && xOrder == 'c') {
-				if (eleStride == 1 && resultEleStride == 1) {
-					if (length < ELEMENT_THRESHOLD) {
-						int maxIdx = 0;
-                        auto currMax = dx[0];
-
-						for (int i = 0; i < length; i++) {
-							if (currMax < dx[i]) {
-								currMax = dx[i];
-								maxIdx = i;
-							}
-
-							result[i] = static_cast<Z>(0);
-
-						}
-
-						result[maxIdx] = static_cast<Z>(1);
-
-					}
-					else {
-						int maxIdx = 0;
-						auto currMax = dx[0];
-
-
-{
-						int maxIdxLocal = maxIdx;
-						auto currMaxLocal = currMax;
-
-						for (int i = 0; i < length; i++) {
-							if (currMaxLocal < dx[i]) {
-								currMaxLocal = dx[i];
-								maxIdxLocal = i;
-							}
-							result[i] = static_cast<Z>(0);
-						}
-
-PRAGMA_OMP_CRITICAL
-{
-						if (currMax < currMaxLocal) {
-							currMax = currMaxLocal;
-							maxIdx = maxIdxLocal;
-						}
-}
-}
-						result[maxIdx] = static_cast<Z>(1);
-					}
-
-				}
-				else {
-					if (length < ELEMENT_THRESHOLD) {
-						int maxIdx = 0;
-                        auto currMax = dx[0];
-
-						for (int i = 0; i < length; i++) {
-							result[i * resultEleStride] = static_cast<Z>(0);
-							if (currMax < dx[i * eleStride]) {
-								currMax = dx[i * eleStride];
-								maxIdx = i;
-							}
-						}
-
-						result[maxIdx * resultEleStride] = static_cast<Z>(1);
-
-					}
-					else {
-						int maxIdx = 0;
-						auto currMax = dx[0];
-
-
-{
-						int maxIdxLocal = maxIdx;
-						auto currMaxLocal = currMax;
-
-						for (int i = 0; i < length; i++) {
-							result[i * resultEleStride] = static_cast<Z>(0);
-							if (currMaxLocal < dx[i * eleStride]) {
-								currMaxLocal = dx[i * eleStride];
-								maxIdxLocal = i;
-							}
-						}
-
-PRAGMA_OMP_CRITICAL
-{
-						if (currMax < currMaxLocal) {
-							currMax = currMaxLocal;
-							maxIdx = maxIdxLocal;
-						}
-}
-}
-						result[maxIdx * resultEleStride] = static_cast<Z>(1);
-					}
-
-				}
-			}
-
-
-			else {
-				Nd4jLong shapeIter[MAX_RANK];
-				Nd4jLong coord[MAX_RANK];
-				int dim;
-				Nd4jLong xStridesIter[MAX_RANK];
-				Nd4jLong resultStridesIter[MAX_RANK];
-				auto xShape = shape::shapeOf(xShapeBuffer);
-				auto xStride = shape::stride(xShapeBuffer);
-				auto resultStride = shape::stride(zShapeBuffer);
-				auto rank = shape::rank(xShapeBuffer);
-				auto originalResult = result;
-				if (PrepareTwoRawArrayIter<X, Z>(rank,
-					xShape,
-					dx,
-					xStride,
-					result,
-					resultStride,
-					&rank,
-					shapeIter,
-					&dx,
-					xStridesIter,
-					&result,
-					resultStridesIter) >= 0) {
-					auto value = dx[0];
-					int idx = 0;
-					int maxIdx = 0;
-					ND4J_RAW_ITER_START(dim, rank, coord, shapeIter); {
-						if (dx[0] > value) {
-							value = dx[0];
-							maxIdx = idx;
-						}
-
-						idx++;
-						result[0] = static_cast<Z>(0);
-
-					}
-					ND4J_RAW_ITER_TWO_NEXT(
-						dim,
-						rank,
-						coord,
-						shapeIter,
-						dx,
-						xStridesIter,
-						result,
-						resultStridesIter);
-
-					//pointer to where max value would be
-					if (shape::order(zShapeBuffer) == 'c' || (shape::order(zShapeBuffer) == 'f' &&
-						maxIdx * shape::stride(zShapeBuffer)[shape::rank(zShapeBuffer) - 1] >=
-						shape::length(zShapeBuffer)))
-						originalResult[maxIdx] = static_cast<Z>(1);
-					else
-						originalResult[maxIdx * shape::stride(zShapeBuffer)[shape::rank(zShapeBuffer) - 1]] = static_cast<Z>(1);
-				}
-			}
-
-
-		}
-	public:
-
-
-#ifdef __CUDACC__
-		/**
-		*
-		*/
-
-		static inline __device__ void execSpecialCuda(
-			             void *vx, Nd4jLong *xShapeBuffer,
-			             void *vresult, Nd4jLong *zShapeBuffer,
-			             void *vextraParams, int *allocationPointer,
-                         void *reductionPointer,
-                         Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-
-            auto dx = reinterpret_cast<X *>(vx);
-            auto result = reinterpret_cast<Z *>(vresult);
-            auto extraParams = reinterpret_cast<X *>(vextraParams);
-
-			// FIXME: MAX_DIMENSION is lower then FP16 frame
-			if (extraParams == nullptr || (int) extraParams[0] == MAX_DIMENSION) {
-				doAllCuda(dx, xShapeBuffer, result, zShapeBuffer, extraParams, allocationPointer, reductionPointer);
-			}
-		}
-#endif
-
-		static void execSpecial(
-			void *vx,
-			Nd4jLong *xShapeBuffer,
-			void *vresult,
-			Nd4jLong *zShapeBuffer,
-			void *vextraParams,
-			Nd4jLong *tadShapeInfo,
-			Nd4jLong *tadOffsets) {
-
-            auto dx = reinterpret_cast<X *>(vx);
-            auto result = reinterpret_cast<Z *>(vresult);
-            auto extraParams = reinterpret_cast<X *>(vextraParams);
-
-			//FIXME: this op should be moved to CustomOps
-			if (extraParams == nullptr || (int)extraParams[0] == 0 ||
-				((int)extraParams[0] == 1 && (int)extraParams[1] == MAX_DIMENSION)) {
-				doAll(dx, xShapeBuffer, result, zShapeBuffer, extraParams);
-			}
-			else if (shape::isVector(xShapeBuffer)) {
-				auto dimensionLength = (int)extraParams[0];
-				auto dimension = new int[dimensionLength];
-				auto length = shape::length(xShapeBuffer);
-				for (int i = 0; i < dimensionLength; i++) {
-					dimension[i] = (int)extraParams[i + 1];
-				}
-				if (shape::shapeOf(xShapeBuffer)[dimension[0]] == 1) {
-					for (int i = 0; i < length; i++) {
-						result[i] = static_cast<Z>(1);
-					}
-				}
-				else {
-					auto eleStride = shape::elementWiseStride(xShapeBuffer);
-					if (eleStride == 1) {
-						int maxIdx = 0;
-						auto currMax = dx[0];
-						if (length < ELEMENT_THRESHOLD) {
-
-							for (int i = 0; i < length; i++) {
-								if (currMax < dx[i]) {
-									currMax = dx[i];
-									maxIdx = i;
-								}
-
-								result[i] = static_cast<Z>(0);
-
-							}
-						}
-						else {
-PRAGMA_OMP_PARALLEL
-{
-							int maxIdxLocal = maxIdx;
-							auto currMaxLocal = currMax;
-
-							for (int i = 0; i < length; i++) {
-								if (currMaxLocal < dx[i]) {
-									currMaxLocal = dx[i];
-									maxIdxLocal = i;
-								}
-
-								result[i] = static_cast<Z>(0);
-
-							}
-
-							PRAGMA_OMP_CRITICAL
-                            {
-							    if (currMax < currMaxLocal) {
-								    currMax = currMaxLocal;
-								    maxIdx = maxIdxLocal;
-							    }
-                            }
-}
-						}
-
-						result[maxIdx] = static_cast<Z>(1);
-
-					}
-
-
-					else {
-						int maxIdx = 0;
-						auto currMax = dx[0];
-						if (length < ELEMENT_THRESHOLD) {
-
-							for (int i = 0; i < length; i++) {
-								if (currMax < dx[i * eleStride]) {
-									currMax = dx[i * eleStride];
-									maxIdx = i;
-								}
-
-								result[i] = static_cast<Z>(0);
-							}
-						}
-						else {
-
-{
-							int maxIdxLocal = maxIdx;
-							auto currMaxLocal = currMax;
-
-							for (int i = 0; i < length; i++) {
-								if (currMaxLocal < dx[i * eleStride]) {
-									currMaxLocal = dx[i * eleStride];
-									maxIdxLocal = i;
-								}
-
-								result[i] = static_cast<Z>(0);
-							}
-
-PRAGMA_OMP_CRITICAL
-{
-							if (currMax < currMaxLocal) {
-								currMax = currMaxLocal;
-								maxIdx = maxIdxLocal;
-							}
-}
-}
-						}
-
-						result[maxIdx] = static_cast<Z>(1);
-					}
-				}
-
-
-			}
-			else {
-                auto dimensionLength = (int) extraParams[0];
-                auto dimension = new int[dimensionLength];
-
-                PRAGMA_OMP_SIMD
-                for (int i = 0; i < dimensionLength; i++) {
-                    dimension[i] = (int) extraParams[i + 1];
-                }
-                //decompose in to several sub tads after
-                //moving all dimensions (in sorted order)
-                //to the back.
-                //permuted version of the x shape info for setting up the tad problem
-				auto tadShapeShapeInfo = tadShapeInfo;
-				if(tadShapeInfo==nullptr) {
-                    auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(xShapeBuffer, dimension, dimensionLength);
-
-					tadShapeShapeInfo = tadPack.primaryShapeInfo();
-					tadOffsets = tadPack.primaryOffsets();
-                    tadShapeInfo = tadShapeShapeInfo;
-				}
-
-                auto tadLength = shape::length(tadShapeInfo);//shape::tadLength(xShapeBuffer, dimension, dimensionLength);
-                auto tads = shape::length(xShapeBuffer) / tadLength;
-
-                int tadsPerThread = tads / TAD_THRESHOLD;
-                int num_threads = nd4j::math::nd4j_max<int>(1, tadsPerThread);
-                num_threads = nd4j::math::nd4j_min<int>(num_threads, omp_get_max_threads());
-
-                auto tadEWS = shape::elementWiseStride(tadShapeShapeInfo);
-                auto zEWS = tadEWS;
-
-                int span = (tads / num_threads) + 8;
-
-                PRAGMA_OMP_PARALLEL_THREADS(num_threads)
-                {
-                    int tid = omp_get_thread_num();
-                    int start = span * tid;
-                    int end = span * (tid + 1);
-                    if (end > tads) end = tads;
-
-                    for (int r = start; r < end; r++) {
-                        if (tadEWS > 0 && zEWS > 0 && dimensionLength == 1) {
-                            auto rX = dx + tadOffsets[r];
-                            auto rZ = result + tadOffsets[r];
-
-                            auto maxValue = rX[0];
-                            int maxIdx = 0;
-                            if (tadEWS == 1 && zEWS == 1) {
-
-                                for (int i = 0; i < tadLength; i++) {
-                                    if (rX[i] > maxValue) {
-                                        maxIdx = i;
-                                        maxValue = rX[i];
-                                    }
-                                }
-
-
-                                for (int i = 0; i < tadLength; i++) {
-                                    rZ[i] = static_cast<Z>(maxIdx == i);
-                                }
-
-                            } else {
-
-                                for (int i = 0; i < tadLength; i++) {
-                                    if (rX[i * tadEWS] > maxValue) {
-                                        maxIdx = i;
-                                        maxValue = rX[i * tadEWS];
-                                    }
-                                }
-
-                                for (int i = 0; i < tadLength; i++) {
-                                    rZ[i * zEWS] = static_cast<Z>(maxIdx == i);
-                                }
-                            }
-                        } else {
-                            int tadsPerThread = tads / TAD_THRESHOLD;
-                            int num_threads = nd4j::math::nd4j_max<int>(1, tadsPerThread);
-                            num_threads = nd4j::math::nd4j_min<int>(num_threads, omp_get_max_threads());
-
-                            auto offset = tadOffsets[r];
-                            Nd4jLong shapeIter[MAX_RANK];
-                            Nd4jLong coord[MAX_RANK];
-                            int dim;
-                            Nd4jLong xStridesIter[MAX_RANK];
-                            Nd4jLong resultStridesIter[MAX_RANK];
-                            auto xShape = shape::shapeOf(tadShapeShapeInfo);
-                            auto xStride = shape::stride(tadShapeShapeInfo);
-                            auto resultStride = shape::stride(tadShapeShapeInfo);
-                            int rank = shape::rank(tadShapeShapeInfo);
-                            auto xPointer = dx + offset;
-                            auto resultPointer = result + offset;
-                            auto maxValue = xPointer[0];
-
-                            auto maxCursor = resultPointer;
-                            Nd4jPointer maxCursorLong = reinterpret_cast<Nd4jPointer>(maxCursor);
-                            if (PrepareTwoRawArrayIter<X,Z>(rank,
-                                                             xShape,
-                                                             xPointer,
-                                                             xStride,
-                                                             resultPointer,
-                                                             resultStride,
-                                                             &rank,
-                                                             shapeIter,
-                                                             &xPointer,
-                                                             xStridesIter,
-                                                             &resultPointer,
-                                                             resultStridesIter) >= 0) {
-                                   ND4J_RAW_ITER_START(dim, rank, coord, shapeIter); {
-                                       if (maxValue < xPointer[0]) {
-                                           maxCursor = resultPointer;
-                                           maxCursorLong = reinterpret_cast<Nd4jPointer>(resultPointer);
-                                           maxValue = xPointer[0];
-                                       }
-
-                                       resultPointer[0] = static_cast<Z>(0);
-                                   }
-                                   ND4J_RAW_ITER_TWO_NEXT(dim,
-                                                          rank,
-                                                          coord,
-                                                          shapeIter,
-                                                          xPointer,
-                                                          xStridesIter,
-                                                          resultPointer,
-                                                          resultStridesIter);
-                                   maxCursor = reinterpret_cast<Z *>(maxCursorLong);
-                                   maxCursor[0] = static_cast<Z>(1);;
-                            }
-                        }
-                    }
-                }
-
-                delete[] dimension;
-            }
-		}
-
-		op_def static Z op(X d1, X *params) {
-			return nd4j::math::softplus<X,Z>(d1);
-		}
-	};
-}
diff --git a/libnd4j/include/ops/special_random_ops.h b/libnd4j/include/ops/special_random_ops.h
index 1ae310ad4..a25aa36ec 100644
--- a/libnd4j/include/ops/special_random_ops.h
+++ b/libnd4j/include/ops/special_random_ops.h
@@ -25,6 +25,7 @@
 #include <helpers/shape.h>
 #include <graph/RandomGenerator.h>
 #include <specials_cuda.h>
+#include <execution/Threads.h>
 
 namespace randomOps {
 
@@ -152,9 +153,9 @@ namespace randomOps {
             // TODO: we probably might want to skip this sum, and state that probabilities array should be real probabilities, i.e. should sum to 1.0
             //T probSum = extraArguments[0];
 
-            Nd4jLong xLength = shape::length(xShapeBuffer);
-            Nd4jLong yLength = shape::length(yShapeBuffer);
-            Nd4jLong zLength = shape::length(zShapeBuffer);
+            auto xLength = shape::length(xShapeBuffer);
+            auto yLength = shape::length(yShapeBuffer);
+            auto zLength = shape::length(zShapeBuffer);
 
             auto xEWS = shape::elementWiseStride(xShapeBuffer);
             auto yEWS = shape::elementWiseStride(yShapeBuffer);
@@ -162,47 +163,53 @@ namespace randomOps {
 
             int elementsPerThread = zLength / TAD_THRESHOLD;
             int _threads = nd4j::math::nd4j_max<int>(1, elementsPerThread);
-            _threads = nd4j::math::nd4j_min<int>(_threads, omp_get_max_threads());
+            _threads = nd4j::math::nd4j_min<int>(_threads, nd4j::Environment::getInstance()->maxThreads());
 
             if (zEWS >= 1 && xEWS >= 1 && yEWS >= 1) {
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(_threads)
-                for (Nd4jLong e = 0; e < zLength; e++) {
-                    T prob = rng->relativeT<T>(e);
-                    T cumProb = (T) 0.0f;
-                    for (Nd4jLong f = 0; f < yLength; f++) {
-                        T relProb = y[f * yEWS];
-                        cumProb += relProb;
+                auto func = PRAGMA_THREADS_FOR {
+                    for (uint64_t e = start; e < stop; e += increment) {
+                        T prob = rng->relativeT<T>(e);
+                        T cumProb = (T) 0.0f;
+                        for (Nd4jLong f = 0; f < yLength; f++) {
+                            T relProb = y[f * yEWS];
+                            cumProb += relProb;
 
-                        if (prob <= cumProb || f == yLength - 1) {
-                            z[e * zEWS] = x[f * xEWS];
-                            break;
+                            if (prob <= cumProb || f == yLength - 1) {
+                                z[e * zEWS] = x[f * xEWS];
+                                break;
+                            }
                         }
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func, 0, zLength, 1, _threads);
             }
             else {
 
-                PRAGMA_OMP_PARALLEL_FOR_THREADS(_threads)
-                for (Nd4jLong i = 0; i < zLength; i++) {
+                auto func = PRAGMA_THREADS_FOR {
+                    for (Nd4jLong i = 0; i < zLength; i++) {
 
-                    auto zOffset2 = shape::getIndexOffset(i, zShapeBuffer);
-                    T prob = rng->relativeT<T>(i);
-                    T cumProb = (T) 0.0f;
+                        auto zOffset2 = shape::getIndexOffset(i, zShapeBuffer);
+                        T prob = rng->relativeT<T>(i);
+                        T cumProb = (T) 0.0f;
 
-                    for (Nd4jLong f = 0; f < yLength; f++) {
+                        for (Nd4jLong f = 0; f < yLength; f++) {
 
-                        auto yOffset2 = shape::getIndexOffset(f, yShapeBuffer);
-                        T relProb = y[yOffset2];
-                        cumProb += relProb;
+                            auto yOffset2 = shape::getIndexOffset(f, yShapeBuffer);
+                            T relProb = y[yOffset2];
+                            cumProb += relProb;
 
-                        if (prob <= cumProb || f == yLength - 1) {
+                            if (prob <= cumProb || f == yLength - 1) {
 
-                            auto xOffset2 = shape::getIndexOffset(f, xShapeBuffer);
-                            z[zOffset2] = x[xOffset2];
-                            break;
+                                auto xOffset2 = shape::getIndexOffset(f, xShapeBuffer);
+                                z[zOffset2] = x[xOffset2];
+                                break;
+                            }
                         }
                     }
-                }
+                };
+
+                samediff::Threads::parallel_for(func, 0, zLength, 1, _threads);
             }
         }
     };
@@ -308,7 +315,7 @@ namespace randomOps {
 
             int elementsPerThread = middle / TAD_THRESHOLD;
             int _threads = nd4j::math::nd4j_max<int>(1, elementsPerThread);
-            _threads = nd4j::math::nd4j_min<int>(_threads, omp_get_max_threads());
+            _threads = nd4j::math::nd4j_min<int>(_threads, nd4j::Environment::getInstance()->maxThreads());
 
             int span = (middle / _threads) + 8;
 
@@ -322,25 +329,30 @@ namespace randomOps {
 
             const T epsilon = static_cast<T>(1e-5);
 
-            PRAGMA_OMP_PARALLEL_FOR_THREADS(_threads)
-            for (Nd4jLong e = 0; e < middle; e++) {
-                auto epm = e + middle;
+            auto func = PRAGMA_THREADS_FOR {
+                for (uint64_t e = start; e < stop; e += increment) {
+                    auto epm = e + middle;
 
-                // we need to get random values
-                T r0 = rng->relativeT<T>(e, epsilon, static_cast<T>(1.0f));
-                T r1 = rng->relativeT<T>(epm, epsilon, static_cast<T>(1.0f));
+                    // we need to get random values
+                    T r0 = rng->relativeT<T>(e, epsilon, static_cast<T>(1.0f));
+                    T r1 = rng->relativeT<T>(epm, epsilon, static_cast<T>(1.0f));
 
-                T realMean0 = y == z ? mean : y[e * yEWS];
+                    T realMean0 = y == z ? mean : y[e * yEWS];
 
-                auto z0 =  (nd4j::math::nd4j_sqrt<T,T>(static_cast<T>(-2.0f) * nd4j::math::nd4j_log<T,T>(r0)) * nd4j::math::nd4j_cos<T,T>(two_pi * r1)) * stddev + realMean0;
-                z[e * zEWS] = z0;
+                    auto z0 = (nd4j::math::nd4j_sqrt<T, T>(static_cast<T>(-2.0f) * nd4j::math::nd4j_log<T, T>(r0)) *
+                               nd4j::math::nd4j_cos<T, T>(two_pi * r1)) * stddev + realMean0;
+                    z[e * zEWS] = z0;
 
-                if (epm < zLength) {
-                    T realMean1 = y == z ? mean : y[epm * yEWS];
-                    auto z1 = (nd4j::math::nd4j_sqrt<T,T>(static_cast<T>(-2.0f) * nd4j::math::nd4j_log<T,T>(r0)) * nd4j::math::nd4j_sin<T,T>(two_pi * r1)) * stddev + realMean1;
-                    z[epm * zEWS] = z1;
+                    if (epm < zLength) {
+                        T realMean1 = y == z ? mean : y[epm * yEWS];
+                        auto z1 = (nd4j::math::nd4j_sqrt<T, T>(static_cast<T>(-2.0f) * nd4j::math::nd4j_log<T, T>(r0)) *
+                                   nd4j::math::nd4j_sin<T, T>(two_pi * r1)) * stddev + realMean1;
+                        z[epm * zEWS] = z1;
+                    }
                 }
-            }
+            };
+
+            samediff::Threads::parallel_for(func, 0, middle, 1, _threads);
         }
     };
 
@@ -422,21 +434,13 @@ namespace randomOps {
 
             int elementsPerThread = zLength / TAD_THRESHOLD;
             int _threads = nd4j::math::nd4j_max<int>(1, elementsPerThread);
-            _threads = nd4j::math::nd4j_min<int>(_threads, omp_get_max_threads());
+            _threads = nd4j::math::nd4j_min<int>(_threads, nd4j::Environment::getInstance()->maxThreads());
 
-            auto span = (zLength / _threads) + 8;
+            T prob = extraArguments[1];
 
             nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
-            PRAGMA_OMP_PARALLEL_THREADS(_threads)
-            {
-                int tid = omp_get_thread_num();
-                auto start = span * tid;
-                auto end = span * (tid + 1);
-                if (end > zLength) end = zLength;
-
-                T prob = extraArguments[1];
-
-                for (Nd4jLong e = start; e < end; e++) {
+            auto func = PRAGMA_THREADS_FOR {
+                for (Nd4jLong e = start; e < stop; e += increment) {
 
                     int success = 0;
                     for (int t = 1; t <= trials; t++) {
@@ -453,7 +457,9 @@ namespace randomOps {
                     // if trials is set to 0, effectively we just have successful memset
                     z[e * zEWS] = static_cast<T>(success);
                 }
-            }
+            };
+
+            samediff::Threads::parallel_for(func, 0, zLength, 1, _threads);
         }
     };
 
@@ -536,22 +542,14 @@ namespace randomOps {
 
             int elementsPerThread = zLength / TAD_THRESHOLD;
             int _threads = nd4j::math::nd4j_max<int>(1, elementsPerThread);
-            _threads = nd4j::math::nd4j_min<int>(_threads, omp_get_max_threads());
+            _threads = nd4j::math::nd4j_min<int>(_threads, nd4j::Environment::getInstance()->maxThreads());
 
-            auto span = (zLength / _threads) + 8;
+            T prob = extraArguments[1];
 
             //nd4j::random::RandomBuffer *buffer = reinterpret_cast<nd4j::random::RandomBuffer *> (state);
             nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
-            PRAGMA_OMP_PARALLEL_THREADS(_threads)
-            {
-                int tid = omp_get_thread_num();
-                Nd4jLong start = span * tid;
-                Nd4jLong end = span * (tid + 1);
-                if (end > zLength) end = zLength;
-
-                T prob = extraArguments[1];
-
-                for (Nd4jLong e = start; e < end; e++) {
+            auto func = PRAGMA_THREADS_FOR {
+                for (uint64_t e = start; e < stop; e += increment) {
 
                     int success = 0;
                     for (int t = 1; t <= trials; t++) {
@@ -568,7 +566,9 @@ namespace randomOps {
                     // if trials is set to 0, effectively we just have successful memset
                     z[e * zEWS] = static_cast<T>(success);
                 }
-            }
+            };
+
+            samediff::Threads::parallel_for(func, 0, zLength, 1, _threads);
         }
     };
 
@@ -685,19 +685,22 @@ namespace randomOps {
             Nd4jLong middle = zLength / 2 + (zLength % 2);
             int elementsPerThread = middle / TAD_THRESHOLD;
             int _threads = nd4j::math::nd4j_max<int>(1, elementsPerThread);
-            _threads = nd4j::math::nd4j_min<int>(_threads, omp_get_max_threads());
+            _threads = nd4j::math::nd4j_min<int>(_threads, nd4j::Environment::getInstance()->maxThreads());
 
             const T epsilon = static_cast<T>(1e-5);
 
-            PRAGMA_OMP_PARALLEL_FOR_THREADS(_threads)
-            for (Nd4jLong e = 0; e < zLength; ++e) {
-                if (z[e] > mean + ds || z[e] < mean - ds) {
-                    z[e] = step(rng, mean, stddev, e, middle, z[e]);
+            auto func = PRAGMA_THREADS_FOR {
+                for (uint64_t e = start; e < stop; e += increment) {
+                    if (z[e] > mean + ds || z[e] < mean - ds) {
+                        z[e] = step(rng, mean, stddev, e, middle, z[e]);
 
-                    if (z[e] > mean + ds || z[e] < mean - ds)
-                        z[e] = mean + nd4j::DataTypeUtils::min<T>();
+                        if (z[e] > mean + ds || z[e] < mean - ds)
+                            z[e] = mean + nd4j::DataTypeUtils::min<T>();
+                    }
                 }
-            }
+            };
+
+            samediff::Threads::parallel_for(func, 0, zLength, 1, _threads);
         }
     };
 
@@ -799,7 +802,7 @@ namespace randomOps {
 
             int elementsPerThread = middle / TAD_THRESHOLD;
             int _threads = nd4j::math::nd4j_max<int>(1, elementsPerThread);
-            _threads = nd4j::math::nd4j_min<int>(_threads, omp_get_max_threads());
+            _threads = nd4j::math::nd4j_min<int>(_threads, nd4j::Environment::getInstance()->maxThreads());
 
             int span = (zLength / _threads) + 8;
 
@@ -813,16 +816,9 @@ namespace randomOps {
             const T stddev = extraArguments[1];
             const T epsilon = static_cast<T>(1e-5);
 
-            PRAGMA_OMP_PARALLEL_THREADS(_threads)
-            {
-                int tid = omp_get_thread_num();
-                Nd4jLong start = span * tid;
-                Nd4jLong end = span * (tid + 1);
-                if (end > middle)
-                    end = middle;
-
+            auto func = PRAGMA_THREADS_FOR {
                 PRAGMA_OMP_SIMD
-                for (Nd4jLong e = start; e < end; e++) {
+                for (uint64_t e = start; e < stop; e += increment) {
                     auto epm = e + middle;
 
                     // we need to get random values
@@ -838,7 +834,9 @@ namespace randomOps {
                         z[epm * zEWS] =  nd4j::math::nd4j_exp<T,T>((nd4j::math::nd4j_sqrt<T,T>(static_cast<T>(-2.0f) * nd4j::math::nd4j_log<T,T>(r0)) * nd4j::math::nd4j_sin<T,T>(two_pi * r1)) * stddev + realMean);
                     }
                 }
-            }
+            };
+
+            samediff::Threads::parallel_for(func, 0, middle, 1, _threads);
         }
     };
 
diff --git a/libnd4j/include/ops/specials.h b/libnd4j/include/ops/specials.h
index 6919aa38d..d8030db0b 100644
--- a/libnd4j/include/ops/specials.h
+++ b/libnd4j/include/ops/specials.h
@@ -18,8 +18,8 @@
 // Created by raver119 on 24.04.17.
 //
 
-#ifndef LIBND4J_CONCAT_H
-#define LIBND4J_CONCAT_H
+#ifndef LIBND4J_SPECIALS_H
+#define LIBND4J_SPECIALS_H
 
 
 #ifdef __CUDACC__
@@ -28,6 +28,7 @@
 #endif
 
 #include <pointercast.h>
+#include <vector>
 
 namespace nd4j {
     class NDArray;
@@ -81,4 +82,4 @@ namespace nd4j {
 }
 
 
-#endif //LIBND4J_CONCAT_H
+#endif //LIBND4J_SPECIALS_H
diff --git a/libnd4j/include/performance/benchmarking/impl/FullBenchmarkSuit.cpp b/libnd4j/include/performance/benchmarking/impl/FullBenchmarkSuit.cpp
index d35346e2b..22bb87103 100644
--- a/libnd4j/include/performance/benchmarking/impl/FullBenchmarkSuit.cpp
+++ b/libnd4j/include/performance/benchmarking/impl/FullBenchmarkSuit.cpp
@@ -21,8 +21,9 @@
 #include <ops/declarable/CustomOperations.h>
 #include <performance/benchmarking/FullBenchmarkSuit.h>
 #include <ops/declarable/LegacyRandomOp.h>
+#include <algorithm>
 
-#ifdef _RELEASE
+#ifdef RELEASE_BUILD
     int wIterations = 4;
     int rIterations = 20;
     int gemmRegularUpperPow = 11;
diff --git a/libnd4j/include/performance/benchmarking/impl/LightBenchmarkSuit.cpp b/libnd4j/include/performance/benchmarking/impl/LightBenchmarkSuit.cpp
index caad37867..9e179db7f 100644
--- a/libnd4j/include/performance/benchmarking/impl/LightBenchmarkSuit.cpp
+++ b/libnd4j/include/performance/benchmarking/impl/LightBenchmarkSuit.cpp
@@ -21,14 +21,14 @@
 #include <ops/declarable/CustomOperations.h>
 #include "performance/benchmarking/LightBenchmarkSuit.h"
 
-#ifdef _RELEASE
-#define WARMUP 3
-#define NUM_ITER 10
+#ifdef RELEASE_BUILD
+#define WARMUP 5
+#define NUM_ITER 100
 
 #else
 
-#define WARMUP 0
-#define NUM_ITER 1
+#define WARMUP 5
+#define NUM_ITER 100
 
 #endif
 
@@ -592,7 +592,7 @@ namespace nd4j {
     }
 
     std::string LightBenchmarkSuit::runSuit() {
-#ifdef _RELEASE
+#ifdef RELEASE_BUILD
         std::vector<nd4j::DataType> dtypes({nd4j::DataType::FLOAT32, nd4j::DataType::HALF});
 #else
         std::vector<nd4j::DataType> dtypes({nd4j::DataType::FLOAT32});
@@ -609,7 +609,7 @@ namespace nd4j {
 
             nd4j_printf("Running LightBenchmarkSuite.pairwiseBenchmark [%s]\n", DataTypeUtils::asString(t).c_str());
             BUILD_SINGLE_SELECTOR(t, result += pairwiseBenchmark, (), LIBND4J_TYPES);
-
+/*
             nd4j_printf("Running LightBenchmarkSuite.reduceFullBenchmark [%s]\n", DataTypeUtils::asString(t).c_str());
             BUILD_SINGLE_SELECTOR(t, result += reduceFullBenchmark, (), LIBND4J_TYPES);
 
@@ -627,12 +627,13 @@ namespace nd4j {
 
             nd4j_printf("Running LightBenchmarkSuite.lstmBenchmark [%s]\n", DataTypeUtils::asString(t).c_str());
             BUILD_SINGLE_SELECTOR(t, result += lstmBenchmark, (), LIBND4J_TYPES);
+            */
         }
 
         nd4j_printf("Running LightBenchmarkSuite.broadcast2d\n", "");
-        result += broadcast2d();
+        //result += broadcast2d();
         nd4j_printf("Running LightBenchmarkSuite.mismatchedOrderAssign\n", "");
-        result += mismatchedOrderAssign();
+        //result += mismatchedOrderAssign();
 
         return result;
     }
diff --git a/libnd4j/include/pointercast.h b/libnd4j/include/pointercast.h
index c6161782a..e080b33b6 100644
--- a/libnd4j/include/pointercast.h
+++ b/libnd4j/include/pointercast.h
@@ -21,6 +21,7 @@
 #ifndef NATIVEOPERATIONS_POINTERCAST_H
 #define NATIVEOPERATIONS_POINTERCAST_H
 
+#include <msvc.h>
 #include <stdint.h>
 
 typedef void* Nd4jPointer;
diff --git a/libnd4j/include/templatemath.h b/libnd4j/include/templatemath.h
index 96f97f762..23f6b342d 100644
--- a/libnd4j/include/templatemath.h
+++ b/libnd4j/include/templatemath.h
@@ -44,7 +44,6 @@
 #define M_PI 3.14159265358979323846
 #endif
 
-
 namespace nd4j {
 #ifdef __CUDACC__
 
@@ -1651,4 +1650,46 @@ inline __device__ bfloat16 nd4j_atomicDiv<bfloat16>(bfloat16* address, bfloat16
 
 }
 
+#ifdef _OPENMP
+
+#ifndef MAX_FLOAT
+#define MAX_FLOAT 1e37
+#endif
+
+#pragma omp declare reduction(maxTF : float,double,float16,bfloat16 :              \
+                omp_out = nd4j::math::nd4j_max(omp_in, omp_out) )\
+                initializer (omp_priv=-MAX_FLOAT)
+
+#pragma omp declare reduction(minTF : float,double,float16,bfloat16 :              \
+                omp_out = nd4j::math::nd4j_min(omp_in, omp_out) )\
+                initializer (omp_priv=MAX_FLOAT)
+
+#pragma omp declare reduction(maxT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t :              \
+                omp_out = nd4j::math::nd4j_max(omp_in, omp_out) )\
+                initializer (omp_priv=0)
+
+#pragma omp declare reduction(minT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t :              \
+                omp_out = nd4j::math::nd4j_min(omp_in, omp_out) )\
+                initializer (omp_priv=0)
+
+#pragma omp declare reduction(amaxT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t :              \
+                omp_out = nd4j::math::nd4j_max(nd4j::math::nd4j_abs(omp_in), nd4j::math::nd4j_abs(omp_out)) )
+
+#pragma omp declare reduction(aminT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t :              \
+                omp_out = nd4j::math::nd4j_min(nd4j::math::nd4j_abs(omp_in), nd4j::math::nd4j_abs(omp_out)) )
+
+#pragma omp declare reduction(asumT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t :              \
+                omp_out = nd4j::math::nd4j_abs(omp_in) + nd4j::math::nd4j_abs(omp_out))\
+                initializer (omp_priv=0)
+
+#pragma omp declare reduction(sumT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t :              \
+                omp_out = omp_in + omp_out)\
+                initializer (omp_priv=0)
+
+#pragma omp declare reduction(prodT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t :              \
+                omp_out = omp_in * omp_out)\
+                initializer (omp_priv=1)
+
+#endif
+
 #endif /* TEMPLATEMATH_H_ */
diff --git a/libnd4j/pom.xml b/libnd4j/pom.xml
index f33f8577f..3e766b944 100644
--- a/libnd4j/pom.xml
+++ b/libnd4j/pom.xml
@@ -185,6 +185,8 @@
                             <buildCommand>
                                 <program>bash</program>
                                 <argument>run_tests.sh</argument>
+                                <argument>--chip</argument>
+                                <argument>${libnd4j.chip}</argument>
                             </buildCommand>
                         </configuration>
                     </execution>
diff --git a/libnd4j/tests_cpu/layers_tests/BooleanOpsTests.cpp b/libnd4j/tests_cpu/layers_tests/BooleanOpsTests.cpp
index 2cbc8513e..20469ed2d 100644
--- a/libnd4j/tests_cpu/layers_tests/BooleanOpsTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/BooleanOpsTests.cpp
@@ -141,7 +141,7 @@ TEST_F(BooleanOpsTests, test_where_1) {
 
     auto z = result->at(0);
 
-    z->printIndexedBuffer("z");
+    //z->printIndexedBuffer("z");
 
     ASSERT_EQ(e, *z);
 
diff --git a/libnd4j/tests_cpu/layers_tests/BroadcastableOpsTests.cpp b/libnd4j/tests_cpu/layers_tests/BroadcastableOpsTests.cpp
index c6b834a33..33a8fa10a 100644
--- a/libnd4j/tests_cpu/layers_tests/BroadcastableOpsTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/BroadcastableOpsTests.cpp
@@ -41,6 +41,8 @@ TEST_F(BroadcastableOpsTests, Test_Add_1) {
     y.linspace(1);
     exp.linspace(1);
 
+    //exp.printIndexedBuffer("E B");
+
     exp.applyBroadcast(broadcast::Add, {1}, &y);
 
     nd4j::ops::add op;
@@ -50,8 +52,8 @@ TEST_F(BroadcastableOpsTests, Test_Add_1) {
 
     auto z = result->at(0);
 
-    // exp.printIndexedBuffer("E");
-    // z->printIndexedBuffer("Z");
+    //exp.printIndexedBuffer("E A");
+    //z->printIndexedBuffer("Z");
 
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
@@ -717,7 +719,7 @@ TEST_F(BroadcastableOpsTests, broadcast_bool_empty_2) {
 
     auto z = result->at(0);
 
-    z->printShapeInfo("z");
+    // z->printShapeInfo("z");
 
     ASSERT_EQ(Status::OK(), result->status());
     ASSERT_TRUE(e.isSameShape(z));
diff --git a/libnd4j/tests_cpu/layers_tests/BrodcastTests.cpp b/libnd4j/tests_cpu/layers_tests/BrodcastTests.cpp
index 0fa4d687d..9a8f09b87 100644
--- a/libnd4j/tests_cpu/layers_tests/BrodcastTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/BrodcastTests.cpp
@@ -54,7 +54,7 @@ TEST_F(BroadcastMultiDimTest,MultimDimTest) {
             tad->tadOnlyShapeInfo, //tadShapeInfo
             tad->tadOffsets, //tadOffset
             tad->tadOnlyShapeInfo, //tadShapeInfoZ
-            tad->tadOffsets); //tadOffsetZ
+            tad->tadOffsets, 0, tad->numTads); //tadOffsetZ
     for(int i = 0; i < 30; i++) {
         ASSERT_EQ(dataAssertion[i],result[i]);
     }
diff --git a/libnd4j/tests_cpu/layers_tests/CMakeLists.txt b/libnd4j/tests_cpu/layers_tests/CMakeLists.txt
index 6f964d0ac..8a58fe3a5 100644
--- a/libnd4j/tests_cpu/layers_tests/CMakeLists.txt
+++ b/libnd4j/tests_cpu/layers_tests/CMakeLists.txt
@@ -34,7 +34,7 @@ if (CUDA_BLAS)
     endif()
 
     if ("${COMPUTE}" STREQUAL "all")
-        list(APPEND CUDA_NVCC_FLAGS -DCUDA_10 ${EXPM} -w --cudart=static -O3 --expt-extended-lambda -gencode arch=compute_35,code=sm_35 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70)
+        list(APPEND CUDA_NVCC_FLAGS -DCUDA_10 ${EXPM} -w --cudart=static -O3 --expt-extended-lambda -gencode arch=compute_30,code=sm_30 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70)
     else()
         list(APPEND CUDA_NVCC_FLAGS -DCUDA_10 ${EXPM} -w -G -g --cudart=static --expt-extended-lambda -arch=compute_${COMPUTE} -code=sm_${COMPUTE})
     endif()
@@ -43,18 +43,19 @@ endif()
 # -fsanitize=address
 # -fsanitize=leak
 if (APPLE)
-    set(CMAKE_CXX_FLAGS  " -fPIC -std=c++11 -fassociative-math -funsafe-math-optimizations -fmax-errors=2 -D__APPLE_OS__=true")
-elseif(WIN32)    
-	if (CPU_BLAS)
-		set(CMAKE_CXX_FLAGS  " -fPIC -std=c++11 -fassociative-math -funsafe-math-optimizations -fmax-errors=2")
+    set(CMAKE_CXX_FLAGS  " -fPIC -std=c++11 -fmax-errors=2 -D__APPLE_OS__=true")
+elseif(WIN32)
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -march=native -mtune=native -O3")
+	if (CPU_BLAS AND LINUX)
+		set(CMAKE_CXX_FLAGS  " -fPIC -std=c++11 -fmax-errors=2")
 	endif()
 else()
-
-    set(CMAKE_CXX_FLAGS  " -fPIC -std=c++11 -fassociative-math -funsafe-math-optimizations -fmax-errors=2")
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}  -O3")
+    set(CMAKE_CXX_FLAGS  " -fPIC -std=c++11 -fmax-errors=2")
     if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64*")
         set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -mcpu=native")
     else()
-        set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS}")
+        set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -march=native -mtune=native")
     endif()
 
     if (CPU_BLAS)
@@ -130,6 +131,10 @@ foreach (TMP_PATH ${TEST_SOURCES})
 endforeach(TMP_PATH)
 
 if (CPU_BLAS)
+    if (NOT BLAS_LIBRARIES)
+        set(BLAS_LIBRARIES "")
+    endif()
+
 	add_executable(runtests ${TEST_SOURCES})
 	target_link_libraries(runtests ${LIBND4J_NAME}static ${MKLDNN_LIBRARIES} ${OPENBLAS_LIBRARIES} ${MKLDNN} ${BLAS_LIBRARIES} ${CPU_FEATURES} gtest gtest_main)
 elseif(CUDA_BLAS)
diff --git a/libnd4j/tests_cpu/layers_tests/ConditionalTests.cpp b/libnd4j/tests_cpu/layers_tests/ConditionalTests.cpp
index 2d4f9205f..60ba4733c 100644
--- a/libnd4j/tests_cpu/layers_tests/ConditionalTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ConditionalTests.cpp
@@ -160,7 +160,6 @@ TEST_F(ConditionalTests, Flat_Test_2) {
 
     auto exp = NDArrayFactory::create<float>('c', {2, 2}, {1, 1, 1, 1});
 
-    z->printIndexedBuffer("z");
     ASSERT_TRUE(exp.equalsTo(z));
     delete graph;
 }
diff --git a/libnd4j/tests_cpu/layers_tests/ConstantShapeHelperTests.cpp b/libnd4j/tests_cpu/layers_tests/ConstantShapeHelperTests.cpp
index 383815417..9134ef0a4 100644
--- a/libnd4j/tests_cpu/layers_tests/ConstantShapeHelperTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ConstantShapeHelperTests.cpp
@@ -140,8 +140,8 @@ TEST_F(ConstantShapeHelperTests, basic_test_5) {
     auto arrayA = NDArrayFactory::create<int>(1);
     auto arrayB = NDArrayFactory::create_<float>('c', {128, 256});
 
-    arrayA.printShapeInfo("A");
-    arrayB->printShapeInfo("B");
+    //arrayA.printShapeInfo("A");
+    //arrayB->printShapeInfo("B");
     ASSERT_EQ(0, arrayA.rankOf());
     ASSERT_EQ(2, arrayB->rankOf());
     ASSERT_NE(arrayA.dataType(), arrayB->dataType());
diff --git a/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp b/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp
index 853f82cda..353e51ad3 100644
--- a/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp
@@ -614,182 +614,6 @@ TYPED_TEST(TypedConvolutionTests1, sconv2d_conv2d_1) {
     delete result2D;
 }
 
-
-
-TEST_F(ConvolutionTests1, Test_im2col_col2im_1) {
-    int kY = 5;
-    int kX = 5;
-    int sY = 1;
-    int sX = 1;
-    int pY = 0;
-    int pX = 0;
-    int dY = 1;
-    int dX = 1;
-    int inY = 28;
-    int inX = 28;
-    int channels = 3;
-
-    bool isSameMode = true;
-
-    auto x = NDArrayFactory::create<double>('c', {2, channels, inY, inX});
-    x.linspace(1);
-
-    int oY, oX;
-    x.syncToDevice();
-    //ASSERT_TRUE(x.isActualOnDeviceSide());
-    ASSERT_TRUE(x.isActualOnHostSide());
-    //x.printBuffer("x", 64);
-
-    nd4j::ops::ConvolutionUtils::calcOutSizePool2D(oY, oX, kY, kX, sY, sX, pY, pX, dY, dX, inY, inX, isSameMode);
-
-    if (isSameMode)
-        nd4j::ops::ConvolutionUtils::calcPadding2D(pY, pX, oY, oX, inY, inX, kY, kX, sY, sX, dY, dX);
-
-    auto im2col0 = NDArrayFactory::create<double>('c', {2, channels, kY, kX, oY, oX});
-
-    ExtraArguments args({(double) kY, (double) kX, (double) sY, (double) sX, (double) pY, (double) pX, (double) dY, (double) dX, isSameMode ? (double) 1 : (double) 0, (double)0.0, (double) 0.});
-    x.applyTransform(transform::Im2col, &im2col0, &args);
-
-    nd4j::ops::im2col op;
-    auto result2col = op.execute({&x}, {}, {kY, kX, sY, sX, pY, pX, dY, dX, isSameMode ? 1 : 0});
-
-    auto im2col1 = result2col->at(0);
-
-    //im2col0.printBuffer("transformed");
-    //im2col1->printBuffer("customized", 64);
-
-    ASSERT_TRUE(im2col1->isSameShape(&im2col0));
-    ASSERT_TRUE(im2col1->equalsTo(&im2col0));
-
-
-    ExtraArguments args2({ (double) sY, (double) sX, (double) pY, (double) pX, (double) inY, (double) inX, (double) dY, (double) dX, isSameMode ? (double) 1 : (double) 0});
-    auto col2im0 = NDArrayFactory::create<double>('c', {2, channels, inY, inX});
-    im2col0.applyTransform(transform::Col2Im, &col2im0, &args2);
-
-    nd4j::ops::col2im op2im;
-    auto result2im = op2im.execute({im2col1}, {}, {sY, sX, pY, pX, inY, inX, dY, dX, isSameMode ? 1 : 0});
-    auto col2im1 = result2im->at(0);
-
-    ASSERT_TRUE(col2im1->isSameShape(&col2im0));
-    ASSERT_TRUE(col2im1->equalsTo(&col2im0));
-
-    delete result2col;
-    delete result2im;
-}
-
-
-TEST_F(ConvolutionTests1, Test_im2col_col2im_2) {
-    int kY = 5;
-    int kX = 5;
-    int sY = 1;
-    int sX = 1;
-    int pY = 0;
-    int pX = 0;
-    int dY = 1;
-    int dX = 1;
-    int inY = 28;
-    int inX = 28;
-    int channels = 3;
-
-    bool isSameMode = true;
-
-    auto x = NDArrayFactory::create<double>('c', {2, channels, inY, inX});
-    x.linspace(1);
-
-    int oY, oX;
-
-    nd4j::ops::ConvolutionUtils::calcOutSizePool2D(oY, oX, kY, kX, sY, sX, pY, pX, dY, dX, inY, inX, isSameMode);
-
-    if (isSameMode)
-        nd4j::ops::ConvolutionUtils::calcPadding2D(pY, pX, oY, oX, inY, inX, kY, kX, sY, sX, dY, dX);
-
-    auto im2col0 = NDArrayFactory::create<double>('c', {2, channels, oY, oX, kY, kX});
-    im2col0.permutei({0, 1, 4, 5, 2, 3});
-
-    ExtraArguments args2col({(double) kY, (double) kX, (double) sY, (double) sX, (double) pY, (double) pX, (double) dY, (double) dX, isSameMode ? (double) 1 : (double) 0, (double)0.0, (double) 0.});
-    x.applyTransform(transform::Im2col, &im2col0, &args2col);
-
-    nd4j::ops::im2col op;
-    auto result2col = op.execute({&x}, {}, {kY, kX, sY, sX, pY, pX, dY, dX, isSameMode ? 1 : 0});
-
-    auto im2col1 = result2col->at(0);
-
-    ASSERT_TRUE(im2col1->isSameShape(&im2col0));
-    ASSERT_TRUE(im2col1->equalsTo(&im2col0));
-
-
-    ExtraArguments args2im({ (double) sY, (double) sX, (double) pY, (double) pX, (double) inY, (double) inX, (double) dY, (double) dX, isSameMode ? (double) 1 : (double) 0});
-    auto col2im0 = NDArrayFactory::create<double>('c', {2, channels, inY, inX});
-    im2col0.applyTransform(transform::Col2Im, &col2im0, &args2im);
-
-    nd4j::ops::col2im op2im;
-    auto result2im = op2im.execute({im2col1}, {}, {sY, sX, pY, pX, inY, inX, dY, dX, isSameMode ? 1 : 0});
-    auto col2im1 = result2im->at(0);
-
-    ASSERT_TRUE(col2im1->isSameShape(&col2im0));
-    ASSERT_TRUE(col2im1->equalsTo(&col2im0));
-
-    delete result2col;
-    delete result2im;
-}
-
-TEST_F(ConvolutionTests1, Test_im2col_col2im_3) {
-    int kY = 5;
-    int kX = 5;
-    int sY = 1;
-    int sX = 1;
-    int pY = 0;
-    int pX = 0;
-    int dY = 1;
-    int dX = 1;
-    int inY = 28;
-    int inX = 28;
-    int channels = 3;
-
-    bool isSameMode = true;
-
-    auto x = NDArrayFactory::create<double>('c', {2, channels, inY, inX});
-    x.linspace(1);
-
-    int oY, oX;
-
-    nd4j::ops::ConvolutionUtils::calcOutSizePool2D(oY, oX, kY, kX, sY, sX, pY, pX, dY, dX, inY, inX, isSameMode);
-
-    if (isSameMode)
-        nd4j::ops::ConvolutionUtils::calcPadding2D(pY, pX, oY, oX, inY, inX, kY, kX, sY, sX, dY, dX);
-
-    auto im2col0 = NDArrayFactory::create<double>('c', {2, channels, oY, oX, kY, kX});
-    im2col0.permutei({0, 1, 4, 5, 2, 3});
-
-    auto im2col1 = NDArrayFactory::create<double>('c', {2, channels, oY, oX, kY, kX});
-    im2col1.permutei({0, 1, 4, 5, 2, 3});
-
-    ExtraArguments args2col({(double) kY, (double) kX, (double) sY, (double) sX, (double) pY, (double) pX, (double) dY, (double) dX, isSameMode ? (double) 1 : (double) 0, (double)0.0, (double) 0.});
-    x.applyTransform(transform::Im2col, &im2col0, &args2col);
-
-    nd4j::ops::im2col op;
-    auto status = op.execute({&x}, {&im2col1}, {}, {kY, kX, sY, sX, pY, pX, dY, dX, isSameMode ? 1 : 0}, {});
-    ASSERT_EQ(Status::OK(), status);
-
-    ASSERT_TRUE(im2col1.isSameShape(&im2col0));
-    ASSERT_TRUE(im2col1.equalsTo(&im2col0));
-
-
-    ExtraArguments args2im({ (double) sY, (double) sX, (double) pY, (double) pX, (double) inY, (double) inX, (double) dY, (double) dX, isSameMode ? (double) 1 : (double) 0});
-    auto col2im0 = NDArrayFactory::create<double>('c', {2, channels, inY, inX});
-    im2col0.applyTransform(transform::Col2Im, &col2im0, &args2im);
-
-    nd4j::ops::col2im op2im;
-    auto result2im = op2im.execute({&im2col1}, {}, {sY, sX, pY, pX, inY, inX, dY, dX, isSameMode ? 1 : 0});
-    auto col2im1 = result2im->at(0);
-
-    ASSERT_TRUE(col2im1->isSameShape(&col2im0));
-    ASSERT_TRUE(col2im1->equalsTo(&col2im0));
-
-    delete result2im;
-}
-
-
 TEST_F(ConvolutionTests1, TestDeconv_bp_1) {
 
     int bS=3, iH=4,iW=4,  iC=3,oC=2,  kH=1,kW=1,  sH=1,sW=1,  pH=0,pW=0,  dH=1,dW=1;
@@ -1212,8 +1036,8 @@ TYPED_TEST(TypedConvolutionTests1, conv3d_bp_test1) {
 
     nd4j::ops::conv3dnew_bp op;
     auto results = op.execute({&input, &weights, &bias, &gradO}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW,  dD,dH,dW, paddingMode, dataFormat});
-    auto* gradI = results->at(0);
-    auto* gradW = results->at(1);
+    auto gradI = results->at(0);
+    auto gradW = results->at(1);
 
     ASSERT_EQ(Status::OK(), results->status());
     ASSERT_TRUE(expGradI.isSameShape(gradI));
diff --git a/libnd4j/tests_cpu/layers_tests/DataTypesValidationTests.cpp b/libnd4j/tests_cpu/layers_tests/DataTypesValidationTests.cpp
index c018e58d0..45b35eb4e 100644
--- a/libnd4j/tests_cpu/layers_tests/DataTypesValidationTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DataTypesValidationTests.cpp
@@ -110,7 +110,7 @@ TEST_F(DataTypesValidationTests, test_bfloat16_rand_1) {
     RandomGenerator gen(119, 120);
     RandomLauncher::fillUniform(LaunchContext::defaultContext(), gen, &x, 1, 6);
 
-    ASSERT_TRUE(x.sumNumber().e<float>(0) > 0);
+    ASSERT_TRUE(x.sumNumber().e<float>(0) != 0.f);
 }
 
 TEST_F(DataTypesValidationTests, test_bfloat16_rand_2) {
@@ -118,7 +118,7 @@ TEST_F(DataTypesValidationTests, test_bfloat16_rand_2) {
     RandomGenerator gen(119, 120);
     RandomLauncher::fillGaussian(LaunchContext::defaultContext(), gen, &x, 0, 1);
 
-    ASSERT_TRUE(x.sumNumber().e<float>(0) > 0);
+    ASSERT_TRUE(x.sumNumber().e<float>(0) != 0.f);
 }
 
 TEST_F(DataTypesValidationTests, cast_1) {
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp
index 458858c57..8dd2e7a40 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp
@@ -164,9 +164,7 @@ TEST_F(DeclarableOpsTests1, ApplyGradientDescent_1) {
     auto result = op.execute({&x, &y}, {1.}, {}, {}, false, nd4j::DataType::DOUBLE);
     ASSERT_EQ(result->status(), ND4J_STATUS_OK);
     auto z = result->at(0);
-//    result->at(0)->printIndexedBuffer("OUTPUT");
-//    result->at(0)->printShapeInfo("OUTPUT Shape");
-//    exp.printIndexedBuffer("EXPECT");
+
     ASSERT_TRUE(z->equalsTo(exp));
     delete result;
 }
@@ -180,9 +178,7 @@ TEST_F(DeclarableOpsTests1, AssignBroadcastTest_1) {
     auto result = op.execute({&x, &y}, {}, {}, {}, false, nd4j::DataType::DOUBLE);
     ASSERT_EQ(result->status(), ND4J_STATUS_OK);
     auto z = result->at(0);
-//    result->at(0)->printIndexedBuffer("OUTPUT");
-//    result->at(0)->printShapeInfo("OUTPUT Shape");
-//    exp.printIndexedBuffer("EXPECT");
+
     ASSERT_TRUE(z->equalsTo(exp));
     delete result;
 }
@@ -199,11 +195,6 @@ TEST_F(DeclarableOpsTests1, AssignBroadcastTest_2) {
     ASSERT_EQ(result->status(), ND4J_STATUS_OK);
     auto z1 = result->at(0);
     auto z2 = result->at(1);
-//    z1->printIndexedBuffer("OUTPUT");
-//    z2->printIndexedBuffer("OUTPUT");
-//
-//    exp1.printIndexedBuffer("EXPECT");
-//    exp2.printIndexedBuffer("EXPECT");
 
     ASSERT_TRUE(z1->equalsTo(exp1));
     ASSERT_TRUE(z2->equalsTo(exp2));
@@ -220,9 +211,7 @@ TEST_F(DeclarableOpsTests1, AXpY_Test_1) {
     auto result = op.execute({&x, &y}, {2.}, {}, {}, false, nd4j::DataType::DOUBLE);
     ASSERT_EQ(result->status(), ND4J_STATUS_OK);
     auto z = result->at(0);
-//    result->at(0)->printIndexedBuffer("OUTPUT");
-//    result->at(0)->printShapeInfo("OUTPUT Shape");
-//    exp.printIndexedBuffer("EXPECT");
+
     ASSERT_TRUE(z->equalsTo(exp));
     delete result;
 }
@@ -265,14 +254,6 @@ TEST_F(DeclarableOpsTests1, TestTensorMmul1) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto *out = results->at(0);
-    // exp.printShapeInfo();
-    // out->printShapeInfo();
-    // exp.printBuffer();
-    // out->printBuffer();
-
-    // PointersManager manager(x.getContext(), "scatter");
-    // manager.printDevContentOnHost<float>(out->getSpecialBuffer(), out->lengthOf());
-    // manager.printDevContentOnHost<float>(exp.getSpecialBuffer(), exp.lengthOf());
 
     ASSERT_TRUE(exp.isSameShape(out));
     ASSERT_TRUE(exp.equalsTo(out));
@@ -293,8 +274,6 @@ TEST_F(DeclarableOpsTests1, TestTensorDot2) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto *out = results->at(0);
-    // out->printBuffer();
-    // out->printShapeInfo();
 
     ASSERT_TRUE(exp.isSameShape(out));
     ASSERT_TRUE(exp.equalsTo(out));
@@ -315,8 +294,6 @@ TEST_F(DeclarableOpsTests1, TestTensorDot3) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto *out = results->at(0);
-    // out->printBuffer();
-    // out->printShapeInfo();
 
     ASSERT_TRUE(exp.isSameShape(out));
     ASSERT_TRUE(exp.equalsTo(out));
@@ -337,8 +314,6 @@ TEST_F(DeclarableOpsTests1, TestTensorDot4) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto *out = results->at(0);
-    // out->printBuffer();
-    // out->printShapeInfo();
 
     ASSERT_TRUE(exp.isSameShape(out));
     ASSERT_TRUE(exp.equalsTo(out));
@@ -631,8 +606,6 @@ TEST_F(DeclarableOpsTests1, ClipByValue1) {
 
     clip.execute(block);
 
-    // x->printIndexedBuffer("Result");
-    // exp.printIndexedBuffer("Expect");
     ASSERT_TRUE(x->equalsTo(&exp));
 
 
@@ -775,7 +748,7 @@ TEST_F(DeclarableOpsTests1, ReverseSubtractMatrices1) {
     nd4j::ops::reversesubtract subOp;
 
     subOp.execute(block);
-    // x->printIndexedBuffer("Output Subtract");
+
     ASSERT_TRUE(x->equalsTo(&exp));
 
     delete variableSpace;
@@ -814,7 +787,7 @@ TEST_F(DeclarableOpsTests1, ReverseSubtractTest_2) {
     y.assign(1.f);
     exp.assign(-2.f);
     x.applyTrueBroadcast(BROADCAST(ReverseSubtract), &y, &z, true);
-//    x.printIndexedBuffer("ReverseSubtract Legacy");
+
     ASSERT_TRUE(exp.equalsTo(&z));
 
     nd4j::ops::reversesubtract subOp;
@@ -822,7 +795,6 @@ TEST_F(DeclarableOpsTests1, ReverseSubtractTest_2) {
     auto res = subOp.execute({&x, &y}, {}, {});
 
     ASSERT_TRUE(res->status() == ND4J_STATUS_OK);
-    //res->at(0)->printIndexedBuffer("OUtput REVERSED SUB");
     ASSERT_TRUE(res->at(0)->equalsTo(&exp));
 
     delete res;
@@ -862,8 +834,8 @@ TEST_F(DeclarableOpsTests1, ReverseModTest_1) {
     y.assign(9.f);
     exp.assign(1.f);
     y.applyTrueBroadcast(BROADCAST(Mod), &x, &z, true);
-    // z.printIndexedBuffer("MOD1");
     ASSERT_TRUE(exp.equalsTo(&z));
+
     x.applyTrueBroadcast(BROADCAST(ReverseMod), &y, &exp, true);
     ASSERT_TRUE(exp.equalsTo(&z));
 
@@ -899,7 +871,6 @@ TEST_F(DeclarableOpsTests1, ReverseModTest_2) {
     auto res = subOp.execute({&x, &y}, {}, {});
 
     ASSERT_TRUE(res->status() == ND4J_STATUS_OK);
-//    res->at(0)->printIndexedBuffer("OUtput REVERSED MOD2");
     ASSERT_TRUE(res->at(0)->equalsTo(&exp));
 
     delete res;
@@ -1355,7 +1326,6 @@ TEST_F(DeclarableOpsTests1, DivideScalarScalar1) {
 
     div.execute(block);
 
-    //x->printBuffer("x");
     ASSERT_TRUE(x->equalsTo(&exp));
 
     delete variableSpace;
@@ -1503,10 +1473,6 @@ TEST_F(DeclarableOpsTests1, Test_Cast_1) {
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-    // z->printIndexedBuffer("OUtput");
-    // yExp.printIndexedBuffer("Expect");
-    // z->printShapeInfo("OUt shape");
-    // yExp.printShapeInfo("Exp shape");
     ASSERT_TRUE(yExp.equalsTo(z));
 
     delete result;
@@ -1515,8 +1481,6 @@ TEST_F(DeclarableOpsTests1, Test_Cast_1) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests1, TestRegistrator1) {
     auto res = nd4j::ops::OpRegistrator::getInstance()->getAllCustomOperations();
-
-    // nd4j_printf("Ops: %s\n", res)
 }
 
 // //////////////////////////////////////////////////////////////////////
@@ -1555,7 +1519,6 @@ TEST_F(DeclarableOpsTests1, TestRegistrator1) {
 //     //auto status = execCustomOp(nullptr, hash, inputBuffers, inputShapes, 2, outputBuffers, outputShapes, 1, nullptr, 0, nullptr, 0, false);
 //     auto status = execCustomOp(nullptr, hash, inputBuffers, inputShapes, 2, outputBuffers, outputShapes, 1, nullptr, 0, nullptr, 0, nullptr, 0, false);
 //     ASSERT_EQ(ND4J_STATUS_OK, status);
-//     // z->printIndexedBuffer("Output add");
 //     ASSERT_NEAR(2.0f, y->meanNumber().e<float>(0), 1e-5);
 //     ASSERT_NEAR(1.0f, x->meanNumber().e<float>(0), 1e-5);
 //     ASSERT_NEAR(3.0f, z->meanNumber().e<float>(0), 1e-5);
@@ -1636,8 +1599,6 @@ TEST_F(DeclarableOpsTests1, TestGemv1) {
 
      nd4j::blas::GEMV<float, float, float>::op('f',  x->rows(), x->columns(), 1.0f, x->getBuffer(), y->rows(), y->getBuffer(), 1, 0.0, z->getBuffer(), 1);
 
-    //z->printBuffer();
-
     ASSERT_TRUE(z->equalsTo(exp));
 
     delete []xBuffer; delete []xShape; delete x; delete []yBuffer; delete []yShape; delete y; delete z; delete []expBuffer; delete exp;
@@ -2020,8 +1981,6 @@ TEST_F(DeclarableOpsTests1, TestCustomShape1) {
     auto inshapes = new ShapeList(input->getShapeInfo());
     auto shapes = test.calculateOutputShape(inshapes, *block);
 
-    //input.printShapeInfo("input");
-    //shape::printShapeInfoLinear(shape);
 
     ASSERT_EQ(input->getShapeInfo()[0]    , shapes->at(0)[0]);
     ASSERT_EQ(input->getShapeInfo()[1] * 2, shapes->at(0)[1]);
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp
index 3fd9d26c6..f0ae83168 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp
@@ -130,7 +130,7 @@ TEST_F(DeclarableOpsTests10, Test_Not_1) {
     auto result = op.execute({&x, &y}, {}, {}, {}, false, nd4j::DataType::BOOL);
     ASSERT_EQ(Status::OK(), result->status());
     auto res = result->at(0);
-    res->printBuffer("OUtput NOT");
+
     ASSERT_TRUE(e.equalsTo(res));
 
     delete result;
@@ -163,7 +163,7 @@ TEST_F(DeclarableOpsTests10, MirrorPad_SGO_Test_1) {
 
     auto res = op.execute({&in, &pad}, {10.0}, {0}, {}, false, nd4j::DataType::DOUBLE);
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
-    res->at(0)->printIndexedBuffer("Mirror pad:");
+
     ASSERT_TRUE(exp.equalsTo(res->at(0)));
     delete res;
 }
@@ -180,9 +180,6 @@ TEST_F(DeclarableOpsTests10, Unique_SGO_Test_1) {
     auto res1 = res->at(0);
     auto res2 = res->at(1);
 
-    res1->printIndexedBuffer("Unique values");
-    res2->printIndexedBuffer("Unique idxs");
-
     ASSERT_TRUE(exp.equalsTo(res1));
     ASSERT_TRUE(expIdx.equalsTo(res2));
     delete res;
@@ -215,8 +212,7 @@ TEST_F(DeclarableOpsTests10, Where_SGO_Test_02) {
     auto res = op.execute({&input}, {}, {});
     ASSERT_TRUE(res->status() == ND4J_STATUS_OK);
     auto  resA = res->at(0);
-    resA->printIndexedBuffer("Where02");
-    resA->printBuffer("Where02lINEAR");
+
     ASSERT_TRUE(exp.equalsTo(resA));
     ASSERT_TRUE(exp.isSameShape(resA));
 //    ASSERT_TRUE(expIdx.equalsTo(res->at(1)));
@@ -329,8 +325,7 @@ TEST_F(DeclarableOpsTests10, Where_SGO_Test_5) {
     ASSERT_TRUE(res->status() == ND4J_STATUS_OK);
     auto resA = res->at(0);
     //ASSERT_TRUE(resA->isEmpty());
-    resA->printIndexedBuffer("Result A");
-    //resA->printShapeInfo("ShapeA");
+
     ASSERT_TRUE(exp.equalsTo(resA));
     ASSERT_TRUE(exp.isSameShape(resA));
 //    ASSERT_TRUE(expIdx.equalsTo(res->at(1)));
@@ -658,8 +653,7 @@ TEST_F(DeclarableOpsTests10, top_k_permuted_test1) {
 
     auto z = result->at(0);
     auto zI = result->at(1);
-    z->printIndexedBuffer("TopK(5)");
-    zI->printIndexedBuffer("TopKI(5)");
+
     ASSERT_TRUE(expUnsorted.isSameShape(z));
     ASSERT_TRUE(expUnsorted.equalsTo(z));
 
@@ -669,8 +663,7 @@ TEST_F(DeclarableOpsTests10, top_k_permuted_test1) {
 
     z = result2->at(0);
     zI = result2->at(1);
-    z->printIndexedBuffer("sorted TopK(5)");
-    zI->printIndexedBuffer("sorted TopKI(5)");
+
     ASSERT_TRUE(expSorted.isSameShape(z));
     ASSERT_TRUE(expSorted.equalsTo(z));
 
@@ -693,8 +686,7 @@ TEST_F(DeclarableOpsTests10, top_k_permuted_test2) {
 
     auto z = result->at(0);
     auto zI = result->at(1);
-    z->printIndexedBuffer("TopK(5)");
-    zI->printIndexedBuffer("TopKI(5)");
+
     ASSERT_TRUE(expUnsorted.isSameShape(z));
     ASSERT_TRUE(expUnsorted.equalsTo(z));
 
@@ -704,8 +696,7 @@ TEST_F(DeclarableOpsTests10, top_k_permuted_test2) {
 
     z = result2->at(0);
     zI = result2->at(1);
-    z->printIndexedBuffer("sorted TopK(5)");
-    zI->printIndexedBuffer("sorted TopKI(5)");
+
     ASSERT_TRUE(expSorted.isSameShape(z));
     ASSERT_TRUE(expSorted.equalsTo(z));
 
@@ -1022,8 +1013,6 @@ TEST_F(DeclarableOpsTests10, NTH_Element_Test_2) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     NDArray* output = results->at(0);
-    output->printIndexedBuffer("Output 2");
-    exp.printIndexedBuffer("Expect 2");
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1046,8 +1035,7 @@ TEST_F(DeclarableOpsTests10, NTH_Element_Test_3) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     NDArray* output = results->at(0);
-    output->printIndexedBuffer("Output 3");
-    exp.printIndexedBuffer("Expect 3");
+
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
 
@@ -1179,7 +1167,7 @@ TEST_F(DeclarableOpsTests10, NTH_Element_Test_7) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     NDArray* output = results->at(0);
-    output->printIndexedBuffer("NTH rank3_n2");
+
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
 
@@ -1206,7 +1194,7 @@ TEST_F(DeclarableOpsTests10, NTH_Element_Test_8) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     NDArray* output = results->at(0);
-    output->printIndexedBuffer("NTH rank3_n2_reverse");
+
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
 
@@ -1812,7 +1800,7 @@ TEST_F(DeclarableOpsTests10, LinSpace_Test1) {
     auto result = op.execute({&start, &finish, &num}, {}, {});
     ASSERT_EQ(result->status(), ND4J_STATUS_OK);
     auto res = result->at(0);
-    res->printIndexedBuffer("from 1 to 24");
+
     ASSERT_TRUE(expect.equalsTo(res));
     delete result;
 }
@@ -2084,7 +2072,7 @@ TEST_F(DeclarableOpsTests10, Image_CropAndResize_2) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto result = results->at(0);
-    result->printIndexedBuffer("Cropped and Resized");
+
     ASSERT_TRUE(expected.isSameShapeStrict(result));
     ASSERT_TRUE(expected.equalsTo(result));
 
@@ -2108,7 +2096,7 @@ TEST_F(DeclarableOpsTests10, Image_CropAndResize_3) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto result = results->at(0);
-    result->printIndexedBuffer("Cropped and Resized");
+
     ASSERT_TRUE(expected.isSameShapeStrict(result));
     ASSERT_TRUE(expected.equalsTo(result));
 
@@ -2156,7 +2144,7 @@ TEST_F(DeclarableOpsTests10, Image_CropAndResize_5) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto result = results->at(0);
-    result->printShapeInfo("Cropped and Resized");
+
     ASSERT_TRUE(expected.isSameShapeStrict(result));
     //ASSERT_TRUE(expected.equalsTo(result));
 
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests11.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests11.cpp
index 988e5d583..d077f886d 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests11.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests11.cpp
@@ -916,7 +916,6 @@ TEST_F(DeclarableOpsTests11, SquaredSubtractTest_Test1) {
     auto result = op.execute({&x, &y}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
-    result->at(0)->printBuffer("Output");
 
     delete result;
 }
@@ -928,7 +927,6 @@ TEST_F(DeclarableOpsTests11, SquaredSubtractTest_Test2) {
     nd4j::ops::squaredsubtract op;
     auto result = op.execute({&x, &y}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
-    result->at(0)->printBuffer("Output");
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
     delete result;
 }
@@ -941,7 +939,6 @@ TEST_F(DeclarableOpsTests11, SquaredSubtractTest_Test3) {
     nd4j::ops::squaredsubtract_bp op;
     auto result = op.execute({&x, &y, &eps}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
-    result->at(0)->printBuffer("Output");
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
     delete result;
 }
@@ -1372,7 +1369,6 @@ TEST_F(DeclarableOpsTests11, BFloat16_Test_1) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto res = results->at(0);
-    res->printIndexedBuffer("BFloat16 sum:");
     ASSERT_TRUE(res->equalsTo(exp));
 
     delete results;
@@ -1394,7 +1390,6 @@ TEST_F(DeclarableOpsTests11, BFloat16_Test_2) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto res = results->at(0);
-    res->printIndexedBuffer("BFloat16 sum:");
     ASSERT_TRUE(res->equalsTo(exp));
 
     delete results;
@@ -1416,7 +1411,6 @@ TEST_F(DeclarableOpsTests11, BFloat16_Test_3) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto res = results->at(0);
-    res->printIndexedBuffer("BFloat16 sum:");
     ASSERT_TRUE(res->equalsTo(exp));
 
     delete results;
@@ -1869,7 +1863,6 @@ TEST_F(DeclarableOpsTests11, BFloat16_Test_4) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto res = results->at(0);
-    res->printIndexedBuffer("BFloat16 sum:");
     ASSERT_TRUE(res->equalsTo(exp));
 
     delete results;
@@ -1891,7 +1884,6 @@ TEST_F(DeclarableOpsTests11, BFloat16_Test_5) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto res = results->at(0);
-    res->printIndexedBuffer("BFloat16 subtract:");
     ASSERT_TRUE(res->equalsTo(exp));
 
     delete results;
@@ -1913,7 +1905,6 @@ TEST_F(DeclarableOpsTests11, BFloat16_Test_6) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto res = results->at(0);
-    res->printIndexedBuffer("BFloat16 subtract2:");
     ASSERT_TRUE(res->equalsTo(exp));
 
     delete results;
@@ -2189,7 +2180,6 @@ TEST_F(DeclarableOpsTests11, SafeDivideMixed_Test1) {
     NDArray numOfNonZero(sumDiff.getShapeInfo(), nd4j::DataType::INT64, false);
     numOfNonZero.assign(1);
     sumDiff.applyPairwiseTransform(pairwise::SafeDivide, &numOfNonZero, &sumDiff, nullptr);
-    sumDiff.printIndexedBuffer("Output as Is");
 }
 
 /////////////////////////////////////////////////////////////////
@@ -2393,7 +2383,6 @@ TEST_F(DeclarableOpsTests11, Multiply_BP_Test1) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto *dLdo = results->at(0);
-    dLdo->printBuffer("Output for multiply_bp op");
     ASSERT_TRUE(dLdpExp.isSameShape(dLdo));
     ASSERT_TRUE(dLdpExp.equalsTo(dLdo));
 
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests12.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests12.cpp
index 3f868c45c..59da5edb4 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests12.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests12.cpp
@@ -402,8 +402,6 @@ TEST_F(DeclarableOpsTests12, TestDivideBP_1) {
     Nd4jStatus status = op.execute({&x, &y, &eps}, {&output1, &output2}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
-    output1.printIndexedBuffer("DivideBP X out");
-    output2.printIndexedBuffer("DivideBP Y out");
     //ASSERT_TRUE(output.e<double>(0) == 47.);
 }
 
@@ -427,8 +425,6 @@ TEST_F(DeclarableOpsTests12, TestDivideBP_2) {
     Nd4jStatus status = op.execute({&x, &y, &eps}, {&output1, &output2}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
-    output1.printIndexedBuffer("2DivideBP X out");
-    output2.printIndexedBuffer("2DivideBP Y out");
     ASSERT_TRUE(output1.equalsTo(exp1));
     ASSERT_TRUE(output2.equalsTo(exp2));
 }
@@ -450,8 +446,6 @@ TEST_F(DeclarableOpsTests12, TestReverseDivideBP_1) {
     Nd4jStatus status = op.execute({&y, &x, &eps}, {&output2, &output1}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
-    output1.printIndexedBuffer("RDivideBP X out");
-    output2.printIndexedBuffer("RDivideBP Y out");
     //ASSERT_TRUE(output.e<double>(0) == 47.);
 }
 
@@ -476,8 +470,6 @@ TEST_F(DeclarableOpsTests12, TestReverseDivideBP_2) {
     Nd4jStatus status = op.execute({&y, &x, &eps}, {&output2, &output1}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
-    output1.printIndexedBuffer("2RDivideBP X out");
-    output2.printIndexedBuffer("2RDivideBP Y out");
     ASSERT_TRUE(output1.equalsTo(exp1));
     ASSERT_TRUE(output2.equalsTo(exp2));
 }
@@ -501,7 +493,6 @@ TEST_F(DeclarableOpsTests12, TestSliceBP_1) {
     Nd4jStatus status = op.execute({&x, &eps}, {&output}, {}, {1,1,2,2}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
-    output.printIndexedBuffer("SLICE_BP out");
     ASSERT_TRUE(output.equalsTo(exp));
     //ASSERT_TRUE(output2.equalsTo(exp2));
 }
@@ -526,7 +517,6 @@ TEST_F(DeclarableOpsTests12, TestConfusionZero_1) {
     Nd4jStatus status = op.execute({&x, &i}, {&output}, {}, {4}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
-    output.printIndexedBuffer("Confusion out");
     ASSERT_TRUE(output.equalsTo(exp));
     //ASSERT_TRUE(output2.equalsTo(exp2));
 }
@@ -545,8 +535,6 @@ TEST_F(DeclarableOpsTests12, TestMaximumBP_1) {
     output1.assign(119);
     x.linspace(1.);
     y.linspace(12., -1.);
-    x.printBuffer("X");
-    y.printBuffer("Y");
     eps.linspace(1.);
     //exp1.assign(1.);
     //exp2.assign(-2.);
@@ -554,8 +542,6 @@ TEST_F(DeclarableOpsTests12, TestMaximumBP_1) {
     Nd4jStatus status = op.execute({&x, &y, &eps}, {&output1, &output2}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
-    output1.printIndexedBuffer("X max");
-    output2.printIndexedBuffer("Y max");
     ASSERT_TRUE(output1.equalsTo(exp1));
     ASSERT_TRUE(output2.equalsTo(exp2));
 }
@@ -574,8 +560,6 @@ TEST_F(DeclarableOpsTests12, TestMinimumBP_1) {
     output1.assign(119);
     x.linspace(1.);
     y.linspace(12., -1.);
-    x.printBuffer("X");
-    y.printBuffer("Y");
     eps.linspace(1.);
     //exp1.assign(1.);
     //exp2.assign(-2.);
@@ -583,8 +567,6 @@ TEST_F(DeclarableOpsTests12, TestMinimumBP_1) {
     Nd4jStatus status = op.execute({&x, &y, &eps}, {&output2, &output1}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
-    output2.printIndexedBuffer("X min");
-    output1.printIndexedBuffer("Y min");
     ASSERT_TRUE(output1.equalsTo(exp1));
     ASSERT_TRUE(output2.equalsTo(exp2));
 }
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests13.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests13.cpp
index 9d460f152..71ee8a04e 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests13.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests13.cpp
@@ -533,7 +533,6 @@ TEST_F(DeclarableOpsTests13, adjustSaturation_1) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto result = results->at(0);
-    // result->printIndexedBuffer();
 
     ASSERT_TRUE(exp.isSameShape(result));
     ASSERT_TRUE(exp.equalsTo(result));
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp
index 2d8311828..574da8993 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp
@@ -58,12 +58,7 @@ TEST_F(DeclarableOpsTests14, Test_Reshape_CF_1) {
     auto x = NDArrayFactory::create<double>('f', {2, 3}, {1.0, 4.0, 2.0, 5.0, 3.0, 6.0});
     auto e = NDArrayFactory::create<double>('f', {3, 2}, {1.0, 3.0, 5.0, 2.0, 4.0, 6.0});
 
-    x.printShapeInfo("x shape");
-    x.printBuffer("x buffr");
-    x.printIndexedBuffer("x indxd");
-
-    auto r = x.reshape('c', {3, 2});
-    r.printIndexedBuffer("r pre-s");
+    auto r = x.reshape('c', {3, 2});;
     r.streamline('f');
 
     nd4j::ops::reshape op;
@@ -92,7 +87,7 @@ TEST_F(DeclarableOpsTests14, Test_Inf_Comparison_2) {
 TEST_F(DeclarableOpsTests14, Multiply_test) {
 
     for(int k=2;k<10;k++){
-        nd4j_printf("k=%d\n", k);
+        //nd4j_printf("k=%d\n", k);
         NDArray x = NDArrayFactory::create<double>('c', {k, 1});
         NDArray y = NDArrayFactory::create<double>('c', {k});
         NDArray e = NDArrayFactory::create<double>('c', {k, k});
@@ -122,7 +117,6 @@ TEST_F(DeclarableOpsTests14, Test_EvalReductionShape_1) {
     ASSERT_EQ(Status::OK(), result->status());
 
     auto z = result->at(0);
-    z->printIndexedBuffer("Reduced shape");
     ASSERT_EQ(e, *z);
 
     delete result;
@@ -416,8 +410,6 @@ TEST_F(DeclarableOpsTests14, test_empty_argmax_1) {
 
     auto z = result->at(0);
 
-    z->printShapeInfo("Z");
-
     ASSERT_EQ(e, *z);
 
     delete result;
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp
index 6eabc964a..97e7d2d91 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp
@@ -250,7 +250,6 @@ TEST_F(DeclarableOpsTests15, Test_BitCast_2) {
     auto result = op.execute({&x}, {}, {nd4j::DataType::HALF}, {});
     ASSERT_EQ(Status::OK(), result->status());
     auto out = result->at(0);
-    out->printIndexedBuffer("Casted result");
     ASSERT_TRUE(e.equalsTo(out));
     delete result;
 }
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp
index d95e86b1c..1a459a012 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp
@@ -149,5 +149,16 @@ TEST_F(DeclarableOpsTests16, test_knn_mindistance_1) {
     nd4j::ops::knn_mindistance op;
     auto result = op.execute({&input, &low, &high}, {&output}, {}, {}, {});
     ASSERT_EQ(Status::OK(), result);
+}
 
+TEST_F(DeclarableOpsTests16, test_empty_cast_1) {
+    auto x = NDArrayFactory::create<bool>('c', {1, 0, 2});
+    auto e = NDArrayFactory::create<Nd4jLong>('c', {1, 0, 2});
+
+    nd4j::ops::cast op;
+    auto result = op.execute({&x}, {}, {10});
+    ASSERT_EQ(Status::OK(), result->status());
+    ASSERT_EQ(e, *result->at(0));
+
+    delete result;
 }
\ No newline at end of file
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests2.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests2.cpp
index 62172dbf2..4941e7459 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests2.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests2.cpp
@@ -3589,8 +3589,6 @@ TEST_F(DeclarableOpsTests2, softmax_cross_entropy_loss_test1) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto *result = results->at(0);
-    result->printIndexedBuffer("SCEL Output");
-    expected.printIndexedBuffer("SCEL Expect");
     ASSERT_TRUE(expected.isSameShape(result));
     ASSERT_TRUE(expected.equalsTo(result));
 
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests4.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests4.cpp
index 2f56eaf2a..478a31d4a 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests4.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests4.cpp
@@ -479,7 +479,6 @@ TEST_F(DeclarableOpsTests4, Test_FlattenTests_4) {
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-    z->printIndexedBuffer();
 
     ASSERT_TRUE(exp.equalsTo(z));
 
@@ -1045,7 +1044,6 @@ TEST_F(DeclarableOpsTests4, Test_StridedSlice_Alex_3) {
     ASSERT_EQ(Status::OK(), result->status());
 
     auto z = result->at(0);
-    z->printShapeInfo("Emply shape expected");
     ASSERT_TRUE(z->isEmpty());
 
     delete result;
@@ -1065,9 +1063,6 @@ TEST_F(DeclarableOpsTests4, Test_StridedSlice_Alex_4) {
     ASSERT_EQ(Status::OK(), result->status());
 
     auto z = result->at(0);
-    z->printBuffer("Strided Slice");
-    z->printShapeInfo("Vector size 1 shape expected");
-    exp.printShapeInfo("Expected shape");
     ASSERT_TRUE(z->lengthOf() == 1);
     ASSERT_TRUE(exp.equalsTo(z));
     delete result;
@@ -1482,9 +1477,6 @@ TEST_F(DeclarableOpsTests4, WeightedCrossEntropyWithLogits_2) {
     auto results = op.execute({&targets, &input, &weights}, {}, {}, {}, false, nd4j::DataType::DOUBLE);
     auto  output = results->at(0);
 
-    output->printIndexedBuffer("Result is ");
-    expected.printIndexedBuffer("Expected is ");
-
     ASSERT_EQ(Status::OK(), results->status());
     ASSERT_TRUE(expected.isSameShape(output));
     ASSERT_TRUE(expected.equalsTo(output));
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests5.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests5.cpp
index 86acca29c..2e8d96f3c 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests5.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests5.cpp
@@ -304,7 +304,6 @@ TEST_F(DeclarableOpsTests5, hardsigmoid_test1) {
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-    z->printIndexedBuffer("Hadrdsigmoid 2x2");
     ASSERT_TRUE(exp.equalsTo(z));
 
     delete result;
@@ -321,7 +320,6 @@ TEST_F(DeclarableOpsTests5, hardsigmoid_test2) {
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-    z->printIndexedBuffer("Hadrdsigmoid 2x2");
     ASSERT_TRUE(exp.equalsTo(z));
 
     delete result;
@@ -384,7 +382,6 @@ TEST_F(DeclarableOpsTests5, histogram_test2) {
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-    z->printIndexedBuffer("Histogram4");
     ASSERT_TRUE(exp.equalsTo(z));
 
     delete result;
@@ -400,7 +397,6 @@ TEST_F(DeclarableOpsTests5, Identity_test1) {
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-//    z->printIndexedBuffer("Histogram3");
     ASSERT_TRUE(matrix.equalsTo(z));
 
     delete result;
@@ -416,7 +412,6 @@ TEST_F(DeclarableOpsTests5, Identity_test2) {
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-    z->printIndexedBuffer("Identity_BP");
     ASSERT_TRUE(z->equalsTo(eps));
 
     delete result;
@@ -433,7 +428,6 @@ TEST_F(DeclarableOpsTests5, Log1p_test1) {
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-    z->printIndexedBuffer("Log1p");
     ASSERT_TRUE(z->equalsTo(y));
 
     delete result;
@@ -450,7 +444,6 @@ TEST_F(DeclarableOpsTests5, Test_SpaceToBatch_1) {
     ASSERT_EQ(Status::OK(), result->status());
 
     auto z = result->at(0);
-    // z->printIndexedBuffer();
 
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
@@ -846,9 +839,6 @@ TEST_F(DeclarableOpsTests5, reverse_sequense_test1) {
 
     auto output = results->at(0);
 
-    exp.printIndexedBuffer("E");
-    output->printIndexedBuffer("O");
-
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
 
@@ -1314,17 +1304,6 @@ TEST_F(DeclarableOpsTests5, Test_TopK_3_unsorted) {
     auto v = result->at(0);
     auto i = result->at(1);
 
-//    v->printShapeInfo("shape v");
-//    expV.printShapeInfo("shape expV");
-
-//    i->printShapeInfo("shape I");
-//    expI.printShapeInfo("shape expI");
-
-    v->printIndexedBuffer("v");
-//    expV.printIndexedBuffer("expV");
-    i->printIndexedBuffer("i");
-//    expI.printIndexedBuffer("expI");
-
     ASSERT_TRUE(expV.isSameShape(v));
     ASSERT_TRUE(expV.equalsTo(v));
 
@@ -1349,17 +1328,6 @@ TEST_F(DeclarableOpsTests5, Test_TopK_4) {
     auto v = result->at(0);
     auto i = result->at(1);
 
-//    v->printShapeInfo("shape v");
-//    expV.printShapeInfo("shape expV");
-
-//    i->printShapeInfo("shape I");
-//    expI.printShapeInfo("shape expI");
-
-//    v->printIndexedBuffer("v");
-//    expV.printIndexedBuffer("expV");
-//    i->printIndexedBuffer("i");
-//    expI.printIndexedBuffer("expI");
-
     ASSERT_TRUE(expV.isSameShape(v));
     ASSERT_TRUE(expV.equalsTo(v));
 
@@ -1377,11 +1345,6 @@ TEST_F(DeclarableOpsTests5, Test_TopK_5) {
 
     nd4j::ops::top_k op;
     auto result = op.execute({&x}, {}, {2, 1});
-    for (Nd4jLong r = 0; r < 2; r++) {
-        for (Nd4jLong c = 0; c < 3; c++)
-            nd4j_printf("%f, ", x.e<double>(r,c));
-        nd4j_printf("\n", "");
-    }
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     ASSERT_EQ(2, result->size());
@@ -1389,18 +1352,6 @@ TEST_F(DeclarableOpsTests5, Test_TopK_5) {
     auto v = result->at(0);
     auto i = result->at(1);
 
-//    x.printShapeInfo("shape of the source X");
-//    v->printShapeInfo("shape v");
-//    expV.printShapeInfo("shape expV");
-
-//    i->printShapeInfo("shape I");
-//    expI.printShapeInfo("shape expI");
-
-    v->printIndexedBuffer("v");
-    expV.printIndexedBuffer("expV");
-    i->printIndexedBuffer("i");
-    expI.printIndexedBuffer("expI");
-
     ASSERT_TRUE(expV.isSameShape(v));
     ASSERT_TRUE(expV.equalsTo(v));
 
@@ -2025,10 +1976,6 @@ TEST_F(DeclarableOpsTests5, DynamicPartition_2) {
 
     for (int e = 0; e < result->size(); e++) {
         auto output = result->at(e);
-         nd4j_printf("%i: ", e);
-         output->printShapeInfo("Output shape> ");
-         exp[e].printShapeInfo("Expected shape> ");
-         output->printIndexedBuffer("Output data> ");
 
         ASSERT_TRUE(exp[e].isSameShape(output));
         ASSERT_TRUE(exp[e].equalsTo(output));
@@ -2126,10 +2073,6 @@ TEST_F(DeclarableOpsTests5, DynamicStitch_1) {
 
     auto output = result->at(0);
 
-    // output->printShapeInfo("Output shape> ");
-    // exp.printShapeInfo("Expected shape> ");
-     output->printIndexedBuffer("O data");
-     exp.printIndexedBuffer("E data");
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
 
@@ -2334,8 +2277,6 @@ TEST_F(DeclarableOpsTests5, confusion_matrix_test1) {
     ASSERT_EQ(Status::OK(), results->status());
 
     auto output = results->at(0);
-    output->printIndexedBuffer("CM output");
-    expected.printIndexedBuffer("CM expected");
 
     ASSERT_TRUE(expected.isSameShape(output));
     ASSERT_TRUE(expected.equalsTo(output));
@@ -2355,9 +2296,6 @@ TEST_F(DeclarableOpsTests5, confusion_matrix_test2) {
     ASSERT_EQ(Status::OK(), results->status());
 
     auto output = results->at(0);
-    output->printIndexedBuffer("CM2 output");
-    expected.printIndexedBuffer("CM2 expected");
-
 
     ASSERT_TRUE(expected.isSameShape(output));
     ASSERT_TRUE(expected.equalsTo(output));
@@ -2376,8 +2314,6 @@ TEST_F(DeclarableOpsTests5, confusion_matrix_test3) {
     nd4j::ops::confusion_matrix op;
     auto results = op.execute({&labels, &predictions, &weights}, {}, {3});
     auto output = results->at(0);
-    output->printIndexedBuffer("CM3");
-
 
     ASSERT_EQ(Status::OK(), results->status());
     ASSERT_TRUE(expected.isSameShape(output));
@@ -2397,7 +2333,6 @@ TEST_F(DeclarableOpsTests5, confusion_matrix_test4) {
     nd4j::ops::confusion_matrix op;
     auto results = op.execute({&labels, &predictions, &weights}, {}, {3, nd4j::DataType::DOUBLE});
     auto output = results->at(0);
-    output->printIndexedBuffer("CM4");
 
     ASSERT_EQ(Status::OK(), results->status());
     ASSERT_TRUE(expected.isSameShape(output));
@@ -2470,11 +2405,6 @@ TEST_F(DeclarableOpsTests5, XWPlusB_1) {
 
     auto output = result->at(0);
 
-    output->printShapeInfo("Output shape> ");
-    exp.printShapeInfo("Expected shape> ");
-    output->printIndexedBuffer("Output data> ");
-    exp.printIndexedBuffer("Expected res>");
-
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
 
@@ -2778,7 +2708,7 @@ TEST_F(DeclarableOpsTests5, L2_Loss_1) {
 
     ASSERT_EQ(Status::OK(), results->status());
     ASSERT_TRUE(output->isScalar());
-    output->printIndexedBuffer("L2_Loss output");
+
     ASSERT_EQ(output->e<double>(0), exp);
 
     delete results;
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp
index 34b66c61a..79a569e0f 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp
@@ -118,8 +118,7 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_Once_Again_4) {
     ASSERT_EQ(Status::OK(), result->status());
 
     auto z = result->at(0);
-    z->printShapeInfo("SS OS shape");
-    z->printIndexedBuffer("SS OS out");
+
     ASSERT_TRUE(z->equalsTo(exp));
     //ASSERT_EQ(exp, *z);
 
@@ -127,9 +126,10 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_Once_Again_4) {
 }
 
 TEST_F(DeclarableOpsTests6, Test_StridedSlice_Once_Again_04) {
+    int z = 0;
     auto matrix = NDArrayFactory::create<double>('c', {1}, {10});
     auto b = NDArrayFactory::create_<int>('c', {1}, {1});
-    auto e = NDArrayFactory::create_<int>('c', {1}, {(int)0});
+    auto e = NDArrayFactory::create_<int>('c', {1}, {z});
     auto s = NDArrayFactory::create_<int>('c', {1}, {1});
     nd4j::ops::ones_as opOnes;
     //auto exp = NDArrayFactory::create<double>('c', {2}, {1.0f, 2.0f});
@@ -138,7 +138,6 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_Once_Again_04) {
     ASSERT_EQ(onesRes->status(), Status::OK());
 
     auto ones = onesRes->at(0);
-    ones->printShapeInfo("Shape ones");
     *ones *= 10;
     auto onesD = ones->dup();
 
@@ -161,9 +160,6 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_Once_Again_04) {
     nd4j::ops::strided_slice op;
     auto result = op.calculateOutputShape(inputShapes, *block); //execute({ones, &b, &e, &s}, {}, {0, 1, 0, 0, 0});
     ASSERT_EQ(result->size(), 1);
-    shape::printShapeInfoLinear(result->at(0));
-    //auto z = result->at(0);
-//    z->printShapeInfo("SS OS shape");
     ASSERT_TRUE(shape::isEmpty(result->at(0)));
     //ASSERT_EQ(exp, *z);
     delete block;
@@ -189,8 +185,6 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_Once_Again_5) {
     ASSERT_EQ(Status::OK(), result->status());
 
     auto z = result->at(0);
-    z->printShapeInfo("Output shape");
-    z->printIndexedBuffer("Output");
     ASSERT_TRUE(exp.equalsTo(z));
 
     delete result;
@@ -211,8 +205,6 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_Once_Again_6) {
     ASSERT_EQ(Status::OK(), result->status());
 
     auto z = result->at(0);
-    z->printShapeInfo("Output shape");
-    z->printIndexedBuffer("Output");
     ASSERT_TRUE(exp.equalsTo(z));
 
     delete result;
@@ -234,8 +226,6 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_Once_Again_7) {
     ASSERT_EQ(Status::OK(), result->status());
 
     auto z = result->at(0);
-    z->printShapeInfo("Output shape");
-    z->printIndexedBuffer("Output");
     //ASSERT_TRUE(exp.equalsTo(z));
 
     delete result;
@@ -258,8 +248,6 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_BP_1) {
     ASSERT_EQ(Status::OK(), result->status());
 
     auto z = result->at(0);
-    z->printShapeInfo("Output shape");
-    z->printIndexedBuffer("Output");
     //ASSERT_TRUE(exp.equalsTo(z));
 
     delete result;
@@ -282,8 +270,6 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_BP_2) {
     ASSERT_EQ(Status::OK(), result->status());
 
     auto z = result->at(0);
-    z->printShapeInfo("Output shape");
-    z->printIndexedBuffer("Output");
     //ASSERT_TRUE(exp.equalsTo(z));
 
     delete result;
@@ -306,8 +292,6 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_BP_3) {
     ASSERT_EQ(Status::OK(), result->status());
 
     auto z = result->at(0);
-    z->printShapeInfo("Output shape");
-    z->printIndexedBuffer("Output");
     //ASSERT_TRUE(exp.equalsTo(z));
 
     delete result;
@@ -362,8 +346,6 @@ TEST_F(DeclarableOpsTests6, Test_Order_1) {
     ASSERT_EQ(Status::OK(), result->status());
 
     auto z = result->at(0);
-    z->printIndexedBuffer("O Output");
-    exp.printIndexedBuffer("O Expect");
     ASSERT_TRUE(exp.equalsTo(z));
     ASSERT_NE(x.ordering(), z->ordering());
 
@@ -379,7 +361,6 @@ TEST_F(DeclarableOpsTests6, cumSum_1) {
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-    // z->printIndexedBuffer("CumSum1");
 
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
@@ -910,9 +891,7 @@ TEST_F(DeclarableOpsTests6, TestRank_1) {
     auto ress = op.execute({&x}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, ress->status());
-    ress->at(0)->printIndexedBuffer("RANK Result is ");
 
-    //    x.printIndexedBuffer("Input is");
     ASSERT_TRUE(ress->at(0)->equalsTo(exp));
     delete ress;
 }
@@ -926,8 +905,6 @@ TEST_F(DeclarableOpsTests6, TestDropout_2) {
     auto ress = op.execute({&x}, {0.4f}, {113}, {}, false, nd4j::DataType::DOUBLE);
 
     ASSERT_EQ(ND4J_STATUS_OK, ress->status());
-    //x.printIndexedBuffer("Input is");
-    //ress->at(0)->printIndexedBuffer("Result is ");
 
     delete ress;
 }
@@ -943,8 +920,6 @@ TEST_F(DeclarableOpsTests6, TestDropout_3) {
     auto ress = op.execute({&x, &shape}, {0.4f}, {113}, {}, false, nd4j::DataType::DOUBLE);
 
     ASSERT_EQ(ND4J_STATUS_OK, ress->status());
-    //x.printIndexedBuffer("Input is");
-    //ress->at(0)->printIndexedBuffer("Result is ");
 
     delete ress;
 }
@@ -1556,8 +1531,6 @@ TEST_F(DeclarableOpsTests6, LogMatrixDeterminant_1) {
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-    z->printIndexedBuffer("Log ABS Output ");
-    exp.printIndexedBuffer("Log ABS Expected ");
 
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
@@ -1578,8 +1551,6 @@ TEST_F(DeclarableOpsTests6, LogDet_1) {
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-//    z->printIndexedBuffer("LogDet Output1 ");
-//    exp.printIndexedBuffer("LogDet Expected1 ");
 
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
@@ -1593,16 +1564,12 @@ TEST_F(DeclarableOpsTests6, LogDet_2) {
     auto x = NDArrayFactory::create<double>('c', {1, 3, 3}, {4,12,-16,12,37,-43,-16,-43,98});
     auto exp = NDArrayFactory::create<double>('c', {1}, { 3.5835189});
 
-    //x.printIndexedBuffer("Input");
     nd4j::ops::logdet op;
     auto result = op.execute({&x}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-//    z->printIndexedBuffer("LogDet Output2 ");
-//    z->printShapeInfo("Shape");
-//    exp.printIndexedBuffer("LogDet Expected2 ");
 
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
@@ -1616,16 +1583,12 @@ TEST_F(DeclarableOpsTests6, LogDet_3) {
     auto x = NDArrayFactory::create<double>('c', {3, 3}, {4,12,-16,12,37,-43,-16,-43,98});
     auto exp = NDArrayFactory::create<double>( 3.5835189);
 
-    //x.printIndexedBuffer("Input");
     nd4j::ops::logdet op;
     auto result = op.execute({&x}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-//    z->printIndexedBuffer("LogDet Output3 ");
-//    z->printShapeInfo("Shape");
-//    exp.printIndexedBuffer("LogDet Expected3 ");
 
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
@@ -1670,8 +1633,6 @@ TEST_F(DeclarableOpsTests6, MatrixInverse_1) {
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-//    z->printIndexedBuffer("Output ");
-//    exp.printIndexedBuffer("Expected ");
 
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
@@ -1710,8 +1671,6 @@ TEST_F(DeclarableOpsTests6, MatrixInverse_01) {
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-//    z->printIndexedBuffer("Output ");
-//    exp.printIndexedBuffer("Expected ");
 
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
@@ -1731,8 +1690,6 @@ TEST_F(DeclarableOpsTests6, MatrixInverse_02) {
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-//    z->printIndexedBuffer("Output ");
-//    exp.printIndexedBuffer("Expected ");
 
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests7.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests7.cpp
index c80d75372..e9fe7264e 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests7.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests7.cpp
@@ -66,7 +66,6 @@ TEST_F(DeclarableOpsTests7, Test_CHOOSE_SCALAR_LARGE) {
 
     auto z = result->at(1);
 
-    z->printIndexedBuffer("CHOOSE test");
     ASSERT_EQ(148,z->e<double>(0));
     //ASSERT_TRUE(exp.isSameShape(z));
 
@@ -572,8 +571,7 @@ TEST_F(DeclarableOpsTests7, Test_Dynamic_Stitch_119_1) {
 
     ASSERT_EQ(Status::OK(), result->status());
     auto z = result->at(0);
-    z->printIndexedBuffer("Stitch");
-    z->printShapeInfo("Stitch Shape");
+
     ASSERT_TRUE(z->isSameShape(exp));
     ASSERT_TRUE(z->equalsTo(exp));
 
@@ -664,8 +662,7 @@ TEST_F(DeclarableOpsTests7, Test_Dynamic_Stitch_119_2) {
 
     ASSERT_EQ(Status::OK(), result->status());
     auto z = result->at(0);
-    z->printIndexedBuffer("Stitch");
-    z->printShapeInfo("Stitch Shape");
+
     ASSERT_TRUE(z->isSameShape(exp));
     ASSERT_TRUE(z->equalsTo(exp));
 
@@ -683,11 +680,7 @@ TEST_F(DeclarableOpsTests7, Test_Dynamic_Partition_119) {
     ASSERT_EQ(Status::OK(), result->status());
     ASSERT_EQ(4, result->size());
     auto z = result->at(0);
-//    z->printShapeInfo("Output shape info");
-//    z->printIndexedBuffer("Output1");
-//    result->at(1)->printIndexedBuffer("Output2");
-//    result->at(2)->printIndexedBuffer("Output3");
-//    result->at(3)->printIndexedBuffer("Output4");
+
     ASSERT_TRUE(e.isSameShape(z));
 
     delete result;
@@ -1080,7 +1073,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentMin_1) {
     auto result = op.execute({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
     auto out = result->at(0);
-    out->printIndexedBuffer("Segment mIN1");
+
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
 
     delete result;
@@ -1097,7 +1090,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentMin_01) {
     auto result = op.execute({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
     auto out = result->at(0);
-    out->printIndexedBuffer("Segment mIN01");
+
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
 
     delete result;
@@ -1113,7 +1106,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentMin_02) {
     auto result = op.execute({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
     auto out = result->at(0);
-    out->printIndexedBuffer("Segment mIN02");
+
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
 
     delete result;
@@ -1130,8 +1123,6 @@ TEST_F(DeclarableOpsTests7, TestSegmentMinBP_1) {
 
     auto result = op.execute({&x, &idx, &eps}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
-    //result->at(0)->printIndexedBuffer("Output1");
-    //exp.printIndexedBuffer("Expecte");
 
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
 
@@ -1433,9 +1424,6 @@ TEST_F(DeclarableOpsTests7, TestSegmentMean_02) {
     auto result = op.execute({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
     ASSERT_EQ(result->size(), 1);
-    exp.printIndexedBuffer("Expect Mean");
-    result->at(0)->printIndexedBuffer("Output Mean");
-//    exp.printShapeInfo("Exp Shape");
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
 
     delete result;
@@ -1451,9 +1439,6 @@ TEST_F(DeclarableOpsTests7, TestSegmentMean_021) {
     auto result = op.execute({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
     ASSERT_EQ(result->size(), 1);
-    exp.printIndexedBuffer("Expect Mean");
-    result->at(0)->printIndexedBuffer("Output Mean");
-//    exp.printShapeInfo("Exp Shape");
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
 
     delete result;
@@ -1470,9 +1455,6 @@ TEST_F(DeclarableOpsTests7, TestSegmentMean_022) {
     auto result = op.execute({&x, &idx}, {&z}, {}, {}, {}, false, nd4j::DataType::FLOAT32);
     ASSERT_EQ(result, Status::OK());
 
-    exp.printIndexedBuffer("Expect Mean");
-    z.printIndexedBuffer("Output Mean");
-//    exp.printShapeInfo("Exp Shape");
     ASSERT_TRUE(exp.equalsTo(z));
 
 //    delete result;
@@ -1491,9 +1473,6 @@ TEST_F(DeclarableOpsTests7, TestSegmentMeanBP_2) {
     auto result = op.execute({&x, &idx, &eps}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
     ASSERT_EQ(result->size(), 2);
-//    exp.printIndexedBuffer("Expect");
-//    result->at(0)->printIndexedBuffer("Output");
-//    exp.printShapeInfo("Exp Shape");
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
 
     delete result;
@@ -1842,8 +1821,6 @@ TEST_F(DeclarableOpsTests7, TestSegmentSum_1) {
 
     auto result = op.execute({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
-     result->at(0)->printIndexedBuffer("Output Sum");
-     exp.printIndexedBuffer("Expect Sum");
 
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
 
@@ -2001,8 +1978,6 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentSum_1) {
 
     auto result = op.execute({&x, &idx}, {}, {5});
     ASSERT_EQ(result->status(), Status::OK());
-    result->at(0)->printIndexedBuffer("UnsortedSum1");
-    exp.printIndexedBuffer("Unsorted Sum1 Exp");
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
 
     delete result;
@@ -2019,8 +1994,6 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentSum_2) {
     auto result = op.execute({&x, &idx}, {}, {3});
     ASSERT_EQ(result->status(), Status::OK());
     ASSERT_EQ(result->size(), 1);
-//    exp.printIndexedBuffer("Expect");
-//    exp.printShapeInfo("Exp Shape");
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
 
     delete result;
@@ -2241,10 +2214,6 @@ TEST_F(DeclarableOpsTests7, TestSegmentProd_04) {
 
     auto result = op.execute({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
-    result->at(0)->printIndexedBuffer("Output");
-//    result->at(0)->printShapeInfo("Out Shape");
-    exp.printIndexedBuffer("Expect");
-//    exp.printShapeInfo("Exp Shape");
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
 
     delete result;
@@ -2262,10 +2231,6 @@ TEST_F(DeclarableOpsTests7, TestSegmentProd_05) {
 
     auto result = op.execute({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
-    result->at(0)->printIndexedBuffer("Output");
-//    result->at(0)->printShapeInfo("Out Shape");
-    exp.printIndexedBuffer("Expect");
-//    exp.printShapeInfo("Exp Shape");
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
 
     delete result;
@@ -2279,15 +2244,10 @@ TEST_F(DeclarableOpsTests7, TestSegmentProd_06) {
 
     auto idx = NDArrayFactory::create<int>({0,0,1,2,2,2,3,3});
     auto exp = NDArrayFactory::create<int8_t>({ 2,   3, 120,  56});
-    x.printIndexedBuffer("INPUT INT8");
     nd4j::ops::segment_prod op;
 
     auto result = op.execute({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
-    result->at(0)->printIndexedBuffer("Output");
-//    result->at(0)->printShapeInfo("Out Shape");
-    exp.printIndexedBuffer("Expect");
-//    exp.printShapeInfo("Exp Shape");
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
 
     delete result;
@@ -2301,15 +2261,10 @@ TEST_F(DeclarableOpsTests7, TestSegmentProd_07) {
 
     auto idx = NDArrayFactory::create<int>({0,0,1,2,2,2,3,3});
     auto exp = NDArrayFactory::create<uint8_t>({ 2,   3, 120,  56});
-    x.printIndexedBuffer("INPUT INT8");
     nd4j::ops::segment_prod op;
 
     auto result = op.execute({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
-    result->at(0)->printIndexedBuffer("Output");
-//    result->at(0)->printShapeInfo("Out Shape");
-    exp.printIndexedBuffer("Expect");
-//    exp.printShapeInfo("Exp Shape");
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
 
     delete result;
@@ -2577,12 +2532,6 @@ auto exp = NDArrayFactory::create<double>('c', {3, 1, 2, 6}, {
 
     auto result = op.execute({&x}, {}, {2,1,3,2,2,2,0});
     ASSERT_EQ(result->status(), Status::OK());
-//    x.printIndexedBuffer("images");
-//    nd4j_printf("input params: ksize = [1, 2, 1, 1], strides = [1, 3, 2, 1], rates = [1, 2, 2, 1]\n", "");
-    result->at(0)->printBuffer("Output");
-    //result->at(0)->printShapeInfo("Out Shape");
-    exp.printBuffer("Expect");
-    //exp.printShapeInfo("Exp Shape");
 
     ASSERT_TRUE(exp.isSameShape(result->at(0)));
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
@@ -3142,8 +3091,6 @@ auto exp = NDArrayFactory::create<double>('c', {2, 2, 4, 2}, {
     auto result = op.execute({&x}, {}, {6}, {}, false, nd4j::DataType::DOUBLE);
     ASSERT_EQ(result->status(), Status::OK());
 
-    result->at(0)->printIndexedBuffer("z");
-
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
 
     delete result;
@@ -3358,9 +3305,6 @@ auto exp = NDArrayFactory::create<double>('c', {2, 3, 3}, {
     auto result = op.execute({&x}, {y}, {}, {1, 1}, {}, true, nd4j::DataType::DOUBLE);
     ASSERT_EQ(result, Status::OK());
 
-    x.printIndexedBuffer("Output");
-    //exp.printIndexedBuffer("Expect");
-
     ASSERT_TRUE(exp.equalsTo(&x));
 
 //    delete result;
@@ -3431,8 +3375,6 @@ TEST_F(DeclarableOpsTests7, TestRoll_12) {
     auto result = op.execute({&x, &shift, &axis}, {}, {}, {}, false, nd4j::DataType::DOUBLE);
     ASSERT_EQ(result->status(), Status::OK());
     auto out = result->at(0);
-    out->printIndexedBuffer("Output");
-    //exp.printIndexedBuffer("Expect");
 
     ASSERT_TRUE(exp.equalsTo(out));
 
@@ -3457,9 +3399,6 @@ TEST_F(DeclarableOpsTests7, TestRoll_13) {
     ASSERT_EQ(result->status(), Status::OK());
     auto out = result->at(0);
 
-//    out->printIndexedBuffer("Output");
-    //exp.printIndexedBuffer("Expect");
-
     ASSERT_TRUE(exp.equalsTo(out));
 
     delete result;
@@ -4274,11 +4213,8 @@ TEST_F(DeclarableOpsTests7, TypesConversion_test4) {
     ASSERT_EQ(ND4J_STATUS_OK, result32->status());
     ASSERT_EQ(ND4J_STATUS_OK, result64->status());
     auto out1 = result32->at(0);
-    out1->printIndexedBuffer("OUT_F");
     auto out2 = result64->at(0);
-    out2->printIndexedBuffer("OUT_D");
 
-//    output->printIndexedBuffer("Toggled");
     ASSERT_TRUE(exp32.equalsTo(out1));
     ASSERT_TRUE(exp64.equalsTo(out2));
 
@@ -4369,8 +4305,6 @@ TEST_F(DeclarableOpsTests7, mirrorPad_test5) {
     nd4j::ops::mirror_pad op;
     auto result = op.execute({&input, &paddings}, {}, {0});
     auto output = result->at(0);
-    output->printBuffer("Output");
-    exp.printBuffer("Expected");
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
 
@@ -6204,8 +6138,6 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Max_BP_1) {
     nd4j::ops::reduce_max_bp op;
     auto result = op.execute({&x, &eps}, {}, {0, 1});
     auto output = result->at(0);
-    exp.printIndexedBuffer("E");
-    output->printIndexedBuffer("O");
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
@@ -6379,8 +6311,6 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Norm1_BP_02) {
     auto result = op.execute({&x, &eps, &axes}, {}, {}, {false});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     auto output = result->at(0);
-    output->printIndexedBuffer("Result is");
-    exp.printIndexedBuffer("Expect is");
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
 
@@ -6397,7 +6327,6 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Norm1_BP_3) {
     nd4j::ops::reduce_norm1_bp op;
     auto result = op.execute({&x, &eps}, {1.f}, {0,1});
     auto output = result->at(0);
-//    output->printIndexedBuffer("Result is");
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests8.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests8.cpp
index 82b3d2db7..9f98ab3a1 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests8.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests8.cpp
@@ -55,12 +55,12 @@ TEST_F(DeclarableOpsTests8, reduceVariance_test1) {
 
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.f});
     auto exp = NDArrayFactory::create<double>('c', {4}, {602.2222f, 727.13885f, 993.5555f, 755.8889f});
-        
+
     nd4j::ops::reduce_variance op;
     auto result = op.execute({&x}, {}, {0,1});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -73,12 +73,12 @@ TEST_F(DeclarableOpsTests8, reduceVariance_test2) {
 
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.f});
     auto exp = NDArrayFactory::create<double>('c', {1,1,4}, {602.2222f, 727.13885f, 993.5555f, 755.8889f});
-    
+
     nd4j::ops::reduce_variance op;
     auto result = op.execute({&x}, {1.}, {0,1});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -91,12 +91,12 @@ TEST_F(DeclarableOpsTests8, reduceVariance_test3) {
 
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.f});
     auto exp = NDArrayFactory::create<double>('c', {3}, {900.9375f, 969.8594f, 424.1875f});
-        
+
     nd4j::ops::reduce_variance op;
     auto result = op.execute({&x}, {}, {0,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -108,13 +108,13 @@ TEST_F(DeclarableOpsTests8, reduceVariance_test3) {
 TEST_F(DeclarableOpsTests8, reduceVariance_test4) {
 
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.f});
-    auto exp = NDArrayFactory::create<double>('c', {1,3,1}, {900.9375f, 969.8594f, 424.1875f});  
-        
+    auto exp = NDArrayFactory::create<double>('c', {1,3,1}, {900.9375f, 969.8594f, 424.1875f});
+
     nd4j::ops::reduce_variance op;
     auto result = op.execute({&x}, {1.}, {0,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -127,12 +127,12 @@ TEST_F(DeclarableOpsTests8, reduceVariance_test5) {
 
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.f});
     auto exp = NDArrayFactory::create<double>(788.6927f);
-        
+
     nd4j::ops::reduce_variance op;
     auto result = op.execute({&x}, {}, {});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -145,12 +145,12 @@ TEST_F(DeclarableOpsTests8, reduceVariance_test6) {
 
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.});
     auto exp = NDArrayFactory::create<double>(788.6927f);
-           
+
     nd4j::ops::reduce_variance op;
     auto result = op.execute({&x}, {}, {0,1,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -163,12 +163,12 @@ TEST_F(DeclarableOpsTests8, reduceVariance_test7) {
 
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.});
     auto exp = NDArrayFactory::create<double>('c', {1,1,1}, {788.6927f});
-           
+
     nd4j::ops::reduce_variance op;
     auto result = op.execute({&x}, {1.}, {0,1,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -199,12 +199,12 @@ TEST_F(DeclarableOpsTests8, reduceStDev_test1) {
 
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.});
     auto exp = NDArrayFactory::create<double>('c', {4}, {24.54022f, 26.96551f, 31.52072f, 27.49343f});
-        
+
     nd4j::ops::reduce_stdev op;
     auto result = op.execute({&x}, {}, {0,1});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -217,12 +217,12 @@ TEST_F(DeclarableOpsTests8, reduceStDev_test2) {
 
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.});
     auto exp = NDArrayFactory::create<double>('c', {1,1,4}, {24.54022f, 26.96551f, 31.52072f, 27.49343f});
-    
+
     nd4j::ops::reduce_stdev op;
     auto result = op.execute({&x}, {1.}, {0,1});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -235,12 +235,12 @@ TEST_F(DeclarableOpsTests8, reduceStDev_test3) {
 
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.});
     auto exp = NDArrayFactory::create<double>('c', {3}, {30.01562f, 31.14257f, 20.59581f});
-        
+
     nd4j::ops::reduce_stdev op;
     auto result = op.execute({&x}, {}, {0,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -252,13 +252,13 @@ TEST_F(DeclarableOpsTests8, reduceStDev_test3) {
 TEST_F(DeclarableOpsTests8, reduceStDev_test4) {
 
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.});
-    auto exp = NDArrayFactory::create<double>('c', {1,3,1}, {30.01562f, 31.14257f, 20.59581f});  
-        
+    auto exp = NDArrayFactory::create<double>('c', {1,3,1}, {30.01562f, 31.14257f, 20.59581f});
+
     nd4j::ops::reduce_stdev op;
     auto result = op.execute({&x}, {1.}, {0,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -271,12 +271,12 @@ TEST_F(DeclarableOpsTests8, reduceStDev_test5) {
 
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.});
     auto exp = NDArrayFactory::create<double>(28.08367f);
-        
+
     nd4j::ops::reduce_stdev op;
     auto result = op.execute({&x}, {}, {});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -289,12 +289,12 @@ TEST_F(DeclarableOpsTests8, reduceStDev_test6) {
 
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.});
     auto exp = NDArrayFactory::create<double>(28.08367f);
-           
+
     nd4j::ops::reduce_stdev op;
     auto result = op.execute({&x}, {}, {0,1,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -307,12 +307,12 @@ TEST_F(DeclarableOpsTests8, reduceStDev_test7) {
 
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.});
     auto exp = NDArrayFactory::create<double>('c', {1,1,1}, {28.08367f});
-           
+
     nd4j::ops::reduce_stdev op;
     auto result = op.execute({&x}, {1.f}, {0,1,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -325,12 +325,12 @@ TEST_F(DeclarableOpsTests8, reduceStDev_test8) {
 
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.});
     auto exp = NDArrayFactory::create<double>('c', {4}, {26.88246f, 29.53924f, 34.52921f, 30.11755f});
-        
+
     nd4j::ops::reduce_stdev op;
     auto result = op.execute({&x}, {0.f,1.f}, {0,1});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
     // output->printBuffer("Reduced STDDEV");
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -366,36 +366,36 @@ TEST_F(DeclarableOpsTests8, reduceVarianceBP_test1) {
     auto exp34 = NDArrayFactory::create<double>('c', {3,4}, {-0.45833334f, -0.375f, -0.29166666f, -0.20833333f, -0.125f, -0.041666668f, 0.041666668f, 0.125f, 0.20833333f, 0.29166666f, 0.375f, 0.45833334f});
 
     x.linspace(1);
-            
+
     nd4j::ops::reduce_variance_bp op;
 
     auto result = op.execute({&x, &gradO2}, {0,1}, {});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
     auto output = result->at(0);
     ASSERT_TRUE(exp12.isSameShape(output));
     ASSERT_TRUE(exp12.equalsTo(output));
     delete result;
 
     result = op.execute({&x, &gradO1}, {1,1}, {});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
     output = result->at(0);
     ASSERT_TRUE(exp12.isSameShape(output));
     ASSERT_TRUE(exp12.equalsTo(output));
-    delete result;    
+    delete result;
 
     result = op.execute({&x, &gradO2}, {0,0}, {});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
     output = result->at(0);
     ASSERT_TRUE(exp34.isSameShape(output));
     ASSERT_TRUE(exp34.equalsTo(output));
-    delete result;    
+    delete result;
 
     result = op.execute({&x, &gradO1}, {1,0}, {});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
     output = result->at(0);
     ASSERT_TRUE(exp34.isSameShape(output));
     ASSERT_TRUE(exp34.equalsTo(output));
-    delete result;   
+    delete result;
 
 }
 
@@ -409,36 +409,36 @@ TEST_F(DeclarableOpsTests8, reduceVarianceBP_test2) {
     auto exp34 = NDArrayFactory::create<double>('c', {3,4}, {-4.000000f, -8.000000f, -12.000000f, -16.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 4.000000f, 8.000000f, 12.000000f, 16.000000f});
 
     x.linspace(1);
-            
+
     nd4j::ops::reduce_variance_bp op;
 
     auto result = op.execute({&x, &gradO2}, {0,0}, {0});
-    ASSERT_EQ(Status::OK(), result->status());    
-    auto output = result->at(0);    
+    ASSERT_EQ(Status::OK(), result->status());
+    auto output = result->at(0);
     ASSERT_TRUE(exp12.isSameShape(output));
     ASSERT_TRUE(exp12.equalsTo(output));
     delete result;
 
     result = op.execute({&x, &gradO1}, {1,0}, {0});
-    ASSERT_EQ(Status::OK(), result->status());    
-    output = result->at(0);        
+    ASSERT_EQ(Status::OK(), result->status());
+    output = result->at(0);
     ASSERT_TRUE(exp12.isSameShape(output));
-    ASSERT_TRUE(exp12.equalsTo(output)); 
-    delete result;    
+    ASSERT_TRUE(exp12.equalsTo(output));
+    delete result;
 
     result = op.execute({&x, &gradO2}, {0,1}, {0});
-    ASSERT_EQ(Status::OK(), result->status());    
-    output = result->at(0);    
-    ASSERT_TRUE(exp34.isSameShape(output));
-    ASSERT_TRUE(exp34.equalsTo(output));
-    delete result;    
-
-    result = op.execute({&x, &gradO1}, {1,1}, {0});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
     output = result->at(0);
     ASSERT_TRUE(exp34.isSameShape(output));
     ASSERT_TRUE(exp34.equalsTo(output));
-    delete result;   
+    delete result;
+
+    result = op.execute({&x, &gradO1}, {1,1}, {0});
+    ASSERT_EQ(Status::OK(), result->status());
+    output = result->at(0);
+    ASSERT_TRUE(exp34.isSameShape(output));
+    ASSERT_TRUE(exp34.equalsTo(output));
+    delete result;
 
 }
 
@@ -537,15 +537,15 @@ TEST_F(DeclarableOpsTests8, reduceStDevBP_test1) {
     auto x = NDArrayFactory::create<double>('c', {3,4});
     auto gradO1 = NDArrayFactory::create<double>('c', {1,1}, {0.5f});
     auto gradO2 = NDArrayFactory::create<double>(0.5f);
-    auto exp12 = NDArrayFactory::create<double>('c', {3,4}, {-0.069337524f, -0.056730703f, -0.04412388f, -0.031517055f, -0.018910235f, -0.0063034114f, 0.0063034114f, 0.018910235f, 0.031517055f, 0.04412388f, 0.056730703f, 0.069337524f});     
+    auto exp12 = NDArrayFactory::create<double>('c', {3,4}, {-0.069337524f, -0.056730703f, -0.04412388f, -0.031517055f, -0.018910235f, -0.0063034114f, 0.0063034114f, 0.018910235f, 0.031517055f, 0.04412388f, 0.056730703f, 0.069337524f});
     auto exp34 = NDArrayFactory::create<double>('c', {3,4}, {-0.06638563f, -0.05431551f, -0.0422454f, -0.030175284f, -0.01810517f, -0.006035057f, 0.006035057f, 0.01810517f, 0.030175284f, 0.0422454f, 0.05431551f, 0.06638563f});
 
     x.linspace(1);
-            
+
     nd4j::ops::reduce_stdev_bp op;
 
     auto result = op.execute({&x, &gradO2}, {0,1}, {});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
     auto output = result->at(0);
     // output->printIndexedBuffer();
     ASSERT_TRUE(exp12.isSameShape(output));
@@ -553,21 +553,21 @@ TEST_F(DeclarableOpsTests8, reduceStDevBP_test1) {
     delete result;
 
     result = op.execute({&x, &gradO1}, {1,1}, {});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
     output = result->at(0);
     ASSERT_TRUE(exp12.isSameShape(output));
     ASSERT_TRUE(exp12.equalsTo(output));
     delete result;
 
     result = op.execute({&x, &gradO2}, {0,0}, {});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
     output = result->at(0);
     ASSERT_TRUE(exp34.isSameShape(output));
     ASSERT_TRUE(exp34.equalsTo(output));
     delete result;
 
     result = op.execute({&x, &gradO1}, {1,0}, {});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
     output = result->at(0);
     ASSERT_TRUE(exp34.isSameShape(output));
     ASSERT_TRUE(exp34.equalsTo(output));
@@ -584,36 +584,36 @@ TEST_F(DeclarableOpsTests8, reduceStDevBP_test2) {
     auto exp34 = NDArrayFactory::create<double>('c', {3,4}, {-0.5f, -1.0f, -1.5f, -2.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.5f, 1.0f, 1.5f, 2.0f});
 
     x.linspace(1);
-            
+
     nd4j::ops::reduce_stdev_bp op;
 
     auto result = op.execute({&x, &gradO2}, {0,0}, {0});
-    ASSERT_EQ(Status::OK(), result->status());    
-    auto output = result->at(0);        
+    ASSERT_EQ(Status::OK(), result->status());
+    auto output = result->at(0);
     ASSERT_TRUE(exp12.isSameShape(output));
     ASSERT_TRUE(exp12.equalsTo(output));
     delete result;
 
     result = op.execute({&x, &gradO1}, {1,0}, {0});
-    ASSERT_EQ(Status::OK(), result->status());    
-    output = result->at(0);        
+    ASSERT_EQ(Status::OK(), result->status());
+    output = result->at(0);
     ASSERT_TRUE(exp12.isSameShape(output));
-    ASSERT_TRUE(exp12.equalsTo(output)); 
-    delete result;    
+    ASSERT_TRUE(exp12.equalsTo(output));
+    delete result;
 
     result = op.execute({&x, &gradO2}, {0,1}, {0});
-    ASSERT_EQ(Status::OK(), result->status());    
-    output = result->at(0);    
-    ASSERT_TRUE(exp34.isSameShape(output));
-    ASSERT_TRUE(exp34.equalsTo(output));
-    delete result;    
-
-    result = op.execute({&x, &gradO1}, {1,1}, {0});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
     output = result->at(0);
     ASSERT_TRUE(exp34.isSameShape(output));
     ASSERT_TRUE(exp34.equalsTo(output));
-    delete result;   
+    delete result;
+
+    result = op.execute({&x, &gradO1}, {1,1}, {0});
+    ASSERT_EQ(Status::OK(), result->status());
+    output = result->at(0);
+    ASSERT_TRUE(exp34.isSameShape(output));
+    ASSERT_TRUE(exp34.equalsTo(output));
+    delete result;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -669,44 +669,44 @@ TEST_F(DeclarableOpsTests8, reduceStDevBP_test3) {
     auto exp34 = NDArrayFactory::create<double>('c', {3,4}, {-0.38729835f, -0.12909944f, 0.12909944f, 0.38729835f, -0.7745967f, -0.2581989f, 0.2581989f, 0.7745967f, -1.161895f, -0.38729835f, 0.38729835f, 1.161895f});
 
     x.linspace(1);
-            
+
     nd4j::ops::reduce_stdev_bp op;
 
     auto result = op.execute({&x, &gradO2}, {0,0}, {1});
-    ASSERT_EQ(Status::OK(), result->status());    
-    auto output = result->at(0);    
+    ASSERT_EQ(Status::OK(), result->status());
+    auto output = result->at(0);
     ASSERT_TRUE(exp12.isSameShape(output));
     ASSERT_TRUE(exp12.equalsTo(output));
     delete result;
 
     result = op.execute({&x, &gradO1}, {1,0}, {1});
-    ASSERT_EQ(Status::OK(), result->status());    
-    output = result->at(0);        
+    ASSERT_EQ(Status::OK(), result->status());
+    output = result->at(0);
     ASSERT_TRUE(exp12.isSameShape(output));
-    ASSERT_TRUE(exp12.equalsTo(output)); 
-    delete result;    
+    ASSERT_TRUE(exp12.equalsTo(output));
+    delete result;
 
     result = op.execute({&x, &gradO2}, {0,1}, {1});
-    ASSERT_EQ(Status::OK(), result->status());    
-    output = result->at(0);    
-    ASSERT_TRUE(exp34.isSameShape(output));
-    ASSERT_TRUE(exp34.equalsTo(output));
-    delete result;    
-
-    result = op.execute({&x, &gradO1}, {1,1}, {1});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
     output = result->at(0);
     ASSERT_TRUE(exp34.isSameShape(output));
     ASSERT_TRUE(exp34.equalsTo(output));
-    delete result;   
+    delete result;
+
+    result = op.execute({&x, &gradO1}, {1,1}, {1});
+    ASSERT_EQ(Status::OK(), result->status());
+    output = result->at(0);
+    ASSERT_TRUE(exp34.isSameShape(output));
+    ASSERT_TRUE(exp34.equalsTo(output));
+    delete result;
 
 }
 
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_1) {
-    
-    auto input = NDArrayFactory::create<double>('c', {3, 5},   {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.});    
+
+    auto input = NDArrayFactory::create<double>('c', {3, 5},   {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.});
     auto exp = NDArrayFactory::create<double>(120.f);
     //************************************//
 
@@ -714,7 +714,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_1) {
     auto result = op.execute({&input}, {}, {});
 
     ASSERT_EQ(Status::OK(), result->status());
-    auto z = result->at(0);    
+    auto z = result->at(0);
     //z->printIndexedBuffer("Result is ");
     ASSERT_TRUE(exp.equalsTo(z));
     delete result;
@@ -722,8 +722,8 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_1) {
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_2) {
-    
-    auto input = NDArrayFactory::create<double>('c', {3, 5},   {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.});    
+
+    auto input = NDArrayFactory::create<double>('c', {3, 5},   {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.});
     auto exp = NDArrayFactory::create<double>({15.f, 40.f, 65.f});
     //************************************//
 
@@ -731,7 +731,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_2) {
     auto result = op.execute({&input}, {}, {1});
 
     ASSERT_EQ(Status::OK(), result->status());
-    auto z = result->at(0);    
+    auto z = result->at(0);
 //    z->printIndexedBuffer("Result is ");
     ASSERT_TRUE(exp.equalsTo(z));
     delete result;
@@ -757,8 +757,8 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_03) {
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_1) {
-    
-    auto input = NDArrayFactory::create<double>('c', {3, 5},   {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.});    
+
+    auto input = NDArrayFactory::create<double>('c', {3, 5},   {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.});
     auto exp = NDArrayFactory::create<double>(1307674368000.f);
     //************************************//
 
@@ -766,7 +766,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_1) {
     auto result = op.execute({&input}, {}, {});
 
     ASSERT_EQ(Status::OK(), result->status());
-    auto z = result->at(0);    
+    auto z = result->at(0);
     //z->printIndexedBuffer("Result is ");
     ASSERT_TRUE(exp.equalsTo(z));
     delete result;
@@ -774,8 +774,8 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_1) {
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_2) {
-    
-    auto input = NDArrayFactory::create<double>('c', {3, 5},   {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.});    
+
+    auto input = NDArrayFactory::create<double>('c', {3, 5},   {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.});
     auto exp = NDArrayFactory::create<double>({120.f, 30240.f, 360360.f});
     //************************************//
 
@@ -783,7 +783,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_2) {
     auto result = op.execute({&input}, {}, {1});
 
     ASSERT_EQ(Status::OK(), result->status());
-    auto z = result->at(0);    
+    auto z = result->at(0);
 //    z->printIndexedBuffer("Result is ");
     ASSERT_TRUE(exp.equalsTo(z));
     delete result;
@@ -798,9 +798,9 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_01) {
 
     nd4j::ops::reduce_sum op;
     auto result = op.execute({&x}, {}, {0,1});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -817,10 +817,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_02) {
 
     nd4j::ops::reduce_sum op;
     auto result = op.execute({&x}, {1.}, {0, 1});
-    auto output = result->at(0);    
+    auto output = result->at(0);
    // output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -837,10 +837,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_3) {
 
     nd4j::ops::reduce_sum op;
     auto result = op.execute({&x}, {}, {0, 2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -857,10 +857,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_4) {
 
     nd4j::ops::reduce_sum op;
     auto result = op.execute({&x}, {1.}, {0, 2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -877,10 +877,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_5) {
 
     nd4j::ops::reduce_sum op;
     auto result = op.execute({&x}, {}, {});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -894,13 +894,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_6) {
     auto x = NDArrayFactory::create<double>('c', {2,3,4});
     auto exp = NDArrayFactory::create<double>(300.f);
     x.linspace(1);
-           
+
     nd4j::ops::reduce_sum op;
     auto result = op.execute({&x}, {}, {0,1,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -914,13 +914,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_7) {
     auto x = NDArrayFactory::create<double>('c', {2,3,4});
     auto exp = NDArrayFactory::create<double>('c', {1,1,1}, {300.f});
     x.linspace(1);
-//    x.printIndexedBuffer("Input with shape (2, 3, 4) is");       
+//    x.printIndexedBuffer("Input with shape (2, 3, 4) is");
     nd4j::ops::reduce_sum op;
     auto result = op.execute({&x}, {1.}, {0,1,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -937,9 +937,9 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_01) {
 
     nd4j::ops::reduce_prod op;
     auto result = op.execute({&x}, {}, {0,1});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -956,10 +956,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_02) {
 
     nd4j::ops::reduce_prod op;
     auto result = op.execute({&x}, {1.}, {0, 1});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -976,10 +976,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_3) {
 
     nd4j::ops::reduce_prod op;
     auto result = op.execute({&x}, {}, {0, 2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -996,10 +996,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_4) {
 
     nd4j::ops::reduce_prod op;
     auto result = op.execute({&x}, {1.}, {0, 2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1034,13 +1034,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_5) {
     auto x = NDArrayFactory::create<double>('c', {2,3,2});
     auto exp = NDArrayFactory::create<double>(479001600.f);
     x.linspace(1);
-    
+
     nd4j::ops::reduce_prod op;
     auto result = op.execute({&x}, {}, {});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1054,13 +1054,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_6) {
     auto x = NDArrayFactory::create<double>('c', {2,3,2});
     auto exp = NDArrayFactory::create<double>(479001600.f);
     x.linspace(1);
-           
+
     nd4j::ops::reduce_prod op;
     auto result = op.execute({&x}, {}, {0,1,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1074,13 +1074,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_7) {
     auto x = NDArrayFactory::create<double>('c', {2,3,2});
     auto exp = NDArrayFactory::create<double>('c', {1, 1, 1}, {479001600.f});
     x.linspace(1);
-//    x.printIndexedBuffer("Input with shape (2, 3, 4) is");       
+//    x.printIndexedBuffer("Input with shape (2, 3, 4) is");
     nd4j::ops::reduce_prod op;
     auto result = op.execute({&x}, {1.}, {0,1,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1097,9 +1097,9 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Min_1) {
 
     nd4j::ops::reduce_min op;
     auto result = op.execute({&x}, {}, {0, 1});
-    auto output = result->at(0);    
+    auto output = result->at(0);
     // output->printIndexedBuffer("Result is");
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1116,10 +1116,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Min_2) {
 
     nd4j::ops::reduce_min op;
     auto result = op.execute({&x}, {1.}, {0, 1});
-    auto output = result->at(0);    
+    auto output = result->at(0);
     // output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1136,10 +1136,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Min_3) {
 
     nd4j::ops::reduce_min op;
     auto result = op.execute({&x}, {}, {0, 2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
     // output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1156,10 +1156,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Min_4) {
 
     nd4j::ops::reduce_min op;
     auto result = op.execute({&x}, {1.}, {0, 2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
     // output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1194,13 +1194,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Min_5) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     auto exp = NDArrayFactory::create<double>(1.f);
     x.linspace(1);
-    
+
     nd4j::ops::reduce_min op;
     auto result = op.execute({&x}, {}, {});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1214,13 +1214,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Min_6) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     auto exp = NDArrayFactory::create<double>(1.f);
     x.linspace(1);
-           
+
     nd4j::ops::reduce_min op;
     auto result = op.execute({&x}, {}, {0,1,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1234,13 +1234,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Min_7) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     auto exp = NDArrayFactory::create<double>('c', {1, 1, 1}, {1.f});
     x.linspace(1);
-    // x.printIndexedBuffer("Input with shape (2, 3, 4) is");       
+    // x.printIndexedBuffer("Input with shape (2, 3, 4) is");
     nd4j::ops::reduce_min op;
     auto result = op.execute({&x}, {1.}, {0,1,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1257,10 +1257,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Max_1) {
 
     nd4j::ops::reduce_max op;
     auto result = op.execute({&x}, {}, {0,1});
-    auto output = result->at(0);    
+    auto output = result->at(0);
     // output->printIndexedBuffer("Result is");
     // output->printShapeInfo("Output shape");
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1277,10 +1277,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Max_2) {
 
     nd4j::ops::reduce_max op;
     auto result = op.execute({&x}, {1.}, {0, 1});
-    auto output = result->at(0);    
+    auto output = result->at(0);
     // output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1297,10 +1297,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Max_3) {
 
     nd4j::ops::reduce_max op;
     auto result = op.execute({&x}, {}, {0, 2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
     // output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1317,10 +1317,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Max_4) {
 
     nd4j::ops::reduce_max op;
     auto result = op.execute({&x}, {1.}, {0, 2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
     // output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1355,13 +1355,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Max_5) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     auto exp = NDArrayFactory::create<double>(24.f);
     x.linspace(1);
-    
+
     nd4j::ops::reduce_max op;
     auto result = op.execute({&x}, {}, {});
-    auto output = result->at(0);    
+    auto output = result->at(0);
     // output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1375,13 +1375,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Max_6) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     auto exp = NDArrayFactory::create<double>(24.f);
     x.linspace(1);
-           
+
     nd4j::ops::reduce_max op;
     auto result = op.execute({&x}, {}, {0,1,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
     // output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1395,13 +1395,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Max_7) {
 	auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     auto exp = NDArrayFactory::create<double>('c', {1, 1, 1}, {24.f});
     x.linspace(1);
-//    x.printIndexedBuffer("Input with shape (2, 3, 4) is");       
+//    x.printIndexedBuffer("Input with shape (2, 3, 4) is");
     nd4j::ops::reduce_max op;
     auto result = op.execute({&x}, {1.}, {0,1,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
     // output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1419,7 +1419,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm1_1) {
     auto result = op.execute({&x}, {}, {0,1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1436,10 +1436,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm1_2) {
 
     nd4j::ops::reduce_norm1 op;
     auto result = op.execute({&x}, {1.}, {0, 1});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1456,10 +1456,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm1_3) {
 
     nd4j::ops::reduce_norm1 op;
     auto result = op.execute({&x}, {}, {0, 2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1476,10 +1476,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm1_4) {
 
     nd4j::ops::reduce_norm1 op;
     auto result = op.execute({&x}, {1.}, {0, 2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1514,13 +1514,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm1_5) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     auto exp = NDArrayFactory::create<double>(300.f);
     x.linspace(1);
-    
+
     nd4j::ops::reduce_norm1 op;
     auto result = op.execute({&x}, {}, {});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1534,13 +1534,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm1_6) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     auto exp = NDArrayFactory::create<double>(300.f);
     x.linspace(1);
-           
+
     nd4j::ops::reduce_norm1 op;
     auto result = op.execute({&x}, {}, {0,1,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1554,13 +1554,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm1_7) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     auto exp = NDArrayFactory::create<double>('c', {1, 1, 1}, {300.f});
     x.linspace(1);
-//    x.printIndexedBuffer("Input with shape (2, 3, 4) is");       
+//    x.printIndexedBuffer("Input with shape (2, 3, 4) is");
     nd4j::ops::reduce_norm1 op;
     auto result = op.execute({&x}, {1.}, {0,1,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1578,7 +1578,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm2_1) {
     auto result = op.execute({&x}, {}, {0,1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1595,10 +1595,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm2_2) {
 
     nd4j::ops::reduce_norm2 op;
     auto result = op.execute({&x}, {1.}, {0, 1});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1615,10 +1615,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm2_3) {
 
     nd4j::ops::reduce_norm2 op;
     auto result = op.execute({&x}, {}, {0, 2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1635,10 +1635,10 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm2_4) {
 
     nd4j::ops::reduce_norm2 op;
     auto result = op.execute({&x}, {1.}, {0, 2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1673,13 +1673,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm2_5) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     auto exp = NDArrayFactory::create<double>(70.f);
     x.linspace(1);
-    
+
     nd4j::ops::reduce_norm2 op;
     auto result = op.execute({&x}, {}, {});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1693,13 +1693,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm2_6) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     auto exp = NDArrayFactory::create<double>(70.f);
     x.linspace(1);
-           
+
     nd4j::ops::reduce_norm2 op;
     auto result = op.execute({&x}, {}, {0,1,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1713,13 +1713,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm2_7) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     auto exp = NDArrayFactory::create<double>('c', {1, 1, 1}, {70.f});
     x.linspace(1);
-//    x.printIndexedBuffer("Input with shape (2, 3, 4) is");       
+//    x.printIndexedBuffer("Input with shape (2, 3, 4) is");
     nd4j::ops::reduce_norm2 op;
     auto result = op.execute({&x}, {1.}, {0,1,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1738,7 +1738,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_NormMax_1) {
     auto result = op.execute({&x}, {}, {0,1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1757,7 +1757,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_NormMax_2) {
     auto result = op.execute({&x}, {1.f}, {0,1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1776,7 +1776,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_NormMax_3) {
     auto result = op.execute({&x}, {}, {0,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1795,7 +1795,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_NormMax_4) {
     auto result = op.execute({&x}, {1.f}, {0,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1829,13 +1829,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_NormMax_5) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     auto exp = NDArrayFactory::create<double>(24.f);
     x.linspace(1);
-    
+
     nd4j::ops::reduce_norm_max op;
     auto result = op.execute({&x}, {}, {});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1849,13 +1849,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_NormMax_6) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     auto exp = NDArrayFactory::create<double>(24.f);
     x.linspace(1);
-    
+
     nd4j::ops::reduce_norm_max op;
     auto result = op.execute({&x}, {}, {0, 1, 2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1869,13 +1869,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_NormMax_7) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     auto exp = NDArrayFactory::create<double>('c', {1, 1, 1}, {24.f});
     x.linspace(1);
-    
+
     nd4j::ops::reduce_norm_max op;
     auto result = op.execute({&x}, {1.f}, {});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1894,7 +1894,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_SquaredNorm_1) {
     auto result = op.execute({&x}, {}, {0,1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1913,7 +1913,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_SquaredNorm_2) {
     auto result = op.execute({&x}, {1.f}, {0,1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1932,7 +1932,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_SquaredNorm_3) {
     auto result = op.execute({&x}, {}, {0,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1951,7 +1951,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_SquaredNorm_4) {
     auto result = op.execute({&x}, {1.f}, {0,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -1985,13 +1985,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_SquaredNorm_5) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     auto exp = NDArrayFactory::create<double>(4900.f);
     x.linspace(1);
-    
+
     nd4j::ops::reduce_sqnorm op;
     auto result = op.execute({&x}, {}, {});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -2005,13 +2005,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_SquaredNorm_6) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     auto exp = NDArrayFactory::create<double>(4900.f);
     x.linspace(1);
-    
+
     nd4j::ops::reduce_sqnorm op;
     auto result = op.execute({&x}, {}, {0, 1, 2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -2025,13 +2025,13 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_SquaredNorm_7) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     auto exp = NDArrayFactory::create<double>('c', {1, 1, 1}, {4900.f});
     x.linspace(1);
-    
+
     nd4j::ops::reduce_sqnorm op;
     auto result = op.execute({&x}, {1.f}, {});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -2041,8 +2041,8 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_SquaredNorm_7) {
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_1) {
-    
-    auto input = NDArrayFactory::create<double>('c', {3, 4},   {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.});    
+
+    auto input = NDArrayFactory::create<double>('c', {3, 4},   {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.});
     auto eps = NDArrayFactory::create<double>(0.5f);
     auto exp = NDArrayFactory::create<double>('c', {3, 4}, {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,0.5f});
     //************************************//
@@ -2051,7 +2051,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_1) {
     auto result = op.execute({&input, &eps}, {}, {});
 
     ASSERT_EQ(Status::OK(), result->status());
-    auto z = result->at(0);    
+    auto z = result->at(0);
 //    z->printIndexedBuffer("Result is ");
 //    z->printShapeInfo();
     ASSERT_TRUE(exp.equalsTo(z));
@@ -2060,11 +2060,11 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_1) {
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_2) {
-    
-    auto input = NDArrayFactory::create<double>('c', {3, 4},   {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.});    
+
+    auto input = NDArrayFactory::create<double>('c', {3, 4},   {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.});
     auto eps = NDArrayFactory::create<double>('c', {1, 1}, {0.5f});
-    auto exp = NDArrayFactory::create<double>('c', {3, 4}, {0.5f, 0.5f, 0.5f, 0.5f, 
-                                     0.5f, 0.5f, 0.5f, 0.5f, 
+    auto exp = NDArrayFactory::create<double>('c', {3, 4}, {0.5f, 0.5f, 0.5f, 0.5f,
+                                     0.5f, 0.5f, 0.5f, 0.5f,
                                      0.5f, 0.5f, 0.5f,0.5f});
     //************************************//
 
@@ -2072,7 +2072,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_2) {
     auto result = op.execute({&input, &eps}, {1.f}, {});
 
     ASSERT_EQ(Status::OK(), result->status());
-    auto z = result->at(0);    
+    auto z = result->at(0);
 //  z->printIndexedBuffer("Result is ");
 //  z->printShapeInfo();
     ASSERT_TRUE(exp.equalsTo(z));
@@ -2081,11 +2081,11 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_2) {
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_3) {
-    
-    auto input = NDArrayFactory::create<double>('c', {3, 4},   {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.});    
+
+    auto input = NDArrayFactory::create<double>('c', {3, 4},   {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.});
     auto eps = NDArrayFactory::create<double>('c', {4}, {1.f, 2.f, 3.f, 4.f});
-    auto exp = NDArrayFactory::create<double>('c', {3, 4}, {1.f, 2.f, 3.f, 4.f, 
-                                     1.f, 2.f, 3.f, 4.f, 
+    auto exp = NDArrayFactory::create<double>('c', {3, 4}, {1.f, 2.f, 3.f, 4.f,
+                                     1.f, 2.f, 3.f, 4.f,
                                      1.f, 2.f, 3.f, 4.f});
     //************************************//
 
@@ -2093,7 +2093,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_3) {
     auto result = op.execute({&input, &eps}, {}, {0});
 
     ASSERT_EQ(Status::OK(), result->status());
-    auto z = result->at(0);    
+    auto z = result->at(0);
 //    z->printIndexedBuffer("Result is ");
 //    z->printShapeInfo();
     ASSERT_TRUE(exp.equalsTo(z));
@@ -2102,11 +2102,11 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_3) {
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_4) {
-    
-    auto input = NDArrayFactory::create<double>('c', {3, 4},   {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.});    
+
+    auto input = NDArrayFactory::create<double>('c', {3, 4},   {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.});
     auto eps = NDArrayFactory::create<double>('c', {1, 4}, {1.f, 2.f, 3.f, 4.f});
-    auto exp = NDArrayFactory::create<double>('c', {3, 4}, {1.f, 2.f, 3.f, 4.f, 
-                                     1.f, 2.f, 3.f, 4.f, 
+    auto exp = NDArrayFactory::create<double>('c', {3, 4}, {1.f, 2.f, 3.f, 4.f,
+                                     1.f, 2.f, 3.f, 4.f,
                                      1.f, 2.f, 3.f, 4.f});
     //************************************//
 
@@ -2114,7 +2114,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_4) {
     auto result = op.execute({&input, &eps}, {1.f}, {0});
 
     ASSERT_EQ(Status::OK(), result->status());
-    auto z = result->at(0);    
+    auto z = result->at(0);
 //    z->printIndexedBuffer("Result is ");
 //    z->printShapeInfo();
     ASSERT_TRUE(exp.equalsTo(z));
@@ -2146,23 +2146,23 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_04) {
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_BP_1) {
-    
-    auto input = NDArrayFactory::create<double>('c', {3, 5},   {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f});    
+
+    auto input = NDArrayFactory::create<double>('c', {3, 5},   {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f});
     auto eps = NDArrayFactory::create<double>(1307674368000.f);
     //************************************//
 //    auto exp = NDArrayFactory::create<double>('c', {3, 4}, {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,0.5f});
     //************************************//
-    auto exp = NDArrayFactory::create<double>('c', {3, 5},   {1710012166826558903812096.f, 855006083413279451906048.f, 570004067618451974258688.f, 
-                                       427503041706639725953024.f, 342002454982589992140800.f, 285002033809225987129344.f, 
-                                       244287457550765131825152.f, 213751520853319862976512.f, 190001355872817324752896.f, 
-                                       171001227491294996070400.f, 155455648254341989531648.f, 142501016904612993564672.f, 
-                                       131539399526781282156544.f, 122143728775382565912576.f, 114000815325130245799936.f});    
+    auto exp = NDArrayFactory::create<double>('c', {3, 5},   {1710012166826558903812096.f, 855006083413279451906048.f, 570004067618451974258688.f,
+                                       427503041706639725953024.f, 342002454982589992140800.f, 285002033809225987129344.f,
+                                       244287457550765131825152.f, 213751520853319862976512.f, 190001355872817324752896.f,
+                                       171001227491294996070400.f, 155455648254341989531648.f, 142501016904612993564672.f,
+                                       131539399526781282156544.f, 122143728775382565912576.f, 114000815325130245799936.f});
 
     nd4j::ops::reduce_prod_bp op;
     auto result = op.execute({&input, &eps}, {}, {});
 
     ASSERT_EQ(Status::OK(), result->status());
-    auto z = result->at(0);    
+    auto z = result->at(0);
 //    z->printIndexedBuffer("Result is ");
 //    z->printShapeInfo();
     ASSERT_TRUE(exp.equalsTo(z));
@@ -2175,13 +2175,13 @@ TEST_F(DeclarableOpsTests8, reduceMean_test1) {
     auto x = NDArrayFactory::create<double>('c', {2,3,4});
     auto exp = NDArrayFactory::create<double>('c', {4}, {11.f, 12.f, 13.f, 14.f});
     x.linspace(1);
-    
-        
+
+
     nd4j::ops::reduce_mean op;
     auto result = op.execute({&x}, {}, {0,1});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -2195,13 +2195,13 @@ TEST_F(DeclarableOpsTests8, reduceMean_test2) {
     auto x = NDArrayFactory::create<double>('c', {2,3,4});
     auto exp = NDArrayFactory::create<double>('c', {1,1,4}, {11.f, 12.f, 13.f, 14.f});
     x.linspace(1);
-    
-        
+
+
     nd4j::ops::reduce_mean op;
     auto result = op.execute({&x}, {1.}, {0,1});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -2215,13 +2215,13 @@ TEST_F(DeclarableOpsTests8, reduceMean_test3) {
     auto x = NDArrayFactory::create<double>('c', {2,3,4});
     auto exp = NDArrayFactory::create<double>('c', {3}, {8.5f, 12.5f, 16.5f});
     x.linspace(1);
-    
-        
+
+
     nd4j::ops::reduce_mean op;
     auto result = op.execute({&x}, {}, {0,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -2235,13 +2235,13 @@ TEST_F(DeclarableOpsTests8, reduceMean_test4) {
     auto x = NDArrayFactory::create<double>('c', {2,3,4});
     auto exp = NDArrayFactory::create<double>('c', {1,3,1}, {8.5f, 12.5f, 16.5f});
     x.linspace(1);
-    
-        
+
+
     nd4j::ops::reduce_mean op;
     auto result = op.execute({&x}, {1.f}, {0,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -2255,13 +2255,13 @@ TEST_F(DeclarableOpsTests8, reduceMean_test5) {
     auto x = NDArrayFactory::create<double>('c', {2,3,4});
     auto exp = NDArrayFactory::create<double>(12.5f);
     x.linspace(1);
-    
-        
+
+
     nd4j::ops::reduce_mean op;
     auto result = op.execute({&x}, {}, {});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -2275,12 +2275,12 @@ TEST_F(DeclarableOpsTests8, reduceMean_test6) {
     auto x = NDArrayFactory::create<double>('c', {2,3,4});
     auto exp = NDArrayFactory::create<double>(12.5f);
     x.linspace(1);
-           
+
     nd4j::ops::reduce_mean op;
     auto result = op.execute({&x}, {}, {0,1,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -2294,12 +2294,12 @@ TEST_F(DeclarableOpsTests8, reduceMean_test7) {
     auto x = NDArrayFactory::create<double>('c', {2,3,4});
     auto exp = NDArrayFactory::create<double>('c', {1,1,1}, {12.5f});
     x.linspace(1);
-           
+
     nd4j::ops::reduce_mean op;
     auto result = op.execute({&x}, {1.}, {0,1,2});
-    auto output = result->at(0);    
+    auto output = result->at(0);
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -2336,11 +2336,11 @@ TEST_F(DeclarableOpsTests8, reduceMeanBP_test1) {
     auto exp = NDArrayFactory::create<double>('c', {3,4}, {1./24, 1./24, 1./24, 1./24, 1./24, 1./24, 1./24, 1./24, 1./24, 1./24, 1./24, 1./24});
 
     x.linspace(1);
-            
+
     nd4j::ops::reduce_mean_bp op;
 
     auto result = op.execute({&x, &gradO1}, {0}, {});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
     auto output = result->at(0);
 
     // output->printShapeInfo("o");
@@ -2350,7 +2350,7 @@ TEST_F(DeclarableOpsTests8, reduceMeanBP_test1) {
     delete result;
 
     result = op.execute({&x, &gradO2}, {1}, {});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
     output = result->at(0);
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -2367,18 +2367,18 @@ TEST_F(DeclarableOpsTests8, reduceMeanBP_test2) {
     auto exp = NDArrayFactory::create<double>('c', {3,4}, {1.f/3.f, 2.f/3.f, 1.f, 4.f/3.f, 1.f/3.f, 2.f/3.f, 1.f, 4.f/3.f, 1.f/3.f, 2.f/3.f, 1.f, 4.f/3.f});
 
     x.linspace(1);
-            
+
     nd4j::ops::reduce_mean_bp op;
 
     auto result = op.execute({&x, &gradO1}, {0}, {0});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
     auto output = result->at(0);
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
     delete result;
 
     result = op.execute({&x, &gradO2}, {1}, {0});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
     output = result->at(0);
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -2422,18 +2422,18 @@ TEST_F(DeclarableOpsTests8, reduceMeanBP_test3) {
     auto exp = NDArrayFactory::create<double>('c', {3,4}, {0.25f, 0.25f, 0.25f, 0.25f, 0.5f, 0.5f, 0.5f, 0.5f, 0.75f, 0.75f, 0.75f, 0.75f});
 
     x.linspace(1);
-            
+
     nd4j::ops::reduce_mean_bp op;
 
     auto result = op.execute({&x, &gradO1}, {0}, {1});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
     auto output = result->at(0);
     ASSERT_TRUE(exp.isSameShape(output));
-    ASSERT_TRUE(exp.equalsTo(output)); 
+    ASSERT_TRUE(exp.equalsTo(output));
     delete result;
 
     result = op.execute({&x, &gradO2}, {1}, {1});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
     output = result->at(0);
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -2444,14 +2444,14 @@ TEST_F(DeclarableOpsTests8, reduceMeanBP_test3) {
 TEST_F(DeclarableOpsTests8, reduceStDevBP_test4) {
 
     auto x = NDArrayFactory::create<double>('c', {3}, {2.f, 3.f, 4.f});
-    auto gradO = NDArrayFactory::create<double>(0.5f);    
-    auto exp = NDArrayFactory::create<double>('c', {3}, {-0.25f, 0.f, 0.25f});    
-            
+    auto gradO = NDArrayFactory::create<double>(0.5f);
+    auto exp = NDArrayFactory::create<double>('c', {3}, {-0.25f, 0.f, 0.25f});
+
     nd4j::ops::reduce_stdev_bp op;
 
     auto result = op.execute({&x, &gradO}, {0,1}, {});
-    ASSERT_EQ(Status::OK(), result->status());        
-    auto output = result->at(0);        
+    ASSERT_EQ(Status::OK(), result->status());
+    auto output = result->at(0);
 
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
@@ -2481,7 +2481,7 @@ TEST_F(DeclarableOpsTests8, avgpool2d_test13) {
 
     nd4j::ops::avgpool2d op;
     auto results = op.execute({&input}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW,  paddingMode, 0, dataFormat});
-    auto output = results->at(0);    
+    auto output = results->at(0);
 
     ASSERT_EQ(Status::OK(), results->status());
 
@@ -2489,19 +2489,19 @@ TEST_F(DeclarableOpsTests8, avgpool2d_test13) {
     //expected.printIndexedBuffer("expected");
 
     ASSERT_TRUE(expected.isSameShape(output));
-    ASSERT_TRUE(expected.equalsTo(output));    
- 
+    ASSERT_TRUE(expected.equalsTo(output));
+
     delete results;
 }
 
- 
+
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test1) {
-    
+
     auto labels = NDArrayFactory::create<double>('c', {2,3,4},{0,1,1,0,0,0,1,0,1,0,1,1,1,0,1,0,1,0,0,1,1,0,1,0});
     auto logits = NDArrayFactory::create<double>('c', {2,3,4});
     auto expected = NDArrayFactory::create<double>('c', {2,3}, {2.78507, 1.34254, 4.12761, 2.88507, 2.78507, 2.88507});
-                                            
+
     logits.linspace(0.1, 0.1);
 
     nd4j::ops::softmax_cross_entropy_loss_with_logits op;
@@ -2509,7 +2509,7 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test1) {
 
     ASSERT_EQ(Status::OK(), results->status());
 
-    auto *output = results->at(0);    
+    auto *output = results->at(0);
 
     ASSERT_TRUE(expected.isSameShape(output));
     ASSERT_TRUE(expected.equalsTo(output));
@@ -2519,11 +2519,11 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test1) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test2) {
-    
+
     auto labels = NDArrayFactory::create<double>('c', {2,3,4},{0,1,1,0,0,0,1,0,1,0,1,1,1,0,1,0,1,0,0,1,1,0,1,0});
     auto logits = NDArrayFactory::create<double>('c', {2,3,4});
     auto expected = NDArrayFactory::create<double>('c', {3,4}, {0.26328, 1.46328, 1.72656, 0.     , 0.26328, 0.     , 1.46328, 0.26328, 1.72656, 0.     , 1.72656, 1.46328});
-                                            
+
     logits.linspace(0.1, 0.1);
 
     nd4j::ops::softmax_cross_entropy_loss_with_logits op;
@@ -2531,7 +2531,7 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test2) {
 
     ASSERT_EQ(Status::OK(), results->status());
 
-    auto *output = results->at(0);    
+    auto *output = results->at(0);
 
     ASSERT_TRUE(expected.isSameShape(output));
     ASSERT_TRUE(expected.equalsTo(output));
@@ -2541,11 +2541,11 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test2) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test3) {
-    
+
     auto labels = NDArrayFactory::create<double>('c', {2,3,4},{0,1,1,0,0,0,1,0,1,0,1,1,1,0,1,0,1,0,0,1,1,0,1,0});
     auto logits = NDArrayFactory::create<double>('c', {2,3,4});
     auto expected = NDArrayFactory::create<double>('c', {2,4}, {0.75125, 1.55125, 3.45375, 0.75125, 3.45375, 0.     , 2.3025 , 1.15125});
-                                            
+
     logits.linspace(0.1, 0.1);
 
     nd4j::ops::softmax_cross_entropy_loss_with_logits op;
@@ -2553,7 +2553,7 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test3) {
 
     ASSERT_EQ(Status::OK(), results->status());
 
-    auto *output = results->at(0);    
+    auto *output = results->at(0);
 
     ASSERT_TRUE(expected.isSameShape(output));
     ASSERT_TRUE(expected.equalsTo(output));
@@ -2563,11 +2563,11 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test3) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test4) {
-    
+
     auto labels = NDArrayFactory::create<double>('c', {2,3},{0,1,1,0,0,1});
     auto logits = NDArrayFactory::create<double>('c', {2,3});
     auto expected = NDArrayFactory::create<double>('c', {2}, {2.10389, 1.00194});
-                                            
+
     logits.linspace(0.1, 0.1);
 
     nd4j::ops::softmax_cross_entropy_loss_with_logits op;
@@ -2585,11 +2585,11 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test4) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test5) {
-    
+
     auto labels = NDArrayFactory::create<double>('c', {2,3},{0,1,1,0,0,1});
     auto logits = NDArrayFactory::create<double>('c', {2,3});
     auto expected = NDArrayFactory::create<double>('c', {3}, {0., 0.85436, 1.40871});
-                                            
+
     logits.linspace(0.1, 0.1);
 
     nd4j::ops::softmax_cross_entropy_loss_with_logits op;
@@ -2607,11 +2607,11 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test5) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test6) {
-    
+
     auto labels = NDArrayFactory::create<double>('c', {2,1}, {0,1});
     auto logits = NDArrayFactory::create<double>('c', {2,1});
     auto expected = NDArrayFactory::create<double>('c', {1}, {0.6444});
-                                            
+
     logits.linspace(0.1, 0.1);
 
     nd4j::ops::softmax_cross_entropy_loss_with_logits op;
@@ -2629,11 +2629,11 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test6) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test7) {
-    
+
     auto labels = NDArrayFactory::create<double>('c', {2,1}, {0,1});
     auto logits = NDArrayFactory::create<double>('c', {2,1});
     auto expected = NDArrayFactory::create<double>('c', {2}, {0., 0.});
-                                            
+
     logits.linspace(0.1, 0.1);
 
     nd4j::ops::softmax_cross_entropy_loss_with_logits op;
@@ -2651,11 +2651,11 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test7) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test8) {
-    
+
     auto labels = NDArrayFactory::create<double>('c', {2}, {0,1});
     auto logits = NDArrayFactory::create<double>('c', {2});
     auto expected = NDArrayFactory::create<double>(0.6444);
-                                            
+
     logits.linspace(0.1, 0.1);
 
     nd4j::ops::softmax_cross_entropy_loss_with_logits op;
@@ -2663,7 +2663,7 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test8) {
 
     ASSERT_EQ(Status::OK(), results->status());
 
-    auto *output = results->at(0);    
+    auto *output = results->at(0);
 
     ASSERT_TRUE(expected.isSameShape(output));
     ASSERT_TRUE(expected.equalsTo(output));
@@ -2673,11 +2673,11 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test8) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test9) {
-    
+
     auto labels = NDArrayFactory::create<double>('c', {1}, {0.});
     auto logits = NDArrayFactory::create<double>('c', {1}, {0.2});
     auto expected = NDArrayFactory::create<double>(0.);
-                                               
+
     nd4j::ops::softmax_cross_entropy_loss_with_logits op;
     auto results = op.execute({&logits, &labels}, {}, {});
 
@@ -2693,11 +2693,11 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test9) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test10) {
-    
+
     auto labels = NDArrayFactory::create<double>('c', {1,2}, {0,1});
     auto logits = NDArrayFactory::create<double>('c', {1,2});
     auto expected = NDArrayFactory::create<double>('c', {2}, {0., 0.});
-                                            
+
     logits.linspace(0.1, 0.1);
 
     nd4j::ops::softmax_cross_entropy_loss_with_logits op;
@@ -2715,14 +2715,14 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test10) {
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, clipbynorm_test4) {
-    
+
     auto x = NDArrayFactory::create<double>('c', {3, 5}, {0.7044955, 0.55606544, 0.15833677, 0.001874401, 0.61595726, 0.3924779, 0.7414847, 0.4127324, 0.24026828, 0.26093036, 0.46741188, 0.01863421, 0.08528871, 0.529365, 0.5510694});
-    auto exp = NDArrayFactory::create<double>('c', {3, 5}, {0.405392, 0.319980, 0.091113, 0.001079, 0.354444, 0.225846, 0.426676, 0.237501, 0.138259, 0.150149, 0.268965, 0.010723, 0.049078, 0.304615, 0.317105});    
+    auto exp = NDArrayFactory::create<double>('c', {3, 5}, {0.405392, 0.319980, 0.091113, 0.001079, 0.354444, 0.225846, 0.426676, 0.237501, 0.138259, 0.150149, 0.268965, 0.010723, 0.049078, 0.304615, 0.317105});
 
     nd4j::ops::clipbynorm op;
     auto result = op.execute({&x}, {1.f}, {}, {}, false, nd4j::DataType::DOUBLE);
     auto output = result->at(0);
-        
+
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
 
@@ -2731,16 +2731,18 @@ TEST_F(DeclarableOpsTests8, clipbynorm_test4) {
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, clipbynorm_test5) {
-    
+
+    // auto x = NDArrayFactory::create<double>('c', {3, 5}, {1,2,3,4,5,  1,2,3,4,5,  1,2,3,4,5});
     auto x = NDArrayFactory::create<double>('c', {3, 5});
-    auto exp = NDArrayFactory::create<double>('c', {3, 5}, {1.,  2.,  2.89271,  3.50524,  4.00892, 6.,  7.,  7.71389,  7.88678,  8.01784, 11., 12., 12.53507, 12.26833, 12.02676});    
+    auto exp = NDArrayFactory::create<double>('c', {3, 5}, {1.,  2.,  2.89271,  3.50524,  4.00892, 6.,  7.,  7.71389,  7.88678,  8.01784, 11., 12., 12.53507, 12.26833, 12.02676});
+    // auto exp = NDArrayFactory::create<double>('c', {3, 5}, {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1});
 
     x.linspace(1);
 
     nd4j::ops::clipbynorm op;
     auto result = op.execute({&x}, {15.f}, {0}, {}, false, nd4j::DataType::DOUBLE);
     auto output = result->at(0);
-        
+
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
 
@@ -2749,25 +2751,25 @@ TEST_F(DeclarableOpsTests8, clipbynorm_test5) {
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, clipbynorm_test6) {
-    
+
     auto x = NDArrayFactory::create<double>('c', {3, 5});
-    auto exp = NDArrayFactory::create<double>('c', {3, 5}, {1., 2., 3., 4., 5., 4.95434, 5.78006, 6.60578, 7.43151, 8.25723, 5.64288, 6.15587, 6.66886, 7.18185, 7.69484});    
+    auto exp = NDArrayFactory::create<double>('c', {3, 5}, {1., 2., 3., 4., 5., 4.95434, 5.78006, 6.60578, 7.43151, 8.25723, 5.64288, 6.15587, 6.66886, 7.18185, 7.69484});
 
     x.linspace(1);
 
     nd4j::ops::clipbynorm op;
     auto result = op.execute({&x}, {15.f}, {1}, {}, false, nd4j::DataType::DOUBLE);
     auto output = result->at(0);
-        
+
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
 
     delete result;
 }
- 
+
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, clipbynorm_test7) {
-    
+
     auto x = NDArrayFactory::create<double>('c', {3, 5});
     auto exp = NDArrayFactory::create<double>('c', {3, 5}, {0.42597, 0.85194, 1.27791, 1.70389, 2.12986, 2.55583, 2.9818 , 3.40777, 3.83374, 4.25971, 4.68569, 5.11166, 5.53763, 5.9636 , 6.38957});
 
@@ -2782,10 +2784,10 @@ TEST_F(DeclarableOpsTests8, clipbynorm_test7) {
 
     delete result;
 }
- 
+
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, clipbynorm_test8) {
-    
+
     auto x = NDArrayFactory::create<double>('c', {3, 5});
     auto exp = NDArrayFactory::create<double>('c', {3, 5}, {0.42597, 0.85194, 1.27791, 1.70389, 2.12986, 2.55583, 2.9818 , 3.40777, 3.83374, 4.25971, 4.68569, 5.11166, 5.53763, 5.9636 , 6.38957});
 
@@ -2800,12 +2802,12 @@ TEST_F(DeclarableOpsTests8, clipbynorm_test8) {
 
     delete result;
 }
- 
+
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, clipbynorm_test9) {
-    
+
     auto x = NDArrayFactory::create<double>('c', {2}, {3., 4.});
-    auto exp = NDArrayFactory::create<double>('c', {2}, {2.4, 3.2});    
+    auto exp = NDArrayFactory::create<double>('c', {2}, {2.4, 3.2});
 
     nd4j::ops::clipbynorm op;
     auto result = op.execute({&x}, {4.}, {}, {}, false, nd4j::DataType::DOUBLE);
@@ -2816,10 +2818,10 @@ TEST_F(DeclarableOpsTests8, clipbynorm_test9) {
 
     delete result;
 }
- 
+
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, clipbynorm_test10) {
-    
+
     auto x = NDArrayFactory::create<double>(6.);
     auto exp = NDArrayFactory::create<double>(5.);
 
@@ -2832,10 +2834,10 @@ TEST_F(DeclarableOpsTests8, clipbynorm_test10) {
 
     delete result;
 }
- 
+
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests8, clipbynorm_test11) {
-    
+
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     auto exp = NDArrayFactory::create<double>('c', {2, 3, 4}, {1.,  2.,  3.,  4.,  4.44787,  5.33745,  6.22702,  7.1166 , 6.33046,  7.03384,  7.73723,  8.44061,
                                         13., 14., 15., 16., 15.12277, 16.01235, 16.90192, 17.7915 ,14.77107, 15.47446, 16.17784, 16.88123});
@@ -2872,19 +2874,19 @@ TEST_F(DeclarableOpsTests8, reduceMeanBP_test4) {
     auto gradO1 = NDArrayFactory::create<double>('c', {4}, {1., 2., 3., 4.});
     auto gradO2 = NDArrayFactory::create<double>('c', {1, 4}, {1., 2., 3., 4.});
     auto exp = NDArrayFactory::create<double>('c', {3,4}, {0.333333, 0.666667, 1.000000, 1.333333, 0.333333, 0.666667, 1.000000, 1.333333, 0.333333, 0.666667, 1.000000, 1.333333});
-                                     
+
     nd4j::ops::reduce_mean_bp op;
 
     auto result = op.execute({&x, &gradO1}, {0}, {0});
-    ASSERT_EQ(Status::OK(), result->status());    
-    auto output = result->at(0);    
+    ASSERT_EQ(Status::OK(), result->status());
+    auto output = result->at(0);
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
     delete result;
 
     result = op.execute({&x, &gradO2}, {1}, {0});
-    ASSERT_EQ(Status::OK(), result->status());    
-    output = result->at(0);    
+    ASSERT_EQ(Status::OK(), result->status());
+    output = result->at(0);
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
     delete result;
@@ -2898,19 +2900,19 @@ TEST_F(DeclarableOpsTests8, reduceMeanBP_test5) {
     auto gradO1 = NDArrayFactory::create<double>('c', {3}, {1., 2., 3.});
     auto gradO2 = NDArrayFactory::create<double>('c', {3, 1}, {1., 2., 3.});
     auto exp = NDArrayFactory::create<double>('c', {3,4}, {0.2500,0.2500,0.2500,0.2500, 0.5000,0.5000,0.5000,0.5000, 0.7500,0.7500,0.7500,0.7500});
-                                     
+
     nd4j::ops::reduce_mean_bp op;
-    
+
     auto result = op.execute({&x, &gradO1}, {0}, {1});
-    ASSERT_EQ(Status::OK(), result->status());    
-    auto output = result->at(0);    
+    ASSERT_EQ(Status::OK(), result->status());
+    auto output = result->at(0);
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
     delete result;
 
     result = op.execute({&x, &gradO2}, {1}, {1});
-    ASSERT_EQ(Status::OK(), result->status());    
-    output = result->at(0);    
+    ASSERT_EQ(Status::OK(), result->status());
+    output = result->at(0);
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
     delete result;
@@ -2924,19 +2926,19 @@ TEST_F(DeclarableOpsTests8, reduceStDevBP_test5) {
     auto gradO1 = NDArrayFactory::create<double>('c', {4}, {1., 2., 3., 4.});
     auto gradO2 = NDArrayFactory::create<double>('c', {1, 4}, {1., 2., 3., 4.});
     auto exp = NDArrayFactory::create<double>('c', {3,4}, {-0.408248, -0.816497, -1.224745, -1.632993, 0.000000, 0.000000, 0.000000, 0.000000, 0.408248, 0.816497, 1.224745, 1.632993});
-                                                                          
+
     nd4j::ops::reduce_stdev_bp op;
 
     auto result = op.execute({&x, &gradO1}, {0}, {0});
-    ASSERT_EQ(Status::OK(), result->status());    
-    auto output = result->at(0);        
+    ASSERT_EQ(Status::OK(), result->status());
+    auto output = result->at(0);
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
     delete result;
 
     result = op.execute({&x, &gradO2}, {1}, {0});
-    ASSERT_EQ(Status::OK(), result->status());    
-    output = result->at(0);    
+    ASSERT_EQ(Status::OK(), result->status());
+    output = result->at(0);
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
     delete result;
@@ -2948,12 +2950,12 @@ TEST_F(DeclarableOpsTests8, zeros_as_test1) {
     auto x = NDArrayFactory::create<double>(10.f);
     auto y = NDArrayFactory::create<double>(100.f);
     auto exp = NDArrayFactory::create<double>(0.f);
-                                                                          
+
     nd4j::ops::zeros_as op;
 
     Nd4jStatus status = op.execute({&x}, {&y}, {}, {}, {});
-    ASSERT_EQ(Status::OK(), status);    
-    
+    ASSERT_EQ(Status::OK(), status);
+
     ASSERT_TRUE(y.isSameShape(exp));
     ASSERT_TRUE(y.equalsTo(exp));
 
@@ -2987,11 +2989,11 @@ TEST_F(DeclarableOpsTests8, ones_as_test1) {
     nd4j::ops::ones_as op;
 
     Nd4jStatus status = op.execute({&x}, {&y}, {}, {}, {}, false, nd4j::DataType::DOUBLE);
-    ASSERT_EQ(Status::OK(), status);    
-    
+    ASSERT_EQ(Status::OK(), status);
+
     ASSERT_TRUE(y.isSameShape(exp));
     ASSERT_TRUE(y.equalsTo(exp));
-        
+
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -3017,7 +3019,7 @@ TEST_F(DeclarableOpsTests8, NormalizeMoments_SGO_1) {
 
     auto data   = NDArrayFactory::create<double>('c', {10, 10});
     data.linspace(1);
-    
+
     auto means = data.reduceAlongDimension(reduce::Sum, {0});
     auto deviance = NDArrayFactory::create<double>('c', {10}, {825., 825. , 825., 825., 825., 825., 825., 825., 825., 825. }); // data.varianceAlongDimension(variance::SummaryStatsVariance, false, {0}); // = NDArrayFactory::create<double>('c', {10, 10});
 
@@ -3040,24 +3042,24 @@ TEST_F(DeclarableOpsTests8, NormalizeMoments_SGO_1) {
     ASSERT_EQ(Status::OK(), results->status());
     ASSERT_EQ(results->size(), 2);
 
-    auto outputMeans = results->at(0);    
-    auto outputDeviance = results->at(1);    
+    auto outputMeans = results->at(0);
+    auto outputDeviance = results->at(1);
 
 //    outputMeans->printIndexedBuffer("Means");
 //    outputDeviance->printIndexedBuffer("Variance");
 //    deviance.printIndexedBuffer("Expected");
 //    means->printIndexedBuffer("Expected means");
     ASSERT_TRUE(means->isSameShape(outputMeans));
-    ASSERT_TRUE(means->equalsTo(outputMeans));    
+    ASSERT_TRUE(means->equalsTo(outputMeans));
     ASSERT_TRUE(deviance.isSameShape(outputDeviance));
     ASSERT_TRUE(deviance.equalsTo(outputDeviance));
     delete means;
     //delete deviance;
     delete ssSquared;
 //    ASSERT_TRUE(expMeans.isSameShape(outputMeans));
-//    ASSERT_TRUE(expMeans.equalsTo(outputMeans));    
+//    ASSERT_TRUE(expMeans.equalsTo(outputMeans));
 //    ASSERT_TRUE(expMeans.isSameShape(outputDeviance));
-//    ASSERT_TRUE(expDeviance.equalsTo(outputDeviance));    
+//    ASSERT_TRUE(expDeviance.equalsTo(outputDeviance));
 
     delete results;
 }
@@ -3073,10 +3075,10 @@ TEST_F(DeclarableOpsTests8, Test_Moments_1) {
     nd4j::ops::moments op;
     auto result = op.execute({&x}, {}, {0, 1});
 
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
-    auto outputMeans = result->at(0);    
-    auto outputVariance = result->at(1);    
+    auto outputMeans = result->at(0);
+    auto outputVariance = result->at(1);
 
 //    outputMeans->printIndexedBuffer("Means");
 //    outputVariance->printIndexedBuffer("Variance");
@@ -3103,10 +3105,10 @@ TEST_F(DeclarableOpsTests8, Test_Moments_2) {
 
     nd4j::ops::moments op;
     auto result = op.execute({&x}, {1.}, {0, 1});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
-    auto outputMeans = result->at(0);    
-    auto outputVariance = result->at(1);    
+    auto outputMeans = result->at(0);
+    auto outputVariance = result->at(1);
 
 //    outputMeans->printIndexedBuffer("Means");
 //    outputVariance->printIndexedBuffer("Variance");
@@ -3132,10 +3134,10 @@ TEST_F(DeclarableOpsTests8, Test_Moments_3) {
 
     nd4j::ops::moments op;
     auto result = op.execute({&x}, {}, {0, 2});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
-    auto outputMeans = result->at(0);    
-    auto outputVariance = result->at(1);    
+    auto outputMeans = result->at(0);
+    auto outputVariance = result->at(1);
 
 //    outputMeans->printIndexedBuffer("Means");
 //    outputVariance->printIndexedBuffer("Variance");
@@ -3161,10 +3163,10 @@ TEST_F(DeclarableOpsTests8, Test_Moments_4) {
 
     nd4j::ops::moments op;
     auto result = op.execute({&x}, {1.}, {0, 2});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
-    auto outputMeans = result->at(0);    
-    auto outputVariance = result->at(1);    
+    auto outputMeans = result->at(0);
+    auto outputVariance = result->at(1);
 
 //    outputMeans->printIndexedBuffer("Means");
 //    outputVariance->printIndexedBuffer("Variance");
@@ -3187,13 +3189,13 @@ TEST_F(DeclarableOpsTests8, Test_Moments_6) {
 
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     x.linspace(1);
-           
+
     nd4j::ops::moments op;
     auto result = op.execute({&x}, {}, {0,1,2});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
-    auto outputMeans = result->at(0);    
-    auto outputVariance = result->at(1);    
+    auto outputMeans = result->at(0);
+    auto outputVariance = result->at(1);
 
 //    outputMeans->printIndexedBuffer("Means");
 //    outputVariance->printIndexedBuffer("Variance");
@@ -3216,13 +3218,13 @@ TEST_F(DeclarableOpsTests8, Test_Moments_7) {
     auto expVariance = NDArrayFactory::create<double>('c', {1,1,1}, {47.916668f});
 
     x.linspace(1);
-    // x.printIndexedBuffer("Input with shape (2, 3, 4) is");       
+    // x.printIndexedBuffer("Input with shape (2, 3, 4) is");
     nd4j::ops::moments op;
     auto result = op.execute({&x}, {1.}, {0,1,2});
-    ASSERT_EQ(Status::OK(), result->status());    
+    ASSERT_EQ(Status::OK(), result->status());
 
-    auto outputMeans = result->at(0);    
-    auto outputVariance = result->at(1);    
+    auto outputMeans = result->at(0);
+    auto outputVariance = result->at(1);
 
 //    outputMeans->printIndexedBuffer("Means");
 //    outputVariance->printIndexedBuffer("Variance");
@@ -3319,13 +3321,13 @@ TYPED_TEST(TypedDeclarableOpsTests8, LrnTest_1) {
     nd4j::ops::lrn op;
     auto  results = op.execute({&x}, {1.0, 1.0, 0.5}, {2}, {}, false, nd4j::DataType::DOUBLE);
     auto out = results->at(0);
-        
+
     ASSERT_EQ(Status::OK(), results->status());
     ASSERT_TRUE(exp.isSameShape(out));
 //    out->printIndexedBuffer("LRN out");
 //    exp.printIndexedBuffer("LRN exp");
-    ASSERT_TRUE(exp.equalsTo(out));    
-    
+    ASSERT_TRUE(exp.equalsTo(out));
+
     delete results;
 }
 
@@ -3334,75 +3336,75 @@ TYPED_TEST(TypedDeclarableOpsTests8, LrnTest_2) {
 
     auto x = NDArrayFactory::create<TypeParam>('c', {3, 3, 5, 5});
     x.linspace(1);
-    
+
     auto exp = NDArrayFactory::create<TypeParam>('c', {3, 3, 5, 5}, {
-    0.2581989f, 0.3592106f,  0.40089184f,  0.53935987f,  0.70014f,   
-    0.4898979f, 0.46056613f,  0.43971977f,  0.5240002f,  0.6375767f, 
-    0.5274096f, 0.47771242f,  0.4443308f,  0.5163977f,  0.61701745f, 
-    0.5424508f, 0.48452914f,  0.44570294f,  0.5123918f,  0.6068971f, 
-    0.5505386f, 0.4881662f,  0.4462865f,  0.5099462f,  0.60088515f, 
+    0.2581989f, 0.3592106f,  0.40089184f,  0.53935987f,  0.70014f,
+    0.4898979f, 0.46056613f,  0.43971977f,  0.5240002f,  0.6375767f,
+    0.5274096f, 0.47771242f,  0.4443308f,  0.5163977f,  0.61701745f,
+    0.5424508f, 0.48452914f,  0.44570294f,  0.5123918f,  0.6068971f,
+    0.5505386f, 0.4881662f,  0.4462865f,  0.5099462f,  0.60088515f,
 
-    0.5555859f,  0.49042296f,  0.44658744f,  0.5083028f,  0.59690416f, 
-    0.55903524f,  0.4919585f,  0.44676256f,  0.5071239f,  0.59407425f, 
-    0.5615412f,  0.49307042f,  0.44687328f,  0.50623745f,  0.5919596f, 
-    0.56344414f,  0.49391258f,  0.4469477f,  0.5055468f,  0.59031945f, 
-    0.56493837f,  0.49457246f,  0.4470002f,  0.5049936f,  0.5890103f, 
+    0.5555859f,  0.49042296f,  0.44658744f,  0.5083028f,  0.59690416f,
+    0.55903524f,  0.4919585f,  0.44676256f,  0.5071239f,  0.59407425f,
+    0.5615412f,  0.49307042f,  0.44687328f,  0.50623745f,  0.5919596f,
+    0.56344414f,  0.49391258f,  0.4469477f,  0.5055468f,  0.59031945f,
+    0.56493837f,  0.49457246f,  0.4470002f,  0.5049936f,  0.5890103f,
 
-    0.56614274f,  0.49510333f,  0.44703856f,  0.50454074f,  0.5879411f, 
+    0.56614274f,  0.49510333f,  0.44703856f,  0.50454074f,  0.5879411f,
     0.567134f,  0.49553978f,  0.4470674f,  0.504163f,  0.5870515f,
-    0.5679643f,  0.4959048f,  0.44708967f,  0.5038433f,  0.5862998f, 
-    0.56866974f,  0.4962146f,  0.44710726f,  0.5035692f,  0.58565617f, 
-    0.56927663f,  0.49648085f,  0.4471213f,  0.5033315f,  0.5850988f, 
+    0.5679643f,  0.4959048f,  0.44708967f,  0.5038433f,  0.5862998f,
+    0.56866974f,  0.4962146f,  0.44710726f,  0.5035692f,  0.58565617f,
+    0.56927663f,  0.49648085f,  0.4471213f,  0.5033315f,  0.5850988f,
 
 
-    0.56980413f,  0.49671215f,  0.44713274f,  0.50312346f,  0.58461165f, 
-    0.57026696f,  0.49691492f,  0.4471422f,  0.50293994f,  0.58418214f, 
-    0.5706764f,  0.49709415f,  0.44715008f,  0.5027767f,  0.5838005f, 
+    0.56980413f,  0.49671215f,  0.44713274f,  0.50312346f,  0.58461165f,
+    0.57026696f,  0.49691492f,  0.4471422f,  0.50293994f,  0.58418214f,
+    0.5706764f,  0.49709415f,  0.44715008f,  0.5027767f,  0.5838005f,
     0.571041f,  0.4972537f,  0.44715673f,  0.50263065f,  0.58345926f,
-    0.57136786f,  0.49739665f,  0.44716236f,  0.5024992f,  0.58315235f, 
+    0.57136786f,  0.49739665f,  0.44716236f,  0.5024992f,  0.58315235f,
 
-    0.5716625f,  0.49752548f,  0.4471672f,  0.5023803f,   0.5828747f, 
-    0.5719295f,  0.49764213f,  0.44717142f,  0.5022721f,   0.5826225f, 
-    0.57217246f,  0.49774826f,  0.44717506f,  0.5021734f,   0.58239233f, 
-    0.5723947f,  0.4978453f,  0.44717824f,  0.5020829f,   0.58218133f, 
-    0.57259864f,  0.49793428f,  0.44718108f,  0.5019997f,   0.5819874f, 
+    0.5716625f,  0.49752548f,  0.4471672f,  0.5023803f,   0.5828747f,
+    0.5719295f,  0.49764213f,  0.44717142f,  0.5022721f,   0.5826225f,
+    0.57217246f,  0.49774826f,  0.44717506f,  0.5021734f,   0.58239233f,
+    0.5723947f,  0.4978453f,  0.44717824f,  0.5020829f,   0.58218133f,
+    0.57259864f,  0.49793428f,  0.44718108f,  0.5019997f,   0.5819874f,
 
-    0.5727864f,  0.49801624f,  0.44718358f,  0.5019227f,   0.5818083f, 
+    0.5727864f,  0.49801624f,  0.44718358f,  0.5019227f,   0.5818083f,
     0.57296f,  0.49809194f,  0.44718578f,  0.5018515f,   0.5816426f,
-    0.5731208f,  0.49816203f,  0.44718775f,  0.5017854f,   0.58148885f, 
-    0.57327026f,  0.49822718f,  0.4471895f,  0.5017239f,   0.5813457f, 
+    0.5731208f,  0.49816203f,  0.44718775f,  0.5017854f,   0.58148885f,
+    0.57327026f,  0.49822718f,  0.4471895f,  0.5017239f,   0.5813457f,
     0.57340944f,  0.49828786f,  0.44719115f,  0.5016664f,   0.581212f,
 
 
-    0.57353944f,  0.4983446f,  0.44719255f,  0.50161266f,  0.58108705f, 
-    0.5736612f,  0.49839762f,  0.4471939f,  0.50156236f,  0.5809699f, 
+    0.57353944f,  0.4983446f,  0.44719255f,  0.50161266f,  0.58108705f,
+    0.5736612f,  0.49839762f,  0.4471939f,  0.50156236f,  0.5809699f,
     0.5737754f,  0.4984474f,  0.44719502f,  0.501515f,  0.58085984f,
-    0.5738828f,  0.49849418f,  0.4471962f,  0.50147045f,  0.5807564f, 
-    0.5739839f,  0.49853817f,  0.44719717f,  0.5014284f,  0.5806588f, 
+    0.5738828f,  0.49849418f,  0.4471962f,  0.50147045f,  0.5807564f,
+    0.5739839f,  0.49853817f,  0.44719717f,  0.5014284f,  0.5806588f,
 
-    0.5740793f,  0.49857965f,  0.4471981f,  0.5013887f,  0.5805666f, 
-    0.5741694f,  0.49861887f,  0.44719887f,  0.50135124f,  0.58047944f, 
-    0.57425463f,  0.49865603f,  0.44719967f,  0.5013157f,  0.5803969f, 
-    0.5743354f,  0.4986912f,  0.44720036f,  0.5012819f,  0.5803186f, 
-    0.57441217f,  0.49872455f,  0.44720104f,  0.5012499f,  0.58024424f, 
+    0.5740793f,  0.49857965f,  0.4471981f,  0.5013887f,  0.5805666f,
+    0.5741694f,  0.49861887f,  0.44719887f,  0.50135124f,  0.58047944f,
+    0.57425463f,  0.49865603f,  0.44719967f,  0.5013157f,  0.5803969f,
+    0.5743354f,  0.4986912f,  0.44720036f,  0.5012819f,  0.5803186f,
+    0.57441217f,  0.49872455f,  0.44720104f,  0.5012499f,  0.58024424f,
 
-    0.57448506f,  0.4987563f,  0.44720164f,  0.5012194f,  0.58017343f, 
-    0.57455444f,  0.4987865f,  0.4472022f,  0.5011904f,  0.5801061f, 
-    0.57462054f,  0.49881527f,  0.44720277f,  0.5011627f,  0.5800419f, 
-    0.57468355f,  0.49884263f,  0.44720328f,  0.50113624f,  0.5799805f, 
+    0.57448506f,  0.4987563f,  0.44720164f,  0.5012194f,  0.58017343f,
+    0.57455444f,  0.4987865f,  0.4472022f,  0.5011904f,  0.5801061f,
+    0.57462054f,  0.49881527f,  0.44720277f,  0.5011627f,  0.5800419f,
+    0.57468355f,  0.49884263f,  0.44720328f,  0.50113624f,  0.5799805f,
     0.57474375f,  0.49886885f,  0.44720373f,  0.50111103f,  0.5799219f }
     );
 //
     nd4j::ops::lrn op;
     auto  results = op.execute({&x}, {1.0, 1.0, 0.5}, {2}, {}, false, nd4j::DataType::DOUBLE);
     auto out = results->at(0);
-        
+
     ASSERT_EQ(Status::OK(), results->status());
 //    ASSERT_TRUE(exp.isSameShape(out));
 //    out->printIndexedBuffer("LRN out");
 //    exp.printIndexedBuffer("LRN exp");
-    ASSERT_TRUE(exp.equalsTo(out));    
-    
+    ASSERT_TRUE(exp.equalsTo(out));
+
     delete results;
 }
 
@@ -3413,60 +3415,60 @@ TYPED_TEST(TypedDeclarableOpsTests8, LrnTest_3) {
     x.linspace(1);
 
     auto exp = NDArrayFactory::create<TypeParam>('c', {3, 3, 5, 5}, {
-            0.2581989f, 0.3592106f,  0.40089184f,  0.53935987f,  0.70014f, 
-            0.4898979f, 0.46056613f,  0.43971977f,  0.5240002f,  0.6375767f, 
-            0.5274096f, 0.47771242f,  0.4443308f,  0.5163977f,  0.61701745f, 
-            0.5424508f, 0.48452914f,  0.44570294f,  0.5123918f,  0.6068971f, 
-            0.5505386f, 0.4881662f,  0.4462865f,  0.5099462f,  0.60088515f, 
+            0.2581989f, 0.3592106f,  0.40089184f,  0.53935987f,  0.70014f,
+            0.4898979f, 0.46056613f,  0.43971977f,  0.5240002f,  0.6375767f,
+            0.5274096f, 0.47771242f,  0.4443308f,  0.5163977f,  0.61701745f,
+            0.5424508f, 0.48452914f,  0.44570294f,  0.5123918f,  0.6068971f,
+            0.5505386f, 0.4881662f,  0.4462865f,  0.5099462f,  0.60088515f,
 
-            0.5555859f,  0.49042296f,  0.44658744f,  0.5083028f,  0.59690416f, 
-            0.55903524f,  0.4919585f,  0.44676256f,  0.5071239f,  0.59407425f, 
-            0.5615412f,  0.49307042f,  0.44687328f,  0.50623745f,  0.5919596f, 
-            0.56344414f,  0.49391258f,  0.4469477f,  0.5055468f,  0.59031945f, 
-            0.56493837f,  0.49457246f,  0.4470002f,  0.5049936f,  0.5890103f, 
+            0.5555859f,  0.49042296f,  0.44658744f,  0.5083028f,  0.59690416f,
+            0.55903524f,  0.4919585f,  0.44676256f,  0.5071239f,  0.59407425f,
+            0.5615412f,  0.49307042f,  0.44687328f,  0.50623745f,  0.5919596f,
+            0.56344414f,  0.49391258f,  0.4469477f,  0.5055468f,  0.59031945f,
+            0.56493837f,  0.49457246f,  0.4470002f,  0.5049936f,  0.5890103f,
 
-            0.56614274f,  0.49510333f,  0.44703856f,  0.50454074f,  0.5879411f, 
+            0.56614274f,  0.49510333f,  0.44703856f,  0.50454074f,  0.5879411f,
             0.567134f,  0.49553978f,  0.4470674f,  0.504163f,  0.5870515f,
-            0.5679643f,  0.4959048f,  0.44708967f,  0.5038433f,  0.5862998f, 
-            0.56866974f,  0.4962146f,  0.44710726f,  0.5035692f,  0.58565617f, 
-            0.56927663f,  0.49648085f,  0.4471213f,  0.5033315f,  0.5850988f, 
+            0.5679643f,  0.4959048f,  0.44708967f,  0.5038433f,  0.5862998f,
+            0.56866974f,  0.4962146f,  0.44710726f,  0.5035692f,  0.58565617f,
+            0.56927663f,  0.49648085f,  0.4471213f,  0.5033315f,  0.5850988f,
 
 
-            0.56980413f,  0.49671215f,  0.44713274f,  0.50312346f,  0.58461165f, 
-            0.57026696f,  0.49691492f,  0.4471422f,  0.50293994f,  0.58418214f, 
-            0.5706764f,  0.49709415f,  0.44715008f,  0.5027767f,  0.5838005f, 
+            0.56980413f,  0.49671215f,  0.44713274f,  0.50312346f,  0.58461165f,
+            0.57026696f,  0.49691492f,  0.4471422f,  0.50293994f,  0.58418214f,
+            0.5706764f,  0.49709415f,  0.44715008f,  0.5027767f,  0.5838005f,
             0.571041f,  0.4972537f,  0.44715673f,  0.50263065f,  0.58345926f,
-            0.57136786f,  0.49739665f,  0.44716236f,  0.5024992f,  0.58315235f, 
+            0.57136786f,  0.49739665f,  0.44716236f,  0.5024992f,  0.58315235f,
 
-            0.5716625f,  0.49752548f,  0.4471672f,  0.5023803f,   0.5828747f, 
-            0.5719295f,  0.49764213f,  0.44717142f,  0.5022721f,   0.5826225f, 
-            0.57217246f,  0.49774826f,  0.44717506f,  0.5021734f,   0.58239233f, 
-            0.5723947f,  0.4978453f,  0.44717824f,  0.5020829f,   0.58218133f, 
-            0.57259864f,  0.49793428f,  0.44718108f,  0.5019997f,   0.5819874f, 
+            0.5716625f,  0.49752548f,  0.4471672f,  0.5023803f,   0.5828747f,
+            0.5719295f,  0.49764213f,  0.44717142f,  0.5022721f,   0.5826225f,
+            0.57217246f,  0.49774826f,  0.44717506f,  0.5021734f,   0.58239233f,
+            0.5723947f,  0.4978453f,  0.44717824f,  0.5020829f,   0.58218133f,
+            0.57259864f,  0.49793428f,  0.44718108f,  0.5019997f,   0.5819874f,
 
-            0.5727864f,  0.49801624f,  0.44718358f,  0.5019227f,   0.5818083f, 
+            0.5727864f,  0.49801624f,  0.44718358f,  0.5019227f,   0.5818083f,
             0.57296f,  0.49809194f,  0.44718578f,  0.5018515f,   0.5816426f,
-            0.5731208f,  0.49816203f,  0.44718775f,  0.5017854f,   0.58148885f, 
-            0.57327026f,  0.49822718f,  0.4471895f,  0.5017239f,   0.5813457f, 
+            0.5731208f,  0.49816203f,  0.44718775f,  0.5017854f,   0.58148885f,
+            0.57327026f,  0.49822718f,  0.4471895f,  0.5017239f,   0.5813457f,
             0.57340944f,  0.49828786f,  0.44719115f,  0.5016664f,   0.581212f,
 
 
-            0.57353944f,  0.4983446f,  0.44719255f,  0.50161266f,  0.58108705f, 
-            0.5736612f,  0.49839762f,  0.4471939f,  0.50156236f,  0.5809699f, 
+            0.57353944f,  0.4983446f,  0.44719255f,  0.50161266f,  0.58108705f,
+            0.5736612f,  0.49839762f,  0.4471939f,  0.50156236f,  0.5809699f,
             0.5737754f,  0.4984474f,  0.44719502f,  0.501515f,  0.58085984f,
-            0.5738828f,  0.49849418f,  0.4471962f,  0.50147045f,  0.5807564f, 
-            0.5739839f,  0.49853817f,  0.44719717f,  0.5014284f,  0.5806588f, 
+            0.5738828f,  0.49849418f,  0.4471962f,  0.50147045f,  0.5807564f,
+            0.5739839f,  0.49853817f,  0.44719717f,  0.5014284f,  0.5806588f,
 
-            0.5740793f,  0.49857965f,  0.4471981f,  0.5013887f,  0.5805666f, 
-            0.5741694f,  0.49861887f,  0.44719887f,  0.50135124f,  0.58047944f, 
-            0.57425463f,  0.49865603f,  0.44719967f,  0.5013157f,  0.5803969f, 
-            0.5743354f,  0.4986912f,  0.44720036f,  0.5012819f,  0.5803186f, 
-            0.57441217f,  0.49872455f,  0.44720104f,  0.5012499f,  0.58024424f, 
+            0.5740793f,  0.49857965f,  0.4471981f,  0.5013887f,  0.5805666f,
+            0.5741694f,  0.49861887f,  0.44719887f,  0.50135124f,  0.58047944f,
+            0.57425463f,  0.49865603f,  0.44719967f,  0.5013157f,  0.5803969f,
+            0.5743354f,  0.4986912f,  0.44720036f,  0.5012819f,  0.5803186f,
+            0.57441217f,  0.49872455f,  0.44720104f,  0.5012499f,  0.58024424f,
 
-            0.57448506f,  0.4987563f,  0.44720164f,  0.5012194f,  0.58017343f, 
-            0.57455444f,  0.4987865f,  0.4472022f,  0.5011904f,  0.5801061f, 
-            0.57462054f,  0.49881527f,  0.44720277f,  0.5011627f,  0.5800419f, 
-            0.57468355f,  0.49884263f,  0.44720328f,  0.50113624f,  0.5799805f, 
+            0.57448506f,  0.4987563f,  0.44720164f,  0.5012194f,  0.58017343f,
+            0.57455444f,  0.4987865f,  0.4472022f,  0.5011904f,  0.5801061f,
+            0.57462054f,  0.49881527f,  0.44720277f,  0.5011627f,  0.5800419f,
+            0.57468355f,  0.49884263f,  0.44720328f,  0.50113624f,  0.5799805f,
             0.57474375f,  0.49886885f,  0.44720373f,  0.50111103f,  0.5799219f }
     );
 //
@@ -3526,13 +3528,7 @@ TYPED_TEST(TypedDeclarableOpsTests8, LrnTest_4_119) {
     auto ttlTime = std::chrono::duration_cast<std::chrono::milliseconds> ((timeEnd - timeStart)).count();
 
 
-    //ASSERT_EQ(Status::OK(), results);
-
-    nd4j_printf("avg time: %lld ms\n", spanTime);
-
 //    ASSERT_TRUE(exp.isSameShape(out));
-//    out->printIndexedBuffer("LRN out");
-//    exp.printIndexedBuffer("LRN exp");
 //    ASSERT_TRUE(exp.equalsTo(out));
 }
 
@@ -3548,8 +3544,6 @@ TYPED_TEST(TypedDeclarableOpsTests8, LrnTest_5) {
 
     ASSERT_EQ(Status::OK(), results->status());
 //    ASSERT_TRUE(exp.isSameShape(out));
-//    out->printIndexedBuffer("LRN out");
-//    exp.printIndexedBuffer("LRN exp");
 //    ASSERT_TRUE(exp.equalsTo(out));
 
     delete results;
@@ -3626,13 +3620,13 @@ auto exp = NDArrayFactory::create<TypeParam>('c', {3,3,5,5}, {
     nd4j::ops::lrn_bp op;
     auto  results = op.execute({&x, &eps}, {1.0, 1.0, 0.5}, {2}, {}, false, typeid(TypeParam) == typeid(float) ? nd4j::DataType::FLOAT32 : nd4j::DataType::DOUBLE);
     auto out = results->at(0);
-        
+
     ASSERT_EQ(Status::OK(), results->status());
 //    ASSERT_TRUE(exp.isSameShape(out));
     // out->printBuffer("LRN BP out");
     // exp.printBuffer("LRN BP exp");
     //ASSERT_TRUE(exp.equalsTo(out));
-    
+
     delete results;
 }
 
@@ -3641,7 +3635,7 @@ TYPED_TEST(TypedDeclarableOpsTests8, LrnTest_BP_2) {
 
     auto x = NDArrayFactory::create<TypeParam>( 'c', {3, 3, 5, 5});
     x.linspace(1);
-    
+
     auto eps = NDArrayFactory::create<TypeParam>('c', {3, 3, 5, 5}, {            0.2581989 ,0.3592106 , 0.40089184, 0.53935987, 0.70014,
                                                                                  0.4898979 ,0.46056613, 0.43971977, 0.5240002 , 0.6375767,
                                                                                  0.5274096 ,0.47771242, 0.4443308 , 0.5163977 , 0.61701745,
@@ -3706,13 +3700,13 @@ TYPED_TEST(TypedDeclarableOpsTests8, LrnTest_BP_2) {
     nd4j::ops::lrn_bp op;
     auto  results = op.execute({&x, &eps}, {1.0, 1.0, 0.5}, {2}, {}, false, typeid(TypeParam) == typeid(float) ? nd4j::DataType::FLOAT32 : nd4j::DataType::DOUBLE);
     auto out = results->at(0);
-        
+
     ASSERT_EQ(Status::OK(), results->status());
     ASSERT_TRUE(exp.isSameShape(out));
     //out->printBuffer("LRN BP out");
 //    exp.printIndexedBuffer("LRN exp");
    // ASSERT_TRUE(exp.equalsTo(out));
-    
+
     delete results;
 }
 
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp
index 654d4bf2c..f88d6e930 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp
@@ -924,8 +924,6 @@ TEST_F(DeclarableOpsTests9, tile_test1) {
     auto reps   = NDArrayFactory::create<int>('c', {1, 2}, {2, 1});
     auto expOut = NDArrayFactory::create<double>('c', {2, 6,}, {1.,2.,3.,4.,5.,6., 1.,2.,3.,4.,5.,6.});
 
-    expOut.printIndexedBuffer("expOut");
-
     nd4j::ops::tile op;
     auto results = op.execute({&input, &reps}, {}, {});
     auto out = results->at(0);
@@ -1660,8 +1658,6 @@ TEST_F(DeclarableOpsTests9, test_range_int_1) {
 
     auto z = result->at(0);
 
-    z->printIndexedBuffer("z");
-
     delete result;
 }
 
@@ -3343,8 +3339,8 @@ TEST_F(DeclarableOpsTests9, Cholesky_Test_3) {
     auto result = op.execute({&x}, {}, {});
     ASSERT_EQ(result->status(), ND4J_STATUS_OK);
     auto res = result->at(0);
-//    res->printIndexedBuffer("Output for Cholesky 3");
-    ASSERT_TRUE(exp.equalsTo(res));
+    // res->printIndexedBuffer("Output for Cholesky 3");
+    ASSERT_TRUE(exp.equalsTo(res, 1e-4));
     delete result;
 }
 
diff --git a/libnd4j/tests_cpu/layers_tests/EmptyTests.cpp b/libnd4j/tests_cpu/layers_tests/EmptyTests.cpp
index baba901bf..8ae123260 100644
--- a/libnd4j/tests_cpu/layers_tests/EmptyTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/EmptyTests.cpp
@@ -121,7 +121,6 @@ TEST_F(EmptyTests, Test_Concat_3) {
 
     auto z = result->at(0);
 
-    z->printIndexedBuffer("z");
     ASSERT_EQ(exp, *z);
 
     delete result;
@@ -141,7 +140,6 @@ TEST_F(EmptyTests, Test_Concat_4) {
 
     auto z = result->at(0);
 
-    z->printIndexedBuffer("z");
     ASSERT_EQ(exp, *z);
 
     delete result;
@@ -282,7 +280,6 @@ TEST_F(EmptyTests, test_shaped_empty_3) {
 
 TEST_F(EmptyTests, test_shaped_empty_4) {
     auto shape = ConstantShapeHelper::getInstance()->vectorShapeInfo(0, nd4j::DataType::FLOAT32);
-    shape::printShapeInfoLinear("shape", shape);
     NDArray array(shape, true, nd4j::LaunchContext::defaultContext());
     std::vector<Nd4jLong> shapeOf({0});
 
diff --git a/libnd4j/tests_cpu/layers_tests/HelpersTests1.cpp b/libnd4j/tests_cpu/layers_tests/HelpersTests1.cpp
index 2ed43d08a..1dc2c8e48 100644
--- a/libnd4j/tests_cpu/layers_tests/HelpersTests1.cpp
+++ b/libnd4j/tests_cpu/layers_tests/HelpersTests1.cpp
@@ -20,6 +20,7 @@
 #include <hhSequence.h>
 #include <svd.h>
 #include <hhColPivQR.h>
+#include <array>
 #include <jacobiSVD.h>
 #include <ops/declarable/helpers/reverse.h>
 #include <ops/declarable/helpers/activations.h>
@@ -46,14 +47,14 @@ public:
 #ifndef __CUDABLAS__
 
 TEST_F(HelpersTests1, test_binary_search_1) {
-    std::array<int, 10> array({0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
+    std::array<int, 10> array = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
 
     auto idx = nd4j::ops::helpers::binarySearch(array.data(), 2, 10);
     ASSERT_EQ(2, idx);
 }
 
 TEST_F(HelpersTests1, test_binary_search_2) {
-    std::array<int, 10> array({0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
+    std::array<int, 10> array = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
 
     auto idx = nd4j::ops::helpers::binarySearch(array.data(), 18, 10);
     ASSERT_EQ(-1, idx);
diff --git a/libnd4j/tests_cpu/layers_tests/IndexingTests.cpp b/libnd4j/tests_cpu/layers_tests/IndexingTests.cpp
index 8097aab33..96c480fd9 100644
--- a/libnd4j/tests_cpu/layers_tests/IndexingTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/IndexingTests.cpp
@@ -58,7 +58,6 @@ TEST_F(IndexingTests, StridedSlice_1) {
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-    z->printIndexedBuffer("Output");
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
 
@@ -379,8 +378,6 @@ TEST_F(IndexingTests, Test_StridedSlice_1) {
 
     auto z = result->at(0);
 
-    z->printIndexedBuffer("Z");
-
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
 
@@ -424,8 +421,6 @@ TEST_F(IndexingTests, Test_StridedSlice_3) {
 
     auto z = result->at(0);
 
-    z->printIndexedBuffer("Z");
-
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
 
diff --git a/libnd4j/tests_cpu/layers_tests/JavaInteropCudaTests.cu b/libnd4j/tests_cpu/layers_tests/JavaInteropCudaTests.cu
index 294e03c12..f442c0bb9 100644
--- a/libnd4j/tests_cpu/layers_tests/JavaInteropCudaTests.cu
+++ b/libnd4j/tests_cpu/layers_tests/JavaInteropCudaTests.cu
@@ -50,7 +50,6 @@ TEST_F(JavaInteropCudaTests, test_DeclarableOp_execution_1) {
 
     context.setOutputArray(0, x.buffer(), x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo());
 
-    nd4j_printf("Starting execution...\n","");
     PointersManager pm(LaunchContext::defaultContext(), "test_DeclarableOp_execution_1");
     execCustomOp2(nullptr, op.getOpHash(), &context);
 
@@ -78,7 +77,6 @@ TEST_F(JavaInteropCudaTests, test_DeclarableOp_execution_2) {
 
     context.setOutputArray(0, z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo());
 
-    nd4j_printf("Starting execution...\n","");
     PointersManager pm(LaunchContext::defaultContext(), "test_DeclarableOp_execution_2");
     execCustomOp2(nullptr, op.getOpHash(), &context);
 
diff --git a/libnd4j/tests_cpu/layers_tests/JavaInteropTests.cpp b/libnd4j/tests_cpu/layers_tests/JavaInteropTests.cpp
index 21af8e380..aa75ea1ab 100644
--- a/libnd4j/tests_cpu/layers_tests/JavaInteropTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/JavaInteropTests.cpp
@@ -426,6 +426,24 @@ TEST_F(JavaInteropTests, Test_FastPath_Validation_2) {
     ASSERT_NE(Status::OK(), status);
 }
 
+TEST_F(JavaInteropTests, Test_empty_cast_1) {
+    auto x = NDArrayFactory::create<bool>('c', {1, 0, 2});
+    auto z = NDArrayFactory::create<Nd4jLong>('c', {1, 0, 2});
+    auto e = NDArrayFactory::create<Nd4jLong>('c', {1, 0, 2});
+
+    Nd4jLong iArgs[] = {10};
+
+    Context ctx(1);
+    ctx.setInputArray(0, x.buffer(), x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo());
+    ctx.setOutputArray(0, z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo());
+    ctx.setIArguments(iArgs, 1);
+
+    nd4j::ops::cast op;
+    auto result = op.execute(&ctx);
+    ASSERT_EQ(Status::OK(), result);
+    ASSERT_EQ(e, z);
+}
+
 /*
 TEST_F(JavaInteropTests, test_avgpooling_edge_1) {
     int inOutH = 35;
@@ -1183,7 +1201,9 @@ TEST_F(JavaInteropTests, test_bfloat16_rng) {
     RandomGenerator rng(119, 323841120L);
     bfloat16 args[2] = {(bfloat16) 0.0f, (bfloat16) 1.0f};
     execRandom(nullptr, nd4j::random::Ops::UniformDistribution, &rng, z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), args);
-    z.printIndexedBuffer("z");
+
+    //z.printIndexedBuffer("z");
+
     ASSERT_TRUE(z.sumNumber().e<float>(0) > 0);
 }
 
@@ -1192,7 +1212,7 @@ TEST_F(JavaInteropTests, test_ismax_view) {
     auto v = original.subarray({NDIndex::all(), NDIndex::all(), NDIndex::interval(0, 40, 2)});
     v->assign(1.0);
 
-    auto e = v->ulike();
+    auto e = v->like();
     auto t = e.tensorAlongDimension(0, {0, 1});
     t->assign(1.0);
 
@@ -1208,7 +1228,6 @@ TEST_F(JavaInteropTests, test_ismax_view) {
     nd4j::ops::ismax op;
     op.execute(&ctx);
 
-    z.printIndexedBuffer("z");
     ASSERT_EQ(e, z);
 
     delete v;
diff --git a/libnd4j/tests_cpu/layers_tests/LambdaTests.cu b/libnd4j/tests_cpu/layers_tests/LambdaTests.cu
index c1dc1acfe..30244b7dc 100644
--- a/libnd4j/tests_cpu/layers_tests/LambdaTests.cu
+++ b/libnd4j/tests_cpu/layers_tests/LambdaTests.cu
@@ -68,8 +68,6 @@ TEST_F(LambdaTests, test_basic_1) {
     ASSERT_EQ(0, res);
 
     ASSERT_EQ(e, x);
-
-    x.printIndexedBuffer("x");
 }
 
 void test(NDArray &x) {
@@ -127,7 +125,6 @@ TEST_F(LambdaTests, test_basic_2) {
 
     test(x);
 
-    x.printIndexedBuffer("x");
     ASSERT_EQ(e, x);
 }
 
@@ -137,7 +134,6 @@ TEST_F(LambdaTests, test_basic_3) {
 
     test(x);
 
-    x.printIndexedBuffer("x");
     ASSERT_EQ(e, x);
 }
 
@@ -147,7 +143,6 @@ TEST_F(LambdaTests, test_basic_4) {
 
     test2<float>(x);
 
-    x.printIndexedBuffer("x");
     ASSERT_EQ(e, x);
 }
 
@@ -158,7 +153,6 @@ TEST_F(LambdaTests, test_basic_5) {
 
     testPairwise(x, y);
 
-    x.printIndexedBuffer("x");
     ASSERT_EQ(e, x);
 }
 
@@ -168,7 +162,6 @@ TEST_F(LambdaTests, test_basic_6) {
 
     testIndexed(x);
 
-    x.printIndexedBuffer("x");
     ASSERT_EQ(e, x);
 }
 
@@ -180,7 +173,6 @@ TEST_F(LambdaTests, test_basic_7) {
 
     testTriplewise(w, x, y);
 
-    w.printIndexedBuffer("w");
     ASSERT_EQ(e, w);
 }
 
@@ -191,7 +183,6 @@ TEST_F(LambdaTests, test_basic_8) {
 
     testIndexedPairwise(x, y);
 
-    x.printIndexedBuffer("x");
     ASSERT_EQ(e, x);
 }
 
diff --git a/libnd4j/tests_cpu/layers_tests/LegacyOpsTests.cpp b/libnd4j/tests_cpu/layers_tests/LegacyOpsTests.cpp
index 5308ee99d..f48ee54f6 100644
--- a/libnd4j/tests_cpu/layers_tests/LegacyOpsTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/LegacyOpsTests.cpp
@@ -442,11 +442,11 @@ TEST_F(LegacyOpsTests, reduce3_1) {
 
     //int *tadShapeBuffer = shape::computeResultShape(shapeBuffer,dimension,dimensionLength);
     auto tadShapeBuffer = nd4j::ShapeUtils::evalReduceShapeInfo('c', dim, shapeBuffer, false, true, nullptr);
-    functions::reduce3::Reduce3<float, float>::exec(opNum, x, xShapeBuffer, extraVals, y, shapeBuffer, result, tadShapeBuffer, dimension, dimensionLength);
+    functions::reduce3::Reduce3<float, float>::exec(opNum, x, xShapeBuffer, extraVals, y, shapeBuffer, result, tadShapeBuffer, dimension, dimensionLength, 0, 4);
 
     float distancesAssertion[4] = {0.0,8.0,16.0,24.0};
     for(int i = 0; i < 4; i++)
-        ASSERT_EQ(distancesAssertion[i],result[i]);
+        ASSERT_NEAR(distancesAssertion[i],result[i], 1e-5);
 
     delete[] shapeBuffer;
     delete[] xShapeBuffer;
@@ -726,6 +726,26 @@ TEST_F(LegacyOpsTests, test_legacy_reduce_empty_3) {
     ASSERT_EQ(e, z);
 }
 
+TEST_F(LegacyOpsTests, test_legacy_reduce_empty_4) {
+    if (!Environment::getInstance()->isCPU())
+        return;
+    int a = 0;
+
+    auto x = NDArrayFactory::create<float>('c', {1, 0, 2});
+    auto d = NDArrayFactory::create<int>('c', {1}, {a});
+    auto z = NDArrayFactory::create<float>('c', {0, 2});
+    auto e = NDArrayFactory::create<float>('c', {0, 2});
+
+
+
+    ::execReduceSame2(nullptr, reduce::SameOps::Sum,
+            x.buffer(), x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
+            nullptr,
+            z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
+            d.buffer(), d.shapeInfo(), d.specialBuffer(), d.specialShapeInfo());
+
+}
+
 TEST_F(LegacyOpsTests, test_legacy_transform_float_1) {
     auto x = NDArrayFactory::create<float>('c', {1, 0, 4});
 
diff --git a/libnd4j/tests_cpu/layers_tests/NDArrayCudaBasicsTests.cu b/libnd4j/tests_cpu/layers_tests/NDArrayCudaBasicsTests.cu
index 4ab884d28..71ad6929b 100644
--- a/libnd4j/tests_cpu/layers_tests/NDArrayCudaBasicsTests.cu
+++ b/libnd4j/tests_cpu/layers_tests/NDArrayCudaBasicsTests.cu
@@ -152,7 +152,6 @@ TEST_F(NDArrayCudaBasicsTests, Test_Cosine_1) {
     //ASSERT_TRUE(y->isActualOnDeviceSide());
     //ASSERT_TRUE(y->isActualOnHostSide());
     //y->syncToHost();
-    y->printBuffer("Cosine");
     delete x;
     delete y;
 }
@@ -251,9 +250,6 @@ TEST_F(NDArrayCudaBasicsTests, TestAdd_3) {
     cudaMemcpy(z.buffer(), z.specialBuffer(), z.lengthOf() * z.sizeOfT(), cudaMemcpyDeviceToHost);
     res = cudaStreamSynchronize(*stream);
     ASSERT_EQ(0, res);
-    x.printBuffer("3X = ");
-    y.printBuffer("3Y = ");
-    z.printBuffer("3Result out");
 
     //
     // cudaFree(devBufferPtrX);
@@ -347,11 +343,7 @@ TEST_F(NDArrayCudaBasicsTests, TestAdd_6) {
     x += y;
     //x.applyPairwiseTransform(pairwise::Add, &y, &z, nullptr);
     x.syncToHost();
-    x.printBuffer("6X = ");
-    //y.printBuffer("3Y = ");
-    //z.printBuffer("3Result out");
 
-    //
     // cudaFree(devBufferPtrX);
     //cudaFree(devBufferPtrZ);
     //cudaFree(devShapePtrX);
@@ -381,11 +373,7 @@ TEST_F(NDArrayCudaBasicsTests, TestAdd_7) {
     x += 2.;
     //x.applyPairwiseTransform(pairwise::Add, &y, &z, nullptr);
     x.syncToHost();
-    x.printBuffer("7X = ");
-    //y.printBuffer("3Y = ");
-    //z.printBuffer("3Result out");
 
-    //
     // cudaFree(devBufferPtrX);
     //cudaFree(devBufferPtrZ);
     //cudaFree(devShapePtrX);
@@ -445,9 +433,6 @@ TEST_F(NDArrayCudaBasicsTests, TestMultiply_2) {
     //res = cudaMalloc(reinterpret_cast<void **>(&devShapePtrX), shape::shapeInfoByteLength(x.shapeInfo()));
     //ASSERT_EQ(0, res);
     x.applyPairwiseTransform(pairwise::Multiply, &y, &z, nullptr);
-    x.printBuffer("3X = ");
-    y.printBuffer("3Y = ");
-    z.printBuffer("3Result out");
 
     //
     // cudaFree(devBufferPtrX);
@@ -744,8 +729,7 @@ TEST_F(NDArrayCudaBasicsTests, TestRawBroadcast_2) {
 
     cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     z.tickWriteDevice();
-    z.printBuffer("Result with Broadcast2 (multiply)");
-    exp.printBuffer("Expect with Broadcast2 (multiply)");
+
     // verify results
     for (int e = 0; e < z.lengthOf(); e++)
         ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
@@ -811,7 +795,6 @@ TEST_F(NDArrayCudaBasicsTests, TestRawBroadcast_3) {
 
     //cudaResult = cudaStreamSynchronize(stream); ASSERT_EQ(0, cudaResult);
     //z.syncToHost();
-    z.printBuffer("Result with Broadcast3 (multiply)");
     // verify results
     for (int e = 0; e < z.lengthOf(); e++)
         ASSERT_NEAR(exp.e<double>(e), z.e<double>(e), 1e-5);
@@ -842,11 +825,8 @@ TEST_F(NDArrayCudaBasicsTests, TestBroadcastMultiply_1) {
     //res = cudaMalloc(reinterpret_cast<void **>(&devShapePtrX), shape::shapeInfoByteLength(x.shapeInfo()));
     //ASSERT_EQ(0, res);
     //x.applyPairwiseTransform(pairwise::Multiply, &y, &z, nullptr);
-    //x.printBuffer("23X = ");
-    //y.printBuffer("23Y = ");
     x *= y;
     //x.syncToHost();
-    x.printBuffer("54Result out");
 
     //
     // cudaFree(devBufferPtrX);
@@ -995,7 +975,6 @@ TEST_F(NDArrayCudaBasicsTests, TestBroadcastRaw_1) {
     // allocate required amount of global device memory and copy host data to it
     //cudaResult = allocateDeviceMem(*pLc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
     for(size_t i = 0; i < devicePtrs.size(); ++i) {
-        nd4j_printf("Allocation of %i bytes with device\n", hostData[i].second)
         cudaResult = cudaMalloc(&devicePtrs[i], hostData[i].second); //if(cudaResult != 0) return cudaResult;
         ASSERT_EQ(cudaResult, 0);
         cudaMemcpy(devicePtrs[i], hostData[i].first, hostData[i].second, cudaMemcpyHostToDevice);
@@ -1047,7 +1026,6 @@ TEST_F(NDArrayCudaBasicsTests, TestBroadcastMultiply) {
     //x.printBuffer("23X = ");
     //y.printBuffer("23Y = ");
     x *= y;
-    x.printBuffer("55Result out");
 
     //
     // cudaFree(devBufferPtrX);
@@ -1082,7 +1060,6 @@ TEST_F(NDArrayCudaBasicsTests, TestBroadcastMultiply_2) {
     //y.printBuffer("23Y = ");
     //void NDArray::applyTrueBroadcast(nd4j::BroadcastOpsTuple op, const NDArray* other, NDArray* target, const bool checkTargetShape, ExtraArguments *extraArgs)
     x.applyTrueBroadcast(BroadcastOpsTuple::Multiply(), &y, &exp);
-    exp.printBuffer("56Result out");
 
     //
     // cudaFree(devBufferPtrX);
@@ -1111,8 +1088,6 @@ TEST_F(NDArrayCudaBasicsTests, TestReduceSum_1) {
     ASSERT_EQ(0, res);
     y.syncToHost();
 
-    x.printBuffer("X = ");
-    y.printBuffer("Y = ");
     ASSERT_NEAR(y.e<double>(0), 15, 1e-5);
 }
 
@@ -1120,7 +1095,6 @@ TEST_F(NDArrayCudaBasicsTests, TestReduceSum_1) {
 TEST_F(NDArrayCudaBasicsTests, TestDup1) {
 
     NDArray array('c', {2,3}, {1,2,3,4,5,6});
-    array.printBuffer("Array at start");
     auto arrC = array.dup('c');
     auto arrF = array.dup('f');
     // arrC->printBuffer("arrC");
@@ -1498,22 +1472,18 @@ TEST_F(NDArrayCudaBasicsTests, EqualityTest1) {
             arrayA->p(i, k, (float) i);
         }
     }
-    arrayA->printBuffer("arrayA is ");
+
     for (int i = 0; i < arrayB->rows(); i++) {
         for (int k = 0; k < arrayB->columns(); k++) {
             arrayB->p(i, k, (float) i);
         }
     }
-    arrayB->printBuffer("arrayB is ");
 
     for (int i = 0; i < arrayC->rows(); i++) {
         for (int k = 0; k < arrayC->columns(); k++) {
             arrayC->p(i, k, (float) i+1);
         }
     }
-    arrayC->printBuffer("arrayC is ");
-
-
 
     ASSERT_TRUE(arrayA->equalsTo(arrayB, 1e-5));
 
@@ -1920,8 +1890,6 @@ TEST_F(NDArrayCudaBasicsTests, Tile_Test_2_2)
     auto y = x.tile({1,2,1});
     auto exp = NDArrayFactory::create<float>('f', {2, 2, 2});
     exp = 10.;
-    y.printShapeInfo("Output SHAPE");
-    y.printBuffer("Output TILE");
     ASSERT_TRUE(exp.equalsTo(y));
 }
 
@@ -1945,17 +1913,13 @@ TEST_F(NDArrayCudaBasicsTests, Operator_Plus_Test_2)
 {
     double expBuff[] = {2., 3, 3., 4., 4., 5, 5., 6., 6., 7, 7., 8.};
     NDArray a('c', {4,4}, {1.,2,3,4,5,6,7,8,9,2,3,2,1,0,4,7.}, nd4j::DataType::FLOAT32);
-    a.printBuffer();
     auto x = NDArrayFactory::create<double>('c', {3, 2, 1});
     auto y = NDArrayFactory::create<double>('c',    {1, 2});
     auto expected = NDArrayFactory::create<double>(expBuff, 'c', {3, 2, 2});
 
     x.linspace(1);
     y.linspace(1);
-    x.printBuffer("X=");
-    y.printBuffer("Y=");
     auto result = x + y;
-    result.printIndexedBuffer("Result");
 
     ASSERT_TRUE(expected.isSameShape(&result));
     ASSERT_TRUE(expected.equalsTo(&result));
@@ -2133,7 +2097,7 @@ TEST_F(NDArrayCudaBasicsTests, Test_diagonal_1) {
     for (Nd4jLong e = 0; e < exp.lengthOf(); ++e) {
         printf("VAL[%ld] = %f\n", e, diag->e<float>(e)); //, exp.e<float>(e), 1.e-5);
     }
-    diag->printIndexedBuffer("DIAGONAL");
+
     for (Nd4jLong e = 0; e < exp.lengthOf(); ++e) {
         ASSERT_NEAR(diag->e<float>(e), exp.e<float>(e), 1.e-5);
     }
diff --git a/libnd4j/tests_cpu/layers_tests/NDArrayTests.cpp b/libnd4j/tests_cpu/layers_tests/NDArrayTests.cpp
index 75608f2bc..747ecc183 100644
--- a/libnd4j/tests_cpu/layers_tests/NDArrayTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/NDArrayTests.cpp
@@ -174,8 +174,6 @@ TEST_F(NDArrayTest, EqualityTest1) {
             arrayC->p(i, k, (float) i+1);
         }
     }
-    arrayB->printBuffer("B =");
-    arrayC->printBuffer("C =");
 
     //nd4j_printf("A B\n","");
     ASSERT_TRUE(arrayA->equalsTo(arrayB, 1e-5));
@@ -1699,7 +1697,6 @@ TEST_F(NDArrayTest, TestVarianceAlongDimension2) {
     NDArray exp(expBuff, expShapeInfo);
 
     auto result = x.varianceAlongDimension(variance::SummaryStatsVariance, false, {1});
-    result->printIndexedBuffer("VARIANCE2");
     ASSERT_TRUE(exp.isSameShapeStrict(result));
     ASSERT_TRUE(exp.equalsTo(result));
 
@@ -1714,7 +1711,6 @@ TEST_F(NDArrayTest, TestVarianceAlongDimension3) {
     x.linspace(1); // 1, 2, 3, ..., 100
     exp.assign(825.f);
     auto result = x.varianceAlongDimension(variance::SummaryStatsVariance, false, {0});
-    result->printIndexedBuffer("VARIANCE3");
     ASSERT_TRUE(exp.isSameShapeStrict(result));
     ASSERT_TRUE(exp.equalsTo(result));
 
@@ -1729,7 +1725,6 @@ TEST_F(NDArrayTest, TestVarianceAlongDimension4) {
     x.linspace(1); // 1, 2, 3, ..., 100
     exp.assign(1716.);
     auto result = x.varianceAlongDimension(variance::SummaryStatsVariance, false, {0});
-    result->printIndexedBuffer("VARIANCE4");
     ASSERT_TRUE(exp.isSameShapeStrict(result));
     ASSERT_TRUE(exp.equalsTo(result));
 
diff --git a/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp b/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp
index 9f9937368..a497cd9e6 100644
--- a/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp
+++ b/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp
@@ -184,7 +184,6 @@ TEST_F(NDArrayTest2, SetIdentity_test_8) {
 
     auto x = NDArrayFactory::create<float>('c', {3, 3, 3});
     auto xExp = NDArrayFactory::create<float>('c', {3, 3, 3}, {1.,0.,0. ,0.,0.,0., 0.,0.,0.,   0.,0.,0. ,0.,1.,0., 0.,0.,0.,  0.,0.,0. ,0.,0.,0., 0.,0.,1.});
-    xExp.printIndexedBuffer("Identity8");
     x.setIdentity();
 
     ASSERT_TRUE(x.equalsTo(&xExp));
@@ -921,8 +920,6 @@ TEST_F(NDArrayTest2, test_subarray_ews_1) {
     NDArray x('c', {10, 5}, nd4j::DataType::FLOAT32);
     auto subArr1 = x.subarray({NDIndex::all(), NDIndex::point(2)});
 
-    subArr1->printShapeInfo("subArr1");
-
     ASSERT_EQ(5, subArr1->ews());
     delete subArr1;
 }
@@ -933,8 +930,6 @@ TEST_F(NDArrayTest2, test_subarray_ews_2) {
     NDArray x('f', {10, 5}, nd4j::DataType::FLOAT32);
     auto subArr1 = x.subarray({NDIndex::all(), NDIndex::point(2)});
 
-    subArr1->printShapeInfo("subArr1");
-
     ASSERT_EQ(1, subArr1->ews());
     delete subArr1;
 }
@@ -945,8 +940,6 @@ TEST_F(NDArrayTest2, test_subarray_ews_3) {
     NDArray x('c', {10, 5}, nd4j::DataType::FLOAT32);
     auto subArr1 = x.subarray({NDIndex::point(2), NDIndex::all()});
 
-    subArr1->printShapeInfo("subArr1");
-
     ASSERT_EQ(1, subArr1->ews());
     delete subArr1;
 }
@@ -957,8 +950,6 @@ TEST_F(NDArrayTest2, test_subarray_ews_4) {
     NDArray x('f', {10, 5}, nd4j::DataType::FLOAT32);
     auto subArr1 = x.subarray({NDIndex::point(2), NDIndex::all()});
 
-    subArr1->printShapeInfo("subArr1");
-
     ASSERT_EQ(10, subArr1->ews());
     delete subArr1;
 }
@@ -1074,8 +1065,6 @@ TEST_F(NDArrayTest2, test_subarray_interval_1) {
     NDArray x('f', {10, 10}, nd4j::DataType::FLOAT32);
     auto subArr1 = x.subarray({NDIndex::all(), NDIndex::interval(0,9)});
 
-    subArr1->printShapeInfo("subArr1");
-
     ASSERT_EQ(10, subArr1->sizeAt(0));
     ASSERT_EQ(9, subArr1->sizeAt(1));
     delete subArr1;
@@ -1086,8 +1075,6 @@ TEST_F(NDArrayTest2, test_subarray_interval_2) {
     NDArray x('c', {10, 10}, nd4j::DataType::FLOAT32);
     auto subArr1 = x.subarray({NDIndex::all(), NDIndex::interval(0,9)});
 
-    subArr1->printShapeInfo("subArr1");
-
     ASSERT_EQ(10, subArr1->sizeAt(0));
     ASSERT_EQ(9, subArr1->sizeAt(1));
     delete subArr1;
@@ -1098,10 +1085,8 @@ TEST_F(NDArrayTest2, test_subarray_3d_cf) {
     NDArray c('c', {10, 20, 30}, nd4j::DataType::FLOAT32);
 
     auto subarrayF = f({0,0, 0,0, 2,3}, true);
-    subarrayF.printShapeInfo("F subarray shapeInfo");
 
     auto subarrayC = c({2,3, 0,0, 0,0}, true);
-    subarrayC.printShapeInfo("C subarray shapeInfo");
 }
 
 TEST_F(NDArrayTest2, test_broadcast_row_1) {
@@ -1133,8 +1118,6 @@ TEST_F(NDArrayTest2, test_broadcast_column_2) {
     e.assign(1.0f);
 
     x.applyTrueBroadcast(BroadcastOpsTuple::Add(), &y, &x, false);
-    x.printShapeInfo();
-    x.printIndexedBuffer();
 
     ASSERT_EQ(e, x);
 }
@@ -1189,8 +1172,6 @@ TEST_F(NDArrayTest2, test_long_sum_1) {
     auto x = NDArrayFactory::create<Nd4jLong>('c', {2, 2}, {1, 2, 3, 4});
 
     auto z = x.reduceAlongDims(reduce::Sum, {0});
-
-    z.printIndexedBuffer("z long");
 }
 
 //////////////////////////////////////////////////////////////////////
diff --git a/libnd4j/tests_cpu/layers_tests/NativeOpsTests.cpp b/libnd4j/tests_cpu/layers_tests/NativeOpsTests.cpp
index 9aac42ddf..95b3027cc 100644
--- a/libnd4j/tests_cpu/layers_tests/NativeOpsTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/NativeOpsTests.cpp
@@ -191,7 +191,8 @@ TEST_F(NativeOpsTests, ExecBroadcast_2) {
 #ifdef __CUDABLAS__
 printf("Unsupported for cuda now.\n");
 #else
-    auto dimension = NDArrayFactory::create<int>('c', {1}, {(int)0});
+    int dimd = 0;
+    auto dimension = NDArrayFactory::create<int>('c', {1}, {dimd});
 
     ::execBroadcastBool(nullptr,
         broadcast::EqualTo,
@@ -525,8 +526,8 @@ TEST_F(NativeOpsTests, Reduce3Test_1) {
                             y.specialBuffer(), y.specialShapeInfo(),
                             exp.buffer(), exp.shapeInfo(),
                             exp.specialBuffer(), exp.specialShapeInfo());
-//    x.printIndexedBuffer("Input");
-//    exp.printIndexedBuffer("Reduce3 Dot");
+    //z.printIndexedBuffer("Z");
+    //exp.printIndexedBuffer("Reduce3 Dot");
     ASSERT_TRUE(exp.equalsTo(z));
 }
 
diff --git a/libnd4j/tests_cpu/layers_tests/OmpLaunchHelperTests.cpp b/libnd4j/tests_cpu/layers_tests/OmpLaunchHelperTests.cpp
index d8174f000..0d879748d 100644
--- a/libnd4j/tests_cpu/layers_tests/OmpLaunchHelperTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/OmpLaunchHelperTests.cpp
@@ -81,34 +81,6 @@ TEST_F(OmpLaunchHelperTests, Test_BetterThreads_3) {
     ASSERT_EQ(1, n);
 }
 
-//////////////////////////////////////////////////////////////////////
-TEST_F(OmpLaunchHelperTests, loop_test1) {
-    
-    const Nd4jLong N = 20010;
-    Nd4jLong desiredNumThreads = 2;
-    int x[N] = {0};
-
-    OmpLaunchHelper info(N, desiredNumThreads);
-    PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-    {                        
-        auto threadNum = omp_get_thread_num();
-        auto xi = x + info.getThreadOffset(threadNum);
-
-        auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
-
-        PRAGMA_OMP_SIMD
-        for (Nd4jLong i = 0; i < ulen; i++)
-            xi[i] = xi[i] + 1;
-    }
-    
-    #ifdef _OPENMP
-        ASSERT_EQ(desiredNumThreads, info._numThreads);
-    #else
-        ASSERT_EQ(1, info._numThreads);
-    #endif
-    
-}
-
 TEST_F(OmpLaunchHelperTests, test_tad_threads_1) {
     Nd4jLong numTads = 16;
     Nd4jLong tadLength = 16;
diff --git a/libnd4j/tests_cpu/layers_tests/OpsArena.cpp b/libnd4j/tests_cpu/layers_tests/OpsArena.cpp
deleted file mode 100644
index b09a4e043..000000000
--- a/libnd4j/tests_cpu/layers_tests/OpsArena.cpp
+++ /dev/null
@@ -1,200 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2015-2018 Skymind, Inc.
- *
- * This program and the accompanying materials are made available under the
- * terms of the Apache License, Version 2.0 which is available at
- * https://www.apache.org/licenses/LICENSE-2.0.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations
- * under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- ******************************************************************************/
-
-//
-// Created by raver119 on 11.10.2017.
-//
-// This "set of tests" is special one - we don't check ops results here. we just check for memory equality BEFORE op launch and AFTER op launch
-//
-//
-#include "testlayers.h"
-#include <vector>
-#include <ops/declarable/CustomOperations.h>
-#include <ops/declarable/OpTuple.h>
-#include <ops/declarable/OpRegistrator.h>
-#include <memory/MemoryReport.h>
-#include <memory/MemoryUtils.h>
-#include <MmulHelper.h>
-
-using namespace nd4j;
-using namespace nd4j::ops;
-
-class OpsArena : public testing::Test {
-public:
-    const int numIterations = 0;
-    std::vector<OpTuple *> tuples;
-
-
-    OpsArena() {
-        // nd4j_printf("\nStarting memory tests...\n","");
-
-
-        // conv2d_bp
-        tuples.push_back((new OpTuple("conv2d_bp"))
-                                 ->addInput(NDArrayFactory::create_<float>('c', {2, 1, 4, 4}))
-                                 ->addInput(NDArrayFactory::create_<float>('c', {3, 3, 1, 2}))
-                                 //->addInput(new NDArray<float>('c', {2, 1}))
-                                 ->addInput(NDArrayFactory::create_<float>('c', {2, 2, 4, 4}))
-                                 ->setIArgs({3, 3, 1, 1, 0, 0, 1, 1, 1}));
-
-
-        // mergeavg
-        tuples.emplace_back((new OpTuple("mergeavg"))
-                                    ->addInput(NDArrayFactory::create_<float>('c', {100, 100}))
-                                    ->addInput(NDArrayFactory::create_<float>('c', {100, 100}))
-                                    ->addInput(NDArrayFactory::create_<float>('c', {100, 100}))
-                                    ->addInput(NDArrayFactory::create_<float>('c', {100, 100})));
-
-        // mergemax
-        auto mergeMax_X0 = NDArrayFactory::create_<float>('c', {100, 100});
-        auto mergeMax_X1 = NDArrayFactory::create_<float>('c', {100, 100});
-        auto mergeMax_X2 = NDArrayFactory::create_<float>('c', {100, 100});
-        tuples.push_back(new OpTuple("mergemax", {mergeMax_X0, mergeMax_X1, mergeMax_X2}, {}, {}));
-
-        // conv2d
-        auto conv2d_Input = NDArrayFactory::create_<float>('c', {1, 2, 5, 4});
-        auto conv2d_Weights = NDArrayFactory::create_<float>('c', {2, 2, 2, 3});
-        auto conv2d_Bias = NDArrayFactory::create_<float>('c', {3, 1});
-        tuples.push_back(new OpTuple("conv2d", {conv2d_Input, conv2d_Weights, conv2d_Bias}, {}, {2, 2, 1, 1, 0, 0, 1, 1, 1, 0}));
-
-        // test custom op
-        tuples.emplace_back((new OpTuple("testcustom"))
-                                    ->setIArgs({1, 2})
-                                    ->addInput(NDArrayFactory::create_<float>('c', {100, 100})));
-
-
-        // deconv2d
-        tuples.emplace_back((new OpTuple("deconv2d"))
-                                    ->addInput(NDArrayFactory::create_<float>('c', {2, 3, 4, 4}))
-                                    ->addInput(NDArrayFactory::create_<float>('c', {5, 5, 3, 3}))
-                                    ->setIArgs({5, 5, 1, 1, 0, 0, 1, 1, 0, 0}));
-
-        // maxpool2d
-        tuples.emplace_back((new OpTuple("maxpool2d"))
-                                    ->addInput(NDArrayFactory::create_<float>('c', {2, 1, 28, 28}))
-                                    ->setIArgs({5, 5, 1, 1, 0, 0, 2, 2, 0}));
-    }
-
-
-    ~OpsArena() {
-        for (auto v: tuples)
-            delete v;
-    }
-
-};
-
-
-TEST_F(OpsArena, TestFeedForward) {
-    nd4j::ops::mergeavg op0;
-    nd4j::ops::mergemax op1;
-
-#ifdef _WIN32
-    if (1 > 0)
-        return;
-#endif
-
-    for (auto tuple: tuples) {
-        auto op = OpRegistrator::getInstance()->getOperation(tuple->_opName);
-        if (op == nullptr) {
-            // nd4j_printf("Can't find Op by name: [%s]\n", tuple->_opName);
-            ASSERT_TRUE(false);
-        }
-
-        // nd4j_printf("Testing op [%s]\n", tuple->_opName);
-        nd4j::memory::MemoryReport before, after;
-
-        // warmup
-        auto tmp1 = op->execute(tuple->_inputs, tuple->_tArgs, tuple->_iArgs);
-        auto tmp2 = op->execute(tuple->_inputs, tuple->_tArgs, tuple->_iArgs);
-        delete tmp1;
-        delete tmp2;
-
-        auto b = nd4j::memory::MemoryUtils::retrieveMemoryStatistics(before);
-
-        if (!b)
-            ASSERT_TRUE(false);
-
-        for (int e = 0; e < numIterations; e++) {
-            auto result = op->execute(tuple->_inputs, tuple->_tArgs, tuple->_iArgs);
-
-            // we just want to be sure op was executed successfully
-            ASSERT_TRUE(result->size() > 0);
-
-            delete result;
-        }
-
-
-        auto a = nd4j::memory::MemoryUtils::retrieveMemoryStatistics(after);
-        if (!a)
-            ASSERT_TRUE(false);
-
-
-        // this is our main assertion. memory footprint after op run should NOT be higher then before
-        if (after > before) {
-            // nd4j_printf("WARNING!!! OpName: [%s]; RSS before: [%lld]; RSS after: [%lld]\n", tuple->_opName, before.getRSS(), after.getRSS())
-        //    ASSERT_TRUE(after <= before);
-        }
-    }
-}
-
-
-
-TEST_F(OpsArena, TestMmulHelper1) {
-    auto a = NDArrayFactory::create<float>('c', {100, 100});
-    auto b = NDArrayFactory::create<float>('c', {100, 100});
-    auto c = NDArrayFactory::create<float>('c', {100, 100});
-
-    nd4j::MmulHelper::mmul(&a, &b, &c);
-
-    nd4j::memory::MemoryReport before, after;
-
-    nd4j::memory::MemoryUtils::retrieveMemoryStatistics(before);
-
-    for (int e = 0; e < numIterations; e++) {
-        nd4j::MmulHelper::mmul(&a, &b, &c);
-    }
-
-    nd4j::memory::MemoryUtils::retrieveMemoryStatistics(after);
-    if (after > before) {
-        // nd4j_printf("WARNING!!! OpName: [%s]; RSS before: [%lld]; RSS after: [%lld]\n", "mmulHelper", before.getRSS(), after.getRSS())
-        //ASSERT_TRUE(after <= before);
-    }
-}
-
-
-TEST_F(OpsArena, TestMmulHelper2) {
-    auto a = NDArrayFactory::create<float>('c', {100, 100});
-    auto b = NDArrayFactory::create<float>('c', {100, 100});
-
-    auto c = nd4j::MmulHelper::mmul(&a, &b);
-    delete c;
-
-    nd4j::memory::MemoryReport before, after;
-
-    nd4j::memory::MemoryUtils::retrieveMemoryStatistics(before);
-
-    for (int e = 0; e < numIterations; e++) {
-        c = nd4j::MmulHelper::mmul(&a, &b);
-        delete c;
-    }
-
-    nd4j::memory::MemoryUtils::retrieveMemoryStatistics(after);
-    if (after > before) {
-        // nd4j_printf("WARNING!!! OpName: [%s]; RSS before: [%lld]; RSS after: [%lld]\n", "mmulHelper", before.getRSS(), after.getRSS())
-        ASSERT_TRUE(after <= before);
-    }
-}
-
diff --git a/libnd4j/tests_cpu/layers_tests/ParityOpsTests.cpp b/libnd4j/tests_cpu/layers_tests/ParityOpsTests.cpp
index 0254d1877..d5880d689 100644
--- a/libnd4j/tests_cpu/layers_tests/ParityOpsTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ParityOpsTests.cpp
@@ -419,9 +419,6 @@ TEST_F(ParityOpsTests, Test_Shape_1) {
 
     auto z = result->at(0);
 
-    z->printShapeInfo("z shape");
-    z->printIndexedBuffer(" z buffr");
-
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
 
@@ -1362,7 +1359,8 @@ TEST_F(ParityOpsTests, scatterND_sub_test2) {
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-    // z->printIndexedBuffer();
+    //exp.printIndexedBuffer("e");
+    //z->printIndexedBuffer("z");
 
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
diff --git a/libnd4j/tests_cpu/layers_tests/PerformanceTests.cpp b/libnd4j/tests_cpu/layers_tests/PerformanceTests.cpp
index 6ea2ba081..998b8164b 100644
--- a/libnd4j/tests_cpu/layers_tests/PerformanceTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/PerformanceTests.cpp
@@ -43,6 +43,7 @@
 #include <performance/benchmarking/LightBenchmarkSuit.h>
 
 #include <ops/declarable/helpers/legacy_helpers.h>
+#include <execution/ThreadPool.h>
 
 using namespace nd4j;
 using namespace nd4j::graph;
@@ -52,7 +53,7 @@ public:
     int numIterations = 100;
 
     PerformanceTests() {
-        //
+        samediff::ThreadPool::getInstance();
     }
 };
 
@@ -65,6 +66,7 @@ TEST_F(PerformanceTests, test_maxpooling2d_1) {
     x.linspace(1.0f);
     Nd4jLong k = 5;
 
+
     Nd4jLong iArgs[] {k,k, 1,1, 0,0, 1,1, 1};
     Context ctx(1);
     ctx.setInputArray(0, &x);
@@ -81,6 +83,9 @@ TEST_F(PerformanceTests, test_maxpooling2d_1) {
         auto timeEnd = std::chrono::system_clock::now();
         auto outerTime = std::chrono::duration_cast<std::chrono::nanoseconds>(timeEnd - timeStart).count();
         valuesX.emplace_back(outerTime);
+
+        if ((i + 1) % 1000 == 0)
+            nd4j_printf("Iteration %i finished...\n", i + 1);
     }
 
     std::sort(valuesX.begin(), valuesX.end());
diff --git a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp
index e95c6eca6..dfb685e22 100644
--- a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp
@@ -57,12 +57,201 @@ public:
         fflush(stdout);
     }
 };
-
+/*
 TEST_F(PlaygroundTests, test_s_1) {
     auto t = ::runLightBenchmarkSuit(true);
     delete[] t;
 }
 
+TEST_F(PlaygroundTests, test_s_2) {
+    std::atomic<int> s;
+    s = 0;
+    auto func = PRAGMA_THREADS_FOR {
+        s++;
+    };
+
+    samediff::Threads::parallel_for(func, 0, 8192, 1, 4);
+    std::vector<Nd4jLong> values;
+
+    for (int e = 0; e < 100000; e++) {
+        s = 0;
+
+        auto timeStart = std::chrono::system_clock::now();
+        //samediff::Threads::parallel_for(func, 0, 8192, 1, 4);
+        PRAGMA_OMP_PARALLEL_THREADS(4) {
+            s++;
+        }
+
+        auto timeEnd = std::chrono::system_clock::now();
+        auto outerTime = std::chrono::duration_cast<std::chrono::nanoseconds> (timeEnd - timeStart).count();
+        values.emplace_back(outerTime);
+    };
+    std::sort(values.begin(), values.end());
+
+    nd4j_printf("Time: %lld;\n", values[values.size() / 2]);
+}
+ */
+/*
+TEST_F(PlaygroundTests, test_s_4) {
+    std::atomic<float> f;
+    std::atomic<int> s;
+    std::vector<Nd4jLong> valuesX, valuesY;
+    int iterations = 1000;
+    s = 0;
+    auto func = PRAGMA_THREADS_FOR {
+        s++;
+    };
+
+    samediff::Threads::parallel_for(func, 0, 8192, 1, 4);
+
+    ////////
+
+    auto x = NDArrayFactory::create<float>('c', {32, 3, 256, 256});
+    auto z = NDArrayFactory::create<float>('c', {32, 3, 256, 256});
+    x.linspace(1.0);
+
+    auto xs0 = x.sizeAt(0);
+    auto xs1 = x.sizeAt(1);
+    auto xs2 = x.sizeAt(2);
+    auto xs3 = x.sizeAt(3);
+
+    auto buffer = x.bufferAsT<float>();
+    auto zbuffer = z.bufferAsT<float>();
+
+    for (int e = 0; e < iterations; e++) {
+        auto timeStart = std::chrono::system_clock::now();
+        PRAGMA_OMP_PARALLEL_FOR_COLLAPSE(2)
+        for (int i = 0; i < xs0; i++) {
+            for (int j = 0; j < xs1; j++) {
+                auto thread_id = omp_get_thread_num();
+                for (int k = 0; k < xs2; k++) {
+                    for (int l = 0; l < xs3; l++) {
+                        zbuffer[thread_id] += buffer[i * j + (k*l)] * 2.5f;
+                    }
+                }
+            }
+        }
+        auto timeEnd = std::chrono::system_clock::now();
+        auto outerTime = std::chrono::duration_cast<std::chrono::nanoseconds>(timeEnd - timeStart).count();
+        valuesX.emplace_back(outerTime);
+    }
+
+
+    for (int e = 0; e < iterations; e++) {
+        auto timeStart = std::chrono::system_clock::now();
+        auto f2d = PRAGMA_THREADS_FOR_2D {
+            for (auto i = start_x; i < stop_x; i++) {
+                for (auto j = start_y; j < stop_y; j++) {
+
+                    for (auto k = 0; k < xs2; k++) {
+                        for (auto l = 0; l < xs3; l++) {
+                            zbuffer[thread_id] += buffer[i * j + (k * l)] * 2.5f;
+                        }
+                    }
+                }
+            }
+        };
+        samediff::Threads::parallel_for(f2d, 0, xs0, 1, 0, xs1, 1);
+
+        auto timeEnd = std::chrono::system_clock::now();
+        auto outerTime = std::chrono::duration_cast<std::chrono::nanoseconds>(timeEnd - timeStart).count();
+        valuesY.emplace_back(outerTime);
+    }
+
+    if (valuesX.size() > 0) {
+        std::sort(valuesX.begin(), valuesX.end());
+        nd4j_printf("OpenMP time: %lld; Min: %lld; Max: %lld;\n", valuesX[valuesX.size() / 2], valuesX[0], valuesX[valuesX.size() - 1]);
+    }
+
+    if (valuesY.size() > 0) {
+        std::sort(valuesY.begin(), valuesY.end());
+        nd4j_printf("Threads time: %lld; Min: %lld; Max: %lld;\n", valuesY[valuesY.size() / 2], valuesY[0], valuesY[valuesY.size() - 1]);
+    }
+
+    nd4j_printf("Sum: %f\n", z.sumNumber().e<float>(0));
+}
+
+
+TEST_F(PlaygroundTests, test_s_5) {
+    auto x = NDArrayFactory::create<float>('c', {32, 1, 28, 28});
+
+    std::vector<Nd4jLong> values;
+    auto iterations = 100;
+
+    auto startX = 0;
+    auto stopX = x.sizeAt(0);
+    auto incX = 1;
+    auto startY = 0;
+    auto stopY = x.sizeAt(1);
+    auto incY = 1;
+    auto numThreads = 4;
+
+    // number of elements per loop
+    auto delta_x = (stopX - startX);
+    auto delta_y = (stopY - startY);
+
+    // number of iterations per loop
+    auto itersX = delta_x / incX;
+    auto itersY = delta_y / incY;
+
+    for (int e = 0; e < iterations; e++) {
+        auto timeStart = std::chrono::system_clock::now();
+
+        // picking best fit here
+        auto splitLoop = samediff::ThreadsHelper::pickLoop2d(numThreads, itersX, itersY);
+        auto span = samediff::Span2::build(splitLoop, 0, numThreads, startX, stopX, incX, startY, stopY, incY);
+
+        auto timeEnd = std::chrono::system_clock::now();
+        auto outerTime = std::chrono::duration_cast<std::chrono::nanoseconds>(timeEnd - timeStart).count();
+        values.emplace_back(outerTime);
+    }
+
+    std::sort(values.begin(), values.end());
+
+    nd4j_printf("Calculations time: [Median: %lld; Min: %lld; Max: %lld;]\n", values[values.size() / 2], values[0], values[values.size()-1]);
+}
+
+
+TEST_F(PlaygroundTests, test_s_6) {
+    auto x = NDArrayFactory::create<float>('c', {1024 * 1024 * 64});
+    auto buffer = x.bufferAsT<float>();
+    auto len = x.lengthOf();
+    std::vector<Nd4jLong> values;
+    auto iterations = 1000;
+
+    for (int i = 0; i < iterations; i++) {
+        auto timeStart = std::chrono::system_clock::now();
+
+        // picking best fit here
+        for (int e = 0; e < len; e++) {
+            buffer[e] = (buffer[e] + 1.72f) * 3.17f - 0.0012f;
+        }
+
+        auto timeEnd = std::chrono::system_clock::now();
+        auto outerTime = std::chrono::duration_cast<std::chrono::nanoseconds>(timeEnd - timeStart).count();
+        values.emplace_back(outerTime);
+    }
+
+    std::sort(values.begin(), values.end());
+
+    nd4j_printf("Calculations time: [Median: %lld; Min: %lld; Max: %lld;]\n", values[values.size() / 2], values[0], values[values.size()-1]);
+}
+
+
+TEST_F(PlaygroundTests, test_s_3) {
+    std::atomic<int> s;
+    s = 0;
+    auto func = PRAGMA_THREADS_FOR {
+        s++;
+    };
+
+    for (int e = 0; e < 10000; e++) {
+
+        samediff::Threads::parallel_for(func, 0, 8192, 1, 4);
+    }
+}
+ */
+
 /*
 TEST_F(PlaygroundTests, test_relubp_1) {
     auto x = NDArrayFactory::create<float>('c', {128, 64, 224, 224});
diff --git a/libnd4j/tests_cpu/layers_tests/RNGTests.cpp b/libnd4j/tests_cpu/layers_tests/RNGTests.cpp
index bc4db6e63..5c3ca340b 100644
--- a/libnd4j/tests_cpu/layers_tests/RNGTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/RNGTests.cpp
@@ -868,7 +868,6 @@ TEST_F(RNGTests, Test_UniformDistribution_04) {
     ASSERT_EQ(Status::OK(), result->status());
 
     auto z = result->at(0);
-    z->printIndexedBuffer("Uniform int distribution");
     ASSERT_TRUE(exp0.isSameShape(z));
     ASSERT_FALSE(exp0.equalsTo(z));
 
diff --git a/libnd4j/tests_cpu/layers_tests/ReduceTests.cpp b/libnd4j/tests_cpu/layers_tests/ReduceTests.cpp
index 4df0f3dc8..8bf12f58b 100644
--- a/libnd4j/tests_cpu/layers_tests/ReduceTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ReduceTests.cpp
@@ -77,7 +77,7 @@ TEST_F(EuclideanDistanceTest,Test1) {
                                              result,
                                              tadShapeBuffer,
                                              dimension,
-                                             dimensionLength);
+                                             dimensionLength, 0, 2);
 
     ASSERT_EQ(result[1],result[0]);
 }
@@ -107,7 +107,7 @@ TEST_F(StdTest,MultiDimTest) {
             dimensionsForStd,
             dimensionLength,
             tad->tadOnlyShapeInfo,
-            tad->tadOffsets);
+            tad->tadOffsets, 0, shape::length(resultShapeInfo));
 
     // for(int i = 0; i < shape::length(resultShapeInfo); i++)
         // printf("%f\n",result[i]);
@@ -145,7 +145,7 @@ TEST_F(ReduceTest,MatrixTest) {
             dimension,
             dimensionLength,
             tad->tadOnlyShapeInfo,
-            tad->tadOffsets);
+            tad->tadOffsets, 0, tad->numTads);
 
     // for(int i = 0; i < shape::length(resultShapeInfo); i++)
     //     printf("%f\n",result[i]);
diff --git a/libnd4j/tests_cpu/layers_tests/ShapeTests2.cpp b/libnd4j/tests_cpu/layers_tests/ShapeTests2.cpp
index ecc91779e..a8f430fe3 100644
--- a/libnd4j/tests_cpu/layers_tests/ShapeTests2.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ShapeTests2.cpp
@@ -234,7 +234,6 @@ TEST_F(NormalThreeFourFive,DimensionTest) {
     tad->init(inputShapeBuffer,dimension,dimensionLength);
     tad->createTadOnlyShapeInfo();
     tad->createOffsets();
-    shape::printShapeInfoLinear(tad->tadOnlyShapeInfo);
     ASSERT_TRUE(arrsEquals(8,assertionBuffer,tad->tadOnlyShapeInfo));
 
     delete tad;
diff --git a/libnd4j/tests_cpu/layers_tests/TadTests.cpp b/libnd4j/tests_cpu/layers_tests/TadTests.cpp
index aabef927f..b4a631a8c 100644
--- a/libnd4j/tests_cpu/layers_tests/TadTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/TadTests.cpp
@@ -206,8 +206,6 @@ TEST_F(TadTests, test_TAD_empty_dims_1) {
     xTad.init(xShape, reinterpret_cast<int*>(112L), 0);
     xTad.createTadOnlyShapeInfo();
     xTad.createOffsets();
-    nd4j_printf("numTads: %i\n", (int) xTad.numTads);
-    shape::printShapeInfoLinear("TAD shape", xTad.tadOnlyShapeInfo);
 }
 
 TEST_F(TadTests, test_tad_order_1) {
@@ -218,7 +216,6 @@ TEST_F(TadTests, test_tad_order_1) {
     xTad.init(xShape, &dim, 1);
     xTad.createTadOnlyShapeInfo();
 
-    shape::printShapeInfoLinear("tad shape", xTad.tadOnlyShapeInfo);
     ASSERT_TRUE(shape::equalsStrict(tShape, xTad.tadOnlyShapeInfo));
 }
 
@@ -230,7 +227,6 @@ TEST_F(TadTests, test_tad_order_2) {
     xTad.init(xShape, &dim, 1);
     xTad.createTadOnlyShapeInfo();
 
-    shape::printShapeInfoLinear("tad shape", xTad.tadOnlyShapeInfo);
     ASSERT_TRUE(shape::equalsStrict(tShape, xTad.tadOnlyShapeInfo));
 }
 
@@ -243,7 +239,6 @@ TEST_F(TadTests, test_tad_order_3) {
     xTad.init(xShape, &dim, 1);
     xTad.createTadOnlyShapeInfo();
 
-    shape::printShapeInfoLinear("tad shape", xTad.tadOnlyShapeInfo);
     ASSERT_TRUE(shape::equalsStrict(tShape, xTad.tadOnlyShapeInfo));
 }
 
@@ -256,7 +251,6 @@ TEST_F(TadTests, test_tad_order_4) {
     xTad.init(xShape, dim, 2);
     xTad.createTadOnlyShapeInfo();
 
-    shape::printShapeInfoLinear("tad shape", xTad.tadOnlyShapeInfo);
     ASSERT_TRUE(shape::equalsStrict(tShape, xTad.tadOnlyShapeInfo));
 }
 
@@ -264,7 +258,6 @@ TEST_F(TadTests, test_column_1) {
     auto x = NDArrayFactory::create<float>('c', {5, 2});
     auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(x.shapeInfo(), 0);
 
-    shape::printShapeInfoLinear("column view", tadPack.primaryShapeInfo());
     ASSERT_EQ(1, shape::rank(tadPack.primaryShapeInfo()));
     ASSERT_EQ(5, shape::length(tadPack.primaryShapeInfo()));
     ASSERT_TRUE(shape::isVector(tadPack.primaryShapeInfo()));
diff --git a/libnd4j/tests_cpu/layers_tests/ThreadsTests.cpp b/libnd4j/tests_cpu/layers_tests/ThreadsTests.cpp
new file mode 100644
index 000000000..1139d6076
--- /dev/null
+++ b/libnd4j/tests_cpu/layers_tests/ThreadsTests.cpp
@@ -0,0 +1,233 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+
+#include "testlayers.h"
+#include <ops/declarable/CustomOperations.h>
+#include <loops/type_conversions.h>
+#include <execution/Threads.h>
+#include <chrono>
+#include <execution/ThreadPool.h>
+
+using namespace samediff;
+using namespace nd4j;
+using namespace nd4j::ops;
+using namespace nd4j::graph;
+
+class ThreadsTests : public testing::Test {
+public:
+
+};
+
+TEST_F(ThreadsTests, th_test_1) {
+    ASSERT_EQ(1, ThreadsHelper::numberOfThreads(6, 1023));
+    ASSERT_EQ(1, ThreadsHelper::numberOfThreads(6, 1024));
+    ASSERT_EQ(1, ThreadsHelper::numberOfThreads(6, 1026));
+
+    ASSERT_EQ(1, ThreadsHelper::numberOfThreads(6, 2043));
+    ASSERT_EQ(2, ThreadsHelper::numberOfThreads(6, 2048));
+}
+
+
+TEST_F(ThreadsTests, th_test_2) {
+    // in this case we'll get better split over second loop - exactly 32 elements per thread
+    ASSERT_EQ(2, ThreadsHelper::pickLoop2d(32, 48, 1024));
+    ASSERT_EQ(2, ThreadsHelper::pickLoop2d(6, 4, 16384));
+
+    // in this case we'll get better split over first loop - 2 loops/2048 elements per thread
+    ASSERT_EQ(1, ThreadsHelper::pickLoop2d(32, 64, 1024));
+    ASSERT_EQ(1, ThreadsHelper::pickLoop2d(6, 6, 16384));
+
+    // in this case none of loops are good enough, but second loop is too small for split
+    ASSERT_EQ(1, ThreadsHelper::pickLoop2d(6, 64, 32));
+
+    // all loops are good enough, but we go with bigger one, since small
+    ASSERT_EQ(1, ThreadsHelper::pickLoop2d(2, 64, 32));
+
+    // obviously split goes into second loop, to give 1024 elements per thread
+    ASSERT_EQ(2, ThreadsHelper::pickLoop2d(2, 1, 2048));
+}
+
+TEST_F(ThreadsTests, th_test_3) {
+    // typical conv cases
+    ASSERT_EQ(1, ThreadsHelper::pickLoop3d(4, 32, 3, 128));
+    ASSERT_EQ(2, ThreadsHelper::pickLoop3d(4, 1, 128, 64));
+    ASSERT_EQ(3, ThreadsHelper::pickLoop3d(4, 1, 3, 128));
+
+    // checking for optimal threads for conv inference
+    ASSERT_EQ(6, ThreadsHelper::numberOfThreads3d(6, 1, 3, 128));
+    ASSERT_EQ(4, ThreadsHelper::numberOfThreads3d(4, 1, 3, 128));
+    ASSERT_EQ(8, ThreadsHelper::numberOfThreads3d(8, 1, 3, 128));
+
+    // checking for optimal threads for conv training
+    ASSERT_EQ(6, ThreadsHelper::numberOfThreads3d(6, 16, 3, 128));
+    ASSERT_EQ(6, ThreadsHelper::numberOfThreads3d(6, 8, 3, 128));
+
+
+    ASSERT_EQ(6, ThreadsHelper::numberOfThreads3d(6, 8, 3, 64));
+    ASSERT_EQ(1, ThreadsHelper::pickLoop3d(6, 8, 3, 64));
+}
+
+TEST_F(ThreadsTests, th_test_4) {
+    // typical conv cases
+    ASSERT_EQ(2, ThreadsHelper::numberOfThreads2d(2, 32, 3));
+    ASSERT_EQ(4, ThreadsHelper::numberOfThreads2d(4, 32, 3));
+    ASSERT_EQ(6, ThreadsHelper::numberOfThreads2d(6, 32, 1));
+    ASSERT_EQ(8, ThreadsHelper::numberOfThreads2d(8, 16, 64));
+
+    ASSERT_EQ(1, ThreadsHelper::pickLoop2d(4, 32, 1));
+    ASSERT_EQ(1, ThreadsHelper::pickLoop2d(8, 19, 17));
+
+    // primes edge cases
+    ASSERT_EQ(6, ThreadsHelper::numberOfThreads2d(6, 19, 17));
+    ASSERT_EQ(8, ThreadsHelper::numberOfThreads2d(8, 19, 17));
+
+    ASSERT_EQ(1, ThreadsHelper::pickLoop2d(8, 19, 17));
+
+    for (auto e = 0; e < 6; e++) {
+        auto span = Span2::build(1, e, 6, 0, 19, 1, 0, 17, 1);
+
+        nd4j_printf("Span start: %lld; stop: %lld\n", span.startX(), span.stopX());
+    }
+
+    nd4j_printf("-----------------------\n","");
+    for (auto e = 0; e < 6; e++) {
+        auto span = Span2::build(1, e, 6, 0, 32, 1, 0, 3, 1);
+
+        nd4j_printf("Span start: %lld; stop: %lld\n", span.startX(), span.stopX());
+    }
+}
+
+
+TEST_F(ThreadsTests, test_span_converage_1) {
+    for (int b = 1; b <= 128; b++) {
+        for (int c = 1; c <= 64; c++) {
+            for (int t = 1; t <= 64; t++) {
+
+                auto threads = ThreadsHelper::numberOfThreads2d(t, b, c);
+                auto loop = ThreadsHelper::pickLoop2d(threads, b, c);
+
+                if (t > 1 && threads == 1 && (b > 1 && c > 1)) {
+                    nd4j_printf("Got 1 thread for [%i, %i] loop; initial max threads: %i\n", b, c, t)
+                }
+
+                auto sum = 0;
+                for (auto a = 0; a < threads; a++) {
+                    auto span = Span2::build(loop, a,threads, 0, b, 1, 0, c, 1);
+
+                    if (loop == 1)
+                        sum += span.stopX() - span.startX();
+                    else if (loop == 2)
+                        sum += span.stopY() - span.startY();
+                    else
+                        throw std::runtime_error("Bad loop!");
+                }
+
+                if (loop == 1)
+                    ASSERT_EQ(b, sum);
+                else
+                    ASSERT_EQ(c, sum);
+            }
+        }
+    }
+}
+
+TEST_F(ThreadsTests, validation_test_2d_1) {
+    if (1 > 0)
+        return;
+
+    std::vector<int> threads({1, 2, 4, 6, 8, 12, 16, 20, 32, 48, 64});
+
+    for (int e = 1; e < 1024; e++) {
+        for (int i = 1; i <= 1024; i++ ) {
+            for (auto t:threads) {
+                std::atomic<int64_t> sum;
+                sum.store(0);
+
+                auto func = PRAGMA_THREADS_FOR_2D {
+                    for (auto x = start_x; x < stop_x; x += inc_x) {
+                        for (auto y = start_y; y < stop_y; y += inc_y) {
+                            sum++;
+                        }
+                    }
+                };
+
+                samediff::Threads::parallel_for(func, 0, e, 1, 0, i, 1, t, true);
+
+                ASSERT_EQ(e * i, sum.load());
+            }
+        }
+
+        nd4j_printf("Finished iteration %i\n", e);
+    }
+}
+
+TEST_F(ThreadsTests, reduction_test_1) {
+
+    auto func = PRAGMA_REDUCE_LONG {
+        int64_t sum = 0;
+
+        for (auto e = start; e < stop; e++) {
+            sum++;
+        };
+
+        return sum;
+    };
+
+    auto sum = samediff::Threads::parallel_long(func, LAMBDA_AL {return _old + _new;}, 0, 8192, 1, 4);
+    ASSERT_EQ(8192, sum);
+}
+
+/*
+TEST_F(ThreadsTests, basic_test_1) {
+    if (!Environment::getInstance()->isCPU())
+        return;
+
+    auto instance = samediff::ThreadPool::getInstance();
+
+    auto array = NDArrayFactory::create<float>('c', {512, 768});
+    auto like = array.like();
+    auto buffer = array.bufferAsT<float>();
+    auto lbuffer = like.bufferAsT<float>();
+
+    auto func = PRAGMA_THREADS_FOR {
+        PRAGMA_OMP_SIMD
+        for (uint64_t e = start; e < stop; e += increment) {
+            buffer[e] += 1.0f;
+        }
+    };
+
+    auto timeStartThreads = std::chrono::system_clock::now();
+    samediff::Threads::parallel_for(func, 0, array.lengthOf());
+    auto timeEndThreads = std::chrono::system_clock::now();
+    auto outerTimeThreads = std::chrono::duration_cast<std::chrono::microseconds> (timeEndThreads - timeStartThreads).count();
+
+    auto timeStartOmp = std::chrono::system_clock::now();
+    PRAGMA_OMP_PARALLEL_FOR_SIMD
+    for (uint64_t e = 0; e < array.lengthOf(); e ++) {
+        lbuffer[e] += 1.0f;
+    }
+    auto timeEndOmp = std::chrono::system_clock::now();
+    auto outerTimeOmp = std::chrono::duration_cast<std::chrono::microseconds> (timeEndOmp - timeStartOmp).count();
+
+    ASSERT_NEAR((float) array.lengthOf(), array.sumNumber().e<float>(0), 1e-5f);
+
+    nd4j_printf("Threads time: %lld us; OMP time: %lld us; %p\n", outerTimeThreads, outerTimeOmp, instance)
+}
+ */
\ No newline at end of file
diff --git a/libnd4j/tests_cpu/layers_tests/WorkspaceTests.cpp b/libnd4j/tests_cpu/layers_tests/WorkspaceTests.cpp
index 72ca854f8..fd277b971 100644
--- a/libnd4j/tests_cpu/layers_tests/WorkspaceTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/WorkspaceTests.cpp
@@ -55,7 +55,6 @@ TEST_F(WorkspaceTests, BasicInitialization2) {
 
     auto v = array.reduceNumber(reduce::Sum);
     auto f = v.e<float>(0);
-    v.printShapeInfo("v shape");
 
     ASSERT_NEAR(2.0f, f, 1e-5);
 
@@ -77,7 +76,6 @@ TEST_F(WorkspaceTests, BasicInitialization3) {
 
     auto v = array.reduceNumber(reduce::Sum);
     auto f = v.e<float>(0);
-    v.printShapeInfo("v shape");
 
     ASSERT_NEAR(2.0f, array.reduceNumber(reduce::Sum).e<float>(0), 1e-5);
 
diff --git a/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt b/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt
index 218035421..315839dba 100644
--- a/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt
+++ b/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt
@@ -109,15 +109,17 @@ endif()
 # -fsanitize=address
 # -fsanitize=leak
 if (APPLE)
-    set(CMAKE_CXX_FLAGS  " -O0 -g -fPIC -std=c++11 -D__APPLE_OS__=true")
+    set(CMAKE_CXX_FLAGS  " -O0 -g -fPIC -std=c++11 -D__APPLE_OS__=true -DAPPLE_BUILD=true")
 elseif(WIN32)
 	if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
 		set(CMAKE_CXX_FLAGS  " -g -fPIC -std=c++11 -Wa,-mbig-obj")
 	endif()
 else()
+    set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -DLINUX_BUILD=true")
+
     if ("${_RELEASE}" OR CMAKE_BUILD_TYPE STREQUAL "Release")
         message("Release build for tests")
-        set(CMAKE_CXX_FLAGS  "-O3 -fPIC -std=c++11")
+        set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -O3 -fPIC -std=c++11 -D_RELEASE=true")
         if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64*")
             set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -mcpu=native")
         else()
diff --git a/libnd4j/tests_cpu/run_tests.sh b/libnd4j/tests_cpu/run_tests.sh
index e5cbd4106..2932827d4 100755
--- a/libnd4j/tests_cpu/run_tests.sh
+++ b/libnd4j/tests_cpu/run_tests.sh
@@ -16,9 +16,30 @@
 # SPDX-License-Identifier: Apache-2.0
 ################################################################################
 
-
 set -exo pipefail
 
+while [[ $# > 0 ]]
+do
+    key="$1"
+    value="${2:-}"
+
+    case $key in
+        -c|--chip)
+        CHIP="${value}"
+        shift # past argument
+        ;;
+        *)
+        # unknown option
+        ;;
+    esac
+    
+    if [[ $# > 0 ]]; then
+        shift # past argument or value
+    fi
+done
+
+CHIP="${CHIP:-cpu}"
+
 # On Mac, make sure it can find libraries for GCC
 export DYLD_LIBRARY_PATH=/usr/local/lib/gcc/8/:/usr/local/lib/gcc/7/:/usr/local/lib/gcc/6/:/usr/local/lib/gcc/5/
 
@@ -30,4 +51,4 @@ if [ -n "$BUILD_PATH" ]; then
     export PATH="$PATH:$BUILD_PATH"
 fi
 
-../blasbuild/cpu/tests_cpu/layers_tests/runtests --gtest_output="xml:../target/surefire-reports/TEST-results.xml"
+../blasbuild/${CHIP}/tests_cpu/layers_tests/runtests --gtest_output="xml:../target/surefire-reports/TEST-${CHIP}-results.xml"
diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/reduce3/EqualsWithEps.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/reduce3/EqualsWithEps.java
index 3bf3105f8..0f8f48d86 100644
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/reduce3/EqualsWithEps.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/reduce3/EqualsWithEps.java
@@ -49,7 +49,7 @@ public class EqualsWithEps extends BaseReduce3Op {
 
     public EqualsWithEps(INDArray x, INDArray y, INDArray z, double eps, int... dimensions) {
         super(x, y, z, false, dimensions);
-        this.extraArgs = new Object[] {eps};
+        this.extraArgs = new Object[] {0.0, 0.0, eps};
     }
 
     public EqualsWithEps(INDArray x, INDArray y, double eps, int... dimensions) {
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java
index efa70d691..fecb64012 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java
@@ -731,7 +731,6 @@ public class Nd4jCuda extends org.nd4j.nativeblas.Nd4jCudaHelper {
 // #define ND4J_EXPORT
 // #endif
 // #include <dll.h>
-// #include <helpers/BlasHelper.h>
 
 /*
 int tad_threshold = 1;
@@ -3604,6 +3603,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #include <indexing/IndicesList.h>
 // #include <graph/Intervals.h>
 // #include <array/DataType.h>
+// #include <array/DataTypeUtils.h>
 // #include <stdint.h>
 // #include <array/ArrayOptions.h>
 // #include <array/ArrayType.h>
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/CpuMemoryManager.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/CpuMemoryManager.java
index 8af56286d..58ad965a6 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/CpuMemoryManager.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/CpuMemoryManager.java
@@ -67,7 +67,7 @@ public class CpuMemoryManager extends BasicMemoryManager {
      */
     @Override
     public void release(@NonNull Pointer pointer, MemoryKind kind) {
-        Pointer.free(pointer);
+        NativeOpsHolder.getInstance().getDeviceNativeOps().freeHost(pointer);
         pointer.setNull();
     }
 
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java
index f915c8152..06c061fad 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java
@@ -1,4 +1,4 @@
-// Targeted by JavaCPP version 1.5.1-1: DO NOT EDIT THIS FILE
+// Targeted by JavaCPP version 1.5.2: DO NOT EDIT THIS FILE
 
 package org.nd4j.nativeblas;
 
@@ -731,7 +731,6 @@ public class Nd4jCpu extends org.nd4j.nativeblas.Nd4jCpuHelper {
 // #define ND4J_EXPORT
 // #endif
 // #include <dll.h>
-// #include <helpers/BlasHelper.h>
 
 /*
 int tad_threshold = 1;
@@ -3604,6 +3603,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #include <indexing/IndicesList.h>
 // #include <graph/Intervals.h>
 // #include <array/DataType.h>
+// #include <array/DataTypeUtils.h>
 // #include <stdint.h>
 // #include <array/ArrayOptions.h>
 // #include <array/ArrayType.h>
@@ -5454,6 +5454,10 @@ NDArray& NDArray::operator()(const Nd4jLong* idx) {
 
         
 
+        
+
+        
+
         
 
         
@@ -21232,6 +21236,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 
 /*******************************************************************************
  * Copyright (c) 2015-2018 Skymind, Inc.
+ * Copyright (c) 2019 Konduit K.K.
  *
  * This program and the accompanying materials are made available under the
  * terms of the Apache License, Version 2.0 which is available at
@@ -21290,6 +21295,18 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                 }
 //         #endif
 
+        /*
+         * random_uniform distribution for types int32,int64, float16, float and double
+         * by default dtype is float32
+         *
+         * input:
+         *    0 - shape of output (1D int tensor)
+         *    1 - min val (0D of output type) - optional (0 as default)
+         *    2 - max val (0D of output type) - optional (inf as default)
+         *
+         * output:
+         *    0 - uniformly distributed values of given type (between min and max)
+         */
 //         #if NOT_EXCLUDED(OP_randomuniform)
         @Namespace("nd4j::ops") public static class randomuniform extends DeclarableCustomOp {
             static { Loader.load(); }
@@ -21362,6 +21379,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                 }
 //         #endif
 
+//         #if NOT_EXCLUDED(OP_random_crop)
         @Namespace("nd4j::ops") public static class random_crop extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
@@ -21377,6 +21395,50 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
+//         #endif
+
+        /**
+         * random_gamma op.
+         */
+//         #if NOT_EXCLUDED(OP_random_gamma)
+        @Namespace("nd4j::ops") public static class random_gamma extends DeclarableCustomOp {
+            static { Loader.load(); }
+            /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
+            public random_gamma(Pointer p) { super(p); }
+            /** Native array allocator. Access with {@link Pointer#position(long)}. */
+            public random_gamma(long size) { super((Pointer)null); allocateArray(size); }
+            private native void allocateArray(long size);
+            @Override public random_gamma position(long position) {
+                return (random_gamma)super.position(position);
+            }
+        
+                                                                                    public random_gamma() { super((Pointer)null); allocate(); }
+                                                                                    private native void allocate();
+                                                                                    public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
+                                                                                }
+//         #endif
+
+        /**
+         * random_poisson op.
+         */
+//         #if NOT_EXCLUDED(OP_random_poisson)
+        @Namespace("nd4j::ops") public static class random_poisson extends DeclarableCustomOp {
+            static { Loader.load(); }
+            /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
+            public random_poisson(Pointer p) { super(p); }
+            /** Native array allocator. Access with {@link Pointer#position(long)}. */
+            public random_poisson(long size) { super((Pointer)null); allocateArray(size); }
+            private native void allocateArray(long size);
+            @Override public random_poisson position(long position) {
+                return (random_poisson)super.position(position);
+            }
+        
+                                                                                    public random_poisson() { super((Pointer)null); allocate(); }
+                                                                                    private native void allocate();
+                                                                                    public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
+                                                                                }
+//         #endif
+
     
 
 

From 47d19908f46239f44c381059fa5608eb0d6218ac Mon Sep 17 00:00:00 2001
From: Alex Black <blacka101@gmail.com>
Date: Thu, 14 Nov 2019 19:38:20 +1100
Subject: [PATCH 08/15] Various fixes (#43)

* #8172 Enable DL4J MKLDNN batch norm backward pass

Signed-off-by: AlexDBlack <blacka101@gmail.com>

* #8382 INDArray.toString() rank 1 brackets / ambiguity fix

Signed-off-by: AlexDBlack <blacka101@gmail.com>

* #8308 Fix handful of broken links (inc. some in errors)

Signed-off-by: AlexDBlack <blacka101@gmail.com>

* Unused dependencies, round 1

Signed-off-by: AlexDBlack <blacka101@gmail.com>

* Unused dependencies, round 2

Signed-off-by: AlexDBlack <blacka101@gmail.com>

* Unused dependencies, round 3

Signed-off-by: AlexDBlack <blacka101@gmail.com>

* Small fix

Signed-off-by: AlexDBlack <blacka101@gmail.com>

* Uniform distribution TF import fix

Signed-off-by: AlexDBlack <blacka101@gmail.com>
---
 CONTRIBUTING.md                               |   2 +-
 arbiter/arbiter-core/src/assembly/bin.xml     |   2 +-
 datavec/datavec-api/pom.xml                   |   5 -
 datavec/datavec-arrow/pom.xml                 |  10 -
 datavec/datavec-data/datavec-data-nlp/pom.xml |  20 --
 datavec/datavec-geo/pom.xml                   |  30 ---
 datavec/datavec-hadoop/pom.xml                |  30 ---
 datavec/datavec-local/pom.xml                 |  37 +---
 datavec/datavec-perf/pom.xml                  |   5 -
 .../datavec-spark-inference-client/pom.xml    |  20 --
 .../datavec-spark-inference-server/pom.xml    |   6 -
 datavec/datavec-spark/pom.xml                 |   5 -
 .../common/resources/DL4JResources.java       |  10 +-
 .../nn/mkldnn/ValidateMKLDNN.java             |  83 +++++----
 .../graph/models/deepwalk/DeepWalk.java       |   2 +-
 .../InvalidKerasConfigurationException.java   |   2 +-
 ...nsupportedKerasConfigurationException.java |   4 +-
 .../layers/embeddings/KerasEmbedding.java     |   2 +-
 .../pom.xml                                   |  65 -------
 .../pom.xml                                   |  15 --
 .../clustering/quadtree/QuadTree.java         |   2 +-
 .../clustering/util/MathUtils.java            |   2 +-
 .../deeplearning4j-nlp-chinese/pom.xml        |   6 -
 .../deeplearning4j-nlp-uima/pom.xml           |  20 --
 .../deeplearning4j-nlp/pom.xml                |  15 --
 .../graph/walkers/impl/PopularityWalker.java  |   2 +-
 .../graph/walkers/impl/RandomWalker.java      |   2 +-
 .../nn/conf/GradientNormalization.java        |   2 +-
 .../distribution/OrthogonalDistribution.java  |   2 +-
 .../nn/conf/layers/BatchNormalization.java    |   4 +-
 .../conf/serde/BaseNetConfigDeserializer.java |   4 +-
 .../layers/mkldnn/MKLDNNBatchNormHelper.java  |   6 +-
 .../normalization/BatchNormalization.java     |  10 +-
 .../recurrent/GravesBidirectionalLSTM.java    |   2 +-
 .../nn/layers/recurrent/GravesLSTM.java       |   2 +-
 .../nn/layers/recurrent/LSTM.java             |   2 +-
 .../nn/layers/recurrent/LSTMHelpers.java      |   2 +-
 .../deeplearning4j-aws/pom.xml                | 176 ++----------------
 .../aws/emr/SparkEMRClient.java               |   8 +-
 .../deeplearning4j-scaleout/spark/pom.xml     |  79 --------
 .../deeplearning4j-play/pom.xml               |  26 ---
 docs/deeplearning4j-nlp/templates/word2vec.md |   4 +-
 .../templates/computationgraph.md             |   6 +-
 .../templates/model-persistence.md            |   2 +-
 docs/deeplearning4j-nn/templates/recurrent.md |   2 +-
 .../templates/tsne-visualization.md           |   2 +-
 .../templates/howto.md                        |   4 +-
 docs/deeplearning4j/templates/beginners.md    |   2 +-
 docs/deeplearning4j/templates/cheat-sheet.md  |   4 +-
 .../templates/config-performance-debugging.md |   2 +-
 .../deeplearning4j/templates/examples-tour.md |  52 +++---
 docs/deeplearning4j/templates/quickstart.md   |   2 +-
 libnd4j/include/helpers/files.h               |   2 +-
 libnd4j/include/ops/declarable/headers/nn.h   |   2 +-
 .../ops/declarable/helpers/cpu/gru.cpp        |   2 +-
 .../ops/declarable/helpers/cuda/gru.cu        |   2 +-
 libnd4j/tests_cpu/layers_tests/testinclude.h  |   2 +-
 nd4j/README.md                                |   4 +-
 .../nd4j-api-parent/nd4j-api/pom.xml          |  22 ---
 .../org/nd4j/autodiff/samediff/ops/SDNN.java  |   6 +-
 .../activations/impl/ActivationRReLU.java     |   2 +-
 .../linalg/api/ops/impl/scalar/LeakyReLU.java |   2 +-
 .../api/ops/impl/transforms/MaxOut.java       |   2 +-
 .../api/ops/impl/transforms/strict/ELU.java   |   2 +-
 .../random/custom/DistributionUniform.java    |   1 +
 .../nd4j/linalg/learning/AdaMaxUpdater.java   |   2 +-
 .../org/nd4j/linalg/learning/AdamUpdater.java |   2 +-
 .../nd4j/linalg/learning/config/AdaMax.java   |   2 +-
 .../org/nd4j/linalg/learning/config/Adam.java |   2 +-
 .../nd4j/linalg/string/NDArrayStrings.java    |  24 ++-
 .../nd4j-backend-impls/nd4j-native/pom.xml    |  70 -------
 .../java/org/nd4j/nativeblas/Nd4jCpu.java     |   2 +-
 nd4j/nd4j-backends/nd4j-tests/pom.xml         |  15 +-
 .../java/org/nd4j/linalg/BaseNd4jTest.java    |   4 -
 .../java/org/nd4j/linalg/ToStringTest.java    |  25 +++
 .../java/org/nd4j/linalg/util/ArrayUtil.java  |   2 +-
 .../java/org/nd4j/linalg/util/MathUtils.java  |   2 +-
 .../nd4j-parameter-server-client/pom.xml      |  34 ----
 .../nd4j/parameterserver/BaseNd4jTest.java    |   1 -
 .../background/BackgroundDaemonStarter.java   |   2 +-
 .../nd4j-parameter-server-status/pom.xml      | 137 ++++----------
 .../nd4j-parameter-server/pom.xml             |  45 +----
 .../parameterserver/util/CheckSocket.java     |   2 +-
 nd4j/nd4j-remote/nd4j-grpc-client/pom.xml     |   2 +
 nd4j/nd4j-serde/nd4j-arrow/pom.xml            |  10 -
 nd4j/nd4j-serde/nd4j-kryo/pom.xml             |  21 +--
 .../qlearning/discrete/QLearningDiscrete.java |   2 +-
 87 files changed, 259 insertions(+), 1011 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 4e75d7bfe..0a25d9775 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -9,7 +9,7 @@ Deeplearning4j's [open issues are here](https://github.com/eclipse/deeplearning4
 
 Note that you will need to [build dl4j from source](https://deeplearning4j.org/docs/latest/deeplearning4j-build-from-source)
 
-For some tips on contributing to open source, this [post is helpful](http://blog.smartbear.com/programming/14-ways-to-contribute-to-open-source-without-being-a-programming-genius-or-a-rock-star/).
+For some tips on contributing to open source, this [post is helpful](https://smartbear.com/blog/test-and-monitor/14-ways-to-contribute-to-open-source-without-being/).
 
 ## Contributions
 
diff --git a/arbiter/arbiter-core/src/assembly/bin.xml b/arbiter/arbiter-core/src/assembly/bin.xml
index cc6920b24..c99d6b144 100644
--- a/arbiter/arbiter-core/src/assembly/bin.xml
+++ b/arbiter/arbiter-core/src/assembly/bin.xml
@@ -61,7 +61,7 @@
 			<outputDirectory>examples</outputDirectory>
 			<!--
 				<lineEnding>unix</lineEnding>
-				http://stackoverflow.com/questions/2958282/stranges-files-in-my-assembly-since-switching-to-lineendingunix-lineending
+				https://stackoverflow.com/questions/2958282/stranges-files-in-my-assembly-since-switching-to-lineendingunix-lineending
 			-->
 		</fileSet>
 
diff --git a/datavec/datavec-api/pom.xml b/datavec/datavec-api/pom.xml
index 022f2e38b..b3401b431 100644
--- a/datavec/datavec-api/pom.xml
+++ b/datavec/datavec-api/pom.xml
@@ -52,11 +52,6 @@
             <artifactId>joda-time</artifactId>
             <version>${jodatime.version}</version>
         </dependency>
-        <dependency>
-            <groupId>org.yaml</groupId>
-            <artifactId>snakeyaml</artifactId>
-            <version>${snakeyaml.version}</version>
-        </dependency>
         <!-- ND4J Shaded Jackson Dependencies -->
         <dependency>
             <groupId>org.nd4j</groupId>
diff --git a/datavec/datavec-arrow/pom.xml b/datavec/datavec-arrow/pom.xml
index 645971a45..6134bbf27 100644
--- a/datavec/datavec-arrow/pom.xml
+++ b/datavec/datavec-arrow/pom.xml
@@ -29,21 +29,11 @@
     <name>datavec-arrow</name>
 
     <dependencies>
-        <dependency>
-            <groupId>org.nd4j</groupId>
-            <artifactId>nd4j-arrow</artifactId>
-            <version>${project.version}</version>
-        </dependency>
         <dependency>
             <groupId>org.datavec</groupId>
             <artifactId>datavec-api</artifactId>
             <version>${project.version}</version>
         </dependency>
-        <dependency>
-            <groupId>com.carrotsearch</groupId>
-            <artifactId>hppc</artifactId>
-            <version>${hppc.version}</version>
-        </dependency>
         <dependency>
             <groupId>org.apache.arrow</groupId>
             <artifactId>arrow-vector</artifactId>
diff --git a/datavec/datavec-data/datavec-data-nlp/pom.xml b/datavec/datavec-data/datavec-data-nlp/pom.xml
index 17ad11211..12df0fb08 100644
--- a/datavec/datavec-data/datavec-data-nlp/pom.xml
+++ b/datavec/datavec-data/datavec-data-nlp/pom.xml
@@ -44,26 +44,6 @@
             <artifactId>datavec-api</artifactId>
             <version>${project.version}</version>
         </dependency>
-        <dependency>
-            <groupId>commons-logging</groupId>
-            <artifactId>commons-logging</artifactId>
-            <version>${commons-logging.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.springframework</groupId>
-            <artifactId>spring-core</artifactId>
-            <version>${spring.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.springframework</groupId>
-            <artifactId>spring-context</artifactId>
-            <version>${spring.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.springframework</groupId>
-            <artifactId>spring-beans</artifactId>
-            <version>${spring.version}</version>
-        </dependency>
         <dependency>
             <groupId>org.cleartk</groupId>
             <artifactId>cleartk-snowball</artifactId>
diff --git a/datavec/datavec-geo/pom.xml b/datavec/datavec-geo/pom.xml
index 15c22ba3b..50e843555 100644
--- a/datavec/datavec-geo/pom.xml
+++ b/datavec/datavec-geo/pom.xml
@@ -31,36 +31,6 @@
             <artifactId>datavec-api</artifactId>
             <version>${project.version}</version>
         </dependency>
-        <dependency>
-            <groupId>com.fasterxml.jackson.core</groupId>
-            <artifactId>jackson-core</artifactId>
-            <version>${geo.jackson.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>com.fasterxml.jackson.core</groupId>
-            <artifactId>jackson-databind</artifactId>
-            <version>${geo.jackson.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>com.fasterxml.jackson.core</groupId>
-            <artifactId>jackson-annotations</artifactId>
-            <version>${geo.jackson.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>com.fasterxml.jackson.dataformat</groupId>
-            <artifactId>jackson-dataformat-yaml</artifactId>
-            <version>${geo.jackson.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>com.fasterxml.jackson.dataformat</groupId>
-            <artifactId>jackson-dataformat-xml</artifactId>
-            <version>${geo.jackson.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>com.fasterxml.jackson.datatype</groupId>
-            <artifactId>jackson-datatype-joda</artifactId>
-            <version>${geo.jackson.version}</version>
-        </dependency>
         <dependency>
             <groupId>com.maxmind.geoip2</groupId>
             <artifactId>geoip2</artifactId>
diff --git a/datavec/datavec-hadoop/pom.xml b/datavec/datavec-hadoop/pom.xml
index c95e6d3bc..5ec6d4c3f 100644
--- a/datavec/datavec-hadoop/pom.xml
+++ b/datavec/datavec-hadoop/pom.xml
@@ -35,41 +35,11 @@
             <version>${project.version}</version>
         </dependency>
 
-        <dependency>
-            <groupId>com.sun.xml.bind</groupId>
-            <artifactId>jaxb-core</artifactId>
-            <version>${jaxb.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>com.sun.xml.bind</groupId>
-            <artifactId>jaxb-impl</artifactId>
-            <version>${jaxb.version}</version>
-        </dependency>
         <dependency>
             <groupId>io.netty</groupId>
             <artifactId>netty</artifactId>
             <version>${netty.version}</version>
         </dependency>
-        <dependency>
-            <groupId>org.apache.commons</groupId>
-            <artifactId>commons-compress</artifactId>
-            <version>${commons-compress.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.zookeeper</groupId>
-            <artifactId>zookeeper</artifactId>
-            <version>${zookeeper.version}</version>
-            <exclusions>
-                <exclusion>
-                    <groupId>log4j</groupId>
-                    <artifactId>log4j</artifactId>
-                </exclusion>
-                <exclusion>
-                    <groupId>org.slf4j</groupId>
-                    <artifactId>slf4j-log4j12</artifactId>
-                </exclusion>
-            </exclusions>
-        </dependency>
         <dependency>
             <groupId>org.apache.hadoop</groupId>
             <artifactId>hadoop-common</artifactId>
diff --git a/datavec/datavec-local/pom.xml b/datavec/datavec-local/pom.xml
index f286eeb95..d2b15ffed 100644
--- a/datavec/datavec-local/pom.xml
+++ b/datavec/datavec-local/pom.xml
@@ -73,42 +73,7 @@
         </dependency>
 
 
-        <dependency>
-            <groupId>com.fasterxml.jackson.core</groupId>
-            <artifactId>jackson-core</artifactId>
-            <version>${geo.jackson.version}</version>
-            <scope>test</scope>
-        </dependency>
-        <dependency>
-            <groupId>com.fasterxml.jackson.core</groupId>
-            <artifactId>jackson-databind</artifactId>
-            <version>${geo.jackson.version}</version>
-            <scope>test</scope>
-        </dependency>
-        <dependency>
-            <groupId>com.fasterxml.jackson.core</groupId>
-            <artifactId>jackson-annotations</artifactId>
-            <version>${geo.jackson.version}</version>
-            <scope>test</scope>
-        </dependency>
-        <dependency>
-            <groupId>com.fasterxml.jackson.dataformat</groupId>
-            <artifactId>jackson-dataformat-yaml</artifactId>
-            <version>${geo.jackson.version}</version>
-            <scope>test</scope>
-        </dependency>
-        <dependency>
-            <groupId>com.fasterxml.jackson.dataformat</groupId>
-            <artifactId>jackson-dataformat-xml</artifactId>
-            <version>${geo.jackson.version}</version>
-            <scope>test</scope>
-        </dependency>
-        <dependency>
-            <groupId>com.fasterxml.jackson.datatype</groupId>
-            <artifactId>jackson-datatype-joda</artifactId>
-            <version>${geo.jackson.version}</version>
-            <scope>test</scope>
-        </dependency>
+
         <dependency>
             <groupId>org.datavec</groupId>
             <artifactId>datavec-python</artifactId>
diff --git a/datavec/datavec-perf/pom.xml b/datavec/datavec-perf/pom.xml
index fb4eaaa89..95f3135e5 100644
--- a/datavec/datavec-perf/pom.xml
+++ b/datavec/datavec-perf/pom.xml
@@ -41,11 +41,6 @@
             <artifactId>slf4j-api</artifactId>
             <version>${slf4j.version}</version>
         </dependency>
-        <dependency>
-            <groupId>com.github.oshi</groupId>
-            <artifactId>oshi-core</artifactId>
-            <version>${oshi.version}</version>
-        </dependency>
         <dependency>
             <groupId>org.datavec</groupId>
             <artifactId>datavec-data-image</artifactId>
diff --git a/datavec/datavec-spark-inference-parent/datavec-spark-inference-client/pom.xml b/datavec/datavec-spark-inference-parent/datavec-spark-inference-client/pom.xml
index 076c22ab9..95f13081f 100644
--- a/datavec/datavec-spark-inference-parent/datavec-spark-inference-client/pom.xml
+++ b/datavec/datavec-spark-inference-parent/datavec-spark-inference-client/pom.xml
@@ -41,26 +41,6 @@
             <version>1.0.0-SNAPSHOT</version>
             <scope>test</scope>
         </dependency>
-        <dependency>
-            <groupId>commons-codec</groupId>
-            <artifactId>commons-codec</artifactId>
-            <version>${commons-codec.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.httpcomponents</groupId>
-            <artifactId>httpclient</artifactId>
-            <version>${httpclient.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.httpcomponents</groupId>
-            <artifactId>httpcore</artifactId>
-            <version>${httpcore.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.httpcomponents</groupId>
-            <artifactId>httpmime</artifactId>
-            <version>${httpmime.version}</version>
-        </dependency>
         <dependency>
             <groupId>com.mashape.unirest</groupId>
             <artifactId>unirest-java</artifactId>
diff --git a/datavec/datavec-spark-inference-parent/datavec-spark-inference-server/pom.xml b/datavec/datavec-spark-inference-parent/datavec-spark-inference-server/pom.xml
index 8bef216a7..77eff8758 100644
--- a/datavec/datavec-spark-inference-parent/datavec-spark-inference-server/pom.xml
+++ b/datavec/datavec-spark-inference-parent/datavec-spark-inference-server/pom.xml
@@ -94,12 +94,6 @@
             <version>${scala.version}</version>
         </dependency>
 
-        <dependency>
-            <groupId>org.yaml</groupId>
-            <artifactId>snakeyaml</artifactId>
-            <version>${snakeyaml.version}</version>
-        </dependency>
-
         <dependency>
             <groupId>com.typesafe.play</groupId>
             <artifactId>play-java_2.11</artifactId>
diff --git a/datavec/datavec-spark/pom.xml b/datavec/datavec-spark/pom.xml
index f7143c6ea..72f0b105f 100644
--- a/datavec/datavec-spark/pom.xml
+++ b/datavec/datavec-spark/pom.xml
@@ -39,11 +39,6 @@
             <artifactId>scala-library</artifactId>
             <version>${scala.version}</version>
         </dependency>
-        <dependency>
-            <groupId>org.scala-lang</groupId>
-            <artifactId>scala-reflect</artifactId>
-            <version>${scala.version}</version>
-        </dependency>
 
         <dependency>
             <groupId>org.apache.spark</groupId>
diff --git a/deeplearning4j/deeplearning4j-common/src/main/java/org/deeplearning4j/common/resources/DL4JResources.java b/deeplearning4j/deeplearning4j-common/src/main/java/org/deeplearning4j/common/resources/DL4JResources.java
index a28ad375d..fab713e8e 100644
--- a/deeplearning4j/deeplearning4j-common/src/main/java/org/deeplearning4j/common/resources/DL4JResources.java
+++ b/deeplearning4j/deeplearning4j-common/src/main/java/org/deeplearning4j/common/resources/DL4JResources.java
@@ -64,7 +64,7 @@ public class DL4JResources {
     /**
      * Set the base download URL for (most) DL4J datasets and models.<br>
      * This usually doesn't need to be set manually unless there is some issue with the default location
-     * @param baseDownloadURL Base download URL to set. For example, http://blob.deeplearning4j.org/
+     * @param baseDownloadURL Base download URL to set. For example, https://dl4jdata.blob.core.windows.net/
      */
     public static void setBaseDownloadURL(@NonNull String baseDownloadURL){
         baseURL = baseDownloadURL;
@@ -79,8 +79,8 @@ public class DL4JResources {
 
     /**
      * Get the URL relative to the base URL.<br>
-     * For example, if baseURL is "http://blob.deeplearning4j.org/", and relativeToBase is "/datasets/iris.dat"
-     * this simply returns "http://blob.deeplearning4j.org/datasets/iris.dat"
+     * For example, if baseURL is "https://dl4jdata.blob.core.windows.net/", and relativeToBase is "/datasets/iris.dat"
+     * this simply returns "https://dl4jdata.blob.core.windows.net/datasets/iris.dat"
      *
      * @param relativeToBase Relative URL
      * @return URL
@@ -92,8 +92,8 @@ public class DL4JResources {
 
     /**
      * Get the URL relative to the base URL as a String.<br>
-     * For example, if baseURL is "http://blob.deeplearning4j.org/", and relativeToBase is "/datasets/iris.dat"
-     * this simply returns "http://blob.deeplearning4j.org/datasets/iris.dat"
+     * For example, if baseURL is "https://dl4jdata.blob.core.windows.net/", and relativeToBase is "/datasets/iris.dat"
+     * this simply returns "https://dl4jdata.blob.core.windows.net/datasets/iris.dat"
      *
      * @param relativeToBase Relative URL
      * @return URL
diff --git a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/mkldnn/ValidateMKLDNN.java b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/mkldnn/ValidateMKLDNN.java
index 7e3ae6720..f65e48f44 100644
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/mkldnn/ValidateMKLDNN.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/mkldnn/ValidateMKLDNN.java
@@ -138,52 +138,55 @@ public class ValidateMKLDNN extends BaseDL4JTest {
         ConvolutionMode cm = ConvolutionMode.Truncate;
 
         for (int minibatch : new int[]{1, 3}) {
+            for (boolean b : new boolean[]{true, false}) {
 
-            inputSize[0] = minibatch;
-            INDArray f = Nd4j.rand(Nd4j.defaultFloatingPointType(), inputSize);
-            INDArray l = TestUtils.randomOneHot(minibatch, 10);
+                inputSize[0] = minibatch;
+                INDArray f = Nd4j.rand(Nd4j.defaultFloatingPointType(), inputSize);
+                INDArray l = TestUtils.randomOneHot(minibatch, 10);
 
-            MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder()
-                    .updater(new Adam(0.01))
-                    .convolutionMode(cm)
-                    .seed(12345)
-                    .list()
-                    .layer(new ConvolutionLayer.Builder().activation(Activation.TANH)
-                            .kernelSize(kernel)
-                            .stride(stride)
-                            .padding(0, 0)
-                            .nOut(3)
-                            .build())
-                    .layer(new BatchNormalization.Builder().helperAllowFallback(false)/*.eps(0)*/.build())
-                    .layer(new ConvolutionLayer.Builder().activation(Activation.TANH)
-                            .kernelSize(kernel)
-                            .stride(stride)
-                            .padding(0, 0)
-                            .nOut(3)
-                            .build())
-                    .layer(new OutputLayer.Builder().nOut(10).activation(Activation.SOFTMAX).lossFunction(LossFunctions.LossFunction.MCXENT).build())
-                    .setInputType(InputType.convolutional(inputSize[2], inputSize[3], inputSize[1]))
-                    .build();
+                MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder()
+                        .dataType(DataType.FLOAT)
+                        .updater(new Adam(0.01))
+                        .convolutionMode(cm)
+                        .seed(12345)
+                        .list()
+                        .layer(new ConvolutionLayer.Builder().activation(Activation.TANH)
+                                .kernelSize(kernel)
+                                .stride(stride)
+                                .padding(0, 0)
+                                .nOut(3)
+                                .build())
+                        .layer(new BatchNormalization.Builder().useLogStd(b).helperAllowFallback(false)/*.eps(0)*/.build())
+                        .layer(new ConvolutionLayer.Builder().activation(Activation.TANH)
+                                .kernelSize(kernel)
+                                .stride(stride)
+                                .padding(0, 0)
+                                .nOut(3)
+                                .build())
+                        .layer(new OutputLayer.Builder().nOut(10).activation(Activation.SOFTMAX).lossFunction(LossFunctions.LossFunction.MCXENT).build())
+                        .setInputType(InputType.convolutional(inputSize[2], inputSize[3], inputSize[1]))
+                        .build();
 
-            MultiLayerNetwork netWith = new MultiLayerNetwork(conf.clone());
-            netWith.init();
+                MultiLayerNetwork netWith = new MultiLayerNetwork(conf.clone());
+                netWith.init();
 
-            MultiLayerNetwork netWithout = new MultiLayerNetwork(conf.clone());
-            netWithout.init();
+                MultiLayerNetwork netWithout = new MultiLayerNetwork(conf.clone());
+                netWithout.init();
 
-            LayerHelperValidationUtil.TestCase tc = LayerHelperValidationUtil.TestCase.builder()
-                    .allowHelpersForClasses(Collections.<Class<?>>singletonList(org.deeplearning4j.nn.layers.normalization.BatchNormalization.class))
-                    .testForward(true)
-                    .testScore(true)
-                    .testBackward(true)
-                    .testTraining(true)
-                    .features(f)
-                    .labels(l)
-                    .data(new SingletonDataSetIterator(new DataSet(f, l)))
-                    .maxRelError(1e-4)
-                    .build();
+                LayerHelperValidationUtil.TestCase tc = LayerHelperValidationUtil.TestCase.builder()
+                        .allowHelpersForClasses(Collections.<Class<?>>singletonList(org.deeplearning4j.nn.layers.normalization.BatchNormalization.class))
+                        .testForward(true)
+                        .testScore(true)
+                        .testBackward(true)
+                        .testTraining(true)
+                        .features(f)
+                        .labels(l)
+                        .data(new SingletonDataSetIterator(new DataSet(f, l)))
+                        .maxRelError(1e-4)
+                        .build();
 
-            LayerHelperValidationUtil.validateMLN(netWith, tc);
+                LayerHelperValidationUtil.validateMLN(netWith, tc);
+            }
         }
     }
 
diff --git a/deeplearning4j/deeplearning4j-graph/src/main/java/org/deeplearning4j/graph/models/deepwalk/DeepWalk.java b/deeplearning4j/deeplearning4j-graph/src/main/java/org/deeplearning4j/graph/models/deepwalk/DeepWalk.java
index 0bc633895..0ba9217ec 100644
--- a/deeplearning4j/deeplearning4j-graph/src/main/java/org/deeplearning4j/graph/models/deepwalk/DeepWalk.java
+++ b/deeplearning4j/deeplearning4j-graph/src/main/java/org/deeplearning4j/graph/models/deepwalk/DeepWalk.java
@@ -38,7 +38,7 @@ import java.util.concurrent.atomic.AtomicLong;
 
 /**Implementation of the DeepWalk graph vectorization model, based on the paper
  * <i>DeepWalk: Online Learning of Social Representations</i> by Perozzi, Al-Rfou & Skiena (2014),
- * <a href="http://arxiv.org/abs/1403.6652">http://arxiv.org/abs/1403.6652</a><br>
+ * <a href="https://arxiv.org/abs/1403.6652">https://arxiv.org/abs/1403.6652</a><br>
  * Similar to word2vec in nature, DeepWalk is an unsupervised learning algorithm that learns a vector representation
  * of each vertex in a graph. Vector representations are learned using walks (usually random walks) on the vertices in
  * the graph.<br>
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/exceptions/InvalidKerasConfigurationException.java b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/exceptions/InvalidKerasConfigurationException.java
index bea7fa2ad..db51cb499 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/exceptions/InvalidKerasConfigurationException.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/exceptions/InvalidKerasConfigurationException.java
@@ -40,6 +40,6 @@ public class InvalidKerasConfigurationException extends Exception {
     }
 
     private static String appendDocumentationURL(String message) {
-        return message + ". For more information, see http://deeplearning4j.org/model-import-keras.";
+        return message + ". For more information, see http://deeplearning4j.org/docs/latest/keras-import-overview";
     }
 }
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/exceptions/UnsupportedKerasConfigurationException.java b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/exceptions/UnsupportedKerasConfigurationException.java
index c540bcd64..6244cf1e8 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/exceptions/UnsupportedKerasConfigurationException.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/exceptions/UnsupportedKerasConfigurationException.java
@@ -22,7 +22,7 @@ package org.deeplearning4j.nn.modelimport.keras.exceptions;
  * is not currently supported.
  *
  * See <a href="https://deeplearning4j.org/docs/latest/keras-import-overview">https://deeplearning4j.org/docs/latest/keras-import-overview</a>
- * for more information and file an issue at <a href="http://github.com/deeplearning4j/deeplearning4j/issues">http://github.com/deeplearning4j/deeplearning4j/issues</a>.
+ * for more information and file an issue at <a href="https://github.com/eclipse/deeplearning4j/issues">https://github.com/eclipse/deeplearning4j/issues</a>.
  *
  * @author dave@skymind.io
  */
@@ -41,6 +41,6 @@ public class UnsupportedKerasConfigurationException extends Exception {
     }
 
     private static String appendDocumentationURL(String message) {
-        return message + ". Please file an issue at http://github.com/deeplearning4j/deeplearning4j/issues.";
+        return message + ". Please file an issue at https://github.com/eclipse/deeplearning4j/issues.";
     }
 }
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/embeddings/KerasEmbedding.java b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/embeddings/KerasEmbedding.java
index 6bc1c4129..2a34f707c 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/embeddings/KerasEmbedding.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/embeddings/KerasEmbedding.java
@@ -104,7 +104,7 @@ public class KerasEmbedding extends KerasLayer {
                     "on Embedding layers. Zero Masking for the Embedding layer only works with unidirectional LSTM for now."
                     + " If you want to have this behaviour for your imported model " +
                     "in DL4J, apply masking as a pre-processing step to your input." +
-                    "See https://deeplearning4j.org/usingrnns#masking for more on this.");
+                    "See http://deeplearning4j.org/docs/latest/deeplearning4j-nn-recurrent#masking for more on this.");
 
         Pair<WeightInit, Distribution> init = getWeightInitFromConfig(layerConfig, conf.getLAYER_FIELD_EMBEDDING_INIT(),
                 enforceTrainingConfig, conf, kerasMajorVersion);
diff --git a/deeplearning4j/deeplearning4j-nearestneighbors-parent/deeplearning4j-nearestneighbor-server/pom.xml b/deeplearning4j/deeplearning4j-nearestneighbors-parent/deeplearning4j-nearestneighbor-server/pom.xml
index 7477c7794..38ee4204c 100644
--- a/deeplearning4j/deeplearning4j-nearestneighbors-parent/deeplearning4j-nearestneighbor-server/pom.xml
+++ b/deeplearning4j/deeplearning4j-nearestneighbors-parent/deeplearning4j-nearestneighbor-server/pom.xml
@@ -77,71 +77,6 @@
             <version>${project.version}</version>
         </dependency>
 
-        <dependency>
-            <groupId>com.google.protobuf</groupId>
-            <artifactId>protobuf-java</artifactId>
-            <version>${google.protobuf.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>joda-time</groupId>
-            <artifactId>joda-time</artifactId>
-            <version>${jodatime.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.commons</groupId>
-            <artifactId>commons-lang3</artifactId>
-            <version>${commons-lang3.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.hibernate</groupId>
-            <artifactId>hibernate-validator</artifactId>
-            <version>${hibernate.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.scala-lang</groupId>
-            <artifactId>scala-library</artifactId>
-            <version>${scala.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.scala-lang</groupId>
-            <artifactId>scala-reflect</artifactId>
-            <version>${scala.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.yaml</groupId>
-            <artifactId>snakeyaml</artifactId>
-            <version>${snakeyaml.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>com.fasterxml.jackson.core</groupId>
-            <artifactId>jackson-core</artifactId>
-            <version>${jackson.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>com.fasterxml.jackson.core</groupId>
-            <artifactId>jackson-databind</artifactId>
-            <version>${jackson.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>com.fasterxml.jackson.core</groupId>
-            <artifactId>jackson-annotations</artifactId>
-            <version>${jackson.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>com.fasterxml.jackson.datatype</groupId>
-            <artifactId>jackson-datatype-jdk8</artifactId>
-            <version>${jackson.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>com.fasterxml.jackson.datatype</groupId>
-            <artifactId>jackson-datatype-jsr310</artifactId>
-            <version>${jackson.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>com.typesafe</groupId>
-             <artifactId>config</artifactId>
-            <version>${typesafe.config.version}</version>
-        </dependency>
         <dependency>
             <groupId>com.typesafe.play</groupId>
             <artifactId>play-java_2.11</artifactId>
diff --git a/deeplearning4j/deeplearning4j-nearestneighbors-parent/deeplearning4j-nearestneighbors-client/pom.xml b/deeplearning4j/deeplearning4j-nearestneighbors-parent/deeplearning4j-nearestneighbors-client/pom.xml
index 57248c559..d6b64b025 100644
--- a/deeplearning4j/deeplearning4j-nearestneighbors-parent/deeplearning4j-nearestneighbors-client/pom.xml
+++ b/deeplearning4j/deeplearning4j-nearestneighbors-parent/deeplearning4j-nearestneighbors-client/pom.xml
@@ -31,21 +31,6 @@
 
 
     <dependencies>
-        <dependency>
-            <groupId>org.apache.httpcomponents</groupId>
-            <artifactId>httpclient</artifactId>
-            <version>${httpclient.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.httpcomponents</groupId>
-            <artifactId>httpcore</artifactId>
-            <version>${httpcore.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.httpcomponents</groupId>
-            <artifactId>httpmime</artifactId>
-            <version>${httpmime.version}</version>
-        </dependency>
         <dependency>
             <groupId>com.mashape.unirest</groupId>
             <artifactId>unirest-java</artifactId>
diff --git a/deeplearning4j/deeplearning4j-nearestneighbors-parent/nearestneighbor-core/src/main/java/org/deeplearning4j/clustering/quadtree/QuadTree.java b/deeplearning4j/deeplearning4j-nearestneighbors-parent/nearestneighbor-core/src/main/java/org/deeplearning4j/clustering/quadtree/QuadTree.java
index f1cc2e304..0fbf8afec 100644
--- a/deeplearning4j/deeplearning4j-nearestneighbors-parent/nearestneighbor-core/src/main/java/org/deeplearning4j/clustering/quadtree/QuadTree.java
+++ b/deeplearning4j/deeplearning4j-nearestneighbors-parent/nearestneighbor-core/src/main/java/org/deeplearning4j/clustering/quadtree/QuadTree.java
@@ -29,7 +29,7 @@ import static java.lang.Math.max;
  * QuadTree: <a href="http://en.wikipedia.org/wiki/Quadtree">http://en.wikipedia.org/wiki/Quadtree</a>
  *
  * Reference impl based on the paper by:
- * <a href="http://arxiv.org/pdf/1301.3342v2.pdf">http://arxiv.org/pdf/1301.3342v2.pdf</a>
+ * <a href="https://arxiv.org/pdf/1301.3342v2.pdf">https://arxiv.org/pdf/1301.3342v2.pdf</a>
  *
  * Primarily focused on 2 dimensions, may expand later if there's a reason.
  *
diff --git a/deeplearning4j/deeplearning4j-nearestneighbors-parent/nearestneighbor-core/src/main/java/org/deeplearning4j/clustering/util/MathUtils.java b/deeplearning4j/deeplearning4j-nearestneighbors-parent/nearestneighbor-core/src/main/java/org/deeplearning4j/clustering/util/MathUtils.java
index ce6ddcff7..792231c7e 100755
--- a/deeplearning4j/deeplearning4j-nearestneighbors-parent/nearestneighbor-core/src/main/java/org/deeplearning4j/clustering/util/MathUtils.java
+++ b/deeplearning4j/deeplearning4j-nearestneighbors-parent/nearestneighbor-core/src/main/java/org/deeplearning4j/clustering/util/MathUtils.java
@@ -86,7 +86,7 @@ public class MathUtils {
 
 
     /**
-     * See: http://stackoverflow.com/questions/466204/rounding-off-to-nearest-power-of-2
+     * See: https://stackoverflow.com/questions/466204/rounding-off-to-nearest-power-of-2
      * @param v the number to getFromOrigin the next power of 2 for
      * @return the next power of 2 for the passed in value
      */
diff --git a/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp-chinese/pom.xml b/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp-chinese/pom.xml
index b72cb721d..35eb2903d 100644
--- a/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp-chinese/pom.xml
+++ b/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp-chinese/pom.xml
@@ -52,12 +52,6 @@
             <artifactId>deeplearning4j-nlp</artifactId>
             <version>${project.version}</version>
         </dependency>
-        <dependency>
-            <groupId>org.nutz</groupId>
-            <artifactId>nutz</artifactId>
-            <version>1.r.58</version>
-            <scope>provided</scope>
-        </dependency>
         <dependency>
             <groupId>org.nlpcn</groupId>
             <artifactId>nlp-lang</artifactId>
diff --git a/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp-uima/pom.xml b/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp-uima/pom.xml
index d27625e9f..44fbbcf9d 100644
--- a/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp-uima/pom.xml
+++ b/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp-uima/pom.xml
@@ -33,26 +33,6 @@
     </properties>
 
     <dependencies>
-        <dependency>
-            <groupId>commons-logging</groupId>
-            <artifactId>commons-logging</artifactId>
-            <version>${commons-logging.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.springframework</groupId>
-            <artifactId>spring-core</artifactId>
-            <version>${spring.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.springframework</groupId>
-            <artifactId>spring-context</artifactId>
-            <version>${spring.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.springframework</groupId>
-            <artifactId>spring-beans</artifactId>
-            <version>${spring.version}</version>
-        </dependency>
         <dependency>
             <groupId>org.cleartk</groupId>
             <artifactId>cleartk-snowball</artifactId>
diff --git a/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp/pom.xml b/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp/pom.xml
index 62c0c73f9..3f367689c 100644
--- a/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp/pom.xml
+++ b/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp/pom.xml
@@ -54,11 +54,6 @@
       <scope>test</scope>
     </dependency>
 
-    <dependency>
-      <groupId>org.objenesis</groupId>
-      <artifactId>objenesis</artifactId>
-      <version>${objenesis.version}</version>
-    </dependency>
     <dependency>
       <groupId>org.mockito</groupId>
       <artifactId>mockito-core</artifactId>
@@ -66,16 +61,6 @@
       <scope>test</scope>
     </dependency>
 
-    <!-- TSNE -->
-    <!-- (Previously: dropwizard deps) -->
-
-
-    <dependency>
-      <groupId>org.nd4j</groupId>
-      <artifactId>nd4j-jackson</artifactId>
-      <version>${nd4j.version}</version>
-    </dependency>
-
     <dependency>
       <groupId>ch.qos.logback</groupId>
       <artifactId>logback-classic</artifactId>
diff --git a/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp/src/main/java/org/deeplearning4j/models/sequencevectors/graph/walkers/impl/PopularityWalker.java b/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp/src/main/java/org/deeplearning4j/models/sequencevectors/graph/walkers/impl/PopularityWalker.java
index b48ddb2ea..05d69e94c 100644
--- a/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp/src/main/java/org/deeplearning4j/models/sequencevectors/graph/walkers/impl/PopularityWalker.java
+++ b/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp/src/main/java/org/deeplearning4j/models/sequencevectors/graph/walkers/impl/PopularityWalker.java
@@ -42,7 +42,7 @@ import java.util.*;
  * Instead of rand walks, this walker produces walks based on number of edges coming into each node.
  * This allows you to build walks filtering too rare nodes, or too popular nodes, depending on your demands.
  *
- * Original DeepWalk paper: <a href="http://arxiv.org/pdf/1403.6652v2">http://arxiv.org/pdf/1403.6652v2</a>
+ * Original DeepWalk paper: <a href="https://arxiv.org/pdf/1403.6652v2">https://arxiv.org/pdf/1403.6652v2</a>
  * @author raver119@gmail.com
  */
 public class PopularityWalker<T extends SequenceElement> extends RandomWalker<T> implements GraphWalker<T> {
diff --git a/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp/src/main/java/org/deeplearning4j/models/sequencevectors/graph/walkers/impl/RandomWalker.java b/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp/src/main/java/org/deeplearning4j/models/sequencevectors/graph/walkers/impl/RandomWalker.java
index 922dbbe27..b422a52d1 100644
--- a/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp/src/main/java/org/deeplearning4j/models/sequencevectors/graph/walkers/impl/RandomWalker.java
+++ b/deeplearning4j/deeplearning4j-nlp-parent/deeplearning4j-nlp/src/main/java/org/deeplearning4j/models/sequencevectors/graph/walkers/impl/RandomWalker.java
@@ -37,7 +37,7 @@ import java.util.concurrent.atomic.AtomicInteger;
 /**
  * This is Random-based walker for SequenceVectors-based DeepWalk implementation
  *
- * Original DeepWalk paper: <a href="http://arxiv.org/pdf/1403.6652v2">http://arxiv.org/pdf/1403.6652v2</a>
+ * Original DeepWalk paper: <a href="https://arxiv.org/pdf/1403.6652v2">https://arxiv.org/pdf/1403.6652v2</a>
  *
  * @author AlexDBlack
  * @author raver119@gmail.com
diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/GradientNormalization.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/GradientNormalization.java
index 01bd9cf3d..05b1c6638 100644
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/GradientNormalization.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/GradientNormalization.java
@@ -52,7 +52,7 @@ package org.deeplearning4j.nn.conf;
  * </ul>
  * Thus, the l2 norm of the scaled gradients will not exceed the specified threshold, though may be smaller than it<br>
  * See: Pascanu, Mikolov, Bengio (2012), <i>On the difficulty of training Recurrent Neural Networks</i>,
- * <a href="http://arxiv.org/abs/1211.5063">http://arxiv.org/abs/1211.5063</a><br>
+ * <a href="https://arxiv.org/abs/1211.5063">https://arxiv.org/abs/1211.5063</a><br>
  * Threshold for clipping can be set in Layer configuration, using gradientNormalizationThreshold(double threshold)
  * </p>
  *
diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/distribution/OrthogonalDistribution.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/distribution/OrthogonalDistribution.java
index 8959c2349..dbe7143d4 100644
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/distribution/OrthogonalDistribution.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/distribution/OrthogonalDistribution.java
@@ -23,7 +23,7 @@ import org.nd4j.shade.jackson.annotation.JsonProperty;
 
 /**
  * Orthogonal distribution, with gain parameter.<br>
- * See <a href="http://arxiv.org/abs/1312.6120">http://arxiv.org/abs/1312.6120</a> for details
+ * See <a href="https://arxiv.org/abs/1312.6120">https://arxiv.org/abs/1312.6120</a> for details
  *
  */
 @EqualsAndHashCode(callSuper = false)
diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/BatchNormalization.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/BatchNormalization.java
index 4c470fec5..f95421585 100644
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/BatchNormalization.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/BatchNormalization.java
@@ -236,7 +236,7 @@ public class BatchNormalization extends FeedForwardLayer {
 
         /**
          * Epsilon value for batch normalization; small floating point value added to variance (algorithm 1 in <a
-         * href="http://arxiv.org/pdf/1502.03167v3.pdf">http://arxiv.org/pdf/1502.03167v3.pdf</a>) to reduce/avoid
+         * href="https://arxiv.org/pdf/1502.03167v3.pdf">https://arxiv.org/pdf/1502.03167v3.pdf</a>) to reduce/avoid
          * underflow issues.<br> Default: 1e-5
          */
         protected double eps = 1e-5;
@@ -365,7 +365,7 @@ public class BatchNormalization extends FeedForwardLayer {
 
         /**
          * Epsilon value for batch normalization; small floating point value added to variance (algorithm 1 in <a
-         * href="http://arxiv.org/pdf/1502.03167v3.pdf">http://arxiv.org/pdf/1502.03167v3.pdf</a>) to reduce/avoid
+         * href="https://arxiv.org/pdf/1502.03167v3.pdf">https://arxiv.org/pdf/1502.03167v3.pdf</a>) to reduce/avoid
          * underflow issues.<br> Default: 1e-5
          *
          * @param eps Epsilon values to use
diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/serde/BaseNetConfigDeserializer.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/serde/BaseNetConfigDeserializer.java
index d32488363..a90218946 100644
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/serde/BaseNetConfigDeserializer.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/serde/BaseNetConfigDeserializer.java
@@ -53,8 +53,8 @@ import java.util.Map;
  * We deserialize the config using the default deserializer, then handle the new IUpdater (which will be null for
  * 0.8.0 and earlier configs) if necessary
  *
- * Overall design: <a href="http://stackoverflow.com/questions/18313323/how-do-i-call-the-default-deserializer-from-a-custom-deserializer-in-jackson">
- *     http://stackoverflow.com/questions/18313323/how-do-i-call-the-default-deserializer-from-a-custom-deserializer-in-jackson</a>
+ * Overall design: <a href="https://stackoverflow.com/questions/18313323/how-do-i-call-the-default-deserializer-from-a-custom-deserializer-in-jackson">
+ *     https://stackoverflow.com/questions/18313323/how-do-i-call-the-default-deserializer-from-a-custom-deserializer-in-jackson</a>
  *
  * @author Alex Black
  */
diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/mkldnn/MKLDNNBatchNormHelper.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/mkldnn/MKLDNNBatchNormHelper.java
index 0d9ae18e7..2e8c04aa3 100644
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/mkldnn/MKLDNNBatchNormHelper.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/mkldnn/MKLDNNBatchNormHelper.java
@@ -67,17 +67,17 @@ public class MKLDNNBatchNormHelper implements BatchNormalizationHelper {
                                                      INDArray beta, INDArray dGammaView, INDArray dBetaView, double eps, LayerWorkspaceMgr workspaceMgr) {
         if(input.dataType() != DataType.FLOAT)
             return null;    //MKL-DNN only supports float
-        /*
+
         //TODO FIXME - AB 2019/11/01 - https://github.com/eclipse/deeplearning4j/issues/8335
         List<INDArray> args = new ArrayList<>();
         args.add(input);
         args.add(meanCache);
         args.add(varCache);
-        args.add(epsilon);
         if(gamma != null)
             args.add(gamma.reshape(gamma.length()));
         if(beta != null)
             args.add(beta.reshape(beta.length()));
+        args.add(epsilon);
 
 
         DynamicCustomOp op = DynamicCustomOp.builder("batchnorm_bp")
@@ -110,8 +110,6 @@ public class MKLDNNBatchNormHelper implements BatchNormalizationHelper {
         g.setGradientFor(BatchNormalizationParamInitializer.BETA, dBetaView);
 
         return new Pair<>(g, epsAtInput);
-         */
-        return null;
     }
 
     @Override
diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/normalization/BatchNormalization.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/normalization/BatchNormalization.java
index 8c8f329ea..cd070185c 100644
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/normalization/BatchNormalization.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/normalization/BatchNormalization.java
@@ -49,8 +49,8 @@ import java.util.*;
 /**
  * Batch normalization layer.<br>
  * Rerences:<br>
- *  <a href="http://arxiv.org/pdf/1502.03167v3.pdf">http://arxiv.org/pdf/1502.03167v3.pdf</a><br>
- *  <a href="http://arxiv.org/pdf/1410.7455v8.pdf">http://arxiv.org/pdf/1410.7455v8.pdf</a><br>
+ *  <a href="https://arxiv.org/pdf/1502.03167v3.pdf">https://arxiv.org/pdf/1502.03167v3.pdf</a><br>
+ *  <a href="https://arxiv.org/pdf/1410.7455v8.pdf">https://arxiv.org/pdf/1410.7455v8.pdf</a><br>
  *  <a href="https://kratzert.github.io/2016/02/12/understanding-the-gradient-flow-through-the-batch-normalization-layer.html">
  *      https://kratzert.github.io/2016/02/12/understanding-the-gradient-flow-through-the-batch-normalization-layer.html</a>
  *
@@ -327,7 +327,7 @@ public class BatchNormalization extends BaseLayer<org.deeplearning4j.nn.conf.lay
             batchMean = input.mean(0, 2, 3);
             batchVar = input.var(false, 0, 2, 3);
         } else {
-            // TODO setup BatchNorm for RNN http://arxiv.org/pdf/1510.01378v1.pdf
+            // TODO setup BatchNorm for RNN https://arxiv.org/pdf/1510.01378v1.pdf
             throw new IllegalStateException( "The layer prior to BatchNorm in the configuration is not currently supported. " + layerId());
         }
 
@@ -476,7 +476,7 @@ public class BatchNormalization extends BaseLayer<org.deeplearning4j.nn.conf.lay
 
         // xHat = (x-xmean) / sqrt(var + epsilon)
         //Note that for CNNs, mean and variance are calculated per feature map (i.e., per activation) rather than per activation
-        //Pg5 of http://arxiv.org/pdf/1502.03167v3.pdf
+        //Pg5 of https://arxiv.org/pdf/1502.03167v3.pdf
         // "For convolutional layers, we additionally want the normalization to obey the convolutional property – so that
         //  different elements of the same feature map, at different locations, are normalized in the same way. To achieve
         //  this, we jointly normalize all the activations in a minibatch, over all locations."
@@ -560,7 +560,7 @@ public class BatchNormalization extends BaseLayer<org.deeplearning4j.nn.conf.lay
                 activations = Nd4j.getExecutioner().exec(new BroadcastAddOp(activations, beta, activations, 1));
             }
         } else {
-            // TODO setup BatchNorm for RNN http://arxiv.org/pdf/1510.01378v1.pdf
+            // TODO setup BatchNorm for RNN https://arxiv.org/pdf/1510.01378v1.pdf
             throw new IllegalStateException(
                             "The layer prior to BatchNorm in the configuration is not currently supported. "
                                             + layerId());
diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/GravesBidirectionalLSTM.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/GravesBidirectionalLSTM.java
index 6fc96dc80..e0fd80842 100644
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/GravesBidirectionalLSTM.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/GravesBidirectionalLSTM.java
@@ -40,7 +40,7 @@ import java.util.Map;
  * <a href="http://www.cs.toronto.edu/~graves/phd.pdf">http://www.cs.toronto.edu/~graves/phd.pdf</a>
  * See also for full/vectorized equations (and a comparison to other LSTM variants):
  * Greff et al. 2015, "LSTM: A Search Space Odyssey", pg11. This is the "vanilla" variant in said paper
- * <a href="http://arxiv.org/pdf/1503.04069.pdf">http://arxiv.org/pdf/1503.04069.pdf</a>
+ * <a href="https://arxiv.org/pdf/1503.04069.pdf">https://arxiv.org/pdf/1503.04069.pdf</a>
  *
  * A high level description of bidirectional LSTM can be found from
  * "Hybrid Speech Recognition with Deep Bidirectional LSTM"
diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/GravesLSTM.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/GravesLSTM.java
index 13f30b8bb..b112672f9 100644
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/GravesLSTM.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/GravesLSTM.java
@@ -34,7 +34,7 @@ import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
  * <a href="http://www.cs.toronto.edu/~graves/phd.pdf">http://www.cs.toronto.edu/~graves/phd.pdf</a>
  * See also for full/vectorized equations (and a comparison to other LSTM variants):
  * Greff et al. 2015, "LSTM: A Search Space Odyssey", pg11. This is the "vanilla" variant in said paper
- * <a href="http://arxiv.org/pdf/1503.04069.pdf">http://arxiv.org/pdf/1503.04069.pdf</a>
+ * <a href="https://arxiv.org/pdf/1503.04069.pdf">https://arxiv.org/pdf/1503.04069.pdf</a>
  *
  * @author Alex Black
  * @see LSTM LSTM class, for the version without peephole connections
diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/LSTM.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/LSTM.java
index 692713f6e..a55a19e46 100644
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/LSTM.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/LSTM.java
@@ -38,7 +38,7 @@ import org.nd4j.util.OneTimeLogger;
  *
  * See also for full/vectorized equations (and a comparison to other LSTM variants):
  * Greff et al. 2015, "LSTM: A Search Space Odyssey", pg11. This is the "no peephole" variant in said paper
- * <a href="http://arxiv.org/pdf/1503.04069.pdf">http://arxiv.org/pdf/1503.04069.pdf</a>
+ * <a href="https://arxiv.org/pdf/1503.04069.pdf">https://arxiv.org/pdf/1503.04069.pdf</a>
  *
  * @author Alex Black
  * @see GravesLSTM GravesLSTM class, for the version with peephole connections
diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/LSTMHelpers.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/LSTMHelpers.java
index 86079aead..c733ef6c2 100644
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/LSTMHelpers.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/layers/recurrent/LSTMHelpers.java
@@ -68,7 +68,7 @@ import static org.nd4j.linalg.indexing.NDArrayIndex.*;
  * <p>
  * When 'hasPeepholeConnections' is true, this is the "vanilla" variant in said paper<br>
  * When 'hasPeepholeConnections' is false, this is the "no peephole" variant<br>
- * <a href="http://arxiv.org/pdf/1503.04069.pdf">http://arxiv.org/pdf/1503.04069.pdf</a>
+ * <a href="https://arxiv.org/pdf/1503.04069.pdf">https://arxiv.org/pdf/1503.04069.pdf</a>
  *
  *
  * @author Alex Black (LSTM implementations)
diff --git a/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-aws/pom.xml b/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-aws/pom.xml
index 7c9967ef8..94f66b405 100644
--- a/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-aws/pom.xml
+++ b/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-aws/pom.xml
@@ -44,184 +44,48 @@
   </properties>
 
   <dependencies>
-    <dependency>
-      <groupId>org.scala-lang</groupId>
-      <artifactId>scala-library</artifactId>
-      <version>${scala.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.scala-lang</groupId>
-      <artifactId>scala-reflect</artifactId>
-      <version>${scala.version}</version>
-    </dependency>
-
-    <dependency>
-      <groupId>commons-logging</groupId>
-      <artifactId>commons-logging</artifactId>
-      <version>${commons-logging.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>joda-time</groupId>
-      <artifactId>joda-time</artifactId>
-      <version>${jodatime.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.httpcomponents</groupId>
-      <artifactId>httpclient</artifactId>
-      <version>${httpclient.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.httpcomponents</groupId>
-      <artifactId>httpcore</artifactId>
-      <version>${httpcore.version}</version>
-    </dependency>
     <dependency>
       <groupId>com.amazonaws</groupId>
       <artifactId>aws-java-sdk</artifactId>
       <version>1.11.24</version>
     </dependency>
-    <dependency>
-      <groupId>org.deeplearning4j</groupId>
-      <artifactId>deeplearning4j-core</artifactId>
-      <version>${project.parent.version}</version>
-    </dependency>
     <dependency>
       <groupId>args4j</groupId>
       <artifactId>args4j</artifactId>
       <version>2.32</version>
     </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-api</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.nd4j</groupId>
+      <artifactId>nd4j-api</artifactId>
+      <version>${nd4j.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.deeplearning4j</groupId>
+      <artifactId>deeplearning4j-util</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+
     <dependency>
       <groupId>com.jcraft</groupId>
       <artifactId>jsch</artifactId>
       <version>${jsch.version}</version>
     </dependency>
+
     <dependency>
-      <groupId>com.google.inject</groupId>
-      <artifactId>guice</artifactId>
-      <version>${guice.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>com.google.protobuf</groupId>
-      <artifactId>protobuf-java</artifactId>
-      <version>${google.protobuf.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>commons-codec</groupId>
-      <artifactId>commons-codec</artifactId>
-      <version>${commons-codec.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>commons-collections</groupId>
-      <artifactId>commons-collections</artifactId>
-      <version>${commons-collections.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>commons-io</groupId>
-      <artifactId>commons-io</artifactId>
-      <version>${commons-io.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>commons-lang</groupId>
-      <artifactId>commons-lang</artifactId>
-      <version>${commons-lang.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>commons-net</groupId>
-      <artifactId>commons-net</artifactId>
-      <version>${commons-net.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>com.sun.xml.bind</groupId>
-      <artifactId>jaxb-core</artifactId>
-      <version>${jaxb.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>com.sun.xml.bind</groupId>
-      <artifactId>jaxb-impl</artifactId>
-      <version>${jaxb.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>io.netty</groupId>
-      <artifactId>netty</artifactId>
-      <version>${netty.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>com.fasterxml.jackson.core</groupId>
-      <artifactId>jackson-core</artifactId>
-      <version>${jackson.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>com.fasterxml.jackson.core</groupId>
-      <artifactId>jackson-databind</artifactId>
-      <version>${jackson.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>com.fasterxml.jackson.core</groupId>
-      <artifactId>jackson-annotations</artifactId>
-      <version>${jackson.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>javax.servlet</groupId>
-      <artifactId>javax.servlet-api</artifactId>
-      <version>${servlet.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.commons</groupId>
-      <artifactId>commons-compress</artifactId>
-      <version>${commons-compress.version}</version>
+      <groupId>org.threadly</groupId>
+      <artifactId>threadly</artifactId>
+      <version>${threadly.version}</version>
     </dependency>
+
     <dependency>
       <groupId>org.apache.commons</groupId>
       <artifactId>commons-lang3</artifactId>
       <version>${commons-lang3.version}</version>
     </dependency>
-    <dependency>
-      <groupId>org.apache.commons</groupId>
-      <artifactId>commons-math3</artifactId>
-      <version>${commons-math3.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.curator</groupId>
-      <artifactId>curator-recipes</artifactId>
-      <version>${curator.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>com.typesafe</groupId>
-       <artifactId>config</artifactId>
-      <version>${typesafe.config.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.spark</groupId>
-      <artifactId>spark-core_2.11</artifactId>
-      <version>${spark.version}</version>
-      <exclusions>
-        <exclusion>
-          <groupId>com.google.code.findbugs</groupId>
-          <artifactId>jsr305</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.slf4j</groupId>
-          <artifactId>jul-to-slf4j</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.slf4j</groupId>
-           <artifactId>jcl-over-slf4j</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.slf4j</groupId>
-          <artifactId>slf4j-log4j12</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>log4j</groupId>
-          <artifactId>log4j</artifactId>
-        </exclusion>
-      </exclusions>
-    </dependency>
-    <dependency>
-        <groupId>org.threadly</groupId>
-        <artifactId>threadly</artifactId>
-        <version>${threadly.version}</version>
-    </dependency>
   </dependencies>
 
   <profiles>
diff --git a/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-aws/src/main/java/org/deeplearning4j/aws/emr/SparkEMRClient.java b/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-aws/src/main/java/org/deeplearning4j/aws/emr/SparkEMRClient.java
index b1476fa3b..d179cca09 100644
--- a/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-aws/src/main/java/org/deeplearning4j/aws/emr/SparkEMRClient.java
+++ b/deeplearning4j/deeplearning4j-scaleout/deeplearning4j-aws/src/main/java/org/deeplearning4j/aws/emr/SparkEMRClient.java
@@ -27,8 +27,8 @@ import lombok.AllArgsConstructor;
 import lombok.Data;
 import lombok.NoArgsConstructor;
 import lombok.extern.slf4j.Slf4j;
-import org.apache.commons.lang.RandomStringUtils;
-import org.apache.spark.api.java.function.Function;
+import org.apache.commons.lang3.RandomStringUtils;
+import org.nd4j.linalg.function.Function;
 
 import java.io.File;
 import java.util.*;
@@ -157,7 +157,7 @@ public class SparkEMRClient {
     private void submitJob(AmazonElasticMapReduce emr, String mainClass, List<String> args, Map<String, String> sparkConfs, File uberJar) throws Exception {
         AmazonS3URI s3Jar = new AmazonS3URI(sparkS3JarFolder + "/" + uberJar.getName());
         log.info(String.format("Placing uberJar %s to %s", uberJar.getPath(), s3Jar.toString()));
-        PutObjectRequest putRequest = sparkS3PutObjectDecorator.call(
+        PutObjectRequest putRequest = sparkS3PutObjectDecorator.apply(
                 new PutObjectRequest(s3Jar.getBucket(), s3Jar.getKey(), uberJar)
         );
         sparkS3ClientBuilder.build().putObject(putRequest);
@@ -289,7 +289,7 @@ public class SparkEMRClient {
         // This should allow the user to decorate the put call to add metadata to the jar put command, such as security groups,
         protected Function<PutObjectRequest, PutObjectRequest> sparkS3PutObjectDecorator = new Function<PutObjectRequest, PutObjectRequest>() {
             @Override
-            public PutObjectRequest call(PutObjectRequest putObjectRequest) throws Exception {
+            public PutObjectRequest apply(PutObjectRequest putObjectRequest) {
                 return putObjectRequest;
             }
         };
diff --git a/deeplearning4j/deeplearning4j-scaleout/spark/pom.xml b/deeplearning4j/deeplearning4j-scaleout/spark/pom.xml
index 579e042ab..a24676022 100644
--- a/deeplearning4j/deeplearning4j-scaleout/spark/pom.xml
+++ b/deeplearning4j/deeplearning4j-scaleout/spark/pom.xml
@@ -116,7 +116,6 @@
     </build>
 
     <dependencies>
-
         <!-- ND4J Shaded Jackson Dependency -->
         <dependency>
             <groupId>org.nd4j</groupId>
@@ -139,82 +138,6 @@
             <artifactId>scala-reflect</artifactId>
             <version>${scala.version}</version>
         </dependency>
-
-        <dependency>
-            <groupId>com.google.inject</groupId>
-            <artifactId>guice</artifactId>
-            <version>${guice.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>com.google.protobuf</groupId>
-            <artifactId>protobuf-java</artifactId>
-            <version>${google.protobuf.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>commons-codec</groupId>
-            <artifactId>commons-codec</artifactId>
-            <version>${commons-codec.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>commons-collections</groupId>
-            <artifactId>commons-collections</artifactId>
-            <version>${commons-collections.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>commons-io</groupId>
-            <artifactId>commons-io</artifactId>
-            <version>${commons-io.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>commons-lang</groupId>
-            <artifactId>commons-lang</artifactId>
-            <version>${commons-lang.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>commons-net</groupId>
-            <artifactId>commons-net</artifactId>
-            <version>${commons-net.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>com.sun.xml.bind</groupId>
-            <artifactId>jaxb-core</artifactId>
-            <version>${jaxb.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>com.sun.xml.bind</groupId>
-            <artifactId>jaxb-impl</artifactId>
-            <version>${jaxb.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>io.netty</groupId>
-            <artifactId>netty</artifactId>
-            <version>${netty.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>javax.servlet</groupId>
-            <artifactId>javax.servlet-api</artifactId>
-            <version>${servlet.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.commons</groupId>
-            <artifactId>commons-compress</artifactId>
-            <version>${commons-compress.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.commons</groupId>
-            <artifactId>commons-lang3</artifactId>
-            <version>${commons-lang3.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.commons</groupId>
-            <artifactId>commons-math3</artifactId>
-            <version>${commons-math3.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.curator</groupId>
-            <artifactId>curator-recipes</artifactId>
-            <version>${curator.version}</version>
-        </dependency>
         <dependency>
             <groupId>com.typesafe</groupId>
             <artifactId>config</artifactId>
@@ -250,9 +173,7 @@
                     <artifactId>log4j</artifactId>
                 </exclusion>
             </exclusions>
-
         </dependency>
-
     </dependencies>
 
     <profiles>
diff --git a/deeplearning4j/deeplearning4j-ui-parent/deeplearning4j-play/pom.xml b/deeplearning4j/deeplearning4j-ui-parent/deeplearning4j-play/pom.xml
index 1b4f33c1e..fa18ad91d 100644
--- a/deeplearning4j/deeplearning4j-ui-parent/deeplearning4j-play/pom.xml
+++ b/deeplearning4j/deeplearning4j-ui-parent/deeplearning4j-play/pom.xml
@@ -129,32 +129,11 @@
             <artifactId>deeplearning4j-ui-model</artifactId>
             <version>${project.version}</version>
         </dependency>
-
-        <dependency>
-            <groupId>com.google.protobuf</groupId>
-            <artifactId>protobuf-java</artifactId>
-            <version>${google.protobuf.version}</version>
-        </dependency>
         <dependency>
             <groupId>javax.ws.rs</groupId>
             <artifactId>javax.ws.rs-api</artifactId>
             <version>${ws.rs.version}</version>
         </dependency>
-        <dependency>
-            <groupId>joda-time</groupId>
-            <artifactId>joda-time</artifactId>
-            <version>${jodatime.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.commons</groupId>
-            <artifactId>commons-lang3</artifactId>
-            <version>${commons-lang3.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.hibernate</groupId>
-            <artifactId>hibernate-validator</artifactId>
-            <version>${hibernate.version}</version>
-        </dependency>
         <dependency>
             <groupId>org.scala-lang</groupId>
             <artifactId>scala-library</artifactId>
@@ -165,11 +144,6 @@
             <artifactId>scala-reflect</artifactId>
             <version>${scala.version}</version>
         </dependency>
-        <dependency>
-            <groupId>org.yaml</groupId>
-            <artifactId>snakeyaml</artifactId>
-            <version>${snakeyaml.version}</version>
-        </dependency>
         <dependency>
             <groupId>com.typesafe.play</groupId>
             <artifactId>play-java_2.11</artifactId>
diff --git a/docs/deeplearning4j-nlp/templates/word2vec.md b/docs/deeplearning4j-nlp/templates/word2vec.md
index e941060f2..df188dc2f 100644
--- a/docs/deeplearning4j-nlp/templates/word2vec.md
+++ b/docs/deeplearning4j-nlp/templates/word2vec.md
@@ -447,7 +447,7 @@ Marketers might seek to establish relationships among products to build a recomm
 
 ### <a name="patent">Google's Word2vec Patent</a>
 
-Word2vec is [a method of computing vector representations of words](http://arxiv.org/pdf/1301.3781.pdf) introduced by a team of researchers at Google led by Tomas Mikolov. Google [hosts an open-source version of Word2vec](https://code.google.com/p/word2vec/) released under an Apache 2.0 license. In 2014, Mikolov left Google for Facebook, and in May 2015, [Google was granted a patent for the method](http://patft.uspto.gov/netacgi/nph-Parser?Sect1=PTO2&Sect2=HITOFF&p=1&u=%2Fnetahtml%2FPTO%2Fsearch-bool.html&r=1&f=G&l=50&co1=AND&d=PTXT&s1=9037464&OS=9037464&RS=9037464), which does not abrogate the Apache license under which it has been released. 
+Word2vec is [a method of computing vector representations of words](https://arxiv.org/pdf/1301.3781.pdf) introduced by a team of researchers at Google led by Tomas Mikolov. Google [hosts an open-source version of Word2vec](https://code.google.com/p/word2vec/) released under an Apache 2.0 license. In 2014, Mikolov left Google for Facebook, and in May 2015, [Google was granted a patent for the method](http://patft.uspto.gov/netacgi/nph-Parser?Sect1=PTO2&Sect2=HITOFF&p=1&u=%2Fnetahtml%2FPTO%2Fsearch-bool.html&r=1&f=G&l=50&co1=AND&d=PTXT&s1=9037464&OS=9037464&RS=9037464), which does not abrogate the Apache license under which it has been released. 
 
 ### <a name="foreign">Foreign Languages</a>
 
@@ -485,7 +485,7 @@ Deeplearning4j has a class called [SequenceVectors](https://github.com/eclipse/d
 * [Quora: What Are Some Interesting Word2Vec Results?](http://www.quora.com/Word2vec/What-are-some-interesting-Word2Vec-results/answer/Omer-Levy)
 * [Word2Vec: an introduction](http://www.folgertkarsdorp.nl/word2vec-an-introduction/); Folgert Karsdorp
 * [Mikolov's Original Word2vec Code @Google](https://code.google.com/p/word2vec/)
-* [word2vec Explained: Deriving Mikolov et al.’s Negative-Sampling Word-Embedding Method](http://arxiv.org/pdf/1402.3722v1.pdf); Yoav Goldberg and Omer Levy
+* [word2vec Explained: Deriving Mikolov et al.’s Negative-Sampling Word-Embedding Method](https://arxiv.org/pdf/1402.3722v1.pdf); Yoav Goldberg and Omer Levy
 * [Advances in Pre-Training Distributed Word Representations - by Mikolov et al](https://arxiv.org/abs/1712.09405)
 
 
diff --git a/docs/deeplearning4j-nn/templates/computationgraph.md b/docs/deeplearning4j-nn/templates/computationgraph.md
index a5ced0ceb..f4ff7f03d 100644
--- a/docs/deeplearning4j-nn/templates/computationgraph.md
+++ b/docs/deeplearning4j-nn/templates/computationgraph.md
@@ -51,10 +51,10 @@ Examples of some architectures that can be built using ComputationGraph include:
 
 - Multi-task learning architectures
 - Recurrent neural networks with skip connections
-- [GoogLeNet](http://arxiv.org/abs/1409.4842), a complex type of convolutional netural network for image classification
-- [Image caption generation](http://arxiv.org/abs/1411.4555)
+- [GoogLeNet](https://arxiv.org/abs/1409.4842), a complex type of convolutional netural network for image classification
+- [Image caption generation](https://arxiv.org/abs/1411.4555)
 - [Convolutional networks for sentence classification](https://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/convolution/sentenceclassification/CnnSentenceClassificationExample.java)
-- [Residual learning convolutional neural networks](http://arxiv.org/abs/1512.03385)
+- [Residual learning convolutional neural networks](https://arxiv.org/abs/1512.03385)
 
 
 ## <a name="config">Configuring a Computation Graph</a>
diff --git a/docs/deeplearning4j-nn/templates/model-persistence.md b/docs/deeplearning4j-nn/templates/model-persistence.md
index ef4d593e9..82f87f1ff 100644
--- a/docs/deeplearning4j-nn/templates/model-persistence.md
+++ b/docs/deeplearning4j-nn/templates/model-persistence.md
@@ -8,7 +8,7 @@ weight: 10
 
 ## Saving and Loading a Neural Network
 
-The `ModelSerializer` is a class which handles loading and saving models. There are two methods for saving models shown in the examples through the link. The first example saves a normal multilayer network, the second one saves a [computation graph](https://deeplearning4j.org/compgraph).
+The `ModelSerializer` is a class which handles loading and saving models. There are two methods for saving models shown in the examples through the link. The first example saves a normal multilayer network, the second one saves a [computation graph](https://deeplearning4j.org/docs/latest/deeplearning4j-nn-computationgraph).
 
 Here is a [basic example](https://github.com/eclipse/deeplearning4j-examples/tree/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/misc/modelsaving) with code to save a computation graph using the `ModelSerializer` class, as well as an example of using ModelSerializer to save a neural net built using MultiLayer configuration.
 
diff --git a/docs/deeplearning4j-nn/templates/recurrent.md b/docs/deeplearning4j-nn/templates/recurrent.md
index 0b33981e7..fe07ebddb 100644
--- a/docs/deeplearning4j-nn/templates/recurrent.md
+++ b/docs/deeplearning4j-nn/templates/recurrent.md
@@ -29,7 +29,7 @@ DL4J currently supports the following types of recurrent neural network
 * BaseRecurrent
 
 Java documentation for each is available, [GravesLSTM](https://deeplearning4j.org/api/{{page.version}}/org/deeplearning4j/nn/conf/layers/GravesLSTM.html), 
- [BidirectionalGravesLSTM](https://deeplearning4j.org/api/{{page.version}}/org/deeplearning4j/nn/conf/layers/GravesBidirectionalLSTM.html),  [BaseRecurrent](https://deeplearning4j.org/doc/org/deeplearning4j/nn/conf/layers/BaseRecurrentLayer.html)
+ [BidirectionalGravesLSTM](https://deeplearning4j.org/api/{{page.version}}/org/deeplearning4j/nn/conf/layers/GravesBidirectionalLSTM.html),  [BaseRecurrent](https://deeplearning4j.org/api/latest/org/deeplearning4j/nn/conf/layers/BaseRecurrentLayer.html)
 
 #### Data for RNNs
 Consider for the moment a standard feed-forward network (a multi-layer perceptron or 'DenseLayer' in DL4J). These networks expect input and output data that is two-dimensional: that is, data with "shape" [numExamples,inputSize]. This means that the data into a feed-forward network has ‘numExamples’ rows/examples, where each row consists of ‘inputSize’ columns. A single example would have shape [1,inputSize], though in practice we generally use multiple examples for computational and optimization efficiency. Similarly, output data for a standard feed-forward network is also two dimensional, with shape [numExamples,outputSize].
diff --git a/docs/deeplearning4j-nn/templates/tsne-visualization.md b/docs/deeplearning4j-nn/templates/tsne-visualization.md
index 83ab3a3ce..9a55b1a74 100644
--- a/docs/deeplearning4j-nn/templates/tsne-visualization.md
+++ b/docs/deeplearning4j-nn/templates/tsne-visualization.md
@@ -8,7 +8,7 @@ weight: 10
 
 ## t-SNE's Data Visualization
 
-[t-Distributed Stochastic Neighbor Embedding](http://homepage.tudelft.nl/19j49/t-SNE.html) (t-SNE) is a data-visualization tool created by Laurens van der Maaten at Delft University of Technology.
+[t-Distributed Stochastic Neighbor Embedding](https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding) (t-SNE) is a data-visualization tool created by Laurens van der Maaten at Delft University of Technology.
 
 While it can be used for any data, t-SNE (pronounced Tee-Snee) is only really meaningful with labeled data, which clarify how the input is clustering. Below, you can see the kind of graphic you can generate in DL4J with t-SNE working on MNIST data.
 
diff --git a/docs/deeplearning4j-scaleout/templates/howto.md b/docs/deeplearning4j-scaleout/templates/howto.md
index 500b1a241..af55969c6 100644
--- a/docs/deeplearning4j-scaleout/templates/howto.md
+++ b/docs/deeplearning4j-scaleout/templates/howto.md
@@ -627,7 +627,7 @@ To use the system clock time source, add the following to Spark submit:
 
 ## <a name="ubuntu16">Failed training on Ubuntu 16.04 (Ubuntu bug that may affect DL4J users)</a>
 
-When running a Spark on YARN cluster on Ubuntu 16.04 machines, chances are that after finishing a job, all processes owned by the user running Hadoop/YARN are killed. This is related to a bug in Ubuntu, which is documented at https://bugs.launchpad.net/ubuntu/+source/procps/+bug/1610499. There's also a Stackoverflow discussion about it at http://stackoverflow.com/questions/38419078/logouts-while-running-hadoop-under-ubuntu-16-04.
+When running a Spark on YARN cluster on Ubuntu 16.04 machines, chances are that after finishing a job, all processes owned by the user running Hadoop/YARN are killed. This is related to a bug in Ubuntu, which is documented at https://bugs.launchpad.net/ubuntu/+source/procps/+bug/1610499. There's also a Stackoverflow discussion about it at https://stackoverflow.com/questions/38419078/logouts-while-running-hadoop-under-ubuntu-16-04.
 
 Some workarounds are suggested. 
 
@@ -695,7 +695,7 @@ To use the system clock time source, add the following to Spark submit:
 
 ## <a href="ubuntu16">Failed training on Ubuntu 16.04 (Ubuntu bug that may affect DL4J users)</a>
 
-When running a Spark on YARN cluster on Ubuntu 16.04 machines, chances are that after finishing a job, all processes owned by the user running Hadoop/YARN are killed. This is related to a bug in Ubuntu, which is documented at https://bugs.launchpad.net/ubuntu/+source/procps/+bug/1610499. There's also a Stackoverflow discussion about it at http://stackoverflow.com/questions/38419078/logouts-while-running-hadoop-under-ubuntu-16-04.
+When running a Spark on YARN cluster on Ubuntu 16.04 machines, chances are that after finishing a job, all processes owned by the user running Hadoop/YARN are killed. This is related to a bug in Ubuntu, which is documented at https://bugs.launchpad.net/ubuntu/+source/procps/+bug/1610499. There's also a Stackoverflow discussion about it at https://stackoverflow.com/questions/38419078/logouts-while-running-hadoop-under-ubuntu-16-04.
 
 Some workarounds are suggested. 
 
diff --git a/docs/deeplearning4j/templates/beginners.md b/docs/deeplearning4j/templates/beginners.md
index f7740516d..3ca4d82f1 100644
--- a/docs/deeplearning4j/templates/beginners.md
+++ b/docs/deeplearning4j/templates/beginners.md
@@ -99,4 +99,4 @@ You can also download a [free version of the Skymind Intelligence Layer](https:/
 
 Most of what we know about deep learning is contained in academic papers. You can find some of the major research groups [here](https://skymind.ai/wiki/machine-learning-research-groups-labs).
 
-While individual courses have limits on what they can teach, the Internet does not. Most math and programming questions can be answered by Googling and searching sites like [Stackoverflow](http://stackoverflow.com) and [Math Stackexchange](https://math.stackexchange.com/).
+While individual courses have limits on what they can teach, the Internet does not. Most math and programming questions can be answered by Googling and searching sites like [Stackoverflow](https://stackoverflow.com) and [Math Stackexchange](https://math.stackexchange.com/).
diff --git a/docs/deeplearning4j/templates/cheat-sheet.md b/docs/deeplearning4j/templates/cheat-sheet.md
index 3437ffa0f..f4b4157af 100644
--- a/docs/deeplearning4j/templates/cheat-sheet.md
+++ b/docs/deeplearning4j/templates/cheat-sheet.md
@@ -220,7 +220,7 @@ List of supported activation functions:
 * **LEAKYRELU** - ([Source](https://github.com/eclipse/deeplearning4j/blob/master/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/activations/impl/ActivationLReLU.java)) - leaky rectified linear unit. ```f(x) = max(0, x) + alpha * min(0, x)``` with ```alpha=0.01``` by default.
 * **RATIONALTANH** - ([Source](https://github.com/eclipse/deeplearning4j/blob/master/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/activations/impl/ActivationRationalTanh.java)) - ```tanh(y) ~ sgn(y) * { 1 - 1/(1+|y|+y^2+1.41645*y^4)}``` which approximates ```f(x) = 1.7159 * tanh(2x/3)```, but should be faster to execute. ([Reference](https://arxiv.org/abs/1508.01292))
 * **RELU** - ([Source](https://github.com/eclipse/deeplearning4j/blob/master/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/activations/impl/ActivationReLU.java)) - standard rectified linear unit: ```f(x) = x``` if ```x>0``` or ```f(x) = 0``` otherwise
-* **RRELU** - ([Source](https://github.com/eclipse/deeplearning4j/blob/master/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/activations/impl/ActivationRReLU.java)) - randomized rectified linear unit. Deterministic during test time. ([Reference](http://arxiv.org/abs/1505.00853))
+* **RRELU** - ([Source](https://github.com/eclipse/deeplearning4j/blob/master/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/activations/impl/ActivationRReLU.java)) - randomized rectified linear unit. Deterministic during test time. ([Reference](https://arxiv.org/abs/1505.00853))
 * **SIGMOID** - ([Source](https://github.com/eclipse/deeplearning4j/blob/master/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/activations/impl/ActivationSigmoid.java)) - standard sigmoid activation function, ```f(x) = 1 / (1 + exp(-x))```
 * **SOFTMAX** - ([Source](https://github.com/eclipse/deeplearning4j/blob/master/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/activations/impl/ActivationSoftmax.java)) - standard softmax activation function
 * **SOFTPLUS** - ([Source](https://github.com/eclipse/deeplearning4j/blob/master/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/activations/impl/ActivationSoftPlus.java)) - ```f(x) = log(1+e^x)``` - shape is similar to a smooth version of the RELU activation function
@@ -269,7 +269,7 @@ The [CS231n course notes](http://cs231n.github.io/neural-networks-3/#ada) have a
 Supported updaters in Deeplearning4j:
 * **AdaDelta** - ([Source](https://github.com/eclipse/deeplearning4j/tree/master/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/learning/config/AdaDelta.java)) - [Reference](https://arxiv.org/abs/1212.5701)
 * **AdaGrad** - ([Source](https://github.com/eclipse/deeplearning4j/tree/master/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/learning/config/AdaGrad.java)) - [Reference](http://jmlr.org/papers/v12/duchi11a.html)
-* **AdaMax** - ([Source](https://github.com/eclipse/deeplearning4j/tree/master/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/learning/config/AdaMax.java)) - A variant of the Adam updater - [Reference](http://arxiv.org/abs/1412.6980)
+* **AdaMax** - ([Source](https://github.com/eclipse/deeplearning4j/tree/master/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/learning/config/AdaMax.java)) - A variant of the Adam updater - [Reference](https://arxiv.org/abs/1412.6980)
 * **Adam** - ([Source](https://github.com/eclipse/deeplearning4j/tree/master/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/learning/config/Adam.java))
 * **Nadam** - ([Source](https://github.com/eclipse/deeplearning4j/tree/master/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/learning/config/Nadam.java)) - A variant of the Adam updater, using the Nesterov mementum update rule - [Reference](https://arxiv.org/abs/1609.04747)
 * **Nesterovs** - ([Source](https://github.com/eclipse/deeplearning4j/tree/master/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/learning/config/Nesterovs.java)) - Nesterov momentum updater
diff --git a/docs/deeplearning4j/templates/config-performance-debugging.md b/docs/deeplearning4j/templates/config-performance-debugging.md
index 6dafd13b7..04b92ba23 100644
--- a/docs/deeplearning4j/templates/config-performance-debugging.md
+++ b/docs/deeplearning4j/templates/config-performance-debugging.md
@@ -84,7 +84,7 @@ Not all DL4J layer types are supported in cuDNN. DL4J layers with cuDNN support
 To check if cuDNN is being used, the simplest approach is to look at the log output when running inference or training:
 If cuDNN is NOT available when you are using a layer that supports it, you will see a message such as:
 ```
-o.d.n.l.c.ConvolutionLayer - cuDNN not found: use cuDNN for better GPU performance by including the deeplearning4j-cuda module. For more information, please refer to: https://deeplearning4j.org/cudnn
+o.d.n.l.c.ConvolutionLayer - cuDNN not found: use cuDNN for better GPU performance by including the deeplearning4j-cuda module. For more information, please refer to: https://deeplearning4j.org/docs/latest/deeplearning4j-config-cudnn
 java.lang.ClassNotFoundException: org.deeplearning4j.nn.layers.convolution.CudnnConvolutionHelper
 	at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
 	at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
diff --git a/docs/deeplearning4j/templates/examples-tour.md b/docs/deeplearning4j/templates/examples-tour.md
index 2aa5dd29b..ee6c049ab 100644
--- a/docs/deeplearning4j/templates/examples-tour.md
+++ b/docs/deeplearning4j/templates/examples-tour.md
@@ -18,31 +18,31 @@ Most of the examples make use of DataVec, a toolkit for preprocessing and clearn
 
 This example takes the canonical Iris dataset of the flower species of the same name, whose relevant measurements are sepal length, sepal width, petal length and petal width. It builds a Spark RDD from the relatively small dataset and runs an analysis against it. 
 
-[Show me the code](http://github.com/eclipse/deeplearning4j-examples/blob/master/datavec-examples/src/main/java/org/datavec/transform/analysis/IrisAnalysis.java)
+[Show me the code](https://github.com/eclipse/deeplearning4j-examples/blob/master/datavec-examples/src/main/java/org/datavec/transform/analysis/IrisAnalysis.java)
 
 ### BasicDataVecExample.java
 
 This example loads data into a Spark RDD. All DataVec transform operations use Spark RDDs. Here, we use DataVec to filter data, apply time transformations and remove columns.
 
-[Show me the code](http://github.com/eclipse/deeplearning4j-examples/blob/master/datavec-examples/src/main/java/org/datavec/transform/basic/BasicDataVecExample.java)
+[Show me the code](https://github.com/eclipse/deeplearning4j-examples/blob/master/datavec-examples/src/main/java/org/datavec/transform/basic/BasicDataVecExample.java)
 
 ### PrintSchemasAtEachStep.java
 
 This example shows the print Schema tools that are useful to visualize and to ensure that the code for the transform is behaving as expected. 
 
-[Show me the code](http://github.com/eclipse/deeplearning4j-examples/blob/master/datavec-examples/src/main/java/org/datavec/transform/debugging/PrintSchemasAtEachStep.java)
+[Show me the code](https://github.com/eclipse/deeplearning4j-examples/blob/master/datavec-examples/src/main/java/org/datavec/transform/debugging/PrintSchemasAtEachStep.java)
 
 ### JoinExample.java
 
 You may need to join datasets before passing to a neural network. You can do that in DataVec, and this example shows you how. 
 
-[Show me the code](http://github.com/eclipse/deeplearning4j-examples/blob/master/datavec-examples/src/main/java/org/datavec/transform/join/JoinExample.java)
+[Show me the code](https://github.com/eclipse/deeplearning4j-examples/blob/master/datavec-examples/src/main/java/org/datavec/transform/join/JoinExample.java)
 
 ### LogDataExample.java
 
 This is an example of parsing log data using DataVec. The obvious use cases are cybersecurity and customer relationship management. 
 
-[Show me the code](http://github.com/eclipse/deeplearning4j-examples/blob/master/datavec-examples/src/main/java/org/datavec/transform/logdata/LogDataExample.java)
+[Show me the code](https://github.com/eclipse/deeplearning4j-examples/blob/master/datavec-examples/src/main/java/org/datavec/transform/logdata/LogDataExample.java)
 
 ### MnistImagePipelineExample.java
 
@@ -50,7 +50,7 @@ This example is from the video below, which demonstrates the ParentPathLabelGene
 
 <iframe width="560" height="315" src="http://www.youtube.com/embed/GLC8CIoHDnI" frameborder="0" allowfullscreen></iframe>
 
-[Show me the code](http://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/dataExamples/MnistImagePipelineExample.java)
+[Show me the code](https://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/dataExamples/MnistImagePipelineExample.java)
 
 ### PreprocessNormalizerExample.java
 
@@ -78,13 +78,13 @@ MNIST is the "Hello World" of deep learning. Simple, straightforward, and focuss
 
 This is a Single Layer Perceptron for recognizing digits. Note that this pulls the images from a binary package containing the dataset, a rather special case for data ingestion.
 
-[Show me the code](http://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/feedforward/mnist/MLPMnistSingleLayerExample.java)
+[Show me the code](https://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/feedforward/mnist/MLPMnistSingleLayerExample.java)
 
 ### MLPMnistTwoLayerExample.java
 
 A two-layer perceptron for MNIST, showing there is more than one useful network for a given dataset. 
 
-[Show me the code](http://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/feedforward/mnist/MLPMnistTwoLayerExample.java)
+[Show me the code](https://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/feedforward/mnist/MLPMnistTwoLayerExample.java)
 
 ### Feedforward Examples
 
@@ -92,7 +92,7 @@ Data flows through feed-forward neural networks in a single pass from input via
 
 These networks can be used for a wide range of tasks depending on they are configured. Along with image classification over MNIST data, this directory has examples demonstrating regression, classification, and anomoly detection.
 
-[Show me the code](http://github.com/eclipse/deeplearning4j-examples/tree/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/feedforward)
+[Show me the code](https://github.com/eclipse/deeplearning4j-examples/tree/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/feedforward)
 
 ### Convolutional Neural Networks
 
@@ -102,7 +102,7 @@ Convolutional Neural Networks are mainly used for image recognition, although th
 
 This example can be run using either LeNet or AlexNet. 
 
-[Show me the code](http://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/convolution/AnimalsClassification.java)
+[Show me the code](https://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/convolution/AnimalsClassification.java)
 
 ---
 
@@ -115,7 +115,7 @@ load the model for later training or inference.
 
 This demonstrates saving and loading a network build using the class ComputationGraph.
 
-[Show me the code](http://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/misc/modelsaving/SaveLoadComputationGraph.java)
+[Show me the code](https://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/misc/modelsaving/SaveLoadComputationGraph.java)
 
 ### SaveLoadMultiLayerNetwork.java
 
@@ -135,11 +135,11 @@ Do you need to add a Loss Function that is not available or prebuilt yet? Check
 
 ### CustomLossExample.java
 
-[Show me the code](http://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/misc/lossfunctions/CustomLossExample.java)
+[Show me the code](https://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/misc/lossfunctions/CustomLossExample.java)
 
 ### CustomLossL1L2.java
 
-[Show me the code](http://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/misc/lossfunctions/CustomLossL1L2.java)
+[Show me the code](https://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/misc/lossfunctions/CustomLossL1L2.java)
 
 ### Custom Layer
 
@@ -147,7 +147,7 @@ Do you need to add a layer with features that aren't available in DeepLearning4J
 
 ### CustomLayerExample.java
 
-[Show me the code](http://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/misc/customlayers/CustomLayerExample.java)
+[Show me the code](https://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/misc/customlayers/CustomLayerExample.java)
 
 ---
 
@@ -159,25 +159,25 @@ Neural Networks for NLP? We have those, too.
 
 Global Vectors for Word Representation are useful for detecting relationships between words. 
 
-[Show me the code](http://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/nlp/glove/GloVeExample.java)
+[Show me the code](https://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/nlp/glove/GloVeExample.java)
 
 ### Paragraph Vectors
 
 A vectorized representation of words. Described [here](https://cs.stanford.edu/~quocle/paragraph_vector.pdf)
 
-[Show me the code](http://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/nlp/paragraphvectors/ParagraphVectorsClassifierExample.java)
+[Show me the code](https://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/nlp/paragraphvectors/ParagraphVectorsClassifierExample.java)
 
 ### Sequence Vectors
 
 One way to represent sentences is as a sequence of words. 
 
-[Show me the code](http://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/nlp/sequencevectors/SequenceVectorsTextExample.java)
+[Show me the code](https://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/nlp/sequencevectors/SequenceVectorsTextExample.java)
 
 ### Word2Vec
 
 Described [here](https://deeplearning4j.org/word2vec.html)
 
-[Show me the code](http://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/nlp/word2vec/Word2VecRawTextExample.java)
+[Show me the code](https://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/nlp/word2vec/Word2VecRawTextExample.java)
 
 ---
 
@@ -185,7 +185,7 @@ Described [here](https://deeplearning4j.org/word2vec.html)
 
 t-Distributed Stochastic Neighbor Embedding (t-SNE) is useful for data visualization. We include an example in the NLP section since word similarity visualization is a common use. 
 
-[Show me the code](http://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/nlp/tsne/TSNEStandardExample.java)
+[Show me the code](https://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/nlp/tsne/TSNEStandardExample.java)
 
 ---
 
@@ -199,19 +199,19 @@ The examples folder for Recurrent Neural Networks has the following:
 
 An RNN learns a string of characters.
 
-[Show me the code](http://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/recurrent/basic/BasicRNNExample.java)
+[Show me the code](https://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/recurrent/basic/BasicRNNExample.java)
 
 ### GravesLSTMCharModellingExample.java
 
 Takes the complete works of Shakespeare as a sequence of characters and Trains a Neural Net to generate "Shakespeare" one character at a time.
 
-[Show me the code](http://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/recurrent/character/GravesLSTMCharModellingExample.java)
+[Show me the code](https://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/recurrent/character/GravesLSTMCharModellingExample.java)
 
 ### SingleTimestepRegressionExample.java
 
 Regression with an LSTM (Long Short Term Memory) Recurrent Neural Network. 
 
-[Show me the code](http://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/recurrent/regression/SingleTimestepRegressionExample.java)
+[Show me the code](https://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/recurrent/regression/SingleTimestepRegressionExample.java)
 
 ### AdditionRNN.java
 
@@ -254,13 +254,13 @@ DeepLearning4j supports using a Spark Cluster for network training. Here are the
 ### MnistMLPExample.java
 
 This is an example of a Multi-Layer Perceptron training on the Mnist data set of handwritten digits. 
-[Show me the code](http://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-spark-examples/dl4j-spark/src/main/java/org/deeplearning4j/mlp/MnistMLPExample.java)
+[Show me the code](https://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-spark-examples/dl4j-spark/src/main/java/org/deeplearning4j/mlp/MnistMLPExample.java)
 
 ### SparkLSTMCharacterExample.java
 
 An LSTM recurrent Network in Spark. 
 
-[Show me the code](http://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-spark-examples/dl4j-spark/src/main/java/org/deeplearning4j/rnn/SparkLSTMCharacterExample.java)
+[Show me the code](https://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-spark-examples/dl4j-spark/src/main/java/org/deeplearning4j/rnn/SparkLSTMCharacterExample.java)
 
 ---
 
@@ -274,7 +274,7 @@ The learning algorithms and loss functions are executed as ND4J operations.
 
 This is a directory with examples for creating and manipulating NDArrays.
 
-[Show me the code](http://github.com/eclipse/deeplearning4j-examples/tree/master/nd4j-examples/src/main/java/org/nd4j/examples)
+[Show me the code](https://github.com/eclipse/deeplearning4j-examples/tree/master/nd4j-examples/src/main/java/org/nd4j/examples)
 
 ---
 
@@ -282,4 +282,4 @@ This is a directory with examples for creating and manipulating NDArrays.
 
 Deep learning algorithms have learned to play Space Invaders and Doom using reinforcement learning. DeepLearning4J/RL4J examples of Reinforcement Learning are available here: 
 
-[Show me the code](http://github.com/eclipse/deeplearning4j-examples/tree/master/rl4j-examples)
\ No newline at end of file
+[Show me the code](https://github.com/eclipse/deeplearning4j-examples/tree/master/rl4j-examples)
\ No newline at end of file
diff --git a/docs/deeplearning4j/templates/quickstart.md b/docs/deeplearning4j/templates/quickstart.md
index bcc042f07..25f4216ff 100644
--- a/docs/deeplearning4j/templates/quickstart.md
+++ b/docs/deeplearning4j/templates/quickstart.md
@@ -179,7 +179,7 @@ Congratulations! You just trained your first neural network with Deeplearning4j.
 **Q:** **SPARK ISSUES** I am running the examples and having issues with the Spark based examples such as distributed training or datavec transform options.
 
 
-**A:** You may be missing some dependencies that Spark requires. See this [Stack Overflow discussion](http://stackoverflow.com/a/38735202/3892515) for a discussion of potential dependency issues. Windows users may need the winutils.exe from Hadoop.
+**A:** You may be missing some dependencies that Spark requires. See this [Stack Overflow discussion](https://stackoverflow.com/a/38735202/3892515) for a discussion of potential dependency issues. Windows users may need the winutils.exe from Hadoop.
 
 Download winutils.exe from https://github.com/steveloughran/winutils and put it into the null/bin/winutils.exe (or create a hadoop folder and add that to HADOOP_HOME)
 
diff --git a/libnd4j/include/helpers/files.h b/libnd4j/include/helpers/files.h
index fa87d4e3e..c49cedbb7 100644
--- a/libnd4j/include/helpers/files.h
+++ b/libnd4j/include/helpers/files.h
@@ -16,7 +16,7 @@
 
 //
 // Methods to lookup files in $PATH
-// adopted from http://stackoverflow.com/questions/2718915/check-if-file-exists-including-on-path
+// adopted from https://stackoverflow.com/questions/2718915/check-if-file-exists-including-on-path
 //
 
 #ifndef LIBND4J_FILES_H
diff --git a/libnd4j/include/ops/declarable/headers/nn.h b/libnd4j/include/ops/declarable/headers/nn.h
index 9f9b0e40a..810733680 100644
--- a/libnd4j/include/ops/declarable/headers/nn.h
+++ b/libnd4j/include/ops/declarable/headers/nn.h
@@ -137,7 +137,7 @@ namespace nd4j {
         #endif
 
         /**
-         * This operation performs batch normalization of layer, it is based on following article http://arxiv.org/abs/1502.03167.
+         * This operation performs batch normalization of layer, it is based on following article https://arxiv.org/abs/1502.03167.
          * Expected arguments:
          * x: input 4D array of shape [bS,iH,iW,iD] (data format = NHWC) or [bS,iD,iH,iW] (data format = NCHW), where
          *    bS - batch size
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/gru.cpp b/libnd4j/include/ops/declarable/helpers/cpu/gru.cpp
index 9799e609d..579ab2612 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/gru.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/gru.cpp
@@ -19,7 +19,7 @@
 //
 
 // implementation of gated Recurrent Unit cell
-// (cf. http://arxiv.org/abs/1406.1078).
+// (cf. https://arxiv.org/abs/1406.1078).
 // Kyunghyun Cho, Bart van Merrienboer, Caglar Gulcehre, Dzmitry Bahdanau, Fethi Bougares, Holger Schwenk, Yoshua Bengio
 // "Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation"
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/gru.cu b/libnd4j/include/ops/declarable/helpers/cuda/gru.cu
index 8e7b62a91..cbbdf1439 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/gru.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/gru.cu
@@ -19,7 +19,7 @@
 //
 
 // implementation of gated Recurrent Unit cell
-// (cf. http://arxiv.org/abs/1406.1078).
+// (cf. https://arxiv.org/abs/1406.1078).
 // Kyunghyun Cho, Bart van Merrienboer, Caglar Gulcehre, Dzmitry Bahdanau, Fethi Bougares, Holger Schwenk, Yoshua Bengio
 // "Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation"
 
diff --git a/libnd4j/tests_cpu/layers_tests/testinclude.h b/libnd4j/tests_cpu/layers_tests/testinclude.h
index f27623cff..79607cdc9 100644
--- a/libnd4j/tests_cpu/layers_tests/testinclude.h
+++ b/libnd4j/tests_cpu/layers_tests/testinclude.h
@@ -24,7 +24,7 @@
 #include <string>
 #include <op_boilerplate.h>
 
-//http://stackoverflow.com/questions/228005/alternative-to-itoa-for-converting-integer-to-string-c
+//https://stackoverflow.com/questions/228005/alternative-to-itoa-for-converting-integer-to-string-c
 FORCEINLINE std::string int_array_to_string(Nd4jLong int_array[], Nd4jLong size_of_array) {
     std::string returnstring = "[";
     for (int temp = 0; temp < size_of_array; temp++) {
diff --git a/nd4j/README.md b/nd4j/README.md
index f26adea75..1d41d4403 100644
--- a/nd4j/README.md
+++ b/nd4j/README.md
@@ -41,12 +41,12 @@ To install ND4J, there are a couple of approaches, and more information can be f
 
 #### Install from Maven Central
 
-1. Search for nd4j in the [Maven Central Repository](http://mvnrepository.com/search?q=nd4j) to find the available nd4j jars.
+1. Search for nd4j in the [Maven Central Repository](https://search.maven.org/search?q=nd4j) to find the available nd4j jars.
 2. Include the appropriate dependency in your pom.xml.
 
 #### Clone from the GitHub Repo
 
-https://deeplearning4j.org/buildinglocally 
+https://deeplearning4j.org/docs/latest/deeplearning4j-build-from-source 
 ## Contribute
 
 1. Check for open issues, or open a new issue to start a discussion around a feature idea or a bug.
diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/pom.xml b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/pom.xml
index 21924f80a..b4a374baf 100644
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/pom.xml
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/pom.xml
@@ -192,12 +192,6 @@
         </dependency>
 
 
-        <dependency>
-            <groupId>org.objenesis</groupId>
-            <artifactId>objenesis</artifactId>
-            <version>${objenesis.version}</version>
-        </dependency>
-
 
         <!-- oshi: Used for collecting system information for system info reporting -->
         <dependency>
@@ -206,22 +200,6 @@
             <version>${oshi.version}</version>
         </dependency>
 
-        <dependency>
-            <groupId>junit</groupId>
-            <artifactId>junit</artifactId>
-            <scope>test</scope>
-        </dependency>
-        <dependency>
-            <groupId>ch.qos.logback</groupId>
-            <artifactId>logback-classic</artifactId>
-            <scope>test</scope>
-        </dependency>
-        <dependency>
-            <groupId>ch.qos.logback</groupId>
-            <artifactId>logback-core</artifactId>
-            <scope>test</scope>
-        </dependency>
-
         <dependency>
             <groupId>org.slf4j</groupId>
             <artifactId>slf4j-api</artifactId>
diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/samediff/ops/SDNN.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/samediff/ops/SDNN.java
index 668a7a4a9..7b1cc5768 100644
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/samediff/ops/SDNN.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/autodiff/samediff/ops/SDNN.java
@@ -69,7 +69,7 @@ public class SDNN extends SDOps {
 
     /**
      * Neural network batch normalization operation.<br>
-     * For details, see <a href="http://arxiv.org/abs/1502.03167">http://arxiv.org/abs/1502.03167</a>
+     * For details, see <a href="https://arxiv.org/abs/1502.03167">https://arxiv.org/abs/1502.03167</a>
      *
      * @param name     Name of the output variable
      * @param input    Input variable.
@@ -139,7 +139,7 @@ public class SDNN extends SDOps {
      * out = a * (exp(x) - 1) if x <= 0<br>
      * with constant a = 1.0
      * <p>
-     * See: <a href="http://arxiv.org/abs/1511.07289">http://arxiv.org/abs/1511.07289</a>
+     * See: <a href="https://arxiv.org/abs/1511.07289">https://arxiv.org/abs/1511.07289</a>
      *
      * @param x Input variable
      * @return Output variable
@@ -154,7 +154,7 @@ public class SDNN extends SDOps {
      * out = a * (exp(x) - 1) if x <= 0<br>
      * with constant a = 1.0
      * <p>
-     * See: <a href="http://arxiv.org/abs/1511.07289">http://arxiv.org/abs/1511.07289</a>
+     * See: <a href="https://arxiv.org/abs/1511.07289">https://arxiv.org/abs/1511.07289</a>
      *
      * @param name Output variable name
      * @param x    Input variable
diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/activations/impl/ActivationRReLU.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/activations/impl/ActivationRReLU.java
index 478305e76..9221d601c 100644
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/activations/impl/ActivationRReLU.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/activations/impl/ActivationRReLU.java
@@ -34,7 +34,7 @@ import org.nd4j.shade.jackson.annotation.JsonIgnoreProperties;
  *  alpha is drawn from uniform(l,u) during training and is set to l+u/2 during test
  *  l and u default to 1/8 and 1/3 respectively
  *
- *  <a href="http://arxiv.org/abs/1505.00853">
+ *  <a href="https://arxiv.org/abs/1505.00853">
  *  Empirical Evaluation of Rectified Activations in Convolutional Network</a>
  */
 @EqualsAndHashCode(callSuper = false)
diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/scalar/LeakyReLU.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/scalar/LeakyReLU.java
index fe70de288..b9a98dc6e 100644
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/scalar/LeakyReLU.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/scalar/LeakyReLU.java
@@ -34,7 +34,7 @@ import org.tensorflow.framework.NodeDef;
  * Out(x) = x if x >= 0<br>
  * Leaky ReLU may avoid zero gradient "dying ReLU" problem by having non-zero
  * gradient below 0.<br>
- * See for example http://arxiv.org/abs/1505.00853 for a comparison of
+ * See for example https://arxiv.org/abs/1505.00853 for a comparison of
  * ReLU variants.
  *
  * @author Alex Black
diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/transforms/MaxOut.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/transforms/MaxOut.java
index 05993cd7f..939ed854b 100644
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/transforms/MaxOut.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/transforms/MaxOut.java
@@ -33,7 +33,7 @@ import java.util.List;
 
 /**
  * Max out activation:
- * http://arxiv.org/pdf/1302.4389.pdf
+ * https://arxiv.org/pdf/1302.4389.pdf
  *
  * @author Adam Gibson
  */
diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/transforms/strict/ELU.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/transforms/strict/ELU.java
index 6923639fd..c4fc245b7 100644
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/transforms/strict/ELU.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/transforms/strict/ELU.java
@@ -32,7 +32,7 @@ import java.util.List;
  * Introduced in paper:<br>
  * Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)<br>
  * Djork-Arné Clevert, Thomas Unterthiner, Sepp Hochreiter (2015)<br>
- * <a href="http://arxiv.org/abs/1511.07289">http://arxiv.org/abs/1511.07289</a>
+ * <a href="https://arxiv.org/abs/1511.07289">https://arxiv.org/abs/1511.07289</a>
  *
  * @author Alex Black
  */
diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/random/custom/DistributionUniform.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/random/custom/DistributionUniform.java
index 0744533ba..ecc76a1b2 100644
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/random/custom/DistributionUniform.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/random/custom/DistributionUniform.java
@@ -74,6 +74,7 @@ public class DistributionUniform extends DynamicCustomOp {
         AttrValue v = attributesForNode.get("dtype");
         dataType = TFGraphMapper.convertType(v.getType());
         addIArgument(dataType.toInt());
+        addTArgument(0.0, 1.0); //TF version is hardcoded 0 to 1
     }
 
     protected void addArgs() {
diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/learning/AdaMaxUpdater.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/learning/AdaMaxUpdater.java
index c398dad72..20a908f1e 100644
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/learning/AdaMaxUpdater.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/learning/AdaMaxUpdater.java
@@ -32,7 +32,7 @@ import java.util.Map;
 
 /**
  * The AdaMax updater, a variant of Adam.
- * http://arxiv.org/abs/1412.6980
+ * https://arxiv.org/abs/1412.6980
  *
  * @author Justin Long
  */
diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/learning/AdamUpdater.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/learning/AdamUpdater.java
index 8d7709873..e68af09f7 100644
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/learning/AdamUpdater.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/learning/AdamUpdater.java
@@ -30,7 +30,7 @@ import java.util.Map;
 
 /**
  * The Adam updater.
- * http://arxiv.org/abs/1412.6980
+ * https://arxiv.org/abs/1412.6980
  *
  * @author Adam Gibson
  */
diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/learning/config/AdaMax.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/learning/config/AdaMax.java
index 00956589a..848bb3408 100644
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/learning/config/AdaMax.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/learning/config/AdaMax.java
@@ -28,7 +28,7 @@ import java.util.Map;
 
 /**
  * The AdaMax updater, a variant of Adam.
- * http://arxiv.org/abs/1412.6980
+ * https://arxiv.org/abs/1412.6980
  *
  * @author Justin Long
  */
diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/learning/config/Adam.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/learning/config/Adam.java
index 22ebe06f3..6901af59c 100644
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/learning/config/Adam.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/learning/config/Adam.java
@@ -29,7 +29,7 @@ import java.util.Map;
 
 /**
  * The Adam updater.
- * http://arxiv.org/abs/1412.6980
+ * https://arxiv.org/abs/1412.6980
  *
  * @author Adam Gibson
  */
diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/string/NDArrayStrings.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/string/NDArrayStrings.java
index c28f35151..f5c0d5c9e 100644
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/string/NDArrayStrings.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/string/NDArrayStrings.java
@@ -46,6 +46,9 @@ public class NDArrayStrings {
 
     public static final String EMPTY_ARRAY_STR = "[]";
 
+    private static final String[] OPEN_BRACKETS =  new String[]{"", "[", "[[", "[[[", "[[[[", "[[[[[", "[[[[[[", "[[[[[[[", "[[[[[[[["};
+    private static final String[] CLOSE_BRACKETS = new String[]{"", "]", "]]", "]]]", "]]]]", "]]]]]", "]]]]]]", "]]]]]]]", "]]]]]]]]"};
+
     /**
      * The default number of elements for printing INDArrays (via NDArrayStrings or INDArray.toString)
      */
@@ -190,29 +193,29 @@ public class NDArrayStrings {
 
     private String format(INDArray arr, int offset, boolean summarize) {
         int rank = arr.rank();
-        if (arr.isScalar()) {
+        if (arr.isScalar() || arr.length() == 1) {
+            int fRank = Math.min(rank, OPEN_BRACKETS.length-1);
             if (arr.isR()) {
-                //true scalar i.e shape = [] not legacy which is [1,1]
                 double arrElement = arr.getDouble(0);
                 if (!dontOverrideFormat && ((Math.abs(arrElement) < this.minToPrintWithoutSwitching && arrElement != 0) || (Math.abs(arrElement) >= this.maxToPrintWithoutSwitching))) {
                     //switch to scientific notation
                     String asString = localeIndifferentDecimalFormat(scientificFormat).format(arrElement);
                     //from E to small e
                     asString = asString.replace('E', 'e');
-                    return asString;
+                    return OPEN_BRACKETS[fRank] + asString + CLOSE_BRACKETS[fRank];
                 } else {
-                    if (arr.getDouble(0) == 0) return "0";
-                    return decimalFormat.format(arr.getDouble(0));
+                    if (arr.getDouble(0) == 0) return OPEN_BRACKETS[fRank] + "0" + CLOSE_BRACKETS[fRank];
+                    return OPEN_BRACKETS[fRank] + decimalFormat.format(arr.getDouble(0)) + CLOSE_BRACKETS[fRank];
                 }
             } else if (arr.isZ()) {
                 long arrElement = arr.getLong(0);
-                return String.valueOf(arrElement);
+                return OPEN_BRACKETS[fRank] + arrElement + CLOSE_BRACKETS[fRank];
             } else if (arr.isB()) {
                 long arrElement = arr.getLong(0);
-                return arrElement == 0 ? "false" : "true";
+                return OPEN_BRACKETS[fRank] + (arrElement == 0 ? "false" : "true") + CLOSE_BRACKETS[fRank];
             } else if (arr.dataType() == DataType.UTF8){
                 String s = arr.getString(0);
-                return "\"" + s.replaceAll("\n","\\n") + "\"";
+                return OPEN_BRACKETS[fRank] + "\"" + s.replaceAll("\n","\\n") + "\"" + CLOSE_BRACKETS[fRank];
             } else
                 throw new ND4JIllegalStateException();
         } else if (rank == 1) {
@@ -246,9 +249,10 @@ public class NDArrayStrings {
                     //hack fix for slice issue with 'f' order
                     if (arr.ordering() == 'f' && arr.rank() > 2 && arr.size(arr.rank() - 1) == 1) {
                         sb.append(format(arr.dup('c').slice(i), offset, summarize));
-                    } else if(arr.rank() <= 1 || arr.length() == 1) {
-                        sb.append(format(Nd4j.scalar(arr.getDouble(0)),offset,summarize));
                     }
+//                    else if(arr.rank() <= 1 || arr.length() == 1) {
+//                        sb.append(format(Nd4j.scalar(arr.getDouble(0)),offset,summarize));
+//                    }
                     else {
                         sb.append(format(arr.slice(i), offset, summarize));
                     }
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/pom.xml b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/pom.xml
index c6017e3a7..33e54bd4a 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/pom.xml
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/pom.xml
@@ -34,13 +34,6 @@
             <classifier>${dependency.classifier}</classifier>
         </dependency>
 
-        <dependency>
-            <groupId>org.springframework</groupId>
-            <artifactId>spring-core</artifactId>
-            <version>5.0.2.RELEASE</version>
-            <scope>test</scope>
-        </dependency>
-
         <dependency>
             <groupId>org.bytedeco</groupId>
             <artifactId>javacpp</artifactId>
@@ -87,73 +80,10 @@
             <artifactId>nd4j-api</artifactId>
             <version>${project.version}</version>
         </dependency>
-        <dependency>
-            <groupId>junit</groupId>
-            <artifactId>junit</artifactId>
-        </dependency>
-        <dependency>
-            <groupId>org.nd4j</groupId>
-            <artifactId>nd4j-jackson</artifactId>
-            <version>${project.version}</version>
-            <scope>test</scope>
-        </dependency>
-        <dependency>
-            <groupId>ch.qos.logback</groupId>
-            <artifactId>logback-classic</artifactId>
-            <version>${logback.version}</version>
-            <scope>test</scope>
-        </dependency>
-
-        <dependency>
-            <groupId>org.nd4j</groupId>
-            <artifactId>nd4j-tensorflow</artifactId>
-            <version>${project.version}</version>
-            <scope>test</scope>
-        </dependency>
-
-        <!-- Reflections: required in one of the tests -->
-        <dependency>
-            <groupId>org.reflections</groupId>
-            <artifactId>reflections</artifactId>
-            <version>${reflections.version}</version>
-            <scope>test</scope>
-            <exclusions>
-                <exclusion>
-                    <groupId>com.google.code.findbugs</groupId>
-                    <artifactId>*</artifactId>
-                </exclusion>
-            </exclusions>
-        </dependency>
     </dependencies>
 
     <build>
         <plugins>
-            <plugin>
-                <groupId>org.apache.maven.plugins</groupId>
-                <artifactId>maven-surefire-plugin</artifactId>
-                <configuration>
-                    <environmentVariables>
-                        <LD_LIBRARY_PATH>${env.LD_LIBRARY_PATH}:${user.dir}:${libnd4jhome}/blasbuild/cpu/blas/</LD_LIBRARY_PATH>
-                    </environmentVariables>
-                    <includes>
-                        <include>*.java</include>
-                        <include>**/*.java</include>
-                        <include>**/Test*.java</include>
-                        <include>**/*Test.java</include>
-                        <include>**/*TestCase.java</include>
-                    </includes>
-                    <junitArtifactName>junit:junit</junitArtifactName>
-                    <systemPropertyVariables>
-                        <org.nd4j.linalg.defaultbackend>org.nd4j.linalg.cpu.nativecpu.CpuBackend</org.nd4j.linalg.defaultbackend>
-                        <org.nd4j.linalg.tests.backendstorun>org.nd4j.linalg.cpu.nativecpu.CpuBackend</org.nd4j.linalg.tests.backendstorun>
-                    </systemPropertyVariables>
-                    <!--
-                        Maximum heap size was set to 8g, as a minimum required value for tests run.
-                        Depending on a build machine, default value is not always enough.
-                    -->
-                    <argLine>-Ddtype=float -Xmx8g</argLine>
-                </configuration>
-            </plugin>
             <plugin>
                 <artifactId>maven-compiler-plugin</artifactId>
                 <executions>
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java
index 06c061fad..0441cd3b3 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java
@@ -21680,7 +21680,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
         /**
-         * This operation performs batch normalization of layer, it is based on following article http://arxiv.org/abs/1502.03167.
+         * This operation performs batch normalization of layer, it is based on following article https://arxiv.org/abs/1502.03167.
          * Expected arguments:
          * x: input 4D array of shape [bS,iH,iW,iD] (data format = NHWC) or [bS,iD,iH,iW] (data format = NCHW), where
          *    bS - batch size
diff --git a/nd4j/nd4j-backends/nd4j-tests/pom.xml b/nd4j/nd4j-backends/nd4j-tests/pom.xml
index 5f1d372ff..50fa24bf9 100644
--- a/nd4j/nd4j-backends/nd4j-tests/pom.xml
+++ b/nd4j/nd4j-backends/nd4j-tests/pom.xml
@@ -57,12 +57,7 @@
         </plugins>
     </build>
     <dependencies>
-         <dependency>
-            <groupId>org.springframework</groupId>
-            <artifactId>spring-core</artifactId>
-            <version>5.0.2.RELEASE</version>
-            <scope>test</scope>
-        </dependency>
+
 
         <dependency>
             <groupId>junit</groupId>
@@ -105,10 +100,12 @@
             <artifactId>logback-core</artifactId>
             <version>${logback.version}</version>
         </dependency>
+
         <dependency>
-            <groupId>org.nd4j</groupId>
-            <artifactId>nd4j-kafka_${scala.binary.version}</artifactId>
-            <version>${project.version}</version>
+            <groupId>org.springframework</groupId>
+            <artifactId>spring-core</artifactId>
+            <version>5.0.2.RELEASE</version>
+            <scope>test</scope>
         </dependency>
 
         <dependency>
diff --git a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/BaseNd4jTest.java b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/BaseNd4jTest.java
index a9582b6ad..c3c94e1ed 100644
--- a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/BaseNd4jTest.java
+++ b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/BaseNd4jTest.java
@@ -20,16 +20,13 @@ package org.nd4j.linalg;
 import lombok.val;
 import org.bytedeco.javacpp.Pointer;
 import org.junit.After;
-import org.junit.AfterClass;
 import org.junit.Before;
 import org.junit.Rule;
 import org.junit.rules.TestName;
 import org.junit.runner.RunWith;
 import org.junit.runners.Parameterized;
-import org.nd4j.config.ND4JEnvironmentVars;
 import org.nd4j.config.ND4JSystemProperties;
 import org.nd4j.linalg.api.buffer.DataType;
-import org.nd4j.linalg.api.environment.Nd4jEnvironment;
 import org.nd4j.linalg.api.memory.MemoryWorkspace;
 import org.nd4j.linalg.factory.Nd4j;
 import org.nd4j.linalg.factory.Nd4jBackend;
@@ -38,7 +35,6 @@ import org.nd4j.linalg.util.ArrayUtil;
 import org.nd4j.nativeblas.NativeOpsHolder;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import scala.collection.mutable.StringBuilder;
 
 import java.lang.management.ManagementFactory;
 import java.util.*;
diff --git a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/ToStringTest.java b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/ToStringTest.java
index 97d952fea..42b895f76 100644
--- a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/ToStringTest.java
+++ b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/ToStringTest.java
@@ -31,6 +31,7 @@ import org.nd4j.linalg.api.buffer.DataType;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.factory.Nd4j;
 import org.nd4j.linalg.factory.Nd4jBackend;
+import org.nd4j.linalg.util.ArrayUtil;
 
 @RunWith(Parameterized.class)
 @Slf4j
@@ -57,6 +58,30 @@ public class ToStringTest extends BaseNd4jTest {
                 Nd4j.createFromArray(1, 2, 3, 4, 5, 6, 7, 8).toString(6, true, 1));
     }
 
+    @Test
+    public void testToStringScalars(){
+        DataType[] dataTypes = new DataType[]{DataType.FLOAT, DataType.DOUBLE, DataType.BOOL, DataType.INT, DataType.UINT32};
+        String[] strs = new String[]{"1.0000", "1.0000", "true", "1", "1"};
+
+        for(int dt=0; dt<5; dt++ ) {
+            for (int i = 0; i < 5; i++) {
+                long[] shape = ArrayUtil.nTimes(i, 1L);
+                INDArray scalar = Nd4j.scalar(1.0f).castTo(dataTypes[dt]).reshape(shape);
+                String str = scalar.toString();
+                StringBuilder sb = new StringBuilder();
+                for (int j = 0; j < i; j++) {
+                    sb.append("[");
+                }
+                sb.append(strs[dt]);
+                for (int j = 0; j < i; j++) {
+                    sb.append("]");
+                }
+                String exp = sb.toString();
+                assertEquals("Rank: " + i + ", DT: " + dataTypes[dt], exp, str);
+            }
+        }
+    }
+
     @Override
     public char ordering() {
         return 'c';
diff --git a/nd4j/nd4j-common/src/main/java/org/nd4j/linalg/util/ArrayUtil.java b/nd4j/nd4j-common/src/main/java/org/nd4j/linalg/util/ArrayUtil.java
index caeb0d47b..e1408e298 100644
--- a/nd4j/nd4j-common/src/main/java/org/nd4j/linalg/util/ArrayUtil.java
+++ b/nd4j/nd4j-common/src/main/java/org/nd4j/linalg/util/ArrayUtil.java
@@ -1495,7 +1495,7 @@ public class ArrayUtil {
 
     }
 
-    //Credit: http://stackoverflow.com/questions/15533854/converting-byte-array-to-double-array
+    //Credit: https://stackoverflow.com/questions/15533854/converting-byte-array-to-double-array
 
     /**
      *
diff --git a/nd4j/nd4j-common/src/main/java/org/nd4j/linalg/util/MathUtils.java b/nd4j/nd4j-common/src/main/java/org/nd4j/linalg/util/MathUtils.java
index a46238a7a..c32b43669 100644
--- a/nd4j/nd4j-common/src/main/java/org/nd4j/linalg/util/MathUtils.java
+++ b/nd4j/nd4j-common/src/main/java/org/nd4j/linalg/util/MathUtils.java
@@ -107,7 +107,7 @@ public class MathUtils {
     }
 
     /**
-     * See: <a href="http://stackoverflow.com/questions/466204/rounding-off-to-nearest-power-of-2">http://stackoverflow.com/questions/466204/rounding-off-to-nearest-power-of-2</a>
+     * See: <a href="https://stackoverflow.com/questions/466204/rounding-off-to-nearest-power-of-2">https://stackoverflow.com/questions/466204/rounding-off-to-nearest-power-of-2</a>
      *
      * @param v the number to getFromOrigin the next power of 2 for
      * @return the next power of 2 for the passed in value
diff --git a/nd4j/nd4j-parameter-server-parent/nd4j-parameter-server-client/pom.xml b/nd4j/nd4j-parameter-server-parent/nd4j-parameter-server-client/pom.xml
index 734b1b738..21b3f6b65 100644
--- a/nd4j/nd4j-parameter-server-parent/nd4j-parameter-server-client/pom.xml
+++ b/nd4j/nd4j-parameter-server-parent/nd4j-parameter-server-client/pom.xml
@@ -29,29 +29,6 @@
     <name>nd4j-parameter-server-client</name>
 
     <dependencies>
-
-        <dependency>
-            <groupId>commons-codec</groupId>
-            <artifactId>commons-codec</artifactId>
-            <version>${commons-codec.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.httpcomponents</groupId>
-            <artifactId>httpclient</artifactId>
-            <version>${httpclient.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.httpcomponents</groupId>
-            <artifactId>httpcore</artifactId>
-            <version>${httpcore.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.httpcomponents</groupId>
-            <artifactId>httpmime</artifactId>
-            <version>${httpmime.version}</version>
-        </dependency>
-
-
         <dependency>
             <groupId>com.mashape.unirest</groupId>
             <artifactId>unirest-java</artifactId>
@@ -72,11 +49,6 @@
             <artifactId>nd4j-aeron</artifactId>
             <version>${project.version}</version>
         </dependency>
-        <dependency>
-            <groupId>commons-io</groupId>
-            <artifactId>commons-io</artifactId>
-            <version>${commons-io.version}</version>
-        </dependency>
         <dependency>
             <groupId>org.zeroturnaround</groupId>
             <artifactId>zt-exec</artifactId>
@@ -89,12 +61,6 @@
             <version>${project.version}</version>
             <scope>test</scope>
         </dependency>
-        <dependency>
-            <groupId>org.nd4j</groupId>
-            <artifactId>nd4j-parameter-server-status_2.11</artifactId>
-            <version>${project.version}</version>
-            <scope>test</scope>
-        </dependency>
 
         <dependency>
             <groupId>ch.qos.logback</groupId>
diff --git a/nd4j/nd4j-parameter-server-parent/nd4j-parameter-server-client/src/test/java/org/nd4j/parameterserver/BaseNd4jTest.java b/nd4j/nd4j-parameter-server-parent/nd4j-parameter-server-client/src/test/java/org/nd4j/parameterserver/BaseNd4jTest.java
index 8688671bc..36958198d 100644
--- a/nd4j/nd4j-parameter-server-parent/nd4j-parameter-server-client/src/test/java/org/nd4j/parameterserver/BaseNd4jTest.java
+++ b/nd4j/nd4j-parameter-server-parent/nd4j-parameter-server-client/src/test/java/org/nd4j/parameterserver/BaseNd4jTest.java
@@ -29,7 +29,6 @@ import org.nd4j.linalg.api.buffer.DataType;
 import org.nd4j.linalg.api.memory.MemoryWorkspace;
 import org.nd4j.linalg.factory.Nd4j;
 import org.nd4j.linalg.profiler.ProfilerConfig;
-import scala.collection.mutable.StringBuilder;
 
 import java.lang.management.ManagementFactory;
 import java.util.List;
diff --git a/nd4j/nd4j-parameter-server-parent/nd4j-parameter-server-client/src/test/java/org/nd4j/parameterserver/background/BackgroundDaemonStarter.java b/nd4j/nd4j-parameter-server-parent/nd4j-parameter-server-client/src/test/java/org/nd4j/parameterserver/background/BackgroundDaemonStarter.java
index 894b20189..aa32ba514 100644
--- a/nd4j/nd4j-parameter-server-parent/nd4j-parameter-server-client/src/test/java/org/nd4j/parameterserver/background/BackgroundDaemonStarter.java
+++ b/nd4j/nd4j-parameter-server-parent/nd4j-parameter-server-client/src/test/java/org/nd4j/parameterserver/background/BackgroundDaemonStarter.java
@@ -31,7 +31,7 @@ import java.util.concurrent.TimeoutException;
 /**
  * Start background daemons for tests
  * Credit to:
- * http://stackoverflow.com/questions/636367/executing-a-java-application-in-a-separate-process
+ * https://stackoverflow.com/questions/636367/executing-a-java-application-in-a-separate-process
  * @author Adam Gibson
  */
 @Slf4j
diff --git a/nd4j/nd4j-parameter-server-parent/nd4j-parameter-server-status/pom.xml b/nd4j/nd4j-parameter-server-parent/nd4j-parameter-server-status/pom.xml
index dd50f938e..62bb98c1c 100644
--- a/nd4j/nd4j-parameter-server-parent/nd4j-parameter-server-status/pom.xml
+++ b/nd4j/nd4j-parameter-server-parent/nd4j-parameter-server-status/pom.xml
@@ -47,106 +47,11 @@
             <artifactId>nd4j-parameter-server</artifactId>
             <version>${project.version}</version>
         </dependency>
-        <dependency>
-            <groupId>joda-time</groupId>
-            <artifactId>joda-time</artifactId>
-            <version>${jodatime.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.commons</groupId>
-            <artifactId>commons-lang3</artifactId>
-            <version>${commons-lang3.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.hibernate</groupId>
-            <artifactId>hibernate-validator</artifactId>
-            <version>${hibernate.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.scala-lang</groupId>
-            <artifactId>scala-library</artifactId>
-            <version>${scala.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.scala-lang</groupId>
-            <artifactId>scala-reflect</artifactId>
-            <version>${scala.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.yaml</groupId>
-            <artifactId>snakeyaml</artifactId>
-            <version>${snakeyaml.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>com.fasterxml.jackson.core</groupId>
-            <artifactId>jackson-core</artifactId>
-            <version>${jackson.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>com.fasterxml.jackson.core</groupId>
-            <artifactId>jackson-databind</artifactId>
-            <version>${jackson.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>com.fasterxml.jackson.core</groupId>
-            <artifactId>jackson-annotations</artifactId>
-            <version>${jackson.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>com.fasterxml.jackson.datatype</groupId>
-            <artifactId>jackson-datatype-jdk8</artifactId>
-            <version>${jackson.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>com.fasterxml.jackson.datatype</groupId>
-            <artifactId>jackson-datatype-jsr310</artifactId>
-            <version>${jackson.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>com.typesafe</groupId>
-            <artifactId>config</artifactId>
-            <version>${typesafe.config.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>com.typesafe.play</groupId>
-            <artifactId>play-java_2.11</artifactId>
-            <version>${playframework.version}</version>
-            <exclusions>
-              <exclusion>
-                <groupId>ch.qos.logback</groupId>
-                <artifactId>logback-core</artifactId>
-              </exclusion>
-              <exclusion>
-                <groupId>ch.qos.logback</groupId>
-                <artifactId>logback-classic</artifactId>
-              </exclusion>
-              <exclusion>
-                <groupId>com.google.code.findbugs</groupId>
-                <artifactId>jsr305</artifactId>
-              </exclusion>
-              <exclusion>
-                <groupId>org.slf4j</groupId>
-                <artifactId>jul-to-slf4j</artifactId>
-              </exclusion>
-              <exclusion>
-                <groupId>org.slf4j</groupId>
-                <artifactId>jcl-over-slf4j</artifactId>
-              </exclusion>
-              <exclusion>
-                <groupId>org.apache.tomcat</groupId>
-                <artifactId>tomcat-servlet-api</artifactId>
-              </exclusion>
-              <exclusion>
-                <groupId>net.jodah</groupId>
-                <artifactId>typetools</artifactId>
-              </exclusion>
-            </exclusions>
-        </dependency>
 
         <dependency>
-            <groupId>net.jodah</groupId>
-            <artifactId>typetools</artifactId>
-            <version>${jodah.typetools.version}</version>
+          <groupId>junit</groupId>
+          <artifactId>junit</artifactId>
+          <scope>test</scope>
         </dependency>
 
         <dependency>
@@ -156,9 +61,39 @@
         </dependency>
 
         <dependency>
-          <groupId>junit</groupId>
-          <artifactId>junit</artifactId>
-          <scope>test</scope>
+            <groupId>com.typesafe.play</groupId>
+            <artifactId>play-java_2.11</artifactId>
+            <version>${playframework.version}</version>
+            <exclusions>
+                <exclusion>
+                    <groupId>ch.qos.logback</groupId>
+                    <artifactId>logback-core</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>ch.qos.logback</groupId>
+                    <artifactId>logback-classic</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>com.google.code.findbugs</groupId>
+                    <artifactId>jsr305</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.slf4j</groupId>
+                    <artifactId>jul-to-slf4j</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.slf4j</groupId>
+                    <artifactId>jcl-over-slf4j</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.apache.tomcat</groupId>
+                    <artifactId>tomcat-servlet-api</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>net.jodah</groupId>
+                    <artifactId>typetools</artifactId>
+                </exclusion>
+            </exclusions>
         </dependency>
     </dependencies>
 
diff --git a/nd4j/nd4j-parameter-server-parent/nd4j-parameter-server/pom.xml b/nd4j/nd4j-parameter-server-parent/nd4j-parameter-server/pom.xml
index 1122f90d7..af7316a37 100644
--- a/nd4j/nd4j-parameter-server-parent/nd4j-parameter-server/pom.xml
+++ b/nd4j/nd4j-parameter-server-parent/nd4j-parameter-server/pom.xml
@@ -34,39 +34,6 @@
             <artifactId>nd4j-parameter-server-model</artifactId>
             <version>${project.version}</version>
         </dependency>
-        <dependency>
-            <groupId>commons-codec</groupId>
-            <artifactId>commons-codec</artifactId>
-            <version>${commons-codec.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.httpcomponents</groupId>
-            <artifactId>httpclient</artifactId>
-            <version>${httpclient.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.httpcomponents</groupId>
-            <artifactId>httpcore</artifactId>
-            <version>${httpcore.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.httpcomponents</groupId>
-            <artifactId>httpmime</artifactId>
-            <version>${httpmime.version}</version>
-        </dependency>
-
-        <dependency>
-            <groupId>com.mashape.unirest</groupId>
-            <artifactId>unirest-java</artifactId>
-            <version>${unirest.version}</version>
-        </dependency>
-
-        <dependency>
-            <groupId>org.nd4j</groupId>
-            <artifactId>nd4j-jackson</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-
         <dependency>
             <groupId>org.slf4j</groupId>
             <artifactId>slf4j-log4j12</artifactId>
@@ -76,16 +43,20 @@
             <groupId>junit</groupId>
             <artifactId>junit</artifactId>
         </dependency>
+        <dependency>
+            <groupId>org.nd4j</groupId>
+            <artifactId>nd4j-aeron</artifactId>
+            <version>${project.version}</version>
+        </dependency>
         <dependency>
             <groupId>com.beust</groupId>
             <artifactId>jcommander</artifactId>
             <version>${jcommander.version}</version>
         </dependency>
-
         <dependency>
-            <groupId>org.nd4j</groupId>
-            <artifactId>nd4j-aeron</artifactId>
-            <version>${project.version}</version>
+            <groupId>com.mashape.unirest</groupId>
+            <artifactId>unirest-java</artifactId>
+            <version>${unirest.version}</version>
         </dependency>
     </dependencies>
 
diff --git a/nd4j/nd4j-parameter-server-parent/nd4j-parameter-server/src/main/java/org/nd4j/parameterserver/util/CheckSocket.java b/nd4j/nd4j-parameter-server-parent/nd4j-parameter-server/src/main/java/org/nd4j/parameterserver/util/CheckSocket.java
index c550176c8..de88ff27a 100644
--- a/nd4j/nd4j-parameter-server-parent/nd4j-parameter-server/src/main/java/org/nd4j/parameterserver/util/CheckSocket.java
+++ b/nd4j/nd4j-parameter-server-parent/nd4j-parameter-server/src/main/java/org/nd4j/parameterserver/util/CheckSocket.java
@@ -20,7 +20,7 @@ import java.io.IOException;
 import java.net.*;
 
 /**
- * Credit: http://stackoverflow.com/questions/5226905/test-if-remote-port-is-in-use
+ * Credit: https://stackoverflow.com/questions/5226905/test-if-remote-port-is-in-use
  *
  *
  */
diff --git a/nd4j/nd4j-remote/nd4j-grpc-client/pom.xml b/nd4j/nd4j-remote/nd4j-grpc-client/pom.xml
index aa60e9586..9dbdcbf24 100644
--- a/nd4j/nd4j-remote/nd4j-grpc-client/pom.xml
+++ b/nd4j/nd4j-remote/nd4j-grpc-client/pom.xml
@@ -74,12 +74,14 @@
             <groupId>ch.qos.logback</groupId>
             <artifactId>logback-classic</artifactId>
             <version>${logback.version}</version>
+            <scope>test</scope>
         </dependency>
 
         <dependency>
             <groupId>ch.qos.logback</groupId>
             <artifactId>logback-core</artifactId>
             <version>${logback.version}</version>
+            <scope>test</scope>
         </dependency>
     </dependencies>
 
diff --git a/nd4j/nd4j-serde/nd4j-arrow/pom.xml b/nd4j/nd4j-serde/nd4j-arrow/pom.xml
index 4e4ba462e..f16583745 100644
--- a/nd4j/nd4j-serde/nd4j-arrow/pom.xml
+++ b/nd4j/nd4j-serde/nd4j-arrow/pom.xml
@@ -39,16 +39,6 @@
             <artifactId>nd4j-api</artifactId>
             <version>${project.version}</version>
         </dependency>
-        <dependency>
-            <groupId>com.carrotsearch</groupId>
-            <artifactId>hppc</artifactId>
-            <version>${hppc.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>joda-time</groupId>
-            <artifactId>joda-time</artifactId>
-            <version>${jodatime.version}</version>
-        </dependency>
         <dependency>
             <groupId>org.apache.arrow</groupId>
             <artifactId>arrow-vector</artifactId>
diff --git a/nd4j/nd4j-serde/nd4j-kryo/pom.xml b/nd4j/nd4j-serde/nd4j-kryo/pom.xml
index 8d046ecf4..850413b1d 100644
--- a/nd4j/nd4j-serde/nd4j-kryo/pom.xml
+++ b/nd4j/nd4j-serde/nd4j-kryo/pom.xml
@@ -94,26 +94,7 @@
             <version>${jkserializers.version}</version>
         </dependency>
 
-        <dependency>
-            <groupId>commons-codec</groupId>
-            <artifactId>commons-codec</artifactId>
-            <version>${commons-codec.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>commons-io</groupId>
-            <artifactId>commons-io</artifactId>
-            <version>${commons-io.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.commons</groupId>
-            <artifactId>commons-lang3</artifactId>
-            <version>${commons-lang3.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.ow2.asm</groupId>
-            <artifactId>asm</artifactId>
-            <version>${asm.version}</version>
-        </dependency>
+
         <dependency>
             <groupId>org.apache.spark</groupId>
             <artifactId>spark-core_2.11</artifactId>
diff --git a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/learning/sync/qlearning/discrete/QLearningDiscrete.java b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/learning/sync/qlearning/discrete/QLearningDiscrete.java
index ca5ddf0f2..796780fb9 100644
--- a/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/learning/sync/qlearning/discrete/QLearningDiscrete.java
+++ b/rl4j/rl4j-core/src/main/java/org/deeplearning4j/rl4j/learning/sync/qlearning/discrete/QLearningDiscrete.java
@@ -44,7 +44,7 @@ import java.util.ArrayList;
  *
  * DQN or Deep Q-Learning in the Discrete domain
  *
- * http://arxiv.org/abs/1312.5602
+ * https://arxiv.org/abs/1312.5602
  *
  */
 public abstract class QLearningDiscrete<O extends Encodable> extends QLearning<O, Integer, DiscreteSpace> {

From 1eb3de90d70490ca244bd637e6c143abcf54704b Mon Sep 17 00:00:00 2001
From: raver119 <raver119@gmail.com>
Date: Thu, 14 Nov 2019 14:35:02 +0300
Subject: [PATCH 09/15] [WIP] Platform helpers switches (#44)

* - platform helpers can be disabled on per-op basis now via Context::allowHelpers
- java has access to it as well

Signed-off-by: raver119 <raver119@gmail.com>

* global platform-helpers trigger

Signed-off-by: raver119 <raver119@gmail.com>

* few signatures renamed

Signed-off-by: raver119 <raver119@gmail.com>

* - few new env variables to follow
- maxThreads/masterThreads differentiation

Signed-off-by: raver119 <raver119@gmail.com>

* Javadoc update

Signed-off-by: raver119 <raver119@gmail.com>
---
 libnd4j/blas/Environment.cpp                  | 121 +++++++++++++++++-
 libnd4j/blas/Environment.h                    |  19 ++-
 libnd4j/blas/NativeOps.h                      |   1 +
 libnd4j/blas/cpu/NativeOps.cpp                |   3 +
 libnd4j/blas/cuda/NativeOps.cu                |   4 +
 libnd4j/include/execution/Threads.h           |  28 ++--
 libnd4j/include/graph/Context.h               |   7 +
 libnd4j/include/graph/impl/Context.cpp        |   8 ++
 .../ops/declarable/impl/DeclarableOp.cpp      |  15 ++-
 .../org/nd4j/linalg/api/ops/OpContext.java    |   8 ++
 .../java/org/nd4j/nativeblas/NativeOps.java   |   1 +
 .../ops/executioner/CudaOpContext.java        |   5 +
 .../java/org/nd4j/nativeblas/Nd4jCuda.java    |  20 ++-
 .../cpu/nativecpu/ops/CpuOpContext.java       |   5 +
 .../java/org/nd4j/nativeblas/Nd4jCpu.java     |  15 +++
 .../org/nd4j/config/ND4JEnvironmentVars.java  |  33 +++++
 16 files changed, 275 insertions(+), 18 deletions(-)

diff --git a/libnd4j/blas/Environment.cpp b/libnd4j/blas/Environment.cpp
index 90c391cf1..de0ac925b 100644
--- a/libnd4j/blas/Environment.cpp
+++ b/libnd4j/blas/Environment.cpp
@@ -43,7 +43,7 @@
 namespace nd4j {
 
     nd4j::Environment::Environment() {
-        _tadThreshold.store(8);
+        _tadThreshold.store(1);
         _elementThreshold.store(1024);
         _verbose.store(false);
         _debug.store(false);
@@ -52,6 +52,7 @@ namespace nd4j {
         _leaks.store(false);
         _dataType.store(nd4j::DataType::FLOAT32);
         _maxThreads = std::thread::hardware_concurrency();
+        _maxMasterThreads = _maxThreads.load();
 
 #ifndef ANDROID
         const char* omp_threads = std::getenv("OMP_NUM_THREADS");
@@ -66,6 +67,94 @@ namespace nd4j {
                 // still do nothing
             }
         }
+
+        /**
+         * Defines size of thread pool used for parallelism
+         */
+        const char* max_threads = std::getenv("SD_MAX_THREADS");
+        if (max_threads != nullptr) {
+            try {
+                std::string t(max_threads);
+                int val = std::stoi(t);
+                _maxThreads.store(val);
+            } catch (std::invalid_argument &e) {
+                // just do nothing
+            } catch (std::out_of_range &e) {
+                // still do nothing
+            }
+        }
+
+        /**
+         * Defines max number of threads usable at once
+         */
+        const char* max_master_threads = std::getenv("SD_MASTER_THREADS");
+        if (max_master_threads != nullptr) {
+            try {
+                std::string t(max_master_threads);
+                int val = std::stoi(t);
+                _maxMasterThreads.store(val);
+            } catch (std::invalid_argument &e) {
+                // just do nothing
+            } catch (std::out_of_range &e) {
+                // still do nothing
+            }
+        }
+
+        /**
+         * If this env var is defined - we'll disallow use of platform-specific helpers (mkldnn, cudnn, etc)
+         */
+        const char* forbid_helpers = std::getenv("SD_FORBID_HELPERS");
+        if (max_master_threads != nullptr) {
+            _allowHelpers = false;
+        }
+
+        /**
+         * This var defines max amount of host memory library can allocate
+         */
+        const char* max_primary_memory = std::getenv("SD_MAX_PRIMARY_BYTES");
+        if (max_primary_memory != nullptr) {
+            try {
+                std::string t(max_primary_memory);
+                auto val = std::stol(t);
+                _maxTotalPrimaryMemory.store(val);
+            } catch (std::invalid_argument &e) {
+                // just do nothing
+            } catch (std::out_of_range &e) {
+                // still do nothing
+            }
+        }
+
+        /**
+         * This var defines max amount of special (i.e. device) memory library can allocate on all devices combined
+         */
+        const char* max_special_memory = std::getenv("SD_MAX_SPECIAL_BYTES");
+        if (max_special_memory != nullptr) {
+            try {
+                std::string t(max_special_memory);
+                auto val = std::stol(t);
+                _maxTotalSpecialMemory.store(val);
+            } catch (std::invalid_argument &e) {
+                // just do nothing
+            } catch (std::out_of_range &e) {
+                // still do nothing
+            }
+        }
+
+        /**
+         * This var defines max amount of special (i.e. device) memory library can allocate on all devices combined
+         */
+        const char* max_device_memory = std::getenv("SD_MAX_DEVICE_BYTES");
+        if (max_device_memory != nullptr) {
+            try {
+                std::string t(max_device_memory);
+                auto val = std::stol(t);
+                _maxDeviceMemory.store(val);
+            } catch (std::invalid_argument &e) {
+                // just do nothing
+            } catch (std::out_of_range &e) {
+                // still do nothing
+            }
+        }
 #endif
 
 #ifdef __CUDABLAS__
@@ -97,6 +186,18 @@ namespace nd4j {
         //
     }
 
+    void Environment::setMaxPrimaryMemory(uint64_t maxBytes) {
+        _maxTotalPrimaryMemory = maxBytes;
+    }
+
+    void Environment::setMaxSpecialyMemory(uint64_t maxBytes) {
+        _maxTotalSpecialMemory;
+    }
+
+    void Environment::setMaxDeviceMemory(uint64_t maxBytes) {
+        _maxDeviceMemory = maxBytes;
+    }
+
     Environment *Environment::getInstance() {
         if (_instance == 0)
             _instance = new Environment();
@@ -179,8 +280,16 @@ namespace nd4j {
         return _maxThreads.load();
     }
 
+    int Environment::maxMasterThreads() {
+        return _maxMasterThreads.load();
+    }
+
     void Environment::setMaxThreads(int max) {
-        _maxThreads.store(max);
+        //_maxThreads.store(max);
+    }
+
+    void Environment::setMaxMasterThreads(int max) {
+        //_maxMasterThreads = max;
     }
 
     bool Environment::precisionBoostAllowed() {
@@ -211,6 +320,14 @@ namespace nd4j {
         return _blasPatchVersion;
     }
 
+    bool Environment::helpersAllowed() {
+        return _allowHelpers.load();
+    }
+
+    void Environment::allowHelpers(bool reallyAllow) {
+        _allowHelpers.store(reallyAllow);
+    }
+
     nd4j::Environment *nd4j::Environment::_instance = 0;
 
 }
diff --git a/libnd4j/blas/Environment.h b/libnd4j/blas/Environment.h
index a303d27d0..54982471f 100644
--- a/libnd4j/blas/Environment.h
+++ b/libnd4j/blas/Environment.h
@@ -37,10 +37,18 @@ namespace nd4j{
         std::atomic<bool> _debug;
         std::atomic<bool> _leaks;
         std::atomic<bool> _profile;
-        std::atomic<int> _maxThreads;
         std::atomic<nd4j::DataType> _dataType;
         std::atomic<bool> _precBoost;
         std::atomic<bool> _useMKLDNN{true};
+        std::atomic<bool> _allowHelpers{true};
+
+        std::atomic<int> _maxThreads;
+        std::atomic<int> _maxMasterThreads;
+
+        // these fields hold defaults
+        std::atomic<int64_t> _maxTotalPrimaryMemory{-1};
+        std::atomic<int64_t> _maxTotalSpecialMemory{-1};
+        std::atomic<int64_t> _maxDeviceMemory{-1};
 
 #ifdef __ND4J_EXPERIMENTAL__
         const bool _experimental = true;
@@ -74,6 +82,8 @@ namespace nd4j{
         void setDebug(bool reallyDebug);
         void setProfiling(bool reallyProfile);
         void setLeaksDetector(bool reallyDetect);
+        bool helpersAllowed();
+        void allowHelpers(bool reallyAllow);
         
         int tadThreshold();
         void setTadThreshold(int threshold);
@@ -84,6 +94,13 @@ namespace nd4j{
         int maxThreads();
         void setMaxThreads(int max);
 
+        int maxMasterThreads();
+        void setMaxMasterThreads(int max);
+
+        void setMaxPrimaryMemory(uint64_t maxBytes);
+        void setMaxSpecialyMemory(uint64_t maxBytes);
+        void setMaxDeviceMemory(uint64_t maxBytes);
+
         bool isUseMKLDNN() { return _useMKLDNN.load(); }
         void setUseMKLDNN(bool useMKLDNN) { _useMKLDNN.store(useMKLDNN); }
 
diff --git a/libnd4j/blas/NativeOps.h b/libnd4j/blas/NativeOps.h
index b10b3807a..ff368d7c8 100755
--- a/libnd4j/blas/NativeOps.h
+++ b/libnd4j/blas/NativeOps.h
@@ -1732,6 +1732,7 @@ typedef nd4j::graph::RandomGenerator OpaqueRandomGenerator;
 
 ND4J_EXPORT OpaqueContext* createGraphContext(int nodeId);
 ND4J_EXPORT OpaqueRandomGenerator* getGraphContextRandomGenerator(OpaqueContext* ptr);
+ND4J_EXPORT void ctxAllowHelpers(OpaqueContext* ptr, bool reallyAllow);
 ND4J_EXPORT void markGraphContextInplace(OpaqueContext* ptr, bool reallyInplace);
 ND4J_EXPORT void setGraphContextCudaContext(OpaqueContext* ptr, void *stream, void *reductionPointer, void *allocationPointer);
 ND4J_EXPORT void setGraphContextInputArray(OpaqueContext* ptr, int index, void *buffer, void *shapeInfo, void *specialBuffer, void *specialShapeInfo);
diff --git a/libnd4j/blas/cpu/NativeOps.cpp b/libnd4j/blas/cpu/NativeOps.cpp
index 151f5c883..df6ccc240 100644
--- a/libnd4j/blas/cpu/NativeOps.cpp
+++ b/libnd4j/blas/cpu/NativeOps.cpp
@@ -2874,6 +2874,9 @@ void deleteGraphContext(nd4j::graph::Context* ptr) {
     delete ptr;
 }
 
+void ctxAllowHelpers(OpaqueContext* ptr, bool reallyAllow) {
+    ptr->allowHelpers(reallyAllow);
+}
 
 nd4j::graph::RandomGenerator* createRandomGenerator(Nd4jLong rootSeed, Nd4jLong nodeSeed) {
     return new nd4j::graph::RandomGenerator(rootSeed, nodeSeed);
diff --git a/libnd4j/blas/cuda/NativeOps.cu b/libnd4j/blas/cuda/NativeOps.cu
index 2af0e3783..cda6acbad 100755
--- a/libnd4j/blas/cuda/NativeOps.cu
+++ b/libnd4j/blas/cuda/NativeOps.cu
@@ -3558,4 +3558,8 @@ bool isMinimalRequirementsMet() {
 
 bool isOptimalRequirementsMet() {
     return true;
+}
+
+void ctxAllowHelpers(OpaqueContext* ptr, bool reallyAllow) {
+    ptr->allowHelpers(reallyAllow);
 }
\ No newline at end of file
diff --git a/libnd4j/include/execution/Threads.h b/libnd4j/include/execution/Threads.h
index 683220b61..be12a311a 100644
--- a/libnd4j/include/execution/Threads.h
+++ b/libnd4j/include/execution/Threads.h
@@ -107,11 +107,22 @@ namespace samediff {
          * @param increment
          * @return
          */
-        static int parallel_for(FUNC_1D function, int64_t start, int64_t stop, int64_t increment = 1, uint32_t numThreads = nd4j::Environment::getInstance()->maxThreads());
-
-        static int parallel_tad(FUNC_1D function, int64_t start, int64_t stop, int64_t increment = 1, uint32_t numThreads = nd4j::Environment::getInstance()->maxThreads());
+        static int parallel_for(FUNC_1D function, int64_t start, int64_t stop, int64_t increment = 1, uint32_t numThreads = nd4j::Environment::getInstance()->maxMasterThreads());
 
         /**
+         * This function executes 1 dimensional loop for a given number of threads
+         *
+         * @param function
+         * @param start
+         * @param stop
+         * @param increment
+         * @param numThreads
+         * @return
+         */
+        static int parallel_tad(FUNC_1D function, int64_t start, int64_t stop, int64_t increment = 1, uint32_t numThreads = nd4j::Environment::getInstance()->maxMasterThreads());
+
+        /**
+         * This method will execute function splitting 2 nested loops space with multiple threads
          *
          * @param function
          * @param numThreads
@@ -123,9 +134,10 @@ namespace samediff {
          * @param inc_y
          * @return
          */
-        static int parallel_for(FUNC_2D function, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y, uint64_t numThreads = nd4j::Environment::getInstance()->maxThreads(), bool debug = false);
+        static int parallel_for(FUNC_2D function, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y, uint64_t numThreads = nd4j::Environment::getInstance()->maxMasterThreads(), bool debug = false);
 
         /**
+         * This method will execute function splitting 3 nested loops space with multiple threads
          *
          * @param function
          * @param numThreads
@@ -140,7 +152,7 @@ namespace samediff {
          * @param inc_z
          * @return
          */
-        static int parallel_for(FUNC_3D function, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y, int64_t start_z, int64_t stop_z, int64_t inc_z, uint64_t numThreads = nd4j::Environment::getInstance()->maxThreads());
+        static int parallel_for(FUNC_3D function, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y, int64_t start_z, int64_t stop_z, int64_t inc_z, uint64_t numThreads = nd4j::Environment::getInstance()->maxMasterThreads());
 
         /**
          *
@@ -148,11 +160,11 @@ namespace samediff {
          * @param numThreads
          * @return
          */
-        static int parallel_do(FUNC_DO function, uint64_t numThreads = nd4j::Environment::getInstance()->maxThreads());
+        static int parallel_do(FUNC_DO function, uint64_t numThreads = nd4j::Environment::getInstance()->maxMasterThreads());
 
-        static int64_t parallel_long(FUNC_RL function, FUNC_AL aggregator, int64_t start, int64_t stop, int64_t increment = 1, uint64_t numThreads = nd4j::Environment::getInstance()->maxThreads());
+        static int64_t parallel_long(FUNC_RL function, FUNC_AL aggregator, int64_t start, int64_t stop, int64_t increment = 1, uint64_t numThreads = nd4j::Environment::getInstance()->maxMasterThreads());
 
-        static double parallel_double(FUNC_RD function, FUNC_AD aggregator, int64_t start, int64_t stop, int64_t increment = 1, uint64_t numThreads = nd4j::Environment::getInstance()->maxThreads());
+        static double parallel_double(FUNC_RD function, FUNC_AD aggregator, int64_t start, int64_t stop, int64_t increment = 1, uint64_t numThreads = nd4j::Environment::getInstance()->maxMasterThreads());
     };
 }
 
diff --git a/libnd4j/include/graph/Context.h b/libnd4j/include/graph/Context.h
index f397d46f3..96079e5a2 100644
--- a/libnd4j/include/graph/Context.h
+++ b/libnd4j/include/graph/Context.h
@@ -58,9 +58,12 @@ namespace nd4j {
 
             std::vector<nd4j::DataType> _dataTypes;
 
+            // fields for fast execution (out-of-graph ops use)
             std::vector<NDArray*> _fastpath_in;
             std::vector<NDArray*> _fastpath_out;
             std::vector<NDArray*> _handles;
+
+            bool _helpersAllowed = true;
         public:
             Context(ContextPrototype* prototype, VariableSpace* variableSpace);
 
@@ -188,6 +191,10 @@ namespace nd4j {
             void setBArguments(bool *arguments, int numberOfArguments);
 
             void setCudaContext(Nd4jPointer cudaStream, Nd4jPointer reductionPointer, Nd4jPointer allocationPointer);
+
+
+            void allowHelpers(bool reallyAllow);
+            bool helpersAllowed();
         };
     }
 }
diff --git a/libnd4j/include/graph/impl/Context.cpp b/libnd4j/include/graph/impl/Context.cpp
index 085fa969e..b18d3f347 100644
--- a/libnd4j/include/graph/impl/Context.cpp
+++ b/libnd4j/include/graph/impl/Context.cpp
@@ -461,6 +461,14 @@ namespace nd4j {
                 v->setContext(_context);
 #endif
         }
+
+        void Context::allowHelpers(bool reallyAllow) {
+            _helpersAllowed = reallyAllow;
+        }
+
+        bool Context::helpersAllowed() {
+            return _helpersAllowed;
+        }
     }
 }
 
diff --git a/libnd4j/include/ops/declarable/impl/DeclarableOp.cpp b/libnd4j/include/ops/declarable/impl/DeclarableOp.cpp
index fe1574ea1..5ee19b007 100644
--- a/libnd4j/include/ops/declarable/impl/DeclarableOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/DeclarableOp.cpp
@@ -506,12 +506,15 @@ namespace nd4j {
             Nd4jStatus status;
             bool hasHelper = false;
 
-            // if we have platform-specific helper for this op - invoke it
-            if (OpRegistrator::getInstance()->hasHelper(this->getOpHash())) {
-                auto helper =  OpRegistrator::getInstance()->getPlatformHelper(this->getOpHash());
-                if (helper->isUsable(*block)) {
-                    status = helper->invokeHelper(*block);
-                    hasHelper = true;
+            // platform helpers use might be forbidden for various reasons, so we'll check it out first
+            if (block->helpersAllowed() && nd4j::Environment::getInstance()->helpersAllowed()) {
+                // if we have platform-specific helper for this op - invoke it
+                if (OpRegistrator::getInstance()->hasHelper(this->getOpHash())) {
+                    auto helper = OpRegistrator::getInstance()->getPlatformHelper(this->getOpHash());
+                    if (helper->isUsable(*block)) {
+                        status = helper->invokeHelper(*block);
+                        hasHelper = true;
+                    }
                 }
             }
 
diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/OpContext.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/OpContext.java
index cd74a60a0..e66d52f91 100644
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/OpContext.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/OpContext.java
@@ -128,4 +128,12 @@ public interface OpContext extends AutoCloseable {
      * @param reallyInplace
      */
     void markInplace(boolean reallyInplace);
+
+    /**
+     * This method allows to enable/disable use of platform helpers within ops. I.e. mkldnn or cuDNN.
+     * PLEASE NOTE: default value is True
+     *
+     * @param reallyAllow
+     */
+    void allowHelpers(boolean reallyAllow);
 }
diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-native-api/src/main/java/org/nd4j/nativeblas/NativeOps.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-native-api/src/main/java/org/nd4j/nativeblas/NativeOps.java
index 8f621668b..d4a7b8f8b 100644
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-native-api/src/main/java/org/nd4j/nativeblas/NativeOps.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-native-api/src/main/java/org/nd4j/nativeblas/NativeOps.java
@@ -1123,6 +1123,7 @@ public interface NativeOps {
     void setGraphContextTArguments(OpaqueContext ptr, DoublePointer arguments, int numberOfArguments);
     void setGraphContextIArguments(OpaqueContext ptr, LongPointer arguments, int numberOfArguments);
     void setGraphContextBArguments(OpaqueContext ptr, BooleanPointer arguments, int numberOfArguments);
+    void ctxAllowHelpers(OpaqueContext ptr, boolean reallyAllow);
     void deleteGraphContext(OpaqueContext ptr);
 
     OpaqueRandomGenerator createRandomGenerator(long rootSeed, long nodeSeed);
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaOpContext.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaOpContext.java
index 32f1b0a10..b75f688fe 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaOpContext.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaOpContext.java
@@ -136,4 +136,9 @@ public class CudaOpContext extends BaseOpContext implements OpContext {
     public void markInplace(boolean reallyInplace) {
         nativeOps.markGraphContextInplace(context, reallyInplace);
     }
+
+    @Override
+    public void allowHelpers(boolean reallyAllow) {
+        nativeOps.ctxAllowHelpers(context, reallyAllow);
+    }
 }
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java
index fecb64012..22b2068d4 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java
@@ -1,4 +1,4 @@
-// Targeted by JavaCPP version 1.5.1-1: DO NOT EDIT THIS FILE
+// Targeted by JavaCPP version 1.5.2: DO NOT EDIT THIS FILE
 
 package org.nd4j.nativeblas;
 
@@ -575,6 +575,8 @@ public class Nd4jCuda extends org.nd4j.nativeblas.Nd4jCudaHelper {
         public native void setDebug(@Cast("bool") boolean reallyDebug);
         public native void setProfiling(@Cast("bool") boolean reallyProfile);
         public native void setLeaksDetector(@Cast("bool") boolean reallyDetect);
+        public native @Cast("bool") boolean helpersAllowed();
+        public native void allowHelpers(@Cast("bool") boolean reallyAllow);
         
         public native int tadThreshold();
         public native void setTadThreshold(int threshold);
@@ -585,6 +587,13 @@ public class Nd4jCuda extends org.nd4j.nativeblas.Nd4jCudaHelper {
         public native int maxThreads();
         public native void setMaxThreads(int max);
 
+        public native int maxMasterThreads();
+        public native void setMaxMasterThreads(int max);
+
+        public native void setMaxPrimaryMemory(@Cast("uint64_t") long maxBytes);
+        public native void setMaxSpecialyMemory(@Cast("uint64_t") long maxBytes);
+        public native void setMaxDeviceMemory(@Cast("uint64_t") long maxBytes);
+
         public native @Cast("bool") boolean isUseMKLDNN();
         public native void setUseMKLDNN(@Cast("bool") boolean useMKLDNN);
 
@@ -3087,6 +3096,7 @@ public native void deleteShapeBuffer(OpaqueConstantDataBuffer ptr);
 
 public native OpaqueContext createGraphContext(int nodeId);
 public native OpaqueRandomGenerator getGraphContextRandomGenerator(OpaqueContext ptr);
+public native void ctxAllowHelpers(OpaqueContext ptr, @Cast("bool") boolean reallyAllow);
 public native void markGraphContextInplace(OpaqueContext ptr, @Cast("bool") boolean reallyInplace);
 public native void setGraphContextCudaContext(OpaqueContext ptr, Pointer stream, Pointer reductionPointer, Pointer allocationPointer);
 public native void setGraphContextInputArray(OpaqueContext ptr, int index, Pointer buffer, Pointer shapeInfo, Pointer specialBuffer, Pointer specialShapeInfo);
@@ -5454,6 +5464,10 @@ NDArray& NDArray::operator()(const Nd4jLong* idx) {
 
         
 
+        
+
+        
+
         
 
         
@@ -6741,6 +6755,10 @@ NDArray& NDArray::operator()(const Nd4jLong* idx) {
             public native void setBArguments(@Cast("bool*") boolean[] arguments, int numberOfArguments);
 
             public native void setCudaContext(@Cast("Nd4jPointer") Pointer cudaStream, @Cast("Nd4jPointer") Pointer reductionPointer, @Cast("Nd4jPointer") Pointer allocationPointer);
+
+
+            public native void allowHelpers(@Cast("bool") boolean reallyAllow);
+            public native @Cast("bool") boolean helpersAllowed();
         }
     
 
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/ops/CpuOpContext.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/ops/CpuOpContext.java
index 9431a3453..6700f9019 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/ops/CpuOpContext.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/ops/CpuOpContext.java
@@ -105,4 +105,9 @@ public class CpuOpContext extends BaseOpContext implements OpContext {
     public void markInplace(boolean reallyInplace) {
         nativeOps.markGraphContextInplace(context, reallyInplace);
     }
+
+    @Override
+    public void allowHelpers(boolean reallyAllow) {
+        nativeOps.ctxAllowHelpers(context, reallyAllow);
+    }
 }
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java
index 0441cd3b3..d99a8240a 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java
@@ -575,6 +575,8 @@ public class Nd4jCpu extends org.nd4j.nativeblas.Nd4jCpuHelper {
         public native void setDebug(@Cast("bool") boolean reallyDebug);
         public native void setProfiling(@Cast("bool") boolean reallyProfile);
         public native void setLeaksDetector(@Cast("bool") boolean reallyDetect);
+        public native @Cast("bool") boolean helpersAllowed();
+        public native void allowHelpers(@Cast("bool") boolean reallyAllow);
         
         public native int tadThreshold();
         public native void setTadThreshold(int threshold);
@@ -585,6 +587,13 @@ public class Nd4jCpu extends org.nd4j.nativeblas.Nd4jCpuHelper {
         public native int maxThreads();
         public native void setMaxThreads(int max);
 
+        public native int maxMasterThreads();
+        public native void setMaxMasterThreads(int max);
+
+        public native void setMaxPrimaryMemory(@Cast("uint64_t") long maxBytes);
+        public native void setMaxSpecialyMemory(@Cast("uint64_t") long maxBytes);
+        public native void setMaxDeviceMemory(@Cast("uint64_t") long maxBytes);
+
         public native @Cast("bool") boolean isUseMKLDNN();
         public native void setUseMKLDNN(@Cast("bool") boolean useMKLDNN);
 
@@ -3087,6 +3096,7 @@ public native void deleteShapeBuffer(OpaqueConstantDataBuffer ptr);
 
 public native OpaqueContext createGraphContext(int nodeId);
 public native OpaqueRandomGenerator getGraphContextRandomGenerator(OpaqueContext ptr);
+public native void ctxAllowHelpers(OpaqueContext ptr, @Cast("bool") boolean reallyAllow);
 public native void markGraphContextInplace(OpaqueContext ptr, @Cast("bool") boolean reallyInplace);
 public native void setGraphContextCudaContext(OpaqueContext ptr, Pointer stream, Pointer reductionPointer, Pointer allocationPointer);
 public native void setGraphContextInputArray(OpaqueContext ptr, int index, Pointer buffer, Pointer shapeInfo, Pointer specialBuffer, Pointer specialShapeInfo);
@@ -6745,6 +6755,10 @@ NDArray& NDArray::operator()(const Nd4jLong* idx) {
             public native void setBArguments(@Cast("bool*") boolean[] arguments, int numberOfArguments);
 
             public native void setCudaContext(@Cast("Nd4jPointer") Pointer cudaStream, @Cast("Nd4jPointer") Pointer reductionPointer, @Cast("Nd4jPointer") Pointer allocationPointer);
+
+
+            public native void allowHelpers(@Cast("bool") boolean reallyAllow);
+            public native @Cast("bool") boolean helpersAllowed();
         }
     
 
@@ -11383,6 +11397,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 // #elif _MSC_VER
 // #define FORCEINLINE __forceinline
 // #elif __GNUC__
+// #define INLINE_LOOPS
 // #define FORCEINLINE __attribute__((always_inline)) inline 
 // #elif __CUDACC__ 
 // #else
diff --git a/nd4j/nd4j-common/src/main/java/org/nd4j/config/ND4JEnvironmentVars.java b/nd4j/nd4j-common/src/main/java/org/nd4j/config/ND4JEnvironmentVars.java
index 3bcff03f0..c77f945d0 100644
--- a/nd4j/nd4j-common/src/main/java/org/nd4j/config/ND4JEnvironmentVars.java
+++ b/nd4j/nd4j-common/src/main/java/org/nd4j/config/ND4JEnvironmentVars.java
@@ -137,6 +137,39 @@ public class ND4JEnvironmentVars {
      */
     public static final String ND4J_IGNORE_AVX = "ND4J_IGNORE_AVX";
 
+    /**
+     * This variable defines how many threads will be used in ThreadPool for parallel execution of linear algebra.
+     * Default value: number of threads supported by this system.
+     */
+    public static final String SD_MAX_THREADS = "SD_MAX_THREADS";
+
+    /**
+     * This variable defines how many threads will be used for any 1 linear algebra operation.
+     * Default value: number of threads supported by this system.
+     */
+    public static final String SD_MASTER_THREADS = "SD_MASTER_THREADS";
+
+    /**
+     * If set, this variable disables use of optimized platform helpers (i.e. mkldnn or cuDNN)
+     */
+    public static final String SD_FORBID_HELPERS = "SD_FORBID_HELPERS";
+
+    /**
+     * If set, this variables defines how much memory application is allowed to use off-heap.
+     * PLEASE NOTE: this option is separate from JVM XMS/XMX options
+     */
+    public static final String SD_MAX_PRIMARY_BYTES = "SD_MAX_PRIMARY_BYTES";
+
+    /**
+     * If set, this variable defines how much memory application is allowed to use ON ALL computational devices COMBINED.
+     */
+    public static final String SD_MAX_SPECIAL_BYTES = "SD_MAX_SPECIAL_BYTES";
+
+    /**
+     * If set, this variable defines how much memory application is allowed to use on any one computational device
+     */
+    public static final String SD_MAX_DEVICE_BYTES = "SD_MAX_DEVICE_BYTES";
+
     private ND4JEnvironmentVars() {
     }
 }

From c5b912bddf6b8d1c452e3bc075f5f81fb9187944 Mon Sep 17 00:00:00 2001
From: raver119 <raver119@gmail.com>
Date: Thu, 14 Nov 2019 19:50:24 +0300
Subject: [PATCH 10/15] few changes for openblas and jcpp preloads (on macos)
 (#46)

Signed-off-by: raver119 <raver119@gmail.com>
---
 .../linalg/cpu/nativecpu/CpuNDArrayFactory.java    |  7 -------
 .../java/org/nd4j/nativeblas/Nd4jCpuPresets.java   | 14 ++++----------
 2 files changed, 4 insertions(+), 17 deletions(-)

diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/CpuNDArrayFactory.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/CpuNDArrayFactory.java
index a1746134c..03904125d 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/CpuNDArrayFactory.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/linalg/cpu/nativecpu/CpuNDArrayFactory.java
@@ -84,13 +84,6 @@ public class CpuNDArrayFactory extends BaseNativeNDArrayFactory {
 
     @Override
     public void createBlas() {
-        String lib = System.getProperty(ND4JSystemProperties.ND4J_CPU_LOAD_OPENBLAS,
-                     System.getProperty(ND4JSystemProperties.ND4J_CPU_LOAD_OPENBLAS_NOLAPACK, "")).toLowerCase();
-        if (lib.trim().length() == 0) {
-            // try to load by default the LAPACK-less version of MKL bundled with MKL-DNN
-            System.setProperty(ND4JSystemProperties.ND4J_CPU_LOAD_OPENBLAS_NOLAPACK, "mklml");
-        }
-
         // we'll check hardware support first
         if (!nativeOps.isMinimalRequirementsMet()) {
             // this means cpu binary was built for some arch support, we don't have on this box
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpuPresets.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpuPresets.java
index 4a99bbe3e..7f3fc8256 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpuPresets.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpuPresets.java
@@ -130,18 +130,12 @@ import java.util.Scanner;
                                 compiler = {"cpp11", "nowarnings"}, library = "jnind4jcpu", link = "nd4jcpu",
                                 preloadresource = {"org/bytedeco/openblas/"},
                                 preload = {"openblas", "openblas_nolapack", "libnd4jcpu"}),
-                                @Platform(value = "linux", preload = {"gomp@.1"},
-                                                preloadpath = {"/lib64/", "/lib/", "/usr/lib64/", "/usr/lib/"}),
-                @Platform(value = {"linux-arm", "linux-ppc"},
-                                preload = {"gomp@.1", "gcc_s@.1", "quadmath@.0", "gfortran@.5", "gfortran@.4", "gfortran@.3", "openblas@.0", "libnd4jcpu"}),
+                @Platform(value = "linux", preload = {"gomp@.1"}, preloadpath = {"/lib64/", "/lib/", "/usr/lib64/", "/usr/lib/"}),
+                @Platform(value = {"linux-arm", "linux-ppc"}, preload = {"gomp@.1", "gcc_s@.1", "quadmath@.0", "gfortran@.5", "gfortran@.4", "gfortran@.3", "openblas@.0", "libnd4jcpu"}),
                 @Platform(value = "linux-armhf", preloadpath = {"/usr/arm-linux-gnueabihf/lib/", "/usr/lib/arm-linux-gnueabihf/"}),
                 @Platform(value = "linux-arm64", preloadpath = {"/usr/aarch64-linux-gnu/lib/", "/usr/lib/aarch64-linux-gnu/"}),
-                @Platform(value = "linux-ppc64", preloadpath = {"/usr/powerpc64-linux-gnu/lib/", "/usr/powerpc64le-linux-gnu/lib/",
-                                                                "/usr/lib/powerpc64-linux-gnu/", "/usr/lib/powerpc64le-linux-gnu/"}),
-                @Platform(value = "macosx", preload = {"gcc_s@.1", "gomp@.1", "stdc++@.6"},
-                                preloadpath = {"/usr/local/lib/gcc/8/", "/usr/local/lib/gcc/7/", "/usr/local/lib/gcc/6/", "/usr/local/lib/gcc/5/"}),
-                @Platform(value = "windows", preload = {"libwinpthread-1", "libgcc_s_seh-1", "libgomp-1", "libstdc++-6",
-                                                        "msvcr120", "libnd4jcpu"}),
+                @Platform(value = "linux-ppc64", preloadpath = {"/usr/powerpc64-linux-gnu/lib/", "/usr/powerpc64le-linux-gnu/lib/", "/usr/lib/powerpc64-linux-gnu/", "/usr/lib/powerpc64le-linux-gnu/"}),
+                @Platform(value = "windows", preload = {"libwinpthread-1", "libgcc_s_seh-1", "libgomp-1", "libstdc++-6", "msvcr120", "libnd4jcpu"}),
                 @Platform(extension = {"-avx512", "-avx2"}) })
 public class Nd4jCpuPresets implements InfoMapper, BuildEnabled {
 

From 62d8e0d40981b945041ee7fc6145971f446febb1 Mon Sep 17 00:00:00 2001
From: Yurii Shyrma <iuriish@yahoo.com>
Date: Thu, 14 Nov 2019 19:21:22 +0200
Subject: [PATCH 11/15] - make agreement between our and mkl api
 dilation/padding formulas (#47)

Signed-off-by: Yurii <iuriish@yahoo.com>
---
 .../ops/declarable/helpers/convolutions.h     | 48 +++++++++++++++++++
 .../declarable/platform/mkldnn/deconv2d.cpp   | 18 ++++---
 .../declarable/platform/mkldnn/deconv3d.cpp   | 15 ++++--
 .../platform/mkldnn/mkldnnUtils.cpp           | 18 +++++--
 .../layers_tests/ConvolutionTests1.cpp        | 45 ++++++++++++++---
 .../layers_tests/ConvolutionTests2.cpp        | 46 ++++++++++++++++++
 6 files changed, 168 insertions(+), 22 deletions(-)

diff --git a/libnd4j/include/ops/declarable/helpers/convolutions.h b/libnd4j/include/ops/declarable/helpers/convolutions.h
index fc7c41034..68cfc8d05 100644
--- a/libnd4j/include/ops/declarable/helpers/convolutions.h
+++ b/libnd4j/include/ops/declarable/helpers/convolutions.h
@@ -194,6 +194,54 @@ namespace nd4j {
 
             }
 
+            static inline void calcPaddingAndDilationForConv2DMKL(const int iH, const int iW, const int oH, const int oW, const int kH, const int kW, const int sH, const int sW, const int isSameMode, int& pH, int& pW, int& dH, int& dW) {
+
+                if(kH != 1) {
+                    if(isSameMode) {
+                        pH = (oH - 1) * sH - iH + kH - pH;
+                        dH = dH - 1;
+                    }
+                    else
+                        dH = (iH + 2*pH - (oH - 1) * sH - kH) / (kH - 1);
+                }
+                if(kW != 1) {
+                    if(isSameMode) {
+                        pW = (oW - 1) * sW - iW + kW - pW;
+                        dW = dW - 1;
+                    }
+                    else
+                        dW = (iW + 2*pW - (oW - 1) * sW - kW) / (kW - 1);
+                }
+            }
+
+            static inline void calcPaddingAndDilationForConv3DMKL(const int iD, const int iH, const int iW, const int oD, const int oH, const int oW, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int isSameMode, int& pD, int& pH, int& pW, int& dD, int& dH, int& dW) {
+
+                if(kD != 1) {
+                    if(isSameMode) {
+                        pD = (oD - 1) * sD - iD + kD - pD;
+                        dD = dD - 1;
+                    }
+                    else
+                        dD = (iD + 2*pD - (oD - 1) * sD - kD) / (kD - 1);
+                }
+                if(kH != 1) {
+                    if(isSameMode) {
+                        pH = (oH - 1) * sH - iH + kH - pH;
+                        dH = dH - 1;
+                    }
+                    else
+                        dH = (iH + 2*pH - (oH - 1) * sH - kH) / (kH - 1);
+                }
+                if(kW != 1) {
+                    if(isSameMode) {
+                        pW = (oW - 1) * sW - iW + kW - pW;
+                        dW = dW - 1;
+                    }
+                    else
+                        dW = (iW + 2*pW - (oW - 1) * sW - kW) / (kW - 1);
+                }
+            }
+
             static void conv2d(nd4j::graph::Context  &context, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int isSameMode, const int isNCHW);
 
             // static void conv2d(nd4j::graph::Context & block, const std::vector<NDArray*>& inArrs, NDArray* output, const std::vector<int>& intArgs);
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/deconv2d.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/deconv2d.cpp
index cfd1620b0..239e243ca 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/deconv2d.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/deconv2d.cpp
@@ -46,10 +46,13 @@ static void deconv2dMKLDNN(const NDArray* input, const NDArray* weights, const N
     int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH;       // corresponding indexes
     ConvolutionUtils::getSizesAndIndexesConv2d(true, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH);
 
+    int dHmkl(dH), dWmkl(dW), pHmkl(pH), pWmkl(pW);
+    ConvolutionUtils::calcPaddingAndDilationForConv2DMKL(oH, oW, iH, iW, kH, kW, sH, sW, isSameMode, pHmkl, pWmkl, dHmkl, dWmkl);
+
     mkldnn::memory::dims strides   = { sH, sW };
-    mkldnn::memory::dims dilation  = { dH - 1, dW - 1};
     mkldnn::memory::dims padding   = { pH, pW };
-    mkldnn::memory::dims padding_r = { (iH - 1) * sH - oH + kH - pH, (iW - 1) * sW - oW + kW - pW };
+    mkldnn::memory::dims padding_r = { pHmkl, pWmkl };
+    mkldnn::memory::dims dilation  = { dHmkl, dWmkl };
 
     // input type
     mkldnn::memory::data_type xType;
@@ -190,11 +193,13 @@ static void deconv2dBackPropMKLDNN(const NDArray* input, const NDArray* weights,
     int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH;       // corresponding indexes
     ConvolutionUtils::getSizesAndIndexesConv2d(true, *input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH);
 
-    mkldnn::memory::dims strides   = { sH, sW };
-    mkldnn::memory::dims dilation  = { dH - 1, dW - 1 };
-    mkldnn::memory::dims padding   = { pH, pW };
-    mkldnn::memory::dims padding_r = { (iH - 1) * sH - oH + kH - pH, (iW - 1) * sW - oW + kW - pW };
+    int dHmkl(dH), dWmkl(dW), pHmkl(pH), pWmkl(pW);
+    ConvolutionUtils::calcPaddingAndDilationForConv2DMKL(oH, oW, iH, iW, kH, kW, sH, sW, isSameMode, pHmkl, pWmkl, dHmkl, dWmkl);
 
+    mkldnn::memory::dims strides   = { sH, sW };
+    mkldnn::memory::dims padding   = { pH, pW };
+    mkldnn::memory::dims padding_r = { pHmkl, pWmkl };
+    mkldnn::memory::dims dilation  = { dHmkl, dWmkl };
     // input type
     mkldnn::memory::data_type xType = input->dataType() == DataType::FLOAT32 ? mkldnn::memory::data_type::f32 : mkldnn::memory::data_type::bf16;
     // weights type
@@ -425,7 +430,6 @@ PLATFORM_CHECK(deconv2d) {
 
     return block.isUseMKLDNN() && (
             (xType==DataType::FLOAT32 && wType==DataType::FLOAT32 && bType==DataType::FLOAT32 && zType==DataType::FLOAT32) ||
-            (xType==DataType::HALF    && wType==DataType::HALF    && bType==DataType::HALF    && zType==DataType::HALF   ) ||
             ((xType==DataType::UINT8 || xType==DataType::INT8) && wType==DataType::INT8 && (zType==DataType::UINT8 || zType==DataType::INT8 || zType==DataType::INT32 || zType==DataType::FLOAT32) && bType == zType)
           );
 }
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/deconv3d.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/deconv3d.cpp
index aab4a723a..d1d7ca87f 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/deconv3d.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/deconv3d.cpp
@@ -47,10 +47,13 @@ static void deconv3dMKLDNN(const NDArray* input, const NDArray* weights, const N
     int indIOioC, indIOioD, indWoC, indWiC, indWkD;             // corresponding indexes
     ConvolutionUtils::getSizesAndIndexesConv3d(true, *input, *output, bS, iC, iD, iH, iW, oC, oD, oH, oW, indIOioC, indIOioD, indWoC, indWiC, indWkD);
 
+    int dDmkl(dD), dHmkl(dH), dWmkl(dW), pDmkl(pD), pHmkl(pH), pWmkl(pW);
+    ConvolutionUtils::calcPaddingAndDilationForConv3DMKL(oD, oH, oW, iD, iH, iW, kD, kH, kW, sD, sH, sW, isSameMode, pDmkl, pHmkl, pWmkl, dDmkl, dHmkl, dWmkl);
+
     mkldnn::memory::dims strides   = { sD, sH, sW };
-    mkldnn::memory::dims dilation  = { dD - 1, dH - 1, dW - 1};
     mkldnn::memory::dims padding   = { pD, pH, pW };
-    mkldnn::memory::dims padding_r = {(iD - 1) * sD - oD + kD - pD, (iH - 1) * sH - oH + kH - pH, (iW - 1) * sW - oW + kW - pW };
+    mkldnn::memory::dims padding_r = { pDmkl, pHmkl, pWmkl };
+    mkldnn::memory::dims dilation  = { dDmkl, dHmkl, dWmkl };
 
     // input type
     mkldnn::memory::data_type xType;
@@ -194,10 +197,13 @@ static void deconv3dBackPropMKLDNN(const NDArray* input, const NDArray* weights,
     int indIOioC, indIOioD, indWoC, indWiC, indWkD;             // corresponding indexes
     ConvolutionUtils::getSizesAndIndexesConv3d(true, *input, *gradO, bS, iC, iD, iH, iW, oC, oD, oH, oW, indIOioC, indIOioD, indWoC, indWiC, indWkD);
 
+    int dDmkl(dD), dHmkl(dH), dWmkl(dW), pDmkl(pD), pHmkl(pH), pWmkl(pW);
+    ConvolutionUtils::calcPaddingAndDilationForConv3DMKL(oD, oH, oW, iD, iH, iW, kD, kH, kW, sD, sH, sW, isSameMode, pDmkl, pHmkl, pWmkl, dDmkl, dHmkl, dWmkl);
+
     mkldnn::memory::dims strides   = { sD, sH, sW };
-    mkldnn::memory::dims dilation  = { dD - 1, dH - 1, dW - 1 };
     mkldnn::memory::dims padding   = { pD, pH, pW };
-    mkldnn::memory::dims padding_r = {(iD - 1) * sD - oD + kD - pD, (iH - 1) * sH - oH + kH - pH, (iW - 1) * sW - oW + kW - pW };
+    mkldnn::memory::dims padding_r = { pDmkl, pHmkl, pWmkl };
+    mkldnn::memory::dims dilation  = { dDmkl, dHmkl, dWmkl };
 
     // input type
     mkldnn::memory::data_type xType = input->dataType() == DataType::FLOAT32 ? mkldnn::memory::data_type::f32 : mkldnn::memory::data_type::bf16;
@@ -438,7 +444,6 @@ PLATFORM_CHECK(deconv3d) {
 
     return block.isUseMKLDNN() && (
             (xType==DataType::FLOAT32 && wType==DataType::FLOAT32 && bType==DataType::FLOAT32 && zType==DataType::FLOAT32) ||
-            (xType==DataType::HALF    && wType==DataType::HALF    && bType==DataType::HALF    && zType==DataType::HALF   ) ||
             ((xType==DataType::UINT8 || xType==DataType::INT8) && wType==DataType::INT8 && (zType==DataType::UINT8 || zType==DataType::INT8 || zType==DataType::INT32 || zType==DataType::FLOAT32) && bType == zType)
           );
 }
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/mkldnnUtils.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/mkldnnUtils.cpp
index 8a1afdedb..084fb760b 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/mkldnnUtils.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/mkldnnUtils.cpp
@@ -20,6 +20,7 @@
 
 #include <mkldnn_types.h>
 #include "mkldnnUtils.h"
+#include <ops/declarable/helpers/convolutions.h>
 
 using namespace mkldnn;
 
@@ -154,6 +155,14 @@ namespace nd4j {
             mkldnn::memory::dims conv_bias_tz = { oC };
             mkldnn::memory::dims conv_dst_tz = { bS, oC, oH, oW };
 
+            int dHmkl(dH), dWmkl(dW), pHmkl(pH), pWmkl(pW);
+            nd4j::ops::ConvolutionUtils::calcPaddingAndDilationForConv2DMKL(iH, iW, oH, oW, kH, kW, sH, sW, isSameMode, pHmkl, pWmkl, dHmkl, dWmkl);
+
+            conv_strides   = { sH, sW };
+            conv_padding   = { pH, pW };
+            conv_padding_r = { pHmkl, pWmkl };
+            conv_dilation  = { dHmkl, dWmkl };
+
             conv_strides   = { sH, sW };
             conv_padding   = { pH, pW };
             conv_dilation  = { dH-1, dW-1};
@@ -234,12 +243,13 @@ namespace nd4j {
             mkldnn::memory::dims conv_bias_tz = { oC };
             mkldnn::memory::dims conv_dst_tz = { bS, oC, oD, oH, oW };
 
+            int dDmkl(dD), dHmkl(dH), dWmkl(dW), pDmkl(pD), pHmkl(pH), pWmkl(pW);
+            nd4j::ops::ConvolutionUtils::calcPaddingAndDilationForConv3DMKL(iD, iH, iW, oD, oH, oW, kD, kH, kW, sD, sH, sW, isSameMode, pDmkl, pHmkl, pWmkl, dDmkl, dHmkl, dWmkl);
+
             conv_strides   = { sD, sH, sW };
-            conv_dilation  = { dD-1, dH-1, dW-1};
             conv_padding   = { pD, pH, pW };
-            conv_padding_r = { (oD - 1) * sD - iD + kD - pD,
-                               (oH - 1) * sH - iH + kH - pH,
-                               (oW - 1) * sW - iW + kW - pW };
+            conv_padding_r = { pDmkl, pHmkl, pWmkl };
+            conv_dilation  = { dDmkl, dHmkl, dWmkl };
 
             auto type = mkldnn::memory::data_type::f32;
             auto format = isNCDHW ? mkldnn::memory::format_tag::ncdhw : mkldnn::memory::format_tag::ndhwc;
diff --git a/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp b/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp
index 353e51ad3..23208ce1f 100644
--- a/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp
@@ -2137,9 +2137,9 @@ TEST_F(ConvolutionTests1, deconv2d_test1) {
     int paddingMode = 0;             // 1-SAME, 0-VALID;
     int dataFormat  = 1;             // 1-NHWC, 0-NCHW
 
-    auto input    = NDArrayFactory::create<double>('c', {bS, iH, iW, iC});
-    auto weights  = NDArrayFactory::create<double>('c', {kH, kW, oC, iC});
-    auto exp = NDArrayFactory::create<double>('c', {bS, oH, oW, oC}, {  2.75,   7.75,  12.75,  17.75,  22.75, 30.5 ,  40.5 ,  50.5 ,  60.5 ,  70.5 , 30.5 ,  40.5 ,  50.5 ,  60.5 ,  70.5 , 27.75,  32.75,  37.75,  42.75,  47.75,
+    auto input    = NDArrayFactory::create<float>('c', {bS, iH, iW, iC});
+    auto weights  = NDArrayFactory::create<float>('c', {kH, kW, oC, iC});
+    auto exp = NDArrayFactory::create<float>('c', {bS, oH, oW, oC}, {  2.75,   7.75,  12.75,  17.75,  22.75, 30.5 ,  40.5 ,  50.5 ,  60.5 ,  70.5 , 30.5 ,  40.5 ,  50.5 ,  60.5 ,  70.5 , 27.75,  32.75,  37.75,  42.75,  47.75,
                                                   55.5 ,  65.5 ,  75.5 ,  85.5 ,  95.5 ,161.  , 181.  , 201.  , 221.  , 241.  ,161.  , 181.  , 201.  , 221.  , 241.  ,105.5 , 115.5 , 125.5 , 135.5 , 145.5 ,
                                                   55.5 ,  65.5 ,  75.5 ,  85.5 ,  95.5 ,161.  , 181.  , 201.  , 221.  , 241.  ,161.  , 181.  , 201.  , 221.  , 241.  ,105.5 , 115.5 , 125.5 , 135.5 , 145.5 ,
                                                   52.75,  57.75,  62.75,  67.75,  72.75,130.5 , 140.5 , 150.5 , 160.5 , 170.5 ,130.5 , 140.5 , 150.5 , 160.5 , 170.5 , 77.75,  82.75,  87.75,  92.75,  97.75,
@@ -2170,9 +2170,9 @@ TEST_F(ConvolutionTests1, deconv2d_test2) {
     int paddingMode = 1;             // 1-SAME, 0-VALID;
     int dataFormat  = 1;             // 1-NHWC, 0-NCHW
 
-    auto input    = NDArrayFactory::create<double>('c', {bS, oH, oW, oC});
-    auto weights  = NDArrayFactory::create<double>('c', {kH, kW, iC, oC});
-    auto exp = NDArrayFactory::create<double>('c', {bS, iH, iW, iC}, {2.75,   7.75,  12.75,  17.75,  22.75, 30.5 ,  40.5 ,  50.5 ,  60.5 ,  70.5 , 30.5 ,  40.5 ,  50.5 ,  60.5 ,  70.5 , 30.5 ,  40.5 ,  50.5 ,  60.5 ,  70.5 ,
+    auto input    = NDArrayFactory::create<float>('c', {bS, oH, oW, oC});
+    auto weights  = NDArrayFactory::create<float>('c', {kH, kW, iC, oC});
+    auto exp = NDArrayFactory::create<float>('c', {bS, iH, iW, iC}, {2.75,   7.75,  12.75,  17.75,  22.75, 30.5 ,  40.5 ,  50.5 ,  60.5 ,  70.5 , 30.5 ,  40.5 ,  50.5 ,  60.5 ,  70.5 , 30.5 ,  40.5 ,  50.5 ,  60.5 ,  70.5 ,
                                                 55.5 ,  65.5 ,  75.5 ,  85.5 ,  95.5 ,161.  , 181.  , 201.  , 221.  , 241.  ,161.  , 181.  , 201.  , 221.  , 241.  ,161.  , 181.  , 201.  , 221.  , 241.  ,
                                                 55.5 ,  65.5 ,  75.5 ,  85.5 ,  95.5 ,161.  , 181.  , 201.  , 221.  , 241.  ,161.  , 181.  , 201.  , 221.  , 241.  ,161.  , 181.  , 201.  , 221.  , 241.  ,
                                                 55.5 ,  65.5 ,  75.5 ,  85.5 ,  95.5 ,161.  , 181.  , 201.  , 221.  , 241.  ,161.  , 181.  , 201.  , 221.  , 241.  ,161.  , 181.  , 201.  , 221.  , 241.  ,
@@ -2194,6 +2194,39 @@ TEST_F(ConvolutionTests1, deconv2d_test2) {
     delete results;
 }
 
+//////////////////////////////////////////////////////////////////////
+TEST_F(ConvolutionTests1, deconv2d_test3) {
+
+    int bS=1, oH=5,oW=5,  oC=3,iC=2,  kH=2,kW=2,  sH=1,sW=1,  pH=0,pW=0,  dH=2,dW=2;
+    int       iH=3,iW=3;
+    int paddingMode = 0;             // 1-SAME, 0-VALID;
+    int dataFormat  = 1;             // 1-NHWC, 0-NCHW
+
+    auto input    = NDArrayFactory::create<float>('c', {bS, iH, iW, iC});
+    auto weights  = NDArrayFactory::create<float>('c', {kH, kW, oC, iC});
+    auto bias     = NDArrayFactory::create<float>('c', {oC});
+
+    auto exp = NDArrayFactory::create<float>('c', {bS, oH, oW, oC}, {-2.9, -6.8, -10.7, -2.6, -6.1, -9.6, -16.9, -23.9, -30.9, -13.1, -16.6, -20.1, -11.6, -14.7, -17.8, -2.0, -4.7, -7.4, -1.7, -4.0, -6.3, -11.5, -16.1,
+                                -20.7, -8.6, -10.9, -13.2, -7.1, -9.0, -10.9, -27.4, -32.8, -38.2, -24.4, -29.0, -33.6, -65.0, -74.2, -83.4, -38.2, -42.8, -47.4,
+                                -32.8, -36.6, -40.4, -18.2, -20.9, -23.6, -15.5, -17.8, -20.1, -39.1, -43.7, -48.3, -22.4, -24.7, -27.0, -18.5, -20.4, -22.3, -10.1, -11.6, -13.1,
+                                -7.4, -8.5, -9.6, -19.3, -21.5, -23.7, -10.7, -11.8, -12.9, -6.8, -7.5, -8.2});
+
+    input.linspace(-10, 0.5);
+    weights.linspace(0.1, 0.1);
+    bias = 0.2;
+
+    nd4j::ops::deconv2d op;
+    auto results = op.execute({&input, &weights}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
+    ASSERT_EQ(Status::OK(), results->status());
+
+    auto output = results->at(0);
+
+    ASSERT_TRUE(exp.isSameShape(output));
+    ASSERT_TRUE(exp.equalsTo(output));
+
+    delete results;
+}
+
 //////////////////////////////////////////////////////////////////////
 TYPED_TEST(TypedConvolutionTests1, deconv2d_tf_test1) {
 
diff --git a/libnd4j/tests_cpu/layers_tests/ConvolutionTests2.cpp b/libnd4j/tests_cpu/layers_tests/ConvolutionTests2.cpp
index c20271dd0..836ad123b 100644
--- a/libnd4j/tests_cpu/layers_tests/ConvolutionTests2.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ConvolutionTests2.cpp
@@ -567,6 +567,52 @@ TEST_F(ConvolutionTests2, deconv3d_test4) {
     delete results;
 }
 
+//////////////////////////////////////////////////////////////////////
+TEST_F(ConvolutionTests2, deconv3d_test5) {
+
+    int bS=1, oD=5,oH=5,oW=5,  oC=3,iC=2,  kD=2,kH=2,kW=2,  sD=1,sH=1,sW=1,  pD=0,pH=0,pW=0,  dD=2,dH=2,dW=2;
+    int       iD=3,iH=3,iW=3;
+    int paddingMode = 0;             // 1-SAME, 0-VALID;
+    int dataFormat  = 1;             // 1-NHWC, 0-NCHW
+
+    auto input    = NDArrayFactory::create<float>('c', {bS, iD, iH, iW, iC});
+    auto weights  = NDArrayFactory::create<float>('c', {kD, kH, kW, oC, iC});
+    auto bias     = NDArrayFactory::create<float>('c', {oC});
+
+    auto exp = NDArrayFactory::create<float>('c', {bS, oD, oH, oW, oC}, {-2.9, -6.8, -10.7, -2.6, -6.1, -9.6, -16.9, -23.9, -30.9, -13.1, -16.6, -20.1, -11.6, -14.7, -17.8, -2.0, -4.7, -7.4, -1.7, -4.0, -6.3, -11.5,
+                -16.1, -20.7, -8.6, -10.9, -13.2, -7.1, -9.0, -10.9, -27.4, -32.8, -38.2, -24.4, -29.0, -33.6, -65.0, -74.2, -83.4, -38.2, -42.8, -47.4, -32.8,
+                -36.6, -40.4, -18.2, -20.9, -23.6, -15.5, -17.8, -20.1, -39.1, -43.7, -48.3, -22.4, -24.7, -27.0, -18.5, -20.4, -22.3, -10.1, -11.6, -13.1, -7.4,
+                -8.5, -9.6, -19.3, -21.5, -23.7, -10.7, -11.8, -12.9, -6.8, -7.5, -8.2, -0.2, -0.5, -0.8, 0.1, 0.2, 0.3, -0.7, -0.5, -0.3, 0.4, 0.5, 0.6, 1.9, 2.4,
+                2.9, 0.7, 1.6, 2.5, 1.0, 2.3, 3.6, 4.7, 7.3, 9.9, 4.9, 6.2, 7.5, 6.4, 8.1, 9.8, -0.4, 1.4, 3.2, 2.6, 5.2, 7.8, 10.6, 15.8, 21.0, 10.4, 13.0, 15.6,
+                15.8, 19.2, 22.6, 6.1, 7.0, 7.9, 8.8, 10.1, 11.4, 20.3, 22.9, 25.5, 12.7, 14.0, 15.3, 16.6, 18.3, 20.0, 14.2, 16.3, 18.4, 16.9, 19.4, 21.9, 40.1,
+                45.1, 50.1, 24.4, 26.9, 29.4, 28.3, 31.2, 34.1, -47.2, -47.8, -48.4, -41.8, -41.6, -41.4, -85.4, -85., -84.6, -41.2, -41.0, -40.8, -33.4, -32.4, -31.4,
+                -31., -29.2, -27.4, -25.6, -23.0, -20.4, -45.8, -40.6, -35.4, -17.8, -15.2, -12.6, -10.0, -6.6, -3.2, -65.6, -62.0, -58.4, -50.0, -44.8, -39.6, -89.2,
+                -78.8, -68.4, -34.4, -29.2, -24., -14.0, -7.2, -0.4, -20.2, -18.4, -16.6, -10., -7.4, -4.8, -14.6, -9.4, -4.2, -2.2, 0.4, 3.0, 10.4, 13.8, 17.2, 10.4,
+                14.6, 18.8, 20.6, 25.6, 30.6, 53.8, 63.8, 73.8, 35.6, 40.6, 45.6, 48.2, 54.0, 59.8, -3.8, -4.1, -4.4, 1.3, 1.4, 1.5, 1.7, 1.9, 2.1, 1.6, 1.7, 1.8, 7.9,
+                8.4, 8.9, 11.5, 12.4, 13.3, 16.6, 17.9, 19.2, 35.9, 38.5, 41.1, 20.5, 21.8, 23.1, 26.8, 28.5, 30.2, 21.2, 23.0, 24.8, 33.8, 36.4, 39.0, 73.0, 78.2,
+                83.4, 41.6, 44.2, 46.8, 56.6, 60.0, 63.4, 16.9, 17.8, 18.7, 24.4, 25.7, 27., 51.5, 54.1, 56.7, 28.3, 29.6, 30.9, 37.0, 38.7, 40.4, 39.4, 41.5,
+                43.6, 46.9, 49.4, 51.9, 100.1, 105.1, 110.1, 54.4, 56.9, 59.4, 63.1, 66.0, 68.9, 42.1, 45.4, 48.7, 47.2, 50.9, 54.6, 104.3, 111.7,
+                119.1, 58.3, 62.0, 65.7, 64.6, 68.7, 72.8, 57.4, 61.9, 66.4, 62.5, 67.4, 72.3, 138.5, 148.3, 158.1, 77.2, 82.1, 87.0, 83.5, 88.8, 94.1,
+                134.6, 143.6, 152.6, 147.2, 157.0, 166.8, 321.4, 341.0, 360.6, 176.6, 186.4, 196.2, 191.6, 202.2, 212.8, 84.4, 88.9,
+                93.4, 91.9, 96.8, 101.7, 197.3, 207.1, 216.9, 106.6, 111.5, 116.4, 115.3, 120.6, 125.9, 106.9, 112.6, 118.3, 114.4, 120.5, 126.6, 245.9, 258.1, 270.3, 132.7, 138.8, 144.9, 141.4, 147.9, 154.4});
+
+    input.linspace(-10, 0.5);
+    weights.linspace(0.1, 0.1);
+    bias = 0.2;
+
+    nd4j::ops::deconv3d op;
+    auto results = op.execute({&input, &weights}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW,  dD,dH,dW, paddingMode, dataFormat});
+    ASSERT_EQ(Status::OK(), results->status());
+
+    auto output = results->at(0);
+    // output->printBuffer();
+
+    ASSERT_TRUE(exp.isSameShape(output));
+    ASSERT_TRUE(exp.equalsTo(output));
+
+    delete results;
+}
+
 //////////////////////////////////////////////////////////////////////
 TEST_F(ConvolutionTests2, deconv3d_bp_test1) {
 

From d7718c28fb80dcf15f6843ca14f492ac11396cb3 Mon Sep 17 00:00:00 2001
From: Serhii Shepel <9946053+sshepel@users.noreply.github.com>
Date: Fri, 15 Nov 2019 09:56:28 +0200
Subject: [PATCH 12/15] Drop unused profiles for artifacts distribution i8388
 (#48)

* Drop unused profiles for artifacts distribution

* Drop skymindnexus-skil profile
---
 pom.xml | 127 --------------------------------------------------------
 1 file changed, 127 deletions(-)

diff --git a/pom.xml b/pom.xml
index 1f8e0dca9..f907a58ed 100644
--- a/pom.xml
+++ b/pom.xml
@@ -453,133 +453,6 @@
                 </plugins>
             </build>
         </profile>
-        <profile>
-            <id>s3-repo</id>
-            <activation>
-                <property>
-                    <name>local.software.repository</name>
-                    <value>s3-repo</value>
-                </property>
-            </activation>
-            <distributionManagement>
-                <snapshotRepository>
-                    <id>s3-repo</id>
-                    <name>s3-repo</name>
-                    <url>s3://${s3.repo.url}</url>
-                </snapshotRepository>
-            </distributionManagement>
-        </profile>
-        <profile>
-            <id>skymindnexus-skil</id>
-            <activation>
-                <property>
-                    <name>local.software.repository</name>
-                    <value>skymindnexus-skil</value>
-                </property>
-            </activation>
-            <distributionManagement>
-                <repository>
-                    <id>skymindnexus</id>
-                    <name>skymindnexus</name>
-                    <url>https://nexus.skymind.io/repository/skil/</url>
-                </repository>
-            </distributionManagement>
-        </profile>
-        <profile>
-            <id>local-nexus</id>
-            <activation>
-                <property>
-                    <name>local.software.repository</name>
-                    <value>nexus</value>
-                </property>
-            </activation>
-            <distributionManagement>
-                <snapshotRepository>
-                    <id>local-nexus</id>
-                    <name>local-nexus</name>
-                    <url>
-                        http://master-jenkins.skymind.io:8088/repository/snapshots
-                    </url>
-                </snapshotRepository>
-            </distributionManagement>
-            <build>
-                <plugins>
-                    <plugin>
-                        <artifactId>maven-deploy-plugin</artifactId>
-                        <version>${maven-deploy-plugin.version}</version>
-                        <configuration>
-                            <skip>true</skip>
-                        </configuration>
-                    </plugin>
-                    <plugin>
-                        <groupId>org.sonatype.plugins</groupId>
-                        <artifactId>nexus-staging-maven-plugin</artifactId>
-                        <version>1.6.6</version>
-                        <executions>
-                            <execution>
-                                <id>default-deploy</id>
-                                <phase>deploy</phase>
-                                <goals>
-                                    <goal>deploy</goal>
-                                </goals>
-                            </execution>
-                        </executions>
-                        <extensions>true</extensions>
-                        <configuration>
-                            <serverId>local-nexus</serverId>
-                            <nexusUrl>http://master-jenkins.skymind.io:8088/</nexusUrl>
-                            <skipStagingRepositoryClose>true</skipStagingRepositoryClose>
-                        </configuration>
-                    </plugin>
-                </plugins>
-            </build>
-        </profile>
-        <profile>
-            <id>local-jfrog</id>
-            <activation>
-                <property>
-                    <name>local.software.repository</name>
-                    <value>jfrog</value>
-                </property>
-            </activation>
-            <distributionManagement>
-                <snapshotRepository>
-                    <id>local-jfrog</id>
-                    <name>local-jfrog</name>
-                    <url>http://master-jenkins.skymind.io:8081/artifactory/libs-snapshot-local
-                    </url>
-                </snapshotRepository>
-                <repository>
-                    <id>local-jfrog</id>
-                    <name>local-jfrog</name>
-                    <url>http://master-jenkins.skymind.io:8081/artifactory/libs-release-local
-                    </url>
-                </repository>
-            </distributionManagement>
-        </profile>
-        <profile>
-            <id>Bintray-artifactory</id>
-            <activation>
-                <property>
-                    <name>local.software.repository</name>
-                    <value>bintray</value>
-                </property>
-            </activation>
-            <distributionManagement>
-                <snapshotRepository>
-                    <id>bintray-deeplearning4j-maven</id>
-                    <name>deeplearning4j-maven-snapshots</name>
-                    <url>https://oss.jfrog.org/artifactory/oss-snapshot-local</url>
-                </snapshotRepository>
-                <repository>
-                    <id>bintray-deeplearning4j-maven</id>
-                    <name>deeplearning4j-maven-releases</name>
-                    <url>
-                        https://api.bintray.com/maven/deeplearning4j/maven/${project.artifactId}/;publish=1
-                    </url>
-                </repository>
-            </distributionManagement>
-        </profile>
         <profile>
             <id>sonatype-nexus</id>
             <activation>

From 1780dcc8839c2ffed59fd325804ce42e77342dae Mon Sep 17 00:00:00 2001
From: raver119 <raver119@gmail.com>
Date: Fri, 15 Nov 2019 17:04:29 +0300
Subject: [PATCH 13/15] [WIP] Small fixes here and there (#50)

* one range test

Signed-off-by: raver119 <raver119@gmail.com>

* few Context convenience singatures

Signed-off-by: raver119 <raver119@gmail.com>

* one more range test

Signed-off-by: raver119 <raver119@gmail.com>

* "range" "fix"

Signed-off-by: raver119 <raver119@gmail.com>

* adjuct_contrast_v2 now allows scale factor to be provided via input_variable

Signed-off-by: raver119 <raver119@gmail.com>

* adjust_contrast now allows scale factor as variable too

Signed-off-by: raver119 <raver119@gmail.com>

* bitcast shape tests

Signed-off-by: raver119 <raver119@gmail.com>

* BitCast import dtype added

Signed-off-by: raver119 <raver119@gmail.com>

* few more BitCast signatures

Signed-off-by: raver119 <raver119@gmail.com>
---
 libnd4j/include/graph/Context.h               |  4 +++
 libnd4j/include/graph/impl/Context.cpp        | 15 +++++++++
 .../generic/parity_ops/adjust_contrast.cpp    | 24 ++++++++------
 .../ops/declarable/headers/parity_ops.h       |  4 +--
 .../layers_tests/DeclarableOpsTests16.cpp     | 25 +++++++++++++++
 .../nd4j/linalg/api/ops/custom/BitCast.java   | 29 +++++++++++++++++
 .../ops/executioner/CudaExecutioner.java      |  2 +-
 .../java/org/nd4j/nativeblas/Nd4jCuda.java    |  9 ++++++
 .../java/org/nd4j/nativeblas/Nd4jCpu.java     |  9 ++++++
 .../nd4j/linalg/custom/CustomOpsTests.java    | 32 +++++++++++++++++++
 10 files changed, 140 insertions(+), 13 deletions(-)

diff --git a/libnd4j/include/graph/Context.h b/libnd4j/include/graph/Context.h
index 96079e5a2..d5b85b543 100644
--- a/libnd4j/include/graph/Context.h
+++ b/libnd4j/include/graph/Context.h
@@ -190,6 +190,10 @@ namespace nd4j {
             void setIArguments(Nd4jLong *arguments, int numberOfArguments);
             void setBArguments(bool *arguments, int numberOfArguments);
 
+            void setTArguments(const std::vector<double> &tArgs);
+            void setIArguments(const std::vector<Nd4jLong> &tArgs);
+            void setBArguments(const std::vector<bool> &tArgs);
+
             void setCudaContext(Nd4jPointer cudaStream, Nd4jPointer reductionPointer, Nd4jPointer allocationPointer);
 
 
diff --git a/libnd4j/include/graph/impl/Context.cpp b/libnd4j/include/graph/impl/Context.cpp
index b18d3f347..146e66067 100644
--- a/libnd4j/include/graph/impl/Context.cpp
+++ b/libnd4j/include/graph/impl/Context.cpp
@@ -469,6 +469,21 @@ namespace nd4j {
         bool Context::helpersAllowed() {
             return _helpersAllowed;
         }
+
+        void Context::setTArguments(const std::vector<double> &tArgs) {
+            for (auto t:tArgs)
+                _tArgs.emplace_back(t);
+        }
+
+        void Context::setIArguments(const std::vector<Nd4jLong> &iArgs) {
+            for (auto i:iArgs)
+                _iArgs.emplace_back(i);
+        }
+
+        void Context::setBArguments(const std::vector<bool> &bArgs) {
+            for (auto b:bArgs)
+                _bArgs.emplace_back(b);
+        }
     }
 }
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/adjust_contrast.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/adjust_contrast.cpp
index 538214b14..cc11eedca 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/adjust_contrast.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/adjust_contrast.cpp
@@ -27,12 +27,14 @@
 namespace nd4j {
 namespace ops {
 
-CONFIGURABLE_OP_IMPL(adjust_contrast, 1, 1, true, 1, 0) {
+CONFIGURABLE_OP_IMPL(adjust_contrast, 1, 1, true, -2, 0) {
 
     auto input  = INPUT_VARIABLE(0);
     auto output = OUTPUT_VARIABLE(0);
 
-    const double factor = T_ARG(0);
+    REQUIRE_TRUE(block.numT() > 0 || block.width() > 1, 0, "ADJUST_CONTRAST: Scale factor required");
+
+    const double factor = block.width() > 1 ? INPUT_VARIABLE(1)->e<double>(0) : T_ARG(0);
 
     REQUIRE_TRUE(input->rankOf() > 2, 0, "ADJUST_CONTRAST: op expects rank of input array to be >= 3, but got %i instead", input->rankOf());
     REQUIRE_TRUE(input->sizeAt(-1) == 3, 0, "ADJUST_CONTRAST: operation expects image with 3 channels (R, G, B), but got %i instead", input->sizeAt(-1));
@@ -59,15 +61,17 @@ DECLARE_TYPES(adjust_contrast) {
 }
 
 
-    CONFIGURABLE_OP_IMPL(adjust_contrast_v2, 1, 1, true, 1, 0) {
+    CONFIGURABLE_OP_IMPL(adjust_contrast_v2, 1, 1, true, -2, 0) {
 
         auto input  = INPUT_VARIABLE(0);
         auto output = OUTPUT_VARIABLE(0);
 
-        const double factor = T_ARG(0);
+        REQUIRE_TRUE(block.numT() > 0 || block.width() > 1, 0, "ADJUST_CONTRAST_V2: Scale factor required");
 
-        REQUIRE_TRUE(input->rankOf() > 2, 0, "ADJUST_CONTRAST: op expects rank of input array to be >= 3, but got %i instead", input->rankOf());
-        REQUIRE_TRUE(input->sizeAt(-1) == 3, 0, "ADJUST_CONTRAST: operation expects image with 3 channels (R, G, B), but got %i instead", input->sizeAt(-1));
+        const double factor = block.width() > 1 ? INPUT_VARIABLE(1)->e<double>(0) : T_ARG(0);
+
+        REQUIRE_TRUE(input->rankOf() > 2, 0, "ADJUST_CONTRAST_V2: op expects rank of input array to be >= 3, but got %i instead", input->rankOf());
+        REQUIRE_TRUE(input->sizeAt(-1) == 3, 0, "ADJUST_CONTRAST_V2: operation expects image with 3 channels (R, G, B), but got %i instead", input->sizeAt(-1));
 
         // compute mean before
         std::vector<int> axes(input->rankOf() - 1);
@@ -78,10 +82,10 @@ DECLARE_TYPES(adjust_contrast) {
         auto mean = input->reduceAlongDims(reduce::Mean, axes);
 
         // result as (x - mean) * factor + mean
-        std::unique_ptr<NDArray> temp(input->dup());
-        input->applyTrueBroadcast(BroadcastOpsTuple::Subtract(), &mean, temp.get());
-        temp->applyScalar(scalar::Multiply, factor);
-        temp->applyTrueBroadcast(BroadcastOpsTuple::Add(), &mean, output);
+        auto temp = input->ulike();
+        input->applyTrueBroadcast(BroadcastOpsTuple::Subtract(), &mean, &temp);
+        temp.applyScalar(scalar::Multiply, factor);
+        temp.applyTrueBroadcast(BroadcastOpsTuple::Add(), &mean, output);
 
         return Status::OK();
     }
diff --git a/libnd4j/include/ops/declarable/headers/parity_ops.h b/libnd4j/include/ops/declarable/headers/parity_ops.h
index 3660ee229..590d99308 100644
--- a/libnd4j/include/ops/declarable/headers/parity_ops.h
+++ b/libnd4j/include/ops/declarable/headers/parity_ops.h
@@ -610,8 +610,8 @@ namespace nd4j {
          *
          */
         #if NOT_EXCLUDED(OP_adjust_contrast)
-        DECLARE_CONFIGURABLE_OP(adjust_contrast, 1, 1, true, 1, 0);
-        DECLARE_CONFIGURABLE_OP(adjust_contrast_v2, 1, 1, true, 1, 0);
+        DECLARE_CONFIGURABLE_OP(adjust_contrast, 1, 1, true, -2, 0);
+        DECLARE_CONFIGURABLE_OP(adjust_contrast_v2, 1, 1, true, -2, 0);
         #endif
 
 
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp
index 1a459a012..d29d1f0e1 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp
@@ -161,4 +161,29 @@ TEST_F(DeclarableOpsTests16, test_empty_cast_1) {
     ASSERT_EQ(e, *result->at(0));
 
     delete result;
+}
+
+TEST_F(DeclarableOpsTests16, test_range_1) {
+    nd4j::ops::range op;
+    auto z = NDArrayFactory::create<float>('c', {200});
+
+    Context ctx(1);
+    ctx.setTArguments({-1.0, 1.0, 0.01});
+    ctx.setOutputArray(0, &z);
+
+    auto status = op.execute(&ctx);
+    ASSERT_EQ(Status::OK(), status);
+}
+
+TEST_F(DeclarableOpsTests16, test_range_2) {
+    nd4j::ops::range op;
+    auto z = NDArrayFactory::create<float>('c', {200});
+
+    double tArgs[] = {-1.0, 1.0, 0.01};
+
+    auto shapes = ::calculateOutputShapes2(nullptr, op.getOpHash(), nullptr, nullptr, 0, tArgs, 3, nullptr, 0, nullptr, 0);
+    shape::printShapeInfoLinear("Result", shapes->at(0));
+    ASSERT_TRUE(shape::shapeEquals(z.shapeInfo(), shapes->at(0)));
+
+    delete shapes;
 }
\ No newline at end of file
diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/custom/BitCast.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/custom/BitCast.java
index ee0adfb94..43bff11e6 100644
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/custom/BitCast.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/custom/BitCast.java
@@ -1,25 +1,54 @@
 package org.nd4j.linalg.api.ops.custom;
 
+import lombok.val;
 import org.nd4j.autodiff.samediff.SDVariable;
 import org.nd4j.autodiff.samediff.SameDiff;
+import org.nd4j.imports.graphmapper.tf.TFGraphMapper;
 import org.nd4j.linalg.api.buffer.DataType;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.api.ops.DynamicCustomOp;
+import org.nd4j.linalg.api.shape.options.ArrayOptionsHelper;
 import org.nd4j.linalg.factory.Nd4j;
+import org.tensorflow.framework.AttrValue;
+import org.tensorflow.framework.GraphDef;
+import org.tensorflow.framework.NodeDef;
+
+import java.util.Map;
 
 public class BitCast extends DynamicCustomOp {
     public BitCast() {}
 
+    public BitCast(INDArray in, DataType dataType, INDArray out) {
+        this(in, dataType.toInt(), out);
+    }
+
     public BitCast(INDArray in, int dataType, INDArray out) {
         inputArguments.add(in);
         outputArguments.add(out);
         iArguments.add(Long.valueOf(dataType));
     }
 
+    public BitCast(INDArray in, DataType dataType) {
+        this(in, dataType.toInt());
+    }
+
+    public BitCast(INDArray in, int dataType) {
+        inputArguments.add(in);
+        iArguments.add(Long.valueOf(dataType));
+    }
+
     public BitCast(SameDiff sameDiff, SDVariable in, SDVariable dataType) {
         super("", sameDiff, new SDVariable[]{in, dataType});
     }
 
+    @Override
+    public void initFromTensorFlow(NodeDef nodeDef, SameDiff initWith, Map<String, AttrValue> attributesForNode, GraphDef graph) {
+        TFGraphMapper.initFunctionFromProperties(nodeDef.getOp(), this, attributesForNode, nodeDef, graph);
+        val t = nodeDef.getAttrOrDefault("type", null);
+        val type = ArrayOptionsHelper.convertToDataType(t.getType());
+        addIArgument(type.toInt());
+    }
+
     @Override
     public String opName() {
         return "bitcast";
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaExecutioner.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaExecutioner.java
index 8fe744b38..20f2b5f22 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaExecutioner.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaExecutioner.java
@@ -2226,7 +2226,7 @@ public class CudaExecutioner extends DefaultOpExecutioner {
 
         cnt = 0;
         for (val t: op.tArgs())
-            tArgs.put(cnt++, (float) t);
+            tArgs.put(cnt++, t);
 
         OpaqueShapeList ptrptr = nativeOps.calculateOutputShapes2(null, hash, inputBuffers, inputShapes, op.inputArguments().length, tArgs, op.tArgs().length, iArgs, op.iArgs().length, bArgs, op.numBArguments());
 
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java
index 22b2068d4..e8b5e15c9 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java
@@ -6754,6 +6754,15 @@ NDArray& NDArray::operator()(const Nd4jLong* idx) {
             public native void setBArguments(@Cast("bool*") BooleanPointer arguments, int numberOfArguments);
             public native void setBArguments(@Cast("bool*") boolean[] arguments, int numberOfArguments);
 
+            public native void setTArguments(@StdVector DoublePointer tArgs);
+            public native void setTArguments(@StdVector DoubleBuffer tArgs);
+            public native void setTArguments(@StdVector double[] tArgs);
+            public native void setIArguments(@Cast("Nd4jLong*") @StdVector LongPointer tArgs);
+            public native void setIArguments(@Cast("Nd4jLong*") @StdVector LongBuffer tArgs);
+            public native void setIArguments(@Cast("Nd4jLong*") @StdVector long[] tArgs);
+            public native void setBArguments(@Cast("bool*") @StdVector BooleanPointer tArgs);
+            public native void setBArguments(@Cast("bool*") @StdVector boolean[] tArgs);
+
             public native void setCudaContext(@Cast("Nd4jPointer") Pointer cudaStream, @Cast("Nd4jPointer") Pointer reductionPointer, @Cast("Nd4jPointer") Pointer allocationPointer);
 
 
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java
index d99a8240a..e2e9b0c2f 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java
@@ -6754,6 +6754,15 @@ NDArray& NDArray::operator()(const Nd4jLong* idx) {
             public native void setBArguments(@Cast("bool*") BooleanPointer arguments, int numberOfArguments);
             public native void setBArguments(@Cast("bool*") boolean[] arguments, int numberOfArguments);
 
+            public native void setTArguments(@StdVector DoublePointer tArgs);
+            public native void setTArguments(@StdVector DoubleBuffer tArgs);
+            public native void setTArguments(@StdVector double[] tArgs);
+            public native void setIArguments(@Cast("Nd4jLong*") @StdVector LongPointer tArgs);
+            public native void setIArguments(@Cast("Nd4jLong*") @StdVector LongBuffer tArgs);
+            public native void setIArguments(@Cast("Nd4jLong*") @StdVector long[] tArgs);
+            public native void setBArguments(@Cast("bool*") @StdVector BooleanPointer tArgs);
+            public native void setBArguments(@Cast("bool*") @StdVector boolean[] tArgs);
+
             public native void setCudaContext(@Cast("Nd4jPointer") Pointer cudaStream, @Cast("Nd4jPointer") Pointer reductionPointer, @Cast("Nd4jPointer") Pointer allocationPointer);
 
 
diff --git a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/custom/CustomOpsTests.java b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/custom/CustomOpsTests.java
index ad38f39d7..556405c14 100644
--- a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/custom/CustomOpsTests.java
+++ b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/custom/CustomOpsTests.java
@@ -931,4 +931,36 @@ public class CustomOpsTests extends BaseNd4jTest {
         Nd4j.exec(new KnnMinDistance(point, lowest, highest, distance));
         System.out.println(distance);
     }
+
+
+    @Test
+    public void testRange(){
+        DynamicCustomOp op = DynamicCustomOp.builder("range")
+                .addFloatingPointArguments(-1.0, 1.0, 0.01)
+                .build();
+
+        List<LongShapeDescriptor> lsd = op.calculateOutputShape();
+        //System.out.println("Calculated output shape: " + Arrays.toString(lsd.get(0).getShape()));
+        op.setOutputArgument(0, Nd4j.create(lsd.get(0)));
+
+        Nd4j.exec(op);
+    }
+
+    @Test
+    public void testBitCastShape_1(){
+        val out = Nd4j.createUninitialized(1,10);
+        BitCast op = new BitCast(Nd4j.zeros(DataType.FLOAT,1,10), DataType.INT.toInt(), out);
+        List<LongShapeDescriptor> lsd = op.calculateOutputShape();
+        assertEquals(1, lsd.size());
+        assertArrayEquals(new long[]{1,10}, lsd.get(0).getShape());
+    }
+
+    @Test
+    public void testBitCastShape_2(){
+        val out = Nd4j.createUninitialized(1,10);
+        BitCast op = new BitCast(Nd4j.zeros(DataType.DOUBLE,1,10), DataType.INT.toInt(), out);
+        List<LongShapeDescriptor> lsd = op.calculateOutputShape();
+        assertEquals(1, lsd.size());
+        assertArrayEquals(new long[]{1,10, 2}, lsd.get(0).getShape());
+    }
 }

From 09a827fb6dcea3f810bd857483e84b4e28501784 Mon Sep 17 00:00:00 2001
From: Alex Black <blacka101@gmail.com>
Date: Sat, 16 Nov 2019 17:04:29 +1100
Subject: [PATCH 14/15] Fixes and pre-release QA (#51)

* #8395 Keras import - support scaled identity weight init

Signed-off-by: AlexDBlack <blacka101@gmail.com>

* More Keras scaled weight init fixes

Signed-off-by: AlexDBlack <blacka101@gmail.com>

* #8352 Deprecate duplicate SamplingDataSetIterator class

Signed-off-by: AlexDBlack <blacka101@gmail.com>

* Remove /O2 optimization for faster CUDA build

Signed-off-by: AlexDBlack <blacka101@gmail.com>

* Tweak regression test precision for CUDA

Signed-off-by: AlexDBlack <blacka101@gmail.com>

* Fix edge cases for buffer creation

Signed-off-by: AlexDBlack <blacka101@gmail.com>

* Update MKLDNN validation tests to new helper enable/disable settings

Signed-off-by: AlexDBlack <blacka101@gmail.com>

* Delete debugging class

Signed-off-by: AlexDBlack <blacka101@gmail.com>

* MKLDNN test - add proper skip for CUDA backend

Signed-off-by: AlexDBlack <blacka101@gmail.com>

* Align WeightInitUtil with weight init classes

Signed-off-by: AlexDBlack <blacka101@gmail.com>

* Fix for SameDiff test layers weight init when using IWeightInit classes

Signed-off-by: AlexDBlack <blacka101@gmail.com>
---
 .../LayerHelperValidationUtil.java            |  56 ++++++++-
 .../org/deeplearning4j/TestBatchNormBp.java   | 107 ------------------
 .../testlayers/MinimalSameDiffDense.java      |  15 ++-
 .../samediff/testlayers/SameDiffConv.java     |  14 ++-
 .../samediff/testlayers/SameDiffDense.java    |  12 +-
 .../nn/mkldnn/ValidateMKLDNN.java             |   2 +
 .../regressiontest/RegressionTest100b4.java   |   8 +-
 .../iterator/SamplingDataSetIterator.java     |  98 +---------------
 .../nn/modelimport/keras/Hdf5Archive.java     |   2 +-
 .../keras/KerasSequentialModel.java           |   1 -
 .../advanced/activations/KerasPReLU.java      |  10 +-
 .../KerasAtrousConvolution1D.java             |  10 +-
 .../KerasAtrousConvolution2D.java             |   9 +-
 .../convolutional/KerasConvolution.java       |   2 -
 .../convolutional/KerasConvolution1D.java     |  10 +-
 .../convolutional/KerasConvolution2D.java     |  10 +-
 .../convolutional/KerasConvolution3D.java     |  10 +-
 .../convolutional/KerasDeconvolution2D.java   |  10 +-
 .../KerasDepthwiseConvolution2D.java          |  10 +-
 .../KerasSeparableConvolution2D.java          |  16 +--
 .../convolutional/KerasUpsampling3D.java      |   1 -
 .../convolutional/KerasZeroPadding3D.java     |   1 -
 .../keras/layers/core/KerasDense.java         |  10 +-
 .../keras/layers/core/KerasFlatten.java       |   1 -
 .../keras/layers/core/KerasRepeatVector.java  |   1 -
 .../keras/layers/core/KerasReshape.java       |   2 -
 .../layers/embeddings/KerasEmbedding.java     |  10 +-
 .../layers/local/KerasLocallyConnected1D.java |  11 +-
 .../layers/local/KerasLocallyConnected2D.java |  16 +--
 .../KerasBatchNormalization.java              |   1 -
 .../keras/layers/recurrent/KerasLSTM.java     |  15 +--
 .../layers/recurrent/KerasSimpleRnn.java      |  15 +--
 .../sequence/TimeSeriesGenerator.java         |   3 -
 .../KerasFlattenRnnPreprocessor.java          |   3 +-
 .../preprocessors/ReshapePreprocessor.java    |   4 +-
 ...ensorFlowCnnToFeedForwardPreProcessor.java |   2 +-
 .../keras/utils/DL4JKerasModelValidator.java  |  13 ---
 .../keras/utils/KerasActivationUtils.java     |   1 -
 .../keras/utils/KerasInitilizationUtils.java  |  91 +++++++--------
 .../keras/utils/KerasModelUtils.java          |   1 -
 .../nn/modelimport/keras/KerasTestUtils.java  |   2 -
 .../nn/modelimport/keras/MiscTests.java       |   2 -
 .../configurations/FullModelComparisons.java  |   2 -
 .../Keras1ModelConfigurationTest.java         |   1 -
 .../Keras2ModelConfigurationTest.java         |   2 -
 .../KerasInitilizationTest.java               |   6 +-
 .../configurations/KerasModelImportTest.java  |   6 -
 .../keras/e2e/KerasLambdaTest.java            |   1 -
 .../keras/e2e/KerasModelEndToEndTest.java     |  23 ++--
 .../keras/e2e/KerasYolo9000PredictTest.java   |   4 -
 .../keras/e2e/KerasYolo9000Test.java          |   1 -
 .../KerasAtrousConvolution1DTest.java         |   5 -
 .../convolution/KerasConvolution3DTest.java   |   4 -
 .../convolution/KerasCropping1DTest.java      |   1 -
 .../convolution/KerasCropping3DTest.java      |   2 -
 .../KerasDepthwiseConvolution2DTest.java      |   4 -
 .../convolution/KerasUpsampling1DTest.java    |   4 -
 .../convolution/KerasUpsampling2DTest.java    |   2 -
 .../convolution/KerasZeroPadding3DTest.java   |   2 -
 .../keras/layers/core/KerasDenseTest.java     |   5 -
 .../keras/layers/core/KerasPermuteTest.java   |   6 +-
 .../keras/layers/core/KerasReshapeTest.java   |   2 +-
 .../layers/embeddings/KerasEmbeddingTest.java |   6 +-
 .../local/KerasLocallyConnected1DTest.java    |   3 -
 .../local/KerasLocallyConnected2DTest.java    |   9 +-
 .../layers/pooling/KerasPooling3DTest.java    |   1 -
 .../keras/layers/recurrent/KerasLSTMTest.java |   9 +-
 .../keras/optimizers/OptimizerImport.java     |   5 -
 .../TimeSeriesGeneratorImportTest.java        |   2 -
 .../text/TokenizerImportTest.java             |   6 +-
 .../preprocessing/text/TokenizerTest.java     |   1 -
 .../weights/KerasWeightSettingTests.java      |   1 -
 .../conf/layers/samediff/SameDiffLayer.java   |  17 ++-
 .../nn/weights/WeightInitIdentity.java        |  23 +++-
 .../nn/weights/WeightInitUtil.java            |   8 +-
 .../WeightInitVarScalingNormalFanAvg.java     |  23 +++-
 .../WeightInitVarScalingNormalFanIn.java      |  27 ++++-
 .../WeightInitVarScalingNormalFanOut.java     |  24 +++-
 .../WeightInitVarScalingUniformFanAvg.java    |  14 ++-
 .../WeightInitVarScalingUniformFanIn.java     |  17 ++-
 .../WeightInitVarScalingUniformFanOut.java    |  16 ++-
 libnd4j/CMakeLists.txt                        |   2 +-
 .../java/org/nd4j/linalg/api/shape/Shape.java |   7 ++
 .../api/iterator/SamplingDataSetIterator.java |   7 --
 .../java/org/nd4j/linalg/factory/Nd4j.java    |  23 +---
 85 files changed, 378 insertions(+), 574 deletions(-)
 delete mode 100644 deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/TestBatchNormBp.java

diff --git a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/LayerHelperValidationUtil.java b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/LayerHelperValidationUtil.java
index 59ef8c28e..e3923c4ff 100644
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/LayerHelperValidationUtil.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/LayerHelperValidationUtil.java
@@ -35,6 +35,7 @@ import org.nd4j.linalg.indexing.conditions.Conditions;
 import org.nd4j.linalg.ops.transforms.Transforms;
 
 import java.lang.reflect.Field;
+import java.lang.reflect.Method;
 import java.util.*;
 
 import static org.junit.Assert.*;
@@ -63,6 +64,30 @@ public class LayerHelperValidationUtil {
         private DataSetIterator data;
     }
 
+    public static void disableCppHelpers(){
+        try {
+            Class<?> c = Class.forName("org.nd4j.nativeblas.Nd4jCpu$Environment");
+            Method m = c.getMethod("getInstance");
+            Object instance = m.invoke(null);
+            Method m2 = c.getMethod("allowHelpers", boolean.class);
+            m2.invoke(instance, false);
+        } catch (Throwable t){
+            throw new RuntimeException(t);
+        }
+    }
+
+    public static void enableCppHelpers(){
+        try{
+            Class<?> c = Class.forName("org.nd4j.nativeblas.Nd4jCpu$Environment");
+            Method m = c.getMethod("getInstance");
+            Object instance = m.invoke(null);
+            Method m2 = c.getMethod("allowHelpers", boolean.class);
+            m2.invoke(instance, true);
+        } catch (Throwable t){
+            throw new RuntimeException(t);
+        }
+    }
+
     public static void validateMLN(MultiLayerNetwork netOrig, TestCase t){
         assertNotNull(t.getAllowHelpersForClasses());
         assertFalse(t.getAllowHelpersForClasses().isEmpty());
@@ -95,7 +120,13 @@ public class LayerHelperValidationUtil {
             for (boolean train : new boolean[]{false, true}) {
                 assertEquals(net1NoHelper.params(), net2With.params());
                 String s = "Feed forward test - " + t.getTestName() + " - " + (train ? "Train: " : "Test: ");
-                List<INDArray> ff1 = net1NoHelper.feedForward(t.getFeatures(), train);
+                List<INDArray> ff1;
+                try {
+                    disableCppHelpers();
+                    ff1 = net1NoHelper.feedForward(t.getFeatures(), train);
+                } finally {
+                    enableCppHelpers();
+                }
                 List<INDArray> ff2 = net2With.feedForward(t.getFeatures(), train);
                 List<String> paramKeys = new ArrayList<>(net1NoHelper.paramTable().keySet());
                 Collections.sort(paramKeys);
@@ -131,7 +162,13 @@ public class LayerHelperValidationUtil {
                     log.info("Forward pass, max relative error: " + layerName + " - " + maxRE);
                 }
 
-                INDArray out1 = net1NoHelper.output(t.getFeatures(), train);
+                INDArray out1;
+                try {
+                    disableCppHelpers();
+                    out1 = net1NoHelper.output(t.getFeatures(), train);
+                } finally {
+                    enableCppHelpers();
+                }
                 INDArray out2 = net2With.output(t.getFeatures(), train);
                 INDArray relError = relError(out1, out2, t.getMinAbsError());
                 double maxRE = relError.maxNumber().doubleValue();
@@ -148,7 +185,13 @@ public class LayerHelperValidationUtil {
             Preconditions.checkNotNull(t.getLabels(), "Labels are not set (null)");
 
             log.info("Validation - checking scores");
-            double s1 = net1NoHelper.score(new DataSet(t.getFeatures(), t.getLabels()));
+            double s1;
+            try {
+                disableCppHelpers();
+                s1 = net1NoHelper.score(new DataSet(t.getFeatures(), t.getLabels()));
+            } finally {
+                enableCppHelpers();
+            }
             double s2 = net2With.score(new DataSet(t.getFeatures(), t.getLabels()));
 
             double re = relError(s1, s2);
@@ -168,7 +211,12 @@ public class LayerHelperValidationUtil {
             net2With.setInput(t.getFeatures());
             net2With.setLabels(t.getLabels());
 
-            net1NoHelper.computeGradientAndScore();
+            try {
+                disableCppHelpers();
+                net1NoHelper.computeGradientAndScore();
+            } finally {
+                enableCppHelpers();
+            }
             net2With.computeGradientAndScore();
 
             List<String> paramKeys = new ArrayList<>(net1NoHelper.paramTable().keySet());
diff --git a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/TestBatchNormBp.java b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/TestBatchNormBp.java
deleted file mode 100644
index f34ce65f0..000000000
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/TestBatchNormBp.java
+++ /dev/null
@@ -1,107 +0,0 @@
-package org.deeplearning4j;
-
-import org.deeplearning4j.nn.conf.MultiLayerConfiguration;
-import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
-import org.deeplearning4j.nn.conf.WorkspaceMode;
-import org.deeplearning4j.nn.conf.layers.BatchNormalization;
-import org.deeplearning4j.nn.gradient.Gradient;
-import org.deeplearning4j.nn.layers.mkldnn.MKLDNNBatchNormHelper;
-import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
-import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
-import org.junit.Test;
-import org.nd4j.linalg.api.buffer.DataType;
-import org.nd4j.linalg.api.ndarray.INDArray;
-import org.nd4j.linalg.api.ops.DynamicCustomOp;
-import org.nd4j.linalg.factory.Nd4j;
-import org.nd4j.linalg.primitives.Pair;
-
-import java.lang.reflect.Field;
-
-import static junit.framework.TestCase.*;
-
-public class TestBatchNormBp {
-
-    @Test
-    public void test(){
-        Nd4j.getRandom().setSeed(12345);
-//        INDArray in = Nd4j.rand(DataType.FLOAT, 1, 3, 4, 4);
-        INDArray in = Nd4j.rand(DataType.FLOAT, 1, 3, 15, 15);
-        INDArray mean = in.mean(0, 2, 3);   //Nd4j.rand(DataType.FLOAT, 3);
-        INDArray var = in.var(0, 2, 3); //Nd4j.rand(DataType.FLOAT, 3);
-        INDArray eps = Nd4j.rand(DataType.FLOAT, in.shape());
-//        INDArray gamma = Nd4j.ones(DataType.FLOAT, 3);
-//        INDArray beta = Nd4j.zeros(DataType.FLOAT, 3);
-        INDArray gamma = Nd4j.rand(DataType.FLOAT, 3);
-        INDArray beta = Nd4j.rand(DataType.FLOAT, 3);
-        double e = 1e-5;
-
-        INDArray dLdIn = in.ulike();
-        INDArray dLdm = mean.ulike();
-        INDArray dLdv = var.ulike();
-        INDArray dLdg = gamma.ulike();
-        INDArray dLdb = beta.ulike();
-
-        DynamicCustomOp op = DynamicCustomOp.builder("batchnorm_bp")
-                .addInputs(in, mean, var, eps, gamma, beta)
-                .addIntegerArguments(
-                        1,          //Apply scale
-                        1,           //Apply beta
-                        1)           //Axis (NCHW)
-                .addFloatingPointArguments(e)
-                .addOutputs(dLdIn, dLdm, dLdv, dLdg, dLdb)
-                .build();
-
-        Nd4j.exec(op);
-        System.out.println(dLdIn);
-    }
-
-    @Test
-    public void compareImpls() throws Exception {
-
-        Nd4j.getRandom().setSeed(12345);
-        INDArray in = Nd4j.rand(DataType.FLOAT, 1, 3, 15, 15);
-        INDArray mean = in.mean(0, 2, 3).reshape(1,3);
-        INDArray var = in.var(0, 2, 3).reshape(1,3);
-        INDArray eps = Nd4j.rand(DataType.FLOAT, in.shape());
-        INDArray gamma = Nd4j.rand(DataType.FLOAT, 1,3);
-        INDArray beta = Nd4j.rand(DataType.FLOAT, 1,3);
-        double e = 1e-3;
-
-        INDArray dLdIn = in.ulike();
-        INDArray dLdm = mean.ulike();
-        INDArray dLdv = var.ulike();
-        INDArray dLdg = gamma.ulike();
-        INDArray dLdb = beta.ulike();
-
-
-        MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder()
-                .inferenceWorkspaceMode(WorkspaceMode.NONE)
-                .trainingWorkspaceMode(WorkspaceMode.NONE)
-                .list()
-                .layer(new BatchNormalization.Builder().nIn(3).nOut(3).build())
-                .build();
-        MultiLayerNetwork net = new MultiLayerNetwork(conf);
-        net.init();
-        org.deeplearning4j.nn.layers.normalization.BatchNormalization bn = (org.deeplearning4j.nn.layers.normalization.BatchNormalization) net.getLayer(0);
-        assertNotNull(bn.getHelper());
-        Field f = bn.getClass().getDeclaredField("helper");
-        f.setAccessible(true);
-        f.set(bn, null);
-        assertNull(bn.getHelper());
-
-
-        MKLDNNBatchNormHelper h = new MKLDNNBatchNormHelper(DataType.FLOAT);
-
-        net.output(in, true);
-        bn.setInput(in, LayerWorkspaceMgr.noWorkspaces());
-        Pair<Gradient,INDArray> p = net.backpropGradient(eps, LayerWorkspaceMgr.noWorkspaces());
-
-        h.preOutput(in, true, new long[]{1,3}, gamma, beta, mean, var, 0.5, e, LayerWorkspaceMgr.noWorkspaces());
-        Pair<Gradient,INDArray> pmkl = h.backpropGradient(in, eps, new long[]{1,3}, gamma, beta, dLdg, dLdb, e, LayerWorkspaceMgr.noWorkspaces());
-
-        INDArray dldin_dl4j = p.getSecond();
-
-        System.out.println("dl4j == mkldnn: " + p.getSecond().equals(pmkl.getSecond()));
-    }
-
-}
diff --git a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/samediff/testlayers/MinimalSameDiffDense.java b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/samediff/testlayers/MinimalSameDiffDense.java
index 1b8e7ded9..9cbbccaa7 100644
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/samediff/testlayers/MinimalSameDiffDense.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/samediff/testlayers/MinimalSameDiffDense.java
@@ -70,8 +70,19 @@ public class MinimalSameDiffDense extends SameDiffLayer {
 
     @Override
     public void initializeParameters(Map<String, INDArray> params) {
-        params.get(DefaultParamInitializer.BIAS_KEY).assign(0);
-        initWeights(nIn, nOut, weightInit, params.get(DefaultParamInitializer.WEIGHT_KEY));
+        String b = DefaultParamInitializer.BIAS_KEY;
+        if(paramWeightInit != null && paramWeightInit.containsKey(b)){
+            paramWeightInit.get(b).init(nIn, nOut, params.get(b).shape(), 'c', params.get(b));
+        } else {
+            params.get(DefaultParamInitializer.BIAS_KEY).assign(0);
+        }
+
+        String w = DefaultParamInitializer.WEIGHT_KEY;
+        if(paramWeightInit != null && paramWeightInit.containsKey(w)){
+            paramWeightInit.get(w).init(nIn, nOut, params.get(w).shape(), 'c', params.get(w));
+        } else {
+            initWeights(nIn, nOut, weightInit, params.get(DefaultParamInitializer.WEIGHT_KEY));
+        }
     }
 
     //OPTIONAL methods:
diff --git a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/samediff/testlayers/SameDiffConv.java b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/samediff/testlayers/SameDiffConv.java
index 778b95dc7..1be09182c 100644
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/samediff/testlayers/SameDiffConv.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/samediff/testlayers/SameDiffConv.java
@@ -109,13 +109,17 @@ public class SameDiffConv extends SameDiffLayer {
     @Override
     public void initializeParameters(Map<String, INDArray> params) {
         try(MemoryWorkspace ws = Nd4j.getWorkspaceManager().scopeOutOfWorkspaces()) {
+            double fanIn = nIn * kernel[0] * kernel[1];
+            double fanOut = nOut * kernel[0] * kernel[1] / ((double) stride[0] * stride[1]);
             for (Map.Entry<String, INDArray> e : params.entrySet()) {
-                if (ConvolutionParamInitializer.BIAS_KEY.equals(e.getKey())) {
-                    e.getValue().assign(0);
+                if(paramWeightInit != null && paramWeightInit.containsKey(e.getKey())){
+                    paramWeightInit.get(e.getKey()).init(fanIn, fanOut, e.getValue().shape(), 'c', e.getValue());
                 } else {
-                    double fanIn = nIn * kernel[0] * kernel[1];
-                    double fanOut = nOut * kernel[0] * kernel[1] / ((double) stride[0] * stride[1]);
-                    WeightInitUtil.initWeights(fanIn, fanOut, e.getValue().shape(), weightInit, null, 'c', e.getValue());
+                    if (ConvolutionParamInitializer.BIAS_KEY.equals(e.getKey())) {
+                        e.getValue().assign(0);
+                    } else {
+                        WeightInitUtil.initWeights(fanIn, fanOut, e.getValue().shape(), weightInit, null, 'c', e.getValue());
+                    }
                 }
             }
         }
diff --git a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/samediff/testlayers/SameDiffDense.java b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/samediff/testlayers/SameDiffDense.java
index 3da6e8f1c..630b6059c 100644
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/samediff/testlayers/SameDiffDense.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/layers/samediff/testlayers/SameDiffDense.java
@@ -88,11 +88,15 @@ public class SameDiffDense extends SameDiffLayer {
     @Override
     public void initializeParameters(Map<String,INDArray> params){
         for(Map.Entry<String,INDArray> e : params.entrySet()){
-            if(DefaultParamInitializer.BIAS_KEY.equals(e.getKey())){
-                e.getValue().assign(0.0);
+            if(paramWeightInit != null && paramWeightInit.containsKey(e.getKey())){
+                paramWeightInit.get(e.getKey()).init(nIn, nOut, e.getValue().shape(), 'c', e.getValue());
             } else {
-                //Normally use 'c' order, but use 'f' for direct comparison to DL4J DenseLayer
-                WeightInitUtil.initWeights(nIn, nOut, new long[]{nIn, nOut}, weightInit, null, 'f', e.getValue());
+                if(DefaultParamInitializer.BIAS_KEY.equals(e.getKey())){
+                    e.getValue().assign(0.0);
+                } else {
+                    //Normally use 'c' order, but use 'f' for direct comparison to DL4J DenseLayer
+                    WeightInitUtil.initWeights(nIn, nOut, new long[]{nIn, nOut}, weightInit, null, 'f', e.getValue());
+                }
             }
         }
     }
diff --git a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/mkldnn/ValidateMKLDNN.java b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/mkldnn/ValidateMKLDNN.java
index f65e48f44..7013311ba 100644
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/mkldnn/ValidateMKLDNN.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/mkldnn/ValidateMKLDNN.java
@@ -50,6 +50,7 @@ import static org.junit.Assume.assumeTrue;
 
 public class ValidateMKLDNN extends BaseDL4JTest {
 
+
     @Test
     public void validateConvSubsampling() throws Exception {
         //Only run test if using nd4j-native backend
@@ -268,6 +269,7 @@ public class ValidateMKLDNN extends BaseDL4JTest {
 
     @Test
     public void compareBatchNormBackward() throws Exception {
+        assumeTrue(Nd4j.getBackend().getClass().getName().toLowerCase().contains("native"));
 
         Nd4j.getRandom().setSeed(12345);
         INDArray in = Nd4j.rand(DataType.FLOAT, 1, 3, 15, 15);
diff --git a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest100b4.java b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest100b4.java
index d1112899f..a4883ea07 100644
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest100b4.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/regressiontest/RegressionTest100b4.java
@@ -339,7 +339,13 @@ public class RegressionTest100b4 extends BaseDL4JTest {
 
         INDArray outAct = net.output(in);
 
-        assertEquals(outExp, outAct);
+        //19 layers - CPU vs. GPU difference accumulates notably, but appears to be correct
+        if(Nd4j.getBackend().getClass().getName().toLowerCase().contains("native")){
+            assertEquals(outExp, outAct);
+        } else {
+            boolean eq = outExp.equalsWithEps(outAct, 0.1);
+            assertTrue(eq);
+        }
     }
 
     @Test
diff --git a/deeplearning4j/deeplearning4j-data/deeplearning4j-utility-iterators/src/main/java/org/deeplearning4j/datasets/iterator/SamplingDataSetIterator.java b/deeplearning4j/deeplearning4j-data/deeplearning4j-utility-iterators/src/main/java/org/deeplearning4j/datasets/iterator/SamplingDataSetIterator.java
index 62ee85407..32e4c61d3 100755
--- a/deeplearning4j/deeplearning4j-data/deeplearning4j-utility-iterators/src/main/java/org/deeplearning4j/datasets/iterator/SamplingDataSetIterator.java
+++ b/deeplearning4j/deeplearning4j-data/deeplearning4j-utility-iterators/src/main/java/org/deeplearning4j/datasets/iterator/SamplingDataSetIterator.java
@@ -24,101 +24,11 @@ import org.nd4j.linalg.dataset.api.iterator.DataSetIterator;
 import java.util.List;
 
 /**
- * A wrapper for a dataset to sample from.
- * This will randomly sample from the given dataset.
- * @author Adam GIbson
+ * @deprecated Use {@link org.nd4j.linalg.dataset.api.iterator.SamplingDataSetIterator}
  */
-public class SamplingDataSetIterator implements DataSetIterator {
-
-    /**
-     * 
-     */
-    private static final long serialVersionUID = -2700563801361726914L;
-    private DataSet sampleFrom;
-    private int batchSize;
-    private int totalNumberSamples;
-    private int numTimesSampled;
-    @Getter
-    private DataSetPreProcessor preProcessor;
-
-    /**
-     *
-     * @param sampleFrom the dataset to sample from
-     * @param batchSize the batch size to sample
-     * @param totalNumberSamples the sample size
-     */
+@Deprecated
+public class SamplingDataSetIterator extends org.nd4j.linalg.dataset.api.iterator.SamplingDataSetIterator {
     public SamplingDataSetIterator(DataSet sampleFrom, int batchSize, int totalNumberSamples) {
-        super();
-        this.sampleFrom = sampleFrom;
-        this.batchSize = batchSize;
-        this.totalNumberSamples = totalNumberSamples;
+        super(sampleFrom, batchSize, totalNumberSamples);
     }
-
-    @Override
-    public boolean hasNext() {
-        return numTimesSampled < totalNumberSamples;
-    }
-
-    @Override
-    public DataSet next() {
-        DataSet ret = sampleFrom.sample(batchSize);
-        numTimesSampled += batchSize;
-        return ret;
-    }
-
-    @Override
-    public void remove() {
-        throw new UnsupportedOperationException();
-    }
-
-    @Override
-    public int inputColumns() {
-        return sampleFrom.numInputs();
-    }
-
-    @Override
-    public int totalOutcomes() {
-        return sampleFrom.numOutcomes();
-    }
-
-    @Override
-    public boolean resetSupported() {
-        return true;
-    }
-
-    @Override
-    public boolean asyncSupported() {
-        return true;
-    }
-
-    @Override
-    public void reset() {
-        numTimesSampled = 0;
-    }
-
-    @Override
-    public int batch() {
-        return batchSize;
-    }
-
-    @Override
-    public void setPreProcessor(DataSetPreProcessor preProcessor) {
-        this.preProcessor = preProcessor;
-    }
-
-    @Override
-    public List<String> getLabels() {
-        return null;
-    }
-
-
-    @Override
-    public DataSet next(int num) {
-        DataSet ret = sampleFrom.sample(num);
-        numTimesSampled++;
-        return ret;
-    }
-
-
-
 }
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/Hdf5Archive.java b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/Hdf5Archive.java
index 83d138d5c..a5ea8efca 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/Hdf5Archive.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/Hdf5Archive.java
@@ -17,6 +17,7 @@
 package org.deeplearning4j.nn.modelimport.keras;
 
 import lombok.extern.slf4j.Slf4j;
+import org.bytedeco.hdf5.*;
 import org.bytedeco.javacpp.BytePointer;
 import org.bytedeco.javacpp.FloatPointer;
 import org.bytedeco.javacpp.Loader;
@@ -32,7 +33,6 @@ import java.lang.Exception;
 import java.util.ArrayList;
 import java.util.List;
 
-import org.bytedeco.hdf5.*;
 import static org.bytedeco.hdf5.global.hdf5.*;
 
 /**
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/KerasSequentialModel.java b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/KerasSequentialModel.java
index 529cf729c..d163c0776 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/KerasSequentialModel.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/KerasSequentialModel.java
@@ -17,7 +17,6 @@
 package org.deeplearning4j.nn.modelimport.keras;
 
 import lombok.extern.slf4j.Slf4j;
-import org.deeplearning4j.nn.api.layers.IOutputLayer;
 import org.deeplearning4j.nn.conf.BackpropType;
 import org.deeplearning4j.nn.conf.InputPreProcessor;
 import org.deeplearning4j.nn.conf.MultiLayerConfiguration;
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/advanced/activations/KerasPReLU.java b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/advanced/activations/KerasPReLU.java
index 15de6fc53..8877d8b5a 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/advanced/activations/KerasPReLU.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/advanced/activations/KerasPReLU.java
@@ -18,7 +18,6 @@ package org.deeplearning4j.nn.modelimport.keras.layers.advanced.activations;
 
 import lombok.extern.slf4j.Slf4j;
 import org.deeplearning4j.nn.api.layers.LayerConstraint;
-import org.deeplearning4j.nn.conf.distribution.Distribution;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.layers.PReLULayer;
 import org.deeplearning4j.nn.modelimport.keras.KerasLayer;
@@ -27,9 +26,8 @@ import org.deeplearning4j.nn.modelimport.keras.exceptions.UnsupportedKerasConfig
 import org.deeplearning4j.nn.modelimport.keras.utils.KerasConstraintUtils;
 import org.deeplearning4j.nn.modelimport.keras.utils.KerasLayerUtils;
 import org.deeplearning4j.nn.params.PReLUParamInitializer;
-import org.deeplearning4j.nn.weights.WeightInit;
+import org.deeplearning4j.nn.weights.IWeightInit;
 import org.nd4j.linalg.api.ndarray.INDArray;
-import org.nd4j.linalg.primitives.Pair;
 import org.nd4j.linalg.util.ArrayUtil;
 
 import java.util.HashMap;
@@ -79,14 +77,12 @@ public class KerasPReLU extends KerasLayer {
         LayerConstraint weightConstraint = KerasConstraintUtils.getConstraintsFromConfig(
                 layerConfig, ALPHA_CONSTRAINT, conf, kerasMajorVersion);
 
-        Pair<WeightInit, Distribution> init = getWeightInitFromConfig(layerConfig, ALPHA_INIT,
+        IWeightInit init = getWeightInitFromConfig(layerConfig, ALPHA_INIT,
                 enforceTrainingConfig, conf, kerasMajorVersion);
-        WeightInit weightInit = init.getFirst();
-        Distribution distribution = init.getSecond();
         long[] axes = getSharedAxes(layerConfig);
 
         PReLULayer.Builder builder = new PReLULayer.Builder().sharedAxes(axes)
-        .weightInit(weightInit.getWeightInitFunction(distribution)).name(layerName);
+        .weightInit(init).name(layerName);
         if (weightConstraint != null){
             builder.constrainWeights(weightConstraint);
         }
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasAtrousConvolution1D.java b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasAtrousConvolution1D.java
index b7fa269f7..d7a4ab699 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasAtrousConvolution1D.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasAtrousConvolution1D.java
@@ -17,14 +17,12 @@
 package org.deeplearning4j.nn.modelimport.keras.layers.convolutional;
 
 import org.deeplearning4j.nn.api.layers.LayerConstraint;
-import org.deeplearning4j.nn.conf.distribution.Distribution;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.layers.Convolution1DLayer;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.InvalidKerasConfigurationException;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.UnsupportedKerasConfigurationException;
 import org.deeplearning4j.nn.modelimport.keras.utils.KerasConstraintUtils;
-import org.deeplearning4j.nn.weights.WeightInit;
-import org.nd4j.linalg.primitives.Pair;
+import org.deeplearning4j.nn.weights.IWeightInit;
 
 import java.util.Map;
 
@@ -83,15 +81,13 @@ public class KerasAtrousConvolution1D extends KerasConvolution {
         LayerConstraint weightConstraint = KerasConstraintUtils.getConstraintsFromConfig(
                 layerConfig, conf.getLAYER_FIELD_W_CONSTRAINT(), conf, kerasMajorVersion);
 
-        Pair<WeightInit, Distribution> init = getWeightInitFromConfig(layerConfig, conf.getLAYER_FIELD_INIT(),
+        IWeightInit init = getWeightInitFromConfig(layerConfig, conf.getLAYER_FIELD_INIT(),
                 enforceTrainingConfig, conf, kerasMajorVersion);
-        WeightInit weightInit = init.getFirst();
-        Distribution distribution = init.getSecond();
 
         Convolution1DLayer.Builder builder = new Convolution1DLayer.Builder().name(this.layerName)
                 .nOut(getNOutFromConfig(layerConfig, conf)).dropOut(this.dropout)
                 .activation(getIActivationFromConfig(layerConfig, conf))
-                .weightInit(weightInit.getWeightInitFunction(distribution))
+                .weightInit(init)
                 .dilation(getDilationRate(layerConfig, 1, conf, true)[0])
                 .l1(this.weightL1Regularization).l2(this.weightL2Regularization)
                 .convolutionMode(getConvolutionModeFromConfig(layerConfig, conf))
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasAtrousConvolution2D.java b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasAtrousConvolution2D.java
index aa602bb3c..dd374992a 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasAtrousConvolution2D.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasAtrousConvolution2D.java
@@ -17,14 +17,12 @@
 package org.deeplearning4j.nn.modelimport.keras.layers.convolutional;
 
 import org.deeplearning4j.nn.api.layers.LayerConstraint;
-import org.deeplearning4j.nn.conf.distribution.Distribution;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.layers.ConvolutionLayer;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.InvalidKerasConfigurationException;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.UnsupportedKerasConfigurationException;
 import org.deeplearning4j.nn.modelimport.keras.utils.KerasConstraintUtils;
-import org.deeplearning4j.nn.weights.WeightInit;
-import org.nd4j.linalg.primitives.Pair;
+import org.deeplearning4j.nn.weights.IWeightInit;
 
 import java.util.Map;
 
@@ -84,14 +82,13 @@ public class KerasAtrousConvolution2D extends KerasConvolution {
         LayerConstraint weightConstraint = KerasConstraintUtils.getConstraintsFromConfig(
                 layerConfig, conf.getLAYER_FIELD_W_CONSTRAINT(), conf, kerasMajorVersion);
 
-        Pair<WeightInit, Distribution> init = getWeightInitFromConfig(layerConfig, conf.getLAYER_FIELD_INIT(),
+        IWeightInit init = getWeightInitFromConfig(layerConfig, conf.getLAYER_FIELD_INIT(),
                 enforceTrainingConfig, conf, kerasMajorVersion);
-        WeightInit weightInit = init.getFirst();
 
         ConvolutionLayer.Builder builder = new ConvolutionLayer.Builder().name(this.layerName)
                 .nOut(getNOutFromConfig(layerConfig, conf)).dropOut(this.dropout)
                 .activation(getIActivationFromConfig(layerConfig, conf))
-                .weightInit(weightInit.getWeightInitFunction())
+                .weightInit(init)
                 .dilation(getDilationRate(layerConfig, 2, conf, true))
                 .l1(this.weightL1Regularization).l2(this.weightL2Regularization)
                 .convolutionMode(getConvolutionModeFromConfig(layerConfig, conf))
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasConvolution.java b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasConvolution.java
index c4e66f6ef..f1d2f0210 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasConvolution.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasConvolution.java
@@ -21,7 +21,6 @@ import lombok.EqualsAndHashCode;
 import lombok.extern.slf4j.Slf4j;
 import org.apache.commons.lang3.ArrayUtils;
 import org.deeplearning4j.nn.modelimport.keras.KerasLayer;
-import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.InvalidKerasConfigurationException;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.UnsupportedKerasConfigurationException;
 import org.deeplearning4j.nn.params.ConvolutionParamInitializer;
@@ -30,7 +29,6 @@ import org.nd4j.linalg.factory.Nd4j;
 
 import java.util.HashMap;
 import java.util.Map;
-import java.util.Set;
 
 import static org.deeplearning4j.nn.modelimport.keras.utils.KerasLayerUtils.removeDefaultWeights;
 
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasConvolution1D.java b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasConvolution1D.java
index 33512eb33..3da88d3b1 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasConvolution1D.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasConvolution1D.java
@@ -22,7 +22,6 @@ import lombok.extern.slf4j.Slf4j;
 import org.apache.commons.lang3.ArrayUtils;
 import org.deeplearning4j.nn.api.layers.LayerConstraint;
 import org.deeplearning4j.nn.conf.InputPreProcessor;
-import org.deeplearning4j.nn.conf.distribution.Distribution;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.layers.Convolution1DLayer;
 import org.deeplearning4j.nn.conf.layers.InputTypeUtil;
@@ -30,10 +29,9 @@ import org.deeplearning4j.nn.modelimport.keras.exceptions.InvalidKerasConfigurat
 import org.deeplearning4j.nn.modelimport.keras.exceptions.UnsupportedKerasConfigurationException;
 import org.deeplearning4j.nn.modelimport.keras.utils.KerasConstraintUtils;
 import org.deeplearning4j.nn.params.ConvolutionParamInitializer;
-import org.deeplearning4j.nn.weights.WeightInit;
+import org.deeplearning4j.nn.weights.IWeightInit;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.factory.Nd4j;
-import org.nd4j.linalg.primitives.Pair;
 
 import java.util.HashMap;
 import java.util.Map;
@@ -94,15 +92,13 @@ public class KerasConvolution1D extends KerasConvolution {
         LayerConstraint weightConstraint = KerasConstraintUtils.getConstraintsFromConfig(
                 layerConfig, conf.getLAYER_FIELD_W_CONSTRAINT(), conf, kerasMajorVersion);
 
-        Pair<WeightInit, Distribution> init = getWeightInitFromConfig(layerConfig, conf.getLAYER_FIELD_INIT(),
+        IWeightInit init = getWeightInitFromConfig(layerConfig, conf.getLAYER_FIELD_INIT(),
                 enforceTrainingConfig, conf, kerasMajorVersion);
-        WeightInit weightInit = init.getFirst();
-        Distribution distribution = init.getSecond();
 
         Convolution1DLayer.Builder builder = new Convolution1DLayer.Builder().name(this.layerName)
                 .nOut(getNOutFromConfig(layerConfig, conf)).dropOut(this.dropout)
                 .activation(getIActivationFromConfig(layerConfig, conf))
-                .weightInit(weightInit.getWeightInitFunction(distribution))
+                .weightInit(init)
                 .l1(this.weightL1Regularization).l2(this.weightL2Regularization)
                 .convolutionMode(getConvolutionModeFromConfig(layerConfig, conf))
                 .kernelSize(getKernelSizeFromConfig(layerConfig, 1,  conf, kerasMajorVersion)[0])
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasConvolution2D.java b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasConvolution2D.java
index 3c1d9f7d2..e9c74e78c 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasConvolution2D.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasConvolution2D.java
@@ -21,14 +21,12 @@ import lombok.EqualsAndHashCode;
 import lombok.extern.slf4j.Slf4j;
 import org.deeplearning4j.nn.api.layers.LayerConstraint;
 import org.deeplearning4j.nn.conf.InputPreProcessor;
-import org.deeplearning4j.nn.conf.distribution.Distribution;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.layers.ConvolutionLayer;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.InvalidKerasConfigurationException;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.UnsupportedKerasConfigurationException;
 import org.deeplearning4j.nn.modelimport.keras.utils.KerasConstraintUtils;
-import org.deeplearning4j.nn.weights.WeightInit;
-import org.nd4j.linalg.primitives.Pair;
+import org.deeplearning4j.nn.weights.IWeightInit;
 
 import java.util.Map;
 
@@ -87,10 +85,8 @@ public class KerasConvolution2D extends KerasConvolution {
         numTrainableParams = hasBias ? 2 : 1;
         int[] dilationRate = getDilationRate(layerConfig, 2, conf, false);
 
-        Pair<WeightInit, Distribution> init = getWeightInitFromConfig(layerConfig, conf.getLAYER_FIELD_INIT(),
+        IWeightInit init = getWeightInitFromConfig(layerConfig, conf.getLAYER_FIELD_INIT(),
                 enforceTrainingConfig, conf, kerasMajorVersion);
-        WeightInit weightInit = init.getFirst();
-        Distribution distribution = init.getSecond();
 
         LayerConstraint biasConstraint = KerasConstraintUtils.getConstraintsFromConfig(
                 layerConfig, conf.getLAYER_FIELD_B_CONSTRAINT(), conf, kerasMajorVersion);
@@ -100,7 +96,7 @@ public class KerasConvolution2D extends KerasConvolution {
         ConvolutionLayer.Builder builder = new ConvolutionLayer.Builder().name(this.layerName)
                 .nOut(getNOutFromConfig(layerConfig, conf)).dropOut(this.dropout)
                 .activation(getIActivationFromConfig(layerConfig, conf))
-                .weightInit(weightInit.getWeightInitFunction(distribution))
+                .weightInit(init)
                 .l1(this.weightL1Regularization).l2(this.weightL2Regularization)
                 .convolutionMode(getConvolutionModeFromConfig(layerConfig, conf))
                 .kernelSize(getKernelSizeFromConfig(layerConfig, 2, conf, kerasMajorVersion))
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasConvolution3D.java b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasConvolution3D.java
index 8da12a726..ccd776306 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasConvolution3D.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasConvolution3D.java
@@ -21,15 +21,13 @@ import lombok.EqualsAndHashCode;
 import lombok.extern.slf4j.Slf4j;
 import org.deeplearning4j.nn.api.layers.LayerConstraint;
 import org.deeplearning4j.nn.conf.InputPreProcessor;
-import org.deeplearning4j.nn.conf.distribution.Distribution;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.layers.Convolution3D;
 import org.deeplearning4j.nn.conf.layers.ConvolutionLayer;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.InvalidKerasConfigurationException;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.UnsupportedKerasConfigurationException;
 import org.deeplearning4j.nn.modelimport.keras.utils.KerasConstraintUtils;
-import org.deeplearning4j.nn.weights.WeightInit;
-import org.nd4j.linalg.primitives.Pair;
+import org.deeplearning4j.nn.weights.IWeightInit;
 
 import java.util.Map;
 
@@ -88,10 +86,8 @@ public class KerasConvolution3D extends KerasConvolution {
         numTrainableParams = hasBias ? 2 : 1;
         int[] dilationRate = getDilationRate(layerConfig, 3, conf, false);
 
-        Pair<WeightInit, Distribution> init = getWeightInitFromConfig(layerConfig, conf.getLAYER_FIELD_INIT(),
+        IWeightInit init = getWeightInitFromConfig(layerConfig, conf.getLAYER_FIELD_INIT(),
                 enforceTrainingConfig, conf, kerasMajorVersion);
-        WeightInit weightInit = init.getFirst();
-        Distribution distribution = init.getSecond();
 
         LayerConstraint biasConstraint = KerasConstraintUtils.getConstraintsFromConfig(
                 layerConfig, conf.getLAYER_FIELD_B_CONSTRAINT(), conf, kerasMajorVersion);
@@ -101,7 +97,7 @@ public class KerasConvolution3D extends KerasConvolution {
         Convolution3D.Builder builder = new Convolution3D.Builder().name(this.layerName)
                 .nOut(getNOutFromConfig(layerConfig, conf)).dropOut(this.dropout)
                 .activation(getIActivationFromConfig(layerConfig, conf))
-                .weightInit(weightInit.getWeightInitFunction(distribution))
+                .weightInit(init)
                 .l1(this.weightL1Regularization).l2(this.weightL2Regularization)
                 .convolutionMode(getConvolutionModeFromConfig(layerConfig, conf))
                 .kernelSize(getKernelSizeFromConfig(layerConfig, 3, conf, kerasMajorVersion))
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasDeconvolution2D.java b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasDeconvolution2D.java
index 33e02ae6f..92d9f3af8 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasDeconvolution2D.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasDeconvolution2D.java
@@ -20,14 +20,12 @@ import lombok.Data;
 import lombok.EqualsAndHashCode;
 import lombok.extern.slf4j.Slf4j;
 import org.deeplearning4j.nn.api.layers.LayerConstraint;
-import org.deeplearning4j.nn.conf.distribution.Distribution;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.layers.Deconvolution2D;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.InvalidKerasConfigurationException;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.UnsupportedKerasConfigurationException;
 import org.deeplearning4j.nn.modelimport.keras.utils.KerasConstraintUtils;
-import org.deeplearning4j.nn.weights.WeightInit;
-import org.nd4j.linalg.primitives.Pair;
+import org.deeplearning4j.nn.weights.IWeightInit;
 
 import java.util.Map;
 
@@ -86,10 +84,8 @@ public class KerasDeconvolution2D extends KerasConvolution {
         numTrainableParams = hasBias ? 2 : 1;
         int[] dilationRate = getDilationRate(layerConfig, 2, conf, false);
 
-        Pair<WeightInit, Distribution> init = getWeightInitFromConfig(layerConfig, conf.getLAYER_FIELD_INIT(),
+        IWeightInit init = getWeightInitFromConfig(layerConfig, conf.getLAYER_FIELD_INIT(),
                 enforceTrainingConfig, conf, kerasMajorVersion);
-        WeightInit weightInit = init.getFirst();
-        Distribution distribution = init.getSecond();
 
         LayerConstraint biasConstraint = KerasConstraintUtils.getConstraintsFromConfig(
                 layerConfig, conf.getLAYER_FIELD_B_CONSTRAINT(), conf, kerasMajorVersion);
@@ -99,7 +95,7 @@ public class KerasDeconvolution2D extends KerasConvolution {
         Deconvolution2D.Builder builder = new Deconvolution2D.Builder().name(this.layerName)
                 .nOut(getNOutFromConfig(layerConfig, conf)).dropOut(this.dropout)
                 .activation(getIActivationFromConfig(layerConfig, conf))
-                .weightInit(weightInit.getWeightInitFunction(distribution))
+                .weightInit(init)
                 .l1(this.weightL1Regularization).l2(this.weightL2Regularization)
                 .convolutionMode(getConvolutionModeFromConfig(layerConfig, conf))
                 .kernelSize(getKernelSizeFromConfig(layerConfig, 2, conf, kerasMajorVersion))
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasDepthwiseConvolution2D.java b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasDepthwiseConvolution2D.java
index f27d3ff08..c72de75a6 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasDepthwiseConvolution2D.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasDepthwiseConvolution2D.java
@@ -21,7 +21,6 @@ import lombok.EqualsAndHashCode;
 import lombok.extern.slf4j.Slf4j;
 import lombok.val;
 import org.deeplearning4j.nn.api.layers.LayerConstraint;
-import org.deeplearning4j.nn.conf.distribution.Distribution;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.layers.DepthwiseConvolution2D;
 import org.deeplearning4j.nn.modelimport.keras.KerasLayer;
@@ -30,9 +29,8 @@ import org.deeplearning4j.nn.modelimport.keras.exceptions.UnsupportedKerasConfig
 import org.deeplearning4j.nn.modelimport.keras.utils.KerasConstraintUtils;
 import org.deeplearning4j.nn.modelimport.keras.utils.KerasRegularizerUtils;
 import org.deeplearning4j.nn.params.SeparableConvolutionParamInitializer;
-import org.deeplearning4j.nn.weights.WeightInit;
+import org.deeplearning4j.nn.weights.IWeightInit;
 import org.nd4j.linalg.api.ndarray.INDArray;
-import org.nd4j.linalg.primitives.Pair;
 
 import java.util.Collections;
 import java.util.HashMap;
@@ -126,10 +124,8 @@ public class KerasDepthwiseConvolution2D extends KerasConvolution {
         numTrainableParams = hasBias ? 2 : 1;
         int[] dilationRate = getDilationRate(layerConfig, 2, conf, false);
 
-        Pair<WeightInit, Distribution> depthWiseInit = getWeightInitFromConfig(layerConfig,
+        IWeightInit depthWiseInit = getWeightInitFromConfig(layerConfig,
                 conf.getLAYER_FIELD_DEPTH_WISE_INIT(), enforceTrainingConfig, conf, kerasMajorVersion);
-        WeightInit depthWeightInit = depthWiseInit.getFirst();
-        Distribution depthDistribution = depthWiseInit.getSecond();
 
         val nIn = getNInFromConfig(previousLayers);
 
@@ -152,7 +148,7 @@ public class KerasDepthwiseConvolution2D extends KerasConvolution {
                 .nIn(nIn)
                 .nOut(nIn * depthMultiplier)
                 .activation(getIActivationFromConfig(layerConfig, conf))
-                .weightInit(depthWeightInit.getWeightInitFunction(depthDistribution))
+                .weightInit(depthWiseInit)
                 .depthMultiplier(depthMultiplier)
                 .l1(this.weightL1Regularization).l2(this.weightL2Regularization)
                 .convolutionMode(getConvolutionModeFromConfig(layerConfig, conf))
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasSeparableConvolution2D.java b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasSeparableConvolution2D.java
index 67eba9bf1..cd052bbb7 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasSeparableConvolution2D.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasSeparableConvolution2D.java
@@ -20,7 +20,6 @@ import lombok.Data;
 import lombok.EqualsAndHashCode;
 import lombok.extern.slf4j.Slf4j;
 import org.deeplearning4j.nn.api.layers.LayerConstraint;
-import org.deeplearning4j.nn.conf.distribution.Distribution;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.layers.SeparableConvolution2D;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.InvalidKerasConfigurationException;
@@ -28,9 +27,8 @@ import org.deeplearning4j.nn.modelimport.keras.exceptions.UnsupportedKerasConfig
 import org.deeplearning4j.nn.modelimport.keras.utils.KerasConstraintUtils;
 import org.deeplearning4j.nn.modelimport.keras.utils.KerasRegularizerUtils;
 import org.deeplearning4j.nn.params.SeparableConvolutionParamInitializer;
-import org.deeplearning4j.nn.weights.WeightInit;
+import org.deeplearning4j.nn.weights.IWeightInit;
 import org.nd4j.linalg.api.ndarray.INDArray;
-import org.nd4j.linalg.primitives.Pair;
 
 import java.util.HashMap;
 import java.util.Map;
@@ -93,17 +91,13 @@ public class KerasSeparableConvolution2D extends KerasConvolution {
 
         int depthMultiplier = getDepthMultiplier(layerConfig, conf);
 
-        Pair<WeightInit, Distribution> depthWiseInit = getWeightInitFromConfig(layerConfig,
+        IWeightInit depthWiseInit = getWeightInitFromConfig(layerConfig,
                 conf.getLAYER_FIELD_DEPTH_WISE_INIT(), enforceTrainingConfig, conf, kerasMajorVersion);
-        WeightInit depthWeightInit = depthWiseInit.getFirst();
-        Distribution depthDistribution = depthWiseInit.getSecond();
 
-        Pair<WeightInit, Distribution> pointWiseInit = getWeightInitFromConfig(layerConfig,
+        IWeightInit pointWiseInit = getWeightInitFromConfig(layerConfig,
                 conf.getLAYER_FIELD_POINT_WISE_INIT(), enforceTrainingConfig, conf, kerasMajorVersion);
-        WeightInit pointWeightInit = pointWiseInit.getFirst();
-        Distribution pointDistribution = pointWiseInit.getSecond();
 
-        if (depthWeightInit != pointWeightInit || depthDistribution != pointDistribution)
+        if ( !depthWiseInit.getClass().equals(pointWiseInit.getClass()) )
             if (enforceTrainingConfig)
                 throw new UnsupportedKerasConfigurationException(
                         "Specifying different initialization for depth- and point-wise weights not supported.");
@@ -126,7 +120,7 @@ public class KerasSeparableConvolution2D extends KerasConvolution {
         SeparableConvolution2D.Builder builder = new SeparableConvolution2D.Builder().name(this.layerName)
                 .nOut(getNOutFromConfig(layerConfig, conf)).dropOut(this.dropout)
                 .activation(getIActivationFromConfig(layerConfig, conf))
-                .weightInit(depthWeightInit.getWeightInitFunction(depthDistribution))
+                .weightInit(depthWiseInit)
                 .depthMultiplier(depthMultiplier)
                 .l1(this.weightL1Regularization).l2(this.weightL2Regularization)
                 .convolutionMode(getConvolutionModeFromConfig(layerConfig, conf))
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasUpsampling3D.java b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasUpsampling3D.java
index a9c1054f1..98aabb3ee 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasUpsampling3D.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasUpsampling3D.java
@@ -17,7 +17,6 @@
 package org.deeplearning4j.nn.modelimport.keras.layers.convolutional;
 
 import org.deeplearning4j.nn.conf.inputs.InputType;
-import org.deeplearning4j.nn.conf.layers.Upsampling2D;
 import org.deeplearning4j.nn.conf.layers.Upsampling3D;
 import org.deeplearning4j.nn.modelimport.keras.KerasLayer;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.InvalidKerasConfigurationException;
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasZeroPadding3D.java b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasZeroPadding3D.java
index 387b826f5..7c840d301 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasZeroPadding3D.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/convolutional/KerasZeroPadding3D.java
@@ -21,7 +21,6 @@ import lombok.EqualsAndHashCode;
 import lombok.extern.slf4j.Slf4j;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.layers.ZeroPadding3DLayer;
-import org.deeplearning4j.nn.conf.layers.ZeroPaddingLayer;
 import org.deeplearning4j.nn.modelimport.keras.KerasLayer;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.InvalidKerasConfigurationException;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.UnsupportedKerasConfigurationException;
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasDense.java b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasDense.java
index d840370d8..296b5dabf 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasDense.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasDense.java
@@ -21,7 +21,6 @@ import lombok.EqualsAndHashCode;
 import lombok.extern.slf4j.Slf4j;
 import org.deeplearning4j.nn.api.layers.LayerConstraint;
 import org.deeplearning4j.nn.conf.InputPreProcessor;
-import org.deeplearning4j.nn.conf.distribution.Distribution;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.layers.DenseLayer;
 import org.deeplearning4j.nn.modelimport.keras.KerasLayer;
@@ -29,9 +28,8 @@ import org.deeplearning4j.nn.modelimport.keras.exceptions.InvalidKerasConfigurat
 import org.deeplearning4j.nn.modelimport.keras.exceptions.UnsupportedKerasConfigurationException;
 import org.deeplearning4j.nn.modelimport.keras.utils.KerasConstraintUtils;
 import org.deeplearning4j.nn.params.DefaultParamInitializer;
-import org.deeplearning4j.nn.weights.WeightInit;
+import org.deeplearning4j.nn.weights.IWeightInit;
 import org.nd4j.linalg.api.ndarray.INDArray;
-import org.nd4j.linalg.primitives.Pair;
 
 import java.util.HashMap;
 import java.util.Map;
@@ -95,15 +93,13 @@ public class KerasDense extends KerasLayer {
         LayerConstraint weightConstraint = KerasConstraintUtils.getConstraintsFromConfig(
                 layerConfig, conf.getLAYER_FIELD_W_CONSTRAINT(), conf, kerasMajorVersion);
 
-        Pair<WeightInit, Distribution> init = getWeightInitFromConfig(layerConfig, conf.getLAYER_FIELD_INIT(),
+        IWeightInit init = getWeightInitFromConfig(layerConfig, conf.getLAYER_FIELD_INIT(),
                 enforceTrainingConfig, conf, kerasMajorVersion);
-        WeightInit weightInit = init.getFirst();
-        Distribution distribution = init.getSecond();
 
         DenseLayer.Builder builder = new DenseLayer.Builder().name(this.layerName)
                 .nOut(getNOutFromConfig(layerConfig, conf))
                 .dropOut(this.dropout).activation(getIActivationFromConfig(layerConfig, conf))
-                .weightInit(weightInit.getWeightInitFunction(distribution))
+                .weightInit(init)
                 .biasInit(0.0)
                 .l1(this.weightL1Regularization).l2(this.weightL2Regularization)
                 .hasBias(hasBias);
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasFlatten.java b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasFlatten.java
index d2aeb75c3..e0a6628a2 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasFlatten.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasFlatten.java
@@ -22,7 +22,6 @@ import org.deeplearning4j.nn.conf.InputPreProcessor;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.inputs.InputType.InputTypeConvolutional;
 import org.deeplearning4j.nn.conf.preprocessor.CnnToFeedForwardPreProcessor;
-import org.deeplearning4j.nn.conf.preprocessor.RnnToFeedForwardPreProcessor;
 import org.deeplearning4j.nn.modelimport.keras.KerasLayer;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.InvalidKerasConfigurationException;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.UnsupportedKerasConfigurationException;
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasRepeatVector.java b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasRepeatVector.java
index 45f9ddadd..41254e221 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasRepeatVector.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasRepeatVector.java
@@ -18,7 +18,6 @@ package org.deeplearning4j.nn.modelimport.keras.layers.core;
 
 import lombok.extern.slf4j.Slf4j;
 import org.deeplearning4j.nn.conf.inputs.InputType;
-import org.deeplearning4j.nn.conf.layers.DropoutLayer;
 import org.deeplearning4j.nn.conf.layers.misc.RepeatVector;
 import org.deeplearning4j.nn.modelimport.keras.KerasLayer;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasReshape.java b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasReshape.java
index 1275cf5a9..6a5e1ff2a 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasReshape.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasReshape.java
@@ -18,7 +18,6 @@ package org.deeplearning4j.nn.modelimport.keras.layers.core;
 
 
 import lombok.val;
-import org.apache.commons.lang3.ArrayUtils;
 import org.deeplearning4j.nn.conf.InputPreProcessor;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.modelimport.keras.KerasLayer;
@@ -26,7 +25,6 @@ import org.deeplearning4j.nn.modelimport.keras.exceptions.InvalidKerasConfigurat
 import org.deeplearning4j.nn.modelimport.keras.exceptions.UnsupportedKerasConfigurationException;
 import org.deeplearning4j.nn.modelimport.keras.preprocessors.ReshapePreprocessor;
 import org.deeplearning4j.nn.modelimport.keras.utils.KerasLayerUtils;
-import org.nd4j.linalg.util.ArrayUtil;
 
 import java.util.List;
 import java.util.Map;
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/embeddings/KerasEmbedding.java b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/embeddings/KerasEmbedding.java
index 2a34f707c..1ee13c0b0 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/embeddings/KerasEmbedding.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/embeddings/KerasEmbedding.java
@@ -21,7 +21,6 @@ import lombok.EqualsAndHashCode;
 import lombok.extern.slf4j.Slf4j;
 import org.deeplearning4j.nn.api.layers.LayerConstraint;
 import org.deeplearning4j.nn.conf.InputPreProcessor;
-import org.deeplearning4j.nn.conf.distribution.Distribution;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.layers.EmbeddingSequenceLayer;
 import org.deeplearning4j.nn.modelimport.keras.KerasLayer;
@@ -30,11 +29,10 @@ import org.deeplearning4j.nn.modelimport.keras.exceptions.UnsupportedKerasConfig
 import org.deeplearning4j.nn.modelimport.keras.utils.KerasConstraintUtils;
 import org.deeplearning4j.nn.modelimport.keras.utils.KerasLayerUtils;
 import org.deeplearning4j.nn.params.DefaultParamInitializer;
-import org.deeplearning4j.nn.weights.WeightInit;
+import org.deeplearning4j.nn.weights.IWeightInit;
 import org.nd4j.linalg.activations.Activation;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.factory.Nd4j;
-import org.nd4j.linalg.primitives.Pair;
 
 import java.util.HashMap;
 import java.util.Map;
@@ -106,10 +104,8 @@ public class KerasEmbedding extends KerasLayer {
                     "in DL4J, apply masking as a pre-processing step to your input." +
                     "See http://deeplearning4j.org/docs/latest/deeplearning4j-nn-recurrent#masking for more on this.");
 
-        Pair<WeightInit, Distribution> init = getWeightInitFromConfig(layerConfig, conf.getLAYER_FIELD_EMBEDDING_INIT(),
+        IWeightInit init = getWeightInitFromConfig(layerConfig, conf.getLAYER_FIELD_EMBEDDING_INIT(),
                 enforceTrainingConfig, conf, kerasMajorVersion);
-        WeightInit weightInit = init.getFirst();
-        Distribution distribution = init.getSecond();
 
         LayerConstraint embeddingConstraint = KerasConstraintUtils.getConstraintsFromConfig(
                 layerConfig, conf.getLAYER_FIELD_EMBEDDINGS_CONSTRAINT(), conf, kerasMajorVersion);
@@ -121,7 +117,7 @@ public class KerasEmbedding extends KerasLayer {
                 .inferInputLength(inferInputLength)
                 .nOut(getNOutFromConfig(layerConfig, conf))
                 .dropOut(this.dropout).activation(Activation.IDENTITY)
-                .weightInit(weightInit.getWeightInitFunction(distribution))
+                .weightInit(init)
                 .biasInit(0.0)
                 .l1(this.weightL1Regularization)
                 .l2(this.weightL2Regularization)
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/local/KerasLocallyConnected1D.java b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/local/KerasLocallyConnected1D.java
index f08e462ca..d6fed55fe 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/local/KerasLocallyConnected1D.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/local/KerasLocallyConnected1D.java
@@ -21,7 +21,6 @@ import lombok.EqualsAndHashCode;
 import lombok.extern.slf4j.Slf4j;
 import org.deeplearning4j.nn.api.layers.LayerConstraint;
 import org.deeplearning4j.nn.conf.InputPreProcessor;
-import org.deeplearning4j.nn.conf.distribution.Distribution;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.layers.LocallyConnected1D;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.InvalidKerasConfigurationException;
@@ -29,9 +28,8 @@ import org.deeplearning4j.nn.modelimport.keras.exceptions.UnsupportedKerasConfig
 import org.deeplearning4j.nn.modelimport.keras.layers.convolutional.KerasConvolution;
 import org.deeplearning4j.nn.modelimport.keras.utils.KerasConstraintUtils;
 import org.deeplearning4j.nn.params.ConvolutionParamInitializer;
-import org.deeplearning4j.nn.weights.WeightInit;
+import org.deeplearning4j.nn.weights.IWeightInit;
 import org.nd4j.linalg.api.ndarray.INDArray;
-import org.nd4j.linalg.primitives.Pair;
 
 import java.util.HashMap;
 import java.util.Map;
@@ -90,11 +88,8 @@ public class KerasLocallyConnected1D extends KerasConvolution {
         numTrainableParams = hasBias ? 2 : 1;
         int[] dilationRate = getDilationRate(layerConfig, 1, conf, false);
 
-        Pair<WeightInit, Distribution> init = getWeightInitFromConfig(layerConfig, conf.getLAYER_FIELD_INIT(),
+        IWeightInit init = getWeightInitFromConfig(layerConfig, conf.getLAYER_FIELD_INIT(),
                 enforceTrainingConfig, conf, kerasMajorVersion);
-        WeightInit weightInit = init.getFirst();
-        // TODO: take care of distribution and bias init
-        //Distribution distribution = init.getSecond();
 
         LayerConstraint biasConstraint = KerasConstraintUtils.getConstraintsFromConfig(
                 layerConfig, conf.getLAYER_FIELD_B_CONSTRAINT(), conf, kerasMajorVersion);
@@ -104,7 +99,7 @@ public class KerasLocallyConnected1D extends KerasConvolution {
         LocallyConnected1D.Builder builder = new LocallyConnected1D.Builder().name(this.layerName)
                 .nOut(getNOutFromConfig(layerConfig, conf)).dropOut(this.dropout)
                 .activation(getActivationFromConfig(layerConfig, conf))
-                .weightInit(weightInit)
+                .weightInit(conf.getKERAS_PARAM_NAME_W(), init)
                 .l1(this.weightL1Regularization).l2(this.weightL2Regularization)
                 .convolutionMode(getConvolutionModeFromConfig(layerConfig, conf))
                 .kernelSize(getKernelSizeFromConfig(layerConfig, 1, conf, kerasMajorVersion)[0])
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/local/KerasLocallyConnected2D.java b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/local/KerasLocallyConnected2D.java
index 5c2ab641b..550c20d01 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/local/KerasLocallyConnected2D.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/local/KerasLocallyConnected2D.java
@@ -21,7 +21,6 @@ import lombok.EqualsAndHashCode;
 import lombok.extern.slf4j.Slf4j;
 import org.deeplearning4j.nn.api.layers.LayerConstraint;
 import org.deeplearning4j.nn.conf.InputPreProcessor;
-import org.deeplearning4j.nn.conf.distribution.Distribution;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.layers.LocallyConnected2D;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.InvalidKerasConfigurationException;
@@ -29,9 +28,8 @@ import org.deeplearning4j.nn.modelimport.keras.exceptions.UnsupportedKerasConfig
 import org.deeplearning4j.nn.modelimport.keras.layers.convolutional.KerasConvolution;
 import org.deeplearning4j.nn.modelimport.keras.utils.KerasConstraintUtils;
 import org.deeplearning4j.nn.params.ConvolutionParamInitializer;
-import org.deeplearning4j.nn.weights.WeightInit;
+import org.deeplearning4j.nn.weights.IWeightInit;
 import org.nd4j.linalg.api.ndarray.INDArray;
-import org.nd4j.linalg.primitives.Pair;
 
 import java.util.HashMap;
 import java.util.Map;
@@ -39,9 +37,7 @@ import java.util.Map;
 import static org.deeplearning4j.nn.modelimport.keras.layers.convolutional.KerasConvolutionUtils.*;
 import static org.deeplearning4j.nn.modelimport.keras.utils.KerasActivationUtils.getActivationFromConfig;
 import static org.deeplearning4j.nn.modelimport.keras.utils.KerasInitilizationUtils.getWeightInitFromConfig;
-import static org.deeplearning4j.nn.modelimport.keras.utils.KerasLayerUtils.getHasBiasFromConfig;
-import static org.deeplearning4j.nn.modelimport.keras.utils.KerasLayerUtils.getNOutFromConfig;
-import static org.deeplearning4j.nn.modelimport.keras.utils.KerasLayerUtils.removeDefaultWeights;
+import static org.deeplearning4j.nn.modelimport.keras.utils.KerasLayerUtils.*;
 
 
 /**
@@ -92,11 +88,9 @@ public class KerasLocallyConnected2D extends KerasConvolution {
         numTrainableParams = hasBias ? 2 : 1;
         int[] dilationRate = getDilationRate(layerConfig, 2, conf, false);
 
-        Pair<WeightInit, Distribution> init = getWeightInitFromConfig(layerConfig, conf.getLAYER_FIELD_INIT(),
+        IWeightInit init = getWeightInitFromConfig(layerConfig, conf.getLAYER_FIELD_INIT(),
                 enforceTrainingConfig, conf, kerasMajorVersion);
-        WeightInit weightInit = init.getFirst();
-        // TODO: take care of distribution and bias init
-        //Distribution distribution = init.getSecond();
+        // TODO: take care of bias init
 
         LayerConstraint biasConstraint = KerasConstraintUtils.getConstraintsFromConfig(
                 layerConfig, conf.getLAYER_FIELD_B_CONSTRAINT(), conf, kerasMajorVersion);
@@ -106,7 +100,7 @@ public class KerasLocallyConnected2D extends KerasConvolution {
         LocallyConnected2D.Builder builder = new LocallyConnected2D.Builder().name(this.layerName)
                 .nOut(getNOutFromConfig(layerConfig, conf)).dropOut(this.dropout)
                 .activation(getActivationFromConfig(layerConfig, conf))
-                .weightInit(weightInit)
+                .weightInit(conf.getKERAS_PARAM_NAME_W(), init)
                 .l1(this.weightL1Regularization).l2(this.weightL2Regularization)
                 .convolutionMode(getConvolutionModeFromConfig(layerConfig, conf))
                 .kernelSize(getKernelSizeFromConfig(layerConfig, 2, conf, kerasMajorVersion))
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/normalization/KerasBatchNormalization.java b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/normalization/KerasBatchNormalization.java
index ff8d4d91f..7f7d8dc4c 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/normalization/KerasBatchNormalization.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/normalization/KerasBatchNormalization.java
@@ -31,7 +31,6 @@ import org.deeplearning4j.nn.params.BatchNormalizationParamInitializer;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.factory.Nd4j;
 
-import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.Set;
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/recurrent/KerasLSTM.java b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/recurrent/KerasLSTM.java
index f04752936..7d5603261 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/recurrent/KerasLSTM.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/recurrent/KerasLSTM.java
@@ -22,7 +22,6 @@ import lombok.extern.slf4j.Slf4j;
 import lombok.val;
 import org.deeplearning4j.nn.api.layers.LayerConstraint;
 import org.deeplearning4j.nn.conf.InputPreProcessor;
-import org.deeplearning4j.nn.conf.distribution.Distribution;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.layers.InputTypeUtil;
 import org.deeplearning4j.nn.conf.layers.LSTM;
@@ -35,7 +34,7 @@ import org.deeplearning4j.nn.modelimport.keras.exceptions.UnsupportedKerasConfig
 import org.deeplearning4j.nn.modelimport.keras.utils.KerasConstraintUtils;
 import org.deeplearning4j.nn.modelimport.keras.utils.KerasLayerUtils;
 import org.deeplearning4j.nn.params.LSTMParamInitializer;
-import org.deeplearning4j.nn.weights.WeightInit;
+import org.deeplearning4j.nn.weights.IWeightInit;
 import org.nd4j.linalg.activations.IActivation;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.factory.Nd4j;
@@ -151,15 +150,11 @@ public class KerasLSTM extends KerasLayer {
             throws InvalidKerasConfigurationException, UnsupportedKerasConfigurationException {
         super(layerConfig, enforceTrainingConfig);
 
-        Pair<WeightInit, Distribution> init = getWeightInitFromConfig(layerConfig, conf.getLAYER_FIELD_INIT(),
+        IWeightInit init = getWeightInitFromConfig(layerConfig, conf.getLAYER_FIELD_INIT(),
                 enforceTrainingConfig, conf, kerasMajorVersion);
-        WeightInit weightInit = init.getFirst();
-        Distribution distribution = init.getSecond();
 
-        Pair<WeightInit, Distribution> recurrentInit = getWeightInitFromConfig(layerConfig, conf.getLAYER_FIELD_INNER_INIT(),
+        IWeightInit recurrentInit = getWeightInitFromConfig(layerConfig, conf.getLAYER_FIELD_INNER_INIT(),
                 enforceTrainingConfig, conf, kerasMajorVersion);
-        WeightInit recurrentWeightInit = recurrentInit.getFirst();
-        Distribution recurrentDistribution = recurrentInit.getSecond();
 
         boolean hasBias = getHasBiasFromConfig(layerConfig, conf);
 
@@ -186,8 +181,8 @@ public class KerasLSTM extends KerasLayer {
                 .nOut(getNOutFromConfig(layerConfig, conf))
                 .dropOut(this.dropout)
                 .activation(getIActivationFromConfig(layerConfig, conf))
-                .weightInit(weightInit.getWeightInitFunction(distribution))
-                .weightInitRecurrent(recurrentWeightInit.getWeightInitFunction(recurrentDistribution))
+                .weightInit(init)
+                .weightInitRecurrent(recurrentInit)
                 .biasInit(0.0) // TODO: this is incorrect
                 .l1(this.weightL1Regularization)
                 .l2(this.weightL2Regularization);
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/recurrent/KerasSimpleRnn.java b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/recurrent/KerasSimpleRnn.java
index 615405fae..6f5edf597 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/recurrent/KerasSimpleRnn.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/layers/recurrent/KerasSimpleRnn.java
@@ -21,7 +21,6 @@ import lombok.EqualsAndHashCode;
 import lombok.extern.slf4j.Slf4j;
 import org.deeplearning4j.nn.api.layers.LayerConstraint;
 import org.deeplearning4j.nn.conf.InputPreProcessor;
-import org.deeplearning4j.nn.conf.distribution.Distribution;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.layers.InputTypeUtil;
 import org.deeplearning4j.nn.conf.layers.Layer;
@@ -34,7 +33,7 @@ import org.deeplearning4j.nn.modelimport.keras.exceptions.UnsupportedKerasConfig
 import org.deeplearning4j.nn.modelimport.keras.utils.KerasConstraintUtils;
 import org.deeplearning4j.nn.modelimport.keras.utils.KerasLayerUtils;
 import org.deeplearning4j.nn.params.SimpleRnnParamInitializer;
-import org.deeplearning4j.nn.weights.WeightInit;
+import org.deeplearning4j.nn.weights.IWeightInit;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.primitives.Pair;
 
@@ -124,15 +123,11 @@ public class KerasSimpleRnn extends KerasLayer {
             throws InvalidKerasConfigurationException, UnsupportedKerasConfigurationException {
         super(layerConfig, enforceTrainingConfig);
 
-        Pair<WeightInit, Distribution> init = getWeightInitFromConfig(layerConfig, conf.getLAYER_FIELD_INIT(),
+        IWeightInit init = getWeightInitFromConfig(layerConfig, conf.getLAYER_FIELD_INIT(),
                 enforceTrainingConfig, conf, kerasMajorVersion);
-        WeightInit weightInit = init.getFirst();
-        Distribution distribution = init.getSecond();
 
-        Pair<WeightInit, Distribution> recurrentInit = getWeightInitFromConfig(layerConfig, conf.getLAYER_FIELD_INNER_INIT(),
+        IWeightInit recurrentInit = getWeightInitFromConfig(layerConfig, conf.getLAYER_FIELD_INNER_INIT(),
                 enforceTrainingConfig, conf, kerasMajorVersion);
-        WeightInit recurrentWeightInit = recurrentInit.getFirst();
-        Distribution recurrentDistribution = recurrentInit.getSecond();
 
         Map<String, Object> innerConfig = KerasLayerUtils.getInnerLayerConfigFromConfig(layerConfig, conf);
         this.returnSequences = (Boolean) innerConfig.get(conf.getLAYER_FIELD_RETURN_SEQUENCES());
@@ -154,8 +149,8 @@ public class KerasSimpleRnn extends KerasLayer {
                 .nOut(getNOutFromConfig(layerConfig, conf))
                 .dropOut(this.dropout)
                 .activation(getIActivationFromConfig(layerConfig, conf))
-                .weightInit(weightInit.getWeightInitFunction(distribution))
-                .weightInitRecurrent(recurrentWeightInit.getWeightInitFunction(recurrentDistribution))
+                .weightInit(init)
+                .weightInitRecurrent(recurrentInit)
                 .biasInit(0.0)
                 .l1(this.weightL1Regularization)
                 .l2(this.weightL2Regularization);
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/preprocessing/sequence/TimeSeriesGenerator.java b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/preprocessing/sequence/TimeSeriesGenerator.java
index 94498b976..2a81886e0 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/preprocessing/sequence/TimeSeriesGenerator.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/preprocessing/sequence/TimeSeriesGenerator.java
@@ -20,9 +20,7 @@ import com.google.gson.Gson;
 import com.google.gson.reflect.TypeToken;
 import lombok.Data;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.InvalidKerasConfigurationException;
-import org.deeplearning4j.nn.modelimport.keras.preprocessing.text.KerasTokenizer;
 import org.nd4j.linalg.api.ndarray.INDArray;
-import org.nd4j.linalg.api.ops.DynamicCustomOp;
 import org.nd4j.linalg.factory.Nd4j;
 import org.nd4j.linalg.indexing.INDArrayIndex;
 import org.nd4j.linalg.indexing.NDArrayIndex;
@@ -31,7 +29,6 @@ import org.nd4j.linalg.primitives.Pair;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Paths;
-import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/preprocessors/KerasFlattenRnnPreprocessor.java b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/preprocessors/KerasFlattenRnnPreprocessor.java
index 3e18ebe3e..25aa73a06 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/preprocessors/KerasFlattenRnnPreprocessor.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/preprocessors/KerasFlattenRnnPreprocessor.java
@@ -22,9 +22,8 @@ import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.inputs.InvalidInputTypeException;
 import org.deeplearning4j.nn.conf.preprocessor.BaseInputPreProcessor;
 import org.deeplearning4j.nn.workspace.ArrayType;
-import org.nd4j.linalg.api.ndarray.INDArray;
 import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
-import org.nd4j.linalg.api.shape.Shape;
+import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.shade.jackson.annotation.JsonProperty;
 
 /**
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/preprocessors/ReshapePreprocessor.java b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/preprocessors/ReshapePreprocessor.java
index f94adf713..77c6369c5 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/preprocessors/ReshapePreprocessor.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/preprocessors/ReshapePreprocessor.java
@@ -19,17 +19,15 @@ package org.deeplearning4j.nn.modelimport.keras.preprocessors;
 import lombok.Data;
 import lombok.EqualsAndHashCode;
 import lombok.extern.slf4j.Slf4j;
-
 import lombok.val;
 import org.apache.commons.lang3.ArrayUtils;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.inputs.InvalidInputTypeException;
 import org.deeplearning4j.nn.conf.preprocessor.BaseInputPreProcessor;
 import org.deeplearning4j.nn.workspace.ArrayType;
-import org.nd4j.linalg.api.ndarray.INDArray;
 import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
+import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.api.shape.Shape;
-import org.nd4j.linalg.util.ArrayUtil;
 import org.nd4j.shade.jackson.annotation.JsonIgnoreProperties;
 import org.nd4j.shade.jackson.annotation.JsonProperty;
 
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/preprocessors/TensorFlowCnnToFeedForwardPreProcessor.java b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/preprocessors/TensorFlowCnnToFeedForwardPreProcessor.java
index f80863a03..db7d2e990 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/preprocessors/TensorFlowCnnToFeedForwardPreProcessor.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/preprocessors/TensorFlowCnnToFeedForwardPreProcessor.java
@@ -20,9 +20,9 @@ import lombok.extern.slf4j.Slf4j;
 import lombok.val;
 import org.deeplearning4j.nn.conf.preprocessor.CnnToFeedForwardPreProcessor;
 import org.deeplearning4j.nn.workspace.ArrayType;
+import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.api.shape.Shape;
-import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
 import org.nd4j.shade.jackson.annotation.JsonCreator;
 import org.nd4j.shade.jackson.annotation.JsonProperty;
 
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/utils/DL4JKerasModelValidator.java b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/utils/DL4JKerasModelValidator.java
index cd4461082..2ace14aa3 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/utils/DL4JKerasModelValidator.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/utils/DL4JKerasModelValidator.java
@@ -1,28 +1,15 @@
 package org.deeplearning4j.nn.modelimport.keras.utils;
 
 import lombok.NonNull;
-import org.apache.commons.io.IOUtils;
-import org.deeplearning4j.nn.api.Model;
-import org.deeplearning4j.nn.conf.MultiLayerConfiguration;
 import org.deeplearning4j.nn.graph.ComputationGraph;
 import org.deeplearning4j.nn.modelimport.keras.Hdf5Archive;
-import org.deeplearning4j.nn.modelimport.keras.KerasModel;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasModelConfiguration;
 import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
-import org.deeplearning4j.util.ModelSerializer;
 import org.nd4j.validation.Nd4jCommonValidator;
 import org.nd4j.validation.ValidationResult;
 
-import java.io.BufferedReader;
 import java.io.File;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.nio.charset.StandardCharsets;
-import java.util.Arrays;
 import java.util.Collections;
-import java.util.List;
-import java.util.zip.ZipEntry;
-import java.util.zip.ZipFile;
 
 /**
  * A utility for validating serialized Keras sequential and functional models for import into DL4J
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/utils/KerasActivationUtils.java b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/utils/KerasActivationUtils.java
index bb2bb1ca0..f0ddfd912 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/utils/KerasActivationUtils.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/utils/KerasActivationUtils.java
@@ -21,7 +21,6 @@ import org.deeplearning4j.nn.modelimport.keras.exceptions.InvalidKerasConfigurat
 import org.deeplearning4j.nn.modelimport.keras.exceptions.UnsupportedKerasConfigurationException;
 import org.nd4j.linalg.activations.Activation;
 import org.nd4j.linalg.activations.IActivation;
-import org.nd4j.linalg.activations.impl.*;
 
 import java.util.Map;
 
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/utils/KerasInitilizationUtils.java b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/utils/KerasInitilizationUtils.java
index b86b83be1..b4b5e6564 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/utils/KerasInitilizationUtils.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/utils/KerasInitilizationUtils.java
@@ -21,8 +21,7 @@ import org.deeplearning4j.nn.conf.distribution.*;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.InvalidKerasConfigurationException;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.UnsupportedKerasConfigurationException;
-import org.deeplearning4j.nn.weights.WeightInit;
-import org.nd4j.linalg.primitives.Pair;
+import org.deeplearning4j.nn.weights.*;
 
 import java.util.HashMap;
 import java.util.Map;
@@ -42,76 +41,71 @@ public class KerasInitilizationUtils {
      * @return DL4J weight initialization enum
      * @see WeightInit
      */
-    public static Pair<WeightInit, Distribution> mapWeightInitialization(String kerasInit,
-                                                                         KerasLayerConfiguration conf,
-                                                                         Map<String, Object> initConfig,
-                                                                         int kerasMajorVersion)
+    public static IWeightInit mapWeightInitialization(String kerasInit,
+                                                      KerasLayerConfiguration conf,
+                                                      Map<String, Object> initConfig,
+                                                      int kerasMajorVersion)
             throws UnsupportedKerasConfigurationException, InvalidKerasConfigurationException {
 
 
         // TODO: Identity and VarianceScaling need "scale" factor
-        WeightInit init = null;
-        Distribution dist = null;
         if (kerasInit != null) {
             if (kerasInit.equals(conf.getINIT_GLOROT_NORMAL()) ||
                     kerasInit.equals(conf.getINIT_GLOROT_NORMAL_ALIAS())) {
-                init = WeightInit.XAVIER;
+                return WeightInit.XAVIER.getWeightInitFunction();
             } else if (kerasInit.equals(conf.getINIT_GLOROT_UNIFORM()) ||
                     kerasInit.equals(conf.getINIT_GLOROT_UNIFORM_ALIAS())) {
-                init = WeightInit.XAVIER_UNIFORM;
+                return WeightInit.XAVIER_UNIFORM.getWeightInitFunction();
             } else if (kerasInit.equals(conf.getINIT_LECUN_NORMAL()) ||
                     kerasInit.equals(conf.getINIT_LECUN_NORMAL_ALIAS())) {
-                init = WeightInit.LECUN_NORMAL;
+                return WeightInit.LECUN_NORMAL.getWeightInitFunction();
             } else if (kerasInit.equals(conf.getINIT_LECUN_UNIFORM()) ||
                     kerasInit.equals(conf.getINIT_LECUN_UNIFORM_ALIAS())) {
-                init = WeightInit.LECUN_UNIFORM;
+                return WeightInit.LECUN_UNIFORM.getWeightInitFunction();
             } else if (kerasInit.equals(conf.getINIT_HE_NORMAL()) ||
                     kerasInit.equals(conf.getINIT_HE_NORMAL_ALIAS())) {
-                init = WeightInit.RELU;
+                return WeightInit.RELU.getWeightInitFunction();
             } else if (kerasInit.equals(conf.getINIT_HE_UNIFORM()) ||
                     kerasInit.equals(conf.getINIT_HE_UNIFORM_ALIAS())) {
-                init = WeightInit.RELU_UNIFORM;
+                return WeightInit.RELU_UNIFORM.getWeightInitFunction();
             } else if (kerasInit.equals(conf.getINIT_ONE()) ||
                     kerasInit.equals(conf.getINIT_ONES()) ||
                     kerasInit.equals(conf.getINIT_ONES_ALIAS())) {
-                init = WeightInit.ONES;
+                return WeightInit.ONES.getWeightInitFunction();
             } else if (kerasInit.equals(conf.getINIT_ZERO()) ||
                     kerasInit.equals(conf.getINIT_ZEROS()) ||
                     kerasInit.equals(conf.getINIT_ZEROS_ALIAS())) {
-                init = WeightInit.ZERO;
+                return WeightInit.ZERO.getWeightInitFunction();
             } else if (kerasInit.equals(conf.getINIT_UNIFORM()) ||
                     kerasInit.equals(conf.getINIT_RANDOM_UNIFORM()) ||
                     kerasInit.equals(conf.getINIT_RANDOM_UNIFORM_ALIAS())) {
                 if (kerasMajorVersion == 2) {
                     double minVal = (double) initConfig.get(conf.getLAYER_FIELD_INIT_MINVAL());
                     double maxVal = (double) initConfig.get(conf.getLAYER_FIELD_INIT_MAXVAL());
-                    dist = new UniformDistribution(minVal, maxVal);
+                    return new WeightInitDistribution(new UniformDistribution(minVal, maxVal));
                 } else {
                     double scale = 0.05;
                     if (initConfig.containsKey(conf.getLAYER_FIELD_INIT_SCALE()))
                         scale = (double) initConfig.get(conf.getLAYER_FIELD_INIT_SCALE());
-                    dist = new UniformDistribution(-scale, scale);
+                    return new WeightInitDistribution(new UniformDistribution(-scale, scale));
                 }
-                init = WeightInit.DISTRIBUTION;
             } else if (kerasInit.equals(conf.getINIT_NORMAL()) ||
                     kerasInit.equals(conf.getINIT_RANDOM_NORMAL()) ||
                     kerasInit.equals(conf.getINIT_RANDOM_NORMAL_ALIAS())) {
                 if (kerasMajorVersion == 2) {
                     double mean = (double) initConfig.get(conf.getLAYER_FIELD_INIT_MEAN());
                     double stdDev = (double) initConfig.get(conf.getLAYER_FIELD_INIT_STDDEV());
-                    dist = new NormalDistribution(mean, stdDev);
+                    return new WeightInitDistribution(new NormalDistribution(mean, stdDev));
                 } else {
                     double scale = 0.05;
                     if (initConfig.containsKey(conf.getLAYER_FIELD_INIT_SCALE()))
                         scale = (double) initConfig.get(conf.getLAYER_FIELD_INIT_SCALE());
-                    dist = new NormalDistribution(0, scale);
+                    return new WeightInitDistribution(new NormalDistribution(0, scale));
                 }
-                init = WeightInit.DISTRIBUTION;
             } else if (kerasInit.equals(conf.getINIT_CONSTANT()) ||
                     kerasInit.equals(conf.getINIT_CONSTANT_ALIAS())) {
                 double value = (double) initConfig.get(conf.getLAYER_FIELD_INIT_VALUE());
-                dist = new ConstantDistribution(value);
-                init = WeightInit.DISTRIBUTION;
+                return new WeightInitDistribution(new ConstantDistribution(value));
             } else if (kerasInit.equals(conf.getINIT_ORTHOGONAL()) ||
                     kerasInit.equals(conf.getINIT_ORTHOGONAL_ALIAS())) {
                 if (kerasMajorVersion == 2) {
@@ -121,34 +115,38 @@ public class KerasInitilizationUtils {
                     } catch (Exception e) {
                         gain = (int) initConfig.get(conf.getLAYER_FIELD_INIT_GAIN());
                     }
-                    dist = new OrthogonalDistribution(gain);
+                    return new WeightInitDistribution(new OrthogonalDistribution(gain));
                 } else {
                     double scale = 1.1;
                     if (initConfig.containsKey(conf.getLAYER_FIELD_INIT_SCALE()))
                         scale = (double) initConfig.get(conf.getLAYER_FIELD_INIT_SCALE());
-                    dist = new OrthogonalDistribution(scale);
+                    return new WeightInitDistribution(new OrthogonalDistribution(scale));
                 }
-                init = WeightInit.DISTRIBUTION;
             } else if (kerasInit.equals(conf.getINIT_TRUNCATED_NORMAL()) ||
                     kerasInit.equals(conf.getINIT_TRUNCATED_NORMAL_ALIAS())) {
                 double mean = (double) initConfig.get(conf.getLAYER_FIELD_INIT_MEAN());
                 double stdDev = (double) initConfig.get(conf.getLAYER_FIELD_INIT_STDDEV());
-                dist = new TruncatedNormalDistribution(mean, stdDev);
-                init = WeightInit.DISTRIBUTION;
+                return new WeightInitDistribution(new TruncatedNormalDistribution(mean, stdDev));
             } else if (kerasInit.equals(conf.getINIT_IDENTITY()) ||
                     kerasInit.equals(conf.getINIT_IDENTITY_ALIAS())) {
                 if (kerasMajorVersion == 2) {
                     double gain = (double) initConfig.get(conf.getLAYER_FIELD_INIT_GAIN());
-                    if (gain != 1.)
-                        log.warn("Scaled identity weight init not supported, setting gain=1");
+                    if (gain != 1.0)
+                    if (gain != 1.0) {
+                        return new WeightInitIdentity(gain);
+                    } else {
+                        return new WeightInitIdentity();
+                    }
                 } else {
                     double scale = 1.;
                     if (initConfig.containsKey(conf.getLAYER_FIELD_INIT_SCALE()))
                         scale = (double) initConfig.get(conf.getLAYER_FIELD_INIT_SCALE());
-                    if (scale != 1.)
-                        log.warn("Scaled identity weight init not supported, setting scale=1");
+                    if (scale != 1.0) {
+                        return new WeightInitIdentity(scale);
+                    } else {
+                        return new WeightInitIdentity();
+                    }
                 }
-                init = WeightInit.IDENTITY;
             } else if (kerasInit.equals(conf.getINIT_VARIANCE_SCALING())) {
                 double scale;
                 try {
@@ -156,32 +154,27 @@ public class KerasInitilizationUtils {
                 } catch (Exception e) {
                     scale = (int) initConfig.get(conf.getLAYER_FIELD_INIT_SCALE());
                 }
-                if (scale != 1.)
-                    log.warn("Scaled identity weight init not supported, setting scale=1");
                 String mode = (String) initConfig.get(conf.getLAYER_FIELD_INIT_MODE());
                 String distribution = (String) initConfig.get(conf.getLAYER_FIELD_INIT_DISTRIBUTION());
                 switch (mode) {
                     case "fan_in":
                         if (distribution.equals("normal")) {
-                            init = WeightInit.VAR_SCALING_NORMAL_FAN_IN;
+                            return new WeightInitVarScalingNormalFanIn(scale);
                         } else {
-                            init = WeightInit.VAR_SCALING_UNIFORM_FAN_IN;
+                            return new WeightInitVarScalingUniformFanIn(scale);
                         }
-                        break;
                     case "fan_out":
                         if (distribution.equals("normal")) {
-                            init = WeightInit.VAR_SCALING_NORMAL_FAN_OUT;
+                            return new WeightInitVarScalingNormalFanOut(scale);
                         } else {
-                            init = WeightInit.VAR_SCALING_UNIFORM_FAN_OUT;
+                            return new WeightInitVarScalingUniformFanOut(scale);
                         }
-                        break;
                     case "fan_avg":
                         if (distribution.equals("normal")) {
-                            init = WeightInit.VAR_SCALING_NORMAL_FAN_AVG;
+                            return new WeightInitVarScalingNormalFanAvg(scale);
                         } else {
-                            init = WeightInit.VAR_SCALING_UNIFORM_FAN_AVG;
+                            return new WeightInitVarScalingUniformFanAvg(scale);
                         }
-                        break;
                     default:
                         throw new InvalidKerasConfigurationException("Initialization argument 'mode' has to be either " +
                                 "fan_in, fan_out or fan_avg");
@@ -190,7 +183,7 @@ public class KerasInitilizationUtils {
                 throw new UnsupportedKerasConfigurationException("Unknown keras weight initializer " + kerasInit);
             }
         }
-        return new Pair<>(init, dist);
+        throw new IllegalStateException("Error getting Keras weight initialization");
     }
 
     /**
@@ -202,7 +195,7 @@ public class KerasInitilizationUtils {
      * @throws InvalidKerasConfigurationException     Invalid Keras config
      * @throws UnsupportedKerasConfigurationException Unsupported Keras config
      */
-    public static Pair<WeightInit, Distribution> getWeightInitFromConfig(Map<String, Object> layerConfig, String initField,
+    public static IWeightInit getWeightInitFromConfig(Map<String, Object> layerConfig, String initField,
                                                                          boolean enforceTrainingConfig,
                                                                          KerasLayerConfiguration conf,
                                                                          int kerasMajorVersion)
@@ -225,14 +218,14 @@ public class KerasInitilizationUtils {
                 throw new UnsupportedKerasConfigurationException("Incomplete initialization class");
             }
         }
-        Pair<WeightInit, Distribution> init;
+        IWeightInit init;
         try {
             init = mapWeightInitialization(kerasInit, conf, initMap, kerasMajorVersion);
         } catch (UnsupportedKerasConfigurationException e) {
             if (enforceTrainingConfig)
                 throw e;
             else {
-                init = new Pair<>(WeightInit.XAVIER, null);
+                init = new WeightInitXavier();
                 log.warn("Unknown weight initializer " + kerasInit + " (Using XAVIER instead).");
             }
         }
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/utils/KerasModelUtils.java b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/utils/KerasModelUtils.java
index f752b5b03..b33fda9f4 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/utils/KerasModelUtils.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/main/java/org/deeplearning4j/nn/modelimport/keras/utils/KerasModelUtils.java
@@ -21,7 +21,6 @@ import lombok.extern.slf4j.Slf4j;
 import org.apache.commons.lang3.StringUtils;
 import org.deeplearning4j.nn.api.Layer;
 import org.deeplearning4j.nn.api.Model;
-import org.deeplearning4j.nn.conf.layers.wrapper.BaseWrapperLayer;
 import org.deeplearning4j.nn.graph.ComputationGraph;
 import org.deeplearning4j.nn.modelimport.keras.Hdf5Archive;
 import org.deeplearning4j.nn.modelimport.keras.KerasLayer;
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/KerasTestUtils.java b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/KerasTestUtils.java
index bd6561d37..27aa340e8 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/KerasTestUtils.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/KerasTestUtils.java
@@ -16,7 +16,6 @@
 
 package org.deeplearning4j.nn.modelimport.keras;
 
-import org.deeplearning4j.nn.api.Layer;
 import org.deeplearning4j.nn.conf.layers.BaseLayer;
 import org.deeplearning4j.nn.conf.layers.samediff.AbstractSameDiffLayer;
 import org.nd4j.linalg.learning.regularization.L1Regularization;
@@ -25,7 +24,6 @@ import org.nd4j.linalg.learning.regularization.Regularization;
 
 import java.util.List;
 
-import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
 
 public class KerasTestUtils {
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/MiscTests.java b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/MiscTests.java
index 5c288b21c..dcfd53518 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/MiscTests.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/MiscTests.java
@@ -22,8 +22,6 @@ import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
-import org.nd4j.linalg.io.ClassPathResource;
-import org.nd4j.linalg.util.Nd4jValidator;
 import org.nd4j.resources.Resources;
 import org.nd4j.validation.ValidationResult;
 
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/configurations/FullModelComparisons.java b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/configurations/FullModelComparisons.java
index f0dfb3694..6043d7d48 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/configurations/FullModelComparisons.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/configurations/FullModelComparisons.java
@@ -21,7 +21,6 @@ import org.datavec.api.records.reader.SequenceRecordReader;
 import org.datavec.api.records.reader.impl.csv.CSVSequenceRecordReader;
 import org.datavec.api.split.NumberedFileInputSplit;
 import org.deeplearning4j.datasets.datavec.SequenceRecordReaderDataSetIterator;
-
 import org.deeplearning4j.nn.layers.recurrent.LSTM;
 import org.deeplearning4j.nn.layers.recurrent.LastTimeStepLayer;
 import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
@@ -30,7 +29,6 @@ import org.deeplearning4j.nn.modelimport.keras.KerasSequentialModel;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.InvalidKerasConfigurationException;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.UnsupportedKerasConfigurationException;
 import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
-import org.junit.Assert;
 import org.junit.Ignore;
 import org.junit.Rule;
 import org.junit.Test;
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/configurations/Keras1ModelConfigurationTest.java b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/configurations/Keras1ModelConfigurationTest.java
index 6dce1b714..554a2c2d1 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/configurations/Keras1ModelConfigurationTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/configurations/Keras1ModelConfigurationTest.java
@@ -24,7 +24,6 @@ import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.KerasModel;
 import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
 import org.junit.Test;
-import org.nd4j.linalg.io.ClassPathResource;
 import org.nd4j.resources.Resources;
 
 import java.io.InputStream;
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/configurations/Keras2ModelConfigurationTest.java b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/configurations/Keras2ModelConfigurationTest.java
index 162dc235a..81103d315 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/configurations/Keras2ModelConfigurationTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/configurations/Keras2ModelConfigurationTest.java
@@ -30,11 +30,9 @@ import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
 import org.junit.Test;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.factory.Nd4j;
-import org.nd4j.linalg.io.ClassPathResource;
 import org.nd4j.resources.Resources;
 
 import java.io.File;
-import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.Arrays;
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/configurations/KerasInitilizationTest.java b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/configurations/KerasInitilizationTest.java
index 7072f1956..8ac231e12 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/configurations/KerasInitilizationTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/configurations/KerasInitilizationTest.java
@@ -25,6 +25,8 @@ import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.layers.core.KerasDense;
 import org.deeplearning4j.nn.weights.IWeightInit;
 import org.deeplearning4j.nn.weights.WeightInit;
+import org.deeplearning4j.nn.weights.WeightInitIdentity;
+import org.deeplearning4j.nn.weights.WeightInitVarScalingNormalFanIn;
 import org.junit.Test;
 
 import java.util.HashMap;
@@ -94,11 +96,11 @@ public class KerasInitilizationTest extends BaseDL4JTest {
                 WeightInit.RELU_UNIFORM.getWeightInitFunction(),
                 WeightInit.ONES.getWeightInitFunction(),
                 WeightInit.ZERO.getWeightInitFunction(),
-                WeightInit.IDENTITY.getWeightInitFunction(),
+                new WeightInitIdentity(0.2),
                 WeightInit.DISTRIBUTION.getWeightInitFunction(new NormalDistribution(mean, stdDev)),
                 WeightInit.DISTRIBUTION.getWeightInitFunction(new OrthogonalDistribution(gain)),
                 WeightInit.DISTRIBUTION.getWeightInitFunction(new ConstantDistribution(value)),
-                WeightInit.VAR_SCALING_NORMAL_FAN_IN.getWeightInitFunction()};
+                new WeightInitVarScalingNormalFanIn(0.2)};
     }
 
     private Distribution[] dl4jDistributions() {
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/configurations/KerasModelImportTest.java b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/configurations/KerasModelImportTest.java
index a015dc24f..b5d3c9ab6 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/configurations/KerasModelImportTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/configurations/KerasModelImportTest.java
@@ -17,22 +17,16 @@
 package org.deeplearning4j.nn.modelimport.keras.configurations;
 
 import lombok.extern.slf4j.Slf4j;
-import lombok.val;
 import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.KerasModelImport;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.InvalidKerasConfigurationException;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.UnsupportedKerasConfigurationException;
 import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
 import org.junit.Test;
-import org.nd4j.linalg.api.ndarray.INDArray;
-import org.nd4j.linalg.factory.Nd4j;
-import org.nd4j.linalg.io.ClassPathResource;
 import org.nd4j.resources.Resources;
 
-import java.io.File;
 import java.io.IOException;
 
-import static org.junit.Assert.assertArrayEquals;
 import static org.junit.Assert.assertNotNull;
 
 /**
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/e2e/KerasLambdaTest.java b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/e2e/KerasLambdaTest.java
index 31611283f..97ae4318f 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/e2e/KerasLambdaTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/e2e/KerasLambdaTest.java
@@ -31,7 +31,6 @@ import org.nd4j.autodiff.samediff.SDVariable;
 import org.nd4j.autodiff.samediff.SameDiff;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.factory.Nd4j;
-import org.nd4j.linalg.io.ClassPathResource;
 import org.nd4j.resources.Resources;
 
 import java.io.File;
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/e2e/KerasModelEndToEndTest.java b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/e2e/KerasModelEndToEndTest.java
index 874931262..b33ff8d1f 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/e2e/KerasModelEndToEndTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/e2e/KerasModelEndToEndTest.java
@@ -24,22 +24,19 @@ import org.deeplearning4j.eval.ROCMultiClass;
 import org.deeplearning4j.gradientcheck.GradientCheckUtil;
 import org.deeplearning4j.nn.api.Layer;
 import org.deeplearning4j.nn.api.layers.IOutputLayer;
-import org.deeplearning4j.nn.conf.layers.CnnLossLayer;
 import org.deeplearning4j.nn.conf.layers.FeedForwardLayer;
 import org.deeplearning4j.nn.conf.layers.LossLayer;
 import org.deeplearning4j.nn.conf.layers.RnnOutputLayer;
 import org.deeplearning4j.nn.graph.ComputationGraph;
-import org.deeplearning4j.nn.layers.recurrent.LSTM;
-import org.deeplearning4j.nn.layers.recurrent.LastTimeStepLayer;
-import org.deeplearning4j.nn.layers.wrapper.BaseWrapperLayer;
-import org.deeplearning4j.nn.modelimport.keras.*;
+import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
+import org.deeplearning4j.nn.modelimport.keras.Hdf5Archive;
+import org.deeplearning4j.nn.modelimport.keras.KerasModel;
+import org.deeplearning4j.nn.modelimport.keras.KerasSequentialModel;
 import org.deeplearning4j.nn.modelimport.keras.utils.KerasModelBuilder;
 import org.deeplearning4j.nn.modelimport.keras.utils.KerasModelUtils;
 import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
 import org.deeplearning4j.nn.transferlearning.FineTuneConfiguration;
 import org.deeplearning4j.nn.transferlearning.TransferLearning;
-import org.deeplearning4j.nn.workspace.ArrayType;
-import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
 import org.junit.Ignore;
 import org.junit.Rule;
 import org.junit.Test;
@@ -47,27 +44,25 @@ import org.junit.rules.TemporaryFolder;
 import org.nd4j.linalg.activations.Activation;
 import org.nd4j.linalg.activations.IActivation;
 import org.nd4j.linalg.activations.impl.*;
-import org.nd4j.linalg.api.buffer.DataBuffer;
 import org.nd4j.linalg.api.buffer.DataType;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.factory.Nd4j;
-import org.nd4j.linalg.io.ClassPathResource;
 import org.nd4j.linalg.learning.config.NoOp;
 import org.nd4j.linalg.lossfunctions.LossFunctions;
 import org.nd4j.resources.Resources;
 
 import java.io.File;
-import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
 import java.nio.file.Files;
 import java.nio.file.StandardCopyOption;
-import java.util.*;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
 
-import static org.junit.Assert.assertArrayEquals;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.*;
 
 /**
  * Unit tests for end-to-end Keras model import.
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/e2e/KerasYolo9000PredictTest.java b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/e2e/KerasYolo9000PredictTest.java
index f1fcc3ded..8bd6e779d 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/e2e/KerasYolo9000PredictTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/e2e/KerasYolo9000PredictTest.java
@@ -21,7 +21,6 @@ import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.graph.ComputationGraph;
 import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.KerasLayer;
-import org.deeplearning4j.nn.modelimport.keras.KerasModel;
 import org.deeplearning4j.nn.modelimport.keras.KerasModelImport;
 import org.deeplearning4j.nn.modelimport.keras.layers.convolutional.KerasSpaceToDepth;
 import org.deeplearning4j.nn.transferlearning.TransferLearning;
@@ -31,11 +30,8 @@ import org.junit.Test;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.dataset.api.preprocessor.ImagePreProcessingScaler;
 import org.nd4j.linalg.factory.Nd4j;
-import org.nd4j.linalg.io.ClassPathResource;
 
 import java.io.File;
-import java.nio.file.Files;
-import java.nio.file.StandardCopyOption;
 
 /**
  * Import previously stored YOLO9000 Keras net from https://github.com/allanzelener/YAD2K.
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/e2e/KerasYolo9000Test.java b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/e2e/KerasYolo9000Test.java
index 403610c10..dcfe7bfda 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/e2e/KerasYolo9000Test.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/e2e/KerasYolo9000Test.java
@@ -26,7 +26,6 @@ import org.junit.Ignore;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
-import org.nd4j.linalg.io.ClassPathResource;
 import org.nd4j.resources.Resources;
 
 import java.io.File;
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasAtrousConvolution1DTest.java b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasAtrousConvolution1DTest.java
index 0a408ac83..eccaeb536 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasAtrousConvolution1DTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasAtrousConvolution1DTest.java
@@ -27,16 +27,11 @@ import org.deeplearning4j.nn.modelimport.keras.layers.convolutional.KerasAtrousC
 import org.deeplearning4j.nn.weights.IWeightInit;
 import org.deeplearning4j.nn.weights.WeightInitXavier;
 import org.junit.Test;
-import org.nd4j.linalg.learning.regularization.L1Regularization;
-import org.nd4j.linalg.learning.regularization.L2Regularization;
-import org.nd4j.linalg.learning.regularization.Regularization;
 
 import java.util.HashMap;
-import java.util.List;
 import java.util.Map;
 
 import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
 
 /**
  * @author Max Pumperla
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasConvolution3DTest.java b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasConvolution3DTest.java
index 4737ec128..ff0ba8f3d 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasConvolution3DTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasConvolution3DTest.java
@@ -28,9 +28,6 @@ import org.deeplearning4j.nn.modelimport.keras.layers.convolutional.KerasConvolu
 import org.deeplearning4j.nn.weights.IWeightInit;
 import org.deeplearning4j.nn.weights.WeightInitXavier;
 import org.junit.Test;
-import org.nd4j.linalg.learning.regularization.L1Regularization;
-import org.nd4j.linalg.learning.regularization.L2Regularization;
-import org.nd4j.linalg.learning.regularization.Regularization;
 
 import java.util.ArrayList;
 import java.util.HashMap;
@@ -39,7 +36,6 @@ import java.util.Map;
 
 import static org.junit.Assert.assertArrayEquals;
 import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
 
 /**
  * @author Max Pumperla
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasCropping1DTest.java b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasCropping1DTest.java
index f356f674f..1676f6136 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasCropping1DTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasCropping1DTest.java
@@ -24,7 +24,6 @@ import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.layers.convolutional.KerasCropping1D;
 import org.junit.Test;
 
-import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.Map;
 
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasCropping3DTest.java b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasCropping3DTest.java
index 1a6f564b4..6ae3065b6 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasCropping3DTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasCropping3DTest.java
@@ -16,13 +16,11 @@
 
 package org.deeplearning4j.nn.modelimport.keras.layers.convolution;
 
-import org.deeplearning4j.nn.conf.layers.convolutional.Cropping2D;
 import org.deeplearning4j.nn.conf.layers.convolutional.Cropping3D;
 import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
-import org.deeplearning4j.nn.modelimport.keras.layers.convolutional.KerasCropping2D;
 import org.deeplearning4j.nn.modelimport.keras.layers.convolutional.KerasCropping3D;
 import org.junit.Test;
 
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasDepthwiseConvolution2DTest.java b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasDepthwiseConvolution2DTest.java
index a79fab8da..364c50e72 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasDepthwiseConvolution2DTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasDepthwiseConvolution2DTest.java
@@ -30,15 +30,11 @@ import org.deeplearning4j.nn.weights.IWeightInit;
 import org.deeplearning4j.nn.weights.WeightInitXavier;
 import org.junit.Test;
 import org.nd4j.base.Preconditions;
-import org.nd4j.linalg.learning.regularization.L1Regularization;
-import org.nd4j.linalg.learning.regularization.L2Regularization;
-import org.nd4j.linalg.learning.regularization.Regularization;
 
 import java.util.*;
 
 import static org.junit.Assert.assertArrayEquals;
 import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
 
 /**
  * @author Max Pumperla
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasUpsampling1DTest.java b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasUpsampling1DTest.java
index 182054900..aec4278e2 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasUpsampling1DTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasUpsampling1DTest.java
@@ -17,18 +17,14 @@
 package org.deeplearning4j.nn.modelimport.keras.layers.convolution;
 
 import org.deeplearning4j.nn.conf.layers.Upsampling1D;
-import org.deeplearning4j.nn.conf.layers.Upsampling2D;
 import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.layers.convolutional.KerasUpsampling1D;
-import org.deeplearning4j.nn.modelimport.keras.layers.convolutional.KerasUpsampling2D;
 import org.junit.Test;
 
-import java.util.ArrayList;
 import java.util.HashMap;
-import java.util.List;
 import java.util.Map;
 
 import static org.junit.Assert.assertEquals;
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasUpsampling2DTest.java b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasUpsampling2DTest.java
index 3c7b30b57..cea117f8f 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasUpsampling2DTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasUpsampling2DTest.java
@@ -17,13 +17,11 @@
 package org.deeplearning4j.nn.modelimport.keras.layers.convolution;
 
 import org.deeplearning4j.nn.conf.layers.Upsampling2D;
-import org.deeplearning4j.nn.conf.layers.ZeroPadding1DLayer;
 import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.layers.convolutional.KerasUpsampling2D;
-import org.deeplearning4j.nn.modelimport.keras.layers.convolutional.KerasZeroPadding1D;
 import org.junit.Test;
 
 import java.util.ArrayList;
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasZeroPadding3DTest.java b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasZeroPadding3DTest.java
index 779d9ce51..c0a60defd 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasZeroPadding3DTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/convolution/KerasZeroPadding3DTest.java
@@ -17,12 +17,10 @@
 package org.deeplearning4j.nn.modelimport.keras.layers.convolution;
 
 import org.deeplearning4j.nn.conf.layers.ZeroPadding3DLayer;
-import org.deeplearning4j.nn.conf.layers.ZeroPaddingLayer;
 import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
-import org.deeplearning4j.nn.modelimport.keras.layers.convolutional.KerasZeroPadding2D;
 import org.deeplearning4j.nn.modelimport.keras.layers.convolutional.KerasZeroPadding3D;
 import org.junit.Test;
 
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasDenseTest.java b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasDenseTest.java
index 334ab96d3..cca2515a8 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasDenseTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasDenseTest.java
@@ -26,16 +26,11 @@ import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
 import org.deeplearning4j.nn.weights.IWeightInit;
 import org.deeplearning4j.nn.weights.WeightInitXavier;
 import org.junit.Test;
-import org.nd4j.linalg.learning.regularization.L1Regularization;
-import org.nd4j.linalg.learning.regularization.L2Regularization;
-import org.nd4j.linalg.learning.regularization.Regularization;
 
 import java.util.HashMap;
-import java.util.List;
 import java.util.Map;
 
 import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
 
 /**
  * @author Max Pumperla
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasPermuteTest.java b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasPermuteTest.java
index 42cb79cfb..1f2400426 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasPermuteTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasPermuteTest.java
@@ -24,10 +24,12 @@ import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.InvalidKerasConfigurationException;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.UnsupportedKerasConfigurationException;
 import org.deeplearning4j.nn.modelimport.keras.preprocessors.PermutePreprocessor;
-import org.deeplearning4j.nn.modelimport.keras.preprocessors.ReshapePreprocessor;
 import org.junit.Test;
 
-import java.util.*;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
 
 import static org.junit.Assert.assertEquals;
 
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasReshapeTest.java b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasReshapeTest.java
index dafafea1d..19d5ce623 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasReshapeTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/core/KerasReshapeTest.java
@@ -24,11 +24,11 @@ import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.InvalidKerasConfigurationException;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.UnsupportedKerasConfigurationException;
 import org.deeplearning4j.nn.modelimport.keras.preprocessors.ReshapePreprocessor;
+import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
 import org.junit.Assert;
 import org.junit.Test;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.factory.Nd4j;
-import org.deeplearning4j.nn.workspace.LayerWorkspaceMgr;
 
 import java.util.*;
 
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/embeddings/KerasEmbeddingTest.java b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/embeddings/KerasEmbeddingTest.java
index abeba3da7..b171e063f 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/embeddings/KerasEmbeddingTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/embeddings/KerasEmbeddingTest.java
@@ -26,11 +26,7 @@ import org.junit.Test;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.factory.Nd4j;
 
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
 
 import static org.junit.Assert.assertEquals;
 
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/local/KerasLocallyConnected1DTest.java b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/local/KerasLocallyConnected1DTest.java
index f8dc975ea..428d5d99e 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/local/KerasLocallyConnected1DTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/local/KerasLocallyConnected1DTest.java
@@ -20,7 +20,6 @@ import org.deeplearning4j.nn.conf.ConvolutionMode;
 import org.deeplearning4j.nn.conf.dropout.Dropout;
 import org.deeplearning4j.nn.conf.inputs.InputType;
 import org.deeplearning4j.nn.conf.layers.LocallyConnected1D;
-import org.deeplearning4j.nn.conf.layers.LocallyConnected2D;
 import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.KerasTestUtils;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
@@ -31,10 +30,8 @@ import org.junit.Test;
 
 import java.util.ArrayList;
 import java.util.HashMap;
-import java.util.List;
 import java.util.Map;
 
-import static org.junit.Assert.assertArrayEquals;
 import static org.junit.Assert.assertEquals;
 
 /**
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/local/KerasLocallyConnected2DTest.java b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/local/KerasLocallyConnected2DTest.java
index b38b8f783..1ea69e06a 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/local/KerasLocallyConnected2DTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/local/KerasLocallyConnected2DTest.java
@@ -27,15 +27,14 @@ import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.KerasLayerConfiguration;
 import org.deeplearning4j.nn.weights.WeightInit;
 import org.junit.Test;
-import org.nd4j.linalg.learning.regularization.L1Regularization;
-import org.nd4j.linalg.learning.regularization.L2Regularization;
-import org.nd4j.linalg.learning.regularization.Regularization;
 
-import java.util.*;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
 
 import static org.junit.Assert.assertArrayEquals;
 import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
 
 /**
  * @author Max Pumperla
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/pooling/KerasPooling3DTest.java b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/pooling/KerasPooling3DTest.java
index cb6a66155..9026c7308 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/pooling/KerasPooling3DTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/pooling/KerasPooling3DTest.java
@@ -19,7 +19,6 @@ package org.deeplearning4j.nn.modelimport.keras.layers.pooling;
 import org.deeplearning4j.nn.conf.ConvolutionMode;
 import org.deeplearning4j.nn.conf.layers.PoolingType;
 import org.deeplearning4j.nn.conf.layers.Subsampling3DLayer;
-import org.deeplearning4j.nn.conf.layers.SubsamplingLayer;
 import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras1LayerConfiguration;
 import org.deeplearning4j.nn.modelimport.keras.config.Keras2LayerConfiguration;
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/recurrent/KerasLSTMTest.java b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/recurrent/KerasLSTMTest.java
index e2d0b7a03..3b82f14ae 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/recurrent/KerasLSTMTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/layers/recurrent/KerasLSTMTest.java
@@ -33,14 +33,13 @@ import org.deeplearning4j.nn.weights.IWeightInit;
 import org.deeplearning4j.nn.weights.WeightInitXavier;
 import org.junit.Assert;
 import org.junit.Test;
-import org.nd4j.linalg.learning.regularization.L1Regularization;
-import org.nd4j.linalg.learning.regularization.L2Regularization;
-import org.nd4j.linalg.learning.regularization.Regularization;
 
-import java.util.*;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
 
 import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
 
 /**
  * @author Max Pumperla
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/optimizers/OptimizerImport.java b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/optimizers/OptimizerImport.java
index 8819ca9b9..f2a693d9a 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/optimizers/OptimizerImport.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/optimizers/OptimizerImport.java
@@ -16,15 +16,12 @@
 
 package org.deeplearning4j.nn.modelimport.keras.optimizers;
 
-import org.deeplearning4j.config.DL4JSystemProperties;
 import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.KerasModel;
 import org.deeplearning4j.nn.modelimport.keras.KerasSequentialModel;
-import org.deeplearning4j.nn.modelimport.keras.e2e.KerasModelEndToEndTest;
 import org.deeplearning4j.nn.modelimport.keras.utils.KerasModelBuilder;
 import org.deeplearning4j.util.DL4JFileUtils;
 import org.junit.Test;
-import org.nd4j.linalg.io.ClassPathResource;
 import org.nd4j.resources.Resources;
 
 import java.io.File;
@@ -32,8 +29,6 @@ import java.io.InputStream;
 import java.nio.file.Files;
 import java.nio.file.StandardCopyOption;
 
-import static java.io.File.createTempFile;
-
 public class OptimizerImport extends BaseDL4JTest {
 
     @Test
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/preprocessing/sequence/TimeSeriesGeneratorImportTest.java b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/preprocessing/sequence/TimeSeriesGeneratorImportTest.java
index 8753f772c..577e089f9 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/preprocessing/sequence/TimeSeriesGeneratorImportTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/preprocessing/sequence/TimeSeriesGeneratorImportTest.java
@@ -18,9 +18,7 @@ package org.deeplearning4j.nn.modelimport.keras.preprocessing.sequence;
 
 import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.InvalidKerasConfigurationException;
-import org.deeplearning4j.nn.modelimport.keras.preprocessing.text.KerasTokenizer;
 import org.junit.Test;
-import org.nd4j.linalg.io.ClassPathResource;
 import org.nd4j.resources.Resources;
 
 import java.io.IOException;
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/preprocessing/text/TokenizerImportTest.java b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/preprocessing/text/TokenizerImportTest.java
index f229ec813..45114685b 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/preprocessing/text/TokenizerImportTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/preprocessing/text/TokenizerImportTest.java
@@ -19,15 +19,11 @@ package org.deeplearning4j.nn.modelimport.keras.preprocessing.text;
 import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.deeplearning4j.nn.modelimport.keras.exceptions.InvalidKerasConfigurationException;
 import org.junit.Test;
-import org.nd4j.linalg.io.ClassPathResource;
 import org.nd4j.resources.Resources;
 
 import java.io.IOException;
 
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertNull;
-import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.*;
 
 /**
  * Import Keras Tokenizer
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/preprocessing/text/TokenizerTest.java b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/preprocessing/text/TokenizerTest.java
index bbcd00372..a4fb6994b 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/preprocessing/text/TokenizerTest.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/preprocessing/text/TokenizerTest.java
@@ -20,7 +20,6 @@ import org.deeplearning4j.nn.modelimport.keras.BaseDL4JTest;
 import org.junit.Test;
 import org.nd4j.linalg.api.ndarray.INDArray;
 
-import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Map;
 
diff --git a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/weights/KerasWeightSettingTests.java b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/weights/KerasWeightSettingTests.java
index 18cf3305d..7791e3417 100644
--- a/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/weights/KerasWeightSettingTests.java
+++ b/deeplearning4j/deeplearning4j-modelimport/src/test/java/org/deeplearning4j/nn/modelimport/keras/weights/KerasWeightSettingTests.java
@@ -29,7 +29,6 @@ import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.factory.Nd4j;
-import org.nd4j.linalg.io.ClassPathResource;
 import org.nd4j.resources.Resources;
 
 import java.io.File;
diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/samediff/SameDiffLayer.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/samediff/SameDiffLayer.java
index d9655a58f..e17535acc 100644
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/samediff/SameDiffLayer.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/conf/layers/samediff/SameDiffLayer.java
@@ -16,13 +16,11 @@
 
 package org.deeplearning4j.nn.conf.layers.samediff;
 
-import lombok.Data;
-import lombok.EqualsAndHashCode;
-import lombok.Getter;
-import lombok.Setter;
+import lombok.*;
 import org.deeplearning4j.nn.api.Layer;
 import org.deeplearning4j.nn.api.MaskState;
 import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
+import org.deeplearning4j.nn.weights.IWeightInit;
 import org.deeplearning4j.nn.weights.WeightInit;
 import org.deeplearning4j.optimize.api.TrainingListener;
 import org.nd4j.autodiff.samediff.SDVariable;
@@ -32,6 +30,7 @@ import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.primitives.Pair;
 
 import java.util.Collection;
+import java.util.HashMap;
 import java.util.Map;
 
 /**
@@ -58,10 +57,12 @@ import java.util.Map;
 public abstract class SameDiffLayer extends AbstractSameDiffLayer {
 
     protected WeightInit weightInit;
+    protected Map<String,IWeightInit> paramWeightInit;
 
     protected SameDiffLayer(Builder builder) {
         super(builder);
         this.weightInit = builder.weightInit;
+        this.paramWeightInit = builder.paramWeightInit;
     }
 
     protected SameDiffLayer() {
@@ -115,6 +116,7 @@ public abstract class SameDiffLayer extends AbstractSameDiffLayer {
     public static abstract class Builder<T extends Builder<T>> extends AbstractSameDiffLayer.Builder<T> {
 
         protected WeightInit weightInit = WeightInit.XAVIER;
+        protected Map<String,IWeightInit> paramWeightInit;
 
         /**
          * @param weightInit Weight initialization to use for the layer
@@ -123,5 +125,12 @@ public abstract class SameDiffLayer extends AbstractSameDiffLayer {
             this.setWeightInit(weightInit);
             return (T) this;
         }
+
+        public T weightInit(@NonNull String param, @NonNull IWeightInit weightInit){
+            if(paramWeightInit == null)
+                paramWeightInit = new HashMap<>();
+            paramWeightInit.put(param, weightInit);
+            return (T) this;
+        }
     }
 }
diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/weights/WeightInitIdentity.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/weights/WeightInitIdentity.java
index 076fa2ac8..b25121cd3 100644
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/weights/WeightInitIdentity.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/weights/WeightInitIdentity.java
@@ -16,11 +16,14 @@
 
 package org.deeplearning4j.nn.weights;
 
+import lombok.Data;
 import lombok.EqualsAndHashCode;
+import lombok.NoArgsConstructor;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.factory.Nd4j;
 import org.nd4j.linalg.indexing.INDArrayIndex;
 import org.nd4j.linalg.indexing.NDArrayIndex;
+import org.nd4j.shade.jackson.annotation.JsonProperty;
 
 import java.util.Arrays;
 
@@ -32,9 +35,17 @@ import java.util.Arrays;
  *
  * @author Adam Gibson
  */
-@EqualsAndHashCode
+@Data
+@NoArgsConstructor
 public class WeightInitIdentity implements IWeightInit {
 
+    private Double scale;
+
+    public WeightInitIdentity(@JsonProperty("scale") Double scale){
+        this.scale = scale;
+    }
+
+
     @Override
     public INDArray init(double fanIn, double fanOut, long[] shape, char order, INDArray paramView) {
         if (shape[0] != shape[1]) {
@@ -59,6 +70,11 @@ public class WeightInitIdentity implements IWeightInit {
         } else {
             ret = Nd4j.createUninitialized(shape, order).assign(Nd4j.eye(shape[0]));
         }
+
+        if(scale != null){
+            ret.muli(scale);
+        }
+
         INDArray flat = Nd4j.toFlattened(order, ret);
         paramView.assign(flat);
         return paramView.reshape(order, shape);
@@ -82,13 +98,16 @@ public class WeightInitIdentity implements IWeightInit {
             indArrayIndices[i] = NDArrayIndex.point(shape[i] / 2);
         }
 
-        paramView.assign(Nd4j.zeros(paramView.shape()));
+        paramView.assign(0);
         final INDArray params =paramView.reshape(order, shape);
         for (int i = 0; i < shape[0]; i++) {
             indArrayIndices[0] = NDArrayIndex.point(i);
             indArrayIndices[1] = NDArrayIndex.point(i);
             params.put(indArrayIndices, Nd4j.ones(1));
         }
+        if(scale != null){
+            params.muli(scale);
+        }
         return params;
     }
 }
diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/weights/WeightInitUtil.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/weights/WeightInitUtil.java
index b110bc5a0..17034d408 100755
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/weights/WeightInitUtil.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/weights/WeightInitUtil.java
@@ -19,6 +19,7 @@ package org.deeplearning4j.nn.weights;
 
 import org.apache.commons.math3.util.FastMath;
 import org.nd4j.linalg.api.ndarray.INDArray;
+import org.nd4j.linalg.api.ops.random.impl.TruncatedNormalDistribution;
 import org.nd4j.linalg.api.rng.distribution.Distribution;
 import org.nd4j.linalg.api.rng.distribution.impl.OrthogonalDistribution;
 import org.nd4j.linalg.factory.Nd4j;
@@ -146,14 +147,13 @@ public class WeightInitUtil {
                 paramView.assign(flat);
                 break;
             case VAR_SCALING_NORMAL_FAN_IN:
-                // TODO: needs to be truncated normal to match keras.
-                Nd4j.randn(paramView).divi(FastMath.sqrt(fanIn));
+                Nd4j.exec(new TruncatedNormalDistribution(paramView, 0.0, Math.sqrt(1.0 / fanIn)));
                 break;
             case VAR_SCALING_NORMAL_FAN_OUT:
-                Nd4j.randn(paramView).divi(FastMath.sqrt(fanOut));
+                Nd4j.exec(new TruncatedNormalDistribution(paramView, 0.0, Math.sqrt(1.0 / fanOut)));
                 break;
             case VAR_SCALING_NORMAL_FAN_AVG:
-                Nd4j.randn(paramView).divi(FastMath.sqrt((fanIn + fanOut) / 2));
+                Nd4j.exec(new TruncatedNormalDistribution(paramView, 0.0, Math.sqrt(2.0 / (fanIn + fanOut))));
                 break;
             case VAR_SCALING_UNIFORM_FAN_IN:
                 double scalingFanIn = 3.0 / Math.sqrt(fanIn);
diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/weights/WeightInitVarScalingNormalFanAvg.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/weights/WeightInitVarScalingNormalFanAvg.java
index 0be5af0e9..3b9698f10 100644
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/weights/WeightInitVarScalingNormalFanAvg.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/weights/WeightInitVarScalingNormalFanAvg.java
@@ -16,22 +16,39 @@
 
 package org.deeplearning4j.nn.weights;
 
+import lombok.Data;
 import lombok.EqualsAndHashCode;
+import lombok.NoArgsConstructor;
 import org.apache.commons.math3.util.FastMath;
 import org.nd4j.linalg.api.ndarray.INDArray;
+import org.nd4j.linalg.api.ops.random.impl.TruncatedNormalDistribution;
 import org.nd4j.linalg.factory.Nd4j;
 
 /**
- * Gaussian distribution with mean 0, variance 1.0/((fanIn + fanOut)/2)
+ * Truncated aussian distribution with mean 0, variance 1.0/((fanIn + fanOut)/2)
  *
  * @author Adam Gibson
  */
-@EqualsAndHashCode
+@Data
+@NoArgsConstructor
 public class WeightInitVarScalingNormalFanAvg implements IWeightInit {
 
+    private Double scale;
+
+    public WeightInitVarScalingNormalFanAvg(Double scale){
+        this.scale = scale;
+    }
+
     @Override
     public INDArray init(double fanIn, double fanOut, long[] shape, char order, INDArray paramView) {
-        Nd4j.randn(paramView).divi(FastMath.sqrt((fanIn + fanOut) / 2));
+        double std;
+        if(scale == null){
+            std = Math.sqrt(2.0 / (fanIn + fanOut));
+        } else {
+            std = Math.sqrt(2.0 * scale / (fanIn + fanOut));
+        }
+
+        Nd4j.exec(new TruncatedNormalDistribution(paramView, 0.0, std));
         return paramView.reshape(order, shape);
     }
 }
diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/weights/WeightInitVarScalingNormalFanIn.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/weights/WeightInitVarScalingNormalFanIn.java
index 3f89ff015..dca457de3 100644
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/weights/WeightInitVarScalingNormalFanIn.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/weights/WeightInitVarScalingNormalFanIn.java
@@ -16,23 +16,38 @@
 
 package org.deeplearning4j.nn.weights;
 
-import lombok.EqualsAndHashCode;
-import org.apache.commons.math3.util.FastMath;
+import lombok.Data;
+import lombok.NoArgsConstructor;
 import org.nd4j.linalg.api.ndarray.INDArray;
+import org.nd4j.linalg.api.ops.random.impl.TruncatedNormalDistribution;
 import org.nd4j.linalg.factory.Nd4j;
 
 /**
- * Gaussian distribution with mean 0, variance 1.0/(fanIn)
+ * Gaussian distribution with mean 0, variance {@code 1.0/(fanIn)}<br>
+ * If a scale is provided, use variance {@code scale/(fanIn)} instead
  *
  * @author Adam Gibson
  */
-@EqualsAndHashCode
+@Data
+@NoArgsConstructor
 public class WeightInitVarScalingNormalFanIn implements IWeightInit {
 
+    private Double scale;
+
+    public WeightInitVarScalingNormalFanIn(Double scale){
+        this.scale = scale;
+    }
+
     @Override
     public INDArray init(double fanIn, double fanOut, long[] shape, char order, INDArray paramView) {
-        // TODO: needs to be truncated normal to match keras.
-        Nd4j.randn(paramView).divi(FastMath.sqrt(fanIn));
+        double std;
+        if(scale == null){
+            std = Math.sqrt(1.0 / fanIn);
+        } else {
+            std = Math.sqrt(scale / fanIn);
+        }
+
+        Nd4j.exec(new TruncatedNormalDistribution(paramView, 0.0, std));
         return paramView.reshape(order, shape);
     }
 }
diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/weights/WeightInitVarScalingNormalFanOut.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/weights/WeightInitVarScalingNormalFanOut.java
index 6369a19c6..0af43ac88 100644
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/weights/WeightInitVarScalingNormalFanOut.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/weights/WeightInitVarScalingNormalFanOut.java
@@ -16,22 +16,40 @@
 
 package org.deeplearning4j.nn.weights;
 
+import lombok.Data;
 import lombok.EqualsAndHashCode;
+import lombok.NoArgsConstructor;
 import org.apache.commons.math3.util.FastMath;
 import org.nd4j.linalg.api.ndarray.INDArray;
+import org.nd4j.linalg.api.ops.random.impl.TruncatedNormalDistribution;
 import org.nd4j.linalg.factory.Nd4j;
 
 /**
- * Gaussian distribution with mean 0, variance 1.0/(fanOut)
+ * Truncated normal distribution with mean 0, variance 1.0/(fanOut)<br>
+ * If a scale is provided, variance is scale / fanOut
  *
  * @author Adam Gibson
  */
-@EqualsAndHashCode
+@Data
+@NoArgsConstructor
 public class WeightInitVarScalingNormalFanOut implements IWeightInit {
 
+    private Double scale;
+
+    public WeightInitVarScalingNormalFanOut(Double scale){
+        this.scale = scale;
+    }
+
     @Override
     public INDArray init(double fanIn, double fanOut, long[] shape, char order, INDArray paramView) {
-        Nd4j.randn(paramView).divi(FastMath.sqrt(fanOut));
+        double std;
+        if(scale == null){
+            std = Math.sqrt(1.0 / fanOut);
+        } else {
+            std = Math.sqrt(scale / fanOut);
+        }
+
+        Nd4j.exec(new TruncatedNormalDistribution(paramView, 0.0, std));
         return paramView.reshape(order, shape);
     }
 }
diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/weights/WeightInitVarScalingUniformFanAvg.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/weights/WeightInitVarScalingUniformFanAvg.java
index afb1a1dc8..f2e050e6e 100644
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/weights/WeightInitVarScalingUniformFanAvg.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/weights/WeightInitVarScalingUniformFanAvg.java
@@ -16,7 +16,9 @@
 
 package org.deeplearning4j.nn.weights;
 
+import lombok.Data;
 import lombok.EqualsAndHashCode;
+import lombok.NoArgsConstructor;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.factory.Nd4j;
 
@@ -25,12 +27,22 @@ import org.nd4j.linalg.factory.Nd4j;
  *
  * @author Adam Gibson
  */
-@EqualsAndHashCode
+@Data
+@NoArgsConstructor
 public class WeightInitVarScalingUniformFanAvg implements IWeightInit {
 
+    private Double scale;
+
+    public WeightInitVarScalingUniformFanAvg(Double scale){
+        this.scale = scale;
+    }
+
     @Override
     public INDArray init(double fanIn, double fanOut, long[] shape, char order, INDArray paramView) {
         double scalingFanAvg = 3.0 / Math.sqrt((fanIn + fanOut) / 2);
+        if(scale != null)
+            scalingFanAvg *= scale;
+
         Nd4j.rand(paramView, Nd4j.getDistributions().createUniform(-scalingFanAvg, scalingFanAvg));
         return paramView.reshape(order, shape);
     }
diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/weights/WeightInitVarScalingUniformFanIn.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/weights/WeightInitVarScalingUniformFanIn.java
index 0cf26ecc6..7135394a7 100644
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/weights/WeightInitVarScalingUniformFanIn.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/weights/WeightInitVarScalingUniformFanIn.java
@@ -16,21 +16,34 @@
 
 package org.deeplearning4j.nn.weights;
 
+import lombok.Data;
 import lombok.EqualsAndHashCode;
+import lombok.NoArgsConstructor;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.factory.Nd4j;
 
 /**
- * Uniform U[-a,a] with a=3.0/(fanIn)
+ * Uniform U[-a,a] with a=3.0/(fanIn)<br>
+ * If a scale is provided, a = 3.0 * scale / (fanIn)
  *
  * @author Adam Gibson
  */
-@EqualsAndHashCode
+@NoArgsConstructor
+@Data
 public class WeightInitVarScalingUniformFanIn implements IWeightInit {
 
+    private Double scale;
+
+    public WeightInitVarScalingUniformFanIn(Double scale){
+        this.scale = scale;
+    }
+
     @Override
     public INDArray init(double fanIn, double fanOut, long[] shape, char order, INDArray paramView) {
         double scalingFanIn = 3.0 / Math.sqrt(fanIn);
+        if(scale != null)
+            scalingFanIn *= scale;
+
         Nd4j.rand(paramView, Nd4j.getDistributions().createUniform(-scalingFanIn, scalingFanIn));
         return paramView.reshape(order, shape);
     }
diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/weights/WeightInitVarScalingUniformFanOut.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/weights/WeightInitVarScalingUniformFanOut.java
index 2d3b116fc..09bf2053d 100644
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/weights/WeightInitVarScalingUniformFanOut.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/weights/WeightInitVarScalingUniformFanOut.java
@@ -16,21 +16,33 @@
 
 package org.deeplearning4j.nn.weights;
 
+import lombok.Data;
 import lombok.EqualsAndHashCode;
+import lombok.NoArgsConstructor;
 import org.nd4j.linalg.api.ndarray.INDArray;
 import org.nd4j.linalg.factory.Nd4j;
 
 /**
- * Uniform U[-a,a] with a=3.0/(fanOut)
+ * Uniform U[-a,a] with a=3.0/(fanOut)<br>
+ * If a scale is provided, a = 3.0 * scale / fanOut
  *
  * @author Adam Gibson
  */
-@EqualsAndHashCode
+@Data
+@NoArgsConstructor
 public class WeightInitVarScalingUniformFanOut implements IWeightInit {
 
+    private Double scale;
+
+    public WeightInitVarScalingUniformFanOut(Double scale){
+        this.scale = scale;
+    }
+
     @Override
     public INDArray init(double fanIn, double fanOut, long[] shape, char order, INDArray paramView) {
         double scalingFanOut = 3.0 / Math.sqrt(fanOut);
+        if(scale != null)
+            scalingFanOut *= scale;
         Nd4j.rand(paramView, Nd4j.getDistributions().createUniform(-scalingFanOut, scalingFanOut));
         return paramView.reshape(order, shape);
     }
diff --git a/libnd4j/CMakeLists.txt b/libnd4j/CMakeLists.txt
index c563eda27..9ce9b46a3 100755
--- a/libnd4j/CMakeLists.txt
+++ b/libnd4j/CMakeLists.txt
@@ -25,7 +25,7 @@ elseif (APPLE)
 elseif(WIN32)
     set(X86_BUILD true)
     if (CUDA_BLAS)
-        set(CMAKE_CXX_FLAGS_RELEASE  " /O2 -D_RELEASE=true /wd4804")
+        set(CMAKE_CXX_FLAGS_RELEASE  "-D_RELEASE=true /wd4804")
         set(CMAKE_CXX_FLAGS_DEBUG  "  /FS /EHsc /wd4661 /wd4804 /wd4267 /wd4244 /wd4251 /wd4305")
     else()
         set(CMAKE_CXX_FLAGS_RELEASE  "-O3 -fPIC -std=c++11 -fmax-errors=2 -D_RELEASE=true")
diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/shape/Shape.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/shape/Shape.java
index 51711b3d2..44298ffa2 100644
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/shape/Shape.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/shape/Shape.java
@@ -3607,6 +3607,13 @@ public class Shape {
             return ArrayUtil.prodLong(shape);
     }
 
+    public static long lengthOf(int[] shape) {
+        if (shape.length == 0)
+            return 1L;
+        else
+            return ArrayUtil.prodLong(shape);
+    }
+
     /**
      * Calculate the length of the buffer required to store the given shape with the given strides
      *
diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/dataset/api/iterator/SamplingDataSetIterator.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/dataset/api/iterator/SamplingDataSetIterator.java
index c33b37565..cc6fba068 100644
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/dataset/api/iterator/SamplingDataSetIterator.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/dataset/api/iterator/SamplingDataSetIterator.java
@@ -28,11 +28,6 @@ import java.util.List;
  * @author Adam Gibson
  */
 public class SamplingDataSetIterator implements DataSetIterator {
-
-    /**
-     *
-     */
-    private static final long serialVersionUID = -2700563801361726914L;
     private DataSet sampleFrom;
     private int batchSize;
     private int totalNumberSamples;
@@ -145,6 +140,4 @@ public class SamplingDataSetIterator implements DataSetIterator {
         numTimesSampled++;
         return ret;
     }
-
-
 }
diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/factory/Nd4j.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/factory/Nd4j.java
index 25960a8a8..c95dc5ef2 100644
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/factory/Nd4j.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/factory/Nd4j.java
@@ -1164,26 +1164,15 @@ public class Nd4j {
      * @param type  the opType to create
      * @return the created buffer
      */
-    public static DataBuffer createBuffer(int[] shape, DataType type) {
-        long length = ArrayUtil.prodLong(shape);
-
-        if (type == DataType.INT)
-            return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createInt(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createInt(length, true, Nd4j.getMemoryManager().getCurrentWorkspace());
-        else if (type == DataType.LONG)
-            return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createLong(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createLong(length, true, Nd4j.getMemoryManager().getCurrentWorkspace());
-        else if (type == DataType.HALF)
-            return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createHalf(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createHalf(length, true, Nd4j.getMemoryManager().getCurrentWorkspace());
-        else if (type == DataType.DOUBLE)
-            return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createDouble(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createDouble(length, true, Nd4j.getMemoryManager().getCurrentWorkspace());
-        else
-            return Nd4j.getMemoryManager().getCurrentWorkspace() == null ? DATA_BUFFER_FACTORY_INSTANCE.createFloat(length, true) : DATA_BUFFER_FACTORY_INSTANCE.createFloat(length, true, Nd4j.getMemoryManager().getCurrentWorkspace());
+    public static DataBuffer createBuffer(@NonNull int[] shape, @NonNull DataType type) {
+        return createBuffer(ArrayUtil.toLongArray(shape), type);
     }
 
     /**
      * See {@link  #createBuffer(int[], DataType)}
      */
-    public static DataBuffer createBuffer(long[] shape, DataType type) {
-        long length = ArrayUtil.prodLong(shape);
+    public static DataBuffer createBuffer(@NonNull long[] shape, @NonNull DataType type) {
+        long length = Shape.lengthOf(shape);
 
         switch (type) {
             case BOOL:
@@ -1229,14 +1218,14 @@ public class Nd4j {
      * @return the created buffer.
      */
     public static DataBuffer createBufferDetached(int[] shape, DataType type) {
-        return createBufferDetachedImpl( ArrayUtil.prodLong(shape), type);
+        return createBufferDetachedImpl( Shape.lengthOf(shape), type);
     }
 
     /**
      * See {@link  #createBufferDetached(int[], DataType)}
      */
     public static DataBuffer createBufferDetached(long[] shape, DataType type) {
-        return createBufferDetachedImpl( ArrayUtil.prodLong(shape), type);
+        return createBufferDetachedImpl( Shape.lengthOf(shape), type);
     }
 
     // used by createBufferDetached(long[] DataType) and createBufferDetached(int[] , DataType)

From a856922fe97d9cc4284bb3130c1432c054acedcf Mon Sep 17 00:00:00 2001
From: Alex Black <blacka101@gmail.com>
Date: Sat, 16 Nov 2019 23:09:41 +1100
Subject: [PATCH 15/15] #8409 Fix compgraph backprop issue with dual embedding
 layers from single input (#52)

Signed-off-by: AlexDBlack <blacka101@gmail.com>
---
 .../nn/graph/TestComputationGraphNetwork.java | 19 +++++++++++++++++++
 .../nn/graph/ComputationGraph.java            |  7 ++++++-
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/graph/TestComputationGraphNetwork.java b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/graph/TestComputationGraphNetwork.java
index daf657d0a..3e330d248 100644
--- a/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/graph/TestComputationGraphNetwork.java
+++ b/deeplearning4j/deeplearning4j-core/src/test/java/org/deeplearning4j/nn/graph/TestComputationGraphNetwork.java
@@ -2143,4 +2143,23 @@ public class TestComputationGraphNetwork extends BaseDL4JTest {
         INDArray in = Nd4j.create(DataType.FLOAT, 1, 3, 16, 16, 16);
         INDArray out = cg.outputSingle(in);
     }
+
+    @Test
+    public void testDualEmbedding(){
+        ComputationGraphConfiguration conf = new NeuralNetConfiguration.Builder()
+                .graphBuilder()
+                .addInputs("in")
+                .addLayer("e1", new EmbeddingLayer.Builder().nIn(10).nOut(5).build(), "in")
+                .addLayer("e2", new EmbeddingLayer.Builder().nIn(10).nOut(5).build(), "in")
+                .addLayer("out", new OutputLayer.Builder().nIn(10).nOut(2).activation(Activation.SOFTMAX).lossFunction(LossFunctions.LossFunction.MCXENT).build(), "e1", "e2")
+                .setOutputs("out")
+                .build();
+
+        ComputationGraph cg = new ComputationGraph(conf);
+        cg.init();
+
+        INDArray in = Nd4j.createFromArray(3).reshape(1, 1);
+        INDArray label = Nd4j.createFromArray(1, 0).reshape(1, 2);
+        cg.fit(new DataSet(in, label));
+    }
 }
diff --git a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/graph/ComputationGraph.java b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/graph/ComputationGraph.java
index 32d7bfb73..1be13ddf3 100755
--- a/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/graph/ComputationGraph.java
+++ b/deeplearning4j/deeplearning4j-nn/src/main/java/org/deeplearning4j/nn/graph/ComputationGraph.java
@@ -2734,7 +2734,12 @@ public class ComputationGraph implements Serializable, Model, NeuralNetwork {
                         if (setVertexEpsilon[gv.getVertexIndex()]) {
                             //This vertex: must output to multiple vertices... we want to add the epsilons here
                             INDArray currentEps = gv.getEpsilon();
-                            gv.setEpsilon(currentEps.addi(epsilons[j++]));  //TODO is this always safe?
+                            if(currentEps == null){
+                                //Edge case: this can be null for dual embedding layer case - in -> e1, in -> e2
+                                gv.setEpsilon(currentEps);
+                            } else {
+                                gv.setEpsilon(currentEps.addi(epsilons[j++]));  //TODO is this always safe?
+                            }
                         } else {
                             gv.setEpsilon(epsilons[j++]);
                         }