diff --git a/libnd4j/include/execution/cuda/ContextBuffers.cu b/libnd4j/include/execution/cuda/ContextBuffers.cu
index c8a149a5f..ed84d511a 100644
--- a/libnd4j/include/execution/cuda/ContextBuffers.cu
+++ b/libnd4j/include/execution/cuda/ContextBuffers.cu
@@ -30,13 +30,13 @@
 
 namespace nd4j {
     ContextBuffers::ContextBuffers() {
-        nd4j_printf("Creating ContextBuffers for device [%i]\n", AffinityManager::currentDeviceId());
+        //nd4j_printf("Creating ContextBuffers for device [%i]\n", AffinityManager::currentDeviceId());
         _deviceId = AffinityManager::currentDeviceId();
     }
 
     ContextBuffers::~ContextBuffers() {
         if (_allocated) {
-            nd4j_printf("Releasing ContextBuffers\n","");
+            //nd4j_printf("Releasing ContextBuffers\n","");
 
             if (_allocationPointer != nullptr)
                 cudaFree(_allocationPointer);
@@ -69,7 +69,7 @@ namespace nd4j {
     }
 
     void ContextBuffers::initialize() {
-        nd4j_printf("Initializing buffers on deviceId [%i]\n", AffinityManager::currentNativeDeviceId());
+        //nd4j_printf("Initializing buffers on deviceId [%i]\n", AffinityManager::currentNativeDeviceId());
 
         auto res = cudaMalloc(reinterpret_cast<void**>(&_reductionPointer), 1024 * 1024 * 8);
         if (res != 0)
diff --git a/libnd4j/include/helpers/impl/Parameters.cpp b/libnd4j/include/helpers/impl/Parameters.cpp
index f5bcb1014..d2678832f 100644
--- a/libnd4j/include/helpers/impl/Parameters.cpp
+++ b/libnd4j/include/helpers/impl/Parameters.cpp
@@ -19,6 +19,7 @@
 //
 
 #include "../benchmark/Parameters.h"
+#include <stdexcept>
 
 namespace nd4j {
     Parameters* Parameters::addIntParam(std::string string, int param) {
diff --git a/libnd4j/include/op_boilerplate.h b/libnd4j/include/op_boilerplate.h
index d9c8dee62..4f70d9bf2 100644
--- a/libnd4j/include/op_boilerplate.h
+++ b/libnd4j/include/op_boilerplate.h
@@ -1461,7 +1461,7 @@
 
 #ifdef _RELEASE
 
-#define ALLOCATE_SPECIAL(VARIABLE, WORKSPACE, LENGTH, TT) if (WORKSPACE == nullptr) {auto erc_##VARIABLE = cudaMalloc(reinterpret_cast<void**>(&VARIABLE), LENGTH * sizeof(TT)); if (erc_##VARIABLE != 0) {throw cuda_exception::build("[DEVICE] allocation failed", erc_##VARIABLE);} else { }; } else {VARIABLE = reinterpret_cast<TT *>(WORKSPACE->allocateBytes(nd4j::memory::MemoryType::DEVICE, LENGTH * sizeof(TT))); }
+#define ALLOCATE_SPECIAL(VARIABLE, WORKSPACE, LENGTH, TT) if (WORKSPACE == nullptr) {auto erc_##VARIABLE = cudaMalloc(reinterpret_cast<void**>(&VARIABLE), LENGTH * sizeof(TT) + 8); if (erc_##VARIABLE != 0) {throw cuda_exception::build("[DEVICE] allocation failed", erc_##VARIABLE);} else { }; } else {VARIABLE = reinterpret_cast<TT *>(WORKSPACE->allocateBytes(nd4j::memory::MemoryType::DEVICE, LENGTH * sizeof(TT) + 8)); }
 #define RELEASE_SPECIAL(VARIABLE, WORKSPACE) if (VARIABLE != nullptr) {if (WORKSPACE == nullptr) { auto erc_##VARIABLE = cudaFree(reinterpret_cast<void *>(VARIABLE));  if (erc_##VARIABLE != 0) {throw cuda_exception::build("[DEVICE] deallocation failed", erc_##VARIABLE);}; }; };
 
 #else
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/lup.cu b/libnd4j/include/ops/declarable/helpers/cuda/lup.cu
index 1bf40ba7b..354f360c3 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/lup.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/lup.cu
@@ -177,7 +177,9 @@ namespace helpers {
     }
 
     void invertLowerMatrix(NDArray* inputMatrix, NDArray* invertedMatrix) {
+        NDArray::prepareSpecialUse({invertedMatrix}, {inputMatrix});
         BUILD_SINGLE_SELECTOR(inputMatrix->dataType(), invertLowerMatrix_, (inputMatrix, invertedMatrix), FLOAT_NATIVE);
+        NDArray::registerSpecialUse({invertedMatrix}, {inputMatrix});
     }
 
     template <typename T>
@@ -195,7 +197,9 @@ namespace helpers {
     }
 
     void invertUpperMatrix(NDArray* inputMatrix, NDArray* invertedMatrix) {
+        NDArray::prepareSpecialUse({invertedMatrix}, {inputMatrix});
         BUILD_SINGLE_SELECTOR(inputMatrix->dataType(), invertUpperMatrix_, (inputMatrix, invertedMatrix), FLOAT_NATIVE);
+        NDArray::prepareSpecialUse({invertedMatrix}, {inputMatrix});
     }
 
 //    template <typename T>
@@ -242,11 +246,7 @@ namespace helpers {
 
     template <typename T, typename F>
     static __global__ void determinantKernel(T* compound, T* result, Nd4jLong len) {
-        __shared__ F tempRes;
-        if (blockIdx.x == 0) {
-            tempRes = (F)result[0];
-        }
-        __syncthreads();
+        F tempRes = (F)result[0];
 
         auto start = blockIdx.x * blockDim.x + threadIdx.x;
         auto step = blockDim.x * gridDim.x;
@@ -256,18 +256,14 @@ namespace helpers {
         }
         __syncthreads();
 
-        if (blockIdx.x == 0) {
+        if (threadIdx.x == 0) {
             result[0] = (T)tempRes;
         }
     }
 
         template <typename T, typename F>
         static __global__ void determinantLogKernel(T* compound, T* result, Nd4jLong len) {
-            __shared__ F tempRes;
-            if (blockIdx.x == 0) {
-                tempRes = (F)result[0];
-            }
-            __syncthreads();
+            F tempRes =  (F)result[0];
 
             auto start = blockIdx.x * blockDim.x + threadIdx.x;
             auto step = blockDim.x * gridDim.x;
@@ -277,7 +273,7 @@ namespace helpers {
             }
             __syncthreads();
 
-            if (blockIdx.x == 0) {
+            if (threadIdx.x == 0) {
                 result[0] = (T)math::nd4j_log<F,F>(math::nd4j_abs(tempRes));
             }
         }
@@ -520,7 +516,9 @@ namespace helpers {
     }
 
     int determinant(nd4j::LaunchContext * context, NDArray* input, NDArray* output) {
+        NDArray::prepareSpecialUse({output}, {input});
         BUILD_SINGLE_SELECTOR(input->dataType(), return determinant_, (context, input, output), FLOAT_NATIVE);
+        NDArray::registerSpecialUse({output}, {input});
     }
 
     template <typename T>
@@ -568,7 +566,9 @@ namespace helpers {
     }
 
     int logAbsDeterminant(nd4j::LaunchContext * context, NDArray* input, NDArray* output) {
+        NDArray::prepareSpecialUse({output}, {input});
         BUILD_SINGLE_SELECTOR(input->dataType(), return logAbsDeterminant_, (context, input, output), FLOAT_NATIVE);
+        NDArray::registerSpecialUse({output}, {input});
     }
 
     template <typename T>
@@ -631,26 +631,27 @@ namespace helpers {
         auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {output->rankOf() - 2, output->rankOf() - 1});
         auto stream = context->getCudaStream();
 
-//        PRAGMA_OMP_PARALLEL_FOR
         for (auto i = 0LL; i < packX.numberOfTads(); i++) {
-            fillMatrix<T, T><<<1, n2, 128, *stream>>>(matrix.specialBuffer(), matrix.specialShapeInfo(), input->specialBuffer(), input->specialShapeInfo(), i * n2, n);
+            fillMatrix<T, T><<<1, n2, 1024, *stream>>>(matrix.specialBuffer(), matrix.specialShapeInfo(), input->specialBuffer(), input->specialShapeInfo(), i * n2, n);
             matrix.tickWriteDevice();
             compound.assign(matrix);
             lup_<T>(context, &compound, nullptr, nullptr);
-            fillLowerUpperKernel<T><<<n, n, 128>>>(lower.specialBuffer(), lower.specialShapeInfo(), upper.specialBuffer(), upper.specialShapeInfo(), compound.specialBuffer(), compound.specialShapeInfo(), n);
+            fillLowerUpperKernel<T><<<n, n, 1024, *stream>>>(lower.specialBuffer(), lower.specialShapeInfo(), upper.specialBuffer(), upper.specialShapeInfo(), compound.specialBuffer(), compound.specialShapeInfo(), n);
             matrix.assign(0);
             invertUpperMatrix(&upper, &matrix); // U^{-1}
             compound.assign(0);
             invertLowerMatrix(&lower, &compound); // L{-1}
 
             nd4j::MmulHelper::mmul(&matrix, &compound, &upper, 1.0, 0.0);
-            returnMatrix<T, T><<<1, n2, 128, *stream>>>(output->specialBuffer(), output->specialShapeInfo(), upper.specialBuffer(), upper.specialShapeInfo(), i * n2, n);
+            returnMatrix<T, T><<<1, n2, 1024, *stream>>>(output->specialBuffer(), output->specialShapeInfo(), upper.specialBuffer(), upper.specialShapeInfo(), i * n2, n);
         }
         return Status::OK();
     }
 
     int inverse(nd4j::LaunchContext * context, NDArray* input, NDArray* output) {
+        NDArray::prepareSpecialUse({output}, {input});
         BUILD_SINGLE_SELECTOR(input->dataType(), return inverse_, (context, input, output), FLOAT_NATIVE);
+        NDArray::registerSpecialUse({output}, {input});
     }
 
     bool checkCholeskyInput(nd4j::LaunchContext * context, NDArray const* input) {
@@ -795,10 +796,12 @@ namespace helpers {
         double* output = outputBuf;
         double* input = inputBuf;
 
+        Nd4jLong* shapeOf = shape::shapeOf(tadShape);
+        Nd4jLong* strideOf = shape::stride(tadShape);
+
         for (auto i = blockIdx.x; i < batchNum; i += gridDim.x) {
             double* current = input + tadOffsets[i];
-            Nd4jLong* shapeOf = shape::shapeOf(tadShape);
-            Nd4jLong* strideOf = shape::stride(tadShape);
+
             auto zIndex = shape::getIndexOffset(i, outputShape, batchNum);
             for (auto e = threadIdx.x; e < n; e += blockDim.x) {
                 Nd4jLong diag[] = {e, e};
diff --git a/libnd4j/include/templatemath.h b/libnd4j/include/templatemath.h
index b690f4f6e..6a543b35d 100644
--- a/libnd4j/include/templatemath.h
+++ b/libnd4j/include/templatemath.h
@@ -1057,6 +1057,9 @@ inline __device__ uint64_t nd4j_atomicAdd<uint64_t>(uint64_t* address, uint64_t
 
 template <>
 inline __device__ float16 nd4j_atomicAdd<float16>(float16* address, float16 val)  {
+#if __CUDA_ARCH__ >= 700
+    atomicAdd(reinterpret_cast<__half*>(address), val.data);
+#else
 	int* address_as_ull = (int*) address;
 
 	long addr = (long) address;
@@ -1086,6 +1089,7 @@ inline __device__ float16 nd4j_atomicAdd<float16>(float16* address, float16 val)
 
 	if (!misaligned) return old.B.H;
 	else return old.B.L;
+#endif
 }
 
 template <>
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp
index 48996f2a5..bc716cc8e 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp
@@ -1846,8 +1846,8 @@ TEST_F(DeclarableOpsTests6, MatrixInverse_3) {
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-    //z->printIndexedBuffer("Output ");
-    //exp.printIndexedBuffer("Expected ");
+    exp.printIndexedBuffer("Expected ");
+    z->printIndexedBuffer("Output ");
 
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));