diff --git a/libnd4j/CMakeLists.txt b/libnd4j/CMakeLists.txt
index e54c3ebe6..712d123be 100755
--- a/libnd4j/CMakeLists.txt
+++ b/libnd4j/CMakeLists.txt
@@ -1,17 +1,23 @@
 cmake_minimum_required(VERSION 3.15)
 project(libnd4j)
 set(CMAKE_VERBOSE_MAKEFILE OFF)
-option(NATIVE "Optimize for build machine (might not work on others)" OFF)
+
 set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})
 #ensure we create lib files
 set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS OFF)
-option(CHECK_VECTORIZATION "checks for vectorization" OFF)
-option(BUILD_TESTS "Build tests" OFF)
+
+option(SD_NATIVE "Optimize for build machine (might not work on others)" OFF)
+option(SD_CHECK_VECTORIZATION "checks for vectorization" OFF)
+option(SD_BUILD_TESTS "Build tests" OFF)
+option(SD_STATIC_LIB "Build static library" OFF)
+option(SD_SHARED_LIB "Build shared library" ON)
+option(SD_SANITIZE "Enable Address Sanitizer" ON)
+
 option(FLATBUFFERS_BUILD_FLATC "Enable the build of the flatbuffers compiler" OFF)
 set(FLATBUFFERS_BUILD_FLATC "OFF" CACHE STRING "Hack to disable flatc build" FORCE)
 
 set(CMAKE_CXX_STANDARD 11)
-if (CUDA_BLAS)
+if (SD_CUDA)
     enable_language(CUDA)
     set(CMAKE_CUDA_STANDARD 11)
 
@@ -23,23 +29,23 @@ endif()
 # MSVC runtime lib can be either "MultiThreaded" or "MultiThreadedDLL", /MT and /MD respectively
 set(MSVC_RT_LIB "MultiThreadedDLL")
 
-set(X86_BUILD false)
+set(SD_X86_BUILD false)
 
-if (NOT IOS_BUILD AND NOT ANDROID_BUILD AND NOT ${ARCH} MATCHES "power*" AND NOT ${ARCH} MATCHES "arm*")
-    set(X86_BUILD true)
+if (NOT SD_IOS_BUILD AND NOT SD_ANDROID_BUILD AND NOT ${SD_ARCH} MATCHES "power*" AND NOT ${SD_ARCH} MATCHES "arm*")
+    set(SD_X86_BUILD true)
 endif()
 
 # -fsanitize=address
 # -fsanitize=leak
-if (ANDROID_BUILD)
+if (SD_ANDROID_BUILD)
     set(CMAKE_CXX_FLAGS_RELEASE  "${CMAKE_CXX_FLAGS_RELEASE} -O3 -fPIC -Wno-braced-scalar-init -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else -D_RELEASE=true")
     set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} -O0 -g -fPIC -Wno-braced-scalar-init -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else")
 elseif (APPLE)
     set(CMAKE_CXX_FLAGS_RELEASE  "-O3 -fPIC -Wno-braced-scalar-init -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else -D__APPLE_OS__=true -D_RELEASE=true")
     set(CMAKE_CXX_FLAGS_DEBUG  " -O0 -g -fPIC -Wno-braced-scalar-init -Wno-delete-non-virtual-dtor -Wno-unused-command-line-argument -Wno-dangling-else -D__APPLE_OS__=true")
 elseif(WIN32)
-    set(X86_BUILD true)
-    if (CUDA_BLAS)
+    set(SD_X86_BUILD true)
+    if (SD_CUDA)
         set(CMAKE_CXX_FLAGS_RELEASE  "-D_RELEASE=true")
         set(CMAKE_CXX_FLAGS_DEBUG  "  /FS /EHsc")
     else()
@@ -50,14 +56,14 @@ else()
     set(CMAKE_CXX_FLAGS_RELEASE  "-O3 -fPIC -fmax-errors=2 -D_RELEASE=true")
     set(CMAKE_CXX_FLAGS_DEBUG  " -g -O0 -fPIC -fmax-errors=2")
 
-    if (CPU_BLAS)
+    if (SD_CPU)
         set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=address")
     endif()
 endif()
 
-if(NATIVE)
+if(SD_NATIVE)
     IF(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64*")
-        set(X86_BUILD false)
+        set(SD_X86_BUILD false)
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=native")
     ELSE()
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
@@ -65,7 +71,7 @@ if(NATIVE)
 endif()
 
 
-if(NOT CUDA_BLAS)
+if(NOT SD_CUDA)
     # we need this definition to avoid global memory use within mkldnn
     add_definitions(-DDNNL_ENABLE_CONCURRENT_EXEC=true)
 
@@ -91,7 +97,7 @@ if(NOT CUDA_BLAS)
     endif()
 
     # building cpu_features
-    if (X86_BUILD)
+    if (SD_X86_BUILD)
         add_definitions(-DCPU_FEATURES=true)
         set(BUILD_PIC "ON" CACHE STRING "Hack to enforce fPIC mode" FORCE)
         configure_file(./CMakeLists.txt.cpu_features.in cpu_features-download/CMakeLists.txt)
@@ -153,7 +159,7 @@ endif()
 
 
 if (${HELPERS_cudnn})
-    if (NOT CUDA_BLAS)
+    if (NOT SD_CUDA)
         message(FATAL_ERROR "Can't build cuDNN on non-CUDA platform")
     endif()
 
@@ -215,12 +221,12 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR}/include)
 
 if (NOT DEFINED ENV{CLION_IDE})
     message("NOT CLION")
-    include_directories(blas/ include/ include/helpers include/loops include/graph include/execution include/ops include/types include/array include/cnpy include/exceptions)
+    include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
     add_subdirectory(blas)
-    if(BUILD_TESTS)
+    if(SD_BUILD_TESTS)
         # tests are always compiled with all ops included
-        set(LIBND4J_ALL_OPS true)
-        set(LIBND4J_BUILD_MINIFIER true)
+        set(SD_ALL_OPS true)
+        set(SD_BUILD_MINIFIER true)
         add_subdirectory(tests_cpu)
     endif()
 endif ()
@@ -230,7 +236,7 @@ if ($ENV{CLION_IDE})
 endif ()
 
 if (MSVC_DEV)
-    set(LIBND4J_BUILD_MINIFIER false)
+    set(SD_BUILD_MINIFIER false)
 endif ()
 
 set (CMAKE_INSTALL_PREFIX $ENV{ND4J_HOME}/nd4j-native-parent/nd4j-native/src/main/resources)
diff --git a/libnd4j/CMakeSettings.json b/libnd4j/CMakeSettings.json
index 867132ab2..fe7790fa0 100644
--- a/libnd4j/CMakeSettings.json
+++ b/libnd4j/CMakeSettings.json
@@ -9,7 +9,7 @@
       ],
       "buildRoot": "${env.USERPROFILE}\\CMakeBuilds\\${workspaceHash}\\build\\${name}",
       "installRoot": "${env.USERPROFILE}\\CMakeBuilds\\${workspaceHash}\\install\\${name}",
-      "cmakeCommandArgs": " -DCUDA_BLAS=true -DLIBND4J_NAME=nd4jcuda -DMSVC_DEV=true -DCOMPUTE=61 -DBUILD_TESTS=true",
+      "cmakeCommandArgs": " -DSD_CUDA=true -DLIBND4J_NAME=nd4jcuda -DMSVC_DEV=true -DCOMPUTE=61 -DBUILD_TESTS=true",
       "buildCommandArgs": "-v",
       "ctestCommandArgs": ""
     },
@@ -20,7 +20,7 @@
       "buildRoot": "${projectDir}\\out\\build\\${name}",
       "installRoot": "${projectDir}\\out\\install\\${name}",
       "cmakeExecutable": "/usr/bin/cmake",
-      "cmakeCommandArgs": "-DLIBND4J_ALL_OPS=true -DCMAKE_BUILD_TYPE=Debug -DCPU_BLAS=true -DLIBND4J_NAME=nd4jcpu -DBUILD_TESTS=ON -DCMAKE_BUILD_TYPE=Debug -DOPENBLAS_PATH=/usr/lib/openblas-base/  -DEXTENSION=avx2 ",
+      "cmakeCommandArgs": "-DSD_ALL_OPS=true -DCMAKE_BUILD_TYPE=Debug -DSD_CPU=true -DLIBND4J_NAME=nd4jcpu -DBUILD_TESTS=ON -DCMAKE_BUILD_TYPE=Debug -DOPENBLAS_PATH=/usr/lib/openblas-base/  -DEXTENSION=avx2 ",
       "buildCommandArgs": "-j 4",
       "ctestCommandArgs": "",
       "inheritEnvironments": [ "linux_x64" ],
diff --git a/libnd4j/blas/CMakeLists.txt b/libnd4j/blas/CMakeLists.txt
index 2a12d5b9c..2dccc680f 100755
--- a/libnd4j/blas/CMakeLists.txt
+++ b/libnd4j/blas/CMakeLists.txt
@@ -29,24 +29,24 @@ if(APPLE)
     link_directories(/lib)
 endif()
 
-if (APPLE_BUILD)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DAPPLE_BUILD=true -mmacosx-version-min=10.10")
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DAPPLE_BUILD=true -mmacosx-version-min=10.10")
+if (SD_APPLE_BUILD)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSD_APPLE_BUILD=true -mmacosx-version-min=10.10")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DSD_APPLE_BUILD=true -mmacosx-version-min=10.10")
 endif()
 
-if (ARM_BUILD)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DARM_BUILD=true")
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DARM_BUILD=true")
+if (SD_ARM_BUILD)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSD_ARM_BUILD=true")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DSD_ARM_BUILD=true")
 endif()
 
-if (ANDROID_BUILD)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DANDROID_BUILD=true")
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DANDROID_BUILD=true")
+if (SD_ANDROID_BUILD)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSD_ANDROID_BUILD=true")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DSD_ANDROID_BUILD=true")
 endif()
 
-if (IOS_BUILD)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DIOS_BUILD=true")
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DIOS_BUILD=true")
+if (SD_IOS_BUILD)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSD_IOS_BUILD=true")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DSD_IOS_BUILD=true")
 endif()
 
 if(WIN32)
@@ -68,33 +68,33 @@ if(WIN32)
     SET(CMAKE_NINJA_FORCE_RESPONSE_FILE 1 CACHE INTERNAL "")
 endif()
 
-if ("${LIBND4J_ALL_OPS}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLIBND4J_ALL_OPS=true")
+if ("${SD_ALL_OPS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSD_ALL_OPS=true")
 else()
-    message("_OPS: ${LIBND4J_OPS_LIST}")
-    foreach(OP "${LIBND4J_OPS_LIST}")
+    message("_OPS: ${SD_OPS_LIST}")
+    foreach(OP "${SD_OPS_LIST}")
         message(STATUS "${OP}")
     endforeach()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${LIBND4J_OPS_LIST}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SD_OPS_LIST}")
 endif()
 
-IF(${ARCH} MATCHES "arm*")
-    set(ARCH_TUNE "-march=${ARCH}")
-ELSEIF(${ARCH} MATCHES "power*")
-    set(ARCH_TUNE "-mcpu=${ARCH} -mtune=${ARCH} -D__POWER")
-ELSEIF(${EXTENSION} MATCHES "avx2")
+IF(${SD_ARCH} MATCHES "arm*")
+    set(ARCH_TUNE "-march=${SD_ARCH}")
+ELSEIF(${SD_ARCH} MATCHES "power*")
+    set(ARCH_TUNE "-mcpu=${SD_ARCH} -mtune=${SD_ARCH} -D__POWER")
+ELSEIF(${SD_EXTENSION} MATCHES "avx2")
     message("Building AVX2 binary...")
     set(ARCH_TUNE "-mmmx -msse -msse2 -msse3 -msse4.1 -msse4.2 -mavx -mavx2 -mfma -mf16c -mprefetchwt1 -DSD_F16C=true -DF_AVX2=true")
 ELSE()
-    if ("${ARCH}" STREQUAL "x86-64")
+    if ("${SD_ARCH}" STREQUAL "x86-64")
         message("Building x86_64 binary...")
         set(ARCH_TYPE "generic")
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DF_X64=true")
     else()
-        set(ARCH_TYPE "${ARCH}")
+        set(ARCH_TYPE "${SD_ARCH}")
     endif()
 
-    IF(${EXTENSION} MATCHES "avx512")
+    IF(${SD_EXTENSION} MATCHES "avx512")
         message("Building AVX512 binary...")
         # we need to set flag here, that we can use hardware f16 conversion + tell that cpu features should be tracked
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mmmx -msse -msse2 -msse3 -msse4.1 -msse4.2 -mavx -mavx2 -mfma -mf16c -mavx512f -mavx512vl -mavx512bw -mavx512dq  -mavx512cd -mbmi -mbmi2 -mprefetchwt1 -mclflushopt -mxsavec -mxsaves -DSD_F16C=true -DF_AVX512=true")
@@ -102,11 +102,11 @@ ELSE()
 
     if (NOT WIN32)
         # we don't want this definition for msvc
-        set(ARCH_TUNE "-march=${ARCH} -mtune=${ARCH_TYPE}")
+        set(ARCH_TUNE "-march=${SD_ARCH} -mtune=${ARCH_TYPE}")
     endif()
 ENDIF()
 
-if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang" AND X86_BUILD)
+if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang" AND SD_X86_BUILD)
     # apple clang but not ios-arm
     SET( CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} ${ARCH_TUNE}")
 elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
@@ -129,10 +129,10 @@ IF(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
     include_directories("/usr/include")
     include_directories("/usr/local/include")
 ENDIF(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
-if(!CUDA_BLAS)
-    if(!CPU_BLAS)
-        set(CUDA_BLAS FALSE)
-        set(CPU_BLAS TRUE)
+if(!SD_CUDA)
+    if(!SD_CPU)
+        set(SD_CUDA FALSE)
+        set(SD_CPU TRUE)
     endif()
 endif()
 
@@ -141,7 +141,7 @@ if (HAVE_MKLDNN)
     file(GLOB_RECURSE CUSTOMOPS_MKLDNN_SOURCES false ../include/ops/declarable/platform/mkldnn/*.cpp ../include/ops/declarable/platform/mkldnn/mkldnnUtils.h)
 endif()
 
-if(CUDA_BLAS)
+if(SD_CUDA)
     message("Build cublas")
     find_package(CUDA)
     add_definitions(-D__CUDABLAS__=true)
@@ -154,7 +154,7 @@ if(CUDA_BLAS)
 		include_directories(${CUDA_INCLUDE_DIRS})
         message("CUDA found!")
 
-        if ("${EXPERIMENTAL}" STREQUAL "yes")
+        if ("${SD_EXPERIMENTAL}" STREQUAL "yes")
             message("Experimental mode ENABLED")
             set(CMAKE_CUDA_FLAGS " ${CMAKE_CUDA_FLAGS} -D__ND4J_EXPERIMENTAL__=true")
             set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__ND4J_EXPERIMENTAL__=true")
@@ -218,6 +218,7 @@ if(CUDA_BLAS)
         file(GLOB_RECURSE HELPERS_SOURCES false ../include/helpers/impl/*.cpp ../include/helpers/*.cu ../include/helpers/*.cupp ../include/helpers/*.h)
         file(GLOB_RECURSE INDEXING_SOURCES false ../include/indexing/*.cpp ../include/indexing/*.h)
         file(GLOB_RECURSE LOOPS_SOURCES false ../include/loops/impl/*.cpp ../include/loops/*.h)
+        file(GLOB_RECURSE LEGACY_SOURCES false ../include/legacy/impl/*.cpp  ../include/legacy/*.cu ../include/legacy/*.h)
         file(GLOB_RECURSE LOOPS_SOURCES_CUDA false ../include/loops/*.cu)
 
         if (HAVE_CUDNN)
@@ -225,43 +226,41 @@ if(CUDA_BLAS)
             file(GLOB_RECURSE CUSTOMOPS_CUDNN_SOURCES false ../include/ops/declarable/platform/cudnn/*.cu)
         endif()
 
-		add_library(nd4jobj OBJECT cuda/NativeOps.cu cuda/NativeOpExecutioner.cu cuda/BlasVersionHelper.cu Environment.cpp ${LOOPS_SOURCES_CUDA}
+		add_library(nd4jobj OBJECT ${LOOPS_SOURCES_CUDA} ${LEGACY_SOURCES}
                 ${CUSTOMOPS_HELPERS_SOURCES} ${HELPERS_SOURCES} ${EXEC_SOURCES}
-                ../include/cnpy/cnpy.cpp ../include/nd4jmemset.h ../include/nd4jmalloc.h
-                cpu/GraphExecutioner.cpp cuda/NDArray.cu cpu/NDArrayFactory.cpp
-                Environment.h ${LOOPS_SOURCES} ${ARRAY_SOURCES} ${TYPES_SOURCES}
+                ${LOOPS_SOURCES} ${ARRAY_SOURCES} ${TYPES_SOURCES}
                 ${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${INDEXING_SOURCES} ${EXCEPTIONS_SOURCES} ${OPS_SOURCES} ${PERF_SOURCES} ${CUSTOMOPS_CUDNN_SOURCES} ${CUSTOMOPS_MKLDNN_SOURCES})
 
-        add_library(${LIBND4J_NAME} SHARED $<TARGET_OBJECTS:nd4jobj>)
+        add_library(${SD_LIBRARY_NAME} SHARED $<TARGET_OBJECTS:nd4jobj>)
 
         if (WIN32)
             message("MSVC runtime for library: ${MSVC_RT_LIB}")
         endif()
 
         # static library is built only if we're going to build tests, skip otherwise
-        if (BUILD_TESTS)
-            add_library(${LIBND4J_NAME}static STATIC $<TARGET_OBJECTS:nd4jobj>)
-            set_property(TARGET ${LIBND4J_NAME}static PROPERTY MSVC_RUNTIME_LIBRARY "${MSVC_RT_LIB}$<$<CONFIG:Debug>:Debug>")
-            install(TARGETS ${LIBND4J_NAME}static DESTINATION .)
+        if (SD_BUILD_TESTS OR SD_STATIC_LIB)
+            add_library(${SD_LIBRARY_NAME}static STATIC $<TARGET_OBJECTS:nd4jobj>)
+            set_property(TARGET ${SD_LIBRARY_NAME}static PROPERTY MSVC_RUNTIME_LIBRARY "${MSVC_RT_LIB}$<$<CONFIG:Debug>:Debug>")
+            install(TARGETS ${SD_LIBRARY_NAME}static DESTINATION .)
         endif()
 
         # on windows we want to make sure we use MT or MD, but since we use it in one lib, we must use it everywhere to avoid conflicts
         set_property(TARGET nd4jobj PROPERTY MSVC_RUNTIME_LIBRARY "${MSVC_RT_LIB}$<$<CONFIG:Debug>:Debug>")
-        set_property(TARGET ${LIBND4J_NAME} PROPERTY MSVC_RUNTIME_LIBRARY "${MSVC_RT_LIB}$<$<CONFIG:Debug>:Debug>")
+        set_property(TARGET ${SD_LIBRARY_NAME} PROPERTY MSVC_RUNTIME_LIBRARY "${MSVC_RT_LIB}$<$<CONFIG:Debug>:Debug>")
 
         if(WIN32)
             message("CUDA on Windows: enabling /EHsc")
             SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc /bigobj /std:c++14")
         endif()
 
-		target_link_libraries(${LIBND4J_NAME} ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_cusolver_LIBRARY} ${CUDNN} ${MKLDNN})
+		target_link_libraries(${SD_LIBRARY_NAME} ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_cusolver_LIBRARY} ${CUDNN} ${MKLDNN})
 	    set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/cuda)
 
-		install(TARGETS ${LIBND4J_NAME} DESTINATION .)
+		install(TARGETS ${SD_LIBRARY_NAME} DESTINATION .)
     endif(CUDA_FOUND)
-elseif(CPU_BLAS)
+elseif(SD_CPU)
 
-    if ("${EXPERIMENTAL}" STREQUAL "yes")
+    if ("${SD_EXPERIMENTAL}" STREQUAL "yes")
         message("Experimental mode ENABLED")
         set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__ND4J_EXPERIMENTAL__=true")
         set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__ND4J_EXPERIMENTAL__=true")
@@ -279,15 +278,16 @@ elseif(CPU_BLAS)
     file(GLOB_RECURSE OPS_SOURCES false ../include/ops/impl/*.cpp ../include/ops/declarable/impl/*.cpp  ../include/ops/*.h)
     file(GLOB_RECURSE INDEXING_SOURCES false ../include/indexing/*.cpp ../include/indexing/*.h)
     file(GLOB_RECURSE HELPERS_SOURCES false ../include/helpers/*.cpp ../include/helpers/*.h)
+    file(GLOB_RECURSE LEGACY_SOURCES false ../include/legacy/impl/*.cpp  ../include/legacy/cpu/*.cpp ../include/legacy/*.h)
     file(GLOB_RECURSE LOOPS_SOURCES false ../include/loops/*.cpp ../include/loops/*.h)
 
-    if (X86_BUILD)
+    if (SD_X86_BUILD)
         # we disable platform optimizations for certains files for linux/macos
         set_source_files_properties(cpu/NativeOps.cpp PROPERTIES COMPILE_FLAGS "-march=x86-64 -mtune=generic")
         set_source_files_properties(../include/helpers/impl/OpTracker.cpp PROPERTIES COMPILE_FLAGS "-march=x86-64 -mtune=generic")
     endif()
 
-    if(CHECK_VECTORIZATION)
+    if(SD_CHECK_VECTORIZATION)
        set(VECT_FILES cpu/NativeOps.cpp ${OPS_SOURCES} ${HELPERS_SOURCES} ${CUSTOMOPS_GENERIC_SOURCES} ${LOOPS_SOURCES})
        if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
         
@@ -315,33 +315,31 @@ elseif(CPU_BLAS)
 
     message("CPU BLAS")
     add_definitions(-D__CPUBLAS__=true)
-    add_library(nd4jobj OBJECT cpu/NativeOps.cpp cpu/GraphExecutioner.cpp
-            cpu/NativeOpExecutioner.cpp cpu/NDArray.cpp cpu/NDArrayFactory.cpp
-            ../include/cnpy/cnpy.cpp  ../include/nd4jmemset.h ../include/nd4jmalloc.h
-            Environment.cpp Environment.h ${LOOPS_SOURCES} ${HELPERS_SOURCES} ${EXEC_SOURCES} ${ARRAY_SOURCES} ${TYPES_SOURCES}
+    add_library(nd4jobj OBJECT ${LEGACY_SOURCES}
+            ${LOOPS_SOURCES} ${HELPERS_SOURCES} ${EXEC_SOURCES} ${ARRAY_SOURCES} ${TYPES_SOURCES}
             ${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${EXCEPTIONS_SOURCES} ${INDEXING_SOURCES} ${CUSTOMOPS_MKLDNN_SOURCES} ${CUSTOMOPS_GENERIC_SOURCES}
             ${OPS_SOURCES} ${PERF_SOURCES})
     if(IOS)
-        add_library(${LIBND4J_NAME}       STATIC $<TARGET_OBJECTS:nd4jobj>)
+        add_library(${SD_LIBRARY_NAME}       STATIC $<TARGET_OBJECTS:nd4jobj>)
     else()
         # static library is built only if we're going to build tests, skip otherwise
-        if (BUILD_TESTS)
-            add_library(${LIBND4J_NAME}static STATIC $<TARGET_OBJECTS:nd4jobj>)
+        if (SD_BUILD_TESTS OR SD_STATIC_LIB)
+            add_library(${SD_LIBRARY_NAME}static STATIC $<TARGET_OBJECTS:nd4jobj>)
         endif()
 
-        add_library(${LIBND4J_NAME}       SHARED $<TARGET_OBJECTS:nd4jobj>)
+        add_library(${SD_LIBRARY_NAME}       SHARED $<TARGET_OBJECTS:nd4jobj>)
     endif()
 
     # we're including {MKLDNN} here in case of building from sources. in future that'll replace {MKLDNN_LIBRARIES}. same applies to BLAS
     if (NOT BLAS_LIBRARIES)
         set(BLAS_LIBRARIES "")
     endif()
-    target_link_libraries(${LIBND4J_NAME} ${MKLDNN} ${MKLDNN_LIBRARIES} ${OPENBLAS_LIBRARIES} ${BLAS_LIBRARIES} ${CPU_FEATURES})
+    target_link_libraries(${SD_LIBRARY_NAME} ${MKLDNN} ${MKLDNN_LIBRARIES} ${OPENBLAS_LIBRARIES} ${BLAS_LIBRARIES} ${CPU_FEATURES})
 
-    if ("${LIBND4J_ALL_OPS}" AND "${LIBND4J_BUILD_MINIFIER}")
+    if ("${SD_ALL_OPS}" AND "${SD_BUILD_MINIFIER}")
         message(STATUS "Building minifier...")
         add_executable(minifier ../minifier/minifier.cpp ../minifier/graphopt.cpp)
-        target_link_libraries(minifier ${LIBND4J_NAME}static ${MKLDNN_LIBRARIES} ${OPENBLAS_LIBRARIES} ${MKLDNN} ${BLAS_LIBRARIES} ${CPU_FEATURES})
+        target_link_libraries(minifier ${SD_LIBRARY_NAME}static ${MKLDNN_LIBRARIES} ${OPENBLAS_LIBRARIES} ${MKLDNN} ${BLAS_LIBRARIES} ${CPU_FEATURES})
     endif()
 
     if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" AND "${CMAKE_CXX_COMPILER_VERSION}" VERSION_LESS 4.9)
@@ -362,6 +360,6 @@ elseif(CPU_BLAS)
         SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -export-dynamic")
     endif()
 
-    install(TARGETS ${LIBND4J_NAME} DESTINATION  .)
+    install(TARGETS ${SD_LIBRARY_NAME} DESTINATION  .)
     set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/cpu)
 endif()
diff --git a/libnd4j/blas/NDArrayFactory.h b/libnd4j/blas/NDArrayFactory.h
deleted file mode 100644
index bff199d08..000000000
--- a/libnd4j/blas/NDArrayFactory.h
+++ /dev/null
@@ -1,191 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2015-2018 Skymind, Inc.
- * Copyright (c) 2019-2020 Konduit K.K.
- *
- * This program and the accompanying materials are made available under the
- * terms of the Apache License, Version 2.0 which is available at
- * https://www.apache.org/licenses/LICENSE-2.0.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations
- * under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- ******************************************************************************/
-
-//
-// Created by raver119 on 2018-09-16.
-// @author Oleg Semeniv <oleg.semeniv@gmail.com>
-//
-
-#ifndef DEV_TESTS_NDARRAYFACTORY_H
-#define DEV_TESTS_NDARRAYFACTORY_H
-
-#include <vector>
-#include <initializer_list>
-#include <NDArray.h>
-//#include <memory/Workspace.h>
-#include <execution/LaunchContext.h>
-#include <string>
-
-
-namespace nd4j {
-    class ND4J_EXPORT NDArrayFactory {
-    private:
-        template <typename T>
-        static void memcpyFromVector(void *ptr, const std::vector<T> &vector);
-    public:
-        template <typename T>
-        static NDArray* empty_(nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
-
-        static NDArray* empty_(nd4j::DataType dataType, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
-
-        template <typename T>
-        static NDArray empty(nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
-
-        static NDArray empty(nd4j::DataType dataType, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
-
-        template <typename T>
-        static NDArray* valueOf(const std::initializer_list<Nd4jLong>& shape, T value, char order = 'c',  nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
-
-        template <typename T>
-        static NDArray* valueOf(const std::vector<Nd4jLong>& shape, T value, char order = 'c',  nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
-
-        static NDArray* valueOf(const std::vector<Nd4jLong>& shape, const NDArray& value, char order = 'c',  nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
-
-        template <typename T>
-        static NDArray* linspace(T from, T to, Nd4jLong numElements);
-
-
-        template <typename T>
-        static NDArray* create_(const T value, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
-        static NDArray* create_(nd4j::DataType dtype, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
-
-        template <typename T>
-        static NDArray create(const T value, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
-        static NDArray create(nd4j::DataType dtype, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
-        template <typename T>
-        static NDArray create(DataType type, const T scalar, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
-
-
-        template <typename T>
-        static NDArray* vector(Nd4jLong length, T startingValue = (T) 0, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
-
-        template <typename T>
-        static NDArray* create_(char order, const std::vector<Nd4jLong> &shape, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
-
-        static NDArray* create_( char order, const std::vector<Nd4jLong> &shape, nd4j::DataType dataType, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
-
-        template <typename T>
-        static NDArray* create_(char order, const std::vector<Nd4jLong> &shape, const std::vector<T> &data, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
-
-        template <typename T>
-        static NDArray create(char order, const std::vector<Nd4jLong> &shape, const std::vector<T> &data, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
-
-        template <typename T>
-        static NDArray create(char order, const std::vector<Nd4jLong> &shape, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
-        static NDArray create(char order, const std::vector<Nd4jLong> &shape, nd4j::DataType dtype, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
-
-        template <typename T>
-        static NDArray create(const std::vector<T> &values, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
-
-#ifndef __JAVACPP_HACK__
-        // this method only available out of javacpp
-        /**
-         * This constructor creates vector of T
-         *
-         * @param values
-         */
-
-        template <typename T>
-        static NDArray create(char order, const std::initializer_list<Nd4jLong>& shape, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
-
-        template <typename T>
-        static NDArray create(T* buffer, char order, const std::initializer_list<Nd4jLong>& shape, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
-
-        template <typename T>
-        static NDArray create(char order, const std::vector<Nd4jLong> &shape, const std::initializer_list<T>& data, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
-
-        /**
-         * This method creates NDArray from .npy file
-         * @param fileName
-         * @return
-         */
-        static NDArray fromNpyFile(const char *fileName);
-
-        /**
-         * This factory create array from utf8 string
-         * @return NDArray default dataType UTF8
-         */
-        static NDArray string(const char *string, nd4j::DataType dtype = nd4j::DataType::UTF8, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
-        static NDArray* string_(const char *string, nd4j::DataType dtype = nd4j::DataType::UTF8, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
-        static NDArray* string_(const std::string &string, nd4j::DataType dtype = nd4j::DataType::UTF8, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
-        static NDArray string(const std::string& string, nd4j::DataType dtype = nd4j::DataType::UTF8, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
-
-        /**
-         * This factory create array from utf16 string
-         * @return NDArray default dataType UTF16
-         */
-        static NDArray string(const char16_t* u16string, nd4j::DataType dtype = nd4j::DataType::UTF16, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
-        static NDArray* string_(const char16_t* u16string, nd4j::DataType dtype = nd4j::DataType::UTF16, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
-        static NDArray* string_(const std::u16string& u16string, nd4j::DataType dtype = nd4j::DataType::UTF16, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
-        static NDArray string(const std::u16string& u16string, nd4j::DataType dtype = nd4j::DataType::UTF16, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
-        
-        /**
-         * This factory create array from utf32 string
-         * @return NDArray default dataType UTF32
-         */
-        static NDArray string(const char32_t* u32string, nd4j::DataType dtype = nd4j::DataType::UTF32, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
-        static NDArray* string_(const char32_t* u32string, nd4j::DataType dtype = nd4j::DataType::UTF32, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
-        static NDArray* string_(const std::u32string& u32string, nd4j::DataType dtype = nd4j::DataType::UTF32, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
-        static NDArray string(const std::u32string& u32string, nd4j::DataType dtype = nd4j::DataType::UTF32, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
-
-        /**
-         * This factory create array from vector of utf8 strings
-         * @return NDArray default dataType UTF8
-         */
-        static NDArray string( const std::vector<Nd4jLong> &shape, const std::initializer_list<const char *> &strings, nd4j::DataType dtype = nd4j::DataType::UTF8, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
-        static NDArray string( const std::vector<Nd4jLong> &shape, const std::initializer_list<std::string> &string, nd4j::DataType dtype = nd4j::DataType::UTF8, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
-        static NDArray string( const std::vector<Nd4jLong> &shape, const std::vector<const char *> &strings, nd4j::DataType dtype = nd4j::DataType::UTF8, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
-        static NDArray string( const std::vector<Nd4jLong> &shape, const std::vector<std::string> &string, nd4j::DataType dtype = nd4j::DataType::UTF8, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
-        static NDArray* string_( const std::vector<Nd4jLong> &shape, const std::initializer_list<const char *> &strings, nd4j::DataType dtype = nd4j::DataType::UTF8, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
-        static NDArray* string_( const std::vector<Nd4jLong> &shape, const std::initializer_list<std::string> &string, nd4j::DataType dtype = nd4j::DataType::UTF8, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
-        static NDArray* string_( const std::vector<Nd4jLong> &shape, const std::vector<const char *> &strings, nd4j::DataType dtype = nd4j::DataType::UTF8, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
-        static NDArray* string_( const std::vector<Nd4jLong> &shape, const std::vector<std::string> &string, nd4j::DataType dtype = nd4j::DataType::UTF8, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
-
-        /**
-         * This factory create array from vector of utf16 strings
-         * @return NDArray default dataType UTF16
-         */
-        static NDArray string( const std::vector<Nd4jLong>& shape, const std::initializer_list<const char16_t*>& strings, nd4j::DataType dtype = nd4j::DataType::UTF16, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
-        static NDArray string( const std::vector<Nd4jLong>& shape, const std::initializer_list<std::u16string>& string, nd4j::DataType dtype = nd4j::DataType::UTF16, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
-        static NDArray string( const std::vector<Nd4jLong>& shape, const std::vector<const char16_t*>& strings, nd4j::DataType dtype = nd4j::DataType::UTF16, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
-        static NDArray string( const std::vector<Nd4jLong>& shape, const std::vector<std::u16string>& string, nd4j::DataType dtype = nd4j::DataType::UTF16, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
-        static NDArray* string_( const std::vector<Nd4jLong>& shape, const std::initializer_list<const char16_t*>& strings, nd4j::DataType dtype = nd4j::DataType::UTF16, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
-        static NDArray* string_( const std::vector<Nd4jLong>& shape, const std::initializer_list<std::u16string>& string, nd4j::DataType dtype = nd4j::DataType::UTF16, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
-        static NDArray* string_( const std::vector<Nd4jLong>& shape, const std::vector<const char16_t*>& strings, nd4j::DataType dtype = nd4j::DataType::UTF16, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
-        static NDArray* string_( const std::vector<Nd4jLong>& shape, const std::vector<std::u16string>& string, nd4j::DataType dtype = nd4j::DataType::UTF16, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
-
-        /**
-         * This factory create array from vector of utf32 strings
-         * @return NDArray default dataType UTF32
-         */
-        static NDArray string( const std::vector<Nd4jLong>& shape, const std::initializer_list<const char32_t*>& strings, nd4j::DataType dtype = nd4j::DataType::UTF32, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
-        static NDArray string( const std::vector<Nd4jLong>& shape, const std::initializer_list<std::u32string>& string, nd4j::DataType dtype = nd4j::DataType::UTF32, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
-        static NDArray string( const std::vector<Nd4jLong>& shape, const std::vector<const char32_t*>& strings, nd4j::DataType dtype = nd4j::DataType::UTF32, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
-        static NDArray string( const std::vector<Nd4jLong>& shape, const std::vector<std::u32string>& string, nd4j::DataType dtype = nd4j::DataType::UTF32, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
-        static NDArray* string_( const std::vector<Nd4jLong>& shape, const std::initializer_list<const char32_t*>& strings, nd4j::DataType dtype = nd4j::DataType::UTF32, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
-        static NDArray* string_( const std::vector<Nd4jLong>& shape, const std::initializer_list<std::u32string>& string, nd4j::DataType dtype = nd4j::DataType::UTF32, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
-        static NDArray* string_( const std::vector<Nd4jLong>& shape, const std::vector<const char32_t*>& strings, nd4j::DataType dtype = nd4j::DataType::UTF32, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
-        static NDArray* string_( const std::vector<Nd4jLong>& shape, const std::vector<std::u32string>& string, nd4j::DataType dtype = nd4j::DataType::UTF32, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
-
-
-        static ResultSet createSetOfArrs(const Nd4jLong numOfArrs, const void* buffer, const Nd4jLong* shapeInfo, const Nd4jLong* offsets, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
-
-#endif
-    };
-}
-
-#endif //DEV_TESTS_NDARRAYFACTORY_H
diff --git a/libnd4j/blas/cpu/NDArray.macro b/libnd4j/blas/cpu/NDArray.macro
deleted file mode 100644
index ae6db6962..000000000
--- a/libnd4j/blas/cpu/NDArray.macro
+++ /dev/null
@@ -1,148 +0,0 @@
-################################################################################
-# Copyright (c) 2015-2018 Skymind, Inc.
-#
-# This program and the accompanying materials are made available under the
-# terms of the Apache License, Version 2.0 which is available at
-# https://www.apache.org/licenses/LICENSE-2.0.
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations
-# under the License.
-#
-# SPDX-License-Identifier: Apache-2.0
-################################################################################
-
-#ifndef NDARRAY_MACRO
-#define NDARRAY_MACRO
-
-#include <op_boilerplate.h>
-
-//NDArray<T> *other, T *extraParams
-BUILD_CALL_1(template void NDArray<float>::template applyPairwiseTransform, float, (NDArray<float>* other, float* extraParams), PAIRWISE_TRANSFORM_OPS)
-BUILD_CALL_1(template void NDArray<float16>::applyPairwiseTransform, float16, (NDArray<float16>* other, float16* extraParams), PAIRWISE_TRANSFORM_OPS)
-BUILD_CALL_1(template void NDArray<double>::applyPairwiseTransform, double, (NDArray<double>* other, double* extraParams), PAIRWISE_TRANSFORM_OPS)
-
-// NDArray<T> *other, NDArray<T> *target, T *extraParams
-BUILD_CALL_1(template void nd4j::NDArray<float>::applyPairwiseTransform, float, (NDArray<float>* other, NDArray<float>* target, float* extraParams), PAIRWISE_TRANSFORM_OPS)
-BUILD_CALL_1(template void nd4j::NDArray<float16>::applyPairwiseTransform, float16, (NDArray<float16>* other, NDArray<float16>* target, float16* extraParams), PAIRWISE_TRANSFORM_OPS)
-BUILD_CALL_1(template void nd4j::NDArray<double>::applyPairwiseTransform, double, (NDArray<double>* other, NDArray<double>* target, double* extraParams), PAIRWISE_TRANSFORM_OPS)
-
-BUILD_CALL_1(template void nd4j::NDArray<float16>::applyScalar, float16, (NDArray<float16>& scalar, NDArray<float16>* target, float16 *extraParams) const, SCALAR_OPS)
-BUILD_CALL_1(template void nd4j::NDArray<float16>::applyScalar, float16, (float16 scalar, NDArray<float16>* target, float16 *extraParams) const, SCALAR_OPS)
-
-BUILD_CALL_1(template void nd4j::NDArray<float>::applyScalar, float, (NDArray<float>& scalar, NDArray<float>* target, float *extraParams) const, SCALAR_OPS)
-BUILD_CALL_1(template void nd4j::NDArray<float>::applyScalar, float, (float scalar, NDArray<float>* target, float *extraParams) const, SCALAR_OPS)
-
-BUILD_CALL_1(template void nd4j::NDArray<double>::applyScalar, double, (NDArray<double>& scalar, NDArray<double>* target, double *extraParams) const, SCALAR_OPS)
-BUILD_CALL_1(template void nd4j::NDArray<double>::applyScalar, double, (double scalar, NDArray<double>* target, double *extraParams) const, SCALAR_OPS)
-
-
-
-BUILD_CALL_1(template float16 nd4j::NDArray<float16>::reduceNumber, float16, (float16 *extraParams) const, REDUCE_OPS)
-BUILD_CALL_1(template float nd4j::NDArray<float>::reduceNumber, float, (float *extraParams) const, REDUCE_OPS)
-BUILD_CALL_1(template double nd4j::NDArray<double>::reduceNumber, double, (double *extraParams) const, REDUCE_OPS)
-
-BUILD_CALL_1(template Nd4jLong nd4j::NDArray<float16>::indexReduceNumber, float16, (float16 *extraParams), INDEX_REDUCE_OPS)
-BUILD_CALL_1(template Nd4jLong nd4j::NDArray<float>::indexReduceNumber, float, (float *extraParams), INDEX_REDUCE_OPS)
-BUILD_CALL_1(template Nd4jLong nd4j::NDArray<double>::indexReduceNumber, double, (double *extraParams), INDEX_REDUCE_OPS)
-
-BUILD_CALL_1(template void nd4j::NDArray<float16>::applyBroadcast, float16, (std::initializer_list<int> list, const nd4j::NDArray<float16>* a, nd4j::NDArray<float16>* b, float16* c), BROADCAST_OPS)
-BUILD_CALL_1(template void nd4j::NDArray<float>::applyBroadcast, float, (std::initializer_list<int> list, const nd4j::NDArray<float>* a, nd4j::NDArray<float>* b, float* c), BROADCAST_OPS)
-BUILD_CALL_1(template void nd4j::NDArray<double>::applyBroadcast, double, (std::initializer_list<int> list, const nd4j::NDArray<double>* a, nd4j::NDArray<double>* b, double* c), BROADCAST_OPS)
-
-BUILD_CALL_1(template void nd4j::NDArray<float16>::applyTrueBroadcast, float16,(const nd4j::NDArray<float16>* a, nd4j::NDArray<float16>* target, const bool checkTargetShape, float16* c) const, BROADCAST_OPS)
-BUILD_CALL_1(template void nd4j::NDArray<float>::applyTrueBroadcast,   float,  (const nd4j::NDArray<float>*   a, nd4j::NDArray<float>*   target, const bool checkTargetShape, float* c)   const, BROADCAST_OPS)
-BUILD_CALL_1(template void nd4j::NDArray<double>::applyTrueBroadcast,  double, (const nd4j::NDArray<double>*  a, nd4j::NDArray<double>*  target, const bool checkTargetShape, double* c)  const, BROADCAST_OPS)
-
-BUILD_CALL_1(template nd4j::NDArray<float16>* nd4j::NDArray<float16>::applyTrueBroadcast, float16, (const nd4j::NDArray<float16>* a, float16* c) const, BROADCAST_OPS)
-BUILD_CALL_1(template nd4j::NDArray<float>* nd4j::NDArray<float>::applyTrueBroadcast, float, (const nd4j::NDArray<float>* a, float* c) const, BROADCAST_OPS)
-BUILD_CALL_1(template nd4j::NDArray<double>* nd4j::NDArray<double>::applyTrueBroadcast, double, (const nd4j::NDArray<double>* a, double* c) const, BROADCAST_OPS)
-
-BUILD_CALL_1(template nd4j::NDArray<float16> nd4j::NDArray<float16>::applyTrueBroadcast, float16, (const nd4j::NDArray<float16>& a, float16* c) const, BROADCAST_OPS)
-BUILD_CALL_1(template nd4j::NDArray<float> nd4j::NDArray<float>::applyTrueBroadcast, float, (const nd4j::NDArray<float>& a, float* c) const, BROADCAST_OPS)
-BUILD_CALL_1(template nd4j::NDArray<double> nd4j::NDArray<double>::applyTrueBroadcast, double, (const nd4j::NDArray<double>& a, double* c) const, BROADCAST_OPS)
-
-BUILD_CALL_1(template void nd4j::NDArray<float16>::applyTransform, float16, (NDArray<float16>* target, float16* extraParams), TRANSFORM_OPS)
-BUILD_CALL_1(template void nd4j::NDArray<float>::applyTransform, float, (NDArray<float>* target, float* extraParams), TRANSFORM_OPS)
-BUILD_CALL_1(template void nd4j::NDArray<double>::applyTransform, double, (NDArray<double>* target, double* extraParams), TRANSFORM_OPS)
-
-BUILD_CALL_1(template void nd4j::NDArray<float16>::applyTransform, float16, (float16* extraParams), TRANSFORM_OPS)
-BUILD_CALL_1(template void nd4j::NDArray<float>::applyTransform, float, (float* extraParams), TRANSFORM_OPS)
-BUILD_CALL_1(template void nd4j::NDArray<double>::applyTransform, double, (double* extraParams), TRANSFORM_OPS)
-
-BUILD_CALL_1(template void nd4j::NDArray<float16>::applyRandom, float16, (nd4j::random::RandomBuffer *buffer, NDArray<float16>* y, NDArray<float16>* z, float16* extraParams), RANDOM_OPS)
-BUILD_CALL_1(template void nd4j::NDArray<float>::applyRandom, float, (nd4j::random::RandomBuffer *buffer, NDArray<float>* y, NDArray<float>* z, float* extraParams), RANDOM_OPS)
-BUILD_CALL_1(template void nd4j::NDArray<double>::applyRandom, double, (nd4j::random::RandomBuffer *buffer, NDArray<double>* y, NDArray<double>* z, double* extraParams), RANDOM_OPS)
-
-BUILD_CALL_1(template NDArray<float16> nd4j::NDArray<float16>::transform, float16, (float16* extraParams) const, TRANSFORM_OPS)
-BUILD_CALL_1(template NDArray<float>   nd4j::NDArray<float>::transform, float, (float* extraParams) const, TRANSFORM_OPS)
-BUILD_CALL_1(template NDArray<double>  nd4j::NDArray<double>::transform, double, (double* extraParams) const, TRANSFORM_OPS)
-
-BUILD_CALL_1(template NDArray<float> *nd4j::NDArray<float>::template reduceAlongDimension, float, (const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes) const, REDUCE_OPS)
-BUILD_CALL_1(template NDArray<float16> *nd4j::NDArray<float16>::template reduceAlongDimension, float16, (const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes) const, REDUCE_OPS)
-BUILD_CALL_1(template NDArray<double> *nd4j::NDArray<double>::template reduceAlongDimension, double, (const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes) const, REDUCE_OPS)
-
-BUILD_CALL_1(template NDArray<float> nd4j::NDArray<float>::template reduceAlongDims, float, (const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes) const, REDUCE_OPS)
-BUILD_CALL_1(template NDArray<float16> nd4j::NDArray<float16>::template reduceAlongDims, float16, (const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes) const, REDUCE_OPS)
-BUILD_CALL_1(template NDArray<double> nd4j::NDArray<double>::template reduceAlongDims, double, (const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes) const, REDUCE_OPS)
-
-BUILD_CALL_1(template NDArray<float> *nd4j::NDArray<float>::template reduceAlongDimension, float, (const std::initializer_list<int>& dimensions, const bool keepDims, const bool supportOldShapes) const, REDUCE_OPS)
-BUILD_CALL_1(template NDArray<float16> *nd4j::NDArray<float16>::template reduceAlongDimension, float16, (const std::initializer_list<int>& dimensions, const bool keepDims, const bool supportOldShapes) const, REDUCE_OPS)
-BUILD_CALL_1(template NDArray<double> *nd4j::NDArray<double>::template reduceAlongDimension, double, (const std::initializer_list<int>& dimensions, const bool keepDims, const bool supportOldShapes) const, REDUCE_OPS)
-
-BUILD_CALL_1(template void nd4j::NDArray<float>::template reduceAlongDimension, float, (NDArray<float>* target, const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes,  float * extras) const, REDUCE_OPS)
-BUILD_CALL_1(template void nd4j::NDArray<float16>::template reduceAlongDimension, float16, (NDArray<float16>* target, const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes, float16 * extras) const, REDUCE_OPS)
-BUILD_CALL_1(template void nd4j::NDArray<double>::template reduceAlongDimension, double, (NDArray<double>* target, const std::vector<int>& dimension, const bool keepDims, const bool supportOldShapes, double * extras) const, REDUCE_OPS)
-
-BUILD_CALL_1(template NDArray<float> *nd4j::NDArray<float>::template varianceAlongDimension, float, (const bool biasCorrected, const std::initializer_list<int>& dimensions) const, SUMMARY_STATS_OPS)
-BUILD_CALL_1(template NDArray<float16> *nd4j::NDArray<float16>::template varianceAlongDimension, float16, (const bool biasCorrected, const std::initializer_list<int>& dimensions) const, SUMMARY_STATS_OPS)
-BUILD_CALL_1(template NDArray<double> *nd4j::NDArray<double>::template varianceAlongDimension, double, (const bool biasCorrected, const std::initializer_list<int>& dimensions) const, SUMMARY_STATS_OPS)
-
-BUILD_CALL_1(template void nd4j::NDArray<float>::template varianceAlongDimension, float, (const NDArray<float> *target, const bool biasCorrected, const std::initializer_list<int>& dimensions), SUMMARY_STATS_OPS)
-BUILD_CALL_1(template void nd4j::NDArray<float16>::template varianceAlongDimension, float16, (const NDArray<float16> *target,const bool biasCorrected, const std::initializer_list<int>& dimensions), SUMMARY_STATS_OPS)
-BUILD_CALL_1(template void nd4j::NDArray<double>::template varianceAlongDimension, double, (const NDArray<double> *target, const bool biasCorrected, const std::initializer_list<int>& dimensions), SUMMARY_STATS_OPS)
-
-BUILD_CALL_1(template void nd4j::NDArray<float>::template varianceAlongDimension, float, (const NDArray<float> *target, const bool biasCorrected, const std::vector<int>& dimensions), SUMMARY_STATS_OPS)
-BUILD_CALL_1(template void nd4j::NDArray<float16>::template varianceAlongDimension, float16, (const NDArray<float16> *target,const bool biasCorrected, const std::vector<int>& dimensions), SUMMARY_STATS_OPS)
-BUILD_CALL_1(template void nd4j::NDArray<double>::template varianceAlongDimension, double, (const NDArray<double> *target, const bool biasCorrected, const std::vector<int>& dimensions), SUMMARY_STATS_OPS)
-
-BUILD_CALL_1(template float nd4j::NDArray<float>::template varianceNumber, float, (bool biasCorrected), SUMMARY_STATS_OPS)
-BUILD_CALL_1(template float16 nd4j::NDArray<float16>::template varianceNumber, float16, (bool biasCorrected), SUMMARY_STATS_OPS)
-BUILD_CALL_1(template double nd4j::NDArray<double>::template varianceNumber, double, (bool biasCorrected), SUMMARY_STATS_OPS)
-
-BUILD_CALL_1(template NDArray<float> *nd4j::NDArray<float>::template applyReduce3, float, (const NDArray<float>* other, const float* extraParams) const, REDUCE3_OPS)
-BUILD_CALL_1(template NDArray<float16> *nd4j::NDArray<float16>::template applyReduce3, float16, (const NDArray<float16>* other, const float16* extraParams) const, REDUCE3_OPS)
-BUILD_CALL_1(template NDArray<double> *nd4j::NDArray<double>::template applyReduce3, double, (const NDArray<double>* other, const double* extraParams) const, REDUCE3_OPS)
-
-BUILD_CALL_1(template NDArray<float> *nd4j::NDArray<float>::template applyReduce3, float, (const NDArray<float>* other, const std::vector<int> &dims, const float* extraParams) const, REDUCE3_OPS)
-BUILD_CALL_1(template NDArray<float16> *nd4j::NDArray<float16>::template applyReduce3, float16, (const NDArray<float16>* other, const std::vector<int> &dims, const float16* extraParams) const, REDUCE3_OPS)
-BUILD_CALL_1(template NDArray<double> *nd4j::NDArray<double>::template applyReduce3, double, (const NDArray<double>* other, const std::vector<int> &dims, const double* extraParams) const, REDUCE3_OPS)
-
-BUILD_CALL_1(template void nd4j::NDArray<float>::template applyIndexReduce, float, (const NDArray<float>* target, const std::vector<int> & alpha, const float* beta) const, INDEX_REDUCE_OPS)
-BUILD_CALL_1(template void nd4j::NDArray<float16>::template applyIndexReduce, float16, (const NDArray<float16>* target, const std::vector<int> & alpha, const float16* beta) const, INDEX_REDUCE_OPS)
-BUILD_CALL_1(template void nd4j::NDArray<double>::template applyIndexReduce, double, (const NDArray<double>* target, const std::vector<int> & alpha, const double* beta) const, INDEX_REDUCE_OPS)
-
-BUILD_CALL_1(template NDArray<float> *nd4j::NDArray<float>::template applyIndexReduce, float, (const std::vector<int> & alpha, const float* beta) const, INDEX_REDUCE_OPS)
-BUILD_CALL_1(template NDArray<float16> *nd4j::NDArray<float16>::template applyIndexReduce, float16, (const std::vector<int> & alpha, const float16* beta) const, INDEX_REDUCE_OPS)
-BUILD_CALL_1(template NDArray<double> *nd4j::NDArray<double>::template applyIndexReduce, double, (const std::vector<int> & alpha, const double* beta) const, INDEX_REDUCE_OPS)
-
-BUILD_CALL_1(template NDArray<float> *nd4j::NDArray<float>::template applyAllReduce3, float, (const nd4j::NDArray<float>* alpha, const std::vector<int> & beta, float const* gamma) const, REDUCE3_OPS)
-BUILD_CALL_1(template NDArray<float16> *nd4j::NDArray<float16>::template applyAllReduce3, float16, (const nd4j::NDArray<float16>* alpha, const std::vector<int> & beta, float16 const* gamma) const, REDUCE3_OPS)
-BUILD_CALL_1(template NDArray<double> *nd4j::NDArray<double>::template applyAllReduce3, double, (const nd4j::NDArray<double>* alpha, const std::vector<int> & beta, double const* gamma) const, REDUCE3_OPS)
-
-template NDArray<float>   mmul(const NDArray<float>&   left, const NDArray<float>& right);
-template NDArray<float16> mmul(const NDArray<float16>& left, const NDArray<float16>& right);
-template NDArray<double>  mmul(const NDArray<double>&  left, const NDArray<double>& right);
-
-// template NDArray<float>   operator-(const float,   const NDArray<float>&);
-// template NDArray<float16> operator-(const float16, const NDArray<float16>&);
-// template NDArray<double>  operator-(const double,  const NDArray<double>&);
-
-// template NDArray<float>   operator+(const float,   const NDArray<float>&);
-// template NDArray<float16> operator+(const float16, const NDArray<float16>&);
-// template NDArray<double>  operator+(const double,  const NDArray<double>&);
-
-
-#endif
\ No newline at end of file
diff --git a/libnd4j/buildnativeoperations.sh b/libnd4j/buildnativeoperations.sh
index ae3fac13a..380238554 100755
--- a/libnd4j/buildnativeoperations.sh
+++ b/libnd4j/buildnativeoperations.sh
@@ -173,7 +173,7 @@ fi
 case "$OS" in
     linux-armhf)
       export RPI_BIN=$RPI_HOME/tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf/bin/arm-linux-gnueabihf
-      export CMAKE_COMMAND="$CMAKE_COMMAND -D CMAKE_TOOLCHAIN_FILE=cmake/rpi.cmake -DARM_BUILD=true"
+      export CMAKE_COMMAND="$CMAKE_COMMAND -D CMAKE_TOOLCHAIN_FILE=cmake/rpi.cmake -DSD_ARM_BUILD=true"
       if [ -z "$ARCH" ]; then
         ARCH="armv7-r"
       fi
@@ -183,7 +183,7 @@ case "$OS" in
       if [ -z "$ARCH" ]; then
         ARCH="armv8-a"
       fi
-      export CMAKE_COMMAND="$CMAKE_COMMAND -DARM_BUILD=true"
+      export CMAKE_COMMAND="$CMAKE_COMMAND -DSD_ARM_BUILD=true"
     ;;
 
     android-arm)
@@ -194,7 +194,7 @@ case "$OS" in
       export ANDROID_CPP="$ANDROID_NDK/sources/cxx-stl/llvm-libc++/"
       export ANDROID_CC="$ANDROID_NDK/toolchains/llvm/prebuilt/$KERNEL/bin/clang"
       export ANDROID_ROOT="$ANDROID_NDK/platforms/android-21/arch-arm/"
-      export CMAKE_COMMAND="$CMAKE_COMMAND -DCMAKE_TOOLCHAIN_FILE=cmake/android-arm.cmake -DANDROID_BUILD=true"
+      export CMAKE_COMMAND="$CMAKE_COMMAND -DCMAKE_TOOLCHAIN_FILE=cmake/android-arm.cmake -DSD_ANDROID_BUILD=true"
     ;;
 
     android-arm64)
@@ -205,7 +205,7 @@ case "$OS" in
       export ANDROID_CPP="$ANDROID_NDK/sources/cxx-stl/llvm-libc++/"
       export ANDROID_CC="$ANDROID_NDK/toolchains/llvm/prebuilt/$KERNEL/bin/clang"
       export ANDROID_ROOT="$ANDROID_NDK/platforms/android-21/arch-arm64/"
-      export CMAKE_COMMAND="$CMAKE_COMMAND -DCMAKE_TOOLCHAIN_FILE=cmake/android-arm64.cmake -DANDROID_BUILD=true"
+      export CMAKE_COMMAND="$CMAKE_COMMAND -DCMAKE_TOOLCHAIN_FILE=cmake/android-arm64.cmake -DSD_ANDROID_BUILD=true"
     ;;
 
     android-x86)
@@ -216,7 +216,7 @@ case "$OS" in
       export ANDROID_CPP="$ANDROID_NDK/sources/cxx-stl/llvm-libc++/"
       export ANDROID_CC="$ANDROID_NDK/toolchains/llvm/prebuilt/$KERNEL/bin/clang"
       export ANDROID_ROOT="$ANDROID_NDK/platforms/android-21/arch-x86/"
-      export CMAKE_COMMAND="$CMAKE_COMMAND -DCMAKE_TOOLCHAIN_FILE=cmake/android-x86.cmake -DANDROID_BUILD=true"
+      export CMAKE_COMMAND="$CMAKE_COMMAND -DCMAKE_TOOLCHAIN_FILE=cmake/android-x86.cmake -DSD_ANDROID_BUILD=true"
     ;;
 
     android-x86_64)
@@ -227,7 +227,7 @@ case "$OS" in
       export ANDROID_CPP="$ANDROID_NDK/sources/cxx-stl/llvm-libc++/"
       export ANDROID_CC="$ANDROID_NDK/toolchains/llvm/prebuilt/$KERNEL/bin/clang"
       export ANDROID_ROOT="$ANDROID_NDK/platforms/android-21/arch-x86_64/"
-      export CMAKE_COMMAND="$CMAKE_COMMAND -DCMAKE_TOOLCHAIN_FILE=cmake/android-x86_64.cmake -DANDROID_BUILD=true"
+      export CMAKE_COMMAND="$CMAKE_COMMAND -DCMAKE_TOOLCHAIN_FILE=cmake/android-x86_64.cmake -DSD_ANDROID_BUILD=true"
     ;;
 
     ios-x86_64)
@@ -240,7 +240,7 @@ case "$OS" in
       fi
       XCODE_PATH="$(xcode-select --print-path)"
       export IOS_SDK="$XCODE_PATH/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator$IOS_VERSION.sdk"
-      export CMAKE_COMMAND="$CMAKE_COMMAND -DCMAKE_TOOLCHAIN_FILE=cmake/ios-x86_64.cmake --debug-trycompile -DIOS_BUILD=true"
+      export CMAKE_COMMAND="$CMAKE_COMMAND -DCMAKE_TOOLCHAIN_FILE=cmake/ios-x86_64.cmake --debug-trycompile -DSD_IOS_BUILD=true"
     ;;
 
     ios-x86)
@@ -253,7 +253,7 @@ case "$OS" in
       fi
       XCODE_PATH="$(xcode-select --print-path)"
       export IOS_SDK="$XCODE_PATH/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator$IOS_VERSION.sdk"
-      export CMAKE_COMMAND="$CMAKE_COMMAND -DCMAKE_TOOLCHAIN_FILE=cmake/ios-x86.cmake --debug-trycompile -DIOS_BUILD=true"
+      export CMAKE_COMMAND="$CMAKE_COMMAND -DCMAKE_TOOLCHAIN_FILE=cmake/ios-x86.cmake --debug-trycompile -DSD_IOS_BUILD=true"
     ;;
 
     ios-arm64)
@@ -266,7 +266,7 @@ case "$OS" in
       fi
       XCODE_PATH="$(xcode-select --print-path)"
       export IOS_SDK="$XCODE_PATH/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS$IOS_VERSION.sdk"
-      export CMAKE_COMMAND="$CMAKE_COMMAND -DCMAKE_TOOLCHAIN_FILE=cmake/ios-arm64.cmake --debug-trycompile -DIOS_BUILD=true"
+      export CMAKE_COMMAND="$CMAKE_COMMAND -DCMAKE_TOOLCHAIN_FILE=cmake/ios-arm64.cmake --debug-trycompile -DSD_IOS_BUILD=true"
     ;;
 
     ios-arm)
@@ -279,7 +279,7 @@ case "$OS" in
       fi
       XCODE_PATH="$(xcode-select --print-path)"
       export IOS_SDK="$XCODE_PATH/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS$IOS_VERSION.sdk"
-      export CMAKE_COMMAND="$CMAKE_COMMAND -DCMAKE_TOOLCHAIN_FILE=cmake/ios-arm.cmake --debug-trycompile -DIOS_BUILD=true"
+      export CMAKE_COMMAND="$CMAKE_COMMAND -DCMAKE_TOOLCHAIN_FILE=cmake/ios-arm.cmake --debug-trycompile -DSD_IOS_BUILD=true"
     ;;
 
     ios-armv7)
@@ -289,7 +289,7 @@ case "$OS" in
       LIBTYPE="static"
       ARCH="armv7"
       export IOS_SDK="/Applications/Xcode.app/Contents/Developer/Platforms/${iPhoneOS}.platform/Developer/SDKs/${iPhoneOS}${IOS_VERSION}.sdk"
-      export CMAKE_COMMAND="$CMAKE_COMMAND -DCMAKE_TOOLCHAIN_FILE=cmake/ios-armv7.cmake --debug-trycompile -DIOS_BUILD=true"
+      export CMAKE_COMMAND="$CMAKE_COMMAND -DCMAKE_TOOLCHAIN_FILE=cmake/ios-armv7.cmake --debug-trycompile -DSD_IOS_BUILD=true"
     ;;
 
     linux*)
@@ -299,7 +299,7 @@ case "$OS" in
       export CC=clang
       export CXX=clang++
       PARALLEL="true"
-      export CMAKE_COMMAND="$CMAKE_COMMAND -DCMAKE_MACOSX_RPATH=ON -DAPPLE_BUILD=true"
+      export CMAKE_COMMAND="$CMAKE_COMMAND -DCMAKE_MACOSX_RPATH=ON -DSD_APPLE_BUILD=true"
     ;;
 
     windows*)
@@ -376,7 +376,7 @@ fi
 OPERATIONS_ARG=
 
 if [ -z "$OPERATIONS" ]; then
- OPERATIONS_ARG="-DLIBND4J_ALL_OPS=true"
+ OPERATIONS_ARG="-DSD_ALL_OPS=true"
 else
  OPERATIONS_ARG=$OPERATIONS
 fi
@@ -386,9 +386,9 @@ if [ -z "$EXPERIMENTAL" ]; then
 fi
 
 if [ "$CHIP" == "cpu" ]; then
-    BLAS_ARG="-DCPU_BLAS=true -DBLAS=TRUE"
+    BLAS_ARG="-DSD_CPU=true -DBLAS=TRUE"
 else
-    BLAS_ARG="-DCUDA_BLAS=true -DBLAS=TRUE"
+    BLAS_ARG="-DSD_CUDA=true -DBLAS=TRUE"
 fi
 
 if [ -z "$NAME" ]; then
@@ -400,9 +400,9 @@ if [ -z "$NAME" ]; then
 fi
 
 if [ "$LIBTYPE" == "dynamic" ]; then
-     SHARED_LIBS_ARG="-DBUILD_SHARED_LIBS=OFF"
+     SHARED_LIBS_ARG="-DSD_SHARED_LIB=OFF"
      else
-         SHARED_LIBS_ARG="-DBUILD_SHARED_LIBS=ON"
+         SHARED_LIBS_ARG="-DSD_SHARED_LIB=ON"
 fi
 
 if [ "$BUILD" == "release" ]; then
@@ -429,24 +429,24 @@ if [ "$PACKAGING" == "msi" ]; then
 fi
 
 EXPERIMENTAL_ARG="";
-MINIFIER_ARG="-DLIBND4J_BUILD_MINIFIER=false"
-TESTS_ARG="-DBUILD_TESTS=OFF"
-NAME_ARG="-DLIBND4J_NAME=$NAME"
+MINIFIER_ARG="-DSD_BUILD_MINIFIER=false"
+TESTS_ARG="-DSD_BUILD_TESTS=OFF"
+NAME_ARG="-DSD_LIBRARY_NAME=$NAME"
 
 if [ "$EXPERIMENTAL" == "yes" ]; then
-    EXPERIMENTAL_ARG="-DEXPERIMENTAL=yes"
+    EXPERIMENTAL_ARG="-DSD_EXPERIMENTAL=yes"
 fi
 
 if [ "$MINIFIER" == "true" ]; then
-    MINIFIER_ARG="-DLIBND4J_BUILD_MINIFIER=true"
+    MINIFIER_ARG="-DSD_BUILD_MINIFIER=true"
 fi
 
 if [ "$TESTS" == "true" ]; then
-    MINIFIER_ARG="-DLIBND4J_BUILD_MINIFIER=true"
-    TESTS_ARG="-DBUILD_TESTS=ON"
+    MINIFIER_ARG="-DSD_BUILD_MINIFIER=true"
+    TESTS_ARG="-DSD_BUILD_TESTS=ON"
 fi
 
-ARCH_ARG="-DARCH=$ARCH -DEXTENSION=$CHIP_EXTENSION"
+ARCH_ARG="-DSD_ARCH=$ARCH -DSD_EXTENSION=$CHIP_EXTENSION"
 
 CUDA_COMPUTE="-DCOMPUTE=$COMPUTE"
 
@@ -537,7 +537,7 @@ echo CHECK_VECTORIZATION = "$CHECK_VECTORIZATION"
 echo HELPERS = "$HELPERS"
 mkbuilddir
 pwd
-eval $CMAKE_COMMAND  "$BLAS_ARG" "$ARCH_ARG" "$NAME_ARG" -DCHECK_VECTORIZATION="${CHECK_VECTORIZATION}"  $HELPERS "$SHARED_LIBS_ARG" "$MINIFIER_ARG" "$OPERATIONS_ARG" "$BUILD_TYPE" "$PACKAGING_ARG" "$EXPERIMENTAL_ARG" "$TESTS_ARG" "$CUDA_COMPUTE" -DOPENBLAS_PATH="$OPENBLAS_PATH" -DDEV=FALSE -DCMAKE_NEED_RESPONSE=YES -DMKL_MULTI_THREADED=TRUE ../..
+eval $CMAKE_COMMAND  "$BLAS_ARG" "$ARCH_ARG" "$NAME_ARG" -DSD_CHECK_VECTORIZATION="${CHECK_VECTORIZATION}"  $HELPERS "$SHARED_LIBS_ARG" "$MINIFIER_ARG" "$OPERATIONS_ARG" "$BUILD_TYPE" "$PACKAGING_ARG" "$EXPERIMENTAL_ARG" "$TESTS_ARG" "$CUDA_COMPUTE" -DOPENBLAS_PATH="$OPENBLAS_PATH" -DDEV=FALSE -DCMAKE_NEED_RESPONSE=YES -DMKL_MULTI_THREADED=TRUE ../..
 
 if [ "$PARALLEL" == "true" ]; then
     MAKE_ARGUMENTS="$MAKE_ARGUMENTS -j $MAKEJ"
diff --git a/libnd4j/include/array/ArrayOptions.h b/libnd4j/include/array/ArrayOptions.h
index 484228fb7..1f0c25705 100644
--- a/libnd4j/include/array/ArrayOptions.h
+++ b/libnd4j/include/array/ArrayOptions.h
@@ -21,9 +21,9 @@
 #ifndef ND4J_ARRAY_OPTIONS_H
 #define ND4J_ARRAY_OPTIONS_H
 
-#include <op_boilerplate.h>
-#include <pointercast.h>
-#include <dll.h>
+#include <system/op_boilerplate.h>
+#include <system/pointercast.h>
+#include <system/dll.h>
 #include <array/DataType.h>
 #include <array/ArrayType.h>
 #include <array/SpaceType.h>
@@ -87,7 +87,7 @@
 #define ARRAY_UNSIGNED 8388608
 
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT ArrayOptions {
 
     private:
@@ -104,7 +104,7 @@ namespace nd4j {
         static FORCEINLINE _CUDA_HD bool isSparseArray(Nd4jLong *shapeInfo);
         static FORCEINLINE _CUDA_HD bool isUnsigned(Nd4jLong *shapeInfo);
 
-        static FORCEINLINE _CUDA_HD nd4j::DataType dataType(const Nd4jLong *shapeInfo);
+        static FORCEINLINE _CUDA_HD sd::DataType dataType(const Nd4jLong *shapeInfo);
 
         static FORCEINLINE _CUDA_HD SpaceType spaceType(Nd4jLong *shapeInfo);
         static FORCEINLINE _CUDA_HD SpaceType spaceType(const Nd4jLong *shapeInfo);
@@ -119,7 +119,7 @@ namespace nd4j {
 
 
         static FORCEINLINE _CUDA_HD void resetDataType(Nd4jLong *shapeInfo);
-        static FORCEINLINE _CUDA_HD void setDataType(Nd4jLong *shapeInfo, const nd4j::DataType dataType);
+        static FORCEINLINE _CUDA_HD void setDataType(Nd4jLong *shapeInfo, const sd::DataType dataType);
 
         static FORCEINLINE _CUDA_HD void copyDataType(Nd4jLong* to, const Nd4jLong* from);
     };
@@ -155,34 +155,34 @@ namespace nd4j {
         return hasPropertyBitSet(shapeInfo, ARRAY_UNSIGNED);
     }
 
-    FORCEINLINE _CUDA_HD nd4j::DataType ArrayOptions::dataType(const Nd4jLong *shapeInfo) {
+    FORCEINLINE _CUDA_HD sd::DataType ArrayOptions::dataType(const Nd4jLong *shapeInfo) {
         /*if (hasPropertyBitSet(shapeInfo, ARRAY_QUANTIZED))
-            return nd4j::DataType::QINT8;
+            return sd::DataType::QINT8;
         else */if (hasPropertyBitSet(shapeInfo, ARRAY_FLOAT))
-            return nd4j::DataType::FLOAT32;
+            return sd::DataType::FLOAT32;
         else if (hasPropertyBitSet(shapeInfo, ARRAY_DOUBLE))
-            return nd4j::DataType::DOUBLE;
+            return sd::DataType::DOUBLE;
         else if (hasPropertyBitSet(shapeInfo, ARRAY_HALF))
-            return nd4j::DataType::HALF;
+            return sd::DataType::HALF;
         else if (hasPropertyBitSet(shapeInfo, ARRAY_BHALF))
-            return nd4j::DataType::BFLOAT16;
+            return sd::DataType::BFLOAT16;
         else if (hasPropertyBitSet(shapeInfo, ARRAY_BOOL))
-            return nd4j::DataType ::BOOL;
+            return sd::DataType ::BOOL;
         else if (hasPropertyBitSet(shapeInfo, ARRAY_UNSIGNED)) {
             if (hasPropertyBitSet(shapeInfo, ARRAY_CHAR))
-                return nd4j::DataType ::UINT8;
+                return sd::DataType ::UINT8;
             else if (hasPropertyBitSet(shapeInfo, ARRAY_SHORT))
-                return nd4j::DataType ::UINT16;
+                return sd::DataType ::UINT16;
             else if (hasPropertyBitSet(shapeInfo, ARRAY_INT))
-                return nd4j::DataType ::UINT32;
+                return sd::DataType ::UINT32;
             else if (hasPropertyBitSet(shapeInfo, ARRAY_LONG))
-                return nd4j::DataType ::UINT64;
+                return sd::DataType ::UINT64;
             else if (hasPropertyBitSet(shapeInfo, ARRAY_UTF8))
-                return nd4j::DataType ::UTF8;
+                return sd::DataType ::UTF8;
             else if (hasPropertyBitSet(shapeInfo, ARRAY_UTF16))
-                return nd4j::DataType ::UTF16;
+                return sd::DataType ::UTF16;
             else if (hasPropertyBitSet(shapeInfo, ARRAY_UTF32))
-                return nd4j::DataType ::UTF32;
+                return sd::DataType ::UTF32;
             else {
                 //shape::printShapeInfoLinear("Bad unsigned datatype (not)stored in shape", const_cast<Nd4jLong*>(shapeInfo));
 #ifndef __CUDA_ARCH__
@@ -191,19 +191,19 @@ namespace nd4j {
             }
         }
         else if (hasPropertyBitSet(shapeInfo, ARRAY_CHAR))
-            return nd4j::DataType::INT8;
+            return sd::DataType::INT8;
         else if (hasPropertyBitSet(shapeInfo, ARRAY_SHORT))
-            return nd4j::DataType::INT16;
+            return sd::DataType::INT16;
         else if (hasPropertyBitSet(shapeInfo, ARRAY_INT))
-            return nd4j::DataType::INT32;
+            return sd::DataType::INT32;
         else if (hasPropertyBitSet(shapeInfo, ARRAY_LONG))
-            return nd4j::DataType::INT64;
+            return sd::DataType::INT64;
         else if (hasPropertyBitSet(shapeInfo, ARRAY_UTF8))
-            return nd4j::DataType::UTF8;
+            return sd::DataType::UTF8;
         else if (hasPropertyBitSet(shapeInfo, ARRAY_UTF16))
-            return nd4j::DataType::UTF16;
+            return sd::DataType::UTF16;
         else if (hasPropertyBitSet(shapeInfo, ARRAY_UTF32))
-            return nd4j::DataType::UTF32;
+            return sd::DataType::UTF32;
         else {
             //shape::printShapeInfoLinear("Bad signed datatype (not)stored in shape", const_cast<Nd4jLong*>(shapeInfo));
 #ifndef __CUDA_ARCH__
@@ -296,63 +296,63 @@ namespace nd4j {
         unsetPropertyBit(shapeInfo, ARRAY_UNSIGNED);
     }
 
-    FORCEINLINE _CUDA_HD void ArrayOptions::setDataType(Nd4jLong *shapeInfo, const nd4j::DataType dataType) {
+    FORCEINLINE _CUDA_HD void ArrayOptions::setDataType(Nd4jLong *shapeInfo, const sd::DataType dataType) {
         resetDataType(shapeInfo);
-        if (dataType == nd4j::DataType::UINT8 ||
-            dataType == nd4j::DataType::UINT16 ||
-            dataType == nd4j::DataType::UINT32 ||
-            dataType == nd4j::DataType::UINT64) {
+        if (dataType == sd::DataType::UINT8 ||
+            dataType == sd::DataType::UINT16 ||
+            dataType == sd::DataType::UINT32 ||
+            dataType == sd::DataType::UINT64) {
 
             setPropertyBit(shapeInfo, ARRAY_UNSIGNED);
         }
 
         switch (dataType) {
-            case nd4j::DataType::BOOL:
+            case sd::DataType::BOOL:
                 setPropertyBit(shapeInfo, ARRAY_BOOL);
                 break;
-            case nd4j::DataType::HALF:
+            case sd::DataType::HALF:
                 setPropertyBit(shapeInfo, ARRAY_HALF);
                 break;
-            case nd4j::DataType::BFLOAT16:
+            case sd::DataType::BFLOAT16:
                 setPropertyBit(shapeInfo, ARRAY_BHALF);
                 break;
-            case nd4j::DataType::FLOAT32:
+            case sd::DataType::FLOAT32:
                 setPropertyBit(shapeInfo, ARRAY_FLOAT);
                 break;
-            case nd4j::DataType::DOUBLE:
+            case sd::DataType::DOUBLE:
                 setPropertyBit(shapeInfo, ARRAY_DOUBLE);
                 break;
-            case nd4j::DataType::INT8:
+            case sd::DataType::INT8:
                 setPropertyBit(shapeInfo, ARRAY_CHAR);
                 break;
-            case nd4j::DataType::INT16:
+            case sd::DataType::INT16:
                 setPropertyBit(shapeInfo, ARRAY_SHORT);
                 break;
-            case nd4j::DataType::INT32:
+            case sd::DataType::INT32:
                 setPropertyBit(shapeInfo, ARRAY_INT);
                 break;
-            case nd4j::DataType::INT64:
+            case sd::DataType::INT64:
                 setPropertyBit(shapeInfo, ARRAY_LONG);
                 break;
-            case nd4j::DataType::UINT8:
+            case sd::DataType::UINT8:
                 setPropertyBit(shapeInfo, ARRAY_CHAR);
                 break;
-            case nd4j::DataType::UINT16:
+            case sd::DataType::UINT16:
                 setPropertyBit(shapeInfo, ARRAY_SHORT);
                 break;
-            case nd4j::DataType::UINT32:
+            case sd::DataType::UINT32:
                 setPropertyBit(shapeInfo, ARRAY_INT);
                 break;
-            case nd4j::DataType::UINT64:
+            case sd::DataType::UINT64:
                 setPropertyBit(shapeInfo, ARRAY_LONG);
                 break;
-            case nd4j::DataType::UTF8:
+            case sd::DataType::UTF8:
                 setPropertyBit(shapeInfo, ARRAY_UTF8);
                 break;
-            case nd4j::DataType::UTF16:
+            case sd::DataType::UTF16:
                 setPropertyBit(shapeInfo, ARRAY_UTF16);
                 break;
-            case nd4j::DataType::UTF32:
+            case sd::DataType::UTF32:
                 setPropertyBit(shapeInfo, ARRAY_UTF32);
                 break;
             default:
diff --git a/libnd4j/include/array/ArrayType.h b/libnd4j/include/array/ArrayType.h
index d4d6c9729..83e80bc0f 100644
--- a/libnd4j/include/array/ArrayType.h
+++ b/libnd4j/include/array/ArrayType.h
@@ -21,7 +21,7 @@
 #ifndef ND4J_ARRAY_TYPE_H
 #define ND4J_ARRAY_TYPE_H
 
-namespace nd4j {
+namespace sd {
     enum ArrayType {
         DENSE = 1,
         SPARSE = 2,
diff --git a/libnd4j/include/array/ByteOrder.h b/libnd4j/include/array/ByteOrder.h
index 5cc490c85..121be9e9d 100644
--- a/libnd4j/include/array/ByteOrder.h
+++ b/libnd4j/include/array/ByteOrder.h
@@ -21,7 +21,7 @@
 #ifndef LIBND4J_BYTEORDER_H
 #define LIBND4J_BYTEORDER_H
 
-namespace nd4j {
+namespace sd {
     enum ByteOrder {
         LE = 0,
         BE = 1,
diff --git a/libnd4j/include/array/ByteOrderUtils.h b/libnd4j/include/array/ByteOrderUtils.h
index 4250ec9a3..0f335ea65 100644
--- a/libnd4j/include/array/ByteOrderUtils.h
+++ b/libnd4j/include/array/ByteOrderUtils.h
@@ -23,12 +23,12 @@
 
 #include <graph/generated/array_generated.h>
 #include "ByteOrder.h"
-#include <dll.h>
+#include <system/dll.h>
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT  ByteOrderUtils {
     public:
-        static ByteOrder fromFlatByteOrder(nd4j::graph::ByteOrder order);
+        static ByteOrder fromFlatByteOrder(sd::graph::ByteOrder order);
     };
 }
 
diff --git a/libnd4j/include/array/ConstantDataBuffer.h b/libnd4j/include/array/ConstantDataBuffer.h
index fd191b53b..e8bafe114 100644
--- a/libnd4j/include/array/ConstantDataBuffer.h
+++ b/libnd4j/include/array/ConstantDataBuffer.h
@@ -20,11 +20,11 @@
 #ifndef LIBND4J_CONSTANTDATABUFFER_H
 #define LIBND4J_CONSTANTDATABUFFER_H
 
-#include <dll.h>
-#include <pointercast.h>
+#include <system/dll.h>
+#include <system/pointercast.h>
 
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT ConstantDataBuffer {
     private:
         Nd4jPointer _primaryBuffer = nullptr;
diff --git a/libnd4j/include/array/ConstantDescriptor.h b/libnd4j/include/array/ConstantDescriptor.h
index f05f98dac..589ba2353 100644
--- a/libnd4j/include/array/ConstantDescriptor.h
+++ b/libnd4j/include/array/ConstantDescriptor.h
@@ -24,11 +24,11 @@
 #include <array/DataType.h>
 #include <unordered_map>
 #include <vector>
-#include <pointercast.h>
-#include <dll.h>
+#include <system/pointercast.h>
+#include <system/dll.h>
 #include <array/ConstantDataBuffer.h>
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT ConstantDescriptor {
     private:
         std::vector<Nd4jLong> _integerValues;
@@ -63,9 +63,9 @@ namespace nd4j {
 
 namespace std {
     template<>
-    class ND4J_EXPORT hash<nd4j::ConstantDescriptor> {
+    class ND4J_EXPORT hash<sd::ConstantDescriptor> {
     public:
-        size_t operator()(const nd4j::ConstantDescriptor &k) const;
+        size_t operator()(const sd::ConstantDescriptor &k) const;
     };
 }
 
diff --git a/libnd4j/include/array/ConstantHolder.h b/libnd4j/include/array/ConstantHolder.h
index d0824483e..a404e5808 100644
--- a/libnd4j/include/array/ConstantHolder.h
+++ b/libnd4j/include/array/ConstantHolder.h
@@ -27,13 +27,13 @@
 #include <array/ConstantDataBuffer.h>
 #include <mutex>
 
-namespace nd4j {
+namespace sd {
     class ConstantHolder {
     private:
         int _deviceId = 0;
         std::mutex _mutex;
 
-        std::map<nd4j::DataType, ConstantDataBuffer> _buffers;
+        std::map<sd::DataType, ConstantDataBuffer> _buffers;
     public:
         ConstantHolder(const ConstantHolder& other);
         ConstantHolder() = default;
@@ -42,17 +42,17 @@ namespace nd4j {
         ConstantHolder& operator=(const ConstantHolder& other) = default;
         ConstantHolder& operator=(ConstantHolder&& other) = default;
 
-        bool hasBuffer(nd4j::DataType dataType);
+        bool hasBuffer(sd::DataType dataType);
 
         template <typename T>
         bool hasBuffer();
 
-        void addBuffer(ConstantDataBuffer &pointer, nd4j::DataType dataType);
+        void addBuffer(ConstantDataBuffer &pointer, sd::DataType dataType);
 
         template <typename T>
         void addBuffer(ConstantDataBuffer &pointer);
 
-        ConstantDataBuffer* getConstantDataBuffer(nd4j::DataType dataType);
+        ConstantDataBuffer* getConstantDataBuffer(sd::DataType dataType);
 
         template <typename T>
         ConstantDataBuffer* getConstantDataBuffer();
diff --git a/libnd4j/include/array/DataBuffer.h b/libnd4j/include/array/DataBuffer.h
index cd27c20b8..59ffe3045 100644
--- a/libnd4j/include/array/DataBuffer.h
+++ b/libnd4j/include/array/DataBuffer.h
@@ -23,14 +23,14 @@
 #define DEV_TESTS_DATABUFFER_H
 
 #include <cstring>
-#include <op_boilerplate.h>
-#include <dll.h>
-#include <pointercast.h>
+#include <system/op_boilerplate.h>
+#include <system/dll.h>
+#include <system/pointercast.h>
 #include <array/DataType.h>
 #include <memory/Workspace.h>
 #include <execution/LaunchContext.h>
 
-namespace nd4j {
+namespace sd {
 
 class ND4J_EXPORT DataBuffer {
 
diff --git a/libnd4j/include/array/DataType.h b/libnd4j/include/array/DataType.h
index 8ec55342e..cf8baf7d0 100644
--- a/libnd4j/include/array/DataType.h
+++ b/libnd4j/include/array/DataType.h
@@ -21,7 +21,7 @@
 #ifndef ND4J_DATATYPE_H
 #define ND4J_DATATYPE_H
 
-namespace nd4j {
+namespace sd {
     enum DataType {
         INHERIT = 0,
         BOOL = 1,
diff --git a/libnd4j/include/array/DataTypeConversions.h b/libnd4j/include/array/DataTypeConversions.h
index abc804f5e..44f555533 100644
--- a/libnd4j/include/array/DataTypeConversions.h
+++ b/libnd4j/include/array/DataTypeConversions.h
@@ -21,17 +21,17 @@
 #ifndef LIBND4J_DATATYPECONVERSIONS_H
 #define LIBND4J_DATATYPECONVERSIONS_H
 
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <helpers/logger.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <array/DataType.h>
 #include <types/float16.h>
 #include <helpers/BitwiseUtils.h>
 #include <loops/type_conversions.h>
-#include <dll.h>
+#include <system/dll.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
     template <typename T>
     class ND4J_EXPORT DataTypeConversions {
     private:
diff --git a/libnd4j/include/array/DataTypeUtils.h b/libnd4j/include/array/DataTypeUtils.h
index c307ecd4e..bd89605d1 100644
--- a/libnd4j/include/array/DataTypeUtils.h
+++ b/libnd4j/include/array/DataTypeUtils.h
@@ -26,20 +26,20 @@
 #include <types/bfloat16.h>
 #include <array/DataType.h>
 #include <graph/generated/array_generated.h>
-#include <op_boilerplate.h>
-#include <dll.h>
-#include <Environment.h>
-#include <ArrayOptions.h>
+#include <system/op_boilerplate.h>
+#include <system/dll.h>
+#include <system/Environment.h>
+#include <array/ArrayOptions.h>
 //#include <templatemath.h>
-//#include <shape.h>
+//#include <helpers/shape.h>
 #include <helpers/logger.h>
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT DataTypeUtils {
     public:
         static int asInt(DataType type);
         static DataType fromInt(int dtype);
-        static DataType fromFlatDataType(nd4j::graph::DType dtype);
+        static DataType fromFlatDataType(sd::graph::DType dtype);
         FORCEINLINE static std::string  asString(DataType dataType);
 
         template <typename T>
@@ -70,21 +70,21 @@ namespace nd4j {
         FORCEINLINE static _CUDA_HD size_t sizeOf(DataType type);
         FORCEINLINE static _CUDA_HD size_t sizeOf(const Nd4jLong* shapeInfo);
 
-        FORCEINLINE static _CUDA_HD bool isR(nd4j::DataType dataType);
+        FORCEINLINE static _CUDA_HD bool isR(sd::DataType dataType);
 
-        FORCEINLINE static _CUDA_HD bool isZ(nd4j::DataType dataType);
+        FORCEINLINE static _CUDA_HD bool isZ(sd::DataType dataType);
 
-        FORCEINLINE static _CUDA_HD bool isB(nd4j::DataType dataType);
+        FORCEINLINE static _CUDA_HD bool isB(sd::DataType dataType);
 
-        FORCEINLINE static _CUDA_HD bool isU(nd4j::DataType dataType);
+        FORCEINLINE static _CUDA_HD bool isU(sd::DataType dataType);
 
-        FORCEINLINE static _CUDA_HD bool isS(nd4j::DataType dataType);
+        FORCEINLINE static _CUDA_HD bool isS(sd::DataType dataType);
 
-        FORCEINLINE static nd4j::DataType pickPairwiseResultType(nd4j::DataType typeX, nd4j::DataType typeY);
+        FORCEINLINE static sd::DataType pickPairwiseResultType(sd::DataType typeX, sd::DataType typeY);
 
-        FORCEINLINE static nd4j::DataType pickPairwiseResultType(const Nd4jLong* shapeInfo1, const Nd4jLong* shapeInfo2);
+        FORCEINLINE static sd::DataType pickPairwiseResultType(const Nd4jLong* shapeInfo1, const Nd4jLong* shapeInfo2);
 
-        FORCEINLINE static nd4j::DataType pickFloatingType(nd4j::DataType typeX);
+        FORCEINLINE static sd::DataType pickFloatingType(sd::DataType typeX);
 
         template <typename T1, typename T2>
         FORCEINLINE static std::vector<T2> convertVector(const std::vector<T1> &vector);
@@ -106,38 +106,38 @@ namespace nd4j {
 ///// IMLEMENTATION OF INLINE METHODS /////
 //////////////////////////////////////////////////////////////////////////
 
-    FORCEINLINE nd4j::DataType DataTypeUtils::pickFloatingType(nd4j::DataType typeX) {
+    FORCEINLINE sd::DataType DataTypeUtils::pickFloatingType(sd::DataType typeX) {
         // if proposed dataType is already floating point - return it
         if (isR(typeX))
             return typeX;
         return Environment::getInstance()->defaultFloatDataType();
     }
 
-    FORCEINLINE bool DataTypeUtils::isR(nd4j::DataType dataType) {
-        return dataType == nd4j::DataType::FLOAT32 || dataType == nd4j::DataType::BFLOAT16 || dataType == nd4j::DataType::HALF || dataType == nd4j::DataType::DOUBLE;
+    FORCEINLINE bool DataTypeUtils::isR(sd::DataType dataType) {
+        return dataType == sd::DataType::FLOAT32 || dataType == sd::DataType::BFLOAT16 || dataType == sd::DataType::HALF || dataType == sd::DataType::DOUBLE;
     }
 
-    FORCEINLINE bool DataTypeUtils::isB(nd4j::DataType dataType) {
-        return dataType == nd4j::DataType::BOOL;
+    FORCEINLINE bool DataTypeUtils::isB(sd::DataType dataType) {
+        return dataType == sd::DataType::BOOL;
     }
 
-    FORCEINLINE bool DataTypeUtils::isS(nd4j::DataType dataType) {
-        return dataType == nd4j::DataType::UTF8 || dataType == nd4j::DataType::UTF16 || dataType == nd4j::DataType::UTF32;
+    FORCEINLINE bool DataTypeUtils::isS(sd::DataType dataType) {
+        return dataType == sd::DataType::UTF8 || dataType == sd::DataType::UTF16 || dataType == sd::DataType::UTF32;
     }
 
-    FORCEINLINE bool DataTypeUtils::isZ(nd4j::DataType dataType) {
+    FORCEINLINE bool DataTypeUtils::isZ(sd::DataType dataType) {
         return !isR(dataType) && !isB(dataType) && !isS(dataType);
     }
 
-    FORCEINLINE bool DataTypeUtils::isU(nd4j::DataType dataType) {
-        return dataType == nd4j::DataType::UINT8 || dataType == nd4j::DataType::UINT16 || dataType == nd4j::DataType::UINT32 || dataType == nd4j::DataType::UINT64;
+    FORCEINLINE bool DataTypeUtils::isU(sd::DataType dataType) {
+        return dataType == sd::DataType::UINT8 || dataType == sd::DataType::UINT16 || dataType == sd::DataType::UINT32 || dataType == sd::DataType::UINT64;
     }
 
-    FORCEINLINE nd4j::DataType DataTypeUtils::pickPairwiseResultType(nd4j::DataType typeX, nd4j::DataType typeY) {
+    FORCEINLINE sd::DataType DataTypeUtils::pickPairwiseResultType(sd::DataType typeX, sd::DataType typeY) {
         // if both dtypes are the same - just return it
         if (typeX == typeY)
             return typeX;
-        auto nd4j_max = [](nd4j::DataType typeX, nd4j::DataType typeY) {
+        auto nd4j_max = [](sd::DataType typeX, sd::DataType typeY) {
             return typeX > typeY?typeX:typeY;
         };
         auto rX = isR(typeX);
@@ -154,7 +154,7 @@ namespace nd4j {
         // if both data types are float - return biggest one
         if (rX && rY) {
             // if we allow precision boost, then we pick bigger data type
-            if (nd4j::Environment::getInstance()->precisionBoostAllowed()) {
+            if (sd::Environment::getInstance()->precisionBoostAllowed()) {
                 return nd4j_max(typeX, typeY);
             } else {
                 // and we return first operand otherwise
@@ -165,7 +165,7 @@ namespace nd4j {
 
         // if that's not real type, we apply same rules
         if (!rX && !rY) {
-            if (nd4j::Environment::getInstance()->precisionBoostAllowed()) {
+            if (sd::Environment::getInstance()->precisionBoostAllowed()) {
                 return nd4j_max(typeX, typeY);
             } else {
                 // and we return first operand otherwise
@@ -177,7 +177,7 @@ namespace nd4j {
     }
 
 ///////////////////////////////////////////////////////////////////
-FORCEINLINE nd4j::DataType DataTypeUtils::pickPairwiseResultType(const Nd4jLong* shapeInfo1, const Nd4jLong* shapeInfo2) {
+FORCEINLINE sd::DataType DataTypeUtils::pickPairwiseResultType(const Nd4jLong* shapeInfo1, const Nd4jLong* shapeInfo2) {
 
     return pickPairwiseResultType(ArrayOptions::dataType(shapeInfo1), ArrayOptions::dataType(shapeInfo2));
 }
@@ -420,31 +420,31 @@ FORCEINLINE _CUDA_HD T DataTypeUtils::eps() {
         return result;
     }
 
-    FORCEINLINE _CUDA_HD size_t DataTypeUtils::sizeOfElement(nd4j::DataType type) {
+    FORCEINLINE _CUDA_HD size_t DataTypeUtils::sizeOfElement(sd::DataType type) {
         switch (type) {
-            case nd4j::DataType::UINT8:
-            case nd4j::DataType::INT8:
-            case nd4j::DataType::FLOAT8:
-            case nd4j::DataType::QINT8:
-            case nd4j::DataType::BOOL: return (size_t) 1;
+            case sd::DataType::UINT8:
+            case sd::DataType::INT8:
+            case sd::DataType::FLOAT8:
+            case sd::DataType::QINT8:
+            case sd::DataType::BOOL: return (size_t) 1;
 
-            case nd4j::DataType::BFLOAT16:
-            case nd4j::DataType::HALF:
-            case nd4j::DataType::INT16:
-            case nd4j::DataType::QINT16:
-            case nd4j::DataType::UINT16: return (size_t) 2;
+            case sd::DataType::BFLOAT16:
+            case sd::DataType::HALF:
+            case sd::DataType::INT16:
+            case sd::DataType::QINT16:
+            case sd::DataType::UINT16: return (size_t) 2;
 
-            case nd4j::DataType::UTF8:
-            case nd4j::DataType::UTF16:
-            case nd4j::DataType::UTF32:
-            case nd4j::DataType::INT32:
-            case nd4j::DataType::UINT32:
-            case nd4j::DataType::HALF2:
-            case nd4j::DataType::FLOAT32: return (size_t) 4;
+            case sd::DataType::UTF8:
+            case sd::DataType::UTF16:
+            case sd::DataType::UTF32:
+            case sd::DataType::INT32:
+            case sd::DataType::UINT32:
+            case sd::DataType::HALF2:
+            case sd::DataType::FLOAT32: return (size_t) 4;
 
-            case nd4j::DataType::UINT64:
-            case nd4j::DataType::INT64:
-            case nd4j::DataType::DOUBLE: return (size_t) 8;
+            case sd::DataType::UINT64:
+            case sd::DataType::INT64:
+            case sd::DataType::DOUBLE: return (size_t) 8;
 
             default: {
                 nd4j_printf("Unknown DataType used: [%i]\n", asInt(type));
@@ -456,41 +456,41 @@ FORCEINLINE _CUDA_HD T DataTypeUtils::eps() {
     }
 
     template <typename T>
-    FORCEINLINE _CUDA_HD nd4j::DataType nd4j::DataTypeUtils::fromT() {
+    FORCEINLINE _CUDA_HD sd::DataType sd::DataTypeUtils::fromT() {
         if (std::is_same<T, bool>::value) {
-            return nd4j::DataType::BOOL;
+            return sd::DataType::BOOL;
         } else if (std::is_same<T, std::string>::value) {
-            return nd4j::DataType::UTF8;
+            return sd::DataType::UTF8;
         } else if (std::is_same<T, std::u16string>::value) {
-            return nd4j::DataType::UTF16;
+            return sd::DataType::UTF16;
         } else if (std::is_same<T, std::u32string>::value) {
-            return nd4j::DataType::UTF32;
+            return sd::DataType::UTF32;
         } else if (std::is_same<T, float>::value) {
-            return nd4j::DataType::FLOAT32;
+            return sd::DataType::FLOAT32;
         } else if (std::is_same<T, float16>::value) {
-            return nd4j::DataType::HALF;
+            return sd::DataType::HALF;
         } else if (std::is_same<T, bfloat16>::value) {
-            return nd4j::DataType::BFLOAT16;
+            return sd::DataType::BFLOAT16;
         } else if (std::is_same<T, double>::value) {
-            return nd4j::DataType::DOUBLE;
+            return sd::DataType::DOUBLE;
         } else if (std::is_same<T, int8_t >::value) {
-            return nd4j::DataType::INT8;
+            return sd::DataType::INT8;
         } else if (std::is_same<T, int16_t >::value) {
-            return nd4j::DataType::INT16;
+            return sd::DataType::INT16;
         } else if (std::is_same<T, int>::value) {
-            return nd4j::DataType::INT32;
+            return sd::DataType::INT32;
         } else if (std::is_same<T, Nd4jLong>::value) {
-            return nd4j::DataType::INT64;
+            return sd::DataType::INT64;
         } else if (std::is_same<T, uint8_t>::value) {
-            return nd4j::DataType::UINT8;
+            return sd::DataType::UINT8;
         } else if (std::is_same<T, uint16_t>::value) {
-            return nd4j::DataType::UINT16;
+            return sd::DataType::UINT16;
         } else if (std::is_same<T, uint32_t>::value) {
-            return nd4j::DataType::UINT32;
+            return sd::DataType::UINT32;
         } else if (std::is_same<T, Nd4jULong>::value) {
-            return nd4j::DataType::UINT64;
+            return sd::DataType::UINT64;
         } else {
-            return nd4j::DataType::INHERIT;
+            return sd::DataType::INHERIT;
         }
     }
 }
diff --git a/libnd4j/include/array/ExtraArguments.h b/libnd4j/include/array/ExtraArguments.h
index e1f5a69bd..131e8cd92 100644
--- a/libnd4j/include/array/ExtraArguments.h
+++ b/libnd4j/include/array/ExtraArguments.h
@@ -21,14 +21,14 @@
 #ifndef DEV_TESTS_EXTRAARGUMENTS_H
 #define DEV_TESTS_EXTRAARGUMENTS_H
 
-#include <dll.h>
+#include <system/dll.h>
 #include <initializer_list>
 #include <vector>
 #include <array/DataType.h>
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <stdlib.h>
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT ExtraArguments {
     private:
         std::vector<double> _fpArgs;
@@ -54,7 +54,7 @@ namespace nd4j {
         template <typename T>
         void* argumentsAsT(Nd4jLong offset = 0);
 
-        void* argumentsAsT(nd4j::DataType dataType, Nd4jLong offset = 0);
+        void* argumentsAsT(sd::DataType dataType, Nd4jLong offset = 0);
 
         size_t length();
     };
diff --git a/libnd4j/include/array/InteropDataBuffer.h b/libnd4j/include/array/InteropDataBuffer.h
index 3cbfc2f94..27b17aabb 100644
--- a/libnd4j/include/array/InteropDataBuffer.h
+++ b/libnd4j/include/array/InteropDataBuffer.h
@@ -18,7 +18,7 @@
 // @author raver119@gmail.com
 //
 
-#include <dll.h>
+#include <system/dll.h>
 #include <array/DataBuffer.h>
 #include <array/DataType.h>
 #include <memory>
@@ -26,7 +26,7 @@
 #ifndef LIBND4J_INTEROPDATABUFFER_H
 #define LIBND4J_INTEROPDATABUFFER_H
 
-namespace nd4j {
+namespace sd {
     /**
      * This class is a wrapper for DataBuffer, suitable for sharing DataBuffer between front-end and back-end languages
      */
@@ -37,7 +37,7 @@ namespace nd4j {
     public:
         InteropDataBuffer(InteropDataBuffer &dataBuffer, uint64_t length, uint64_t offset);
         InteropDataBuffer(std::shared_ptr<DataBuffer> databuffer);
-        InteropDataBuffer(size_t elements, nd4j::DataType dtype, bool allocateBoth);
+        InteropDataBuffer(size_t elements, sd::DataType dtype, bool allocateBoth);
         ~InteropDataBuffer() = default;
 
 #ifndef __JAVACPP_HACK__
diff --git a/libnd4j/blas/NDArray.h b/libnd4j/include/array/NDArray.h
similarity index 84%
rename from libnd4j/blas/NDArray.h
rename to libnd4j/include/array/NDArray.h
index be82cf3f8..853ff9f4a 100644
--- a/libnd4j/blas/NDArray.h
+++ b/libnd4j/include/array/NDArray.h
@@ -17,11 +17,11 @@
 #ifndef NDARRAY_H
 #define NDARRAY_H
 
-#include <dll.h>
+#include <system/dll.h>
 #include <initializer_list>
 #include <functional>
-#include <shape.h>
-#include "NativeOpExecutioner.h"
+#include <helpers/shape.h>
+#include "legacy/NativeOpExecutioner.h"
 #include <indexing/NDIndex.h>
 #include <indexing/IndicesList.h>
 #include <graph/Intervals.h>
@@ -32,13 +32,13 @@
 #include <array/ArrayType.h>
 #include <array/ResultSet.h>
 #include <helpers/ShapeBuilders.h>
-#include <op_enums.h>
+#include <system/op_enums.h>
 #include <ops/BroadcastOpsTuple.h>
 #include <ops/BroadcastBoolOpsTuple.h>
 #include <ops/BroadcastIntOpsTuple.h>
 #include <array/ExtraArguments.h>
-#include <Status.h>
-#include <ShapeDescriptor.h>
+#include <graph/Status.h>
+#include <array/ShapeDescriptor.h>
 #include <helpers/ConstantShapeHelper.h>
 #include <array/DataBuffer.h>
 #include <execution/AffinityManager.h>
@@ -47,7 +47,7 @@
 #include <memory/MemoryCounter.h>
 
 
-namespace nd4j {
+namespace sd {
 
     template <typename T, typename = typename std::enable_if<DataTypeUtils::scalarTypesForNDarray<T>::value>::type>
     ND4J_EXPORT NDArray operator+(const NDArray& arr, const T& scalar);
@@ -116,7 +116,7 @@ namespace nd4j {
         void templatedSet(void *buffer, const Nd4jLong xOffset, const void *value);
 
         template <typename T>
-        void templatedSet(void *buffer, const Nd4jLong xOfsset, nd4j::DataType dtype, const void *value);
+        void templatedSet(void *buffer, const Nd4jLong xOfsset, sd::DataType dtype, const void *value);
 
         template <typename T>
         void templatedAssign(void *xBuffer, const Nd4jLong xOffset, const void *yBuffer, const Nd4jLong yOffset) const;
@@ -161,7 +161,7 @@ namespace nd4j {
         /**
         *  pointer on device launch context (with all data needed there).
         */
-        nd4j::LaunchContext * _context = nd4j::LaunchContext::defaultContext();
+        sd::LaunchContext * _context = sd::LaunchContext::defaultContext();
 
         // indicates if array's buffer is within workspace
         bool _isAttached = false;
@@ -174,7 +174,7 @@ namespace nd4j {
         /**
         *  type of array elements
         */
-        nd4j::DataType _dataType = FLOAT32;
+        sd::DataType _dataType = FLOAT32;
 
         /**
          * deviceID where this NDArray belongs to
@@ -191,72 +191,72 @@ namespace nd4j {
         *  do not allocate memory, memory for array is passed from outside
         */
 #ifndef __JAVACPP_HACK__
-        NDArray(std::shared_ptr<DataBuffer> buffer, const ShapeDescriptor& descriptor, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext(), const Nd4jLong offset = 0);
+        NDArray(std::shared_ptr<DataBuffer> buffer, const ShapeDescriptor& descriptor, sd::LaunchContext* context = sd::LaunchContext::defaultContext(), const Nd4jLong offset = 0);
 
-        NDArray(std::shared_ptr<DataBuffer> buffer, const char order, const std::vector<Nd4jLong> &shape, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
+        NDArray(std::shared_ptr<DataBuffer> buffer, const char order, const std::vector<Nd4jLong> &shape, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
 
         /**
          * This contructors create scalar array containing string utf8
          *
          */
-        NDArray(const char* str, nd4j::DataType dtype = nd4j::DataType::UTF8, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext())
+        NDArray(const char* str, sd::DataType dtype = sd::DataType::UTF8, sd::LaunchContext* context = sd::LaunchContext::defaultContext())
             : NDArray(std::string(str), dtype, context) {
         }
-        NDArray(const std::string& string, nd4j::DataType dtype = nd4j::DataType::UTF8, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
+        NDArray(const std::string& string, sd::DataType dtype = sd::DataType::UTF8, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
 
         /**
          * This contructors create scalar array containing string utf16
          *
          */
-        NDArray(const char16_t* u16string, nd4j::DataType dtype = nd4j::DataType::UTF16, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext())
+        NDArray(const char16_t* u16string, sd::DataType dtype = sd::DataType::UTF16, sd::LaunchContext* context = sd::LaunchContext::defaultContext())
             : NDArray(std::u16string(u16string), dtype, context) {
         }
 
-        NDArray(const std::u16string& u16string, nd4j::DataType dtype = nd4j::DataType::UTF16, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
+        NDArray(const std::u16string& u16string, sd::DataType dtype = sd::DataType::UTF16, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
 
         /**
         * This contructors create scalar array containing string utf32
         *
         */
-        NDArray(const char32_t* u32string, nd4j::DataType dtype = nd4j::DataType::UTF32, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext())
+        NDArray(const char32_t* u32string, sd::DataType dtype = sd::DataType::UTF32, sd::LaunchContext* context = sd::LaunchContext::defaultContext())
             : NDArray(std::u32string(u32string), dtype, context) {
         }
 
-        NDArray(const std::u32string& u32string, nd4j::DataType dtype = nd4j::DataType::UTF32, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
+        NDArray(const std::u32string& u32string, sd::DataType dtype = sd::DataType::UTF32, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
 
         /**
         * This contructors create array from vector of utf8 strings
         *
         */
-        NDArray(const std::vector<Nd4jLong>& shape, const std::vector<const char*>& strings, nd4j::DataType dtype = nd4j::DataType::UTF8, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
-        NDArray(const std::vector<Nd4jLong>& shape, const std::vector<std::string>& string, nd4j::DataType dtype = nd4j::DataType::UTF8, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
+        NDArray(const std::vector<Nd4jLong>& shape, const std::vector<const char*>& strings, sd::DataType dtype = sd::DataType::UTF8, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
+        NDArray(const std::vector<Nd4jLong>& shape, const std::vector<std::string>& string, sd::DataType dtype = sd::DataType::UTF8, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
 
         /**
          * This contructors create array from vector of utf16 strings
          *
         */
-        NDArray(const std::vector<Nd4jLong>& shape, const std::vector<const char16_t*>& strings, nd4j::DataType dtype = nd4j::DataType::UTF16, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
-        NDArray(const std::vector<Nd4jLong>& shape, const std::vector<std::u16string>& string, nd4j::DataType dtype = nd4j::DataType::UTF16, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
+        NDArray(const std::vector<Nd4jLong>& shape, const std::vector<const char16_t*>& strings, sd::DataType dtype = sd::DataType::UTF16, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
+        NDArray(const std::vector<Nd4jLong>& shape, const std::vector<std::u16string>& string, sd::DataType dtype = sd::DataType::UTF16, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
 
         /**
          * This contructors create array from vector of utf32 strings
          *
         */
-        NDArray(const std::vector<Nd4jLong>& shape, const std::vector<const char32_t*>& strings, nd4j::DataType dtype = nd4j::DataType::UTF32, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
-        NDArray(const std::vector<Nd4jLong>& shape, const std::vector<std::u32string>& string, nd4j::DataType dtype = nd4j::DataType::UTF32, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
+        NDArray(const std::vector<Nd4jLong>& shape, const std::vector<const char32_t*>& strings, sd::DataType dtype = sd::DataType::UTF32, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
+        NDArray(const std::vector<Nd4jLong>& shape, const std::vector<std::u32string>& string, sd::DataType dtype = sd::DataType::UTF32, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
 
 #endif
 
         /**
         *  do not allocate memory, memory for array is passed from outside
         */
-        NDArray(void *buffer, Nd4jLong* shapeInfo, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext(), const bool isBuffAlloc = false);
+        NDArray(void *buffer, Nd4jLong* shapeInfo, sd::LaunchContext* context = sd::LaunchContext::defaultContext(), const bool isBuffAlloc = false);
 
         /**
         *  do not allocate memory, memory for array is passed from outside
         *  we suppose the content of both (device and host) buffers is identical
         */
-        NDArray(void *buffer, void *bufferD, Nd4jLong* shapeInfo, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext(), const bool isBuffAlloc = false, const bool isBuffDAlloc = false);
+        NDArray(void *buffer, void *bufferD, Nd4jLong* shapeInfo, sd::LaunchContext* context = sd::LaunchContext::defaultContext(), const bool isBuffAlloc = false, const bool isBuffDAlloc = false);
 
         /**
         *  copy constructor
@@ -271,34 +271,34 @@ namespace nd4j {
         /**
         *  constructor, create array stored at given workspace
         */
-        NDArray(nd4j::LaunchContext * context);
+        NDArray(sd::LaunchContext * context);
 
 
         /**
 		*  constructor creates new NDArray using shape information from "shapeInfo", set all elements in new array to zeros, if copyStrides is true then use stride values from "shapeInfo", else calculate strides independently
         */
-		NDArray(Nd4jLong* shapeInfo, const bool copyStrides = false, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
+		NDArray(Nd4jLong* shapeInfo, const bool copyStrides = false, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
 
         /**
         *  constructor creates new NDArray using shape information from "shapeInfo", set all elements in new array to be zeros, if copyStrides is true then use stride values from "shapeInfo", else calculate strides independently
         *  set dtype as array type
         */
-        NDArray(Nd4jLong* shapeInfo, const nd4j::DataType dtype, const bool copyStrides = false, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
+        NDArray(Nd4jLong* shapeInfo, const sd::DataType dtype, const bool copyStrides = false, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
 
         /**
         *  this constructor creates new array using shape information contained in vector argument
         */
-        NDArray(const char order, const std::vector<Nd4jLong> &shape, nd4j::DataType dtype = DOUBLE, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
+        NDArray(const char order, const std::vector<Nd4jLong> &shape, sd::DataType dtype = DOUBLE, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
 
         /**
         * This constructor creates new array with elements copied from data and using shape information stored in shape, elements from data will be casted to dtype
         */
-        NDArray(const char order, const std::vector<Nd4jLong> &shape, const std::vector<double>& data, nd4j::DataType dtype = DOUBLE, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
+        NDArray(const char order, const std::vector<Nd4jLong> &shape, const std::vector<double>& data, sd::DataType dtype = DOUBLE, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
 
         /**
         *  this constructor creates new array using given buffer (without memory allocation) and shape information stored in shape
         */
-        NDArray(void *buffer, const char order, const std::vector<Nd4jLong> &shape,  nd4j::DataType dtype, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext(), const bool isBuffAlloc = false);
+        NDArray(void *buffer, const char order, const std::vector<Nd4jLong> &shape,  sd::DataType dtype, sd::LaunchContext* context = sd::LaunchContext::defaultContext(), const bool isBuffAlloc = false);
 
         /**
         * This method returns new array with the same shape & data type
@@ -317,12 +317,12 @@ namespace nd4j {
         *  this constructor creates new NDArray with shape matching "other" array,
         *  doesn't copy "other" elements into new array !!!
         */
-        explicit NDArray(const NDArray* other, const bool copyStrides = false, nd4j::LaunchContext* context = nd4j::LaunchContext ::defaultContext());
+        explicit NDArray(const NDArray* other, const bool copyStrides = false, sd::LaunchContext* context = sd::LaunchContext ::defaultContext());
 
         /**
         *  this constructor creates scalar(and set its value = 0) or empty array depending on bool argument isScalar
         */
-        NDArray(nd4j::DataType dtype, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext(), const bool isScalar = true);
+        NDArray(sd::DataType dtype, sd::LaunchContext* context = sd::LaunchContext::defaultContext(), const bool isScalar = true);
 
         /**
          * This method blocks until asynchronous operation finishes
@@ -392,7 +392,7 @@ namespace nd4j {
         void operator delete(void* p);
 
 
-        void setContext(nd4j::LaunchContext * context);
+        void setContext(sd::LaunchContext * context);
 
         /**
         *  create a new array by replicating current array by repeats times along given dimension
@@ -438,7 +438,7 @@ namespace nd4j {
         /**
         *   returns _context
         */
-        nd4j::LaunchContext * getContext() const {
+        sd::LaunchContext * getContext() const {
             return _context;
         }
 
@@ -626,17 +626,17 @@ namespace nd4j {
         *  keepDims - if true then put unities in place of reduced dimensions
         */
 
-        NDArray reduceAlongDimension(nd4j::reduce::FloatOps op, const std::vector<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false) const;
-        NDArray reduceAlongDimension(nd4j::reduce::FloatOps op, const std::initializer_list<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false) const;
+        NDArray reduceAlongDimension(sd::reduce::FloatOps op, const std::vector<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false) const;
+        NDArray reduceAlongDimension(sd::reduce::FloatOps op, const std::initializer_list<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false) const;
 
-        NDArray reduceAlongDimension(nd4j::reduce::SameOps op, const std::vector<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false) const;
-        NDArray reduceAlongDimension(nd4j::reduce::SameOps op, const std::initializer_list<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false) const;
+        NDArray reduceAlongDimension(sd::reduce::SameOps op, const std::vector<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false) const;
+        NDArray reduceAlongDimension(sd::reduce::SameOps op, const std::initializer_list<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false) const;
 
-        NDArray reduceAlongDimension(nd4j::reduce::BoolOps op, const std::vector<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false) const;
-        NDArray reduceAlongDimension(nd4j::reduce::BoolOps op, const std::initializer_list<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false) const;
+        NDArray reduceAlongDimension(sd::reduce::BoolOps op, const std::vector<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false) const;
+        NDArray reduceAlongDimension(sd::reduce::BoolOps op, const std::initializer_list<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false) const;
 
-        NDArray reduceAlongDimension(nd4j::reduce::LongOps op, const std::vector<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false) const;
-        NDArray reduceAlongDimension(nd4j::reduce::LongOps op, const std::initializer_list<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false) const;
+        NDArray reduceAlongDimension(sd::reduce::LongOps op, const std::vector<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false) const;
+        NDArray reduceAlongDimension(sd::reduce::LongOps op, const std::initializer_list<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false) const;
 
         /**
         *  method reduces array by excluding its shapes along dimensions present in given dimensions vector
@@ -645,37 +645,37 @@ namespace nd4j {
         *  keepDims - if true then put unities in place of reduced dimensions
         *  extras - extra parameters
         */
-        void reduceAlongDimension(nd4j::reduce::FloatOps op, NDArray& target, const std::vector<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false, const bool checkTargetShape = true) const;
-        void reduceAlongDimension(nd4j::reduce::SameOps op,  NDArray& target, const std::vector<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false, const bool checkTargetShape = true) const;
-        void reduceAlongDimension(nd4j::reduce::BoolOps op,  NDArray& target, const std::vector<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false, const bool checkTargetShape = true) const;
-        void reduceAlongDimension(nd4j::reduce::LongOps op,  NDArray& target, const std::vector<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false, const bool checkTargetShape = true) const;
+        void reduceAlongDimension(sd::reduce::FloatOps op, NDArray& target, const std::vector<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false, const bool checkTargetShape = true) const;
+        void reduceAlongDimension(sd::reduce::SameOps op,  NDArray& target, const std::vector<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false, const bool checkTargetShape = true) const;
+        void reduceAlongDimension(sd::reduce::BoolOps op,  NDArray& target, const std::vector<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false, const bool checkTargetShape = true) const;
+        void reduceAlongDimension(sd::reduce::LongOps op,  NDArray& target, const std::vector<int>& dimensions, const bool keepDims = false, const bool supportOldShapes = false, const bool checkTargetShape = true) const;
 
         /**
         *  return variance of array elements set
         *  biasCorrected -  if true bias correction will be applied
         */
-        NDArray varianceNumber(nd4j::variance::Ops op, bool biasCorrected = true);
+        NDArray varianceNumber(sd::variance::Ops op, bool biasCorrected = true);
 
         /**
         *  apply scalar operation to array
         *  extraParams - extra parameters for operation
         *  returns scalar array
         */
-        NDArray reduceNumber(nd4j::reduce::FloatOps ops, void *extraParams = nullptr) const;
-        NDArray reduceNumber(nd4j::reduce::SameOps  ops, void *extraParams = nullptr) const;
-        NDArray reduceNumber(nd4j::reduce::BoolOps  ops, void *extraParams = nullptr) const;
-        NDArray reduceNumber(nd4j::reduce::LongOps  ops, void *extraParams = nullptr) const;
+        NDArray reduceNumber(sd::reduce::FloatOps ops, void *extraParams = nullptr) const;
+        NDArray reduceNumber(sd::reduce::SameOps  ops, void *extraParams = nullptr) const;
+        NDArray reduceNumber(sd::reduce::BoolOps  ops, void *extraParams = nullptr) const;
+        NDArray reduceNumber(sd::reduce::LongOps  ops, void *extraParams = nullptr) const;
 
-        void reduceNumber(nd4j::reduce::FloatOps ops, NDArray& target, void *extraParams = nullptr) const;
-        void reduceNumber(nd4j::reduce::SameOps  ops, NDArray& target, void *extraParams = nullptr) const;
-        void reduceNumber(nd4j::reduce::BoolOps  ops, NDArray& target, void *extraParams = nullptr) const;
-        void reduceNumber(nd4j::reduce::LongOps  ops, NDArray& target, void *extraParams = nullptr) const;
+        void reduceNumber(sd::reduce::FloatOps ops, NDArray& target, void *extraParams = nullptr) const;
+        void reduceNumber(sd::reduce::SameOps  ops, NDArray& target, void *extraParams = nullptr) const;
+        void reduceNumber(sd::reduce::BoolOps  ops, NDArray& target, void *extraParams = nullptr) const;
+        void reduceNumber(sd::reduce::LongOps  ops, NDArray& target, void *extraParams = nullptr) const;
 
         /**
         *  returns element index which corresponds to some condition imposed by operation
         *  extraParams - extra parameters for operation
         */
-        NDArray indexReduceNumber(nd4j::indexreduce::Ops op, ExtraArguments *extraParams = nullptr);
+        NDArray indexReduceNumber(sd::indexreduce::Ops op, ExtraArguments *extraParams = nullptr);
 
         /**
         *  returns index of max element in a given array (optionally: along given dimension(s))
@@ -687,31 +687,31 @@ namespace nd4j {
         void makeBothActual() const        { syncToDevice(); syncToHost(); }
 
 
-        void applyTransform(nd4j::transform::FloatOps op, NDArray& target, ExtraArguments *extraParams = nullptr);
-        void applyTransform(nd4j::transform::SameOps op, NDArray& target, ExtraArguments *extraParams = nullptr);
-        void applyTransform(nd4j::transform::AnyOps op, NDArray& target, ExtraArguments *extraParams = nullptr);
-        void applyTransform(nd4j::transform::BoolOps op, NDArray& target, ExtraArguments *extraParams = nullptr);
-        void applyTransform(nd4j::transform::StrictOps op, NDArray& target, ExtraArguments *extraParams = nullptr);
+        void applyTransform(sd::transform::FloatOps op, NDArray& target, ExtraArguments *extraParams = nullptr);
+        void applyTransform(sd::transform::SameOps op, NDArray& target, ExtraArguments *extraParams = nullptr);
+        void applyTransform(sd::transform::AnyOps op, NDArray& target, ExtraArguments *extraParams = nullptr);
+        void applyTransform(sd::transform::BoolOps op, NDArray& target, ExtraArguments *extraParams = nullptr);
+        void applyTransform(sd::transform::StrictOps op, NDArray& target, ExtraArguments *extraParams = nullptr);
 
         /**
         *  apply OpName transformation to this array and store result in new array to be returned
         *  extraParams - extra parameters for operation
         */
-        NDArray transform(nd4j::transform::FloatOps op, void *extraParams = nullptr) const &;
-        NDArray transform(nd4j::transform::SameOps op, void *extraParams = nullptr) const &;
-        NDArray transform(nd4j::transform::BoolOps op, void *extraParams = nullptr) const &;
-        NDArray transform(nd4j::transform::StrictOps op, void *extraParams = nullptr) const &;
-        NDArray transform(nd4j::transform::FloatOps op, void *extraParams = nullptr) &&;
-        NDArray transform(nd4j::transform::SameOps op, void *extraParams = nullptr) &&;
-        NDArray transform(nd4j::transform::BoolOps op, void *extraParams = nullptr) &&;
-        NDArray transform(nd4j::transform::StrictOps op, void *extraParams = nullptr) &&;
+        NDArray transform(sd::transform::FloatOps op, void *extraParams = nullptr) const &;
+        NDArray transform(sd::transform::SameOps op, void *extraParams = nullptr) const &;
+        NDArray transform(sd::transform::BoolOps op, void *extraParams = nullptr) const &;
+        NDArray transform(sd::transform::StrictOps op, void *extraParams = nullptr) const &;
+        NDArray transform(sd::transform::FloatOps op, void *extraParams = nullptr) &&;
+        NDArray transform(sd::transform::SameOps op, void *extraParams = nullptr) &&;
+        NDArray transform(sd::transform::BoolOps op, void *extraParams = nullptr) &&;
+        NDArray transform(sd::transform::StrictOps op, void *extraParams = nullptr) &&;
 
         /**
         *  apply pairwise OpName transformation based on "this" and "other" arras elements, store result in this array
         *  other - second array necessary for pairwise operation
         *  extraParams - extra parameters for operation
         */
-        void applyPairwiseTransform(nd4j::pairwise::Ops op, const NDArray& other, ExtraArguments *extraParams = nullptr);
+        void applyPairwiseTransform(sd::pairwise::Ops op, const NDArray& other, ExtraArguments *extraParams = nullptr);
 
         /**
         *  apply pairwise OpName transformation based on "this" and "other" arras elements, store result in target array
@@ -719,11 +719,11 @@ namespace nd4j {
         *  target - where to store result
         *  extraParams - extra parameters for operation
         */
-        void applyPairwiseTransform(nd4j::pairwise::Ops op, const NDArray& other, NDArray& target, ExtraArguments *extraParams = nullptr) const;
+        void applyPairwiseTransform(sd::pairwise::Ops op, const NDArray& other, NDArray& target, ExtraArguments *extraParams = nullptr) const;
 
-        void applyPairwiseTransform(nd4j::pairwise::BoolOps op, const NDArray& other, NDArray& target, ExtraArguments *extraParams = nullptr) const;
+        void applyPairwiseTransform(sd::pairwise::BoolOps op, const NDArray& other, NDArray& target, ExtraArguments *extraParams = nullptr) const;
 
-        void applyPairwiseTransform(nd4j::pairwise::IntOps op, const NDArray& other, NDArray&target, ExtraArguments *extraParams = nullptr) const;
+        void applyPairwiseTransform(sd::pairwise::IntOps op, const NDArray& other, NDArray&target, ExtraArguments *extraParams = nullptr) const;
 
         /**
         *  apply operation which requires broadcasting, broadcast a smaller array (tad) along  bigger one (this)
@@ -732,23 +732,23 @@ namespace nd4j {
         *  target - where to store result
         *  extraParams - extra parameters for operation
         */
-        void applyBroadcast(nd4j::broadcast::Ops op, const std::initializer_list<int> dimensions, const NDArray& tad, NDArray& target, ExtraArguments* extraArgs = nullptr);
+        void applyBroadcast(sd::broadcast::Ops op, const std::initializer_list<int> dimensions, const NDArray& tad, NDArray& target, ExtraArguments* extraArgs = nullptr);
 
-        void applyBroadcast(nd4j::broadcast::Ops op, const std::vector<int> &dimensions, const NDArray &tad, NDArray &target, ExtraArguments *extraArgs = nullptr);
+        void applyBroadcast(sd::broadcast::Ops op, const std::vector<int> &dimensions, const NDArray &tad, NDArray &target, ExtraArguments *extraArgs = nullptr);
 
-        void applyBroadcast(nd4j::broadcast::BoolOps op, const std::vector<int> &dimensions, const NDArray &tad, NDArray &target, ExtraArguments *extraArgs = nullptr);
+        void applyBroadcast(sd::broadcast::BoolOps op, const std::vector<int> &dimensions, const NDArray &tad, NDArray &target, ExtraArguments *extraArgs = nullptr);
 
-        void applyBroadcast(nd4j::broadcast::IntOps op, const std::vector<int> &dimensions, const NDArray& tad, NDArray &target, ExtraArguments *extraArgs = nullptr);
+        void applyBroadcast(sd::broadcast::IntOps op, const std::vector<int> &dimensions, const NDArray& tad, NDArray &target, ExtraArguments *extraArgs = nullptr);
 
         /**
         *  apply operation which requires broadcasting, broadcast one tensor along another, also this method checks the possibility of broadcasting
         *  other - input array
         *  extraParams - extra parameters for operation
         */
-        NDArray applyTrueBroadcast(nd4j::BroadcastOpsTuple op, const NDArray& other, ExtraArguments *extraArgs = nullptr) const &;
-        NDArray applyTrueBroadcast(nd4j::BroadcastOpsTuple op, NDArray&& other, ExtraArguments *extraArgs = nullptr) const &;
-        NDArray applyTrueBroadcast(nd4j::BroadcastOpsTuple op, NDArray&& other, ExtraArguments *extraArgs = nullptr) &&;
-        NDArray applyTrueBroadcast(nd4j::BroadcastOpsTuple op, const NDArray& other, ExtraArguments *extraArgs = nullptr) &&;
+        NDArray applyTrueBroadcast(sd::BroadcastOpsTuple op, const NDArray& other, ExtraArguments *extraArgs = nullptr) const &;
+        NDArray applyTrueBroadcast(sd::BroadcastOpsTuple op, NDArray&& other, ExtraArguments *extraArgs = nullptr) const &;
+        NDArray applyTrueBroadcast(sd::BroadcastOpsTuple op, NDArray&& other, ExtraArguments *extraArgs = nullptr) &&;
+        NDArray applyTrueBroadcast(sd::BroadcastOpsTuple op, const NDArray& other, ExtraArguments *extraArgs = nullptr) &&;
 
         /**
         *  apply operation which requires broadcasting, broadcast one tensor along another, also this method checks the possibility of broadcasting
@@ -757,11 +757,11 @@ namespace nd4j {
         *  checkTargetShape - if true check whether target shape is suitable for broadcasting
         *  extraParams - extra parameters for operation
         */
-        void applyTrueBroadcast(nd4j::BroadcastOpsTuple op, const NDArray& other, NDArray& target, const bool checkTargetShape = true, ExtraArguments *extraArgs = nullptr) const;
+        void applyTrueBroadcast(sd::BroadcastOpsTuple op, const NDArray& other, NDArray& target, const bool checkTargetShape = true, ExtraArguments *extraArgs = nullptr) const;
 
-        void applyTrueBroadcast(nd4j::BroadcastBoolOpsTuple op, const NDArray& other, NDArray& target, const bool checkTargetShape = true, ExtraArguments *extraArgs = nullptr) const;
+        void applyTrueBroadcast(sd::BroadcastBoolOpsTuple op, const NDArray& other, NDArray& target, const bool checkTargetShape = true, ExtraArguments *extraArgs = nullptr) const;
 
-        void applyTrueBroadcast(nd4j::BroadcastIntOpsTuple op, const NDArray& other, NDArray& target, const bool checkTargetShape = true, ExtraArguments *extraArgs = nullptr) const;
+        void applyTrueBroadcast(sd::BroadcastIntOpsTuple op, const NDArray& other, NDArray& target, const bool checkTargetShape = true, ExtraArguments *extraArgs = nullptr) const;
 
 
         /**
@@ -771,13 +771,13 @@ namespace nd4j {
         *  extraParams - extra parameters for operation
         */
         template <typename T>
-        void applyScalar(nd4j::scalar::Ops op, const T scalar, NDArray& target, ExtraArguments *extraParams = nullptr);
+        void applyScalar(sd::scalar::Ops op, const T scalar, NDArray& target, ExtraArguments *extraParams = nullptr);
 
         template <typename T>
-        void applyScalar(nd4j::scalar::BoolOps op, const T scalar, NDArray& target, ExtraArguments *extraParams = nullptr) const;
+        void applyScalar(sd::scalar::BoolOps op, const T scalar, NDArray& target, ExtraArguments *extraParams = nullptr) const;
 
         template <typename T>
-        void applyScalar(nd4j::scalar::IntOps op, const T scalar, NDArray& target, ExtraArguments *extraParams = nullptr) const;
+        void applyScalar(sd::scalar::IntOps op, const T scalar, NDArray& target, ExtraArguments *extraParams = nullptr) const;
 
         /**
         *  apply a scalar operation to an array
@@ -785,11 +785,11 @@ namespace nd4j {
         *  target - where to store result
         *  extraParams - extra parameters for operation
         */
-        void applyScalarArr(nd4j::scalar::Ops op, const NDArray& scalar, NDArray& target, ExtraArguments *extraParams = nullptr);
+        void applyScalarArr(sd::scalar::Ops op, const NDArray& scalar, NDArray& target, ExtraArguments *extraParams = nullptr);
 
-        void applyScalarArr(nd4j::scalar::BoolOps op, const NDArray& scalar, NDArray& target, ExtraArguments *extraParams = nullptr) const;
+        void applyScalarArr(sd::scalar::BoolOps op, const NDArray& scalar, NDArray& target, ExtraArguments *extraParams = nullptr) const;
 
-        void applyScalarArr(nd4j::scalar::IntOps op, const NDArray& scalar, NDArray& target, ExtraArguments *extraParams = nullptr) const;
+        void applyScalarArr(sd::scalar::IntOps op, const NDArray& scalar, NDArray& target, ExtraArguments *extraParams = nullptr) const;
 
 #if defined(__CUDABLAS__) //&& defined(BUILD_TESTS)
         template <typename Lambda>
@@ -840,7 +840,7 @@ namespace nd4j {
         *  dimensions - vector of dimensions to reduce along
         *  extraArgs - extra parameters for operation
         */
-        NDArray applyIndexReduce(nd4j::indexreduce::Ops op, const std::vector<int>& dimensions, const ExtraArguments *extraParams = nullptr) const;
+        NDArray applyIndexReduce(sd::indexreduce::Ops op, const std::vector<int>& dimensions, const ExtraArguments *extraParams = nullptr) const;
 
         /**
         *  reduces dimensions in array relying on index operation OpName
@@ -848,14 +848,14 @@ namespace nd4j {
         *  dimensions - vector of dimensions to reduce along
         *  extraArgs - extra parameters for operation
         */
-        void applyIndexReduce(nd4j::indexreduce::Ops op, NDArray& target, const std::vector<int>& dimensions, const ExtraArguments *extraParams = nullptr) const;
+        void applyIndexReduce(sd::indexreduce::Ops op, NDArray& target, const std::vector<int>& dimensions, const ExtraArguments *extraParams = nullptr) const;
 
         /**
         *  apply reduce3 operation OpName to this and other array, return result in new output array
         *  other - input array
         *  extraArgs - extra parameters for operation
         */
-        NDArray applyReduce3(nd4j::reduce3::Ops op, const NDArray& other, const ExtraArguments* extraParams = nullptr) const;
+        NDArray applyReduce3(sd::reduce3::Ops op, const NDArray& other, const ExtraArguments* extraParams = nullptr) const;
 
         /**
         *  apply reduce3 operation OpName to this and other array, return result in new output array
@@ -863,7 +863,7 @@ namespace nd4j {
         *  dimensions - vector of dimensions to reduce along (tads not axis)
         *  extraArgs - extra parameters for operation
         */
-        NDArray applyAllReduce3(nd4j::reduce3::Ops op, const NDArray& other, const std::vector<int>& dimensions, const ExtraArguments* extraParams = nullptr) const;
+        NDArray applyAllReduce3(sd::reduce3::Ops op, const NDArray& other, const std::vector<int>& dimensions, const ExtraArguments* extraParams = nullptr) const;
 
         /**
         *  apply reduce3 (exec) operation OpName to this and other array, return result in new output array
@@ -871,18 +871,18 @@ namespace nd4j {
         *  dimensions - vector of dimensions to reduce along (same as reduceAlongDimension)
         *  extraArgs - extra parameters for operation
         */
-        NDArray applyReduce3(nd4j::reduce3::Ops op, const NDArray& other, const std::vector<int>& dimensions, const ExtraArguments* extraParams = nullptr) const;
+        NDArray applyReduce3(sd::reduce3::Ops op, const NDArray& other, const std::vector<int>& dimensions, const ExtraArguments* extraParams = nullptr) const;
 
         /**
         *  returns variance along given dimensions
         *  biasCorrected -  if true bias correction will be applied
         *  dimensions - vector of dimensions to calculate variance along
         */
-        NDArray varianceAlongDimension(nd4j::variance::Ops op, const bool biasCorrected, const std::vector<int>& dimensions) const;
-        NDArray varianceAlongDimension(nd4j::variance::Ops op, const bool biasCorrected, const std::initializer_list<int>& dimensions) const;
+        NDArray varianceAlongDimension(sd::variance::Ops op, const bool biasCorrected, const std::vector<int>& dimensions) const;
+        NDArray varianceAlongDimension(sd::variance::Ops op, const bool biasCorrected, const std::initializer_list<int>& dimensions) const;
 
-        void varianceAlongDimension(nd4j::variance::Ops op, NDArray& target, const bool biasCorrected, const std::vector<int>& dimensions) const;
-        void varianceAlongDimension(nd4j::variance::Ops op, NDArray& target, const bool biasCorrected, const std::initializer_list<int>& dimensions) const;
+        void varianceAlongDimension(sd::variance::Ops op, NDArray& target, const bool biasCorrected, const std::vector<int>& dimensions) const;
+        void varianceAlongDimension(sd::variance::Ops op, NDArray& target, const bool biasCorrected, const std::initializer_list<int>& dimensions) const;
 
 #endif
 
@@ -1224,7 +1224,7 @@ namespace nd4j {
         *  set _shapeInfo
         */
         void setShapeInfo(const Nd4jLong *shapeInfo);
-        void setShapeInfo(const Nd4jLong *shapeInfo, const nd4j::DataType dtype);
+        void setShapeInfo(const Nd4jLong *shapeInfo, const sd::DataType dtype);
         void setShapeInfo(const ShapeDescriptor& descriptor);
         void setShapeInfo(const ConstantDataBuffer& shapeBuffer);
 
@@ -1271,7 +1271,7 @@ namespace nd4j {
         *  set _shapeInfo
         */
         FORCEINLINE void setShapeInfo(Nd4jLong *shapeInfo);
-        FORCEINLINE void setShapeInfo(Nd4jLong *shapeInfo, const nd4j::DataType dtype);
+        FORCEINLINE void setShapeInfo(Nd4jLong *shapeInfo, const sd::DataType dtype);
 
         /**
         *  returns the value of "dim" dimension
@@ -1537,13 +1537,13 @@ void NDArray::setShapeInfo(Nd4jLong *shapeInfo) {
             _length = shape::length(_shapeInfo);
     }
     else {
-        _dataType = nd4j::DataType::INHERIT;
+        _dataType = sd::DataType::INHERIT;
         _length = 0;
     }
 }
 
 //////////////////////////////////////////////////////////////////////////
-void NDArray::setShapeInfo(Nd4jLong *shapeInfo, const nd4j::DataType dtype) {
+void NDArray::setShapeInfo(Nd4jLong *shapeInfo, const sd::DataType dtype) {
     auto buffer = ConstantShapeHelper::getInstance()->bufferForShapeInfo(shapeInfo);
     _shapeInfo = buffer.primaryAsT<Nd4jLong>();
     _shapeInfoD = buffer.specialAsT<Nd4jLong>();
@@ -1556,7 +1556,7 @@ void NDArray::setShapeInfo(Nd4jLong *shapeInfo, const nd4j::DataType dtype) {
             _length = shape::length(_shapeInfo);
     }
     else {
-        _dataType = nd4j::DataType::INHERIT;
+        _dataType = sd::DataType::INHERIT;
         _length = 0;
     }
 }
@@ -1981,7 +1981,7 @@ Nd4jLong* NDArray::getSpecialShapeInfo() const{
 
 #if defined(__CUDACC__) //&& defined(BUILD_TESTS)
 // for CUDA we need stil stuff inline
-#include "cuda/NDArrayLambda.hpp"
+#include <array/NDArrayLambda.hXX>
 #endif
 
 }
diff --git a/libnd4j/blas/NDArray.hpp b/libnd4j/include/array/NDArray.hXX
similarity index 87%
rename from libnd4j/blas/NDArray.hpp
rename to libnd4j/include/array/NDArray.hXX
index 46cc1ab21..02c186e38 100644
--- a/libnd4j/blas/NDArray.hpp
+++ b/libnd4j/include/array/NDArray.hXX
@@ -21,14 +21,14 @@
 #define __NDARRAY__HPP__
 
 #include <array/ShapeDescriptor.h>
-#include <ConstantShapeHelper.h>
-#include <ConstantShapeHelper.h>
-#include <ConstantTadHelper.h>
-#include <BroadcastPairwiseConverter.h>
+#include <helpers/ConstantShapeHelper.h>
+#include <helpers/ConstantShapeHelper.h>
+#include <helpers/ConstantTadHelper.h>
+#include <loops/BroadcastPairwiseConverter.h>
 #include <helpers/PointersManager.h>
-#include <TrueBroadcastHelper.h>
+#include <loops/TrueBroadcastHelper.h>
 
-namespace nd4j {
+namespace sd {
 
 template <>
 ND4J_EXPORT utf8string NDArray::e(const Nd4jLong i) const;
@@ -57,7 +57,7 @@ NDArray::NDArray(const NDArray& other) {
 }
 
 ////////////////////////////////////////////////////////////////////////
-NDArray::NDArray(const char order, const std::vector<Nd4jLong> &shape, nd4j::DataType dtype, nd4j::LaunchContext * context) {
+NDArray::NDArray(const char order, const std::vector<Nd4jLong> &shape, sd::DataType dtype, sd::LaunchContext * context) {
 
     if ((int) shape.size() > MAX_RANK)
         throw std::invalid_argument("Rank of NDArray can't exceed 32");
@@ -76,7 +76,7 @@ NDArray::NDArray(const char order, const std::vector<Nd4jLong> &shape, nd4j::Dat
 }
 
 ////////////////////////////////////////////////////////////////////////
-NDArray::NDArray(const char order, const std::vector<Nd4jLong> &shape, const std::vector<double>& data, nd4j::DataType dtype, nd4j::LaunchContext * context) {
+NDArray::NDArray(const char order, const std::vector<Nd4jLong> &shape, const std::vector<double>& data, sd::DataType dtype, sd::LaunchContext * context) {
 
     if ((int) shape.size() > MAX_RANK)
         throw std::invalid_argument("Rank of NDArray can't exceed 32");
@@ -109,7 +109,7 @@ NDArray::NDArray(const char order, const std::vector<Nd4jLong> &shape, const std
 
 
 ////////////////////////////////////////////////////////////////////////
-NDArray::NDArray(const NDArray *other, const bool copyStrides, nd4j::LaunchContext* context) {
+NDArray::NDArray(const NDArray *other, const bool copyStrides, sd::LaunchContext* context) {
 
     _context = context;
     _offset  = 0;
@@ -125,7 +125,7 @@ NDArray::NDArray(const NDArray *other, const bool copyStrides, nd4j::LaunchConte
 }
 
 ////////////////////////////////////////////////////////////////////////
-NDArray::NDArray(void* buffer, const char order, const std::vector<Nd4jLong> &shape,  nd4j::DataType dtype, nd4j::LaunchContext * context, const bool isBuffAlloc) {
+NDArray::NDArray(void* buffer, const char order, const std::vector<Nd4jLong> &shape,  sd::DataType dtype, sd::LaunchContext * context, const bool isBuffAlloc) {
 
     if (shape.empty())
         throw std::runtime_error("NDArray constructor: input shape is empty !");
@@ -144,7 +144,7 @@ NDArray::NDArray(void* buffer, const char order, const std::vector<Nd4jLong> &sh
 
 ////////////////////////////////////////////////////////////////////////
 // creates new NDArray using shape information from "shapeInfo" array, set all elements in new array to be zeros
-NDArray::NDArray(Nd4jLong* shapeInfo, const nd4j::DataType dtype, const bool copyStrides, nd4j::LaunchContext * context) {
+NDArray::NDArray(Nd4jLong* shapeInfo, const sd::DataType dtype, const bool copyStrides, sd::LaunchContext * context) {
 
     if (shapeInfo == nullptr)
         throw std::runtime_error("NDArray constructor: can't be initalized without shapeinfo");
@@ -168,7 +168,7 @@ NDArray::NDArray(Nd4jLong* shapeInfo, const nd4j::DataType dtype, const bool cop
 
 ////////////////////////////////////////////////////////////////////////
 // scalar constructor
-NDArray::NDArray(nd4j::DataType dtype, nd4j::LaunchContext* context, const bool isScalar) {
+NDArray::NDArray(sd::DataType dtype, sd::LaunchContext* context, const bool isScalar) {
 
     _context = context;
     _offset  = 0;
@@ -203,7 +203,7 @@ NDArray::NDArray(NDArray&& other) noexcept {
 
 ////////////////////////////////////////////////////////////////////////
 //constructor, create empty array at given workspace
-NDArray::NDArray(nd4j::LaunchContext * context) {
+NDArray::NDArray(sd::LaunchContext * context) {
     _buffer    = std::make_shared<DataBuffer>();
     _shapeInfo = nullptr;
     _shapeInfoD = nullptr;
@@ -214,12 +214,12 @@ NDArray::NDArray(nd4j::LaunchContext * context) {
 
 ////////////////////////////////////////////////////////////////////////
 // creates new NDArray using shape information from "shapeInfo" array, set all elements in new array to be zeros, set dtype as array type
-NDArray::NDArray(Nd4jLong* shapeInfo, const bool copyStrides, nd4j::LaunchContext * context):
+NDArray::NDArray(Nd4jLong* shapeInfo, const bool copyStrides, sd::LaunchContext * context):
         NDArray(shapeInfo, ArrayOptions::dataType(shapeInfo), copyStrides, context) {
 }
 
 ////////////////////////////////////////////////////////////////////////
-NDArray::NDArray(std::shared_ptr<DataBuffer> buffer, const ShapeDescriptor& descriptor, nd4j::LaunchContext* context, const Nd4jLong offset) {
+NDArray::NDArray(std::shared_ptr<DataBuffer> buffer, const ShapeDescriptor& descriptor, sd::LaunchContext* context, const Nd4jLong offset) {
 
     _context = context;
     _offset  = offset;
@@ -233,7 +233,7 @@ NDArray::NDArray(std::shared_ptr<DataBuffer> buffer, const ShapeDescriptor& desc
 
 ////////////////////////////////////////////////////////////////////////
 // do not allocate memory, memory for array is passed from outside
-NDArray::NDArray(void *buffer, Nd4jLong *shapeInfo, nd4j::LaunchContext * context, const bool isBuffAlloc) {
+NDArray::NDArray(void *buffer, Nd4jLong *shapeInfo, sd::LaunchContext * context, const bool isBuffAlloc) {
 
     if (buffer == nullptr && ArrayOptions::arrayType(shapeInfo) != ArrayType::EMPTY)
         throw std::runtime_error("NDArray constructor: can't be initalized with nullptr buffer !");
@@ -262,7 +262,7 @@ NDArray::NDArray(void *buffer, Nd4jLong *shapeInfo, nd4j::LaunchContext * contex
 ////////////////////////////////////////////////////////////////////////
 // do not allocate memory, memory for array is passed from outside
 // we suppose the content of both (device and host) buffers is identical
-NDArray::NDArray(void *buffer, void* bufferD, Nd4jLong *shapeInfo, nd4j::LaunchContext * context, const bool isBuffAlloc, const bool isBuffDAlloc) {
+NDArray::NDArray(void *buffer, void* bufferD, Nd4jLong *shapeInfo, sd::LaunchContext * context, const bool isBuffAlloc, const bool isBuffDAlloc) {
 
     if (shapeInfo == nullptr)
         throw std::runtime_error("NDArray constructor cuda: can't be initalized without shapeinfo");
@@ -280,7 +280,7 @@ NDArray::NDArray(void *buffer, void* bufferD, Nd4jLong *shapeInfo, nd4j::LaunchC
 }
 
 //////////////////////////////////////////////////////////////////////////
-NDArray::NDArray(std::shared_ptr<DataBuffer> buffer, const char order, const std::vector<Nd4jLong> &shape, nd4j::LaunchContext* context) {
+NDArray::NDArray(std::shared_ptr<DataBuffer> buffer, const char order, const std::vector<Nd4jLong> &shape, sd::LaunchContext* context) {
 
     if (shape.empty())
         throw std::runtime_error("NDArray constructor: input shape is empty !");
@@ -299,7 +299,7 @@ NDArray::NDArray(std::shared_ptr<DataBuffer> buffer, const char order, const std
 }
 /////////////////////////////////////////////////////////////////////////
 // u16 string constructors
-NDArray::NDArray(const std::u16string& u16string, nd4j::DataType dtype, nd4j::LaunchContext* context) {
+NDArray::NDArray(const std::u16string& u16string, sd::DataType dtype, sd::LaunchContext* context) {
 
     if (!DataTypeUtils::isS(dtype)) {
         throw std::invalid_argument("NDArray::NDArray: invalid DataType, only string dataTypes have to be used");
@@ -351,7 +351,7 @@ NDArray::NDArray(const std::u16string& u16string, nd4j::DataType dtype, nd4j::La
 
 /////////////////////////////////////////////////////////////////////////
 // u32 string constructors
-NDArray::NDArray(const std::u32string& u32string, nd4j::DataType dtype, nd4j::LaunchContext* context) {
+NDArray::NDArray(const std::u32string& u32string, sd::DataType dtype, sd::LaunchContext* context) {
 
     if (!DataTypeUtils::isS(dtype)) {
         throw std::invalid_argument("NDArray::NDArray: invalid DataType, only string dataTypes have to be used");
@@ -403,7 +403,7 @@ NDArray::NDArray(const std::u32string& u32string, nd4j::DataType dtype, nd4j::La
 /////////////////////////////////////////////////////////////////////////
 // u8 string constructors
 /////////////////////////////////////////////////////////////////////////
-NDArray::NDArray(const std::string& str, nd4j::DataType dtype, nd4j::LaunchContext* context) {
+NDArray::NDArray(const std::string& str, sd::DataType dtype, sd::LaunchContext* context) {
 
     if (!DataTypeUtils::isS(dtype)) {
         throw std::invalid_argument("NDArray::NDArray: invalid DataType, only string dataTypes have to be used");
@@ -455,7 +455,7 @@ NDArray::NDArray(const std::string& str, nd4j::DataType dtype, nd4j::LaunchConte
 }
 /////////////////////////////////////////////////////////////////////////
 // constructors for vector of  strings
-NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<const char*>& string, const nd4j::DataType dataType, nd4j::LaunchContext* context) {
+NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<const char*>& string, const sd::DataType dataType, sd::LaunchContext* context) {
 
     if (!DataTypeUtils::isS(dataType))
         throw std::invalid_argument("NDArray::NDArray: invalid DataType, only string dataTypes have to be used");
@@ -521,7 +521,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<const cha
     syncToDevice();
 }
 /////////////////////////////////////////////////////////////////////////
-NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<std::string>& string, const nd4j::DataType dataType, nd4j::LaunchContext* context) {
+NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<std::string>& string, const sd::DataType dataType, sd::LaunchContext* context) {
 
     if (!DataTypeUtils::isS(dataType))
         throw std::invalid_argument("NDArray::NDArray: invalid DataType, only string dataTypes have to be used");
@@ -589,7 +589,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<std::stri
     syncToDevice();
 }
 /////////////////////////////////////////////////////////////////////////
-NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<std::u16string>& string, nd4j::DataType dtype, nd4j::LaunchContext* context) {
+NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<std::u16string>& string, sd::DataType dtype, sd::LaunchContext* context) {
 
     if (!DataTypeUtils::isS(dtype))
         throw std::invalid_argument("NDArray::NDArray: invalid DataType, only string dataTypes have to be used");
@@ -654,7 +654,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<std::u16s
     syncToDevice();
 }
 /////////////////////////////////////////////////////////////////////////
-NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<const char16_t*>& string, nd4j::DataType dtype, nd4j::LaunchContext* context) {
+NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<const char16_t*>& string, sd::DataType dtype, sd::LaunchContext* context) {
 
     if (!DataTypeUtils::isS(dtype))
         throw std::invalid_argument("NDArray::NDArray: invalid DataType, only string dataTypes have to be used");
@@ -720,7 +720,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<const cha
     syncToDevice();
 }
 /////////////////////////////////////////////////////////////////////////
-NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<std::u32string>& string, nd4j::DataType dtype, nd4j::LaunchContext* context) {
+NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<std::u32string>& string, sd::DataType dtype, sd::LaunchContext* context) {
 
     if (!DataTypeUtils::isS(dtype))
         throw std::invalid_argument("NDArray::NDArray: invalid DataType, only string dataTypes have to be used");
@@ -786,7 +786,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<std::u32s
     syncToDevice();
 }
 /////////////////////////////////////////////////////////////////////////
-NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<const char32_t *>& string, nd4j::DataType dtype, nd4j::LaunchContext* context) {
+NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<const char32_t *>& string, sd::DataType dtype, sd::LaunchContext* context) {
 
     if (!DataTypeUtils::isS(dtype))
         throw std::invalid_argument("NDArray::NDArray: invalid DataType used");
@@ -1221,7 +1221,7 @@ void NDArray::assign(const T& value, bool allowParallelism) {
     auto temp = NDArrayFactory::create(dataType(), value, this->getContext());
 
     NDArray::prepareSpecialUse({this}, {&temp});
-    NativeOpExecutioner::execScalar(getContext(), nd4j::scalar::CopyPws, buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), temp.buffer(), temp.shapeInfo(), temp.specialBuffer(), temp.getSpecialShapeInfo(), nullptr, allowParallelism);
+    NativeOpExecutioner::execScalar(getContext(), sd::scalar::CopyPws, buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), temp.buffer(), temp.shapeInfo(), temp.specialBuffer(), temp.getSpecialShapeInfo(), nullptr, allowParallelism);
     NDArray::registerSpecialUse({this}, {&temp});
 }
 template ND4J_EXPORT void NDArray::assign(const double& value, bool allowParallelism);
@@ -1254,7 +1254,7 @@ NDArray* NDArray::detach() {
 }
 
 //////////////////////////////////////////////////////////////////////////
-NDArray NDArray::varianceNumber(nd4j::variance::Ops op, bool biasCorrected) {
+NDArray NDArray::varianceNumber(sd::variance::Ops op, bool biasCorrected) {
 
     NDArray res(DataTypeUtils::pickFloatingType(dataType()), getContext());
 
@@ -1273,7 +1273,7 @@ NDArray NDArray::sumNumber() const {
     NDArray res(dataType(), getContext());
 
     NDArray::prepareSpecialUse({&res}, {this});
-    NativeOpExecutioner::execReduceSameScalar(getContext(), nd4j::reduce::SameOps::Sum, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(), nullptr, res.buffer(), res.shapeInfo(), res.specialBuffer(), res.specialShapeInfo());
+    NativeOpExecutioner::execReduceSameScalar(getContext(), sd::reduce::SameOps::Sum, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(), nullptr, res.buffer(), res.shapeInfo(), res.specialBuffer(), res.specialShapeInfo());
     NDArray::registerSpecialUse({&res}, {this});
 
     return res;
@@ -1288,7 +1288,7 @@ NDArray NDArray::meanNumber() const {
     NDArray res(DataTypeUtils::pickFloatingType(dataType()), getContext());
 
     NDArray::prepareSpecialUse({&res}, {this});
-    NativeOpExecutioner::execReduceFloatScalar(getContext(), nd4j::reduce::FloatOps::Mean, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(), nullptr, res.buffer(), res.shapeInfo(), res.specialBuffer(), res.specialShapeInfo());
+    NativeOpExecutioner::execReduceFloatScalar(getContext(), sd::reduce::FloatOps::Mean, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(), nullptr, res.buffer(), res.shapeInfo(), res.specialBuffer(), res.specialShapeInfo());
     NDArray::registerSpecialUse({&res}, {this});
     return res;
 }
@@ -1297,21 +1297,21 @@ NDArray NDArray::meanNumber() const {
 bool NDArray::hasNaNs() {
     if (isS())
         throw std::runtime_error("NDArray::hasNaNs: you can't use this method on String array!");
-    return this->reduceNumber(nd4j::reduce::IsNan, nullptr).e<int>(0) > 0;
+    return this->reduceNumber(sd::reduce::IsNan, nullptr).e<int>(0) > 0;
 }
 
 //////////////////////////////////////////////////////////////////////////
 bool NDArray::hasInfs() {
     if (isS())
         throw std::runtime_error("NDArray::hasInfs: you can't use this method on String array!");
-    return this->reduceNumber(nd4j::reduce::IsInf, nullptr).e<int>(0) > 0;
+    return this->reduceNumber(sd::reduce::IsInf, nullptr).e<int>(0) > 0;
 }
 
 //////////////////////////////////////////////////////////////////////////
 bool NDArray::isFinite() {
     if (isS())
         throw std::runtime_error("NDArray::isFinite: you can't use this method on String array!");
-    return this->reduceNumber(nd4j::reduce::IsInfOrNan, nullptr).e<int>(0) == 0;
+    return this->reduceNumber(sd::reduce::IsInfOrNan, nullptr).e<int>(0) == 0;
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -1336,11 +1336,11 @@ void NDArray::templatedSet(void *buffer, const Nd4jLong offset, const void *valu
 BUILD_DOUBLE_TEMPLATE(template ND4J_EXPORT void NDArray::templatedSet, (void *buffer, const Nd4jLong offset, const void *value), LIBND4J_TYPES, LIBND4J_TYPES);
 
 //////////////////////////////////////////////////////////////////////////
-void NDArray::setContext(nd4j::LaunchContext  *context) {
+void NDArray::setContext(sd::LaunchContext  *context) {
 
     _context = context;
     if (getContext() == nullptr)
-        _context = nd4j::LaunchContext ::defaultContext(); // empty context for default cases
+        _context = sd::LaunchContext ::defaultContext(); // empty context for default cases
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -1351,7 +1351,7 @@ void* NDArray::bufferWithOffset(Nd4jLong offset) const {
 
 //////////////////////////////////////////////////////////////////////////
 // eventually method reduces array by excluding its shapes along axes present in dimensions vector
-NDArray NDArray::reduceAlongDimension(nd4j::reduce::FloatOps op, const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes) const {
+NDArray NDArray::reduceAlongDimension(sd::reduce::FloatOps op, const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes) const {
 
     std::vector<int> copy(dimensions);
 
@@ -1365,7 +1365,7 @@ NDArray NDArray::reduceAlongDimension(nd4j::reduce::FloatOps op, const std::vect
 }
 
 //////////////////////////////////////////////////////////////////////////
-NDArray NDArray::reduceAlongDimension(nd4j::reduce::SameOps op, const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes) const {
+NDArray NDArray::reduceAlongDimension(sd::reduce::SameOps op, const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes) const {
 
     std::vector<int> copy(dimensions);
 
@@ -1379,7 +1379,7 @@ NDArray NDArray::reduceAlongDimension(nd4j::reduce::SameOps op, const std::vecto
 }
 
 //////////////////////////////////////////////////////////////////////////
-NDArray NDArray::reduceAlongDimension(nd4j::reduce::BoolOps op, const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes) const {
+NDArray NDArray::reduceAlongDimension(sd::reduce::BoolOps op, const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes) const {
 
     std::vector<int> copy(dimensions);
 
@@ -1393,7 +1393,7 @@ NDArray NDArray::reduceAlongDimension(nd4j::reduce::BoolOps op, const std::vecto
 }
 
 //////////////////////////////////////////////////////////////////////////
-NDArray NDArray::reduceAlongDimension(nd4j::reduce::LongOps op, const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes) const {
+NDArray NDArray::reduceAlongDimension(sd::reduce::LongOps op, const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes) const {
 
     std::vector<int> copy(dimensions);
 
@@ -1408,27 +1408,27 @@ NDArray NDArray::reduceAlongDimension(nd4j::reduce::LongOps op, const std::vecto
 
 //////////////////////////////////////////////////////////////////////////
 // method reduces array by excluding its shapes along axes present in dimensions vector
-NDArray NDArray::reduceAlongDimension(nd4j::reduce::FloatOps op, const std::initializer_list<int>& dimensions, const bool keepDims, const bool supportOldShapes) const {
+NDArray NDArray::reduceAlongDimension(sd::reduce::FloatOps op, const std::initializer_list<int>& dimensions, const bool keepDims, const bool supportOldShapes) const {
     return reduceAlongDimension(op, std::vector<int>(dimensions), keepDims, supportOldShapes);
 }
 
 //////////////////////////////////////////////////////////////////////////
-NDArray NDArray::reduceAlongDimension(nd4j::reduce::SameOps op, const std::initializer_list<int>& dimensions, const bool keepDims, const bool supportOldShapes) const {
+NDArray NDArray::reduceAlongDimension(sd::reduce::SameOps op, const std::initializer_list<int>& dimensions, const bool keepDims, const bool supportOldShapes) const {
     return reduceAlongDimension(op, std::vector<int>(dimensions), keepDims, supportOldShapes);
 }
 
 //////////////////////////////////////////////////////////////////////////
-NDArray NDArray::reduceAlongDimension(nd4j::reduce::BoolOps op, const std::initializer_list<int>& dimensions, const bool keepDims, const bool supportOldShapes) const {
+NDArray NDArray::reduceAlongDimension(sd::reduce::BoolOps op, const std::initializer_list<int>& dimensions, const bool keepDims, const bool supportOldShapes) const {
     return reduceAlongDimension(op, std::vector<int>(dimensions), keepDims, supportOldShapes);
 }
 
 //////////////////////////////////////////////////////////////////////////
-NDArray NDArray::reduceAlongDimension(nd4j::reduce::LongOps op, const std::initializer_list<int>& dimensions, const bool keepDims, const bool supportOldShapes) const {
+NDArray NDArray::reduceAlongDimension(sd::reduce::LongOps op, const std::initializer_list<int>& dimensions, const bool keepDims, const bool supportOldShapes) const {
     return reduceAlongDimension(op, std::vector<int>(dimensions), keepDims, supportOldShapes);
 }
 
 //////////////////////////////////////////////////////////////////////////
-NDArray NDArray::reduceNumber(nd4j::reduce::FloatOps op, void *extraParams) const {
+NDArray NDArray::reduceNumber(sd::reduce::FloatOps op, void *extraParams) const {
     if (isS())
         throw std::runtime_error("NDArray::reduceNumber FloatOps: you can't use this method on String array!");
 
@@ -1443,7 +1443,7 @@ NDArray NDArray::reduceNumber(nd4j::reduce::FloatOps op, void *extraParams) cons
 }
 
 //////////////////////////////////////////////////////////////////////////
-NDArray NDArray::reduceNumber(nd4j::reduce::SameOps op, void *extraParams) const {
+NDArray NDArray::reduceNumber(sd::reduce::SameOps op, void *extraParams) const {
     if (isS())
         throw std::runtime_error("NDArray::reduceNumber SameOps: you can't use this method on String array!");
 
@@ -1457,7 +1457,7 @@ NDArray NDArray::reduceNumber(nd4j::reduce::SameOps op, void *extraParams) const
 }
 
 //////////////////////////////////////////////////////////////////////////
-NDArray NDArray::reduceNumber(nd4j::reduce::BoolOps op, void *extraParams) const {
+NDArray NDArray::reduceNumber(sd::reduce::BoolOps op, void *extraParams) const {
     if (isS())
         throw std::runtime_error("NDArray::reduceNumber BoolOps: you can't use this method on String array!");
 
@@ -1472,7 +1472,7 @@ NDArray NDArray::reduceNumber(nd4j::reduce::BoolOps op, void *extraParams) const
 }
 
 //////////////////////////////////////////////////////////////////////////
-NDArray NDArray::reduceNumber(nd4j::reduce::LongOps op, void *extraParams) const {
+NDArray NDArray::reduceNumber(sd::reduce::LongOps op, void *extraParams) const {
     if (isS())
         throw std::runtime_error("NDArray::reduceNumber LongOps: you can't use this method on String array!");
 
@@ -1487,7 +1487,7 @@ NDArray NDArray::reduceNumber(nd4j::reduce::LongOps op, void *extraParams) const
 }
 
 //////////////////////////////////////////////////////////////////////////
-void NDArray::reduceNumber(nd4j::reduce::FloatOps op, NDArray& target, void *extraParams) const {
+void NDArray::reduceNumber(sd::reduce::FloatOps op, NDArray& target, void *extraParams) const {
     if (isS())
         throw std::runtime_error("NDArray::reduceNumber FloatOps: you can't use this method on String array!");
     if(target.lengthOf() != 1 || target.dataType() != DataTypeUtils::pickFloatingType(dataType()))
@@ -1499,7 +1499,7 @@ void NDArray::reduceNumber(nd4j::reduce::FloatOps op, NDArray& target, void *ext
 }
 
 //////////////////////////////////////////////////////////////////////////
-void NDArray::reduceNumber(nd4j::reduce::SameOps op, NDArray& target, void *extraParams) const {
+void NDArray::reduceNumber(sd::reduce::SameOps op, NDArray& target, void *extraParams) const {
 
     if (isS())
         throw std::runtime_error("NDArray::reduceNumber SameOps: you can't use this method on String array!");
@@ -1512,7 +1512,7 @@ void NDArray::reduceNumber(nd4j::reduce::SameOps op, NDArray& target, void *extr
 }
 
 //////////////////////////////////////////////////////////////////////////
-void NDArray::reduceNumber(nd4j::reduce::BoolOps op, NDArray& target, void *extraParams) const {
+void NDArray::reduceNumber(sd::reduce::BoolOps op, NDArray& target, void *extraParams) const {
 
     if (isS())
         throw std::runtime_error("NDArray::reduceNumber BoolOps: you can't use this method on String array!");
@@ -1525,7 +1525,7 @@ void NDArray::reduceNumber(nd4j::reduce::BoolOps op, NDArray& target, void *extr
 }
 
 //////////////////////////////////////////////////////////////////////////
-void NDArray::reduceNumber(nd4j::reduce::LongOps op, NDArray& target, void *extraParams) const {
+void NDArray::reduceNumber(sd::reduce::LongOps op, NDArray& target, void *extraParams) const {
 
     if (isS())
         throw std::runtime_error("NDArray::reduceNumber LongOps: you can't use this method on String array!");
@@ -1538,7 +1538,7 @@ void NDArray::reduceNumber(nd4j::reduce::LongOps op, NDArray& target, void *extr
 }
 
 //////////////////////////////////////////////////////////////////////////
-NDArray NDArray::indexReduceNumber(nd4j::indexreduce::Ops op, ExtraArguments *extraParams) {
+NDArray NDArray::indexReduceNumber(sd::indexreduce::Ops op, ExtraArguments *extraParams) {
     if (isS())
         throw std::runtime_error("NDArray::indexReduceNumber: you can't use this method on String array!");
 
@@ -1612,7 +1612,7 @@ void NDArray::printBuffer(const char* msg, Nd4jLong limit, const bool sync) cons
     }
     else if (this->isZ()) {
         for (Nd4jLong e = 0; e < limit; e++) {
-            if (this->dataType() != nd4j::DataType::INT64 && this->dataType() != nd4j::DataType::UINT64)
+            if (this->dataType() != sd::DataType::INT64 && this->dataType() != sd::DataType::UINT64)
                 printf("%d", this->e<int>(e));
             else
                 printf("%llu", this->e<Nd4jLong>(e));
@@ -1661,19 +1661,19 @@ void NDArray::printLinearBuffer() const {
 
     printf("[");
 
-    if (this->dataType() == nd4j::DataType::INT32) {
+    if (this->dataType() == sd::DataType::INT32) {
         for(Nd4jLong e = 0; e < len; e++)
             printf("%d, ", this->bufferAsT<int>()[e * ews]);
     }
-    else if(this->dataType() == nd4j::DataType::INT64) {
+    else if(this->dataType() == sd::DataType::INT64) {
         for(Nd4jLong e = 0; e < len; e++)
             printf("%lld, ", this->bufferAsT<Nd4jLong>()[e * ews]);
     }
-    else if(this->dataType() == nd4j::DataType::FLOAT32) {
+    else if(this->dataType() == sd::DataType::FLOAT32) {
         for(Nd4jLong e = 0; e < len; e++)
             printf("%.3f, ", this->bufferAsT<float>()[e * ews]);
     }
-    else if(this->dataType() == nd4j::DataType::DOUBLE) {
+    else if(this->dataType() == sd::DataType::DOUBLE) {
         for(Nd4jLong e = 0; e < len; e++)
             printf("%.3f, ", this->bufferAsT<double>()[e * ews]);
     }
@@ -2129,14 +2129,14 @@ bool NDArray::isIdentityMatrix() {
 
 	const double eps = 1e-5f;
 	for(Nd4jLong i=0; i<rows(); ++i)
-	   if(nd4j::math::nd4j_abs(e<double>(i,i) - 1.f) > eps)
+	   if(sd::math::nd4j_abs(e<double>(i,i) - 1.f) > eps)
 		  return false;
 
 	for(Nd4jLong i=0; i<rows(); ++i) {
         for(Nd4jLong j=0; j<columns(); ++j) {
             if (i == j)
                 continue;
-            if(nd4j::math::nd4j_abs(e<double>(i,j)) > eps)
+            if(sd::math::nd4j_abs(e<double>(i,j)) > eps)
                 return false;
 		}
     }
@@ -2450,16 +2450,16 @@ void NDArray::operator+=(const NDArray& other) {
     if (isS())
         throw std::runtime_error("NDArray::operator+=: you can't use this method on String array!");
     if (!Environment::getInstance()->isExperimentalBuild() && this->dataType() != other.dataType() && (this->dataType() != DataType::BOOL || other.dataType() != BOOL))
-        throw nd4j::datatype_exception::build("NDArray operator+=: Cannot add different types", this->dataType(), other.dataType());
+        throw sd::datatype_exception::build("NDArray operator+=: Cannot add different types", this->dataType(), other.dataType());
 
     if (this->lengthOf() != 1 && other.lengthOf() == 1) {
         NDArray::prepareSpecialUse({this}, {this, &other});
-        NativeOpExecutioner::execScalar(getContext(), nd4j::scalar::Add, buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), other.getBuffer(), other.getShapeInfo(), other.getSpecialBuffer(), other.getSpecialShapeInfo(), nullptr);
+        NativeOpExecutioner::execScalar(getContext(), sd::scalar::Add, buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), other.getBuffer(), other.getShapeInfo(), other.getSpecialBuffer(), other.getSpecialShapeInfo(), nullptr);
         NDArray::registerSpecialUse({this}, {this, &other});
     }
     else if (other.lengthOf() == lengthOf() && this->rankOf() == other.rankOf()) {
         NDArray::prepareSpecialUse({this}, {this, &other});
-        NativeOpExecutioner::execPairwiseTransform(getContext(), nd4j::pairwise::Add, buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), other.getBuffer(), other.getShapeInfo(), other.getSpecialBuffer(), other.getSpecialShapeInfo(), buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), nullptr);
+        NativeOpExecutioner::execPairwiseTransform(getContext(), sd::pairwise::Add, buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), other.getBuffer(), other.getShapeInfo(), other.getSpecialBuffer(), other.getSpecialShapeInfo(), buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), nullptr);
         NDArray::registerSpecialUse({this}, {this, &other});
     }
     else{
@@ -2468,11 +2468,11 @@ void NDArray::operator+=(const NDArray& other) {
             throw std::invalid_argument("NDArray::operator+=: the shapes of this and other arrays are not suitable for broadcast operation !");
 
         if(shape::equalsTypesAndShapesSoft(getShapeInfo(), bShape)) {
-            this->applyTrueBroadcast(nd4j::BroadcastOpsTuple::Add(), other, *this, false);
+            this->applyTrueBroadcast(sd::BroadcastOpsTuple::Add(), other, *this, false);
         }
         else {
             NDArray result(bShape, true, getContext());
-            this->applyTrueBroadcast(nd4j::BroadcastOpsTuple::Add(), other, result, false);
+            this->applyTrueBroadcast(sd::BroadcastOpsTuple::Add(), other, result, false);
             *this = std::move(result);      // move assignment operator, zero cost copy
         }
     }
@@ -2484,16 +2484,16 @@ void NDArray::operator-=(const NDArray& other) {
         throw std::runtime_error("NDArray::operator-=: you can't use this method on String array!");
 
     if (!Environment::getInstance()->isExperimentalBuild() && this->dataType() != other.dataType() && (this->dataType() != DataType::BOOL || other.dataType() != BOOL))
-        throw nd4j::datatype_exception::build("NDArray operator-=: Cannot subtract different types", this->dataType(), other.dataType());
+        throw sd::datatype_exception::build("NDArray operator-=: Cannot subtract different types", this->dataType(), other.dataType());
 
     if (lengthOf() != 1 && other.lengthOf() == 1) {
         NDArray::prepareSpecialUse({this}, {this, &other});
-        NativeOpExecutioner::execScalar(getContext(), nd4j::scalar::Subtract, buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), other.getBuffer(), other.getShapeInfo(), other.getSpecialBuffer(), other.getSpecialShapeInfo(), nullptr);
+        NativeOpExecutioner::execScalar(getContext(), sd::scalar::Subtract, buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), other.getBuffer(), other.getShapeInfo(), other.getSpecialBuffer(), other.getSpecialShapeInfo(), nullptr);
         NDArray::registerSpecialUse({this}, {this, &other});
     }
     else if (other.lengthOf() == lengthOf() && this->rankOf() == other.rankOf()) {
         NDArray::prepareSpecialUse({this}, {this, &other});
-        NativeOpExecutioner::execPairwiseTransform(getContext(), nd4j::pairwise::Subtract, buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), other.getBuffer(), other.getShapeInfo(), other.getSpecialBuffer(), other.getSpecialShapeInfo(), buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), nullptr);
+        NativeOpExecutioner::execPairwiseTransform(getContext(), sd::pairwise::Subtract, buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), other.getBuffer(), other.getShapeInfo(), other.getSpecialBuffer(), other.getSpecialShapeInfo(), buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), nullptr);
         NDArray::registerSpecialUse({this}, {this, &other});
     }
     else{
@@ -2502,11 +2502,11 @@ void NDArray::operator-=(const NDArray& other) {
             throw std::invalid_argument("NDArray::operator-=: the shapes of this and other arrays are not suitable for broadcast operation !");
 
         if(shape::equalsTypesAndShapesSoft(getShapeInfo(), bShape)) {
-            this->applyTrueBroadcast(nd4j::BroadcastOpsTuple::Subtract(), other, *this, false);
+            this->applyTrueBroadcast(sd::BroadcastOpsTuple::Subtract(), other, *this, false);
         }
         else {
             NDArray result(bShape, true, getContext());
-            this->applyTrueBroadcast(nd4j::BroadcastOpsTuple::Subtract(), other, result, false);
+            this->applyTrueBroadcast(sd::BroadcastOpsTuple::Subtract(), other, result, false);
             *this = std::move(result);      // move assignment operator, zero cost copy
         }
     }
@@ -2517,16 +2517,16 @@ void NDArray::operator*=(const NDArray& other) {
     if (isS())
         throw std::runtime_error("NDArray::operator*=: you can't use this method on String array!");
     if (!Environment::getInstance()->isExperimentalBuild() && this->dataType() != other.dataType() && (this->dataType() != DataType::BOOL || other.dataType() != BOOL))
-        throw nd4j::datatype_exception::build("NDArray operator*=: Cannot multiply different types", this->dataType(), other.dataType());
+        throw sd::datatype_exception::build("NDArray operator*=: Cannot multiply different types", this->dataType(), other.dataType());
 
     if (lengthOf() != 1 && other.lengthOf() == 1) {
         NDArray::prepareSpecialUse({this}, {this, &other});
-        NativeOpExecutioner::execScalar(getContext(), nd4j::scalar::Multiply, buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), other.getBuffer(), other.getShapeInfo(), other.getSpecialBuffer(), other.getSpecialShapeInfo(), nullptr);
+        NativeOpExecutioner::execScalar(getContext(), sd::scalar::Multiply, buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), other.getBuffer(), other.getShapeInfo(), other.getSpecialBuffer(), other.getSpecialShapeInfo(), nullptr);
         NDArray::registerSpecialUse({this}, {this, &other});
     }
     else if (other.lengthOf() == lengthOf() && this->rankOf() == other.rankOf()) {
         NDArray::prepareSpecialUse({this}, {this, &other});
-        NativeOpExecutioner::execPairwiseTransform(getContext(), nd4j::pairwise::Multiply, buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), other.getBuffer(), other.getShapeInfo(), other.getSpecialBuffer(), other.getSpecialShapeInfo(), buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), nullptr);
+        NativeOpExecutioner::execPairwiseTransform(getContext(), sd::pairwise::Multiply, buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), other.getBuffer(), other.getShapeInfo(), other.getSpecialBuffer(), other.getSpecialShapeInfo(), buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), nullptr);
         NDArray::registerSpecialUse({this}, {this, &other});
     }
     else{
@@ -2535,11 +2535,11 @@ void NDArray::operator*=(const NDArray& other) {
             throw std::invalid_argument("NDArray::operator*=: the shapes of this and other arrays are not suitable for broadcast operation !");
 
         if(shape::equalsTypesAndShapesSoft(_shapeInfo, bShape)) {
-            this->applyTrueBroadcast(nd4j::BroadcastOpsTuple::Multiply(), other, *this, false);
+            this->applyTrueBroadcast(sd::BroadcastOpsTuple::Multiply(), other, *this, false);
         }
         else {
             NDArray result(bShape, true, getContext());
-            this->applyTrueBroadcast(nd4j::BroadcastOpsTuple::Multiply(), other, result, false);
+            this->applyTrueBroadcast(sd::BroadcastOpsTuple::Multiply(), other, result, false);
             *this = std::move(result);      // move assignment operator, zero cost copy
         }
     }
@@ -2553,17 +2553,17 @@ void NDArray::operator/=(const NDArray& other) {
         throw std::runtime_error("NDArray::operator/=: you can't divide by bool array!");
 
     if (!Environment::getInstance()->isExperimentalBuild() && this->dataType() != other.dataType()) {
-        throw nd4j::datatype_exception::build("NDArray operator/=: Cannot divide different types", this->dataType(), other.dataType());
+        throw sd::datatype_exception::build("NDArray operator/=: Cannot divide different types", this->dataType(), other.dataType());
     }
 
     if (lengthOf() != 1 && other.lengthOf() == 1) {
         NDArray::prepareSpecialUse({this}, {this, &other});
-        NativeOpExecutioner::execScalar(getContext(), nd4j::scalar::Divide, buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), other.getBuffer(), other.getShapeInfo(), other.getSpecialBuffer(), other.getSpecialShapeInfo(), nullptr);
+        NativeOpExecutioner::execScalar(getContext(), sd::scalar::Divide, buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), other.getBuffer(), other.getShapeInfo(), other.getSpecialBuffer(), other.getSpecialShapeInfo(), nullptr);
         NDArray::registerSpecialUse({this}, {this, &other});
     }
     else if (other.lengthOf() == lengthOf() && this->rankOf() == other.rankOf()) {
         NDArray::prepareSpecialUse({this}, {this, &other});
-        NativeOpExecutioner::execPairwiseTransform(getContext(), nd4j::pairwise::Divide, buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), other.getBuffer(), other.getShapeInfo(), other.getSpecialBuffer(), other.getSpecialShapeInfo(), buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), nullptr);
+        NativeOpExecutioner::execPairwiseTransform(getContext(), sd::pairwise::Divide, buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), other.getBuffer(), other.getShapeInfo(), other.getSpecialBuffer(), other.getSpecialShapeInfo(), buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), nullptr);
         NDArray::registerSpecialUse({this}, {this, &other});
     }
     else{
@@ -2572,11 +2572,11 @@ void NDArray::operator/=(const NDArray& other) {
             throw std::invalid_argument("NDArray::operator/=: the shapes of this and other arrays are not suitable for broadcast operation !");
 
         if(shape::equalsTypesAndShapesSoft(_shapeInfo, bShape)) {
-            this->applyTrueBroadcast(nd4j::BroadcastOpsTuple::Divide(), other, *this, false);
+            this->applyTrueBroadcast(sd::BroadcastOpsTuple::Divide(), other, *this, false);
         }
         else {
             NDArray result(bShape, true, getContext());
-            this->applyTrueBroadcast(nd4j::BroadcastOpsTuple::Divide(), other, result, false);
+            this->applyTrueBroadcast(sd::BroadcastOpsTuple::Divide(), other, result, false);
             *this = std::move(result);      // move assignment operator, zero cost copy
         }
     }
@@ -2592,7 +2592,7 @@ void NDArray::operator+=(const T value) {
 
     NDArray::prepareSpecialUse({this}, {&other});
 
-    NativeOpExecutioner::execScalar(getContext(), nd4j::scalar::Add, buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), other.getBuffer(), other.getShapeInfo(), other.getSpecialBuffer(), other.getSpecialShapeInfo(), nullptr);
+    NativeOpExecutioner::execScalar(getContext(), sd::scalar::Add, buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), other.getBuffer(), other.getShapeInfo(), other.getSpecialBuffer(), other.getSpecialShapeInfo(), nullptr);
 
     NDArray::registerSpecialUse({this}, {});
 }
@@ -2614,7 +2614,7 @@ void NDArray::operator-=(const T value) {
 
     NDArray::prepareSpecialUse({this}, {&other});
 
-    NativeOpExecutioner::execScalar(getContext(), nd4j::scalar::Subtract, buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), other.getBuffer(), other.getShapeInfo(), other.getSpecialBuffer(), other.getSpecialShapeInfo(), nullptr);
+    NativeOpExecutioner::execScalar(getContext(), sd::scalar::Subtract, buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), other.getBuffer(), other.getShapeInfo(), other.getSpecialBuffer(), other.getSpecialShapeInfo(), nullptr);
 
     NDArray::registerSpecialUse({this}, {});
 }
@@ -2634,7 +2634,7 @@ void NDArray::operator*=(const T scalar) {
 
     auto other = NDArrayFactory::create(this->dataType(), scalar, getContext());
     NDArray::prepareSpecialUse({this}, {&other});
-    NativeOpExecutioner::execScalar(getContext(), nd4j::scalar::Multiply, buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), other.getBuffer(), other.getShapeInfo(), other.getSpecialBuffer(), other.getSpecialShapeInfo(), nullptr);
+    NativeOpExecutioner::execScalar(getContext(), sd::scalar::Multiply, buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), other.getBuffer(), other.getShapeInfo(), other.getSpecialBuffer(), other.getSpecialShapeInfo(), nullptr);
 
     NDArray::registerSpecialUse({this}, {});
 }
@@ -2657,7 +2657,7 @@ void NDArray::operator/=(const T scalar) {
 
     auto other = NDArrayFactory::create(this->dataType(), scalar, getContext());
     NDArray::prepareSpecialUse({this}, {&other});
-    NativeOpExecutioner::execScalar(getContext(), nd4j::scalar::Divide, buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), other.getBuffer(), other.getShapeInfo(), other.getSpecialBuffer(), other.getSpecialShapeInfo(), nullptr);
+    NativeOpExecutioner::execScalar(getContext(), sd::scalar::Divide, buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), other.getBuffer(), other.getShapeInfo(), other.getSpecialBuffer(), other.getSpecialShapeInfo(), nullptr);
     NDArray::registerSpecialUse({this}, {});
 }
 template ND4J_EXPORT void NDArray::operator/=(const double scalar);
@@ -2680,7 +2680,7 @@ NDArray NDArray::operator-() const & {
     NDArray result(getShapeInfo(), false, getContext());
 
     NDArray::prepareSpecialUse({&result}, {this});
-    NativeOpExecutioner::execTransformSame(getContext(), nd4j::transform::Neg, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(), result.buffer(), result.getShapeInfo(), result.specialBuffer(), result.getSpecialShapeInfo(), nullptr, nullptr, nullptr);
+    NativeOpExecutioner::execTransformSame(getContext(), sd::transform::Neg, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(), result.buffer(), result.getShapeInfo(), result.specialBuffer(), result.getSpecialShapeInfo(), nullptr, nullptr, nullptr);
     NDArray::registerSpecialUse({&result}, {this});
 
     return result;
@@ -2692,7 +2692,7 @@ NDArray NDArray::operator-() && {
         throw std::runtime_error("NDArray::negative-: you can't use this method on String array!");
 
     NDArray::prepareSpecialUse({this}, {this});
-    NativeOpExecutioner::execTransformSame(getContext(), nd4j::transform::Neg, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(), buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), nullptr, nullptr, nullptr);
+    NativeOpExecutioner::execTransformSame(getContext(), sd::transform::Neg, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(), buffer(), getShapeInfo(), specialBuffer(), getSpecialShapeInfo(), nullptr, nullptr, nullptr);
     NDArray::registerSpecialUse({this}, {this});
 
     return std::move(*this);
@@ -2795,7 +2795,7 @@ NDArray NDArray::quantize(const NDArray& array) {
 }
 
 //////////////////////////////////////////////////////////////////////////
-void NDArray::applyTrueBroadcast(nd4j::BroadcastOpsTuple op, const NDArray& other, NDArray& target, const bool checkTargetShape, ExtraArguments *extraArgs) const {
+void NDArray::applyTrueBroadcast(sd::BroadcastOpsTuple op, const NDArray& other, NDArray& target, const bool checkTargetShape, ExtraArguments *extraArgs) const {
 
     if (isS())
         throw std::runtime_error("NDArray::applyTrueBroadcast: you can't use this method on String array!");
@@ -2837,7 +2837,7 @@ void NDArray::applyTrueBroadcast(nd4j::BroadcastOpsTuple op, const NDArray& othe
 }
 
 //////////////////////////////////////////////////////////////////////////
-void NDArray::applyTrueBroadcast(nd4j::BroadcastBoolOpsTuple op, const NDArray& other, NDArray& target, const bool checkTargetShape, ExtraArguments *extraArgs) const {
+void NDArray::applyTrueBroadcast(sd::BroadcastBoolOpsTuple op, const NDArray& other, NDArray& target, const bool checkTargetShape, ExtraArguments *extraArgs) const {
     if (isS())
         throw std::runtime_error("NDArray::applyTrueBroadcast bool: you can't use this method on String array!");
 
@@ -2874,7 +2874,7 @@ void NDArray::applyTrueBroadcast(nd4j::BroadcastBoolOpsTuple op, const NDArray&
 }
 
 //////////////////////////////////////////////////////////////////////////
-void NDArray::applyTrueBroadcast(nd4j::BroadcastIntOpsTuple op, const NDArray& other, NDArray& target, const bool checkTargetShape, ExtraArguments *extraArgs) const {
+void NDArray::applyTrueBroadcast(sd::BroadcastIntOpsTuple op, const NDArray& other, NDArray& target, const bool checkTargetShape, ExtraArguments *extraArgs) const {
 
     if (isS())
         throw std::runtime_error("NDArray::applyTrueBroadcast bool: you can't use this method on String array!");
@@ -2912,7 +2912,7 @@ void NDArray::applyTrueBroadcast(nd4j::BroadcastIntOpsTuple op, const NDArray& o
 }
 
 //////////////////////////////////////////////////////////////////////////
-NDArray NDArray::applyTrueBroadcast(nd4j::BroadcastOpsTuple op, const NDArray& other, ExtraArguments *extraArgs) const & {
+NDArray NDArray::applyTrueBroadcast(sd::BroadcastOpsTuple op, const NDArray& other, ExtraArguments *extraArgs) const & {
     if (isEmpty() || other.isEmpty()) {
         if (isEmpty())
             return NDArray(*this);
@@ -2931,7 +2931,7 @@ NDArray NDArray::applyTrueBroadcast(nd4j::BroadcastOpsTuple op, const NDArray& o
 }
 
 //////////////////////////////////////////////////////////////////////////
-NDArray NDArray::applyTrueBroadcast(nd4j::BroadcastOpsTuple op, NDArray&& other, ExtraArguments *extraArgs) const & {
+NDArray NDArray::applyTrueBroadcast(sd::BroadcastOpsTuple op, NDArray&& other, ExtraArguments *extraArgs) const & {
     if (isEmpty() || other.isEmpty()) {
         if (isEmpty())
             return NDArray(*this);
@@ -2955,7 +2955,7 @@ NDArray NDArray::applyTrueBroadcast(nd4j::BroadcastOpsTuple op, NDArray&& other,
 }
 
 //////////////////////////////////////////////////////////////////////////
-NDArray NDArray::applyTrueBroadcast(nd4j::BroadcastOpsTuple op, const NDArray& other, ExtraArguments *extraArgs) && {
+NDArray NDArray::applyTrueBroadcast(sd::BroadcastOpsTuple op, const NDArray& other, ExtraArguments *extraArgs) && {
     if (isEmpty() || other.isEmpty()) {
         if (isEmpty())
             return NDArray(*this);
@@ -2979,7 +2979,7 @@ NDArray NDArray::applyTrueBroadcast(nd4j::BroadcastOpsTuple op, const NDArray& o
 }
 
 //////////////////////////////////////////////////////////////////////////
-NDArray NDArray::applyTrueBroadcast(nd4j::BroadcastOpsTuple op, NDArray&& other, ExtraArguments *extraArgs) && {
+NDArray NDArray::applyTrueBroadcast(sd::BroadcastOpsTuple op, NDArray&& other, ExtraArguments *extraArgs) && {
     if (isEmpty() || other.isEmpty()) {
         if (isEmpty())
             return NDArray(*this);
@@ -3012,7 +3012,7 @@ NDArray NDArray::applyTrueBroadcast(nd4j::BroadcastOpsTuple op, NDArray&& other,
 }
 
 //////////////////////////////////////////////////////////////////////////
-void NDArray::applyBroadcast(nd4j::broadcast::Ops op, const std::vector<int>& dimensions, const NDArray& other, NDArray& target, ExtraArguments* extraArgs) {
+void NDArray::applyBroadcast(sd::broadcast::Ops op, const std::vector<int>& dimensions, const NDArray& other, NDArray& target, ExtraArguments* extraArgs) {
     if (isS())
         throw std::runtime_error("NDArray::applyBroadcast: you can't use this method on String array!");
     if(((op == broadcast::Divide || op == broadcast::FloorDiv || op == broadcast::FloorMod) && other.isB()) || (op == broadcast::ReverseDivide && this->isB()))
@@ -3057,8 +3057,8 @@ void NDArray::applyBroadcast(nd4j::broadcast::Ops op, const std::vector<int>& di
     if (tadLength != min->lengthOf())
         throw std::runtime_error("NDArray::applyBroadcast method: tad length mismatch !");
 
-    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(max->shapeInfo(), copy);
-    auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(target.shapeInfo(), copy);
+    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(max->shapeInfo(), copy);
+    auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(target.shapeInfo(), copy);
 
     NDArray::prepareSpecialUse({&target}, {this, &other});
     if(max == this)
@@ -3069,7 +3069,7 @@ void NDArray::applyBroadcast(nd4j::broadcast::Ops op, const std::vector<int>& di
 }
 
 //////////////////////////////////////////////////////////////////////////
-void NDArray::applyBroadcast(nd4j::broadcast::BoolOps op, const std::vector<int>& dimensions, const NDArray& other, NDArray& target, ExtraArguments* extraArgs) {
+void NDArray::applyBroadcast(sd::broadcast::BoolOps op, const std::vector<int>& dimensions, const NDArray& other, NDArray& target, ExtraArguments* extraArgs) {
     if (isS())
         throw std::runtime_error("NDArray::applyBroadcast BoolOps: you can't use this method on String array!");
     if(isEmpty() || other.isEmpty()) {
@@ -3114,8 +3114,8 @@ void NDArray::applyBroadcast(nd4j::broadcast::BoolOps op, const std::vector<int>
     if (tadLength != min->lengthOf())
         throw std::runtime_error("Tad length mismatch");
 
-    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(max->shapeInfo(), copy);
-    auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(target.shapeInfo(), copy);
+    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(max->shapeInfo(), copy);
+    auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(target.shapeInfo(), copy);
 
     // TODO: eventually we want separate tads here
     NDArray::prepareSpecialUse({&target}, {this, &other});
@@ -3128,7 +3128,7 @@ void NDArray::applyBroadcast(nd4j::broadcast::BoolOps op, const std::vector<int>
 
 
 //////////////////////////////////////////////////////////////////////////
-void NDArray::applyBroadcast(nd4j::broadcast::IntOps op, const std::vector<int>& dimensions, const NDArray& other, NDArray& target, ExtraArguments* extraArgs) {
+void NDArray::applyBroadcast(sd::broadcast::IntOps op, const std::vector<int>& dimensions, const NDArray& other, NDArray& target, ExtraArguments* extraArgs) {
     if (!isZ())
         throw std::runtime_error("NDArray::applyBroadcast IntOps: you can't use this method on non-Integer array!");
     if(isEmpty() || other.isEmpty()) {
@@ -3173,8 +3173,8 @@ void NDArray::applyBroadcast(nd4j::broadcast::IntOps op, const std::vector<int>&
     if (tadLength != min->lengthOf())
         throw std::runtime_error("Tad length mismatch");
 
-    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(max->shapeInfo(), copy);
-    auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(target.shapeInfo(), copy);
+    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(max->shapeInfo(), copy);
+    auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(target.shapeInfo(), copy);
 
     // TODO: eventually we want separate tads here
     NDArray::prepareSpecialUse({&target}, {this, &other});
@@ -3186,15 +3186,15 @@ void NDArray::applyBroadcast(nd4j::broadcast::IntOps op, const std::vector<int>&
 }
 
 //////////////////////////////////////////////////////////////////////////
-void NDArray::applyBroadcast(nd4j::broadcast::Ops op, const std::initializer_list<int> dimensions, const NDArray& tadArray, NDArray& target, ExtraArguments* extraArgs) {
+void NDArray::applyBroadcast(sd::broadcast::Ops op, const std::initializer_list<int> dimensions, const NDArray& tadArray, NDArray& target, ExtraArguments* extraArgs) {
     std::vector<int> vec(dimensions);
     applyBroadcast(op, vec, tadArray, target, extraArgs);
 }
 
 ////////////////////////////////////////////////////////////////////////
 void* NDArray::operator new(size_t i) {
-    if (nd4j::memory::MemoryRegistrator::getInstance()->hasWorkspaceAttached()) {
-        nd4j::memory::Workspace* ws = nd4j::memory::MemoryRegistrator::getInstance()->getWorkspace();
+    if (sd::memory::MemoryRegistrator::getInstance()->hasWorkspaceAttached()) {
+        sd::memory::Workspace* ws = sd::memory::MemoryRegistrator::getInstance()->getWorkspace();
         return ws->allocateBytes((Nd4jLong) i);
     }
     else {
@@ -3206,7 +3206,7 @@ void* NDArray::operator new(size_t i) {
 
 ////////////////////////////////////////////////////////////////////////
 void NDArray::operator delete(void* p) {
-    if (!nd4j::memory::MemoryRegistrator::getInstance()->hasWorkspaceAttached())
+    if (!sd::memory::MemoryRegistrator::getInstance()->hasWorkspaceAttached())
         free(p);
 }
 
@@ -3265,7 +3265,7 @@ bool NDArray::reshapei(const char order, const std::vector<Nd4jLong>& cshape, co
                 if (i != j)
                     shapeLength *= shape_[j];
 
-            Nd4jLong realShape = nd4j::math::nd4j_abs<int>(lengthOf() / shapeLength);
+            Nd4jLong realShape = sd::math::nd4j_abs<int>(lengthOf() / shapeLength);
             auto thisNewShape = new Nd4jLong[shape.size()];
 
             for (int j = 0; j < (int) shape.size(); j++)
@@ -3290,7 +3290,7 @@ bool NDArray::reshapei(const char order, const std::vector<Nd4jLong>& cshape, co
 
     if(platformBuffer() == nullptr || arrLength != this->lengthOf()) {
         this->printShapeInfo("Mismatched shape");
-        nd4j::Logger::printv("Shape requested: ", shape);
+        sd::Logger::printv("Shape requested: ", shape);
         nd4j_debug("Requested length in reshape: %i; Existing length: %i;\n", arrLength, this->lengthOf());
         throw std::runtime_error("NDArray::reshapei: bad input shape!");
     }
@@ -3332,13 +3332,13 @@ void NDArray::nullify() {
 
 ////////////////////////////////////////////////////////////////////////
 template <typename T>
-void NDArray::templatedSet(void *buffer, const Nd4jLong xOfsset, nd4j::DataType dtype, const void *value) {
+void NDArray::templatedSet(void *buffer, const Nd4jLong xOfsset, sd::DataType dtype, const void *value) {
     BUILD_SINGLE_PARTIAL_SELECTOR(dtype, templatedSet< , T>(buffer, xOfsset, value), LIBND4J_TYPES);
 }
-BUILD_SINGLE_TEMPLATE(template ND4J_EXPORT void NDArray::templatedSet, (void *buffer, const Nd4jLong xOfsset, nd4j::DataType dtype, const void *value), LIBND4J_TYPES);
+BUILD_SINGLE_TEMPLATE(template ND4J_EXPORT void NDArray::templatedSet, (void *buffer, const Nd4jLong xOfsset, sd::DataType dtype, const void *value), LIBND4J_TYPES);
 
 ////////////////////////////////////////////////////////////////////////
-void NDArray::applyPairwiseTransform(nd4j::pairwise::Ops op, const NDArray& other, NDArray& target, ExtraArguments *extraParams) const{
+void NDArray::applyPairwiseTransform(sd::pairwise::Ops op, const NDArray& other, NDArray& target, ExtraArguments *extraParams) const{
     if (isS())
         throw std::runtime_error("NDArray::applyPairwiseTransform: you can't use this method on String array!");
     if (other.lengthOf() != target.lengthOf())
@@ -3355,7 +3355,7 @@ void NDArray::applyPairwiseTransform(nd4j::pairwise::Ops op, const NDArray& othe
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NDArray::applyPairwiseTransform(nd4j::pairwise::BoolOps op, const NDArray& other, NDArray& target, ExtraArguments *extraParams) const{
+void NDArray::applyPairwiseTransform(sd::pairwise::BoolOps op, const NDArray& other, NDArray& target, ExtraArguments *extraParams) const{
     if (isS())
         throw std::runtime_error("NDArray::applyPairwiseTransform BoolOps: you can't use this method on String array!");
     if (other.lengthOf() != target.lengthOf())
@@ -3371,7 +3371,7 @@ void NDArray::applyPairwiseTransform(nd4j::pairwise::BoolOps op, const NDArray&
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NDArray::applyPairwiseTransform(nd4j::pairwise::IntOps op, const NDArray& other, NDArray& target, ExtraArguments *extraParams) const{
+void NDArray::applyPairwiseTransform(sd::pairwise::IntOps op, const NDArray& other, NDArray& target, ExtraArguments *extraParams) const{
     if (isS())
         throw std::runtime_error("NDArray::applyPairwiseTransform IntOps: you can't use this method on String array!");
     if (other.lengthOf() != target.lengthOf())
@@ -3387,7 +3387,7 @@ void NDArray::applyPairwiseTransform(nd4j::pairwise::IntOps op, const NDArray& o
 }
 
 //////////////////////////////////////////////////////////////////////////
-void NDArray::applyPairwiseTransform(nd4j::pairwise::Ops op, const NDArray& other, ExtraArguments *extraParams) {
+void NDArray::applyPairwiseTransform(sd::pairwise::Ops op, const NDArray& other, ExtraArguments *extraParams) {
     applyPairwiseTransform(op, other, *this, extraParams);
 }
 
@@ -3401,7 +3401,7 @@ void NDArray::templatedDoubleAssign(void *xBuffer, const Nd4jLong xOffset, const
 BUILD_DOUBLE_TEMPLATE(template ND4J_EXPORT void NDArray::templatedDoubleAssign, (void *xBuffer, const Nd4jLong xOffset, const void *yBuffer, const Nd4jLong yOffset) const, LIBND4J_TYPES, LIBND4J_TYPES);
 
 ////////////////////////////////////////////////////////////////////////
-void NDArray::varianceAlongDimension(nd4j::variance::Ops op, NDArray& target, const bool biasCorrected, const std::vector<int>& dimensions) const {
+void NDArray::varianceAlongDimension(sd::variance::Ops op, NDArray& target, const bool biasCorrected, const std::vector<int>& dimensions) const {
 
     if (isS())
         throw std::runtime_error("NDArray::varianceAlongDimension: you can't use this method on String array!");
@@ -3415,8 +3415,8 @@ void NDArray::varianceAlongDimension(nd4j::variance::Ops op, NDArray& target, co
         NativeOpExecutioner::execSummaryStatsScalar(getContext(), op, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(), nullptr, target.getBuffer(), target.getShapeInfo(), target.getSpecialBuffer(), target.getSpecialShapeInfo(), biasCorrected);
     else {
         std::vector<int> copy(dimensions);
-        auto pDims = nd4j::Environment::getInstance()->isCPU() ? copy.data() : nullptr;
-        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(this->getShapeInfo(), dimensions);
+        auto pDims = sd::Environment::getInstance()->isCPU() ? copy.data() : nullptr;
+        auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(this->getShapeInfo(), dimensions);
         NativeOpExecutioner::execSummaryStats(getContext(), op, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(), nullptr, target.buffer(), target.shapeInfo(), target.getSpecialBuffer(), target.specialShapeInfo(), pDims, dimensions.size(), packX.platformShapeInfo(), packX.platformOffsets(), biasCorrected);
         synchronize("NDArray::varianceAlongDimension");
     }
@@ -3425,7 +3425,7 @@ void NDArray::varianceAlongDimension(nd4j::variance::Ops op, NDArray& target, co
 }
 
 ////////////////////////////////////////////////////////////////////////
-NDArray NDArray::varianceAlongDimension(nd4j::variance::Ops op, const bool biasCorrected, const std::vector<int>& dimensions) const {
+NDArray NDArray::varianceAlongDimension(sd::variance::Ops op, const bool biasCorrected, const std::vector<int>& dimensions) const {
     if (isS())
         throw std::runtime_error("NDArray::varianceAlongDimension: you can't use this method on String array!");
 
@@ -3442,12 +3442,12 @@ NDArray NDArray::varianceAlongDimension(nd4j::variance::Ops op, const bool biasC
 }
 
 ////////////////////////////////////////////////////////////////////////
-NDArray NDArray::varianceAlongDimension(nd4j::variance::Ops op, const bool biasCorrected, const std::initializer_list<int>& dimensions) const {
+NDArray NDArray::varianceAlongDimension(sd::variance::Ops op, const bool biasCorrected, const std::initializer_list<int>& dimensions) const {
     return varianceAlongDimension(op, biasCorrected, std::vector<int>(dimensions));
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NDArray::varianceAlongDimension(nd4j::variance::Ops op, NDArray &target, const bool biasCorrected, const std::initializer_list<int>& dimensions) const {
+void NDArray::varianceAlongDimension(sd::variance::Ops op, NDArray &target, const bool biasCorrected, const std::initializer_list<int>& dimensions) const {
     varianceAlongDimension(op, target, biasCorrected, std::vector<int>(dimensions));
 }
 
@@ -3554,7 +3554,7 @@ bool NDArray::equalsTo(const NDArray *other, double eps) const {
         return true;
     } else {
         // regular numeric types
-        NDArray tmp(nd4j::DataType::FLOAT32, getContext()); // scalar = 0
+        NDArray tmp(sd::DataType::FLOAT32, getContext()); // scalar = 0
 
         ExtraArguments extras({0.0, 0.0, eps});
 
@@ -3795,7 +3795,7 @@ NDArray NDArray::e(const Nd4jLong i) const {
 
 //////////////////////////////////////////////////////////////////////////
 // perform array transformation
-void NDArray::applyTransform(nd4j::transform::FloatOps op, NDArray& target, ExtraArguments *extraParams) {
+void NDArray::applyTransform(sd::transform::FloatOps op, NDArray& target, ExtraArguments *extraParams) {
 
     if (isS())
         throw std::runtime_error("NDArray::applyTransform FloatOps: you can't use this method on String array!");
@@ -3809,7 +3809,7 @@ void NDArray::applyTransform(nd4j::transform::FloatOps op, NDArray& target, Extr
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NDArray::applyTransform(nd4j::transform::AnyOps op, NDArray& target, ExtraArguments *extraParams) {
+void NDArray::applyTransform(sd::transform::AnyOps op, NDArray& target, ExtraArguments *extraParams) {
 
     if (isS())
         throw std::runtime_error("NDArray::applyTransform AnyOps: you can't use this method on String array!");
@@ -3820,7 +3820,7 @@ void NDArray::applyTransform(nd4j::transform::AnyOps op, NDArray& target, ExtraA
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NDArray::applyTransform(nd4j::transform::SameOps op, NDArray& target, ExtraArguments *extraParams) {
+void NDArray::applyTransform(sd::transform::SameOps op, NDArray& target, ExtraArguments *extraParams) {
 
     if (isS())
         throw std::runtime_error("NDArray::applyTransform SameOps: you can't use this method on String array!");
@@ -3834,7 +3834,7 @@ void NDArray::applyTransform(nd4j::transform::SameOps op, NDArray& target, Extra
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NDArray::applyTransform(nd4j::transform::StrictOps op, NDArray& target, ExtraArguments *extraParams) {
+void NDArray::applyTransform(sd::transform::StrictOps op, NDArray& target, ExtraArguments *extraParams) {
     if (isS())
         throw std::runtime_error("NDArray::applyTransform StrictOps: you can't use this method on String array!");
 
@@ -3847,7 +3847,7 @@ void NDArray::applyTransform(nd4j::transform::StrictOps op, NDArray& target, Ext
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NDArray::applyTransform(nd4j::transform::BoolOps op, NDArray& target, ExtraArguments *extraParams) {
+void NDArray::applyTransform(sd::transform::BoolOps op, NDArray& target, ExtraArguments *extraParams) {
     if (isS())
         throw std::runtime_error("NDArray::applyTransform BoolOps: you can't use this method on String array!");
 
@@ -3860,7 +3860,7 @@ void NDArray::applyTransform(nd4j::transform::BoolOps op, NDArray& target, Extra
 }
 
 ////////////////////////////////////////////////////////////////////////
-NDArray NDArray::transform(nd4j::transform::FloatOps op, void *extraParams) const & {
+NDArray NDArray::transform(sd::transform::FloatOps op, void *extraParams) const & {
     if (isS())
         throw std::runtime_error("NDArray::transform FloatOps: you can't use this method on String array!");
 
@@ -3874,7 +3874,7 @@ NDArray NDArray::transform(nd4j::transform::FloatOps op, void *extraParams) cons
 }
 
 ////////////////////////////////////////////////////////////////////////
-NDArray NDArray::transform(nd4j::transform::FloatOps op, void *extraParams) && {
+NDArray NDArray::transform(sd::transform::FloatOps op, void *extraParams) && {
     if (isS())
         throw std::runtime_error("NDArray::transform SameOps: you can't use this method on String array!");
 
@@ -3886,7 +3886,7 @@ NDArray NDArray::transform(nd4j::transform::FloatOps op, void *extraParams) && {
 }
 
 ////////////////////////////////////////////////////////////////////////
-NDArray NDArray::transform(nd4j::transform::SameOps op, void *extraParams) const & {
+NDArray NDArray::transform(sd::transform::SameOps op, void *extraParams) const & {
     if (isS())
         throw std::runtime_error("NDArray::transform SameOps: you can't use this method on String array!");
 
@@ -3900,7 +3900,7 @@ NDArray NDArray::transform(nd4j::transform::SameOps op, void *extraParams) const
 }
 
 ////////////////////////////////////////////////////////////////////////
-NDArray NDArray::transform(nd4j::transform::SameOps op, void *extraParams) && {
+NDArray NDArray::transform(sd::transform::SameOps op, void *extraParams) && {
     if (isS())
         throw std::runtime_error("NDArray::transform SameOps: you can't use this method on String array!");
 
@@ -3912,7 +3912,7 @@ NDArray NDArray::transform(nd4j::transform::SameOps op, void *extraParams) && {
 }
 
 ////////////////////////////////////////////////////////////////////////
-NDArray NDArray::transform(nd4j::transform::StrictOps op, void *extraParams) const & {
+NDArray NDArray::transform(sd::transform::StrictOps op, void *extraParams) const & {
     if (!this->isR())
         throw std::runtime_error("Source array must have one of FLOAT types");
 
@@ -3926,7 +3926,7 @@ NDArray NDArray::transform(nd4j::transform::StrictOps op, void *extraParams) con
 }
 
 ////////////////////////////////////////////////////////////////////////
-NDArray NDArray::transform(nd4j::transform::StrictOps op, void *extraParams) && {
+NDArray NDArray::transform(sd::transform::StrictOps op, void *extraParams) && {
     if (!this->isR())
         throw std::runtime_error("Source array must have one of FLOAT types");
 
@@ -3938,11 +3938,11 @@ NDArray NDArray::transform(nd4j::transform::StrictOps op, void *extraParams) &&
 }
 
 ////////////////////////////////////////////////////////////////////////
-NDArray NDArray::transform(nd4j::transform::BoolOps op, void *extraParams) const & {
+NDArray NDArray::transform(sd::transform::BoolOps op, void *extraParams) const & {
     if (isS())
         throw std::runtime_error("NDArray::transform BoolOps: you can't use this method on String array!");
 
-    NDArray result(ordering(), getShapeAsVector(), nd4j::DataType::BOOL, getContext());
+    NDArray result(ordering(), getShapeAsVector(), sd::DataType::BOOL, getContext());
 
     NDArray::prepareSpecialUse({&result}, {this});
     NativeOpExecutioner::execTransformBool(getContext(), op, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(), result.buffer(), result.shapeInfo(), result.specialBuffer(), result.specialShapeInfo(), extraParams, nullptr, nullptr);
@@ -3952,7 +3952,7 @@ NDArray NDArray::transform(nd4j::transform::BoolOps op, void *extraParams) const
 }
 
 ////////////////////////////////////////////////////////////////////////
-NDArray NDArray::transform(nd4j::transform::BoolOps op, void *extraParams) && {
+NDArray NDArray::transform(sd::transform::BoolOps op, void *extraParams) && {
     if (isS())
         throw std::runtime_error("NDArray::transform BoolOps: you can't use this method on String array!");
 
@@ -3964,7 +3964,7 @@ NDArray NDArray::transform(nd4j::transform::BoolOps op, void *extraParams) && {
 }
 
 //////////////////////////////////////////////////////////////////////////
-void NDArray::applyScalarArr(nd4j::scalar::Ops op, const NDArray& scalar, NDArray& target, ExtraArguments *extraParams) {
+void NDArray::applyScalarArr(sd::scalar::Ops op, const NDArray& scalar, NDArray& target, ExtraArguments *extraParams) {
     if (isS())
         throw std::runtime_error("NDArray::applyScalarArr: you can't use this method on String array!");
     if (scalar.lengthOf() != 1)
@@ -3979,7 +3979,7 @@ void NDArray::applyScalarArr(nd4j::scalar::Ops op, const NDArray& scalar, NDArra
 }
 
 //////////////////////////////////////////////////////////////////////////
-void NDArray::applyScalarArr(nd4j::scalar::BoolOps op, const NDArray& scalar, NDArray &target, ExtraArguments *extraParams) const {
+void NDArray::applyScalarArr(sd::scalar::BoolOps op, const NDArray& scalar, NDArray &target, ExtraArguments *extraParams) const {
     if (isS())
         throw std::runtime_error("NDArray::applyScalarArr BoolOps: you can't use this method on String array!");
     if (!target.isB())
@@ -3995,7 +3995,7 @@ void NDArray::applyScalarArr(nd4j::scalar::BoolOps op, const NDArray& scalar, ND
 }
 
 //////////////////////////////////////////////////////////////////////////
-void NDArray::applyScalarArr(nd4j::scalar::IntOps op, const NDArray& scalar, NDArray &target, ExtraArguments *extraParams) const {
+void NDArray::applyScalarArr(sd::scalar::IntOps op, const NDArray& scalar, NDArray &target, ExtraArguments *extraParams) const {
     if (isS())
         throw std::runtime_error("NDArray::applyScalarArr IntOps: you can't use this method on String array!");
 
@@ -4013,69 +4013,69 @@ void NDArray::applyScalarArr(nd4j::scalar::IntOps op, const NDArray& scalar, NDA
 
 ////////////////////////////////////////////////////////////////////////
 template <typename T>
-void NDArray::applyScalar(nd4j::scalar::IntOps op, const T scalar, NDArray& target, ExtraArguments *extraParams) const {
+void NDArray::applyScalar(sd::scalar::IntOps op, const T scalar, NDArray& target, ExtraArguments *extraParams) const {
 
     NDArray scalarArr = NDArrayFactory::create(this->dataType(), scalar, getContext());
     applyScalarArr(op, scalarArr, target, extraParams);
 }
 
-template <> ND4J_EXPORT void NDArray::applyScalar(nd4j::scalar::IntOps op, const NDArray& scalar, NDArray &target, ExtraArguments *extraParams) const { throw std::runtime_error("NDArray::applyScalar<NDArray*> method: do not use me!");}
-template ND4J_EXPORT void NDArray::applyScalar<double>(nd4j::scalar::IntOps op, const double scalar, NDArray &target, ExtraArguments *extraParams) const;
-template ND4J_EXPORT void NDArray::applyScalar<float>(nd4j::scalar::IntOps op, const float scalar, NDArray &target, ExtraArguments *extraParams) const;
-template ND4J_EXPORT void NDArray::applyScalar<float16>(nd4j::scalar::IntOps op, const float16 scalar, NDArray &target, ExtraArguments *extraParams) const;
-template ND4J_EXPORT void NDArray::applyScalar<bfloat16>(nd4j::scalar::IntOps op, const bfloat16 scalar, NDArray &target, ExtraArguments *extraParams) const;
-template ND4J_EXPORT void NDArray::applyScalar<Nd4jLong>(nd4j::scalar::IntOps op, const Nd4jLong scalar, NDArray &target, ExtraArguments *extraParams) const;
-template ND4J_EXPORT void NDArray::applyScalar<int>(nd4j::scalar::IntOps op, const int scalar, NDArray &target, ExtraArguments *extraParams) const;
-template ND4J_EXPORT void NDArray::applyScalar<int16_t>(nd4j::scalar::IntOps op, const int16_t scalar, NDArray &target, ExtraArguments *extraParams) const;
-template ND4J_EXPORT void NDArray::applyScalar<int8_t>(nd4j::scalar::IntOps op, const int8_t scalar, NDArray &target, ExtraArguments *extraParams) const;
-template ND4J_EXPORT void NDArray::applyScalar<uint8_t>(nd4j::scalar::IntOps op, const uint8_t scalar, NDArray &target, ExtraArguments *extraParams) const;
-template ND4J_EXPORT void NDArray::applyScalar<bool>(nd4j::scalar::IntOps op, const bool scalar, NDArray &target, ExtraArguments *extraParams) const;
+template <> ND4J_EXPORT void NDArray::applyScalar(sd::scalar::IntOps op, const NDArray& scalar, NDArray &target, ExtraArguments *extraParams) const { throw std::runtime_error("NDArray::applyScalar<NDArray*> method: do not use me!");}
+template ND4J_EXPORT void NDArray::applyScalar<double>(sd::scalar::IntOps op, const double scalar, NDArray &target, ExtraArguments *extraParams) const;
+template ND4J_EXPORT void NDArray::applyScalar<float>(sd::scalar::IntOps op, const float scalar, NDArray &target, ExtraArguments *extraParams) const;
+template ND4J_EXPORT void NDArray::applyScalar<float16>(sd::scalar::IntOps op, const float16 scalar, NDArray &target, ExtraArguments *extraParams) const;
+template ND4J_EXPORT void NDArray::applyScalar<bfloat16>(sd::scalar::IntOps op, const bfloat16 scalar, NDArray &target, ExtraArguments *extraParams) const;
+template ND4J_EXPORT void NDArray::applyScalar<Nd4jLong>(sd::scalar::IntOps op, const Nd4jLong scalar, NDArray &target, ExtraArguments *extraParams) const;
+template ND4J_EXPORT void NDArray::applyScalar<int>(sd::scalar::IntOps op, const int scalar, NDArray &target, ExtraArguments *extraParams) const;
+template ND4J_EXPORT void NDArray::applyScalar<int16_t>(sd::scalar::IntOps op, const int16_t scalar, NDArray &target, ExtraArguments *extraParams) const;
+template ND4J_EXPORT void NDArray::applyScalar<int8_t>(sd::scalar::IntOps op, const int8_t scalar, NDArray &target, ExtraArguments *extraParams) const;
+template ND4J_EXPORT void NDArray::applyScalar<uint8_t>(sd::scalar::IntOps op, const uint8_t scalar, NDArray &target, ExtraArguments *extraParams) const;
+template ND4J_EXPORT void NDArray::applyScalar<bool>(sd::scalar::IntOps op, const bool scalar, NDArray &target, ExtraArguments *extraParams) const;
 
 ////////////////////////////////////////////////////////////////////////
 template <typename T>
-void NDArray::applyScalar(nd4j::scalar::Ops op, const T scalar, NDArray& target, ExtraArguments *extraParams) {
+void NDArray::applyScalar(sd::scalar::Ops op, const T scalar, NDArray& target, ExtraArguments *extraParams) {
 
     auto scalarArr = NDArrayFactory::create<T>(dataType(), scalar, this->getContext());
     applyScalarArr(op, scalarArr, target, extraParams);
 }
-template <> ND4J_EXPORT void NDArray::applyScalar(nd4j::scalar::Ops op, const NDArray& scalar, NDArray &target, ExtraArguments *extraParams) { throw std::runtime_error("NDArray::applyScalar<NDArray*> method: do not use me!");}
-template ND4J_EXPORT void NDArray::applyScalar(nd4j::scalar::Ops op, const double scalar, NDArray &target, ExtraArguments *extraParams);
-template ND4J_EXPORT void NDArray::applyScalar(nd4j::scalar::Ops op, const float scalar, NDArray &target, ExtraArguments *extraParams);
-template ND4J_EXPORT void NDArray::applyScalar(nd4j::scalar::Ops op, const float16 scalar, NDArray &target, ExtraArguments *extraParams);
-template ND4J_EXPORT void NDArray::applyScalar(nd4j::scalar::Ops op, const bfloat16 scalar, NDArray &target, ExtraArguments *extraParams);
-template ND4J_EXPORT void NDArray::applyScalar(nd4j::scalar::Ops op, const Nd4jLong scalar, NDArray &target, ExtraArguments *extraParams);
-template ND4J_EXPORT void NDArray::applyScalar(nd4j::scalar::Ops op, const int scalar, NDArray &target, ExtraArguments *extraParams);
-template ND4J_EXPORT void NDArray::applyScalar(nd4j::scalar::Ops op, const int16_t scalar, NDArray &target, ExtraArguments *extraParams);
-template ND4J_EXPORT void NDArray::applyScalar(nd4j::scalar::Ops op, const int8_t scalar, NDArray &target, ExtraArguments *extraParams);
-template ND4J_EXPORT void NDArray::applyScalar(nd4j::scalar::Ops op, const uint8_t scalar, NDArray &target, ExtraArguments *extraParams);
-template ND4J_EXPORT void NDArray::applyScalar(nd4j::scalar::Ops op, const bool scalar, NDArray &target, ExtraArguments *extraParams);
+template <> ND4J_EXPORT void NDArray::applyScalar(sd::scalar::Ops op, const NDArray& scalar, NDArray &target, ExtraArguments *extraParams) { throw std::runtime_error("NDArray::applyScalar<NDArray*> method: do not use me!");}
+template ND4J_EXPORT void NDArray::applyScalar(sd::scalar::Ops op, const double scalar, NDArray &target, ExtraArguments *extraParams);
+template ND4J_EXPORT void NDArray::applyScalar(sd::scalar::Ops op, const float scalar, NDArray &target, ExtraArguments *extraParams);
+template ND4J_EXPORT void NDArray::applyScalar(sd::scalar::Ops op, const float16 scalar, NDArray &target, ExtraArguments *extraParams);
+template ND4J_EXPORT void NDArray::applyScalar(sd::scalar::Ops op, const bfloat16 scalar, NDArray &target, ExtraArguments *extraParams);
+template ND4J_EXPORT void NDArray::applyScalar(sd::scalar::Ops op, const Nd4jLong scalar, NDArray &target, ExtraArguments *extraParams);
+template ND4J_EXPORT void NDArray::applyScalar(sd::scalar::Ops op, const int scalar, NDArray &target, ExtraArguments *extraParams);
+template ND4J_EXPORT void NDArray::applyScalar(sd::scalar::Ops op, const int16_t scalar, NDArray &target, ExtraArguments *extraParams);
+template ND4J_EXPORT void NDArray::applyScalar(sd::scalar::Ops op, const int8_t scalar, NDArray &target, ExtraArguments *extraParams);
+template ND4J_EXPORT void NDArray::applyScalar(sd::scalar::Ops op, const uint8_t scalar, NDArray &target, ExtraArguments *extraParams);
+template ND4J_EXPORT void NDArray::applyScalar(sd::scalar::Ops op, const bool scalar, NDArray &target, ExtraArguments *extraParams);
 
 ////////////////////////////////////////////////////////////////////////
 template <typename T>
-void NDArray::applyScalar(nd4j::scalar::BoolOps op, const T scalar, NDArray &target, ExtraArguments *extraParams) const {
+void NDArray::applyScalar(sd::scalar::BoolOps op, const T scalar, NDArray &target, ExtraArguments *extraParams) const {
 
     NDArray scalarArr = NDArrayFactory::create<T>(scalar, getContext());
     applyScalarArr(op, scalarArr, target, extraParams);
 }
 
-template <> ND4J_EXPORT void NDArray::applyScalar(nd4j::scalar::BoolOps op, const NDArray& scalar, NDArray &target, ExtraArguments *extraParams) const { throw std::runtime_error("NDArray::applyScalar<NDArray*> method: do not use me!");}
-template ND4J_EXPORT void NDArray::applyScalar<double>(nd4j::scalar::BoolOps op, const double scalar, NDArray &target, ExtraArguments *extraParams) const;
-template ND4J_EXPORT void NDArray::applyScalar<float>(nd4j::scalar::BoolOps op, const float scalar, NDArray &target, ExtraArguments *extraParams) const;
-template ND4J_EXPORT void NDArray::applyScalar<float16>(nd4j::scalar::BoolOps op, const float16 scalar, NDArray &target, ExtraArguments *extraParams) const;
-template ND4J_EXPORT void NDArray::applyScalar<bfloat16>(nd4j::scalar::BoolOps op, const bfloat16 scalar, NDArray &target, ExtraArguments *extraParams) const;
-template ND4J_EXPORT void NDArray::applyScalar<Nd4jLong>(nd4j::scalar::BoolOps op, const Nd4jLong scalar, NDArray &target, ExtraArguments *extraParams) const;
-template ND4J_EXPORT void NDArray::applyScalar<int>(nd4j::scalar::BoolOps op, const int scalar, NDArray &target, ExtraArguments *extraParams) const;
-template ND4J_EXPORT void NDArray::applyScalar<int16_t>(nd4j::scalar::BoolOps op, const int16_t scalar, NDArray &target, ExtraArguments *extraParams) const;
-template ND4J_EXPORT void NDArray::applyScalar<int8_t>(nd4j::scalar::BoolOps op, const int8_t scalar, NDArray &target, ExtraArguments *extraParams) const;
-template ND4J_EXPORT void NDArray::applyScalar<uint8_t>(nd4j::scalar::BoolOps op, const uint8_t scalar, NDArray &target, ExtraArguments *extraParams) const;
-template ND4J_EXPORT void NDArray::applyScalar<bool>(nd4j::scalar::BoolOps op, const bool scalar, NDArray &target, ExtraArguments *extraParams) const;
+template <> ND4J_EXPORT void NDArray::applyScalar(sd::scalar::BoolOps op, const NDArray& scalar, NDArray &target, ExtraArguments *extraParams) const { throw std::runtime_error("NDArray::applyScalar<NDArray*> method: do not use me!");}
+template ND4J_EXPORT void NDArray::applyScalar<double>(sd::scalar::BoolOps op, const double scalar, NDArray &target, ExtraArguments *extraParams) const;
+template ND4J_EXPORT void NDArray::applyScalar<float>(sd::scalar::BoolOps op, const float scalar, NDArray &target, ExtraArguments *extraParams) const;
+template ND4J_EXPORT void NDArray::applyScalar<float16>(sd::scalar::BoolOps op, const float16 scalar, NDArray &target, ExtraArguments *extraParams) const;
+template ND4J_EXPORT void NDArray::applyScalar<bfloat16>(sd::scalar::BoolOps op, const bfloat16 scalar, NDArray &target, ExtraArguments *extraParams) const;
+template ND4J_EXPORT void NDArray::applyScalar<Nd4jLong>(sd::scalar::BoolOps op, const Nd4jLong scalar, NDArray &target, ExtraArguments *extraParams) const;
+template ND4J_EXPORT void NDArray::applyScalar<int>(sd::scalar::BoolOps op, const int scalar, NDArray &target, ExtraArguments *extraParams) const;
+template ND4J_EXPORT void NDArray::applyScalar<int16_t>(sd::scalar::BoolOps op, const int16_t scalar, NDArray &target, ExtraArguments *extraParams) const;
+template ND4J_EXPORT void NDArray::applyScalar<int8_t>(sd::scalar::BoolOps op, const int8_t scalar, NDArray &target, ExtraArguments *extraParams) const;
+template ND4J_EXPORT void NDArray::applyScalar<uint8_t>(sd::scalar::BoolOps op, const uint8_t scalar, NDArray &target, ExtraArguments *extraParams) const;
+template ND4J_EXPORT void NDArray::applyScalar<bool>(sd::scalar::BoolOps op, const bool scalar, NDArray &target, ExtraArguments *extraParams) const;
 
 ////////////////////////////////////////////////////////////////////////
-void NDArray::applyIndexReduce(nd4j::indexreduce::Ops op, NDArray& target, const std::vector<int>& dimensions, const ExtraArguments *extraParams) const {
+void NDArray::applyIndexReduce(sd::indexreduce::Ops op, NDArray& target, const std::vector<int>& dimensions, const ExtraArguments *extraParams) const {
     if (isS())
         throw std::runtime_error("NDArray::applyIndexReduce: you can't use this method on String array!");
 
-    if (target.dataType() != nd4j::DataType::INT64 && target.dataType() != nd4j::DataType::INT32)
+    if (target.dataType() != sd::DataType::INT64 && target.dataType() != sd::DataType::INT32)
         throw std::runtime_error("NDArray::applyIndexReduce operations return INT32/INT64");
 
     void* params = extraParams != nullptr ? const_cast<ExtraArguments*>(extraParams)->argumentsAsT(this->dataType()) : nullptr;
@@ -4088,8 +4088,8 @@ void NDArray::applyIndexReduce(nd4j::indexreduce::Ops op, NDArray& target, const
     else {
         std::vector<int> copy = dimensions;
         shape::checkDimensions(rankOf(), copy);
-        auto pDims = nd4j::Environment::getInstance()->isCPU() ? copy.data() : nullptr;
-        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(getShapeInfo(), copy);
+        auto pDims = sd::Environment::getInstance()->isCPU() ? copy.data() : nullptr;
+        auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(getShapeInfo(), copy);
         NativeOpExecutioner::execIndexReduce(getContext(), op, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(), params, target.buffer(), target.shapeInfo(), target.specialBuffer(), target.specialShapeInfo(), pDims, copy.size(), packX.platformShapeInfo(), packX.platformOffsets());
         synchronize("NDArray::applyIndexReduce");
     }
@@ -4099,7 +4099,7 @@ void NDArray::applyIndexReduce(nd4j::indexreduce::Ops op, NDArray& target, const
 
 ////////////////////////////////////////////////////////////////////////
 // reduce dimensions in this array relying on index operations
-NDArray NDArray::applyIndexReduce(nd4j::indexreduce::Ops op, const std::vector<int>& dimensions, const ExtraArguments* extraParams ) const {
+NDArray NDArray::applyIndexReduce(sd::indexreduce::Ops op, const std::vector<int>& dimensions, const ExtraArguments* extraParams ) const {
 
     std::vector<int> copy = dimensions;
     auto newShape = ShapeUtils::evalReduceShapeInfo('c', copy, *this, DataType::INT64, false, false, getContext()->getWorkspace());
@@ -4112,7 +4112,7 @@ NDArray NDArray::applyIndexReduce(nd4j::indexreduce::Ops op, const std::vector<i
 
 ////////////////////////////////////////////////////////////////////////
 // apply reduce3 operations to this and other array, return result in new output array
-NDArray NDArray::applyReduce3(nd4j::reduce3::Ops op, const NDArray& other, const ExtraArguments* extraParams) const {
+NDArray NDArray::applyReduce3(sd::reduce3::Ops op, const NDArray& other, const ExtraArguments* extraParams) const {
 
     if (isS())
         throw std::runtime_error("NDArray::applyReduce3 method: you can't use this method on String array!");
@@ -4138,7 +4138,7 @@ NDArray NDArray::applyReduce3(nd4j::reduce3::Ops op, const NDArray& other, const
 
 ////////////////////////////////////////////////////////////////////////
 // apply reduce3 (exec) operations to this and other array, return result in new output array
-NDArray NDArray::applyReduce3(nd4j::reduce3::Ops op, const NDArray& other, const std::vector<int>& dimensions, const ExtraArguments* extraParams) const {
+NDArray NDArray::applyReduce3(sd::reduce3::Ops op, const NDArray& other, const std::vector<int>& dimensions, const ExtraArguments* extraParams) const {
 
     if (isS())
         throw std::runtime_error("NDArray::applyReduce3: you can't use this method on String array!");
@@ -4162,10 +4162,10 @@ NDArray NDArray::applyReduce3(nd4j::reduce3::Ops op, const NDArray& other, const
     }
     else {
 
-        auto pDims = nd4j::Environment::getInstance()->isCPU() ? copy.data() : nullptr;
+        auto pDims = sd::Environment::getInstance()->isCPU() ? copy.data() : nullptr;
 
-        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(getShapeInfo(), copy);
-        auto packY = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(other.getShapeInfo(), copy);
+        auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(getShapeInfo(), copy);
+        auto packY = sd::ConstantTadHelper::getInstance()->tadForDimensions(other.getShapeInfo(), copy);
 
         if(!shape::equalsSoft(packX.primaryShapeInfo(), packY.primaryShapeInfo()) || (packX.numberOfTads() != packY.numberOfTads() && packX.numberOfTads() != 1 && packY.numberOfTads() != 1))
             throw std::runtime_error("NDArray::applyReduce3 cuda method: arrays tads are inconsistent !");
@@ -4180,7 +4180,7 @@ NDArray NDArray::applyReduce3(nd4j::reduce3::Ops op, const NDArray& other, const
 
 ////////////////////////////////////////////////////////////////////////
 // apply reduce3 (execAll) operations to this and other array, return result in new output array
-NDArray NDArray::applyAllReduce3(nd4j::reduce3::Ops op, const NDArray& other, const std::vector<int>& dimensions, const ExtraArguments* extraParams) const {
+NDArray NDArray::applyAllReduce3(sd::reduce3::Ops op, const NDArray& other, const std::vector<int>& dimensions, const ExtraArguments* extraParams) const {
     if (isS())
         throw std::runtime_error("NDArray::applyAllReduce3: you can't use this method on String array!");
     if(dataType() != other.dataType())
@@ -4207,7 +4207,7 @@ NDArray NDArray::applyAllReduce3(nd4j::reduce3::Ops op, const NDArray& other, co
     // create dynamic array of extra parameters if array extraParams is empty (==nullptr)
     void* params = extraParams != nullptr ? const_cast<ExtraArguments*>(extraParams)->argumentsAsT(dataType()) : nullptr;
 
-    auto pDims = nd4j::Environment::getInstance()->isCPU() ? copy.data() : nullptr;
+    auto pDims = sd::Environment::getInstance()->isCPU() ? copy.data() : nullptr;
 
     NDArray::prepareSpecialUse({&result}, {this, &other});
     NativeOpExecutioner::execReduce3All(getContext(), op, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(), params, other.getBuffer(), other.getShapeInfo(), other.getSpecialBuffer(), other.getSpecialShapeInfo(), result.buffer(), result.shapeInfo(), result.specialBuffer(), result.specialShapeInfo(), pDims, copy.size(), packX.platformShapeInfo(), packX.platformOffsets(), packY.platformShapeInfo(), packY.platformOffsets());
@@ -4218,7 +4218,7 @@ NDArray NDArray::applyAllReduce3(nd4j::reduce3::Ops op, const NDArray& other, co
 
 //////////////////////////////////////////////////////////////////////////
 // method reduces array by excluding its shapes along axes present in dimensions vector
-void NDArray::reduceAlongDimension(nd4j::reduce::FloatOps op, NDArray& target, const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes, const bool checkTargetShape) const {
+void NDArray::reduceAlongDimension(sd::reduce::FloatOps op, NDArray& target, const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes, const bool checkTargetShape) const {
 
     if (isS())
         throw std::runtime_error("NDArray::reduceAlongDimension FloatOps: you can't use this method on String array!");
@@ -4239,7 +4239,7 @@ void NDArray::reduceAlongDimension(nd4j::reduce::FloatOps op, NDArray& target, c
         NativeOpExecutioner::execReduceFloatScalar(getContext(), op, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(),nullptr, target.getBuffer(), target.getShapeInfo(), target.getSpecialBuffer(), target.getSpecialShapeInfo());
     }
     else {
-        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(getShapeInfo(), copy);
+        auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(getShapeInfo(), copy);
         NativeOpExecutioner::execReduceFloat(getContext(), op, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(), nullptr, target.getBuffer(), target.getShapeInfo(), target.getSpecialBuffer(), target.getSpecialShapeInfo(), copy.data(), copy.size(), packX.platformShapeInfo(), packX.platformOffsets());
     }
     synchronize("NDArray::reduceAlongDimension FloatOps");
@@ -4249,7 +4249,7 @@ void NDArray::reduceAlongDimension(nd4j::reduce::FloatOps op, NDArray& target, c
 
 //////////////////////////////////////////////////////////////////////////
 // method reduces array by excluding its shapes along axes present in dimensions vector
-void NDArray::reduceAlongDimension(nd4j::reduce::SameOps op, NDArray& target, const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes, const bool checkTargetShape) const {
+void NDArray::reduceAlongDimension(sd::reduce::SameOps op, NDArray& target, const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes, const bool checkTargetShape) const {
 
     if (isS())
         throw std::runtime_error("NDArray::reduceAlongDimension SameOps: you can't use this method on String array!");
@@ -4270,8 +4270,8 @@ void NDArray::reduceAlongDimension(nd4j::reduce::SameOps op, NDArray& target, co
         NativeOpExecutioner::execReduceSameScalar(getContext(), op, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(), nullptr, target.getBuffer(), target.getShapeInfo(), target.getSpecialBuffer(), target.getSpecialShapeInfo());
     }
     else { //if (!isEmpty()) {
-        auto pDims = nd4j::Environment::getInstance()->isCPU() ? copy.data() : nullptr;
-        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(this->getShapeInfo(), copy);
+        auto pDims = sd::Environment::getInstance()->isCPU() ? copy.data() : nullptr;
+        auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(this->getShapeInfo(), copy);
         NativeOpExecutioner::execReduceSame(getContext(), op, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(), nullptr, target.getBuffer(), target.getShapeInfo(), target.getSpecialBuffer(), target.getSpecialShapeInfo(), pDims, copy.size(), packX.platformShapeInfo(), packX.platformOffsets());
     }
     synchronize("NDArray::reduceAlongDimension SameOps");
@@ -4281,7 +4281,7 @@ void NDArray::reduceAlongDimension(nd4j::reduce::SameOps op, NDArray& target, co
 
 //////////////////////////////////////////////////////////////////////////
 // method reduces array by excluding its shapes along axes present in dimensions vector
-void NDArray::reduceAlongDimension(nd4j::reduce::LongOps op, NDArray& target, const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes, const bool checkTargetShape) const {
+void NDArray::reduceAlongDimension(sd::reduce::LongOps op, NDArray& target, const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes, const bool checkTargetShape) const {
 
     if (isS())
         throw std::runtime_error("NDArray::reduceAlongDimension LongOps: you can't use this method on String array!");
@@ -4302,8 +4302,8 @@ void NDArray::reduceAlongDimension(nd4j::reduce::LongOps op, NDArray& target, co
         NativeOpExecutioner::execReduceLongScalar(getContext(), op, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(), nullptr, target.getBuffer(), target.getShapeInfo(), target.getSpecialBuffer(), target.getSpecialShapeInfo());
     }
     else {
-        auto pDims = nd4j::Environment::getInstance()->isCPU() ? copy.data() : nullptr;
-        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(this->getShapeInfo(), copy);
+        auto pDims = sd::Environment::getInstance()->isCPU() ? copy.data() : nullptr;
+        auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(this->getShapeInfo(), copy);
         NativeOpExecutioner::execReduceLong(getContext(), op, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(), nullptr, target.getBuffer(), target.getShapeInfo(), target.getSpecialBuffer(), target.getSpecialShapeInfo(), pDims, copy.size(), packX.platformShapeInfo(), packX.platformOffsets());
     }
     synchronize("NDArray::reduceAlongDimension LongOps");
@@ -4313,7 +4313,7 @@ void NDArray::reduceAlongDimension(nd4j::reduce::LongOps op, NDArray& target, co
 
 //////////////////////////////////////////////////////////////////////////
 // method reduces array by excluding its shapes along axes present in dimensions vector
-void NDArray::reduceAlongDimension(nd4j::reduce::BoolOps op, NDArray& target, const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes, const bool checkTargetShape) const {
+void NDArray::reduceAlongDimension(sd::reduce::BoolOps op, NDArray& target, const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes, const bool checkTargetShape) const {
 
     if (isS())
         throw std::runtime_error("NDArray::reduceAlongDimension BoolOps cuda: you can't use this method on String array!");
@@ -4334,8 +4334,8 @@ void NDArray::reduceAlongDimension(nd4j::reduce::BoolOps op, NDArray& target, co
         NativeOpExecutioner::execReduceBoolScalar(getContext(), op, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(), nullptr, target.getBuffer(), target.getShapeInfo(), target.getSpecialBuffer(), target.getSpecialShapeInfo());
     }
     else {
-        auto pDims = nd4j::Environment::getInstance()->isCPU() ? copy.data() : nullptr;
-        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(this->getShapeInfo(), copy);
+        auto pDims = sd::Environment::getInstance()->isCPU() ? copy.data() : nullptr;
+        auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(this->getShapeInfo(), copy);
         NativeOpExecutioner::execReduceBool(getContext(), op, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(), nullptr, target.getBuffer(), target.getShapeInfo(), target.getSpecialBuffer(), target.getSpecialShapeInfo(), pDims, copy.size(), packX.platformShapeInfo(), packX.platformOffsets());
     }
     synchronize("NDArray::reduceAlongDimension LongOps");
@@ -4506,10 +4506,10 @@ void NDArray::addRowVector(const NDArray& row, NDArray& target) const {
 
     int dimension = 1;
 
-    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(this->getShapeInfo(), dimension);
+    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(this->getShapeInfo(), dimension);
 
     NDArray::prepareSpecialUse({&target}, {this, &row});
-    NativeOpExecutioner::execBroadcast(getContext(), nd4j::broadcast::Ops::Add, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(), row.getBuffer(), row.getShapeInfo(), row.getSpecialBuffer(), row.getSpecialShapeInfo(), target.getBuffer(), target.getShapeInfo(), target.getSpecialBuffer(), target.getSpecialShapeInfo(), nullptr, 1, packX.platformShapeInfo(), packX.platformOffsets(), nullptr, nullptr);
+    NativeOpExecutioner::execBroadcast(getContext(), sd::broadcast::Ops::Add, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(), row.getBuffer(), row.getShapeInfo(), row.getSpecialBuffer(), row.getSpecialShapeInfo(), target.getBuffer(), target.getShapeInfo(), target.getSpecialBuffer(), target.getSpecialShapeInfo(), nullptr, 1, packX.platformShapeInfo(), packX.platformOffsets(), nullptr, nullptr);
     NDArray::registerSpecialUse({&target}, {this, &row});
 }
 
@@ -4525,10 +4525,10 @@ void NDArray::subRowVector(const NDArray& row, NDArray& target) const {
 
     int dimension = 1;
 
-    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(this->getShapeInfo(), dimension);
+    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(this->getShapeInfo(), dimension);
 
     NDArray::prepareSpecialUse({&target}, {this, &row});
-    NativeOpExecutioner::execBroadcast(getContext(), nd4j::broadcast::Ops::Subtract, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(), row.getBuffer(), row.getShapeInfo(), row.getSpecialBuffer(), row.getSpecialShapeInfo(), target.getBuffer(), target.getShapeInfo(), target.getSpecialBuffer(), target.getSpecialShapeInfo(), &dimension, 1, packX.platformShapeInfo(), packX.platformOffsets(), nullptr, nullptr);
+    NativeOpExecutioner::execBroadcast(getContext(), sd::broadcast::Ops::Subtract, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(), row.getBuffer(), row.getShapeInfo(), row.getSpecialBuffer(), row.getSpecialShapeInfo(), target.getBuffer(), target.getShapeInfo(), target.getSpecialBuffer(), target.getSpecialShapeInfo(), &dimension, 1, packX.platformShapeInfo(), packX.platformOffsets(), nullptr, nullptr);
     NDArray::registerSpecialUse({&target}, {this, &row});
 }
 
@@ -4545,10 +4545,10 @@ void NDArray::mulRowVector(const NDArray &row, NDArray &target) const {
 
     int dimension = 1;
 
-    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(this->getShapeInfo(), dimension);
+    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(this->getShapeInfo(), dimension);
 
     NDArray::prepareSpecialUse({&target}, {this, &row});
-    NativeOpExecutioner::execBroadcast(getContext(), nd4j::broadcast::Ops::Multiply, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(), row.getBuffer(), row.getShapeInfo(), row.getSpecialBuffer(), row.getSpecialShapeInfo(), target.getBuffer(), target.getShapeInfo(), target.getSpecialBuffer(), target.getSpecialShapeInfo(), nullptr, 1, packX.platformShapeInfo(), packX.platformOffsets(), nullptr, nullptr);
+    NativeOpExecutioner::execBroadcast(getContext(), sd::broadcast::Ops::Multiply, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(), row.getBuffer(), row.getShapeInfo(), row.getSpecialBuffer(), row.getSpecialShapeInfo(), target.getBuffer(), target.getShapeInfo(), target.getSpecialBuffer(), target.getSpecialShapeInfo(), nullptr, 1, packX.platformShapeInfo(), packX.platformOffsets(), nullptr, nullptr);
     NDArray::registerSpecialUse({&target}, {this, &row});
 }
 
@@ -4566,10 +4566,10 @@ void NDArray::divRowVector(const NDArray &row, NDArray &target) const {
 
     int dimension = 1;
 
-    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(this->getShapeInfo(), dimension);
+    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(this->getShapeInfo(), dimension);
 
     NDArray::prepareSpecialUse({&target}, {this, &row});
-    NativeOpExecutioner::execBroadcast(getContext(), nd4j::broadcast::Divide, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(), row.getBuffer(), row.getShapeInfo(), row.getSpecialBuffer(), row.getSpecialShapeInfo(), target.getBuffer(), target.getShapeInfo(), target.getSpecialBuffer(), target.getSpecialShapeInfo(), nullptr, 1, packX.platformShapeInfo(), packX.platformOffsets(), nullptr, nullptr);
+    NativeOpExecutioner::execBroadcast(getContext(), sd::broadcast::Divide, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(), row.getBuffer(), row.getShapeInfo(), row.getSpecialBuffer(), row.getSpecialShapeInfo(), target.getBuffer(), target.getShapeInfo(), target.getSpecialBuffer(), target.getSpecialShapeInfo(), nullptr, 1, packX.platformShapeInfo(), packX.platformOffsets(), nullptr, nullptr);
     NDArray::registerSpecialUse({&target}, {this, &row});
 }
 
@@ -4584,10 +4584,10 @@ void NDArray::addiRowVector(const NDArray& row) {
 
     int dimension = 1;
 
-    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(this->getShapeInfo(), dimension);
+    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(this->getShapeInfo(), dimension);
 
     NDArray::prepareSpecialUse({this}, {&row});
-    NativeOpExecutioner::execBroadcast(getContext(), nd4j::broadcast::Ops::Add, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(), row.getBuffer(), row.getShapeInfo(), row.getSpecialBuffer(), row.getSpecialShapeInfo(), this->buffer(), this->shapeInfo(), this->specialBuffer(), this->specialShapeInfo(), nullptr, 1, packX.platformShapeInfo(), packX.platformOffsets(), nullptr, nullptr);
+    NativeOpExecutioner::execBroadcast(getContext(), sd::broadcast::Ops::Add, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(), row.getBuffer(), row.getShapeInfo(), row.getSpecialBuffer(), row.getSpecialShapeInfo(), this->buffer(), this->shapeInfo(), this->specialBuffer(), this->specialShapeInfo(), nullptr, 1, packX.platformShapeInfo(), packX.platformOffsets(), nullptr, nullptr);
     NDArray::registerSpecialUse({this}, {&row});
 }
 
@@ -4602,10 +4602,10 @@ void NDArray::addColumnVector(const NDArray &column, NDArray &target) const {
 
     int dimension = 0;
 
-    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(this->getShapeInfo(), dimension);
+    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(this->getShapeInfo(), dimension);
 
     NDArray::prepareSpecialUse({&target}, {this, &column});
-    NativeOpExecutioner::execBroadcast(getContext(), nd4j::broadcast::Ops::Add, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(), column.getBuffer(), column.getShapeInfo(), column.getSpecialBuffer(), column.getSpecialShapeInfo(), target.getBuffer(), target.getShapeInfo(), target.getSpecialBuffer(), target.getSpecialShapeInfo(), nullptr, 1, packX.platformShapeInfo(), packX.platformOffsets(), nullptr, nullptr);
+    NativeOpExecutioner::execBroadcast(getContext(), sd::broadcast::Ops::Add, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(), column.getBuffer(), column.getShapeInfo(), column.getSpecialBuffer(), column.getSpecialShapeInfo(), target.getBuffer(), target.getShapeInfo(), target.getSpecialBuffer(), target.getSpecialShapeInfo(), nullptr, 1, packX.platformShapeInfo(), packX.platformOffsets(), nullptr, nullptr);
     NDArray::registerSpecialUse({&target}, {this, &column});
 }
 
@@ -4619,10 +4619,10 @@ void NDArray::addiColumnVector(const NDArray &column) {
 
     int dimension = 0;
 
-    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(this->getShapeInfo(), dimension);
+    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(this->getShapeInfo(), dimension);
 
     NDArray::prepareSpecialUse({this}, {&column});
-    NativeOpExecutioner::execBroadcast(getContext(), nd4j::broadcast::Ops::Add, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(), column.getBuffer(), column.getShapeInfo(), column.getSpecialBuffer(), column.getSpecialShapeInfo(), this->buffer(), this->shapeInfo(), this->specialBuffer(), this->specialShapeInfo(), nullptr, 1, packX.platformShapeInfo(), packX.platformOffsets(), nullptr, nullptr);
+    NativeOpExecutioner::execBroadcast(getContext(), sd::broadcast::Ops::Add, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(), column.getBuffer(), column.getShapeInfo(), column.getSpecialBuffer(), column.getSpecialShapeInfo(), this->buffer(), this->shapeInfo(), this->specialBuffer(), this->specialShapeInfo(), nullptr, 1, packX.platformShapeInfo(), packX.platformOffsets(), nullptr, nullptr);
     NDArray::registerSpecialUse({this}, {&column});
 }
 
@@ -4636,10 +4636,10 @@ void NDArray::muliColumnVector(const NDArray& column) {
 
     int dimension = 0;
 
-    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(this->getShapeInfo(), dimension);
+    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(this->getShapeInfo(), dimension);
 
     NDArray::prepareSpecialUse({this}, {&column});
-    NativeOpExecutioner::execBroadcast(getContext(), nd4j::broadcast::Ops::Multiply, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(), column.getBuffer(), column.getShapeInfo(), column.getSpecialBuffer(), column.getSpecialShapeInfo(), this->buffer(), this->shapeInfo(), this->specialBuffer(), this->specialShapeInfo(), nullptr, 1, packX.platformShapeInfo(), packX.platformOffsets(), nullptr, nullptr);
+    NativeOpExecutioner::execBroadcast(getContext(), sd::broadcast::Ops::Multiply, getBuffer(), getShapeInfo(), getSpecialBuffer(), getSpecialShapeInfo(), column.getBuffer(), column.getShapeInfo(), column.getSpecialBuffer(), column.getSpecialShapeInfo(), this->buffer(), this->shapeInfo(), this->specialBuffer(), this->specialShapeInfo(), nullptr, 1, packX.platformShapeInfo(), packX.platformOffsets(), nullptr, nullptr);
     NDArray::registerSpecialUse({this}, {&column});
 }
 
@@ -4821,7 +4821,7 @@ NDArray NDArray::tensorAlongDimension(Nd4jLong index, const std::vector<int>& di
     if (index >= numTads)
         throw std::runtime_error("Can't get index higher than total number of TADs");
 
-    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(this->getShapeInfo(), copy);
+    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(this->getShapeInfo(), copy);
 
     NDArray array(_buffer, ShapeDescriptor(packX.primaryShapeInfo()), getContext(), packX.primaryOffsets()[index] + getBufferOffset());
     array._isView = true;
@@ -4940,13 +4940,13 @@ void NDArray::setShapeInfo(const Nd4jLong *shapeInfo) {
         _dataType = ArrayOptions::dataType(_shapeInfo);
     }
     else {
-        _dataType = nd4j::DataType::INHERIT;
+        _dataType = sd::DataType::INHERIT;
         _shapeInfoD = _shapeInfo = nullptr;
     }
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NDArray::setShapeInfo(const Nd4jLong *shapeInfo, const nd4j::DataType dtype) {
+void NDArray::setShapeInfo(const Nd4jLong *shapeInfo, const sd::DataType dtype) {
 
     if (shapeInfo != nullptr) {
 
@@ -4967,7 +4967,7 @@ void NDArray::setShapeInfo(const Nd4jLong *shapeInfo, const nd4j::DataType dtype
         _dataType = dtype;
     }
     else {
-        _dataType = nd4j::DataType::INHERIT;
+        _dataType = sd::DataType::INHERIT;
         _shapeInfoD = _shapeInfo = nullptr;
     }
 }
@@ -5022,7 +5022,7 @@ NDArray operator+(NDArray&& arr, const T& scalar) {
     auto tmp = NDArrayFactory::create(arr.dataType(), scalar, arr.getContext());
 
     NDArray::prepareSpecialUse({&arr}, {&arr, &tmp});
-    NativeOpExecutioner::execScalar(arr.getContext(), nd4j::scalar::Add, arr.getBuffer(), arr.getShapeInfo(), arr.getSpecialBuffer(), arr.getSpecialShapeInfo(), arr.buffer(), arr.getShapeInfo(), arr.specialBuffer(), arr.getSpecialShapeInfo(), tmp.buffer(), tmp.shapeInfo(), tmp.specialBuffer(), tmp.specialShapeInfo(), nullptr);
+    NativeOpExecutioner::execScalar(arr.getContext(), sd::scalar::Add, arr.getBuffer(), arr.getShapeInfo(), arr.getSpecialBuffer(), arr.getSpecialShapeInfo(), arr.buffer(), arr.getShapeInfo(), arr.specialBuffer(), arr.getSpecialShapeInfo(), tmp.buffer(), tmp.shapeInfo(), tmp.specialBuffer(), tmp.specialShapeInfo(), nullptr);
     NDArray::registerSpecialUse({&arr}, {&arr, &tmp});
 
     return std::move(arr);
@@ -5044,7 +5044,7 @@ NDArray operator+(const NDArray& arr, const T& scalar) {
     NDArray result(arr.getShapeInfo(), DataTypeUtils::pickPairwiseResultType(arr.dataType(), DataTypeUtils::fromT<T>()), false, arr.getContext());
 
     NDArray::prepareSpecialUse({&result}, {&arr, &tmp});
-    NativeOpExecutioner::execScalar(arr.getContext(), nd4j::scalar::Add, arr.getBuffer(), arr.getShapeInfo(), arr.getSpecialBuffer(), arr.getSpecialShapeInfo(), result.buffer(), result.getShapeInfo(), result.specialBuffer(), result.getSpecialShapeInfo(), tmp.buffer(), tmp.shapeInfo(), tmp.specialBuffer(), tmp.specialShapeInfo(), nullptr);
+    NativeOpExecutioner::execScalar(arr.getContext(), sd::scalar::Add, arr.getBuffer(), arr.getShapeInfo(), arr.getSpecialBuffer(), arr.getSpecialShapeInfo(), result.buffer(), result.getShapeInfo(), result.specialBuffer(), result.getSpecialShapeInfo(), tmp.buffer(), tmp.shapeInfo(), tmp.specialBuffer(), tmp.specialShapeInfo(), nullptr);
     NDArray::registerSpecialUse({&result}, {&arr, &tmp});
 
     return result;
@@ -5092,7 +5092,7 @@ NDArray operator-(NDArray&& arr, const T& scalar) {
     auto tmp = NDArrayFactory::create(arr.dataType(), scalar, arr.getContext());
 
     NDArray::prepareSpecialUse({&arr}, {&arr, &tmp});
-    NativeOpExecutioner::execScalar(arr.getContext(), nd4j::scalar::Subtract, arr.getBuffer(), arr.getShapeInfo(), arr.getSpecialBuffer(), arr.getSpecialShapeInfo(), arr.buffer(), arr.getShapeInfo(), arr.specialBuffer(), arr.getSpecialShapeInfo(), tmp.buffer(), tmp.shapeInfo(), tmp.specialBuffer(), tmp.specialShapeInfo(), nullptr);
+    NativeOpExecutioner::execScalar(arr.getContext(), sd::scalar::Subtract, arr.getBuffer(), arr.getShapeInfo(), arr.getSpecialBuffer(), arr.getSpecialShapeInfo(), arr.buffer(), arr.getShapeInfo(), arr.specialBuffer(), arr.getSpecialShapeInfo(), tmp.buffer(), tmp.shapeInfo(), tmp.specialBuffer(), tmp.specialShapeInfo(), nullptr);
     NDArray::registerSpecialUse({&arr}, {&arr, &tmp});
 
     return std::move(arr);
@@ -5111,7 +5111,7 @@ NDArray operator-(const NDArray& arr, const T& scalar) {
     NDArray result(arr.getShapeInfo(), DataTypeUtils::pickPairwiseResultType(arr.dataType(), DataTypeUtils::fromT<T>()), false, arr.getContext());
 
     NDArray::prepareSpecialUse({&result}, {&arr, &tmp});
-    NativeOpExecutioner::execScalar(arr.getContext(), nd4j::scalar::Subtract, arr.getBuffer(), arr.getShapeInfo(), arr.getSpecialBuffer(), arr.getSpecialShapeInfo(), result.buffer(), result.getShapeInfo(), result.specialBuffer(), result.getSpecialShapeInfo(), tmp.buffer(), tmp.shapeInfo(), tmp.specialBuffer(), tmp.specialShapeInfo(), nullptr);
+    NativeOpExecutioner::execScalar(arr.getContext(), sd::scalar::Subtract, arr.getBuffer(), arr.getShapeInfo(), arr.getSpecialBuffer(), arr.getSpecialShapeInfo(), result.buffer(), result.getShapeInfo(), result.specialBuffer(), result.getSpecialShapeInfo(), tmp.buffer(), tmp.shapeInfo(), tmp.specialBuffer(), tmp.specialShapeInfo(), nullptr);
     NDArray::registerSpecialUse({&result}, {&arr, &tmp});
 
     return result;
@@ -5135,7 +5135,7 @@ NDArray operator-(const T& scalar, NDArray&& arr) {
     auto tmp = NDArrayFactory::create(arr.dataType(), scalar, arr.getContext());
 
     NDArray::prepareSpecialUse({&arr}, {&arr, &tmp});
-    NativeOpExecutioner::execScalar(arr.getContext(), nd4j::scalar::ReverseSubtract, arr.getBuffer(), arr.getShapeInfo(), arr.getSpecialBuffer(), arr.getSpecialShapeInfo(), arr.getBuffer(), arr.getShapeInfo(), arr.specialBuffer(), arr.specialShapeInfo(), tmp.buffer(), tmp.shapeInfo(), tmp.specialBuffer(), tmp.specialShapeInfo(), nullptr);
+    NativeOpExecutioner::execScalar(arr.getContext(), sd::scalar::ReverseSubtract, arr.getBuffer(), arr.getShapeInfo(), arr.getSpecialBuffer(), arr.getSpecialShapeInfo(), arr.getBuffer(), arr.getShapeInfo(), arr.specialBuffer(), arr.specialShapeInfo(), tmp.buffer(), tmp.shapeInfo(), tmp.specialBuffer(), tmp.specialShapeInfo(), nullptr);
     NDArray::registerSpecialUse({&arr}, {&arr, &tmp});
 
     return std::move(arr);
@@ -5158,7 +5158,7 @@ NDArray operator-(const T& scalar, const NDArray& arr) {
     NDArray result(arr.getShapeInfo(), DataTypeUtils::pickPairwiseResultType(arr.dataType(), DataTypeUtils::fromT<T>()), false, arr.getContext());
 
     NDArray::prepareSpecialUse({&result}, {&arr, &tmp});
-    NativeOpExecutioner::execScalar(arr.getContext(), nd4j::scalar::ReverseSubtract, arr.getBuffer(), arr.getShapeInfo(), arr.getSpecialBuffer(), arr.getSpecialShapeInfo(), result.getBuffer(), result.getShapeInfo(), result.specialBuffer(), result.specialShapeInfo(), tmp.buffer(), tmp.shapeInfo(), tmp.specialBuffer(), tmp.specialShapeInfo(), nullptr);
+    NativeOpExecutioner::execScalar(arr.getContext(), sd::scalar::ReverseSubtract, arr.getBuffer(), arr.getShapeInfo(), arr.getSpecialBuffer(), arr.getSpecialShapeInfo(), result.getBuffer(), result.getShapeInfo(), result.specialBuffer(), result.specialShapeInfo(), tmp.buffer(), tmp.shapeInfo(), tmp.specialBuffer(), tmp.specialShapeInfo(), nullptr);
     NDArray::registerSpecialUse({&result}, {&arr, &tmp});
 
     return result;
@@ -5183,7 +5183,7 @@ NDArray operator*(NDArray&& arr, const T& scalar) {
     auto tmp = NDArrayFactory::create(arr.dataType(), scalar, arr.getContext());
 
     NDArray::prepareSpecialUse({&arr}, {&arr, &tmp});
-    NativeOpExecutioner::execScalar(arr.getContext(), nd4j::scalar::Multiply, arr.getBuffer(), arr.getShapeInfo(), arr.getSpecialBuffer(), arr.getSpecialShapeInfo(), arr.buffer(), arr.getShapeInfo(), arr.specialBuffer(), arr.getSpecialShapeInfo(), tmp.buffer(), tmp.shapeInfo(), tmp.specialBuffer(), tmp.specialShapeInfo(), nullptr);
+    NativeOpExecutioner::execScalar(arr.getContext(), sd::scalar::Multiply, arr.getBuffer(), arr.getShapeInfo(), arr.getSpecialBuffer(), arr.getSpecialShapeInfo(), arr.buffer(), arr.getShapeInfo(), arr.specialBuffer(), arr.getSpecialShapeInfo(), tmp.buffer(), tmp.shapeInfo(), tmp.specialBuffer(), tmp.specialShapeInfo(), nullptr);
     NDArray::registerSpecialUse({&arr}, {&arr, &tmp});
 
     return std::move(arr);
@@ -5206,7 +5206,7 @@ NDArray operator*(const NDArray& arr, const T& scalar) {
     NDArray result(arr.getShapeInfo(), DataTypeUtils::pickPairwiseResultType(arr.dataType(), DataTypeUtils::fromT<T>()), false, arr.getContext());
 
     NDArray::prepareSpecialUse({&result}, {&arr, &tmp});
-    NativeOpExecutioner::execScalar(arr.getContext(), nd4j::scalar::Multiply, arr.getBuffer(), arr.getShapeInfo(), arr.getSpecialBuffer(), arr.getSpecialShapeInfo(), result.buffer(), result.getShapeInfo(), result.specialBuffer(), result.getSpecialShapeInfo(), tmp.buffer(), tmp.shapeInfo(), tmp.specialBuffer(), tmp.specialShapeInfo(), nullptr);
+    NativeOpExecutioner::execScalar(arr.getContext(), sd::scalar::Multiply, arr.getBuffer(), arr.getShapeInfo(), arr.getSpecialBuffer(), arr.getSpecialShapeInfo(), result.buffer(), result.getShapeInfo(), result.specialBuffer(), result.getSpecialShapeInfo(), tmp.buffer(), tmp.shapeInfo(), tmp.specialBuffer(), tmp.specialShapeInfo(), nullptr);
     NDArray::registerSpecialUse({&result}, {&arr, &tmp});
 
     return result;
@@ -5259,7 +5259,7 @@ NDArray operator/(NDArray&& arr, const T& scalar) {
     auto tmp = NDArrayFactory::create(arr.dataType(), scalar, arr.getContext());
 
     NDArray::prepareSpecialUse({&arr}, {&arr, &tmp});
-    NativeOpExecutioner::execScalar(arr.getContext(), nd4j::scalar::Divide, arr.getBuffer(), arr.getShapeInfo(), arr.getSpecialBuffer(), arr.getSpecialShapeInfo(), arr.buffer(), arr.getShapeInfo(), arr.specialBuffer(), arr.getSpecialShapeInfo(), tmp.buffer(), tmp.shapeInfo(), tmp.specialBuffer(), tmp.specialShapeInfo(), nullptr);
+    NativeOpExecutioner::execScalar(arr.getContext(), sd::scalar::Divide, arr.getBuffer(), arr.getShapeInfo(), arr.getSpecialBuffer(), arr.getSpecialShapeInfo(), arr.buffer(), arr.getShapeInfo(), arr.specialBuffer(), arr.getSpecialShapeInfo(), tmp.buffer(), tmp.shapeInfo(), tmp.specialBuffer(), tmp.specialShapeInfo(), nullptr);
     NDArray::registerSpecialUse({&arr}, {&arr, &tmp});
 
     return std::move(arr);
@@ -5281,7 +5281,7 @@ NDArray operator/(const NDArray& arr, const T& scalar) {
     NDArray result(arr.getShapeInfo(), DataTypeUtils::pickPairwiseResultType(arr.dataType(), DataTypeUtils::fromT<T>()), false, arr.getContext());
 
     NDArray::prepareSpecialUse({&result}, {&arr, &tmp});
-    NativeOpExecutioner::execScalar(arr.getContext(), nd4j::scalar::Divide, arr.getBuffer(), arr.getShapeInfo(), arr.getSpecialBuffer(), arr.getSpecialShapeInfo(), result.buffer(), result.getShapeInfo(), result.specialBuffer(), result.getSpecialShapeInfo(), tmp.buffer(), tmp.shapeInfo(), tmp.specialBuffer(), tmp.specialShapeInfo(), nullptr);
+    NativeOpExecutioner::execScalar(arr.getContext(), sd::scalar::Divide, arr.getBuffer(), arr.getShapeInfo(), arr.getSpecialBuffer(), arr.getSpecialShapeInfo(), result.buffer(), result.getShapeInfo(), result.specialBuffer(), result.getSpecialShapeInfo(), tmp.buffer(), tmp.shapeInfo(), tmp.specialBuffer(), tmp.specialShapeInfo(), nullptr);
     NDArray::registerSpecialUse({&result}, {&arr, &tmp});
 
     return result;
@@ -5306,7 +5306,7 @@ NDArray operator/(const T& scalar, NDArray&& arr) {
     auto tmp = NDArrayFactory::create(arr.dataType(), scalar, arr.getContext());
 
     NDArray::prepareSpecialUse({&arr}, {&arr, &tmp});
-    NativeOpExecutioner::execScalar(arr.getContext(), nd4j::scalar::ReverseDivide, arr.getBuffer(), arr.getShapeInfo(), arr.getSpecialBuffer(), arr.getSpecialShapeInfo(), arr.getBuffer(), arr.getShapeInfo(), arr.specialBuffer(), arr.specialShapeInfo(), tmp.buffer(), tmp.shapeInfo(), tmp.specialBuffer(), tmp.specialShapeInfo(), nullptr);
+    NativeOpExecutioner::execScalar(arr.getContext(), sd::scalar::ReverseDivide, arr.getBuffer(), arr.getShapeInfo(), arr.getSpecialBuffer(), arr.getSpecialShapeInfo(), arr.getBuffer(), arr.getShapeInfo(), arr.specialBuffer(), arr.specialShapeInfo(), tmp.buffer(), tmp.shapeInfo(), tmp.specialBuffer(), tmp.specialShapeInfo(), nullptr);
     NDArray::registerSpecialUse({&arr}, {&arr, &tmp});
 
     return std::move(arr);
@@ -5330,7 +5330,7 @@ NDArray operator/(const T& scalar, const NDArray& arr) {
     NDArray result(arr.getShapeInfo(), DataTypeUtils::pickPairwiseResultType(arr.dataType(), DataTypeUtils::fromT<T>()), false, arr.getContext());
 
     NDArray::prepareSpecialUse({&result}, {&arr, &tmp});
-    NativeOpExecutioner::execScalar(arr.getContext(), nd4j::scalar::ReverseDivide, arr.getBuffer(), arr.getShapeInfo(), arr.getSpecialBuffer(), arr.getSpecialShapeInfo(), result.getBuffer(), result.getShapeInfo(), result.specialBuffer(), result.specialShapeInfo(), tmp.buffer(), tmp.shapeInfo(), tmp.specialBuffer(), tmp.specialShapeInfo(), nullptr);
+    NativeOpExecutioner::execScalar(arr.getContext(), sd::scalar::ReverseDivide, arr.getBuffer(), arr.getShapeInfo(), arr.getSpecialBuffer(), arr.getSpecialShapeInfo(), result.getBuffer(), result.getShapeInfo(), result.specialBuffer(), result.specialShapeInfo(), tmp.buffer(), tmp.shapeInfo(), tmp.specialBuffer(), tmp.specialShapeInfo(), nullptr);
     NDArray::registerSpecialUse({&result}, {&arr, &tmp});
 
     return result;
@@ -5347,7 +5347,7 @@ NDArray operator+(T1&& arr1, T2&& arr2) {
     if (arr1.isS() || arr2.isS())
         throw std::runtime_error("operator+(T&& arr1, T&& arr2): you can't use this method on String arrays!");
     if (!Environment::getInstance()->isExperimentalBuild() && arr1.dataType() != arr2.dataType() && (arr1.dataType() != DataType::BOOL || arr2.dataType() != BOOL))
-        throw nd4j::datatype_exception::build("operator+(T&& arr1, T&& arr2): Cannot multiply different types", arr1.dataType(), arr2.dataType());
+        throw sd::datatype_exception::build("operator+(T&& arr1, T&& arr2): Cannot multiply different types", arr1.dataType(), arr2.dataType());
 
     PointersManager pointersManager(arr1.getContext(), "operator+(T&& arr1, T&& arr2)");
 
@@ -5365,7 +5365,7 @@ NDArray operator+(T1&& arr1, T2&& arr2) {
             result = new NDArray(arr1.getShapeInfo(), DataTypeUtils::pickPairwiseResultType(arr1.getShapeInfo(), arr2.getShapeInfo()), false, arr1.getContext());
 
         NDArray::prepareSpecialUse({result}, {&arr1, &arr2});
-        NativeOpExecutioner::execPairwiseTransform(arr1.getContext(), nd4j::pairwise::Add, arr1.getBuffer(), arr1.getShapeInfo(), arr1.getSpecialBuffer(), arr1.getSpecialShapeInfo(), arr2.getBuffer(), arr2.getShapeInfo(), arr2.getSpecialBuffer(), arr2.getSpecialShapeInfo(), result->buffer(), result->getShapeInfo(), result->specialBuffer(), result->getSpecialShapeInfo(), nullptr);
+        NativeOpExecutioner::execPairwiseTransform(arr1.getContext(), sd::pairwise::Add, arr1.getBuffer(), arr1.getShapeInfo(), arr1.getSpecialBuffer(), arr1.getSpecialShapeInfo(), arr2.getBuffer(), arr2.getShapeInfo(), arr2.getSpecialBuffer(), arr2.getSpecialShapeInfo(), result->buffer(), result->getShapeInfo(), result->specialBuffer(), result->getSpecialShapeInfo(), nullptr);
         NDArray::registerSpecialUse({result}, {&arr1, &arr2});
 
         if(!isArr1Rvalue && !isArr2Rvalue) {
@@ -5377,7 +5377,7 @@ NDArray operator+(T1&& arr1, T2&& arr2) {
         return std::move(*result);
     }
 
-    return std::forward<T1>(arr1).applyTrueBroadcast(nd4j::BroadcastOpsTuple::Add(), std::forward<T2>(arr2));
+    return std::forward<T1>(arr1).applyTrueBroadcast(sd::BroadcastOpsTuple::Add(), std::forward<T2>(arr2));
 }
 template ND4J_EXPORT NDArray operator+<NDArray&, NDArray&, void>(NDArray& arr1, NDArray& arr2);
 template ND4J_EXPORT NDArray operator+<NDArray&, NDArray, void>(NDArray& arr1, NDArray&& arr2);
@@ -5397,7 +5397,7 @@ NDArray operator-(T1&& arr1, T2&& arr2) {
     if (arr1.isS() || arr2.isS())
         throw std::runtime_error("operator-(T&& arr1, T&& arr2): you can't use this method on String arrays!");
     if (!Environment::getInstance()->isExperimentalBuild() && arr1.dataType() != arr2.dataType() && (arr1.dataType() != DataType::BOOL || arr2.dataType() != BOOL))
-        throw nd4j::datatype_exception::build("operator-(T&& arr1, T&& arr2): Cannot multiply different types", arr1.dataType(), arr2.dataType());
+        throw sd::datatype_exception::build("operator-(T&& arr1, T&& arr2): Cannot multiply different types", arr1.dataType(), arr2.dataType());
 
     PointersManager pointersManager(arr1.getContext(), "operator-(T&& arr1, T&& arr2)");
 
@@ -5415,7 +5415,7 @@ NDArray operator-(T1&& arr1, T2&& arr2) {
             result = new NDArray(arr1.getShapeInfo(), DataTypeUtils::pickPairwiseResultType(arr1.getShapeInfo(), arr2.getShapeInfo()), false, arr1.getContext());
 
         NDArray::prepareSpecialUse({result}, {&arr1, &arr2});
-        NativeOpExecutioner::execPairwiseTransform(arr1.getContext(), nd4j::pairwise::Subtract, arr1.getBuffer(), arr1.getShapeInfo(), arr1.getSpecialBuffer(), arr1.getSpecialShapeInfo(), arr2.getBuffer(), arr2.getShapeInfo(), arr2.getSpecialBuffer(), arr2.getSpecialShapeInfo(), result->buffer(), result->getShapeInfo(), result->specialBuffer(), result->getSpecialShapeInfo(), nullptr);
+        NativeOpExecutioner::execPairwiseTransform(arr1.getContext(), sd::pairwise::Subtract, arr1.getBuffer(), arr1.getShapeInfo(), arr1.getSpecialBuffer(), arr1.getSpecialShapeInfo(), arr2.getBuffer(), arr2.getShapeInfo(), arr2.getSpecialBuffer(), arr2.getSpecialShapeInfo(), result->buffer(), result->getShapeInfo(), result->specialBuffer(), result->getSpecialShapeInfo(), nullptr);
         NDArray::registerSpecialUse({result}, {&arr1, &arr2});
 
         if(!isArr1Rvalue && !isArr2Rvalue) {
@@ -5427,7 +5427,7 @@ NDArray operator-(T1&& arr1, T2&& arr2) {
         return std::move(*result);
     }
 
-    return std::forward<T1>(arr1).applyTrueBroadcast(nd4j::BroadcastOpsTuple::Subtract(), std::forward<T2>(arr2));
+    return std::forward<T1>(arr1).applyTrueBroadcast(sd::BroadcastOpsTuple::Subtract(), std::forward<T2>(arr2));
 }
 template ND4J_EXPORT NDArray operator-<NDArray&, NDArray&, void>(NDArray& arr1, NDArray& arr2);
 template ND4J_EXPORT NDArray operator-<NDArray&, NDArray, void>(NDArray& arr1, NDArray&& arr2);
@@ -5447,7 +5447,7 @@ NDArray operator*(T1&& arr1, T2&& arr2) {
     if (arr1.isS() || arr2.isS())
         throw std::runtime_error("operator*(T&& arr1, T&& arr2): you can't use this method on String arrays!");
     if (!Environment::getInstance()->isExperimentalBuild() && arr1.dataType() != arr2.dataType() && (arr1.dataType() != DataType::BOOL || arr2.dataType() != BOOL))
-        throw nd4j::datatype_exception::build("operator*(T&& arr1, T&& arr2): Cannot multiply different types", arr1.dataType(), arr2.dataType());
+        throw sd::datatype_exception::build("operator*(T&& arr1, T&& arr2): Cannot multiply different types", arr1.dataType(), arr2.dataType());
 
     PointersManager pointersManager(arr1.getContext(), "operator*(T&& arr1, T&& arr2)");
 
@@ -5465,7 +5465,7 @@ NDArray operator*(T1&& arr1, T2&& arr2) {
             result = new NDArray(arr1.getShapeInfo(), DataTypeUtils::pickPairwiseResultType(arr1.getShapeInfo(), arr2.getShapeInfo()), false, arr1.getContext());
 
         NDArray::prepareSpecialUse({result}, {&arr1, &arr2});
-        NativeOpExecutioner::execPairwiseTransform(arr1.getContext(), nd4j::pairwise::Multiply, arr1.getBuffer(), arr1.getShapeInfo(), arr1.getSpecialBuffer(), arr1.getSpecialShapeInfo(), arr2.getBuffer(), arr2.getShapeInfo(), arr2.getSpecialBuffer(), arr2.getSpecialShapeInfo(), result->buffer(), result->getShapeInfo(), result->specialBuffer(), result->getSpecialShapeInfo(), nullptr);
+        NativeOpExecutioner::execPairwiseTransform(arr1.getContext(), sd::pairwise::Multiply, arr1.getBuffer(), arr1.getShapeInfo(), arr1.getSpecialBuffer(), arr1.getSpecialShapeInfo(), arr2.getBuffer(), arr2.getShapeInfo(), arr2.getSpecialBuffer(), arr2.getSpecialShapeInfo(), result->buffer(), result->getShapeInfo(), result->specialBuffer(), result->getSpecialShapeInfo(), nullptr);
         NDArray::registerSpecialUse({result}, {&arr1, &arr2});
 
         if(!isArr1Rvalue && !isArr2Rvalue) {
@@ -5477,7 +5477,7 @@ NDArray operator*(T1&& arr1, T2&& arr2) {
         return std::move(*result);
     }
 
-    return std::forward<T1>(arr1).applyTrueBroadcast(nd4j::BroadcastOpsTuple::Multiply(), std::forward<T2>(arr2));
+    return std::forward<T1>(arr1).applyTrueBroadcast(sd::BroadcastOpsTuple::Multiply(), std::forward<T2>(arr2));
 }
 template ND4J_EXPORT NDArray operator*<NDArray&, NDArray&, void>(NDArray& arr1, NDArray& arr2);
 template ND4J_EXPORT NDArray operator*<NDArray&, NDArray, void>(NDArray& arr1, NDArray&& arr2);
@@ -5497,7 +5497,7 @@ NDArray operator/(T1&& arr1, T2&& arr2) {
     if (arr1.isS() || arr2.isS())
         throw std::runtime_error("operator/(T&& arr1, T&& arr2): you can't use this method on String arrays!");
     if (!Environment::getInstance()->isExperimentalBuild() && arr1.dataType() != arr2.dataType() && (arr1.dataType() != DataType::BOOL || arr2.dataType() != BOOL))
-        throw nd4j::datatype_exception::build("operator/(T&& arr1, T&& arr2): Cannot multiply different types", arr1.dataType(), arr2.dataType());
+        throw sd::datatype_exception::build("operator/(T&& arr1, T&& arr2): Cannot multiply different types", arr1.dataType(), arr2.dataType());
 
     PointersManager pointersManager(arr1.getContext(), "operator/(T&& arr1, T&& arr2)");
 
@@ -5515,7 +5515,7 @@ NDArray operator/(T1&& arr1, T2&& arr2) {
             result = new NDArray(arr1.getShapeInfo(), DataTypeUtils::pickPairwiseResultType(arr1.getShapeInfo(), arr2.getShapeInfo()), false, arr1.getContext());
 
         NDArray::prepareSpecialUse({result}, {&arr1, &arr2});
-        NativeOpExecutioner::execPairwiseTransform(arr1.getContext(), nd4j::pairwise::Divide, arr1.getBuffer(), arr1.getShapeInfo(), arr1.getSpecialBuffer(), arr1.getSpecialShapeInfo(), arr2.getBuffer(), arr2.getShapeInfo(), arr2.getSpecialBuffer(), arr2.getSpecialShapeInfo(), result->buffer(), result->getShapeInfo(), result->specialBuffer(), result->getSpecialShapeInfo(), nullptr);
+        NativeOpExecutioner::execPairwiseTransform(arr1.getContext(), sd::pairwise::Divide, arr1.getBuffer(), arr1.getShapeInfo(), arr1.getSpecialBuffer(), arr1.getSpecialShapeInfo(), arr2.getBuffer(), arr2.getShapeInfo(), arr2.getSpecialBuffer(), arr2.getSpecialShapeInfo(), result->buffer(), result->getShapeInfo(), result->specialBuffer(), result->getSpecialShapeInfo(), nullptr);
         NDArray::registerSpecialUse({result}, {&arr1, &arr2});
 
         if(!isArr1Rvalue && !isArr2Rvalue) {
@@ -5527,7 +5527,7 @@ NDArray operator/(T1&& arr1, T2&& arr2) {
         return std::move(*result);
     }
 
-    return std::forward<T1>(arr1).applyTrueBroadcast(nd4j::BroadcastOpsTuple::Divide(), std::forward<T2>(arr2));
+    return std::forward<T1>(arr1).applyTrueBroadcast(sd::BroadcastOpsTuple::Divide(), std::forward<T2>(arr2));
 }
 template ND4J_EXPORT NDArray operator/<NDArray&, NDArray&, void>(NDArray& arr1, NDArray& arr2);
 template ND4J_EXPORT NDArray operator/<NDArray&, NDArray, void>(NDArray& arr1, NDArray&& arr2);
@@ -5580,7 +5580,7 @@ template ND4J_EXPORT NDArray operator/<NDArray, NDArray, void>(NDArray&& arr1, N
 //                 for(int i=0; i<rows(); ++i)
 //                     dot += e<double>(i,j)*e<double>(i,k);
 
-//                 if(nd4j::math::nd4j_abs(dot) > eps )
+//                 if(sd::math::nd4j_abs(dot) > eps )
 //                     return false;
 
 //                 dot = 0.f;
@@ -5589,7 +5589,7 @@ template ND4J_EXPORT NDArray operator/<NDArray, NDArray, void>(NDArray&& arr1, N
 //             for(int j=0; j<columns(); ++j)  {   // check whether norm of column vector = 1
 //                 for(int i=0; i<rows(); ++i)
 //                     dot += e<double>(i,j)*e<double>(i,j);
-//             if(dot != 0.f && nd4j::math::nd4j_abs(nd4j::math::nd4j_sqrt<double, double>(dot) - 1.f) > eps)
+//             if(dot != 0.f && sd::math::nd4j_abs(sd::math::nd4j_sqrt<double, double>(dot) - 1.f) > eps)
 //                 return false;
 
 //             dot = 0.f;
@@ -5601,7 +5601,7 @@ template ND4J_EXPORT NDArray operator/<NDArray, NDArray, void>(NDArray&& arr1, N
 //                 for(int j=0; j<columns(); ++j)
 //                     dot += e<double>(i,j)*e<double>(k,j);
 
-//                 if(nd4j::math::nd4j_abs(dot) > eps )
+//                 if(sd::math::nd4j_abs(dot) > eps )
 //                     return false;
 
 //                 dot = 0.;
@@ -5611,7 +5611,7 @@ template ND4J_EXPORT NDArray operator/<NDArray, NDArray, void>(NDArray&& arr1, N
 //                 for(int j=0; j<columns(); ++j)
 //                     dot += e<double>(i,j)*e<double>(i,j);
 
-//                 if(dot!= 0. && nd4j::math::nd4j_abs(nd4j::math::nd4j_sqrt<double, double>(dot) - 1.) > eps)
+//                 if(dot!= 0. && sd::math::nd4j_abs(sd::math::nd4j_sqrt<double, double>(dot) - 1.) > eps)
 //                     return false;
 //                 dot = 0.;
 //             }
diff --git a/libnd4j/include/array/NDArrayFactory.h b/libnd4j/include/array/NDArrayFactory.h
new file mode 100644
index 000000000..f25c68fb4
--- /dev/null
+++ b/libnd4j/include/array/NDArrayFactory.h
@@ -0,0 +1,191 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ * Copyright (c) 2019-2020 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// Created by raver119 on 2018-09-16.
+// @author Oleg Semeniv <oleg.semeniv@gmail.com>
+//
+
+#ifndef DEV_TESTS_NDARRAYFACTORY_H
+#define DEV_TESTS_NDARRAYFACTORY_H
+
+#include <vector>
+#include <initializer_list>
+#include <array/NDArray.h>
+//#include <memory/Workspace.h>
+#include <execution/LaunchContext.h>
+#include <string>
+
+
+namespace sd {
+    class ND4J_EXPORT NDArrayFactory {
+    private:
+        template <typename T>
+        static void memcpyFromVector(void *ptr, const std::vector<T> &vector);
+    public:
+        template <typename T>
+        static NDArray* empty_(sd::LaunchContext * context = sd::LaunchContext ::defaultContext());
+
+        static NDArray* empty_(sd::DataType dataType, sd::LaunchContext * context = sd::LaunchContext ::defaultContext());
+
+        template <typename T>
+        static NDArray empty(sd::LaunchContext * context = sd::LaunchContext ::defaultContext());
+
+        static NDArray empty(sd::DataType dataType, sd::LaunchContext * context = sd::LaunchContext ::defaultContext());
+
+        template <typename T>
+        static NDArray* valueOf(const std::initializer_list<Nd4jLong>& shape, T value, char order = 'c',  sd::LaunchContext * context = sd::LaunchContext ::defaultContext());
+
+        template <typename T>
+        static NDArray* valueOf(const std::vector<Nd4jLong>& shape, T value, char order = 'c',  sd::LaunchContext * context = sd::LaunchContext ::defaultContext());
+
+        static NDArray* valueOf(const std::vector<Nd4jLong>& shape, const NDArray& value, char order = 'c',  sd::LaunchContext * context = sd::LaunchContext ::defaultContext());
+
+        template <typename T>
+        static NDArray* linspace(T from, T to, Nd4jLong numElements);
+
+
+        template <typename T>
+        static NDArray* create_(const T value, sd::LaunchContext * context = sd::LaunchContext ::defaultContext());
+        static NDArray* create_(sd::DataType dtype, sd::LaunchContext * context = sd::LaunchContext ::defaultContext());
+
+        template <typename T>
+        static NDArray create(const T value, sd::LaunchContext * context = sd::LaunchContext ::defaultContext());
+        static NDArray create(sd::DataType dtype, sd::LaunchContext * context = sd::LaunchContext ::defaultContext());
+        template <typename T>
+        static NDArray create(DataType type, const T scalar, sd::LaunchContext * context = sd::LaunchContext ::defaultContext());
+
+
+        template <typename T>
+        static NDArray* vector(Nd4jLong length, T startingValue = (T) 0, sd::LaunchContext * context = sd::LaunchContext ::defaultContext());
+
+        template <typename T>
+        static NDArray* create_(char order, const std::vector<Nd4jLong> &shape, sd::LaunchContext * context = sd::LaunchContext ::defaultContext());
+
+        static NDArray* create_( char order, const std::vector<Nd4jLong> &shape, sd::DataType dataType, sd::LaunchContext * context = sd::LaunchContext ::defaultContext());
+
+        template <typename T>
+        static NDArray* create_(char order, const std::vector<Nd4jLong> &shape, const std::vector<T> &data, sd::LaunchContext * context = sd::LaunchContext ::defaultContext());
+
+        template <typename T>
+        static NDArray create(char order, const std::vector<Nd4jLong> &shape, const std::vector<T> &data, sd::LaunchContext * context = sd::LaunchContext ::defaultContext());
+
+        template <typename T>
+        static NDArray create(char order, const std::vector<Nd4jLong> &shape, sd::LaunchContext * context = sd::LaunchContext ::defaultContext());
+        static NDArray create(char order, const std::vector<Nd4jLong> &shape, sd::DataType dtype, sd::LaunchContext * context = sd::LaunchContext ::defaultContext());
+
+        template <typename T>
+        static NDArray create(const std::vector<T> &values, sd::LaunchContext * context = sd::LaunchContext ::defaultContext());
+
+#ifndef __JAVACPP_HACK__
+        // this method only available out of javacpp
+        /**
+         * This constructor creates vector of T
+         *
+         * @param values
+         */
+
+        template <typename T>
+        static NDArray create(char order, const std::initializer_list<Nd4jLong>& shape, sd::LaunchContext * context = sd::LaunchContext ::defaultContext());
+
+        template <typename T>
+        static NDArray create(T* buffer, char order, const std::initializer_list<Nd4jLong>& shape, sd::LaunchContext * context = sd::LaunchContext ::defaultContext());
+
+        template <typename T>
+        static NDArray create(char order, const std::vector<Nd4jLong> &shape, const std::initializer_list<T>& data, sd::LaunchContext * context = sd::LaunchContext ::defaultContext());
+
+        /**
+         * This method creates NDArray from .npy file
+         * @param fileName
+         * @return
+         */
+        static NDArray fromNpyFile(const char *fileName);
+
+        /**
+         * This factory create array from utf8 string
+         * @return NDArray default dataType UTF8
+         */
+        static NDArray string(const char *string, sd::DataType dtype = sd::DataType::UTF8, sd::LaunchContext * context = sd::LaunchContext ::defaultContext());
+        static NDArray* string_(const char *string, sd::DataType dtype = sd::DataType::UTF8, sd::LaunchContext * context = sd::LaunchContext ::defaultContext());
+        static NDArray* string_(const std::string &string, sd::DataType dtype = sd::DataType::UTF8, sd::LaunchContext * context = sd::LaunchContext ::defaultContext());
+        static NDArray string(const std::string& string, sd::DataType dtype = sd::DataType::UTF8, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
+
+        /**
+         * This factory create array from utf16 string
+         * @return NDArray default dataType UTF16
+         */
+        static NDArray string(const char16_t* u16string, sd::DataType dtype = sd::DataType::UTF16, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
+        static NDArray* string_(const char16_t* u16string, sd::DataType dtype = sd::DataType::UTF16, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
+        static NDArray* string_(const std::u16string& u16string, sd::DataType dtype = sd::DataType::UTF16, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
+        static NDArray string(const std::u16string& u16string, sd::DataType dtype = sd::DataType::UTF16, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
+        
+        /**
+         * This factory create array from utf32 string
+         * @return NDArray default dataType UTF32
+         */
+        static NDArray string(const char32_t* u32string, sd::DataType dtype = sd::DataType::UTF32, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
+        static NDArray* string_(const char32_t* u32string, sd::DataType dtype = sd::DataType::UTF32, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
+        static NDArray* string_(const std::u32string& u32string, sd::DataType dtype = sd::DataType::UTF32, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
+        static NDArray string(const std::u32string& u32string, sd::DataType dtype = sd::DataType::UTF32, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
+
+        /**
+         * This factory create array from vector of utf8 strings
+         * @return NDArray default dataType UTF8
+         */
+        static NDArray string( const std::vector<Nd4jLong> &shape, const std::initializer_list<const char *> &strings, sd::DataType dtype = sd::DataType::UTF8, sd::LaunchContext * context = sd::LaunchContext ::defaultContext());
+        static NDArray string( const std::vector<Nd4jLong> &shape, const std::initializer_list<std::string> &string, sd::DataType dtype = sd::DataType::UTF8, sd::LaunchContext * context = sd::LaunchContext ::defaultContext());
+        static NDArray string( const std::vector<Nd4jLong> &shape, const std::vector<const char *> &strings, sd::DataType dtype = sd::DataType::UTF8, sd::LaunchContext * context = sd::LaunchContext ::defaultContext());
+        static NDArray string( const std::vector<Nd4jLong> &shape, const std::vector<std::string> &string, sd::DataType dtype = sd::DataType::UTF8, sd::LaunchContext * context = sd::LaunchContext ::defaultContext());
+        static NDArray* string_( const std::vector<Nd4jLong> &shape, const std::initializer_list<const char *> &strings, sd::DataType dtype = sd::DataType::UTF8, sd::LaunchContext * context = sd::LaunchContext ::defaultContext());
+        static NDArray* string_( const std::vector<Nd4jLong> &shape, const std::initializer_list<std::string> &string, sd::DataType dtype = sd::DataType::UTF8, sd::LaunchContext * context = sd::LaunchContext ::defaultContext());
+        static NDArray* string_( const std::vector<Nd4jLong> &shape, const std::vector<const char *> &strings, sd::DataType dtype = sd::DataType::UTF8, sd::LaunchContext * context = sd::LaunchContext ::defaultContext());
+        static NDArray* string_( const std::vector<Nd4jLong> &shape, const std::vector<std::string> &string, sd::DataType dtype = sd::DataType::UTF8, sd::LaunchContext * context = sd::LaunchContext ::defaultContext());
+
+        /**
+         * This factory create array from vector of utf16 strings
+         * @return NDArray default dataType UTF16
+         */
+        static NDArray string( const std::vector<Nd4jLong>& shape, const std::initializer_list<const char16_t*>& strings, sd::DataType dtype = sd::DataType::UTF16, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
+        static NDArray string( const std::vector<Nd4jLong>& shape, const std::initializer_list<std::u16string>& string, sd::DataType dtype = sd::DataType::UTF16, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
+        static NDArray string( const std::vector<Nd4jLong>& shape, const std::vector<const char16_t*>& strings, sd::DataType dtype = sd::DataType::UTF16, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
+        static NDArray string( const std::vector<Nd4jLong>& shape, const std::vector<std::u16string>& string, sd::DataType dtype = sd::DataType::UTF16, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
+        static NDArray* string_( const std::vector<Nd4jLong>& shape, const std::initializer_list<const char16_t*>& strings, sd::DataType dtype = sd::DataType::UTF16, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
+        static NDArray* string_( const std::vector<Nd4jLong>& shape, const std::initializer_list<std::u16string>& string, sd::DataType dtype = sd::DataType::UTF16, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
+        static NDArray* string_( const std::vector<Nd4jLong>& shape, const std::vector<const char16_t*>& strings, sd::DataType dtype = sd::DataType::UTF16, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
+        static NDArray* string_( const std::vector<Nd4jLong>& shape, const std::vector<std::u16string>& string, sd::DataType dtype = sd::DataType::UTF16, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
+
+        /**
+         * This factory create array from vector of utf32 strings
+         * @return NDArray default dataType UTF32
+         */
+        static NDArray string( const std::vector<Nd4jLong>& shape, const std::initializer_list<const char32_t*>& strings, sd::DataType dtype = sd::DataType::UTF32, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
+        static NDArray string( const std::vector<Nd4jLong>& shape, const std::initializer_list<std::u32string>& string, sd::DataType dtype = sd::DataType::UTF32, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
+        static NDArray string( const std::vector<Nd4jLong>& shape, const std::vector<const char32_t*>& strings, sd::DataType dtype = sd::DataType::UTF32, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
+        static NDArray string( const std::vector<Nd4jLong>& shape, const std::vector<std::u32string>& string, sd::DataType dtype = sd::DataType::UTF32, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
+        static NDArray* string_( const std::vector<Nd4jLong>& shape, const std::initializer_list<const char32_t*>& strings, sd::DataType dtype = sd::DataType::UTF32, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
+        static NDArray* string_( const std::vector<Nd4jLong>& shape, const std::initializer_list<std::u32string>& string, sd::DataType dtype = sd::DataType::UTF32, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
+        static NDArray* string_( const std::vector<Nd4jLong>& shape, const std::vector<const char32_t*>& strings, sd::DataType dtype = sd::DataType::UTF32, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
+        static NDArray* string_( const std::vector<Nd4jLong>& shape, const std::vector<std::u32string>& string, sd::DataType dtype = sd::DataType::UTF32, sd::LaunchContext* context = sd::LaunchContext::defaultContext());
+
+
+        static ResultSet createSetOfArrs(const Nd4jLong numOfArrs, const void* buffer, const Nd4jLong* shapeInfo, const Nd4jLong* offsets, sd::LaunchContext * context = sd::LaunchContext ::defaultContext());
+
+#endif
+    };
+}
+
+#endif //DEV_TESTS_NDARRAYFACTORY_H
diff --git a/libnd4j/blas/cuda/NDArrayLambda.hpp b/libnd4j/include/array/NDArrayLambda.hXX
similarity index 99%
rename from libnd4j/blas/cuda/NDArrayLambda.hpp
rename to libnd4j/include/array/NDArrayLambda.hXX
index 15028dfaa..718a35527 100644
--- a/libnd4j/blas/cuda/NDArrayLambda.hpp
+++ b/libnd4j/include/array/NDArrayLambda.hXX
@@ -17,8 +17,8 @@
 #ifndef CUDA_LAMBDA_HELPER
 #define CUDA_LAMBDA_HELPER
 
-#include <pointercast.h>
-#include <op_boilerplate.h>
+#include <system/pointercast.h>
+#include <system/op_boilerplate.h>
 #include <helpers/shape.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
diff --git a/libnd4j/include/array/NDArrayList.h b/libnd4j/include/array/NDArrayList.h
index 7d6bc12b1..e446213f2 100644
--- a/libnd4j/include/array/NDArrayList.h
+++ b/libnd4j/include/array/NDArrayList.h
@@ -26,25 +26,25 @@
 #include <string>
 #include <atomic>
 #include <unordered_map>
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <memory/Workspace.h>
-#include <dll.h>
+#include <system/dll.h>
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT NDArrayList {
     private:
         // workspace where chunks belong to
-        //nd4j::memory::Workspace* _workspace = nullptr;
-        nd4j::LaunchContext * _context = nd4j::LaunchContext ::defaultContext();
+        //sd::memory::Workspace* _workspace = nullptr;
+        sd::LaunchContext * _context = sd::LaunchContext ::defaultContext();
 
         // numeric and symbolic ids of this list
         std::pair<int, int> _id;
         std::string _name;
 
-        nd4j::DataType _dtype;
+        sd::DataType _dtype;
 
         // stored chunks
-        MAP_IMPL<int, nd4j::NDArray*> _chunks;
+        MAP_IMPL<int, sd::NDArray*> _chunks;
 
         // just a counter, for stored elements
         std::atomic<int> _elements;
@@ -65,7 +65,7 @@ namespace nd4j {
         NDArrayList(int height, bool expandable = false);
         ~NDArrayList();
 
-        nd4j::DataType dataType();
+        sd::DataType dataType();
 
         NDArray* read(int idx);
         NDArray* readRaw(int idx);
@@ -82,8 +82,8 @@ namespace nd4j {
 
         std::pair<int,int>& id();
         std::string& name();
-        //nd4j::memory::Workspace* workspace();
-        nd4j::LaunchContext * context();
+        //sd::memory::Workspace* workspace();
+        sd::LaunchContext * context();
         NDArrayList* clone();
 
         bool equals(NDArrayList& other);
diff --git a/libnd4j/include/array/ResultSet.h b/libnd4j/include/array/ResultSet.h
index ddcfdcdf9..8215ac940 100644
--- a/libnd4j/include/array/ResultSet.h
+++ b/libnd4j/include/array/ResultSet.h
@@ -19,7 +19,7 @@
 // 
 // PLESE NOTE: It will delete all stored NDArrays upon destructor call
 //
-// Created by raver119 on 07.09.17.
+// @author raver119@gmail.com
 //
 
 #ifndef LIBND4J_RESULTSET_H
@@ -27,22 +27,25 @@
 
 #include <vector>
 #include <graph/generated/result_generated.h>
-#include <pointercast.h>
-#include <dll.h>
+#include <system/pointercast.h>
+#include <system/dll.h>
 
-namespace  nd4j {
+namespace sd {
 
     class NDArray; // forward declaration of template class NDArray
     
     class ND4J_EXPORT ResultSet {
     private:
-        std::vector<nd4j::NDArray *> _content;
+        std::vector<sd::NDArray *> _content;
         Nd4jStatus _status = ND4J_STATUS_OK;
         bool _removable = true;
 
     public:
-        // default constructor
-        ResultSet(const nd4j::graph::FlatResult* result = nullptr);
+        explicit ResultSet();
+
+#ifndef __JAVACPP_HACK__
+        ResultSet(const sd::graph::FlatResult* result);
+#endif
 
         ResultSet(const ResultSet& other) noexcept;
 
@@ -57,9 +60,9 @@ namespace  nd4j {
         ~ResultSet();
 
         int size();
-        nd4j::NDArray* at(const unsigned long idx) const;
-        nd4j::NDArray* operator[](const unsigned long idx) const;
-        void push_back(nd4j::NDArray* array);
+        sd::NDArray* at(const unsigned long idx) const;
+        sd::NDArray* operator[](const unsigned long idx) const;
+        void push_back(sd::NDArray* array);
 
         Nd4jStatus status();
         void setStatus(Nd4jStatus status);
diff --git a/libnd4j/include/array/ShapeDescriptor.h b/libnd4j/include/array/ShapeDescriptor.h
index 4eeaf66b9..6e2299ba0 100644
--- a/libnd4j/include/array/ShapeDescriptor.h
+++ b/libnd4j/include/array/ShapeDescriptor.h
@@ -23,12 +23,12 @@
 
 #include <unordered_map>
 #include <vector>
-#include <dll.h>
-#include <pointercast.h>
-#include <DataType.h>
+#include <system/dll.h>
+#include <system/pointercast.h>
+#include <array/DataType.h>
 #include <initializer_list>
 
-namespace nd4j {
+namespace sd {
 
 class ND4J_EXPORT ShapeDescriptor {
 
@@ -44,7 +44,7 @@ class ND4J_EXPORT ShapeDescriptor {
     public:
         ShapeDescriptor(const ShapeDescriptor &other);
         ShapeDescriptor(const Nd4jLong *shapeInfo, bool inheritDtype = true);
-        explicit ShapeDescriptor(const Nd4jLong *shapeInfo, const nd4j::DataType dtypeOverride);
+        explicit ShapeDescriptor(const Nd4jLong *shapeInfo, const sd::DataType dtypeOverride);
         explicit ShapeDescriptor(const Nd4jLong *shapeInfo, const Nd4jLong *dtypeOverride);
         explicit ShapeDescriptor(const Nd4jLong *shapeInfo, const Nd4jLong *dtypeOverride, const Nd4jLong *orderOverride);
         explicit ShapeDescriptor(const DataType type, const Nd4jLong length);
@@ -91,9 +91,9 @@ class ND4J_EXPORT ShapeDescriptor {
 
 namespace std {
     template<>
-    class ND4J_EXPORT hash<nd4j::ShapeDescriptor> {
+    class ND4J_EXPORT hash<sd::ShapeDescriptor> {
     public:
-        size_t operator()(const nd4j::ShapeDescriptor &k) const;
+        size_t operator()(const sd::ShapeDescriptor &k) const;
     };
 }
 
diff --git a/libnd4j/include/array/ShapeList.h b/libnd4j/include/array/ShapeList.h
index 7994e3a6f..2d0fde4ad 100644
--- a/libnd4j/include/array/ShapeList.h
+++ b/libnd4j/include/array/ShapeList.h
@@ -22,10 +22,10 @@
 #define LIBND4J_SHAPELIST_H
 
 #include <vector>
-#include <shape.h>
-#include <dll.h>
+#include <helpers/shape.h>
+#include <system/dll.h>
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT ShapeList {
     protected:
         std::vector<Nd4jLong*> _shapes;
diff --git a/libnd4j/include/array/SpaceType.h b/libnd4j/include/array/SpaceType.h
index 3d24af12a..b6c6dfbbc 100644
--- a/libnd4j/include/array/SpaceType.h
+++ b/libnd4j/include/array/SpaceType.h
@@ -21,7 +21,7 @@
 #ifndef ND4J_SPACE_TYPE_H
 #define ND4J_SPACE_TYPE_H
 
-namespace nd4j {
+namespace sd {
     enum SpaceType {
         CONTINUOUS = 1,
         COMPLEX = 2,
diff --git a/libnd4j/include/array/SparseType.h b/libnd4j/include/array/SparseType.h
index 8c11a9242..3b77a1626 100644
--- a/libnd4j/include/array/SparseType.h
+++ b/libnd4j/include/array/SparseType.h
@@ -21,7 +21,7 @@
 #ifndef LIBND4J_SPARSETYPE_H
 #define LIBND4J_SPARSETYPE_H
 
-namespace nd4j {
+namespace sd {
     enum SparseType {
         CSR = 1,
         CSC = 2,
diff --git a/libnd4j/include/array/TadDescriptor.h b/libnd4j/include/array/TadDescriptor.h
index ab05cbfb7..01ea1caa1 100644
--- a/libnd4j/include/array/TadDescriptor.h
+++ b/libnd4j/include/array/TadDescriptor.h
@@ -22,9 +22,9 @@
 #define DEV_TESTS_TADDESCRIPTOR_H
 
 #include "ShapeDescriptor.h"
-#include <dll.h>
+#include <system/dll.h>
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT TadDescriptor {
     private:
         ShapeDescriptor _originalShape;
@@ -62,9 +62,9 @@ namespace nd4j {
 
 namespace std {
     template<>
-    class ND4J_EXPORT hash<nd4j::TadDescriptor> {
+    class ND4J_EXPORT hash<sd::TadDescriptor> {
     public:
-        size_t operator()(const nd4j::TadDescriptor &k) const;
+        size_t operator()(const sd::TadDescriptor &k) const;
     };
 }
 
diff --git a/libnd4j/include/array/TadPack.h b/libnd4j/include/array/TadPack.h
index 2d195277d..09b084548 100644
--- a/libnd4j/include/array/TadPack.h
+++ b/libnd4j/include/array/TadPack.h
@@ -23,7 +23,7 @@
 
 #include "ConstantDataBuffer.h"
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT TadPack {
     private:
         ConstantDataBuffer _tadShape;
diff --git a/libnd4j/include/array/cpu/DataBuffer.cpp b/libnd4j/include/array/cpu/DataBuffer.cpp
index ccd782adc..2575e2ba4 100644
--- a/libnd4j/include/array/cpu/DataBuffer.cpp
+++ b/libnd4j/include/array/cpu/DataBuffer.cpp
@@ -19,10 +19,10 @@
 // @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
-#include "../DataBuffer.h"
-#include <DataTypeUtils.h>
+#include <array/DataBuffer.h>
+#include <array/DataTypeUtils.h>
 
-namespace nd4j {
+namespace sd {
     void DataBuffer::expand(const uint64_t size) {
         if (size > _lenInBytes) {
             // allocate new buffer
diff --git a/libnd4j/blas/cpu/NDArray.cpp b/libnd4j/include/array/cpu/NDArray.cpp
similarity index 98%
rename from libnd4j/blas/cpu/NDArray.cpp
rename to libnd4j/include/array/cpu/NDArray.cpp
index efae00ea6..7d91d1373 100644
--- a/libnd4j/blas/cpu/NDArray.cpp
+++ b/libnd4j/include/array/cpu/NDArray.cpp
@@ -17,15 +17,15 @@
 #ifndef NDARRAY_CPP
 #define NDARRAY_CPP
 
-#include "../NDArray.h"
-#include "../NDArrayFactory.h"
-#include "NativeOpExecutioner.h"
-#include <BroadcastPairwiseConverter.h>
+#include <array/NDArray.h>
+#include <array/NDArrayFactory.h>
+#include <legacy/NativeOpExecutioner.h>
+#include <loops/BroadcastPairwiseConverter.h>
 #include <memory/Workspace.h>
 #include <memory/MemoryRegistrator.h>
-#include <ops.h>
+#include <ops/ops.h>
 #include <ops/gemm.h>
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <stdexcept>
 #include <memory>
 #include <helpers/logger.h>
@@ -38,16 +38,16 @@
 #include <helpers/ShapeUtils.h>
 #include <sstream>
 #include <helpers/ArrayUtils.h>
-#include <MmulHelper.h>
+#include <helpers/MmulHelper.h>
 #include <helpers/threshold.h>
 #include <exceptions/datatype_exception.h>
 #include <exceptions/allocation_exception.h>
 #include <helpers/ConstantTadHelper.h>
 
-#include <NDArray.hpp>
+#include <array/NDArray.hXX>
 
 
-namespace nd4j {
+namespace sd {
 
 ////////////////////////////////////////////////////////////////////////
 
diff --git a/libnd4j/include/array/cpu/NDArray.macro b/libnd4j/include/array/cpu/NDArray.macro
new file mode 100644
index 000000000..5fbb56378
--- /dev/null
+++ b/libnd4j/include/array/cpu/NDArray.macro
@@ -0,0 +1,148 @@
+################################################################################
+# Copyright (c) 2015-2018 Skymind, Inc.
+#
+# This program and the accompanying materials are made available under the
+# terms of the Apache License, Version 2.0 which is available at
+# https://www.apache.org/licenses/LICENSE-2.0.
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+################################################################################
+
+#ifndef NDARRAY_MACRO
+#define NDARRAY_MACRO
+
+#include <op_boilerplate.h>
+
+//NDArray<T> *other, T *extraParams
+BUILD_CALL_1(template void NDArray<float>::template applyPairwiseTransform, float, (NDArray<float>* other, float* extraParams), PAIRWISE_TRANSFORM_OPS)
+BUILD_CALL_1(template void NDArray<float16>::applyPairwiseTransform, float16, (NDArray<float16>* other, float16* extraParams), PAIRWISE_TRANSFORM_OPS)
+BUILD_CALL_1(template void NDArray<double>::applyPairwiseTransform, double, (NDArray<double>* other, double* extraParams), PAIRWISE_TRANSFORM_OPS)
+
+// NDArray<T> *other, NDArray<T> *target, T *extraParams
+BUILD_CALL_1(template void sd::NDArray<float>::applyPairwiseTransform, float, (NDArray<float>* other, NDArray<float>* target, float* extraParams), PAIRWISE_TRANSFORM_OPS)
+BUILD_CALL_1(template void sd::NDArray<float16>::applyPairwiseTransform, float16, (NDArray<float16>* other, NDArray<float16>* target, float16* extraParams), PAIRWISE_TRANSFORM_OPS)
+BUILD_CALL_1(template void sd::NDArray<double>::applyPairwiseTransform, double, (NDArray<double>* other, NDArray<double>* target, double* extraParams), PAIRWISE_TRANSFORM_OPS)
+
+BUILD_CALL_1(template void sd::NDArray<float16>::applyScalar, float16, (NDArray<float16>& scalar, NDArray<float16>* target, float16 *extraParams) const, SCALAR_OPS)
+BUILD_CALL_1(template void sd::NDArray<float16>::applyScalar, float16, (float16 scalar, NDArray<float16>* target, float16 *extraParams) const, SCALAR_OPS)
+
+BUILD_CALL_1(template void sd::NDArray<float>::applyScalar, float, (NDArray<float>& scalar, NDArray<float>* target, float *extraParams) const, SCALAR_OPS)
+BUILD_CALL_1(template void sd::NDArray<float>::applyScalar, float, (float scalar, NDArray<float>* target, float *extraParams) const, SCALAR_OPS)
+
+BUILD_CALL_1(template void sd::NDArray<double>::applyScalar, double, (NDArray<double>& scalar, NDArray<double>* target, double *extraParams) const, SCALAR_OPS)
+BUILD_CALL_1(template void sd::NDArray<double>::applyScalar, double, (double scalar, NDArray<double>* target, double *extraParams) const, SCALAR_OPS)
+
+
+
+BUILD_CALL_1(template float16 sd::NDArray<float16>::reduceNumber, float16, (float16 *extraParams) const, REDUCE_OPS)
+BUILD_CALL_1(template float sd::NDArray<float>::reduceNumber, float, (float *extraParams) const, REDUCE_OPS)
+BUILD_CALL_1(template double sd::NDArray<double>::reduceNumber, double, (double *extraParams) const, REDUCE_OPS)
+
+BUILD_CALL_1(template Nd4jLong sd::NDArray<float16>::indexReduceNumber, float16, (float16 *extraParams), INDEX_REDUCE_OPS)
+BUILD_CALL_1(template Nd4jLong sd::NDArray<float>::indexReduceNumber, float, (float *extraParams), INDEX_REDUCE_OPS)
+BUILD_CALL_1(template Nd4jLong sd::NDArray<double>::indexReduceNumber, double, (double *extraParams), INDEX_REDUCE_OPS)
+
+BUILD_CALL_1(template void sd::NDArray<float16>::applyBroadcast, float16, (std::initializer_list<int> list, const sd::NDArray<float16>* a, sd::NDArray<float16>* b, float16* c), BROADCAST_OPS)
+BUILD_CALL_1(template void sd::NDArray<float>::applyBroadcast, float, (std::initializer_list<int> list, const sd::NDArray<float>* a, sd::NDArray<float>* b, float* c), BROADCAST_OPS)
+BUILD_CALL_1(template void sd::NDArray<double>::applyBroadcast, double, (std::initializer_list<int> list, const sd::NDArray<double>* a, sd::NDArray<double>* b, double* c), BROADCAST_OPS)
+
+BUILD_CALL_1(template void sd::NDArray<float16>::applyTrueBroadcast, float16,(const sd::NDArray<float16>* a, sd::NDArray<float16>* target, const bool checkTargetShape, float16* c) const, BROADCAST_OPS)
+BUILD_CALL_1(template void sd::NDArray<float>::applyTrueBroadcast,   float,  (const sd::NDArray<float>*   a, sd::NDArray<float>*   target, const bool checkTargetShape, float* c)   const, BROADCAST_OPS)
+BUILD_CALL_1(template void sd::NDArray<double>::applyTrueBroadcast,  double, (const sd::NDArray<double>*  a, sd::NDArray<double>*  target, const bool checkTargetShape, double* c)  const, BROADCAST_OPS)
+
+BUILD_CALL_1(template sd::NDArray<float16>* sd::NDArray<float16>::applyTrueBroadcast, float16, (const sd::NDArray<float16>* a, float16* c) const, BROADCAST_OPS)
+BUILD_CALL_1(template sd::NDArray<float>* sd::NDArray<float>::applyTrueBroadcast, float, (const sd::NDArray<float>* a, float* c) const, BROADCAST_OPS)
+BUILD_CALL_1(template sd::NDArray<double>* sd::NDArray<double>::applyTrueBroadcast, double, (const sd::NDArray<double>* a, double* c) const, BROADCAST_OPS)
+
+BUILD_CALL_1(template sd::NDArray<float16> sd::NDArray<float16>::applyTrueBroadcast, float16, (const sd::NDArray<float16>& a, float16* c) const, BROADCAST_OPS)
+BUILD_CALL_1(template sd::NDArray<float> sd::NDArray<float>::applyTrueBroadcast, float, (const sd::NDArray<float>& a, float* c) const, BROADCAST_OPS)
+BUILD_CALL_1(template sd::NDArray<double> sd::NDArray<double>::applyTrueBroadcast, double, (const sd::NDArray<double>& a, double* c) const, BROADCAST_OPS)
+
+BUILD_CALL_1(template void sd::NDArray<float16>::applyTransform, float16, (NDArray<float16>* target, float16* extraParams), TRANSFORM_OPS)
+BUILD_CALL_1(template void sd::NDArray<float>::applyTransform, float, (NDArray<float>* target, float* extraParams), TRANSFORM_OPS)
+BUILD_CALL_1(template void sd::NDArray<double>::applyTransform, double, (NDArray<double>* target, double* extraParams), TRANSFORM_OPS)
+
+BUILD_CALL_1(template void sd::NDArray<float16>::applyTransform, float16, (float16* extraParams), TRANSFORM_OPS)
+BUILD_CALL_1(template void sd::NDArray<float>::applyTransform, float, (float* extraParams), TRANSFORM_OPS)
+BUILD_CALL_1(template void sd::NDArray<double>::applyTransform, double, (double* extraParams), TRANSFORM_OPS)
+
+BUILD_CALL_1(template void sd::NDArray<float16>::applyRandom, float16, (sd::random::RandomBuffer *buffer, NDArray<float16>* y, NDArray<float16>* z, float16* extraParams), RANDOM_OPS)
+BUILD_CALL_1(template void sd::NDArray<float>::applyRandom, float, (sd::random::RandomBuffer *buffer, NDArray<float>* y, NDArray<float>* z, float* extraParams), RANDOM_OPS)
+BUILD_CALL_1(template void sd::NDArray<double>::applyRandom, double, (sd::random::RandomBuffer *buffer, NDArray<double>* y, NDArray<double>* z, double* extraParams), RANDOM_OPS)
+
+BUILD_CALL_1(template NDArray<float16> sd::NDArray<float16>::transform, float16, (float16* extraParams) const, TRANSFORM_OPS)
+BUILD_CALL_1(template NDArray<float>   sd::NDArray<float>::transform, float, (float* extraParams) const, TRANSFORM_OPS)
+BUILD_CALL_1(template NDArray<double>  sd::NDArray<double>::transform, double, (double* extraParams) const, TRANSFORM_OPS)
+
+BUILD_CALL_1(template NDArray<float> *sd::NDArray<float>::template reduceAlongDimension, float, (const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes) const, REDUCE_OPS)
+BUILD_CALL_1(template NDArray<float16> *sd::NDArray<float16>::template reduceAlongDimension, float16, (const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes) const, REDUCE_OPS)
+BUILD_CALL_1(template NDArray<double> *sd::NDArray<double>::template reduceAlongDimension, double, (const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes) const, REDUCE_OPS)
+
+BUILD_CALL_1(template NDArray<float> sd::NDArray<float>::template reduceAlongDims, float, (const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes) const, REDUCE_OPS)
+BUILD_CALL_1(template NDArray<float16> sd::NDArray<float16>::template reduceAlongDims, float16, (const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes) const, REDUCE_OPS)
+BUILD_CALL_1(template NDArray<double> sd::NDArray<double>::template reduceAlongDims, double, (const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes) const, REDUCE_OPS)
+
+BUILD_CALL_1(template NDArray<float> *sd::NDArray<float>::template reduceAlongDimension, float, (const std::initializer_list<int>& dimensions, const bool keepDims, const bool supportOldShapes) const, REDUCE_OPS)
+BUILD_CALL_1(template NDArray<float16> *sd::NDArray<float16>::template reduceAlongDimension, float16, (const std::initializer_list<int>& dimensions, const bool keepDims, const bool supportOldShapes) const, REDUCE_OPS)
+BUILD_CALL_1(template NDArray<double> *sd::NDArray<double>::template reduceAlongDimension, double, (const std::initializer_list<int>& dimensions, const bool keepDims, const bool supportOldShapes) const, REDUCE_OPS)
+
+BUILD_CALL_1(template void sd::NDArray<float>::template reduceAlongDimension, float, (NDArray<float>* target, const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes,  float * extras) const, REDUCE_OPS)
+BUILD_CALL_1(template void sd::NDArray<float16>::template reduceAlongDimension, float16, (NDArray<float16>* target, const std::vector<int>& dimensions, const bool keepDims, const bool supportOldShapes, float16 * extras) const, REDUCE_OPS)
+BUILD_CALL_1(template void sd::NDArray<double>::template reduceAlongDimension, double, (NDArray<double>* target, const std::vector<int>& dimension, const bool keepDims, const bool supportOldShapes, double * extras) const, REDUCE_OPS)
+
+BUILD_CALL_1(template NDArray<float> *sd::NDArray<float>::template varianceAlongDimension, float, (const bool biasCorrected, const std::initializer_list<int>& dimensions) const, SUMMARY_STATS_OPS)
+BUILD_CALL_1(template NDArray<float16> *sd::NDArray<float16>::template varianceAlongDimension, float16, (const bool biasCorrected, const std::initializer_list<int>& dimensions) const, SUMMARY_STATS_OPS)
+BUILD_CALL_1(template NDArray<double> *sd::NDArray<double>::template varianceAlongDimension, double, (const bool biasCorrected, const std::initializer_list<int>& dimensions) const, SUMMARY_STATS_OPS)
+
+BUILD_CALL_1(template void sd::NDArray<float>::template varianceAlongDimension, float, (const NDArray<float> *target, const bool biasCorrected, const std::initializer_list<int>& dimensions), SUMMARY_STATS_OPS)
+BUILD_CALL_1(template void sd::NDArray<float16>::template varianceAlongDimension, float16, (const NDArray<float16> *target,const bool biasCorrected, const std::initializer_list<int>& dimensions), SUMMARY_STATS_OPS)
+BUILD_CALL_1(template void sd::NDArray<double>::template varianceAlongDimension, double, (const NDArray<double> *target, const bool biasCorrected, const std::initializer_list<int>& dimensions), SUMMARY_STATS_OPS)
+
+BUILD_CALL_1(template void sd::NDArray<float>::template varianceAlongDimension, float, (const NDArray<float> *target, const bool biasCorrected, const std::vector<int>& dimensions), SUMMARY_STATS_OPS)
+BUILD_CALL_1(template void sd::NDArray<float16>::template varianceAlongDimension, float16, (const NDArray<float16> *target,const bool biasCorrected, const std::vector<int>& dimensions), SUMMARY_STATS_OPS)
+BUILD_CALL_1(template void sd::NDArray<double>::template varianceAlongDimension, double, (const NDArray<double> *target, const bool biasCorrected, const std::vector<int>& dimensions), SUMMARY_STATS_OPS)
+
+BUILD_CALL_1(template float sd::NDArray<float>::template varianceNumber, float, (bool biasCorrected), SUMMARY_STATS_OPS)
+BUILD_CALL_1(template float16 sd::NDArray<float16>::template varianceNumber, float16, (bool biasCorrected), SUMMARY_STATS_OPS)
+BUILD_CALL_1(template double sd::NDArray<double>::template varianceNumber, double, (bool biasCorrected), SUMMARY_STATS_OPS)
+
+BUILD_CALL_1(template NDArray<float> *sd::NDArray<float>::template applyReduce3, float, (const NDArray<float>* other, const float* extraParams) const, REDUCE3_OPS)
+BUILD_CALL_1(template NDArray<float16> *sd::NDArray<float16>::template applyReduce3, float16, (const NDArray<float16>* other, const float16* extraParams) const, REDUCE3_OPS)
+BUILD_CALL_1(template NDArray<double> *sd::NDArray<double>::template applyReduce3, double, (const NDArray<double>* other, const double* extraParams) const, REDUCE3_OPS)
+
+BUILD_CALL_1(template NDArray<float> *sd::NDArray<float>::template applyReduce3, float, (const NDArray<float>* other, const std::vector<int> &dims, const float* extraParams) const, REDUCE3_OPS)
+BUILD_CALL_1(template NDArray<float16> *sd::NDArray<float16>::template applyReduce3, float16, (const NDArray<float16>* other, const std::vector<int> &dims, const float16* extraParams) const, REDUCE3_OPS)
+BUILD_CALL_1(template NDArray<double> *sd::NDArray<double>::template applyReduce3, double, (const NDArray<double>* other, const std::vector<int> &dims, const double* extraParams) const, REDUCE3_OPS)
+
+BUILD_CALL_1(template void sd::NDArray<float>::template applyIndexReduce, float, (const NDArray<float>* target, const std::vector<int> & alpha, const float* beta) const, INDEX_REDUCE_OPS)
+BUILD_CALL_1(template void sd::NDArray<float16>::template applyIndexReduce, float16, (const NDArray<float16>* target, const std::vector<int> & alpha, const float16* beta) const, INDEX_REDUCE_OPS)
+BUILD_CALL_1(template void sd::NDArray<double>::template applyIndexReduce, double, (const NDArray<double>* target, const std::vector<int> & alpha, const double* beta) const, INDEX_REDUCE_OPS)
+
+BUILD_CALL_1(template NDArray<float> *sd::NDArray<float>::template applyIndexReduce, float, (const std::vector<int> & alpha, const float* beta) const, INDEX_REDUCE_OPS)
+BUILD_CALL_1(template NDArray<float16> *sd::NDArray<float16>::template applyIndexReduce, float16, (const std::vector<int> & alpha, const float16* beta) const, INDEX_REDUCE_OPS)
+BUILD_CALL_1(template NDArray<double> *sd::NDArray<double>::template applyIndexReduce, double, (const std::vector<int> & alpha, const double* beta) const, INDEX_REDUCE_OPS)
+
+BUILD_CALL_1(template NDArray<float> *sd::NDArray<float>::template applyAllReduce3, float, (const sd::NDArray<float>* alpha, const std::vector<int> & beta, float const* gamma) const, REDUCE3_OPS)
+BUILD_CALL_1(template NDArray<float16> *sd::NDArray<float16>::template applyAllReduce3, float16, (const sd::NDArray<float16>* alpha, const std::vector<int> & beta, float16 const* gamma) const, REDUCE3_OPS)
+BUILD_CALL_1(template NDArray<double> *sd::NDArray<double>::template applyAllReduce3, double, (const sd::NDArray<double>* alpha, const std::vector<int> & beta, double const* gamma) const, REDUCE3_OPS)
+
+template NDArray<float>   mmul(const NDArray<float>&   left, const NDArray<float>& right);
+template NDArray<float16> mmul(const NDArray<float16>& left, const NDArray<float16>& right);
+template NDArray<double>  mmul(const NDArray<double>&  left, const NDArray<double>& right);
+
+// template NDArray<float>   operator-(const float,   const NDArray<float>&);
+// template NDArray<float16> operator-(const float16, const NDArray<float16>&);
+// template NDArray<double>  operator-(const double,  const NDArray<double>&);
+
+// template NDArray<float>   operator+(const float,   const NDArray<float>&);
+// template NDArray<float16> operator+(const float16, const NDArray<float16>&);
+// template NDArray<double>  operator+(const double,  const NDArray<double>&);
+
+
+#endif
\ No newline at end of file
diff --git a/libnd4j/blas/cpu/NDArrayLambda.hpp b/libnd4j/include/array/cpu/NDArrayLambda.hpp
similarity index 100%
rename from libnd4j/blas/cpu/NDArrayLambda.hpp
rename to libnd4j/include/array/cpu/NDArrayLambda.hpp
diff --git a/libnd4j/include/array/cuda/DataBuffer.cu b/libnd4j/include/array/cuda/DataBuffer.cu
index 28e0c432f..922b6967b 100644
--- a/libnd4j/include/array/cuda/DataBuffer.cu
+++ b/libnd4j/include/array/cuda/DataBuffer.cu
@@ -20,14 +20,14 @@
 //
 
 #include "../DataBuffer.h"
-#include <DataTypeUtils.h>
-#include <op_boilerplate.h>
+#include <array/DataTypeUtils.h>
+#include <system/op_boilerplate.h>
 #include <exceptions/cuda_exception.h>
 #include <execution/AffinityManager.h>
 #include <memory/MemoryCounter.h>
 #include <exceptions/allocation_exception.h>
 
-namespace nd4j {
+namespace sd {
     void DataBuffer::expand(const uint64_t size) {
         if (size > _lenInBytes) {
             // allocate new buffer
@@ -67,19 +67,19 @@ namespace nd4j {
 void DataBuffer::allocateSpecial() {
 
     if (_specialBuffer == nullptr && getLenInBytes() > 0) {
-        auto deviceId = nd4j::AffinityManager::currentDeviceId();
+        auto deviceId = sd::AffinityManager::currentDeviceId();
 
         if (_workspace == nullptr)
-            if (!nd4j::memory::MemoryCounter::getInstance()->validate(getLenInBytes()))
-                throw nd4j::allocation_exception::build("Requested amount exceeds device limits", nd4j::memory::MemoryCounter::getInstance()->deviceLimit(deviceId), getLenInBytes());
+            if (!sd::memory::MemoryCounter::getInstance()->validate(getLenInBytes()))
+                throw sd::allocation_exception::build("Requested amount exceeds device limits", sd::memory::MemoryCounter::getInstance()->deviceLimit(deviceId), getLenInBytes());
 
 
         ALLOCATE_SPECIAL(_specialBuffer, _workspace, getLenInBytes(), int8_t);
         _isOwnerSpecial = true;
 
         if (_workspace == nullptr) {
-            nd4j::memory::MemoryCounter::getInstance()->countIn(deviceId, getLenInBytes());
-            nd4j::memory::MemoryCounter::getInstance()->countIn(nd4j::memory::MemoryType::DEVICE, getLenInBytes());
+            sd::memory::MemoryCounter::getInstance()->countIn(deviceId, getLenInBytes());
+            sd::memory::MemoryCounter::getInstance()->countIn(sd::memory::MemoryType::DEVICE, getLenInBytes());
         }
     }
 }
@@ -135,8 +135,8 @@ void DataBuffer::deleteSpecial() {
 
         // count out towards DataBuffer device, only if we're not in workspace
         if (_workspace == nullptr) {
-            nd4j::memory::MemoryCounter::getInstance()->countOut(_deviceId, getLenInBytes());
-            nd4j::memory::MemoryCounter::getInstance()->countOut(nd4j::memory::MemoryType::DEVICE, getLenInBytes());
+            sd::memory::MemoryCounter::getInstance()->countOut(_deviceId, getLenInBytes());
+            sd::memory::MemoryCounter::getInstance()->countOut(sd::memory::MemoryType::DEVICE, getLenInBytes());
         }
     }
 }
diff --git a/libnd4j/blas/cuda/NDArray.cu b/libnd4j/include/array/cuda/NDArray.cu
similarity index 98%
rename from libnd4j/blas/cuda/NDArray.cu
rename to libnd4j/include/array/cuda/NDArray.cu
index 81c8070b3..fcd0b6a9d 100644
--- a/libnd4j/blas/cuda/NDArray.cu
+++ b/libnd4j/include/array/cuda/NDArray.cu
@@ -17,14 +17,14 @@
 #ifndef NDARRAY_CPP
 #define NDARRAY_CPP
 
-#include "../NDArray.h"
-#include "../NDArrayFactory.h"
-#include "NativeOpExecutioner.h"
+#include <array/NDArray.h>
+#include <array/NDArrayFactory.h>
+#include <legacy/NativeOpExecutioner.h>
 #include <memory/Workspace.h>
 #include <memory/MemoryRegistrator.h>
-#include <ops.h>
+#include <ops/ops.h>
 #include <ops/gemm.h>
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <stdexcept>
 #include <memory>
 #include <helpers/logger.h>
@@ -37,17 +37,17 @@
 #include <helpers/ShapeUtils.h>
 #include <sstream>
 #include <helpers/ArrayUtils.h>
-#include <MmulHelper.h>
+#include <helpers/MmulHelper.h>
 #include <helpers/threshold.h>
 #include <exceptions/datatype_exception.h>
 #include <exceptions/cuda_exception.h>
-#include <specials_cuda.h>
+#include <ops/specials_cuda.h>
 #include <loops/special_kernels.h>
-#include <PointersManager.h>
-#include "../NDArray.hpp"
-#include <ConstantShapeHelper.h>
+#include <helpers/PointersManager.h>
+#include <array/NDArray.hXX>
+#include <helpers/ConstantShapeHelper.h>
 
-namespace nd4j {
+namespace sd {
 
 void* NDArray::platformBuffer()             { return specialBuffer();    }
 void* NDArray::getPlatformBuffer() const    { return getSpecialBuffer(); }
@@ -569,6 +569,6 @@ template void NDArray::printCurrentBuffer<double>(const bool host, const char* m
 
 #endif
 
-} // end namespace nd4j
+} // end namespace sd
 #endif
 
diff --git a/libnd4j/include/array/impl/ByteOrderUtils.cpp b/libnd4j/include/array/impl/ByteOrderUtils.cpp
index e23734cab..0220ccac8 100644
--- a/libnd4j/include/array/impl/ByteOrderUtils.cpp
+++ b/libnd4j/include/array/impl/ByteOrderUtils.cpp
@@ -21,8 +21,8 @@
 #include <array/ByteOrderUtils.h>
 
 
-namespace nd4j {
-    ByteOrder ByteOrderUtils::fromFlatByteOrder(nd4j::graph::ByteOrder order) {
+namespace sd {
+    ByteOrder ByteOrderUtils::fromFlatByteOrder(sd::graph::ByteOrder order) {
         return (ByteOrder) order;
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/array/impl/ConstantDataBuffer.cpp b/libnd4j/include/array/impl/ConstantDataBuffer.cpp
index 90a631392..20c842266 100644
--- a/libnd4j/include/array/impl/ConstantDataBuffer.cpp
+++ b/libnd4j/include/array/impl/ConstantDataBuffer.cpp
@@ -20,7 +20,7 @@
 
 #include "../ConstantDataBuffer.h"
 
-namespace nd4j {
+namespace sd {
     ConstantDataBuffer::ConstantDataBuffer(Nd4jPointer primary, Nd4jPointer special, Nd4jLong numEelements, Nd4jLong sizeOf) {
         _primaryBuffer = primary;
         _specialBuffer = special;
diff --git a/libnd4j/include/array/impl/ConstantDescriptor.cpp b/libnd4j/include/array/impl/ConstantDescriptor.cpp
index d53ef0adc..ebb27090d 100644
--- a/libnd4j/include/array/impl/ConstantDescriptor.cpp
+++ b/libnd4j/include/array/impl/ConstantDescriptor.cpp
@@ -19,10 +19,10 @@
 //
 
 #include <array/ConstantDescriptor.h>
-#include <DataTypeUtils.h>
+#include <array/DataTypeUtils.h>
 #include <stdexcept>
 
-namespace nd4j {
+namespace sd {
     ConstantDescriptor::ConstantDescriptor(double* values, int length) {
         for (int e = 0; e < length; e++)
             _floatValues.emplace_back(values[e]);
@@ -77,7 +77,7 @@ namespace nd4j {
 }
 
 namespace std {
-    size_t hash<nd4j::ConstantDescriptor>::operator()(const nd4j::ConstantDescriptor &k) const {
+    size_t hash<sd::ConstantDescriptor>::operator()(const sd::ConstantDescriptor &k) const {
         using std::hash;
         // Compute individual hash values for first,
         // second and third and combine them using XOR
diff --git a/libnd4j/include/array/impl/ConstantHolder.cpp b/libnd4j/include/array/impl/ConstantHolder.cpp
index b3adf8516..08637862c 100644
--- a/libnd4j/include/array/impl/ConstantHolder.cpp
+++ b/libnd4j/include/array/impl/ConstantHolder.cpp
@@ -18,17 +18,17 @@
 // Created by raver on 5/17/2019.
 //
 
-#include <DataTypeUtils.h>
+#include <array/DataTypeUtils.h>
 #include <array/ConstantHolder.h>
-#include <shape.h>
+#include <helpers/shape.h>
 
-namespace nd4j {
+namespace sd {
     ConstantHolder::ConstantHolder(const ConstantHolder& other) {
         _buffers = other._buffers;
         _deviceId = other._deviceId;
     }
 
-    bool ConstantHolder::hasBuffer(nd4j::DataType dataType) {
+    bool ConstantHolder::hasBuffer(sd::DataType dataType) {
         return _buffers.count(dataType) > 0;
     }
 
@@ -42,7 +42,7 @@ namespace nd4j {
     }
     BUILD_SINGLE_TEMPLATE(template ND4J_EXPORT bool ConstantHolder::hasBuffer, (void), LIBND4J_TYPES);
 
-    void ConstantHolder::addBuffer(ConstantDataBuffer &pointer, nd4j::DataType dataType) {
+    void ConstantHolder::addBuffer(ConstantDataBuffer &pointer, sd::DataType dataType) {
         _buffers[dataType] = pointer;
     }
 
@@ -52,7 +52,7 @@ namespace nd4j {
     }
     BUILD_SINGLE_TEMPLATE(template ND4J_EXPORT void ConstantHolder::addBuffer, (ConstantDataBuffer& cb), LIBND4J_TYPES);
 
-    ConstantDataBuffer* ConstantHolder::getConstantDataBuffer(nd4j::DataType dataType) {
+    ConstantDataBuffer* ConstantHolder::getConstantDataBuffer(sd::DataType dataType) {
         if (!hasBuffer(dataType))
             throw std::runtime_error("Requested dataType is absent in storage");
 
diff --git a/libnd4j/include/array/impl/DataBuffer.cpp b/libnd4j/include/array/impl/DataBuffer.cpp
index 36758c684..262460e8c 100644
--- a/libnd4j/include/array/impl/DataBuffer.cpp
+++ b/libnd4j/include/array/impl/DataBuffer.cpp
@@ -26,7 +26,7 @@
 #include <memory/MemoryCounter.h>
 #include <exceptions/allocation_exception.h>
 
-namespace nd4j {
+namespace sd {
     ///// IMLEMENTATION OF COMMON METHODS /////
 
 
@@ -41,7 +41,7 @@ namespace nd4j {
         _workspace = nullptr;
         _isOwnerPrimary = false;
         _isOwnerSpecial = false;
-        _deviceId = nd4j::AffinityManager::currentDeviceId();
+        _deviceId = sd::AffinityManager::currentDeviceId();
 
         setCountersToZero();
     }
@@ -83,7 +83,7 @@ namespace nd4j {
         _workspace      = workspace;
         _isOwnerPrimary = isOwnerPrimary;
         _isOwnerSpecial = isOwnerSpecial;
-        _deviceId = nd4j::AffinityManager::currentDeviceId();
+        _deviceId = sd::AffinityManager::currentDeviceId();
 
         setCountersToZero();
 
@@ -115,7 +115,7 @@ namespace nd4j {
         _dataType       = dataType;
         _workspace      = workspace;
 
-        _deviceId = nd4j::AffinityManager::currentDeviceId();
+        _deviceId = sd::AffinityManager::currentDeviceId();
 
         setCountersToZero();
 
@@ -134,7 +134,7 @@ namespace nd4j {
         _primaryBuffer = nullptr;
         _specialBuffer = nullptr;
 
-        _deviceId = nd4j::AffinityManager::currentDeviceId();
+        _deviceId = sd::AffinityManager::currentDeviceId();
 
         setCountersToZero();
 
@@ -234,17 +234,17 @@ namespace nd4j {
     void DataBuffer::allocatePrimary() {
 
         if (_primaryBuffer == nullptr && getLenInBytes() > 0) {
-            auto deviceId = nd4j::AffinityManager::currentDeviceId();
+            auto deviceId = sd::AffinityManager::currentDeviceId();
             // check if this allocation won't bring us above limit
             if (_workspace == nullptr) {
                 if (Environment::getInstance()->isCPU()) {
                     // on cpu backend we validate against device 0 for now
-                    if (!nd4j::memory::MemoryCounter::getInstance()->validate(getLenInBytes()))
-                        throw nd4j::allocation_exception::build("Requested amount exceeds HOST device limits", nd4j::memory::MemoryCounter::getInstance()->deviceLimit(deviceId), getLenInBytes());
+                    if (!sd::memory::MemoryCounter::getInstance()->validate(getLenInBytes()))
+                        throw sd::allocation_exception::build("Requested amount exceeds HOST device limits", sd::memory::MemoryCounter::getInstance()->deviceLimit(deviceId), getLenInBytes());
                 } else {
                     // in heterogenous mode we valdate against device group
-                    if (!nd4j::memory::MemoryCounter::getInstance()->validateGroup(nd4j::memory::MemoryType::HOST, getLenInBytes()))
-                        throw nd4j::allocation_exception::build("Requested amount exceeds HOST group limits", nd4j::memory::MemoryCounter::getInstance()->groupLimit(nd4j::memory::MemoryType::HOST), getLenInBytes());
+                    if (!sd::memory::MemoryCounter::getInstance()->validateGroup(sd::memory::MemoryType::HOST, getLenInBytes()))
+                        throw sd::allocation_exception::build("Requested amount exceeds HOST group limits", sd::memory::MemoryCounter::getInstance()->groupLimit(sd::memory::MemoryType::HOST), getLenInBytes());
                 }
             }
 
@@ -254,9 +254,9 @@ namespace nd4j {
             // count in towards current deviceId if we're not in workspace mode
             if (_workspace == nullptr) {
                 if (Environment::getInstance()->isCPU()) // we don't want this counter to be added to CUDA device
-                    nd4j::memory::MemoryCounter::getInstance()->countIn(deviceId, getLenInBytes());
+                    sd::memory::MemoryCounter::getInstance()->countIn(deviceId, getLenInBytes());
 
-                nd4j::memory::MemoryCounter::getInstance()->countIn(nd4j::memory::MemoryType::HOST, getLenInBytes());
+                sd::memory::MemoryCounter::getInstance()->countIn(sd::memory::MemoryType::HOST, getLenInBytes());
             }
         }
     }
@@ -280,9 +280,9 @@ namespace nd4j {
             // count out towards DataBuffer device, only if we're not in workspace
             if (_workspace == nullptr) {
                 if (Environment::getInstance()->isCPU())
-                    nd4j::memory::MemoryCounter::getInstance()->countOut(_deviceId, getLenInBytes());
+                    sd::memory::MemoryCounter::getInstance()->countOut(_deviceId, getLenInBytes());
 
-                nd4j::memory::MemoryCounter::getInstance()->countOut(nd4j::memory::MemoryType::HOST, getLenInBytes());
+                sd::memory::MemoryCounter::getInstance()->countOut(sd::memory::MemoryType::HOST, getLenInBytes());
             }
         }
     }
diff --git a/libnd4j/include/array/impl/DataTypeUtils.cpp b/libnd4j/include/array/impl/DataTypeUtils.cpp
index cdf688b25..481fa4149 100644
--- a/libnd4j/include/array/impl/DataTypeUtils.cpp
+++ b/libnd4j/include/array/impl/DataTypeUtils.cpp
@@ -22,12 +22,12 @@
 #include <array/DataTypeUtils.h>
 #include <types/float16.h>
 
-namespace nd4j {
+namespace sd {
     DataType DataTypeUtils::fromInt(int val) {
         return (DataType) val;
     }
 
-    DataType DataTypeUtils::fromFlatDataType(nd4j::graph::DType dtype) {
+    DataType DataTypeUtils::fromFlatDataType(sd::graph::DType dtype) {
         return (DataType) dtype;
     }
 
diff --git a/libnd4j/include/array/impl/ExtraArguments.cpp b/libnd4j/include/array/impl/ExtraArguments.cpp
index f9174ea0f..084f327cc 100644
--- a/libnd4j/include/array/impl/ExtraArguments.cpp
+++ b/libnd4j/include/array/impl/ExtraArguments.cpp
@@ -29,7 +29,7 @@
 #include <cuda_runtime.h>
 #endif
 
-namespace nd4j {
+namespace sd {
     ExtraArguments::ExtraArguments(std::initializer_list<double> arguments) {
         _fpArgs = arguments;
     }
@@ -122,7 +122,7 @@ namespace nd4j {
     BUILD_SINGLE_TEMPLATE(template ND4J_EXPORT void *ExtraArguments::argumentsAsT, (Nd4jLong offset), LIBND4J_TYPES);
 
 
-    void* ExtraArguments::argumentsAsT(nd4j::DataType dataType, Nd4jLong offset) {
+    void* ExtraArguments::argumentsAsT(sd::DataType dataType, Nd4jLong offset) {
         if (_fpArgs.empty() && _intArgs.empty())
             return nullptr;
 
diff --git a/libnd4j/include/array/impl/InteropDataBuffer.cpp b/libnd4j/include/array/impl/InteropDataBuffer.cpp
index cffc1462b..d0a381612 100644
--- a/libnd4j/include/array/impl/InteropDataBuffer.cpp
+++ b/libnd4j/include/array/impl/InteropDataBuffer.cpp
@@ -23,7 +23,7 @@
 #include <execution/AffinityManager.h>
 #include <helpers/logger.h>
 
-namespace nd4j {
+namespace sd {
     InteropDataBuffer::InteropDataBuffer(InteropDataBuffer &dataBuffer, uint64_t length, uint64_t offset) {
         _dataBuffer = dataBuffer.getDataBuffer();
 
@@ -39,7 +39,7 @@ namespace nd4j {
         _dataBuffer = databuffer;
     }
 
-    InteropDataBuffer::InteropDataBuffer(size_t elements, nd4j::DataType dtype, bool allocateBoth) {
+    InteropDataBuffer::InteropDataBuffer(size_t elements, sd::DataType dtype, bool allocateBoth) {
         if (elements == 0) {
             _dataBuffer = std::make_shared<DataBuffer>();
             _dataBuffer->setDataType(dtype);
@@ -95,7 +95,7 @@ namespace nd4j {
     }
 
     void InteropDataBuffer::prepareSpecialUse(const std::vector<const InteropDataBuffer*>& writeList, const std::vector<const InteropDataBuffer*>& readList, bool synchronizeWritables) {
-        auto currentDeviceId = nd4j::AffinityManager::currentDeviceId();
+        auto currentDeviceId = sd::AffinityManager::currentDeviceId();
         for (const auto &v:readList) {
             if (v == nullptr)
                 continue;
diff --git a/libnd4j/blas/cpu/NDArrayFactory.cpp b/libnd4j/include/array/impl/NDArrayFactory.cpp
similarity index 74%
rename from libnd4j/blas/cpu/NDArrayFactory.cpp
rename to libnd4j/include/array/impl/NDArrayFactory.cpp
index 736452b48..870fdc198 100644
--- a/libnd4j/blas/cpu/NDArrayFactory.cpp
+++ b/libnd4j/include/array/impl/NDArrayFactory.cpp
@@ -20,30 +20,30 @@
 // @author Oleg Semeniv <oleg.semeniv@gmail.com>
 //
 
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 #include <exceptions/cuda_exception.h>
-#include <ConstantHelper.h>
-#include <ConstantShapeHelper.h>
-#include <GraphExecutioner.h>
-#include <ShapeUtils.h>
+#include <helpers/ConstantHelper.h>
+#include <helpers/ConstantShapeHelper.h>
+#include <graph/GraphExecutioner.h>
+#include <helpers/ShapeUtils.h>
 #include <type_traits>
 
 
 
 
-#include <StringUtils.h>
-#include <NativeOps.h>
+#include <helpers/StringUtils.h>
+#include <legacy/NativeOps.h>
 
-namespace nd4j {
+namespace sd {
 
     ////////////////////////////////////////////////////////////////////////
     template <>
-    ND4J_EXPORT NDArray NDArrayFactory::create<bool>(const char order, const std::vector<Nd4jLong> &shape, const std::vector<bool> &data, nd4j::LaunchContext * context) {
+    ND4J_EXPORT NDArray NDArrayFactory::create<bool>(const char order, const std::vector<Nd4jLong> &shape, const std::vector<bool> &data, sd::LaunchContext * context) {
 
         if ((int) shape.size() > MAX_RANK)
             throw std::invalid_argument("NDArrayFactory::create: rank of NDArray can't exceed 32 !");
 
-        ShapeDescriptor descriptor(nd4j::DataType::BOOL, order, shape);
+        ShapeDescriptor descriptor(sd::DataType::BOOL, order, shape);
 
         if (descriptor.arrLength() != data.size()) {
             nd4j_printf("NDArrayFactory::create: data size [%i] doesn't match shape length [%lld]\n", data.size(), descriptor.arrLength());
@@ -54,7 +54,7 @@ namespace nd4j {
         ALLOCATE(hostBuffer, context->getWorkspace(), data.size(), bool);
         std::copy(data.begin(), data.end(), hostBuffer);
 
-        std::shared_ptr<DataBuffer> buffer = std::make_shared<DataBuffer>(hostBuffer, data.size() * sizeof(bool), nd4j::DataType::BOOL, true, context->getWorkspace());
+        std::shared_ptr<DataBuffer> buffer = std::make_shared<DataBuffer>(hostBuffer, data.size() * sizeof(bool), sd::DataType::BOOL, true, context->getWorkspace());
 
         NDArray result(buffer, descriptor, context);
 
@@ -63,7 +63,7 @@ namespace nd4j {
 
     ////////////////////////////////////////////////////////////////////////
     template <typename T>
-    NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::vector<T> &data, nd4j::LaunchContext * context) {
+    NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::vector<T> &data, sd::LaunchContext * context) {
 
         if ((int) shape.size() > MAX_RANK)
             throw std::invalid_argument("NDArrayFactory::create: rank of NDArray can't exceed 32 !");
@@ -81,25 +81,25 @@ namespace nd4j {
 
         return result;
     }
-    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::vector<double>& data, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::vector<float>& data, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::vector<float16>& data, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::vector<bfloat16>& data, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::vector<Nd4jLong>& data, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::vector<uint64_t>& data, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::vector<int>& data, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::vector<unsigned int>& data, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::vector<int16_t>& data, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::vector<int8_t>& data, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::vector<uint8_t>& data, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::vector<bool>& data, nd4j::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::vector<double>& data, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::vector<float>& data, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::vector<float16>& data, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::vector<bfloat16>& data, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::vector<Nd4jLong>& data, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::vector<uint64_t>& data, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::vector<int>& data, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::vector<unsigned int>& data, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::vector<int16_t>& data, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::vector<int8_t>& data, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::vector<uint8_t>& data, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::vector<bool>& data, sd::LaunchContext * context);
 
 ////////////////////////////////////////////////////////////////////////
 template<typename T>
-NDArray* NDArrayFactory::create_(const char order, const std::vector<Nd4jLong> &shape, nd4j::LaunchContext * context) {
+NDArray* NDArrayFactory::create_(const char order, const std::vector<Nd4jLong> &shape, sd::LaunchContext * context) {
     return create_(order, shape, DataTypeUtils::fromT<T>(), context);
 }
-BUILD_SINGLE_TEMPLATE(template ND4J_EXPORT NDArray* NDArrayFactory::create_, (const char order, const std::vector<Nd4jLong> &shape, nd4j::LaunchContext * context), LIBND4J_TYPES);
+BUILD_SINGLE_TEMPLATE(template ND4J_EXPORT NDArray* NDArrayFactory::create_, (const char order, const std::vector<Nd4jLong> &shape, sd::LaunchContext * context), LIBND4J_TYPES);
 
 ////////////////////////////////////////////////////////////////////////
 template <typename T>
@@ -128,44 +128,44 @@ template ND4J_EXPORT void NDArrayFactory::memcpyFromVector(void *ptr, const std:
 #ifndef __JAVACPP_HACK__
     ////////////////////////////////////////////////////////////////////////
     template <typename T>
-    NDArray* NDArrayFactory::valueOf(const std::initializer_list<Nd4jLong>& shape, const T value, const char order, nd4j::LaunchContext * context) {
+    NDArray* NDArrayFactory::valueOf(const std::initializer_list<Nd4jLong>& shape, const T value, const char order, sd::LaunchContext * context) {
         return valueOf(std::vector<Nd4jLong>(shape), value, order);
     }
-    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::initializer_list<Nd4jLong>& shape, const double value, const char order, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::initializer_list<Nd4jLong>& shape, const float value, const char order, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::initializer_list<Nd4jLong>& shape, const float16 value, const char order, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::initializer_list<Nd4jLong>& shape, const bfloat16 value, const char order, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::initializer_list<Nd4jLong>& shape, const Nd4jLong value, const char order, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::initializer_list<Nd4jLong>& shape, const int value, const char order, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::initializer_list<Nd4jLong>& shape, const uint8_t value, const char order, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::initializer_list<Nd4jLong>& shape, const int8_t value, const char order, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::initializer_list<Nd4jLong>& shape, const int16_t value, const char order, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::initializer_list<Nd4jLong>& shape, const bool value, const char order, nd4j::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::initializer_list<Nd4jLong>& shape, const double value, const char order, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::initializer_list<Nd4jLong>& shape, const float value, const char order, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::initializer_list<Nd4jLong>& shape, const float16 value, const char order, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::initializer_list<Nd4jLong>& shape, const bfloat16 value, const char order, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::initializer_list<Nd4jLong>& shape, const Nd4jLong value, const char order, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::initializer_list<Nd4jLong>& shape, const int value, const char order, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::initializer_list<Nd4jLong>& shape, const uint8_t value, const char order, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::initializer_list<Nd4jLong>& shape, const int8_t value, const char order, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::initializer_list<Nd4jLong>& shape, const int16_t value, const char order, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::initializer_list<Nd4jLong>& shape, const bool value, const char order, sd::LaunchContext * context);
 
 ////////////////////////////////////////////////////////////////////////
     template <typename T>
-    NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::initializer_list<T>& data, nd4j::LaunchContext * context) {
+    NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::initializer_list<T>& data, sd::LaunchContext * context) {
         std::vector<T> vec(data);
         return create<T>(order, shape, vec, context);
     }
-    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::initializer_list<double>& data, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::initializer_list<float>& data, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::initializer_list<float16>& data, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::initializer_list<bfloat16>& data, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::initializer_list<Nd4jLong>& data, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::initializer_list<uint64_t>& data, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::initializer_list<int>& data, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::initializer_list<unsigned int>& data, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::initializer_list<int16_t>& data, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::initializer_list<int8_t>& data, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::initializer_list<uint8_t>& data, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::initializer_list<bool>& data, nd4j::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::initializer_list<double>& data, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::initializer_list<float>& data, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::initializer_list<float16>& data, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::initializer_list<bfloat16>& data, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::initializer_list<Nd4jLong>& data, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::initializer_list<uint64_t>& data, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::initializer_list<int>& data, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::initializer_list<unsigned int>& data, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::initializer_list<int16_t>& data, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::initializer_list<int8_t>& data, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::initializer_list<uint8_t>& data, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, const std::initializer_list<bool>& data, sd::LaunchContext * context);
 
 #endif
 
 ////////////////////////////////////////////////////////////////////////
     template <typename T>
-    NDArray* NDArrayFactory::create_(const T scalar, nd4j::LaunchContext * context) {
+    NDArray* NDArrayFactory::create_(const T scalar, sd::LaunchContext * context) {
 
         std::shared_ptr<DataBuffer> buffer = std::make_shared<DataBuffer>(1 * sizeof(T), DataTypeUtils::fromT<T>(), context->getWorkspace(), true);
 
@@ -178,22 +178,22 @@ template ND4J_EXPORT void NDArrayFactory::memcpyFromVector(void *ptr, const std:
 
         return res;
     }
-    template ND4J_EXPORT NDArray* NDArrayFactory::create_(const double scalar, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::create_(const float scalar, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::create_(const float16 scalar, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::create_(const bfloat16 scalar, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::create_(const Nd4jLong scalar, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::create_(const int scalar, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::create_(const bool scalar, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::create_(const int8_t scalar, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::create_(const uint8_t scalar, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::create_(const uint16_t scalar, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::create_(const uint32_t scalar, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::create_(const uint64_t scalar, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::create_(const int16_t scalar, nd4j::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::create_(const double scalar, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::create_(const float scalar, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::create_(const float16 scalar, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::create_(const bfloat16 scalar, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::create_(const Nd4jLong scalar, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::create_(const int scalar, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::create_(const bool scalar, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::create_(const int8_t scalar, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::create_(const uint8_t scalar, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::create_(const uint16_t scalar, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::create_(const uint32_t scalar, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::create_(const uint64_t scalar, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::create_(const int16_t scalar, sd::LaunchContext * context);
 
     template <typename T>
-    NDArray NDArrayFactory::create(nd4j::DataType type, const T scalar, nd4j::LaunchContext * context) {
+    NDArray NDArrayFactory::create(sd::DataType type, const T scalar, sd::LaunchContext * context) {
 
         if (type == DataTypeUtils::fromT<T>())
             return NDArrayFactory::create(scalar,  context);
@@ -204,23 +204,23 @@ template ND4J_EXPORT void NDArrayFactory::memcpyFromVector(void *ptr, const std:
 
         return res;
     }
-//    BUILD_DOUBLE_TEMPLATE(template ND4J_EXPORT NDArray NDArrayFactory::create, (DataType type, const T scalar, nd4j::LaunchContext * context), LIBND4J_TYPES);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(DataType type, const double scalar, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(DataType type, const float scalar, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(DataType type, const float16 scalar, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(DataType type, const bfloat16 scalar, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(DataType type, const Nd4jLong scalar, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(DataType type, const int scalar, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(DataType type, const int8_t scalar, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(DataType type, const uint8_t scalar, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(DataType type, const uint16_t scalar, nd4j::LaunchContext* workspace);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(DataType type, const uint32_t scalar, nd4j::LaunchContext* workspace);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(DataType type, const uint64_t scalar, nd4j::LaunchContext* workspace);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(DataType type, const int16_t scalar, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(DataType type, const bool scalar, nd4j::LaunchContext * context);
+//    BUILD_DOUBLE_TEMPLATE(template ND4J_EXPORT NDArray NDArrayFactory::create, (DataType type, const T scalar, sd::LaunchContext * context), LIBND4J_TYPES);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(DataType type, const double scalar, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(DataType type, const float scalar, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(DataType type, const float16 scalar, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(DataType type, const bfloat16 scalar, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(DataType type, const Nd4jLong scalar, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(DataType type, const int scalar, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(DataType type, const int8_t scalar, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(DataType type, const uint8_t scalar, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(DataType type, const uint16_t scalar, sd::LaunchContext* workspace);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(DataType type, const uint32_t scalar, sd::LaunchContext* workspace);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(DataType type, const uint64_t scalar, sd::LaunchContext* workspace);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(DataType type, const int16_t scalar, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(DataType type, const bool scalar, sd::LaunchContext * context);
 
     template <typename T>
-    NDArray NDArrayFactory::create(const T scalar, nd4j::LaunchContext * context) {
+    NDArray NDArrayFactory::create(const T scalar, sd::LaunchContext * context) {
 
         std::shared_ptr<DataBuffer> buffer = std::make_shared<DataBuffer>(1 * sizeof(T), DataTypeUtils::fromT<T>(), context->getWorkspace(), true);
 
@@ -233,73 +233,73 @@ template ND4J_EXPORT void NDArrayFactory::memcpyFromVector(void *ptr, const std:
 
         return res;
     }
-    template ND4J_EXPORT NDArray NDArrayFactory::create(const double scalar, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(const float scalar, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(const float16 scalar, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(const bfloat16 scalar, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(const Nd4jLong scalar, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(const int scalar, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(const int8_t scalar, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(const uint8_t scalar, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(const int16_t scalar, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(const uint16_t scalar, nd4j::LaunchContext* workspace);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(const uint32_t scalar, nd4j::LaunchContext* workspace);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(const uint64_t scalar, nd4j::LaunchContext* workspace);
-    template ND4J_EXPORT NDArray NDArrayFactory::create(const bool scalar, nd4j::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(const double scalar, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(const float scalar, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(const float16 scalar, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(const bfloat16 scalar, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(const Nd4jLong scalar, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(const int scalar, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(const int8_t scalar, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(const uint8_t scalar, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(const int16_t scalar, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(const uint16_t scalar, sd::LaunchContext* workspace);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(const uint32_t scalar, sd::LaunchContext* workspace);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(const uint64_t scalar, sd::LaunchContext* workspace);
+    template ND4J_EXPORT NDArray NDArrayFactory::create(const bool scalar, sd::LaunchContext * context);
 
 
 ////////////////////////////////////////////////////////////////////////
 template<typename T>
-NDArray* NDArrayFactory::create_(const char order, const std::vector<Nd4jLong> &shape, const std::vector<T> &data, nd4j::LaunchContext * context) {
+NDArray* NDArrayFactory::create_(const char order, const std::vector<Nd4jLong> &shape, const std::vector<T> &data, sd::LaunchContext * context) {
 
    return new NDArray(NDArrayFactory::create<T>(order, shape, data, context));
 }
-template ND4J_EXPORT NDArray* NDArrayFactory::create_(const char order, const std::vector<Nd4jLong> &shape, const std::vector<double> &data, nd4j::LaunchContext * context);
-template ND4J_EXPORT NDArray* NDArrayFactory::create_(const char order, const std::vector<Nd4jLong> &shape, const std::vector<float> &data, nd4j::LaunchContext * context);
-template ND4J_EXPORT NDArray* NDArrayFactory::create_(const char order, const std::vector<Nd4jLong> &shape, const std::vector<float16> &data, nd4j::LaunchContext * context);
-template ND4J_EXPORT NDArray* NDArrayFactory::create_(const char order, const std::vector<Nd4jLong> &shape, const std::vector<bfloat16> &data, nd4j::LaunchContext * context);
-template ND4J_EXPORT NDArray* NDArrayFactory::create_(const char order, const std::vector<Nd4jLong> &shape, const std::vector<int> &data, nd4j::LaunchContext * context);
-template ND4J_EXPORT NDArray* NDArrayFactory::create_(const char order, const std::vector<Nd4jLong> &shape, const std::vector<unsigned int> &data, nd4j::LaunchContext * context);
-template ND4J_EXPORT NDArray* NDArrayFactory::create_(const char order, const std::vector<Nd4jLong> &shape, const std::vector<unsigned long> &data, nd4j::LaunchContext * context);
-template ND4J_EXPORT NDArray* NDArrayFactory::create_(const char order, const std::vector<Nd4jLong> &shape, const std::vector<Nd4jLong> &data, nd4j::LaunchContext * context);
-template ND4J_EXPORT NDArray* NDArrayFactory::create_(const char order, const std::vector<Nd4jLong> &shape, const std::vector<int8_t> &data, nd4j::LaunchContext * context);
-template ND4J_EXPORT NDArray* NDArrayFactory::create_(const char order, const std::vector<Nd4jLong> &shape, const std::vector<uint8_t> &data, nd4j::LaunchContext * context);
-template ND4J_EXPORT NDArray* NDArrayFactory::create_(const char order, const std::vector<Nd4jLong> &shape, const std::vector<int16_t> &data, nd4j::LaunchContext * context);
-template ND4J_EXPORT NDArray* NDArrayFactory::create_(const char order, const std::vector<Nd4jLong> &shape, const std::vector<uint16_t> &data, nd4j::LaunchContext * context);
-template ND4J_EXPORT NDArray* NDArrayFactory::create_(const char order, const std::vector<Nd4jLong> &shape, const std::vector<bool> &data, nd4j::LaunchContext * context);
+template ND4J_EXPORT NDArray* NDArrayFactory::create_(const char order, const std::vector<Nd4jLong> &shape, const std::vector<double> &data, sd::LaunchContext * context);
+template ND4J_EXPORT NDArray* NDArrayFactory::create_(const char order, const std::vector<Nd4jLong> &shape, const std::vector<float> &data, sd::LaunchContext * context);
+template ND4J_EXPORT NDArray* NDArrayFactory::create_(const char order, const std::vector<Nd4jLong> &shape, const std::vector<float16> &data, sd::LaunchContext * context);
+template ND4J_EXPORT NDArray* NDArrayFactory::create_(const char order, const std::vector<Nd4jLong> &shape, const std::vector<bfloat16> &data, sd::LaunchContext * context);
+template ND4J_EXPORT NDArray* NDArrayFactory::create_(const char order, const std::vector<Nd4jLong> &shape, const std::vector<int> &data, sd::LaunchContext * context);
+template ND4J_EXPORT NDArray* NDArrayFactory::create_(const char order, const std::vector<Nd4jLong> &shape, const std::vector<unsigned int> &data, sd::LaunchContext * context);
+template ND4J_EXPORT NDArray* NDArrayFactory::create_(const char order, const std::vector<Nd4jLong> &shape, const std::vector<unsigned long> &data, sd::LaunchContext * context);
+template ND4J_EXPORT NDArray* NDArrayFactory::create_(const char order, const std::vector<Nd4jLong> &shape, const std::vector<Nd4jLong> &data, sd::LaunchContext * context);
+template ND4J_EXPORT NDArray* NDArrayFactory::create_(const char order, const std::vector<Nd4jLong> &shape, const std::vector<int8_t> &data, sd::LaunchContext * context);
+template ND4J_EXPORT NDArray* NDArrayFactory::create_(const char order, const std::vector<Nd4jLong> &shape, const std::vector<uint8_t> &data, sd::LaunchContext * context);
+template ND4J_EXPORT NDArray* NDArrayFactory::create_(const char order, const std::vector<Nd4jLong> &shape, const std::vector<int16_t> &data, sd::LaunchContext * context);
+template ND4J_EXPORT NDArray* NDArrayFactory::create_(const char order, const std::vector<Nd4jLong> &shape, const std::vector<uint16_t> &data, sd::LaunchContext * context);
+template ND4J_EXPORT NDArray* NDArrayFactory::create_(const char order, const std::vector<Nd4jLong> &shape, const std::vector<bool> &data, sd::LaunchContext * context);
 
 
     ////////////////////////////////////////////////////////////////////////
     template <>
-    ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::vector<Nd4jLong>& shape, NDArray* value, const char order, nd4j::LaunchContext * context) {
+    ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::vector<Nd4jLong>& shape, NDArray* value, const char order, sd::LaunchContext * context) {
         auto result = create_(order, shape, value->dataType(), context);
         result->assign(*value);
         return result;
     }
 
     template <>
-    ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::vector<Nd4jLong>& shape, NDArray& value, const char order, nd4j::LaunchContext * context) {
+    ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::vector<Nd4jLong>& shape, NDArray& value, const char order, sd::LaunchContext * context) {
         auto result = create_(order, shape, value.dataType(), context);
         result->assign(value);
         return result;
     }
 
     template <typename T>
-    NDArray* NDArrayFactory::valueOf(const std::vector<Nd4jLong>& shape, const T value, const char order, nd4j::LaunchContext * context) {
+    NDArray* NDArrayFactory::valueOf(const std::vector<Nd4jLong>& shape, const T value, const char order, sd::LaunchContext * context) {
         auto result = create_(order, shape, DataTypeUtils::fromT<T>());
         result->assign(value);
         return result;
     }
-    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::vector<Nd4jLong>& shape, const double value, const char order, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::vector<Nd4jLong>& shape, const float value, const char order, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::vector<Nd4jLong>& shape, const float16 value, const char order, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::vector<Nd4jLong>& shape, const bfloat16 value, const char order, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::vector<Nd4jLong>& shape, const Nd4jLong value, const char order, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::vector<Nd4jLong>& shape, const int value, const char order, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::vector<Nd4jLong>& shape, const int16_t value, const char order, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::vector<Nd4jLong>& shape, const int8_t value, const char order, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::vector<Nd4jLong>& shape, const uint8_t value, const char order, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::vector<Nd4jLong>& shape, const bool value, const char order, nd4j::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::vector<Nd4jLong>& shape, const double value, const char order, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::vector<Nd4jLong>& shape, const float value, const char order, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::vector<Nd4jLong>& shape, const float16 value, const char order, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::vector<Nd4jLong>& shape, const bfloat16 value, const char order, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::vector<Nd4jLong>& shape, const Nd4jLong value, const char order, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::vector<Nd4jLong>& shape, const int value, const char order, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::vector<Nd4jLong>& shape, const int16_t value, const char order, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::vector<Nd4jLong>& shape, const int8_t value, const char order, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::vector<Nd4jLong>& shape, const uint8_t value, const char order, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::valueOf(const std::vector<Nd4jLong>& shape, const bool value, const char order, sd::LaunchContext * context);
 
 
     ////////////////////////////////////////////////////////////////////////
@@ -331,7 +331,7 @@ template ND4J_EXPORT NDArray* NDArrayFactory::create_(const char order, const st
 
 ////////////////////////////////////////////////////////////////////////
     template <typename T>
-    NDArray* NDArrayFactory::vector(Nd4jLong length, const T value, nd4j::LaunchContext * context) {
+    NDArray* NDArrayFactory::vector(Nd4jLong length, const T value, sd::LaunchContext * context) {
 
         std::shared_ptr<DataBuffer> buffer = std::make_shared<DataBuffer>(length * sizeof(T), DataTypeUtils::fromT<T>(), context->getWorkspace(), true);
 
@@ -344,37 +344,37 @@ template ND4J_EXPORT NDArray* NDArrayFactory::create_(const char order, const st
 
         return res;
     }
-    template ND4J_EXPORT NDArray* NDArrayFactory::vector(Nd4jLong length, const double startingValue, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::vector(Nd4jLong length, const float startingValue, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::vector(Nd4jLong length, const float16 startingValue, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::vector(Nd4jLong length, const bfloat16 startingValue, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::vector(Nd4jLong length, const Nd4jLong startingValue, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::vector(Nd4jLong length, const int startingValue, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::vector(Nd4jLong length, const uint8_t startingValue, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::vector(Nd4jLong length, const uint16_t startingValue, nd4j::LaunchContext *workspace);
-    template ND4J_EXPORT NDArray* NDArrayFactory::vector(Nd4jLong length, const uint32_t startingValue, nd4j::LaunchContext *workspace);
-    template ND4J_EXPORT NDArray* NDArrayFactory::vector(Nd4jLong length, const uint64_t startingValue, nd4j::LaunchContext *workspace);
-    template ND4J_EXPORT NDArray* NDArrayFactory::vector(Nd4jLong length, const int8_t startingValue, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::vector(Nd4jLong length, const int16_t startingValue, nd4j::LaunchContext * context);
-    template ND4J_EXPORT NDArray* NDArrayFactory::vector(Nd4jLong length, const bool startingValue, nd4j::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::vector(Nd4jLong length, const double startingValue, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::vector(Nd4jLong length, const float startingValue, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::vector(Nd4jLong length, const float16 startingValue, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::vector(Nd4jLong length, const bfloat16 startingValue, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::vector(Nd4jLong length, const Nd4jLong startingValue, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::vector(Nd4jLong length, const int startingValue, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::vector(Nd4jLong length, const uint8_t startingValue, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::vector(Nd4jLong length, const uint16_t startingValue, sd::LaunchContext *workspace);
+    template ND4J_EXPORT NDArray* NDArrayFactory::vector(Nd4jLong length, const uint32_t startingValue, sd::LaunchContext *workspace);
+    template ND4J_EXPORT NDArray* NDArrayFactory::vector(Nd4jLong length, const uint64_t startingValue, sd::LaunchContext *workspace);
+    template ND4J_EXPORT NDArray* NDArrayFactory::vector(Nd4jLong length, const int8_t startingValue, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::vector(Nd4jLong length, const int16_t startingValue, sd::LaunchContext * context);
+    template ND4J_EXPORT NDArray* NDArrayFactory::vector(Nd4jLong length, const bool startingValue, sd::LaunchContext * context);
 
 ////////////////////////////////////////////////////////////////////////
     template <typename T>
-    NDArray NDArrayFactory::create(const char order, const std::initializer_list<Nd4jLong>& shape, nd4j::LaunchContext * context) {
+    NDArray NDArrayFactory::create(const char order, const std::initializer_list<Nd4jLong>& shape, sd::LaunchContext * context) {
         std::vector<Nd4jLong> vec(shape);
         return create<T>(order, vec, context);
     }
-    BUILD_SINGLE_TEMPLATE(template ND4J_EXPORT NDArray NDArrayFactory::create, (const char, const std::initializer_list<Nd4jLong>&, nd4j::LaunchContext * context), LIBND4J_TYPES);
+    BUILD_SINGLE_TEMPLATE(template ND4J_EXPORT NDArray NDArrayFactory::create, (const char, const std::initializer_list<Nd4jLong>&, sd::LaunchContext * context), LIBND4J_TYPES);
 
 ////////////////////////////////////////////////////////////////////////
     template <typename T>
-    NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, nd4j::LaunchContext * context) {
+    NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, sd::LaunchContext * context) {
         return create(order, shape, DataTypeUtils::fromT<T>(), context);
     }
-    BUILD_SINGLE_TEMPLATE(template ND4J_EXPORT NDArray NDArrayFactory::create, (const char order, const std::vector<Nd4jLong> &shape, nd4j::LaunchContext * context), LIBND4J_TYPES);
+    BUILD_SINGLE_TEMPLATE(template ND4J_EXPORT NDArray NDArrayFactory::create, (const char order, const std::vector<Nd4jLong> &shape, sd::LaunchContext * context), LIBND4J_TYPES);
 
 ////////////////////////////////////////////////////////////////////////
-NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, nd4j::DataType dtype, nd4j::LaunchContext* context) {
+NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &shape, sd::DataType dtype, sd::LaunchContext* context) {
 
     if ((int) shape.size() > MAX_RANK)
         throw std::invalid_argument("NDArrayFactory::create: rank of NDArray can't exceed 32");
@@ -392,7 +392,7 @@ NDArray NDArrayFactory::create(const char order, const std::vector<Nd4jLong> &sh
 
 
 ////////////////////////////////////////////////////////////////////////
-NDArray NDArrayFactory::create(nd4j::DataType dtype, nd4j::LaunchContext * context) {
+NDArray NDArrayFactory::create(sd::DataType dtype, sd::LaunchContext * context) {
 
     std::shared_ptr<DataBuffer> buffer = std::make_shared<DataBuffer>(DataTypeUtils::sizeOfElement(dtype), dtype, context->getWorkspace(), true);
 
@@ -403,7 +403,7 @@ NDArray NDArrayFactory::create(nd4j::DataType dtype, nd4j::LaunchContext * conte
     return res;
 }
 
-NDArray* NDArrayFactory::create_(nd4j::DataType dtype, nd4j::LaunchContext * context) {
+NDArray* NDArrayFactory::create_(sd::DataType dtype, sd::LaunchContext * context) {
     auto result = new NDArray();
     *result = NDArrayFactory::create(dtype, context);
     return result;
@@ -411,7 +411,7 @@ NDArray* NDArrayFactory::create_(nd4j::DataType dtype, nd4j::LaunchContext * con
 
 ////////////////////////////////////////////////////////////////////////
 template <typename T>
-NDArray NDArrayFactory::create(const std::vector<T> &values, nd4j::LaunchContext * context) {
+NDArray NDArrayFactory::create(const std::vector<T> &values, sd::LaunchContext * context) {
 
     std::shared_ptr<DataBuffer> buffer = std::make_shared<DataBuffer>(values.size() * sizeof(T), DataTypeUtils::fromT<T>(), context->getWorkspace(), true);
 
@@ -424,21 +424,21 @@ NDArray NDArrayFactory::create(const std::vector<T> &values, nd4j::LaunchContext
 
     return res;
 }
-template ND4J_EXPORT NDArray NDArrayFactory::create(const std::vector<double> &values, nd4j::LaunchContext * context);
-template ND4J_EXPORT NDArray NDArrayFactory::create(const std::vector<float> &values, nd4j::LaunchContext * context);
-template ND4J_EXPORT NDArray NDArrayFactory::create(const std::vector<float16> &values, nd4j::LaunchContext * context);
-template ND4J_EXPORT NDArray NDArrayFactory::create(const std::vector<bfloat16> &values, nd4j::LaunchContext * context);
-template ND4J_EXPORT NDArray NDArrayFactory::create(const std::vector<Nd4jLong> &values, nd4j::LaunchContext * context);
-template ND4J_EXPORT NDArray NDArrayFactory::create(const std::vector<int> &values, nd4j::LaunchContext * context);
-template ND4J_EXPORT NDArray NDArrayFactory::create(const std::vector<int16_t> &values, nd4j::LaunchContext * context);
-template ND4J_EXPORT NDArray NDArrayFactory::create(const std::vector<uint16_t> &values, nd4j::LaunchContext * context);
-template ND4J_EXPORT NDArray NDArrayFactory::create(const std::vector<int8_t> &values, nd4j::LaunchContext * context);
-template ND4J_EXPORT NDArray NDArrayFactory::create(const std::vector<uint8_t> &values, nd4j::LaunchContext * context);
-template ND4J_EXPORT NDArray NDArrayFactory::create(const std::vector<bool> &values, nd4j::LaunchContext * context);
+template ND4J_EXPORT NDArray NDArrayFactory::create(const std::vector<double> &values, sd::LaunchContext * context);
+template ND4J_EXPORT NDArray NDArrayFactory::create(const std::vector<float> &values, sd::LaunchContext * context);
+template ND4J_EXPORT NDArray NDArrayFactory::create(const std::vector<float16> &values, sd::LaunchContext * context);
+template ND4J_EXPORT NDArray NDArrayFactory::create(const std::vector<bfloat16> &values, sd::LaunchContext * context);
+template ND4J_EXPORT NDArray NDArrayFactory::create(const std::vector<Nd4jLong> &values, sd::LaunchContext * context);
+template ND4J_EXPORT NDArray NDArrayFactory::create(const std::vector<int> &values, sd::LaunchContext * context);
+template ND4J_EXPORT NDArray NDArrayFactory::create(const std::vector<int16_t> &values, sd::LaunchContext * context);
+template ND4J_EXPORT NDArray NDArrayFactory::create(const std::vector<uint16_t> &values, sd::LaunchContext * context);
+template ND4J_EXPORT NDArray NDArrayFactory::create(const std::vector<int8_t> &values, sd::LaunchContext * context);
+template ND4J_EXPORT NDArray NDArrayFactory::create(const std::vector<uint8_t> &values, sd::LaunchContext * context);
+template ND4J_EXPORT NDArray NDArrayFactory::create(const std::vector<bool> &values, sd::LaunchContext * context);
 
 ////////////////////////////////////////////////////////////////////////
     template <typename T>
-    NDArray* NDArrayFactory::empty_(nd4j::LaunchContext * context) {
+    NDArray* NDArrayFactory::empty_(sd::LaunchContext * context) {
         auto shapeInfo = ShapeBuilders::createScalarShapeInfo(DataTypeUtils::fromT<T>(), context->getWorkspace());
         ArrayOptions::setPropertyBit(shapeInfo, ARRAY_EMPTY);
         auto result = new NDArray(nullptr, shapeInfo, context, false);
@@ -447,11 +447,11 @@ template ND4J_EXPORT NDArray NDArrayFactory::create(const std::vector<bool> &val
 
         return result;
     }
-    BUILD_SINGLE_TEMPLATE(template ND4J_EXPORT NDArray* NDArrayFactory::empty_, (nd4j::LaunchContext * context), LIBND4J_TYPES);
+    BUILD_SINGLE_TEMPLATE(template ND4J_EXPORT NDArray* NDArrayFactory::empty_, (sd::LaunchContext * context), LIBND4J_TYPES);
 
-    NDArray* NDArrayFactory::empty_(nd4j::DataType dataType, nd4j::LaunchContext * context) {
+    NDArray* NDArrayFactory::empty_(sd::DataType dataType, sd::LaunchContext * context) {
         if (context == nullptr)
-            context = nd4j::LaunchContext ::defaultContext();
+            context = sd::LaunchContext ::defaultContext();
 
         auto shapeInfo = ShapeBuilders::createScalarShapeInfo(dataType, context->getWorkspace());
         ArrayOptions::setPropertyBit(shapeInfo, ARRAY_EMPTY);
@@ -464,13 +464,13 @@ template ND4J_EXPORT NDArray NDArrayFactory::create(const std::vector<bool> &val
 
     ////////////////////////////////////////////////////////////////////////
     template <typename T>
-    NDArray NDArrayFactory::empty(nd4j::LaunchContext * context) {
+    NDArray NDArrayFactory::empty(sd::LaunchContext * context) {
         return empty(DataTypeUtils::fromT<T>(), context);
     }
-    BUILD_SINGLE_TEMPLATE(template ND4J_EXPORT NDArray NDArrayFactory::empty, (nd4j::LaunchContext * context), LIBND4J_TYPES);
+    BUILD_SINGLE_TEMPLATE(template ND4J_EXPORT NDArray NDArrayFactory::empty, (sd::LaunchContext * context), LIBND4J_TYPES);
 
     ////////////////////////////////////////////////////////////////////////
-    NDArray NDArrayFactory::empty(nd4j::DataType dataType, nd4j::LaunchContext * context) {
+    NDArray NDArrayFactory::empty(sd::DataType dataType, sd::LaunchContext * context) {
         auto shapeInfo = ShapeBuilders::createScalarShapeInfo(dataType, context->getWorkspace());
         ArrayOptions::setPropertyBit(shapeInfo, ARRAY_EMPTY);
         NDArray result(nullptr, shapeInfo, context, false);
@@ -481,21 +481,21 @@ template ND4J_EXPORT NDArray NDArrayFactory::create(const std::vector<bool> &val
     }
 
 ////////////////////////////////////////////////////////////////////////
-    NDArray* NDArrayFactory::valueOf(const std::vector<Nd4jLong>& shape, const NDArray& value, const char order, nd4j::LaunchContext * context) {
+    NDArray* NDArrayFactory::valueOf(const std::vector<Nd4jLong>& shape, const NDArray& value, const char order, sd::LaunchContext * context) {
         auto res = NDArrayFactory::create_(order, shape, value.dataType(), context);
         res->assign(const_cast<NDArray&>(value));
         return res;
     }
 
 ////////////////////////////////////////////////////////////////////////
-    NDArray* NDArrayFactory::create_( const char order, const std::vector<Nd4jLong> &shape, nd4j::DataType dataType, nd4j::LaunchContext * context) {
+    NDArray* NDArrayFactory::create_( const char order, const std::vector<Nd4jLong> &shape, sd::DataType dataType, sd::LaunchContext * context) {
 
         return new NDArray(order, shape, dataType, context);
     }
 
 ////////////////////////////////////////////////////////////////////////
 template <typename T>
-NDArray NDArrayFactory::create(T* buffer, const char order, const std::initializer_list<Nd4jLong>& shape, nd4j::LaunchContext * context) {
+NDArray NDArrayFactory::create(T* buffer, const char order, const std::initializer_list<Nd4jLong>& shape, sd::LaunchContext * context) {
 
     if ((int) shape.size() > MAX_RANK)
         throw std::invalid_argument("NDArrayFactory::create: Rank of NDArray can't exceed 32");
@@ -510,89 +510,89 @@ NDArray NDArrayFactory::create(T* buffer, const char order, const std::initializ
     return result;
 }
 
-template ND4J_EXPORT NDArray NDArrayFactory::create(double* buffer, const char order, const std::initializer_list<Nd4jLong>& shape, nd4j::LaunchContext * context);
-template ND4J_EXPORT NDArray NDArrayFactory::create(float* buffer, const char order, const std::initializer_list<Nd4jLong>& shape, nd4j::LaunchContext * context);
-template ND4J_EXPORT NDArray NDArrayFactory::create(float16* buffer, const char order, const std::initializer_list<Nd4jLong>& shape, nd4j::LaunchContext * context);
-template ND4J_EXPORT NDArray NDArrayFactory::create(bfloat16* buffer, const char order, const std::initializer_list<Nd4jLong>& shape, nd4j::LaunchContext * context);
-template ND4J_EXPORT NDArray NDArrayFactory::create(Nd4jLong * buffer, const char order, const std::initializer_list<Nd4jLong>& shape, nd4j::LaunchContext * context);
-template ND4J_EXPORT NDArray NDArrayFactory::create(int* buffer, const char order, const std::initializer_list<Nd4jLong>& shape, nd4j::LaunchContext * context);
-template ND4J_EXPORT NDArray NDArrayFactory::create(bool* buffer, const char order, const std::initializer_list<Nd4jLong>& shape, nd4j::LaunchContext * context);
-template ND4J_EXPORT NDArray NDArrayFactory::create(uint8_t * buffer, const char order, const std::initializer_list<Nd4jLong>& shape, nd4j::LaunchContext * context);
-template ND4J_EXPORT NDArray NDArrayFactory::create(int8_t* buffer, const char order, const std::initializer_list<Nd4jLong>& shape, nd4j::LaunchContext * context);
-template ND4J_EXPORT NDArray NDArrayFactory::create(int16_t* buffer, const char order, const std::initializer_list<Nd4jLong>& shape, nd4j::LaunchContext * context);
+template ND4J_EXPORT NDArray NDArrayFactory::create(double* buffer, const char order, const std::initializer_list<Nd4jLong>& shape, sd::LaunchContext * context);
+template ND4J_EXPORT NDArray NDArrayFactory::create(float* buffer, const char order, const std::initializer_list<Nd4jLong>& shape, sd::LaunchContext * context);
+template ND4J_EXPORT NDArray NDArrayFactory::create(float16* buffer, const char order, const std::initializer_list<Nd4jLong>& shape, sd::LaunchContext * context);
+template ND4J_EXPORT NDArray NDArrayFactory::create(bfloat16* buffer, const char order, const std::initializer_list<Nd4jLong>& shape, sd::LaunchContext * context);
+template ND4J_EXPORT NDArray NDArrayFactory::create(Nd4jLong * buffer, const char order, const std::initializer_list<Nd4jLong>& shape, sd::LaunchContext * context);
+template ND4J_EXPORT NDArray NDArrayFactory::create(int* buffer, const char order, const std::initializer_list<Nd4jLong>& shape, sd::LaunchContext * context);
+template ND4J_EXPORT NDArray NDArrayFactory::create(bool* buffer, const char order, const std::initializer_list<Nd4jLong>& shape, sd::LaunchContext * context);
+template ND4J_EXPORT NDArray NDArrayFactory::create(uint8_t * buffer, const char order, const std::initializer_list<Nd4jLong>& shape, sd::LaunchContext * context);
+template ND4J_EXPORT NDArray NDArrayFactory::create(int8_t* buffer, const char order, const std::initializer_list<Nd4jLong>& shape, sd::LaunchContext * context);
+template ND4J_EXPORT NDArray NDArrayFactory::create(int16_t* buffer, const char order, const std::initializer_list<Nd4jLong>& shape, sd::LaunchContext * context);
 
       /////////////////////////////////////////////////////////////////////////////////////
-      NDArray NDArrayFactory::string(const char16_t* u16string, nd4j::DataType dtype, nd4j::LaunchContext* context) {
+      NDArray NDArrayFactory::string(const char16_t* u16string, sd::DataType dtype, sd::LaunchContext* context) {
           return NDArray(u16string, dtype, context); 
       }
       /////////////////////////////////////////////////////////////////////////
-      NDArray* NDArrayFactory::string_(const char16_t* u16string, nd4j::DataType dtype, nd4j::LaunchContext* context) {
+      NDArray* NDArrayFactory::string_(const char16_t* u16string, sd::DataType dtype, sd::LaunchContext* context) {
           return string_(std::u16string(u16string), dtype, context);
       }
       /////////////////////////////////////////////////////////////////////////
-      NDArray* NDArrayFactory::string_(const std::u16string& u16string, nd4j::DataType dtype, nd4j::LaunchContext* context) {  
+      NDArray* NDArrayFactory::string_(const std::u16string& u16string, sd::DataType dtype, sd::LaunchContext* context) {
           auto res = new NDArray();
           *res = NDArray(u16string, dtype, context);
           return res;
       }
       /////////////////////////////////////////////////////////////////////////
-      NDArray NDArrayFactory::string(const std::u16string& u16string, nd4j::DataType dtype, nd4j::LaunchContext* context) {
+      NDArray NDArrayFactory::string(const std::u16string& u16string, sd::DataType dtype, sd::LaunchContext* context) {
           return NDArray(u16string, dtype, context);
       }
       /////////////////////////////////////////////////////////////////////////
-      NDArray NDArrayFactory::string(const char32_t* u32string, nd4j::DataType dtype, nd4j::LaunchContext* context) {
+      NDArray NDArrayFactory::string(const char32_t* u32string, sd::DataType dtype, sd::LaunchContext* context) {
           return NDArray(u32string, dtype, context);
       }
       /////////////////////////////////////////////////////////////////////////
-      NDArray* NDArrayFactory::string_(const char32_t* u32string, nd4j::DataType dtype, nd4j::LaunchContext* context) {
+      NDArray* NDArrayFactory::string_(const char32_t* u32string, sd::DataType dtype, sd::LaunchContext* context) {
           return string_(std::u32string(u32string), dtype, context);
       }
       /////////////////////////////////////////////////////////////////////////
-      NDArray* NDArrayFactory::string_(const std::u32string& u32string, nd4j::DataType dtype, nd4j::LaunchContext* context) {
+      NDArray* NDArrayFactory::string_(const std::u32string& u32string, sd::DataType dtype, sd::LaunchContext* context) {
           auto res = new NDArray();
           *res = NDArray(u32string, dtype, context);
           return res;
       }
       /////////////////////////////////////////////////////////////////////////
-      NDArray NDArrayFactory::string(const std::u32string& u32string, nd4j::DataType dtype, nd4j::LaunchContext* context) {
+      NDArray NDArrayFactory::string(const std::u32string& u32string, sd::DataType dtype, sd::LaunchContext* context) {
           return NDArray(u32string, dtype, context);
       }
       /////////////////////////////////////////////////////////////////////////
-      NDArray NDArrayFactory::string(const char* str, nd4j::DataType dtype, nd4j::LaunchContext* context) {
+      NDArray NDArrayFactory::string(const char* str, sd::DataType dtype, sd::LaunchContext* context) {
           return NDArray(str, dtype, context);
       }
       /////////////////////////////////////////////////////////////////////////
-      NDArray* NDArrayFactory::string_(const char* str, nd4j::DataType dtype, nd4j::LaunchContext* context) {
+      NDArray* NDArrayFactory::string_(const char* str, sd::DataType dtype, sd::LaunchContext* context) {
           return string_(std::string(str), dtype, context);
       }
       /////////////////////////////////////////////////////////////////////////
-      NDArray* NDArrayFactory::string_(const std::string& str, nd4j::DataType dtype, nd4j::LaunchContext* context) {
+      NDArray* NDArrayFactory::string_(const std::string& str, sd::DataType dtype, sd::LaunchContext* context) {
           auto res = new NDArray();
           *res = NDArray(str, dtype, context);
           return res;
       }
       /////////////////////////////////////////////////////////////////////////
-      NDArray NDArrayFactory::string(const std::string& str, nd4j::DataType dtype, nd4j::LaunchContext* context) {
+      NDArray NDArrayFactory::string(const std::string& str, sd::DataType dtype, sd::LaunchContext* context) {
           return NDArray(str, dtype, context);
       }
       /////////////////////////////////////////////////////////////////////////
-      NDArray NDArrayFactory::string(const std::vector<Nd4jLong> &shape, const std::initializer_list<const char *> &strings, nd4j::DataType dataType, nd4j::LaunchContext * context) {
+      NDArray NDArrayFactory::string(const std::vector<Nd4jLong> &shape, const std::initializer_list<const char *> &strings, sd::DataType dataType, sd::LaunchContext * context) {
           return NDArray(shape, std::vector<const char*>(strings), dataType, context);
       }
       /////////////////////////////////////////////////////////////////////////
-      NDArray NDArrayFactory::string( const std::vector<Nd4jLong> &shape, const std::vector<const char *> &strings, nd4j::DataType dataType, nd4j::LaunchContext * context) {
+      NDArray NDArrayFactory::string( const std::vector<Nd4jLong> &shape, const std::vector<const char *> &strings, sd::DataType dataType, sd::LaunchContext * context) {
           return NDArray( shape, strings, dataType, context);
       }
       /////////////////////////////////////////////////////////////////////////
-      NDArray NDArrayFactory::string( const std::vector<Nd4jLong> &shape, const std::initializer_list<std::string> &string, nd4j::DataType dataType, nd4j::LaunchContext * context) {
+      NDArray NDArrayFactory::string( const std::vector<Nd4jLong> &shape, const std::initializer_list<std::string> &string, sd::DataType dataType, sd::LaunchContext * context) {
           return NDArray( shape, std::vector<std::string>(string), dataType, context);
       }
       /////////////////////////////////////////////////////////////////////////
-      NDArray* NDArrayFactory::string_( const std::vector<Nd4jLong> &shape, const std::initializer_list<const char *> &strings, nd4j::DataType dataType, nd4j::LaunchContext * context) {
+      NDArray* NDArrayFactory::string_( const std::vector<Nd4jLong> &shape, const std::initializer_list<const char *> &strings, sd::DataType dataType, sd::LaunchContext * context) {
           return NDArrayFactory::string_( shape, std::vector<const char*>(strings), dataType, context);
       }
       /////////////////////////////////////////////////////////////////////////
-      NDArray* NDArrayFactory::string_( const std::vector<Nd4jLong> &shape, const std::vector<const char *> &strings, nd4j::DataType dataType, nd4j::LaunchContext * context) {
+      NDArray* NDArrayFactory::string_( const std::vector<Nd4jLong> &shape, const std::vector<const char *> &strings, sd::DataType dataType, sd::LaunchContext * context) {
           std::vector<std::string> vec(strings.size());
           int cnt = 0;
           for (auto s:strings)
@@ -601,37 +601,37 @@ template ND4J_EXPORT NDArray NDArrayFactory::create(int16_t* buffer, const char
           return NDArrayFactory::string_( shape, vec, dataType, context);
       }
       /////////////////////////////////////////////////////////////////////////
-      NDArray* NDArrayFactory::string_( const std::vector<Nd4jLong> &shape, const std::initializer_list<std::string> &string, nd4j::DataType dataType, nd4j::LaunchContext * context) {
+      NDArray* NDArrayFactory::string_( const std::vector<Nd4jLong> &shape, const std::initializer_list<std::string> &string, sd::DataType dataType, sd::LaunchContext * context) {
           return NDArrayFactory::string_( shape, std::vector<std::string>(string), dataType, context);
       }
       /////////////////////////////////////////////////////////////////////////
-      NDArray NDArrayFactory::string( const std::vector<Nd4jLong> &shape, const std::vector<std::string> &string, nd4j::DataType dataType, nd4j::LaunchContext * context) {
+      NDArray NDArrayFactory::string( const std::vector<Nd4jLong> &shape, const std::vector<std::string> &string, sd::DataType dataType, sd::LaunchContext * context) {
           return NDArray(shape, string, dataType, context);
       }
       /////////////////////////////////////////////////////////////////////////
-      NDArray* NDArrayFactory::string_(const std::vector<Nd4jLong> &shape, const std::vector<std::string> &string, nd4j::DataType dataType, nd4j::LaunchContext * context) {
+      NDArray* NDArrayFactory::string_(const std::vector<Nd4jLong> &shape, const std::vector<std::string> &string, sd::DataType dataType, sd::LaunchContext * context) {
           auto res = new NDArray();
           *res = NDArray( shape, string, dataType, context);
           return res;
       }
       /////////////////////////////////////////////////////////////////////////
-      NDArray NDArrayFactory::string(const std::vector<Nd4jLong>& shape, const std::initializer_list<const char16_t*>& strings, nd4j::DataType dataType, nd4j::LaunchContext* context) {
+      NDArray NDArrayFactory::string(const std::vector<Nd4jLong>& shape, const std::initializer_list<const char16_t*>& strings, sd::DataType dataType, sd::LaunchContext* context) {
           return NDArray( shape, std::vector<const char16_t*>(strings), dataType, context);
       }
       /////////////////////////////////////////////////////////////////////////
-      NDArray NDArrayFactory::string( const std::vector<Nd4jLong>& shape, const std::vector<const char16_t*>& strings, nd4j::DataType dataType, nd4j::LaunchContext* context) {
+      NDArray NDArrayFactory::string( const std::vector<Nd4jLong>& shape, const std::vector<const char16_t*>& strings, sd::DataType dataType, sd::LaunchContext* context) {
          return NDArray( shape, strings, dataType, context);
       }
       /////////////////////////////////////////////////////////////////////////
-      NDArray NDArrayFactory::string( const std::vector<Nd4jLong>& shape, const std::initializer_list<std::u16string>& string, nd4j::DataType dataType, nd4j::LaunchContext* context) {          
+      NDArray NDArrayFactory::string( const std::vector<Nd4jLong>& shape, const std::initializer_list<std::u16string>& string, sd::DataType dataType, sd::LaunchContext* context) {
           return NDArray( shape, std::vector<std::u16string>(string), dataType, context);
       }
       /////////////////////////////////////////////////////////////////////////
-      NDArray* NDArrayFactory::string_( const std::vector<Nd4jLong>& shape, const std::initializer_list<const char16_t*>& strings, nd4j::DataType dataType, nd4j::LaunchContext* context) {
+      NDArray* NDArrayFactory::string_( const std::vector<Nd4jLong>& shape, const std::initializer_list<const char16_t*>& strings, sd::DataType dataType, sd::LaunchContext* context) {
           return NDArrayFactory::string_( shape, std::vector<const char16_t*>(strings), dataType, context);
       }
       /////////////////////////////////////////////////////////////////////////
-      NDArray* NDArrayFactory::string_( const std::vector<Nd4jLong>& shape, const std::vector<const char16_t*>& strings, nd4j::DataType dataType, nd4j::LaunchContext* context) {
+      NDArray* NDArrayFactory::string_( const std::vector<Nd4jLong>& shape, const std::vector<const char16_t*>& strings, sd::DataType dataType, sd::LaunchContext* context) {
           std::vector<std::u16string> vec(strings.size());
           int cnt = 0;
           for (auto s : strings)
@@ -640,37 +640,37 @@ template ND4J_EXPORT NDArray NDArrayFactory::create(int16_t* buffer, const char
           return NDArrayFactory::string_( shape, vec, dataType, context);
       }
       /////////////////////////////////////////////////////////////////////////
-      NDArray* NDArrayFactory::string_( const std::vector<Nd4jLong>& shape, const std::initializer_list<std::u16string>& string, nd4j::DataType dataType, nd4j::LaunchContext* context) {
+      NDArray* NDArrayFactory::string_( const std::vector<Nd4jLong>& shape, const std::initializer_list<std::u16string>& string, sd::DataType dataType, sd::LaunchContext* context) {
           return NDArrayFactory::string_( shape, std::vector<std::u16string>(string), dataType, context);
       }
       /////////////////////////////////////////////////////////////////////////
-      NDArray* NDArrayFactory::string_( const std::vector<Nd4jLong>& shape, const std::vector<std::u16string>& string, nd4j::DataType dataType, nd4j::LaunchContext* context) { 
+      NDArray* NDArrayFactory::string_( const std::vector<Nd4jLong>& shape, const std::vector<std::u16string>& string, sd::DataType dataType, sd::LaunchContext* context) {
           auto res = new NDArray();
           *res = NDArray( shape, string, dataType, context);
           return res;
       }
       /////////////////////////////////////////////////////////////////////////
-      NDArray NDArrayFactory::string( const std::vector<Nd4jLong>& shape, const std::vector<std::u16string>& string, nd4j::DataType dtype, nd4j::LaunchContext* context) {
+      NDArray NDArrayFactory::string( const std::vector<Nd4jLong>& shape, const std::vector<std::u16string>& string, sd::DataType dtype, sd::LaunchContext* context) {
           return NDArray( shape, string, dtype, context);
       }
       /////////////////////////////////////////////////////////////////////////
-      NDArray NDArrayFactory::string( const std::vector<Nd4jLong>& shape, const std::initializer_list<const char32_t*>& strings, nd4j::DataType dataType, nd4j::LaunchContext* context) {
+      NDArray NDArrayFactory::string( const std::vector<Nd4jLong>& shape, const std::initializer_list<const char32_t*>& strings, sd::DataType dataType, sd::LaunchContext* context) {
           return NDArray( shape, std::vector<const char32_t*>(strings), dataType, context);
       }
       /////////////////////////////////////////////////////////////////////////
-      NDArray NDArrayFactory::string( const std::vector<Nd4jLong>& shape, const std::vector<const char32_t*>& strings, nd4j::DataType dataType, nd4j::LaunchContext* context) {
+      NDArray NDArrayFactory::string( const std::vector<Nd4jLong>& shape, const std::vector<const char32_t*>& strings, sd::DataType dataType, sd::LaunchContext* context) {
           return NDArray( shape, strings, dataType, context);
       }
       /////////////////////////////////////////////////////////////////////////
-      NDArray NDArrayFactory::string( const std::vector<Nd4jLong>& shape, const std::initializer_list<std::u32string>& string, nd4j::DataType dataType, nd4j::LaunchContext* context) {        
+      NDArray NDArrayFactory::string( const std::vector<Nd4jLong>& shape, const std::initializer_list<std::u32string>& string, sd::DataType dataType, sd::LaunchContext* context) {
           return NDArray(shape, std::vector<std::u32string>(string), dataType, context);
       }
       /////////////////////////////////////////////////////////////////////////
-      NDArray* NDArrayFactory::string_( const std::vector<Nd4jLong>& shape, const std::initializer_list<const char32_t*>& strings, nd4j::DataType dataType, nd4j::LaunchContext* context) {
+      NDArray* NDArrayFactory::string_( const std::vector<Nd4jLong>& shape, const std::initializer_list<const char32_t*>& strings, sd::DataType dataType, sd::LaunchContext* context) {
           return NDArrayFactory::string_( shape, std::vector<const char32_t*>(strings), dataType, context);
       }
       /////////////////////////////////////////////////////////////////////////
-      NDArray* NDArrayFactory::string_( const std::vector<Nd4jLong>& shape, const std::vector<const char32_t*>& strings, nd4j::DataType dataType, nd4j::LaunchContext* context) {
+      NDArray* NDArrayFactory::string_( const std::vector<Nd4jLong>& shape, const std::vector<const char32_t*>& strings, sd::DataType dataType, sd::LaunchContext* context) {
           std::vector<std::u32string> vec(strings.size());
           int cnt = 0;
           for (auto s : strings)
@@ -678,23 +678,23 @@ template ND4J_EXPORT NDArray NDArrayFactory::create(int16_t* buffer, const char
           return NDArrayFactory::string_( shape, vec, dataType, context);
       }
       /////////////////////////////////////////////////////////////////////////
-      NDArray* NDArrayFactory::string_( const std::vector<Nd4jLong>& shape, const std::initializer_list<std::u32string>& string, nd4j::DataType dataType, nd4j::LaunchContext* context) {
+      NDArray* NDArrayFactory::string_( const std::vector<Nd4jLong>& shape, const std::initializer_list<std::u32string>& string, sd::DataType dataType, sd::LaunchContext* context) {
           return NDArrayFactory::string_( shape, std::vector<std::u32string>(string), dataType, context);
       }
       /////////////////////////////////////////////////////////////////////////
-      NDArray* NDArrayFactory::string_( const std::vector<Nd4jLong>& shape, const std::vector<std::u32string>& string, nd4j::DataType dataType, nd4j::LaunchContext* context) {
+      NDArray* NDArrayFactory::string_( const std::vector<Nd4jLong>& shape, const std::vector<std::u32string>& string, sd::DataType dataType, sd::LaunchContext* context) {
           auto res = new NDArray();
           *res = NDArray( shape, string, dataType, context);
           return res;
       }
       /////////////////////////////////////////////////////////////////////////
-      NDArray NDArrayFactory::string(const std::vector<Nd4jLong>& shape, const std::vector<std::u32string>& string, nd4j::DataType dtype, nd4j::LaunchContext* context) {
+      NDArray NDArrayFactory::string(const std::vector<Nd4jLong>& shape, const std::vector<std::u32string>& string, sd::DataType dtype, sd::LaunchContext* context) {
           return NDArray( shape, string, dtype, context);
       }
 
 
       NDArray NDArrayFactory::fromNpyFile(const char *fileName) {
-          auto size = nd4j::graph::getFileSize(fileName);
+          auto size = sd::graph::getFileSize(fileName);
           if (size < 0)
               throw std::runtime_error("File doesn't exit");
 
@@ -705,7 +705,7 @@ template ND4J_EXPORT NDArray NDArrayFactory::create(int16_t* buffer, const char
 
           auto length = shape::length(shape);
           int8_t *buffer = nullptr;
-          nd4j::memory::Workspace *workspace = nullptr;
+          sd::memory::Workspace *workspace = nullptr;
           auto byteLen = length * DataTypeUtils::sizeOfElement(ArrayOptions::dataType(shape));
 
           ALLOCATE(buffer, workspace, byteLen, int8_t);
diff --git a/libnd4j/include/array/impl/NDArrayList.cpp b/libnd4j/include/array/impl/NDArrayList.cpp
index 81ac9ac2d..640392211 100644
--- a/libnd4j/include/array/impl/NDArrayList.cpp
+++ b/libnd4j/include/array/impl/NDArrayList.cpp
@@ -24,7 +24,7 @@
 #include <helpers/ShapeUtils.h>
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     NDArrayList::NDArrayList(int height, bool expandable) {
         _expandable = expandable;
         _elements.store(0);
@@ -47,7 +47,7 @@ namespace nd4j {
         return new NDArray(readRaw(idx)->dup());
     }
 
-    nd4j::DataType NDArrayList::dataType() {
+    sd::DataType NDArrayList::dataType() {
         return _dtype;
     }
 
@@ -144,7 +144,7 @@ namespace nd4j {
 
     NDArray* NDArrayList::stack() {
         // FIXME: this is bad for perf, but ok as poc
-        nd4j::ops::stack op;
+        sd::ops::stack op;
         std::vector<NDArray*> inputs;
         std::vector<double> targs;
         std::vector<Nd4jLong> iargs({0});
@@ -175,7 +175,7 @@ namespace nd4j {
         return _name;
     }
 
-    nd4j::LaunchContext * NDArrayList::context() {
+    sd::LaunchContext * NDArrayList::context() {
         return _context;
     }
 
diff --git a/libnd4j/include/array/impl/ResultSet.cpp b/libnd4j/include/array/impl/ResultSet.cpp
index 33c04f851..3300cc380 100644
--- a/libnd4j/include/array/impl/ResultSet.cpp
+++ b/libnd4j/include/array/impl/ResultSet.cpp
@@ -21,39 +21,40 @@
 #include <array/ResultSet.h>
 #include <graph/FlatUtils.h>
 
-namespace nd4j {
-    ResultSet::ResultSet(const nd4j::graph::FlatResult* result) {
-        if (result != nullptr) {
-            for (int e = 0; e < result->variables()->size(); e++) {
-                auto var = result->variables()->Get(e);
+namespace sd {
+    ResultSet::ResultSet() {
+        //
+    }
 
-                NDArray* array;
+    ResultSet::ResultSet(const sd::graph::FlatResult* result) {
+        for (int e = 0; e < result->variables()->size(); e++) {
+            auto var = result->variables()->Get(e);
 
-                if (var->ndarray() != nullptr) {
-                    array = nd4j::graph::FlatUtils::fromFlatArray(var->ndarray());
-                } else if (var->shape() != nullptr) {
-                    std::vector<Nd4jLong> shapeInfo;
-                    for (int i = 0; i < var->shape()->size(); i++) {
-                        shapeInfo.emplace_back(var->shape()->Get(i));
-                    }
+            NDArray* array;
 
-                    // we just create empty array here
-                    int s0 = shapeInfo.at(0);
-
-                    std::vector<Nd4jLong> shape;
-                    for (int i = 0; i < s0; i++) {
-                        shape.emplace_back(shapeInfo.at(i + 1));
-                    }
-
-                    array = new NDArray((char) shapeInfo.at(shapeInfo.size() - 1), shape, DataTypeUtils::fromFlatDataType(var->dtype()));
-                } else {
-                    nd4j_printf("Either shape or NDArray should be defined in FlatResult variable\n","");
-                    throw std::runtime_error("Empty variable");
+            if (var->ndarray() != nullptr) {
+                array = sd::graph::FlatUtils::fromFlatArray(var->ndarray());
+            } else if (var->shape() != nullptr) {
+                std::vector<Nd4jLong> shapeInfo;
+                for (int i = 0; i < var->shape()->size(); i++) {
+                    shapeInfo.emplace_back(var->shape()->Get(i));
                 }
 
+                // we just create empty array here
+                int s0 = shapeInfo.at(0);
 
-                _content.push_back(array);
+                std::vector<Nd4jLong> shape;
+                for (int i = 0; i < s0; i++) {
+                    shape.emplace_back(shapeInfo.at(i + 1));
+                }
+
+                array = new NDArray((char) shapeInfo.at(shapeInfo.size() - 1), shape, DataTypeUtils::fromFlatDataType(var->dtype()));
+            } else {
+                nd4j_printf("Either shape or NDArray should be defined in FlatResult variable\n","");
+                throw std::runtime_error("Empty variable");
             }
+
+            _content.push_back(array);
         }
     }
 
@@ -123,15 +124,15 @@ namespace nd4j {
         return (int) _content.size();
     }
 
-    nd4j::NDArray* ResultSet::at(const unsigned long idx) const {
+    sd::NDArray* ResultSet::at(const unsigned long idx) const {
         return _content.at(idx);
     }
 
-    nd4j::NDArray* ResultSet::operator[](const unsigned long idx) const {
+    sd::NDArray* ResultSet::operator[](const unsigned long idx) const {
         return _content[idx];
     }
 
-    void ResultSet::push_back(nd4j::NDArray *array) {
+    void ResultSet::push_back(sd::NDArray *array) {
         _content.emplace_back(array);
     }
 
diff --git a/libnd4j/include/array/impl/ShapeDescriptor.cpp b/libnd4j/include/array/impl/ShapeDescriptor.cpp
index 3891fcbb8..3ef096312 100644
--- a/libnd4j/include/array/impl/ShapeDescriptor.cpp
+++ b/libnd4j/include/array/impl/ShapeDescriptor.cpp
@@ -18,11 +18,11 @@
 //  @author raver119@gmail.com
 //
 
-#include "../ShapeDescriptor.h"
-#include <shape.h>
-#include <ShapeBuilders.h>
+#include <array/ShapeDescriptor.h>
+#include <helpers/shape.h>
+#include <helpers/ShapeBuilders.h>
 
-namespace nd4j {
+namespace sd {
 
 //////////////////////////////////////////////////////////////////////////
 // equal to operator
@@ -226,7 +226,7 @@ namespace nd4j {
             _strides.emplace_back(shapeInfo[e + 1 + _rank]);
     }
 
-    ShapeDescriptor::ShapeDescriptor(const Nd4jLong *shapeInfo, const nd4j::DataType dtypeOverride)
+    ShapeDescriptor::ShapeDescriptor(const Nd4jLong *shapeInfo, const sd::DataType dtypeOverride)
             : ShapeDescriptor::ShapeDescriptor(shapeInfo, false) {
         _dataType = dtypeOverride;
     }
@@ -356,14 +356,14 @@ namespace nd4j {
 }
 
 namespace std {
-    size_t hash<nd4j::ShapeDescriptor>::operator()(const nd4j::ShapeDescriptor &k) const {
+    size_t hash<sd::ShapeDescriptor>::operator()(const sd::ShapeDescriptor &k) const {
         auto res = std::hash<Nd4jLong>()(k.arrLength());
         res ^= std::hash<char>()(k.order()) + 0x9e3779b9 + (res << 6) + (res >> 2);
         res ^= k.dataType() + 0x9e3779b9 + (res << 6) + (res >> 2);
         res ^= std::hash<int>()(k.rank()) + 0x9e3779b9 + (res << 6) + (res >> 2);
         res ^= std::hash<Nd4jLong>()(k.ews()) + 0x9e3779b9 + (res << 6) + (res >> 2);
-        auto shapes = const_cast<nd4j::ShapeDescriptor&>(k).shape();
-        auto strides = const_cast<nd4j::ShapeDescriptor&>(k).strides();
+        auto shapes = const_cast<sd::ShapeDescriptor&>(k).shape();
+        auto strides = const_cast<sd::ShapeDescriptor&>(k).strides();
         for (auto s: shapes) {
             res ^= std::hash<Nd4jLong>()(s) + 0x9e3779b9 + (res << 6) + (res >> 2);
         }
diff --git a/libnd4j/include/array/impl/ShapeList.cpp b/libnd4j/include/array/impl/ShapeList.cpp
index 1a1e02adb..1a883cc7e 100644
--- a/libnd4j/include/array/impl/ShapeList.cpp
+++ b/libnd4j/include/array/impl/ShapeList.cpp
@@ -18,10 +18,10 @@
 // @author raver119@gmail.com
 //
 
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <array/ShapeList.h>
 
-namespace nd4j {
+namespace sd {
     //ShapeList::ShapeList(bool autoRemovable) {
 //        _autoremovable = autoRemovable;
 //    }
diff --git a/libnd4j/include/array/impl/TadDescriptor.cpp b/libnd4j/include/array/impl/TadDescriptor.cpp
index a5043bb7c..e2ec7480e 100644
--- a/libnd4j/include/array/impl/TadDescriptor.cpp
+++ b/libnd4j/include/array/impl/TadDescriptor.cpp
@@ -22,7 +22,7 @@
 #include <algorithm>
 #include "../TadDescriptor.h"
 
-namespace nd4j {
+namespace sd {
     TadDescriptor::TadDescriptor(const TadDescriptor &other) {
         _originalShape = other._originalShape;
         _axis = other._axis;
@@ -79,13 +79,13 @@ namespace nd4j {
 }
 
 namespace std {
-    size_t hash<nd4j::TadDescriptor>::operator()(const nd4j::TadDescriptor &k) const {
+    size_t hash<sd::TadDescriptor>::operator()(const sd::TadDescriptor &k) const {
         // Compute individual hash values for first,
         // second and third and combine them using XOR
         // and bit shifting:
         auto res = std::hash<int>()((int)k.areUnitiesinShape());
-        res ^= std::hash<nd4j::ShapeDescriptor>()(k.originalShapeConst())  + 0x9e3779b9 + (res << 6) + (res >> 2);
-        auto axes = const_cast<nd4j::TadDescriptor&>(k).axis();
+        res ^= std::hash<sd::ShapeDescriptor>()(k.originalShapeConst())  + 0x9e3779b9 + (res << 6) + (res >> 2);
+        auto axes = const_cast<sd::TadDescriptor&>(k).axis();
         for (auto a: axes) {
             res ^= std::hash<int>()(a) + 0x9e3779b9 + (res << 6) + (res >> 2);
         }
diff --git a/libnd4j/include/array/impl/TadPack.cpp b/libnd4j/include/array/impl/TadPack.cpp
index 6bfc76eb1..1bd5b8f70 100644
--- a/libnd4j/include/array/impl/TadPack.cpp
+++ b/libnd4j/include/array/impl/TadPack.cpp
@@ -19,10 +19,10 @@
 //
 
 #include "../TadPack.h"
-#include <Environment.h>
+#include <system/Environment.h>
 #include <helpers/shape.h>
 
-namespace nd4j {
+namespace sd {
     TadPack::TadPack(ConstantDataBuffer &shapes, ConstantDataBuffer &offets, Nd4jLong numTads) {
         _tadShape = shapes;
         _tadOffsets = offets;
@@ -49,11 +49,11 @@ namespace nd4j {
     }
 
     Nd4jLong* TadPack::platformShapeInfo() const {
-        return nd4j::Environment::getInstance()->isCPU() ? primaryShapeInfo() : specialShapeInfo();
+        return sd::Environment::getInstance()->isCPU() ? primaryShapeInfo() : specialShapeInfo();
     }
 
     Nd4jLong* TadPack::platformOffsets() const {
-        return nd4j::Environment::getInstance()->isCPU() ? primaryOffsets() : specialOffsets();
+        return sd::Environment::getInstance()->isCPU() ? primaryOffsets() : specialOffsets();
     }
 
     int TadPack::shapeInfoLength() const {
diff --git a/libnd4j/include/cblas.h b/libnd4j/include/cblas.h
index 0d484a2e0..18970a9b0 100755
--- a/libnd4j/include/cblas.h
+++ b/libnd4j/include/cblas.h
@@ -48,7 +48,7 @@
 #endif
 
 #ifndef CBLAS_H
-#include <dll.h>
+#include <system/dll.h>
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/libnd4j/include/cblas_enum_conversion.h b/libnd4j/include/cblas_enum_conversion.h
index b2a854f41..6ff6fe557 100755
--- a/libnd4j/include/cblas_enum_conversion.h
+++ b/libnd4j/include/cblas_enum_conversion.h
@@ -30,7 +30,7 @@ enum CBLAS_UPLO  {CblasUpper=121, CblasLower=122};
 enum CBLAS_DIAG  {CblasNonUnit=131, CblasUnit=132};
 enum CBLAS_SIDE  {CblasLeft=141, CblasRight=142};
 */
-#include <dll.h>
+#include <system/dll.h>
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/libnd4j/include/cnpy/cnpy.h b/libnd4j/include/cnpy/cnpy.h
index d66320cae..ea847c3e7 100644
--- a/libnd4j/include/cnpy/cnpy.h
+++ b/libnd4j/include/cnpy/cnpy.h
@@ -49,8 +49,8 @@
 #include <string>
 #include <fstream>
 #include <streambuf>
-#include <op_boilerplate.h>
-#include <dll.h>
+#include <system/op_boilerplate.h>
+#include <system/dll.h>
 #include <array/DataType.h>
 
 
@@ -238,7 +238,7 @@ namespace cnpy {
 
     ND4J_EXPORT npz_t npzLoad(std::string fname);
 
-    ND4J_EXPORT nd4j::DataType dataTypeFromHeader(char *data);
+    ND4J_EXPORT sd::DataType dataTypeFromHeader(char *data);
 /**
 * Parse the numpy header from
 * the given file
diff --git a/libnd4j/include/exceptions/allocation_exception.h b/libnd4j/include/exceptions/allocation_exception.h
index 458650037..1e9b6653b 100644
--- a/libnd4j/include/exceptions/allocation_exception.h
+++ b/libnd4j/include/exceptions/allocation_exception.h
@@ -23,8 +23,8 @@
 
 #include <string>
 #include <stdexcept>
-#include <pointercast.h>
-#include <dll.h>
+#include <system/pointercast.h>
+#include <system/dll.h>
 
 #if defined(_MSC_VER)
 
@@ -33,7 +33,7 @@
 
 #endif
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT allocation_exception : public std::runtime_error {
     public:
         allocation_exception(std::string message);
diff --git a/libnd4j/include/exceptions/cuda_exception.h b/libnd4j/include/exceptions/cuda_exception.h
index 5150033e8..2dc98eec3 100644
--- a/libnd4j/include/exceptions/cuda_exception.h
+++ b/libnd4j/include/exceptions/cuda_exception.h
@@ -23,7 +23,7 @@
 
 #include <string>
 #include <stdexcept>
-#include <dll.h>
+#include <system/dll.h>
 
 #if defined(_MSC_VER)
 
@@ -32,7 +32,7 @@
 
 #endif
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT cuda_exception : public std::runtime_error {
     public:
         cuda_exception(std::string message);
diff --git a/libnd4j/include/exceptions/datatype_exception.h b/libnd4j/include/exceptions/datatype_exception.h
index 171a2b13b..74829d54c 100644
--- a/libnd4j/include/exceptions/datatype_exception.h
+++ b/libnd4j/include/exceptions/datatype_exception.h
@@ -24,7 +24,7 @@
 #include <string>
 #include <stdexcept>
 #include <array/DataType.h>
-#include <dll.h>
+#include <system/dll.h>
 
 #if defined(_MSC_VER)
 
@@ -33,16 +33,16 @@
 
 #endif
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT datatype_exception : public std::runtime_error {
     public:
         datatype_exception(std::string message);
         ~datatype_exception() = default;
 
 
-        static datatype_exception build(std::string message, nd4j::DataType actual);
-        static datatype_exception build(std::string message, nd4j::DataType expected, nd4j::DataType actual);
-        static datatype_exception build(std::string message, nd4j::DataType expected, nd4j::DataType actualX, nd4j::DataType actualY);
+        static datatype_exception build(std::string message, sd::DataType actual);
+        static datatype_exception build(std::string message, sd::DataType expected, sd::DataType actual);
+        static datatype_exception build(std::string message, sd::DataType expected, sd::DataType actualX, sd::DataType actualY);
     };
 }
 
diff --git a/libnd4j/include/exceptions/graph_exception.h b/libnd4j/include/exceptions/graph_exception.h
index 440fa5aa4..7c9345a4d 100644
--- a/libnd4j/include/exceptions/graph_exception.h
+++ b/libnd4j/include/exceptions/graph_exception.h
@@ -23,8 +23,8 @@
 
 #include <string>
 #include <stdexcept>
-#include <pointercast.h>
-#include <dll.h>
+#include <system/pointercast.h>
+#include <system/dll.h>
 
 #if defined(_MSC_VER)
 
@@ -33,7 +33,7 @@
 
 #endif
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT graph_exception : public std::runtime_error {
     protected:
         Nd4jLong _graphId;
diff --git a/libnd4j/include/exceptions/graph_execution_exception.h b/libnd4j/include/exceptions/graph_execution_exception.h
index 92b02e2ee..37f8e636e 100644
--- a/libnd4j/include/exceptions/graph_execution_exception.h
+++ b/libnd4j/include/exceptions/graph_execution_exception.h
@@ -21,11 +21,11 @@
 #ifndef DEV_TESTS_GRAPH_EXECUTION_EXCEPTION_H
 #define DEV_TESTS_GRAPH_EXECUTION_EXCEPTION_H
 
-#include <op_boilerplate.h>
-#include <pointercast.h>
+#include <system/op_boilerplate.h>
+#include <system/pointercast.h>
 #include <stdexcept>
 #include <exceptions/graph_exception.h>
-#include <dll.h>
+#include <system/dll.h>
 
 #if defined(_MSC_VER)
 
@@ -34,7 +34,7 @@
 
 #endif
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT graph_execution_exception: public graph_exception {
     public:
         explicit graph_execution_exception(Nd4jLong graphId);
diff --git a/libnd4j/include/exceptions/graph_exists_exception.h b/libnd4j/include/exceptions/graph_exists_exception.h
index 985770ad3..63554c31b 100644
--- a/libnd4j/include/exceptions/graph_exists_exception.h
+++ b/libnd4j/include/exceptions/graph_exists_exception.h
@@ -21,11 +21,11 @@
 #ifndef DEV_TESTS_GRAPH_EXISTS_EXCEPTION_H
 #define DEV_TESTS_GRAPH_EXISTS_EXCEPTION_H
 
-#include <op_boilerplate.h>
-#include <pointercast.h>
+#include <system/op_boilerplate.h>
+#include <system/pointercast.h>
 #include <stdexcept>
 #include <exceptions/graph_exception.h>
-#include <dll.h>
+#include <system/dll.h>
 
 #if defined(_MSC_VER)
 
@@ -34,7 +34,7 @@
 
 #endif
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT graph_exists_exception: public graph_exception {
     public:
         explicit graph_exists_exception(Nd4jLong graphId);
diff --git a/libnd4j/include/exceptions/impl/allocation_exception.cpp b/libnd4j/include/exceptions/impl/allocation_exception.cpp
index 85c3e72aa..46f2ef5c8 100644
--- a/libnd4j/include/exceptions/impl/allocation_exception.cpp
+++ b/libnd4j/include/exceptions/impl/allocation_exception.cpp
@@ -19,9 +19,9 @@
 //
 
 #include <exceptions/allocation_exception.h>
-#include <StringUtils.h>
+#include <helpers/StringUtils.h>
 
-namespace nd4j {
+namespace sd {
     allocation_exception::allocation_exception(std::string message) : std::runtime_error(message){
         //
     }
diff --git a/libnd4j/include/exceptions/impl/cuda_exception.cpp b/libnd4j/include/exceptions/impl/cuda_exception.cpp
index d1bb4d53c..91de6c251 100644
--- a/libnd4j/include/exceptions/impl/cuda_exception.cpp
+++ b/libnd4j/include/exceptions/impl/cuda_exception.cpp
@@ -19,9 +19,9 @@
 //
 
 #include <exceptions/cuda_exception.h>
-#include <StringUtils.h>
+#include <helpers/StringUtils.h>
 
-namespace nd4j {
+namespace sd {
     cuda_exception::cuda_exception(std::string message) : std::runtime_error(message){
         //
     }
diff --git a/libnd4j/include/exceptions/impl/datatype_exception.cpp b/libnd4j/include/exceptions/impl/datatype_exception.cpp
index 0dfe5ab38..9aab37951 100644
--- a/libnd4j/include/exceptions/impl/datatype_exception.cpp
+++ b/libnd4j/include/exceptions/impl/datatype_exception.cpp
@@ -21,19 +21,19 @@
 #include <array/DataTypeUtils.h>
 #include <exceptions/datatype_exception.h>
 
-namespace nd4j {
+namespace sd {
     datatype_exception::datatype_exception(std::string message) : std::runtime_error(message){
         //
     }
 
-    datatype_exception datatype_exception::build(std::string message, nd4j::DataType expected, nd4j::DataType actual) {
+    datatype_exception datatype_exception::build(std::string message, sd::DataType expected, sd::DataType actual) {
         auto exp = DataTypeUtils::asString(expected);
         auto act = DataTypeUtils::asString(actual);
         message += "; Expected: [" + exp + "]; Actual: [" + act + "]";
         return datatype_exception(message);
     }
 
-    datatype_exception datatype_exception::build(std::string message, nd4j::DataType expected, nd4j::DataType actualX, nd4j::DataType actualY) {
+    datatype_exception datatype_exception::build(std::string message, sd::DataType expected, sd::DataType actualX, sd::DataType actualY) {
         auto exp = DataTypeUtils::asString(expected);
         auto actX = DataTypeUtils::asString(actualX);
         auto actY = DataTypeUtils::asString(actualY);
@@ -41,7 +41,7 @@ namespace nd4j {
         return datatype_exception(message);
     }
 
-    datatype_exception datatype_exception::build(std::string message, nd4j::DataType actual) {
+    datatype_exception datatype_exception::build(std::string message, sd::DataType actual) {
         auto act = DataTypeUtils::asString(actual);
         message += "; Actual: [" + act + "]";
         return datatype_exception(message);
diff --git a/libnd4j/include/exceptions/impl/graph_exception.cpp b/libnd4j/include/exceptions/impl/graph_exception.cpp
index 48be2cd75..fa2210a1d 100644
--- a/libnd4j/include/exceptions/impl/graph_exception.cpp
+++ b/libnd4j/include/exceptions/impl/graph_exception.cpp
@@ -21,7 +21,7 @@
 #include <exceptions/graph_exception.h>
 #include <helpers/StringUtils.h>
 
-namespace nd4j {
+namespace sd {
     graph_exception::graph_exception(std::string message, Nd4jLong graphId) : std::runtime_error(message) {
         this->_message = message;
             this->_graphId = graphId;
diff --git a/libnd4j/include/exceptions/impl/graph_execution_exception.cpp b/libnd4j/include/exceptions/impl/graph_execution_exception.cpp
index 6f38695d8..086796517 100644
--- a/libnd4j/include/exceptions/impl/graph_execution_exception.cpp
+++ b/libnd4j/include/exceptions/impl/graph_execution_exception.cpp
@@ -21,7 +21,7 @@
 #include <helpers/StringUtils.h>
 #include <exceptions/graph_execution_exception.h>
 
-namespace nd4j {
+namespace sd {
     graph_execution_exception::graph_execution_exception(Nd4jLong graphId) : graph_exception(StringUtils::buildGraphErrorMessage("Caught exception during graph execution", graphId), graphId) {
         _graphId = graphId;
     }
diff --git a/libnd4j/include/exceptions/impl/graph_exists_exception.cpp b/libnd4j/include/exceptions/impl/graph_exists_exception.cpp
index 1b3b1a84e..535a74a6a 100644
--- a/libnd4j/include/exceptions/impl/graph_exists_exception.cpp
+++ b/libnd4j/include/exceptions/impl/graph_exists_exception.cpp
@@ -21,7 +21,7 @@
 #include <helpers/StringUtils.h>
 #include <exceptions/graph_exists_exception.h>
 
-namespace nd4j {
+namespace sd {
     graph_exists_exception::graph_exists_exception(Nd4jLong graphId) : graph_exception(StringUtils::buildGraphErrorMessage("Graph with given ID already exists", graphId), graphId) {
         _graphId = graphId;
     }
diff --git a/libnd4j/include/exceptions/impl/no_results_exception.cpp b/libnd4j/include/exceptions/impl/no_results_exception.cpp
index 7bc18cac7..ce3122ffb 100644
--- a/libnd4j/include/exceptions/impl/no_results_exception.cpp
+++ b/libnd4j/include/exceptions/impl/no_results_exception.cpp
@@ -21,7 +21,7 @@
 #include <helpers/StringUtils.h>
 #include <exceptions/no_results_exception.h>
 
-namespace nd4j {
+namespace sd {
     no_results_exception::no_results_exception(Nd4jLong graphId) : graph_exception(StringUtils::buildGraphErrorMessage("Got no results after graph execution", graphId), graphId) {
         _graphId = graphId;
     }
diff --git a/libnd4j/include/exceptions/impl/unknown_graph_exception.cpp b/libnd4j/include/exceptions/impl/unknown_graph_exception.cpp
index f30e1a1bd..ad73f3d33 100644
--- a/libnd4j/include/exceptions/impl/unknown_graph_exception.cpp
+++ b/libnd4j/include/exceptions/impl/unknown_graph_exception.cpp
@@ -21,7 +21,7 @@
 #include <helpers/StringUtils.h>
 #include <exceptions/unknown_graph_exception.h>
 
-namespace nd4j {
+namespace sd {
     unknown_graph_exception::unknown_graph_exception(Nd4jLong graphId) : graph_exception(StringUtils::buildGraphErrorMessage("Unknown graph", graphId), graphId) {
         _graphId = graphId;
     }
diff --git a/libnd4j/include/exceptions/no_results_exception.h b/libnd4j/include/exceptions/no_results_exception.h
index 0fa1bb167..b2687854b 100644
--- a/libnd4j/include/exceptions/no_results_exception.h
+++ b/libnd4j/include/exceptions/no_results_exception.h
@@ -21,11 +21,11 @@
 #ifndef DEV_TESTS_NO_RESULTS_EXCEPTION_H
 #define DEV_TESTS_NO_RESULTS_EXCEPTION_H
 
-#include <op_boilerplate.h>
-#include <pointercast.h>
+#include <system/op_boilerplate.h>
+#include <system/pointercast.h>
 #include <stdexcept>
 #include <exceptions/graph_exception.h>
-#include <dll.h>
+#include <system/dll.h>
 
 #if defined(_MSC_VER)
 
@@ -34,7 +34,7 @@
 
 #endif
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT no_results_exception: public graph_exception {
     public:
         explicit no_results_exception(Nd4jLong graphId);
diff --git a/libnd4j/include/exceptions/unknown_graph_exception.h b/libnd4j/include/exceptions/unknown_graph_exception.h
index 83efc9dcf..917aeb757 100644
--- a/libnd4j/include/exceptions/unknown_graph_exception.h
+++ b/libnd4j/include/exceptions/unknown_graph_exception.h
@@ -21,11 +21,11 @@
 #ifndef DEV_TESTS_UNKNOWN_GRAPH_EXCEPTION_H
 #define DEV_TESTS_UNKNOWN_GRAPH_EXCEPTION_H
 
-#include <op_boilerplate.h>
-#include <pointercast.h>
+#include <system/op_boilerplate.h>
+#include <system/pointercast.h>
 #include <stdexcept>
 #include <exceptions/graph_exception.h>
-#include <dll.h>
+#include <system/dll.h>
 
 #if defined(_MSC_VER)
 
@@ -34,7 +34,7 @@
 
 #endif
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT unknown_graph_exception: public graph_exception {
     public:
         explicit unknown_graph_exception(Nd4jLong graphId);
diff --git a/libnd4j/include/execution/AffinityManager.h b/libnd4j/include/execution/AffinityManager.h
index 463d6942e..757f637ce 100644
--- a/libnd4j/include/execution/AffinityManager.h
+++ b/libnd4j/include/execution/AffinityManager.h
@@ -21,12 +21,12 @@
 #ifndef LIBND4J_AFFINITYMANAGER_H
 #define LIBND4J_AFFINITYMANAGER_H
 
-#include <dll.h>
-#include <pointercast.h>
+#include <system/dll.h>
+#include <system/pointercast.h>
 #include <atomic>
 #include <mutex>
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT AffinityManager {
     private:
         static std::atomic<int> _lastDevice;
diff --git a/libnd4j/include/execution/CallableInterface.h b/libnd4j/include/execution/CallableInterface.h
index 7e5502af1..aad83b379 100644
--- a/libnd4j/include/execution/CallableInterface.h
+++ b/libnd4j/include/execution/CallableInterface.h
@@ -21,7 +21,7 @@
 #ifndef SAMEDIFF_CALLABLEINTERFACE_H
 #define SAMEDIFF_CALLABLEINTERFACE_H
 
-#include <openmp_pragmas.h>
+#include <system/openmp_pragmas.h>
 #include <cstdint>
 #include <functional>
 #include <atomic>
diff --git a/libnd4j/include/execution/CallableWithArguments.h b/libnd4j/include/execution/CallableWithArguments.h
index ebf1f0019..28ef8433e 100644
--- a/libnd4j/include/execution/CallableWithArguments.h
+++ b/libnd4j/include/execution/CallableWithArguments.h
@@ -25,7 +25,7 @@
 #include <vector>
 #include <atomic>
 #include <condition_variable>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 
 namespace samediff {
     class CallableWithArguments {
diff --git a/libnd4j/include/execution/ContextBuffers.h b/libnd4j/include/execution/ContextBuffers.h
index 67c428d27..c14671e42 100644
--- a/libnd4j/include/execution/ContextBuffers.h
+++ b/libnd4j/include/execution/ContextBuffers.h
@@ -21,11 +21,11 @@
 #ifndef LIBND4J_CONTEXTBUFFERS_H
 #define LIBND4J_CONTEXTBUFFERS_H
 
-#include <dll.h>
-#include <pointercast.h>
+#include <system/dll.h>
+#include <system/pointercast.h>
 #include <execution/ErrorReference.h>
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT ContextBuffers {
     private:
         void* _reductionPointer = nullptr;
diff --git a/libnd4j/include/execution/ErrorReference.h b/libnd4j/include/execution/ErrorReference.h
index 2b68d5855..b71090248 100644
--- a/libnd4j/include/execution/ErrorReference.h
+++ b/libnd4j/include/execution/ErrorReference.h
@@ -22,7 +22,7 @@
 #define DEV_TESTS_ERRORREFERENCE_H
 
 #include <string>
-#include <dll.h>
+#include <system/dll.h>
 
 namespace sd {
     class ND4J_EXPORT ErrorReference {
diff --git a/libnd4j/include/execution/Executor.h b/libnd4j/include/execution/Executor.h
index 26d5365ad..a9eaa6ad3 100644
--- a/libnd4j/include/execution/Executor.h
+++ b/libnd4j/include/execution/Executor.h
@@ -21,7 +21,7 @@
 #ifndef SD_EXECUTOR_H
 #define SD_EXECUTOR_H
 
-namespace nd4j {
+namespace sd {
     class Executor {
     public:
         static void execute() {
diff --git a/libnd4j/include/execution/LaunchContext.h b/libnd4j/include/execution/LaunchContext.h
index 689d79369..e2efa1418 100644
--- a/libnd4j/include/execution/LaunchContext.h
+++ b/libnd4j/include/execution/LaunchContext.h
@@ -35,9 +35,9 @@
 #include "config.h"
 #endif
 
-#include <dll.h>
+#include <system/dll.h>
 #include <memory>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <memory/Workspace.h>
 #include <vector>
 #include <mutex>
@@ -46,7 +46,7 @@
 
 
 
-namespace nd4j  {
+namespace sd  {
 
 class ND4J_EXPORT LaunchContext {
 
@@ -68,7 +68,7 @@ class ND4J_EXPORT LaunchContext {
 
 		bool _isAllocated = false;
 #endif // CUDA
-	    nd4j::memory::Workspace* _workspace = nullptr;
+	    sd::memory::Workspace* _workspace = nullptr;
         int _deviceID = 0;
 
 	public:
@@ -100,8 +100,8 @@ class ND4J_EXPORT LaunchContext {
 		LaunchContext(Nd4jPointer cudaStream, Nd4jPointer reductionPointer = nullptr, Nd4jPointer scalarPointer = nullptr, Nd4jPointer allocationPointer = nullptr);
     	LaunchContext();
     	~LaunchContext();
-    	nd4j::memory::Workspace* getWorkspace() const { return _workspace; }
-    	void setWorkspace(nd4j::memory::Workspace* theWorkspace) {
+    	sd::memory::Workspace* getWorkspace() const { return _workspace; }
+    	void setWorkspace(sd::memory::Workspace* theWorkspace) {
     	    _workspace = theWorkspace;
     	}
 
diff --git a/libnd4j/include/execution/Threads.h b/libnd4j/include/execution/Threads.h
index 3a1fd8951..2ea8295a8 100644
--- a/libnd4j/include/execution/Threads.h
+++ b/libnd4j/include/execution/Threads.h
@@ -21,10 +21,10 @@
 #define SAMEDIFF_THREADS_H
 
 #include <functional>
-#include <openmp_pragmas.h>
-#include <op_boilerplate.h>
-#include <Environment.h>
-#include <op_enums.h>
+#include <system/openmp_pragmas.h>
+#include <system/op_boilerplate.h>
+#include <system/Environment.h>
+#include <system/op_enums.h>
 
 namespace samediff {
     class ND4J_EXPORT ThreadsHelper {
@@ -107,7 +107,7 @@ namespace samediff {
          * @param increment
          * @return
          */
-        static int parallel_for(FUNC_1D function, int64_t start, int64_t stop, int64_t increment = 1, uint32_t numThreads = nd4j::Environment::getInstance()->maxMasterThreads());
+        static int parallel_for(FUNC_1D function, int64_t start, int64_t stop, int64_t increment = 1, uint32_t numThreads = sd::Environment::getInstance()->maxMasterThreads());
 
         /**
          * This function executes 1 dimensional loop for a given number of threads
@@ -119,7 +119,7 @@ namespace samediff {
          * @param numThreads
          * @return
          */
-        static int parallel_tad(FUNC_1D function, int64_t start, int64_t stop, int64_t increment = 1, uint32_t numThreads = nd4j::Environment::getInstance()->maxMasterThreads());
+        static int parallel_tad(FUNC_1D function, int64_t start, int64_t stop, int64_t increment = 1, uint32_t numThreads = sd::Environment::getInstance()->maxMasterThreads());
 
         /**
          * This method will execute function splitting 2 nested loops space with multiple threads
@@ -134,7 +134,7 @@ namespace samediff {
          * @param inc_y
          * @return
          */
-        static int parallel_for(FUNC_2D function, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y, uint64_t numThreads = nd4j::Environment::getInstance()->maxMasterThreads(), bool debug = false);
+        static int parallel_for(FUNC_2D function, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y, uint64_t numThreads = sd::Environment::getInstance()->maxMasterThreads(), bool debug = false);
 
         /**
          * This method will execute function splitting 3 nested loops space with multiple threads
@@ -152,7 +152,7 @@ namespace samediff {
          * @param inc_z
          * @return
          */
-        static int parallel_for(FUNC_3D function, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y, int64_t start_z, int64_t stop_z, int64_t inc_z, uint64_t numThreads = nd4j::Environment::getInstance()->maxMasterThreads());
+        static int parallel_for(FUNC_3D function, int64_t start_x, int64_t stop_x, int64_t inc_x, int64_t start_y, int64_t stop_y, int64_t inc_y, int64_t start_z, int64_t stop_z, int64_t inc_z, uint64_t numThreads = sd::Environment::getInstance()->maxMasterThreads());
 
         /**
          *
@@ -160,18 +160,18 @@ namespace samediff {
          * @param numThreads
          * @return
          */
-        static int parallel_do(FUNC_DO function, uint64_t numThreads = nd4j::Environment::getInstance()->maxMasterThreads());
+        static int parallel_do(FUNC_DO function, uint64_t numThreads = sd::Environment::getInstance()->maxMasterThreads());
 
-        static int64_t parallel_long(FUNC_RL function, FUNC_AL aggregator, int64_t start, int64_t stop, int64_t increment = 1, uint64_t numThreads = nd4j::Environment::getInstance()->maxMasterThreads());
+        static int64_t parallel_long(FUNC_RL function, FUNC_AL aggregator, int64_t start, int64_t stop, int64_t increment = 1, uint64_t numThreads = sd::Environment::getInstance()->maxMasterThreads());
 
-        static double parallel_double(FUNC_RD function, FUNC_AD aggregator, int64_t start, int64_t stop, int64_t increment = 1, uint64_t numThreads = nd4j::Environment::getInstance()->maxMasterThreads());
+        static double parallel_double(FUNC_RD function, FUNC_AD aggregator, int64_t start, int64_t stop, int64_t increment = 1, uint64_t numThreads = sd::Environment::getInstance()->maxMasterThreads());
 
         /**
          * This method will execute function in parallel preserving the parts to be aligned increment size
          * PLEASE NOTE: this function can use smaller number of threads than requested.
          *
         */
-        static int  parallel_aligned_increment(FUNC_1D function, int64_t start, int64_t stop, int64_t increment, size_t type_size = sizeof(float), uint32_t req_numThreads = nd4j::Environment::getInstance()->maxMasterThreads());
+        static int  parallel_aligned_increment(FUNC_1D function, int64_t start, int64_t stop, int64_t increment, size_t type_size = sizeof(float), uint32_t req_numThreads = sd::Environment::getInstance()->maxMasterThreads());
 
     };
 }
diff --git a/libnd4j/include/execution/cpu/AffinityManager.cpp b/libnd4j/include/execution/cpu/AffinityManager.cpp
index 7927982a6..32df63d0d 100644
--- a/libnd4j/include/execution/cpu/AffinityManager.cpp
+++ b/libnd4j/include/execution/cpu/AffinityManager.cpp
@@ -20,7 +20,7 @@
 
 #include <execution/AffinityManager.h>
 
-namespace nd4j {
+namespace sd {
     int AffinityManager::currentDeviceId() {
         return 0;
     }
diff --git a/libnd4j/include/execution/cpu/ContextBuffers.cpp b/libnd4j/include/execution/cpu/ContextBuffers.cpp
index 0038990c2..3b1c566a8 100644
--- a/libnd4j/include/execution/cpu/ContextBuffers.cpp
+++ b/libnd4j/include/execution/cpu/ContextBuffers.cpp
@@ -20,7 +20,7 @@
 #include <execution/ContextBuffers.h>
 #include <execution/AffinityManager.h>
 
-namespace nd4j {
+namespace sd {
     ContextBuffers::ContextBuffers() {
         _deviceId = AffinityManager::currentDeviceId();
     }
diff --git a/libnd4j/include/execution/cpu/LaunchContext.cpp b/libnd4j/include/execution/cpu/LaunchContext.cpp
index 1c34f25d9..6217e0707 100644
--- a/libnd4j/include/execution/cpu/LaunchContext.cpp
+++ b/libnd4j/include/execution/cpu/LaunchContext.cpp
@@ -19,21 +19,21 @@
 //
 
 #include <execution/LaunchContext.h>
-#include <logger.h>
+#include <helpers/logger.h>
 #include <exceptions/cuda_exception.h>
 #include <thread>
 
-#if defined(IOS_BUILD) || defined(APPLE_BUILD) || defined(ANDROID_BUILD)
-nd4j::ContextBuffers contextBuffers = nd4j::ContextBuffers();
+#if defined(SD_IOS_BUILD) || defined(SD_APPLE_BUILD) || defined(SD_ANDROID_BUILD)
+sd::ContextBuffers contextBuffers = sd::ContextBuffers();
 #else
-thread_local nd4j::ContextBuffers contextBuffers = nd4j::ContextBuffers();
+thread_local sd::ContextBuffers contextBuffers = sd::ContextBuffers();
 #endif
 
 #ifdef HAVE_MKLDNN
 #include <dnnl.hpp>
 #endif
 
-namespace nd4j {
+namespace sd {
 
     LaunchContext::~LaunchContext() {
 #ifdef HAVE_MKLDNN
diff --git a/libnd4j/include/execution/cuda/AffinityManager.cu b/libnd4j/include/execution/cuda/AffinityManager.cu
index 2743fb710..cdfe7c107 100644
--- a/libnd4j/include/execution/cuda/AffinityManager.cu
+++ b/libnd4j/include/execution/cuda/AffinityManager.cu
@@ -18,14 +18,14 @@
 // @author raver119@gmail.com
 //
 
-#include <logger.h>
+#include <helpers/logger.h>
 #include <execution/AffinityManager.h>
 #include <exceptions/cuda_exception.h>
-#include <LaunchContext.h>
+#include <execution/LaunchContext.h>
 
 thread_local int globalThreadToDevice = -1;
 
-namespace nd4j {
+namespace sd {
     std::mutex AffinityManager::_currentMutex;
     std::mutex AffinityManager::_numberMutex;
     int AffinityManager::_numberOfDevices = -1;
diff --git a/libnd4j/include/execution/cuda/ContextBuffers.cu b/libnd4j/include/execution/cuda/ContextBuffers.cu
index e018cf807..9411a27d5 100644
--- a/libnd4j/include/execution/cuda/ContextBuffers.cu
+++ b/libnd4j/include/execution/cuda/ContextBuffers.cu
@@ -20,15 +20,15 @@
 
 #include <execution/ContextBuffers.h>
 #include <exceptions/cuda_exception.h>
-#include <logger.h>
-#include <AffinityManager.h>
+#include <helpers/logger.h>
+#include <execution/AffinityManager.h>
 
 #include <cuda.h>
 #include <cuda_runtime_api.h>
 #include <cuda_runtime.h>
 #include <cuda_device_runtime_api.h>
 
-namespace nd4j {
+namespace sd {
     ContextBuffers::ContextBuffers() {
         //nd4j_printf("Creating ContextBuffers for device [%i]\n", AffinityManager::currentDeviceId());
         _deviceId = AffinityManager::currentDeviceId();
diff --git a/libnd4j/include/execution/cuda/LaunchContext.cu b/libnd4j/include/execution/cuda/LaunchContext.cu
index 3145ca8d3..28193c3b0 100644
--- a/libnd4j/include/execution/cuda/LaunchContext.cu
+++ b/libnd4j/include/execution/cuda/LaunchContext.cu
@@ -19,15 +19,15 @@
 //
 
 #include <execution/LaunchContext.h>
-#include <logger.h>
+#include <helpers/logger.h>
 #include <exceptions/cuda_exception.h>
 #include <helpers/cublasHelper.h>
 #include <thread>
 #include <execution/AffinityManager.h>
 
-thread_local nd4j::ContextBuffers contextBuffers = nd4j::ContextBuffers();
+thread_local sd::ContextBuffers contextBuffers = sd::ContextBuffers();
 
-namespace nd4j {
+namespace sd {
 
     std::vector<std::shared_ptr<LaunchContext>> LaunchContext::_contexts = std::vector<std::shared_ptr<LaunchContext>>();
     std::mutex LaunchContext::_mutex;
diff --git a/libnd4j/include/execution/impl/BlockingQueue.cpp b/libnd4j/include/execution/impl/BlockingQueue.cpp
index ff483fd28..21c3b4c6a 100644
--- a/libnd4j/include/execution/impl/BlockingQueue.cpp
+++ b/libnd4j/include/execution/impl/BlockingQueue.cpp
@@ -19,7 +19,7 @@
 //
 
 #include <execution/BlockingQueue.h>
-#include <CallableWithArguments.h>
+#include <execution/CallableWithArguments.h>
 #include <thread>
 
 namespace samediff {
diff --git a/libnd4j/include/execution/impl/ThreadPool.cpp b/libnd4j/include/execution/impl/ThreadPool.cpp
index 5d9e2d5eb..b02c4c4d5 100644
--- a/libnd4j/include/execution/impl/ThreadPool.cpp
+++ b/libnd4j/include/execution/impl/ThreadPool.cpp
@@ -78,7 +78,7 @@ namespace samediff {
     ThreadPool::ThreadPool() {
         // TODO: number of threads must reflect number of cores for UMA system. In case of NUMA it should be per-device pool
         // FIXME: on mobile phones this feature must NOT be used
-        _available = nd4j::Environment::getInstance()->maxThreads();
+        _available = sd::Environment::getInstance()->maxThreads();
 
         _queues.resize(_available.load());
         _threads.resize(_available.load());
diff --git a/libnd4j/include/execution/impl/Threads.cpp b/libnd4j/include/execution/impl/Threads.cpp
index 94710731e..2d0ae1144 100644
--- a/libnd4j/include/execution/impl/Threads.cpp
+++ b/libnd4j/include/execution/impl/Threads.cpp
@@ -22,18 +22,18 @@
 #include <vector>
 #include <thread>
 #include <helpers/logger.h>
-#include <templatemath.h>
-#include <shape.h>
+#include <math/templatemath.h>
+#include <helpers/shape.h>
 
 
 namespace samediff {
 
     int ThreadsHelper::numberOfThreads(int maxThreads, uint64_t numberOfElements) {
         // let's see how many threads we actually need first
-        auto optimalThreads = nd4j::math::nd4j_max<uint64_t>(1, numberOfElements / 1024);
+        auto optimalThreads = sd::math::nd4j_max<uint64_t>(1, numberOfElements / 1024);
 
         // now return the smallest value
-        return nd4j::math::nd4j_min<int>(optimalThreads, maxThreads);
+        return sd::math::nd4j_min<int>(optimalThreads, maxThreads);
     }
 
     Span3::Span3(int64_t startX, int64_t stopX, int64_t incX, int64_t startY, int64_t stopY, int64_t incY, int64_t startZ, int64_t stopZ, int64_t incZ) {
@@ -264,7 +264,7 @@ namespace samediff {
     int ThreadsHelper::numberOfThreads2d(int maxThreads, uint64_t iters_x, uint64_t iters_y) {
         // in some cases there's nothing to think about, part 1
         if (iters_x < maxThreads && iters_y < maxThreads)
-            return nd4j::math::nd4j_max<int>(iters_x, iters_y);
+            return sd::math::nd4j_max<int>(iters_x, iters_y);
 
         auto remX = iters_x % maxThreads;
         auto remY = iters_y % maxThreads;
diff --git a/libnd4j/include/execution/impl/Ticket.cpp b/libnd4j/include/execution/impl/Ticket.cpp
index 5bf911fd0..98cb05376 100644
--- a/libnd4j/include/execution/impl/Ticket.cpp
+++ b/libnd4j/include/execution/impl/Ticket.cpp
@@ -31,7 +31,7 @@ namespace samediff {
 
     Ticket::Ticket() {
         _acquired = true;
-        _interfaces.resize(nd4j::Environment::getInstance()->maxThreads());
+        _interfaces.resize(sd::Environment::getInstance()->maxThreads());
     }
 
     bool Ticket::acquired() {
diff --git a/libnd4j/include/graph/ArgumentsList.h b/libnd4j/include/graph/ArgumentsList.h
index eb1800444..75bdf857a 100644
--- a/libnd4j/include/graph/ArgumentsList.h
+++ b/libnd4j/include/graph/ArgumentsList.h
@@ -21,13 +21,13 @@
 #ifndef LIBND4J_INPUTLIST_H
 #define LIBND4J_INPUTLIST_H
 
-#include <op_boilerplate.h>
-#include <pointercast.h>
-#include <dll.h>
+#include <system/op_boilerplate.h>
+#include <system/pointercast.h>
+#include <system/dll.h>
 #include <vector>
 #include <types/pair.h>
 
-namespace nd4j {
+namespace sd {
 namespace graph {
     class ND4J_EXPORT ArgumentsList {
     protected:
diff --git a/libnd4j/include/graph/Context.h b/libnd4j/include/graph/Context.h
index 51f7bfa2b..96d7e8b12 100644
--- a/libnd4j/include/graph/Context.h
+++ b/libnd4j/include/graph/Context.h
@@ -23,7 +23,7 @@
 #define LIBND4J_CONTEXT_H
 
 #include <vector>
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <graph/Variable.h>
 #include <graph/VariableSpace.h>
 #include <graph/ContextPrototype.h>
@@ -39,26 +39,26 @@
 #include <cuda_device_runtime_api.h>
 #endif
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         /**
          * This class defines input desired for any given node/operation within graph
          */
-        class ND4J_EXPORT Context : public nd4j::graph::ContextPrototype {
+        class ND4J_EXPORT Context : public sd::graph::ContextPrototype {
         protected:
-            nd4j::memory::Workspace* _workspace = nullptr;
-            nd4j::graph::VariableSpace* _variableSpace = nullptr;
+            sd::memory::Workspace* _workspace = nullptr;
+            sd::graph::VariableSpace* _variableSpace = nullptr;
             std::pair<Nd4jLong, Nd4jLong> _executionTime;
-            nd4j::random::RandomBuffer* _rng = nullptr;
+            sd::random::RandomBuffer* _rng = nullptr;
 
-            nd4j::DataType _dataType = nd4j::DataType::FLOAT32;
+            sd::DataType _dataType = sd::DataType::FLOAT32;
             // branch for divergent_op
             int _branch = 0;
 
             // temporary context for standalone ops execution
             LaunchContext* _context = nullptr;
 
-            std::vector<nd4j::DataType> _dataTypes;
+            std::vector<sd::DataType> _dataTypes;
 
             // fields for fast execution (out-of-graph ops use)
             std::vector<NDArray*> _fastpath_in;
@@ -87,30 +87,30 @@ namespace nd4j {
             Nd4jLong getOuterTime();
             Nd4jLong getInnerTime();
 
-            nd4j::DataType dataType() override;
+            sd::DataType dataType() override;
 
-            nd4j::DataType dataType(int index) override;
-            void setDataType(int index, nd4j::DataType type) override;
+            sd::DataType dataType(int index) override;
+            void setDataType(int index, sd::DataType type) override;
             // these methods are related to Workspace abstraction
             bool hasWorkspaceProvided();
-            void attachWorkspace(nd4j::memory::Workspace* workspace);
+            void attachWorkspace(sd::memory::Workspace* workspace);
             void forgetWorkspace();
 
             // these methods return full-time workspace
-            nd4j::memory::Workspace* getWorkspace();
-            nd4j::memory::Workspace* workspace();
-            nd4j::memory::Workspace* fWorkspace();
+            sd::memory::Workspace* getWorkspace();
+            sd::memory::Workspace* workspace();
+            sd::memory::Workspace* fWorkspace();
 
             // this method returns workspace for temporary allocations
-            nd4j::memory::Workspace* tWorkspace();
+            sd::memory::Workspace* tWorkspace();
 
             // this method returns workspace for object allocations
-            nd4j::memory::Workspace* oWorkspace();
+            sd::memory::Workspace* oWorkspace();
 
             void setVariableSpace(VariableSpace* variableSpace);
 
-            nd4j::random::RandomBuffer* getRNG();
-            void setRNG(nd4j::random::RandomBuffer* rng);
+            sd::random::RandomBuffer* getRNG();
+            void setRNG(sd::random::RandomBuffer* rng);
 
             void setTargetEngine(samediff::Engine engine);
 
@@ -206,12 +206,12 @@ namespace nd4j {
             void setTArguments(double *arguments, int numberOfArguments);
             void setIArguments(Nd4jLong *arguments, int numberOfArguments);
             void setBArguments(bool *arguments, int numberOfArguments);
-            void setDArguments(nd4j::DataType *arguments, int numberOfArguments);
+            void setDArguments(sd::DataType *arguments, int numberOfArguments);
 
             void setTArguments(const std::vector<double> &tArgs);
             void setIArguments(const std::vector<Nd4jLong> &tArgs);
             void setBArguments(const std::vector<bool> &tArgs);
-            void setDArguments(const std::vector<nd4j::DataType> &dArgs);
+            void setDArguments(const std::vector<sd::DataType> &dArgs);
 
             /**
              * This method purges fastpath in/out contents and releases all the handles.
diff --git a/libnd4j/include/graph/ContextPrototype.h b/libnd4j/include/graph/ContextPrototype.h
index fac664598..57d773dbb 100644
--- a/libnd4j/include/graph/ContextPrototype.h
+++ b/libnd4j/include/graph/ContextPrototype.h
@@ -23,10 +23,10 @@
 #define ND4J_CONTEXT_PROTOTYPE_H
 
 #include <vector>
-#include <Environment.h>
+#include <system/Environment.h>
 #include <array/DataType.h>
-#include <dll.h>
-#include <RandomGenerator.h>
+#include <system/dll.h>
+#include <graph/RandomGenerator.h>
 #include <ops/declarable/OpDescriptor.h>
 #include <execution/Engine.h>
 #include <execution/ExecutionMode.h>
@@ -35,7 +35,7 @@
 #include <config.h>
 #endif
 
-namespace nd4j {
+namespace sd {
     namespace graph {
 
         class ND4J_EXPORT ContextPrototype {
@@ -47,10 +47,10 @@ namespace nd4j {
             std::vector<int> _iArgs;
             std::vector<bool> _bArgs;
             std::vector<int> _axis;
-            std::vector<nd4j::DataType> _dArgs;
+            std::vector<sd::DataType> _dArgs;
 
             // TODO: remove this field
-			nd4j::DataType _dataType = nd4j::DataType::FLOAT32;
+			sd::DataType _dataType = sd::DataType::FLOAT32;
 			bool _isInplace;
 
             // opNum for legacy XYZ ops
@@ -58,17 +58,17 @@ namespace nd4j {
             uint64_t _rootSeed;
             RandomGenerator _randomGenerator;
 
-            std::vector<nd4j::DataType> _dataTypes;
+            std::vector<sd::DataType> _dataTypes;
 
-            nd4j::ops::OpDescriptor* _opDescriptor;
-            bool _useMKLDNN = nd4j::Environment::getInstance()->isUseMKLDNN();
+            sd::ops::OpDescriptor* _opDescriptor;
+            bool _useMKLDNN = sd::Environment::getInstance()->isUseMKLDNN();
 
             // target engine for execution
             samediff::Engine _engine = DEFAULT_ENGINE;
 
             samediff::ExecutionMode _execMode = samediff::ExecutionMode::MODE_UNDEFINED;
         public:
-            explicit ContextPrototype(nd4j::ops::OpDescriptor* opDescriptor = nullptr, int nodeId = 1, bool inPlace = false);
+            explicit ContextPrototype(sd::ops::OpDescriptor* opDescriptor = nullptr, int nodeId = 1, bool inPlace = false);
             ~ContextPrototype() = default;
 
             int getNodeId();
@@ -77,11 +77,11 @@ namespace nd4j {
             // this method returns true, if inputs are defined
             bool hasVariablesFilled();
 
-            void setOpDescriptor(nd4j::ops::OpDescriptor* opDescriptor);
+            void setOpDescriptor(sd::ops::OpDescriptor* opDescriptor);
 
-            virtual nd4j::DataType dataType();
-            virtual nd4j::DataType dataType(int index);
-            virtual void setDataType(int index, nd4j::DataType type);
+            virtual sd::DataType dataType();
+            virtual sd::DataType dataType(int index);
+            virtual void setDataType(int index, sd::DataType type);
 
             bool isInplace();
             void markInplace(bool reallyInplace);
@@ -96,7 +96,7 @@ namespace nd4j {
             std::vector<double>* getTArguments();
             std::vector<int>* getIArguments();
             std::vector<bool>* getBArguments();
-            std::vector<nd4j::DataType>* getDArguments();
+            std::vector<sd::DataType>* getDArguments();
             std::vector<int>* getAxis();
 
             samediff::Engine engine();
diff --git a/libnd4j/include/graph/ExecutionResult.h b/libnd4j/include/graph/ExecutionResult.h
index 850974943..b1f16032c 100644
--- a/libnd4j/include/graph/ExecutionResult.h
+++ b/libnd4j/include/graph/ExecutionResult.h
@@ -29,7 +29,7 @@
 #include <flatbuffers/flatbuffers.h>
 #include <graph/Variable.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         class ExecutionResult {
         private:
diff --git a/libnd4j/include/graph/ExecutorConfiguration.h b/libnd4j/include/graph/ExecutorConfiguration.h
index be96c057d..40f299f02 100644
--- a/libnd4j/include/graph/ExecutorConfiguration.h
+++ b/libnd4j/include/graph/ExecutorConfiguration.h
@@ -22,22 +22,22 @@
 #define LIBND4J_EXECUTORCONFIGURATION_H
 
 #include <graph/generated/config_generated.h>
-#include <pointercast.h>
-#include <dll.h>
+#include <system/pointercast.h>
+#include <system/dll.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         class ND4J_EXPORT ExecutorConfiguration {
         public:
-            nd4j::graph::ProfilingMode _profilingMode;
-            nd4j::graph::ExecutionMode _executionMode;
-            nd4j::graph::OutputMode _outputMode;
+            sd::graph::ProfilingMode _profilingMode;
+            sd::graph::ExecutionMode _executionMode;
+            sd::graph::OutputMode _outputMode;
             bool _timestats;
             Nd4jLong _footprintForward = 0L;
             Nd4jLong _footprintBackward = 0L;
             Direction _direction = Direction_FORWARD_ONLY;
 
-            explicit ExecutorConfiguration(const nd4j::graph::FlatConfiguration *conf = nullptr);
+            explicit ExecutorConfiguration(const sd::graph::FlatConfiguration *conf = nullptr);
             ~ExecutorConfiguration() = default;
             
             ExecutorConfiguration* clone();
diff --git a/libnd4j/include/graph/FlatUtils.h b/libnd4j/include/graph/FlatUtils.h
index 939db1fb7..1b2a02dca 100644
--- a/libnd4j/include/graph/FlatUtils.h
+++ b/libnd4j/include/graph/FlatUtils.h
@@ -22,12 +22,12 @@
 #define LIBND4J_FLATUTILS_H
 
 #include <utility>
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <graph/generated/array_generated.h>
 #include <graph/generated/node_generated.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         class ND4J_EXPORT FlatUtils {
         public:
@@ -35,7 +35,7 @@ namespace nd4j {
 
             static std::pair<Nd4jLong, Nd4jLong> fromLongPair(LongPair* pair);
 
-            static NDArray* fromFlatArray(const nd4j::graph::FlatArray* flatArray);
+            static NDArray* fromFlatArray(const sd::graph::FlatArray* flatArray);
 
             static flatbuffers::Offset<FlatArray> toFlatArray(flatbuffers::FlatBufferBuilder &builder, NDArray &array);
         };
diff --git a/libnd4j/include/graph/FlowPath.h b/libnd4j/include/graph/FlowPath.h
index 3f72c695c..597520249 100644
--- a/libnd4j/include/graph/FlowPath.h
+++ b/libnd4j/include/graph/FlowPath.h
@@ -21,16 +21,16 @@
 #ifndef LIBND4J_FLOWPATH_H
 #define LIBND4J_FLOWPATH_H
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <unordered_map>
 #include <map>
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <graph/NodeState.h>
 #include <graph/FrameState.h>
 #include <graph/profiling/GraphProfile.h>
-#include <dll.h>
+#include <system/dll.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         class ND4J_EXPORT FlowPath {
         private:
diff --git a/libnd4j/include/graph/FrameState.h b/libnd4j/include/graph/FrameState.h
index ea3011a6d..1c0edbc0b 100644
--- a/libnd4j/include/graph/FrameState.h
+++ b/libnd4j/include/graph/FrameState.h
@@ -22,10 +22,10 @@
 #define LIBND4J_FRAMESTATE_H
 
 #include <string>
-#include <pointercast.h>
-#include <dll.h>
+#include <system/pointercast.h>
+#include <system/dll.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         class ND4J_EXPORT FrameState {
         private:
diff --git a/libnd4j/include/graph/Graph.h b/libnd4j/include/graph/Graph.h
index 5145ba9b0..a160872fd 100644
--- a/libnd4j/include/graph/Graph.h
+++ b/libnd4j/include/graph/Graph.h
@@ -37,7 +37,7 @@
 #include <graph/ExecutorConfiguration.h>
 #include <ops/declarable/OpDescriptor.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
 
         class ND4J_EXPORT Graph {
@@ -51,10 +51,10 @@ namespace nd4j {
 
             // vector holds ID's of top nodes only
             std::vector<int > *_nodes;
-            MAP_IMPL<int, nd4j::graph::Node*> *_mapped;
+            MAP_IMPL<int, sd::graph::Node*> *_mapped;
 
-            MAP_IMPL<int, std::vector<nd4j::graph::Node*> *> *_onion;
-            MAP_IMPL<int, nd4j::graph::Node*> _unmapped;
+            MAP_IMPL<int, std::vector<sd::graph::Node*> *> *_onion;
+            MAP_IMPL<int, sd::graph::Node*> _unmapped;
             std::vector<int> _unmappedMap; // macOS?
 
             std::mutex _mutexPreprocessing;
@@ -68,11 +68,11 @@ namespace nd4j {
             std::vector<Scope*> _scopes;
 
 ////////////////////////////////////////
-            Nd4jStatus validateNode(nd4j::graph::Node *node);
+            Nd4jStatus validateNode(sd::graph::Node *node);
 
             void expandOnion(int newLayer);
 
-            void injectNode(nd4j::graph::Node *node);
+            void injectNode(sd::graph::Node *node);
 
             void pushToOutputOnce(int id);
 
@@ -105,39 +105,39 @@ namespace nd4j {
 
             int numberOfPlaceholders();
 
-            std::vector<nd4j::graph::Variable*>* getPlaceholders();
+            std::vector<sd::graph::Variable*>* getPlaceholders();
 
             /**
              * This method returns pointer to thread_local VariableSpace
              * @return
              */
-            nd4j::graph::VariableSpace *getVariableSpace();
+            sd::graph::VariableSpace *getVariableSpace();
 
             /**
              * This method adds given node to the graph
              *
              * @param node
              */
-            void addNode(nd4j::graph::Node *node);
+            void addNode(sd::graph::Node *node);
 
             /**
              * This method returns layered representation of the graph
              *
              * @return
              */
-            MAP_IMPL<int, std::vector<nd4j::graph::Node*> *> *getOnion();
+            MAP_IMPL<int, std::vector<sd::graph::Node*> *> *getOnion();
 
             /**
              * This method returns map of all nodes of the graph
              * @return
              */
-            MAP_IMPL<int, nd4j::graph::Node*>* getMapped();
+            MAP_IMPL<int, sd::graph::Node*>* getMapped();
 
             /**
              * This method returns outputs of this graph
              * @return
              */
-            std::vector<nd4j::graph::Variable*> *fetchOutputs();
+            std::vector<sd::graph::Variable*> *fetchOutputs();
 
             /**
              * This method returns pointer to ExecutorConfiguration
@@ -156,7 +156,7 @@ namespace nd4j {
              * This method returns all nodes at once (order is NOT guaranteed)
              * @return
              */
-            std::vector<nd4j::graph::Node*> *getAllNodes();
+            std::vector<sd::graph::Node*> *getAllNodes();
 
             /**
              * This method prints out Graph op-by-op, and respective inputs
@@ -166,7 +166,7 @@ namespace nd4j {
             /**
              * This method collect all ops from the graph into ops vector
              */
-            std::vector<nd4j::ops::OpDescriptor> getOperations();
+            std::vector<sd::ops::OpDescriptor> getOperations();
 
             /**
              * This method returns Scope ptr specified with id
diff --git a/libnd4j/blas/GraphExecutioner.h b/libnd4j/include/graph/GraphExecutioner.h
similarity index 89%
rename from libnd4j/blas/GraphExecutioner.h
rename to libnd4j/include/graph/GraphExecutioner.h
index 6ea459b2f..148b27951 100644
--- a/libnd4j/blas/GraphExecutioner.h
+++ b/libnd4j/include/graph/GraphExecutioner.h
@@ -31,13 +31,13 @@
 #include <graph/ResultWrapper.h>
 #include <sys/stat.h>
 #include <graph/ExecutionResult.h>
-#include <dll.h>
+#include <system/dll.h>
 
 #define TF_INPUT "Placeholder"
 #define TF_CONST "Const"
 #define TF_VAR "VariableV2"
 
-namespace nd4j {
+namespace sd {
     namespace graph {
 
     class ND4J_EXPORT GraphExecutioner {
@@ -45,7 +45,7 @@ namespace nd4j {
 
 
     public:
-        //static Nd4jStatus executeFlatNode(nd4j::graph::Graph *graph, nd4j::graph::Node *node, nd4j::graph::VariableSpace<float> *variableSpace);
+        //static Nd4jStatus executeFlatNode(sd::graph::Graph *graph, sd::graph::Node *node, sd::graph::VariableSpace<float> *variableSpace);
 
         static Nd4jStatus executeFlatNode(Graph *graph, Node *node, VariableSpace *variableSpace);
 
@@ -62,7 +62,7 @@ namespace nd4j {
         * @param pointer Pointer to FlatBuffer
         * @return pointer to FlatBuffer with result
         */
-        static nd4j::graph::ResultWrapper* executeFlatBuffer(Nd4jPointer pointer);
+        static sd::graph::ResultWrapper* executeFlatBuffer(Nd4jPointer pointer);
 
         static flatbuffers::Offset<FlatResult> execute(Graph *graph, flatbuffers::FlatBufferBuilder &builder, const FlatInferenceRequest* request);
 
diff --git a/libnd4j/include/graph/GraphHolder.h b/libnd4j/include/graph/GraphHolder.h
index 3465d182e..07e091f42 100644
--- a/libnd4j/include/graph/GraphHolder.h
+++ b/libnd4j/include/graph/GraphHolder.h
@@ -19,14 +19,14 @@
 //
 
 #include <helpers/logger.h>
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <unordered_map>
 #include <map>
 #include <graph/Graph.h>
 #include <helpers/SimpleReadWriteLock.h>
 #include <exceptions/unknown_graph_exception.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         class ND4J_EXPORT GraphHolder {
         private:
diff --git a/libnd4j/include/graph/GraphState.h b/libnd4j/include/graph/GraphState.h
index 6fc553a09..89343997f 100644
--- a/libnd4j/include/graph/GraphState.h
+++ b/libnd4j/include/graph/GraphState.h
@@ -21,21 +21,21 @@
 #ifndef LIBND4J_GRAPHSTATE_H
 #define LIBND4J_GRAPHSTATE_H
 
-#include <pointercast.h>
-#include <op_boilerplate.h>
-#include <dll.h>
+#include <system/pointercast.h>
+#include <system/op_boilerplate.h>
+#include <system/dll.h>
 #include <vector>
 #include <unordered_map>
 #include <map>
 #include <graph/Scope.h>
-#include <Status.h>
+#include <graph/Status.h>
 #include <graph/VariableSpace.h>
 #include <ops/declarable/DeclarableOp.h>
 #include <types/pair.h>
 #include <graph/ArgumentsList.h>
 #include <graph/Graph.h>
 
-namespace nd4j {
+namespace sd {
 namespace graph {
 
     class ND4J_EXPORT GraphState {
@@ -94,7 +94,7 @@ namespace graph {
          * @param op
          * @return
          */
-        Nd4jStatus attachOpToScope(int scopeId, int nodeId, nd4j::ops::DeclarableOp *op, ArgumentsList inputs);
+        Nd4jStatus attachOpToScope(int scopeId, int nodeId, sd::ops::DeclarableOp *op, ArgumentsList inputs);
 
         /**
          * This method returns pointer to the scope with given id
diff --git a/libnd4j/include/graph/GraphUtils.h b/libnd4j/include/graph/GraphUtils.h
index 6938bbea8..3aaf820ae 100644
--- a/libnd4j/include/graph/GraphUtils.h
+++ b/libnd4j/include/graph/GraphUtils.h
@@ -25,12 +25,12 @@
 #include <ops/declarable/OpDescriptor.h>
 #include <ops/declarable/DeclarableOp.h>
 
-namespace nd4j {
+namespace sd {
 namespace graph {
 
 class ND4J_EXPORT GraphUtils {
 public:
-    typedef std::vector<nd4j::ops::OpDescriptor> OpList;
+    typedef std::vector<sd::ops::OpDescriptor> OpList;
 
 public:
     static bool filterOperations(OpList& ops);
diff --git a/libnd4j/include/graph/InferenceRequest.h b/libnd4j/include/graph/InferenceRequest.h
index 9449d7bc1..b445fa0e1 100644
--- a/libnd4j/include/graph/InferenceRequest.h
+++ b/libnd4j/include/graph/InferenceRequest.h
@@ -20,13 +20,13 @@
 #ifndef DEV_TESTS_INFERENCEREQUEST_H
 #define DEV_TESTS_INFERENCEREQUEST_H
 
-#include <op_boilerplate.h>
-#include <pointercast.h>
-#include <dll.h>
+#include <system/op_boilerplate.h>
+#include <system/pointercast.h>
+#include <system/dll.h>
 #include <graph/Variable.h>
 #include "ExecutorConfiguration.h"
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         class ND4J_EXPORT InferenceRequest {
         private:
diff --git a/libnd4j/include/graph/Intervals.h b/libnd4j/include/graph/Intervals.h
index 74cc2cf3c..3a7964076 100644
--- a/libnd4j/include/graph/Intervals.h
+++ b/libnd4j/include/graph/Intervals.h
@@ -21,12 +21,12 @@
 #ifndef LIBND4J_INTERVALS_H
 #define LIBND4J_INTERVALS_H
 
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <vector>
 #include <initializer_list>
-#include <dll.h>
+#include <system/dll.h>
 
-namespace  nd4j {
+namespace sd {
 
     class ND4J_EXPORT Intervals {
     
diff --git a/libnd4j/include/graph/Node.h b/libnd4j/include/graph/Node.h
index f07bfac18..5fde65f3c 100644
--- a/libnd4j/include/graph/Node.h
+++ b/libnd4j/include/graph/Node.h
@@ -22,15 +22,15 @@
 #define LIBND4J_GNODE_H
 
 #include <atomic>
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <string>
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include "Context.h"
 #include <ops/declarable/DeclarableOp.h>
 #include <graph/generated/node_generated.h>
 
 
-namespace nd4j {
+namespace sd {
     namespace graph {
 
 
@@ -39,7 +39,7 @@ namespace nd4j {
         class ND4J_EXPORT Node {
         protected:
             // TODO: this field must be removed
-            nd4j::DataType _dataType;
+            sd::DataType _dataType;
 
             OpType _opType;
             ContextPrototype* _protoContext = nullptr;
@@ -80,8 +80,8 @@ namespace nd4j {
             OpClass _opClass;
 
             // these fields are used to store embedded CustomOps and Graph in case of Graph-in-Graph scenario
-            nd4j::graph::Graph * _graph= nullptr;
-            nd4j::ops::DeclarableOp *_customOp = nullptr;
+            sd::graph::Graph * _graph= nullptr;
+            sd::ops::DeclarableOp *_customOp = nullptr;
 
             // each node can be active or inactive, if used with divergents, like IF statements
             bool _active = true;
@@ -96,14 +96,14 @@ namespace nd4j {
             Nd4jLong _frameId = -1;
 
         public:
-            explicit Node(nd4j::ops::DeclarableOp *customOp, int id = 0, std::initializer_list<int> input = {}, std::initializer_list<int> output = {},  std::initializer_list<int> dimensions = {}, float scalar = 0.0f, std::initializer_list<double> tArgs = {}, std::initializer_list<int> iArgs = {});
+            explicit Node(sd::ops::DeclarableOp *customOp, int id = 0, std::initializer_list<int> input = {}, std::initializer_list<int> output = {},  std::initializer_list<int> dimensions = {}, float scalar = 0.0f, std::initializer_list<double> tArgs = {}, std::initializer_list<int> iArgs = {});
             explicit Node(OpType opType = OpType_TRANSFORM_SAME, int opNum = 0, int id = 0, std::initializer_list<int> input = {}, std::initializer_list<int> output = {},  std::initializer_list<int> dimensions = {}, float scalar = 0.0f, std::initializer_list<double> tArgs = {}, std::initializer_list<int> iArgs = {});
-            explicit Node(const nd4j::graph::FlatNode *node);
+            explicit Node(const sd::graph::FlatNode *node);
             ~Node();
 
             bool equals(Node *other);
 
-            nd4j::DataType dataType();
+            sd::DataType dataType();
             ContextPrototype *protoContext();
             OpType opType();
             Nd4jLong opNum();
@@ -168,12 +168,12 @@ namespace nd4j {
             ContextPrototype* getContextPrototype();
             bool hasBlockAttached();
 
-            void setCustomOp(nd4j::ops::DeclarableOp *customOp = nullptr);
-            nd4j::ops::DeclarableOp* getCustomOp();
+            void setCustomOp(sd::ops::DeclarableOp *customOp = nullptr);
+            sd::ops::DeclarableOp* getCustomOp();
             bool hasCustomOp();
 
-            void setGraph(nd4j::graph::Graph* graph = nullptr);
-            nd4j::graph::Graph* getGraph();
+            void setGraph(sd::graph::Graph* graph = nullptr);
+            sd::graph::Graph* getGraph();
             bool hasGraphEmbedded();
 
             bool isInplace();
@@ -234,7 +234,7 @@ namespace nd4j {
 
             }
 
-            static nd4j::ops::DeclarableOp* buildOpByType(OpType opType, int numInputs, int numIArgs, int numTArgs, int opNum, NDArray *scalar);
+            static sd::ops::DeclarableOp* buildOpByType(OpType opType, int numInputs, int numIArgs, int numTArgs, int opNum, NDArray *scalar);
             static void deleteOpByType(OpType opType, void *op);
         };
     }
diff --git a/libnd4j/include/graph/NodeState.h b/libnd4j/include/graph/NodeState.h
index ece1e317e..5e0a7a6d2 100644
--- a/libnd4j/include/graph/NodeState.h
+++ b/libnd4j/include/graph/NodeState.h
@@ -21,10 +21,10 @@
 #ifndef LIBND4J_NODESTATE_H
 #define LIBND4J_NODESTATE_H
 
-#include <pointercast.h>
-#include <dll.h>
+#include <system/pointercast.h>
+#include <system/dll.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         class ND4J_EXPORT NodeState {
         private:
diff --git a/libnd4j/include/graph/RandomGenerator.h b/libnd4j/include/graph/RandomGenerator.h
index de475b8f8..ef06c345d 100644
--- a/libnd4j/include/graph/RandomGenerator.h
+++ b/libnd4j/include/graph/RandomGenerator.h
@@ -22,9 +22,9 @@
 #define LIBND4J_GRAPH_RNG_H
 
 #include <types/u64.h>
-#include <pointercast.h>
-#include <op_boilerplate.h>
-#include <dll.h>
+#include <system/pointercast.h>
+#include <system/op_boilerplate.h>
+#include <system/dll.h>
 #include <chrono>
 #include <array/DataTypeUtils.h>
 #include <helpers/logger.h>
@@ -35,7 +35,7 @@
 #include <cuda_runtime.h>
 #endif
 
-namespace nd4j {
+namespace sd {
     namespace graph {
 #ifdef __CUDACC__
         class ND4J_EXPORT CudaManagedRandomGenerator {
diff --git a/libnd4j/include/graph/RandomGenerator.hpp b/libnd4j/include/graph/RandomGenerator.hpp
index 9efeaefdc..fbbc8bad1 100644
--- a/libnd4j/include/graph/RandomGenerator.hpp
+++ b/libnd4j/include/graph/RandomGenerator.hpp
@@ -19,14 +19,14 @@
 //
 // relies on xoroshiro64** and xoroshiro128 implementations
 
-#include <op_boilerplate.h>
-#include <pointercast.h>
+#include <system/op_boilerplate.h>
+#include <system/pointercast.h>
 #include <graph/RandomGenerator.h>
 #include <chrono>
 #include <array/DataTypeUtils.h>
 #include <helpers/logger.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
 
 
diff --git a/libnd4j/include/graph/ResultWrapper.h b/libnd4j/include/graph/ResultWrapper.h
index a19a6bd5b..fe5193097 100644
--- a/libnd4j/include/graph/ResultWrapper.h
+++ b/libnd4j/include/graph/ResultWrapper.h
@@ -21,11 +21,11 @@
 #ifndef LIBND4J_RESULTWRAPPER_H
 #define LIBND4J_RESULTWRAPPER_H
 
-#include <op_boilerplate.h>
-#include <pointercast.h>
-#include <dll.h>
+#include <system/op_boilerplate.h>
+#include <system/pointercast.h>
+#include <system/dll.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         class ND4J_EXPORT ResultWrapper {
         private:
diff --git a/libnd4j/include/graph/Scope.h b/libnd4j/include/graph/Scope.h
index 5cbbf8bc0..42b99c18e 100644
--- a/libnd4j/include/graph/Scope.h
+++ b/libnd4j/include/graph/Scope.h
@@ -25,7 +25,7 @@
 #include <unordered_map>
 #include <graph/Node.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
 
         /**
diff --git a/libnd4j/include/graph/SessionLocalStorage.h b/libnd4j/include/graph/SessionLocalStorage.h
index 5d6299938..3cb77ec3a 100644
--- a/libnd4j/include/graph/SessionLocalStorage.h
+++ b/libnd4j/include/graph/SessionLocalStorage.h
@@ -29,7 +29,7 @@
 #include "Stash.h"
 #include <memory/Workspace.h>
 
-namespace nd4j{
+namespace sd{
     namespace graph {
         class ND4J_EXPORT SessionLocalStorage {
         protected:
diff --git a/libnd4j/include/graph/Stash.h b/libnd4j/include/graph/Stash.h
index b44396819..ba431d057 100644
--- a/libnd4j/include/graph/Stash.h
+++ b/libnd4j/include/graph/Stash.h
@@ -22,15 +22,15 @@
 #define LIBND4J_STASH_H
 
 //#include <graph/Block.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <map>
 #include <vector>
 #include <string>
 #include <atomic>
 #include <functional>
-#include <pointercast.h>
+#include <system/pointercast.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         class ND4J_EXPORT KeyPair {
             int _node;
@@ -54,33 +54,33 @@ namespace nd4j {
 
 namespace std {
     template <>
-    class ND4J_EXPORT hash<nd4j::graph::KeyPair> {
+    class ND4J_EXPORT hash<sd::graph::KeyPair> {
     public:
-        size_t operator()(const nd4j::graph::KeyPair& k) const;
+        size_t operator()(const sd::graph::KeyPair& k) const;
     };
 };
 
 #endif
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         class ND4J_EXPORT Stash {
         protected:
-            std::map<nd4j::graph::KeyPair, nd4j::NDArray*> _stash;
-            std::vector<nd4j::NDArray*> _handles;
+            std::map<sd::graph::KeyPair, sd::NDArray*> _stash;
+            std::vector<sd::NDArray*> _handles;
 
         public:
             Stash();
             ~Stash();
 
-            //void storeArray(nd4j::graph::Block<T>& block, const char *name, nd4j::NDArray<T> *array);
-            void storeArray(int nodeId, const char *name, nd4j::NDArray *array);
+            //void storeArray(sd::graph::Block<T>& block, const char *name, sd::NDArray<T> *array);
+            void storeArray(int nodeId, const char *name, sd::NDArray *array);
 
-            //bool checkStash(nd4j::graph::Block<T>& block, const char *name);
+            //bool checkStash(sd::graph::Block<T>& block, const char *name);
             bool checkStash(int nodeId, const char *name);
 
-            //nd4j::NDArray<T>* extractArray(nd4j::graph::Block<T>& block, const char *name);
-            nd4j::NDArray* extractArray(int nodeId, const char *name);
+            //sd::NDArray<T>* extractArray(sd::graph::Block<T>& block, const char *name);
+            sd::NDArray* extractArray(int nodeId, const char *name);
 
             void clear();
         };
diff --git a/libnd4j/include/Status.h b/libnd4j/include/graph/Status.h
similarity index 93%
rename from libnd4j/include/Status.h
rename to libnd4j/include/graph/Status.h
index 5baa7cca2..42794488d 100644
--- a/libnd4j/include/Status.h
+++ b/libnd4j/include/graph/Status.h
@@ -21,12 +21,12 @@
 #ifndef ND4J_STATUS_H
 #define ND4J_STATUS_H
 
-#include <pointercast.h>
-#include <op_boilerplate.h>
-#include <dll.h>
+#include <system/pointercast.h>
+#include <system/op_boilerplate.h>
+#include <system/dll.h>
 #include <helpers/logger.h>
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT Status {
     public:
         static FORCEINLINE Nd4jStatus OK() {
diff --git a/libnd4j/include/graph/TimeHolder.h b/libnd4j/include/graph/TimeHolder.h
index d3509e8ba..191a75bac 100644
--- a/libnd4j/include/graph/TimeHolder.h
+++ b/libnd4j/include/graph/TimeHolder.h
@@ -22,10 +22,10 @@
 #define LIBND4J_TIMEHOLDER_H
 
 #include <map>
-#include <pointercast.h>
-#include <dll.h>
+#include <system/pointercast.h>
+#include <system/dll.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         class ND4J_EXPORT TimeHolder {
         private:
diff --git a/libnd4j/include/graph/Variable.h b/libnd4j/include/graph/Variable.h
index 76ce62fcf..b3ac74533 100644
--- a/libnd4j/include/graph/Variable.h
+++ b/libnd4j/include/graph/Variable.h
@@ -22,7 +22,7 @@
 #define LIBND4J_VARIABLE_H
 
 #include <string>
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <array/NDArrayList.h>
 #include <graph/VariableType.h>
 #include <graph/generated/array_generated.h>
@@ -54,13 +54,13 @@ namespace std {
 
 #endif
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         class ND4J_EXPORT Variable {
         protected:
             int _id = 0;
             int _index = 0;
-            nd4j::NDArray *_ndarray = nullptr;
+            sd::NDArray *_ndarray = nullptr;
             std::string _name;
 
             std::vector<Nd4jLong> _shape;
@@ -75,15 +75,19 @@ namespace nd4j {
             //InputType _variableType = InputType_UNDEFINED;
             //DataType _dataType = INHERIT;
 
-            nd4j::NDArrayList *_list = nullptr;
+            sd::NDArrayList *_list = nullptr;
 
             VariableType _variableType = VariableType::NDARRAY;
             
         public:
             Variable(bool placeHolder);
-            Variable(nd4j::NDArray *arrayw, const char *name, int id, int idx = 0);
-            Variable(nd4j::NDArray *array = nullptr, const char *name = nullptr);
-            Variable(const nd4j::graph::FlatVariable *flatVariable);
+            Variable(sd::NDArray *arrayw, const char *name, int id, int idx = 0);
+            Variable(sd::NDArray *array = nullptr, const char *name = nullptr);
+
+#ifndef __JAVACPP_HACK__
+            Variable(const sd::graph::FlatVariable *flatVariable);
+#endif
+
             ~Variable();
 
             Variable* clone();
@@ -92,12 +96,12 @@ namespace nd4j {
             ND4J_EXPORT Variable* asT();
 
             bool hasNDArray();
-            nd4j::NDArray* getNDArray();
-            void setNDArray(nd4j::NDArray *array);
+            sd::NDArray* getNDArray();
+            void setNDArray(sd::NDArray *array);
 
             bool hasNDArrayList();
-            nd4j::NDArrayList* getNDArrayList();
-            void setNDArrayList(nd4j::NDArrayList* list);
+            sd::NDArrayList* getNDArrayList();
+            void setNDArrayList(sd::NDArrayList* list);
 
             bool isExternal();
             bool isReadOnly();
diff --git a/libnd4j/include/graph/VariableProxy.h b/libnd4j/include/graph/VariableProxy.h
index 63027f237..1569b477d 100644
--- a/libnd4j/include/graph/VariableProxy.h
+++ b/libnd4j/include/graph/VariableProxy.h
@@ -20,7 +20,7 @@
 
 #include <graph/VariableSpace.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         class ND4J_EXPORT VariableProxy: public VariableSpace {
         protected:
@@ -35,7 +35,7 @@ namespace nd4j {
             virtual int numberOfPlaceholders();
             virtual std::vector<Variable*>* getPlaceholders();
 
-            virtual nd4j::memory::Workspace *workspace();
+            virtual sd::memory::Workspace *workspace();
 
             virtual bool hasExternalVariable(int it);
             virtual bool hasExternalVariable(std::pair<int,int>& pair);
@@ -46,10 +46,10 @@ namespace nd4j {
             virtual bool hasVariable(std::pair<int,int>& pair);
             virtual bool hasVariable(std::string *symbol);
 
-            virtual nd4j::graph::Variable *getVariable(int id);
-            virtual nd4j::graph::Variable *getVariable(int id, int idx);
-            virtual nd4j::graph::Variable *getVariable(std::pair<int,int>& pair);
-            virtual nd4j::graph::Variable *getVariable(std::string *symbol);
+            virtual sd::graph::Variable *getVariable(int id);
+            virtual sd::graph::Variable *getVariable(int id, int idx);
+            virtual sd::graph::Variable *getVariable(std::pair<int,int>& pair);
+            virtual sd::graph::Variable *getVariable(std::string *symbol);
 
             virtual std::vector<Variable*> getVariables();
 
@@ -68,7 +68,7 @@ namespace nd4j {
 
             virtual void putOutputVariable(Variable *variable);
 
-            virtual void trackList(nd4j::NDArrayList *list);
+            virtual void trackList(sd::NDArrayList *list);
 
             // memory-related statistics
             virtual Nd4jLong externalMemory();
@@ -79,9 +79,9 @@ namespace nd4j {
             virtual int internalEntries();
             virtual int totalEntries();
 
-            virtual nd4j::graph::VariableSpace *clone();
+            virtual sd::graph::VariableSpace *clone();
 
-            virtual nd4j::graph::Stash* getStash();
+            virtual sd::graph::Stash* getStash();
             virtual void setFlowPath(FlowPath* timers);
             virtual FlowPath* flowPath();
         };
diff --git a/libnd4j/include/graph/VariableSpace.h b/libnd4j/include/graph/VariableSpace.h
index 6ae0339ab..ea3c6370d 100644
--- a/libnd4j/include/graph/VariableSpace.h
+++ b/libnd4j/include/graph/VariableSpace.h
@@ -28,7 +28,7 @@
 #include <list>
 #include <unordered_map>
 #include <mutex>
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <array/NDArrayList.h>
 #include <graph/Variable.h>
 #include <memory/Workspace.h>
@@ -36,14 +36,14 @@
 #include <graph/FlowPath.h>
 
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         class ND4J_EXPORT VariableSpace {
         protected:
-            nd4j::memory::Workspace *_workspace;
+            sd::memory::Workspace *_workspace;
 
             // stash is NOT cloned
-            nd4j::graph::Stash _stash;
+            sd::graph::Stash _stash;
 
             MAP_IMPL<std::pair<int, int>, Variable*> _paired;
             MAP_IMPL<std::string, Variable*> _symbolic;
@@ -51,9 +51,9 @@ namespace nd4j {
             std::vector<Variable*> _external;
             std::vector<Variable*> _internal;
 
-            std::vector<nd4j::NDArrayList*> _lists;
+            std::vector<sd::NDArrayList*> _lists;
 
-            std::vector<nd4j::graph::Variable*> _placeholders;
+            std::vector<sd::graph::Variable*> _placeholders;
 
             void silentPutVariable(std::pair<int,int>& pair, Variable *variable);
 
@@ -61,9 +61,9 @@ namespace nd4j {
 
             std::mutex _varmap;
 
-            MAP_IMPL<int, nd4j::graph::Variable*> _temporary;
+            MAP_IMPL<int, sd::graph::Variable*> _temporary;
 
-            std::vector<nd4j::graph::Variable*> *_handles;
+            std::vector<sd::graph::Variable*> *_handles;
 
             FlowPath* _flow = nullptr;
 
@@ -75,7 +75,7 @@ namespace nd4j {
 
             virtual int numberOfPlaceholders();
             virtual std::vector<Variable*>* getPlaceholders();
-            virtual void setWorkspace(nd4j::memory::Workspace *workspace);
+            virtual void setWorkspace(sd::memory::Workspace *workspace);
 
             virtual LaunchContext* launchContext();
 
@@ -88,10 +88,10 @@ namespace nd4j {
             virtual bool hasVariable(std::pair<int,int>& pair);
             virtual bool hasVariable(std::string *symbol);
 
-            virtual nd4j::graph::Variable* getVariable(int id);
-            virtual nd4j::graph::Variable* getVariable(int id, int idx);
-            virtual nd4j::graph::Variable* getVariable(std::pair<int,int>& pair);
-            virtual nd4j::graph::Variable* getVariable(std::string *symbol);
+            virtual sd::graph::Variable* getVariable(int id);
+            virtual sd::graph::Variable* getVariable(int id, int idx);
+            virtual sd::graph::Variable* getVariable(std::pair<int,int>& pair);
+            virtual sd::graph::Variable* getVariable(std::string *symbol);
 
             virtual std::vector<Variable*> getVariables();
 
@@ -106,7 +106,7 @@ namespace nd4j {
             virtual void dropVariable(std::pair<int,int> &pair);
             virtual void dropVariable(int id, int idx);
 
-            virtual void trackList(nd4j::NDArrayList *list);
+            virtual void trackList(sd::NDArrayList *list);
 
             virtual void putOutputVariable(Variable *variable);
 
@@ -121,17 +121,17 @@ namespace nd4j {
             virtual int internalEntries();
             virtual int totalEntries();
 
-            virtual nd4j::graph::VariableSpace* clone();
+            virtual sd::graph::VariableSpace* clone();
 
             std::vector<Variable*> *handles();
 
 
-            nd4j::graph::VariableSpace* asT();
+            sd::graph::VariableSpace* asT();
             void injectVariable(std::pair<int, int> &pair, Variable* variable);
 
-            virtual nd4j::graph::Stash* getStash();
+            virtual sd::graph::Stash* getStash();
 
-            virtual std::vector<nd4j::graph::Variable*> * getExternalVariables();
+            virtual std::vector<sd::graph::Variable*> * getExternalVariables();
 
             virtual void setFlowPath(FlowPath* timers);
             virtual FlowPath* flowPath();
diff --git a/libnd4j/include/graph/VariableType.h b/libnd4j/include/graph/VariableType.h
index 5c3ca6057..28883f9b1 100644
--- a/libnd4j/include/graph/VariableType.h
+++ b/libnd4j/include/graph/VariableType.h
@@ -21,7 +21,7 @@
 #ifndef ND4J_VARIABLE_TYPE_H
 #define ND4J_VARIABLE_TYPE_H
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         enum VariableType {
             NDARRAY = 0,
diff --git a/libnd4j/include/graph/VariablesSet.h b/libnd4j/include/graph/VariablesSet.h
index 070ac4442..682b7fce4 100644
--- a/libnd4j/include/graph/VariablesSet.h
+++ b/libnd4j/include/graph/VariablesSet.h
@@ -23,16 +23,16 @@
 
 #include <iterator>
 #include <vector>
-#include <pointercast.h>
-#include <dll.h>
+#include <system/pointercast.h>
+#include <system/dll.h>
 #include <graph/Variable.h>
 
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         class ND4J_EXPORT VariablesSet {
         protected:
-            std::vector<nd4j::graph::Variable*> _holder;
+            std::vector<sd::graph::Variable*> _holder;
             Nd4jStatus _status;
         public:
             VariablesSet(Nd4jStatus status = ND4J_STATUS_OK);
diff --git a/libnd4j/include/graph/exceptions/impl/unresolved_input_exception.cpp b/libnd4j/include/graph/exceptions/impl/unresolved_input_exception.cpp
index ac8e511c7..fe6e45875 100644
--- a/libnd4j/include/graph/exceptions/impl/unresolved_input_exception.cpp
+++ b/libnd4j/include/graph/exceptions/impl/unresolved_input_exception.cpp
@@ -18,10 +18,10 @@
 // @author raver119@gmail.com
 //
 
-#include <exceptions/unresolved_input_exception.h>
-#include <StringUtils.h>
+#include <graph/exceptions/unresolved_input_exception.h>
+#include <helpers/StringUtils.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         unresolved_input_exception::unresolved_input_exception(std::string message) : std::runtime_error(message) {
             //
diff --git a/libnd4j/include/graph/exceptions/impl/unresolved_output_exception.cpp b/libnd4j/include/graph/exceptions/impl/unresolved_output_exception.cpp
index 4ccc10c71..df8b5eb00 100644
--- a/libnd4j/include/graph/exceptions/impl/unresolved_output_exception.cpp
+++ b/libnd4j/include/graph/exceptions/impl/unresolved_output_exception.cpp
@@ -18,11 +18,11 @@
 // @author raver119@gmail.com
 //
 
-#include <exceptions/unresolved_output_exception.h>
-#include <StringUtils.h>
+#include <graph/exceptions/unresolved_output_exception.h>
+#include <helpers/StringUtils.h>
 #include <utility>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         unresolved_output_exception::unresolved_output_exception(std::string message) : std::runtime_error(message) {
             //
diff --git a/libnd4j/include/graph/exceptions/unresolved_input_exception.h b/libnd4j/include/graph/exceptions/unresolved_input_exception.h
index e779ecff2..5e38977a9 100644
--- a/libnd4j/include/graph/exceptions/unresolved_input_exception.h
+++ b/libnd4j/include/graph/exceptions/unresolved_input_exception.h
@@ -25,7 +25,7 @@
 #include <string>
 #include <stdexcept>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         class unresolved_input_exception : public std::runtime_error {
         public:
diff --git a/libnd4j/include/graph/exceptions/unresolved_output_exception.h b/libnd4j/include/graph/exceptions/unresolved_output_exception.h
index 6040d3278..05d39c514 100644
--- a/libnd4j/include/graph/exceptions/unresolved_output_exception.h
+++ b/libnd4j/include/graph/exceptions/unresolved_output_exception.h
@@ -25,7 +25,7 @@
 #include <string>
 #include <stdexcept>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         class unresolved_output_exception : public std::runtime_error {
         public:
diff --git a/libnd4j/include/graph/execution/LogicConditional.h b/libnd4j/include/graph/execution/LogicConditional.h
index 0a39ef05f..ffaf6f098 100644
--- a/libnd4j/include/graph/execution/LogicConditional.h
+++ b/libnd4j/include/graph/execution/LogicConditional.h
@@ -21,11 +21,11 @@
 #ifndef LIBND4J_LOGICCONDITIONAL_H
 #define LIBND4J_LOGICCONDITIONAL_H
 
-#include <pointercast.h>
-#include <Node.h>
-#include <Graph.h>
+#include <system/pointercast.h>
+#include <graph/Node.h>
+#include <graph/Graph.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         /**
          * This class is responsible for execution logic of Conditional logical abstraction
diff --git a/libnd4j/include/graph/execution/LogicEnter.h b/libnd4j/include/graph/execution/LogicEnter.h
index a4f64c83d..d770ff10a 100644
--- a/libnd4j/include/graph/execution/LogicEnter.h
+++ b/libnd4j/include/graph/execution/LogicEnter.h
@@ -21,10 +21,10 @@
 #ifndef LIBND4J_LOGICENTER_H
 #define LIBND4J_LOGICENTER_H
 
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <graph/Graph.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         class LogicEnter {
         public:
diff --git a/libnd4j/include/graph/execution/LogicExecutor.h b/libnd4j/include/graph/execution/LogicExecutor.h
index 7b8e5e89e..541b3fc84 100644
--- a/libnd4j/include/graph/execution/LogicExecutor.h
+++ b/libnd4j/include/graph/execution/LogicExecutor.h
@@ -21,11 +21,11 @@
 #ifndef LIBND4J_LOGICEXECUTOR_H
 #define LIBND4J_LOGICEXECUTOR_H
 
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <graph/Node.h>
 #include <graph/Graph.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         /**
          * This class acts as switch for picking logic execution based on opNum, unique for each logical op
diff --git a/libnd4j/include/graph/execution/LogicExit.h b/libnd4j/include/graph/execution/LogicExit.h
index 617338e14..d182e26fb 100644
--- a/libnd4j/include/graph/execution/LogicExit.h
+++ b/libnd4j/include/graph/execution/LogicExit.h
@@ -21,10 +21,10 @@
 #ifndef LIBND4J_LOGICEXIT_H
 #define LIBND4J_LOGICEXIT_H
 
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <graph/Graph.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         class LogicExit {
         public:
diff --git a/libnd4j/include/graph/execution/LogicExpose.h b/libnd4j/include/graph/execution/LogicExpose.h
index 3b86270db..046f3e64e 100644
--- a/libnd4j/include/graph/execution/LogicExpose.h
+++ b/libnd4j/include/graph/execution/LogicExpose.h
@@ -21,11 +21,11 @@
 #ifndef LIBND4J_LOGICEXPOSE_H
 #define LIBND4J_LOGICEXPOSE_H
 
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <graph/Node.h>
 #include <graph/Graph.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         class LogicExpose {
         public:
diff --git a/libnd4j/include/graph/execution/LogicLoopCond.h b/libnd4j/include/graph/execution/LogicLoopCond.h
index 8e8d89c55..36693232b 100644
--- a/libnd4j/include/graph/execution/LogicLoopCond.h
+++ b/libnd4j/include/graph/execution/LogicLoopCond.h
@@ -21,10 +21,10 @@
 #ifndef LIBND4J_LOGICLOOPCOND_H
 #define LIBND4J_LOGICLOOPCOND_H
 
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <graph/Graph.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         class LogicLoopCond {
         public:
diff --git a/libnd4j/include/graph/execution/LogicMerge.h b/libnd4j/include/graph/execution/LogicMerge.h
index eebb7fea8..fe20c9d66 100644
--- a/libnd4j/include/graph/execution/LogicMerge.h
+++ b/libnd4j/include/graph/execution/LogicMerge.h
@@ -21,10 +21,10 @@
 #ifndef LIBND4J_LOGICMERGE_H
 #define LIBND4J_LOGICMERGE_H
 
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <graph/Graph.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         class LogicMerge {
         public:
diff --git a/libnd4j/include/graph/execution/LogicNextIteration.h b/libnd4j/include/graph/execution/LogicNextIteration.h
index b4cfff800..5b9600909 100644
--- a/libnd4j/include/graph/execution/LogicNextIteration.h
+++ b/libnd4j/include/graph/execution/LogicNextIteration.h
@@ -21,10 +21,10 @@
 #ifndef LIBND4J_LOGICNEXTITERATION_H
 #define LIBND4J_LOGICNEXTITERATION_H
 
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <graph/Graph.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         class LogicNextIeration {
         public:
diff --git a/libnd4j/include/graph/execution/LogicReturn.h b/libnd4j/include/graph/execution/LogicReturn.h
index 072b6c2a2..2cc6107c5 100644
--- a/libnd4j/include/graph/execution/LogicReturn.h
+++ b/libnd4j/include/graph/execution/LogicReturn.h
@@ -22,11 +22,11 @@
 #define LIBND4J_LOGICRETURN_H
 
 
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <graph/Node.h>
 #include <graph/Graph.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         /**
          * This class is responsible for execution logic of Return logical abstraction
diff --git a/libnd4j/include/graph/execution/LogicScope.h b/libnd4j/include/graph/execution/LogicScope.h
index ab2924a92..a7a8d6b7a 100644
--- a/libnd4j/include/graph/execution/LogicScope.h
+++ b/libnd4j/include/graph/execution/LogicScope.h
@@ -21,11 +21,11 @@
 #ifndef LIBND4J_LOGICSCOPE_H
 #define LIBND4J_LOGICSCOPE_H
 
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <graph/Node.h>
 #include <graph/Graph.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         /**
          * This class is responsible for execution logic of Scope logical abstraction
diff --git a/libnd4j/include/graph/execution/LogicSwitch.h b/libnd4j/include/graph/execution/LogicSwitch.h
index e61c6b9cc..d91959d91 100644
--- a/libnd4j/include/graph/execution/LogicSwitch.h
+++ b/libnd4j/include/graph/execution/LogicSwitch.h
@@ -21,11 +21,11 @@
 #ifndef LIBND4J_LOGICSWITCH_H
 #define LIBND4J_LOGICSWITCH_H
 
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <graph/Node.h>
 #include <graph/Graph.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         /**
          * This class is responsible for execution logic of Switch logical abstraction
diff --git a/libnd4j/include/graph/execution/LogicWhile.h b/libnd4j/include/graph/execution/LogicWhile.h
index b0bbcef05..6e4b2ea3a 100644
--- a/libnd4j/include/graph/execution/LogicWhile.h
+++ b/libnd4j/include/graph/execution/LogicWhile.h
@@ -21,11 +21,11 @@
 #ifndef LIBND4J_LOGICWHILE_H
 #define LIBND4J_LOGICWHILE_H
 
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <graph/Node.h>
 #include <graph/Graph.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         /**
          * This class is responsible for execution logic of While logical abstraction
diff --git a/libnd4j/include/graph/execution/impl/LogicConditional.cpp b/libnd4j/include/graph/execution/impl/LogicConditional.cpp
index fb1f0fa1e..25627df45 100644
--- a/libnd4j/include/graph/execution/impl/LogicConditional.cpp
+++ b/libnd4j/include/graph/execution/impl/LogicConditional.cpp
@@ -19,12 +19,12 @@
 //
 
 #include <graph/execution/LogicConditional.h>
-#include <GraphExecutioner.h>
+#include <graph/GraphExecutioner.h>
 #include <graph/execution/LogicReturn.h>
-#include <Status.h>
+#include <graph/Status.h>
 
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         Nd4jStatus LogicConditional::processNode(Graph *graph, Node *node) {
             auto __variableSpace = graph->getVariableSpace();
@@ -130,7 +130,7 @@ namespace nd4j {
                 }
             }
 
-            return nd4j::Status::OK();
+            return sd::Status::OK();
         }
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/graph/execution/impl/LogicEnter.cpp b/libnd4j/include/graph/execution/impl/LogicEnter.cpp
index aa69e97ac..f10ff792f 100644
--- a/libnd4j/include/graph/execution/impl/LogicEnter.cpp
+++ b/libnd4j/include/graph/execution/impl/LogicEnter.cpp
@@ -19,10 +19,10 @@
 //
 
 #include <graph/execution/LogicEnter.h>
-#include <Status.h>
+#include <graph/Status.h>
 
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         Nd4jStatus LogicEnter::processNode(Graph *graph, Node *node) {
             // this op replicates input variable into the frame. basically happens once for single loop.
@@ -68,7 +68,7 @@ namespace nd4j {
                 }
             }
 
-            return nd4j::Status::OK();
+            return sd::Status::OK();
         }
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/graph/execution/impl/LogicExecutor.cpp b/libnd4j/include/graph/execution/impl/LogicExecutor.cpp
index 5826e6745..fd7ce3e85 100644
--- a/libnd4j/include/graph/execution/impl/LogicExecutor.cpp
+++ b/libnd4j/include/graph/execution/impl/LogicExecutor.cpp
@@ -32,31 +32,31 @@
 #include <graph/execution/LogicNextIteration.h>
 
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         Nd4jStatus LogicExecutor::processNode(Graph *graph, Node *node) {
             switch (node->opNum()) {
-                case nd4j::logic::While:
+                case sd::logic::While:
                     return LogicWhile::processNode(graph, node);
-                case nd4j::logic::Scope:
+                case sd::logic::Scope:
                     return LogicScope::processNode(graph, node);
-                case nd4j::logic::Conditional:
+                case sd::logic::Conditional:
                     return LogicConditional::processNode(graph, node);
-                case nd4j::logic::Switch:
+                case sd::logic::Switch:
                     return LogicSwitch::processNode(graph, node);
-                case nd4j::logic::Return:
+                case sd::logic::Return:
                     return LogicReturn::processNode(graph, node);
-                case nd4j::logic::Expose:
+                case sd::logic::Expose:
                     return LogicExpose::processNode(graph, node);
-                case nd4j::logic::Merge:
+                case sd::logic::Merge:
                     return LogicMerge::processNode(graph, node);
-                case nd4j::logic::LoopCond:
+                case sd::logic::LoopCond:
                     return LogicLoopCond::processNode(graph, node);
-                case nd4j::logic::NextIteration:
+                case sd::logic::NextIteration:
                     return LogicNextIeration::processNode(graph, node);
-                case nd4j::logic::Exit:
+                case sd::logic::Exit:
                     return LogicExit::processNode(graph, node);
-                case nd4j::logic::Enter:
+                case sd::logic::Enter:
                     return LogicEnter::processNode(graph, node);
             }
 
diff --git a/libnd4j/include/graph/execution/impl/LogicExit.cpp b/libnd4j/include/graph/execution/impl/LogicExit.cpp
index b5e5c0e60..9a0e21793 100644
--- a/libnd4j/include/graph/execution/impl/LogicExit.cpp
+++ b/libnd4j/include/graph/execution/impl/LogicExit.cpp
@@ -21,7 +21,7 @@
 #include <graph/execution/LogicExit.h>
 
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         Nd4jStatus LogicExit::processNode(Graph *graph, Node *node) {
             // this op is basically no-op
diff --git a/libnd4j/include/graph/execution/impl/LogicExpose.cpp b/libnd4j/include/graph/execution/impl/LogicExpose.cpp
index b2a454fcf..b19e1df55 100644
--- a/libnd4j/include/graph/execution/impl/LogicExpose.cpp
+++ b/libnd4j/include/graph/execution/impl/LogicExpose.cpp
@@ -20,7 +20,7 @@
 
 #include <graph/execution/LogicExpose.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         Nd4jStatus LogicExpose::processNode(Graph *graph, Node *node) {
             // do we really want this?
diff --git a/libnd4j/include/graph/execution/impl/LogicLoopCond.cpp b/libnd4j/include/graph/execution/impl/LogicLoopCond.cpp
index 3b035f212..292452719 100644
--- a/libnd4j/include/graph/execution/impl/LogicLoopCond.cpp
+++ b/libnd4j/include/graph/execution/impl/LogicLoopCond.cpp
@@ -21,7 +21,7 @@
 #include <graph/execution/LogicLoopCond.h>
 
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         Nd4jStatus LogicLoopCond::processNode(Graph *graph, Node *node) {
             auto __variableSpace = graph->getVariableSpace();
diff --git a/libnd4j/include/graph/execution/impl/LogicMerge.cpp b/libnd4j/include/graph/execution/impl/LogicMerge.cpp
index aa0d49a20..9d032a93f 100644
--- a/libnd4j/include/graph/execution/impl/LogicMerge.cpp
+++ b/libnd4j/include/graph/execution/impl/LogicMerge.cpp
@@ -19,9 +19,9 @@
 //
 
 #include <graph/execution/LogicMerge.h>
-#include <Status.h>
+#include <graph/Status.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         Nd4jStatus LogicMerge::processNode(Graph *graph, Node *node) {
             // at merge node only one of inputs exist if that's just switch and other node isn't LogicNextItration
diff --git a/libnd4j/include/graph/execution/impl/LogicNextIteration.cpp b/libnd4j/include/graph/execution/impl/LogicNextIteration.cpp
index 439cc6bc1..fb7eaa513 100644
--- a/libnd4j/include/graph/execution/impl/LogicNextIteration.cpp
+++ b/libnd4j/include/graph/execution/impl/LogicNextIteration.cpp
@@ -21,7 +21,7 @@
 #include <graph/execution/LogicNextIteration.h>
 
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         Nd4jStatus LogicNextIeration::processNode(Graph *graph, Node *node) {
             auto __variableSpace = graph->getVariableSpace();
diff --git a/libnd4j/include/graph/execution/impl/LogicReturn.cpp b/libnd4j/include/graph/execution/impl/LogicReturn.cpp
index b5f39c16e..c9dbafd6d 100644
--- a/libnd4j/include/graph/execution/impl/LogicReturn.cpp
+++ b/libnd4j/include/graph/execution/impl/LogicReturn.cpp
@@ -20,9 +20,9 @@
 
 #include "graph/execution/LogicReturn.h"
 #include <helpers/EnumUtils.h>
-#include <Status.h>
+#include <graph/Status.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         Nd4jStatus LogicReturn::processNode(Graph *graph, Node *node) {
             auto __variableSpace = graph->getVariableSpace();
@@ -49,7 +49,7 @@ namespace nd4j {
                     nd4j_debug("In after: [%f]; Out after: [%f]\n", varIn->getNDArray()->meanNumber().e<float>(0), varOut->getNDArray()->meanNumber().e<float>(0));
             }
 
-            return nd4j::Status::OK();
+            return sd::Status::OK();
         }
     }
 }
diff --git a/libnd4j/include/graph/execution/impl/LogicScope.cpp b/libnd4j/include/graph/execution/impl/LogicScope.cpp
index bfd58e1c7..1773aa6ea 100644
--- a/libnd4j/include/graph/execution/impl/LogicScope.cpp
+++ b/libnd4j/include/graph/execution/impl/LogicScope.cpp
@@ -19,15 +19,15 @@
 //
 
 #include <graph/execution/LogicScope.h>
-#include <Status.h>
+#include <graph/Status.h>
 
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         Nd4jStatus LogicScope::processNode(Graph *graph, Node *node) {
             // this op is basically no-op
             // we just know it exists
-            return nd4j::Status::OK();
+            return sd::Status::OK();
         }
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/graph/execution/impl/LogicSwitch.cpp b/libnd4j/include/graph/execution/impl/LogicSwitch.cpp
index c19d339dc..1089046a3 100644
--- a/libnd4j/include/graph/execution/impl/LogicSwitch.cpp
+++ b/libnd4j/include/graph/execution/impl/LogicSwitch.cpp
@@ -18,12 +18,12 @@
 // Created by raver119 on 21.10.17.
 //
 
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <graph/execution/LogicSwitch.h>
-#include <GraphExecutioner.h>
-#include <Status.h>
+#include <graph/GraphExecutioner.h>
+#include <graph/Status.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         Nd4jStatus LogicSwitch::processNode(Graph* graph, Node* node) {
             auto __variableSpace = graph->getVariableSpace();
@@ -102,7 +102,7 @@ namespace nd4j {
                 }
             }
 
-            return nd4j::Status::OK();
+            return sd::Status::OK();
         };
     }
 }
diff --git a/libnd4j/include/graph/execution/impl/LogicWhile.cpp b/libnd4j/include/graph/execution/impl/LogicWhile.cpp
index 147c35248..1dfd3aaf2 100644
--- a/libnd4j/include/graph/execution/impl/LogicWhile.cpp
+++ b/libnd4j/include/graph/execution/impl/LogicWhile.cpp
@@ -20,12 +20,12 @@
 
 #include <graph/execution/LogicWhile.h>
 #include <graph/execution/LogicReturn.h>
-#include <GraphExecutioner.h>
+#include <graph/GraphExecutioner.h>
 #include <graph/execution/LogicExecutor.h>
-#include <Status.h>
+#include <graph/Status.h>
 
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         Nd4jStatus LogicWhile::processNode(Graph *graph, Node *node) {
             auto __variableSpace = graph->getVariableSpace();
@@ -138,7 +138,7 @@ namespace nd4j {
                 return ND4J_STATUS_KERNEL_FAILURE;
             }
 
-            return nd4j::Status::OK();
+            return sd::Status::OK();
         }
     }
 }
diff --git a/libnd4j/include/graph/generated/array_generated.h b/libnd4j/include/graph/generated/array_generated.h
index e3b3bbe60..5c4c0d7af 100644
--- a/libnd4j/include/graph/generated/array_generated.h
+++ b/libnd4j/include/graph/generated/array_generated.h
@@ -6,7 +6,7 @@
 
 #include "flatbuffers/flatbuffers.h"
 
-namespace nd4j {
+namespace sd {
 namespace graph {
 
 struct FlatArray;
@@ -236,7 +236,7 @@ inline flatbuffers::Offset<FlatArray> CreateFlatArrayDirect(
     const std::vector<int8_t> *buffer = nullptr,
     DType dtype = DType_INHERIT,
     ByteOrder byteOrder = ByteOrder_LE) {
-  return nd4j::graph::CreateFlatArray(
+  return sd::graph::CreateFlatArray(
       _fbb,
       shape ? _fbb.CreateVector<int64_t>(*shape) : 0,
       buffer ? _fbb.CreateVector<int8_t>(*buffer) : 0,
@@ -244,37 +244,37 @@ inline flatbuffers::Offset<FlatArray> CreateFlatArrayDirect(
       byteOrder);
 }
 
-inline const nd4j::graph::FlatArray *GetFlatArray(const void *buf) {
-  return flatbuffers::GetRoot<nd4j::graph::FlatArray>(buf);
+inline const sd::graph::FlatArray *GetFlatArray(const void *buf) {
+  return flatbuffers::GetRoot<sd::graph::FlatArray>(buf);
 }
 
-inline const nd4j::graph::FlatArray *GetSizePrefixedFlatArray(const void *buf) {
-  return flatbuffers::GetSizePrefixedRoot<nd4j::graph::FlatArray>(buf);
+inline const sd::graph::FlatArray *GetSizePrefixedFlatArray(const void *buf) {
+  return flatbuffers::GetSizePrefixedRoot<sd::graph::FlatArray>(buf);
 }
 
 inline bool VerifyFlatArrayBuffer(
     flatbuffers::Verifier &verifier) {
-  return verifier.VerifyBuffer<nd4j::graph::FlatArray>(nullptr);
+  return verifier.VerifyBuffer<sd::graph::FlatArray>(nullptr);
 }
 
 inline bool VerifySizePrefixedFlatArrayBuffer(
     flatbuffers::Verifier &verifier) {
-  return verifier.VerifySizePrefixedBuffer<nd4j::graph::FlatArray>(nullptr);
+  return verifier.VerifySizePrefixedBuffer<sd::graph::FlatArray>(nullptr);
 }
 
 inline void FinishFlatArrayBuffer(
     flatbuffers::FlatBufferBuilder &fbb,
-    flatbuffers::Offset<nd4j::graph::FlatArray> root) {
+    flatbuffers::Offset<sd::graph::FlatArray> root) {
   fbb.Finish(root);
 }
 
 inline void FinishSizePrefixedFlatArrayBuffer(
     flatbuffers::FlatBufferBuilder &fbb,
-    flatbuffers::Offset<nd4j::graph::FlatArray> root) {
+    flatbuffers::Offset<sd::graph::FlatArray> root) {
   fbb.FinishSizePrefixed(root);
 }
 
 }  // namespace graph
-}  // namespace nd4j
+}  // namespace sd
 
 #endif  // FLATBUFFERS_GENERATED_ARRAY_ND4J_GRAPH_H_
diff --git a/libnd4j/include/graph/generated/config_generated.h b/libnd4j/include/graph/generated/config_generated.h
index 686d3dc2f..2c12027a2 100644
--- a/libnd4j/include/graph/generated/config_generated.h
+++ b/libnd4j/include/graph/generated/config_generated.h
@@ -6,7 +6,7 @@
 
 #include "flatbuffers/flatbuffers.h"
 
-namespace nd4j {
+namespace sd {
 namespace graph {
 
 struct FlatConfiguration;
@@ -258,37 +258,37 @@ inline flatbuffers::Offset<FlatConfiguration> CreateFlatConfiguration(
   return builder_.Finish();
 }
 
-inline const nd4j::graph::FlatConfiguration *GetFlatConfiguration(const void *buf) {
-  return flatbuffers::GetRoot<nd4j::graph::FlatConfiguration>(buf);
+inline const sd::graph::FlatConfiguration *GetFlatConfiguration(const void *buf) {
+  return flatbuffers::GetRoot<sd::graph::FlatConfiguration>(buf);
 }
 
-inline const nd4j::graph::FlatConfiguration *GetSizePrefixedFlatConfiguration(const void *buf) {
-  return flatbuffers::GetSizePrefixedRoot<nd4j::graph::FlatConfiguration>(buf);
+inline const sd::graph::FlatConfiguration *GetSizePrefixedFlatConfiguration(const void *buf) {
+  return flatbuffers::GetSizePrefixedRoot<sd::graph::FlatConfiguration>(buf);
 }
 
 inline bool VerifyFlatConfigurationBuffer(
     flatbuffers::Verifier &verifier) {
-  return verifier.VerifyBuffer<nd4j::graph::FlatConfiguration>(nullptr);
+  return verifier.VerifyBuffer<sd::graph::FlatConfiguration>(nullptr);
 }
 
 inline bool VerifySizePrefixedFlatConfigurationBuffer(
     flatbuffers::Verifier &verifier) {
-  return verifier.VerifySizePrefixedBuffer<nd4j::graph::FlatConfiguration>(nullptr);
+  return verifier.VerifySizePrefixedBuffer<sd::graph::FlatConfiguration>(nullptr);
 }
 
 inline void FinishFlatConfigurationBuffer(
     flatbuffers::FlatBufferBuilder &fbb,
-    flatbuffers::Offset<nd4j::graph::FlatConfiguration> root) {
+    flatbuffers::Offset<sd::graph::FlatConfiguration> root) {
   fbb.Finish(root);
 }
 
 inline void FinishSizePrefixedFlatConfigurationBuffer(
     flatbuffers::FlatBufferBuilder &fbb,
-    flatbuffers::Offset<nd4j::graph::FlatConfiguration> root) {
+    flatbuffers::Offset<sd::graph::FlatConfiguration> root) {
   fbb.FinishSizePrefixed(root);
 }
 
 }  // namespace graph
-}  // namespace nd4j
+}  // namespace sd
 
 #endif  // FLATBUFFERS_GENERATED_CONFIG_ND4J_GRAPH_H_
diff --git a/libnd4j/include/graph/generated/graph.grpc.fb.cc b/libnd4j/include/graph/generated/graph.grpc.fb.cc
index 52d585abb..5146f49f2 100644
--- a/libnd4j/include/graph/generated/graph.grpc.fb.cc
+++ b/libnd4j/include/graph/generated/graph.grpc.fb.cc
@@ -13,7 +13,7 @@
 #include <grpc++/impl/codegen/rpc_service_method.h>
 #include <grpc++/impl/codegen/service_type.h>
 #include <grpc++/impl/codegen/sync_stream.h>
-namespace nd4j {
+namespace sd {
 namespace graph {
 
 static const char* GraphInferenceServer_method_names[] = {
@@ -138,6 +138,6 @@ GraphInferenceServer::Service::~Service() {
 }
 
 
-}  // namespace nd4j
+}  // namespace sd
 }  // namespace graph
 
diff --git a/libnd4j/include/graph/generated/graph.grpc.fb.h b/libnd4j/include/graph/generated/graph.grpc.fb.h
index 28a1e9275..0167f48d5 100644
--- a/libnd4j/include/graph/generated/graph.grpc.fb.h
+++ b/libnd4j/include/graph/generated/graph.grpc.fb.h
@@ -24,7 +24,7 @@ class ServerCompletionQueue;
 class ServerContext;
 }  // namespace grpc
 
-namespace nd4j {
+namespace sd {
 namespace graph {
 
 class GraphInferenceServer final {
@@ -366,7 +366,7 @@ class GraphInferenceServer final {
 };
 
 }  // namespace graph
-}  // namespace nd4j
+}  // namespace sd
 
 
 #endif  // GRPC_graph__INCLUDED
diff --git a/libnd4j/include/graph/generated/graph_generated.h b/libnd4j/include/graph/generated/graph_generated.h
index 518e5d2cf..1285e4607 100644
--- a/libnd4j/include/graph/generated/graph_generated.h
+++ b/libnd4j/include/graph/generated/graph_generated.h
@@ -15,7 +15,7 @@
 #include "utils_generated.h"
 #include "variable_generated.h"
 
-namespace nd4j {
+namespace sd {
 namespace graph {
 
 struct UpdaterState;
@@ -96,7 +96,7 @@ inline flatbuffers::Offset<UpdaterState> CreateUpdaterStateDirect(
     const char *paramName = nullptr,
     const std::vector<flatbuffers::Offset<flatbuffers::String>> *updaterStateKeys = nullptr,
     const std::vector<flatbuffers::Offset<FlatArray>> *updaterStateValues = nullptr) {
-  return nd4j::graph::CreateUpdaterState(
+  return sd::graph::CreateUpdaterState(
       _fbb,
       paramName ? _fbb.CreateString(paramName) : 0,
       updaterStateKeys ? _fbb.CreateVector<flatbuffers::Offset<flatbuffers::String>>(*updaterStateKeys) : 0,
@@ -248,7 +248,7 @@ inline flatbuffers::Offset<FlatGraph> CreateFlatGraphDirect(
     const std::vector<flatbuffers::Offset<flatbuffers::String>> *lossVariables = nullptr,
     const char *trainingConfig = nullptr,
     const std::vector<flatbuffers::Offset<UpdaterState>> *updaterState = nullptr) {
-  return nd4j::graph::CreateFlatGraph(
+  return sd::graph::CreateFlatGraph(
       _fbb,
       id,
       variables ? _fbb.CreateVector<flatbuffers::Offset<FlatVariable>>(*variables) : 0,
@@ -341,37 +341,37 @@ inline flatbuffers::Offset<FlatResponse> CreateFlatResponse(
   return builder_.Finish();
 }
 
-inline const nd4j::graph::FlatGraph *GetFlatGraph(const void *buf) {
-  return flatbuffers::GetRoot<nd4j::graph::FlatGraph>(buf);
+inline const sd::graph::FlatGraph *GetFlatGraph(const void *buf) {
+  return flatbuffers::GetRoot<sd::graph::FlatGraph>(buf);
 }
 
-inline const nd4j::graph::FlatGraph *GetSizePrefixedFlatGraph(const void *buf) {
-  return flatbuffers::GetSizePrefixedRoot<nd4j::graph::FlatGraph>(buf);
+inline const sd::graph::FlatGraph *GetSizePrefixedFlatGraph(const void *buf) {
+  return flatbuffers::GetSizePrefixedRoot<sd::graph::FlatGraph>(buf);
 }
 
 inline bool VerifyFlatGraphBuffer(
     flatbuffers::Verifier &verifier) {
-  return verifier.VerifyBuffer<nd4j::graph::FlatGraph>(nullptr);
+  return verifier.VerifyBuffer<sd::graph::FlatGraph>(nullptr);
 }
 
 inline bool VerifySizePrefixedFlatGraphBuffer(
     flatbuffers::Verifier &verifier) {
-  return verifier.VerifySizePrefixedBuffer<nd4j::graph::FlatGraph>(nullptr);
+  return verifier.VerifySizePrefixedBuffer<sd::graph::FlatGraph>(nullptr);
 }
 
 inline void FinishFlatGraphBuffer(
     flatbuffers::FlatBufferBuilder &fbb,
-    flatbuffers::Offset<nd4j::graph::FlatGraph> root) {
+    flatbuffers::Offset<sd::graph::FlatGraph> root) {
   fbb.Finish(root);
 }
 
 inline void FinishSizePrefixedFlatGraphBuffer(
     flatbuffers::FlatBufferBuilder &fbb,
-    flatbuffers::Offset<nd4j::graph::FlatGraph> root) {
+    flatbuffers::Offset<sd::graph::FlatGraph> root) {
   fbb.FinishSizePrefixed(root);
 }
 
 }  // namespace graph
-}  // namespace nd4j
+}  // namespace sd
 
 #endif  // FLATBUFFERS_GENERATED_GRAPH_ND4J_GRAPH_H_
diff --git a/libnd4j/include/graph/generated/nd4j/graph/ByteOrder.cs b/libnd4j/include/graph/generated/nd4j/graph/ByteOrder.cs
index 463de04af..5b20d143f 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/ByteOrder.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/ByteOrder.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 public enum ByteOrder : sbyte
diff --git a/libnd4j/include/graph/generated/nd4j/graph/DType.cs b/libnd4j/include/graph/generated/nd4j/graph/DType.cs
index 9062dc881..f6fd2778c 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/DType.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/DType.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 public enum DType : sbyte
diff --git a/libnd4j/include/graph/generated/nd4j/graph/Direction.cs b/libnd4j/include/graph/generated/nd4j/graph/Direction.cs
index 9929b12e6..d93c1e947 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/Direction.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/Direction.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 public enum Direction : sbyte
diff --git a/libnd4j/include/graph/generated/nd4j/graph/ExecutionMode.cs b/libnd4j/include/graph/generated/nd4j/graph/ExecutionMode.cs
index 826037783..77f54df5b 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/ExecutionMode.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/ExecutionMode.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 public enum ExecutionMode : sbyte
diff --git a/libnd4j/include/graph/generated/nd4j/graph/FlatArray.cs b/libnd4j/include/graph/generated/nd4j/graph/FlatArray.cs
index 60d836aeb..6a4aad3f2 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/FlatArray.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/FlatArray.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 using global::System;
diff --git a/libnd4j/include/graph/generated/nd4j/graph/FlatArrayList.cs b/libnd4j/include/graph/generated/nd4j/graph/FlatArrayList.cs
index 0720ed410..dd2e17d32 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/FlatArrayList.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/FlatArrayList.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 using global::System;
diff --git a/libnd4j/include/graph/generated/nd4j/graph/FlatConfiguration.cs b/libnd4j/include/graph/generated/nd4j/graph/FlatConfiguration.cs
index ca50e9973..5c2edbf9b 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/FlatConfiguration.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/FlatConfiguration.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 using global::System;
diff --git a/libnd4j/include/graph/generated/nd4j/graph/FlatDropRequest.cs b/libnd4j/include/graph/generated/nd4j/graph/FlatDropRequest.cs
index 3aba26796..6d2f5f68b 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/FlatDropRequest.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/FlatDropRequest.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 using global::System;
diff --git a/libnd4j/include/graph/generated/nd4j/graph/FlatGraph.cs b/libnd4j/include/graph/generated/nd4j/graph/FlatGraph.cs
index 67eef75b8..15978e301 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/FlatGraph.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/FlatGraph.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 using global::System;
diff --git a/libnd4j/include/graph/generated/nd4j/graph/FlatInferenceRequest.cs b/libnd4j/include/graph/generated/nd4j/graph/FlatInferenceRequest.cs
index c4a5e7283..88e36dad5 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/FlatInferenceRequest.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/FlatInferenceRequest.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 using global::System;
diff --git a/libnd4j/include/graph/generated/nd4j/graph/FlatNode.cs b/libnd4j/include/graph/generated/nd4j/graph/FlatNode.cs
index c94e0fcc4..3f951c0e9 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/FlatNode.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/FlatNode.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 using global::System;
diff --git a/libnd4j/include/graph/generated/nd4j/graph/FlatProperties.cs b/libnd4j/include/graph/generated/nd4j/graph/FlatProperties.cs
index ae116fcb0..471dd9f3d 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/FlatProperties.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/FlatProperties.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 using global::System;
diff --git a/libnd4j/include/graph/generated/nd4j/graph/FlatResponse.cs b/libnd4j/include/graph/generated/nd4j/graph/FlatResponse.cs
index a69698370..2acd7728d 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/FlatResponse.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/FlatResponse.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 using global::System;
diff --git a/libnd4j/include/graph/generated/nd4j/graph/FlatResult.cs b/libnd4j/include/graph/generated/nd4j/graph/FlatResult.cs
index c35c6cb1d..20147a93e 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/FlatResult.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/FlatResult.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 using global::System;
diff --git a/libnd4j/include/graph/generated/nd4j/graph/FlatTiming.cs b/libnd4j/include/graph/generated/nd4j/graph/FlatTiming.cs
index 739c0741e..16aa8895c 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/FlatTiming.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/FlatTiming.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 using global::System;
diff --git a/libnd4j/include/graph/generated/nd4j/graph/FlatVariable.cs b/libnd4j/include/graph/generated/nd4j/graph/FlatVariable.cs
index 325094654..9331965be 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/FlatVariable.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/FlatVariable.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 using global::System;
diff --git a/libnd4j/include/graph/generated/nd4j/graph/FrameIteration.cs b/libnd4j/include/graph/generated/nd4j/graph/FrameIteration.cs
index b3ded5375..6b73ff261 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/FrameIteration.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/FrameIteration.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 using global::System;
diff --git a/libnd4j/include/graph/generated/nd4j/graph/InputType.cs b/libnd4j/include/graph/generated/nd4j/graph/InputType.cs
index 997dd62b2..0172b846d 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/InputType.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/InputType.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 public enum InputType : sbyte
diff --git a/libnd4j/include/graph/generated/nd4j/graph/IntPair.cs b/libnd4j/include/graph/generated/nd4j/graph/IntPair.cs
index 665842db7..38dad82ed 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/IntPair.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/IntPair.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 using global::System;
diff --git a/libnd4j/include/graph/generated/nd4j/graph/IntTriple.cs b/libnd4j/include/graph/generated/nd4j/graph/IntTriple.cs
index 2d3d2a03e..e1c5ac1a4 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/IntTriple.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/IntTriple.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 using global::System;
diff --git a/libnd4j/include/graph/generated/nd4j/graph/LongPair.cs b/libnd4j/include/graph/generated/nd4j/graph/LongPair.cs
index 248c83ea4..b28c9bd0b 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/LongPair.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/LongPair.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 using global::System;
diff --git a/libnd4j/include/graph/generated/nd4j/graph/LongTriple.cs b/libnd4j/include/graph/generated/nd4j/graph/LongTriple.cs
index fbb5fb7cd..dc49143c3 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/LongTriple.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/LongTriple.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 using global::System;
diff --git a/libnd4j/include/graph/generated/nd4j/graph/OpClass.cs b/libnd4j/include/graph/generated/nd4j/graph/OpClass.cs
index 4ff4c2f3b..45ac68a3a 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/OpClass.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/OpClass.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 public enum OpClass : sbyte
diff --git a/libnd4j/include/graph/generated/nd4j/graph/OpType.cs b/libnd4j/include/graph/generated/nd4j/graph/OpType.cs
index f9a5c7c18..2b9d27bce 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/OpType.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/OpType.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 public enum OpType : sbyte
diff --git a/libnd4j/include/graph/generated/nd4j/graph/OutputMode.cs b/libnd4j/include/graph/generated/nd4j/graph/OutputMode.cs
index 8e88b51d9..4cd916dbe 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/OutputMode.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/OutputMode.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 public enum OutputMode : sbyte
diff --git a/libnd4j/include/graph/generated/nd4j/graph/ProfilingMode.cs b/libnd4j/include/graph/generated/nd4j/graph/ProfilingMode.cs
index 837deeffd..1d9b90ca3 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/ProfilingMode.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/ProfilingMode.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 public enum ProfilingMode : sbyte
diff --git a/libnd4j/include/graph/generated/nd4j/graph/UIAddName.cs b/libnd4j/include/graph/generated/nd4j/graph/UIAddName.cs
index bf15cee55..cc2f6fb85 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/UIAddName.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/UIAddName.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 using global::System;
diff --git a/libnd4j/include/graph/generated/nd4j/graph/UIEvent.cs b/libnd4j/include/graph/generated/nd4j/graph/UIEvent.cs
index 1843a48bd..a8775d67d 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/UIEvent.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/UIEvent.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 using global::System;
diff --git a/libnd4j/include/graph/generated/nd4j/graph/UIEventSubtype.cs b/libnd4j/include/graph/generated/nd4j/graph/UIEventSubtype.cs
index 99bed5876..d4da48a25 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/UIEventSubtype.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/UIEventSubtype.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 public enum UIEventSubtype : sbyte
diff --git a/libnd4j/include/graph/generated/nd4j/graph/UIEventType.cs b/libnd4j/include/graph/generated/nd4j/graph/UIEventType.cs
index 6d167e0ac..32427dac2 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/UIEventType.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/UIEventType.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 public enum UIEventType : sbyte
diff --git a/libnd4j/include/graph/generated/nd4j/graph/UIGraphStructure.cs b/libnd4j/include/graph/generated/nd4j/graph/UIGraphStructure.cs
index 97a491bb8..ad8e16ab9 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/UIGraphStructure.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/UIGraphStructure.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 using global::System;
diff --git a/libnd4j/include/graph/generated/nd4j/graph/UIHardwareState.cs b/libnd4j/include/graph/generated/nd4j/graph/UIHardwareState.cs
index 68ea3ecf8..a3444a2ea 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/UIHardwareState.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/UIHardwareState.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 using global::System;
diff --git a/libnd4j/include/graph/generated/nd4j/graph/UIHistogram.cs b/libnd4j/include/graph/generated/nd4j/graph/UIHistogram.cs
index 81390e3c9..1801bef68 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/UIHistogram.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/UIHistogram.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 using global::System;
diff --git a/libnd4j/include/graph/generated/nd4j/graph/UIHistogramType.cs b/libnd4j/include/graph/generated/nd4j/graph/UIHistogramType.cs
index 18a46f293..3b8c3218b 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/UIHistogramType.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/UIHistogramType.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 public enum UIHistogramType : sbyte
diff --git a/libnd4j/include/graph/generated/nd4j/graph/UIInfoType.cs b/libnd4j/include/graph/generated/nd4j/graph/UIInfoType.cs
index a122bdda3..676d60530 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/UIInfoType.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/UIInfoType.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 public enum UIInfoType : sbyte
diff --git a/libnd4j/include/graph/generated/nd4j/graph/UIOp.cs b/libnd4j/include/graph/generated/nd4j/graph/UIOp.cs
index a60715762..f8b3b4fbd 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/UIOp.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/UIOp.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 using global::System;
diff --git a/libnd4j/include/graph/generated/nd4j/graph/UIStaticInfoRecord.cs b/libnd4j/include/graph/generated/nd4j/graph/UIStaticInfoRecord.cs
index f2a28fb5e..410a3c37b 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/UIStaticInfoRecord.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/UIStaticInfoRecord.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 using global::System;
diff --git a/libnd4j/include/graph/generated/nd4j/graph/UISummaryStatistics.cs b/libnd4j/include/graph/generated/nd4j/graph/UISummaryStatistics.cs
index 7bc069e5c..0f63d2d7c 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/UISummaryStatistics.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/UISummaryStatistics.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 using global::System;
diff --git a/libnd4j/include/graph/generated/nd4j/graph/UISystemInfo.cs b/libnd4j/include/graph/generated/nd4j/graph/UISystemInfo.cs
index bd81fde6c..6adbcf98e 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/UISystemInfo.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/UISystemInfo.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 using global::System;
diff --git a/libnd4j/include/graph/generated/nd4j/graph/UIVariable.cs b/libnd4j/include/graph/generated/nd4j/graph/UIVariable.cs
index 4b646b3bb..1f8d14971 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/UIVariable.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/UIVariable.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 using global::System;
diff --git a/libnd4j/include/graph/generated/nd4j/graph/UpdaterState.cs b/libnd4j/include/graph/generated/nd4j/graph/UpdaterState.cs
index 02f8c419b..8cb6e07f0 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/UpdaterState.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/UpdaterState.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 using global::System;
diff --git a/libnd4j/include/graph/generated/nd4j/graph/VarType.cs b/libnd4j/include/graph/generated/nd4j/graph/VarType.cs
index 462dfa7e7..4649bfef6 100644
--- a/libnd4j/include/graph/generated/nd4j/graph/VarType.cs
+++ b/libnd4j/include/graph/generated/nd4j/graph/VarType.cs
@@ -2,7 +2,7 @@
 //  automatically generated by the FlatBuffers compiler, do not modify
 // </auto-generated>
 
-namespace nd4j.graph
+namespace sd.graph
 {
 
 public enum VarType : sbyte
diff --git a/libnd4j/include/graph/generated/node_generated.h b/libnd4j/include/graph/generated/node_generated.h
index 92f4ab126..a39f2490c 100644
--- a/libnd4j/include/graph/generated/node_generated.h
+++ b/libnd4j/include/graph/generated/node_generated.h
@@ -10,7 +10,7 @@
 #include "properties_generated.h"
 #include "utils_generated.h"
 
-namespace nd4j {
+namespace sd {
 namespace graph {
 
 struct FlatNode;
@@ -324,7 +324,7 @@ inline flatbuffers::Offset<FlatNode> CreateFlatNodeDirect(
     const std::vector<flatbuffers::Offset<flatbuffers::String>> *varControlDeps = nullptr,
     const std::vector<flatbuffers::Offset<flatbuffers::String>> *controlDepFor = nullptr,
     const std::vector<int8_t> *extraTypes = nullptr) {
-  return nd4j::graph::CreateFlatNode(
+  return sd::graph::CreateFlatNode(
       _fbb,
       id,
       name ? _fbb.CreateString(name) : 0,
@@ -351,37 +351,37 @@ inline flatbuffers::Offset<FlatNode> CreateFlatNodeDirect(
       extraTypes ? _fbb.CreateVector<int8_t>(*extraTypes) : 0);
 }
 
-inline const nd4j::graph::FlatNode *GetFlatNode(const void *buf) {
-  return flatbuffers::GetRoot<nd4j::graph::FlatNode>(buf);
+inline const sd::graph::FlatNode *GetFlatNode(const void *buf) {
+  return flatbuffers::GetRoot<sd::graph::FlatNode>(buf);
 }
 
-inline const nd4j::graph::FlatNode *GetSizePrefixedFlatNode(const void *buf) {
-  return flatbuffers::GetSizePrefixedRoot<nd4j::graph::FlatNode>(buf);
+inline const sd::graph::FlatNode *GetSizePrefixedFlatNode(const void *buf) {
+  return flatbuffers::GetSizePrefixedRoot<sd::graph::FlatNode>(buf);
 }
 
 inline bool VerifyFlatNodeBuffer(
     flatbuffers::Verifier &verifier) {
-  return verifier.VerifyBuffer<nd4j::graph::FlatNode>(nullptr);
+  return verifier.VerifyBuffer<sd::graph::FlatNode>(nullptr);
 }
 
 inline bool VerifySizePrefixedFlatNodeBuffer(
     flatbuffers::Verifier &verifier) {
-  return verifier.VerifySizePrefixedBuffer<nd4j::graph::FlatNode>(nullptr);
+  return verifier.VerifySizePrefixedBuffer<sd::graph::FlatNode>(nullptr);
 }
 
 inline void FinishFlatNodeBuffer(
     flatbuffers::FlatBufferBuilder &fbb,
-    flatbuffers::Offset<nd4j::graph::FlatNode> root) {
+    flatbuffers::Offset<sd::graph::FlatNode> root) {
   fbb.Finish(root);
 }
 
 inline void FinishSizePrefixedFlatNodeBuffer(
     flatbuffers::FlatBufferBuilder &fbb,
-    flatbuffers::Offset<nd4j::graph::FlatNode> root) {
+    flatbuffers::Offset<sd::graph::FlatNode> root) {
   fbb.FinishSizePrefixed(root);
 }
 
 }  // namespace graph
-}  // namespace nd4j
+}  // namespace sd
 
 #endif  // FLATBUFFERS_GENERATED_NODE_ND4J_GRAPH_H_
diff --git a/libnd4j/include/graph/generated/properties_generated.h b/libnd4j/include/graph/generated/properties_generated.h
index 4f88b6249..34138fe86 100644
--- a/libnd4j/include/graph/generated/properties_generated.h
+++ b/libnd4j/include/graph/generated/properties_generated.h
@@ -8,7 +8,7 @@
 
 #include "array_generated.h"
 
-namespace nd4j {
+namespace sd {
 namespace graph {
 
 struct FlatProperties;
@@ -143,7 +143,7 @@ inline flatbuffers::Offset<FlatProperties> CreateFlatPropertiesDirect(
     const std::vector<uint8_t> *b = nullptr,
     const std::vector<flatbuffers::Offset<flatbuffers::String>> *s = nullptr,
     const std::vector<int32_t> *shape = nullptr) {
-  return nd4j::graph::CreateFlatProperties(
+  return sd::graph::CreateFlatProperties(
       _fbb,
       name ? _fbb.CreateString(name) : 0,
       i ? _fbb.CreateVector<int32_t>(*i) : 0,
@@ -155,37 +155,37 @@ inline flatbuffers::Offset<FlatProperties> CreateFlatPropertiesDirect(
       shape ? _fbb.CreateVector<int32_t>(*shape) : 0);
 }
 
-inline const nd4j::graph::FlatProperties *GetFlatProperties(const void *buf) {
-  return flatbuffers::GetRoot<nd4j::graph::FlatProperties>(buf);
+inline const sd::graph::FlatProperties *GetFlatProperties(const void *buf) {
+  return flatbuffers::GetRoot<sd::graph::FlatProperties>(buf);
 }
 
-inline const nd4j::graph::FlatProperties *GetSizePrefixedFlatProperties(const void *buf) {
-  return flatbuffers::GetSizePrefixedRoot<nd4j::graph::FlatProperties>(buf);
+inline const sd::graph::FlatProperties *GetSizePrefixedFlatProperties(const void *buf) {
+  return flatbuffers::GetSizePrefixedRoot<sd::graph::FlatProperties>(buf);
 }
 
 inline bool VerifyFlatPropertiesBuffer(
     flatbuffers::Verifier &verifier) {
-  return verifier.VerifyBuffer<nd4j::graph::FlatProperties>(nullptr);
+  return verifier.VerifyBuffer<sd::graph::FlatProperties>(nullptr);
 }
 
 inline bool VerifySizePrefixedFlatPropertiesBuffer(
     flatbuffers::Verifier &verifier) {
-  return verifier.VerifySizePrefixedBuffer<nd4j::graph::FlatProperties>(nullptr);
+  return verifier.VerifySizePrefixedBuffer<sd::graph::FlatProperties>(nullptr);
 }
 
 inline void FinishFlatPropertiesBuffer(
     flatbuffers::FlatBufferBuilder &fbb,
-    flatbuffers::Offset<nd4j::graph::FlatProperties> root) {
+    flatbuffers::Offset<sd::graph::FlatProperties> root) {
   fbb.Finish(root);
 }
 
 inline void FinishSizePrefixedFlatPropertiesBuffer(
     flatbuffers::FlatBufferBuilder &fbb,
-    flatbuffers::Offset<nd4j::graph::FlatProperties> root) {
+    flatbuffers::Offset<sd::graph::FlatProperties> root) {
   fbb.FinishSizePrefixed(root);
 }
 
 }  // namespace graph
-}  // namespace nd4j
+}  // namespace sd
 
 #endif  // FLATBUFFERS_GENERATED_PROPERTIES_ND4J_GRAPH_H_
diff --git a/libnd4j/include/graph/generated/request_generated.h b/libnd4j/include/graph/generated/request_generated.h
index 2084e2066..00c782311 100644
--- a/libnd4j/include/graph/generated/request_generated.h
+++ b/libnd4j/include/graph/generated/request_generated.h
@@ -11,7 +11,7 @@
 #include "utils_generated.h"
 #include "variable_generated.h"
 
-namespace nd4j {
+namespace sd {
 namespace graph {
 
 struct FlatInferenceRequest;
@@ -84,44 +84,44 @@ inline flatbuffers::Offset<FlatInferenceRequest> CreateFlatInferenceRequestDirec
     int64_t id = 0,
     const std::vector<flatbuffers::Offset<FlatVariable>> *variables = nullptr,
     flatbuffers::Offset<FlatConfiguration> configuration = 0) {
-  return nd4j::graph::CreateFlatInferenceRequest(
+  return sd::graph::CreateFlatInferenceRequest(
       _fbb,
       id,
       variables ? _fbb.CreateVector<flatbuffers::Offset<FlatVariable>>(*variables) : 0,
       configuration);
 }
 
-inline const nd4j::graph::FlatInferenceRequest *GetFlatInferenceRequest(const void *buf) {
-  return flatbuffers::GetRoot<nd4j::graph::FlatInferenceRequest>(buf);
+inline const sd::graph::FlatInferenceRequest *GetFlatInferenceRequest(const void *buf) {
+  return flatbuffers::GetRoot<sd::graph::FlatInferenceRequest>(buf);
 }
 
-inline const nd4j::graph::FlatInferenceRequest *GetSizePrefixedFlatInferenceRequest(const void *buf) {
-  return flatbuffers::GetSizePrefixedRoot<nd4j::graph::FlatInferenceRequest>(buf);
+inline const sd::graph::FlatInferenceRequest *GetSizePrefixedFlatInferenceRequest(const void *buf) {
+  return flatbuffers::GetSizePrefixedRoot<sd::graph::FlatInferenceRequest>(buf);
 }
 
 inline bool VerifyFlatInferenceRequestBuffer(
     flatbuffers::Verifier &verifier) {
-  return verifier.VerifyBuffer<nd4j::graph::FlatInferenceRequest>(nullptr);
+  return verifier.VerifyBuffer<sd::graph::FlatInferenceRequest>(nullptr);
 }
 
 inline bool VerifySizePrefixedFlatInferenceRequestBuffer(
     flatbuffers::Verifier &verifier) {
-  return verifier.VerifySizePrefixedBuffer<nd4j::graph::FlatInferenceRequest>(nullptr);
+  return verifier.VerifySizePrefixedBuffer<sd::graph::FlatInferenceRequest>(nullptr);
 }
 
 inline void FinishFlatInferenceRequestBuffer(
     flatbuffers::FlatBufferBuilder &fbb,
-    flatbuffers::Offset<nd4j::graph::FlatInferenceRequest> root) {
+    flatbuffers::Offset<sd::graph::FlatInferenceRequest> root) {
   fbb.Finish(root);
 }
 
 inline void FinishSizePrefixedFlatInferenceRequestBuffer(
     flatbuffers::FlatBufferBuilder &fbb,
-    flatbuffers::Offset<nd4j::graph::FlatInferenceRequest> root) {
+    flatbuffers::Offset<sd::graph::FlatInferenceRequest> root) {
   fbb.FinishSizePrefixed(root);
 }
 
 }  // namespace graph
-}  // namespace nd4j
+}  // namespace sd
 
 #endif  // FLATBUFFERS_GENERATED_REQUEST_ND4J_GRAPH_H_
diff --git a/libnd4j/include/graph/generated/result_generated.h b/libnd4j/include/graph/generated/result_generated.h
index aa41959eb..04c458a9f 100644
--- a/libnd4j/include/graph/generated/result_generated.h
+++ b/libnd4j/include/graph/generated/result_generated.h
@@ -12,7 +12,7 @@
 #include "utils_generated.h"
 #include "variable_generated.h"
 
-namespace nd4j {
+namespace sd {
 namespace graph {
 
 struct FlatTiming;
@@ -86,7 +86,7 @@ inline flatbuffers::Offset<FlatTiming> CreateFlatTimingDirect(
     int32_t id = 0,
     const char *name = nullptr,
     flatbuffers::Offset<LongPair> timing = 0) {
-  return nd4j::graph::CreateFlatTiming(
+  return sd::graph::CreateFlatTiming(
       _fbb,
       id,
       name ? _fbb.CreateString(name) : 0,
@@ -184,7 +184,7 @@ inline flatbuffers::Offset<FlatResult> CreateFlatResultDirect(
     const std::vector<flatbuffers::Offset<FlatTiming>> *timing = nullptr,
     int64_t footprintForward = 0,
     int64_t footprintBackward = 0) {
-  return nd4j::graph::CreateFlatResult(
+  return sd::graph::CreateFlatResult(
       _fbb,
       id,
       variables ? _fbb.CreateVector<flatbuffers::Offset<FlatVariable>>(*variables) : 0,
@@ -193,37 +193,37 @@ inline flatbuffers::Offset<FlatResult> CreateFlatResultDirect(
       footprintBackward);
 }
 
-inline const nd4j::graph::FlatResult *GetFlatResult(const void *buf) {
-  return flatbuffers::GetRoot<nd4j::graph::FlatResult>(buf);
+inline const sd::graph::FlatResult *GetFlatResult(const void *buf) {
+  return flatbuffers::GetRoot<sd::graph::FlatResult>(buf);
 }
 
-inline const nd4j::graph::FlatResult *GetSizePrefixedFlatResult(const void *buf) {
-  return flatbuffers::GetSizePrefixedRoot<nd4j::graph::FlatResult>(buf);
+inline const sd::graph::FlatResult *GetSizePrefixedFlatResult(const void *buf) {
+  return flatbuffers::GetSizePrefixedRoot<sd::graph::FlatResult>(buf);
 }
 
 inline bool VerifyFlatResultBuffer(
     flatbuffers::Verifier &verifier) {
-  return verifier.VerifyBuffer<nd4j::graph::FlatResult>(nullptr);
+  return verifier.VerifyBuffer<sd::graph::FlatResult>(nullptr);
 }
 
 inline bool VerifySizePrefixedFlatResultBuffer(
     flatbuffers::Verifier &verifier) {
-  return verifier.VerifySizePrefixedBuffer<nd4j::graph::FlatResult>(nullptr);
+  return verifier.VerifySizePrefixedBuffer<sd::graph::FlatResult>(nullptr);
 }
 
 inline void FinishFlatResultBuffer(
     flatbuffers::FlatBufferBuilder &fbb,
-    flatbuffers::Offset<nd4j::graph::FlatResult> root) {
+    flatbuffers::Offset<sd::graph::FlatResult> root) {
   fbb.Finish(root);
 }
 
 inline void FinishSizePrefixedFlatResultBuffer(
     flatbuffers::FlatBufferBuilder &fbb,
-    flatbuffers::Offset<nd4j::graph::FlatResult> root) {
+    flatbuffers::Offset<sd::graph::FlatResult> root) {
   fbb.FinishSizePrefixed(root);
 }
 
 }  // namespace graph
-}  // namespace nd4j
+}  // namespace sd
 
 #endif  // FLATBUFFERS_GENERATED_RESULT_ND4J_GRAPH_H_
diff --git a/libnd4j/include/graph/generated/uigraphevents_generated.h b/libnd4j/include/graph/generated/uigraphevents_generated.h
index ca62c1a60..b3430a5c7 100644
--- a/libnd4j/include/graph/generated/uigraphevents_generated.h
+++ b/libnd4j/include/graph/generated/uigraphevents_generated.h
@@ -8,7 +8,7 @@
 
 #include "array_generated.h"
 
-namespace nd4j {
+namespace sd {
 namespace graph {
 
 struct UIEvent;
@@ -336,7 +336,7 @@ inline flatbuffers::Offset<FrameIteration> CreateFrameIterationDirect(
     flatbuffers::FlatBufferBuilder &_fbb,
     const char *frame = nullptr,
     uint16_t iteration = 0) {
-  return nd4j::graph::CreateFrameIteration(
+  return sd::graph::CreateFrameIteration(
       _fbb,
       frame ? _fbb.CreateString(frame) : 0,
       iteration);
@@ -397,7 +397,7 @@ inline flatbuffers::Offset<UIAddName> CreateUIAddNameDirect(
     flatbuffers::FlatBufferBuilder &_fbb,
     int32_t nameIdx = 0,
     const char *name = nullptr) {
-  return nd4j::graph::CreateUIAddName(
+  return sd::graph::CreateUIAddName(
       _fbb,
       nameIdx,
       name ? _fbb.CreateString(name) : 0);
@@ -448,7 +448,7 @@ inline flatbuffers::Offset<FlatArrayList> CreateFlatArrayList(
 inline flatbuffers::Offset<FlatArrayList> CreateFlatArrayListDirect(
     flatbuffers::FlatBufferBuilder &_fbb,
     const std::vector<flatbuffers::Offset<FlatArray>> *list = nullptr) {
-  return nd4j::graph::CreateFlatArrayList(
+  return sd::graph::CreateFlatArrayList(
       _fbb,
       list ? _fbb.CreateVector<flatbuffers::Offset<FlatArray>>(*list) : 0);
 }
@@ -544,7 +544,7 @@ inline flatbuffers::Offset<UIHistogram> CreateUIHistogramDirect(
     flatbuffers::Offset<FlatArray> binranges = 0,
     flatbuffers::Offset<FlatArray> y = 0,
     const std::vector<flatbuffers::Offset<flatbuffers::String>> *binlabels = nullptr) {
-  return nd4j::graph::CreateUIHistogram(
+  return sd::graph::CreateUIHistogram(
       _fbb,
       type,
       numbins,
@@ -740,13 +740,13 @@ inline flatbuffers::Offset<UIHardwareState> CreateUIHardwareStateDirect(
     flatbuffers::FlatBufferBuilder &_fbb,
     const std::vector<int64_t> *gpuMemory = nullptr,
     int64_t hostMemory = 0) {
-  return nd4j::graph::CreateUIHardwareState(
+  return sd::graph::CreateUIHardwareState(
       _fbb,
       gpuMemory ? _fbb.CreateVector<int64_t>(*gpuMemory) : 0,
       hostMemory);
 }
 
 }  // namespace graph
-}  // namespace nd4j
+}  // namespace sd
 
 #endif  // FLATBUFFERS_GENERATED_UIGRAPHEVENTS_ND4J_GRAPH_H_
diff --git a/libnd4j/include/graph/generated/uigraphstatic_generated.h b/libnd4j/include/graph/generated/uigraphstatic_generated.h
index 8536a18ba..b6545f53a 100644
--- a/libnd4j/include/graph/generated/uigraphstatic_generated.h
+++ b/libnd4j/include/graph/generated/uigraphstatic_generated.h
@@ -10,7 +10,7 @@
 #include "utils_generated.h"
 #include "variable_generated.h"
 
-namespace nd4j {
+namespace sd {
 namespace graph {
 
 struct UIStaticInfoRecord;
@@ -232,7 +232,7 @@ inline flatbuffers::Offset<UIGraphStructure> CreateUIGraphStructureDirect(
     const std::vector<flatbuffers::Offset<flatbuffers::String>> *outputs = nullptr,
     const std::vector<flatbuffers::Offset<UIVariable>> *variables = nullptr,
     const std::vector<flatbuffers::Offset<UIOp>> *ops = nullptr) {
-  return nd4j::graph::CreateUIGraphStructure(
+  return sd::graph::CreateUIGraphStructure(
       _fbb,
       inputs ? _fbb.CreateVector<flatbuffers::Offset<flatbuffers::String>>(*inputs) : 0,
       inputsPair ? _fbb.CreateVector<flatbuffers::Offset<IntPair>>(*inputsPair) : 0,
@@ -431,7 +431,7 @@ inline flatbuffers::Offset<UIVariable> CreateUIVariableDirect(
     const char *gradientVariable = nullptr,
     const char *uiLabelExtra = nullptr,
     flatbuffers::Offset<FlatArray> constantValue = 0) {
-  return nd4j::graph::CreateUIVariable(
+  return sd::graph::CreateUIVariable(
       _fbb,
       id,
       name ? _fbb.CreateString(name) : 0,
@@ -555,7 +555,7 @@ inline flatbuffers::Offset<UIOp> CreateUIOpDirect(
     const std::vector<flatbuffers::Offset<flatbuffers::String>> *outputs = nullptr,
     const std::vector<flatbuffers::Offset<flatbuffers::String>> *controlDeps = nullptr,
     const char *uiLabelExtra = nullptr) {
-  return nd4j::graph::CreateUIOp(
+  return sd::graph::CreateUIOp(
       _fbb,
       name ? _fbb.CreateString(name) : 0,
       opName ? _fbb.CreateString(opName) : 0,
@@ -566,6 +566,6 @@ inline flatbuffers::Offset<UIOp> CreateUIOpDirect(
 }
 
 }  // namespace graph
-}  // namespace nd4j
+}  // namespace sd
 
 #endif  // FLATBUFFERS_GENERATED_UIGRAPHSTATIC_ND4J_GRAPH_H_
diff --git a/libnd4j/include/graph/generated/utils_generated.h b/libnd4j/include/graph/generated/utils_generated.h
index 9b885ce80..8e7896bb4 100644
--- a/libnd4j/include/graph/generated/utils_generated.h
+++ b/libnd4j/include/graph/generated/utils_generated.h
@@ -6,7 +6,7 @@
 
 #include "flatbuffers/flatbuffers.h"
 
-namespace nd4j {
+namespace sd {
 namespace graph {
 
 struct LongPair;
@@ -512,6 +512,6 @@ inline flatbuffers::Offset<IntTriple> CreateIntTriple(
 }
 
 }  // namespace graph
-}  // namespace nd4j
+}  // namespace sd
 
 #endif  // FLATBUFFERS_GENERATED_UTILS_ND4J_GRAPH_H_
diff --git a/libnd4j/include/graph/generated/variable_generated.h b/libnd4j/include/graph/generated/variable_generated.h
index 465490722..a0e43a5af 100644
--- a/libnd4j/include/graph/generated/variable_generated.h
+++ b/libnd4j/include/graph/generated/variable_generated.h
@@ -9,7 +9,7 @@
 #include "array_generated.h"
 #include "utils_generated.h"
 
-namespace nd4j {
+namespace sd {
 namespace graph {
 
 struct FlatVariable;
@@ -201,7 +201,7 @@ inline flatbuffers::Offset<FlatVariable> CreateFlatVariableDirect(
     const std::vector<flatbuffers::Offset<flatbuffers::String>> *controlDeps = nullptr,
     const std::vector<flatbuffers::Offset<flatbuffers::String>> *controlDepForOp = nullptr,
     const std::vector<flatbuffers::Offset<flatbuffers::String>> *controlDepsForVar = nullptr) {
-  return nd4j::graph::CreateFlatVariable(
+  return sd::graph::CreateFlatVariable(
       _fbb,
       id,
       name ? _fbb.CreateString(name) : 0,
@@ -215,37 +215,37 @@ inline flatbuffers::Offset<FlatVariable> CreateFlatVariableDirect(
       controlDepsForVar ? _fbb.CreateVector<flatbuffers::Offset<flatbuffers::String>>(*controlDepsForVar) : 0);
 }
 
-inline const nd4j::graph::FlatVariable *GetFlatVariable(const void *buf) {
-  return flatbuffers::GetRoot<nd4j::graph::FlatVariable>(buf);
+inline const sd::graph::FlatVariable *GetFlatVariable(const void *buf) {
+  return flatbuffers::GetRoot<sd::graph::FlatVariable>(buf);
 }
 
-inline const nd4j::graph::FlatVariable *GetSizePrefixedFlatVariable(const void *buf) {
-  return flatbuffers::GetSizePrefixedRoot<nd4j::graph::FlatVariable>(buf);
+inline const sd::graph::FlatVariable *GetSizePrefixedFlatVariable(const void *buf) {
+  return flatbuffers::GetSizePrefixedRoot<sd::graph::FlatVariable>(buf);
 }
 
 inline bool VerifyFlatVariableBuffer(
     flatbuffers::Verifier &verifier) {
-  return verifier.VerifyBuffer<nd4j::graph::FlatVariable>(nullptr);
+  return verifier.VerifyBuffer<sd::graph::FlatVariable>(nullptr);
 }
 
 inline bool VerifySizePrefixedFlatVariableBuffer(
     flatbuffers::Verifier &verifier) {
-  return verifier.VerifySizePrefixedBuffer<nd4j::graph::FlatVariable>(nullptr);
+  return verifier.VerifySizePrefixedBuffer<sd::graph::FlatVariable>(nullptr);
 }
 
 inline void FinishFlatVariableBuffer(
     flatbuffers::FlatBufferBuilder &fbb,
-    flatbuffers::Offset<nd4j::graph::FlatVariable> root) {
+    flatbuffers::Offset<sd::graph::FlatVariable> root) {
   fbb.Finish(root);
 }
 
 inline void FinishSizePrefixedFlatVariableBuffer(
     flatbuffers::FlatBufferBuilder &fbb,
-    flatbuffers::Offset<nd4j::graph::FlatVariable> root) {
+    flatbuffers::Offset<sd::graph::FlatVariable> root) {
   fbb.FinishSizePrefixed(root);
 }
 
 }  // namespace graph
-}  // namespace nd4j
+}  // namespace sd
 
 #endif  // FLATBUFFERS_GENERATED_VARIABLE_ND4J_GRAPH_H_
diff --git a/libnd4j/include/graph/impl/ArgumentsList.cpp b/libnd4j/include/graph/impl/ArgumentsList.cpp
index 217c8b8e2..71ba8c479 100644
--- a/libnd4j/include/graph/impl/ArgumentsList.cpp
+++ b/libnd4j/include/graph/impl/ArgumentsList.cpp
@@ -20,7 +20,7 @@
 
 #include <graph/ArgumentsList.h>
 
-namespace nd4j {
+namespace sd {
 namespace graph {
     ArgumentsList::ArgumentsList(std::initializer_list<Pair> arguments) {
         _arguments = arguments;
diff --git a/libnd4j/include/graph/impl/Context.cpp b/libnd4j/include/graph/impl/Context.cpp
index 02955a9ca..954329f42 100644
--- a/libnd4j/include/graph/impl/Context.cpp
+++ b/libnd4j/include/graph/impl/Context.cpp
@@ -18,13 +18,13 @@
 // @author raver119@gmail.com
 //
 
-#include <Context.h>
+#include <graph/Context.h>
 #include <helpers/ShapeUtils.h>
 #include <graph/Context.h>
 #include <array/InteropDataBuffer.h>
 
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         Context::Context(ContextPrototype* prototype, VariableSpace* variableSpace) {
             _variableSpace = variableSpace;
@@ -61,16 +61,16 @@ namespace nd4j {
             if (variableSpace != nullptr && variableSpace->launchContext()->getWorkspace() != nullptr)
                     this->_workspace = variableSpace->launchContext()->getWorkspace();
         }
-        nd4j::DataType Context::dataType(int index) {
+        sd::DataType Context::dataType(int index) {
 
             return _dataType;
         }
 
-        nd4j::DataType Context::dataType() {
+        sd::DataType Context::dataType() {
             return dataType(0);
         }
 
-        void Context::setDataType(int index, nd4j::DataType type) {
+        void Context::setDataType(int index, sd::DataType type) {
             if (this->_dataTypes.size() > (size_t)index)
                 _dataTypes[index] = type;
             _dataType = type;
@@ -115,7 +115,7 @@ namespace nd4j {
             return this->_workspace != nullptr;
         }
 
-        void Context::attachWorkspace(nd4j::memory::Workspace* workspace) {
+        void Context::attachWorkspace(sd::memory::Workspace* workspace) {
             this->_workspace = workspace;
         }
 
@@ -155,19 +155,19 @@ namespace nd4j {
             return _variableSpace;
         }
 
-        nd4j::memory::Workspace* Context::getWorkspace() {
+        sd::memory::Workspace* Context::getWorkspace() {
             return _workspace;
         }
 
-        nd4j::memory::Workspace* Context::workspace() {
+        sd::memory::Workspace* Context::workspace() {
             return _workspace;
         }
 
-        nd4j::random::RandomBuffer* Context::getRNG() {
+        sd::random::RandomBuffer* Context::getRNG() {
             return _rng;
         }
 
-        void Context::setRNG(nd4j::random::RandomBuffer* rng) {
+        void Context::setRNG(sd::random::RandomBuffer* rng) {
             _rng = rng;
         }
 
@@ -209,19 +209,19 @@ namespace nd4j {
                 _variableSpace->flowPath()->markBranch(this->nodeId(), branch);
         }
 
-        Nd4jLong nd4j::graph::Context::getOuterTime(){
+        Nd4jLong sd::graph::Context::getOuterTime(){
             return this->_executionTime.first;
         }
 
-        Nd4jLong nd4j::graph::Context::getInnerTime(){
+        Nd4jLong sd::graph::Context::getInnerTime(){
             return this->_executionTime.second;
         }
 
-        void nd4j::graph::Context::setOuterTime(Nd4jLong time){
+        void sd::graph::Context::setOuterTime(Nd4jLong time){
             this->_executionTime.first = time;
         }
 
-        void nd4j::graph::Context::setInnerTime(Nd4jLong time){
+        void sd::graph::Context::setInnerTime(Nd4jLong time){
             this->_executionTime.second = time;
         }
 
@@ -370,15 +370,15 @@ namespace nd4j {
             return getVariable(idx)->getNDArray();
         }
 
-        nd4j::memory::Workspace *Context::fWorkspace() {
+        sd::memory::Workspace *Context::fWorkspace() {
             return workspace();
         }
 
-        nd4j::memory::Workspace *Context::tWorkspace() {
+        sd::memory::Workspace *Context::tWorkspace() {
             return nullptr;
         }
 
-        nd4j::memory::Workspace *Context::oWorkspace() {
+        sd::memory::Workspace *Context::oWorkspace() {
             return nullptr;
         }
 
@@ -451,7 +451,7 @@ namespace nd4j {
 
             NDArray *array;
             if (dataBuffer != nullptr)
-                array = new NDArray(dataBuffer->dataBuffer(), reinterpret_cast<Nd4jLong *>(shapeInfo), nd4j::LaunchContext::defaultContext(), dataBuffer->offset() / DataTypeUtils::sizeOf(ArrayOptions::dataType(reinterpret_cast<Nd4jLong *>(shapeInfo))));
+                array = new NDArray(dataBuffer->dataBuffer(), reinterpret_cast<Nd4jLong *>(shapeInfo), sd::LaunchContext::defaultContext(), dataBuffer->offset() / DataTypeUtils::sizeOf(ArrayOptions::dataType(reinterpret_cast<Nd4jLong *>(shapeInfo))));
             else
                 array = new NDArray(nullptr, nullptr, reinterpret_cast<Nd4jLong *>(shapeInfo));
 
@@ -470,7 +470,7 @@ namespace nd4j {
 
             NDArray *array;
             if (dataBuffer != nullptr)
-                array = new NDArray(dataBuffer->dataBuffer(), reinterpret_cast<Nd4jLong *>(shapeInfo), nd4j::LaunchContext::defaultContext(), dataBuffer->offset() / DataTypeUtils::sizeOf(ArrayOptions::dataType(reinterpret_cast<Nd4jLong *>(shapeInfo))));
+                array = new NDArray(dataBuffer->dataBuffer(), reinterpret_cast<Nd4jLong *>(shapeInfo), sd::LaunchContext::defaultContext(), dataBuffer->offset() / DataTypeUtils::sizeOf(ArrayOptions::dataType(reinterpret_cast<Nd4jLong *>(shapeInfo))));
             else
                 array = new NDArray(nullptr, nullptr, reinterpret_cast<Nd4jLong *>(shapeInfo));
 
@@ -564,13 +564,13 @@ namespace nd4j {
             return _execMode == samediff::ExecutionMode::MODE_INFERENCE;
         }
 
-        void Context::setDArguments(nd4j::DataType *arguments, int numberOfArguments) {
+        void Context::setDArguments(sd::DataType *arguments, int numberOfArguments) {
             _dArgs.clear();
             for (int e = 0; e < numberOfArguments; e++)
                 _dArgs.emplace_back(arguments[e]);
         }
 
-        void Context::setDArguments(const std::vector<nd4j::DataType> &dArgs) {
+        void Context::setDArguments(const std::vector<sd::DataType> &dArgs) {
             _dArgs.clear();
             for (auto d:dArgs)
                 _dArgs.emplace_back(d);
diff --git a/libnd4j/include/graph/impl/ContextPrototype.cpp b/libnd4j/include/graph/impl/ContextPrototype.cpp
index e8432aea0..417c46b3a 100644
--- a/libnd4j/include/graph/impl/ContextPrototype.cpp
+++ b/libnd4j/include/graph/impl/ContextPrototype.cpp
@@ -18,14 +18,14 @@
 //  @author raver119@gmail.com
 //
 
-#include <pointercast.h>
-#include <dll.h>
+#include <system/pointercast.h>
+#include <system/dll.h>
 #include <types/float16.h>
 #include <graph/ContextPrototype.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
-        ContextPrototype::ContextPrototype(nd4j::ops::OpDescriptor* opDescriptor, int nodeId, bool inPlace) {
+        ContextPrototype::ContextPrototype(sd::ops::OpDescriptor* opDescriptor, int nodeId, bool inPlace) {
             _nodeId = nodeId;
             _isInplace = inPlace;
             _opDescriptor = opDescriptor;
@@ -106,15 +106,15 @@ namespace nd4j {
             return getNodeId();
         }
 
-        nd4j::DataType ContextPrototype::dataType() {
+        sd::DataType ContextPrototype::dataType() {
             return dataType(0);
         }
 
-        nd4j::DataType ContextPrototype::dataType(int index) {
+        sd::DataType ContextPrototype::dataType(int index) {
             return _dataType;
         }
 
-        void ContextPrototype::setDataType(int index, nd4j::DataType type) {
+        void ContextPrototype::setDataType(int index, sd::DataType type) {
             // if (_outputs->size() == 0)
             _dataType = type;
         }
@@ -154,7 +154,7 @@ namespace nd4j {
             return clone;
         }
 
-        void ContextPrototype::setOpDescriptor(nd4j::ops::OpDescriptor* opDescriptor) {
+        void ContextPrototype::setOpDescriptor(sd::ops::OpDescriptor* opDescriptor) {
             _opDescriptor = opDescriptor;
         }
 
@@ -174,7 +174,7 @@ namespace nd4j {
             return clone;
         }
 
-        std::vector<nd4j::DataType> *ContextPrototype::getDArguments() {
+        std::vector<sd::DataType> *ContextPrototype::getDArguments() {
             return &_dArgs;
         }
 
diff --git a/libnd4j/include/graph/impl/ExecutionResult.cpp b/libnd4j/include/graph/impl/ExecutionResult.cpp
index e887efb55..fd2bed054 100644
--- a/libnd4j/include/graph/impl/ExecutionResult.cpp
+++ b/libnd4j/include/graph/impl/ExecutionResult.cpp
@@ -18,12 +18,12 @@
 // @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
-#include <pointercast.h>
-#include <dll.h>
+#include <system/op_boilerplate.h>
+#include <system/pointercast.h>
+#include <system/dll.h>
 #include <graph/ExecutionResult.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         ExecutionResult::ExecutionResult(const FlatResult* flatResult) {
             if (flatResult->variables() != nullptr) {
diff --git a/libnd4j/include/graph/impl/ExecutorConfiguration.cpp b/libnd4j/include/graph/impl/ExecutorConfiguration.cpp
index 0d702a62d..f296ef3cd 100644
--- a/libnd4j/include/graph/impl/ExecutorConfiguration.cpp
+++ b/libnd4j/include/graph/impl/ExecutorConfiguration.cpp
@@ -20,9 +20,9 @@
 
 #include <graph/ExecutorConfiguration.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
-        ExecutorConfiguration::ExecutorConfiguration(const nd4j::graph::FlatConfiguration *conf) {
+        ExecutorConfiguration::ExecutorConfiguration(const sd::graph::FlatConfiguration *conf) {
             if (conf != nullptr) {
                 _profilingMode = conf->profilingMode();
                 _executionMode = conf->executionMode();
diff --git a/libnd4j/include/graph/impl/FlatUtils.cpp b/libnd4j/include/graph/impl/FlatUtils.cpp
index f582220da..e6984bb97 100644
--- a/libnd4j/include/graph/impl/FlatUtils.cpp
+++ b/libnd4j/include/graph/impl/FlatUtils.cpp
@@ -23,10 +23,10 @@
 #include <array/DataTypeConversions.h>
 #include <array/DataTypeUtils.h>
 #include <array/ByteOrderUtils.h>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         std::pair<int, int> FlatUtils::fromIntPair(IntPair *pair) {
             return std::pair<int, int>(pair->first(), pair->second());
@@ -36,7 +36,7 @@ namespace nd4j {
             return std::pair<Nd4jLong, Nd4jLong>(pair->first(), pair->second());
         }
 
-        NDArray* FlatUtils::fromFlatArray(const nd4j::graph::FlatArray *flatArray) {
+        NDArray* FlatUtils::fromFlatArray(const sd::graph::FlatArray *flatArray) {
             auto rank = static_cast<int>(flatArray->shape()->Get(0));
             auto newShape = new Nd4jLong[shape::shapeInfoLength(rank)];
             memcpy(newShape, flatArray->shape()->data(), shape::shapeInfoByteLength(rank));
@@ -52,7 +52,7 @@ namespace nd4j {
             // TODO fix UTF16 and UTF32
             if (dtype == UTF8) {
                 bool isBe = BitwiseUtils::isBE();
-                bool canKeep = (isBe && flatArray->byteOrder() == nd4j::graph::ByteOrder_BE) || (!isBe && flatArray->byteOrder() == nd4j::graph::ByteOrder_LE);
+                bool canKeep = (isBe && flatArray->byteOrder() == sd::graph::ByteOrder_BE) || (!isBe && flatArray->byteOrder() == sd::graph::ByteOrder_LE);
                 
                 std::vector<std::string> substrings(length);
                 std::vector<Nd4jLong> shapeVector(rank);
@@ -96,7 +96,7 @@ namespace nd4j {
 
             BUILD_SINGLE_SELECTOR(dtype, DataTypeConversions, ::convertType(newBuffer, (void *)flatArray->buffer()->data(), dtype, ByteOrderUtils::fromFlatByteOrder(flatArray->byteOrder()),  length), LIBND4J_TYPES);
 
-            auto array = new NDArray(newBuffer, newShape, nd4j::LaunchContext::defaultContext(), true);
+            auto array = new NDArray(newBuffer, newShape, sd::LaunchContext::defaultContext(), true);
 
             delete[] newShape;
             return array;
@@ -108,9 +108,9 @@ namespace nd4j {
             auto fBuffer = builder.CreateVector(byteVector);
             auto fShape = builder.CreateVector(array.getShapeInfoAsFlatVector());
 
-            auto bo = static_cast<nd4j::graph::ByteOrder>(BitwiseUtils::asByteOrder());
+            auto bo = static_cast<sd::graph::ByteOrder>(BitwiseUtils::asByteOrder());
 
-            return CreateFlatArray(builder, fShape, fBuffer, static_cast<nd4j::graph::DType>(array.dataType()), bo);
+            return CreateFlatArray(builder, fShape, fBuffer, static_cast<sd::graph::DType>(array.dataType()), bo);
         }
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/graph/impl/FlowPath.cpp b/libnd4j/include/graph/impl/FlowPath.cpp
index e80167a28..79fe67b30 100644
--- a/libnd4j/include/graph/impl/FlowPath.cpp
+++ b/libnd4j/include/graph/impl/FlowPath.cpp
@@ -20,7 +20,7 @@
 
 #include <graph/FlowPath.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
 
         void FlowPath::ensureNode(int nodeId) {
diff --git a/libnd4j/include/graph/impl/FrameState.cpp b/libnd4j/include/graph/impl/FrameState.cpp
index db79c3f08..d312a4f39 100644
--- a/libnd4j/include/graph/impl/FrameState.cpp
+++ b/libnd4j/include/graph/impl/FrameState.cpp
@@ -21,7 +21,7 @@
 #include <graph/FrameState.h>
 
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         FrameState::FrameState(Nd4jLong id) {
             this->_id = id;
diff --git a/libnd4j/include/graph/impl/Graph.cpp b/libnd4j/include/graph/impl/Graph.cpp
index 3950d5f8a..15db128a8 100644
--- a/libnd4j/include/graph/impl/Graph.cpp
+++ b/libnd4j/include/graph/impl/Graph.cpp
@@ -22,16 +22,16 @@
 #include <array/DataTypeUtils.h>
 #include <helpers/EnumUtils.h>
 #include <graph/FlatUtils.h>
-#include <NativeOps.h>
+#include <legacy/NativeOps.h>
 #include <vector>
 #include <helpers/ShapeUtils.h>
 #include <ops/declarable/OpRegistrator.h>
 #include <graph/VariableProxy.h>
 #include <exceptions/graph_exception.h>
-#include <exceptions/unresolved_input_exception.h>
-#include <exceptions/unresolved_output_exception.h>
+#include <graph/exceptions/unresolved_input_exception.h>
+#include <graph/exceptions/unresolved_output_exception.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         std::vector<Node*>* Graph::getAllNodes() {
             return &_handles;
@@ -155,7 +155,7 @@ namespace nd4j {
                         Nd4jLong *newShape = nullptr;
 
                         // if that's scalar output - we don't care about previous node
-                        if (node->getDimensions()->size() == 0 || (node->getDimensions()->size() == 1 && node->getDimensions()->at(0) == nd4j::DataTypeUtils::max<int>())) {
+                        if (node->getDimensions()->size() == 0 || (node->getDimensions()->size() == 1 && node->getDimensions()->at(0) == sd::DataTypeUtils::max<int>())) {
                             newShape = new Nd4jLong[8];
 
                             newShape[0] = 2;
@@ -702,7 +702,7 @@ namespace nd4j {
 
             prepareOutputs();
 
-            return nd4j::Status::OK();
+            return sd::Status::OK();
         }
 
         void Graph::tagInplaceNodes() {
@@ -883,7 +883,7 @@ namespace nd4j {
 
             // if memory reqs were set - initialize workspace
             if (_configuration->_footprintForward > 0) {
-                nd4j::memory::Workspace *workspace = this->_variableSpace->launchContext()->getWorkspace();
+                sd::memory::Workspace *workspace = this->_variableSpace->launchContext()->getWorkspace();
                 workspace->expandBy(_configuration->_footprintForward);
             }
 
@@ -1175,10 +1175,10 @@ namespace nd4j {
             return ND4J_STATUS_OK;
         }
 
-        std::vector<nd4j::ops::OpDescriptor> Graph::getOperations() {
+        std::vector<sd::ops::OpDescriptor> Graph::getOperations() {
             buildGraph();
             // nd4j_printf("\nRetrieving ops from the Graph and collect them...\n", "");
-            std::vector<nd4j::ops::OpDescriptor> res;
+            std::vector<sd::ops::OpDescriptor> res;
 
             int opCnt = 0;
             for (int l = 0; l < _onion->size(); l++) {
@@ -1187,7 +1187,7 @@ namespace nd4j {
                 for (int n = 0; n < layerSize; n++) {
                     Node* node = _onion->at(l)->at(n);
                     if (node->name() == nullptr) continue;
-                    nd4j::ops::OpDescriptor* pOpDescriptor = nullptr;
+                    sd::ops::OpDescriptor* pOpDescriptor = nullptr;
                     std::string opNameStr; //node->name();
                     int numInputs = 0;
                     int numOutputs = 0;
@@ -1221,7 +1221,7 @@ namespace nd4j {
                     if (pOpDescriptor)
                         res.emplace_back(*pOpDescriptor);
                     else
-                        res.emplace_back(nd4j::ops::OpDescriptor(numInputs, numOutputs, opNameStr, inplace));
+                        res.emplace_back(sd::ops::OpDescriptor(numInputs, numOutputs, opNameStr, inplace));
                 }
             }
 
@@ -1236,7 +1236,7 @@ namespace nd4j {
                     //printOutNode(node);
                     if (node->name() == nullptr) continue;
                     std::string opNameStr; //node->name();
-                    nd4j::ops::OpDescriptor* pOpDescriptor = nullptr;
+                    sd::ops::OpDescriptor* pOpDescriptor = nullptr;
                     int numInputs = 0;
                     int numOutputs = 0;
 
@@ -1264,7 +1264,7 @@ namespace nd4j {
                     if (pOpDescriptor != nullptr)
                         res.emplace_back(*pOpDescriptor);
                     else
-                        res.emplace_back(nd4j::ops::OpDescriptor(numInputs, numOutputs, opNameStr, inplace));
+                        res.emplace_back(sd::ops::OpDescriptor(numInputs, numOutputs, opNameStr, inplace));
                 }
             }
 
diff --git a/libnd4j/blas/cpu/GraphExecutioner.cpp b/libnd4j/include/graph/impl/GraphExecutioner.cpp
similarity index 96%
rename from libnd4j/blas/cpu/GraphExecutioner.cpp
rename to libnd4j/include/graph/impl/GraphExecutioner.cpp
index 98b3204cd..38399242c 100644
--- a/libnd4j/blas/cpu/GraphExecutioner.cpp
+++ b/libnd4j/include/graph/impl/GraphExecutioner.cpp
@@ -24,12 +24,12 @@
 
 //#include <protobuf/core/framework/graph.pb.h>
 
-#include <Variable.h>
-#include <VariableSpace.h>
+#include <graph/Variable.h>
+#include <graph/VariableSpace.h>
 #include <memory/MemoryRegistrator.h>
-#include <Node.h>
-#include <Scope.h>
-#include <GraphExecutioner.h>
+#include <graph/Node.h>
+#include <graph/Scope.h>
+#include <graph/GraphExecutioner.h>
 #include <graph/TimeHolder.h>
 #include <loops/scalar.h>
 #include <loops/pairwise_transform.h>
@@ -46,9 +46,9 @@
 #include <graph/execution/LogicExecutor.h>
 #include <array/DataTypeUtils.h>
 #include <helpers/BitwiseUtils.h>
-#include <generated/array_generated.h>
+#include <graph/generated/array_generated.h>
 #include <helpers/ShapeUtils.h>
-#include <Status.h>
+#include <graph/Status.h>
 #include <deque>
 #include <graph/ResultWrapper.h>
 #include <graph/ExecutionResult.h>
@@ -56,7 +56,7 @@
 #include <exceptions/no_results_exception.h>
 #include <graph/FlatUtils.h>
 
-namespace nd4j{
+namespace sd{
 namespace graph {
 
 /**
@@ -88,7 +88,7 @@ namespace graph {
 
     Context context(node->getContextPrototype(), variableSpace);
 
-    if (nd4j::Environment::getInstance()->isDebugAndVerbose()) {
+    if (sd::Environment::getInstance()->isDebugAndVerbose()) {
         //nd4j_debug("Input variables: %i\n", node->input()->size());
         printf("       Inputs: {");
         for (int e = 0; e < node->input()->size(); e++) {
@@ -218,7 +218,7 @@ Nd4jStatus GraphExecutioner::execute(Graph *graph, VariableSpace* variableSpace)
     Nd4jLong tb0 = Environment::getInstance()->isProfiling() ? GraphProfile::currentTime() : 0L;
     graph->buildGraph();
 
-    auto footprintForward = nd4j::memory::MemoryRegistrator::getInstance()->getGraphMemoryFootprint(graph->hashCode());
+    auto footprintForward = sd::memory::MemoryRegistrator::getInstance()->getGraphMemoryFootprint(graph->hashCode());
     if (footprintForward > 0) {
         if (__variableSpace->launchContext()->getWorkspace() != nullptr) {
             // this method will work only if current workspace size is smaller then proposed value
@@ -273,7 +273,7 @@ Nd4jStatus GraphExecutioner::execute(Graph *graph, VariableSpace* variableSpace)
             nd4j_debug("Step: %lld; Node: %i <%s>\n", exec_counter, node->id(), node->name()->c_str());
 
             // on first non-Exit node after loop we can rewind (if planned)
-            if (!(node->opType() == OpType_LOGIC && node->opNum() == nd4j::logic::Exit)) {
+            if (!(node->opType() == OpType_LOGIC && node->opNum() == sd::logic::Exit)) {
                 // VALIDATED
 
                 // if we're out of frame - let's remove it from queue
@@ -289,7 +289,7 @@ Nd4jStatus GraphExecutioner::execute(Graph *graph, VariableSpace* variableSpace)
 
                 // TODO: move inactivity check right here
                 bool shouldSkip = false;
-                if (node->opType() == OpType_LOGIC && node->opNum() == nd4j::logic::Merge) {
+                if (node->opType() == OpType_LOGIC && node->opNum() == sd::logic::Merge) {
                     // Merge node has own checkout logic
 
                     auto inputId0 = node->input()->at(0);
@@ -344,7 +344,7 @@ Nd4jStatus GraphExecutioner::execute(Graph *graph, VariableSpace* variableSpace)
 
             flowPath->markNodeActive(node->id(), true);
 
-            if (node->opType() == OpType_LOGIC && node->opNum() == nd4j::logic::Enter) {
+            if (node->opType() == OpType_LOGIC && node->opNum() == sd::logic::Enter) {
                 // Enter operation
                 // VALIDATED
 
@@ -363,7 +363,7 @@ Nd4jStatus GraphExecutioner::execute(Graph *graph, VariableSpace* variableSpace)
                 if (status != Status::OK())
                     return status;
 
-            } else if (node->opType() == OpType_LOGIC && node->opNum() == nd4j::logic::NextIteration) {
+            } else if (node->opType() == OpType_LOGIC && node->opNum() == sd::logic::NextIteration) {
                 /**
                  * NextIteration is special case: after successful execution of this op - we're changing execution position
                  */
@@ -391,7 +391,7 @@ Nd4jStatus GraphExecutioner::execute(Graph *graph, VariableSpace* variableSpace)
                 }
 
 
-            } else if (node->opType() == OpType_LOGIC && node->opNum() == nd4j::logic::Exit) {
+            } else if (node->opType() == OpType_LOGIC && node->opNum() == sd::logic::Exit) {
                 // Exit node is another special case: it can rewind executioner to specific point in graph
                 // VALIDATED
 
@@ -458,7 +458,7 @@ Nd4jStatus GraphExecutioner::execute(Graph *graph, VariableSpace* variableSpace)
                     // now we skip all branches except of this active one
                 }
 
-                if (nd4j::Environment::getInstance()->isDebugAndVerbose()) {
+                if (sd::Environment::getInstance()->isDebugAndVerbose()) {
 
                     if (__variableSpace->getVariable(node->id())->hasNDArray()) {
                         auto array = __variableSpace->getVariable(node->id())->getNDArray();
@@ -491,7 +491,7 @@ Nd4jStatus GraphExecutioner::execute(Graph *graph, VariableSpace* variableSpace)
     if (__variableSpace->launchContext()->getWorkspace() != nullptr) {
         auto m = __variableSpace->launchContext()->getWorkspace()->getAllocatedSize();
         auto h = graph->hashCode();
-        nd4j::memory::MemoryRegistrator::getInstance()->setGraphMemoryFootprintIfGreater(h, m);
+        sd::memory::MemoryRegistrator::getInstance()->setGraphMemoryFootprintIfGreater(h, m);
     }
 
     if (tempFlow) {
@@ -511,7 +511,7 @@ Nd4jStatus GraphExecutioner::execute(Graph *graph, VariableSpace* variableSpace)
  * 5) Returns pointer to FlatBuffer results buffer
  *
  */
-    nd4j::graph::ResultWrapper* GraphExecutioner::executeFlatBuffer(Nd4jPointer pointer) {
+    sd::graph::ResultWrapper* GraphExecutioner::executeFlatBuffer(Nd4jPointer pointer) {
     uint8_t *buffer = reinterpret_cast<uint8_t *>(pointer);
 
     // nd4j_debug("Trying to restore graph\n", 0);
@@ -585,7 +585,7 @@ Nd4jStatus GraphExecutioner::execute(Graph *graph, VariableSpace* variableSpace)
         auto fName = builder.CreateString(*(var->getName()));
         auto id = CreateIntPair(builder, var->id(), var->index());
 
-        auto fv = CreateFlatVariable(builder, id, fName, static_cast<nd4j::graph::DType>(array->dataType()), 0, fArray);
+        auto fv = CreateFlatVariable(builder, id, fName, static_cast<sd::graph::DType>(array->dataType()), 0, fArray);
 
         variables_vector.push_back(fv);
         arrays++;
@@ -742,7 +742,7 @@ Graph* GraphExecutioner::importFromTensorFlow(const char *fileName) {
             nd4j_verbose("Node id: [%i]; name: [%s]; opName: [%s]\n", n + 1, node.name().c_str(),
                          node.op().c_str());
 
-            nd4j::ops::DeclarableOp *op = nd4j::ops::OpRegistrator::getInstance()->getOperationFloat(node.op().c_str());
+            sd::ops::DeclarableOp *op = sd::ops::OpRegistrator::getInstance()->getOperationFloat(node.op().c_str());
 
             if (op == nullptr) {
                 nd4j_verbose("Op wasn't found: %s\n", node.op().c_str());
@@ -863,7 +863,7 @@ flatbuffers::Offset<FlatResult> GraphExecutioner::execute(Graph *graph, flatbuff
         graph->printOut();
 
     auto status = GraphExecutioner::execute(graph);
-    if (status != nd4j::Status::OK())
+    if (status != sd::Status::OK())
         throw graph_execution_exception(request->id());
 
     auto outputs = graph->fetchOutputs();
diff --git a/libnd4j/include/graph/impl/GraphHolder.cpp b/libnd4j/include/graph/impl/GraphHolder.cpp
index e14f5d6ee..c480508f5 100644
--- a/libnd4j/include/graph/impl/GraphHolder.cpp
+++ b/libnd4j/include/graph/impl/GraphHolder.cpp
@@ -19,11 +19,11 @@
 //
 
 #include <graph/GraphHolder.h>
-#include <GraphExecutioner.h>
+#include <graph/GraphExecutioner.h>
 #include <exceptions/graph_exists_exception.h>
 #include <exceptions/graph_execution_exception.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         GraphHolder* GraphHolder::getInstance() {
             if (_INSTANCE == 0)
@@ -38,7 +38,7 @@ namespace nd4j {
 
             _graphF[graphId] = graph;
 
-            nd4j::SimpleReadWriteLock lock;
+            sd::SimpleReadWriteLock lock;
             _locks[graphId] = lock;
         }
 
diff --git a/libnd4j/include/graph/impl/GraphState.cpp b/libnd4j/include/graph/impl/GraphState.cpp
index 52691d801..a8b25603a 100644
--- a/libnd4j/include/graph/impl/GraphState.cpp
+++ b/libnd4j/include/graph/impl/GraphState.cpp
@@ -22,7 +22,7 @@
 #include <graph/Node.h>
 
 
-namespace nd4j {
+namespace sd {
 namespace graph {
     GraphState::GraphState(Nd4jLong id) {
         _id = id;
diff --git a/libnd4j/include/graph/impl/GraphUtils.cpp b/libnd4j/include/graph/impl/GraphUtils.cpp
index 4af7df1a2..15f674ce1 100644
--- a/libnd4j/include/graph/impl/GraphUtils.cpp
+++ b/libnd4j/include/graph/impl/GraphUtils.cpp
@@ -35,7 +35,7 @@
 //#include <sys/types.h>
 //#include <sys/wait.h>
 #endif
-namespace nd4j {
+namespace sd {
 namespace graph {
 
 bool GraphUtils::filterOperations(GraphUtils::OpList& ops) {
@@ -69,7 +69,7 @@ std::string GraphUtils::makeCommandLine(GraphUtils::OpList& ops) {
     std::string res;
 
     if (!ops.empty()) {
-        res += std::string(" -g \"-DLIBND4J_OPS_LIST='");
+        res += std::string(" -g \"-DSD_OPS_LIST='");
         //res += *(ops[0].getOpName());
         for (int i = 0; i < ops.size(); i++) {
             res += std::string("-DOP_");
diff --git a/libnd4j/include/graph/impl/InferenceRequest.cpp b/libnd4j/include/graph/impl/InferenceRequest.cpp
index c60d0a08c..29fde1eb1 100644
--- a/libnd4j/include/graph/impl/InferenceRequest.cpp
+++ b/libnd4j/include/graph/impl/InferenceRequest.cpp
@@ -21,7 +21,7 @@
 #include <graph/InferenceRequest.h>
 
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         InferenceRequest::InferenceRequest(Nd4jLong graphId, ExecutorConfiguration *configuration) {
             this->_id = graphId;
diff --git a/libnd4j/include/graph/impl/Intervals.cpp b/libnd4j/include/graph/impl/Intervals.cpp
index d5c879eaa..1a89c797f 100644
--- a/libnd4j/include/graph/impl/Intervals.cpp
+++ b/libnd4j/include/graph/impl/Intervals.cpp
@@ -19,7 +19,7 @@
 //
 #include <graph/Intervals.h>
 
-namespace nd4j {
+namespace sd {
 
     // default constructor
     Intervals::Intervals(): _content({{}}) {}
diff --git a/libnd4j/include/graph/impl/Node.cpp b/libnd4j/include/graph/impl/Node.cpp
index 4c79ccb3e..e3ea75ef9 100644
--- a/libnd4j/include/graph/impl/Node.cpp
+++ b/libnd4j/include/graph/impl/Node.cpp
@@ -40,52 +40,52 @@
 #include <ops/declarable/LegacyTransformStrictOp.h>
 #include <ops/declarable/LegacyTransformBoolOp.h>
 #include <graph/FlatUtils.h>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
-        void nd4j::graph::Node::setOuterTime(Nd4jLong time){
+        void sd::graph::Node::setOuterTime(Nd4jLong time){
 //            if (hasBlockAttached())
 //                _block->setOuterTime(time);
         }
 
-        void nd4j::graph::Node::setInnerTime(Nd4jLong time){
+        void sd::graph::Node::setInnerTime(Nd4jLong time){
 //            if (hasBlockAttached())
 //                _block->setInnerTime(time);
         }
 
-        void nd4j::graph::Node::setGraph(nd4j::graph::Graph* graph) {
+        void sd::graph::Node::setGraph(sd::graph::Graph* graph) {
             _graph = graph;
         }
 
-        nd4j::graph::Graph* nd4j::graph::Node::getGraph() {
+        sd::graph::Graph* sd::graph::Node::getGraph() {
             return _graph;
         }
 
-        bool nd4j::graph::Node::hasGraphEmbedded() {
+        bool sd::graph::Node::hasGraphEmbedded() {
             return _graph != nullptr;
         }
 
-        void nd4j::graph::Node::markInplace(bool reallyInplace) {
+        void sd::graph::Node::markInplace(bool reallyInplace) {
             _isInplace = reallyInplace;
             if (_protoContext != nullptr) {
                 _protoContext->markInplace(reallyInplace);
             }
         }
 
-        OpClass nd4j::graph::Node::getOpClass() {
+        OpClass sd::graph::Node::getOpClass() {
             return _opClass;
         }
 
-        bool nd4j::graph::Node::hasBlockAttached() {
+        bool sd::graph::Node::hasBlockAttached() {
             return _protoContext != nullptr;
         }
 
-        bool nd4j::graph::Node::isInplace() {
+        bool sd::graph::Node::isInplace() {
             return _isInplace;
         }
 
-        bool nd4j::graph::Node::isDivergencePoint() {
+        bool sd::graph::Node::isDivergencePoint() {
             if (hasCustomOp()) {
                 return _customOp->getOpDescriptor()->isDivergent();
             } else if (opType() == OpType_LOGIC && opNum() == 30)
@@ -94,11 +94,11 @@ namespace nd4j {
                 return false;
         }
 
-        void nd4j::graph::Node::setActive(bool reallyActive) {
+        void sd::graph::Node::setActive(bool reallyActive) {
             _active = reallyActive;
         }
 
-        bool nd4j::graph::Node::isActive() {
+        bool sd::graph::Node::isActive() {
             return _active;
         }
 
@@ -110,7 +110,7 @@ namespace nd4j {
             _frameId = frameId;
         }
 
-        ContextPrototype * nd4j::graph::Node::getContextPrototype() {
+        ContextPrototype * sd::graph::Node::getContextPrototype() {
             if (_protoContext == nullptr)
                 _protoContext = new ContextPrototype(this->getCustomOp() != nullptr ? this->getCustomOp()->getOpDescriptor() : nullptr, this->id());
             if (_protoContext->inputs()->empty()) {
@@ -121,22 +121,22 @@ namespace nd4j {
             return _protoContext;
         }
 
-        void nd4j::graph::Node::setContextPrototype(ContextPrototype *block) {
+        void sd::graph::Node::setContextPrototype(ContextPrototype *block) {
             if (_protoContext != nullptr)
                 throw std::runtime_error("Block already exists");
 
             _protoContext = block;
         }
 
-        void nd4j::graph::Node::setId(int id) {
+        void sd::graph::Node::setId(int id) {
             _id = id;
         }
 
-        nd4j::ops::DeclarableOp* nd4j::graph::Node::getCustomOp() {
+        sd::ops::DeclarableOp* sd::graph::Node::getCustomOp() {
             return _customOp;
         }
 
-        void nd4j::graph::Node::setCustomOp(nd4j::ops::DeclarableOp *customOp) {
+        void sd::graph::Node::setCustomOp(sd::ops::DeclarableOp *customOp) {
             _customOp = customOp;
 
             // divergent ops (Switch etc) are always inplace, they don't allocate anything
@@ -144,40 +144,40 @@ namespace nd4j {
                 _isInplace = true;
         }
 
-        bool nd4j::graph::Node::hasCustomOp() {
+        bool sd::graph::Node::hasCustomOp() {
             return _customOp != nullptr;
         }
 
-        std::string * nd4j::graph::Node::name() {
+        std::string * sd::graph::Node::name() {
             return this->getName();
         }
 
-        std::string * nd4j::graph::Node::getName() {
+        std::string * sd::graph::Node::getName() {
             return &_name;
         }
 
-        void nd4j::graph::Node::setName(const std::string& name) {
+        void sd::graph::Node::setName(const std::string& name) {
             _name = name.c_str();
         }
 
-        void nd4j::graph::Node::setName(std::string *name) {
+        void sd::graph::Node::setName(std::string *name) {
             _name = *name;
         }
 
-        double nd4j::graph::Node::scalar() {
+        double sd::graph::Node::scalar() {
             return  _scalar.e<double>(0);
         };
 
-        void nd4j::graph::Node::pickInput(std::pair<int,int>& pair) {
+        void sd::graph::Node::pickInput(std::pair<int,int>& pair) {
             _input.push_back(pair);
         }
 
-        void nd4j::graph::Node::pickInput(int inputId, int outputId) {
+        void sd::graph::Node::pickInput(int inputId, int outputId) {
             std::pair<int,int> p(inputId,outputId);
             pickInput(p);
         }
 
-        void nd4j::graph::Node::pickInput(int inputId) {
+        void sd::graph::Node::pickInput(int inputId) {
             pickInput(inputId, 0);
 
             if (inputId < 0)
@@ -186,25 +186,25 @@ namespace nd4j {
                 _hasInternalInputs = true;
         }
 
-        void nd4j::graph::Node::pickExternalOutput(int outputId) {
+        void sd::graph::Node::pickExternalOutput(int outputId) {
             std::pair<int, int> pair(outputId, 0);
             _output.push_back(pair);
 
             _hasExternalOutputs = true;
         }
 
-        void nd4j::graph::Node::pickOutputOnce(int outputId) {
+        void sd::graph::Node::pickOutputOnce(int outputId) {
             std::pair<int, int> pair(outputId, 0);
             if (std::find(_output.begin(), _output.end(), pair) == _output.end())
                 pickOutput(outputId);
         }
 
-        void nd4j::graph::Node::pickOutput(int nodeId, int outputId) {
+        void sd::graph::Node::pickOutput(int nodeId, int outputId) {
             std::pair<int, int> pair(nodeId, outputId);
             _output.emplace_back(pair);
         }
 
-        void nd4j::graph::Node::pickOutput(int outputId) {
+        void sd::graph::Node::pickOutput(int outputId) {
             std::pair<int, int> pair(outputId, 0);
             _output.emplace_back(pair);
 
@@ -214,47 +214,47 @@ namespace nd4j {
                 _hasInternalOutputs = true;
         }
 
-        int * nd4j::graph::Node::getDimensionsPtr() {
+        int * sd::graph::Node::getDimensionsPtr() {
             return _dim;
         }
 
-        std::vector<int> * nd4j::graph::Node::getDimensions() {
+        std::vector<int> * sd::graph::Node::getDimensions() {
             return &_dimensions;
         }
 
-        int nd4j::graph::Node::getLayer() {
+        int sd::graph::Node::getLayer() {
             return _layer;
         }
 
-        void nd4j::graph::Node::setLayer(int layer) {
+        void sd::graph::Node::setLayer(int layer) {
             _layer = layer;
         }
 
-        bool nd4j::graph::Node::hasExternalOutputs() {
+        bool sd::graph::Node::hasExternalOutputs() {
             return _hasExternalOutputs;
         }
 
-        bool nd4j::graph::Node::hasExternalInputs() {
+        bool sd::graph::Node::hasExternalInputs() {
             return _hasExternalInputs;
         }
 
-        bool nd4j::graph::Node::hasInternalOutputs() {
+        bool sd::graph::Node::hasInternalOutputs() {
             return _hasInternalOutputs;
         }
 
-        bool nd4j::graph::Node::hasInternalInputs() {
+        bool sd::graph::Node::hasInternalInputs() {
             return _hasInternalInputs;
         }
 
-        bool nd4j::graph::Node::isMultiInput() {
+        bool sd::graph::Node::isMultiInput() {
             return _input.size() > 1;
         }
 
-        bool nd4j::graph::Node::isMultiOutput() {
+        bool sd::graph::Node::isMultiOutput() {
             return _output.size() > 1;
         }
 
-        double * nd4j::graph::Node::extraParams() {
+        double * sd::graph::Node::extraParams() {
             return _extraParams;
         }
 
@@ -266,23 +266,23 @@ namespace nd4j {
             _referencedBy.emplace_back(nodeId);
         }
 
-        nd4j::graph::OpType nd4j::graph::Node::opType() {
+        sd::graph::OpType sd::graph::Node::opType() {
             return _opType;
         }
 
-        int nd4j::graph::Node::id() {
+        int sd::graph::Node::id() {
             return _id;
         }
 
-        Nd4jLong nd4j::graph::Node::opNum() {
+        Nd4jLong sd::graph::Node::opNum() {
             return _opNum;
         }
 
-        std::vector<std::pair<int,int>> *nd4j::graph::Node::input() {
+        std::vector<std::pair<int,int>> *sd::graph::Node::input() {
             return &_input;
         }
 
-        std::vector<std::pair<int, int>> *nd4j::graph::Node::output() {
+        std::vector<std::pair<int, int>> *sd::graph::Node::output() {
             return &_output;
         }
 
@@ -313,12 +313,12 @@ namespace nd4j {
         }
         BUILD_SINGLE_TEMPLATE(template ND4J_EXPORT Node* Node::asT, (), LIBND4J_TYPES);
 
-        nd4j::graph::Node::Node(nd4j::ops::DeclarableOp *customOp, int id, std::initializer_list<int> input, std::initializer_list<int> output,  std::initializer_list<int> dimensions, float scalar, std::initializer_list<double> tArgs, std::initializer_list<int> iArgs) {
+        sd::graph::Node::Node(sd::ops::DeclarableOp *customOp, int id, std::initializer_list<int> input, std::initializer_list<int> output,  std::initializer_list<int> dimensions, float scalar, std::initializer_list<double> tArgs, std::initializer_list<int> iArgs) {
             this->_opType = OpType_CUSTOM;
             this->_id = id;
             this->_opNum = customOp->getOpHash();
             this->_extraParams = nullptr;
-            this->_dataType = nd4j::DataType::FLOAT32; // float as default
+            this->_dataType = sd::DataType::FLOAT32; // float as default
             this->_dim = nullptr;
             this->_customOp = customOp;
 
@@ -358,16 +358,16 @@ namespace nd4j {
             this->setContextPrototype(block);
         }
 
-        void nd4j::graph::Node::setOpType(OpType opType) {
+        void sd::graph::Node::setOpType(OpType opType) {
             this->_opType = opType;
         }
 
-        nd4j::graph::Node::Node(OpType opType, int opNum, int id, std::initializer_list<int> input, std::initializer_list<int> output, std::initializer_list<int> dimensions, float scalar, std::initializer_list<double> tArgs, std::initializer_list<int> iArgs) {
+        sd::graph::Node::Node(OpType opType, int opNum, int id, std::initializer_list<int> input, std::initializer_list<int> output, std::initializer_list<int> dimensions, float scalar, std::initializer_list<double> tArgs, std::initializer_list<int> iArgs) {
             this->_opType = opType;
             this->_id = id;
             this->_opNum = opNum;
             this->_extraParams = nullptr;
-            this->_dataType = nd4j::DataType::FLOAT32; // float as default
+            this->_dataType = sd::DataType::FLOAT32; // float as default
             this->_dim = nullptr;
 
             _hasExternalInputs = false;
@@ -455,14 +455,14 @@ namespace nd4j {
             }
         };
 
-        nd4j::graph::Node::Node(const nd4j::graph::FlatNode *node) {
+        sd::graph::Node::Node(const sd::graph::FlatNode *node) {
             _hasExternalInputs = false;
             _hasExternalOutputs = false;
             _hasInternalInputs = false;
             _hasInternalOutputs = false;
             _extraParams = nullptr;
             _dim = nullptr;
-            _dataType = nd4j::DataType::FLOAT32; // float as default
+            _dataType = sd::DataType::FLOAT32; // float as default
             if (node->scope_id() != 0)
                 this->_scope_id = node->scope_id();
 
@@ -470,7 +470,7 @@ namespace nd4j {
                 this->_scope_name = node->scope_name()->str();
 
             if (node->scalar() != nullptr) {
-                auto scalar = nd4j::graph::FlatUtils::fromFlatArray(node->scalar());
+                auto scalar = sd::graph::FlatUtils::fromFlatArray(node->scalar());
                 _scalar = *scalar;
                 delete scalar;
             }
@@ -589,7 +589,7 @@ namespace nd4j {
 
                         if (node->extraTypes() != nullptr && node->extraTypes()->size() > 0) {
                             for (int e = 0; e < (int) node->extraTypes()->size(); e++) {
-                                block->getDArguments()->emplace_back((nd4j::DataType) node->extraTypes()->Get(e));
+                                block->getDArguments()->emplace_back((sd::DataType) node->extraTypes()->Get(e));
                             }
                         }
 
@@ -626,7 +626,7 @@ namespace nd4j {
 
                         if (node->extraTypes() != nullptr && node->extraTypes()->size() > 0) {
                             for (int e = 0; e < (int) node->extraTypes()->size(); e++) {
-                                block->getDArguments()->emplace_back((nd4j::DataType) node->extraTypes()->Get(e));
+                                block->getDArguments()->emplace_back((sd::DataType) node->extraTypes()->Get(e));
                             }
                         }
 
@@ -636,7 +636,7 @@ namespace nd4j {
                         block->setOpDescriptor(this->getCustomOp()->getOpDescriptor());
                     }
                 } else if (this->_opType == OpType_CUSTOM) {
-                        auto op = nd4j::ops::OpRegistrator::getInstance()->getOperation(this->opNum());
+                        auto op = sd::ops::OpRegistrator::getInstance()->getOperation(this->opNum());
                         if (op == nullptr) {
                             nd4j_verbose("Can't find operation: %lld\n", this->opNum());
                             throw std::runtime_error("Can't find requested operation");
@@ -666,7 +666,7 @@ namespace nd4j {
 
                         if (node->extraTypes() != nullptr && node->extraTypes()->size() > 0) {
                             for (int e = 0; e < (int) node->extraTypes()->size(); e++) {
-                                block->getDArguments()->emplace_back((nd4j::DataType) node->extraTypes()->Get(e));
+                                block->getDArguments()->emplace_back((sd::DataType) node->extraTypes()->Get(e));
                             }
                         }
 
@@ -682,7 +682,7 @@ namespace nd4j {
             }
         }
 
-        nd4j::DataType Node::dataType() {
+        sd::DataType Node::dataType() {
             return _dataType;
         }
 
@@ -690,7 +690,7 @@ namespace nd4j {
             return _protoContext;
         }
 
-        nd4j::graph::Node::~Node() {
+        sd::graph::Node::~Node() {
             if (_extraParams != nullptr)
                 delete[] _extraParams;
 
@@ -705,132 +705,132 @@ namespace nd4j {
             }
         }
 
-        int nd4j::graph::Node::getRewindNode() {
+        int sd::graph::Node::getRewindNode() {
             return _rewindNode;
         }
 
-        void nd4j::graph::Node::setRewindNode(int nodeId) {
+        void sd::graph::Node::setRewindNode(int nodeId) {
             _rewindNode = nodeId;
         }
 
-        std::pair<int, int>& nd4j::graph::Node::getRewindLayer() {
+        std::pair<int, int>& sd::graph::Node::getRewindLayer() {
             return _rewindLayer;
         };
 
-        void nd4j::graph::Node::setRewindLayer(int layerId, int stepId) {
+        void sd::graph::Node::setRewindLayer(int layerId, int stepId) {
             _rewindLayer.first = layerId;
             _rewindLayer.second = stepId;
         }
 
-        bool nd4j::graph::Node::equals(Node *other) {
+        bool sd::graph::Node::equals(Node *other) {
             if (_opType == other->_opType && _dataType == other->_dataType && _opNum == other->_opNum)
                 return true;
 
             return false;
         }
 
-        void nd4j::graph::Node::deleteOpByType(OpType opType, void *op) {
+        void sd::graph::Node::deleteOpByType(OpType opType, void *op) {
             switch (opType) {
                 case OpType_PAIRWISE:
-                    delete reinterpret_cast<nd4j::ops::LegacyPairwiseTransformOp*>(op);
+                    delete reinterpret_cast<sd::ops::LegacyPairwiseTransformOp*>(op);
                     break;
                 case OpType_PAIRWISE_BOOL:
-                    delete reinterpret_cast<nd4j::ops::LegacyPairwiseTransformBoolOp*>(op);
+                    delete reinterpret_cast<sd::ops::LegacyPairwiseTransformBoolOp*>(op);
                     break;
                 case OpType_TRANSFORM_STRICT:
-                    delete reinterpret_cast<nd4j::ops::LegacyTransformStrictOp*>(op);
+                    delete reinterpret_cast<sd::ops::LegacyTransformStrictOp*>(op);
                     break;
                 case OpType_TRANSFORM_SAME:
-                    delete reinterpret_cast<nd4j::ops::LegacyTransformSameOp*>(op);
+                    delete reinterpret_cast<sd::ops::LegacyTransformSameOp*>(op);
                     break;
                 case OpType_TRANSFORM_FLOAT:
-                    delete reinterpret_cast<nd4j::ops::LegacyTransformFloatOp*>(op);
+                    delete reinterpret_cast<sd::ops::LegacyTransformFloatOp*>(op);
                     break;
                 case OpType_TRANSFORM_BOOL:
-                    delete reinterpret_cast<nd4j::ops::LegacyTransformBoolOp*>(op);
+                    delete reinterpret_cast<sd::ops::LegacyTransformBoolOp*>(op);
                     break;
                 case OpType_SCALAR:
-                    delete reinterpret_cast<nd4j::ops::LegacyScalarOp*>(op);
+                    delete reinterpret_cast<sd::ops::LegacyScalarOp*>(op);
                     break;
                 case OpType_SCALAR_BOOL:
-                    delete reinterpret_cast<nd4j::ops::LegacyScalarBoolOp*>(op);
+                    delete reinterpret_cast<sd::ops::LegacyScalarBoolOp*>(op);
                     break;
                 case OpType_REDUCE_3:
-                    delete reinterpret_cast<nd4j::ops::LegacyReduce3Op*>(op);
+                    delete reinterpret_cast<sd::ops::LegacyReduce3Op*>(op);
                     break;
                 case OpType_REDUCE_SAME:
-                    delete reinterpret_cast<nd4j::ops::LegacyReduceSameOp*>(op);
+                    delete reinterpret_cast<sd::ops::LegacyReduceSameOp*>(op);
                     break;
                 case OpType_REDUCE_FLOAT:
-                    delete reinterpret_cast<nd4j::ops::LegacyReduceFloatOp*>(op);
+                    delete reinterpret_cast<sd::ops::LegacyReduceFloatOp*>(op);
                     break;
                 case OpType_REDUCE_LONG:
-                    delete reinterpret_cast<nd4j::ops::LegacyReduceLongOp*>(op);
+                    delete reinterpret_cast<sd::ops::LegacyReduceLongOp*>(op);
                     break;
                 case OpType_REDUCE_BOOL:
-                    delete reinterpret_cast<nd4j::ops::LegacyReduceBoolOp*>(op);
+                    delete reinterpret_cast<sd::ops::LegacyReduceBoolOp*>(op);
                     break;
                 case OpType_INDEX_REDUCE:
-                    delete reinterpret_cast<nd4j::ops::LegacyIndexReduceOp*>(op);
+                    delete reinterpret_cast<sd::ops::LegacyIndexReduceOp*>(op);
                     break;
                 case OpType_SUMMARYSTATS:
-                    delete reinterpret_cast<nd4j::ops::LegacyStatsOp*>(op);
+                    delete reinterpret_cast<sd::ops::LegacyStatsOp*>(op);
                     break;
                 case OpType_RANDOM:
-                    delete reinterpret_cast<nd4j::ops::LegacyRandomOp*>(op);
+                    delete reinterpret_cast<sd::ops::LegacyRandomOp*>(op);
                     break;
                 case OpType_BROADCAST:
-                    delete reinterpret_cast<nd4j::ops::LegacyBroadcastOp*>(op);
+                    delete reinterpret_cast<sd::ops::LegacyBroadcastOp*>(op);
                     break;
                 case OpType_BROADCAST_BOOL:
-                    delete reinterpret_cast<nd4j::ops::LegacyBroadcastBoolOp*>(op);
+                    delete reinterpret_cast<sd::ops::LegacyBroadcastBoolOp*>(op);
                     break;
                 case OpType_CUSTOM:
-                    delete reinterpret_cast<nd4j::ops::DeclarableOp*>(op);
+                    delete reinterpret_cast<sd::ops::DeclarableOp*>(op);
                     break;
                 default:
                     throw std::runtime_error("Bad opType passed in");
             }
         }
 
-        nd4j::ops::DeclarableOp* nd4j::graph::Node::buildOpByType(OpType opType, int numInputs,  int numIArgs, int numTArgs, int opNum, NDArray *scalar) {
+        sd::ops::DeclarableOp* sd::graph::Node::buildOpByType(OpType opType, int numInputs,  int numIArgs, int numTArgs, int opNum, NDArray *scalar) {
             switch (opType) {
                 case OpType_PAIRWISE:
-                    return new nd4j::ops::LegacyPairwiseTransformOp(opNum);
+                    return new sd::ops::LegacyPairwiseTransformOp(opNum);
                 case OpType_PAIRWISE_BOOL:
-                    return new nd4j::ops::LegacyPairwiseTransformBoolOp(opNum);
+                    return new sd::ops::LegacyPairwiseTransformBoolOp(opNum);
                 case OpType_TRANSFORM_STRICT:
-                    return new nd4j::ops::LegacyTransformStrictOp(opNum);
+                    return new sd::ops::LegacyTransformStrictOp(opNum);
                 case OpType_TRANSFORM_SAME:
-                    return new nd4j::ops::LegacyTransformSameOp(opNum);
+                    return new sd::ops::LegacyTransformSameOp(opNum);
                 case OpType_TRANSFORM_FLOAT:
-                    return new nd4j::ops::LegacyTransformFloatOp(opNum);
+                    return new sd::ops::LegacyTransformFloatOp(opNum);
                 case OpType_TRANSFORM_BOOL:
-                    return new nd4j::ops::LegacyTransformBoolOp(opNum);
+                    return new sd::ops::LegacyTransformBoolOp(opNum);
                 case OpType_SCALAR:
-                    return scalar == nullptr ? new nd4j::ops::LegacyScalarOp(opNum) : new nd4j::ops::LegacyScalarOp(opNum, *scalar);
+                    return scalar == nullptr ? new sd::ops::LegacyScalarOp(opNum) : new sd::ops::LegacyScalarOp(opNum, *scalar);
                 case OpType_SCALAR_BOOL:
-                    return scalar == nullptr ? new nd4j::ops::LegacyScalarBoolOp(opNum) : new nd4j::ops::LegacyScalarBoolOp(opNum, *scalar);
+                    return scalar == nullptr ? new sd::ops::LegacyScalarBoolOp(opNum) : new sd::ops::LegacyScalarBoolOp(opNum, *scalar);
                 case OpType_REDUCE_3:
-                    return new nd4j::ops::LegacyReduce3Op(opNum);
+                    return new sd::ops::LegacyReduce3Op(opNum);
                 case OpType_REDUCE_SAME:
-                    return new nd4j::ops::LegacyReduceSameOp(opNum);
+                    return new sd::ops::LegacyReduceSameOp(opNum);
                 case OpType_REDUCE_FLOAT:
-                    return new nd4j::ops::LegacyReduceFloatOp(opNum);
+                    return new sd::ops::LegacyReduceFloatOp(opNum);
                 case OpType_REDUCE_LONG:
-                    return new nd4j::ops::LegacyReduceLongOp(opNum);
+                    return new sd::ops::LegacyReduceLongOp(opNum);
                 case OpType_REDUCE_BOOL:
-                    return new nd4j::ops::LegacyReduceBoolOp(opNum);
+                    return new sd::ops::LegacyReduceBoolOp(opNum);
                 case OpType_INDEX_REDUCE:
-                    return new nd4j::ops::LegacyIndexReduceOp(opNum);
+                    return new sd::ops::LegacyIndexReduceOp(opNum);
                 case OpType_SUMMARYSTATS:
-                    return new nd4j::ops::LegacyStatsOp(opNum);
+                    return new sd::ops::LegacyStatsOp(opNum);
                 case OpType_RANDOM:
-                    return new nd4j::ops::LegacyRandomOp(opNum);
+                    return new sd::ops::LegacyRandomOp(opNum);
                 case OpType_BROADCAST:
-                    return new nd4j::ops::LegacyBroadcastOp(opNum);
+                    return new sd::ops::LegacyBroadcastOp(opNum);
                 case OpType_BROADCAST_BOOL:
-                    return new nd4j::ops::LegacyBroadcastBoolOp(opNum);
+                    return new sd::ops::LegacyBroadcastBoolOp(opNum);
                 default:
                     throw std::runtime_error("Bad opType passed in");
             }
@@ -846,7 +846,7 @@ namespace nd4j {
 
 
         Node* Node::clone() {
-            if (this->_customOp && this->_opType == nd4j::graph::OpType_CUSTOM) {
+            if (this->_customOp && this->_opType == sd::graph::OpType_CUSTOM) {
                 auto clone = new Node(this->_customOp, _id);
                 clone->pullValues(this);
                 return clone;
@@ -860,7 +860,7 @@ namespace nd4j {
             if (!_isDeductable)
                 clone->_customOp = _customOp;
             else {
-                auto c = dynamic_cast<nd4j::ops::LegacyOp*>(_customOp);
+                auto c = dynamic_cast<sd::ops::LegacyOp*>(_customOp);
                 clone->_customOp = c->clone();
             }
 
diff --git a/libnd4j/include/graph/impl/NodeState.cpp b/libnd4j/include/graph/impl/NodeState.cpp
index 1e463622b..d09de9c57 100644
--- a/libnd4j/include/graph/impl/NodeState.cpp
+++ b/libnd4j/include/graph/impl/NodeState.cpp
@@ -20,7 +20,7 @@
 
 #include <graph/NodeState.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         NodeState::NodeState(int id) {
             _id = id;
diff --git a/libnd4j/include/graph/impl/ResultWrapper.cpp b/libnd4j/include/graph/impl/ResultWrapper.cpp
index 00fad0acb..277644acf 100644
--- a/libnd4j/include/graph/impl/ResultWrapper.cpp
+++ b/libnd4j/include/graph/impl/ResultWrapper.cpp
@@ -22,7 +22,7 @@
 #include <stdexcept>
 
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         ResultWrapper::ResultWrapper(Nd4jLong size, Nd4jPointer ptr) {
             if (size <= 0)
diff --git a/libnd4j/include/graph/impl/Scope.cpp b/libnd4j/include/graph/impl/Scope.cpp
index 60e22a83d..84a8f2f0d 100644
--- a/libnd4j/include/graph/impl/Scope.cpp
+++ b/libnd4j/include/graph/impl/Scope.cpp
@@ -18,9 +18,9 @@
 // Created by raver119 on 14.10.2017.
 //
 
-#include "Scope.h"
+#include <graph/Scope.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         Scope::Scope(int id, const char *name) {
             _id = id;
diff --git a/libnd4j/include/graph/impl/SessionLocalStorage.cpp b/libnd4j/include/graph/impl/SessionLocalStorage.cpp
index 59ad21c90..9c512b0b6 100644
--- a/libnd4j/include/graph/impl/SessionLocalStorage.cpp
+++ b/libnd4j/include/graph/impl/SessionLocalStorage.cpp
@@ -18,11 +18,11 @@
 // @author raver119@gmail.com
 //
 
-#include <VariableSpace.h>
-#include <Stash.h>
+#include <graph/VariableSpace.h>
+#include <graph/Stash.h>
 #include <graph/SessionLocalStorage.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         SessionLocalStorage::SessionLocalStorage(VariableSpace* variableSpace, Stash* stash) {
             // we start from 1, since key 0 holds original VariableSpace
@@ -104,7 +104,7 @@ namespace nd4j {
             return ntid;
         }
 
-        Nd4jLong nd4j::graph::SessionLocalStorage::startSession() {
+        Nd4jLong sd::graph::SessionLocalStorage::startSession() {
             auto tid = getThreadId();
 
             nd4j_debug("Adding ThreadId: %i;\n", (int) tid);
diff --git a/libnd4j/include/graph/impl/Stash.cpp b/libnd4j/include/graph/impl/Stash.cpp
index c6a573605..618e01c87 100644
--- a/libnd4j/include/graph/impl/Stash.cpp
+++ b/libnd4j/include/graph/impl/Stash.cpp
@@ -22,7 +22,7 @@
 
 
 namespace std {
-    size_t hash<nd4j::graph::KeyPair>::operator()(const nd4j::graph::KeyPair& k) const {
+    size_t hash<sd::graph::KeyPair>::operator()(const sd::graph::KeyPair& k) const {
         using std::hash;
         auto res = std::hash<std::string>()(k.name());
         res ^= std::hash<int>()(k.key()) + 0x9e3779b9 + (res << 6) + (res >> 2);
@@ -30,14 +30,14 @@ namespace std {
     }
 }
 
-namespace nd4j {
+namespace sd {
     namespace graph {
-        nd4j::graph::KeyPair::KeyPair(int node, const char * name) {
+        sd::graph::KeyPair::KeyPair(int node, const char * name) {
             _node = node;
             _name = std::string(name);
         }
 
-        bool nd4j::graph::KeyPair::operator<(const KeyPair& other) const {
+        bool sd::graph::KeyPair::operator<(const KeyPair& other) const {
             if (_node < other._node)
                 return true;
             else if (_node > other._node)
@@ -46,42 +46,42 @@ namespace nd4j {
                 return _name < other._name;
         }
 
-        nd4j::graph::Stash::Stash() {
+        sd::graph::Stash::Stash() {
             //
         }
 
-        nd4j::graph::Stash::~Stash() {
+        sd::graph::Stash::~Stash() {
             if (_handles.size() > 0)
                 this->clear();
         }
 
 /*
-bool nd4j::graph::Stash::checkStash(nd4j::graph::Block& block, const char *name) {
+bool sd::graph::Stash::checkStash(sd::graph::Block& block, const char *name) {
     return checkStash(block.getNodeId(), name);
 }
  */
 
-        bool nd4j::graph::Stash::checkStash(int nodeId, const char *name) {
+        bool sd::graph::Stash::checkStash(int nodeId, const char *name) {
             KeyPair kp(nodeId, name);
             return _stash.count(kp) > 0;
         }
 
 /*
-nd4j::NDArray* nd4j::graph::Stash::extractArray(nd4j::graph::Block& block, const char *name) {
+sd::NDArray* sd::graph::Stash::extractArray(sd::graph::Block& block, const char *name) {
     return extractArray(block.getNodeId(), name);
 }
 */
-        nd4j::NDArray* nd4j::graph::Stash::extractArray(int nodeId, const char *name) {
+        sd::NDArray* sd::graph::Stash::extractArray(int nodeId, const char *name) {
             KeyPair kp(nodeId, name);
             return _stash[kp];
         }
 /*
-void nd4j::graph::Stash::storeArray(nd4j::graph::Block& block, const char *name, nd4j::NDArray *array) {
+void sd::graph::Stash::storeArray(sd::graph::Block& block, const char *name, sd::NDArray *array) {
     storeArray(block.getNodeId(), name, array);
 }
 */
 
-        void nd4j::graph::Stash::storeArray(int nodeId, const char *name, nd4j::NDArray *array) {
+        void sd::graph::Stash::storeArray(int nodeId, const char *name, sd::NDArray *array) {
             KeyPair kp(nodeId, name);
             _stash[kp] = array;
 
@@ -89,7 +89,7 @@ void nd4j::graph::Stash::storeArray(nd4j::graph::Block& block, const char *name,
             _handles.push_back(array);
         }
 
-        void nd4j::graph::Stash::clear() {
+        void sd::graph::Stash::clear() {
             for (auto v: _handles)
                 delete v;
 
diff --git a/libnd4j/include/graph/impl/TimeHolder.cpp b/libnd4j/include/graph/impl/TimeHolder.cpp
index b22740943..a292a8997 100644
--- a/libnd4j/include/graph/impl/TimeHolder.cpp
+++ b/libnd4j/include/graph/impl/TimeHolder.cpp
@@ -20,7 +20,7 @@
 
 #include <graph/TimeHolder.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
 
         void TimeHolder::setOuterTime(int nodeId, Nd4jLong time) {
diff --git a/libnd4j/include/graph/impl/Variable.cpp b/libnd4j/include/graph/impl/Variable.cpp
index 9ff7fbf37..e87c51897 100644
--- a/libnd4j/include/graph/impl/Variable.cpp
+++ b/libnd4j/include/graph/impl/Variable.cpp
@@ -26,7 +26,7 @@
 #include <graph/FlatUtils.h>
 #include <helpers/StringUtils.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
 
         template <typename N>
@@ -52,7 +52,7 @@ namespace nd4j {
         }
         BUILD_SINGLE_TEMPLATE(template ND4J_EXPORT Variable* Variable::asT, (), LIBND4J_TYPES);
 
-        nd4j::graph::Variable* nd4j::graph::Variable::clone() {
+        sd::graph::Variable* sd::graph::Variable::clone() {
             auto result = new Variable(this->isPlaceholder());
             result->_external = this->_external;
             result->_id = this->_id;
@@ -72,47 +72,47 @@ namespace nd4j {
             return result;
         }
 
-        void nd4j::graph::Variable::setIndex(int index) {
+        void sd::graph::Variable::setIndex(int index) {
             _index = index;
         }
 
-        bool nd4j::graph::Variable::hasNDArray() {
+        bool sd::graph::Variable::hasNDArray() {
             return _ndarray != nullptr;
         }
 
-        void nd4j::graph::Variable::setVariableType(VariableType variableType) {
+        void sd::graph::Variable::setVariableType(VariableType variableType) {
             _variableType = variableType;
         }
 
-        bool nd4j::graph::Variable::hasNDArrayList() {
+        bool sd::graph::Variable::hasNDArrayList() {
             return _list != nullptr;
         }
 
-        bool nd4j::graph::Variable::isPlaceholder() {
+        bool sd::graph::Variable::isPlaceholder() {
             return _placeholder;
         }
 
-        std::string * nd4j::graph::Variable::getName() {
+        std::string * sd::graph::Variable::getName() {
             return &_name;
         }
 
-        void nd4j::graph::Variable::setName(std::string *name) {
+        void sd::graph::Variable::setName(std::string *name) {
             _name = *name;
         }
 
-        int nd4j::graph::Variable::id() {
+        int sd::graph::Variable::id() {
             return _id;
         }
 
-        int nd4j::graph::Variable::index() {
+        int sd::graph::Variable::index() {
             return _index;
         }
 
-        void nd4j::graph::Variable::setId(int id) {
+        void sd::graph::Variable::setId(int id) {
             _id = id;
         }
 
-        bool nd4j::graph::Variable::isEmpty() {
+        bool sd::graph::Variable::isEmpty() {
             if (_variableType == VariableType::NDARRAY)
                 return _ndarray == nullptr || !_ndarray->nonNull();
             else if (_variableType == VariableType::ARRAY_LIST)
@@ -121,29 +121,29 @@ namespace nd4j {
             return false;
         }
 
-        bool nd4j::graph::Variable::isExternal() {
+        bool sd::graph::Variable::isExternal() {
             return _external;
         }
 
-        bool nd4j::graph::Variable::isReadOnly() {
+        bool sd::graph::Variable::isReadOnly() {
             return _readOnly;
         }
 
-        void nd4j::graph::Variable::markExternal(bool reallyExternal) {
+        void sd::graph::Variable::markExternal(bool reallyExternal) {
             this->_external = reallyExternal;
         }
 
-        void nd4j::graph::Variable::markRemovable(bool reallyRemovable) {
+        void sd::graph::Variable::markRemovable(bool reallyRemovable) {
             if (!reallyRemovable)
                 nd4j_debug("","");
             this->_removable = reallyRemovable;
         }
 
-        void nd4j::graph::Variable::markReadOnly(bool reallyReadOnly) {
+        void sd::graph::Variable::markReadOnly(bool reallyReadOnly) {
             this->_readOnly = reallyReadOnly;
         }
 
-        nd4j::NDArray * nd4j::graph::Variable::getNDArray() {
+        sd::NDArray * sd::graph::Variable::getNDArray() {
             if (_variableType != VariableType::NDARRAY) {
                 nd4j_printf("Variable[%i:%i/<%s>] is has [%s] type, but NDArray was requested\n", this->_id, this->_index, this->_name.c_str(), EnumUtils::_VariableTypeToString(_variableType));
             }
@@ -162,7 +162,7 @@ namespace nd4j {
             return this->_ndarray;
         }
 
-        nd4j::NDArrayList * nd4j::graph::Variable::getNDArrayList() {
+        sd::NDArrayList * sd::graph::Variable::getNDArrayList() {
             if (_variableType != VariableType::ARRAY_LIST) {
                 nd4j_debug("Variable[%i:%i/<%s>] is has [%s] type, but NDArrayList was requested\n", this->_id, this->_index, this->_name.c_str(), EnumUtils::_VariableTypeToString(_variableType));
             }
@@ -175,24 +175,24 @@ namespace nd4j {
         }
 
 
-        void nd4j::graph::Variable::setNDArrayList(nd4j::NDArrayList * list) {
+        void sd::graph::Variable::setNDArrayList(sd::NDArrayList * list) {
             this->_variableType = VariableType::ARRAY_LIST;
             this->_list = list;
         }
 
 
-        void nd4j::graph::Variable::setNDArray(nd4j::NDArray * array) {
+        void sd::graph::Variable::setNDArray(sd::NDArray * array) {
             this->_variableType = VariableType::NDARRAY;
             this->_ndarray = array;
         }
 
 
-        VariableType nd4j::graph::Variable::variableType() {
+        VariableType sd::graph::Variable::variableType() {
             return _variableType;
         }
 
 
-        nd4j::graph::Variable::Variable(const nd4j::graph::FlatVariable *flatVariable) {
+        sd::graph::Variable::Variable(const sd::graph::FlatVariable *flatVariable) {
             auto vid = flatVariable->id();
             this->_id = vid->first();
             this->_index = vid->second();
@@ -211,7 +211,7 @@ namespace nd4j {
                         // ?????
                         if (flatVariable->ndarray() != nullptr) {
                             auto ar = flatVariable->ndarray();
-                            _ndarray = nd4j::graph::FlatUtils::fromFlatArray(ar);
+                            _ndarray = sd::graph::FlatUtils::fromFlatArray(ar);
                         }
 
                         _variableType = VariableType::NDARRAY;
@@ -223,9 +223,9 @@ namespace nd4j {
 
                         auto ar = flatVariable->ndarray();
                         if (ar->dtype() == DType_UTF8) {
-                            _ndarray = nd4j::graph::FlatUtils::fromFlatArray(ar);
+                            _ndarray = sd::graph::FlatUtils::fromFlatArray(ar);
                         } else {
-                            _ndarray = nd4j::graph::FlatUtils::fromFlatArray(ar);
+                            _ndarray = sd::graph::FlatUtils::fromFlatArray(ar);
                         }
 
                         _variableType = VariableType::NDARRAY;
@@ -236,7 +236,7 @@ namespace nd4j {
                         // ?????
                         if (flatVariable->ndarray() != nullptr) {
                             auto ar = flatVariable->ndarray();
-                            _ndarray = nd4j::graph::FlatUtils::fromFlatArray(ar);
+                            _ndarray = sd::graph::FlatUtils::fromFlatArray(ar);
                             // _ndarray->triggerAllocationFlag(true);
                         }
 
@@ -249,7 +249,7 @@ namespace nd4j {
 
                         if (flatVariable->ndarray() != nullptr) {
                             auto ar = flatVariable->ndarray();
-                            _ndarray = nd4j::graph::FlatUtils::fromFlatArray(ar);
+                            _ndarray = sd::graph::FlatUtils::fromFlatArray(ar);
                             // _ndarray->triggerAllocationFlag(true);
 
                             _variableType = VariableType::NDARRAY;
@@ -270,16 +270,16 @@ namespace nd4j {
             }
         }
 
-        std::vector<Nd4jLong>& nd4j::graph::Variable::shape() {
+        std::vector<Nd4jLong>& sd::graph::Variable::shape() {
             return _shape;
         }
 
-        nd4j::graph::Variable::Variable(bool placeholder) {
+        sd::graph::Variable::Variable(bool placeholder) {
             _placeholder = placeholder;
         }
 
 
-        nd4j::graph::Variable::Variable(NDArray *array, const char *name ) {
+        sd::graph::Variable::Variable(NDArray *array, const char *name ) {
             _ndarray = array;
 
             _external = false;
@@ -293,13 +293,13 @@ namespace nd4j {
         }
 
 
-        nd4j::graph::Variable::Variable(NDArray *array, const char *name, int id, int idx) : Variable(array, name) {
+        sd::graph::Variable::Variable(NDArray *array, const char *name, int id, int idx) : Variable(array, name) {
             _id = id;
             _index = idx;
         }
 
 
-        nd4j::graph::Variable::~Variable() {
+        sd::graph::Variable::~Variable() {
             //nd4j_printf("Removing variable [%i:%i]\n", _id, _index);
             if (_variableType == VariableType::NDARRAY) {
                 nd4j_debug("Removing variable <%i:%i>\n", _id, _index);
@@ -323,7 +323,7 @@ namespace nd4j {
                 auto fBuffer = builder.CreateVector(array->asByteVector());
 
                 // packing array
-                auto fArray = CreateFlatArray(builder, fShape, fBuffer, (nd4j::graph::DType) array->dataType());
+                auto fArray = CreateFlatArray(builder, fShape, fBuffer, (sd::graph::DType) array->dataType());
 
                 // packing id/index of this var
                 auto fVid = CreateIntPair(builder, this->_id, this->_index);
@@ -334,7 +334,7 @@ namespace nd4j {
                     stringId = builder.CreateString(this->_name);
 
                 // returning array
-                return CreateFlatVariable(builder, fVid, stringId, static_cast<nd4j::graph::DType>(array->dataType()), 0, fArray);
+                return CreateFlatVariable(builder, fVid, stringId, static_cast<sd::graph::DType>(array->dataType()), 0, fArray);
             } else {
                 throw std::runtime_error("Variable::asFlatVariable isn't possible for NDArrayList");
             }
diff --git a/libnd4j/include/graph/impl/VariableProxy.cpp b/libnd4j/include/graph/impl/VariableProxy.cpp
index 5dee4b261..2736e2a9e 100644
--- a/libnd4j/include/graph/impl/VariableProxy.cpp
+++ b/libnd4j/include/graph/impl/VariableProxy.cpp
@@ -18,10 +18,10 @@
 //  @author raver119@gmail.com
 //
 
-#include <dll.h>
+#include <system/dll.h>
 #include <graph/VariableProxy.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         
         VariableProxy::VariableProxy(VariableSpace* ref) {
@@ -110,7 +110,7 @@ namespace nd4j {
         }
 
         
-        nd4j::graph::Variable *VariableProxy::getVariable(int id) {
+        sd::graph::Variable *VariableProxy::getVariable(int id) {
             if (_current->hasVariable(id))
                 return _current->getVariable(id);
             
@@ -122,7 +122,7 @@ namespace nd4j {
         }
 
         
-        nd4j::graph::Variable *VariableProxy::getVariable(int id, int idx) {
+        sd::graph::Variable *VariableProxy::getVariable(int id, int idx) {
             if (_current->hasVariable(id, idx))
                 return _current->getVariable(id, idx);
             
@@ -134,7 +134,7 @@ namespace nd4j {
         }
 
         
-        nd4j::graph::Variable *VariableProxy::getVariable(std::pair<int,int>& pair) {
+        sd::graph::Variable *VariableProxy::getVariable(std::pair<int,int>& pair) {
             if (_current->hasVariable(pair))
                 return _current->getVariable(pair);
             
@@ -146,7 +146,7 @@ namespace nd4j {
         }
 
         
-        nd4j::graph::Variable *VariableProxy::getVariable(std::string *symbol) {
+        sd::graph::Variable *VariableProxy::getVariable(std::string *symbol) {
             if (_current->hasVariable(symbol))
                 return _current->getVariable(symbol);
             
@@ -191,7 +191,7 @@ namespace nd4j {
             _current->putVariable(id, array);
         }
 
-        void nd4j::graph::VariableProxy::putVariable(int id, int idx, NDArray &array) {
+        void sd::graph::VariableProxy::putVariable(int id, int idx, NDArray &array) {
             _current->putVariable(id, idx, array);
         }
         
@@ -205,12 +205,12 @@ namespace nd4j {
         }
 
         
-        void VariableProxy::trackList(nd4j::NDArrayList* list) {
+        void VariableProxy::trackList(sd::NDArrayList* list) {
             _current->trackList(list);
         }
 
         
-        nd4j::graph::Stash* VariableProxy::getStash() {
+        sd::graph::Stash* VariableProxy::getStash() {
             return _current->getStash();
         }
 
@@ -260,7 +260,7 @@ namespace nd4j {
         }
 
         
-        nd4j::graph::VariableSpace* VariableProxy::clone() {
+        sd::graph::VariableSpace* VariableProxy::clone() {
             auto clone = new VariableProxy(_backed);
 
             delete clone->_current;
@@ -279,7 +279,7 @@ namespace nd4j {
         }  
 
         
-        nd4j::memory::Workspace * nd4j::graph::VariableProxy::workspace() {
+        sd::memory::Workspace * sd::graph::VariableProxy::workspace() {
             return _workspace;
         }
     }
diff --git a/libnd4j/include/graph/impl/VariableSpace.cpp b/libnd4j/include/graph/impl/VariableSpace.cpp
index de6000630..0e8634d07 100644
--- a/libnd4j/include/graph/impl/VariableSpace.cpp
+++ b/libnd4j/include/graph/impl/VariableSpace.cpp
@@ -19,19 +19,19 @@
 //
 
 #include <graph/VariableSpace.h>
-#include <NativeOps.h>
+#include <legacy/NativeOps.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
-        std::vector<nd4j::graph::Variable *> * nd4j::graph::VariableSpace::getExternalVariables() {
+        std::vector<sd::graph::Variable *> * sd::graph::VariableSpace::getExternalVariables() {
             return &_external;
         }
 
-        nd4j::graph::Stash* nd4j::graph::VariableSpace::getStash() {
+        sd::graph::Stash* sd::graph::VariableSpace::getStash() {
             return &_stash;
         }
 
-        nd4j::graph::VariableSpace* nd4j::graph::VariableSpace::clone() {
+        sd::graph::VariableSpace* sd::graph::VariableSpace::clone() {
             auto result = new VariableSpace();
 
             for (auto const& x : _paired) {
@@ -45,12 +45,12 @@ namespace nd4j {
             return result;
         }
 
-        void VariableSpace::setWorkspace(nd4j::memory::Workspace *workspace) {
+        void VariableSpace::setWorkspace(sd::memory::Workspace *workspace) {
             //_workspace = *workspace;
         }
 
         
-        nd4j::graph::VariableSpace* nd4j::graph::VariableSpace::asT() {
+        sd::graph::VariableSpace* sd::graph::VariableSpace::asT() {
             auto result = new VariableSpace();
 
             for (auto const& x : _paired) {
@@ -65,7 +65,7 @@ namespace nd4j {
         }
 
         
-        void nd4j::graph::VariableSpace::injectVariable(std::pair<int, int> &pair, Variable* variable) {
+        void sd::graph::VariableSpace::injectVariable(std::pair<int, int> &pair, Variable* variable) {
             if (pair.second == 0) {
                 if (pair.first < 0)
                     this->_variables[pair.first] = variable;
@@ -81,23 +81,23 @@ namespace nd4j {
             this->_handles->push_back(variable);
         }
 
-        std::vector<nd4j::graph::Variable*> * nd4j::graph::VariableSpace::getPlaceholders() {
+        std::vector<sd::graph::Variable*> * sd::graph::VariableSpace::getPlaceholders() {
             return &_placeholders;
         }
 
-        int nd4j::graph::VariableSpace ::numberOfPlaceholders() {
+        int sd::graph::VariableSpace ::numberOfPlaceholders() {
             return _placeholders.size();
         }
 
-        bool nd4j::graph::VariableSpace::hasVariable(std::string *symbol) {
+        bool sd::graph::VariableSpace::hasVariable(std::string *symbol) {
             return _symbolic.count(*symbol) == 1;
         }
 
-        nd4j::graph::Variable * nd4j::graph::VariableSpace::getVariable(std::string *symbol) {
+        sd::graph::Variable * sd::graph::VariableSpace::getVariable(std::string *symbol) {
             return _symbolic.at(*symbol);
         }
 
-        bool nd4j::graph::VariableSpace::hasVariable(int id, int index) {
+        bool sd::graph::VariableSpace::hasVariable(int id, int index) {
             std::pair<int, int> pair(id, index);
             return hasVariable(pair);
         }
@@ -126,12 +126,12 @@ namespace nd4j {
             return var->isExternal();
         }
 
-        nd4j::graph::Variable * nd4j::graph::VariableSpace::getVariable(int id, int index) {
+        sd::graph::Variable * sd::graph::VariableSpace::getVariable(int id, int index) {
             std::pair<int, int> pair(id, index);
             return getVariable(pair);
         }
 
-        nd4j::graph::Variable * nd4j::graph::VariableSpace::getVariable(std::pair<int, int>& pair) {
+        sd::graph::Variable * sd::graph::VariableSpace::getVariable(std::pair<int, int>& pair) {
             if (pair.first < 0)
                 return getVariable(pair.first);
             else
@@ -141,32 +141,32 @@ namespace nd4j {
             throw std::runtime_error("Unknown variable requested");
         }
 
-        bool nd4j::graph::VariableSpace::hasVariable(int id) {
+        bool sd::graph::VariableSpace::hasVariable(int id) {
             return _variables.count(id) == 1 || _temporary.count(id) == 1;
         }
 
-        bool nd4j::graph::VariableSpace::hasVariable(std::pair<int,int>& id) {
+        bool sd::graph::VariableSpace::hasVariable(std::pair<int,int>& id) {
             return _paired.count(id) > 0;
         }
 
-        void nd4j::graph::VariableSpace::putOutputVariable(Variable *variable) {
+        void sd::graph::VariableSpace::putOutputVariable(Variable *variable) {
             //putVariable(_auto_counter--, variable);
             putVariable(variable->id(), variable);
         }
 
-        int nd4j::graph::VariableSpace::externalEntries() {
+        int sd::graph::VariableSpace::externalEntries() {
             return _external.size();
         }
 
-        int nd4j::graph::VariableSpace::internalEntries() {
+        int sd::graph::VariableSpace::internalEntries() {
             return _internal.size();
         }
 
-        int nd4j::graph::VariableSpace::totalEntries() {
+        int sd::graph::VariableSpace::totalEntries() {
             return externalEntries() + internalEntries();
         }
 
-        Nd4jLong nd4j::graph::VariableSpace::externalMemory() {
+        Nd4jLong sd::graph::VariableSpace::externalMemory() {
             Nd4jLong size = 0;
             for (auto n: _external) {
                 size += n->getNDArray()->memoryFootprint();
@@ -187,7 +187,7 @@ namespace nd4j {
             return result;
         }
 
-        Nd4jLong nd4j::graph::VariableSpace::internalMemory() {
+        Nd4jLong sd::graph::VariableSpace::internalMemory() {
             Nd4jLong size = 0;
             for (auto n: _internal) {
                 size += n->getNDArray()->memoryFootprint();
@@ -196,36 +196,36 @@ namespace nd4j {
             return size;
         }
 
-        Nd4jLong nd4j::graph::VariableSpace::totalMemory() {
+        Nd4jLong sd::graph::VariableSpace::totalMemory() {
             return externalMemory() + internalMemory();
         }
 
-        Variable* nd4j::graph::VariableSpace::putVariable(std::pair<int,int>& pair, NDArray *array) {
+        Variable* sd::graph::VariableSpace::putVariable(std::pair<int,int>& pair, NDArray *array) {
             auto variable = new Variable(array, nullptr, pair.first, pair.second);
             this->putVariable(pair, variable);
             return variable;
         }
 
-        Variable* nd4j::graph::VariableSpace::putVariable(int node, int idx, NDArray *array) {
+        Variable* sd::graph::VariableSpace::putVariable(int node, int idx, NDArray *array) {
             std::pair<int, int> pair(node, idx);
             return this->putVariable(pair, array);
         }
 
-        void nd4j::graph::VariableSpace::putVariable(int node, int idx, Variable *variable) {
+        void sd::graph::VariableSpace::putVariable(int node, int idx, Variable *variable) {
             std::pair<int, int> pair(node, idx);
             this->putVariable(pair, variable);
         }
 
-        void nd4j::graph::VariableSpace::silentPutVariable(std::pair<int,int>& pair, Variable *variable) {
+        void sd::graph::VariableSpace::silentPutVariable(std::pair<int,int>& pair, Variable *variable) {
             _varmap.lock();
 
-            //std::pair<std::pair<int, int>, nd4j::graph::Variable *> p(pair, variable);
+            //std::pair<std::pair<int, int>, sd::graph::Variable *> p(pair, variable);
             _paired[pair] = variable;
 
             _varmap.unlock();
         }
 
-        void nd4j::graph::VariableSpace::putVariable(std::pair<int,int>& pair, Variable *variable) {
+        void sd::graph::VariableSpace::putVariable(std::pair<int,int>& pair, Variable *variable) {
             silentPutVariable(pair, variable);
 
             if (variable->isPlaceholder())
@@ -247,11 +247,11 @@ namespace nd4j {
             }
         }
 
-        void VariableSpace::trackList(nd4j::NDArrayList* list) {
+        void VariableSpace::trackList(sd::NDArrayList* list) {
             _lists.emplace_back(list);
         }
 
-        void nd4j::graph::VariableSpace::putVariable(int id, Variable *variable) {
+        void sd::graph::VariableSpace::putVariable(int id, Variable *variable) {
             // we don't want to add variables more then once
             if (_variables.count(id) > 0 || _temporary.count(id) > 0) {
                 auto local = id < 0 ? _variables.at(id) : _temporary.at(id);
@@ -277,7 +277,7 @@ namespace nd4j {
             variable->setId(id);
 
             if (variable->getName() != nullptr && variable->getName()->length() != 0) {
-                //std::pair<std::string, nd4j::graph::Variable *> pair(*(variable->getName()), variable);
+                //std::pair<std::string, sd::graph::Variable *> pair(*(variable->getName()), variable);
                 _symbolic[*(variable->getName())] = variable;
             }
 
@@ -305,8 +305,8 @@ namespace nd4j {
             }
         }
 
-        void nd4j::graph::VariableSpace::putVariable(int id, int idx, NDArray &array) {
-            auto *var = new nd4j::graph::Variable(&array, "", id, idx);
+        void sd::graph::VariableSpace::putVariable(int id, int idx, NDArray &array) {
+            auto *var = new sd::graph::Variable(&array, "", id, idx);
             var->markRemovable(false);
             var->markReadOnly(true);
 
@@ -320,12 +320,12 @@ namespace nd4j {
                 delete var;
         }
 
-        void nd4j::graph::VariableSpace::putVariable(int id, NDArray *array) {
-            auto *var = new nd4j::graph::Variable(array);
+        void sd::graph::VariableSpace::putVariable(int id, NDArray *array) {
+            auto *var = new sd::graph::Variable(array);
             this->putVariable(id, var);
         }
 
-        nd4j::graph::Variable * nd4j::graph::VariableSpace::getVariable(int id) {
+        sd::graph::Variable * sd::graph::VariableSpace::getVariable(int id) {
             if (id < 0) {
                 return _variables.at(id);
             } else {
@@ -333,18 +333,18 @@ namespace nd4j {
             }
         }
 
-        LaunchContext* nd4j::graph::VariableSpace::launchContext() {
+        LaunchContext* sd::graph::VariableSpace::launchContext() {
             return LaunchContext::defaultContext();
         }
 
-        std::vector<Variable*>* nd4j::graph::VariableSpace::handles() {
+        std::vector<Variable*>* sd::graph::VariableSpace::handles() {
             return _handles;
         }
 
 /*
  * FIXME: this thing have nice chances to become backend-specific!
  */
-        nd4j::graph::VariableSpace::~VariableSpace() {
+        sd::graph::VariableSpace::~VariableSpace() {
             // loop through variables and release them
             for (auto p: *_handles) {
                 delete p;
diff --git a/libnd4j/include/graph/impl/VariablesSet.cpp b/libnd4j/include/graph/impl/VariablesSet.cpp
index 9b854da30..80f8e3728 100644
--- a/libnd4j/include/graph/impl/VariablesSet.cpp
+++ b/libnd4j/include/graph/impl/VariablesSet.cpp
@@ -20,7 +20,7 @@
 
 #include <graph/VariablesSet.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         Nd4jStatus VariablesSet::status() {
             return _status;
diff --git a/libnd4j/include/graph/profiling/GraphProfile.h b/libnd4j/include/graph/profiling/GraphProfile.h
index 28daf3ccf..f0ada4f90 100644
--- a/libnd4j/include/graph/profiling/GraphProfile.h
+++ b/libnd4j/include/graph/profiling/GraphProfile.h
@@ -22,14 +22,14 @@
 #define ND4J_GRAPH_PROFILE_H
 
 #include "NodeProfile.h"
-#include <pointercast.h>
-#include <dll.h>
+#include <system/pointercast.h>
+#include <system/dll.h>
 #include <vector>
 #include <string>
 #include <map>
 #include <chrono>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         class ND4J_EXPORT GraphProfile {
         private:
diff --git a/libnd4j/include/graph/profiling/GraphProfilingHelper.h b/libnd4j/include/graph/profiling/GraphProfilingHelper.h
index af4107e42..d32d99374 100644
--- a/libnd4j/include/graph/profiling/GraphProfilingHelper.h
+++ b/libnd4j/include/graph/profiling/GraphProfilingHelper.h
@@ -25,7 +25,7 @@
 #include <graph/Graph.h>
 #include "GraphProfile.h"
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         class GraphProfilingHelper {
         public:
diff --git a/libnd4j/include/graph/profiling/NodeProfile.h b/libnd4j/include/graph/profiling/NodeProfile.h
index 62df0c34a..871eb5748 100644
--- a/libnd4j/include/graph/profiling/NodeProfile.h
+++ b/libnd4j/include/graph/profiling/NodeProfile.h
@@ -21,12 +21,12 @@
 #ifndef LIBND4J_NODE_PROFILE_H
 #define LIBND4J_NODE_PROFILE_H
 
-#include <pointercast.h>
-#include <dll.h>
+#include <system/pointercast.h>
+#include <system/dll.h>
 #include <string>
 #include <vector>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         class ND4J_EXPORT NodeProfile {
         private:
diff --git a/libnd4j/include/graph/profiling/impl/GraphProfile.cpp b/libnd4j/include/graph/profiling/impl/GraphProfile.cpp
index ea8e7bc49..6c9edac15 100644
--- a/libnd4j/include/graph/profiling/impl/GraphProfile.cpp
+++ b/libnd4j/include/graph/profiling/impl/GraphProfile.cpp
@@ -21,10 +21,10 @@
 #include <graph/profiling/GraphProfile.h>
 #include <helpers/logger.h>
 #include <chrono>
-#include <templatemath.h>
+#include <math/templatemath.h>
 #include <algorithm>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         GraphProfile::GraphProfile() {
             updateLast();
@@ -200,7 +200,7 @@ namespace nd4j {
                 });
 
                 nd4j_printf("\nTop 30 reports by EXEC:\n", "");
-                auto limit = nd4j::math::nd4j_min<int>(30, sorted.size());
+                auto limit = sd::math::nd4j_min<int>(30, sorted.size());
                 for (int e = 0; e < limit; e++) {
                     sorted[e]->printOut();
                 }
diff --git a/libnd4j/include/graph/profiling/impl/GraphProfilingHelper.cpp b/libnd4j/include/graph/profiling/impl/GraphProfilingHelper.cpp
index cbea09616..03c2411e2 100644
--- a/libnd4j/include/graph/profiling/impl/GraphProfilingHelper.cpp
+++ b/libnd4j/include/graph/profiling/impl/GraphProfilingHelper.cpp
@@ -19,9 +19,9 @@
 //
 
 #include <graph/profiling/GraphProfilingHelper.h>
-#include <GraphExecutioner.h>
+#include <graph/GraphExecutioner.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         GraphProfile *GraphProfilingHelper::profile(Graph *graph, int iterations) {
 
diff --git a/libnd4j/include/graph/profiling/impl/NodeProfile.cpp b/libnd4j/include/graph/profiling/impl/NodeProfile.cpp
index a6a990eb8..bd48fbd28 100644
--- a/libnd4j/include/graph/profiling/impl/NodeProfile.cpp
+++ b/libnd4j/include/graph/profiling/impl/NodeProfile.cpp
@@ -22,7 +22,7 @@
 #include <graph/profiling/NodeProfile.h>
 #include <helpers/ShapeUtils.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         NodeProfile::NodeProfile(int id, const char *name) {
             _id = id;
diff --git a/libnd4j/include/graph/scheme/array.fbs b/libnd4j/include/graph/scheme/array.fbs
index 2ffce58bd..bb8118aad 100644
--- a/libnd4j/include/graph/scheme/array.fbs
+++ b/libnd4j/include/graph/scheme/array.fbs
@@ -14,7 +14,7 @@
  * SPDX-License-Identifier: Apache-2.0
  ******************************************************************************/
 
-namespace nd4j.graph;
+namespace sd.graph;
 
 // byte order for arrays/buffers
 enum ByteOrder:byte {
diff --git a/libnd4j/include/graph/scheme/config.fbs b/libnd4j/include/graph/scheme/config.fbs
index 6bd8e688c..0aac9c05d 100644
--- a/libnd4j/include/graph/scheme/config.fbs
+++ b/libnd4j/include/graph/scheme/config.fbs
@@ -14,7 +14,7 @@
  * SPDX-License-Identifier: Apache-2.0
  ******************************************************************************/
 
-namespace nd4j.graph;
+namespace sd.graph;
 
 enum ProfilingMode:byte {
     NONE,   // no checks for Z values
diff --git a/libnd4j/include/graph/scheme/graph.fbs b/libnd4j/include/graph/scheme/graph.fbs
index 0d5383dff..040af1642 100644
--- a/libnd4j/include/graph/scheme/graph.fbs
+++ b/libnd4j/include/graph/scheme/graph.fbs
@@ -22,7 +22,7 @@ include "result.fbs";
 include "request.fbs";
 include "array.fbs";
 
-namespace nd4j.graph;
+namespace sd.graph;
 
 table UpdaterState {
 	paramName:string;					//Name of the parameter the updater state is for
diff --git a/libnd4j/include/graph/scheme/node.fbs b/libnd4j/include/graph/scheme/node.fbs
index 8e63186f5..e3ad32b76 100644
--- a/libnd4j/include/graph/scheme/node.fbs
+++ b/libnd4j/include/graph/scheme/node.fbs
@@ -18,7 +18,7 @@ include "array.fbs";
 include "utils.fbs";
 include "properties.fbs";
 
-namespace nd4j.graph;
+namespace sd.graph;
 
 
 // this structure describes single operation within graph
diff --git a/libnd4j/include/graph/scheme/properties.fbs b/libnd4j/include/graph/scheme/properties.fbs
index 326c9450a..57ce5f7f3 100644
--- a/libnd4j/include/graph/scheme/properties.fbs
+++ b/libnd4j/include/graph/scheme/properties.fbs
@@ -16,7 +16,7 @@
 
 include "array.fbs";
 
-namespace nd4j.graph;
+namespace sd.graph;
 
 
 table FlatProperties {
diff --git a/libnd4j/include/graph/scheme/request.fbs b/libnd4j/include/graph/scheme/request.fbs
index 1a4b1ec0a..a55b5ce82 100644
--- a/libnd4j/include/graph/scheme/request.fbs
+++ b/libnd4j/include/graph/scheme/request.fbs
@@ -17,7 +17,7 @@
 include "variable.fbs";
 include "config.fbs";
 
-namespace nd4j.graph;
+namespace sd.graph;
 
 table FlatInferenceRequest {
     id:long; // id of the graph to be executed
diff --git a/libnd4j/include/graph/scheme/result.fbs b/libnd4j/include/graph/scheme/result.fbs
index 3f01c649b..f479a209a 100644
--- a/libnd4j/include/graph/scheme/result.fbs
+++ b/libnd4j/include/graph/scheme/result.fbs
@@ -18,7 +18,7 @@ include "node.fbs";
 include "utils.fbs";
 include "variable.fbs";
 
-namespace nd4j.graph;
+namespace sd.graph;
 
 table FlatTiming {
     id:int; // ID of the node for this time report
diff --git a/libnd4j/include/graph/scheme/uigraphevents.fbs b/libnd4j/include/graph/scheme/uigraphevents.fbs
index 9e24c6535..eb9fa13d6 100644
--- a/libnd4j/include/graph/scheme/uigraphevents.fbs
+++ b/libnd4j/include/graph/scheme/uigraphevents.fbs
@@ -16,7 +16,7 @@
 
 include "array.fbs";	//For FlatArray
 
-namespace nd4j.graph;
+namespace sd.graph;
 
 /*
 An "Event" is any value that may occur multiple times (score vs. iteration, or accuracy for example)
diff --git a/libnd4j/include/graph/scheme/uigraphstatic.fbs b/libnd4j/include/graph/scheme/uigraphstatic.fbs
index 814c28fa5..b0b19ce17 100644
--- a/libnd4j/include/graph/scheme/uigraphstatic.fbs
+++ b/libnd4j/include/graph/scheme/uigraphstatic.fbs
@@ -18,7 +18,7 @@ include "utils.fbs";		//For: IntPair
 include "variable.fbs";		//For: VarType
 include "array.fbs";		//For: DataType
 
-namespace nd4j.graph;
+namespace sd.graph;
 
 enum UIInfoType:byte {
 	GRAPH_STRUCTURE,
diff --git a/libnd4j/include/graph/scheme/utils.fbs b/libnd4j/include/graph/scheme/utils.fbs
index 67ce2e461..f2186869b 100644
--- a/libnd4j/include/graph/scheme/utils.fbs
+++ b/libnd4j/include/graph/scheme/utils.fbs
@@ -14,7 +14,7 @@
  * SPDX-License-Identifier: Apache-2.0
  ******************************************************************************/
 
-namespace nd4j.graph;
+namespace sd.graph;
 
 table LongPair {
     first:long; // first
diff --git a/libnd4j/include/graph/scheme/variable.fbs b/libnd4j/include/graph/scheme/variable.fbs
index 1e8010d43..da5c3fb11 100644
--- a/libnd4j/include/graph/scheme/variable.fbs
+++ b/libnd4j/include/graph/scheme/variable.fbs
@@ -17,7 +17,7 @@
 include "array.fbs";
 include "utils.fbs";
 
-namespace nd4j.graph;
+namespace sd.graph;
 
 // Variable type for variables
 enum VarType:byte {
diff --git a/libnd4j/include/helpers/ArrayUtils.h b/libnd4j/include/helpers/ArrayUtils.h
index 54e1a4f19..2ecebeb4a 100644
--- a/libnd4j/include/helpers/ArrayUtils.h
+++ b/libnd4j/include/helpers/ArrayUtils.h
@@ -24,9 +24,9 @@
 #include <initializer_list>
 #include <vector>
 #include <cstring>
-#include <pointercast.h>
+#include <system/pointercast.h>
 
-namespace nd4j {
+namespace sd {
     namespace ArrayUtils {
         void toIntPtr(std::initializer_list<int> list, int* target);
         void toIntPtr(std::vector<int>& list, int* target);
diff --git a/libnd4j/include/helpers/AttentionHelper.h b/libnd4j/include/helpers/AttentionHelper.h
index 186f959fd..02d9da995 100644
--- a/libnd4j/include/helpers/AttentionHelper.h
+++ b/libnd4j/include/helpers/AttentionHelper.h
@@ -21,14 +21,14 @@
 #ifndef LIBND4J_ATTENTIONHELPER_H
 #define LIBND4J_ATTENTIONHELPER_H
 
-#include "NDArray.h"
+#include "array/NDArray.h"
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT AttentionHelper {
 
     public:
-        static nd4j::NDArray multiHeadProject(const nd4j::NDArray* input, const nd4j::NDArray* projectionMatrix, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
-        static void multiHeadProjectBp(const nd4j::NDArray* input, const nd4j::NDArray* projectionMatrix, const nd4j::NDArray* eps, nd4j::NDArray* dLdInput, nd4j::NDArray* dLdProjectionMatrix, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
+        static sd::NDArray multiHeadProject(const sd::NDArray* input, const sd::NDArray* projectionMatrix, sd::LaunchContext * context = sd::LaunchContext ::defaultContext());
+        static void multiHeadProjectBp(const sd::NDArray* input, const sd::NDArray* projectionMatrix, const sd::NDArray* eps, sd::NDArray* dLdInput, sd::NDArray* dLdProjectionMatrix, sd::LaunchContext * context = sd::LaunchContext ::defaultContext());
     };
 }
 
diff --git a/libnd4j/include/helpers/BenchmarkHelper.h b/libnd4j/include/helpers/BenchmarkHelper.h
index 8dc946a2a..f76f787d8 100644
--- a/libnd4j/include/helpers/BenchmarkHelper.h
+++ b/libnd4j/include/helpers/BenchmarkHelper.h
@@ -32,17 +32,17 @@
 #include <helpers/benchmark/BroadcastBenchmark.h>
 #include <ops/declarable/DeclarableOp.h>
 #include <graph/Context.h>
-#include <NDArray.h>
-#include <benchmark/Parameters.h>
-#include <benchmark/PredefinedParameters.h>
-#include <benchmark/ParametersBatch.h>
-#include <benchmark/ParametersSpace.h>
-#include <benchmark/BoolParameters.h>
-#include <benchmark/IntParameters.h>
-#include <benchmark/IntPowerParameters.h>
+#include <array/NDArray.h>
+#include <helpers/benchmark/Parameters.h>
+#include <helpers/benchmark/PredefinedParameters.h>
+#include <helpers/benchmark/ParametersBatch.h>
+#include <helpers/benchmark/ParametersSpace.h>
+#include <helpers/benchmark/BoolParameters.h>
+#include <helpers/benchmark/IntParameters.h>
+#include <helpers/benchmark/IntPowerParameters.h>
 #include <array/ResultSet.h>
 
-namespace nd4j {
+namespace sd {
 
     class ND4J_EXPORT BenchmarkHelper {
     private:
@@ -54,7 +54,7 @@ namespace nd4j {
 
         void benchmarkScalarOperation(scalar::Ops op, std::string testName, double value, NDArray &x, NDArray &z);
 
-        void benchmarkDeclarableOp(nd4j::ops::DeclarableOp &op, std::string testName, Context &context);
+        void benchmarkDeclarableOp(sd::ops::DeclarableOp &op, std::string testName, Context &context);
 
         void benchmarkGEMM(char orderA, std::initializer_list<Nd4jLong> shapeA, char orderB, std::initializer_list<Nd4jLong> shapeB, char orderC, std::initializer_list<Nd4jLong> shapeC);
 
diff --git a/libnd4j/include/helpers/BitwiseUtils.h b/libnd4j/include/helpers/BitwiseUtils.h
index 6defc4c49..6b7e5c231 100644
--- a/libnd4j/include/helpers/BitwiseUtils.h
+++ b/libnd4j/include/helpers/BitwiseUtils.h
@@ -23,11 +23,11 @@
 
 #include <vector>
 #include <array/ByteOrder.h>
-#include <op_boilerplate.h>
-#include <pointercast.h>
+#include <system/op_boilerplate.h>
+#include <system/pointercast.h>
 #include <climits>
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT BitwiseUtils {
     public:
 
@@ -55,7 +55,7 @@ namespace nd4j {
          * This method returns enum
          * @return
          */
-        static nd4j::ByteOrder asByteOrder();
+        static sd::ByteOrder asByteOrder();
 
         /**
          * This method swaps bytes: LE vs BE
diff --git a/libnd4j/include/helpers/BlasHelper.h b/libnd4j/include/helpers/BlasHelper.h
index 3b6179b68..b2fe7b60c 100644
--- a/libnd4j/include/helpers/BlasHelper.h
+++ b/libnd4j/include/helpers/BlasHelper.h
@@ -21,7 +21,7 @@
 #ifndef LIBND4J_BLAS_HELPER_H
 #define LIBND4J_BLAS_HELPER_H
 
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <types/float16.h>
 #include <cblas.h>
 #include <helpers/logger.h>
@@ -34,7 +34,7 @@
 #define CUSOLVERAPI 
 #endif
 
-namespace nd4j {
+namespace sd {
     typedef enum{
         CUBLAS_STATUS_SUCCESS         =0,
         CUBLAS_STATUS_NOT_INITIALIZED =1,
@@ -415,8 +415,8 @@ namespace nd4j {
         template <typename T>
         bool hasGEMM();
 
-        bool hasGEMM(const nd4j::DataType dtype);
-        bool hasGEMV(const nd4j::DataType dtype);
+        bool hasGEMM(const sd::DataType dtype);
+        bool hasGEMV(const sd::DataType dtype);
 
         template <typename T>
         bool hasBatchedGEMM();
diff --git a/libnd4j/include/helpers/ConstantHelper.h b/libnd4j/include/helpers/ConstantHelper.h
index 07ae6d156..3e5681fb6 100644
--- a/libnd4j/include/helpers/ConstantHelper.h
+++ b/libnd4j/include/helpers/ConstantHelper.h
@@ -21,9 +21,9 @@
 #ifndef DEV_TESTS_CONSTANTHELPER_H
 #define DEV_TESTS_CONSTANTHELPER_H
 
-#include <op_boilerplate.h>
-#include <dll.h>
-#include <pointercast.h>
+#include <system/op_boilerplate.h>
+#include <system/dll.h>
+#include <system/pointercast.h>
 #include <memory/Workspace.h>
 #include <vector>
 #include <map>
@@ -32,7 +32,7 @@
 #include <array/ConstantDataBuffer.h>
 #include <array/ConstantHolder.h>
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT ConstantHelper {
     private:
         static ConstantHelper* _INSTANCE;
@@ -55,7 +55,7 @@ namespace nd4j {
         static int getNumberOfDevices();
         void* replicatePointer(void *src, size_t numBytes, memory::Workspace *workspace = nullptr);
 
-        ConstantDataBuffer* constantBuffer(const ConstantDescriptor &descriptor, nd4j::DataType dataType);
+        ConstantDataBuffer* constantBuffer(const ConstantDescriptor &descriptor, sd::DataType dataType);
 
         Nd4jLong getCachedAmount(int deviceId);
     };
diff --git a/libnd4j/include/helpers/ConstantShapeHelper.h b/libnd4j/include/helpers/ConstantShapeHelper.h
index 3184a3675..56f9d6aeb 100644
--- a/libnd4j/include/helpers/ConstantShapeHelper.h
+++ b/libnd4j/include/helpers/ConstantShapeHelper.h
@@ -21,17 +21,17 @@
 #ifndef DEV_TESTS_CONSTANTSHAPEHELPER_H
 #define DEV_TESTS_CONSTANTSHAPEHELPER_H
 
-#include <dll.h>
-#include <pointercast.h>
+#include <system/dll.h>
+#include <system/pointercast.h>
 #include <map>
 #include <mutex>
 #include <vector>
-#include <ShapeDescriptor.h>
+#include <array/ShapeDescriptor.h>
 #include <array/ConstantDataBuffer.h>
 #include <memory/Workspace.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 
-namespace nd4j {
+namespace sd {
 
     class ND4J_EXPORT ConstantShapeHelper {
     private:
@@ -48,21 +48,21 @@ namespace nd4j {
         static ConstantShapeHelper* getInstance();
 
 
-        ConstantDataBuffer bufferForShapeInfo(nd4j::DataType dataType, char order, const std::vector<Nd4jLong> &shape);
+        ConstantDataBuffer bufferForShapeInfo(sd::DataType dataType, char order, const std::vector<Nd4jLong> &shape);
         ConstantDataBuffer bufferForShapeInfo(const ShapeDescriptor &descriptor);
         ConstantDataBuffer bufferForShapeInfo(const Nd4jLong *shapeInfo);
-        ConstantDataBuffer bufferForShapeInfo(const nd4j::DataType dataType, const char order, const int rank, const Nd4jLong* shape);
+        ConstantDataBuffer bufferForShapeInfo(const sd::DataType dataType, const char order, const int rank, const Nd4jLong* shape);
 
 
-        Nd4jLong* emptyShapeInfo(const nd4j::DataType dataType);
-        Nd4jLong* scalarShapeInfo(const nd4j::DataType dataType);
-        Nd4jLong* vectorShapeInfo(const Nd4jLong length, const nd4j::DataType dataType);
+        Nd4jLong* emptyShapeInfo(const sd::DataType dataType);
+        Nd4jLong* scalarShapeInfo(const sd::DataType dataType);
+        Nd4jLong* vectorShapeInfo(const Nd4jLong length, const sd::DataType dataType);
         Nd4jLong* createShapeInfo(const ShapeDescriptor &descriptor);
-        Nd4jLong* createShapeInfo(const nd4j::DataType dataType, const char order, const std::vector<Nd4jLong> &shape);
-        Nd4jLong* createShapeInfo(const nd4j::DataType dataType, const char order, const int rank, const Nd4jLong* shape);
-        Nd4jLong* createShapeInfo(const nd4j::DataType dataType, const Nd4jLong* shapeInfo);
+        Nd4jLong* createShapeInfo(const sd::DataType dataType, const char order, const std::vector<Nd4jLong> &shape);
+        Nd4jLong* createShapeInfo(const sd::DataType dataType, const char order, const int rank, const Nd4jLong* shape);
+        Nd4jLong* createShapeInfo(const sd::DataType dataType, const Nd4jLong* shapeInfo);
 
-        Nd4jLong* createFromExisting(Nd4jLong *shapeInfo, nd4j::memory::Workspace *workspace);
+        Nd4jLong* createFromExisting(Nd4jLong *shapeInfo, sd::memory::Workspace *workspace);
         Nd4jLong* createFromExisting(Nd4jLong *shapeInfo, bool destroyOriginal = true);
 
         bool checkBufferExistenceForShapeInfo(ShapeDescriptor &descriptor);
diff --git a/libnd4j/include/helpers/ConstantTadHelper.h b/libnd4j/include/helpers/ConstantTadHelper.h
index 3a79a74e3..80efaa86f 100644
--- a/libnd4j/include/helpers/ConstantTadHelper.h
+++ b/libnd4j/include/helpers/ConstantTadHelper.h
@@ -22,9 +22,9 @@
 #ifndef DEV_TESTS_CONSTANTTADHELPER_H
 #define DEV_TESTS_CONSTANTTADHELPER_H
 
-#include <dll.h>
-#include <op_boilerplate.h>
-#include <pointercast.h>
+#include <system/dll.h>
+#include <system/op_boilerplate.h>
+#include <system/pointercast.h>
 #include <map>
 #include <vector>
 #include <mutex>
@@ -32,7 +32,7 @@
 #include <array/TadDescriptor.h>
 #include <array/TadPack.h>
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT ConstantTadHelper {
     private:
         static ConstantTadHelper *_INSTANCE;
diff --git a/libnd4j/include/helpers/CudaLaunchHelper.h b/libnd4j/include/helpers/CudaLaunchHelper.h
index 9fec14764..6bf44317f 100644
--- a/libnd4j/include/helpers/CudaLaunchHelper.h
+++ b/libnd4j/include/helpers/CudaLaunchHelper.h
@@ -22,12 +22,12 @@
 #define LIBND4J_CUDALAUNCHHELPER_H
 
 
-#include <pointercast.h>
-#include <dll.h>
-#include <op_boilerplate.h>
+#include <system/pointercast.h>
+#include <system/dll.h>
+#include <system/op_boilerplate.h>
 #include <types/triple.h>
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT CudaLaunchHelper {
     public:
         static Triple getFlatLaunchParams(Nd4jLong length, int SM, int CORES, int SHARED_MEMORY);
diff --git a/libnd4j/include/helpers/DebugHelper.h b/libnd4j/include/helpers/DebugHelper.h
index 3c3fe1d58..b0387dd8c 100644
--- a/libnd4j/include/helpers/DebugHelper.h
+++ b/libnd4j/include/helpers/DebugHelper.h
@@ -21,10 +21,10 @@
 #ifndef LIBND4J_DEBUGHELPER_H
 #define LIBND4J_DEBUGHELPER_H
 
-#include <pointercast.h>
-#include <op_boilerplate.h>
-#include <Environment.h>
-#include <StringUtils.h>
+#include <system/pointercast.h>
+#include <system/op_boilerplate.h>
+#include <system/Environment.h>
+#include <helpers/StringUtils.h>
 #include <string>
 
 
@@ -35,8 +35,8 @@
 #include <cuda_runtime_api.h>
 
 #endif
-#include <DebugInfo.h>
-namespace nd4j {
+#include <helpers/DebugInfo.h>
+namespace sd {
     class NDArray;
     class ND4J_EXPORT DebugHelper {
     public:
diff --git a/libnd4j/include/helpers/DebugInfo.h b/libnd4j/include/helpers/DebugInfo.h
index 345ce0b61..c2efb00fe 100644
--- a/libnd4j/include/helpers/DebugInfo.h
+++ b/libnd4j/include/helpers/DebugInfo.h
@@ -21,13 +21,13 @@
 #ifndef LIBND4J__DEBUG_INFO_HELPER__H
 #define LIBND4J__DEBUG_INFO_HELPER__H
 
-#include <pointercast.h>
-#include <op_boilerplate.h>
-#include <Environment.h>
-#include <StringUtils.h>
+#include <system/pointercast.h>
+#include <system/op_boilerplate.h>
+#include <system/Environment.h>
+#include <helpers/StringUtils.h>
 #include <string>
-#include <dll.h>
-#include <templatemath.h>
+#include <system/dll.h>
+#include <math/templatemath.h>
 
 #ifdef __CUDACC__
 
@@ -37,7 +37,7 @@
 
 #endif
 
-namespace nd4j {
+namespace sd {
     struct ND4J_EXPORT DebugInfo {
        double _minValue;
        double _maxValue;
@@ -51,10 +51,10 @@ namespace nd4j {
     };
 
     FORCEINLINE bool operator==(DebugInfo const& first, DebugInfo const& second) {
-        return nd4j::math::nd4j_abs(first._minValue - second._minValue) < 0.000001 &&
-        nd4j::math::nd4j_abs(first._maxValue  -   second._maxValue) < 0.000001  &&
-        nd4j::math::nd4j_abs(first._meanValue -  second._meanValue) < 0.000001  &&
-        nd4j::math::nd4j_abs(first._stdDevValue - second._stdDevValue) < 0.000001  &&
+        return sd::math::nd4j_abs(first._minValue - second._minValue) < 0.000001 &&
+        sd::math::nd4j_abs(first._maxValue  -   second._maxValue) < 0.000001  &&
+        sd::math::nd4j_abs(first._meanValue -  second._meanValue) < 0.000001  &&
+        sd::math::nd4j_abs(first._stdDevValue - second._stdDevValue) < 0.000001  &&
         first._zeroCount   ==   second._zeroCount &&
         first._positiveCount == second._positiveCount &&
         first._negativeCount == second._negativeCount &&
diff --git a/libnd4j/include/helpers/EnumUtils.h b/libnd4j/include/helpers/EnumUtils.h
index 28b80b50e..6138117c7 100644
--- a/libnd4j/include/helpers/EnumUtils.h
+++ b/libnd4j/include/helpers/EnumUtils.h
@@ -24,11 +24,11 @@
 #include <graph/VariableType.h>
 #include <graph/generated/node_generated.h>
 
-namespace nd4j {
+namespace sd {
     class EnumUtils {
     public:
-        static const char * _VariableTypeToString(nd4j::graph::VariableType variableType);
-        static const char * _OpTypeToString(nd4j::graph::OpType opType);
+        static const char * _VariableTypeToString(sd::graph::VariableType variableType);
+        static const char * _OpTypeToString(sd::graph::OpType opType);
         static const char * _LogicOpToString(int opNum);
     };
 }
diff --git a/libnd4j/include/helpers/GradCheck.h b/libnd4j/include/helpers/GradCheck.h
index 32f66109a..0d184a5a1 100644
--- a/libnd4j/include/helpers/GradCheck.h
+++ b/libnd4j/include/helpers/GradCheck.h
@@ -22,10 +22,10 @@
 #define LIBND4J_GRADCHECK_H
 
 
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <ops/declarable/DeclarableOp.h>
 
-namespace nd4j {
+namespace sd {
 
 class ND4J_EXPORT GradCheck {
 
diff --git a/libnd4j/include/helpers/LoopKind.h b/libnd4j/include/helpers/LoopKind.h
index 541e6e5a7..95e9238ad 100644
--- a/libnd4j/include/helpers/LoopKind.h
+++ b/libnd4j/include/helpers/LoopKind.h
@@ -23,15 +23,15 @@
 
 
 // #include <pointercast.h>
-#include <shape.h>
-// #include <OmpLaunchHelper.h>
-// #include <DataTypeUtils.h>
+#include <helpers/shape.h>
+// #include <helpers/OmpLaunchHelper.h>
+// #include <array/DataTypeUtils.h>
 // #include <ops.h>
 // #include <indexreduce.h>
 // #include <helpers/ConstantTadHelper.h>
 // #include <openmp_pragmas.h>
 
-namespace nd4j {
+namespace sd {
 
 
 class ND4J_EXPORT LoopKind {
@@ -110,17 +110,17 @@ LoopKind::Kind LoopKind::deduceKindOfLoopBroadcast(const Nd4jLong* xShapeInfo, c
 
     if (bNDLoopsRanks && bNotCommonVectorCase) {
         // case x[3,4,5] * y[1,4,5] = z[3,4,5] or reverse x[1,4,5] + y[3,4,5] = z[3,4,5]
-        if (nd4j::LoopKind::EWS1 == deduceKindOfLoopXYZ(xShapeInfo, yShapeInfo, zShapeInfo)
+        if (sd::LoopKind::EWS1 == deduceKindOfLoopXYZ(xShapeInfo, yShapeInfo, zShapeInfo)
             && (1 == shape::sizeAt(yShapeInfo, 0) || 1 == shape::sizeAt(xShapeInfo, 0))) {
             return EWS1;
         }
 
         if (3 == xRank)
-            return nd4j::LoopKind::BROADCAST_3D;
+            return sd::LoopKind::BROADCAST_3D;
         if (4 == xRank)
-            return nd4j::LoopKind::BROADCAST_4D;
+            return sd::LoopKind::BROADCAST_4D;
         if (5 == xRank)
-            return nd4j::LoopKind::BROADCAST_5D;
+            return sd::LoopKind::BROADCAST_5D;
 
     }
 
@@ -136,12 +136,12 @@ LoopKind::Kind LoopKind::deduceKindOfLoopBroadcast(const Nd4jLong* xShapeInfo, c
         auto detect = xShapeInfo[xRank] == 1 ? -1 : (yShapeInfo[xRank] == 1) ? 1 : 0;
 
         if (detect == 1)
-            return nd4j::LoopKind::BROADCAST_SCALAR_Y;
+            return sd::LoopKind::BROADCAST_SCALAR_Y;
         else if (detect == -1)
-            return nd4j::LoopKind::BROADCAST_SCALAR_X;
+            return sd::LoopKind::BROADCAST_SCALAR_X;
         }
 
-    return nd4j::LoopKind::COMMON;
+    return sd::LoopKind::COMMON;
 }
 
 //////////////////////////////////////////////////////////////////////////////
diff --git a/libnd4j/include/helpers/Loops.h b/libnd4j/include/helpers/Loops.h
index 680a5f0aa..508b84f20 100644
--- a/libnd4j/include/helpers/Loops.h
+++ b/libnd4j/include/helpers/Loops.h
@@ -22,18 +22,18 @@
 #define LIBND4J_LOOPS_H
 
 #include <functional>
-#include <pointercast.h>
-#include <shape.h>
-#include <LoopKind.h>
-#include <OmpLaunchHelper.h>
-#include <DataTypeUtils.h>
-#include <ops.h>
-#include <indexreduce.h>
+#include <system/pointercast.h>
+#include <helpers/shape.h>
+#include <helpers/LoopKind.h>
+#include <helpers/OmpLaunchHelper.h>
+#include <array/DataTypeUtils.h>
+#include <ops/ops.h>
+#include <loops/indexreduce.h>
 #include <helpers/ConstantTadHelper.h>
-#include <openmp_pragmas.h>
+#include <system/openmp_pragmas.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
 
     template <typename X, typename Z, typename E>
     class ND4J_EXPORT ReductionLoops {
@@ -263,7 +263,7 @@ namespace nd4j {
     //////////////////////////////////////////////////////////////////////////////
     template<typename X, typename Z, typename E>
     template <typename OpType>
-    void nd4j::ReductionLoops<X, Z, E>::loopReduce(X* x, Nd4jLong* xShapeInfo,
+    void sd::ReductionLoops<X, Z, E>::loopReduce(X* x, Nd4jLong* xShapeInfo,
         Z* z, Nd4jLong* zShapeInfo,
         Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets,
         E* extraParams, int64_t start, int64_t stop) {
@@ -429,7 +429,7 @@ namespace nd4j {
         //*********************************************//
         case LoopKind::X_EWSNONZERO: {
             uint castZShapeInfo[MAX_RANK];
-            const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo, castZShapeInfo);
+            const bool canCastZ = sd::DataTypeUtils::castShapeInfo<uint>(zShapeInfo, castZShapeInfo);
 
             for (auto i = start; i < stop; i++) {
                 auto tad = x + tadOffsets[i];
@@ -447,7 +447,7 @@ namespace nd4j {
         //*********************************************//
         case LoopKind::Z_EWSNONZERO: {
             uint castTadShapeInfo[MAX_RANK];
-            const bool canCastTad = nd4j::DataTypeUtils::castShapeInfo<uint>(tadShapeInfo, castTadShapeInfo);
+            const bool canCastTad = sd::DataTypeUtils::castShapeInfo<uint>(tadShapeInfo, castTadShapeInfo);
 
             for (auto i = start; i < stop; i++) {
                 auto tad = x + tadOffsets[i];
@@ -469,7 +469,7 @@ namespace nd4j {
             shape::calcOffsets(tadShapeInfo, innertadOffsets);
 
             uint castZShapeInfo[MAX_RANK];
-            const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo, castZShapeInfo);
+            const bool canCastZ = sd::DataTypeUtils::castShapeInfo<uint>(zShapeInfo, castZShapeInfo);
 
             for (auto i = start; i < stop; i++) {
                 auto tad = x + tadOffsets[i];
@@ -492,7 +492,7 @@ namespace nd4j {
     //////////////////////////////////////////////////////////////////////////////
     template <typename X, typename Z, typename E>
     template <typename OpType>
-    void nd4j::TransformLoops<X, Z, E>::loopTransform(X* x, Nd4jLong* xShapeInfo,
+    void sd::TransformLoops<X, Z, E>::loopTransform(X* x, Nd4jLong* xShapeInfo,
         Z* z, Nd4jLong* zShapeInfo,
         E* extraParams, uint64_t threadId, uint64_t numThreads) {
 
@@ -536,7 +536,7 @@ namespace nd4j {
         case LoopKind::Z_EWSNONZERO: {
             const uint zEws = shape::elementWiseStride(zShapeInfo);
             uint castXShapeInfo[MAX_RANK];
-            const bool canCastX = nd4j::DataTypeUtils::castShapeInfo<uint>(xShapeInfo, castXShapeInfo);
+            const bool canCastX = sd::DataTypeUtils::castShapeInfo<uint>(xShapeInfo, castXShapeInfo);
 
             auto span = samediff::Span::build(threadId, numThreads, 0, len, 1);
             int64_t start = span.startX(), stop = span.stopX();
@@ -682,7 +682,7 @@ namespace nd4j {
     //////////////////////////////////////////////////////////////////////////////
     template<typename X, typename Z>
     template <typename OpType>
-    void nd4j::Reduction3Loops<X, Z>::loopReduce3(X* x, Nd4jLong* xShapeInfo,
+    void sd::Reduction3Loops<X, Z>::loopReduce3(X* x, Nd4jLong* xShapeInfo,
         X* y, Nd4jLong* yShapeInfo,
         Z* z, Nd4jLong* zShapeInfo,
         int* dims, int dimsLen,
@@ -700,21 +700,21 @@ namespace nd4j {
         std::vector<Nd4jLong> zeroOffsets;
 
         if (xLen == yLen) {
-            tadPackX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dims, dimsLen);
-            tadPackY = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dims, dimsLen);
+            tadPackX = sd::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dims, dimsLen);
+            tadPackY = sd::ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dims, dimsLen);
             xTadShapeInfo = tadPackX.primaryShapeInfo();
             yTadShapeInfo = tadPackY.primaryShapeInfo();
             xTadOffsets = tadPackX.primaryOffsets();
             yTadOffsets = tadPackY.primaryOffsets();
         }
         else if (yLen > xLen) {
-            tadPackY = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dims, dimsLen);
+            tadPackY = sd::ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dims, dimsLen);
             xTadShapeInfo = xShapeInfo;
             yTadShapeInfo = tadPackY.primaryShapeInfo();
             yTadOffsets = tadPackY.primaryOffsets();
         }
         else {
-            tadPackX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dims, dimsLen);
+            tadPackX = sd::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dims, dimsLen);
             yTadShapeInfo = yShapeInfo;
             xTadShapeInfo = tadPackX.primaryShapeInfo();
             xTadOffsets = tadPackX.primaryOffsets();
@@ -912,7 +912,7 @@ namespace nd4j {
         //*********************************************//
         default: {
             uint castXTadShapeInfo[MAX_RANK];
-            const bool canCastXTad = nd4j::DataTypeUtils::castShapeInfo<uint>(xTadShapeInfo, castXTadShapeInfo);
+            const bool canCastXTad = sd::DataTypeUtils::castShapeInfo<uint>(xTadShapeInfo, castXTadShapeInfo);
 
             if (shape::haveSameShapeAndStrides(xTadShapeInfo, yTadShapeInfo)) {
                 Z extraParams[3];
@@ -935,7 +935,7 @@ namespace nd4j {
             }
             else {
                 uint castYTadShapeInfo[MAX_RANK];
-                const bool canCastYTad = nd4j::DataTypeUtils::castShapeInfo<uint>(yTadShapeInfo, castYTadShapeInfo);
+                const bool canCastYTad = sd::DataTypeUtils::castShapeInfo<uint>(yTadShapeInfo, castYTadShapeInfo);
 
                 Z extraParams[3];
                 for (auto i = start; i < stop; i++) {
@@ -962,7 +962,7 @@ namespace nd4j {
     //////////////////////////////////////////////////////////////////////////////
     template<typename X, typename Z>
     template <typename OpType>
-    void nd4j::Reduction3Loops<X, Z>::loopReduce3All(X* x, Nd4jLong* xShapeInfo,
+    void sd::Reduction3Loops<X, Z>::loopReduce3All(X* x, Nd4jLong* xShapeInfo,
         X* y, Nd4jLong* yShapeInfo,
         Z* z, Nd4jLong* zShapeInfo,
         Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets,
@@ -1188,7 +1188,7 @@ namespace nd4j {
         //*********************************************//
         default: {
             uint castXTadShapeInfo[MAX_RANK];
-            const bool canCastXTad = nd4j::DataTypeUtils::castShapeInfo<uint>(xTadShapeInfo, castXTadShapeInfo);
+            const bool canCastXTad = sd::DataTypeUtils::castShapeInfo<uint>(xTadShapeInfo, castXTadShapeInfo);
 
             if (shape::haveSameShapeAndStrides(xTadShapeInfo, yTadShapeInfo)) {
                 Z extraParams[3];
@@ -1213,7 +1213,7 @@ namespace nd4j {
             }
             else {
                 uint castYTadShapeInfo[MAX_RANK];
-                const bool canCastYTad = nd4j::DataTypeUtils::castShapeInfo<uint>(yTadShapeInfo, castYTadShapeInfo);
+                const bool canCastYTad = sd::DataTypeUtils::castShapeInfo<uint>(yTadShapeInfo, castYTadShapeInfo);
 
                 Z extraParams[3];
                 for (Nd4jLong ix = 0; ix < numXTads; ix++) {
diff --git a/libnd4j/include/helpers/Loops.hpp b/libnd4j/include/helpers/Loops.hpp
index 95b844340..852ef4808 100644
--- a/libnd4j/include/helpers/Loops.hpp
+++ b/libnd4j/include/helpers/Loops.hpp
@@ -22,12 +22,12 @@
 //#define LIBND4J_LOOPS_CPP
 
 #include <Loops.h>
-#include <shape.h>
-#include <OmpLaunchHelper.h>
-#include <DataTypeUtils.h>
+#include <helpers/shape.h>
+#include <helpers/OmpLaunchHelper.h>
+#include <array/DataTypeUtils.h>
 
 
-namespace nd4j {
+namespace sd {
 
 
 }
diff --git a/libnd4j/include/helpers/LoopsCoordsHelper.h b/libnd4j/include/helpers/LoopsCoordsHelper.h
index 35f9d2063..cd578b62a 100644
--- a/libnd4j/include/helpers/LoopsCoordsHelper.h
+++ b/libnd4j/include/helpers/LoopsCoordsHelper.h
@@ -23,9 +23,9 @@
 #include <cstddef>
 #include <type_traits>
 #include <utility>
-#include <pointercast.h>
-#include <op_boilerplate.h>
-namespace nd4j {
+#include <system/pointercast.h>
+#include <system/op_boilerplate.h>
+namespace sd {
 
 #if defined(__GNUC__)
 #define likely(x) __builtin_expect( (x), 1)	 
@@ -78,16 +78,16 @@ namespace nd4j {
 		ZipCoordsState() {}
 	};
 
-#define COORDS(x,index)          ((x).::nd4j::CoordsState<(index)>::coord)
-#define STRIDE(x,index)          ((x).::nd4j::CoordsState<(index)>::stride)
-#define LAST_NUM(x,index)        ((x).::nd4j::CoordsState<(index)>::last_num)
-#define OF_ADJUST(x,index)       ((x).::nd4j::CoordsState<(index)>::adjust)
-#define ZIP_LAST_NUM(x,index)    ((x).::nd4j::ZipCoordsState<(index)>::last_num)
-#define ZIP_COORDS(x,index)      ((x).::nd4j::ZipCoordsState<(index)>::coord)
-#define ZIP_STRIDE1(x,index)     ((x).::nd4j::ZipCoordsState<(index)>::stride1)
-#define ZIP_STRIDE2(x,index)     ((x).::nd4j::ZipCoordsState<(index)>::stride2)
-#define ZIP_OF_ADJUST1(x,index)  ((x).::nd4j::ZipCoordsState<(index)>::adjust1)
-#define ZIP_OF_ADJUST2(x,index)  ((x).::nd4j::ZipCoordsState<(index)>::adjust2)
+#define COORDS(x,index)          ((x).::sd::CoordsState<(index)>::coord)
+#define STRIDE(x,index)          ((x).::sd::CoordsState<(index)>::stride)
+#define LAST_NUM(x,index)        ((x).::sd::CoordsState<(index)>::last_num)
+#define OF_ADJUST(x,index)       ((x).::sd::CoordsState<(index)>::adjust)
+#define ZIP_LAST_NUM(x,index)    ((x).::sd::ZipCoordsState<(index)>::last_num)
+#define ZIP_COORDS(x,index)      ((x).::sd::ZipCoordsState<(index)>::coord)
+#define ZIP_STRIDE1(x,index)     ((x).::sd::ZipCoordsState<(index)>::stride1)
+#define ZIP_STRIDE2(x,index)     ((x).::sd::ZipCoordsState<(index)>::stride2)
+#define ZIP_OF_ADJUST1(x,index)  ((x).::sd::ZipCoordsState<(index)>::adjust1)
+#define ZIP_OF_ADJUST2(x,index)  ((x).::sd::ZipCoordsState<(index)>::adjust2)
 
 
 	FORCEINLINE void   index2coords_C(Nd4jLong index, const Nd4jLong rank, const Nd4jLong* bases, Nd4jLong* coords) {
diff --git a/libnd4j/include/helpers/MKLDNNStream.h b/libnd4j/include/helpers/MKLDNNStream.h
index f88ec0e62..f575c48d9 100644
--- a/libnd4j/include/helpers/MKLDNNStream.h
+++ b/libnd4j/include/helpers/MKLDNNStream.h
@@ -27,7 +27,7 @@
 
 #if defined(HAVE_MKLDNN)
 
-namespace nd4j {
+namespace sd {
     class MKLDNNStream {
     protected:
         std::string _opName;
@@ -47,7 +47,7 @@ namespace nd4j {
         static bool isSupported(const std::vector<const NDArray*> &arrays) {
             // FIXME: strict float support doesn't work anymore
             for (auto v:arrays) {
-                if (v != nullptr && v->dataType() != nd4j::DataType::FLOAT32) {
+                if (v != nullptr && v->dataType() != sd::DataType::FLOAT32) {
                     return false;
                 }
             }
diff --git a/libnd4j/include/helpers/MmulHelper.h b/libnd4j/include/helpers/MmulHelper.h
index 76244d050..6e38be5c1 100644
--- a/libnd4j/include/helpers/MmulHelper.h
+++ b/libnd4j/include/helpers/MmulHelper.h
@@ -22,45 +22,45 @@
 #ifndef LIBND4J_MMULHELPER_H
 #define LIBND4J_MMULHELPER_H
 
-#include "NDArray.h"
+#include "array/NDArray.h"
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT MmulHelper {
 
     private:
 
         // multiptication N-dimensions tensor on other N-dimensions one
-        static nd4j::NDArray* mmulNxN(const nd4j::NDArray* A, const nd4j::NDArray* B, nd4j::NDArray* C, const double alpha = 1.0, const double beta = 0.0, const char outOrder = 'f');
+        static sd::NDArray* mmulNxN(const sd::NDArray* A, const sd::NDArray* B, sd::NDArray* C, const double alpha = 1.0, const double beta = 0.0, const char outOrder = 'f');
 
         // dot product of vectors (X * Y) = Z[0]
-        static nd4j::NDArray* dot(const nd4j::NDArray* X, const nd4j::NDArray* Y, nd4j::NDArray* Z, const double alpha = 1.0, const double beta = 0.0);
+        static sd::NDArray* dot(const sd::NDArray* X, const sd::NDArray* Y, sd::NDArray* Z, const double alpha = 1.0, const double beta = 0.0);
 
         // multiptication Matrix to Matrix
-        static nd4j::NDArray* mmulMxM(const nd4j::NDArray* A, const nd4j::NDArray* B, nd4j::NDArray* C, double alpha = 1.0, double beta = 0.0, const char outOrder = 'f');
+        static sd::NDArray* mmulMxM(const sd::NDArray* A, const sd::NDArray* B, sd::NDArray* C, double alpha = 1.0, double beta = 0.0, const char outOrder = 'f');
 
         // multiptication Matrix to vector
-        static nd4j::NDArray* mmulMxV(const nd4j::NDArray* A, const nd4j::NDArray* B, nd4j::NDArray* C, double alpha = 1.0, double beta = 0.0, const char outOrder = 'f');
+        static sd::NDArray* mmulMxV(const sd::NDArray* A, const sd::NDArray* B, sd::NDArray* C, double alpha = 1.0, double beta = 0.0, const char outOrder = 'f');
 
     public:
 
-        static nd4j::NDArray* mmul(const nd4j::NDArray* A, const nd4j::NDArray* B, nd4j::NDArray* C = nullptr, const double alpha = 1.0, const double beta = 0.0, const char outOrder = 'f');
+        static sd::NDArray* mmul(const sd::NDArray* A, const sd::NDArray* B, sd::NDArray* C = nullptr, const double alpha = 1.0, const double beta = 0.0, const char outOrder = 'f');
 
-        static nd4j::NDArray* tensorDot(const nd4j::NDArray* A, const nd4j::NDArray* B, const std::initializer_list<int>& axesA, const std::initializer_list<int>& axesB = {});
+        static sd::NDArray* tensorDot(const sd::NDArray* A, const sd::NDArray* B, const std::initializer_list<int>& axesA, const std::initializer_list<int>& axesB = {});
 
-        static nd4j::NDArray* tensorDot(const nd4j::NDArray* A, const nd4j::NDArray* B, const std::vector<int>& axesA, const std::vector<int>& axesB);
+        static sd::NDArray* tensorDot(const sd::NDArray* A, const sd::NDArray* B, const std::vector<int>& axesA, const std::vector<int>& axesB);
 
-        static void tensorDot(const nd4j::NDArray* a, const nd4j::NDArray* b, nd4j::NDArray* c, const std::vector<int>& axes_a, const std::vector<int>& axes_b, const std::vector<int>& permutForC = {});
+        static void tensorDot(const sd::NDArray* a, const sd::NDArray* b, sd::NDArray* c, const std::vector<int>& axes_a, const std::vector<int>& axes_b, const std::vector<int>& permutForC = {});
 
 
 #ifndef __JAVACPP_HACK__
         /**
         *  modif - (can be empty) vector containing a subsequence of permutation/reshaping arrays (in any order), user must take care of correctness of such arrays by himself
         */
-        static void tensorDot(const nd4j::NDArray* a, const nd4j::NDArray* b, nd4j::NDArray* c, const std::vector<std::vector<Nd4jLong>>& modifA, const std::vector<std::vector<Nd4jLong>>& modifB, const std::vector<std::vector<Nd4jLong>>& modifC);
-        static nd4j::NDArray* tensorDot(const nd4j::NDArray* a, const nd4j::NDArray* b, const std::vector<std::vector<Nd4jLong>>& modifA, const std::vector<std::vector<Nd4jLong>>& modifB);
+        static void tensorDot(const sd::NDArray* a, const sd::NDArray* b, sd::NDArray* c, const std::vector<std::vector<Nd4jLong>>& modifA, const std::vector<std::vector<Nd4jLong>>& modifB, const std::vector<std::vector<Nd4jLong>>& modifC);
+        static sd::NDArray* tensorDot(const sd::NDArray* a, const sd::NDArray* b, const std::vector<std::vector<Nd4jLong>>& modifA, const std::vector<std::vector<Nd4jLong>>& modifB);
 #endif
 
-        static void matmul(const nd4j::NDArray* x, const nd4j::NDArray* y, nd4j::NDArray* z, const bool transX, const bool transY);
+        static void matmul(const sd::NDArray* x, const sd::NDArray* y, sd::NDArray* z, const bool transX, const bool transY);
     };
 }
 
diff --git a/libnd4j/include/helpers/OmpLaunchHelper.h b/libnd4j/include/helpers/OmpLaunchHelper.h
index dac93cbe2..3e0e50391 100644
--- a/libnd4j/include/helpers/OmpLaunchHelper.h
+++ b/libnd4j/include/helpers/OmpLaunchHelper.h
@@ -23,10 +23,10 @@
 #define LIBND4J_OMPLAUNCHHELPER_H
 
 #include <vector>
-#include <pointercast.h>
-#include <op_boilerplate.h>
+#include <system/pointercast.h>
+#include <system/op_boilerplate.h>
 
-namespace nd4j {
+namespace sd {
 
 class ND4J_EXPORT OmpLaunchHelper {
 	
diff --git a/libnd4j/include/helpers/OpArgsHolder.h b/libnd4j/include/helpers/OpArgsHolder.h
index 5d792105c..a9432f134 100644
--- a/libnd4j/include/helpers/OpArgsHolder.h
+++ b/libnd4j/include/helpers/OpArgsHolder.h
@@ -22,10 +22,10 @@
 #define LIBND4J_OPARGSHOLDER_H
 
 
-#include <NDArray.h>
-#include <dll.h>
+#include <array/NDArray.h>
+#include <system/dll.h>
 
-namespace nd4j {
+namespace sd {
 
 class ND4J_EXPORT OpArgsHolder {
 
diff --git a/libnd4j/include/helpers/OpBenchmark.h b/libnd4j/include/helpers/OpBenchmark.h
index 0a4b32824..328b20dce 100644
--- a/libnd4j/include/helpers/OpBenchmark.h
+++ b/libnd4j/include/helpers/OpBenchmark.h
@@ -21,13 +21,13 @@
 #ifndef DEV_TESTS_OPEXECUTIONER_H
 #define DEV_TESTS_OPEXECUTIONER_H
 
-#include <NativeOpExecutioner.h>
-#include <NDArray.h>
+#include <legacy/NativeOpExecutioner.h>
+#include <array/NDArray.h>
 #include <helpers/ShapeUtils.h>
-#include <PointersManager.h>
+#include <helpers/PointersManager.h>
 #include <helpers/ConstantTadHelper.h>
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT OpBenchmark {
     protected:
         int _opNum = 0;
diff --git a/libnd4j/include/helpers/OpTracker.h b/libnd4j/include/helpers/OpTracker.h
index dd03f154f..122f4f32b 100644
--- a/libnd4j/include/helpers/OpTracker.h
+++ b/libnd4j/include/helpers/OpTracker.h
@@ -24,12 +24,12 @@
 #include <map>
 #include <vector>
 #include <atomic>
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <graph/generated/utils_generated.h>
 #include <ops/declarable/OpDescriptor.h>
-#include <dll.h>
+#include <system/dll.h>
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT OpTracker {
     private:
         static OpTracker* _INSTANCE;        
@@ -37,7 +37,7 @@ namespace nd4j {
         std::string _export;
 
         int _operations = 0;
-        std::map<nd4j::graph::OpType, std::vector<nd4j::ops::OpDescriptor>> _map;
+        std::map<sd::graph::OpType, std::vector<sd::ops::OpDescriptor>> _map;
 
         OpTracker() = default;
         ~OpTracker() = default;
@@ -50,8 +50,8 @@ namespace nd4j {
         int totalGroups();
         int totalOperations();
 
-        void storeOperation(nd4j::graph::OpType opType, const nd4j::ops::OpDescriptor& descriptor);
-        void storeOperation(nd4j::graph::OpType opType, const char* opName, const Nd4jLong opNum);
+        void storeOperation(sd::graph::OpType opType, const sd::ops::OpDescriptor& descriptor);
+        void storeOperation(sd::graph::OpType opType, const char* opName, const Nd4jLong opNum);
 
         const char* exportOperations();
     };
diff --git a/libnd4j/include/helpers/PointersManager.h b/libnd4j/include/helpers/PointersManager.h
index 50fdbccf9..4f7af9409 100644
--- a/libnd4j/include/helpers/PointersManager.h
+++ b/libnd4j/include/helpers/PointersManager.h
@@ -26,21 +26,21 @@
 #include <string>
 #include <execution/LaunchContext.h>
 
-#include <types.h>
+#include <types/types.h>
 
-namespace nd4j {
+namespace sd {
 
 class ND4J_EXPORT PointersManager {
 
     private:
 
-        nd4j::LaunchContext  *_context;
+        sd::LaunchContext  *_context;
         std::vector<void*> _pOnGlobMem;
         std::string _funcName;
 
     public:
 
-        PointersManager(const nd4j::LaunchContext* context, const std::string& funcName = "");
+        PointersManager(const sd::LaunchContext* context, const std::string& funcName = "");
 
         ~PointersManager();
 
diff --git a/libnd4j/include/helpers/RandomLauncher.h b/libnd4j/include/helpers/RandomLauncher.h
index 2e477e079..49e961062 100644
--- a/libnd4j/include/helpers/RandomLauncher.h
+++ b/libnd4j/include/helpers/RandomLauncher.h
@@ -18,30 +18,30 @@
 //  @author raver119@gmail.com
 //
 
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <helpers/helper_random.h>
 #include <graph/RandomGenerator.h>
 #include <execution/LaunchContext.h>
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT RandomLauncher {
     public:
-        static void applyDropOut(nd4j::LaunchContext *context, nd4j::graph::RandomGenerator& rng, NDArray *array, double retainProb, NDArray* z = nullptr);
-        static void applyInvertedDropOut(nd4j::LaunchContext *context, nd4j::graph::RandomGenerator& rng, NDArray *array, double retainProb, NDArray* z = nullptr);
-        static void applyAlphaDropOut(nd4j::LaunchContext *context, nd4j::graph::RandomGenerator& rng, NDArray *array, double retainProb, double alpha, double beta, double alphaPrime, NDArray* z = nullptr);
+        static void applyDropOut(sd::LaunchContext *context, sd::graph::RandomGenerator& rng, NDArray *array, double retainProb, NDArray* z = nullptr);
+        static void applyInvertedDropOut(sd::LaunchContext *context, sd::graph::RandomGenerator& rng, NDArray *array, double retainProb, NDArray* z = nullptr);
+        static void applyAlphaDropOut(sd::LaunchContext *context, sd::graph::RandomGenerator& rng, NDArray *array, double retainProb, double alpha, double beta, double alphaPrime, NDArray* z = nullptr);
 
-        static void fillUniform(nd4j::LaunchContext *context, nd4j::graph::RandomGenerator& rng, NDArray* array, double from, double to);
+        static void fillUniform(sd::LaunchContext *context, sd::graph::RandomGenerator& rng, NDArray* array, double from, double to);
 
-        static void fillGaussian(nd4j::LaunchContext *context, nd4j::graph::RandomGenerator& rng, NDArray* array, double mean, double stdev);
+        static void fillGaussian(sd::LaunchContext *context, sd::graph::RandomGenerator& rng, NDArray* array, double mean, double stdev);
 
-        static void fillExponential(nd4j::LaunchContext *context, nd4j::graph::RandomGenerator& rng, NDArray* array, double lambda);
+        static void fillExponential(sd::LaunchContext *context, sd::graph::RandomGenerator& rng, NDArray* array, double lambda);
 
-        static void fillLogNormal(nd4j::LaunchContext *context, nd4j::graph::RandomGenerator& rng, NDArray* array, double mean, double stdev);
+        static void fillLogNormal(sd::LaunchContext *context, sd::graph::RandomGenerator& rng, NDArray* array, double mean, double stdev);
 
-        static void fillTruncatedNormal(nd4j::LaunchContext *context, nd4j::graph::RandomGenerator& rng, NDArray* array, double mean, double stdev);
+        static void fillTruncatedNormal(sd::LaunchContext *context, sd::graph::RandomGenerator& rng, NDArray* array, double mean, double stdev);
 
-        static void fillBinomial(nd4j::LaunchContext *context, nd4j::graph::RandomGenerator& rng, NDArray* array, int trials, double prob);
+        static void fillBinomial(sd::LaunchContext *context, sd::graph::RandomGenerator& rng, NDArray* array, int trials, double prob);
 
-        static void fillBernoulli(nd4j::LaunchContext *context, nd4j::graph::RandomGenerator& rng, NDArray* array, double prob);
+        static void fillBernoulli(sd::LaunchContext *context, sd::graph::RandomGenerator& rng, NDArray* array, double prob);
     };
 }
\ No newline at end of file
diff --git a/libnd4j/include/helpers/ShapeBuilders.h b/libnd4j/include/helpers/ShapeBuilders.h
index 2d71c7ab2..e2c29a280 100644
--- a/libnd4j/include/helpers/ShapeBuilders.h
+++ b/libnd4j/include/helpers/ShapeBuilders.h
@@ -23,25 +23,25 @@
 
 #include <vector>
 #include <helpers/shape.h>
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <memory/Workspace.h>
 #include <array/DataType.h>
 #include <array/ArrayOptions.h>
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT ShapeBuilders {
     public:
-        static Nd4jLong* createScalarShapeInfo(nd4j::DataType dataType, nd4j::memory::Workspace* workspace = nullptr);
+        static Nd4jLong* createScalarShapeInfo(sd::DataType dataType, sd::memory::Workspace* workspace = nullptr);
 
-        static Nd4jLong* createVectorShapeInfo(const nd4j::DataType dataType, const Nd4jLong length, nd4j::memory::Workspace* workspace = nullptr);
+        static Nd4jLong* createVectorShapeInfo(const sd::DataType dataType, const Nd4jLong length, sd::memory::Workspace* workspace = nullptr);
 
         /**
         *   create shapeInfo for given order basing on shape stored in shapeOnly vector
         *   memory allocation for shapeInfo is on given workspace
         */
-        static Nd4jLong* createShapeInfo(const nd4j::DataType dataType, const char order, int rank, const Nd4jLong* shapeOnly, memory::Workspace* workspace = nullptr);
-        static Nd4jLong* createShapeInfo(const nd4j::DataType dataType, const char order, const std::vector<Nd4jLong>& shapeOnly, memory::Workspace* workspace = nullptr);
-        static Nd4jLong* createShapeInfo(const nd4j::DataType dataType, const char order, const std::initializer_list<Nd4jLong>& shapeOnly, memory::Workspace* workspace = nullptr);
+        static Nd4jLong* createShapeInfo(const sd::DataType dataType, const char order, int rank, const Nd4jLong* shapeOnly, memory::Workspace* workspace = nullptr);
+        static Nd4jLong* createShapeInfo(const sd::DataType dataType, const char order, const std::vector<Nd4jLong>& shapeOnly, memory::Workspace* workspace = nullptr);
+        static Nd4jLong* createShapeInfo(const sd::DataType dataType, const char order, const std::initializer_list<Nd4jLong>& shapeOnly, memory::Workspace* workspace = nullptr);
 
         /**
         *   allocates memory for new shapeInfo and copy all information from inShapeInfo to new shapeInfo
@@ -58,9 +58,9 @@ namespace nd4j {
         */
         static Nd4jLong* copyShapeInfoWithoutUnites(const Nd4jLong* inShapeInfo, const int dimsSize, const int* dimsToExclude, memory::Workspace* workspace = nullptr);
 
-        static Nd4jLong* emptyShapeInfo(const nd4j::DataType dataType, memory::Workspace* workspace = nullptr);
+        static Nd4jLong* emptyShapeInfo(const sd::DataType dataType, memory::Workspace* workspace = nullptr);
 
-        static Nd4jLong* emptyShapeInfo(const nd4j::DataType dataType, const char order, const std::vector<Nd4jLong> &shape, memory::Workspace* workspace = nullptr);
+        static Nd4jLong* emptyShapeInfo(const sd::DataType dataType, const char order, const std::vector<Nd4jLong> &shape, memory::Workspace* workspace = nullptr);
 
     };
 }
diff --git a/libnd4j/include/helpers/ShapeUtils.h b/libnd4j/include/helpers/ShapeUtils.h
index 39ea3edaa..3b641a09b 100644
--- a/libnd4j/include/helpers/ShapeUtils.h
+++ b/libnd4j/include/helpers/ShapeUtils.h
@@ -22,9 +22,9 @@
 #define LIBND4J_SHAPEUTILS_H
 
 #include <vector>
-#include <NDArray.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
 
     class ND4J_EXPORT ShapeUtils {
 
@@ -35,28 +35,28 @@ namespace nd4j {
         static std::vector<Nd4jLong> evalShapeForTensorDot(const NDArray* a,   const NDArray* b,   const std::vector<int>& axesA, const std::vector<int>& axesB, std::vector<int>& permutAt, std::vector<int>& permutBt, std::vector<Nd4jLong>& shapeAt, std::vector<Nd4jLong>& shapeBt);
 
         // evaluate resulting shape after reduce operation
-        static Nd4jLong* evalReduceShapeInfo(const char order, std::vector<int>& dimensions, const NDArray& arr, const nd4j::DataType dataType, const bool keepDims = false, const bool supportOldShapes = false, nd4j::memory::Workspace* workspace = nullptr);
-        static Nd4jLong* evalReduceShapeInfo(const char order, std::vector<int>& dimensions, const Nd4jLong* shapeInfo, const nd4j::DataType dataType, const bool keepDims = false, const bool supportOldShapes = false, nd4j::memory::Workspace* workspace = nullptr);
-        static Nd4jLong* evalReduceShapeInfo(const char order, std::vector<int>& dimensions, const NDArray& arr, const bool keepDims = false, const bool supportOldShapes = false, nd4j::memory::Workspace* workspace = nullptr);
-        static Nd4jLong* evalReduceShapeInfo(const char order, std::vector<int>& dimensions, const Nd4jLong* shapeInfo, const bool keepDims = false, const bool supportOldShapes = false, nd4j::memory::Workspace* workspace = nullptr);
+        static Nd4jLong* evalReduceShapeInfo(const char order, std::vector<int>& dimensions, const NDArray& arr, const sd::DataType dataType, const bool keepDims = false, const bool supportOldShapes = false, sd::memory::Workspace* workspace = nullptr);
+        static Nd4jLong* evalReduceShapeInfo(const char order, std::vector<int>& dimensions, const Nd4jLong* shapeInfo, const sd::DataType dataType, const bool keepDims = false, const bool supportOldShapes = false, sd::memory::Workspace* workspace = nullptr);
+        static Nd4jLong* evalReduceShapeInfo(const char order, std::vector<int>& dimensions, const NDArray& arr, const bool keepDims = false, const bool supportOldShapes = false, sd::memory::Workspace* workspace = nullptr);
+        static Nd4jLong* evalReduceShapeInfo(const char order, std::vector<int>& dimensions, const Nd4jLong* shapeInfo, const bool keepDims = false, const bool supportOldShapes = false, sd::memory::Workspace* workspace = nullptr);
 
         /**
          * evaluate output shape for reduce operation when input shape is empty
          * behavior is analogous to tf
          */
-        static Nd4jLong* evalReduceShapeInfoEmpty(const char order, std::vector<int>& dimensions, const Nd4jLong *shapeInfo, const nd4j::DataType dataType, const bool keepDims, nd4j::memory::Workspace* workspace);
+        static Nd4jLong* evalReduceShapeInfoEmpty(const char order, std::vector<int>& dimensions, const Nd4jLong *shapeInfo, const sd::DataType dataType, const bool keepDims, sd::memory::Workspace* workspace);
 
 		// evaluate shape for array which is result of repeat operation applied to arr
     	static std::vector<Nd4jLong> evalRepeatShape(int axis, const std::vector<int>& repeats, const NDArray& arr);
 
         // evaluate shapeInfo of permuted array
         // if setContigStrides = true, then set contiguous strides in output shapeInfo in accordance with arr order
-        static Nd4jLong* evalPermShapeInfo(const int* dimensions, const int rank, const NDArray& arr, nd4j::memory::Workspace* workspace, const bool setContigStrides = false);
-        static Nd4jLong* evalPermShapeInfo(const Nd4jLong* dimensions, const int rank, const NDArray& arr, nd4j::memory::Workspace* workspace);
+        static Nd4jLong* evalPermShapeInfo(const int* dimensions, const int rank, const NDArray& arr, sd::memory::Workspace* workspace, const bool setContigStrides = false);
+        static Nd4jLong* evalPermShapeInfo(const Nd4jLong* dimensions, const int rank, const NDArray& arr, sd::memory::Workspace* workspace);
 
         // evaluate shapeInfo of transposed array
         // if setContigStrides = true, then set contiguous strides in output shapeInfo in accordance with arr order
-        static Nd4jLong* evalTranspShapeInfo(const NDArray& arr, nd4j::memory::Workspace* workspace, const bool setContigStrides = false);
+        static Nd4jLong* evalTranspShapeInfo(const NDArray& arr, sd::memory::Workspace* workspace, const bool setContigStrides = false);
 
         static bool copyVectorPart(std::vector<int>& target, std::vector<int>& source, int rank, int offset);
 
@@ -72,8 +72,8 @@ namespace nd4j {
 
         // check the possibility of broadcast operation, if true then return shapeInfo of resulting array
         // if evalMinMax == false then array with larger rank has to be passed as first argument
-        static bool evalBroadcastShapeInfo(const NDArray& max, const NDArray& min, const bool evalMinMax, Nd4jLong*& resultShapeInfo, nd4j::memory::Workspace* workspace);
-        static bool evalBroadcastShapeInfo(Nd4jLong *max, Nd4jLong *min, const bool evalMinMax, Nd4jLong*& resultShapeInfo, nd4j::memory::Workspace* workspace);
+        static bool evalBroadcastShapeInfo(const NDArray& max, const NDArray& min, const bool evalMinMax, Nd4jLong*& resultShapeInfo, sd::memory::Workspace* workspace);
+        static bool evalBroadcastShapeInfo(Nd4jLong *max, Nd4jLong *min, const bool evalMinMax, Nd4jLong*& resultShapeInfo, sd::memory::Workspace* workspace);
 
         // evaluate sorted vector of max axes to create tads along in case of simple broadcast operation
         // if simple broadcast is not possible then empty vector is returned
@@ -88,7 +88,7 @@ namespace nd4j {
         static std::vector<int> getDimsWithSameShape(const NDArray& max, const NDArray& min);
 
         // evaluate shapeInfo for resulting array of tile operation
-        static Nd4jLong* evalTileShapeInfo(const NDArray& arr, const std::vector<Nd4jLong>& reps, nd4j::memory::Workspace* workspace);
+        static Nd4jLong* evalTileShapeInfo(const NDArray& arr, const std::vector<Nd4jLong>& reps, sd::memory::Workspace* workspace);
 
         // returns shape part of shapeInfo as std::vector
         static std::vector<Nd4jLong> pullShapeFromShapeInfo(Nd4jLong *shapeInfo);
@@ -104,13 +104,13 @@ namespace nd4j {
         static std::vector<Nd4jLong> shapeAsVector(const Nd4jLong* shapeInfo);
 
         // evaluate shapeInfo for diagonal array which is made using input arr elements as diagonal
-        static Nd4jLong* evalDiagShapeInfo(const Nd4jLong* shapeInfo, nd4j::memory::Workspace* workspace);
+        static Nd4jLong* evalDiagShapeInfo(const Nd4jLong* shapeInfo, sd::memory::Workspace* workspace);
 
         static std::vector<int> evalBroadcastBackwardAxis(const Nd4jLong *operand, const Nd4jLong *result);
 
         // utility to calculate matrix product shape with give source shapes and additional params
         // returns ShapeList pointer with result shape
-        static Nd4jLong* matrixProductShape(Nd4jLong* theFirstShape, Nd4jLong* theSecondShape, bool shouldTranspondFirst, bool shouldTranspondSecond, nd4j::DataType dtype, nd4j::memory::Workspace* workspace);
+        static Nd4jLong* matrixProductShape(Nd4jLong* theFirstShape, Nd4jLong* theSecondShape, bool shouldTranspondFirst, bool shouldTranspondSecond, sd::DataType dtype, sd::memory::Workspace* workspace);
 
         /**
         *  This method evaluates permutation vector necessary for reducing of shapeFrom to shapeTo
diff --git a/libnd4j/include/helpers/SimpleReadWriteLock.h b/libnd4j/include/helpers/SimpleReadWriteLock.h
index b7637f355..5d1fce711 100644
--- a/libnd4j/include/helpers/SimpleReadWriteLock.h
+++ b/libnd4j/include/helpers/SimpleReadWriteLock.h
@@ -23,7 +23,7 @@
 
 #include <atomic>
 #include <mutex>
-#include <dll.h>
+#include <system/dll.h>
 
 /**
  * This class provides PRIMITIVE read-write lock, and should NOT be used outside of GraphServer due to its inefficiency.
@@ -31,7 +31,7 @@
  *
  * Basic idea: write lock won't be obtained before all read requests served
  */
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT SimpleReadWriteLock {
     private:
         std::atomic<unsigned long long int> _read_locks;
diff --git a/libnd4j/include/helpers/StringUtils.h b/libnd4j/include/helpers/StringUtils.h
index 7a0e2a960..ef9586637 100644
--- a/libnd4j/include/helpers/StringUtils.h
+++ b/libnd4j/include/helpers/StringUtils.h
@@ -23,15 +23,15 @@
 #ifndef LIBND4J_STRINGUTILS_H
 #define LIBND4J_STRINGUTILS_H
 
-#include <pointercast.h>
-#include <op_boilerplate.h>
+#include <system/pointercast.h>
+#include <system/op_boilerplate.h>
 #include <string>
 #include <sstream>
 #include <vector>
-#include <NDArray.h>
-#include <unicode.h>
+#include <array/NDArray.h>
+#include <helpers/unicode.h>
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT StringUtils {
     public:
         template <typename T>
diff --git a/libnd4j/include/helpers/TAD.h b/libnd4j/include/helpers/TAD.h
index fb52e639c..6df5a05a2 100644
--- a/libnd4j/include/helpers/TAD.h
+++ b/libnd4j/include/helpers/TAD.h
@@ -23,7 +23,7 @@
 
 
 #include <helpers/shape.h>
-#include <pointercast.h>
+#include <system/pointercast.h>
 
 
 namespace shape {
@@ -443,7 +443,7 @@ namespace shape {
 
     INLINEDEF void TAD::createTadOnlyShapeInfo() {
         this->tadOnlyShapeInfo = this->shapeInfoOnlyShapeAndStride();
-        nd4j::ArrayOptions::setDataType(this->tadOnlyShapeInfo, nd4j::ArrayOptions::dataType(this->originalShapeInfo));
+        sd::ArrayOptions::setDataType(this->tadOnlyShapeInfo, sd::ArrayOptions::dataType(this->originalShapeInfo));
 
         // possible optimization goes here
         if (shape::order(this->originalShapeInfo) == 'c'
diff --git a/libnd4j/include/helpers/benchmark/BasicSuit.h b/libnd4j/include/helpers/benchmark/BasicSuit.h
index 4c06d4e66..1e4c156fb 100644
--- a/libnd4j/include/helpers/benchmark/BasicSuit.h
+++ b/libnd4j/include/helpers/benchmark/BasicSuit.h
@@ -21,7 +21,7 @@
 #ifndef DEV_TESTS_BASICSUIT_H
 #define DEV_TESTS_BASICSUIT_H
 
-namespace nd4j {
+namespace sd {
     class BasicSuit {
     protected:
 
diff --git a/libnd4j/include/helpers/benchmark/BoolParameters.h b/libnd4j/include/helpers/benchmark/BoolParameters.h
index d6a6cbadb..bac8a0c5c 100644
--- a/libnd4j/include/helpers/benchmark/BoolParameters.h
+++ b/libnd4j/include/helpers/benchmark/BoolParameters.h
@@ -27,7 +27,7 @@
 #include "Parameters.h"
 #include "ParametersSpace.h"
 
-namespace nd4j {
+namespace sd {
     class BoolParameters : public ParametersSpace {
     protected:
 
diff --git a/libnd4j/include/helpers/benchmark/BroadcastBenchmark.h b/libnd4j/include/helpers/benchmark/BroadcastBenchmark.h
index ac2a643e9..3a043be59 100644
--- a/libnd4j/include/helpers/benchmark/BroadcastBenchmark.h
+++ b/libnd4j/include/helpers/benchmark/BroadcastBenchmark.h
@@ -23,7 +23,7 @@
 #ifndef DEV_TESTS_BROADCASTBENCHMARK_H
 #define DEV_TESTS_BROADCASTBENCHMARK_H
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT BroadcastBenchmark : public OpBenchmark {
     public:
         BroadcastBenchmark() : OpBenchmark() {
diff --git a/libnd4j/include/helpers/benchmark/DeclarableBenchmark.h b/libnd4j/include/helpers/benchmark/DeclarableBenchmark.h
index 0aa8c35a6..f9347eb05 100644
--- a/libnd4j/include/helpers/benchmark/DeclarableBenchmark.h
+++ b/libnd4j/include/helpers/benchmark/DeclarableBenchmark.h
@@ -22,25 +22,25 @@
 #ifndef DEV_TESTS_DECLARABLEBENCHMARK_H
 #define DEV_TESTS_DECLARABLEBENCHMARK_H
 
-#include <NDArray.h>
-#include <Context.h>
-#include <OpBenchmark.h>
-#include <declarable/DeclarableOp.h>
-#include <declarable/OpRegistrator.h>
-#include <PointersManager.h>
+#include <array/NDArray.h>
+#include <graph/Context.h>
+#include <helpers/OpBenchmark.h>
+#include <ops/declarable/DeclarableOp.h>
+#include <ops/declarable/OpRegistrator.h>
+#include <helpers/PointersManager.h>
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT DeclarableBenchmark : public OpBenchmark  {
     protected:
-        nd4j::ops::DeclarableOp *_op = nullptr;
-        nd4j::graph::Context *_context = nullptr;
+        sd::ops::DeclarableOp *_op = nullptr;
+        sd::graph::Context *_context = nullptr;
     public:
-        DeclarableBenchmark(nd4j::ops::DeclarableOp &op, std::string name = 0) : OpBenchmark() {
+        DeclarableBenchmark(sd::ops::DeclarableOp &op, std::string name = 0) : OpBenchmark() {
             _op = &op; //ops::OpRegistrator::getInstance()->getOperation(op.getOpHash());
             _testName = name;
         }
 
-        void setContext(nd4j::graph::Context *ctx) {
+        void setContext(sd::graph::Context *ctx) {
             _context = ctx;
         }
 
diff --git a/libnd4j/include/helpers/benchmark/IntParameters.h b/libnd4j/include/helpers/benchmark/IntParameters.h
index 1615fcaa7..10a1763e4 100644
--- a/libnd4j/include/helpers/benchmark/IntParameters.h
+++ b/libnd4j/include/helpers/benchmark/IntParameters.h
@@ -27,7 +27,7 @@
 #include "Parameters.h"
 #include "ParametersSpace.h"
 
-namespace nd4j {
+namespace sd {
     class IntParameters : public ParametersSpace {
     protected:
         int _start;
diff --git a/libnd4j/include/helpers/benchmark/IntPowerParameters.h b/libnd4j/include/helpers/benchmark/IntPowerParameters.h
index 81dabfb86..82c58bb23 100644
--- a/libnd4j/include/helpers/benchmark/IntPowerParameters.h
+++ b/libnd4j/include/helpers/benchmark/IntPowerParameters.h
@@ -27,7 +27,7 @@
 #include "Parameters.h"
 #include "ParametersSpace.h"
 
-namespace nd4j {
+namespace sd {
     class IntPowerParameters : public ParametersSpace {
     protected:
         int _base;
@@ -47,7 +47,7 @@ namespace nd4j {
         std::vector<int> evaluate() override {
             std::vector<int> result;
             for (int e = _start; e <= _stop; e += _step) {
-               result.emplace_back(nd4j::math::nd4j_pow<double, double, int>(_base, e));
+               result.emplace_back(sd::math::nd4j_pow<double, double, int>(_base, e));
             }
             return result;
         }
diff --git a/libnd4j/include/helpers/benchmark/MatrixBenchmark.h b/libnd4j/include/helpers/benchmark/MatrixBenchmark.h
index 7c1330648..eb8fd2619 100644
--- a/libnd4j/include/helpers/benchmark/MatrixBenchmark.h
+++ b/libnd4j/include/helpers/benchmark/MatrixBenchmark.h
@@ -18,13 +18,13 @@
 // @author raver119@gmail.com
 //
 
-#include "../OpBenchmark.h"
-#include <MmulHelper.h>
+#include <helpers/OpBenchmark.h>
+#include <helpers/MmulHelper.h>
 
 #ifndef DEV_TESTS_MATRIXBENCHMARK_H
 #define DEV_TESTS_MATRIXBENCHMARK_H
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT MatrixBenchmark : public OpBenchmark {
     private:
         float _alpha = 1.0f;
diff --git a/libnd4j/include/helpers/benchmark/PairwiseBenchmark.h b/libnd4j/include/helpers/benchmark/PairwiseBenchmark.h
index 68229b583..ca92e96b3 100644
--- a/libnd4j/include/helpers/benchmark/PairwiseBenchmark.h
+++ b/libnd4j/include/helpers/benchmark/PairwiseBenchmark.h
@@ -23,9 +23,9 @@
 #ifndef DEV_TESTS_PAIRWISEBENCHMARK_H
 #define DEV_TESTS_PAIRWISEBENCHMARK_H
 
-using namespace nd4j::graph;
+using namespace sd::graph;
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT PairwiseBenchmark : public OpBenchmark {
     public:
         PairwiseBenchmark() : OpBenchmark() {
diff --git a/libnd4j/include/helpers/benchmark/Parameters.h b/libnd4j/include/helpers/benchmark/Parameters.h
index 934837e8e..eee443574 100644
--- a/libnd4j/include/helpers/benchmark/Parameters.h
+++ b/libnd4j/include/helpers/benchmark/Parameters.h
@@ -25,7 +25,7 @@
 #include <string>
 #include <vector>
 
-namespace nd4j {
+namespace sd {
     class Parameters {
     private:
         std::map<std::string, int> _intParams;
diff --git a/libnd4j/include/helpers/benchmark/ParametersBatch.h b/libnd4j/include/helpers/benchmark/ParametersBatch.h
index 4a7119937..5a045d5cd 100644
--- a/libnd4j/include/helpers/benchmark/ParametersBatch.h
+++ b/libnd4j/include/helpers/benchmark/ParametersBatch.h
@@ -21,11 +21,11 @@
 #ifndef DEV_TESTS_PARAMETERSBATCH_H
 #define DEV_TESTS_PARAMETERSBATCH_H
 
-#include "ParametersSpace.h"
+#include <helpers/benchmark/ParametersSpace.h>
 #include <vector>
-#include <shape.h>
+#include <helpers/shape.h>
 
-namespace nd4j {
+namespace sd {
     class ParametersBatch {
     protected:
         std::vector<ParametersSpace*> _spaces;
diff --git a/libnd4j/include/helpers/benchmark/ParametersSpace.h b/libnd4j/include/helpers/benchmark/ParametersSpace.h
index d245e2319..a7c59f9a6 100644
--- a/libnd4j/include/helpers/benchmark/ParametersSpace.h
+++ b/libnd4j/include/helpers/benchmark/ParametersSpace.h
@@ -23,7 +23,7 @@
 
 #include <vector>
 
-namespace nd4j {
+namespace sd {
     class ParametersSpace {
     protected:
         std::string _name;
diff --git a/libnd4j/include/helpers/benchmark/PredefinedParameters.h b/libnd4j/include/helpers/benchmark/PredefinedParameters.h
index 5f683857d..f2a7fc347 100644
--- a/libnd4j/include/helpers/benchmark/PredefinedParameters.h
+++ b/libnd4j/include/helpers/benchmark/PredefinedParameters.h
@@ -23,7 +23,7 @@
 
 #include "ParametersSpace.h"
 
-namespace nd4j {
+namespace sd {
     class PredefinedParameters : public ParametersSpace{
         std::vector<int> _params;
     public:
diff --git a/libnd4j/include/helpers/benchmark/ReductionBenchmark.h b/libnd4j/include/helpers/benchmark/ReductionBenchmark.h
index 460f2e272..a1dc0126f 100644
--- a/libnd4j/include/helpers/benchmark/ReductionBenchmark.h
+++ b/libnd4j/include/helpers/benchmark/ReductionBenchmark.h
@@ -25,9 +25,9 @@
 #ifndef DEV_TESTS_REDUCEBENCHMARK_H
 #define DEV_TESTS_REDUCEBENCHMARK_H
 
-using namespace nd4j::graph;
+using namespace sd::graph;
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT ReductionBenchmark : public OpBenchmark {
     protected:
         int _opType;        //0=Float, 1=Same
diff --git a/libnd4j/include/helpers/benchmark/ScalarBenchmark.h b/libnd4j/include/helpers/benchmark/ScalarBenchmark.h
index d24c31b84..3b0cdecf5 100644
--- a/libnd4j/include/helpers/benchmark/ScalarBenchmark.h
+++ b/libnd4j/include/helpers/benchmark/ScalarBenchmark.h
@@ -22,9 +22,9 @@
 #ifndef DEV_TESTS_SCALARBENCHMARK_H
 #define DEV_TESTS_SCALARBENCHMARK_H
 
-using namespace nd4j::graph;
+using namespace sd::graph;
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT ScalarBenchmark : public OpBenchmark {
     public:
         ScalarBenchmark() : OpBenchmark() {
diff --git a/libnd4j/include/helpers/benchmark/TransformBenchmark.h b/libnd4j/include/helpers/benchmark/TransformBenchmark.h
index 02f19a7ed..024857633 100644
--- a/libnd4j/include/helpers/benchmark/TransformBenchmark.h
+++ b/libnd4j/include/helpers/benchmark/TransformBenchmark.h
@@ -22,9 +22,9 @@
 #ifndef DEV_TESTS_TRANSFORMBENCHMARK_H
 #define DEV_TESTS_TRANSFORMBENCHMARK_H
 
-using namespace nd4j::graph;
+using namespace sd::graph;
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT TransformBenchmark : public OpBenchmark {
 
     protected:
diff --git a/libnd4j/include/helpers/biDiagonalUp.h b/libnd4j/include/helpers/biDiagonalUp.h
index 279b2a63b..aaf64d41d 100644
--- a/libnd4j/include/helpers/biDiagonalUp.h
+++ b/libnd4j/include/helpers/biDiagonalUp.h
@@ -21,10 +21,10 @@
 #ifndef LIBND4J_BIDIAGONALUP_H
 #define LIBND4J_BIDIAGONALUP_H
 
-#include <hhSequence.h>
-#include "NDArray.h"
+#include <helpers/hhSequence.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
diff --git a/libnd4j/include/helpers/cpu/ConstantHelper.cpp b/libnd4j/include/helpers/cpu/ConstantHelper.cpp
index f6981d582..10b8a52c3 100644
--- a/libnd4j/include/helpers/cpu/ConstantHelper.cpp
+++ b/libnd4j/include/helpers/cpu/ConstantHelper.cpp
@@ -21,14 +21,14 @@
 
 #ifndef __CUDABLAS__
 
-#include <ConstantHelper.h>
+#include <helpers/ConstantHelper.h>
 #include <execution/AffinityManager.h>
 #include <types/types.h>
 #include <loops/type_conversions.h>
-#include <type_boilerplate.h>
+#include <system/type_boilerplate.h>
 #include <cstring>
 
-namespace nd4j {
+namespace sd {
 
     ConstantHelper::ConstantHelper() {
         int numDevices = getNumberOfDevices();
@@ -44,7 +44,7 @@ namespace nd4j {
 
     ConstantHelper* ConstantHelper::getInstance() {
         if (!_INSTANCE)
-            _INSTANCE = new nd4j::ConstantHelper();
+            _INSTANCE = new sd::ConstantHelper();
 
         return _INSTANCE;
     }
@@ -70,7 +70,7 @@ namespace nd4j {
         return AffinityManager::numberOfDevices();
     }
 
-    ConstantDataBuffer* ConstantHelper::constantBuffer(const ConstantDescriptor &descriptor, nd4j::DataType dataType) {
+    ConstantDataBuffer* ConstantHelper::constantBuffer(const ConstantDescriptor &descriptor, sd::DataType dataType) {
         const auto deviceId = getCurrentDevice();
 
         // we're locking away cache modification
@@ -100,9 +100,9 @@ namespace nd4j {
 
             // create buffer with this dtype
             if (descriptor.isFloat()) {
-                BUILD_DOUBLE_SELECTOR(nd4j::DataType::DOUBLE, dataType, nd4j::TypeCast::convertGeneric, (nullptr, const_cast<double *>(descriptor.floatValues().data()), descriptor.length(), cbuff), (nd4j::DataType::DOUBLE, double), LIBND4J_TYPES);
+                BUILD_DOUBLE_SELECTOR(sd::DataType::DOUBLE, dataType, sd::TypeCast::convertGeneric, (nullptr, const_cast<double *>(descriptor.floatValues().data()), descriptor.length(), cbuff), (sd::DataType::DOUBLE, double), LIBND4J_TYPES);
             } else if (descriptor.isInteger()) {
-                BUILD_DOUBLE_SELECTOR(nd4j::DataType::INT64, dataType, nd4j::TypeCast::convertGeneric, (nullptr, const_cast<Nd4jLong *>(descriptor.integerValues().data()), descriptor.length(), cbuff), (nd4j::DataType::INT64, Nd4jLong), LIBND4J_TYPES);
+                BUILD_DOUBLE_SELECTOR(sd::DataType::INT64, dataType, sd::TypeCast::convertGeneric, (nullptr, const_cast<Nd4jLong *>(descriptor.integerValues().data()), descriptor.length(), cbuff), (sd::DataType::INT64, Nd4jLong), LIBND4J_TYPES);
             }
 
             ConstantDataBuffer dataBuffer(cbuff, nullptr, descriptor.length(), DataTypeUtils::sizeOf(dataType));
@@ -123,7 +123,7 @@ namespace nd4j {
             return _counters[deviceId];
     }
 
-    nd4j::ConstantHelper* nd4j::ConstantHelper::_INSTANCE = 0;
+    sd::ConstantHelper* sd::ConstantHelper::_INSTANCE = 0;
 }
 
 #endif
\ No newline at end of file
diff --git a/libnd4j/include/helpers/cpu/ConstantShapeHelper.cpp b/libnd4j/include/helpers/cpu/ConstantShapeHelper.cpp
index 5ab1e91f7..b7ffa15f5 100644
--- a/libnd4j/include/helpers/cpu/ConstantShapeHelper.cpp
+++ b/libnd4j/include/helpers/cpu/ConstantShapeHelper.cpp
@@ -20,12 +20,12 @@
 
 #ifndef __CUDABLAS__
 
-#include "../ConstantShapeHelper.h"
-#include <logger.h>
-#include <ShapeBuilders.h>
-#include <ShapeUtils.h>
+#include <helpers/ConstantShapeHelper.h>
+#include <helpers/logger.h>
+#include <helpers/ShapeBuilders.h>
+#include <helpers/ShapeUtils.h>
 
-namespace nd4j {
+namespace sd {
     ConstantShapeHelper::ConstantShapeHelper() {
         _cache.resize(32);
         for (int e = 0; e < 32; e++) {
@@ -41,12 +41,12 @@ namespace nd4j {
         return _INSTANCE;
     }
 
-    ConstantDataBuffer ConstantShapeHelper::bufferForShapeInfo(nd4j::DataType dataType, char order, const std::vector<Nd4jLong> &shape) {
+    ConstantDataBuffer ConstantShapeHelper::bufferForShapeInfo(sd::DataType dataType, char order, const std::vector<Nd4jLong> &shape) {
         ShapeDescriptor descriptor(dataType, order, shape);
         return bufferForShapeInfo(descriptor);
     }
 
-    ConstantDataBuffer ConstantShapeHelper::bufferForShapeInfo(const nd4j::DataType dataType, const char order, const int rank, const Nd4jLong* shape) {
+    ConstantDataBuffer ConstantShapeHelper::bufferForShapeInfo(const sd::DataType dataType, const char order, const int rank, const Nd4jLong* shape) {
         ShapeDescriptor descriptor(dataType, order, shape, rank);
         return bufferForShapeInfo(descriptor);
     }
@@ -94,31 +94,31 @@ namespace nd4j {
         return result;
     }
 
-    Nd4jLong* ConstantShapeHelper::createShapeInfo(const nd4j::DataType dataType, const char order, const int rank, const Nd4jLong* shape) {
+    Nd4jLong* ConstantShapeHelper::createShapeInfo(const sd::DataType dataType, const char order, const int rank, const Nd4jLong* shape) {
         ShapeDescriptor descriptor(dataType, order, shape, rank);
         return bufferForShapeInfo(descriptor).primaryAsT<Nd4jLong>();
     }
 
-    Nd4jLong* ConstantShapeHelper::createShapeInfo(const nd4j::DataType dataType, const Nd4jLong* shapeInfo) {
+    Nd4jLong* ConstantShapeHelper::createShapeInfo(const sd::DataType dataType, const Nd4jLong* shapeInfo) {
         return ConstantShapeHelper::createShapeInfo(dataType, shape::order(shapeInfo), shape::rank(shapeInfo), shape::shapeOf(const_cast<Nd4jLong*>(shapeInfo)));
     }
 
-    Nd4jLong* ConstantShapeHelper::emptyShapeInfo(const nd4j::DataType dataType) {
+    Nd4jLong* ConstantShapeHelper::emptyShapeInfo(const sd::DataType dataType) {
         auto descriptor = ShapeDescriptor::emptyDescriptor(dataType);
         return bufferForShapeInfo(descriptor).primaryAsT<Nd4jLong>();
     }
 
-    Nd4jLong* ConstantShapeHelper::scalarShapeInfo(const nd4j::DataType dataType) {
+    Nd4jLong* ConstantShapeHelper::scalarShapeInfo(const sd::DataType dataType) {
         auto descriptor = ShapeDescriptor::scalarDescriptor(dataType);
         return bufferForShapeInfo(descriptor).primaryAsT<Nd4jLong>();
     }
 
-    Nd4jLong* ConstantShapeHelper::vectorShapeInfo(const Nd4jLong length, const nd4j::DataType dataType) {
+    Nd4jLong* ConstantShapeHelper::vectorShapeInfo(const Nd4jLong length, const sd::DataType dataType) {
         auto descriptor = ShapeDescriptor::vectorDescriptor(length, dataType);
         return bufferForShapeInfo(descriptor).primaryAsT<Nd4jLong>();
     }
 
-    Nd4jLong* ConstantShapeHelper::createShapeInfo(const nd4j::DataType dataType, const char order, const std::vector<Nd4jLong> &shape) {
+    Nd4jLong* ConstantShapeHelper::createShapeInfo(const sd::DataType dataType, const char order, const std::vector<Nd4jLong> &shape) {
         ShapeDescriptor descriptor(dataType, order, shape);
         return bufferForShapeInfo(descriptor).primaryAsT<Nd4jLong>();
     }
@@ -137,7 +137,7 @@ namespace nd4j {
         return result;
     }
 
-    Nd4jLong* ConstantShapeHelper::createFromExisting(Nd4jLong *shapeInfo, nd4j::memory::Workspace *workspace) {
+    Nd4jLong* ConstantShapeHelper::createFromExisting(Nd4jLong *shapeInfo, sd::memory::Workspace *workspace) {
         ShapeDescriptor descriptor(shapeInfo);
         auto result = createShapeInfo(descriptor);
 
@@ -146,7 +146,7 @@ namespace nd4j {
         return result;
     }
 
-    nd4j::ConstantShapeHelper* nd4j::ConstantShapeHelper::_INSTANCE = 0;
+    sd::ConstantShapeHelper* sd::ConstantShapeHelper::_INSTANCE = 0;
 }
 
 #endif
\ No newline at end of file
diff --git a/libnd4j/include/helpers/cpu/ConstantTadHelper.cpp b/libnd4j/include/helpers/cpu/ConstantTadHelper.cpp
index 9c34cc475..2d94a9e2d 100644
--- a/libnd4j/include/helpers/cpu/ConstantTadHelper.cpp
+++ b/libnd4j/include/helpers/cpu/ConstantTadHelper.cpp
@@ -19,13 +19,13 @@
 //
 
 #include "../ConstantTadHelper.h"
-#include <TAD.h>
-#include <ShapeUtils.h>
+#include <helpers/TAD.h>
+#include <helpers/ShapeUtils.h>
 
 #ifndef __CUDABLAS__
 
 
-namespace nd4j {
+namespace sd {
 
     ConstantTadHelper::ConstantTadHelper() {
         MAP_IMPL<TadDescriptor, TadPack> pack;
@@ -113,7 +113,7 @@ namespace nd4j {
         }
     }
 
-    nd4j::ConstantTadHelper* nd4j::ConstantTadHelper::_INSTANCE = 0;
+    sd::ConstantTadHelper* sd::ConstantTadHelper::_INSTANCE = 0;
 }
 
 #endif
\ No newline at end of file
diff --git a/libnd4j/include/helpers/cpu/MmulHelper.cpp b/libnd4j/include/helpers/cpu/MmulHelper.cpp
index a2e5bbd34..edbc45fd4 100644
--- a/libnd4j/include/helpers/cpu/MmulHelper.cpp
+++ b/libnd4j/include/helpers/cpu/MmulHelper.cpp
@@ -19,14 +19,14 @@
 // @author Yurii Shyrma (iuriish@yahoo.com)
 //
 #include "../MmulHelper.h"
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 #include <helpers/BlasHelper.h>
 #include <helpers/ShapeUtils.h>
 #include <exceptions/datatype_exception.h>
 #include <execution/Threads.h>
 
 
-namespace nd4j {
+namespace sd {
 
 //////////////////////////////////////////////////////////////////////////////
 // MXK x KxN = MxN              -> actual sequence of axes doesn't matter
@@ -282,7 +282,7 @@ NDArray* MmulHelper::mmulMxM(const NDArray* A, const NDArray* B, NDArray* C, con
 
 ////////////////////////////////////////////////////////////////////////////
 // MXN x N = M
-NDArray* MmulHelper::mmulMxV(const NDArray* A, const NDArray* X, nd4j::NDArray* Y, const double alpha, const double beta, const char outOrder) {
+NDArray* MmulHelper::mmulMxV(const NDArray* A, const NDArray* X, sd::NDArray* Y, const double alpha, const double beta, const char outOrder) {
 
     if (X->dataType() != A->dataType())
         throw datatype_exception::build("mmulMxV expects all data types to be the same", A->dataType(), X->dataType());
@@ -362,7 +362,7 @@ NDArray* MmulHelper::mmulMxV(const NDArray* A, const NDArray* X, nd4j::NDArray*
 
 ////////////////////////////////////////////////////////////////////////////
 // (X * Y) = Z[0]
-NDArray* MmulHelper::dot(const NDArray* X, const NDArray* Y, nd4j::NDArray* Z, const double alpha, const double beta) {
+NDArray* MmulHelper::dot(const NDArray* X, const NDArray* Y, sd::NDArray* Z, const double alpha, const double beta) {
     if (X->dataType() != Y->dataType())
         throw datatype_exception::build("Dot expects all data types to be the same", X->dataType(), Y->dataType());
 
diff --git a/libnd4j/include/helpers/cpu/PointersManager.cpp b/libnd4j/include/helpers/cpu/PointersManager.cpp
index 8116d3703..61eb7b2ec 100644
--- a/libnd4j/include/helpers/cpu/PointersManager.cpp
+++ b/libnd4j/include/helpers/cpu/PointersManager.cpp
@@ -20,16 +20,16 @@
 
 #ifndef __CUDABLAS__
 
-#include <PointersManager.h>
+#include <helpers/PointersManager.h>
 #include <exceptions/cuda_exception.h>
-#include <logger.h>
+#include <helpers/logger.h>
 #include <memory/Workspace.h>
 
-namespace nd4j {
+namespace sd {
 
 //////////////////////////////////////////////////////////////////////////
-PointersManager::PointersManager(const nd4j::LaunchContext *context, const std::string& funcName)  {
-    _context  = const_cast<nd4j::LaunchContext*>(context);
+PointersManager::PointersManager(const sd::LaunchContext *context, const std::string& funcName)  {
+    _context  = const_cast<sd::LaunchContext*>(context);
     _funcName = funcName;
 }
 
diff --git a/libnd4j/include/helpers/cpu/biDiagonalUp.cpp b/libnd4j/include/helpers/cpu/biDiagonalUp.cpp
index 1e3280a82..4623a93ad 100644
--- a/libnd4j/include/helpers/cpu/biDiagonalUp.cpp
+++ b/libnd4j/include/helpers/cpu/biDiagonalUp.cpp
@@ -19,19 +19,19 @@
 //
 
 
-#include <householder.h>
-#include <biDiagonalUp.h>
-#include <NDArrayFactory.h>
+#include <helpers/householder.h>
+#include <helpers/biDiagonalUp.h>
+#include <array/NDArrayFactory.h>
 
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
 
 //////////////////////////////////////////////////////////////////////////
-BiDiagonalUp::BiDiagonalUp(const NDArray& matrix): _HHmatrix(nd4j::NDArrayFactory::create(matrix.ordering(), {matrix.sizeAt(0), matrix.sizeAt(1)}, matrix.dataType(), matrix.getContext())),
-                                                         _HHbidiag(nd4j::NDArrayFactory::create(matrix.ordering(), {matrix.sizeAt(1), matrix.sizeAt(1)}, matrix.dataType(), matrix.getContext())) {
+BiDiagonalUp::BiDiagonalUp(const NDArray& matrix): _HHmatrix(sd::NDArrayFactory::create(matrix.ordering(), {matrix.sizeAt(0), matrix.sizeAt(1)}, matrix.dataType(), matrix.getContext())),
+                                                         _HHbidiag(sd::NDArrayFactory::create(matrix.ordering(), {matrix.sizeAt(1), matrix.sizeAt(1)}, matrix.dataType(), matrix.getContext())) {
 
 	// input validation
 	if(matrix.rankOf() != 2 || matrix.isScalar())
diff --git a/libnd4j/include/helpers/cpu/cublasHelper.cpp b/libnd4j/include/helpers/cpu/cublasHelper.cpp
index 3dba2d31e..f6f718702 100644
--- a/libnd4j/include/helpers/cpu/cublasHelper.cpp
+++ b/libnd4j/include/helpers/cpu/cublasHelper.cpp
@@ -20,7 +20,7 @@
 
 #include "../cublasHelper.h"
 
-namespace nd4j {
+namespace sd {
     static void* handle_() {
         return nullptr;
     }
@@ -39,7 +39,7 @@ namespace nd4j {
 
     CublasHelper* CublasHelper::getInstance() {
         if (!_INSTANCE)
-            _INSTANCE = new nd4j::CublasHelper();
+            _INSTANCE = new sd::CublasHelper();
 
         return _INSTANCE;
     }
@@ -57,5 +57,5 @@ namespace nd4j {
     }
 
 
-    nd4j::CublasHelper* nd4j::CublasHelper::_INSTANCE = 0;
+    sd::CublasHelper* sd::CublasHelper::_INSTANCE = 0;
 }
\ No newline at end of file
diff --git a/libnd4j/include/helpers/cpu/hhColPivQR.cpp b/libnd4j/include/helpers/cpu/hhColPivQR.cpp
index f6f4e3c6c..e118b0bf1 100644
--- a/libnd4j/include/helpers/cpu/hhColPivQR.cpp
+++ b/libnd4j/include/helpers/cpu/hhColPivQR.cpp
@@ -18,11 +18,11 @@
 // Created by Yurii Shyrma on 11.01.2018
 //
 
-#include <hhColPivQR.h>
-#include <householder.h>
-#include <NDArrayFactory.h>
+#include <helpers/hhColPivQR.h>
+#include <helpers/householder.h>
+#include <array/NDArrayFactory.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
diff --git a/libnd4j/include/helpers/cpu/hhSequence.cpp b/libnd4j/include/helpers/cpu/hhSequence.cpp
index 60cbf6ec5..8a2a35329 100644
--- a/libnd4j/include/helpers/cpu/hhSequence.cpp
+++ b/libnd4j/include/helpers/cpu/hhSequence.cpp
@@ -18,11 +18,11 @@
 // Created by Yurii Shyrma on 02.01.2018
 //
 
-#include <hhSequence.h>
-#include <householder.h>
-#include <NDArrayFactory.h>
+#include <helpers/hhSequence.h>
+#include <helpers/householder.h>
+#include <array/NDArrayFactory.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -30,7 +30,7 @@ namespace helpers {
 //////////////////////////////////////////////////////////////////////////
 HHsequence::HHsequence(const NDArray& vectors, const NDArray& coeffs, const char type): _vectors(vectors), _coeffs(coeffs) {
 	
-	_diagSize = nd4j::math::nd4j_min(_vectors.sizeAt(0), _vectors.sizeAt(1));
+	_diagSize = sd::math::nd4j_min(_vectors.sizeAt(0), _vectors.sizeAt(1));
 	_shift = 0;    
 	_type  = type;
 }
@@ -117,7 +117,7 @@ void HHsequence::_applyTo(NDArray& dest) {
         BUILD_SINGLE_SELECTOR(xType, _mulLeft, (matrix), FLOAT_TYPES);
     }
 
-    BUILD_SINGLE_TEMPLATE(template void HHsequence::_applyTo, (nd4j::NDArray &dest), FLOAT_TYPES);
+    BUILD_SINGLE_TEMPLATE(template void HHsequence::_applyTo, (sd::NDArray &dest), FLOAT_TYPES);
     BUILD_SINGLE_TEMPLATE(template void HHsequence::_mulLeft, (NDArray& matrix), FLOAT_TYPES);
 }
 }
diff --git a/libnd4j/include/helpers/cpu/householder.cpp b/libnd4j/include/helpers/cpu/householder.cpp
index 024695583..69d4ca3db 100644
--- a/libnd4j/include/helpers/cpu/householder.cpp
+++ b/libnd4j/include/helpers/cpu/householder.cpp
@@ -18,10 +18,10 @@
 // Created by Yurii Shyrma on 18.12.2017
 //
 
-#include <householder.h>
-#include <NDArrayFactory.h>
+#include <helpers/householder.h>
+#include <array/NDArrayFactory.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
diff --git a/libnd4j/include/helpers/cpu/jacobiSVD.cpp b/libnd4j/include/helpers/cpu/jacobiSVD.cpp
index 4ba2bfe0a..372a2a409 100644
--- a/libnd4j/include/helpers/cpu/jacobiSVD.cpp
+++ b/libnd4j/include/helpers/cpu/jacobiSVD.cpp
@@ -18,12 +18,12 @@
 // Created by Yurii Shyrma on 11.01.2018
 //
 
-#include <jacobiSVD.h>
-#include <hhColPivQR.h>
-#include <NDArrayFactory.h>
+#include <helpers/jacobiSVD.h>
+#include <helpers/hhColPivQR.h>
+#include <array/NDArrayFactory.h>
 
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
diff --git a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.hpp b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.hpp
index f0e9e8fd5..fe6019b5a 100644
--- a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.hpp
+++ b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.hpp
@@ -26,14 +26,14 @@ using namespace simdOps;
 //////////////////////////////////////////////////////////////////////////////
 template <typename X, typename Z>
 template <typename OpType>
-void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
+void sd::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
                            Z* z, Nd4jLong* zShapeInfo,
                            Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets,
                            X* extraParams) {
 
-    nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopTadXZ(xShapeInfo, zShapeInfo, tadShapeInfo);
-    if(kindOfLoop == nd4j::LoopKind::SMALLARR2DX)
-        kindOfLoop = nd4j::LoopKind::EWSNONZERO;
+    sd::LoopKind::Kind kindOfLoop = sd::LoopKind::deduceKindOfLoopTadXZ(xShapeInfo, zShapeInfo, tadShapeInfo);
+    if(kindOfLoop == sd::LoopKind::SMALLARR2DX)
+        kindOfLoop = sd::LoopKind::EWSNONZERO;
 
     const Nd4jLong zLen   = shape::length(zShapeInfo);
     const Nd4jLong tadLen = shape::length(tadShapeInfo);
@@ -46,7 +46,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
 
     switch (kindOfLoop) {
         //*********************************************//
-        case nd4j::LoopKind::EWS1: {
+        case sd::LoopKind::EWS1: {
 
             auto func = PRAGMA_THREADS_FOR {
                 for (auto i = start; i < stop; i++) {
@@ -67,7 +67,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
             break;
 
             //*********************************************//
-        case nd4j::LoopKind::EWSNONZERO: {
+        case sd::LoopKind::EWSNONZERO: {
 
             auto func = PRAGMA_THREADS_FOR {
                 for (auto i = start; i < stop; i++) {
@@ -88,7 +88,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
             break;
 
             //*********************************************//
-        case nd4j::LoopKind::RANK1: {
+        case sd::LoopKind::RANK1: {
 
             auto func = PRAGMA_THREADS_FOR {
                 for (auto i = start; i < stop; i++) {
@@ -109,7 +109,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
             break;
 
             //*********************************************//
-        case nd4j::LoopKind::RANK2: {
+        case sd::LoopKind::RANK2: {
             Nd4jLong newStride[2];
             shape::updateStrides(2, tadShape, newStride, 'c');
 
@@ -136,7 +136,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
             break;
 
             //*********************************************//
-        case nd4j::LoopKind::RANK3: {
+        case sd::LoopKind::RANK3: {
             Nd4jLong newStride[3];
             shape::updateStrides(3, tadShape, newStride, 'c');
 
@@ -165,7 +165,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
             break;
 
             //*********************************************//
-        case nd4j::LoopKind::RANK4: {
+        case sd::LoopKind::RANK4: {
             Nd4jLong newStride[4];
             shape::updateStrides(4, tadShape, newStride, 'c');
 
@@ -196,7 +196,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
             break;
 
             //*********************************************//
-        case nd4j::LoopKind::RANK5: {
+        case sd::LoopKind::RANK5: {
             Nd4jLong newStride[5];
             shape::updateStrides(5, tadShape, newStride, 'c');
 
@@ -229,9 +229,9 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
             break;
 
             //*********************************************//
-        case nd4j::LoopKind::X_EWSNONZERO: {
+        case sd::LoopKind::X_EWSNONZERO: {
             uint castZShapeInfo[MAX_RANK];
-            const bool canCastZ   = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo,   castZShapeInfo);
+            const bool canCastZ   = sd::DataTypeUtils::castShapeInfo<uint>(zShapeInfo,   castZShapeInfo);
 
             auto func = PRAGMA_THREADS_FOR {
                 for (auto i = start; i < stop; i++) {
@@ -253,9 +253,9 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
             break;
 
             //*********************************************//
-        case nd4j::LoopKind::Z_EWSNONZERO: {
+        case sd::LoopKind::Z_EWSNONZERO: {
             uint castTadShapeInfo[MAX_RANK];
-            const bool canCastTad = nd4j::DataTypeUtils::castShapeInfo<uint>(tadShapeInfo, castTadShapeInfo);
+            const bool canCastTad = sd::DataTypeUtils::castShapeInfo<uint>(tadShapeInfo, castTadShapeInfo);
 
             auto func = PRAGMA_THREADS_FOR {
                 for (auto i = start; i < stop; i++) {
@@ -280,8 +280,8 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
         default: {
             uint castTadShapeInfo[MAX_RANK];
             uint castZShapeInfo[MAX_RANK];
-            const bool canCastTad = nd4j::DataTypeUtils::castShapeInfo<uint>(tadShapeInfo, castTadShapeInfo);
-            const bool canCastZ   = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo,   castZShapeInfo);
+            const bool canCastTad = sd::DataTypeUtils::castShapeInfo<uint>(tadShapeInfo, castTadShapeInfo);
+            const bool canCastZ   = sd::DataTypeUtils::castShapeInfo<uint>(zShapeInfo,   castZShapeInfo);
 
             auto func = PRAGMA_THREADS_FOR {
                 for (auto i = start; i < stop; i++) {
@@ -305,7 +305,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
 }
 
 template <typename X, typename Y>
-void nd4j::IndexReductionLoops<X, Y>::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* vz, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams) {
+void sd::IndexReductionLoops<X, Y>::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* vz, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams) {
     auto x = reinterpret_cast<X *>(vx);
     auto z = reinterpret_cast<Y *>(vz);
     auto extraParams = reinterpret_cast<X *>(vextraParams);
diff --git a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_0.cpp b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_0.cpp
index a694e42ca..68ae29fc9 100644
--- a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_0.cpp
+++ b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_0.cpp
@@ -21,4 +21,4 @@
 
 #include "./IndexReductionLoops.hpp"
 
-BUILD_DOUBLE_TEMPLATE(template void nd4j::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_0, (nd4j::DataType::INT32, int32_t));
\ No newline at end of file
+BUILD_DOUBLE_TEMPLATE(template void sd::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_0, (sd::DataType::INT32, int32_t));
\ No newline at end of file
diff --git a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_1.cpp b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_1.cpp
index 236428c4a..fe68715ca 100644
--- a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_1.cpp
+++ b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_1.cpp
@@ -21,4 +21,4 @@
 
 #include "./IndexReductionLoops.hpp"
 
-BUILD_DOUBLE_TEMPLATE(template void nd4j::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_1, (nd4j::DataType::INT32, int32_t));
\ No newline at end of file
+BUILD_DOUBLE_TEMPLATE(template void sd::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_1, (sd::DataType::INT32, int32_t));
\ No newline at end of file
diff --git a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_2.cpp b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_2.cpp
index 173744c97..8627003fd 100644
--- a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_2.cpp
+++ b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_2.cpp
@@ -21,4 +21,4 @@
 
 #include "./IndexReductionLoops.hpp"
 
-BUILD_DOUBLE_TEMPLATE(template void nd4j::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_2, (nd4j::DataType::INT32, int32_t));
\ No newline at end of file
+BUILD_DOUBLE_TEMPLATE(template void sd::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_2, (sd::DataType::INT32, int32_t));
\ No newline at end of file
diff --git a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_3.cpp b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_3.cpp
index fbb2fde50..8b2f4e1a7 100644
--- a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_3.cpp
+++ b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_3.cpp
@@ -21,4 +21,4 @@
 
 #include "./IndexReductionLoops.hpp"
 
-BUILD_DOUBLE_TEMPLATE(template void nd4j::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_3, (nd4j::DataType::INT32, int32_t));
\ No newline at end of file
+BUILD_DOUBLE_TEMPLATE(template void sd::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_3, (sd::DataType::INT32, int32_t));
\ No newline at end of file
diff --git a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_4.cpp b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_4.cpp
index 1fd8196e7..e87921565 100644
--- a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_4.cpp
+++ b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_4.cpp
@@ -21,4 +21,4 @@
 
 #include "./IndexReductionLoops.hpp"
 
-BUILD_DOUBLE_TEMPLATE(template void nd4j::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_4, (nd4j::DataType::INT32, int32_t));
\ No newline at end of file
+BUILD_DOUBLE_TEMPLATE(template void sd::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_4, (sd::DataType::INT32, int32_t));
\ No newline at end of file
diff --git a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_5.cpp b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_5.cpp
index 1378c661c..062b006fd 100644
--- a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_5.cpp
+++ b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_5.cpp
@@ -21,4 +21,4 @@
 
 #include "./IndexReductionLoops.hpp"
 
-BUILD_DOUBLE_TEMPLATE(template void nd4j::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_5, (nd4j::DataType::INT32, int32_t));
\ No newline at end of file
+BUILD_DOUBLE_TEMPLATE(template void sd::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_5, (sd::DataType::INT32, int32_t));
\ No newline at end of file
diff --git a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_6.cpp b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_6.cpp
index 5b338bb09..4182de6fd 100644
--- a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_6.cpp
+++ b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_6.cpp
@@ -21,4 +21,4 @@
 
 #include "./IndexReductionLoops.hpp"
 
-BUILD_DOUBLE_TEMPLATE(template void nd4j::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_6, (nd4j::DataType::INT32, int32_t));
\ No newline at end of file
+BUILD_DOUBLE_TEMPLATE(template void sd::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_6, (sd::DataType::INT32, int32_t));
\ No newline at end of file
diff --git a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_7.cpp b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_7.cpp
index 7dfe7d939..53a4ed23f 100644
--- a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_7.cpp
+++ b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_7.cpp
@@ -21,4 +21,4 @@
 
 #include "./IndexReductionLoops.hpp"
 
-BUILD_DOUBLE_TEMPLATE(template void nd4j::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_7, (nd4j::DataType::INT32, int32_t));
\ No newline at end of file
+BUILD_DOUBLE_TEMPLATE(template void sd::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_7, (sd::DataType::INT32, int32_t));
\ No newline at end of file
diff --git a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_8.cpp b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_8.cpp
index 14e91685d..2cf4b6ae7 100644
--- a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_8.cpp
+++ b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_8.cpp
@@ -21,4 +21,4 @@
 
 #include "./IndexReductionLoops.hpp"
 
-BUILD_DOUBLE_TEMPLATE(template void nd4j::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_8, (nd4j::DataType::INT32, int32_t));
\ No newline at end of file
+BUILD_DOUBLE_TEMPLATE(template void sd::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_8, (sd::DataType::INT32, int32_t));
\ No newline at end of file
diff --git a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_9.cpp b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_9.cpp
index 677802476..b6b1da4a0 100644
--- a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_9.cpp
+++ b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int32_9.cpp
@@ -21,4 +21,4 @@
 
 #include "./IndexReductionLoops.hpp"
 
-BUILD_DOUBLE_TEMPLATE(template void nd4j::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_9, (nd4j::DataType::INT32, int32_t));
\ No newline at end of file
+BUILD_DOUBLE_TEMPLATE(template void sd::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_9, (sd::DataType::INT32, int32_t));
\ No newline at end of file
diff --git a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_0.cpp b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_0.cpp
index cb295f479..de4cf1872 100644
--- a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_0.cpp
+++ b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_0.cpp
@@ -21,4 +21,4 @@
 
 #include "./IndexReductionLoops.hpp"
 
-BUILD_DOUBLE_TEMPLATE(template void nd4j::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_0, (nd4j::DataType::INT64, Nd4jLong));
\ No newline at end of file
+BUILD_DOUBLE_TEMPLATE(template void sd::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_0, (sd::DataType::INT64, Nd4jLong));
\ No newline at end of file
diff --git a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_1.cpp b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_1.cpp
index 8b5914d84..71a19bab2 100644
--- a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_1.cpp
+++ b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_1.cpp
@@ -21,4 +21,4 @@
 
 #include "./IndexReductionLoops.hpp"
 
-BUILD_DOUBLE_TEMPLATE(template void nd4j::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_1, (nd4j::DataType::INT64, Nd4jLong));
\ No newline at end of file
+BUILD_DOUBLE_TEMPLATE(template void sd::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_1, (sd::DataType::INT64, Nd4jLong));
\ No newline at end of file
diff --git a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_2.cpp b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_2.cpp
index 97d8f5906..22d430e9e 100644
--- a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_2.cpp
+++ b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_2.cpp
@@ -21,4 +21,4 @@
 
 #include "./IndexReductionLoops.hpp"
 
-BUILD_DOUBLE_TEMPLATE(template void nd4j::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_2, (nd4j::DataType::INT64, Nd4jLong));
\ No newline at end of file
+BUILD_DOUBLE_TEMPLATE(template void sd::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_2, (sd::DataType::INT64, Nd4jLong));
\ No newline at end of file
diff --git a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_3.cpp b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_3.cpp
index 6463c8584..c2434f63a 100644
--- a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_3.cpp
+++ b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_3.cpp
@@ -21,4 +21,4 @@
 
 #include "./IndexReductionLoops.hpp"
 
-BUILD_DOUBLE_TEMPLATE(template void nd4j::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_3, (nd4j::DataType::INT64, Nd4jLong));
\ No newline at end of file
+BUILD_DOUBLE_TEMPLATE(template void sd::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_3, (sd::DataType::INT64, Nd4jLong));
\ No newline at end of file
diff --git a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_4.cpp b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_4.cpp
index 365e2a70f..be628bb63 100644
--- a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_4.cpp
+++ b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_4.cpp
@@ -21,4 +21,4 @@
 
 #include "./IndexReductionLoops.hpp"
 
-BUILD_DOUBLE_TEMPLATE(template void nd4j::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_4, (nd4j::DataType::INT64, Nd4jLong));
\ No newline at end of file
+BUILD_DOUBLE_TEMPLATE(template void sd::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_4, (sd::DataType::INT64, Nd4jLong));
\ No newline at end of file
diff --git a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_5.cpp b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_5.cpp
index 77e6bf5be..a5e8a596f 100644
--- a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_5.cpp
+++ b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_5.cpp
@@ -21,4 +21,4 @@
 
 #include "./IndexReductionLoops.hpp"
 
-BUILD_DOUBLE_TEMPLATE(template void nd4j::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_5, (nd4j::DataType::INT64, Nd4jLong));
\ No newline at end of file
+BUILD_DOUBLE_TEMPLATE(template void sd::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_5, (sd::DataType::INT64, Nd4jLong));
\ No newline at end of file
diff --git a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_6.cpp b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_6.cpp
index f355d655e..3e96a0574 100644
--- a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_6.cpp
+++ b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_6.cpp
@@ -21,4 +21,4 @@
 
 #include "./IndexReductionLoops.hpp"
 
-BUILD_DOUBLE_TEMPLATE(template void nd4j::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_6, (nd4j::DataType::INT64, Nd4jLong));
\ No newline at end of file
+BUILD_DOUBLE_TEMPLATE(template void sd::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_6, (sd::DataType::INT64, Nd4jLong));
\ No newline at end of file
diff --git a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_7.cpp b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_7.cpp
index 542587b18..a6c02301f 100644
--- a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_7.cpp
+++ b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_7.cpp
@@ -21,4 +21,4 @@
 
 #include "./IndexReductionLoops.hpp"
 
-BUILD_DOUBLE_TEMPLATE(template void nd4j::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_7, (nd4j::DataType::INT64, Nd4jLong));
\ No newline at end of file
+BUILD_DOUBLE_TEMPLATE(template void sd::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_7, (sd::DataType::INT64, Nd4jLong));
\ No newline at end of file
diff --git a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_8.cpp b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_8.cpp
index bccc40219..e461c9bcd 100644
--- a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_8.cpp
+++ b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_8.cpp
@@ -21,4 +21,4 @@
 
 #include "./IndexReductionLoops.hpp"
 
-BUILD_DOUBLE_TEMPLATE(template void nd4j::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_8, (nd4j::DataType::INT64, Nd4jLong));
\ No newline at end of file
+BUILD_DOUBLE_TEMPLATE(template void sd::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_8, (sd::DataType::INT64, Nd4jLong));
\ No newline at end of file
diff --git a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_9.cpp b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_9.cpp
index 85475046c..33e5ba403 100644
--- a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_9.cpp
+++ b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops_int64_9.cpp
@@ -21,4 +21,4 @@
 
 #include "./IndexReductionLoops.hpp"
 
-BUILD_DOUBLE_TEMPLATE(template void nd4j::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_9, (nd4j::DataType::INT64, Nd4jLong));
\ No newline at end of file
+BUILD_DOUBLE_TEMPLATE(template void sd::IndexReductionLoops, ::wrapIndexReduce(const int opNum, void* vx, Nd4jLong* xShapeInfo, void* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, void* vextraParams), LIBND4J_TYPES_9, (sd::DataType::INT64, Nd4jLong));
\ No newline at end of file
diff --git a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_0.cpp b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_0.cpp
index b8405553e..f721c5994 100644
--- a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_0.cpp
+++ b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_0.cpp
@@ -18,13 +18,13 @@
 // @author raver119@gmail.com
 //
 
-#include <Loops.h>
-#include <pointercast.h>
+#include <helpers/Loops.h>
+#include <system/pointercast.h>
 #include <types/types.h>
 
 using namespace simdOps;
 
-namespace nd4j {
+namespace sd {
 
     template<typename X, typename Z>
     template <typename OpType>
diff --git a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_1.cpp b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_1.cpp
index 44ccea08c..19a248896 100644
--- a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_1.cpp
+++ b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_1.cpp
@@ -18,13 +18,13 @@
 // @author raver119@gmail.com
 //
 
-#include <Loops.h>
-#include <pointercast.h>
+#include <helpers/Loops.h>
+#include <system/pointercast.h>
 #include <types/types.h>
 
 using namespace simdOps;
 
-namespace nd4j {
+namespace sd {
 
     template<typename X, typename Z>
     template <typename OpType>
diff --git a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_2.cpp b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_2.cpp
index ec261a7ea..e90050e4e 100644
--- a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_2.cpp
+++ b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_2.cpp
@@ -18,13 +18,13 @@
 // @author raver119@gmail.com
 //
 
-#include <Loops.h>
-#include <pointercast.h>
+#include <helpers/Loops.h>
+#include <system/pointercast.h>
 #include <types/types.h>
 
 using namespace simdOps;
 
-namespace nd4j {
+namespace sd {
 
     template<typename X, typename Z>
     template <typename OpType>
diff --git a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_3.cpp b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_3.cpp
index 3b1efadc9..d109d1013 100644
--- a/libnd4j/include/helpers/cpu/loops/Reduction3Loops_3.cpp
+++ b/libnd4j/include/helpers/cpu/loops/Reduction3Loops_3.cpp
@@ -18,13 +18,13 @@
 // @author raver119@gmail.com
 //
 
-#include <Loops.h>
-#include <pointercast.h>
+#include <helpers/Loops.h>
+#include <system/pointercast.h>
 #include <types/types.h>
 
 using namespace simdOps;
 
-namespace nd4j {
+namespace sd {
 
     template<typename X, typename Z>
     template <typename OpType>
diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops.hpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops.hpp
index 0709e5f3c..943da43ab 100644
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops.hpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops.hpp
@@ -19,4 +19,4 @@
 //
 
 #include <helpers/Loops.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_bool.cpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_bool.cpp
index 151bc6a82..31ec60d93 100644
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_bool.cpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_bool.cpp
@@ -22,7 +22,7 @@
 
 using namespace simdOps;
 
-namespace nd4j {
+namespace sd {
 
     template<typename X, typename Z>
     template <typename OpType>
diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_0.cpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_0.cpp
index af8b0b451..f4243d1c9 100644
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_0.cpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_0.cpp
@@ -19,12 +19,12 @@
 //
 
 #include "ReductionLoops.hpp"
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <types/types.h>
 
 using namespace simdOps;
 
-namespace nd4j {
+namespace sd {
 
     template<typename X, typename Z>
     template <typename OpType>
diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_1.cpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_1.cpp
index 137ffc011..1c5b46d40 100644
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_1.cpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_1.cpp
@@ -19,12 +19,12 @@
 //
 
 #include "ReductionLoops.hpp"
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <types/types.h>
 
 using namespace simdOps;
 
-namespace nd4j {
+namespace sd {
 
     template<typename X, typename Z>
     template <typename OpType>
diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_2.cpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_2.cpp
index 79b11b419..08ca08cdb 100644
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_2.cpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_2.cpp
@@ -19,12 +19,12 @@
 //
 
 #include "ReductionLoops.hpp"
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <types/types.h>
 
 using namespace simdOps;
 
-namespace nd4j {
+namespace sd {
 
     template<typename X, typename Z>
     template <typename OpType>
diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_3.cpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_3.cpp
index ddedd6c18..7735c2125 100644
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_3.cpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_float_3.cpp
@@ -19,12 +19,12 @@
 //
 
 #include "ReductionLoops.hpp"
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <types/types.h>
 
 using namespace simdOps;
 
-namespace nd4j {
+namespace sd {
 
     template<typename X, typename Z>
     template <typename OpType>
diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_long.cpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_long.cpp
index 2e7708497..e4f4ab2e0 100644
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_long.cpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_long.cpp
@@ -24,12 +24,12 @@ using namespace simdOps;
 
 
 #include "ReductionLoops.hpp"
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <types/types.h>
 
 using namespace simdOps;
 
-namespace nd4j {
+namespace sd {
 
     template<typename X, typename Z>
     template <typename OpType>
diff --git a/libnd4j/include/helpers/cpu/loops/ReductionLoops_same.cpp b/libnd4j/include/helpers/cpu/loops/ReductionLoops_same.cpp
index 08a67ec59..6188a90f5 100644
--- a/libnd4j/include/helpers/cpu/loops/ReductionLoops_same.cpp
+++ b/libnd4j/include/helpers/cpu/loops/ReductionLoops_same.cpp
@@ -22,7 +22,7 @@
 
 using namespace simdOps;
 
-namespace nd4j {
+namespace sd {
 
     template<typename X>
     template <typename OpType>
diff --git a/libnd4j/include/helpers/cpu/svd.cpp b/libnd4j/include/helpers/cpu/svd.cpp
index ef129a0d0..4e257b267 100644
--- a/libnd4j/include/helpers/cpu/svd.cpp
+++ b/libnd4j/include/helpers/cpu/svd.cpp
@@ -18,14 +18,14 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 03.01.2018
 //
 
-#include <svd.h>
-#include <jacobiSVD.h>
-#include <biDiagonalUp.h>
+#include <helpers/svd.h>
+#include <helpers/jacobiSVD.h>
+#include <helpers/biDiagonalUp.h>
 #include <array/ResultSet.h>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
diff --git a/libnd4j/include/helpers/cublasHelper.h b/libnd4j/include/helpers/cublasHelper.h
index f07cc178c..0300f3698 100644
--- a/libnd4j/include/helpers/cublasHelper.h
+++ b/libnd4j/include/helpers/cublasHelper.h
@@ -21,12 +21,12 @@
 #ifndef DEV_TESTS_CUBLASHELPER_H
 #define DEV_TESTS_CUBLASHELPER_H
 
-#include <dll.h>
-#include <pointercast.h>
+#include <system/dll.h>
+#include <system/pointercast.h>
 #include <vector>
 #include <mutex>
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT CublasHelper {
     private:
         static CublasHelper *_INSTANCE;
diff --git a/libnd4j/include/helpers/cuda/ConstantHelper.cu b/libnd4j/include/helpers/cuda/ConstantHelper.cu
index 678988dd9..62d932489 100644
--- a/libnd4j/include/helpers/cuda/ConstantHelper.cu
+++ b/libnd4j/include/helpers/cuda/ConstantHelper.cu
@@ -20,12 +20,12 @@
 //
 
 #include <exceptions/cuda_exception.h>
-#include <ConstantHelper.h>
-#include <DataTypeUtils.h>
-#include <shape.h>
+#include <helpers/ConstantHelper.h>
+#include <array/DataTypeUtils.h>
+#include <helpers/shape.h>
 #include <execution/LaunchContext.h>
-#include <specials.h>
-#include <logger.h>
+#include <ops/specials.h>
+#include <helpers/logger.h>
 #include <cuda_runtime.h>
 #include <cuda.h>
 #include <execution/AffinityManager.h>
@@ -34,7 +34,7 @@
 
 __constant__ char deviceConstantMemory[CONSTANT_LIMIT];
 
-namespace nd4j {
+namespace sd {
     static void* getConstantSpace() {
         Nd4jPointer dConstAddr;
         auto dZ = cudaGetSymbolAddress(reinterpret_cast<void **>(&dConstAddr), deviceConstantMemory);
@@ -86,7 +86,7 @@ namespace nd4j {
 
     ConstantHelper* ConstantHelper::getInstance() {
         if (!_INSTANCE)
-            _INSTANCE = new nd4j::ConstantHelper();
+            _INSTANCE = new sd::ConstantHelper();
 
         return _INSTANCE;
     }
@@ -133,7 +133,7 @@ namespace nd4j {
         }
     }
 
-    ConstantDataBuffer* ConstantHelper::constantBuffer(const ConstantDescriptor &descriptor, nd4j::DataType dataType) {
+    ConstantDataBuffer* ConstantHelper::constantBuffer(const ConstantDescriptor &descriptor, sd::DataType dataType) {
         const auto deviceId = getCurrentDevice();
 
         // all cache modifications are synchronous
@@ -161,9 +161,9 @@ namespace nd4j {
 
             // create buffer with this dtype
             if (descriptor.isFloat()) {
-                BUILD_DOUBLE_SELECTOR(nd4j::DataType::DOUBLE, dataType, nd4j::SpecialTypeConverter::convertGeneric, (nullptr, const_cast<double *>(descriptor.floatValues().data()), descriptor.length(), cbuff), (nd4j::DataType::DOUBLE, double), LIBND4J_TYPES);
+                BUILD_DOUBLE_SELECTOR(sd::DataType::DOUBLE, dataType, sd::SpecialTypeConverter::convertGeneric, (nullptr, const_cast<double *>(descriptor.floatValues().data()), descriptor.length(), cbuff), (sd::DataType::DOUBLE, double), LIBND4J_TYPES);
             } else if (descriptor.isInteger()) {
-                BUILD_DOUBLE_SELECTOR(nd4j::DataType::INT64, dataType, nd4j::SpecialTypeConverter::convertGeneric, (nullptr, const_cast<Nd4jLong *>(descriptor.integerValues().data()), descriptor.length(), cbuff), (nd4j::DataType::INT64, Nd4jLong), LIBND4J_TYPES);
+                BUILD_DOUBLE_SELECTOR(sd::DataType::INT64, dataType, sd::SpecialTypeConverter::convertGeneric, (nullptr, const_cast<Nd4jLong *>(descriptor.integerValues().data()), descriptor.length(), cbuff), (sd::DataType::INT64, Nd4jLong), LIBND4J_TYPES);
             }
 
             auto dbuff = replicatePointer(cbuff, descriptor.length() * DataTypeUtils::sizeOf(dataType));
@@ -185,5 +185,5 @@ namespace nd4j {
             return _counters[deviceId];
     }
 
-    nd4j::ConstantHelper* nd4j::ConstantHelper::_INSTANCE = 0;
+    sd::ConstantHelper* sd::ConstantHelper::_INSTANCE = 0;
 }
\ No newline at end of file
diff --git a/libnd4j/include/helpers/cuda/ConstantShapeHelper.cu b/libnd4j/include/helpers/cuda/ConstantShapeHelper.cu
index 96f2774cd..e4719bd74 100644
--- a/libnd4j/include/helpers/cuda/ConstantShapeHelper.cu
+++ b/libnd4j/include/helpers/cuda/ConstantShapeHelper.cu
@@ -20,12 +20,12 @@
 
 #include "../ConstantShapeHelper.h"
 #include <exceptions/cuda_exception.h>
-#include <ShapeDescriptor.h>
-#include <ShapeBuilders.h>
-#include <AffinityManager.h>
-#include <ConstantHelper.h>
+#include <array/ShapeDescriptor.h>
+#include <helpers/ShapeBuilders.h>
+#include <execution/AffinityManager.h>
+#include <helpers/ConstantHelper.h>
 
-namespace nd4j {
+namespace sd {
 
     ConstantShapeHelper::ConstantShapeHelper() {
         auto numDevices = AffinityManager::numberOfDevices();
@@ -44,12 +44,12 @@ namespace nd4j {
         return _INSTANCE;
     }
 
-    ConstantDataBuffer ConstantShapeHelper::bufferForShapeInfo(nd4j::DataType dataType, char order, const std::vector<Nd4jLong> &shape) {
+    ConstantDataBuffer ConstantShapeHelper::bufferForShapeInfo(sd::DataType dataType, char order, const std::vector<Nd4jLong> &shape) {
         ShapeDescriptor descriptor(dataType, order, shape);
         return bufferForShapeInfo(descriptor);
     }
 
-    ConstantDataBuffer ConstantShapeHelper::bufferForShapeInfo(const nd4j::DataType dataType, const char order, const int rank, const Nd4jLong* shape) {
+    ConstantDataBuffer ConstantShapeHelper::bufferForShapeInfo(const sd::DataType dataType, const char order, const int rank, const Nd4jLong* shape) {
         ShapeDescriptor descriptor(dataType, order, shape, rank);
         return bufferForShapeInfo(descriptor);
     }
@@ -83,31 +83,31 @@ namespace nd4j {
         return _cache[deviceId].count(descriptor) != 0;
     }
 
-    Nd4jLong* ConstantShapeHelper::createShapeInfo(const nd4j::DataType dataType, const char order, const int rank, const Nd4jLong* shape) {
+    Nd4jLong* ConstantShapeHelper::createShapeInfo(const sd::DataType dataType, const char order, const int rank, const Nd4jLong* shape) {
         ShapeDescriptor descriptor(dataType, order, shape, rank);
         return bufferForShapeInfo(descriptor).primaryAsT<Nd4jLong>();
     }
 
-    Nd4jLong* ConstantShapeHelper::createShapeInfo(const nd4j::DataType dataType, const Nd4jLong* shapeInfo) {
+    Nd4jLong* ConstantShapeHelper::createShapeInfo(const sd::DataType dataType, const Nd4jLong* shapeInfo) {
         return ConstantShapeHelper::createShapeInfo(dataType, shape::order(shapeInfo), shape::rank(shapeInfo), shape::shapeOf(const_cast<Nd4jLong*>(shapeInfo)));
     }
 
-    Nd4jLong* ConstantShapeHelper::emptyShapeInfo(const nd4j::DataType dataType) {
+    Nd4jLong* ConstantShapeHelper::emptyShapeInfo(const sd::DataType dataType) {
         auto descriptor = ShapeDescriptor::emptyDescriptor(dataType);
         return bufferForShapeInfo(descriptor).primaryAsT<Nd4jLong>();
     }
 
-    Nd4jLong* ConstantShapeHelper::scalarShapeInfo(const nd4j::DataType dataType) {
+    Nd4jLong* ConstantShapeHelper::scalarShapeInfo(const sd::DataType dataType) {
         auto descriptor = ShapeDescriptor::scalarDescriptor(dataType);
         return bufferForShapeInfo(descriptor).primaryAsT<Nd4jLong>();
     }
 
-    Nd4jLong* ConstantShapeHelper::vectorShapeInfo(const Nd4jLong length, const nd4j::DataType dataType) {
+    Nd4jLong* ConstantShapeHelper::vectorShapeInfo(const Nd4jLong length, const sd::DataType dataType) {
         auto descriptor = ShapeDescriptor::vectorDescriptor(length, dataType);
         return bufferForShapeInfo(descriptor).primaryAsT<Nd4jLong>();
     }
 
-    Nd4jLong* ConstantShapeHelper::createShapeInfo(const nd4j::DataType dataType, const char order, const std::vector<Nd4jLong> &shape) {
+    Nd4jLong* ConstantShapeHelper::createShapeInfo(const sd::DataType dataType, const char order, const std::vector<Nd4jLong> &shape) {
         ShapeDescriptor descriptor(dataType, order, shape);
         return bufferForShapeInfo(descriptor).primaryAsT<Nd4jLong>();
     }
@@ -126,7 +126,7 @@ namespace nd4j {
         return result;
     }
 
-    Nd4jLong* ConstantShapeHelper::createFromExisting(Nd4jLong *shapeInfo, nd4j::memory::Workspace *workspace) {
+    Nd4jLong* ConstantShapeHelper::createFromExisting(Nd4jLong *shapeInfo, sd::memory::Workspace *workspace) {
         ShapeDescriptor descriptor(shapeInfo);
         auto result = createShapeInfo(descriptor);
 
@@ -135,5 +135,5 @@ namespace nd4j {
         return result;
     }
 
-    nd4j::ConstantShapeHelper* nd4j::ConstantShapeHelper::_INSTANCE = 0;
+    sd::ConstantShapeHelper* sd::ConstantShapeHelper::_INSTANCE = 0;
 }
\ No newline at end of file
diff --git a/libnd4j/include/helpers/cuda/ConstantTadHelper.cu b/libnd4j/include/helpers/cuda/ConstantTadHelper.cu
index a1cd3e89f..b7b419070 100644
--- a/libnd4j/include/helpers/cuda/ConstantTadHelper.cu
+++ b/libnd4j/include/helpers/cuda/ConstantTadHelper.cu
@@ -19,14 +19,14 @@
 //
 
 #include "../ConstantTadHelper.h"
-#include <TAD.h>
-#include <ConstantHelper.h>
-#include <AffinityManager.h>
+#include <helpers/TAD.h>
+#include <helpers/ConstantHelper.h>
+#include <execution/AffinityManager.h>
 #include <exceptions/cuda_exception.h>
 #include <execution/LaunchContext.h>
-#include <ShapeUtils.h>
+#include <helpers/ShapeUtils.h>
 
-namespace nd4j {
+namespace sd {
     ConstantTadHelper::ConstantTadHelper() {
         auto numDevices = AffinityManager::numberOfDevices();
 
@@ -108,5 +108,5 @@ namespace nd4j {
         }
     }
 
-    nd4j::ConstantTadHelper* nd4j::ConstantTadHelper::_INSTANCE = 0;
+    sd::ConstantTadHelper* sd::ConstantTadHelper::_INSTANCE = 0;
 }
\ No newline at end of file
diff --git a/libnd4j/include/helpers/cuda/PointersManager.cu b/libnd4j/include/helpers/cuda/PointersManager.cu
index a5e099980..dc5fe15f5 100644
--- a/libnd4j/include/helpers/cuda/PointersManager.cu
+++ b/libnd4j/include/helpers/cuda/PointersManager.cu
@@ -19,17 +19,17 @@
 // @author raver119@gmail.com
 //
 
-#include <PointersManager.h>
+#include <helpers/PointersManager.h>
 #include <exceptions/cuda_exception.h>
-#include <StringUtils.h>
-#include <logger.h>
+#include <helpers/StringUtils.h>
+#include <helpers/logger.h>
 #include <memory/Workspace.h>
 
-namespace nd4j {
+namespace sd {
 
 //////////////////////////////////////////////////////////////////////////
-PointersManager::PointersManager(const nd4j::LaunchContext* context, const std::string& funcName)  {
-        _context  = const_cast<nd4j::LaunchContext*>(context);
+PointersManager::PointersManager(const sd::LaunchContext* context, const std::string& funcName)  {
+        _context  = const_cast<sd::LaunchContext*>(context);
         _funcName = funcName;
 }
 
@@ -42,7 +42,7 @@ void* PointersManager::replicatePointer(const void* src, const size_t numberOfBy
         if (cudaResult != 0)
             throw cuda_exception::build(_funcName + ": cannot allocate global memory on device!", cudaResult);
     } else {
-	    dst = _context->getWorkspace()->allocateBytes(nd4j::memory::MemoryType::DEVICE, numberOfBytes);
+	    dst = _context->getWorkspace()->allocateBytes(sd::memory::MemoryType::DEVICE, numberOfBytes);
 	}
 
     if (_context != nullptr)
@@ -84,8 +84,8 @@ static __global__ void printDevContentOnDev_(const void* pDev, const Nd4jLong le
 ////////////////////////////////////////////////////////////////////////
 template<typename T>
 void PointersManager::printDevContentOnDevFromHost(const void* pDev, const Nd4jLong len, const int tid) {
-    printDevContentOnDev_<T><<<512, 512, 1024, *nd4j::LaunchContext ::defaultContext()->getCudaStream()>>>(pDev, len, tid);
-    auto res = cudaStreamSynchronize(*nd4j::LaunchContext ::defaultContext()->getCudaStream());
+    printDevContentOnDev_<T><<<512, 512, 1024, *sd::LaunchContext ::defaultContext()->getCudaStream()>>>(pDev, len, tid);
+    auto res = cudaStreamSynchronize(*sd::LaunchContext ::defaultContext()->getCudaStream());
     if (res != 0)
         throw std::runtime_error("PointersManager::printDevContentOnDevFromHost: cudaStreamSynchronize failed!");
 }
diff --git a/libnd4j/include/helpers/cuda_off/MmulHelper.cu b/libnd4j/include/helpers/cuda_off/MmulHelper.cu
index bf366dc29..379a6ed72 100644
--- a/libnd4j/include/helpers/cuda_off/MmulHelper.cu
+++ b/libnd4j/include/helpers/cuda_off/MmulHelper.cu
@@ -22,12 +22,12 @@
 #include <exceptions/cuda_exception.h>
 #include <cublas_v2.h>
 #include "../MmulHelper.h"
-#include <specials_cuda.h>
-#include <ShapeUtils.h>
-#include <PointersManager.h>
+#include <ops/specials_cuda.h>
+#include <helpers/ShapeUtils.h>
+#include <helpers/PointersManager.h>
 #include <numeric>
 
-namespace nd4j {
+namespace sd {
 
 //////////////////////////////////////////////////////////////////////////////
 // MXK x KxN = MxN              -> actual sequence of axes doesn't matter
@@ -357,7 +357,7 @@ NDArray* MmulHelper::mmulMxM(const NDArray* A, const NDArray* B, NDArray* C, dou
 
 ////////////////////////////////////////////////////////////////////////////
 // MXN x N = M
-NDArray* MmulHelper::mmulMxV(const NDArray* A, const NDArray* X, nd4j::NDArray* Y, const double alpha, const double beta, const char outOrder) {
+NDArray* MmulHelper::mmulMxV(const NDArray* A, const NDArray* X, sd::NDArray* Y, const double alpha, const double beta, const char outOrder) {
 
     int xLenDim, yLenDim(0);
 
@@ -463,7 +463,7 @@ NDArray* MmulHelper::mmulMxV(const NDArray* A, const NDArray* X, nd4j::NDArray*
 
 ////////////////////////////////////////////////////////////////////////////
 // (X * Y) = Z[0]
-NDArray* MmulHelper::dot(const NDArray* X, const NDArray* Y, nd4j::NDArray* Z, const double alpha, const double beta) {
+NDArray* MmulHelper::dot(const NDArray* X, const NDArray* Y, sd::NDArray* Z, const double alpha, const double beta) {
 
     int xLenDim(0), yLenDim(0);
 
diff --git a/libnd4j/include/helpers/cuda_off/cublasHelper.cu b/libnd4j/include/helpers/cuda_off/cublasHelper.cu
index 7204862eb..7ab2d7d63 100644
--- a/libnd4j/include/helpers/cuda_off/cublasHelper.cu
+++ b/libnd4j/include/helpers/cuda_off/cublasHelper.cu
@@ -33,7 +33,7 @@
 
 #endif
 
-namespace nd4j {
+namespace sd {
     std::mutex CublasHelper::_mutex;
 
     static void* handle_() {
@@ -105,7 +105,7 @@ namespace nd4j {
     CublasHelper* CublasHelper::getInstance() {
         _mutex.lock();
         if (!_INSTANCE)
-            _INSTANCE = new nd4j::CublasHelper();
+            _INSTANCE = new sd::CublasHelper();
         _mutex.unlock();
 
         return _INSTANCE;
@@ -140,5 +140,5 @@ namespace nd4j {
     }
 
 
-    nd4j::CublasHelper* nd4j::CublasHelper::_INSTANCE = 0;
+    sd::CublasHelper* sd::CublasHelper::_INSTANCE = 0;
 }
\ No newline at end of file
diff --git a/libnd4j/include/helpers/helper_generator.h b/libnd4j/include/helpers/helper_generator.h
index 0e741e931..ecf87ae81 100644
--- a/libnd4j/include/helpers/helper_generator.h
+++ b/libnd4j/include/helpers/helper_generator.h
@@ -21,10 +21,10 @@
 #ifndef LIBND4J_HELPER_GENERATOR_H
 #define LIBND4J_HELPER_GENERATOR_H
 
-#include <op_boilerplate.h>
-#include <pointercast.h>
+#include <system/op_boilerplate.h>
+#include <system/pointercast.h>
 #include <array/DataTypeUtils.h>
-#include <dll.h>
+#include <system/dll.h>
 
 #ifdef _MSC_VER
 // include for uint64_t on MSVC
@@ -48,7 +48,7 @@
 #endif
 
 
-namespace nd4j {
+namespace sd {
     namespace random {
 
 #ifdef __CUDACC__
@@ -111,7 +111,7 @@ namespace nd4j {
                 this->synchronizer = 0;
                 this->devBuffer = devBuffer;
 
-                cudaMalloc(&devHolder, sizeof(nd4j::random::RandomBuffer));
+                cudaMalloc(&devHolder, sizeof(sd::random::RandomBuffer));
             }
 
             __host__
@@ -125,8 +125,8 @@ namespace nd4j {
             }
 
             __host__
-            void propagateToDevice(nd4j::random::RandomBuffer *buffer, cudaStream_t stream) {
-                cudaMemcpyAsync(devHolder, buffer, sizeof(nd4j::random::RandomBuffer), cudaMemcpyHostToDevice, stream);
+            void propagateToDevice(sd::random::RandomBuffer *buffer, cudaStream_t stream) {
+                cudaMemcpyAsync(devHolder, buffer, sizeof(sd::random::RandomBuffer), cudaMemcpyHostToDevice, stream);
             }
 
             __host__ __device__
@@ -231,7 +231,7 @@ namespace nd4j {
 
             uint64_t _CUDA_HD next64(uint64_t shiftedSeed) {
                 const auto s0 = static_cast<uint64_t>(shiftedSeed);
-                auto s1 = static_cast<uint64_t>(shiftedSeed) % nd4j::DataTypeUtils::max<int>() + 11;
+                auto s1 = static_cast<uint64_t>(shiftedSeed) % sd::DataTypeUtils::max<int>() + 11;
                 uint64_t r0, r1;
 
                 s1 ^= s0;
@@ -246,7 +246,7 @@ namespace nd4j {
             }
 
             uint64_t static _CUDA_HD inline safeShift(uint64_t x, uint64_t y) {
-                if (y != 0 && x > nd4j::DataTypeUtils::max<uint64_t>() / y) {
+                if (y != 0 && x > sd::DataTypeUtils::max<uint64_t>() / y) {
                     return x / y + 11;
                 } else return (x * y) + 11;
             }
@@ -349,7 +349,7 @@ namespace nd4j {
             */
             int _CUDA_D nextInt() {
                 auto u = nextUInt64();
-                return u <= nd4j::DataTypeUtils::max<int>() ? static_cast<int>(u) : static_cast<int>(u % nd4j::DataTypeUtils::max<int>());
+                return u <= sd::DataTypeUtils::max<int>() ? static_cast<int>(u) : static_cast<int>(u % sd::DataTypeUtils::max<int>());
             };
 
             uint64_t _CUDA_D nextUInt64() {
@@ -395,7 +395,7 @@ namespace nd4j {
             template<typename T>
             _CUDA_D T nextT() {
                 auto u = static_cast<float>(nextUInt64());
-                auto m = static_cast<float>(nd4j::DataTypeUtils::max<uint64_t>());
+                auto m = static_cast<float>(sd::DataTypeUtils::max<uint64_t>());
                 return static_cast<T>(u / m);
             }
 
@@ -432,7 +432,7 @@ namespace nd4j {
              */
             inline int _CUDA_D relativeInt(Nd4jLong index) {
                 auto u = relativeUInt64(index);
-                return u <= nd4j::DataTypeUtils::max<int>() ? static_cast<int>(u) : static_cast<int>(u % nd4j::DataTypeUtils::max<int>());
+                return u <= sd::DataTypeUtils::max<int>() ? static_cast<int>(u) : static_cast<int>(u % sd::DataTypeUtils::max<int>());
             }
 
             /**
@@ -476,7 +476,7 @@ namespace nd4j {
                  * FIXME: once we add support for additional datatypes this code must be tweaked
                  */
                 auto u = static_cast<float>(relativeUInt64(index));
-                auto m = static_cast<float> (nd4j::DataTypeUtils::max<uint64_t>());
+                auto m = static_cast<float> (sd::DataTypeUtils::max<uint64_t>());
                 return static_cast<T>(u / m);
             }
 
@@ -516,11 +516,11 @@ namespace nd4j {
             Nd4jLong limit;
             Nd4jLong seed;
             uint64_t *buffer;
-            nd4j::random::RandomBuffer *realBuffer;
+            sd::random::RandomBuffer *realBuffer;
 
         public:
 
-            _CUDA_HD IGenerator(nd4j::random::RandomBuffer *buffer) {
+            _CUDA_HD IGenerator(sd::random::RandomBuffer *buffer) {
                 this->limit = buffer->getSize();
                 this->buffer = reinterpret_cast<uint64_t *>(buffer->getBuffer());
                 this->realBuffer = buffer;
@@ -600,7 +600,7 @@ namespace nd4j {
             }
 
         public:
-            _CUDA_HD Xoroshiro128(nd4j::random::RandomBuffer *buffer) : IGenerator(buffer) {
+            _CUDA_HD Xoroshiro128(sd::random::RandomBuffer *buffer) : IGenerator(buffer) {
                 //
             }
 
diff --git a/libnd4j/include/helpers/helper_hash.h b/libnd4j/include/helpers/helper_hash.h
index 81817ce35..1b032238f 100644
--- a/libnd4j/include/helpers/helper_hash.h
+++ b/libnd4j/include/helpers/helper_hash.h
@@ -23,11 +23,11 @@
 #define LIBND4J_HELPER_HASH_H
 
 #include <string>
-#include <dll.h>
-#include <pointercast.h>
+#include <system/dll.h>
+#include <system/pointercast.h>
 #include <mutex>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         class ND4J_EXPORT HashHelper {
         private:
diff --git a/libnd4j/include/helpers/helper_ptrmap.h b/libnd4j/include/helpers/helper_ptrmap.h
index 1eebee0ca..4f2ec128c 100644
--- a/libnd4j/include/helpers/helper_ptrmap.h
+++ b/libnd4j/include/helpers/helper_ptrmap.h
@@ -27,7 +27,7 @@
 #define ptr_def inline
 #endif
 
-namespace nd4j {
+namespace sd {
 
     /**
      * This class is a simple wrapper to represent batch arguments as single surface of parameters.
diff --git a/libnd4j/include/helpers/helper_random.h b/libnd4j/include/helpers/helper_random.h
index f685eff40..6f2523e05 100644
--- a/libnd4j/include/helpers/helper_random.h
+++ b/libnd4j/include/helpers/helper_random.h
@@ -34,25 +34,25 @@
 #endif
 
 
-namespace nd4j {
+namespace sd {
 
     namespace random {
 
         template<typename T>
         class RandomHelper {
         private:
-            nd4j::random::IGenerator *generator;
-            nd4j::random::RandomBuffer *buffer;
+            sd::random::IGenerator *generator;
+            sd::random::RandomBuffer *buffer;
 
 
         public:
 
-            _CUDA_HD RandomHelper(nd4j::random::IGenerator *generator) {
+            _CUDA_HD RandomHelper(sd::random::IGenerator *generator) {
                 this->generator = generator;
                 this->buffer = generator->getBuffer();
             }
 
-            _CUDA_HD RandomHelper(nd4j::random::RandomBuffer *buffer) {
+            _CUDA_HD RandomHelper(sd::random::RandomBuffer *buffer) {
                 this->buffer = buffer;
             }
 
@@ -117,7 +117,7 @@ namespace nd4j {
              * @return
              */
             inline _CUDA_D T nextT() {
-                return (T) nextUInt() / (T) nd4j::DataTypeUtils::max<Nd4jULong>();
+                return (T) nextUInt() / (T) sd::DataTypeUtils::max<Nd4jULong>();
             }
 
             /**
@@ -150,7 +150,7 @@ namespace nd4j {
              *  relative methods are made as workaround for lock-free concurrent execution
              */
             inline _CUDA_D int relativeInt(Nd4jLong index) {
-                return (int) (relativeUInt(index) % (nd4j::DataTypeUtils::max<uint32_t>() + 1));
+                return (int) (relativeUInt(index) % (sd::DataTypeUtils::max<uint32_t>() + 1));
             }
 
             /**
@@ -190,8 +190,8 @@ namespace nd4j {
             inline _CUDA_D T relativeT(Nd4jLong index) {
                 if (sizeof(T) < 4) {
                     // FIXME: this is fast hack for short types, like fp16. This should be improved.
-                    return (T)((float) relativeUInt(index) / (float) nd4j::DataTypeUtils::max<uint32_t>());
-                } else return (T) relativeUInt(index) / (T) nd4j::DataTypeUtils::max<uint32_t>();
+                    return (T)((float) relativeUInt(index) / (float) sd::DataTypeUtils::max<uint32_t>());
+                } else return (T) relativeUInt(index) / (T) sd::DataTypeUtils::max<uint32_t>();
             }
 
             /**
diff --git a/libnd4j/include/helpers/hhColPivQR.h b/libnd4j/include/helpers/hhColPivQR.h
index 8c1ae4988..28dd42f64 100644
--- a/libnd4j/include/helpers/hhColPivQR.h
+++ b/libnd4j/include/helpers/hhColPivQR.h
@@ -21,10 +21,10 @@
 #ifndef LIBND4J_HHCOLPICQR_H
 #define LIBND4J_HHCOLPICQR_H
 
-#include <hhColPivQR.h>
-#include "NDArray.h"
+#include <helpers/hhColPivQR.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
diff --git a/libnd4j/include/helpers/hhSequence.h b/libnd4j/include/helpers/hhSequence.h
index b3104f1ae..31855a86c 100644
--- a/libnd4j/include/helpers/hhSequence.h
+++ b/libnd4j/include/helpers/hhSequence.h
@@ -21,9 +21,9 @@
 #ifndef LIBND4J_HHSEQUENCE_H
 #define LIBND4J_HHSEQUENCE_H
 
-#include "NDArray.h"
+#include "array/NDArray.h"
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
diff --git a/libnd4j/include/helpers/householder.h b/libnd4j/include/helpers/householder.h
index c8924865b..e71769901 100644
--- a/libnd4j/include/helpers/householder.h
+++ b/libnd4j/include/helpers/householder.h
@@ -22,9 +22,9 @@
 #define LIBND4J_HOUSEHOLDER_H
 
 
-#include "NDArray.h"
+#include "array/NDArray.h"
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
diff --git a/libnd4j/include/helpers/impl/ArrayUtils.cpp b/libnd4j/include/helpers/impl/ArrayUtils.cpp
index bd7107945..004cb1546 100644
--- a/libnd4j/include/helpers/impl/ArrayUtils.cpp
+++ b/libnd4j/include/helpers/impl/ArrayUtils.cpp
@@ -20,7 +20,7 @@
 
 #include <helpers/ArrayUtils.h>
 
-namespace nd4j {
+namespace sd {
     namespace ArrayUtils {
         void toIntPtr(std::initializer_list<int> list, int* target) {
             std::vector<int> vec(list);
diff --git a/libnd4j/include/helpers/impl/AttentionHelper.cpp b/libnd4j/include/helpers/impl/AttentionHelper.cpp
index 731c9e56a..bd5d006f2 100644
--- a/libnd4j/include/helpers/impl/AttentionHelper.cpp
+++ b/libnd4j/include/helpers/impl/AttentionHelper.cpp
@@ -26,9 +26,9 @@
 #include "../AttentionHelper.h"
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
 
-    nd4j::NDArray AttentionHelper::multiHeadProject(const nd4j::NDArray *input, const nd4j::NDArray *projectionMatrix, nd4j::LaunchContext * context) {
+    sd::NDArray AttentionHelper::multiHeadProject(const sd::NDArray *input, const sd::NDArray *projectionMatrix, sd::LaunchContext * context) {
         auto miniBatchSize = input->sizeAt(0);
         auto seqLength = input->sizeAt(2);
         auto numHeads = projectionMatrix->sizeAt(0);
@@ -39,7 +39,7 @@ namespace nd4j {
         auto projectionPrep = projectionMatrix->reshape('c', {numHeads * projectionMatrix->sizeAt(1), projectionMatrix->sizeAt(2)});    //[nHeads, hS, nIn] -> [nHeads*hS, nIn]
 
         NDArray projected('c', {numHeads * projectionMatrix->sizeAt(1), (miniBatchSize * seqLength)}, input->dataType(), context);  //[nHeads*hS, batch*timeSteps]
-        nd4j::ops::matmul mmul;
+        sd::ops::matmul mmul;
         mmul.execute({&projectionPrep, &inputPrep}, {&projected});
 
         projected.reshapei({numHeads, projectedSize, miniBatchSize, seqLength});
@@ -48,9 +48,9 @@ namespace nd4j {
         return projected;
     }
 
-    void AttentionHelper::multiHeadProjectBp(const nd4j::NDArray *input, const nd4j::NDArray *projectionMatrix,
-                                        const nd4j::NDArray *eps, nd4j::NDArray *dLdInput,
-                                        nd4j::NDArray *dLdProjectionMatrix, nd4j::LaunchContext * context) {
+    void AttentionHelper::multiHeadProjectBp(const sd::NDArray *input, const sd::NDArray *projectionMatrix,
+                                        const sd::NDArray *eps, sd::NDArray *dLdInput,
+                                        sd::NDArray *dLdProjectionMatrix, sd::LaunchContext * context) {
         auto miniBatchSize = input->sizeAt(0);
         auto seqLength = input->sizeAt(2);
         auto numHeads = projectionMatrix->sizeAt(0);
@@ -63,7 +63,7 @@ namespace nd4j {
         auto inputPrep = inputPerm.reshape('c', {input->sizeAt(1), (miniBatchSize * seqLength)});
         auto projectionPrep = projectionMatrix->reshape('c', {numHeads * projectionMatrix->sizeAt(1), projectionMatrix->sizeAt(2)});
 
-        nd4j::ops::matmul_bp mmulBp;
+        sd::ops::matmul_bp mmulBp;
         NDArray dLdProjectionPrep(projectionPrep.shapeInfo(), false, context);
         NDArray dLdInputPrep(inputPrep.shapeInfo(), false, context);
         mmulBp.execute({&projectionPrep, &inputPrep, &epsReshaped}, std::vector<NDArray*>{&dLdProjectionPrep, &dLdInputPrep}, {}, {}, {});
diff --git a/libnd4j/include/helpers/impl/BenchmarkHelper.cpp b/libnd4j/include/helpers/impl/BenchmarkHelper.cpp
index cbe0c0729..9e85cc5b7 100644
--- a/libnd4j/include/helpers/impl/BenchmarkHelper.cpp
+++ b/libnd4j/include/helpers/impl/BenchmarkHelper.cpp
@@ -20,11 +20,11 @@
 
 
 #include "../BenchmarkHelper.h"
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 #include <chrono>
 #include <helpers/ShapeUtils.h>
 
-namespace nd4j {
+namespace sd {
     BenchmarkHelper::BenchmarkHelper(unsigned int warmUpIterations, unsigned int runIterations) {
         _wIterations = warmUpIterations;
         _rIterations = runIterations;
@@ -59,9 +59,9 @@ namespace nd4j {
 
         auto n = NDArrayFactory::create(timings, LaunchContext::defaultContext());
 
-        auto stdev = n.varianceNumber(nd4j::variance::SummaryStatsStandardDeviation, false).e<double>(0);
-        auto min = n.reduceNumber(nd4j::reduce::Min).e<Nd4jLong>(0);
-        auto max = n.reduceNumber(nd4j::reduce::Max).e<Nd4jLong>(0);
+        auto stdev = n.varianceNumber(sd::variance::SummaryStatsStandardDeviation, false).e<double>(0);
+        auto min = n.reduceNumber(sd::reduce::Min).e<Nd4jLong>(0);
+        auto max = n.reduceNumber(sd::reduce::Max).e<Nd4jLong>(0);
 
         // opNum, DataType, Shape, average time, median time
         auto t = benchmark.dataType();
@@ -77,7 +77,7 @@ namespace nd4j {
         // printing out stuff
         snprintf(const_cast<char *>(temp.data()), temp.length(), "%s\t%i\t%i\t%i\t%s\t%s\t%s\t%s\t%s\t%s\t%lld\t%lld\t%lld\t%lld\t%.2f\n", benchmark.testName().c_str(), benchmark.opNum(),
                     _wIterations, _rIterations, t.c_str(), inpl.c_str(), s.c_str(), strides.c_str(), a.c_str(), o.c_str(),
-                    nd4j::math::nd4j_floor<double, Nd4jLong>(sumT), median, min, max, stdev);
+                    sd::math::nd4j_floor<double, Nd4jLong>(sumT), median, min, max, stdev);
 
         auto pos = temp.find('\n');
         return temp.substr(0, pos + 1);
@@ -109,9 +109,9 @@ namespace nd4j {
         Nd4jLong median = timings[_rIterations / 2];
 
         NDArray n = NDArrayFactory::create(timings, nullptr);
-        double stdev = n.varianceNumber(nd4j::variance::SummaryStatsStandardDeviation, false).e<double>(0);
-        Nd4jLong min = n.reduceNumber(nd4j::reduce::Min).e<Nd4jLong>(0);
-        Nd4jLong max = n.reduceNumber(nd4j::reduce::Max).e<Nd4jLong>(0);
+        double stdev = n.varianceNumber(sd::variance::SummaryStatsStandardDeviation, false).e<double>(0);
+        Nd4jLong min = n.reduceNumber(sd::reduce::Min).e<Nd4jLong>(0);
+        Nd4jLong max = n.reduceNumber(sd::reduce::Max).e<Nd4jLong>(0);
 
         // opNum, DataType, Shape, average time, median time
         auto t = DataTypeUtils::asString(x.dataType());
@@ -129,7 +129,7 @@ namespace nd4j {
         // printing out stuff
         nd4j_printf("%s\t%i\t%i\t%i\t%s\t%s\t%s\t%s\t%s\tn/a\t%lld\t%lld\t%lld\t%lld\t%.2f\n", testName.c_str(), op,
                     _wIterations, _rIterations, t.c_str(), inpl.c_str(), s.c_str(), stride.c_str(), o.c_str(),
-                    nd4j::math::nd4j_floor<double, Nd4jLong>(sumT), median, min, max, stdev);
+                    sd::math::nd4j_floor<double, Nd4jLong>(sumT), median, min, max, stdev);
     }
 
     std::string BenchmarkHelper::runOperationSuit(std::initializer_list<OpBenchmark*> benchmarks, const char *msg) {
diff --git a/libnd4j/include/helpers/impl/BitwiseUtils.cpp b/libnd4j/include/helpers/impl/BitwiseUtils.cpp
index 61e066d57..e3f4ce92a 100644
--- a/libnd4j/include/helpers/impl/BitwiseUtils.cpp
+++ b/libnd4j/include/helpers/impl/BitwiseUtils.cpp
@@ -20,10 +20,10 @@
 
 #include <helpers/logger.h>
 #include <helpers/BitwiseUtils.h>
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <types/float16.h>
 
-namespace nd4j {
+namespace sd {
 
      bool BitwiseUtils::isBE() {
         short int word = 0x0001;
@@ -76,7 +76,7 @@ namespace nd4j {
         return bits;
     }
 
-    nd4j::ByteOrder BitwiseUtils::asByteOrder() {
+    sd::ByteOrder BitwiseUtils::asByteOrder() {
         return isBE() ? ByteOrder::BE : ByteOrder::LE;
     }
 }
diff --git a/libnd4j/include/helpers/impl/BlasHelper.cpp b/libnd4j/include/helpers/impl/BlasHelper.cpp
index bf52fe2c6..0f270a97e 100644
--- a/libnd4j/include/helpers/impl/BlasHelper.cpp
+++ b/libnd4j/include/helpers/impl/BlasHelper.cpp
@@ -19,7 +19,7 @@
 //
 
 #include <helpers/BlasHelper.h>
-namespace nd4j {
+namespace sd {
     BlasHelper* BlasHelper::getInstance() {
         if (_instance == 0)
             _instance = new BlasHelper();
@@ -130,7 +130,7 @@ namespace nd4j {
         return false;
     }
 
-    bool BlasHelper::hasGEMV(const nd4j::DataType dtype)  {
+    bool BlasHelper::hasGEMV(const sd::DataType dtype)  {
         if(dtype == DataType::FLOAT32) {
             #if defined(__EXTERNAL_BLAS__) || defined(HAVE_OPENBLAS)
                 return true;
@@ -206,7 +206,7 @@ namespace nd4j {
         return false;
     }
 
-    bool BlasHelper:: hasGEMM(const nd4j::DataType dtype) {
+    bool BlasHelper:: hasGEMM(const sd::DataType dtype) {
         if(dtype == DataType::FLOAT32) {
             #if defined(__EXTERNAL_BLAS__) || defined(HAVE_OPENBLAS)
                 return true;
diff --git a/libnd4j/include/helpers/impl/CudaLaunchHelper.cpp b/libnd4j/include/helpers/impl/CudaLaunchHelper.cpp
index 947895d8c..d0bcce11e 100644
--- a/libnd4j/include/helpers/impl/CudaLaunchHelper.cpp
+++ b/libnd4j/include/helpers/impl/CudaLaunchHelper.cpp
@@ -19,9 +19,9 @@
 //
 
 #include <helpers/CudaLaunchHelper.h>
-#include <templatemath.h>
+#include <math/templatemath.h>
 
-namespace nd4j {
+namespace sd {
     Triple CudaLaunchHelper::getFlatLaunchParams(Nd4jLong length, int SM, int CORES, int SHARED_MEMORY) {
         // TODO: to be implemented
         Triple triple(1, 2, 3);
@@ -31,11 +31,11 @@ namespace nd4j {
 
     int CudaLaunchHelper::getReductionBlocks(Nd4jLong xLength, int blockSize) {
         int div = xLength / blockSize;
-        int can = nd4j::math::nd4j_max<int>(div, 1);
+        int can = sd::math::nd4j_max<int>(div, 1);
         if (xLength % blockSize != 0 && xLength > blockSize)
             can++;
 
         // not more then 512 blocks
-        return nd4j::math::nd4j_min<int>(can, 512);
+        return sd::math::nd4j_min<int>(can, 512);
     }
 }
diff --git a/libnd4j/include/helpers/impl/DebugHelper.cpp b/libnd4j/include/helpers/impl/DebugHelper.cpp
index 704c463e6..d24068a65 100644
--- a/libnd4j/include/helpers/impl/DebugHelper.cpp
+++ b/libnd4j/include/helpers/impl/DebugHelper.cpp
@@ -19,13 +19,13 @@
 //
 
 #include <helpers/DebugHelper.h>
-#include <NDArray.h>
-#include <NDArrayFactory.h>
+#include <array/NDArray.h>
+#include <array/NDArrayFactory.h>
 #include <ops/declarable/headers/parity_ops.h>
 #include <helpers/DebugInfo.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
     DebugInfo DebugHelper::debugStatistics(NDArray const* input) {
         DebugInfo info;
         DebugHelper::retrieveDebugStatistics(&info, input);
@@ -50,11 +50,11 @@ namespace nd4j {
             info->_maxValue = info->_minValue;
             info->_meanValue = info->_minValue;
             info->_stdDevValue = info->_minValue;
-            info->_zeroCount = nd4j::math::nd4j_abs(input->e<double>(0)) > 0.00001? 0: 1;
+            info->_zeroCount = sd::math::nd4j_abs(input->e<double>(0)) > 0.00001? 0: 1;
             info->_positiveCount = input->e<double>(0) > 0?1:0;
             info->_negativeCount = input->e<double>(0) < 0?1:0;
-            info->_infCount = nd4j::math::nd4j_isinf(input->e<double>(0));
-            info->_nanCount = nd4j::math::nd4j_isnan(input->e<double>(0));
+            info->_infCount = sd::math::nd4j_isinf(input->e<double>(0));
+            info->_nanCount = sd::math::nd4j_isnan(input->e<double>(0));
         }
         else if (input->lengthOf() > 0) {
             // TO DO: here processing for all elements with array
@@ -62,11 +62,11 @@ namespace nd4j {
             auto _maxValue = input->e<double>(0);
             auto _meanValue = input->e<double>(0);
             auto _stdDevValue = 0.; //info->_minValue;
-            auto _zeroCount = nd4j::math::nd4j_abs(input->e<double>(0)) > 0.00001? 0L : 1L;
+            auto _zeroCount = sd::math::nd4j_abs(input->e<double>(0)) > 0.00001? 0L : 1L;
             auto _positiveCount = input->e<double>(0) > 0? 1L : 0L;
             auto _negativeCount = input->e<double>(0) < 0? 1L : 0L;
-            auto _infCount = nd4j::math::nd4j_isinf(input->e<double>(0)) ? 1L : 0L;
-            auto _nanCount = nd4j::math::nd4j_isnan(input->e<double>(0)) ? 1L : 0L;
+            auto _infCount = sd::math::nd4j_isinf(input->e<double>(0)) ? 1L : 0L;
+            auto _nanCount = sd::math::nd4j_isnan(input->e<double>(0)) ? 1L : 0L;
 
 PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) reduction(+:_nanCount,_infCount,_meanValue,_zeroCount,_positiveCount,_negativeCount) reduction(min:_minValue) reduction(max:_maxValue))
             for (Nd4jLong e = 1; e < input->lengthOf(); e++) {
@@ -74,18 +74,18 @@ PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) reduction(+:_nanCount,_infCount,_m
                 auto n = e + 1.;
 //                auto delta = current - _meanValue;
 //                auto delta2 = delta * delta;
-                _minValue = nd4j::math::nd4j_min(current, _minValue);
-                _maxValue = nd4j::math::nd4j_max(current, _maxValue);
+                _minValue = sd::math::nd4j_min(current, _minValue);
+                _maxValue = sd::math::nd4j_max(current, _maxValue);
 
                 _meanValue += current;
                 //_meanValue += delta / n; // this is a perfect formula but not working with omp in this notation
                 //_stdDevValue += delta2 * e / n;
 
-                _zeroCount += nd4j::math::nd4j_abs(current) > 0.00001 ? 0 : 1;
+                _zeroCount += sd::math::nd4j_abs(current) > 0.00001 ? 0 : 1;
                 _positiveCount += current > 0 ? 1 : 0;
                 _negativeCount += current < 0 ? 1 : 0;
-                _infCount += nd4j::math::nd4j_isinf(current);
-                _nanCount += nd4j::math::nd4j_isnan(current);
+                _infCount += sd::math::nd4j_isinf(current);
+                _nanCount += sd::math::nd4j_isnan(current);
             }
             *info = {_minValue, _maxValue, _meanValue / input->lengthOf(), _stdDevValue, _zeroCount, _positiveCount, _negativeCount, _infCount, _nanCount};
             _stdDevValue = 0; //math::nd4j_sqrt<double, double>(info->_stdDevValue / (input->lengthOf() - 1));
diff --git a/libnd4j/include/helpers/impl/EnumUtils.cpp b/libnd4j/include/helpers/impl/EnumUtils.cpp
index 15fff8e43..a18592d0b 100644
--- a/libnd4j/include/helpers/impl/EnumUtils.cpp
+++ b/libnd4j/include/helpers/impl/EnumUtils.cpp
@@ -21,10 +21,10 @@
 #include <graph/VariableType.h>
 #include <helpers/EnumUtils.h>
 
-using namespace nd4j::graph;
+using namespace sd::graph;
 
-namespace nd4j {
-    const char * EnumUtils::_VariableTypeToString(nd4j::graph::VariableType variableType) {
+namespace sd {
+    const char * EnumUtils::_VariableTypeToString(sd::graph::VariableType variableType) {
         switch (variableType) {
             case NDARRAY: return "NDARRAY";
             case ARRAY_LIST: return "ARRAY_LIST";
@@ -33,7 +33,7 @@ namespace nd4j {
         }
     }
 
-    const char * EnumUtils::_OpTypeToString(nd4j::graph::OpType opType) {
+    const char * EnumUtils::_OpTypeToString(sd::graph::OpType opType) {
         switch(opType) {
             case OpType_REDUCE_SAME: return "REDUCE_SAME";
             case OpType_REDUCE_BOOL: return "REDUCE_BOOL";
diff --git a/libnd4j/include/helpers/impl/GradCheck.cpp b/libnd4j/include/helpers/impl/GradCheck.cpp
index 8b24e5f16..7bd96687d 100644
--- a/libnd4j/include/helpers/impl/GradCheck.cpp
+++ b/libnd4j/include/helpers/impl/GradCheck.cpp
@@ -18,11 +18,11 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 16.07.2018
 //
 
-#include <GradCheck.h>
-#include <NDArrayFactory.h>
+#include <helpers/GradCheck.h>
+#include <array/NDArrayFactory.h>
 
 
-namespace nd4j {
+namespace sd {
 
 //////////////////////////////////////////////////////////////////////////
 void GradCheck::fillGradArrays(const LossFunc loss, const std::vector<NDArray*>& gradArrs) {
@@ -62,7 +62,7 @@ bool GradCheck::checkGrad(ops::DeclarableOp& opFF, ops::DeclarableOp& opBP, cons
 	// back prop pass
 	ResultSet* outArrsBP = opBP.execute(argsHolderBP);		// number of output arrays in back prop = numInArrsFF;
 
-	NDArray tmpScalar(nd4j::DataType::DOUBLE, inArrsFF[0]->getContext()); // scalar = 0
+	NDArray tmpScalar(sd::DataType::DOUBLE, inArrsFF[0]->getContext()); // scalar = 0
 
 	for(int i = 0; i < numInArrsFF; ++i) {							// loop through input array
 
diff --git a/libnd4j/include/helpers/impl/MmulHelper.cpp b/libnd4j/include/helpers/impl/MmulHelper.cpp
index abc353132..bc525622a 100644
--- a/libnd4j/include/helpers/impl/MmulHelper.cpp
+++ b/libnd4j/include/helpers/impl/MmulHelper.cpp
@@ -24,19 +24,19 @@
 #include "../MmulHelper.h"
 #include <helpers/ShapeUtils.h>
 #include <helpers/BlasHelper.h>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 
-namespace nd4j {
+namespace sd {
 
 //////////////////////////////////////////////////////////////////////////
-nd4j::NDArray* nd4j::MmulHelper::tensorDot(const nd4j::NDArray* A, const nd4j::NDArray* B, const std::initializer_list<int>& axesA, const std::initializer_list<int>& axesB) {
+sd::NDArray* sd::MmulHelper::tensorDot(const sd::NDArray* A, const sd::NDArray* B, const std::initializer_list<int>& axesA, const std::initializer_list<int>& axesB) {
     std::vector<int> aA(axesA);
     std::vector<int> aB(axesB);
     return tensorDot(A, B, aA, aB);
 }
 
 //////////////////////////////////////////////////////////////////////////
-nd4j::NDArray* nd4j::MmulHelper::tensorDot(const nd4j::NDArray* a, const nd4j::NDArray* b, const std::vector<int>& axes_0, const std::vector<int>& axes_1) {
+sd::NDArray* sd::MmulHelper::tensorDot(const sd::NDArray* a, const sd::NDArray* b, const std::vector<int>& axes_0, const std::vector<int>& axes_1) {
 
     std::vector<int> permutAt, permutBt;
     std::vector<Nd4jLong> shapeAt, shapeBt;
@@ -68,7 +68,7 @@ nd4j::NDArray* nd4j::MmulHelper::tensorDot(const nd4j::NDArray* a, const nd4j::N
 }
 
 //////////////////////////////////////////////////////////////////////////
-void nd4j::MmulHelper::tensorDot(const nd4j::NDArray* a, const nd4j::NDArray* b, nd4j::NDArray* c, const std::vector<int>& axes_a, const std::vector<int>& axes_b, const std::vector<int>& permutForC) {
+void sd::MmulHelper::tensorDot(const sd::NDArray* a, const sd::NDArray* b, sd::NDArray* c, const std::vector<int>& axes_a, const std::vector<int>& axes_b, const std::vector<int>& permutForC) {
 
     std::vector<int> permutAt, permutBt;
     std::vector<Nd4jLong> shapeAt, shapeBt;
@@ -112,7 +112,7 @@ void nd4j::MmulHelper::tensorDot(const nd4j::NDArray* a, const nd4j::NDArray* b,
 
 #ifndef __JAVACPP_HACK__
 //////////////////////////////////////////////////////////////////////////
-void nd4j::MmulHelper::tensorDot(const NDArray* a, const NDArray* b, NDArray* c, const std::vector<std::vector<Nd4jLong>>& modifA, const std::vector<std::vector<Nd4jLong>>& modifB, const std::vector<std::vector<Nd4jLong>>& modifC) {
+void sd::MmulHelper::tensorDot(const NDArray* a, const NDArray* b, NDArray* c, const std::vector<std::vector<Nd4jLong>>& modifA, const std::vector<std::vector<Nd4jLong>>& modifB, const std::vector<std::vector<Nd4jLong>>& modifC) {
 
     NDArray *aPR(const_cast<NDArray*>(a)), *bPR(const_cast<NDArray*>(b));
     std::string whatToDoWithA, whatToDoWithB, whatToDoWithC;         // "" - nothing; "p" - permutation; "r" - reshaping; "pr" - permutation+reshaping; "rp" - reshaping/permutation, and so on; if another string is produced - throw exception
@@ -163,7 +163,7 @@ void nd4j::MmulHelper::tensorDot(const NDArray* a, const NDArray* b, NDArray* c,
 }
 
 //////////////////////////////////////////////////////////////////////////
-NDArray* nd4j::MmulHelper::tensorDot(const nd4j::NDArray* a, const nd4j::NDArray* b, const std::vector<std::vector<Nd4jLong>>& modifA, const std::vector<std::vector<Nd4jLong>>& modifB) {
+NDArray* sd::MmulHelper::tensorDot(const sd::NDArray* a, const sd::NDArray* b, const std::vector<std::vector<Nd4jLong>>& modifA, const std::vector<std::vector<Nd4jLong>>& modifB) {
 
     NDArray *aPR(const_cast<NDArray*>(a)), *bPR(const_cast<NDArray*>(b));
     std::string whatToDoWithA, whatToDoWithB;         // "" - nothing; "p" - permutation only; "r" - reshaping only; "pr" - permutation+reshaping; "rp" - reshaping/permutation; another string - throw exception
@@ -198,7 +198,7 @@ NDArray* nd4j::MmulHelper::tensorDot(const nd4j::NDArray* a, const nd4j::NDArray
 
 
 //////////////////////////////////////////////////////////////////////////
-nd4j::NDArray* MmulHelper::mmul(const nd4j::NDArray* A, const nd4j::NDArray* B, nd4j::NDArray* C , const double alpha, const double beta, const char outOrder) {
+sd::NDArray* MmulHelper::mmul(const sd::NDArray* A, const sd::NDArray* B, sd::NDArray* C , const double alpha, const double beta, const char outOrder) {
 
     int lenDim;
     const int aRank = A->rankOf();
@@ -239,7 +239,7 @@ nd4j::NDArray* MmulHelper::mmul(const nd4j::NDArray* A, const nd4j::NDArray* B,
 
 
 //////////////////////////////////////////////////////////////////////////
-    void MmulHelper::matmul(const nd4j::NDArray* x, const nd4j::NDArray* y, nd4j::NDArray* z, const bool transX, const bool transY) {
+    void MmulHelper::matmul(const sd::NDArray* x, const sd::NDArray* y, sd::NDArray* z, const bool transX, const bool transY) {
         int xRank = x->rankOf();
         int yRank = y->rankOf();
 
diff --git a/libnd4j/include/helpers/impl/OmpLaunchHelper.cpp b/libnd4j/include/helpers/impl/OmpLaunchHelper.cpp
index 80e456e29..0e409a952 100644
--- a/libnd4j/include/helpers/impl/OmpLaunchHelper.cpp
+++ b/libnd4j/include/helpers/impl/OmpLaunchHelper.cpp
@@ -19,14 +19,14 @@
 // @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
-#include <OmpLaunchHelper.h>
-#include <Environment.h>
-#include <templatemath.h>
+#include <helpers/OmpLaunchHelper.h>
+#include <system/Environment.h>
+#include <math/templatemath.h>
 #ifdef _OPENMP
 #include <omp.h>
 #endif
 
-namespace nd4j {
+namespace sd {
 
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -43,11 +43,11 @@ OmpLaunchHelper::OmpLaunchHelper(const Nd4jLong N, float desiredNumThreads) {
             else if(desiredNumThreads < 1) 
                 desiredNumThreads = 1;
             else
-                desiredNumThreads = nd4j::math::nd4j_min<int>(omp_get_max_threads(), desiredNumThreads);
+                desiredNumThreads = sd::math::nd4j_min<int>(omp_get_max_threads(), desiredNumThreads);
         #else
-            desiredNumThreads = nd4j::Environment::getInstance()->maxThreads();
+            desiredNumThreads = sd::Environment::getInstance()->maxThreads();
         #endif
-        _numThreads = nd4j::math::nd4j_min<int>(N / maxItersPerThread, desiredNumThreads);        
+        _numThreads = sd::math::nd4j_min<int>(N / maxItersPerThread, desiredNumThreads);
     }
 
     _itersPerThread = N / _numThreads;
@@ -75,7 +75,7 @@ Nd4jLong OmpLaunchHelper::betterSpan(Nd4jLong N) {
         #ifdef _OPENMP
             return betterThreads(N, omp_get_max_threads());
         #else
-            return betterThreads(N, nd4j::Environment::getInstance()->maxThreads());;
+            return betterThreads(N, sd::Environment::getInstance()->maxThreads());;
         #endif
     }
 
@@ -84,7 +84,7 @@ Nd4jLong OmpLaunchHelper::betterSpan(Nd4jLong N) {
         if (N < t)
             return 1;
         else {
-            return static_cast<int>(nd4j::math::nd4j_min<Nd4jLong>(N / t, maxThreads));
+            return static_cast<int>(sd::math::nd4j_min<Nd4jLong>(N / t, maxThreads));
         }
     }
 
@@ -92,7 +92,7 @@ Nd4jLong OmpLaunchHelper::betterSpan(Nd4jLong N) {
 #ifdef _OPENMP
         auto maxThreads = omp_get_max_threads();
 #else
-        auto maxThreads = nd4j::Environment::getInstance()->maxThreads();
+        auto maxThreads = sd::Environment::getInstance()->maxThreads();
 #endif
 
         // if there's only 1 thread allowed - nothing to do here
@@ -106,6 +106,6 @@ Nd4jLong OmpLaunchHelper::betterSpan(Nd4jLong N) {
             return 1;
 
         // by default we're spawning as many threads we can, but not more than number of TADs
-        return nd4j::math::nd4j_min<int>(numTads, maxThreads);
+        return sd::math::nd4j_min<int>(numTads, maxThreads);
     }
 }
diff --git a/libnd4j/include/helpers/impl/OpArgsHolder.cpp b/libnd4j/include/helpers/impl/OpArgsHolder.cpp
index 816253bc6..7b82a85d9 100644
--- a/libnd4j/include/helpers/impl/OpArgsHolder.cpp
+++ b/libnd4j/include/helpers/impl/OpArgsHolder.cpp
@@ -18,10 +18,10 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 15.07.2018
 //
 
-#include <OpArgsHolder.h>
+#include <helpers/OpArgsHolder.h>
 
 
-namespace nd4j {
+namespace sd {
 
 ////////////////////////////////////////////////////////////////////////
 // default constructor
diff --git a/libnd4j/include/helpers/impl/OpBenchmark.cpp b/libnd4j/include/helpers/impl/OpBenchmark.cpp
index 304ca9723..6cb0dc08a 100644
--- a/libnd4j/include/helpers/impl/OpBenchmark.cpp
+++ b/libnd4j/include/helpers/impl/OpBenchmark.cpp
@@ -20,7 +20,7 @@
 
 #include "../OpBenchmark.h"
 
-namespace nd4j {
+namespace sd {
     OpBenchmark::OpBenchmark(std::string name, NDArray *x, NDArray *y, NDArray *z) {
         _testName = name;
         _x = x;
diff --git a/libnd4j/include/helpers/impl/OpTracker.cpp b/libnd4j/include/helpers/impl/OpTracker.cpp
index 1fc4f330d..bb82ab0d1 100644
--- a/libnd4j/include/helpers/impl/OpTracker.cpp
+++ b/libnd4j/include/helpers/impl/OpTracker.cpp
@@ -21,13 +21,13 @@
 #include <helpers/OpTracker.h>
 #include <sstream>
 #include <helpers/logger.h>
-#include <NativeOps.h>
+#include <legacy/NativeOps.h>
 
 
-using namespace nd4j::ops;
-using namespace nd4j::graph;
+using namespace sd::ops;
+using namespace sd::graph;
 
-namespace nd4j {
+namespace sd {
     
     OpTracker* OpTracker::getInstance() {
         if (_INSTANCE == 0)
@@ -36,7 +36,7 @@ namespace nd4j {
         return _INSTANCE;
     }
 
-    void OpTracker::storeOperation(nd4j::graph::OpType opType, const OpDescriptor& descriptor) {
+    void OpTracker::storeOperation(sd::graph::OpType opType, const OpDescriptor& descriptor) {
         // check out CPU features
         if (!::isMinimalRequirementsMet()) {
 
@@ -75,7 +75,7 @@ namespace nd4j {
             _map[opType].emplace_back(descriptor);
     }
 
-    void OpTracker::storeOperation(nd4j::graph::OpType opType, const char* opName, const Nd4jLong opNum) {
+    void OpTracker::storeOperation(sd::graph::OpType opType, const char* opName, const Nd4jLong opNum) {
         OpDescriptor descriptor(0, opName, false);
         descriptor.setOpNum((int) opNum);
         descriptor.setHash(-1);
@@ -119,5 +119,5 @@ namespace nd4j {
         return _export.c_str();
     }
 
-    nd4j::OpTracker* nd4j::OpTracker::_INSTANCE = 0;
+    sd::OpTracker* sd::OpTracker::_INSTANCE = 0;
 }
diff --git a/libnd4j/include/helpers/impl/Parameters.cpp b/libnd4j/include/helpers/impl/Parameters.cpp
index d2678832f..356ad5a5a 100644
--- a/libnd4j/include/helpers/impl/Parameters.cpp
+++ b/libnd4j/include/helpers/impl/Parameters.cpp
@@ -21,7 +21,7 @@
 #include "../benchmark/Parameters.h"
 #include <stdexcept>
 
-namespace nd4j {
+namespace sd {
     Parameters* Parameters::addIntParam(std::string string, int param) {
         _intParams[string] = param;
         return this;
diff --git a/libnd4j/include/helpers/impl/RandomLauncher.cpp b/libnd4j/include/helpers/impl/RandomLauncher.cpp
index 099040dc5..8114c2ec4 100644
--- a/libnd4j/include/helpers/impl/RandomLauncher.cpp
+++ b/libnd4j/include/helpers/impl/RandomLauncher.cpp
@@ -19,16 +19,16 @@
 //
 
 #include <types/float16.h>
-#include <dll.h>
+#include <system/dll.h>
 #include <helpers/RandomLauncher.h>
 #include <graph/RandomGenerator.h>
 //#include <ops/declarable/CustomOperations.h>
 #include <helpers/PointersManager.h>
 
-namespace nd4j {
+namespace sd {
     // FIXME: implement this
 
-    void RandomLauncher::applyDropOut(nd4j::LaunchContext *context, nd4j::graph::RandomGenerator& rng, NDArray *array, double retainProb, NDArray* z) {
+    void RandomLauncher::applyDropOut(sd::LaunchContext *context, sd::graph::RandomGenerator& rng, NDArray *array, double retainProb, NDArray* z) {
         if (z == nullptr)
             z = array;
 
@@ -39,7 +39,7 @@ namespace nd4j {
         pm.synchronize();
     }
 
-    void RandomLauncher::applyInvertedDropOut(nd4j::LaunchContext *context, nd4j::graph::RandomGenerator& rng, NDArray *array, double retainProb, NDArray* z) {
+    void RandomLauncher::applyInvertedDropOut(sd::LaunchContext *context, sd::graph::RandomGenerator& rng, NDArray *array, double retainProb, NDArray* z) {
         if (z == nullptr)
             z = array;
 
@@ -50,7 +50,7 @@ namespace nd4j {
         pm.synchronize();
     }
 
-    void RandomLauncher::applyAlphaDropOut(nd4j::LaunchContext *context, nd4j::graph::RandomGenerator& rng, NDArray *array, double retainProb, double alpha, double beta, double alphaPrime, NDArray* z) {
+    void RandomLauncher::applyAlphaDropOut(sd::LaunchContext *context, sd::graph::RandomGenerator& rng, NDArray *array, double retainProb, double alpha, double beta, double alphaPrime, NDArray* z) {
         if (z == nullptr)
             z = array;
 
@@ -61,7 +61,7 @@ namespace nd4j {
         pm.synchronize();
     }
 
-    void RandomLauncher::fillBernoulli(nd4j::LaunchContext *context, nd4j::graph::RandomGenerator& rng, NDArray* array, double prob) {
+    void RandomLauncher::fillBernoulli(sd::LaunchContext *context, sd::graph::RandomGenerator& rng, NDArray* array, double prob) {
         ExtraArguments arguments({prob});
         PointersManager pm(context, "fillBernoulli");
 
@@ -69,7 +69,7 @@ namespace nd4j {
         pm.synchronize();
     }
 
-    void RandomLauncher::fillUniform(nd4j::LaunchContext *context, nd4j::graph::RandomGenerator& rng, NDArray* array, double from, double to) {
+    void RandomLauncher::fillUniform(sd::LaunchContext *context, sd::graph::RandomGenerator& rng, NDArray* array, double from, double to) {
         ExtraArguments arguments({from, to});
         PointersManager pm(context, "fillUniform");
 
@@ -77,7 +77,7 @@ namespace nd4j {
         pm.synchronize();
     }
 
-    void RandomLauncher::fillGaussian(nd4j::LaunchContext *context, nd4j::graph::RandomGenerator& rng, NDArray* array, double mean, double stdev) {
+    void RandomLauncher::fillGaussian(sd::LaunchContext *context, sd::graph::RandomGenerator& rng, NDArray* array, double mean, double stdev) {
         ExtraArguments arguments({mean, stdev});
         PointersManager pm(context, "fillGaussian");
 
@@ -85,7 +85,7 @@ namespace nd4j {
         pm.synchronize();
     }
 
-    void RandomLauncher::fillExponential(nd4j::LaunchContext *context, nd4j::graph::RandomGenerator& rng, NDArray* array, double lambda) {
+    void RandomLauncher::fillExponential(sd::LaunchContext *context, sd::graph::RandomGenerator& rng, NDArray* array, double lambda) {
         ExtraArguments arguments({lambda});
         PointersManager pm(context, "fillExponential");
 
@@ -93,7 +93,7 @@ namespace nd4j {
         pm.synchronize();
     }
 
-    void RandomLauncher::fillLogNormal(nd4j::LaunchContext *context, nd4j::graph::RandomGenerator& rng, NDArray* array, double mean, double stdev) {
+    void RandomLauncher::fillLogNormal(sd::LaunchContext *context, sd::graph::RandomGenerator& rng, NDArray* array, double mean, double stdev) {
         ExtraArguments arguments({mean, stdev});
         PointersManager pm(context, "fillLogNormal");
 
@@ -101,7 +101,7 @@ namespace nd4j {
         pm.synchronize();
     }
 
-    void RandomLauncher::fillTruncatedNormal(nd4j::LaunchContext *context, nd4j::graph::RandomGenerator& rng, NDArray* array, double mean, double stdev) {
+    void RandomLauncher::fillTruncatedNormal(sd::LaunchContext *context, sd::graph::RandomGenerator& rng, NDArray* array, double mean, double stdev) {
         ExtraArguments arguments({mean, stdev});
         PointersManager pm(context, "fillTruncatedNormal");
 
@@ -109,7 +109,7 @@ namespace nd4j {
         pm.synchronize();
     }
 
-    void RandomLauncher::fillBinomial(nd4j::LaunchContext *context, nd4j::graph::RandomGenerator& rng, NDArray* array, int trials, double prob) {
+    void RandomLauncher::fillBinomial(sd::LaunchContext *context, sd::graph::RandomGenerator& rng, NDArray* array, int trials, double prob) {
         ExtraArguments arguments({(double) trials, prob});
         PointersManager pm(context, "fillBinomial");
 
diff --git a/libnd4j/include/helpers/impl/ShapeBuilders.cpp b/libnd4j/include/helpers/impl/ShapeBuilders.cpp
index d8443e180..7c0c7fed6 100644
--- a/libnd4j/include/helpers/impl/ShapeBuilders.cpp
+++ b/libnd4j/include/helpers/impl/ShapeBuilders.cpp
@@ -20,10 +20,10 @@
 
 #include <helpers/ShapeBuilders.h>
 
-namespace nd4j {
+namespace sd {
 
 
-    Nd4jLong* ShapeBuilders::createScalarShapeInfo(const nd4j::DataType dataType, nd4j::memory::Workspace* workspace) {
+    Nd4jLong* ShapeBuilders::createScalarShapeInfo(const sd::DataType dataType, sd::memory::Workspace* workspace) {
         Nd4jLong *newShape;
         ALLOCATE(newShape, workspace, shape::shapeInfoLength(0), Nd4jLong);
         newShape[0] = 0;
@@ -31,12 +31,12 @@ namespace nd4j {
         newShape[2] = 1;
         newShape[3] = 99;
 
-        nd4j::ArrayOptions::setDataType(newShape, dataType);
+        sd::ArrayOptions::setDataType(newShape, dataType);
 
         return newShape;
     }
 
-    Nd4jLong* ShapeBuilders::createVectorShapeInfo(const nd4j::DataType dataType, const Nd4jLong length, nd4j::memory::Workspace* workspace) {
+    Nd4jLong* ShapeBuilders::createVectorShapeInfo(const sd::DataType dataType, const Nd4jLong length, sd::memory::Workspace* workspace) {
         Nd4jLong *newShape;
         ALLOCATE(newShape, workspace, shape::shapeInfoLength(1), Nd4jLong);
 
@@ -47,13 +47,13 @@ namespace nd4j {
         newShape[4] = 1;
         newShape[5] = 99;
 
-        nd4j::ArrayOptions::setDataType(newShape, dataType);
+        sd::ArrayOptions::setDataType(newShape, dataType);
 
         return newShape;
     }
 
     ////////////////////////////////////////////////////////////////////////////////
-    Nd4jLong* ShapeBuilders::createShapeInfo(const nd4j::DataType dataType, const char order, int rank, const Nd4jLong* shapeOnly, memory::Workspace* workspace) {
+    Nd4jLong* ShapeBuilders::createShapeInfo(const sd::DataType dataType, const char order, int rank, const Nd4jLong* shapeOnly, memory::Workspace* workspace) {
         Nd4jLong* shapeInfo = nullptr;
 
         if(rank == 0) {    // scalar case
@@ -79,19 +79,19 @@ namespace nd4j {
                 ArrayOptions::setPropertyBit(shapeInfo, ARRAY_EMPTY);
             }
 
-            nd4j::ArrayOptions::setDataType(shapeInfo, dataType);
+            sd::ArrayOptions::setDataType(shapeInfo, dataType);
         }
 
         return shapeInfo;
     }
 
-    Nd4jLong* ShapeBuilders::emptyShapeInfo(const nd4j::DataType dataType, memory::Workspace* workspace) {
+    Nd4jLong* ShapeBuilders::emptyShapeInfo(const sd::DataType dataType, memory::Workspace* workspace) {
         auto shapeInfo = createScalarShapeInfo(dataType, workspace);
         ArrayOptions::setPropertyBit(shapeInfo, ARRAY_EMPTY);
         return shapeInfo;
     }
 
-    Nd4jLong* ShapeBuilders::emptyShapeInfo(const nd4j::DataType dataType, const char order, const std::vector<Nd4jLong> &shape, memory::Workspace* workspace) {
+    Nd4jLong* ShapeBuilders::emptyShapeInfo(const sd::DataType dataType, const char order, const std::vector<Nd4jLong> &shape, memory::Workspace* workspace) {
         auto shapeInfo = createShapeInfo(dataType, order, shape, workspace);
         memset(shape::stride(shapeInfo), 0, shape.size() * sizeof(Nd4jLong));
         ArrayOptions::setPropertyBit(shapeInfo, ARRAY_EMPTY);
@@ -99,13 +99,13 @@ namespace nd4j {
     }
 
 ////////////////////////////////////////////////////////////////////////////////
-    Nd4jLong* ShapeBuilders::createShapeInfo(const nd4j::DataType dataType, const char order, const std::vector<Nd4jLong>& shapeOnly, memory::Workspace* workspace) {
+    Nd4jLong* ShapeBuilders::createShapeInfo(const sd::DataType dataType, const char order, const std::vector<Nd4jLong>& shapeOnly, memory::Workspace* workspace) {
 
         return ShapeBuilders::createShapeInfo(dataType, order, shapeOnly.size(), shapeOnly.data(), workspace);
     }
 
 ////////////////////////////////////////////////////////////////////////////////
-    Nd4jLong* ShapeBuilders::createShapeInfo(const nd4j::DataType dataType, const char order, const std::initializer_list<Nd4jLong>& shapeOnly, memory::Workspace* workspace) {
+    Nd4jLong* ShapeBuilders::createShapeInfo(const sd::DataType dataType, const char order, const std::initializer_list<Nd4jLong>& shapeOnly, memory::Workspace* workspace) {
 
         return ShapeBuilders::createShapeInfo(dataType, order, std::vector<Nd4jLong>(shapeOnly), workspace);
     }
diff --git a/libnd4j/include/helpers/impl/ShapeUtils.cpp b/libnd4j/include/helpers/impl/ShapeUtils.cpp
index 10babeae1..355d93b07 100644
--- a/libnd4j/include/helpers/impl/ShapeUtils.cpp
+++ b/libnd4j/include/helpers/impl/ShapeUtils.cpp
@@ -27,7 +27,7 @@
 #include <flatbuffers/util.h>
 
 
-namespace nd4j {
+namespace sd {
 
 //////////////////////////////////////////////////////////////////////////
 // evaluate shape for array resulting from tensorDot operation, also evaluate shapes and dimensions permutations for transposition of two input arrays
@@ -124,7 +124,7 @@ std::vector<Nd4jLong> ShapeUtils::evalShapeForTensorDot(const NDArray* a,   cons
 
 //////////////////////////////////////////////////////////////////////////
 // evaluate output shape for reduce operation when input shape is empty
-Nd4jLong* ShapeUtils::evalReduceShapeInfoEmpty(const char order, std::vector<int>& dimsToExclude, const Nd4jLong *shapeInfo, const nd4j::DataType dataType, const bool keepDims, nd4j::memory::Workspace* workspace) {
+Nd4jLong* ShapeUtils::evalReduceShapeInfoEmpty(const char order, std::vector<int>& dimsToExclude, const Nd4jLong *shapeInfo, const sd::DataType dataType, const bool keepDims, sd::memory::Workspace* workspace) {
 
     if (dimsToExclude.size() == 0) {   // return copy of input shape
         Nd4jLong* outShapeInfo = ShapeBuilders::copyShapeInfoAndType(shapeInfo, dataType, true, workspace);
@@ -171,22 +171,22 @@ Nd4jLong* ShapeUtils::evalReduceShapeInfoEmpty(const char order, std::vector<int
     return ConstantShapeHelper::getInstance()->bufferForShapeInfo(descriptor).primaryAsT<Nd4jLong>();
 }
 
-Nd4jLong* ShapeUtils::evalReduceShapeInfo(const char order, std::vector<int>& dimsToExclude, const NDArray& arr, const bool keepDims, const bool supportOldShapes, nd4j::memory::Workspace* workspace) {
+Nd4jLong* ShapeUtils::evalReduceShapeInfo(const char order, std::vector<int>& dimsToExclude, const NDArray& arr, const bool keepDims, const bool supportOldShapes, sd::memory::Workspace* workspace) {
     return evalReduceShapeInfo(order, dimsToExclude, arr, arr.dataType(), keepDims, supportOldShapes, workspace);
 }
 
-Nd4jLong* ShapeUtils::evalReduceShapeInfo(const char order, std::vector<int>& dimsToExclude, const Nd4jLong* shapeInfo, const bool keepDims, const bool supportOldShapes, nd4j::memory::Workspace* workspace) {
+Nd4jLong* ShapeUtils::evalReduceShapeInfo(const char order, std::vector<int>& dimsToExclude, const Nd4jLong* shapeInfo, const bool keepDims, const bool supportOldShapes, sd::memory::Workspace* workspace) {
     return evalReduceShapeInfo(order, dimsToExclude, shapeInfo, ArrayOptions::dataType(shapeInfo), keepDims, supportOldShapes, workspace);
 }
 
 //////////////////////////////////////////////////////////////////////////
-Nd4jLong* ShapeUtils::evalReduceShapeInfo(const char order, std::vector<int>& dimsToExclude, const NDArray& arr, const nd4j::DataType dataType, const bool keepDims, const bool supportOldShapes, nd4j::memory::Workspace* workspace) {
+Nd4jLong* ShapeUtils::evalReduceShapeInfo(const char order, std::vector<int>& dimsToExclude, const NDArray& arr, const sd::DataType dataType, const bool keepDims, const bool supportOldShapes, sd::memory::Workspace* workspace) {
     return evalReduceShapeInfo(order, dimsToExclude, arr.getShapeInfo(), dataType, keepDims, supportOldShapes, workspace);
 }
 
 //////////////////////////////////////////////////////////////////////////
 // evaluate shape resulting from reduce operation
-Nd4jLong* ShapeUtils::evalReduceShapeInfo(const char order, std::vector<int>& dimsToExclude, const Nd4jLong *shapeInfo, const nd4j::DataType dataType, const bool keepDims, const bool supportOldShapes, nd4j::memory::Workspace* workspace) {
+Nd4jLong* ShapeUtils::evalReduceShapeInfo(const char order, std::vector<int>& dimsToExclude, const Nd4jLong *shapeInfo, const sd::DataType dataType, const bool keepDims, const bool supportOldShapes, sd::memory::Workspace* workspace) {
 
     if(ArrayOptions::arrayType(shapeInfo) == ArrayType::EMPTY)
         return ShapeUtils::evalReduceShapeInfoEmpty(order, dimsToExclude, shapeInfo, dataType, keepDims, workspace);
@@ -315,7 +315,7 @@ std::vector<Nd4jLong> ShapeUtils::evalRepeatShape(int axis, const std::vector<in
 
 //////////////////////////////////////////////////////////////////////////
 // evaluate shapeInfo of permuted array
-Nd4jLong* ShapeUtils::evalPermShapeInfo(const int* dimensions, const int rank, const NDArray& arr, nd4j::memory::Workspace* workspace, const bool setContigStrides) {
+Nd4jLong* ShapeUtils::evalPermShapeInfo(const int* dimensions, const int rank, const NDArray& arr, sd::memory::Workspace* workspace, const bool setContigStrides) {
 
     if (!arr.nonNull())
         throw std::runtime_error("ShapeUtils::evalPermShapeInfo static method: wrong arguments: array is nullptr!");
@@ -347,7 +347,7 @@ Nd4jLong* ShapeUtils::evalPermShapeInfo(const int* dimensions, const int rank, c
 
     //////////////////////////////////////////////////////////////////////////
     // evaluate shapeInfo of permuted array
-    Nd4jLong* ShapeUtils::evalPermShapeInfo(const Nd4jLong *dimensions, const int rank, const NDArray& arr, nd4j::memory::Workspace* workspace) {
+    Nd4jLong* ShapeUtils::evalPermShapeInfo(const Nd4jLong *dimensions, const int rank, const NDArray& arr, sd::memory::Workspace* workspace) {
 
         std::vector<int> dims(dimensions, dimensions + rank);
         return evalPermShapeInfo(dims.data(), rank, arr, workspace);
@@ -355,7 +355,7 @@ Nd4jLong* ShapeUtils::evalPermShapeInfo(const int* dimensions, const int rank, c
 
 //////////////////////////////////////////////////////////////////////////
 // evaluate shapeInfo of transposed array
-    Nd4jLong* ShapeUtils::evalTranspShapeInfo(const NDArray& arr, nd4j::memory::Workspace* workspace, const bool setContigStrides) {
+    Nd4jLong* ShapeUtils::evalTranspShapeInfo(const NDArray& arr, sd::memory::Workspace* workspace, const bool setContigStrides) {
 
         int rank = arr.rankOf();
         std::vector<int> dimensions(rank);
@@ -444,11 +444,11 @@ bool ShapeUtils::areShapesBroadcastable(const std::vector<Nd4jLong>& shape1, con
 //////////////////////////////////////////////////////////////////////////
 // check the possibility of broadcast operation, if true then return shapeInfo of resulting array
 // if evalMinMax == false the array with larger rank has to be passed as first argument
-bool ShapeUtils::evalBroadcastShapeInfo(const NDArray &max, const NDArray &min, const bool evalMinMax, Nd4jLong*& resultShapeInfo, nd4j::memory::Workspace* workspace) {
+bool ShapeUtils::evalBroadcastShapeInfo(const NDArray &max, const NDArray &min, const bool evalMinMax, Nd4jLong*& resultShapeInfo, sd::memory::Workspace* workspace) {
     return evalBroadcastShapeInfo(max.getShapeInfo(), min.getShapeInfo(), evalMinMax, resultShapeInfo, workspace);
 }
 
-bool ShapeUtils::evalBroadcastShapeInfo(Nd4jLong *max, Nd4jLong *min, const bool evalMinMax, Nd4jLong*& resultShapeInfo, nd4j::memory::Workspace* workspace) {
+bool ShapeUtils::evalBroadcastShapeInfo(Nd4jLong *max, Nd4jLong *min, const bool evalMinMax, Nd4jLong*& resultShapeInfo, sd::memory::Workspace* workspace) {
 
     // check whether broadcast operation is possible for input arrays
     if(!areShapesBroadcastable(max, min))
@@ -561,7 +561,7 @@ std::vector<int> ShapeUtils::getDimsWithSameShape(const NDArray& arr1, const NDA
 
 //////////////////////////////////////////////////////////////////////////
 // evaluate shapeInfo for resulting array from tile operation
-Nd4jLong* ShapeUtils::evalTileShapeInfo(const NDArray& arr, const std::vector<Nd4jLong>& reps, nd4j::memory::Workspace* workspace) {
+Nd4jLong* ShapeUtils::evalTileShapeInfo(const NDArray& arr, const std::vector<Nd4jLong>& reps, sd::memory::Workspace* workspace) {
     // check whether reps contains at least one zero (then throw exception) or whether all elements in reps are unities (then simply reshape or do nothing)
     int repsSize = reps.size();
     Nd4jLong product = 1;
@@ -725,7 +725,7 @@ std::vector<Nd4jLong> ShapeUtils::shapeAsVector(const Nd4jLong* shapeInfo) {
 
 //////////////////////////////////////////////////////////////////////////
 // evaluate shapeInfo for diagonal array which is made using input arr elements as diagonal
-Nd4jLong* ShapeUtils::evalDiagShapeInfo(const Nd4jLong* shapeInfoConst, nd4j::memory::Workspace* workspace){
+Nd4jLong* ShapeUtils::evalDiagShapeInfo(const Nd4jLong* shapeInfoConst, sd::memory::Workspace* workspace){
     auto shapeInfo = const_cast<Nd4jLong*>(shapeInfoConst);
 
     const auto rank = shape::rank(shapeInfo);
@@ -766,7 +766,7 @@ std::vector<int> ShapeUtils::evalBroadcastBackwardAxis(const Nd4jLong *operandSh
 }
 
 ////////////////////////////////////////////////////////////////////////////////
-Nd4jLong* ShapeUtils::matrixProductShape(Nd4jLong* theFirstShape, Nd4jLong* theSecondShape, bool shouldTranspondFirst, bool shouldTranspondSecond, nd4j::DataType  dtype, nd4j::memory::Workspace* workspace) {
+Nd4jLong* ShapeUtils::matrixProductShape(Nd4jLong* theFirstShape, Nd4jLong* theSecondShape, bool shouldTranspondFirst, bool shouldTranspondSecond, sd::DataType  dtype, sd::memory::Workspace* workspace) {
 
     auto inA = theFirstShape;
     auto inB = theSecondShape;
@@ -823,7 +823,7 @@ Nd4jLong* ShapeUtils::matrixProductShape(Nd4jLong* theFirstShape, Nd4jLong* theS
         (shape::isScalar(tmpA) && shape::isVector(tmpB))) {
         // element-wise
         shape[0] = 1;
-        shape[1] = (int) nd4j::math::nd4j_max<Nd4jLong>(shape::length(tmpA), shape::length(tmpB));
+        shape[1] = (int) sd::math::nd4j_max<Nd4jLong>(shape::length(tmpA), shape::length(tmpB));
     } else if (shape::isRowVector(tmpA) && shape::isRowVector(tmpB)) {
         // dot case
         shape[0] = 1;
@@ -1070,7 +1070,7 @@ void ShapeUtils::copyCertainStridesFromShapeInfo(const Nd4jLong* inShapeInfo, co
     }
     else {
 
-        auto dimEx = nd4j::ShapeUtils::evalDimsToExclude(nRank, dimsSize, dims);
+        auto dimEx = sd::ShapeUtils::evalDimsToExclude(nRank, dimsSize, dims);
 
         for (int i = 0, it = 0; i < nRank; ++i) {
             auto nCount = std::count(dimEx.cbegin(), dimEx.cend(), i);
diff --git a/libnd4j/include/helpers/impl/SimpleReadWriteLock.cpp b/libnd4j/include/helpers/impl/SimpleReadWriteLock.cpp
index 22be1d6ca..52682b925 100644
--- a/libnd4j/include/helpers/impl/SimpleReadWriteLock.cpp
+++ b/libnd4j/include/helpers/impl/SimpleReadWriteLock.cpp
@@ -21,7 +21,7 @@
 #include <helpers/SimpleReadWriteLock.h>
 
 
-namespace nd4j {
+namespace sd {
     SimpleReadWriteLock::SimpleReadWriteLock(const SimpleReadWriteLock& other) {
         _read_locks.store(other._read_locks.load());
         _write_locks.store(other._write_locks.load());
diff --git a/libnd4j/include/helpers/impl/StringUtils.cpp b/libnd4j/include/helpers/impl/StringUtils.cpp
index 045dcea73..5ac2fd8cc 100644
--- a/libnd4j/include/helpers/impl/StringUtils.cpp
+++ b/libnd4j/include/helpers/impl/StringUtils.cpp
@@ -23,7 +23,7 @@
 #include <helpers/StringUtils.h>
 #include <exceptions/datatype_exception.h>
 
-namespace nd4j {
+namespace sd {
     static FORCEINLINE bool match(const uint8_t *haystack, const uint8_t *needle, uint64_t length) {
         for (int e = 0; e < length; e++)
             if (haystack[e] != needle[e])
@@ -49,7 +49,7 @@ namespace nd4j {
 
     uint64_t StringUtils::byteLength(const NDArray &array) {
         if (!array.isS())
-            throw nd4j::datatype_exception::build("StringUtils::byteLength expects one of String types;", array.dataType());
+            throw sd::datatype_exception::build("StringUtils::byteLength expects one of String types;", array.dataType());
 
         auto buffer = array.bufferAsT<Nd4jLong>();
         return buffer[array.lengthOf()];
diff --git a/libnd4j/include/helpers/impl/TAD.cpp b/libnd4j/include/helpers/impl/TAD.cpp
index 1c768f2a7..5d31827da 100644
--- a/libnd4j/include/helpers/impl/TAD.cpp
+++ b/libnd4j/include/helpers/impl/TAD.cpp
@@ -19,8 +19,8 @@
 //
 
 
-#include "TAD.h"
-#include <pointercast.h>
+#include <helpers/TAD.h>
+#include <system/pointercast.h>
 
 namespace shape {
 
diff --git a/libnd4j/include/helpers/impl/helper_hash.cpp b/libnd4j/include/helpers/impl/helper_hash.cpp
index 54eb1731a..b12acb273 100644
--- a/libnd4j/include/helpers/impl/helper_hash.cpp
+++ b/libnd4j/include/helpers/impl/helper_hash.cpp
@@ -21,7 +21,7 @@
 #include <helpers/helper_hash.h>
 #include <helpers/logger.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
 
         HashHelper* HashHelper::getInstance() {
@@ -65,7 +65,7 @@ namespace nd4j {
             return h;
         }
 
-        nd4j::ops::HashHelper* nd4j::ops::HashHelper::_INSTANCE = 0;
+        sd::ops::HashHelper* sd::ops::HashHelper::_INSTANCE = 0;
     }
 }
 
diff --git a/libnd4j/include/helpers/impl/logger.cpp b/libnd4j/include/helpers/impl/logger.cpp
index 8c0f09a92..59d8f98bc 100644
--- a/libnd4j/include/helpers/impl/logger.cpp
+++ b/libnd4j/include/helpers/impl/logger.cpp
@@ -20,7 +20,7 @@
 
 #include <helpers/logger.h>
 
-namespace nd4j {
+namespace sd {
 
 
 #ifdef __CUDACC__
diff --git a/libnd4j/include/helpers/impl/unicode.cpp b/libnd4j/include/helpers/impl/unicode.cpp
index 2e49faf3e..6ebbe7c1b 100644
--- a/libnd4j/include/helpers/impl/unicode.cpp
+++ b/libnd4j/include/helpers/impl/unicode.cpp
@@ -18,9 +18,9 @@
 // @author Oleg Semeniv <oleg.semeniv@gmail.com>
 //
 
-#include <unicode.h>
+#include <helpers/unicode.h>
 
-namespace nd4j {
+namespace sd {
 namespace unicode {
 
     constexpr uint32_t ONEBYTEBOUND = 0x00000080;
diff --git a/libnd4j/include/helpers/jacobiSVD.h b/libnd4j/include/helpers/jacobiSVD.h
index 6b22f472a..f6f161bbb 100644
--- a/libnd4j/include/helpers/jacobiSVD.h
+++ b/libnd4j/include/helpers/jacobiSVD.h
@@ -21,10 +21,10 @@
 #ifndef LIBND4J_JACOBISVD_H
 #define LIBND4J_JACOBISVD_H
 
-#include <hhSequence.h>
-#include "NDArray.h"
+#include <helpers/hhSequence.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
diff --git a/libnd4j/include/helpers/logger.h b/libnd4j/include/helpers/logger.h
index 193935e0d..c13785ff7 100644
--- a/libnd4j/include/helpers/logger.h
+++ b/libnd4j/include/helpers/logger.h
@@ -23,32 +23,32 @@
 
 #include <vector>
 #include <cstdarg>
-#include <Environment.h>
+#include <system/Environment.h>
 #include <stdlib.h>
 #include <stdio.h>
-#include <dll.h>
-#include <op_boilerplate.h>
-#include <pointercast.h>
+#include <system/dll.h>
+#include <system/op_boilerplate.h>
+#include <system/pointercast.h>
 
 #ifndef __CUDA_ARCH__
 
-#define nd4j_debug(FORMAT, ...) if (nd4j::Environment::getInstance()->isDebug() && nd4j::Environment::getInstance()->isVerbose()) nd4j::Logger::info(FORMAT, __VA_ARGS__);
-#define nd4j_logger(FORMAT, ...) if (nd4j::Environment::getInstance()->isDebug() && nd4j::Environment::getInstance()->isVerbose()) nd4j::Logger::info(FORMAT, __VA_ARGS__);
-#define nd4j_verbose(FORMAT, ...) if (nd4j::Environment::getInstance()->isVerbose()) nd4j::Logger::info(FORMAT, __VA_ARGS__);
-#define nd4j_printf(FORMAT, ...) nd4j::Logger::info(FORMAT, __VA_ARGS__);
-#define nd4j_printv(FORMAT, VECTOR)     nd4j::Logger::printv(FORMAT, VECTOR);
+#define nd4j_debug(FORMAT, ...) if (sd::Environment::getInstance()->isDebug() && sd::Environment::getInstance()->isVerbose()) sd::Logger::info(FORMAT, __VA_ARGS__);
+#define nd4j_logger(FORMAT, ...) if (sd::Environment::getInstance()->isDebug() && sd::Environment::getInstance()->isVerbose()) sd::Logger::info(FORMAT, __VA_ARGS__);
+#define nd4j_verbose(FORMAT, ...) if (sd::Environment::getInstance()->isVerbose()) sd::Logger::info(FORMAT, __VA_ARGS__);
+#define nd4j_printf(FORMAT, ...) sd::Logger::info(FORMAT, __VA_ARGS__);
+#define nd4j_printv(FORMAT, VECTOR)     sd::Logger::printv(FORMAT, VECTOR);
 
 #else
 
 #define nd4j_debug(FORMAT, A, ...)
 #define nd4j_logger(FORMAT, A, ...)
 #define nd4j_verbose(FORMAT, ...)
-#define nd4j_printf(FORMAT, ...) nd4j::Logger::info(FORMAT, __VA_ARGS__);
+#define nd4j_printf(FORMAT, ...) sd::Logger::info(FORMAT, __VA_ARGS__);
 #define nd4j_printv(FORMAT, VECTOR)
 
 #endif
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT Logger {
 
     public:
diff --git a/libnd4j/include/helpers/shape.h b/libnd4j/include/helpers/shape.h
index d4e95c65f..2a7785cc8 100644
--- a/libnd4j/include/helpers/shape.h
+++ b/libnd4j/include/helpers/shape.h
@@ -26,13 +26,13 @@
 
 #include <cstring>
 #include <cstdio>
-#include "../dll.h"
-#include "../nd4jmalloc.h"
-#include "../templatemath.h"
+#include "system/dll.h"
+#include "system/nd4jmalloc.h"
+#include "math/templatemath.h"
 #include "../helpers/logger.h"
-#include "../pointercast.h"
+#include "system/pointercast.h"
 #include "../cnpy/cnpy.h"
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 
 #define MAX_DIMENSION 0x7fffffff
 #define MAX_NUM_THREADS  1024
@@ -52,7 +52,7 @@
 #define INLINEDEF inline
 #endif
 
-#include "../pairwise_util.h"
+#include "system/pairwise_util.h"
 #include <stdint.h>
 #include <array/ArrayOptions.h>
 
@@ -141,17 +141,17 @@ namespace shape {
     * Get the shape info buffer
     * for the given rank and shape.
     */
-    ND4J_EXPORT _CUDA_HD Nd4jLong *shapeBuffer(int rank, nd4j::DataType dtype, Nd4jLong *shape);
+    ND4J_EXPORT _CUDA_HD Nd4jLong *shapeBuffer(int rank, sd::DataType dtype, Nd4jLong *shape);
 
-    ND4J_EXPORT _CUDA_HD Nd4jLong *shapeBuffer(int rank, nd4j::DataType dtype, Nd4jLong *shape, Nd4jLong *buffer);
+    ND4J_EXPORT _CUDA_HD Nd4jLong *shapeBuffer(int rank, sd::DataType dtype, Nd4jLong *shape, Nd4jLong *buffer);
 
     /**
     * Get the shape info buffer
     * for the given rank and shape.
      */
-    ND4J_EXPORT _CUDA_HD Nd4jLong *shapeBufferFortran(int rank, nd4j::DataType dtype, Nd4jLong *shape);
+    ND4J_EXPORT _CUDA_HD Nd4jLong *shapeBufferFortran(int rank, sd::DataType dtype, Nd4jLong *shape);
 
-    ND4J_EXPORT _CUDA_HD Nd4jLong *shapeBufferFortran(int rank, nd4j::DataType dtype, Nd4jLong *shape, Nd4jLong *output);
+    ND4J_EXPORT _CUDA_HD Nd4jLong *shapeBufferFortran(int rank, sd::DataType dtype, Nd4jLong *shape, Nd4jLong *output);
 
 #ifdef __CUDACC__
 
@@ -1008,7 +1008,7 @@ namespace shape {
     ND4J_EXPORT void calcOffsets(const Nd4jLong* shapeInfo, Nd4jLong* offsets, const char order = 'c');
     // ND4J_EXPORT void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const char order = 'c');
     // ND4J_EXPORT void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const Nd4jLong* zShapeInfo, Nd4jLong*& zOffsets, const char order = 'c');
-    ND4J_EXPORT _CUDA_HD void shapeOldScalar(nd4j::DataType dtype, Nd4jLong* const buffer, const char order);
+    ND4J_EXPORT _CUDA_HD void shapeOldScalar(sd::DataType dtype, Nd4jLong* const buffer, const char order);
 
     // deduce order and element-wise stride
     // if array is scalar or unit length vector then ews = 1 and order is preserved
@@ -1207,7 +1207,7 @@ __device__ INLINEDEF Nd4jLong *cuMalloc(Nd4jLong *buffer, long size) {
             retShapeLength = 2;
         }
 
-        auto ret = shape::shapeBuffer(retShapeLength, nd4j::ArrayOptions::dataType(originalShapeBuffer), retShape);
+        auto ret = shape::shapeBuffer(retShapeLength, sd::ArrayOptions::dataType(originalShapeBuffer), retShape);
         delete[] retShape;
 
         return ret;
@@ -1681,7 +1681,7 @@ __device__ INLINEDEF Nd4jLong *cuMalloc(Nd4jLong *buffer, long size) {
  * Get the shape info buffer
  * for the given rank and shape.
  */
-    INLINEDEF _CUDA_HD Nd4jLong *shapeBuffer(int rank, nd4j::DataType dtype, Nd4jLong *shape) {
+    INLINEDEF _CUDA_HD Nd4jLong *shapeBuffer(int rank, sd::DataType dtype, Nd4jLong *shape) {
         Nd4jLong *stride = shape::calcStrides(shape, rank);
 
         traceNew(11);
@@ -1697,7 +1697,7 @@ __device__ INLINEDEF Nd4jLong *cuMalloc(Nd4jLong *buffer, long size) {
         auto shapeInfoBuffer = shape::toShapeBuffer(shapeInfo);
         delete[] stride;
         delete shapeInfo;
-        nd4j::ArrayOptions::setDataType(shapeInfoBuffer, dtype);
+        sd::ArrayOptions::setDataType(shapeInfoBuffer, dtype);
         return shapeInfoBuffer;
     }
 
@@ -1706,7 +1706,7 @@ __device__ INLINEDEF Nd4jLong *cuMalloc(Nd4jLong *buffer, long size) {
      *
      * This method is used only for SoftMax
      */
-    INLINEDEF _CUDA_HD Nd4jLong *shapeBuffer(int rank, nd4j::DataType dtype, Nd4jLong *shape, Nd4jLong *buffer) {
+    INLINEDEF _CUDA_HD Nd4jLong *shapeBuffer(int rank, sd::DataType dtype, Nd4jLong *shape, Nd4jLong *buffer) {
         Nd4jLong stride[MAX_RANK];
         shape::calcStrides(shape,rank, stride);
 
@@ -1721,7 +1721,7 @@ __device__ INLINEDEF Nd4jLong *cuMalloc(Nd4jLong *buffer, long size) {
         shapeInfo.order = 'c';
         shapeInfo.elementWiseStride = elementWiseStride;
         shape::toShapeBuffer(&shapeInfo, buffer);
-        nd4j::ArrayOptions::setDataType(buffer, dtype);
+        sd::ArrayOptions::setDataType(buffer, dtype);
         return buffer;
     }
 
@@ -1729,7 +1729,7 @@ __device__ INLINEDEF Nd4jLong *cuMalloc(Nd4jLong *buffer, long size) {
 * Get the shape info buffer
 * for the given rank and shape.
 */
-    INLINEDEF _CUDA_HD Nd4jLong *shapeBufferFortran(int rank, nd4j::DataType dtype, Nd4jLong *shape) {
+    INLINEDEF _CUDA_HD Nd4jLong *shapeBufferFortran(int rank, sd::DataType dtype, Nd4jLong *shape) {
         auto stride = shape::calcStridesFortran(shape,rank);
 
         traceNew(12);
@@ -1746,11 +1746,11 @@ __device__ INLINEDEF Nd4jLong *cuMalloc(Nd4jLong *buffer, long size) {
         auto shapeInfoBuffer = shape::toShapeBuffer(shapeInfo);
         delete[] stride;
         delete shapeInfo;
-        nd4j::ArrayOptions::setDataType(shapeInfoBuffer, dtype);
+        sd::ArrayOptions::setDataType(shapeInfoBuffer, dtype);
         return shapeInfoBuffer;
     }
 
-    INLINEDEF _CUDA_HD Nd4jLong *shapeBufferFortran(int rank, nd4j::DataType dtype, Nd4jLong *shape, Nd4jLong *output) {
+    INLINEDEF _CUDA_HD Nd4jLong *shapeBufferFortran(int rank, sd::DataType dtype, Nd4jLong *shape, Nd4jLong *output) {
         Nd4jLong stride[MAX_RANK];
         shape::calcStridesFortran(shape,rank, stride);
 
@@ -1765,7 +1765,7 @@ __device__ INLINEDEF Nd4jLong *cuMalloc(Nd4jLong *buffer, long size) {
         shapeInfo.order = 'f';
         shapeInfo.elementWiseStride = elementWiseStride;
         shape::toShapeBuffer(&shapeInfo, output);
-        nd4j::ArrayOptions::setDataType(output, dtype);
+        sd::ArrayOptions::setDataType(output, dtype);
         return output;
     }
 
@@ -3006,7 +3006,7 @@ INLINEDEF _CUDA_HD bool haveSameShapeAndStrides(const Nd4jLong *shapeInfo1, cons
  */
     template <typename T>
     INLINEDEF _CUDA_HD T* range(int from, int to, int increment) {
-        int diff = nd4j::math::nd4j_abs<int>(from - to);
+        int diff = sd::math::nd4j_abs<int>(from - to);
         int retLength = diff / increment;
         T *ret;
 
@@ -3186,7 +3186,7 @@ INLINEDEF _CUDA_HD bool haveSameShapeAndStrides(const Nd4jLong *shapeInfo1, cons
         }
         else if(rank == dimensionLength)
             return shape::prodLong(shape,rank);
-        int absSelta = nd4j::math::nd4j_abs<int>(rank - dimensionLength);
+        int absSelta = sd::math::nd4j_abs<int>(rank - dimensionLength);
         traceNew(27);
         auto ret2 = shape::removeIndex<Nd4jLong>(shape, dimension, rank, dimensionLength);
         auto ret = prodLong(ret2, absSelta);
@@ -3337,7 +3337,7 @@ INLINEDEF _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const uint *coo
  *
  */
     INLINEDEF _CUDA_HD int tadsPerBlock(int blockSize, int tads) {
-        return  nd4j::math::nd4j_ceil<double, int>(tads / (double) blockSize);
+        return  sd::math::nd4j_ceil<double, int>(tads / (double) blockSize);
     }
 
 /**
@@ -3748,7 +3748,7 @@ INLINEDEF _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const uint *coo
 
     INLINEDEF _CUDA_HD Nd4jLong *shapeBufferOfNpy(int rank, unsigned int* shape,bool fortranOrder) {
         if(fortranOrder) {
-            Nd4jLong *shapeBufferRet = shape::shapeBufferFortran(rank, nd4j::FLOAT32,(Nd4jLong *) shape);
+            Nd4jLong *shapeBufferRet = shape::shapeBufferFortran(rank, sd::FLOAT32,(Nd4jLong *) shape);
             return shapeBufferRet;
         }
         else {
@@ -3757,7 +3757,7 @@ INLINEDEF _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const uint *coo
                 newShape[i] = shape[i];
             }
 
-            Nd4jLong *shapeBufferRet = shape::shapeBuffer(rank, nd4j::FLOAT32, newShape);
+            Nd4jLong *shapeBufferRet = shape::shapeBuffer(rank, sd::FLOAT32, newShape);
             delete[] newShape;
             return shapeBufferRet;
 
@@ -3948,7 +3948,7 @@ INLINEDEF _CUDA_HD bool areStridesDefault(const Nd4jLong* shapeInfo) {
 //         target[shape::shapeInfoLength(newRank) - 3] = 0;
 //         target[shape::shapeInfoLength(newRank) - 2] = 0;
 //         target[shape::shapeInfoLength(newRank) - 1] = isFOrder ? 102 : 99;
-//         nd4j::ArrayOptions::setDataType(target, nd4j::ArrayOptions::dataType(oldShape));
+//         sd::ArrayOptions::setDataType(target, sd::ArrayOptions::dataType(oldShape));
 
 //         delete[] olddims;
 //         delete[] oldstrides;
@@ -4503,7 +4503,7 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con
         return N;
     }
 
-    INLINEDEF _CUDA_HD void shapeOldScalar(nd4j::DataType dataType, Nd4jLong* const buffer, const char order) {
+    INLINEDEF _CUDA_HD void shapeOldScalar(sd::DataType dataType, Nd4jLong* const buffer, const char order) {
 
         buffer[0] = 2;
         buffer[1] = 1;
@@ -4513,7 +4513,7 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con
         buffer[6] = 1;
         buffer[7] = (int)order;
 
-        nd4j::ArrayOptions::setDataType(buffer, dataType);
+        sd::ArrayOptions::setDataType(buffer, dataType);
     }
 
     template <typename T1, typename T2>
diff --git a/libnd4j/include/helpers/svd.h b/libnd4j/include/helpers/svd.h
index f02732ce7..58007bf37 100644
--- a/libnd4j/include/helpers/svd.h
+++ b/libnd4j/include/helpers/svd.h
@@ -21,10 +21,10 @@
 #ifndef LIBND4J_SVD_H
 #define LIBND4J_SVD_H
 
-#include <hhSequence.h>
-#include "NDArray.h"
+#include <helpers/hhSequence.h>
+#include "array/NDArray.h"
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
diff --git a/libnd4j/include/helpers/unicode.h b/libnd4j/include/helpers/unicode.h
index 239b71201..6db4841db 100644
--- a/libnd4j/include/helpers/unicode.h
+++ b/libnd4j/include/helpers/unicode.h
@@ -21,9 +21,9 @@
 #ifndef LIBND4J_UNICODE_H
 #define LIBND4J_UNICODE_H
 
-#include <NDArray.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
 namespace unicode {
 
     /**
diff --git a/libnd4j/include/indexing/IndicesList.h b/libnd4j/include/indexing/IndicesList.h
index 2094be9a5..a652615d5 100644
--- a/libnd4j/include/indexing/IndicesList.h
+++ b/libnd4j/include/indexing/IndicesList.h
@@ -24,7 +24,7 @@
 #include <initializer_list>
 #include "NDIndex.h"
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT IndicesList {
     protected:
         std::vector<NDIndex *> _indices;
diff --git a/libnd4j/include/indexing/NDIndex.h b/libnd4j/include/indexing/NDIndex.h
index baa67104d..799da4e6c 100644
--- a/libnd4j/include/indexing/NDIndex.h
+++ b/libnd4j/include/indexing/NDIndex.h
@@ -21,11 +21,11 @@
 #ifndef LIBND4J_NDINDEX_H
 #define LIBND4J_NDINDEX_H
 
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <vector>
-#include <dll.h>
+#include <system/dll.h>
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT NDIndex {
     protected:
         std::vector<Nd4jLong> _indices;
diff --git a/libnd4j/include/indexing/impl/IndicesList.cpp b/libnd4j/include/indexing/impl/IndicesList.cpp
index d9d02c3ca..5acbf57d5 100644
--- a/libnd4j/include/indexing/impl/IndicesList.cpp
+++ b/libnd4j/include/indexing/impl/IndicesList.cpp
@@ -20,23 +20,23 @@
 
 #include <indexing/IndicesList.h>
 
-using namespace nd4j;
+using namespace sd;
 
-nd4j::IndicesList::IndicesList(std::initializer_list<NDIndex *> list) {
+sd::IndicesList::IndicesList(std::initializer_list<NDIndex *> list) {
 	for (auto v: list)
 	_indices.emplace_back(v);
 }
 
-nd4j::IndicesList::~IndicesList() {
+sd::IndicesList::~IndicesList() {
     for(auto v: _indices)
         delete v;
 }
 
-int nd4j::IndicesList::size() {
+int sd::IndicesList::size() {
     return (int) _indices.size();
 }
 
-bool nd4j::IndicesList::isScalar() {
+bool sd::IndicesList::isScalar() {
     if (_indices.size() == 1) {
         return _indices.at(0)->isPoint();
     }
@@ -44,10 +44,10 @@ bool nd4j::IndicesList::isScalar() {
     return false;
 }
 
-nd4j::NDIndex* nd4j::IndicesList::at(int idx) {
+sd::NDIndex* sd::IndicesList::at(int idx) {
     return _indices.at(idx);
 }
 
-void nd4j::IndicesList::push_back(NDIndex* idx) {
+void sd::IndicesList::push_back(NDIndex* idx) {
     _indices.emplace_back(idx);
 }
\ No newline at end of file
diff --git a/libnd4j/include/indexing/impl/NDIndex.cpp b/libnd4j/include/indexing/impl/NDIndex.cpp
index 1bf4388bb..43aaf0914 100644
--- a/libnd4j/include/indexing/impl/NDIndex.cpp
+++ b/libnd4j/include/indexing/impl/NDIndex.cpp
@@ -20,7 +20,7 @@
 
 #include <indexing/NDIndex.h>
 
-namespace nd4j {
+namespace sd {
 
     bool NDIndex::isInterval() {
         return false;
@@ -30,11 +30,11 @@ namespace nd4j {
         return _stride;
     }
 
-    nd4j::NDIndexAll::NDIndexAll() : nd4j::NDIndex() {
+    sd::NDIndexAll::NDIndexAll() : sd::NDIndex() {
         _indices.push_back(-1);
     }
 
-    nd4j::NDIndexPoint::NDIndexPoint(Nd4jLong point) : nd4j::NDIndex() {
+    sd::NDIndexPoint::NDIndexPoint(Nd4jLong point) : sd::NDIndex() {
         this->_indices.push_back(point);
     }
 
@@ -52,34 +52,34 @@ namespace nd4j {
 
 
 
-    nd4j::NDIndexInterval::NDIndexInterval(Nd4jLong start, Nd4jLong end, Nd4jLong stride) : nd4j::NDIndex() {
+    sd::NDIndexInterval::NDIndexInterval(Nd4jLong start, Nd4jLong end, Nd4jLong stride) : sd::NDIndex() {
         this->_stride = stride;
         for (int e = start; e < end; e+= stride)
             this->_indices.push_back(e);
     }
 
-    bool nd4j::NDIndex::isAll() {
+    bool sd::NDIndex::isAll() {
         return _indices.size() == 1 && _indices.at(0) == -1;
     }
 
-    bool nd4j::NDIndex::isPoint() {
+    bool sd::NDIndex::isPoint() {
         return _indices.size() == 1 && _indices.at(0) >= 0;
     }
 
-    std::vector<Nd4jLong> &nd4j::NDIndex::getIndices() {
+    std::vector<Nd4jLong> &sd::NDIndex::getIndices() {
         return _indices;
     }
 
 
-    nd4j::NDIndex *nd4j::NDIndex::all() {
+    sd::NDIndex *sd::NDIndex::all() {
         return new NDIndexAll();
     }
 
-    nd4j::NDIndex *nd4j::NDIndex::point(Nd4jLong pt) {
+    sd::NDIndex *sd::NDIndex::point(Nd4jLong pt) {
         return new NDIndexPoint(pt);
     }
 
-    nd4j::NDIndex *nd4j::NDIndex::interval(Nd4jLong start, Nd4jLong end, Nd4jLong stride) {
+    sd::NDIndex *sd::NDIndex::interval(Nd4jLong start, Nd4jLong end, Nd4jLong stride) {
         return new NDIndexInterval(start, end, stride);
     }
 }
\ No newline at end of file
diff --git a/libnd4j/blas/NativeOpExecutioner.h b/libnd4j/include/legacy/NativeOpExecutioner.h
similarity index 88%
rename from libnd4j/blas/NativeOpExecutioner.h
rename to libnd4j/include/legacy/NativeOpExecutioner.h
index b4a5fdea4..56fb500db 100644
--- a/libnd4j/blas/NativeOpExecutioner.h
+++ b/libnd4j/include/legacy/NativeOpExecutioner.h
@@ -23,7 +23,7 @@
 
 
 #include <types/types.h>
-#include <dll.h>
+#include <system/dll.h>
 #include <ops/specials.h>
 #include <ops/specials_sparse.h>
 #include <execution/LaunchContext.h>
@@ -45,7 +45,7 @@ public:
      * @param result
      * @param resultShapeInfo
      */
-    static void execIndexReduceScalar(nd4j::LaunchContext  *lc,
+    static void execIndexReduceScalar(sd::LaunchContext  *lc,
                                     int opNum, 
                                     void *hX, Nd4jLong *hXShapeInfo,
                                     void *dX, Nd4jLong *dXShapeInfo,
@@ -66,7 +66,7 @@ public:
      * @param dimension
      * @param dimensionLength
      */
-    static void execReduce3Scalar(nd4j::LaunchContext  *lc,
+    static void execReduce3Scalar(sd::LaunchContext  *lc,
                             int opNum,
                             void *hX, Nd4jLong *hXShapeInfo,
                             void *dX, Nd4jLong *dXShapeInfo,
@@ -88,7 +88,7 @@ public:
      * @param result
      * @param resultShapeInfo
      */
-    static void execReduce3(nd4j::LaunchContext  *lc,
+    static void execReduce3(sd::LaunchContext  *lc,
                             int opNum,
                             void *hX, Nd4jLong *hXShapeInfo,
                             void *dX, Nd4jLong *dXShapeInfo,
@@ -111,7 +111,7 @@ public:
      * @param dimension
      * @param dimensionLength
      */
-    static void execReduce3(nd4j::LaunchContext  *lc,
+    static void execReduce3(sd::LaunchContext  *lc,
                             int opNum,
                             void *hX, Nd4jLong *hXShapeInfo,
                             void *dX, Nd4jLong *dXShapeInfo,
@@ -124,7 +124,7 @@ public:
                             Nd4jLong *xTadOnlyShapeInfo, Nd4jLong *xTadOffsets,
                             Nd4jLong *yTadOnlyShapeInfo, Nd4jLong *yTadOffsets);
 
-    static void execReduce3All(nd4j::LaunchContext  *lc,
+    static void execReduce3All(sd::LaunchContext  *lc,
                             int opNum,
                             void *hX, Nd4jLong *hXShapeInfo,
                             void *dX, Nd4jLong *dXShapeInfo,
@@ -148,7 +148,7 @@ public:
      * @param dimension
      * @param dimensionLength
      */
-    static void execIndexReduce(nd4j::LaunchContext  *lc,
+    static void execIndexReduce(sd::LaunchContext  *lc,
                                 int opNum,
                                 void *hX, Nd4jLong *hXShapeInfo,
                                 void *dX, Nd4jLong *dXShapeInfo,
@@ -169,7 +169,7 @@ public:
      * @param extraParams
      * @param n
      */
-    static void execScalar(nd4j::LaunchContext  *lc,
+    static void execScalar(sd::LaunchContext  *lc,
                             int opNum,
                             void *hX, Nd4jLong *hXShapeInfo,
                             void *dX, Nd4jLong *dXShapeInfo,
@@ -179,7 +179,7 @@ public:
                             void *dScalar, Nd4jLong *dSscalarShapeInfo,
                             void *extraParams, bool allowParallelism = true);
 
-static void execScalarBool(nd4j::LaunchContext  *lc,
+static void execScalarBool(sd::LaunchContext  *lc,
                             int opNum,
                             void *hX, Nd4jLong *hXShapeInfo,
                             void *dX, Nd4jLong *dXShapeInfo,
@@ -189,7 +189,7 @@ static void execScalarBool(nd4j::LaunchContext  *lc,
                             void *dScalar, Nd4jLong *dSscalarShapeInfo,
                             void *extraParams, bool allowParallelism = true);
 
-static void execScalarInt(nd4j::LaunchContext  *lc,
+static void execScalarInt(sd::LaunchContext  *lc,
                                int opNum,
                                void *hX, Nd4jLong *hXShapeInfo,
                                void *dX, Nd4jLong *dXShapeInfo,
@@ -199,7 +199,7 @@ static void execScalarInt(nd4j::LaunchContext  *lc,
                                void *dScalar, Nd4jLong *dSscalarShapeInfo,
                                void *extraParams, bool allowParallelism = true);
 
- static void execScalar(nd4j::LaunchContext  *lc,
+ static void execScalar(sd::LaunchContext  *lc,
                             int opNum,
                             void *hX, Nd4jLong *hXShapeInfo,
                             void *dX, Nd4jLong *dXShapeInfo,
@@ -212,7 +212,7 @@ static void execScalarInt(nd4j::LaunchContext  *lc,
                             Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets,
                             Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ);
 
- static void execScalarBool(nd4j::LaunchContext  *lc,
+ static void execScalarBool(sd::LaunchContext  *lc,
                             int opNum,
                             void *hX, Nd4jLong *hXShapeInfo,
                             void *dX, Nd4jLong *dXShapeInfo,
@@ -225,7 +225,7 @@ static void execScalarInt(nd4j::LaunchContext  *lc,
                             Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets,
                             Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ);
 
- static void execScalarInt(nd4j::LaunchContext  *lc,
+ static void execScalarInt(sd::LaunchContext  *lc,
                                int opNum,
                                void *hX, Nd4jLong *hXShapeInfo,
                                void *dX, Nd4jLong *dXShapeInfo,
@@ -251,7 +251,7 @@ static void execScalarInt(nd4j::LaunchContext  *lc,
      * @param dimension
      * @param dimensionLength
      */
-    static void execBroadcast(nd4j::LaunchContext  *lc,
+    static void execBroadcast(sd::LaunchContext  *lc,
                             int opNum,
                             void *hX, Nd4jLong *hXShapeInfo,
                             void *dX, Nd4jLong *dXShapeInfo,
@@ -263,7 +263,7 @@ static void execScalarInt(nd4j::LaunchContext  *lc,
                             Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets,
                             Nd4jLong *tadOnlyShapeInfoZ,Nd4jLong *tadOffsetsZ);
 
-    static void execInverseBroadcast(nd4j::LaunchContext  *lc,
+    static void execInverseBroadcast(sd::LaunchContext  *lc,
                                      int opNum,
                                      void *x, Nd4jLong *xShapeInfo,
                                      void *dX, Nd4jLong *dXShapeInfo,
@@ -276,7 +276,7 @@ static void execScalarInt(nd4j::LaunchContext  *lc,
                                      Nd4jLong *tadOnlyShapeInfoZ, Nd4jLong *tadOffsetsZ);
 
 
-    static void execBroadcastBool(nd4j::LaunchContext  *lc,
+    static void execBroadcastBool(sd::LaunchContext  *lc,
                             int opNum,
                             void *hX, Nd4jLong *hXShapeInfo,
                             void *dX, Nd4jLong *dXShapeInfo,
@@ -289,7 +289,7 @@ static void execScalarInt(nd4j::LaunchContext  *lc,
                             Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets,
                             Nd4jLong *tadOnlyShapeInfoZ,Nd4jLong *tadOffsetsZ);
 
-    static void execInverseBroadcastBool(nd4j::LaunchContext  *lc,
+    static void execInverseBroadcastBool(sd::LaunchContext  *lc,
                                      int opNum,
                                      void *x, Nd4jLong *xShapeInfo,
                                      void *dX, Nd4jLong *dXShapeInfo,
@@ -302,7 +302,7 @@ static void execScalarInt(nd4j::LaunchContext  *lc,
                                      Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets,
                                      Nd4jLong *tadOnlyShapeInfoZ, Nd4jLong *tadOffsetsZ);
 
-    static void execBroadcastInt(nd4j::LaunchContext  *lc,
+    static void execBroadcastInt(sd::LaunchContext  *lc,
                                   int opNum,
                                   void *hX, Nd4jLong *hXShapeInfo,
                                   void *dX, Nd4jLong *dXShapeInfo,
@@ -314,7 +314,7 @@ static void execScalarInt(nd4j::LaunchContext  *lc,
                                   Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets,
                                   Nd4jLong *tadOnlyShapeInfoZ,Nd4jLong *tadOffsetsZ);
 
-    static void execInverseBroadcastInt(nd4j::LaunchContext  *lc,
+    static void execInverseBroadcastInt(sd::LaunchContext  *lc,
                                          int opNum,
                                          void *x, Nd4jLong *xShapeInfo,
                                          void *dX, Nd4jLong *dXShapeInfo,
@@ -338,7 +338,7 @@ static void execScalarInt(nd4j::LaunchContext  *lc,
   * @param extraParams
   * @param n
   */
-    static void execPairwiseTransform(nd4j::LaunchContext  *lc,
+    static void execPairwiseTransform(sd::LaunchContext  *lc,
                                     int opNum,
                                     void *hX, Nd4jLong *hXShapeInfo,
                                     void *dX, Nd4jLong *dXShapeInfo,
@@ -348,7 +348,7 @@ static void execScalarInt(nd4j::LaunchContext  *lc,
                                     void *dZ, Nd4jLong *dZShapeInfo,
                                     void *extraParams);
 
-    static void execPairwiseBoolTransform(nd4j::LaunchContext  *lc,
+    static void execPairwiseBoolTransform(sd::LaunchContext  *lc,
                                     int opNum,
                                     void *hX, Nd4jLong *hXShapeInfo,
                                     void *dX, Nd4jLong *dXShapeInfo,
@@ -358,7 +358,7 @@ static void execScalarInt(nd4j::LaunchContext  *lc,
                                     void *dZ, Nd4jLong *dZShapeInfo,
                                     void *extraParams);
 
-    static void execPairwiseIntTransform(nd4j::LaunchContext  *lc,
+    static void execPairwiseIntTransform(sd::LaunchContext  *lc,
                                           int opNum,
                                           void *hX, Nd4jLong *hXShapeInfo,
                                           void *dX, Nd4jLong *dXShapeInfo,
@@ -378,7 +378,7 @@ static void execScalarInt(nd4j::LaunchContext  *lc,
  * @param extraParams
  * @param n
  */
-    static void execTransformFloat(nd4j::LaunchContext  *lc,
+    static void execTransformFloat(sd::LaunchContext  *lc,
                                 int opNum,
                                 void *hX, Nd4jLong *hXShapeInfo,
                                 void *dX, Nd4jLong *dXShapeInfo,
@@ -387,7 +387,7 @@ static void execScalarInt(nd4j::LaunchContext  *lc,
                                 void *extraParams,
                                 Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
 
-static void execTransformAny(nd4j::LaunchContext  *lc,
+static void execTransformAny(sd::LaunchContext  *lc,
                                 int opNum,
                                 void *hX, Nd4jLong *hXShapeInfo,
                                 void *dX, Nd4jLong *dXShapeInfo,
@@ -396,7 +396,7 @@ static void execTransformAny(nd4j::LaunchContext  *lc,
                                 void *extraParams,
                                 Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, bool allowParallelism = true);
 
-static void execTransformStrict(nd4j::LaunchContext  *lc,
+static void execTransformStrict(sd::LaunchContext  *lc,
                                     int opNum,
                                     void *hX, Nd4jLong *hXShapeInfo,
                                     void *dX, Nd4jLong *dXShapeInfo,
@@ -405,7 +405,7 @@ static void execTransformStrict(nd4j::LaunchContext  *lc,
                                     void *extraParams,
                                     Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
 
-static void execTransformSame(nd4j::LaunchContext  *lc,
+static void execTransformSame(sd::LaunchContext  *lc,
                                 int opNum,
                                 void *hX, Nd4jLong *hXShapeInfo,
                                 void *dX, Nd4jLong *dXShapeInfo,
@@ -414,7 +414,7 @@ static void execTransformSame(nd4j::LaunchContext  *lc,
                                 void *extraParams,
                                 Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
 
-static void execTransformBool(nd4j::LaunchContext  *lc,
+static void execTransformBool(sd::LaunchContext  *lc,
                                 int opNum,
                                 void *hX, Nd4jLong *hXShapeInfo,
                                 void *dX, Nd4jLong *dXShapeInfo,
@@ -431,7 +431,7 @@ static void execTransformBool(nd4j::LaunchContext  *lc,
      * @param result
      * @param resultShapeInfo
      */
-    static void execReduceFloat(nd4j::LaunchContext  *lc,
+    static void execReduceFloat(sd::LaunchContext  *lc,
                             int opNum,
                             void *hX, Nd4jLong *hXShapeInfo,
                             void *dX, Nd4jLong *dXShapeInfo,
@@ -441,7 +441,7 @@ static void execTransformBool(nd4j::LaunchContext  *lc,
                             int *dimension, int dimensionLength,
                             Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
 
-    static void execReduceSame(nd4j::LaunchContext  *lc,
+    static void execReduceSame(sd::LaunchContext  *lc,
                                 int opNum,
                                 void *hX, Nd4jLong *hXShapeInfo,
                                 void *dX, Nd4jLong *dXShapeInfo,
@@ -451,7 +451,7 @@ static void execTransformBool(nd4j::LaunchContext  *lc,
                                 int *dimension, int dimensionLength,
                                 Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
 
-    static void execReduceBool(nd4j::LaunchContext  *lc,
+    static void execReduceBool(sd::LaunchContext  *lc,
                                 int opNum,
                                 void *hX, Nd4jLong *hXShapeInfo,
                                 void *dX, Nd4jLong *dXShapeInfo,
@@ -461,7 +461,7 @@ static void execTransformBool(nd4j::LaunchContext  *lc,
                                 int *dimension, int dimensionLength,
                                 Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets);
 
-    static void execReduceLong(nd4j::LaunchContext  *lc,
+    static void execReduceLong(sd::LaunchContext  *lc,
                                 int opNum,
                                 void *hX, Nd4jLong *hXShapeInfo,
                                 void *dX, Nd4jLong *dXShapeInfo,
@@ -479,7 +479,7 @@ static void execTransformBool(nd4j::LaunchContext  *lc,
      * @param extraParams
      * @return
      */
-    static void execReduceFloatScalar(nd4j::LaunchContext  *lc,
+    static void execReduceFloatScalar(sd::LaunchContext  *lc,
                                     int opNum,
                                     void *hX, Nd4jLong *hXShapeInfo,
                                     void *dX, Nd4jLong *dXShapeInfo,
@@ -487,7 +487,7 @@ static void execTransformBool(nd4j::LaunchContext  *lc,
                                     void *hZ, Nd4jLong *hZShapeInfo,
                                     void *dZ, Nd4jLong *dZShapeInfo);
 
-    static void execReduceBoolScalar(nd4j::LaunchContext  *lc,
+    static void execReduceBoolScalar(sd::LaunchContext  *lc,
                                         int opNum,
                                         void *hX, Nd4jLong *hXShapeInfo,
                                         void *dX, Nd4jLong *dXShapeInfo,
@@ -495,7 +495,7 @@ static void execTransformBool(nd4j::LaunchContext  *lc,
                                         void *hZ, Nd4jLong *hZShapeInfo,
                                         void *dZ, Nd4jLong *dZShapeInfo);
 
-    static void execReduceSameScalar(nd4j::LaunchContext  *lc,
+    static void execReduceSameScalar(sd::LaunchContext  *lc,
                                         int opNum,
                                         void *hX, Nd4jLong *hXShapeInfo,
                                         void *dX, Nd4jLong *dXShapeInfo,
@@ -503,7 +503,7 @@ static void execTransformBool(nd4j::LaunchContext  *lc,
                                         void *hZ, Nd4jLong *hZShapeInfo,
                                         void *dZ, Nd4jLong *dZShapeInfo);
 
-    static void execReduceLongScalar(nd4j::LaunchContext  *lc,
+    static void execReduceLongScalar(sd::LaunchContext  *lc,
                                     int opNum,
                                     void *hX, Nd4jLong *hXShapeInfo,
                                     void *dX, Nd4jLong *dXShapeInfo,
@@ -511,7 +511,7 @@ static void execTransformBool(nd4j::LaunchContext  *lc,
                                     void *hZ, Nd4jLong *hZShapeInfo,
                                     void *dZ, Nd4jLong *dZShapeInfo);    
 
-    static void execReduce3TAD(nd4j::LaunchContext  *lc,
+    static void execReduce3TAD(sd::LaunchContext  *lc,
                             int opNum,
                             void *hX, Nd4jLong *hXShapeInfo,
                             void *dX, Nd4jLong *dXShapeInfo,
@@ -535,7 +535,7 @@ static void execTransformBool(nd4j::LaunchContext  *lc,
      * @param dimension
      * @param dimensionLength
      */
-    static void execSummaryStats(nd4j::LaunchContext  *lc,
+    static void execSummaryStats(sd::LaunchContext  *lc,
                                 int opNum,
                                 void *hX, Nd4jLong *hXShapeInfo,
                                 void *dX, Nd4jLong *dXShapeInfo,
@@ -555,7 +555,7 @@ static void execTransformBool(nd4j::LaunchContext  *lc,
      * @param result
      * @param resultShapeInfo
      */
-    static void execSummaryStats(nd4j::LaunchContext  *lc,
+    static void execSummaryStats(sd::LaunchContext  *lc,
                                 int opNum,
                                 void *hX, Nd4jLong *hXShapeInfo,
                                 void *dX, Nd4jLong *dXShapeInfo,
@@ -573,7 +573,7 @@ static void execTransformBool(nd4j::LaunchContext  *lc,
     * @param result
     * @param resultShapeInfo
     */
-    static void execSummaryStatsScalar(nd4j::LaunchContext  *lc,
+    static void execSummaryStatsScalar(sd::LaunchContext  *lc,
                                     int opNum,
                                     void *hX, Nd4jLong *hXShapeInfo,
                                     void *dX, Nd4jLong *dXShapeInfo,
@@ -583,14 +583,14 @@ static void execTransformBool(nd4j::LaunchContext  *lc,
                                     bool biasCorrected);   
 
 
-    static void execRandom(nd4j::LaunchContext  *lc,
+    static void execRandom(sd::LaunchContext  *lc,
                             int opNum,
                             Nd4jPointer state,
                             void *hZ, Nd4jLong *hZShapeBuffer,
                             void *dZ, Nd4jLong *dZShapeBuffer,
                             void *extraArguments);
 
-    static void execRandom(nd4j::LaunchContext  *lc,
+    static void execRandom(sd::LaunchContext  *lc,
                             int opNum,
                             Nd4jPointer state,
                             void *hX, Nd4jLong *hXShapeBuffer,
@@ -599,7 +599,7 @@ static void execTransformBool(nd4j::LaunchContext  *lc,
                             void *dZ, Nd4jLong *dZShapeBuffer,
                             void *extraArguments);
 
-    static void execRandom(nd4j::LaunchContext  *lc,
+    static void execRandom(sd::LaunchContext  *lc,
                           int opNum,
                           Nd4jPointer state,
                           void *hX, Nd4jLong *hXShapeBuffer,
@@ -613,7 +613,7 @@ static void execTransformBool(nd4j::LaunchContext  *lc,
 
 
     template <typename X>
-    static FORCEINLINE void execAggregate(nd4j::LaunchContext  *lc,
+    static FORCEINLINE void execAggregate(sd::LaunchContext  *lc,
                               int opNum,
                               void **varguments,
                               int numArguments,
@@ -630,32 +630,32 @@ static void execTransformBool(nd4j::LaunchContext  *lc,
     
 
     inline static void execSort(void *x, Nd4jLong *xShapeInfo, bool descending) {
-        auto xType = nd4j::ArrayOptions::dataType(xShapeInfo);
+        auto xType = sd::ArrayOptions::dataType(xShapeInfo);
 
-        BUILD_SINGLE_SELECTOR(xType, nd4j::SpecialMethods, ::sortGeneric(x, xShapeInfo, descending), LIBND4J_TYPES);
+        BUILD_SINGLE_SELECTOR(xType, sd::SpecialMethods, ::sortGeneric(x, xShapeInfo, descending), LIBND4J_TYPES);
     }
 
     static void execSort(void *x, Nd4jLong *xShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, bool descending) {
-        auto xType = nd4j::ArrayOptions::dataType(xShapeInfo);
+        auto xType = sd::ArrayOptions::dataType(xShapeInfo);
 
-        BUILD_SINGLE_SELECTOR(xType, nd4j::SpecialMethods, ::sortTadGeneric(x, xShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets, descending), LIBND4J_TYPES);
+        BUILD_SINGLE_SELECTOR(xType, sd::SpecialMethods, ::sortTadGeneric(x, xShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets, descending), LIBND4J_TYPES);
     }
 
     inline static void execSortCooIndices(Nd4jLong *indices, void *values, Nd4jLong length, int rank) {
-        nd4j::sparse::SparseUtils<Nd4jLong>::sortCooIndicesGeneric(indices, reinterpret_cast<Nd4jLong *>(values), length, rank);
+        sd::sparse::SparseUtils<Nd4jLong>::sortCooIndicesGeneric(indices, reinterpret_cast<Nd4jLong *>(values), length, rank);
     }
 
 
     inline static Nd4jLong encodeBitmap(void *dx, Nd4jLong *xShapeInfo, Nd4jLong N, int *dz, float threshold) {
-        auto xType = nd4j::ArrayOptions::dataType(xShapeInfo);
+        auto xType = sd::ArrayOptions::dataType(xShapeInfo);
 
-        BUILD_SINGLE_SELECTOR(xType, return nd4j::SpecialMethods, ::encodeBitmapGeneric(dx, xShapeInfo, N, dz, threshold), FLOAT_TYPES);
+        BUILD_SINGLE_SELECTOR(xType, return sd::SpecialMethods, ::encodeBitmapGeneric(dx, xShapeInfo, N, dz, threshold), FLOAT_TYPES);
     }
 
     inline static void decodeBitmap(void *dx, Nd4jLong N, void *dz, Nd4jLong *zShapeInfo) {
-        auto zType = nd4j::ArrayOptions::dataType(zShapeInfo);
+        auto zType = sd::ArrayOptions::dataType(zShapeInfo);
 
-        BUILD_SINGLE_SELECTOR(zType, nd4j::SpecialMethods, ::decodeBitmapGeneric(dx, N, dz, zShapeInfo), FLOAT_TYPES);
+        BUILD_SINGLE_SELECTOR(zType, sd::SpecialMethods, ::decodeBitmapGeneric(dx, N, dz, zShapeInfo), FLOAT_TYPES);
     }
 
 };
diff --git a/libnd4j/blas/NativeOps.h b/libnd4j/include/legacy/NativeOps.h
similarity index 97%
rename from libnd4j/blas/NativeOps.h
rename to libnd4j/include/legacy/NativeOps.h
index f4d1d261b..ea8352362 100755
--- a/libnd4j/blas/NativeOps.h
+++ b/libnd4j/include/legacy/NativeOps.h
@@ -42,9 +42,9 @@
 #endif
 */
 
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <types/float16.h>
-#include <cnpy.h>
+#include <cnpy/cnpy.h>
 
 //DO NOT REMOVE: THIS IS AN EDITOR SEMANTICS THING FOR CLION
 //IT DEFINES THE EXPORT MACRO FOR THE EDITOR AND THEN
@@ -54,7 +54,7 @@
 #else
 #define ND4J_EXPORT
 #endif
-#include <dll.h>
+#include <system/dll.h>
 
 /*
 int tad_threshold = 1;
@@ -75,10 +75,10 @@ bool verbose = false;
 #include <graph/GraphState.h>
 #include <graph/execution/LogicExecutor.h>
 #include <graph/ResultWrapper.h>
-#include <DebugInfo.h>
+#include <helpers/DebugInfo.h>
 #include <memory/MemoryCounter.h>
 
-typedef nd4j::InteropDataBuffer OpaqueDataBuffer;
+typedef sd::InteropDataBuffer OpaqueDataBuffer;
 
 extern "C" {
 
@@ -782,7 +782,7 @@ ND4J_EXPORT void enableVerboseMode(bool reallyEnable);
  */
 ND4J_EXPORT void setGridLimit(int gridSize);
 
-typedef nd4j::TadPack OpaqueTadPack;
+typedef sd::TadPack OpaqueTadPack;
 
 /**
  *
@@ -961,7 +961,7 @@ ND4J_EXPORT void execAggregate(Nd4jPointer *extraPointers,
                          int numIntArrays,
                          void *realArguments,
                          int numRealArguments,
-                         nd4j::DataType dtype);
+                         sd::DataType dtype);
 
 
 ND4J_EXPORT void batchExecutor(Nd4jPointer *extraPointers,
@@ -974,7 +974,7 @@ ND4J_EXPORT void batchExecutor(Nd4jPointer *extraPointers,
                                int maxIdx,
                                int maxReals,
                                void *ptrToArguments,
-                               nd4j::DataType dtype);
+                               sd::DataType dtype);
 
 ND4J_EXPORT void execAggregateBatch(Nd4jPointer *extraPointers,
                               int numAggregates,
@@ -986,7 +986,7 @@ ND4J_EXPORT void execAggregateBatch(Nd4jPointer *extraPointers,
                               int maxIdx,
                               int maxReals,
                               void *ptrToArguments,
-                              nd4j::DataType dtype);
+                              sd::DataType dtype);
 
 /**
  * Random operations
@@ -1127,7 +1127,7 @@ extern "C" {
 
 static Nd4jPointer numpyHeaderForNd4j(Nd4jPointer data,Nd4jPointer shapeBuffer,Nd4jLong wordSize,Nd4jLong *headerSize) {
     auto shapeBufferCast = reinterpret_cast<Nd4jLong *>(shapeBuffer);
-    auto type = nd4j::ArrayOptions::dataType(shapeBufferCast);
+    auto type = sd::ArrayOptions::dataType(shapeBufferCast);
     BUILD_SINGLE_SELECTOR(type, return _numpyHeaderForNd4j, (data, shapeBuffer, wordSize, headerSize), LIBND4J_TYPES);
 }
 
@@ -1192,7 +1192,7 @@ extern "C" {
 
 static Nd4jPointer numpyFromNd4j(Nd4jPointer data,Nd4jPointer shapeBuffer,Nd4jLong wordSize) {
     auto shapeBufferCast = reinterpret_cast<Nd4jLong *>(shapeBuffer);
-    auto type = nd4j::ArrayOptions::dataType(shapeBufferCast);
+    auto type = sd::ArrayOptions::dataType(shapeBufferCast);
     BUILD_SINGLE_SELECTOR(type, return _numpyFromNd4j, (data, shapeBuffer, wordSize), LIBND4J_TYPES);
 }
 
@@ -1499,7 +1499,7 @@ ND4J_EXPORT Nd4jLong* mmapFile(Nd4jPointer *extraPointers, const char *fileName,
 
 ND4J_EXPORT void munmapFile(Nd4jPointer *extraPointers, Nd4jLong* ptrMap, Nd4jLong length);
 
-typedef nd4j::graph::ResultWrapper OpaqueResultWrapper;
+typedef sd::graph::ResultWrapper OpaqueResultWrapper;
 
 // flatbuffers execution
 ND4J_EXPORT OpaqueResultWrapper* executeFlatGraph(Nd4jPointer *extraPointers, Nd4jPointer flatBufferPointer);
@@ -1515,7 +1515,7 @@ ND4J_EXPORT const char* getAllOperations();
 ND4J_EXPORT int execCustomOp(Nd4jPointer* extraPointers, Nd4jLong hash, Nd4jPointer* inputBuffers, Nd4jPointer* inputShapes, int numInputs, Nd4jPointer* outputBuffers, Nd4jPointer* outputShapes, int numOutputs, double* tArgs, int numTArgs, Nd4jLong *iArgs, int numIArgs, bool* bArgs, int numBArgs, bool isInplace);
 ND4J_EXPORT int execCustomOp2(Nd4jPointer* extraPointers, Nd4jLong hash, Nd4jPointer opContext);
 
-typedef nd4j::ShapeList OpaqueShapeList;
+typedef sd::ShapeList OpaqueShapeList;
 
 ND4J_EXPORT OpaqueShapeList* calculateOutputShapes(Nd4jPointer* extraPointers, Nd4jLong hash, Nd4jPointer* inputShapes, int numInputShapes, double* tArgs, int numTArgs, Nd4jLong *iArgs, int numIArgs);
 ND4J_EXPORT OpaqueShapeList* calculateOutputShapes2(Nd4jPointer* extraPointers, Nd4jLong hash, Nd4jPointer* inputBuffers, Nd4jPointer* inputShapes, int numInputShapes, double* tArgs, int numTArgs, Nd4jLong *iArgs, int numIArgs, bool *bArgs, int numBArgs, int *dArgs, int numDArgs);
@@ -1527,8 +1527,8 @@ ND4J_EXPORT void deleteShapeList(Nd4jPointer shapeList);
 
 ND4J_EXPORT int registerGraph(Nd4jPointer *extraPointers, Nd4jLong graphId, Nd4jPointer flatBufferPointer);
 
-typedef nd4j::graph::VariablesSet OpaqueVariablesSet;
-typedef nd4j::graph::Variable OpaqueVariable;
+typedef sd::graph::VariablesSet OpaqueVariablesSet;
+typedef sd::graph::Variable OpaqueVariable;
 
 ND4J_EXPORT OpaqueVariablesSet *executeStoredGraph(Nd4jPointer *extraPointers, Nd4jLong graphId, Nd4jPointer *inputBuffers, Nd4jPointer *inputShapes, int* inputIndices, int numInputs);
 
@@ -1578,13 +1578,13 @@ ND4J_EXPORT void scatterUpdate(Nd4jPointer *extraPointers, int opCode, int numOf
 ND4J_EXPORT void inspectArray(Nd4jPointer *extraPointers, Nd4jPointer buffer, Nd4jLong *shapeInfo, Nd4jPointer specialBuffer, Nd4jLong *specialShapeInfo, Nd4jPointer debugInfo);
 
 
-typedef nd4j::ConstantDataBuffer OpaqueConstantDataBuffer;
+typedef sd::ConstantDataBuffer OpaqueConstantDataBuffer;
 
-ND4J_EXPORT OpaqueConstantDataBuffer* shapeBuffer(int rank, Nd4jLong *shape, Nd4jLong *strides, nd4j::DataType dtype, char order, Nd4jLong ews, bool empty);
+ND4J_EXPORT OpaqueConstantDataBuffer* shapeBuffer(int rank, Nd4jLong *shape, Nd4jLong *strides, sd::DataType dtype, char order, Nd4jLong ews, bool empty);
 
-ND4J_EXPORT OpaqueConstantDataBuffer* constantBufferLong(nd4j::DataType dtype, Nd4jLong *data, int length);
-ND4J_EXPORT OpaqueConstantDataBuffer* constantBufferDouble(nd4j::DataType dtype, double *data, int length);
-ND4J_EXPORT OpaqueConstantDataBuffer* constantBuffer(nd4j::DataType dtype, nd4j::ConstantDescriptor *descriptor);
+ND4J_EXPORT OpaqueConstantDataBuffer* constantBufferLong(sd::DataType dtype, Nd4jLong *data, int length);
+ND4J_EXPORT OpaqueConstantDataBuffer* constantBufferDouble(sd::DataType dtype, double *data, int length);
+ND4J_EXPORT OpaqueConstantDataBuffer* constantBuffer(sd::DataType dtype, sd::ConstantDescriptor *descriptor);
 
 ND4J_EXPORT Nd4jPointer getConstantDataBufferPrimary(OpaqueConstantDataBuffer* dbf);
 ND4J_EXPORT Nd4jPointer getConstantDataBufferSpecial(OpaqueConstantDataBuffer* dbf);
@@ -1593,8 +1593,8 @@ ND4J_EXPORT Nd4jLong getConstantDataBufferSizeOf(OpaqueConstantDataBuffer* dbf);
 
 ND4J_EXPORT void deleteShapeBuffer(OpaqueConstantDataBuffer* ptr);
 
-typedef nd4j::graph::Context OpaqueContext;
-typedef nd4j::graph::RandomGenerator OpaqueRandomGenerator;
+typedef sd::graph::Context OpaqueContext;
+typedef sd::graph::RandomGenerator OpaqueRandomGenerator;
 
 ND4J_EXPORT OpaqueContext* createGraphContext(int nodeId);
 ND4J_EXPORT OpaqueRandomGenerator* getGraphContextRandomGenerator(OpaqueContext* ptr);
@@ -1625,7 +1625,7 @@ ND4J_EXPORT void deleteRandomGenerator(OpaqueRandomGenerator* ptr);
 ND4J_EXPORT const char* runLightBenchmarkSuit(bool printOut);
 ND4J_EXPORT const char* runFullBenchmarkSuit(bool printOut);
 
-typedef nd4j::LaunchContext OpaqueLaunchContext;
+typedef sd::LaunchContext OpaqueLaunchContext;
 
 ND4J_EXPORT OpaqueLaunchContext* defaultLaunchContext();
 ND4J_EXPORT Nd4jPointer lcScalarPointer(OpaqueLaunchContext* lc);
diff --git a/libnd4j/blas/cpu/NativeOpExecutioner.cpp b/libnd4j/include/legacy/cpu/NativeOpExecutioner.cpp
similarity index 74%
rename from libnd4j/blas/cpu/NativeOpExecutioner.cpp
rename to libnd4j/include/legacy/cpu/NativeOpExecutioner.cpp
index 1fedb0241..b3f1a1745 100644
--- a/libnd4j/blas/cpu/NativeOpExecutioner.cpp
+++ b/libnd4j/include/legacy/cpu/NativeOpExecutioner.cpp
@@ -16,19 +16,19 @@
 
 
 #include <vector>
-#include <pointercast.h>
-#include "NativeOpExecutioner.h"
+#include <system/pointercast.h>
+#include "legacy/NativeOpExecutioner.h"
 #include <types/types.h>
 
-#include <LoopKind.h>
+#include <helpers/LoopKind.h>
 
-#include <pairwise_bool.h>
-#include <broadcasting_bool.h>
-#include <scalar_bool.h>
+#include <loops/pairwise_bool.h>
+#include <loops/broadcasting_bool.h>
+#include <loops/scalar_bool.h>
 
-#include <pairwise_int.h>
-#include <broadcasting_int.h>
-#include <scalar_int.h>
+#include <loops/pairwise_int.h>
+#include <loops/broadcasting_int.h>
+#include <loops/scalar_int.h>
 
 #include <loops/transform_float.h>
 #include <loops/transform_bool.h>
@@ -50,7 +50,7 @@
 #include <loops/transform_same.h>
 #include <loops/scalar.h>
 #include <loops/random.h>
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <exceptions/datatype_exception.h>
 #include <array/TadPack.h>
 #include <helpers/ConstantTadHelper.h>
@@ -76,7 +76,7 @@
 * @param hZ
 * @param hZShapeInfo
 */
-void NativeOpExecutioner::execIndexReduceScalar(nd4j::LaunchContext  *lc, int opNum,
+void NativeOpExecutioner::execIndexReduceScalar(sd::LaunchContext  *lc, int opNum,
                                     void *hX, Nd4jLong *hXShapeInfo,
                                     void *dX, Nd4jLong *dXShapeInfo,
                                     void *extraParams,
@@ -85,8 +85,8 @@ void NativeOpExecutioner::execIndexReduceScalar(nd4j::LaunchContext  *lc, int op
 
 
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
     auto hz = reinterpret_cast<Nd4jLong*>(hZ);
 
     BUILD_DOUBLE_SELECTOR(xType, zType, hz[0] = functions::indexreduce::IndexReduce, ::execScalar(opNum,hX,hXShapeInfo,extraParams), LIBND4J_TYPES, INDEXING_TYPES);
@@ -105,7 +105,7 @@ void NativeOpExecutioner::execIndexReduceScalar(nd4j::LaunchContext  *lc, int op
  * @param dimensionLength
  */
 
-void NativeOpExecutioner::execIndexReduce(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execIndexReduce(sd::LaunchContext  *lc,
                                 int opNum,
                                 void *hX, Nd4jLong *hXShapeInfo,
                                 void *dX, Nd4jLong *dXShapeInfo,
@@ -116,8 +116,8 @@ void NativeOpExecutioner::execIndexReduce(nd4j::LaunchContext  *lc,
                                 Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
 
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
     Nd4jLong* hz = reinterpret_cast<Nd4jLong*>(hZ);
 
     BUILD_DOUBLE_SELECTOR(xType, zType, functions::indexreduce::IndexReduce, ::exec(opNum, hX, hXShapeInfo, extraParams, hz, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets), LIBND4J_TYPES, INDEXING_TYPES);
@@ -138,7 +138,7 @@ void NativeOpExecutioner::execIndexReduce(nd4j::LaunchContext  *lc,
  * @param dimensionLength
  */
 
-void NativeOpExecutioner::execBroadcast(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execBroadcast(sd::LaunchContext  *lc,
                             int opNum,
                             void *hX, Nd4jLong *hXShapeInfo,
                             void *dX, Nd4jLong *dXShapeInfo,
@@ -152,9 +152,9 @@ void NativeOpExecutioner::execBroadcast(nd4j::LaunchContext  *lc,
 
 
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hYShapeInfo))
         return;
@@ -163,7 +163,7 @@ void NativeOpExecutioner::execBroadcast(nd4j::LaunchContext  *lc,
     BUILD_PAIRWISE_SELECTOR(xType, yType, zType, functions::broadcast::Broadcast, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES, LIBND4J_TYPES);
 #else
 
-    auto loopKind = nd4j::LoopKind::deduceKindOfLoopBroadcast(hXShapeInfo, hYShapeInfo, hZShapeInfo);
+    auto loopKind = sd::LoopKind::deduceKindOfLoopBroadcast(hXShapeInfo, hYShapeInfo, hZShapeInfo);
 
     auto func = PRAGMA_THREADS_FOR {
         BUILD_SINGLE_SELECTOR_THRICE(xType, functions::broadcast::Broadcast, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ, loopKind, start, stop), LIBND4J_TYPES);
@@ -172,23 +172,23 @@ void NativeOpExecutioner::execBroadcast(nd4j::LaunchContext  *lc,
     Nd4jLong numTads = 0;
 
     switch (loopKind) {
-        case nd4j::LoopKind::BROADCAST_SCALAR_X: {
+        case sd::LoopKind::BROADCAST_SCALAR_X: {
                 numTads = shape::length(hXShapeInfo);
             }
             break;
-        case nd4j::LoopKind::BROADCAST_SCALAR_Y: {
+        case sd::LoopKind::BROADCAST_SCALAR_Y: {
                 numTads = shape::length(hYShapeInfo);
             }
             break;
-        case nd4j::LoopKind::BROADCAST_3D: {
+        case sd::LoopKind::BROADCAST_3D: {
             numTads = shape::sizeAt(hZShapeInfo, 0);
             }
             break;
-        case nd4j::LoopKind::BROADCAST_4D: {
+        case sd::LoopKind::BROADCAST_4D: {
             numTads = shape::sizeAt(hZShapeInfo, 0) * shape::sizeAt(hZShapeInfo, 1);
             }
             break;
-        case nd4j::LoopKind::BROADCAST_5D: {
+        case sd::LoopKind::BROADCAST_5D: {
             numTads = shape::sizeAt(hZShapeInfo, 0) * shape::sizeAt(hZShapeInfo, 1);
             }
             break;
@@ -204,7 +204,7 @@ void NativeOpExecutioner::execBroadcast(nd4j::LaunchContext  *lc,
 #endif
 }
 
-void NativeOpExecutioner::execInverseBroadcast(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execInverseBroadcast(sd::LaunchContext  *lc,
                                                int opNum,
                                                void *hX, Nd4jLong *hXShapeInfo,
                                                void *dX, Nd4jLong *dXShapeInfo,
@@ -215,16 +215,16 @@ void NativeOpExecutioner::execInverseBroadcast(nd4j::LaunchContext  *lc,
                                                int *dimension, int dimensionLength,
                                                Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets,
                                                Nd4jLong *tadOnlyShapeInfoZ,Nd4jLong *tadOffsetsZ) {
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hYShapeInfo))
         return;
 
-    if (!nd4j::Environment::getInstance()->isExperimentalBuild())
-        if ((yType != xType && yType != nd4j::DataType::BOOL) || xType != zType)
-            throw nd4j::datatype_exception::build("NativeOps::execBroadcast both operands must have same data type", xType, yType);
+    if (!sd::Environment::getInstance()->isExperimentalBuild())
+        if ((yType != xType && yType != sd::DataType::BOOL) || xType != zType)
+            throw sd::datatype_exception::build("NativeOps::execBroadcast both operands must have same data type", xType, yType);
 
 #ifdef __ND4J_EXPERIMENTAL__
     BUILD_PAIRWISE_SELECTOR(xType, yType, zType, functions::broadcast::Broadcast, ::execInverse(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES, LIBND4J_TYPES);
@@ -244,7 +244,7 @@ void NativeOpExecutioner::execInverseBroadcast(nd4j::LaunchContext  *lc,
 
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execBroadcastBool(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execBroadcastBool(sd::LaunchContext  *lc,
                             int opNum,
                             void *hX, Nd4jLong *hXShapeInfo,
                             void *dX, Nd4jLong *dXShapeInfo,
@@ -258,9 +258,9 @@ void NativeOpExecutioner::execBroadcastBool(nd4j::LaunchContext  *lc,
                             Nd4jLong *tadOnlyShapeInfoZ,Nd4jLong *tadOffsetsZ) {
 
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hYShapeInfo))
         return;
@@ -276,7 +276,7 @@ void NativeOpExecutioner::execBroadcastBool(nd4j::LaunchContext  *lc,
     samediff::Threads::parallel_tad(func, 0, numTads);
 }
 
-void NativeOpExecutioner::execInverseBroadcastBool(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execInverseBroadcastBool(sd::LaunchContext  *lc,
                                                   int opNum,
                                                   void *hX, Nd4jLong *hXShapeInfo,
                                                   void *dX, Nd4jLong *dXShapeInfo,
@@ -289,16 +289,16 @@ void NativeOpExecutioner::execInverseBroadcastBool(nd4j::LaunchContext  *lc,
                                                   Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets,
                                                   Nd4jLong *tadOnlyShapeInfoZ,Nd4jLong *tadOffsetsZ) {
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hYShapeInfo))
         return;
 
-    if (!nd4j::Environment::getInstance()->isExperimentalBuild())
-        if (yType != xType || nd4j::DataType::BOOL != zType)
-            throw nd4j::datatype_exception::build("NativeOps::execInverseBroadcastBool both operands must have same data type", xType, yType);
+    if (!sd::Environment::getInstance()->isExperimentalBuild())
+        if (yType != xType || sd::DataType::BOOL != zType)
+            throw sd::datatype_exception::build("NativeOps::execInverseBroadcastBool both operands must have same data type", xType, yType);
 
     auto func = PRAGMA_THREADS_FOR {
         BUILD_DOUBLE_SELECTOR(xType, zType, functions::broadcast::BroadcastBool, ::execInverse(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, extraParams, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ, start, stop), LIBND4J_TYPES, BOOL_TYPES);
@@ -314,7 +314,7 @@ void NativeOpExecutioner::execInverseBroadcastBool(nd4j::LaunchContext  *lc,
 
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execBroadcastInt(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execBroadcastInt(sd::LaunchContext  *lc,
                                             int opNum,
                                             void *hX, Nd4jLong *hXShapeInfo,
                                             void *dX, Nd4jLong *dXShapeInfo,
@@ -327,18 +327,18 @@ void NativeOpExecutioner::execBroadcastInt(nd4j::LaunchContext  *lc,
                                             Nd4jLong *tadOnlyShapeInfoZ,Nd4jLong *tadOffsetsZ) {
 
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hYShapeInfo))
         return;
 
     if (xType != yType || xType != zType)
-        throw nd4j::datatype_exception::build("NativeOpExecutioner::execBroadcastInt", zType, xType, yType);
+        throw sd::datatype_exception::build("NativeOpExecutioner::execBroadcastInt", zType, xType, yType);
 
-    if (!nd4j::DataTypeUtils::isZ(zType))
-        throw nd4j::datatype_exception::build("NativeOpExecutioner::execBroadcastInt requires integer data type", zType);
+    if (!sd::DataTypeUtils::isZ(zType))
+        throw sd::datatype_exception::build("NativeOpExecutioner::execBroadcastInt requires integer data type", zType);
 
     auto func = PRAGMA_THREADS_FOR {
         BUILD_SINGLE_SELECTOR(xType, functions::broadcast::BroadcastInt, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ, start, stop), INTEGER_TYPES);
@@ -351,7 +351,7 @@ void NativeOpExecutioner::execBroadcastInt(nd4j::LaunchContext  *lc,
     samediff::Threads::parallel_tad(func, 0, numTads);
 }
 
-void NativeOpExecutioner::execInverseBroadcastInt(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execInverseBroadcastInt(sd::LaunchContext  *lc,
                                                    int opNum,
                                                    void *hX, Nd4jLong *hXShapeInfo,
                                                    void *dX, Nd4jLong *dXShapeInfo,
@@ -363,18 +363,18 @@ void NativeOpExecutioner::execInverseBroadcastInt(nd4j::LaunchContext  *lc,
                                                    Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets,
                                                    Nd4jLong *tadOnlyShapeInfoZ,Nd4jLong *tadOffsetsZ) {
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hYShapeInfo))
         return;
 
     if (xType != yType || xType != zType)
-        throw nd4j::datatype_exception::build("NativeOpExecutioner::execInverseBroadcastInt", zType, xType, yType);
+        throw sd::datatype_exception::build("NativeOpExecutioner::execInverseBroadcastInt", zType, xType, yType);
 
-    if (!nd4j::DataTypeUtils::isZ(zType))
-        throw nd4j::datatype_exception::build("NativeOpExecutioner::execInverseBroadcastInt requires integer data type", zType);
+    if (!sd::DataTypeUtils::isZ(zType))
+        throw sd::datatype_exception::build("NativeOpExecutioner::execInverseBroadcastInt requires integer data type", zType);
 
     auto func = PRAGMA_THREADS_FOR {
         BUILD_SINGLE_SELECTOR(xType, functions::broadcast::BroadcastInt,::execInverse(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ, start, stop), INTEGER_TYPES);
@@ -400,7 +400,7 @@ void NativeOpExecutioner::execInverseBroadcastInt(nd4j::LaunchContext  *lc,
 * @param extraParams
 * @param n
 */
-void NativeOpExecutioner::execPairwiseTransform(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execPairwiseTransform(sd::LaunchContext  *lc,
                                     int opNum,
                                     void *hX, Nd4jLong *hXShapeInfo,
                                     void *dX, Nd4jLong *dXShapeInfo,
@@ -410,9 +410,9 @@ void NativeOpExecutioner::execPairwiseTransform(nd4j::LaunchContext  *lc,
                                     void *dZ, Nd4jLong *dZShapeInfo,
                                     void *extraParams) {
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hYShapeInfo))
         return;
@@ -427,13 +427,13 @@ void NativeOpExecutioner::execPairwiseTransform(nd4j::LaunchContext  *lc,
     };
 
     auto zLen = shape::length(hZShapeInfo);
-    samediff::Threads::parallel_for(func, 0, zLen, 1, nd4j::math::nd4j_max<int>(1, nd4j::math::nd4j_min<int>(zLen / 1024, nd4j::Environment::getInstance()->maxMasterThreads())));
+    samediff::Threads::parallel_for(func, 0, zLen, 1, sd::math::nd4j_max<int>(1, sd::math::nd4j_min<int>(zLen / 1024, sd::Environment::getInstance()->maxMasterThreads())));
 
 #endif
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execPairwiseBoolTransform(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execPairwiseBoolTransform(sd::LaunchContext  *lc,
                                     int opNum,
                                     void *hX, Nd4jLong *hXShapeInfo,
                                     void *dX, Nd4jLong *dXShapeInfo,
@@ -444,30 +444,30 @@ void NativeOpExecutioner::execPairwiseBoolTransform(nd4j::LaunchContext  *lc,
                                     void *extraParams) {
 
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hYShapeInfo))
         return;
 
     if (xType != yType)
-        throw nd4j::datatype_exception::build("NativeOpExecutioner::execPairwiseBoolTransform", xType, yType);
+        throw sd::datatype_exception::build("NativeOpExecutioner::execPairwiseBoolTransform", xType, yType);
 
-    if (zType != nd4j::DataType::BOOL)
-        throw nd4j::datatype_exception::build("NativeOpExecutioner::execPairwiseBoolTransform", nd4j::DataType::BOOL, zType);
+    if (zType != sd::DataType::BOOL)
+        throw sd::datatype_exception::build("NativeOpExecutioner::execPairwiseBoolTransform", sd::DataType::BOOL, zType);
 
     auto func = PRAGMA_THREADS_FOR {
         BUILD_DOUBLE_SELECTOR(xType, zType, functions::pairwise_transforms::PairWiseBoolTransform, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, extraParams, start, stop), LIBND4J_TYPES, BOOL_TYPES);
     };
 
     auto zLen = shape::length(hZShapeInfo);
-    samediff::Threads::parallel_for(func, 0, zLen, 1, nd4j::math::nd4j_max<int>(1, nd4j::math::nd4j_min<int>(zLen / 1024, nd4j::Environment::getInstance()->maxMasterThreads())));
+    samediff::Threads::parallel_for(func, 0, zLen, 1, sd::math::nd4j_max<int>(1, sd::math::nd4j_min<int>(zLen / 1024, sd::Environment::getInstance()->maxMasterThreads())));
 
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execPairwiseIntTransform(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execPairwiseIntTransform(sd::LaunchContext  *lc,
                                                     int opNum,
                                                     void *hX, Nd4jLong *hXShapeInfo,
                                                     void *dX, Nd4jLong *dXShapeInfo,
@@ -477,25 +477,25 @@ void NativeOpExecutioner::execPairwiseIntTransform(nd4j::LaunchContext  *lc,
                                                     void *dZ, Nd4jLong *dZShapeInfo,
                                                     void *extraParams) {
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hYShapeInfo))
         return;
 
     if (xType != yType || xType != zType)
-        throw nd4j::datatype_exception::build("NativeOpExecutioner::execPairwiseIntTransform", zType, xType, yType);
+        throw sd::datatype_exception::build("NativeOpExecutioner::execPairwiseIntTransform", zType, xType, yType);
 
-    if (!nd4j::DataTypeUtils::isZ(zType))
-        throw nd4j::datatype_exception::build("NativeOpExecutioner::execSPairwiseInt requires integer data type", zType);
+    if (!sd::DataTypeUtils::isZ(zType))
+        throw sd::datatype_exception::build("NativeOpExecutioner::execSPairwiseInt requires integer data type", zType);
 
     auto func = PRAGMA_THREADS_FOR {
         BUILD_SINGLE_SELECTOR(xType, functions::pairwise_transforms::PairWiseIntTransform, ::exec(opNum, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, extraParams, start, stop), INTEGER_TYPES);
     };
 
     auto zLen = shape::length(hZShapeInfo);
-    samediff::Threads::parallel_for(func, 0, zLen, 1, nd4j::math::nd4j_max<int>(1, nd4j::math::nd4j_min<int>(zLen / 1024, nd4j::Environment::getInstance()->maxMasterThreads())));
+    samediff::Threads::parallel_for(func, 0, zLen, 1, sd::math::nd4j_max<int>(1, sd::math::nd4j_min<int>(zLen / 1024, sd::Environment::getInstance()->maxMasterThreads())));
 
 }
 
@@ -509,7 +509,7 @@ void NativeOpExecutioner::execPairwiseIntTransform(nd4j::LaunchContext  *lc,
 * @param hZ
 * @param hZShapeInfo
 */
-void NativeOpExecutioner::execReduceFloat(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execReduceFloat(sd::LaunchContext  *lc,
                             int opNum,
                             void *hX, Nd4jLong *hXShapeInfo,
                             void *dX, Nd4jLong *dXShapeInfo,
@@ -521,8 +521,8 @@ void NativeOpExecutioner::execReduceFloat(nd4j::LaunchContext  *lc,
 
 
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     // nothing to do here if result is empty
     if (shape::isEmpty(hZShapeInfo))
@@ -532,13 +532,13 @@ void NativeOpExecutioner::execReduceFloat(nd4j::LaunchContext  *lc,
         BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceFloatFunction, ::exec(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets, start, stop), LIBND4J_TYPES, FLOAT_TYPES);
     };
 
-    const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopTadXZ(hXShapeInfo, hZShapeInfo, tadShapeInfo);
+    const sd::LoopKind::Kind kindOfLoop = sd::LoopKind::deduceKindOfLoopTadXZ(hXShapeInfo, hZShapeInfo, tadShapeInfo);
 
-    samediff::Threads::parallel_tad(func, 0, shape::length(hZShapeInfo), 1, kindOfLoop == nd4j::LoopKind::Kind::SMALLARR2DX ? 1 : nd4j::Environment::getInstance()->maxMasterThreads());
+    samediff::Threads::parallel_tad(func, 0, shape::length(hZShapeInfo), 1, kindOfLoop == sd::LoopKind::Kind::SMALLARR2DX ? 1 : sd::Environment::getInstance()->maxMasterThreads());
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execReduceSame(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execReduceSame(sd::LaunchContext  *lc,
                                 int opNum,
                                 void *hX, Nd4jLong *hXShapeInfo,
                                 void *dX, Nd4jLong *dXShapeInfo,
@@ -549,8 +549,8 @@ void NativeOpExecutioner::execReduceSame(nd4j::LaunchContext  *lc,
                                 Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
 
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     // nothing to do here if result is empty
     if (shape::isEmpty(hZShapeInfo))
@@ -560,13 +560,13 @@ void NativeOpExecutioner::execReduceSame(nd4j::LaunchContext  *lc,
         BUILD_SINGLE_SELECTOR(xType, functions::reduce::ReduceSameFunction, ::exec(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets, start, stop), LIBND4J_TYPES);
     };
 
-    const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopTadXZ(hXShapeInfo, hZShapeInfo, tadShapeInfo);
+    const sd::LoopKind::Kind kindOfLoop = sd::LoopKind::deduceKindOfLoopTadXZ(hXShapeInfo, hZShapeInfo, tadShapeInfo);
 
-    samediff::Threads::parallel_tad(func, 0, shape::length(hZShapeInfo), 1, kindOfLoop == nd4j::LoopKind::Kind::SMALLARR2DX ? 1 : nd4j::Environment::getInstance()->maxMasterThreads());
+    samediff::Threads::parallel_tad(func, 0, shape::length(hZShapeInfo), 1, kindOfLoop == sd::LoopKind::Kind::SMALLARR2DX ? 1 : sd::Environment::getInstance()->maxMasterThreads());
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execReduceBool(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execReduceBool(sd::LaunchContext  *lc,
                                 int opNum,
                                 void *hX, Nd4jLong *hXShapeInfo,
                                 void *dX, Nd4jLong *dXShapeInfo,
@@ -577,8 +577,8 @@ void NativeOpExecutioner::execReduceBool(nd4j::LaunchContext  *lc,
                                 Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
 
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     // nothing to do here if result is empty
     if (shape::isEmpty(hZShapeInfo))
@@ -588,13 +588,13 @@ void NativeOpExecutioner::execReduceBool(nd4j::LaunchContext  *lc,
         BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceBoolFunction, ::exec(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets, start, stop), LIBND4J_TYPES, BOOL_TYPES);
     };
 
-    const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopTadXZ(hXShapeInfo, hZShapeInfo, tadShapeInfo);
+    const sd::LoopKind::Kind kindOfLoop = sd::LoopKind::deduceKindOfLoopTadXZ(hXShapeInfo, hZShapeInfo, tadShapeInfo);
 
-    samediff::Threads::parallel_tad(func, 0, shape::length(hZShapeInfo), 1, kindOfLoop == nd4j::LoopKind::Kind::SMALLARR2DX ? 1 : nd4j::Environment::getInstance()->maxMasterThreads());
+    samediff::Threads::parallel_tad(func, 0, shape::length(hZShapeInfo), 1, kindOfLoop == sd::LoopKind::Kind::SMALLARR2DX ? 1 : sd::Environment::getInstance()->maxMasterThreads());
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execReduceLong(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execReduceLong(sd::LaunchContext  *lc,
                                 int opNum,
                                 void *hX, Nd4jLong *hXShapeInfo,
                                 void *dX, Nd4jLong *dXShapeInfo,
@@ -605,8 +605,8 @@ void NativeOpExecutioner::execReduceLong(nd4j::LaunchContext  *lc,
                                 Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
 
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     // nothing to do here if result is empty
     if (shape::isEmpty(hZShapeInfo))
@@ -616,9 +616,9 @@ void NativeOpExecutioner::execReduceLong(nd4j::LaunchContext  *lc,
         BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceLongFunction, ::exec(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets, start, stop), LIBND4J_TYPES, LONG_TYPES);
     };
 
-    const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopTadXZ(hXShapeInfo, hZShapeInfo, tadShapeInfo);
+    const sd::LoopKind::Kind kindOfLoop = sd::LoopKind::deduceKindOfLoopTadXZ(hXShapeInfo, hZShapeInfo, tadShapeInfo);
 
-    samediff::Threads::parallel_tad(func, 0, shape::length(hZShapeInfo), 1, kindOfLoop == nd4j::LoopKind::Kind::SMALLARR2DX ? 1 : nd4j::Environment::getInstance()->maxMasterThreads());
+    samediff::Threads::parallel_tad(func, 0, shape::length(hZShapeInfo), 1, kindOfLoop == sd::LoopKind::Kind::SMALLARR2DX ? 1 : sd::Environment::getInstance()->maxMasterThreads());
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -630,7 +630,7 @@ void NativeOpExecutioner::execReduceLong(nd4j::LaunchContext  *lc,
  * @param extraParams
  * @return
  */
-void NativeOpExecutioner::execReduceFloatScalar(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execReduceFloatScalar(sd::LaunchContext  *lc,
                                     int opNum,
                                     void *hX, Nd4jLong *hXShapeInfo,
                                     void *dX, Nd4jLong *dXShapeInfo,
@@ -639,14 +639,14 @@ void NativeOpExecutioner::execReduceFloatScalar(nd4j::LaunchContext  *lc,
                                     void *dZ, Nd4jLong *dZShapeInfo) {
 
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceFloatFunction, ::execScalar(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo), LIBND4J_TYPES, FLOAT_TYPES);
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execReduceSameScalar(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execReduceSameScalar(sd::LaunchContext  *lc,
                                         int opNum,
                                         void *hX, Nd4jLong *hXShapeInfo,
                                         void *dX, Nd4jLong *dXShapeInfo,
@@ -655,13 +655,13 @@ void NativeOpExecutioner::execReduceSameScalar(nd4j::LaunchContext  *lc,
                                         void *dZ, Nd4jLong *dZShapeInfo) {
 
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
 
     BUILD_SINGLE_SELECTOR(xType, functions::reduce::ReduceSameFunction, ::execScalar(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo), LIBND4J_TYPES);
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execReduceBoolScalar(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execReduceBoolScalar(sd::LaunchContext  *lc,
                                         int opNum,
                                         void *hX, Nd4jLong *hXShapeInfo,
                                         void *dX, Nd4jLong *dXShapeInfo,
@@ -671,14 +671,14 @@ void NativeOpExecutioner::execReduceBoolScalar(nd4j::LaunchContext  *lc,
 
 
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceBoolFunction, ::execScalar(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo), LIBND4J_TYPES, BOOL_TYPES);
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execReduceLongScalar(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execReduceLongScalar(sd::LaunchContext  *lc,
                                         int opNum,
                                         void *hX, Nd4jLong *hXShapeInfo,
                                         void *dX, Nd4jLong *dXShapeInfo,
@@ -687,8 +687,8 @@ void NativeOpExecutioner::execReduceLongScalar(nd4j::LaunchContext  *lc,
                                         void *dZ, Nd4jLong *dZShapeInfo) {
 
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce::ReduceLongFunction, ::execScalar(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo), LIBND4J_TYPES, LONG_TYPES);
 }
@@ -708,7 +708,7 @@ void NativeOpExecutioner::execReduceLongScalar(nd4j::LaunchContext  *lc,
  * @param dimension
  * @param dimensionLength
  */
-void NativeOpExecutioner::execReduce3Scalar(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execReduce3Scalar(sd::LaunchContext  *lc,
                             int opNum,
                             void *hX, Nd4jLong *hXShapeInfo,
                             void *dX, Nd4jLong *dXShapeInfo,
@@ -717,8 +717,8 @@ void NativeOpExecutioner::execReduce3Scalar(nd4j::LaunchContext  *lc,
                             void *dY, Nd4jLong *dYShapeInfo,
                             void *hZ, Nd4jLong *hZShapeInfo,
                             void *dZ, Nd4jLong *dZShapeInfo) {
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
 
     BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::execScalar(opNum, hX, hXShapeInfo, extraParamsVals, hY, hYShapeInfo, hZ, hZShapeInfo), LIBND4J_TYPES, FLOAT_TYPES);
@@ -736,7 +736,7 @@ void NativeOpExecutioner::execReduce3Scalar(nd4j::LaunchContext  *lc,
 * @param hZ
 * @param hZShapeInfo
 */
-void NativeOpExecutioner::execReduce3(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execReduce3(sd::LaunchContext  *lc,
                             int opNum,
                             void *hX, Nd4jLong *hXShapeInfo,
                             void *dX, Nd4jLong *dXShapeInfo,
@@ -747,15 +747,15 @@ void NativeOpExecutioner::execReduce3(nd4j::LaunchContext  *lc,
                             void *dZ, Nd4jLong *dZShapeInfo) {
 
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     //BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::exec(opNum, hX, hXShapeInfo, extraParamsVals, hY, hYShapeInfo, hZ, hZShapeInfo, nullptr, 0), LIBND4J_TYPES, FLOAT_TYPES);
     NativeOpExecutioner::execReduce3Scalar(lc, opNum, hX, hXShapeInfo, dX, dXShapeInfo, extraParamsVals, hY, hYShapeInfo, dY, dYShapeInfo, hZ, hZShapeInfo, dZ, dZShapeInfo);
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execReduce3(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execReduce3(sd::LaunchContext  *lc,
                             int opNum,
                             void *hX, Nd4jLong *hXShapeInfo,
                             void *dX, Nd4jLong *dXShapeInfo,
@@ -769,22 +769,22 @@ void NativeOpExecutioner::execReduce3(nd4j::LaunchContext  *lc,
                             Nd4jLong *yTadOnlyShapeInfo, Nd4jLong *yTadOffsets) {
 
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     const auto xLen = shape::length(hXShapeInfo);
     const auto yLen = shape::length(hYShapeInfo);
 
-    nd4j::TadPack tadPack;
+    sd::TadPack tadPack;
 
     if(xLen == yLen) {
-        tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension, dimensionLength);
+        tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension, dimensionLength);
     }
     else if(yLen > xLen) {
-        tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hYShapeInfo, dimension, dimensionLength);
+        tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(hYShapeInfo, dimension, dimensionLength);
     }
     else {
-        tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension, dimensionLength);
+        tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension, dimensionLength);
     }
 
     auto func = PRAGMA_THREADS_FOR {
@@ -796,7 +796,7 @@ void NativeOpExecutioner::execReduce3(nd4j::LaunchContext  *lc,
 
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execReduce3All(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execReduce3All(sd::LaunchContext  *lc,
                             int opNum,
                             void *hX, Nd4jLong *hXShapeInfo,
                             void *dX, Nd4jLong *dXShapeInfo,
@@ -810,10 +810,10 @@ void NativeOpExecutioner::execReduce3All(nd4j::LaunchContext  *lc,
                             Nd4jLong *yTadShapeInfo, Nd4jLong *yOffsets) {
 
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
-    auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension, dimensionLength);
+    auto tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension, dimensionLength);
 
     // TODO: make it 2d
     auto func = PRAGMA_THREADS_FOR {
@@ -824,7 +824,7 @@ void NativeOpExecutioner::execReduce3All(nd4j::LaunchContext  *lc,
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execReduce3TAD(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execReduce3TAD(sd::LaunchContext  *lc,
                             int opNum,
                             void *hX, Nd4jLong *hXShapeInfo,
                             void *dX, Nd4jLong *dXShapeInfo,
@@ -839,22 +839,22 @@ void NativeOpExecutioner::execReduce3TAD(nd4j::LaunchContext  *lc,
 
 
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     const auto xLen = shape::length(hXShapeInfo);
     const auto yLen = shape::length(hYShapeInfo);
 
-    nd4j::TadPack tadPack;
+    sd::TadPack tadPack;
 
     if(xLen == yLen) {
-        tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension, dimensionLength);
+        tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension, dimensionLength);
     }
     else if(yLen > xLen) {
-        tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hYShapeInfo, dimension, dimensionLength);
+        tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(hYShapeInfo, dimension, dimensionLength);
     }
     else {
-        tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension, dimensionLength);
+        tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension, dimensionLength);
     }
 
     auto func = PRAGMA_THREADS_FOR {
@@ -877,7 +877,7 @@ void NativeOpExecutioner::execReduce3TAD(nd4j::LaunchContext  *lc,
 * @param extraParams
 * @param n
 */
-void NativeOpExecutioner::execScalar(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execScalar(sd::LaunchContext  *lc,
                             int opNum,
                             void *hX, Nd4jLong *hXShapeInfo,
                             void *dX, Nd4jLong *dXShapeInfo,
@@ -887,9 +887,9 @@ void NativeOpExecutioner::execScalar(nd4j::LaunchContext  *lc,
                             void *dScalar, Nd4jLong *dScalarShapeInfo,
                             void *extraParams, bool allowParallelism) {
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto yType = nd4j::ArrayOptions::dataType(hScalarShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto yType = sd::ArrayOptions::dataType(hScalarShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hScalarShapeInfo))
         return;
@@ -898,20 +898,20 @@ void NativeOpExecutioner::execScalar(nd4j::LaunchContext  *lc,
     BUILD_PAIRWISE_SELECTOR(xType, yType, zType, functions::scalar::ScalarTransform, ::transform(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, hScalar, extraParams), LIBND4J_TYPES, LIBND4J_TYPES);
 #else
     if (xType != yType || xType != zType)
-        throw nd4j::datatype_exception::build("NativeOpExecutioner::execScalar", zType, xType, yType);
+        throw sd::datatype_exception::build("NativeOpExecutioner::execScalar", zType, xType, yType);
 
     auto func = PRAGMA_THREADS_FOR {
         BUILD_SINGLE_SELECTOR_THRICE(xType, functions::scalar::ScalarTransform,::transform(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, hScalar, extraParams, start, stop), LIBND4J_TYPES);
     };
 
     auto zLen = shape::length(hZShapeInfo);
-    samediff::Threads::parallel_for(func, 0, zLen, 1, !allowParallelism ? 1 : nd4j::math::nd4j_max<int>(1, nd4j::math::nd4j_min<int>(zLen / 1024, nd4j::Environment::getInstance()->maxMasterThreads())));
+    samediff::Threads::parallel_for(func, 0, zLen, 1, !allowParallelism ? 1 : sd::math::nd4j_max<int>(1, sd::math::nd4j_min<int>(zLen / 1024, sd::Environment::getInstance()->maxMasterThreads())));
 
 #endif
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execScalar(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execScalar(sd::LaunchContext  *lc,
                             int opNum,
                             void *hX, Nd4jLong *hXShapeInfo,
                             void *dX, Nd4jLong *dXShapeInfo,
@@ -924,9 +924,9 @@ void NativeOpExecutioner::execScalar(nd4j::LaunchContext  *lc,
                             Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets,
                             Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) {
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto yType = nd4j::ArrayOptions::dataType(hScalarShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto yType = sd::ArrayOptions::dataType(hScalarShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hScalarShapeInfo))
         return;
@@ -935,20 +935,20 @@ void NativeOpExecutioner::execScalar(nd4j::LaunchContext  *lc,
     BUILD_PAIRWISE_SELECTOR(xType, yType, zType, functions::scalar::ScalarTransform, ::transform(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, hScalars, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ), LIBND4J_TYPES, LIBND4J_TYPES);
 #else
     if (xType != yType || xType != zType)
-        throw nd4j::datatype_exception::build("NativeOpExecutioner::execScalar", zType, xType, yType);
+        throw sd::datatype_exception::build("NativeOpExecutioner::execScalar", zType, xType, yType);
 
     auto func = PRAGMA_THREADS_FOR {
         BUILD_SINGLE_SELECTOR_THRICE(xType, functions::scalar::ScalarTransform, ::transform(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, hScalars, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ, start, stop), LIBND4J_TYPES);
     };
 
     auto yLen = shape::length(hScalarShapeInfo);
-    samediff::Threads::parallel_tad(func, 0, yLen, 1, nd4j::math::nd4j_min<int>(yLen, nd4j::Environment::getInstance()->maxMasterThreads()));
+    samediff::Threads::parallel_tad(func, 0, yLen, 1, sd::math::nd4j_min<int>(yLen, sd::Environment::getInstance()->maxMasterThreads()));
 
 #endif
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execScalarBool(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execScalarBool(sd::LaunchContext  *lc,
                             int opNum,
                             void *hX, Nd4jLong *hXShapeInfo,
                             void *dX, Nd4jLong *dXShapeInfo,
@@ -958,30 +958,30 @@ void NativeOpExecutioner::execScalarBool(nd4j::LaunchContext  *lc,
                             void *dScalar, Nd4jLong *dSscalarShapeInfo,
                             void *extraParams, bool allowParallelism) {
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto yType = nd4j::ArrayOptions::dataType(hSscalarShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto yType = sd::ArrayOptions::dataType(hSscalarShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hSscalarShapeInfo))
         return;
 
     if (xType != yType)
-        throw nd4j::datatype_exception::build("NativeOpExecutioner::execScalarBool", xType, yType);
+        throw sd::datatype_exception::build("NativeOpExecutioner::execScalarBool", xType, yType);
 
-    if (zType != nd4j::DataType::BOOL)
-        throw nd4j::datatype_exception::build("NativeOpExecutioner::execScalarBool", nd4j::DataType::BOOL, zType);
+    if (zType != sd::DataType::BOOL)
+        throw sd::datatype_exception::build("NativeOpExecutioner::execScalarBool", sd::DataType::BOOL, zType);
 
     auto func = PRAGMA_THREADS_FOR {
         BUILD_DOUBLE_SELECTOR(xType, zType, functions::scalar::ScalarBoolTransform, ::transform(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, hScalar, extraParams, start, stop), LIBND4J_TYPES, BOOL_TYPES);
     };
 
     auto zLen = shape::length(hZShapeInfo);
-    samediff::Threads::parallel_for(func, 0, zLen, 1,  !allowParallelism ? 1 : nd4j::math::nd4j_max<int>(1, nd4j::math::nd4j_min<int>(zLen / 1024, nd4j::Environment::getInstance()->maxMasterThreads())));
+    samediff::Threads::parallel_for(func, 0, zLen, 1,  !allowParallelism ? 1 : sd::math::nd4j_max<int>(1, sd::math::nd4j_min<int>(zLen / 1024, sd::Environment::getInstance()->maxMasterThreads())));
 
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execScalarBool(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execScalarBool(sd::LaunchContext  *lc,
                             int opNum,
                             void *hX, Nd4jLong *hXShapeInfo,
                             void *dX, Nd4jLong *dXShapeInfo,
@@ -994,29 +994,29 @@ void NativeOpExecutioner::execScalarBool(nd4j::LaunchContext  *lc,
                             Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets,
                             Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) {
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto yType = nd4j::ArrayOptions::dataType(hScalarShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto yType = sd::ArrayOptions::dataType(hScalarShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hScalarShapeInfo))
         return;
 
     if (xType != yType)
-        throw nd4j::datatype_exception::build("NativeOpExecutioner::execScalarBool", xType, yType);
+        throw sd::datatype_exception::build("NativeOpExecutioner::execScalarBool", xType, yType);
 
-    if (zType != nd4j::DataType::BOOL)
-        throw nd4j::datatype_exception::build("NativeOpExecutioner::execScalarBool", nd4j::DataType::BOOL, zType);
+    if (zType != sd::DataType::BOOL)
+        throw sd::datatype_exception::build("NativeOpExecutioner::execScalarBool", sd::DataType::BOOL, zType);
 
     auto func = PRAGMA_THREADS_FOR {
         BUILD_DOUBLE_SELECTOR(xType, zType, functions::scalar::ScalarBoolTransform, ::transform(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, hScalars, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ, start, stop), LIBND4J_TYPES, BOOL_TYPES);
     };
 
     auto yLen = shape::length(hScalarShapeInfo);
-    samediff::Threads::parallel_tad(func, 0, yLen, 1, nd4j::math::nd4j_min<int>(yLen, nd4j::Environment::getInstance()->maxMasterThreads()));
+    samediff::Threads::parallel_tad(func, 0, yLen, 1, sd::math::nd4j_min<int>(yLen, sd::Environment::getInstance()->maxMasterThreads()));
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execScalarInt(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execScalarInt(sd::LaunchContext  *lc,
                                          int opNum,
                                          void *hX, Nd4jLong *hXShapeInfo,
                                          void *dX, Nd4jLong *dXShapeInfo,
@@ -1026,30 +1026,30 @@ void NativeOpExecutioner::execScalarInt(nd4j::LaunchContext  *lc,
                                          void *dScalar, Nd4jLong *dSscalarShapeInfo,
                                          void *extraParams, bool allowParallelism) {
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto yType = nd4j::ArrayOptions::dataType(hSscalarShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto yType = sd::ArrayOptions::dataType(hSscalarShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hSscalarShapeInfo))
         return;
 
     if (xType != yType || xType != zType)
-        throw nd4j::datatype_exception::build("NativeOpExecutioner::execScalarInt", xType, yType);
+        throw sd::datatype_exception::build("NativeOpExecutioner::execScalarInt", xType, yType);
 
-    if (!nd4j::DataTypeUtils::isZ(zType))
-        throw nd4j::datatype_exception::build("NativeOpExecutioner::execScalarInt", nd4j::DataType::INT32, zType);
+    if (!sd::DataTypeUtils::isZ(zType))
+        throw sd::datatype_exception::build("NativeOpExecutioner::execScalarInt", sd::DataType::INT32, zType);
 
     auto func = PRAGMA_THREADS_FOR {
         BUILD_SINGLE_SELECTOR(xType, functions::scalar::ScalarIntTransform, ::transform(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, hScalar, extraParams, start, stop), INTEGER_TYPES);
     };
 
     auto zLen = shape::length(hZShapeInfo);
-    samediff::Threads::parallel_for(func, 0, zLen, 1, !allowParallelism ? 1 : nd4j::math::nd4j_max<int>(1, nd4j::math::nd4j_min<int>(zLen / 1024, nd4j::Environment::getInstance()->maxMasterThreads())));
+    samediff::Threads::parallel_for(func, 0, zLen, 1, !allowParallelism ? 1 : sd::math::nd4j_max<int>(1, sd::math::nd4j_min<int>(zLen / 1024, sd::Environment::getInstance()->maxMasterThreads())));
 
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execScalarInt(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execScalarInt(sd::LaunchContext  *lc,
                                          int opNum,
                                          void *hX, Nd4jLong *hXShapeInfo,
                                          void *dX, Nd4jLong *dXShapeInfo,
@@ -1062,25 +1062,25 @@ void NativeOpExecutioner::execScalarInt(nd4j::LaunchContext  *lc,
                                          Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets,
                                          Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) {
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto yType = nd4j::ArrayOptions::dataType(hScalarShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto yType = sd::ArrayOptions::dataType(hScalarShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hScalarShapeInfo))
         return;
 
     if (xType != yType || xType != zType)
-        throw nd4j::datatype_exception::build("NativeOpExecutioner::execScalarInt", xType, yType);
+        throw sd::datatype_exception::build("NativeOpExecutioner::execScalarInt", xType, yType);
 
-    if (!nd4j::DataTypeUtils::isZ(zType))
-        throw nd4j::datatype_exception::build("NativeOpExecutioner::execScalarInt requires integer data type", zType);
+    if (!sd::DataTypeUtils::isZ(zType))
+        throw sd::datatype_exception::build("NativeOpExecutioner::execScalarInt requires integer data type", zType);
 
     auto func = PRAGMA_THREADS_FOR {
         BUILD_SINGLE_SELECTOR(xType, functions::scalar::ScalarIntTransform, ::transform(opNum, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, hScalars, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ, start, stop), INTEGER_TYPES);
     };
 
     auto yLen = shape::length(hScalarShapeInfo);
-    samediff::Threads::parallel_tad(func, 0, yLen, 1, nd4j::math::nd4j_min<int>(yLen, nd4j::Environment::getInstance()->maxMasterThreads()));
+    samediff::Threads::parallel_tad(func, 0, yLen, 1, sd::math::nd4j_min<int>(yLen, sd::Environment::getInstance()->maxMasterThreads()));
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -1093,7 +1093,7 @@ void NativeOpExecutioner::execScalarInt(nd4j::LaunchContext  *lc,
 * @param hZ
 * @param hZShapeInfo
 */
-void NativeOpExecutioner::execSummaryStats(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execSummaryStats(sd::LaunchContext  *lc,
                                 int opNum,
                                 void *hX, Nd4jLong *hXShapeInfo,
                                 void *dX, Nd4jLong *dXShapeInfo,
@@ -1103,8 +1103,8 @@ void NativeOpExecutioner::execSummaryStats(nd4j::LaunchContext  *lc,
                                 bool biasCorrected) {
 
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     BUILD_DOUBLE_SELECTOR(xType, zType, functions::summarystats::SummaryStatsReduce, ::exec(opNum, biasCorrected, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, nullptr, 1), LIBND4J_TYPES, FLOAT_TYPES);
 }
@@ -1119,7 +1119,7 @@ void NativeOpExecutioner::execSummaryStats(nd4j::LaunchContext  *lc,
 * @param hZ
 * @param hZShapeInfo
 */
-void NativeOpExecutioner::execSummaryStatsScalar(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execSummaryStatsScalar(sd::LaunchContext  *lc,
                                     int opNum,
                                     void *hX, Nd4jLong *hXShapeInfo,
                                     void *dX, Nd4jLong *dXShapeInfo,
@@ -1129,8 +1129,8 @@ void NativeOpExecutioner::execSummaryStatsScalar(nd4j::LaunchContext  *lc,
                                     bool biasCorrected) {
 
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     BUILD_DOUBLE_SELECTOR(xType, zType, functions::summarystats::SummaryStatsReduce, ::execScalar(opNum, biasCorrected, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo), LIBND4J_TYPES, FLOAT_TYPES);
 }
@@ -1147,7 +1147,7 @@ void NativeOpExecutioner::execSummaryStatsScalar(nd4j::LaunchContext  *lc,
 * @param dimension
 * @param dimensionLength
 */
-void NativeOpExecutioner::execSummaryStats(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execSummaryStats(sd::LaunchContext  *lc,
                                 int opNum,
                                 void *hX, Nd4jLong *hXShapeInfo,
                                 void *dX, Nd4jLong *dXShapeInfo,
@@ -1157,8 +1157,8 @@ void NativeOpExecutioner::execSummaryStats(nd4j::LaunchContext  *lc,
                                 int *dimension, int dimensionLength,
                                 Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets,
                                 bool biasCorrected) {
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     BUILD_DOUBLE_SELECTOR(xType, zType, functions::summarystats::SummaryStatsReduce, ::exec(opNum, biasCorrected, hX, hXShapeInfo, extraParams, hZ, hZShapeInfo, dimension, dimensionLength), LIBND4J_TYPES, FLOAT_TYPES);
 }
@@ -1175,7 +1175,7 @@ void NativeOpExecutioner::execSummaryStats(nd4j::LaunchContext  *lc,
 * @param extraParams
 * @param n
 */
-void NativeOpExecutioner::execTransformFloat(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execTransformFloat(sd::LaunchContext  *lc,
                                 int opNum,
                                 void *hX, Nd4jLong *hXShapeInfo,
                                 void *dX, Nd4jLong *dXShapeInfo,
@@ -1183,8 +1183,8 @@ void NativeOpExecutioner::execTransformFloat(nd4j::LaunchContext  *lc,
                                 void *dZ, Nd4jLong *dZShapeInfo,
                                 void *extraParams,
                                 Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (shape::isEmpty(hXShapeInfo))
         return;
@@ -1193,11 +1193,11 @@ void NativeOpExecutioner::execTransformFloat(nd4j::LaunchContext  *lc,
         BUILD_DOUBLE_SELECTOR(xType, zType, functions::transform::TransformFloat, ::exec(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, extraParams, thread_id, numThreads), LIBND4J_TYPES, FLOAT_TYPES);
     };
 
-    samediff::Threads::parallel_do(func, nd4j::math::nd4j_max<int>(1, nd4j::math::nd4j_min<int>(shape::length(hZShapeInfo) / 1024, nd4j::Environment::getInstance()->maxMasterThreads())));
+    samediff::Threads::parallel_do(func, sd::math::nd4j_max<int>(1, sd::math::nd4j_min<int>(shape::length(hZShapeInfo) / 1024, sd::Environment::getInstance()->maxMasterThreads())));
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execTransformBool(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execTransformBool(sd::LaunchContext  *lc,
                                 int opNum,
                                 void *hX, Nd4jLong *hXShapeInfo,
                                 void *dX, Nd4jLong *dXShapeInfo,
@@ -1205,8 +1205,8 @@ void NativeOpExecutioner::execTransformBool(nd4j::LaunchContext  *lc,
                                 void *dZ, Nd4jLong *dZShapeInfo,
                                 void *extraParams,
                                 Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (shape::isEmpty(hXShapeInfo))
         return;
@@ -1215,11 +1215,11 @@ void NativeOpExecutioner::execTransformBool(nd4j::LaunchContext  *lc,
         BUILD_DOUBLE_SELECTOR(xType, zType, functions::transform::TransformBool, ::exec(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, extraParams, thread_id, numThreads), LIBND4J_TYPES, BOOL_TYPES);
     };
 
-    samediff::Threads::parallel_do(func, nd4j::math::nd4j_max<int>(1, nd4j::math::nd4j_min<int>(shape::length(hZShapeInfo) / 1024, nd4j::Environment::getInstance()->maxMasterThreads())));
+    samediff::Threads::parallel_do(func, sd::math::nd4j_max<int>(1, sd::math::nd4j_min<int>(shape::length(hZShapeInfo) / 1024, sd::Environment::getInstance()->maxMasterThreads())));
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execTransformAny(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execTransformAny(sd::LaunchContext  *lc,
                                 int opNum,
                                 void *hX, Nd4jLong *hXShapeInfo,
                                 void *dX, Nd4jLong *dXShapeInfo,
@@ -1227,8 +1227,8 @@ void NativeOpExecutioner::execTransformAny(nd4j::LaunchContext  *lc,
                                 void *dZ, Nd4jLong *dZShapeInfo,
                                 void *extraParams,
                                 Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, bool allowParallelism) {
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (shape::isEmpty(hXShapeInfo))
         return;
@@ -1237,11 +1237,11 @@ void NativeOpExecutioner::execTransformAny(nd4j::LaunchContext  *lc,
         BUILD_DOUBLE_SELECTOR(xType, zType, functions::transform::TransformAny, ::exec(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, extraParams, thread_id, numThreads), LIBND4J_TYPES, LIBND4J_TYPES);
     };
 
-    samediff::Threads::parallel_do(func, nd4j::math::nd4j_max<int>(1, nd4j::math::nd4j_min<int>(shape::length(hZShapeInfo) / 1024, nd4j::Environment::getInstance()->maxMasterThreads())));
+    samediff::Threads::parallel_do(func, sd::math::nd4j_max<int>(1, sd::math::nd4j_min<int>(shape::length(hZShapeInfo) / 1024, sd::Environment::getInstance()->maxMasterThreads())));
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execTransformSame(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execTransformSame(sd::LaunchContext  *lc,
                                 int opNum,
                                 void *hX, Nd4jLong *hXShapeInfo,
                                 void *dX, Nd4jLong *dXShapeInfo,
@@ -1249,8 +1249,8 @@ void NativeOpExecutioner::execTransformSame(nd4j::LaunchContext  *lc,
                                 void *dZ, Nd4jLong *dZShapeInfo,
                                 void *extraParams,
                                 Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (shape::isEmpty(hXShapeInfo))
         return;
@@ -1259,11 +1259,11 @@ void NativeOpExecutioner::execTransformSame(nd4j::LaunchContext  *lc,
         BUILD_SINGLE_SELECTOR(xType, functions::transform::TransformSame, ::exec(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, extraParams, thread_id, numThreads), LIBND4J_TYPES);
     };
 
-    samediff::Threads::parallel_do(func, nd4j::math::nd4j_max<int>(1, nd4j::math::nd4j_min<int>(shape::length(hZShapeInfo) / 1024, nd4j::Environment::getInstance()->maxMasterThreads())));
+    samediff::Threads::parallel_do(func, sd::math::nd4j_max<int>(1, sd::math::nd4j_min<int>(shape::length(hZShapeInfo) / 1024, sd::Environment::getInstance()->maxMasterThreads())));
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execTransformStrict(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execTransformStrict(sd::LaunchContext  *lc,
                                 int opNum,
                                 void *hX, Nd4jLong *hXShapeInfo,
                                 void *dX, Nd4jLong *dXShapeInfo,
@@ -1271,8 +1271,8 @@ void NativeOpExecutioner::execTransformStrict(nd4j::LaunchContext  *lc,
                                 void *dZ, Nd4jLong *dZShapeInfo,
                                 void *extraParams,
                                 Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (shape::isEmpty(hXShapeInfo))
         return;
@@ -1281,11 +1281,11 @@ void NativeOpExecutioner::execTransformStrict(nd4j::LaunchContext  *lc,
         BUILD_SINGLE_SELECTOR(xType, functions::transform::TransformStrict, ::exec(opNum, hX, hXShapeInfo, hZ, hZShapeInfo, extraParams, thread_id, numThreads), FLOAT_TYPES);
     };
 
-    samediff::Threads::parallel_do(func, nd4j::math::nd4j_max<int>(1, nd4j::math::nd4j_min<int>(shape::length(hZShapeInfo) / 1024, nd4j::Environment::getInstance()->maxMasterThreads())));
+    samediff::Threads::parallel_do(func, sd::math::nd4j_max<int>(1, sd::math::nd4j_min<int>(shape::length(hZShapeInfo) / 1024, sd::Environment::getInstance()->maxMasterThreads())));
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execRandom(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execRandom(sd::LaunchContext  *lc,
                             int opNum,
                             Nd4jPointer state,
                             void *hZ, Nd4jLong *hZShapeInfo,
@@ -1293,16 +1293,16 @@ void NativeOpExecutioner::execRandom(nd4j::LaunchContext  *lc,
                             void *extraArguments) {
 
 
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     BUILD_SINGLE_SELECTOR(zType, functions::random::RandomFunction, ::execTransform(opNum, state, hZ, hZShapeInfo, extraArguments), FLOAT_TYPES);
 
-    auto rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
+    auto rng = reinterpret_cast<sd::graph::RandomGenerator*>(state);
     rng->rewindH(shape::length(hZShapeInfo));
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execRandom(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execRandom(sd::LaunchContext  *lc,
                             int opNum,
                             Nd4jPointer state,
                             void *hX, Nd4jLong *hXShapeInfo,
@@ -1312,16 +1312,16 @@ void NativeOpExecutioner::execRandom(nd4j::LaunchContext  *lc,
                             void *extraArguments) {
 
 
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     BUILD_SINGLE_SELECTOR(zType, functions::random::RandomFunction, ::execTransform(opNum, state, hX, hXShapeInfo, hZ, hZShapeInfo, extraArguments), FLOAT_TYPES);
 
-    auto rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
+    auto rng = reinterpret_cast<sd::graph::RandomGenerator*>(state);
     rng->rewindH(shape::length(hZShapeInfo));
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execRandom(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execRandom(sd::LaunchContext  *lc,
                           int opNum,
                           Nd4jPointer state,
                           void *hX, Nd4jLong *hXShapeInfo,
@@ -1333,11 +1333,11 @@ void NativeOpExecutioner::execRandom(nd4j::LaunchContext  *lc,
                           void *extraArguments) {
 
 
-    auto xType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     BUILD_SINGLE_SELECTOR(xType, functions::random::RandomFunction, ::execTransform(opNum, state, hX, hXShapeInfo, hY, hYShapeInfo, hZ, hZShapeInfo, extraArguments), FLOAT_TYPES);
 
-    auto rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
+    auto rng = reinterpret_cast<sd::graph::RandomGenerator*>(state);
     rng->rewindH(shape::length(hZShapeInfo));
 }
 
diff --git a/libnd4j/blas/cpu/NativeOps.cpp b/libnd4j/include/legacy/cpu/NativeOps.cpp
similarity index 76%
rename from libnd4j/blas/cpu/NativeOps.cpp
rename to libnd4j/include/legacy/cpu/NativeOps.cpp
index 0b9534511..cf04acbe7 100644
--- a/libnd4j/blas/cpu/NativeOps.cpp
+++ b/libnd4j/include/legacy/cpu/NativeOps.cpp
@@ -20,18 +20,18 @@
 
 #define __STDC_CONSTANT_MACROS
 
-#include "../NativeOps.h"
-#include "NativeOpExecutioner.h"
-#include "../NDArray.h"
-#include "../GraphExecutioner.h"
+#include <legacy/NativeOps.h>
+#include "legacy/NativeOpExecutioner.h"
+#include <array/NDArray.h>
+#include <graph/GraphExecutioner.h>
 #include <graph/GraphHolder.h>
-#include <templatemath.h>
+#include <math/templatemath.h>
 #include <types/float8.h>
 #include <loops/type_conversions.h>
 #include <helpers/helper_ptrmap.h>
 #include <helpers/logger.h>
-#include <pointercast.h>
-#include <pairwise_util.h>
+#include <system/pointercast.h>
+#include <system/pairwise_util.h>
 #include <types/types.h>
 #include <ops/declarable/helpers/transforms.h>
 #include <exceptions/allocation_exception.h>
@@ -65,8 +65,8 @@ bool experimentalSupport = false;
 #endif
 
 #include <ops/specials.h>
-#include "../Environment.h"
-#include <TAD.h>
+#include <system/Environment.h>
+#include <helpers/TAD.h>
 #include <ops/declarable/OpRegistrator.h>
 #include <graph/Context.h>
 #include <graph/ResultWrapper.h>
@@ -81,16 +81,16 @@ bool experimentalSupport = false;
 #include <cpuinfo_x86.h>
 #endif
 
-using namespace nd4j;
+using namespace sd;
 
 void setElementThreshold(int num) {
     if (num > 0)
-        nd4j::Environment::getInstance()->setElementwiseThreshold(num);
+        sd::Environment::getInstance()->setElementwiseThreshold(num);
 }
 
 void setTADThreshold(int num) {
     if (num > 0)
-        nd4j::Environment::getInstance()->setTadThreshold(num);
+        sd::Environment::getInstance()->setTadThreshold(num);
 }
 
 /**
@@ -108,8 +108,8 @@ void execIndexReduceScalar(Nd4jPointer *extraPointers,
     try {
         NativeOpExecutioner::execIndexReduceScalar(nullptr, opNum, dbX->primary(), hXShapeInfo, dbX->special(), dXShapeInfo, extraParams, dbZ->primary(), hZShapeInfo, dbZ->special(), dZShapeInfo);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -133,7 +133,7 @@ void  execIndexReduce(Nd4jPointer *extraPointers,int opNum,
         auto dimension = reinterpret_cast<int *>(dbDimension->primary());
         int dimensionLength = static_cast<int>(shape::length(hDimensionShape));
 
-        auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension,
+        auto tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension,
                                                                                 dimensionLength);
 
         auto hTADShapeInfo = tadPack.primaryShapeInfo();
@@ -156,8 +156,8 @@ void  execIndexReduce(Nd4jPointer *extraPointers,int opNum,
                                              hTADShapeInfo,
                                              hTADOffsets);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -184,9 +184,9 @@ void execBroadcast(Nd4jPointer *extraPointers,
         auto dimension = reinterpret_cast<int *>(dbDimension->primary());
         int dimensionLength = static_cast<int>(shape::length(hDimensionShape));
 
-        auto tadPackX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension,
+        auto tadPackX = sd::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension,
                                                                                  dimensionLength);
-        auto tadPackZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hZShapeInfo, dimension,
+        auto tadPackZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(hZShapeInfo, dimension,
                                                                                  dimensionLength);
 
         auto hTADShapeInfo = tadPackX.primaryShapeInfo();
@@ -209,8 +209,8 @@ void execBroadcast(Nd4jPointer *extraPointers,
                                            dimension,
                                            dimensionLength, hTADShapeInfo, hTADOffsets, hTADShapeInfoZ, hTADOffsetsZ);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -225,9 +225,9 @@ void execBroadcastBool(Nd4jPointer *extraPointers,
         auto dimension = reinterpret_cast<int *>(dbDimension->primary());
         int dimensionLength = static_cast<int>(shape::length(hDimensionShape));
 
-        auto tadPackX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension,
+        auto tadPackX = sd::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension,
                                                                                  dimensionLength);
-        auto tadPackZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hZShapeInfo, dimension,
+        auto tadPackZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(hZShapeInfo, dimension,
                                                                                  dimensionLength);
 
         auto hTADShapeInfo = tadPackX.primaryShapeInfo();
@@ -252,8 +252,8 @@ void execBroadcastBool(Nd4jPointer *extraPointers,
                                                dimensionLength, hTADShapeInfo, hTADOffsets, hTADShapeInfoZ,
                                                hTADOffsetsZ);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -293,8 +293,8 @@ void execPairwiseTransform(
                                                    dZShapeInfo,
                                                    extraParams);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -323,8 +323,8 @@ void execPairwiseTransformBool(
                                                        dZShapeInfo,
                                                        extraParams);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -357,8 +357,8 @@ void execReduceFloat(
                                                    dbZ->special(),
                                                    dZShapeInfo);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -382,8 +382,8 @@ void execReduceSame(
                                                   dbZ->special(),
                                                   dZShapeInfo);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -406,8 +406,8 @@ void execReduceBool(
                                                   dbZ->special(),
                                                   dZShapeInfo);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -430,8 +430,8 @@ void execReduceLong(
                                                   dbZ->special(),
                                                   dZShapeInfo);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -454,7 +454,7 @@ void execReduceFloat2(Nd4jPointer *extraPointers,
         auto dimension = reinterpret_cast<int *>(dbDimension->primary());
         int dimensionLength = static_cast<int>(shape::length(hDimensionShape));
 
-        auto tadPackX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension,
+        auto tadPackX = sd::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension,
                                                                                  dimensionLength);
 
         auto hTADShapeInfo = tadPackX.primaryShapeInfo();
@@ -475,8 +475,8 @@ void execReduceFloat2(Nd4jPointer *extraPointers,
                                              hTADShapeInfo,
                                              hTADOffsets);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -490,7 +490,7 @@ void execReduceBool2(Nd4jPointer *extraPointers,
         auto dimension = reinterpret_cast<int *>(dbDimension->primary());
         int dimensionLength = static_cast<int>(shape::length(hDimensionShape));
 
-        auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension,
+        auto tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension,
                                                                                 dimensionLength);
 
         auto hTADShapeInfo = tadPack.primaryShapeInfo();
@@ -511,8 +511,8 @@ void execReduceBool2(Nd4jPointer *extraPointers,
                                             hTADShapeInfo,
                                             hTADOffsets);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -526,7 +526,7 @@ void execReduceSame2(Nd4jPointer *extraPointers,
         auto dimension = reinterpret_cast<int *>(dbDimension->primary());
         int dimensionLength = static_cast<int>(shape::length(hDimensionShape));
 
-        auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension,
+        auto tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension,
                                                                                 dimensionLength);
 
         auto hTADShapeInfo = tadPack.primaryShapeInfo();
@@ -547,8 +547,8 @@ void execReduceSame2(Nd4jPointer *extraPointers,
                                             hTADShapeInfo,
                                             hTADOffsets);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -562,7 +562,7 @@ void execReduceLong2(Nd4jPointer *extraPointers,
         auto dimension = reinterpret_cast<int *>(dbDimension->primary());
         int dimensionLength = static_cast<int>(shape::length(hDimensionShape));
 
-        auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension,
+        auto tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension,
                                                                                 dimensionLength);
 
         auto hTADShapeInfo = tadPack.primaryShapeInfo();
@@ -583,8 +583,8 @@ void execReduceLong2(Nd4jPointer *extraPointers,
                                             hTADShapeInfo,
                                             hTADOffsets);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -609,8 +609,8 @@ void execReduce3(Nd4jPointer *extraPointers,
         NativeOpExecutioner::execReduce3(nullptr, opNum, dbX->primary(), hXShapeInfo, dbX->special(), dXShapeInfo, extraParams, dbY->primary(), hYShapeInfo,
                                          dbY->special(), dYShapeInfo, dbZ->primary(), hZShapeInfo, dbZ->special(), dZShapeInfo);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -632,8 +632,8 @@ void execReduce3Scalar(Nd4jPointer *extraPointers,int opNum,
         NativeOpExecutioner::execReduce3Scalar(nullptr, opNum, dbX->primary(), hXShapeInfo, dbX->special(), dXShapeInfo, extraParams, dbY->primary(),
                                                hYShapeInfo, dbY->special(), dYShapeInfo, dbZ->primary(), hZShapeInfo, dbZ->special(), dZShapeInfo);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 /**
@@ -669,7 +669,7 @@ void execReduce3Tad(Nd4jPointer *extraPointers,
                                              yTadOnlyShapeInfo, yTadOffsets);
         } else {
             // going tad-way
-            auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension,
+            auto tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension,
                                                                                     dimensionLength);
 
             auto hTADShapeInfo = tadPack.primaryShapeInfo();
@@ -681,8 +681,8 @@ void execReduce3Tad(Nd4jPointer *extraPointers,
                                                 hTADOffsets, nullptr, nullptr);
         }
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -725,8 +725,8 @@ void execScalar(
                                         dScalarShapeInfo,
                                         extraParams);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -754,8 +754,8 @@ void execScalarBool(
                                             dScalarShapeInfo,
                                             extraParams);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -786,8 +786,8 @@ void execSummaryStatsScalar(Nd4jPointer *extraPointers,
                                                     dZShapeInfo,
                                                     biasCorrected);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 /**
@@ -819,8 +819,8 @@ void execSummaryStats(Nd4jPointer *extraPointers,
                                               dZShapeInfo,
                                               biasCorrected);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 /**
@@ -864,8 +864,8 @@ void execSummaryStatsTad(Nd4jPointer *extraPointers,
                                               tadOffsets,
                                               biasCorrected);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -900,8 +900,8 @@ void execTransformFloat(
                                                 nullptr,
                                                 nullptr);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -926,8 +926,8 @@ void execTransformSame(
                                                nullptr,
                                                nullptr);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -952,8 +952,8 @@ void execTransformBool(
                                                nullptr,
                                                nullptr);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -978,8 +978,8 @@ void execTransformAny(
                                               nullptr,
                                               nullptr);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -1004,8 +1004,8 @@ void execTransformStrict(
                                                  nullptr,
                                                  nullptr);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -1030,8 +1030,8 @@ void execReduce3All(Nd4jPointer *extraPointers,
                                             hYShapeInfo, dbY->special(), dYShapeInfo, dbZ->primary(), hZShapeInfo, dbZ->special(), dZShapeInfo, dimension,
                                             dimensionLength, xTadShapeInfo, xOffsets, yTadShapeInfo, yOffsets);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -1050,12 +1050,12 @@ void specialConcat(
         Nd4jPointer *tadPointers,
         Nd4jPointer *offsetPointers) {
     try {
-        auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+        auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
-        BUILD_SINGLE_SELECTOR(zType, nd4j::SpecialMethods,::concatCpuGeneric(dimension, numArrays, data, inputShapeInfo, hZ, hZShapeInfo), LIBND4J_TYPES);
+        BUILD_SINGLE_SELECTOR(zType, sd::SpecialMethods,::concatCpuGeneric(dimension, numArrays, data, inputShapeInfo, hZ, hZShapeInfo), LIBND4J_TYPES);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -1068,7 +1068,7 @@ void initializeDevicesAndFunctions() {
 }
 
 void initializeFunctions(Nd4jPointer *functions) {
-    nd4j::BlasHelper::getInstance()->initializeFunctions(functions);
+    sd::BlasHelper::getInstance()->initializeFunctions(functions);
 }
 
 /**
@@ -1216,45 +1216,45 @@ int getAvailableDevices() {
 }
 
 void enableDebugMode(bool reallyEnable) {
-    nd4j::Environment::getInstance()->setDebug(reallyEnable);
+    sd::Environment::getInstance()->setDebug(reallyEnable);
 }
 
 void enableVerboseMode(bool reallyEnable) {
-    nd4j::Environment::getInstance()->setVerbose(reallyEnable);
+    sd::Environment::getInstance()->setVerbose(reallyEnable);
 }
 
 void setGridLimit(int gridSize) {
     // no-op
 }
 
-nd4j::TadPack* tadOnlyShapeInfo(Nd4jLong *hXShapeInfo, int *dimension, int dimensionLength) {
+sd::TadPack* tadOnlyShapeInfo(Nd4jLong *hXShapeInfo, int *dimension, int dimensionLength) {
     auto pack = new TadPack();
     try {
-        *pack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension, dimensionLength);
+        *pack = sd::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo, dimension, dimensionLength);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 
     return pack;
 }
 
-Nd4jLong* getPrimaryShapeInfo(nd4j::TadPack* pack) {
+Nd4jLong* getPrimaryShapeInfo(sd::TadPack* pack) {
     return pack->primaryShapeInfo();
 }
-Nd4jLong* getPrimaryOffsets(nd4j::TadPack* pack) {
+Nd4jLong* getPrimaryOffsets(sd::TadPack* pack) {
     return pack->primaryOffsets();
 }
-Nd4jLong* getSpecialShapeInfo(nd4j::TadPack* pack) {
+Nd4jLong* getSpecialShapeInfo(sd::TadPack* pack) {
     return pack->specialShapeInfo();
 }
-Nd4jLong* getSpecialOffsets(nd4j::TadPack* pack) {
+Nd4jLong* getSpecialOffsets(sd::TadPack* pack) {
     return pack->specialOffsets();
 }
-Nd4jLong getNumberOfTads(nd4j::TadPack* pack) {
+Nd4jLong getNumberOfTads(sd::TadPack* pack) {
     return pack->numberOfTads();
 }
-int getShapeInfoLength(nd4j::TadPack* pack) {
+int getShapeInfoLength(sd::TadPack* pack) {
     return pack->shapeInfoLength();
 }
 
@@ -1287,8 +1287,8 @@ void pullRowsGeneric(void *vx,
     const auto tadLength = shape::length(tadShapeInfo);
 
     int elementsPerThread = n / TAD_THRESHOLD;
-    int _threads = nd4j::math::nd4j_max<int>(1, elementsPerThread);
-    _threads = nd4j::math::nd4j_min<int>(_threads, nd4j::Environment::getInstance()->maxThreads());
+    int _threads = sd::math::nd4j_max<int>(1, elementsPerThread);
+    _threads = sd::math::nd4j_min<int>(_threads, sd::Environment::getInstance()->maxThreads());
 
     auto func = PRAGMA_THREADS_FOR {
         for (auto idx = start; idx < stop; idx++) {
@@ -1331,12 +1331,12 @@ void pullRows(Nd4jPointer *extraPointers,
         Nd4jLong *zTadShapeInfo,
         Nd4jLong *zTadOffsets) {
     try {
-        auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
+        auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
 
         BUILD_SINGLE_SELECTOR(xType, pullRowsGeneric, (dbX->primary(), hXShapeInfo, dbZ->primary(), hZShapeInfo, n, indexes, tadShapeInfo, tadOffsets, zTadShapeInfo, zTadOffsets), LIBND4J_TYPES);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -1387,12 +1387,12 @@ void tear(Nd4jPointer *extraPointers,
         Nd4jLong *tadShapeInfo,
         Nd4jLong *tadOffsets) {
     try {
-        auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
+        auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
 
         BUILD_SINGLE_SELECTOR(xType, tearGeneric, (dbX->primary(), hXShapeInfo, targets, hZShapeInfo, tadShapeInfo, tadOffsets), LIBND4J_TYPES);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -1406,12 +1406,12 @@ void average(Nd4jPointer *extras,
         Nd4jLong length,
         bool propagate) {
     try {
-        auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
+        auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
 
-        BUILD_SINGLE_SELECTOR(xType, nd4j::SpecialMethods, ::averageGeneric(hX, z, hZShapeInfo, n, length, propagate), LIBND4J_TYPES);
+        BUILD_SINGLE_SELECTOR(xType, sd::SpecialMethods, ::averageGeneric(hX, z, hZShapeInfo, n, length, propagate), LIBND4J_TYPES);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -1423,12 +1423,12 @@ void accumulate(Nd4jPointer *extras,
         int n,
         Nd4jLong length) {
     try {
-        auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
+        auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
 
-        BUILD_SINGLE_SELECTOR(xType, nd4j::SpecialMethods, ::accumulateGeneric(hX, hz, hZShapeInfo, n, length), LIBND4J_TYPES);
+        BUILD_SINGLE_SELECTOR(xType, sd::SpecialMethods, ::accumulateGeneric(hX, hz, hZShapeInfo, n, length), LIBND4J_TYPES);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -1502,7 +1502,7 @@ void shuffleGeneric(void **hX, Nd4jLong **hXShapeInfo, void **dz, Nd4jLong **hZS
                     if (swapIdx < 0)
                         continue;
 
-                    nd4j::math::nd4j_swap<T>(hX[r * ews], hX[swapIdx * ews]);
+                    sd::math::nd4j_swap<T>(hX[r * ews], hX[swapIdx * ews]);
                 }
             } else {
                 for (Nd4jLong r = 0; r < numTads; r++) {
@@ -1517,12 +1517,12 @@ void shuffleGeneric(void **hX, Nd4jLong **hXShapeInfo, void **dz, Nd4jLong **hZS
 
                     if (tadEWS == 1) {
                         for (Nd4jLong i = 0; i < tadLength; i++) {
-                            nd4j::math::nd4j_swap<T>(rX[i], rY[i]);
+                            sd::math::nd4j_swap<T>(rX[i], rY[i]);
                         }
                     } else {
                         for (Nd4jLong i = 0; i < tadLength; i++) {
                             auto offset = shape::getIndexOffset(i, tadOnlyShapeInfo[f]);
-                            nd4j::math::nd4j_swap<T>(hX[offset + oldOffset], hX[offset + newOffset]);
+                            sd::math::nd4j_swap<T>(hX[offset + oldOffset], hX[offset + newOffset]);
                         }
                     }
                 }
@@ -1548,19 +1548,19 @@ void shuffle(Nd4jPointer *extras,
         auto tadOnlyShapeInfo = reinterpret_cast<Nd4jLong **>(tadShapeInfo);
         auto tadOffset = reinterpret_cast<Nd4jLong **>(tadOffsets);
 
-        auto xType = nd4j::ArrayOptions::dataType(xShape[0]);
+        auto xType = sd::ArrayOptions::dataType(xShape[0]);
 
         BUILD_SINGLE_SELECTOR(xType, shuffleGeneric,
                               (hX, xShape, hz, zShape, N, shuffleMap, tadOnlyShapeInfo, tadOffset), LIBND4J_TYPES);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
 
 bool isExperimentalEnabled() {
-    return nd4j::Environment::getInstance()->isExperimentalBuild();
+    return sd::Environment::getInstance()->isExperimentalBuild();
 }
 
 
@@ -1607,8 +1607,8 @@ void execScalarTad(Nd4jPointer *extraPointers,
                                         tadShapeInfoZ,
                                         tadOffsetsZ);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -1647,8 +1647,8 @@ void execScalarBoolTad(Nd4jPointer *extraPointers,
                                             tadShapeInfoZ,
                                             tadOffsetsZ);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -1666,8 +1666,8 @@ const char * getDeviceName(int deviceId) {
             sprintf(name, "x86-compatible CPU");
         }
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 
 
@@ -1686,7 +1686,7 @@ void execAggregate(Nd4jPointer *extraPointers,int opNum,
                                     int numIntArrays,
                                     void *realArguments,
                                     int numRealArguments,
-                                    nd4j::DataType dtype) {
+                                    sd::DataType dtype) {
 
 }
 
@@ -1700,7 +1700,7 @@ void batchExecutor(Nd4jPointer *extraPointers,
                                int maxIdx,
                                int maxReals,
                                void *ptrToArguments,
-                               nd4j::DataType dtype) {
+                               sd::DataType dtype) {
 
 }
 
@@ -1714,7 +1714,7 @@ void execAggregateBatch(Nd4jPointer *extraPointers,
                                          int maxIdx,
                                          int maxReals,
                                          void *ptrToArguments,
-                                         nd4j::DataType dtype) {
+                                         sd::DataType dtype) {
 
 }
 
@@ -1727,8 +1727,8 @@ void execRandom(Nd4jPointer *extraPointers,
     try {
         NativeOpExecutioner::execRandom(nullptr, opNum, state, dbZ->primary(), hZShapeInfo, dbZ->special(), dZShapeInfo, extraArguments);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -1742,8 +1742,8 @@ void execRandom3(Nd4jPointer *extraPointers,
     try {
         NativeOpExecutioner::execRandom(nullptr, opNum, state, dbX->primary(), hXShapeInfo, dbX->special(), dXShapeInfo, dbY->primary(), hYShapeInfo, dbY->special(), dYShapeInfo, dbZ->primary(), hZShapeInfo, dbZ->special(), dZShapeInfo, extraArguments);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -1756,8 +1756,8 @@ void execRandom2(Nd4jPointer *extraPointers,
     try {
         NativeOpExecutioner::execRandom(nullptr, opNum, state, dbX->primary(), hXShapeInfo, dbX->special(), dXShapeInfo, dbZ->primary(), hZShapeInfo, dbZ->special(), dZShapeInfo, extraArguments);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -1767,28 +1767,28 @@ Nd4jPointer initRandom(Nd4jPointer *extraPointers, long seed, long bufferSize, N
 
         return (Nd4jPointer) generator;
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
 
         return nullptr;
     }
 }
 
 void refreshBuffer(Nd4jPointer *extraPointers, long seed, Nd4jPointer ptrRandom) {
-    auto generator = reinterpret_cast<nd4j::graph::RandomGenerator*> (ptrRandom);
+    auto generator = reinterpret_cast<sd::graph::RandomGenerator*> (ptrRandom);
 
     generator->setStates(seed);
 }
 
 void reSeedBuffer(Nd4jPointer *extraPointers, long seed, Nd4jPointer ptrRandom) {
-    auto generator = reinterpret_cast<nd4j::graph::RandomGenerator *> (ptrRandom);
+    auto generator = reinterpret_cast<sd::graph::RandomGenerator *> (ptrRandom);
 
     generator->setStates(seed);
 }
 
 
 void destroyRandom(Nd4jPointer ptrBuffer) {
-    auto buffer = reinterpret_cast<nd4j::graph::RandomGenerator*>(ptrBuffer);
+    auto buffer = reinterpret_cast<sd::graph::RandomGenerator*>(ptrBuffer);
     delete buffer;
 }
 
@@ -1825,8 +1825,8 @@ void sort(Nd4jPointer *extraPointers,
     try {
         NativeOpExecutioner::execSort(hX, hXShapeInfo, descending);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -1841,8 +1841,8 @@ void sortTad(Nd4jPointer *extraPointers,
     try {
         NativeOpExecutioner::execSort(hX, hXShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets, descending);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -1854,8 +1854,8 @@ void sortCooIndices(Nd4jPointer *extraPointers,
     try {
         NativeOpExecutioner::execSortCooIndices(indices, values, length, rank);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -1889,8 +1889,8 @@ try {
 
     return hZ;
 } catch (std::exception &e) {
-    nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-    nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+    sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+    sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     return nullptr;
 }
 }
@@ -1906,25 +1906,25 @@ void munmapFile(Nd4jPointer *extraPointers, Nd4jLong *ptrMap, Nd4jLong length) {
     delete[] ptrMap;
 }
 
-nd4j::graph::ResultWrapper* executeFlatGraph(Nd4jPointer *extraPointers, Nd4jPointer flatBufferPointer) {
+sd::graph::ResultWrapper* executeFlatGraph(Nd4jPointer *extraPointers, Nd4jPointer flatBufferPointer) {
     try {
-        return nd4j::graph::GraphExecutioner::executeFlatBuffer(flatBufferPointer);
+        return sd::graph::GraphExecutioner::executeFlatBuffer(flatBufferPointer);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
         return nullptr;
     }
 }
 
-Nd4jLong getResultWrapperSize(nd4j::graph::ResultWrapper* ptr) {
+Nd4jLong getResultWrapperSize(sd::graph::ResultWrapper* ptr) {
     return ptr->size();
 }
-Nd4jPointer getResultWrapperPointer(nd4j::graph::ResultWrapper* ptr) {
+Nd4jPointer getResultWrapperPointer(sd::graph::ResultWrapper* ptr) {
     return ptr->pointer();
 }
 
 const char* getAllCustomOps() {
-    return nd4j::ops::OpRegistrator::getInstance()->getAllCustomOperations();
+    return sd::ops::OpRegistrator::getInstance()->getAllCustomOperations();
 }
 
 template <typename T>
@@ -1936,7 +1936,7 @@ FORCEINLINE int estimateThresholdGeneric(Nd4jPointer *extraPointers, Nd4jPointer
         int64_t cnt = 0;
         PRAGMA_OMP_SIMD
         for (auto e = start; e < stop; e++) {
-            auto v = nd4j::math::nd4j_abs<T>(buffer[e]);
+            auto v = sd::math::nd4j_abs<T>(buffer[e]);
             if (v >= threshold)
                 cnt++;
         }
@@ -1953,31 +1953,31 @@ int estimateThreshold(Nd4jPointer *extraPointers, Nd4jPointer hX, Nd4jLong *hXSh
         auto xType = ArrayOptions::dataType(hXShapeInfo);
         BUILD_SINGLE_SELECTOR(xType, return estimateThresholdGeneric, (extraPointers, hX, N, threshold), FLOAT_TYPES);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
         return 0;
     }
 }
 
-Nd4jLong getShapeListSize(nd4j::ShapeList* list) {
+Nd4jLong getShapeListSize(sd::ShapeList* list) {
     return list->size();
 }
 
-Nd4jLong* getShape(nd4j::ShapeList* list, Nd4jLong i) {
+Nd4jLong* getShape(sd::ShapeList* list, Nd4jLong i) {
     return list->at(i);
 }
 
 void deleteShapeList(Nd4jPointer shapeList) {
-    auto list = reinterpret_cast<nd4j::ShapeList*>(shapeList);
+    auto list = reinterpret_cast<sd::ShapeList*>(shapeList);
 
     //list->destroy();
     delete list;
 }
 
-nd4j::ShapeList* _calculateOutputShapes(Nd4jPointer* extraPointers, nd4j::ops::DeclarableOp* op, Nd4jPointer* inputBuffers, Nd4jPointer* inputShapes, int numInputShapes, double* tArgs, int numTArgs, Nd4jLong *iArgs, int numIArgs, bool *bArgs, int numBArgs, int *dArgs, int numDArgs) {
-    nd4j::graph::VariableSpace varSpace;
+sd::ShapeList* _calculateOutputShapes(Nd4jPointer* extraPointers, sd::ops::DeclarableOp* op, Nd4jPointer* inputBuffers, Nd4jPointer* inputShapes, int numInputShapes, double* tArgs, int numTArgs, Nd4jLong *iArgs, int numIArgs, bool *bArgs, int numBArgs, int *dArgs, int numDArgs) {
+    sd::graph::VariableSpace varSpace;
     Context block(2, &varSpace);
-    nd4j::ShapeList inShapes;
+    sd::ShapeList inShapes;
 
     for (int e = 0; e < numIArgs; e++)
         block.getIArguments()->push_back(iArgs[e]);
@@ -1989,15 +1989,15 @@ nd4j::ShapeList* _calculateOutputShapes(Nd4jPointer* extraPointers, nd4j::ops::D
         block.getBArguments()->push_back(bArgs[e]);
 
     for (int e = 0; e < numDArgs; e++)
-        block.getDArguments()->push_back((nd4j::DataType) dArgs[e]);
+        block.getDArguments()->push_back((sd::DataType) dArgs[e]);
 
     for (int e = 0; e < numInputShapes; e++) {
         auto shape_ = reinterpret_cast<Nd4jLong *>(inputShapes[e]);
 
         // we shouldn't copy buffer if that's empty array
-        void *buffer_ = nd4j::ArrayOptions::arrayType(shape_) == ArrayType::EMPTY ? nullptr : inputBuffers[e];
+        void *buffer_ = sd::ArrayOptions::arrayType(shape_) == ArrayType::EMPTY ? nullptr : inputBuffers[e];
 
-        auto array = new nd4j::NDArray(buffer_, shape_, varSpace.launchContext(), false);
+        auto array = new sd::NDArray(buffer_, shape_, varSpace.launchContext(), false);
 
         // block should contain references to proper variable
         varSpace.putVariable(1, e, array);
@@ -2018,21 +2018,21 @@ nd4j::ShapeList* _calculateOutputShapes(Nd4jPointer* extraPointers, nd4j::ops::D
     return shapeList;
 }
 
-nd4j::ShapeList* calculateOutputShapes2(Nd4jPointer* extraPointers, Nd4jLong hash, Nd4jPointer* inputBuffers, Nd4jPointer* inputShapes, int numInputShapes, double* tArgs, int numTArgs, Nd4jLong *iArgs, int numIArgs, bool *bArgs, int numBArgs, int *dArgs, int numDArgs) {
+sd::ShapeList* calculateOutputShapes2(Nd4jPointer* extraPointers, Nd4jLong hash, Nd4jPointer* inputBuffers, Nd4jPointer* inputShapes, int numInputShapes, double* tArgs, int numTArgs, Nd4jLong *iArgs, int numIArgs, bool *bArgs, int numBArgs, int *dArgs, int numDArgs) {
     try {
-        auto op = nd4j::ops::OpRegistrator::getInstance()->getOperation(hash);
+        auto op = sd::ops::OpRegistrator::getInstance()->getOperation(hash);
 
         return _calculateOutputShapes(extraPointers, op, inputBuffers, inputShapes, numInputShapes, tArgs, numTArgs, iArgs, numIArgs, bArgs, numBArgs, dArgs, numDArgs);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
         return nullptr;
     }
 }
 
-nd4j::ShapeList* _calculateOutputShapes(Nd4jPointer* extraPointers, nd4j::ops::DeclarableOp *op, Nd4jPointer* inputShapes, int numInputShapes, double *tArgs, int numTArgs, Nd4jLong *iArgs, int numIArgs) {
+sd::ShapeList* _calculateOutputShapes(Nd4jPointer* extraPointers, sd::ops::DeclarableOp *op, Nd4jPointer* inputShapes, int numInputShapes, double *tArgs, int numTArgs, Nd4jLong *iArgs, int numIArgs) {
     Context block(1);
-    nd4j::ShapeList inShapes;
+    sd::ShapeList inShapes;
 
     for (int e = 0; e < numIArgs; e++)
         block.getIArguments()->push_back(iArgs[e]);
@@ -2049,39 +2049,39 @@ nd4j::ShapeList* _calculateOutputShapes(Nd4jPointer* extraPointers, nd4j::ops::D
     return shapeList;
 }
 
-nd4j::ShapeList* calculateOutputShapes(Nd4jPointer* extraPointers, Nd4jLong hash, Nd4jPointer* inputShapes, int numInputShapes, double* tArgs, int numTArgs, Nd4jLong *iArgs, int numIArgs) {
+sd::ShapeList* calculateOutputShapes(Nd4jPointer* extraPointers, Nd4jLong hash, Nd4jPointer* inputShapes, int numInputShapes, double* tArgs, int numTArgs, Nd4jLong *iArgs, int numIArgs) {
     try {
-        auto op = nd4j::ops::OpRegistrator::getInstance()->getOperation(hash);
+        auto op = sd::ops::OpRegistrator::getInstance()->getOperation(hash);
 
         return _calculateOutputShapes(extraPointers, op, inputShapes, numInputShapes, tArgs, numTArgs, iArgs, numIArgs);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
         return nullptr;
     }
 }
 
 int execCustomOp2(Nd4jPointer* extraPointers, Nd4jLong hash, Nd4jPointer opContext) {
     try {
-        auto op = nd4j::ops::OpRegistrator::getInstance()->getOperation(hash);
+        auto op = sd::ops::OpRegistrator::getInstance()->getOperation(hash);
         auto context = reinterpret_cast<Context *>(opContext);
 
         return op->execute(context);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
         return 20;
     }
 }
 
-Nd4jStatus realExec(nd4j::ops::DeclarableOp* op, Nd4jPointer* extraPointers, Nd4jLong hash, Nd4jPointer* inputBuffers, Nd4jPointer* inputShapes, int numInputs, Nd4jPointer* outputBuffers, Nd4jPointer* outputShapes, int numOutputs, double* tArgs, int numTArgs, Nd4jLong *iArgs, int numIArgs, bool* bArgs, int numBArgs, bool isInplace) {
+Nd4jStatus realExec(sd::ops::DeclarableOp* op, Nd4jPointer* extraPointers, Nd4jLong hash, Nd4jPointer* inputBuffers, Nd4jPointer* inputShapes, int numInputs, Nd4jPointer* outputBuffers, Nd4jPointer* outputShapes, int numOutputs, double* tArgs, int numTArgs, Nd4jLong *iArgs, int numIArgs, bool* bArgs, int numBArgs, bool isInplace) {
     if (op == nullptr)
         nd4j_printf("Can't find requested operation: [%lld]\n", hash);
 
     // we're using the same fake nodeId everywhere here
 
-    std::vector<nd4j::NDArray*> inputs(numInputs);
-    std::vector<nd4j::NDArray*> outputs(numOutputs);
+    std::vector<sd::NDArray*> inputs(numInputs);
+    std::vector<sd::NDArray*> outputs(numOutputs);
     std::vector<double> ttArgs(numTArgs);
     std::vector<Nd4jLong> iiArgs(numIArgs);
     std::vector<bool> biArgs(numBArgs);
@@ -2089,9 +2089,9 @@ Nd4jStatus realExec(nd4j::ops::DeclarableOp* op, Nd4jPointer* extraPointers, Nd4
     // filling block now with inputs
     for (int e = 0; e < numInputs; e++) {
         auto shape = reinterpret_cast<Nd4jLong *>(inputShapes[e]);
-        void *buffer = nd4j::ArrayOptions::arrayType(shape) == ArrayType::EMPTY ? nullptr : inputBuffers[e];
+        void *buffer = sd::ArrayOptions::arrayType(shape) == ArrayType::EMPTY ? nullptr : inputBuffers[e];
 
-        inputs[e] = new nd4j::NDArray(buffer, shape);
+        inputs[e] = new sd::NDArray(buffer, shape);
     }
 
     // if not inplace - transferring output arrays
@@ -2100,12 +2100,12 @@ Nd4jStatus realExec(nd4j::ops::DeclarableOp* op, Nd4jPointer* extraPointers, Nd4
         for (int e = 0; e < numOutputs; e++) {
             // we want to keep original output shape intact
             auto shape = shape::copyShape(reinterpret_cast<Nd4jLong *>(outputShapes[e]));
-            void *buffer = nd4j::ArrayOptions::arrayType(shape) == ArrayType::EMPTY ? nullptr : outputBuffers[e];
+            void *buffer = sd::ArrayOptions::arrayType(shape) == ArrayType::EMPTY ? nullptr : outputBuffers[e];
 
             // FIXME: revisit this.
             bool canNullify = true;
             for (int i = 0; i < numInputs; i++) {
-                void *ibuffer = nd4j::ArrayOptions::arrayType(shape) == ArrayType::EMPTY ? nullptr : inputBuffers[i];
+                void *ibuffer = sd::ArrayOptions::arrayType(shape) == ArrayType::EMPTY ? nullptr : inputBuffers[i];
                 if (ibuffer == buffer) {
                     canNullify = false;
                     break;
@@ -2115,7 +2115,7 @@ Nd4jStatus realExec(nd4j::ops::DeclarableOp* op, Nd4jPointer* extraPointers, Nd4
             if (canNullify)
                 memset((uint8_t *) buffer, '\0', shape::length(shape) * DataTypeUtils::sizeOfElement(ArrayOptions::dataType(shape)));
 
-            auto array = new nd4j::NDArray(buffer, shape);
+            auto array = new sd::NDArray(buffer, shape);
             outputs[e] = array;
 
             // and we want to release shape copy once we're done
@@ -2133,7 +2133,7 @@ Nd4jStatus realExec(nd4j::ops::DeclarableOp* op, Nd4jPointer* extraPointers, Nd4
         biArgs[e] = bArgs[e];
 
     // hypothetically at this point we have everything filled
-    auto hZ = op->execute(inputs, outputs, ttArgs, iiArgs, biArgs, std::vector<nd4j::DataType>(), isInplace);
+    auto hZ = op->execute(inputs, outputs, ttArgs, iiArgs, biArgs, std::vector<sd::DataType>(), isInplace);
     //auto hZ = op->execute(inputs, ttArgs, iiArgs, isInplace);
 
 
@@ -2161,40 +2161,40 @@ Nd4jStatus realExec(nd4j::ops::DeclarableOp* op, Nd4jPointer* extraPointers, Nd4
 
 int execCustomOp(Nd4jPointer* extraPointers, Nd4jLong hash, Nd4jPointer* inputBuffers, Nd4jPointer* inputShapes, int numInputs, Nd4jPointer* outputBuffers, Nd4jPointer* outputShapes, int numOutputs, double* tArgs, int numTArgs, Nd4jLong *iArgs, int numIArgs, bool* bArgs, int numBArgs, bool isInplace) {
     try {
-        auto op = nd4j::ops::OpRegistrator::getInstance()->getOperation(hash);
+        auto op = sd::ops::OpRegistrator::getInstance()->getOperation(hash);
         return realExec(op, extraPointers, hash, inputBuffers, inputShapes, numInputs, outputBuffers, outputShapes, numOutputs, tArgs, numTArgs, iArgs, numIArgs, bArgs, numBArgs, isInplace);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
         return 1;
     }
 }
 
 int registerGraph(Nd4jPointer *extraPointers, Nd4jLong graphId, Nd4jPointer flatBufferPointer) {
     try {
-        auto graph = nd4j::graph::GraphExecutioner::importFromFlatPointer(flatBufferPointer);
+        auto graph = sd::graph::GraphExecutioner::importFromFlatPointer(flatBufferPointer);
 
-        nd4j::graph::GraphHolder::getInstance()->registerGraph(graphId, graph);
+        sd::graph::GraphHolder::getInstance()->registerGraph(graphId, graph);
 
         return ND4J_STATUS_OK;
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
         return 1;
     }
 }
 
 static VariablesSet* executeStoredGraphT(Nd4jPointer *extraPointers, Nd4jLong graphId, Nd4jPointer *inputBuffers, Nd4jPointer *inputShapes, int* inputIndices, int numInputs) {
-    auto graph = nd4j::graph::GraphHolder::getInstance()->cloneGraph(graphId);
+    auto graph = sd::graph::GraphHolder::getInstance()->cloneGraph(graphId);
     auto varSpace = graph->getVariableSpace();
 
-    std::vector<nd4j::NDArray*> handles;
+    std::vector<sd::NDArray*> handles;
 
     for (int e = 0; e < numInputs; e++) {
         auto idx = inputIndices[e];
 
         // we'll delete this array later, together with cloned VariableSpace
-        auto array = new nd4j::NDArray(inputBuffers[e], reinterpret_cast<Nd4jLong *>(inputShapes[e]));
+        auto array = new sd::NDArray(inputBuffers[e], reinterpret_cast<Nd4jLong *>(inputShapes[e]));
         handles.emplace_back(array);
 
         if (varSpace->hasVariable(idx)) {
@@ -2207,8 +2207,8 @@ static VariablesSet* executeStoredGraphT(Nd4jPointer *extraPointers, Nd4jLong gr
             varSpace->putVariable(idx, array);
     }
 
-    auto hZ = nd4j::graph::GraphExecutioner::execute(graph, varSpace);
-    auto varSet = new nd4j::graph::VariablesSet(hZ);
+    auto hZ = sd::graph::GraphExecutioner::execute(graph, varSpace);
+    auto varSet = new sd::graph::VariablesSet(hZ);
 
     if (hZ == ND4J_STATUS_OK) {
         // pull back results, and provide them
@@ -2230,47 +2230,47 @@ static VariablesSet* executeStoredGraphT(Nd4jPointer *extraPointers, Nd4jLong gr
     return varSet;
 }
 
-nd4j::graph::VariablesSet* executeStoredGraph(Nd4jPointer *extraPointers, Nd4jLong graphId, Nd4jPointer *inputBuffers, Nd4jPointer *inputShapes, int* inputIndices, int numInputs) {
+sd::graph::VariablesSet* executeStoredGraph(Nd4jPointer *extraPointers, Nd4jLong graphId, Nd4jPointer *inputBuffers, Nd4jPointer *inputShapes, int* inputIndices, int numInputs) {
     return nullptr;
 }
 
-Nd4jLong getVariablesSetSize(nd4j::graph::VariablesSet* set) {
+Nd4jLong getVariablesSetSize(sd::graph::VariablesSet* set) {
     return set->size();
 }
 
-Nd4jStatus getVariablesSetStatus(nd4j::graph::VariablesSet* set) {
+Nd4jStatus getVariablesSetStatus(sd::graph::VariablesSet* set) {
     return set->status();
 }
 
-nd4j::graph::Variable* getVariable(nd4j::graph::VariablesSet* set, Nd4jLong i) {
+sd::graph::Variable* getVariable(sd::graph::VariablesSet* set, Nd4jLong i) {
     return set->at(i);
 }
 
-int getVariableId(nd4j::graph::Variable* variable) {
+int getVariableId(sd::graph::Variable* variable) {
     return variable->id();
 }
 
-int getVariableIndex(nd4j::graph::Variable* variable) {
+int getVariableIndex(sd::graph::Variable* variable) {
     return variable->index();
 }
 
-const char* getVariableName(nd4j::graph::Variable* variable) {
+const char* getVariableName(sd::graph::Variable* variable) {
     return variable->getName()->c_str();
 }
 
-Nd4jLong* getVariableShape(nd4j::graph::Variable* variable) {
+Nd4jLong* getVariableShape(sd::graph::Variable* variable) {
     return variable->getNDArray()->shapeInfo();
 }
 
-void* getVariableBuffer(nd4j::graph::Variable* variable) {
+void* getVariableBuffer(sd::graph::Variable* variable) {
     return variable->getNDArray()->buffer();
 }
 
 int unregisterGraph(Nd4jPointer *extraPointers, Nd4jLong graphId) {
 
-    nd4j::graph::GraphHolder::getInstance()->dropGraphAny(graphId);
+    sd::graph::GraphHolder::getInstance()->dropGraphAny(graphId);
 
-    return nd4j::Status::OK();
+    return sd::Status::OK();
 }
 
 void deletePointerArray(Nd4jPointer pointer) {
@@ -2293,25 +2293,25 @@ void deleteLongArray(Nd4jPointer pointer) {
     delete[] ptr;
 }
 
-void deleteVariablesSet(nd4j::graph::VariablesSet* pointer) {
+void deleteVariablesSet(sd::graph::VariablesSet* pointer) {
     delete pointer;
 }
 
 const char* getAllOperations() {
-    return nd4j::OpTracker::getInstance()->exportOperations();
+    return sd::OpTracker::getInstance()->exportOperations();
 }
 
 
 Nd4jPointer getGraphState(Nd4jLong id) {
-    return (Nd4jPointer) new nd4j::graph::GraphState(id);
+    return (Nd4jPointer) new sd::graph::GraphState(id);
 }
 
 void deleteGraphState(Nd4jPointer state) {
-    auto stateP = reinterpret_cast<nd4j::graph::GraphState*>(state);
+    auto stateP = reinterpret_cast<sd::graph::GraphState*>(state);
     delete stateP;
 }
 
-Nd4jStatus execCustomOpWithScope_(Nd4jPointer *extraPointers, nd4j::graph::GraphState *state, Nd4jLong opHash, Nd4jLong *scopes, int numScopes, Nd4jPointer *inputBuffers, Nd4jPointer *inputShapes, int numInputs, Nd4jPointer *outputBuffers, Nd4jPointer *outputShapes, int numOutputs) {
+Nd4jStatus execCustomOpWithScope_(Nd4jPointer *extraPointers, sd::graph::GraphState *state, Nd4jLong opHash, Nd4jLong *scopes, int numScopes, Nd4jPointer *inputBuffers, Nd4jPointer *inputShapes, int numInputs, Nd4jPointer *outputBuffers, Nd4jPointer *outputShapes, int numOutputs) {
     /**
      * That's basically exec, with VariableSpace provided in GraphState:
      * depending on operation (i.e. while of if), different logic executors could be used
@@ -2329,7 +2329,7 @@ Nd4jStatus execCustomOpWithScope_(Nd4jPointer *extraPointers, nd4j::graph::Graph
         auto buffer = inputBuffers[e];
         auto shapeInfo = reinterpret_cast<Nd4jLong *>(inputShapes[e]);
 
-        auto array = new nd4j::NDArray(buffer, shapeInfo, varSpace->launchContext());
+        auto array = new sd::NDArray(buffer, shapeInfo, varSpace->launchContext());
 
         // now we just put array to VarSpace
         varSpace->putVariable(0, e, array);
@@ -2378,17 +2378,17 @@ Nd4jStatus execCustomOpWithScope_(Nd4jPointer *extraPointers, nd4j::graph::Graph
 
 Nd4jStatus execCustomOpWithScope(Nd4jPointer *extraPointers, Nd4jPointer state, Nd4jLong opHash, Nd4jLong *scopes, int numScopes, Nd4jPointer *inputBuffers, Nd4jPointer *inputShapes, int numInputs, Nd4jPointer *outputBuffers, Nd4jPointer *outputShapes, int numOutputs) {
     try {
-        return execCustomOpWithScope_(extraPointers, reinterpret_cast<nd4j::graph::GraphState *>(state), opHash, scopes, numScopes, inputBuffers, inputShapes, numInputs, outputBuffers, outputShapes, numOutputs);
+        return execCustomOpWithScope_(extraPointers, reinterpret_cast<sd::graph::GraphState *>(state), opHash, scopes, numScopes, inputBuffers, inputShapes, numInputs, outputBuffers, outputShapes, numOutputs);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
         return 1;
     }
 }
 
 void deleteResultWrapper(Nd4jPointer ptr) {
     // just 0 room for compiler s@!t
-    auto p = reinterpret_cast<nd4j::graph::ResultWrapper *>(ptr);
+    auto p = reinterpret_cast<sd::graph::ResultWrapper *>(ptr);
     delete p;
 }
 
@@ -2402,113 +2402,113 @@ void convertTypes(Nd4jPointer *extras, int srcType, Nd4jPointer hX, Nd4jLong N,
 
     if (srcType == ND4J_FLOAT8) {
         if (dstType == ND4J_FLOAT8) {
-            // convertGeneric<double, nd4j::float8>(hx, N, hz);
+            // convertGeneric<double, sd::float8>(hx, N, hz);
         } else if (dstType == ND4J_INT8) {
-            //nd4j::TypeCast::convertGeneric<nd4j::float8, nd4j::int8>(nullptr, hx, N, hz);
+            //sd::TypeCast::convertGeneric<sd::float8, sd::int8>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_UINT8) {
-            //nd4j::TypeCast::convertGeneric<nd4j::float8, nd4j::uint8>(nullptr, hx, N, hz);
+            //sd::TypeCast::convertGeneric<sd::float8, sd::uint8>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_FLOAT16) {
-            //nd4j::TypeCast::convertGeneric<nd4j::float8, float16>(nullptr, hx, N, hz);
+            //sd::TypeCast::convertGeneric<sd::float8, float16>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_INT16) {
-            //nd4j::TypeCast::convertGeneric<nd4j::float8, nd4j::int16>(nullptr, hx, N, hz);
+            //sd::TypeCast::convertGeneric<sd::float8, sd::int16>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_UINT16) {
-            //nd4j::TypeCast::convertGeneric<nd4j::float8, nd4j::uint16>(nullptr, hx, N, hz);
+            //sd::TypeCast::convertGeneric<sd::float8, sd::uint16>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_FLOAT24) {
 
         } else if (dstType == ND4J_FLOAT32) {
-            //nd4j::TypeCast::convertGeneric<nd4j::float8, float>(nullptr, hx, N, hz);
+            //sd::TypeCast::convertGeneric<sd::float8, float>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_DOUBLE) {
-            //nd4j::TypeCast::convertGeneric<nd4j::float8, double>(nullptr, hx, N, hz);
+            //sd::TypeCast::convertGeneric<sd::float8, double>(nullptr, hx, N, hz);
         } else {
             //nd4j_printf("Unsupported types conversion: [%i] -> [%i]\n", srcType, dstType);
         }
     } else if (srcType == ND4J_INT8) {
         if (dstType == ND4J_FLOAT8) {
-            //nd4j::TypeCast::convertGeneric<nd4j::int8, nd4j::float8>(nullptr, hx, N, hz);
+            //sd::TypeCast::convertGeneric<sd::int8, sd::float8>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_INT8) {
-            //convertGeneric<nd4j::int8, nd4j::int8>(hx, N, hz);
+            //convertGeneric<sd::int8, sd::int8>(hx, N, hz);
         } else if (dstType == ND4J_UINT8) {
-            nd4j::TypeCast::convertGeneric<int8_t, uint8_t>(nullptr, hx, N, hz);
+            sd::TypeCast::convertGeneric<int8_t, uint8_t>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_FLOAT16) {
-            nd4j::TypeCast::convertGeneric<int8_t, float16>(nullptr, hx, N, hz);
+            sd::TypeCast::convertGeneric<int8_t, float16>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_INT16) {
-            nd4j::TypeCast::convertGeneric<int8_t, int16_t>(nullptr, hx, N, hz);
+            sd::TypeCast::convertGeneric<int8_t, int16_t>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_UINT16) {
-            //nd4j::TypeCast::convertGeneric<int8_t, uint16_t>(nullptr, hx, N, hz);
+            //sd::TypeCast::convertGeneric<int8_t, uint16_t>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_FLOAT24) {
             // TODO: eventually we might want to add it
         } else if (dstType == ND4J_FLOAT32) {
-            nd4j::TypeCast::convertGeneric<int8_t, float>(nullptr, hx, N, hz);
+            sd::TypeCast::convertGeneric<int8_t, float>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_DOUBLE) {
-            nd4j::TypeCast::convertGeneric<int8_t, double>(nullptr, hx, N, hz);
+            sd::TypeCast::convertGeneric<int8_t, double>(nullptr, hx, N, hz);
         } else {
             nd4j_printf("Unsupported types conversion: [%i] -> [%i]\n", srcType, dstType);
         }
     } else if (srcType == ND4J_UINT8) {
         if (dstType == ND4J_FLOAT8) {
-        //    nd4j::TypeCast::convertGeneric<uint8_t, nd4j::float8>(nullptr, hx, N, hz);
+        //    sd::TypeCast::convertGeneric<uint8_t, sd::float8>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_INT8) {
-            nd4j::TypeCast::convertGeneric<uint8_t, int8_t>(nullptr, hx, N, hz);
+            sd::TypeCast::convertGeneric<uint8_t, int8_t>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_UINT8) {
-            nd4j::TypeCast::convertGeneric<uint8_t, uint8_t>(nullptr, hx, N, hz);
+            sd::TypeCast::convertGeneric<uint8_t, uint8_t>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_FLOAT16) {
-            nd4j::TypeCast::convertGeneric<uint8_t, float16>(nullptr, hx, N, hz);
+            sd::TypeCast::convertGeneric<uint8_t, float16>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_INT16) {
-            nd4j::TypeCast::convertGeneric<uint8_t, int16_t>(nullptr, hx, N, hz);
+            sd::TypeCast::convertGeneric<uint8_t, int16_t>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_UINT16) {
-     //       nd4j::TypeCast::convertGeneric<uint8_t, uint16_t>(nullptr, hx, N, hz);
+     //       sd::TypeCast::convertGeneric<uint8_t, uint16_t>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_FLOAT24) {
             // TODO: still might want to add
         } else if (dstType == ND4J_FLOAT32) {
-            nd4j::TypeCast::convertGeneric<uint8_t, float>(nullptr, hx, N, hz);
+            sd::TypeCast::convertGeneric<uint8_t, float>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_DOUBLE) {
-            nd4j::TypeCast::convertGeneric<uint8_t, double>(nullptr, hx, N, hz);
+            sd::TypeCast::convertGeneric<uint8_t, double>(nullptr, hx, N, hz);
         } else {
             nd4j_printf("Unsupported types conversion: [%i] -> [%i]\n", srcType, dstType);
         }
     } else if (srcType == ND4J_FLOAT16) {
         if (dstType == ND4J_FLOAT8) {
-        //    nd4j::TypeCast::convertGeneric<float16, nd4j::float8>(nullptr, hx, N, hz);
+        //    sd::TypeCast::convertGeneric<float16, sd::float8>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_INT8) {
-            nd4j::TypeCast::convertGeneric<float16, int8_t>(nullptr, hx, N, hz);
+            sd::TypeCast::convertGeneric<float16, int8_t>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_UINT8) {
-            nd4j::TypeCast::convertGeneric<float16, uint8_t>(nullptr, hx, N, hz);
+            sd::TypeCast::convertGeneric<float16, uint8_t>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_FLOAT16) {
-            nd4j::TypeCast::convertGeneric<float16, float16>(nullptr, hx, N, hz);
+            sd::TypeCast::convertGeneric<float16, float16>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_INT16) {
-            nd4j::TypeCast::convertGeneric<float16, int16_t>(nullptr, hx, N, hz);
+            sd::TypeCast::convertGeneric<float16, int16_t>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_UINT16) {
-//            nd4j::TypeCast::convertGeneric<float16, uint16_t>(nullptr, hx, N, hz);
+//            sd::TypeCast::convertGeneric<float16, uint16_t>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_FLOAT24) {
             // TODO: .... ^^^
         } else if (dstType == ND4J_FLOAT32) {
-            nd4j::TypeCast::convertGeneric<float16, float>(nullptr, hx, N, hz);
+            sd::TypeCast::convertGeneric<float16, float>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_DOUBLE) {
-            nd4j::TypeCast::convertGeneric<float16, double>(nullptr, hx, N, hz);
+            sd::TypeCast::convertGeneric<float16, double>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_THRESHOLD) {
-            nd4j::TypeCast::convertToThreshold<float16>(nullptr, hx, N, hz);
+            sd::TypeCast::convertToThreshold<float16>(nullptr, hx, N, hz);
         } else {
             nd4j_printf("Unsupported types conversion: [%i] -> [%i]\n", srcType, dstType);
         }
     } else if (srcType == ND4J_INT16) {
         if (dstType == ND4J_FLOAT8) {
-         //   nd4j::TypeCast::convertGeneric<int16_t, nd4j::float8>(nullptr, hx, N, hz);
+         //   sd::TypeCast::convertGeneric<int16_t, sd::float8>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_INT8) {
-            nd4j::TypeCast::convertGeneric<int16_t, int8_t>(nullptr, hx, N, hz);
+            sd::TypeCast::convertGeneric<int16_t, int8_t>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_UINT8) {
-            nd4j::TypeCast::convertGeneric<int16_t, uint8_t>(nullptr, hx, N, hz);
+            sd::TypeCast::convertGeneric<int16_t, uint8_t>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_FLOAT16) {
-            nd4j::TypeCast::convertGeneric<int16_t, float16>(nullptr, hx, N, hz);
+            sd::TypeCast::convertGeneric<int16_t, float16>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_INT16) {
-            //nd4j::TypeCast::convertGeneric<int16_t, int16_t>(nullptr, hx, N, hz);
+            //sd::TypeCast::convertGeneric<int16_t, int16_t>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_UINT16) {
-//            nd4j::TypeCast::convertGeneric<int16_t, uint16_t>(nullptr, hx, N, hz);
+//            sd::TypeCast::convertGeneric<int16_t, uint16_t>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_FLOAT24) {
             // TODO...
         } else if (dstType == ND4J_FLOAT32) {
-            nd4j::TypeCast::convertGeneric<int16_t, float>(nullptr, hx, N, hz);
+            sd::TypeCast::convertGeneric<int16_t, float>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_DOUBLE) {
-            nd4j::TypeCast::convertGeneric<int16_t, double>(nullptr, hx, N, hz);
+            sd::TypeCast::convertGeneric<int16_t, double>(nullptr, hx, N, hz);
         } else {
             printf("Unsupported types conversion: [%i] -> [%i]\n", srcType, dstType);
         }
@@ -2516,57 +2516,57 @@ void convertTypes(Nd4jPointer *extras, int srcType, Nd4jPointer hX, Nd4jLong N,
 
     } else if (srcType == ND4J_FLOAT32) {
         if (dstType == ND4J_FLOAT8) {
-        //    nd4j::TypeCast::convertGeneric<float, nd4j::float8>(nullptr, hx, N, hz);
+        //    sd::TypeCast::convertGeneric<float, sd::float8>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_INT8) {
-            nd4j::TypeCast::convertGeneric<float, int8_t>(nullptr, hx, N, hz);
+            sd::TypeCast::convertGeneric<float, int8_t>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_UINT8) {
-            nd4j::TypeCast::convertGeneric<float, uint8_t>(nullptr, hx, N, hz);
+            sd::TypeCast::convertGeneric<float, uint8_t>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_FLOAT16) {
-            nd4j::TypeCast::convertGeneric<float, float16>(nullptr, hx, N, hz);
+            sd::TypeCast::convertGeneric<float, float16>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_INT16) {
-            nd4j::TypeCast::convertGeneric<float, int16_t>(nullptr, hx, N, hz);
+            sd::TypeCast::convertGeneric<float, int16_t>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_UINT16) {
-//            nd4j::TypeCast::convertGeneric<float, uint16_t>(nullptr, hx, N, hz);
+//            sd::TypeCast::convertGeneric<float, uint16_t>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_FLOAT24) {
 
         } else if (dstType == ND4J_DOUBLE) {
-            nd4j::TypeCast::convertGeneric<float, double>(nullptr, hx, N, hz);
+            sd::TypeCast::convertGeneric<float, double>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_THRESHOLD) {
-            nd4j::TypeCast::convertToThreshold<float>(nullptr, hx, N, hz);
+            sd::TypeCast::convertToThreshold<float>(nullptr, hx, N, hz);
         } else {
             nd4j_printf("Unsupported types conversion: [%i] -> [%i]\n", srcType, dstType);
         }
     } else if (srcType == ND4J_DOUBLE) {
         if (dstType == ND4J_FLOAT8) {
-         //   nd4j::TypeCast::convertGeneric<double, nd4j::float8>(nullptr, hx, N, hz);
+         //   sd::TypeCast::convertGeneric<double, sd::float8>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_INT8) {
-            nd4j::TypeCast::convertGeneric<double, int8_t>(nullptr, hx, N, hz);
+            sd::TypeCast::convertGeneric<double, int8_t>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_UINT8) {
-            nd4j::TypeCast::convertGeneric<double, uint8_t>(nullptr, hx, N, hz);
+            sd::TypeCast::convertGeneric<double, uint8_t>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_FLOAT16) {
-            nd4j::TypeCast::convertGeneric<double, float16>(nullptr, hx, N, hz);
+            sd::TypeCast::convertGeneric<double, float16>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_INT16) {
-            nd4j::TypeCast::convertGeneric<double, int16_t>(nullptr, hx, N, hz);
+            sd::TypeCast::convertGeneric<double, int16_t>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_UINT16) {
-//            nd4j::TypeCast::convertGeneric<double, uint16_t>(nullptr, hx, N, hz);
+//            sd::TypeCast::convertGeneric<double, uint16_t>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_FLOAT24) {
 
         } else if (dstType == ND4J_FLOAT32) {
-            nd4j::TypeCast::convertGeneric<double, float>(nullptr, hx, N, hz);
+            sd::TypeCast::convertGeneric<double, float>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_DOUBLE) {
             //
         } else if (dstType == ND4J_THRESHOLD) {
-            nd4j::TypeCast::convertToThreshold<double>(nullptr, hx, N, hz);
+            sd::TypeCast::convertToThreshold<double>(nullptr, hx, N, hz);
         } else {
             nd4j_printf("Unsupported types conversion: [%i] -> [%i]\n", srcType, dstType);
         }
     } else if (srcType == ND4J_THRESHOLD) {
         if (dstType == ND4J_FLOAT16) {
-            nd4j::TypeCast::convertFromThreshold<float16>(nullptr, hx, N, hz);
+            sd::TypeCast::convertFromThreshold<float16>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_FLOAT32) {
-            nd4j::TypeCast::convertFromThreshold<float>(nullptr, hx, N, hz);
+            sd::TypeCast::convertFromThreshold<float>(nullptr, hx, N, hz);
         } else if (dstType == ND4J_DOUBLE) {
-            nd4j::TypeCast::convertFromThreshold<double>(nullptr, hx, N, hz);
+            sd::TypeCast::convertFromThreshold<double>(nullptr, hx, N, hz);
         } else {
             nd4j_printf("Unsupported types conversion: [%i] -> [%i]\n", srcType, dstType);
         }
@@ -2577,27 +2577,27 @@ void convertTypes(Nd4jPointer *extras, int srcType, Nd4jPointer hX, Nd4jLong N,
 
 /*
 void fillUtf8String(Nd4jPointer *extraPointers, const char **strings, int numStrings, Nd4jPointer buffer) {
-    auto hZ = reinterpret_cast<nd4j::utf8string**>(buffer);
+    auto hZ = reinterpret_cast<sd::utf8string**>(buffer);
     for (int e = 0; e < numStrings; e++) {
-        hZ[e] = reinterpret_cast<nd4j::utf8string*>(createUtf8String(extraPointers, strings[e]));
+        hZ[e] = reinterpret_cast<sd::utf8string*>(createUtf8String(extraPointers, strings[e]));
     }
 }
  */
 
 Nd4jPointer createUtf8String(Nd4jPointer *extraPointers, const char *string, int length) {
-    auto u = new nd4j::utf8string(string, length);
+    auto u = new sd::utf8string(string, length);
     return reinterpret_cast<Nd4jPointer>(u);
 }
 
 Nd4jLong getUtf8StringLength(Nd4jPointer *extraPointers, Nd4jPointer ptr) {
-    return reinterpret_cast<nd4j::utf8string*>(ptr)->_length;
+    return reinterpret_cast<sd::utf8string*>(ptr)->_length;
 }
 char* getUtf8StringBuffer(Nd4jPointer *extraPointers, Nd4jPointer ptr) {
-    return reinterpret_cast<nd4j::utf8string*>(ptr)->_buffer;
+    return reinterpret_cast<sd::utf8string*>(ptr)->_buffer;
 }
 
 void deleteUtf8String(Nd4jPointer *extraPointers, Nd4jPointer ptr) {
-    delete(reinterpret_cast<nd4j::utf8string*>(ptr));
+    delete(reinterpret_cast<sd::utf8string*>(ptr));
 }
 
 template <typename I>
@@ -2668,20 +2668,20 @@ void scatterUpdate(Nd4jPointer *extraPointers, int opCode, int numOfSubArrs,
     try {
         BUILD_SINGLE_SELECTOR(iType, _scatterUpdate, (extraPointers, opCode, numOfSubArrs, hX, hXShapeInfo, hXOffsets, dX, dXShapeInfo, dXOffsets, hY, hYShapeInfo, hYOffsets, dY, dYShapeInfo, dYOffsets, hIindexes, hIndicesShapeInfo, dIindexes, dIndicesShapeInfo), INDEXING_TYPES);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
 
 void inspectArray(Nd4jPointer *extraPointers, Nd4jPointer buffer, Nd4jLong *shapeInfo, Nd4jPointer specialBuffer, Nd4jLong *specialShapeInfo, Nd4jPointer debugInfo) {
     try {
-        auto p = reinterpret_cast<nd4j::DebugInfo *>(debugInfo);
+        auto p = reinterpret_cast<sd::DebugInfo *>(debugInfo);
         NDArray array(buffer, shapeInfo);
-        nd4j::DebugHelper::retrieveDebugStatistics(p, &array);
+        sd::DebugHelper::retrieveDebugStatistics(p, &array);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -2692,85 +2692,85 @@ void tryPointer(Nd4jPointer extra, Nd4jPointer p, int len) {
         for (int i = 0; i < len; i++)
             cnt += buf[cnt];
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
-nd4j::ConstantDataBuffer* shapeBuffer(int rank, Nd4jLong *shape, Nd4jLong *strides, nd4j::DataType dtype, char order, Nd4jLong ews, bool empty) {
+sd::ConstantDataBuffer* shapeBuffer(int rank, Nd4jLong *shape, Nd4jLong *strides, sd::DataType dtype, char order, Nd4jLong ews, bool empty) {
     try {
         auto buffer = new ConstantDataBuffer();
-        *buffer = nd4j::ConstantShapeHelper::getInstance()->bufferForShapeInfo(
+        *buffer = sd::ConstantShapeHelper::getInstance()->bufferForShapeInfo(
                 ShapeDescriptor(dtype, order, shape, strides, rank, ews, empty));
         return buffer;
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
         return nullptr;
     }
 }
 
-void deleteShapeBuffer(nd4j::ConstantDataBuffer* ptr) {
+void deleteShapeBuffer(sd::ConstantDataBuffer* ptr) {
     delete ptr;
 }
 
-void deleteTadPack(nd4j::TadPack* ptr) {
+void deleteTadPack(sd::TadPack* ptr) {
     delete ptr;
 }
 
-nd4j::ConstantDataBuffer* constantBufferLong(nd4j::DataType dtype, Nd4jLong *data, int length) {
+sd::ConstantDataBuffer* constantBufferLong(sd::DataType dtype, Nd4jLong *data, int length) {
     return nullptr;
 }
 
-nd4j::ConstantDataBuffer* constantBufferDouble(nd4j::DataType dtype, double *data, int length) {
+sd::ConstantDataBuffer* constantBufferDouble(sd::DataType dtype, double *data, int length) {
     return nullptr;
 }
 
-nd4j::ConstantDataBuffer* constantBuffer(nd4j::DataType dtype, nd4j::ConstantDescriptor *descriptor) {
+sd::ConstantDataBuffer* constantBuffer(sd::DataType dtype, sd::ConstantDescriptor *descriptor) {
     try {
-        return nd4j::ConstantHelper::getInstance()->constantBuffer(*descriptor, dtype);
+        return sd::ConstantHelper::getInstance()->constantBuffer(*descriptor, dtype);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
         return nullptr;
     }
 }
 
-Nd4jPointer getConstantDataBufferPrimary(nd4j::ConstantDataBuffer* dbf) {
+Nd4jPointer getConstantDataBufferPrimary(sd::ConstantDataBuffer* dbf) {
     return dbf->primary();
 }
-Nd4jPointer getConstantDataBufferSpecial(nd4j::ConstantDataBuffer* dbf) {
+Nd4jPointer getConstantDataBufferSpecial(sd::ConstantDataBuffer* dbf) {
     return dbf->special();
 }
-Nd4jLong getConstantDataBufferLength(nd4j::ConstantDataBuffer* dbf) {
+Nd4jLong getConstantDataBufferLength(sd::ConstantDataBuffer* dbf) {
     return dbf->length();
 }
-Nd4jLong getConstantDataBufferSizeOf(nd4j::ConstantDataBuffer* dbf) {
+Nd4jLong getConstantDataBufferSizeOf(sd::ConstantDataBuffer* dbf) {
     return dbf->sizeOf();
 }
 
 
-nd4j::graph::Context* createGraphContext(int nodeId) {
+sd::graph::Context* createGraphContext(int nodeId) {
     try {
-        return new nd4j::graph::Context(nodeId);
+        return new sd::graph::Context(nodeId);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
         return nullptr;
     }
 }
-nd4j::graph::RandomGenerator* getGraphContextRandomGenerator(nd4j::graph::Context* ptr) {
+sd::graph::RandomGenerator* getGraphContextRandomGenerator(sd::graph::Context* ptr) {
     return &ptr->randomGenerator();
 }
-void markGraphContextInplace(nd4j::graph::Context* ptr, bool reallyInplace) {
+void markGraphContextInplace(sd::graph::Context* ptr, bool reallyInplace) {
     ptr->markInplace(reallyInplace);
 }
-void setGraphContextCudaContext(nd4j::graph::Context* ptr, void *stream, void *reductionPointer, void *allocationPointer) {
+void setGraphContextCudaContext(sd::graph::Context* ptr, void *stream, void *reductionPointer, void *allocationPointer) {
 }
-void setGraphContextInputArray(nd4j::graph::Context* ptr, int index, void *buffer, void *shapeInfo, void *specialBuffer, void *specialShapeInfo) {
+void setGraphContextInputArray(sd::graph::Context* ptr, int index, void *buffer, void *shapeInfo, void *specialBuffer, void *specialShapeInfo) {
     ptr->setInputArray(index, buffer, shapeInfo, specialBuffer, specialShapeInfo);
 }
-void setGraphContextOutputArray(nd4j::graph::Context* ptr, int index, void *buffer, void *shapeInfo, void *specialBuffer, void *specialShapeInfo) {
+void setGraphContextOutputArray(sd::graph::Context* ptr, int index, void *buffer, void *shapeInfo, void *specialBuffer, void *specialShapeInfo) {
     ptr->setOutputArray(index, buffer, shapeInfo, specialBuffer, specialShapeInfo);
 }
 
@@ -2782,25 +2782,25 @@ void setGraphContextOutputBuffer(OpaqueContext* ptr, int index, OpaqueDataBuffer
     ptr->setOutputArray(index, buffer, shapeInfo, specialShapeInfo);
 }
 
-void setGraphContextTArguments(nd4j::graph::Context* ptr, double *arguments, int numberOfArguments) {
+void setGraphContextTArguments(sd::graph::Context* ptr, double *arguments, int numberOfArguments) {
     ptr->setTArguments(arguments, numberOfArguments);
 }
-void setGraphContextIArguments(nd4j::graph::Context* ptr, Nd4jLong *arguments, int numberOfArguments) {
+void setGraphContextIArguments(sd::graph::Context* ptr, Nd4jLong *arguments, int numberOfArguments) {
     ptr->setIArguments(arguments, numberOfArguments);
 }
-void setGraphContextBArguments(nd4j::graph::Context* ptr, bool *arguments, int numberOfArguments) {
+void setGraphContextBArguments(sd::graph::Context* ptr, bool *arguments, int numberOfArguments) {
     ptr->setBArguments(arguments, numberOfArguments);
 }
 
 void setGraphContextDArguments(OpaqueContext* ptr, int *arguments, int numberOfArguments) {
-    std::vector<nd4j::DataType> dtypes(numberOfArguments);
+    std::vector<sd::DataType> dtypes(numberOfArguments);
     for (int e = 0; e < numberOfArguments; e++)
-        dtypes[e] = (nd4j::DataType) arguments[e];
+        dtypes[e] = (sd::DataType) arguments[e];
 
     ptr->setDArguments(dtypes);
 }
 
-void deleteGraphContext(nd4j::graph::Context* ptr) {
+void deleteGraphContext(sd::graph::Context* ptr) {
     delete ptr;
 }
 
@@ -2819,31 +2819,31 @@ void ctxPurge(OpaqueContext* ptr) {
     ptr->clearFastPath();
 }
 
-nd4j::graph::RandomGenerator* createRandomGenerator(Nd4jLong rootSeed, Nd4jLong nodeSeed) {
-    return new nd4j::graph::RandomGenerator(rootSeed, nodeSeed);
+sd::graph::RandomGenerator* createRandomGenerator(Nd4jLong rootSeed, Nd4jLong nodeSeed) {
+    return new sd::graph::RandomGenerator(rootSeed, nodeSeed);
 }
 
-Nd4jLong getRandomGeneratorRootState(nd4j::graph::RandomGenerator* ptr) {
+Nd4jLong getRandomGeneratorRootState(sd::graph::RandomGenerator* ptr) {
     return ptr->rootState();
 }
 
-Nd4jLong getRandomGeneratorNodeState(nd4j::graph::RandomGenerator* ptr) {
+Nd4jLong getRandomGeneratorNodeState(sd::graph::RandomGenerator* ptr) {
     return ptr->nodeState();
 }
 
-void setRandomGeneratorStates(nd4j::graph::RandomGenerator* ptr, Nd4jLong rootSeed, Nd4jLong nodeSeed) {
+void setRandomGeneratorStates(sd::graph::RandomGenerator* ptr, Nd4jLong rootSeed, Nd4jLong nodeSeed) {
     ptr->setStates(rootSeed, nodeSeed);
 }
 
-int getRandomGeneratorRelativeInt(nd4j::graph::RandomGenerator* ptr, Nd4jLong index) {
+int getRandomGeneratorRelativeInt(sd::graph::RandomGenerator* ptr, Nd4jLong index) {
     return ptr->relativeInt(index);
 }
 
-Nd4jLong getRandomGeneratorRelativeLong(nd4j::graph::RandomGenerator* ptr, Nd4jLong index) {
+Nd4jLong getRandomGeneratorRelativeLong(sd::graph::RandomGenerator* ptr, Nd4jLong index) {
     return ptr->relativeLong(index);
 }
 
-void deleteRandomGenerator(nd4j::graph::RandomGenerator* ptr) {
+void deleteRandomGenerator(sd::graph::RandomGenerator* ptr) {
     delete ptr;
 }
 
@@ -2870,19 +2870,19 @@ Nd4jPointer shapeBufferForNumpy(Nd4jPointer npyArray) {
         Nd4jLong *shapeBuffer;
         if (shape.size() == 1 && shape[0] == 0) {
             // scalar case
-            shapeBuffer = nd4j::ShapeBuilders::createScalarShapeInfo(dtype);
+            shapeBuffer = sd::ShapeBuilders::createScalarShapeInfo(dtype);
         } else if (_empty) {
             if (shapeSize > 0)
-                shapeBuffer = nd4j::ShapeBuilders::emptyShapeInfo(dtype, arr.fortranOrder ? 'f' : 'c', shape);
+                shapeBuffer = sd::ShapeBuilders::emptyShapeInfo(dtype, arr.fortranOrder ? 'f' : 'c', shape);
             else
-                shapeBuffer = nd4j::ShapeBuilders::emptyShapeInfo(dtype);
+                shapeBuffer = sd::ShapeBuilders::emptyShapeInfo(dtype);
         } else {
-            shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(dtype, arr.fortranOrder ? 'f' : 'c', shape);
+            shapeBuffer = sd::ShapeBuilders::createShapeInfo(dtype, arr.fortranOrder ? 'f' : 'c', shape);
         }
-        return reinterpret_cast<Nd4jPointer>(nd4j::ConstantShapeHelper::getInstance()->createFromExisting(shapeBuffer, true));
+        return reinterpret_cast<Nd4jPointer>(sd::ConstantShapeHelper::getInstance()->createFromExisting(shapeBuffer, true));
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
         return nullptr;
     }
 }
@@ -2897,10 +2897,10 @@ void sortByKey(Nd4jPointer *extraPointers,
         auto xType = ArrayOptions::dataType(xShapeInfo);
         auto yType = ArrayOptions::dataType(yShapeInfo);
 
-        BUILD_DOUBLE_SELECTOR(xType, yType, nd4j::DoubleMethods, ::sortByKey(x, xShapeInfo, y, yShapeInfo, descending), LIBND4J_TYPES, LIBND4J_TYPES);
+        BUILD_DOUBLE_SELECTOR(xType, yType, sd::DoubleMethods, ::sortByKey(x, xShapeInfo, y, yShapeInfo, descending), LIBND4J_TYPES, LIBND4J_TYPES);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -2914,10 +2914,10 @@ void sortByValue(Nd4jPointer *extraPointers,
         auto xType = ArrayOptions::dataType(xShapeInfo);
         auto yType = ArrayOptions::dataType(yShapeInfo);
 
-        BUILD_DOUBLE_SELECTOR(xType, yType, nd4j::DoubleMethods, ::sortByValue(x, xShapeInfo, y, yShapeInfo, descending), LIBND4J_TYPES, LIBND4J_TYPES);
+        BUILD_DOUBLE_SELECTOR(xType, yType, sd::DoubleMethods, ::sortByValue(x, xShapeInfo, y, yShapeInfo, descending), LIBND4J_TYPES, LIBND4J_TYPES);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -2933,10 +2933,10 @@ void sortTadByKey(Nd4jPointer *extraPointers,
         auto xType = ArrayOptions::dataType(xShapeInfo);
         auto yType = ArrayOptions::dataType(yShapeInfo);
 
-        BUILD_DOUBLE_SELECTOR(xType, yType, nd4j::DoubleMethods, ::sortTadByKey(x, xShapeInfo, y, yShapeInfo, dimension, dimensionLength, descending), LIBND4J_TYPES, LIBND4J_TYPES);
+        BUILD_DOUBLE_SELECTOR(xType, yType, sd::DoubleMethods, ::sortTadByKey(x, xShapeInfo, y, yShapeInfo, dimension, dimensionLength, descending), LIBND4J_TYPES, LIBND4J_TYPES);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -2952,16 +2952,16 @@ void sortTadByValue(Nd4jPointer *extraPointers,
         auto xType = ArrayOptions::dataType(xShapeInfo);
         auto yType = ArrayOptions::dataType(yShapeInfo);
 
-        BUILD_DOUBLE_SELECTOR(xType, yType, nd4j::DoubleMethods, ::sortTadByValue(x, xShapeInfo, y, yShapeInfo, dimension, dimensionLength, descending), LIBND4J_TYPES, LIBND4J_TYPES);
+        BUILD_DOUBLE_SELECTOR(xType, yType, sd::DoubleMethods, ::sortTadByValue(x, xShapeInfo, y, yShapeInfo, dimension, dimensionLength, descending), LIBND4J_TYPES, LIBND4J_TYPES);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
 const char* runLightBenchmarkSuit(bool printOut) {
     try {
-        nd4j::LightBenchmarkSuit suit;
+        sd::LightBenchmarkSuit suit;
         auto result = suit.runSuit();
 
         if (printOut)
@@ -2973,19 +2973,19 @@ const char* runLightBenchmarkSuit(bool printOut) {
 
         return chars;
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
         return nullptr;
     }
 }
 
 Nd4jLong getCachedMemory(int deviceId) {
-    return nd4j::ConstantHelper::getInstance()->getCachedAmount(deviceId);
+    return sd::ConstantHelper::getInstance()->getCachedAmount(deviceId);
 }
 
 const char* runFullBenchmarkSuit(bool printOut) {
     try {
-        nd4j::FullBenchmarkSuit suit;
+        sd::FullBenchmarkSuit suit;
         auto result = suit.runSuit();
 
         if (printOut)
@@ -2997,13 +2997,13 @@ const char* runFullBenchmarkSuit(bool printOut) {
 
         return chars;
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
         return nullptr;
     }
 }
 
-nd4j::LaunchContext* defaultLaunchContext() {
+sd::LaunchContext* defaultLaunchContext() {
     return LaunchContext::defaultContext();
 }
 
@@ -3036,11 +3036,11 @@ Nd4jPointer lcSolverHandle(OpaqueLaunchContext* lc) {
 }
 
 int lastErrorCode() {
-    return nd4j::LaunchContext::defaultContext()->errorReference()->errorCode();
+    return sd::LaunchContext::defaultContext()->errorReference()->errorCode();
 }
 
 const char* lastErrorMessage() {
-    return nd4j::LaunchContext::defaultContext()->errorReference()->errorMessage();
+    return sd::LaunchContext::defaultContext()->errorReference()->errorMessage();
 }
 
 void ctxShapeFunctionOverride(OpaqueContext* ptr, bool reallyOverride) {
@@ -3118,10 +3118,10 @@ bool isOptimalRequirementsMet() {
 OpaqueDataBuffer* allocateDataBuffer(Nd4jLong elements, int dataType, bool allocateBoth) {
     try {
         auto dtype = DataTypeUtils::fromInt(dataType);
-        return new nd4j::InteropDataBuffer(elements * DataTypeUtils::sizeOf(dtype) , dtype, allocateBoth);
+        return new sd::InteropDataBuffer(elements * DataTypeUtils::sizeOf(dtype) , dtype, allocateBoth);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
         return nullptr;
     }
 }
@@ -3158,8 +3158,8 @@ void dbExpandBuffer(OpaqueDataBuffer *dataBuffer, Nd4jLong elements) {
     try {
         dataBuffer->dataBuffer()->expand(elements * DataTypeUtils::sizeOf(dataBuffer->dataBuffer()->getDataType()));
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
diff --git a/libnd4j/blas/cuda/BlasVersionHelper.cu b/libnd4j/include/legacy/cuda/BlasVersionHelper.cu
similarity index 94%
rename from libnd4j/blas/cuda/BlasVersionHelper.cu
rename to libnd4j/include/legacy/cuda/BlasVersionHelper.cu
index 1f80a0cc0..04b0e78f1 100644
--- a/libnd4j/blas/cuda/BlasVersionHelper.cu
+++ b/libnd4j/include/legacy/cuda/BlasVersionHelper.cu
@@ -18,9 +18,9 @@
 // @author raver119@gmail.com
 //
 
-#include "../BlasVersionHelper.h"
+#include <system/BlasVersionHelper.h>
 
-namespace nd4j {
+namespace sd {
     BlasVersionHelper::BlasVersionHelper() {
         _blasMajorVersion = __CUDACC_VER_MAJOR__;
         _blasMinorVersion = __CUDACC_VER_MINOR__;
diff --git a/libnd4j/blas/cuda/NativeOpExecutioner.cu b/libnd4j/include/legacy/cuda/NativeOpExecutioner.cu
similarity index 84%
rename from libnd4j/blas/cuda/NativeOpExecutioner.cu
rename to libnd4j/include/legacy/cuda/NativeOpExecutioner.cu
index 1e0685dc4..c618115de 100644
--- a/libnd4j/blas/cuda/NativeOpExecutioner.cu
+++ b/libnd4j/include/legacy/cuda/NativeOpExecutioner.cu
@@ -14,16 +14,16 @@
  * SPDX-License-Identifier: Apache-2.0
  ******************************************************************************/
 
-#include "../NativeOpExecutioner.h"
+#include <legacy/NativeOpExecutioner.h>
 #include <cuda.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <helpers/DebugHelper.h>
-#include <DataTypeUtils.h>
+#include <array/DataTypeUtils.h>
 #include <exceptions/datatype_exception.h>
 #include <exceptions/cuda_exception.h>
 #include <helpers/CudaLaunchHelper.h>
 #include <helpers/ShapeBuilders.h>
-#include <PointersManager.h>
+#include <helpers/PointersManager.h>
 
 #include <array/ConstantDataBuffer.h>
 #include <array/ShapeDescriptor.h>
@@ -55,12 +55,12 @@
 #include <loops/scalar_bool.h>
 #include <loops/scalar_int.h>
 
-using namespace nd4j;
+using namespace sd;
 
 /**
 * This is utility kernel, that updates given special buffer with proper values in device memory
 */
-extern "C" __global__ void prepareShapeBuffer(int *dimension, int *maxDimension, Nd4jLong *specialPointer, int rows, nd4j::DataType dataType) {
+extern "C" __global__ void prepareShapeBuffer(int *dimension, int *maxDimension, Nd4jLong *specialPointer, int rows, sd::DataType dataType) {
     Nd4jLong tid = blockIdx.x * blockDim.x + threadIdx.x;
     if (tid > 0)
         return;
@@ -85,7 +85,7 @@ extern "C" __global__ void prepareShapeBuffer(int *dimension, int *maxDimension,
 
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execPairwiseTransform(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execPairwiseTransform(sd::LaunchContext  *lc,
                                     int opNum,
                                     void *hX, Nd4jLong *hXShapeInfo,
                                     void *dX, Nd4jLong *dXShapeInfo,
@@ -97,9 +97,9 @@ void NativeOpExecutioner::execPairwiseTransform(nd4j::LaunchContext  *lc,
 
     auto stream = lc->getCudaStream();
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hYShapeInfo))
         return;
@@ -126,7 +126,7 @@ void NativeOpExecutioner::execPairwiseTransform(nd4j::LaunchContext  *lc,
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execPairwiseBoolTransform( nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execPairwiseBoolTransform( sd::LaunchContext  *lc,
                                                     int opNum,
                                                     void *hX, Nd4jLong *hXShapeInfo,
                                                     void *dX, Nd4jLong *dXShapeInfo,
@@ -138,18 +138,18 @@ void NativeOpExecutioner::execPairwiseBoolTransform( nd4j::LaunchContext  *lc,
 
 	auto stream = lc->getCudaStream();
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hYShapeInfo))
         return;
 
     if (!DataTypeUtils::isB(zType))
-		throw nd4j::datatype_exception::build("NativeOpExecutioner::execPairwiseBoolTransform wrong Z operand data type", nd4j::DataType::BOOL, zType);
+		throw sd::datatype_exception::build("NativeOpExecutioner::execPairwiseBoolTransform wrong Z operand data type", sd::DataType::BOOL, zType);
 
     if (yType != xType)
-        throw nd4j::datatype_exception::build("NativeOpExecutioner::execPairwiseBoolTransform both operands must have same data type", xType, yType);
+        throw sd::datatype_exception::build("NativeOpExecutioner::execPairwiseBoolTransform both operands must have same data type", xType, yType);
 
     dim3 launchDims(256, 1024, 16384);
 
@@ -162,7 +162,7 @@ void NativeOpExecutioner::execPairwiseBoolTransform( nd4j::LaunchContext  *lc,
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execPairwiseIntTransform( nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execPairwiseIntTransform( sd::LaunchContext  *lc,
                                                      int opNum,
                                                      void *hX, Nd4jLong *hXShapeInfo,
                                                      void *dX, Nd4jLong *dXShapeInfo,
@@ -174,18 +174,18 @@ void NativeOpExecutioner::execPairwiseIntTransform( nd4j::LaunchContext  *lc,
 
     auto stream = lc->getCudaStream();
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hYShapeInfo))
         return;
 
     if (!DataTypeUtils::isZ(zType))
-        throw nd4j::datatype_exception::build("NativeOpExecutioner::execPairwiseIntTransform wrong Z operand data type", nd4j::DataType::BOOL, zType);
+        throw sd::datatype_exception::build("NativeOpExecutioner::execPairwiseIntTransform wrong Z operand data type", sd::DataType::BOOL, zType);
 
     if (yType != xType || zType != xType)
-        throw nd4j::datatype_exception::build("NativeOpExecutioner::execPairwiseIntTransform both operands must have same data type", xType, yType);
+        throw sd::datatype_exception::build("NativeOpExecutioner::execPairwiseIntTransform both operands must have same data type", xType, yType);
 
     dim3 launchDims(256, 1024, 16384);
 
@@ -198,7 +198,7 @@ void NativeOpExecutioner::execPairwiseIntTransform( nd4j::LaunchContext  *lc,
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execSummaryStatsScalar(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execSummaryStatsScalar(sd::LaunchContext  *lc,
                                     int opNum,
                                     void *hX, Nd4jLong *hXShapeInfo,
                                     void *dX, Nd4jLong *dXShapeInfo,
@@ -212,8 +212,8 @@ void NativeOpExecutioner::execSummaryStatsScalar(nd4j::LaunchContext  *lc,
 
     dim3 launchDims = dim3(256, 256, 32768);
 
-	auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-	auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+	auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+	auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     BUILD_DOUBLE_SELECTOR(xType, zType, functions::summarystats::SummaryStatsReduce, ::execSummaryStatsReduceScalar(launchDims, stream, opNum, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, nullptr, nullptr, biasCorrected, reductionPointer), LIBND4J_TYPES, FLOAT_TYPES);
 
@@ -224,7 +224,7 @@ void NativeOpExecutioner::execSummaryStatsScalar(nd4j::LaunchContext  *lc,
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execBroadcastBool(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execBroadcastBool(sd::LaunchContext  *lc,
                             int opNum,
                             void *hX, Nd4jLong *hXShapeInfo,
                             void *dX, Nd4jLong *dXShapeInfo,
@@ -239,9 +239,9 @@ void NativeOpExecutioner::execBroadcastBool(nd4j::LaunchContext  *lc,
 
 	auto stream = lc->getCudaStream();
 
-	auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-	auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+	auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+	auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hYShapeInfo))
         return;
@@ -252,7 +252,7 @@ void NativeOpExecutioner::execBroadcastBool(nd4j::LaunchContext  *lc,
     if (yType != xType)
         throw std::runtime_error("NativeOpExecutioner::execBroadcastBool requires both X & Y operands to have same type");
 
-	if (nd4j::Environment::getInstance()->isDebugAndVerbose())
+	if (sd::Environment::getInstance()->isDebugAndVerbose())
 		printf("F3B opNum:[%i]\n", opNum);
 
 	dim3 launchDims(256, 256, 1024);
@@ -265,7 +265,7 @@ void NativeOpExecutioner::execBroadcastBool(nd4j::LaunchContext  *lc,
         throw cuda_exception::build("execBroadcastBool failed", res);
 }
 
-void NativeOpExecutioner::execInverseBroadcastBool(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execInverseBroadcastBool(sd::LaunchContext  *lc,
                                                    int opNum,
                                                    void *hX, Nd4jLong *hXShapeInfo,
                                                    void *dX, Nd4jLong *dXShapeInfo,
@@ -279,9 +279,9 @@ void NativeOpExecutioner::execInverseBroadcastBool(nd4j::LaunchContext  *lc,
                                                    Nd4jLong *tadOnlyShapeInfoZ,Nd4jLong *tadOffsetsZ) {
     auto stream = lc->getCudaStream();
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hYShapeInfo))
         return;
@@ -304,7 +304,7 @@ void NativeOpExecutioner::execInverseBroadcastBool(nd4j::LaunchContext  *lc,
 
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execBroadcastInt(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execBroadcastInt(sd::LaunchContext  *lc,
                                             int opNum,
                                             void *hX, Nd4jLong *hXShapeInfo,
                                             void *dX, Nd4jLong *dXShapeInfo,
@@ -318,9 +318,9 @@ void NativeOpExecutioner::execBroadcastInt(nd4j::LaunchContext  *lc,
 
     auto stream = lc->getCudaStream();
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hYShapeInfo))
         return;
@@ -341,7 +341,7 @@ void NativeOpExecutioner::execBroadcastInt(nd4j::LaunchContext  *lc,
         throw cuda_exception::build("execBroadcastBool failed", res);
 }
 
-void NativeOpExecutioner::execInverseBroadcastInt(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execInverseBroadcastInt(sd::LaunchContext  *lc,
                                                    int opNum,
                                                    void *hX, Nd4jLong *hXShapeInfo,
                                                    void *dX, Nd4jLong *dXShapeInfo,
@@ -354,9 +354,9 @@ void NativeOpExecutioner::execInverseBroadcastInt(nd4j::LaunchContext  *lc,
                                                    Nd4jLong *tadOnlyShapeInfoZ,Nd4jLong *tadOffsetsZ) {
     auto stream = lc->getCudaStream();
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hYShapeInfo))
         return;
@@ -367,7 +367,7 @@ void NativeOpExecutioner::execInverseBroadcastInt(nd4j::LaunchContext  *lc,
     if (yType != xType || zType != xType)
         throw std::runtime_error("NativeOpExecutioner::execBroadcastInt requires both X & Y operands to have same type");
 
-    if (nd4j::Environment::getInstance()->isDebugAndVerbose())
+    if (sd::Environment::getInstance()->isDebugAndVerbose())
         printf("F3BI opNum:[%i]\n", opNum);
 
     dim3 launchDims(256, 256, 1024);
@@ -393,7 +393,7 @@ void NativeOpExecutioner::execInverseBroadcastInt(nd4j::LaunchContext  *lc,
  * @param dimension
  * @param dimensionLength
  */
-void NativeOpExecutioner::execBroadcast(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execBroadcast(sd::LaunchContext  *lc,
 		                              int opNum,
 		                              void *hX, Nd4jLong *hXShapeInfo,
 		                              void *dX, Nd4jLong *dXShapeInfo,
@@ -407,9 +407,9 @@ void NativeOpExecutioner::execBroadcast(nd4j::LaunchContext  *lc,
 
 	auto stream = lc->getCudaStream();
 
-	auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-	auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+	auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+	auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hYShapeInfo))
         return;
@@ -428,7 +428,7 @@ void NativeOpExecutioner::execBroadcast(nd4j::LaunchContext  *lc,
         throw cuda_exception::build("execBroadcast failed", res);
 }
 
-void NativeOpExecutioner::execInverseBroadcast(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execInverseBroadcast(sd::LaunchContext  *lc,
                                                int opNum,
                                                void *hX, Nd4jLong *hXShapeInfo,
                                                void *dX, Nd4jLong *dXShapeInfo,
@@ -442,9 +442,9 @@ void NativeOpExecutioner::execInverseBroadcast(nd4j::LaunchContext  *lc,
 
     auto stream = lc->getCudaStream();
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hYShapeInfo))
         return;
@@ -464,7 +464,7 @@ void NativeOpExecutioner::execInverseBroadcast(nd4j::LaunchContext  *lc,
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execReduceSame(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execReduceSame(sd::LaunchContext  *lc,
                             int opNum,
                             void *hX, Nd4jLong *hXShapeInfo,
                             void *dX, Nd4jLong *dXShapeInfo,
@@ -477,11 +477,11 @@ void NativeOpExecutioner::execReduceSame(nd4j::LaunchContext  *lc,
 	auto stream = lc->getCudaStream();
 	auto reductionPointer = lc->getReductionPointer();
 
-    if (nd4j::Environment::getInstance()->isDebugAndVerbose())
+    if (sd::Environment::getInstance()->isDebugAndVerbose())
         printf("SF7 opNum:[%i]\n", opNum);
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
     auto xRank = shape::rank(hXShapeInfo);
 
     if (zType != xType)
@@ -499,7 +499,7 @@ void NativeOpExecutioner::execReduceSame(nd4j::LaunchContext  *lc,
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execReduceLong(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execReduceLong(sd::LaunchContext  *lc,
                             int opNum,
                             void *hX, Nd4jLong *hXShapeInfo,
                             void *dX, Nd4jLong *dXShapeInfo,
@@ -512,14 +512,14 @@ void NativeOpExecutioner::execReduceLong(nd4j::LaunchContext  *lc,
 	auto stream = lc->getCudaStream();
 	auto reductionPointer = lc->getReductionPointer();
 
-    if (nd4j::Environment::getInstance()->isDebugAndVerbose())
+    if (sd::Environment::getInstance()->isDebugAndVerbose())
         printf("LF7 opNum:[%i]\n", opNum);
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
-    if (zType != nd4j::DataType::INT64)
-        throw datatype_exception::build("NativeOpExecutioner::execReduceLong wrong Z data type", nd4j::DataType::INT64, zType);
+    if (zType != sd::DataType::INT64)
+        throw datatype_exception::build("NativeOpExecutioner::execReduceLong wrong Z data type", sd::DataType::INT64, zType);
 
     auto xRank = shape::rank(hXShapeInfo);
     auto numBlocks = shape::length(hZShapeInfo);
@@ -535,7 +535,7 @@ void NativeOpExecutioner::execReduceLong(nd4j::LaunchContext  *lc,
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execReduceBool(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execReduceBool(sd::LaunchContext  *lc,
                             int opNum,
                             void *hX, Nd4jLong *hXShapeInfo,
                             void *dX, Nd4jLong *dXShapeInfo,
@@ -548,13 +548,13 @@ void NativeOpExecutioner::execReduceBool(nd4j::LaunchContext  *lc,
 	auto stream = lc->getCudaStream();
 	auto reductionPointer = lc->getReductionPointer();
 
-    if (nd4j::Environment::getInstance()->isDebugAndVerbose())
+    if (sd::Environment::getInstance()->isDebugAndVerbose())
         printf("BF7 opNum:[%i]\n", opNum);
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
-    if (zType != nd4j::DataType::BOOL)
+    if (zType != sd::DataType::BOOL)
         throw std::runtime_error("NativeOpExecutioner::execReduceBool requires Z operand to have BOOL type");
 
     auto xRank = shape::rank(hXShapeInfo);
@@ -581,7 +581,7 @@ void NativeOpExecutioner::execReduceBool(nd4j::LaunchContext  *lc,
  * @param dimension
  * @param dimensionLength
  */
-void NativeOpExecutioner::execIndexReduce(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execIndexReduce(sd::LaunchContext  *lc,
                                 int opNum,
                                 void *hX, Nd4jLong *hXShapeInfo,
                                 void *dX, Nd4jLong *dXShapeInfo,
@@ -595,15 +595,15 @@ void NativeOpExecutioner::execIndexReduce(nd4j::LaunchContext  *lc,
 	auto reductionPointer = lc->getReductionPointer();
 	auto allocationPointer = lc->getAllocationPointer();
 
-	if (nd4j::Environment::getInstance()->isDebugAndVerbose())
+	if (sd::Environment::getInstance()->isDebugAndVerbose())
 		printf("F2 opNum:[%i]\n", opNum);
 
-	auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+	auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 	auto numBlocks = shape::length(hZShapeInfo);
     dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);
 
-    if (zType != nd4j::DataType::INT64 && zType != nd4j::DataType::INT32)
+    if (zType != sd::DataType::INT64 && zType != sd::DataType::INT32)
         throw datatype_exception::build("NativeOpExecutioner::execIndexReduce requires Z operand to have INT32/INT64 type", zType);
 
 	auto dz = reinterpret_cast<Nd4jLong*>(dZ);
@@ -626,7 +626,7 @@ void NativeOpExecutioner::execIndexReduce(nd4j::LaunchContext  *lc,
  * @param dZ
  * @param dZShapeInfo
  */
-void  NativeOpExecutioner::execReduceFloat(nd4j::LaunchContext  *lc,
+void  NativeOpExecutioner::execReduceFloat(sd::LaunchContext  *lc,
 										int opNum,
 										void *hX, Nd4jLong *hXShapeInfo,
         								void *dX, Nd4jLong *dXShapeInfo,
@@ -639,11 +639,11 @@ void  NativeOpExecutioner::execReduceFloat(nd4j::LaunchContext  *lc,
 	auto stream = lc->getCudaStream();
 	auto reductionPointer = lc->getReductionPointer();
 
-	if (nd4j::Environment::getInstance()->isDebugAndVerbose())
+	if (sd::Environment::getInstance()->isDebugAndVerbose())
 		printf("F8 opNum:[%i]\n", opNum);
 
-	auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+	auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     auto xRank = shape::rank(hXShapeInfo);
     auto numBlocks = shape::length(hZShapeInfo);
@@ -666,7 +666,7 @@ void  NativeOpExecutioner::execReduceFloat(nd4j::LaunchContext  *lc,
  * @param extraParams
  */
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execIndexReduceScalar(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execIndexReduceScalar(sd::LaunchContext  *lc,
 											int opNum,
 											void *hX, Nd4jLong *hXShapeInfo,
         									void *dX, Nd4jLong *dXShapeInfo,
@@ -674,7 +674,7 @@ void NativeOpExecutioner::execIndexReduceScalar(nd4j::LaunchContext  *lc,
         									void *hZ, Nd4jLong *hZShapeInfo,
 											void *dZ, Nd4jLong *dZShapeInfo){
 
-	if (nd4j::Environment::getInstance()->isDebug())
+	if (sd::Environment::getInstance()->isDebug())
 		printf("F1 opNum:[%i]\n", opNum);
 
 	auto stream = lc->getCudaStream();
@@ -686,17 +686,17 @@ void NativeOpExecutioner::execIndexReduceScalar(nd4j::LaunchContext  *lc,
     auto numBlocks = CudaLaunchHelper::getReductionBlocks(xLength, blockWidth);
     dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);
 
-	if (nd4j::Environment::getInstance()->isDebugAndVerbose() && launchDims.x == 1)
+	if (sd::Environment::getInstance()->isDebugAndVerbose() && launchDims.x == 1)
 		printf("AF1 opNum:[%i]\n", opNum);
 
-	auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+	auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     // FIXME: we want Z to be one of integer types
 	//if (!DataTypeUtils::isZ(zType))
-	//    throw nd4j::datatype_exception("NativeOpExecutioner::execIndexReduceScalar requires Z operand to have one of integer types")
-	if (zType != nd4j::DataType::INT64 && zType != nd4j::DataType::INT32)
-        throw nd4j::datatype_exception::build("NativeOpExecutioner::execIndexReduceScalar requires Z operand to have INT32/INT64 data type", zType);
+	//    throw sd::datatype_exception("NativeOpExecutioner::execIndexReduceScalar requires Z operand to have one of integer types")
+	if (zType != sd::DataType::INT64 && zType != sd::DataType::INT32)
+        throw sd::datatype_exception::build("NativeOpExecutioner::execIndexReduceScalar requires Z operand to have INT32/INT64 data type", zType);
 
     auto dz = reinterpret_cast<Nd4jLong*>(dZ);
 
@@ -717,7 +717,7 @@ void NativeOpExecutioner::execIndexReduceScalar(nd4j::LaunchContext  *lc,
 
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execReduceFloatScalar(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execReduceFloatScalar(sd::LaunchContext  *lc,
                                                 int opNum,
                                                 void *hX, Nd4jLong *hXShapeInfo,
                                                 void *dX, Nd4jLong *dXShapeInfo,
@@ -728,8 +728,8 @@ void NativeOpExecutioner::execReduceFloatScalar(nd4j::LaunchContext  *lc,
     auto stream = lc->getCudaStream();
     auto reductionPointer = lc->getReductionPointer();
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     auto xLength = shape::length(hXShapeInfo);
     auto blockWidth = 256;
@@ -746,7 +746,7 @@ void NativeOpExecutioner::execReduceFloatScalar(nd4j::LaunchContext  *lc,
 
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execReduceBoolScalar(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execReduceBoolScalar(sd::LaunchContext  *lc,
                                         int opNum,
                                         void *hX, Nd4jLong *hXShapeInfo,
                                         void *dX, Nd4jLong *dXShapeInfo,
@@ -757,10 +757,10 @@ void NativeOpExecutioner::execReduceBoolScalar(nd4j::LaunchContext  *lc,
     auto stream = lc->getCudaStream();
     auto reductionPointer = lc->getReductionPointer();
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
-    if (zType != nd4j::DataType::BOOL)
+    if (zType != sd::DataType::BOOL)
         throw std::runtime_error("NativeOpExecutioner::execReduceBoolScalar requires Z operand to have BOOL type");
 
     auto xLength = shape::length(hXShapeInfo);
@@ -777,7 +777,7 @@ void NativeOpExecutioner::execReduceBoolScalar(nd4j::LaunchContext  *lc,
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execReduceSameScalar(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execReduceSameScalar(sd::LaunchContext  *lc,
                                         int opNum,
                                         void *hX, Nd4jLong *hXShapeInfo,
                                         void *dX, Nd4jLong *dXShapeInfo,
@@ -788,8 +788,8 @@ void NativeOpExecutioner::execReduceSameScalar(nd4j::LaunchContext  *lc,
     auto stream = lc->getCudaStream();
     auto reductionPointer = lc->getReductionPointer();
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (zType != xType)
         throw datatype_exception::build("NativeOpExecutioner::execReduceSameScalar requires both X & Z operands to have same type", xType, zType);
@@ -808,7 +808,7 @@ void NativeOpExecutioner::execReduceSameScalar(nd4j::LaunchContext  *lc,
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execReduceLongScalar(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execReduceLongScalar(sd::LaunchContext  *lc,
                                     int opNum,
                                     void *hX, Nd4jLong *hXShapeInfo,
                                     void *dX, Nd4jLong *dXShapeInfo,
@@ -819,11 +819,11 @@ void NativeOpExecutioner::execReduceLongScalar(nd4j::LaunchContext  *lc,
     auto stream = lc->getCudaStream();
     auto reductionPointer = lc->getReductionPointer();
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
-    if (zType != nd4j::DataType::INT64)
-        throw datatype_exception::build("NativeOpExecutioner::execReduceLongScalar wrong Z data type", nd4j::DataType::INT64, zType);
+    if (zType != sd::DataType::INT64)
+        throw datatype_exception::build("NativeOpExecutioner::execReduceLongScalar wrong Z data type", sd::DataType::INT64, zType);
 
     auto xLength = shape::length(hXShapeInfo);
     auto blockWidth = 256;
@@ -839,7 +839,7 @@ void NativeOpExecutioner::execReduceLongScalar(nd4j::LaunchContext  *lc,
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execTransformSame(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execTransformSame(sd::LaunchContext  *lc,
 									int opNum,
                                    	void *hX, Nd4jLong *hXShapeInfo,
                                    	void *dX, Nd4jLong *dXShapeInfo,
@@ -873,7 +873,7 @@ void NativeOpExecutioner::execTransformSame(nd4j::LaunchContext  *lc,
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execTransformBool(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execTransformBool(sd::LaunchContext  *lc,
                                 int opNum,
                                 void *hX, Nd4jLong *hXShapeInfo,
                                 void *dX, Nd4jLong *dXShapeInfo,
@@ -907,7 +907,7 @@ void NativeOpExecutioner::execTransformBool(nd4j::LaunchContext  *lc,
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execTransformAny(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execTransformAny(sd::LaunchContext  *lc,
                                 		int opNum,
                                 		void *hX, Nd4jLong *hXShapeInfo,
                                 		void *dX, Nd4jLong *dXShapeInfo,
@@ -937,7 +937,7 @@ void NativeOpExecutioner::execTransformAny(nd4j::LaunchContext  *lc,
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execTransformStrict(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execTransformStrict(sd::LaunchContext  *lc,
                                     int opNum,
                                     void *hX, Nd4jLong *hXShapeInfo,
                                     void *dX, Nd4jLong *dXShapeInfo,
@@ -971,7 +971,7 @@ void NativeOpExecutioner::execTransformStrict(nd4j::LaunchContext  *lc,
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execTransformFloat(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execTransformFloat(sd::LaunchContext  *lc,
                                 int opNum,
                                 void *hX, Nd4jLong *hXShapeInfo,
                                 void *dX, Nd4jLong *dXShapeInfo,
@@ -1005,7 +1005,7 @@ void NativeOpExecutioner::execTransformFloat(nd4j::LaunchContext  *lc,
 
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execSummaryStats(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execSummaryStats(sd::LaunchContext  *lc,
                                 int opNum,
                                 void *hX, Nd4jLong *hXShapeInfo,
                                 void *dX, Nd4jLong *dXShapeInfo,
@@ -1019,11 +1019,11 @@ void NativeOpExecutioner::execSummaryStats(nd4j::LaunchContext  *lc,
 
     dim3 launchDims = dim3(256, 256, 32768);
 
-	auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-	auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+	auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+	auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (!DataTypeUtils::isR(zType))
-        throw nd4j::datatype_exception::build("NativeOpExecutioner::execSummaryStats requires Z operand to have floating point data type", zType);
+        throw sd::datatype_exception::build("NativeOpExecutioner::execSummaryStats requires Z operand to have floating point data type", zType);
 
     BUILD_DOUBLE_SELECTOR(xType, zType, functions::summarystats::SummaryStatsReduce, ::execSummaryStatsReduce(launchDims, stream, opNum, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, nullptr, nullptr, biasCorrected, reductionPointer), LIBND4J_TYPES, FLOAT_TYPES);
 
@@ -1034,7 +1034,7 @@ void NativeOpExecutioner::execSummaryStats(nd4j::LaunchContext  *lc,
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execSummaryStats(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execSummaryStats(sd::LaunchContext  *lc,
                                 			int opNum,
                                 			void *hX, Nd4jLong *hXShapeInfo,
                                 			void *dX, Nd4jLong *dXShapeInfo,
@@ -1049,11 +1049,11 @@ void NativeOpExecutioner::execSummaryStats(nd4j::LaunchContext  *lc,
 
     dim3 launchDims = dim3(256, 256, 32768);
 
-	auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-	auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+	auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+	auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (!DataTypeUtils::isR(zType))
-        throw nd4j::datatype_exception::build("NativeOpExecutioner::execSummaryStats requires Z operand to have floating point data type", zType);
+        throw sd::datatype_exception::build("NativeOpExecutioner::execSummaryStats requires Z operand to have floating point data type", zType);
 
     BUILD_DOUBLE_SELECTOR(xType, zType, functions::summarystats::SummaryStatsReduce, ::execSummaryStatsReduce(launchDims, stream, opNum, dX, dXShapeInfo, hXShapeInfo, extraParams, dZ, dZShapeInfo, hZShapeInfo, dimension, dimensionLength, tadShapeInfo, tadOffsets, biasCorrected, reductionPointer), LIBND4J_TYPES, FLOAT_TYPES);
 
@@ -1065,7 +1065,7 @@ void NativeOpExecutioner::execSummaryStats(nd4j::LaunchContext  *lc,
 
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execReduce3(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execReduce3(sd::LaunchContext  *lc,
                             int opNum,
                             void *hX, Nd4jLong *hXShapeInfo,
                             void *dX, Nd4jLong *dXShapeInfo,
@@ -1079,19 +1079,19 @@ void NativeOpExecutioner::execReduce3(nd4j::LaunchContext  *lc,
     auto reductionPointer = lc->getReductionPointer();
 	auto allocationPointer = lc->getAllocationPointer();
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     auto blockWidth = 256;
     auto numBlocks = CudaLaunchHelper::getReductionBlocks(shape::length(hXShapeInfo), blockWidth);
     dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);
 
     if (xType != yType)
-        throw nd4j::datatype_exception::build("NativeOpExecutioner::execReduce3 requires Y operand to have X type", xType, yType);
+        throw sd::datatype_exception::build("NativeOpExecutioner::execReduce3 requires Y operand to have X type", xType, yType);
 
     if (!DataTypeUtils::isR(zType))
-        throw nd4j::datatype_exception::build("NativeOpExecutioner::execReduce3 requires Z operand to have floating point data type", zType);
+        throw sd::datatype_exception::build("NativeOpExecutioner::execReduce3 requires Z operand to have floating point data type", zType);
 
     BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::execScalar(launchDims, stream, opNum, dX, dXShapeInfo, dY, dYShapeInfo, extraParams, dZ, dZShapeInfo, allocationPointer, reductionPointer, nullptr), LIBND4J_TYPES, FLOAT_TYPES);
 
@@ -1102,7 +1102,7 @@ void NativeOpExecutioner::execReduce3(nd4j::LaunchContext  *lc,
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execReduce3(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execReduce3(sd::LaunchContext  *lc,
                             int opNum,
                             void *hX, Nd4jLong *hXShapeInfo,
                             void *dX, Nd4jLong *dXShapeInfo,
@@ -1123,15 +1123,15 @@ void NativeOpExecutioner::execReduce3(nd4j::LaunchContext  *lc,
     auto stream = lc->getCudaStream();
     auto allocationPointer = lc->getAllocationPointer();
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
      if (xType != yType)
-        throw nd4j::datatype_exception::build("NativeOpExecutioner::execReduce3 requires Y operand to have X type", xType, yType);
+        throw sd::datatype_exception::build("NativeOpExecutioner::execReduce3 requires Y operand to have X type", xType, yType);
 
     if (!DataTypeUtils::isR(zType))
-        throw nd4j::datatype_exception::build("NativeOpExecutioner::execReduce3 requires Z operand to have floating point data type", zType);
+        throw sd::datatype_exception::build("NativeOpExecutioner::execReduce3 requires Z operand to have floating point data type", zType);
 
 
     auto numBlocks = shape::length(hZShapeInfo);
@@ -1155,7 +1155,7 @@ void NativeOpExecutioner::execReduce3(nd4j::LaunchContext  *lc,
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execReduce3Scalar(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execReduce3Scalar(sd::LaunchContext  *lc,
 								  int opNum,
                                   void *hX, Nd4jLong *hXShapeInfo,
                                   void *dX, Nd4jLong *dXShapeInfo,
@@ -1170,9 +1170,9 @@ void NativeOpExecutioner::execReduce3Scalar(nd4j::LaunchContext  *lc,
 	auto allocationPointer = lc->getAllocationPointer();
 	auto reductionPointer  = lc->getReductionPointer();
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     auto xLength = shape::length(hXShapeInfo);
     auto blockWidth = 256;
@@ -1180,10 +1180,10 @@ void NativeOpExecutioner::execReduce3Scalar(nd4j::LaunchContext  *lc,
     dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, blockWidth, 32768);
 
     if (xType != yType)
-        throw nd4j::datatype_exception::build("NativeOpExecutioner::execReduce3Scalar requires Y operand to have X type", xType, yType);
+        throw sd::datatype_exception::build("NativeOpExecutioner::execReduce3Scalar requires Y operand to have X type", xType, yType);
 
     if (!DataTypeUtils::isR(zType))
-        throw nd4j::datatype_exception::build("NativeOpExecutioner::execReduce3Scalar requires Z operand to have floating point data type", zType);
+        throw sd::datatype_exception::build("NativeOpExecutioner::execReduce3Scalar requires Z operand to have floating point data type", zType);
 
     BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::execScalar(launchDims, stream, opNum, dX, dXShapeInfo, dY, dYShapeInfo, extraParams, dZ, dZShapeInfo, allocationPointer, reductionPointer, nullptr), LIBND4J_TYPES, FLOAT_TYPES);
 
@@ -1195,7 +1195,7 @@ void NativeOpExecutioner::execReduce3Scalar(nd4j::LaunchContext  *lc,
 
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execScalarBool(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execScalarBool(sd::LaunchContext  *lc,
 										int opNum,
 										void *hX, Nd4jLong *hXShapeInfo,
 										void *dX, Nd4jLong *dXShapeInfo,
@@ -1209,9 +1209,9 @@ void NativeOpExecutioner::execScalarBool(nd4j::LaunchContext  *lc,
 
 	dim3 launchDims = dim3(256, 512, 8192);
 
-	auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-	auto yType = nd4j::ArrayOptions::dataType(hScalarShapeInfo);
-	auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+	auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+	auto yType = sd::ArrayOptions::dataType(hScalarShapeInfo);
+	auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hScalarShapeInfo))
         return;
@@ -1231,7 +1231,7 @@ void NativeOpExecutioner::execScalarBool(nd4j::LaunchContext  *lc,
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execScalarBool(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execScalarBool(sd::LaunchContext  *lc,
 						   				int opNum,
 						   				void *hX, Nd4jLong *hXShapeInfo,
 						   				void *dX, Nd4jLong *dXShapeInfo,
@@ -1248,9 +1248,9 @@ void NativeOpExecutioner::execScalarBool(nd4j::LaunchContext  *lc,
 
 	dim3 launchDims(256, 512, 8192);
 
-	auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-	auto yType = nd4j::ArrayOptions::dataType(hScalarShapeInfo);
-	auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+	auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+	auto yType = sd::ArrayOptions::dataType(hScalarShapeInfo);
+	auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hScalarShapeInfo))
         return;
@@ -1270,7 +1270,7 @@ void NativeOpExecutioner::execScalarBool(nd4j::LaunchContext  *lc,
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execScalarInt(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execScalarInt(sd::LaunchContext  *lc,
                                          int opNum,
                                          void *hX, Nd4jLong *hXShapeInfo,
                                          void *dX, Nd4jLong *dXShapeInfo,
@@ -1284,9 +1284,9 @@ void NativeOpExecutioner::execScalarInt(nd4j::LaunchContext  *lc,
 
     dim3 launchDims = dim3(256, 512, 8192);
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto yType = nd4j::ArrayOptions::dataType(hScalarShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto yType = sd::ArrayOptions::dataType(hScalarShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hScalarShapeInfo))
         return;
@@ -1306,7 +1306,7 @@ void NativeOpExecutioner::execScalarInt(nd4j::LaunchContext  *lc,
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execScalarInt(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execScalarInt(sd::LaunchContext  *lc,
                                          int opNum,
                                          void *hX, Nd4jLong *hXShapeInfo,
                                          void *dX, Nd4jLong *dXShapeInfo,
@@ -1323,9 +1323,9 @@ void NativeOpExecutioner::execScalarInt(nd4j::LaunchContext  *lc,
 
     dim3 launchDims(256, 512, 8192);
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto yType = nd4j::ArrayOptions::dataType(hScalarShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto yType = sd::ArrayOptions::dataType(hScalarShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hScalarShapeInfo))
         return;
@@ -1345,7 +1345,7 @@ void NativeOpExecutioner::execScalarInt(nd4j::LaunchContext  *lc,
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execScalar(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execScalar(sd::LaunchContext  *lc,
 									int opNum,
 									void *hX, Nd4jLong *hXShapeInfo,
 									void *dX, Nd4jLong *dXShapeInfo,
@@ -1359,9 +1359,9 @@ void NativeOpExecutioner::execScalar(nd4j::LaunchContext  *lc,
 
 	dim3 launchDims(256, 512, 8192);
 
-	auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-	auto yType = nd4j::ArrayOptions::dataType(hScalarShapeInfo);
-	auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+	auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+	auto yType = sd::ArrayOptions::dataType(hScalarShapeInfo);
+	auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hScalarShapeInfo))
         return;
@@ -1380,7 +1380,7 @@ void NativeOpExecutioner::execScalar(nd4j::LaunchContext  *lc,
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execScalar(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execScalar(sd::LaunchContext  *lc,
 					 				int opNum,
 					 				void *hX, Nd4jLong *hXShapeInfo,
                      				void *dX, Nd4jLong *dXShapeInfo,
@@ -1395,9 +1395,9 @@ void NativeOpExecutioner::execScalar(nd4j::LaunchContext  *lc,
 
     auto stream = lc->getCudaStream();
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto yType = nd4j::ArrayOptions::dataType(hScalarShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto yType = sd::ArrayOptions::dataType(hScalarShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (shape::isEmpty(hXShapeInfo) || shape::isEmpty(hScalarShapeInfo))
         return;
@@ -1417,7 +1417,7 @@ void NativeOpExecutioner::execScalar(nd4j::LaunchContext  *lc,
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execRandom(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execRandom(sd::LaunchContext  *lc,
 						  int opNum,
                           Nd4jPointer stateHost,
                           void *hZ, Nd4jLong *hZShapeInfo,
@@ -1425,7 +1425,7 @@ void NativeOpExecutioner::execRandom(nd4j::LaunchContext  *lc,
                           void *extraArguments) {
 
     auto stream = lc->getCudaStream();
-    auto sizeOf = sizeof(nd4j::graph::RandomGenerator);
+    auto sizeOf = sizeof(sd::graph::RandomGenerator);
     Nd4jPointer stateDevice;
 
     cudaError_t res = cudaMalloc(reinterpret_cast<void **>(&stateDevice), sizeOf);
@@ -1433,9 +1433,9 @@ void NativeOpExecutioner::execRandom(nd4j::LaunchContext  *lc,
     checkCudaErrors(cudaMemcpyAsync(stateDevice, stateHost, sizeOf, cudaMemcpyHostToDevice, *stream));
 
     dim3 launchDims = dim3(512, 512, 32768);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
-    auto rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(stateHost);
+    auto rng = reinterpret_cast<sd::graph::RandomGenerator*>(stateHost);
 
     // functions::random::RandomFunction<float>::executeCudaSingle(launchDims, extraPointers, opNum, stateHost, dZ, dZShapeInfo, extraArguments),
     BUILD_SINGLE_SELECTOR(zType, functions::random::RandomFunction, ::executeCudaSingle(launchDims, stream, opNum, stateDevice, dZ, dZShapeInfo, extraArguments), FLOAT_TYPES);
@@ -1450,7 +1450,7 @@ void NativeOpExecutioner::execRandom(nd4j::LaunchContext  *lc,
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execRandom(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execRandom(sd::LaunchContext  *lc,
 							int opNum,
 							Nd4jPointer stateHost,
 						   	void *hX, Nd4jLong *hXShapeInfo,
@@ -1461,17 +1461,17 @@ void NativeOpExecutioner::execRandom(nd4j::LaunchContext  *lc,
 
     auto stream = lc->getCudaStream();
 
-    auto sizeOf = sizeof(nd4j::graph::RandomGenerator);
+    auto sizeOf = sizeof(sd::graph::RandomGenerator);
     Nd4jPointer stateDevice;
 
     cudaError_t res = cudaMalloc(reinterpret_cast<void **>(&stateDevice), sizeOf);
     checkCudaErrors(cudaStreamSynchronize(*stream));
     checkCudaErrors(cudaMemcpyAsync(stateDevice, stateHost, sizeOf, cudaMemcpyHostToDevice, *stream));
 
-    auto rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(stateHost);
+    auto rng = reinterpret_cast<sd::graph::RandomGenerator*>(stateHost);
 
     dim3 launchDims = dim3(512, 512, 32768);
-    auto xType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hZShapeInfo);
     // functions::random::RandomFunction<float>::executeCudaDouble(launchDims, extraPointers, opNum, stateHost, dX, dXShapeInfo, dZ, dZShapeInfo, extraArguments);
     BUILD_SINGLE_SELECTOR(xType, functions::random::RandomFunction, ::executeCudaDouble(launchDims, stream, opNum, stateDevice, dX, dXShapeInfo, dZ, dZShapeInfo, extraArguments), FLOAT_TYPES);
 
@@ -1485,7 +1485,7 @@ void NativeOpExecutioner::execRandom(nd4j::LaunchContext  *lc,
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execRandom(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execRandom(sd::LaunchContext  *lc,
 							int opNum,
 							Nd4jPointer stateHost,
 							void *hX, Nd4jLong *hXShapeInfo,
@@ -1497,17 +1497,17 @@ void NativeOpExecutioner::execRandom(nd4j::LaunchContext  *lc,
 							void *extraArguments) {
 
     auto stream = lc->getCudaStream();
-    auto sizeOf = sizeof(nd4j::graph::RandomGenerator);
+    auto sizeOf = sizeof(sd::graph::RandomGenerator);
     Nd4jPointer stateDevice;
 
     cudaError_t res = cudaMalloc(reinterpret_cast<void **>(&stateDevice), sizeOf);
     checkCudaErrors(cudaStreamSynchronize(*stream));
     checkCudaErrors(cudaMemcpyAsync(stateDevice, stateHost, sizeOf, cudaMemcpyHostToDevice, *stream));
 
-    auto rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(stateHost);
+    auto rng = reinterpret_cast<sd::graph::RandomGenerator*>(stateHost);
 
     dim3 launchDims = dim3(512, 512, 32768);
-    auto xType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hZShapeInfo);
     // functions::random::RandomFunction<float>::executeCudaTriple(launchDims, extraPointers, opNum, stateHost, dX, dXShapeInfo, dY, dYShapeInfo, dZ, dZShapeInfo, extraArguments);
     BUILD_SINGLE_SELECTOR(xType, functions::random::RandomFunction, ::executeCudaTriple(launchDims, stream, opNum, stateDevice, dX, dXShapeInfo, dY, dYShapeInfo, dZ, dZShapeInfo, extraArguments), FLOAT_TYPES);
 
@@ -1521,7 +1521,7 @@ void NativeOpExecutioner::execRandom(nd4j::LaunchContext  *lc,
 }
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execReduce3All(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execReduce3All(sd::LaunchContext  *lc,
 									int opNum,
 									void *hX, Nd4jLong *hXShapeInfo,
                             		void *dX, Nd4jLong *dXShapeInfo,
@@ -1538,20 +1538,20 @@ void NativeOpExecutioner::execReduce3All(nd4j::LaunchContext  *lc,
     auto allocationPointer = lc->getAllocationPointer();
 	auto reductionPointer  = lc->getReductionPointer();
 
-    if (nd4j::Environment::getInstance()->isDebugAndVerbose())
+    if (sd::Environment::getInstance()->isDebugAndVerbose())
         printf("D119 opNum:[%i]\n", opNum);
 
     dim3 launchDims(shape::length(hZShapeInfo), 256, 32768);
 
-    if (nd4j::Environment::getInstance()->isVerbose() && launchDims.x == 1)
+    if (sd::Environment::getInstance()->isVerbose() && launchDims.x == 1)
         printf("AD119 opNum:[%i]\n", opNum);
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
     if (yType != xType)
-        throw nd4j::datatype_exception::build("NativeOpExecutioner::execReduce3All both operands must have same data type", xType, yType);
+        throw sd::datatype_exception::build("NativeOpExecutioner::execReduce3All both operands must have same data type", xType, yType);
 
     BUILD_DOUBLE_SELECTOR(xType, zType, functions::reduce3::Reduce3, ::execAll(launchDims, stream, opNum, dX, dXShapeInfo, dY, dYShapeInfo, extraParamsVals, dZ, dZShapeInfo, dimension, dimensionLength, 1, allocationPointer, xTadShapeInfo, xOffsets, yTadShapeInfo, yOffsets), LIBND4J_TYPES, FLOAT_TYPES);
 
@@ -1563,7 +1563,7 @@ void NativeOpExecutioner::execReduce3All(nd4j::LaunchContext  *lc,
 
 
 ////////////////////////////////////////////////////////////////////////
-void NativeOpExecutioner::execReduce3TAD(nd4j::LaunchContext  *lc,
+void NativeOpExecutioner::execReduce3TAD(sd::LaunchContext  *lc,
                                             int opNum,
                                             void *hX, Nd4jLong *hXShapeInfo,
                                             void *dX, Nd4jLong *dXShapeInfo,
@@ -1584,15 +1584,15 @@ void NativeOpExecutioner::execReduce3TAD(nd4j::LaunchContext  *lc,
     auto stream = lc->getCudaStream();
     auto allocationPointer = lc->getAllocationPointer();
 
-    auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-    auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
-    auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+    auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+    auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
+    auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
      if (xType != yType)
-        throw nd4j::datatype_exception::build("NativeOpExecutioner::execReduce3TAD requires Y operand to have X type", xType, yType);
+        throw sd::datatype_exception::build("NativeOpExecutioner::execReduce3TAD requires Y operand to have X type", xType, yType);
 
     if (!DataTypeUtils::isR(zType))
-        throw nd4j::datatype_exception::build("NativeOpExecutioner::execReduce3TAD requires Z operand to have floating point data type", zType);
+        throw sd::datatype_exception::build("NativeOpExecutioner::execReduce3TAD requires Z operand to have floating point data type", zType);
 
     auto numBlocks = shape::length(hZShapeInfo);
     dim3 launchDims(numBlocks == 0 ? 1 : numBlocks, 256, 32768);
diff --git a/libnd4j/blas/cuda/NativeOps.cu b/libnd4j/include/legacy/cuda/NativeOps.cu
similarity index 75%
rename from libnd4j/blas/cuda/NativeOps.cu
rename to libnd4j/include/legacy/cuda/NativeOps.cu
index 07ce876ea..1a4de3de5 100755
--- a/libnd4j/blas/cuda/NativeOps.cu
+++ b/libnd4j/include/legacy/cuda/NativeOps.cu
@@ -15,11 +15,11 @@
  ******************************************************************************/
 
 
-#include "NativeOpExecutioner.h"
-#include "../NativeOps.h"
+#include <legacy/NativeOpExecutioner.h>
+#include <legacy/NativeOps.h>
 #include <cuda.h>
 
-#include <buffer.h>
+#include <system/buffer.h>
 
 
 #include <loops/transform_any.h>
@@ -29,25 +29,25 @@
 #include <helpers/threshold.h>
 #include <ops/specials_cuda.h>
 #include <helpers/DebugHelper.h>
-#include <AffinityManager.h>
+#include <execution/AffinityManager.h>
 
 #include <exceptions/datatype_exception.h>
 #include <exceptions/cuda_exception.h>
 #include <helpers/CudaLaunchHelper.h>
-#include <GraphExecutioner.h>
+#include <graph/GraphExecutioner.h>
 #include <helpers/BlasHelper.h>
 #include <graph/GraphHolder.h>
 #include <ops/declarable/CustomOperations.h>
-#include <PointersManager.h>
+#include <helpers/PointersManager.h>
 
 
 //#include <sys/time.h>
 
 #include <curand.h>
-#include <Status.h>
+#include <graph/Status.h>
 #include <helpers/DebugHelper.h>
 
-using namespace nd4j;
+using namespace sd;
 
 #include <loops/special_kernels.h>
 #include <performance/benchmarking/FullBenchmarkSuit.h>
@@ -122,18 +122,18 @@ int getDeviceSharedThreshold(int deviceId) {
 
 
 
-nd4j::buffer::Buffer<Nd4jLong> * createScalarBuffer(cudaStream_t stream) {
+sd::buffer::Buffer<Nd4jLong> * createScalarBuffer(cudaStream_t stream) {
 	Nd4jLong *scalarShapeInfo = shape::createScalarShapeInfo();
-	nd4j::buffer::Buffer<Nd4jLong> *buff = nd4j::buffer::createBuffer(scalarShapeInfo,shape::shapeInfoLength(2), stream);
-	nd4j::buffer::copyDataToGpu(&buff, stream);
+	sd::buffer::Buffer<Nd4jLong> *buff = sd::buffer::createBuffer(scalarShapeInfo,shape::shapeInfoLength(2), stream);
+	sd::buffer::copyDataToGpu(&buff, stream);
 	return buff;
 }
 
 
 class ScalarShapeInformation {
 private:
-	nd4j::buffer::Buffer<Nd4jLong> *scalarDimension;
-	nd4j::buffer::Buffer<Nd4jLong> *scalarShapeInfo;
+	sd::buffer::Buffer<Nd4jLong> *scalarDimension;
+	sd::buffer::Buffer<Nd4jLong> *scalarShapeInfo;
 //	std::thread::id threadId;
 
 public:
@@ -143,14 +143,14 @@ public:
 		CHECK_ALLOC(scalarDimensionBuff, "Failed to allocate ShapeInfoBuffer", sizeof(Nd4jLong));
 
 		scalarDimensionBuff[0] = MAX_DIMENSION;
-		scalarDimension = nd4j::buffer::createBuffer(scalarDimensionBuff,1, stream);
+		scalarDimension = sd::buffer::createBuffer(scalarDimensionBuff,1, stream);
 		scalarShapeInfo = createScalarBuffer(stream);
 //		threadId = std::this_thread::get_id();
 
 	}
 	~ScalarShapeInformation() {
-		nd4j::buffer::freeBuffer(&scalarShapeInfo);
-		nd4j::buffer::freeBuffer(&scalarDimension);
+		sd::buffer::freeBuffer(&scalarShapeInfo);
+		sd::buffer::freeBuffer(&scalarDimension);
 	}
 
 
@@ -178,7 +178,7 @@ public:
 
 template <typename T>
 class ScalarInfo {
-	nd4j::buffer::Buffer<T> *scalarData;
+	sd::buffer::Buffer<T> *scalarData;
 	ScalarShapeInformation *shapeInfo;
 	T finalResult;
 	cudaStream_t streamRef;
@@ -189,13 +189,13 @@ public:
 		CHECK_ALLOC(scalarResult, "Failed to allocate new scalar buffer", sizeof(T));
 
 		shapeInfo = new ScalarShapeInformation(stream);
-		scalarData = nd4j::buffer::createBuffer(scalarResult,1, stream);
+		scalarData = sd::buffer::createBuffer(scalarResult,1, stream);
 		streamRef = stream;
-		nd4j::buffer::copyDataToGpu(&scalarData, stream);
+		sd::buffer::copyDataToGpu(&scalarData, stream);
 	}
 
 	T getFinalResultFromDevice() {
-		nd4j::buffer::copyDataFromGpu(&scalarData, streamRef);
+		sd::buffer::copyDataFromGpu(&scalarData, streamRef);
 		return scalarData->data[0];
 	}
 
@@ -222,7 +222,7 @@ public:
 	 }
 
 	 ~ScalarInfo() {
-		 nd4j::buffer::freeBuffer(&scalarData);
+		 sd::buffer::freeBuffer(&scalarData);
 		 delete shapeInfo;
 	 }
 };
@@ -243,8 +243,8 @@ void execPairwiseTransform( Nd4jPointer *extraPointers,
 
         InteropDataBuffer::registerSpecialUse({dbZ}, {dbX, dbY});
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -267,8 +267,8 @@ void execPairwiseTransformBool(Nd4jPointer *extraPointers,
 
         InteropDataBuffer::registerSpecialUse({dbZ}, {dbX, dbY});
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -291,8 +291,8 @@ void execSummaryStatsScalar(Nd4jPointer *extraPointers,
 
         InteropDataBuffer::registerSpecialUse({dbZ}, {dbX});
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -328,8 +328,8 @@ void execBroadcastBool(Nd4jPointer *extraPointers,
 
         InteropDataBuffer::registerSpecialUse({dbZ}, {dbX, dbY});
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -367,9 +367,9 @@ void   execBroadcast(
         auto tadOnlyShapeInfoZ = reinterpret_cast<Nd4jLong *>(extraPointers[12]);
         auto tadOffsetsZ = reinterpret_cast<Nd4jLong *>(extraPointers[13]);
 
-        auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-        auto yType = nd4j::ArrayOptions::dataType(hYShapeInfo);
-        auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+        auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+        auto yType = sd::ArrayOptions::dataType(hYShapeInfo);
+        auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
         LaunchContext lc(extraPointers[1], extraPointers[4], extraPointers[5], extraPointers[3]);
         NativeOpExecutioner::execBroadcast(&lc, opNum,
@@ -381,8 +381,8 @@ void   execBroadcast(
 
         InteropDataBuffer::registerSpecialUse({dbZ}, {dbX, dbY});
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -413,8 +413,8 @@ void execReduceFloat(Nd4jPointer *extraPointers,
 
         InteropDataBuffer::registerSpecialUse({dbZ}, {dbX});
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -435,8 +435,8 @@ void execReduceSame(Nd4jPointer *extraPointers,
 
         InteropDataBuffer::registerSpecialUse({dbZ}, {dbX});
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -454,7 +454,7 @@ void execReduceSame2(Nd4jPointer *extraPointers,
         auto dimension = reinterpret_cast<int *>(dbDimension->primary());
         int dimensionLength = static_cast<int>(shape::length(hDimensionShape));
 
-        auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo,
+        auto tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo,
                                                                                 dimension,
                                                                                 shape::length(hDimensionShape));
 
@@ -468,8 +468,8 @@ void execReduceSame2(Nd4jPointer *extraPointers,
 
         InteropDataBuffer::registerSpecialUse({dbZ}, {dbX});
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -487,7 +487,7 @@ void execReduceLong2(Nd4jPointer *extraPointers,
         auto dimension = reinterpret_cast<int *>(dbDimension->primary());
         int dimensionLength = static_cast<int>(shape::length(hDimensionShape));
 
-        auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo,
+        auto tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo,
                                                                                 dimension,
                                                                                 shape::length(hDimensionShape));
 
@@ -501,8 +501,8 @@ void execReduceLong2(Nd4jPointer *extraPointers,
 
         InteropDataBuffer::registerSpecialUse({dbZ}, {dbX});
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -521,11 +521,11 @@ void   execReduceLong(Nd4jPointer *extraPointers,
 
         auto reductionPointer = reinterpret_cast<void *>(extraPointers[4]);
 
-        auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-        auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+        auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+        auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
-        if (zType != nd4j::DataType::INT64)
-            throw datatype_exception::build("execReduceLong wrong Z data type", nd4j::DataType::INT64, zType);
+        if (zType != sd::DataType::INT64)
+            throw datatype_exception::build("execReduceLong wrong Z data type", sd::DataType::INT64, zType);
 
         auto xLength = shape::length(hXShapeInfo);
         auto blockWidth = 256;
@@ -539,12 +539,12 @@ void   execReduceLong(Nd4jPointer *extraPointers,
                                       dbZ->special(), ConstantShapeHelper::getInstance()->bufferForShapeInfo(hZShapeInfo).specialAsT<Nd4jLong>(), hXShapeInfo,
                                       nullptr, 0, reductionPointer, dTADShapeInfo), LIBND4J_TYPES, LONG_TYPES);
 
-        nd4j::DebugHelper::checkErrorCode(stream, "execReduceLong(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "execReduceLong(...) failed");
 
         InteropDataBuffer::registerSpecialUse({dbZ}, {dbX});
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -562,7 +562,7 @@ void execReduceBool2(Nd4jPointer *extraPointers,
         auto dimension = reinterpret_cast<int *>(dbDimension->primary());
         int dimensionLength = static_cast<int>(shape::length(hDimensionShape));
 
-        auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo,
+        auto tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo,
                                                                                 dimension,
                                                                                 shape::length(hDimensionShape));
 
@@ -576,8 +576,8 @@ void execReduceBool2(Nd4jPointer *extraPointers,
 
         InteropDataBuffer::registerSpecialUse({dbZ}, {dbX});
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -596,10 +596,10 @@ void   execReduceBool(Nd4jPointer *extraPointers,
 
         auto reductionPointer = reinterpret_cast<void *>(extraPointers[4]);
 
-        auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-        auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+        auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+        auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
-        if (zType != nd4j::DataType::BOOL)
+        if (zType != sd::DataType::BOOL)
             throw std::runtime_error("execReduceBool requires Z operand to have BOOL type");
 
         auto xLength = shape::length(hXShapeInfo);
@@ -614,12 +614,12 @@ void   execReduceBool(Nd4jPointer *extraPointers,
                                       dbZ->special(), ConstantShapeHelper::getInstance()->bufferForShapeInfo(hZShapeInfo).specialAsT<Nd4jLong>(), hZShapeInfo,
                                       nullptr, 0, reductionPointer, dTADShapeInfo), LIBND4J_TYPES, BOOL_TYPES);
 
-        nd4j::DebugHelper::checkErrorCode(stream, "execReduceBool(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "execReduceBool(...) failed");
 
         InteropDataBuffer::registerSpecialUse({dbZ}, {dbX});
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -648,7 +648,7 @@ void execIndexReduce(Nd4jPointer *extraPointers,
         auto dimension = reinterpret_cast<int *>(dbDimension->primary());
         int dimensionLength = static_cast<int>(shape::length(hDimensionShape));
 
-        auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo,
+        auto tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo,
                                                                                 dimension,
                                                                                 shape::length(hDimensionShape));
 
@@ -662,8 +662,8 @@ void execIndexReduce(Nd4jPointer *extraPointers,
 
         InteropDataBuffer::registerSpecialUse({dbZ}, {dbX});
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -690,7 +690,7 @@ void execReduceFloat2(Nd4jPointer *extraPointers,
         auto dimension = reinterpret_cast<int *>(dbDimension->primary());
         int dimensionLength = static_cast<int>(shape::length(hDimensionShape));
 
-        auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo,
+        auto tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo,
                                                                                 dimension,
                                                                                 shape::length(hDimensionShape));
 
@@ -704,8 +704,8 @@ void execReduceFloat2(Nd4jPointer *extraPointers,
 
         InteropDataBuffer::registerSpecialUse({dbZ}, {dbX});
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -734,8 +734,8 @@ void execIndexReduceScalar(
 
         InteropDataBuffer::registerSpecialUse({dbZ}, {dbX});
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -759,8 +759,8 @@ void execTransformSame(Nd4jPointer *extraPointers,int opNum,
 
         InteropDataBuffer::registerSpecialUse({dbZ}, {dbX});
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -784,8 +784,8 @@ void execTransformBool(Nd4jPointer *extraPointers,int opNum,
 
         InteropDataBuffer::registerSpecialUse({dbZ}, {dbX});
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -810,8 +810,8 @@ void execTransformAny(Nd4jPointer *extraPointers,int opNum,
 
         InteropDataBuffer::registerSpecialUse({dbZ}, {dbX});
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -835,8 +835,8 @@ void execTransformStrict(Nd4jPointer *extraPointers,int opNum,
 
         InteropDataBuffer::registerSpecialUse({dbZ}, {dbX});
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -860,8 +860,8 @@ void execTransformFloat(Nd4jPointer *extraPointers,int opNum,
 
         InteropDataBuffer::registerSpecialUse({dbZ}, {dbX});
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -939,7 +939,7 @@ void enableP2P(bool enable) {
                         cudaDeviceDisablePeerAccess(dY);
                     }
                 } else {
-					if (nd4j::Environment::getInstance()->isVerbose()) printf("Peer access [%i] -> [%i] isn't possible\n", dX, dY);
+					if (sd::Environment::getInstance()->isVerbose()) printf("Peer access [%i] -> [%i] isn't possible\n", dX, dY);
 				}
             }
         }
@@ -977,13 +977,13 @@ void initializeDevicesAndFunctions() {
         if (supportedP2P && devCnt > 1)
             enableP2P(allowedP2P);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
 void initializeFunctions(Nd4jPointer *functions) {
-    nd4j::BlasHelper::getInstance()->initializeDeviceFunctions(functions);
+    sd::BlasHelper::getInstance()->initializeDeviceFunctions(functions);
 	/*
 	cublasSgemv = (CublasSgemv)functions[0];
     cublasDgemv = (CublasDgemv)functions[1];
@@ -1010,8 +1010,8 @@ Nd4jPointer mallocHost(Nd4jLong memorySize, int flags) {
 	// cudaHostAllocMapped |cudaHostAllocPortable
 	auto res = cudaHostAlloc(reinterpret_cast<void **>(&pointer), memorySize + 8, cudaHostAllocDefault);
 	if (res != 0) {
-	    nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(res);
-	    nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage("cudaHostAlloc failed");
+	    sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(res);
+	    sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage("cudaHostAlloc failed");
     }
 
 	return reinterpret_cast<int8_t*>(pointer);
@@ -1029,8 +1029,8 @@ Nd4jPointer mallocDevice(Nd4jLong memorySize, int deviceId, int flags) {
 	Nd4jPointer pointer;
 	auto res = cudaMalloc(reinterpret_cast<void **>(&pointer), memorySize + 8);
 	if (res != 0) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(res);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage("cudaMalloc failed");
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(res);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage("cudaMalloc failed");
 	}
 
 	return reinterpret_cast<int8_t*>(pointer);
@@ -1044,8 +1044,8 @@ Nd4jPointer mallocDevice(Nd4jLong memorySize, int deviceId, int flags) {
 int freeHost(Nd4jPointer pointer) {
 	auto res = cudaFreeHost(reinterpret_cast<void *>(pointer));
 	if (res != 0) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(res);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage("cudaFreeHost failed");
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(res);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage("cudaFreeHost failed");
     }
 
 	return 1L;
@@ -1062,8 +1062,8 @@ int freeDevice(Nd4jPointer pointer, int deviceId) {
 
 	// we're intentionally skipping
 	if (res != 0 && res != 1) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(res);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage("cudaFree failed");
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(res);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage("cudaFree failed");
 	}
 
 	return res == 0 ? 1L : 0L;
@@ -1079,8 +1079,8 @@ Nd4jPointer createStream() {
     auto stream = new cudaStream_t();
     auto dZ = cudaStreamCreate(stream);
     if (dZ != 0) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(dZ);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage("cudaStreamCreate failed");
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(dZ);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage("cudaStreamCreate failed");
     }
 
     return stream;
@@ -1093,8 +1093,8 @@ Nd4jPointer createEvent() {
 
 	auto dZ = cudaEventCreateWithFlags(reinterpret_cast<cudaEvent_t *>(&nativeEvent), cudaEventDisableTiming);
 	if (dZ != 0) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(dZ);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage("cudaEventCreateWithFlags failed");
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(dZ);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage("cudaEventCreateWithFlags failed");
 	}
 
 	return nativeEvent;
@@ -1106,8 +1106,8 @@ int registerEvent(Nd4jPointer event, Nd4jPointer stream) {
 
     auto dZ = cudaEventRecord(*pEvent, *pStream);
 	if (dZ != 0) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(dZ);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage("cudaEventRecord failed");
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(dZ);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage("cudaEventRecord failed");
 	}
 
 	return 1;
@@ -1189,8 +1189,8 @@ int memcpySync(Nd4jPointer dst, Nd4jPointer src, Nd4jLong size, int flags, Nd4jP
         }
             break;
         default: {
-            nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-            nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage("UNDEFNED MEMCPY");
+            sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+            sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage("UNDEFNED MEMCPY");
             return 0;
         }
     }
@@ -1200,8 +1200,8 @@ int memcpySync(Nd4jPointer dst, Nd4jPointer src, Nd4jLong size, int flags, Nd4jP
         printf("Failed on [%p] -> [%p], size: [%i], direction: [%i], dZ: [%i]\n", src, dst, size, flags, static_cast<int>(dZ));
         fflush(stdout);
         fflush(stderr);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(dZ);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage("cudaMemcpy failed");
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(dZ);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage("cudaMemcpy failed");
         return 0;
     }
 
@@ -1213,7 +1213,7 @@ int memcpyAsync(Nd4jPointer dst, Nd4jPointer src, Nd4jLong size, int flags, Nd4j
 
 	cudaMemcpyKind 	kind;
 
-	//nd4j::DebugHelper::checkErrorCode(pStream, "Preliminary sync failed");
+	//sd::DebugHelper::checkErrorCode(pStream, "Preliminary sync failed");
 
 	switch (flags) {
 		case 0: {
@@ -1233,8 +1233,8 @@ int memcpyAsync(Nd4jPointer dst, Nd4jPointer src, Nd4jLong size, int flags, Nd4j
 		    }
 			break;
 		default: {
-            nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-            nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage("UNDEFNED MEMCPY");
+            sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+            sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage("UNDEFNED MEMCPY");
             return 0;
 		}
 	}
@@ -1245,8 +1245,8 @@ int memcpyAsync(Nd4jPointer dst, Nd4jPointer src, Nd4jLong size, int flags, Nd4j
         printf("Failed on [%p] -> [%p], size: [%i], direction: [%i], dZ: [%i]\n", src, dst, size, flags, static_cast<int>(dZ));
         fflush(stdout);
         fflush(stderr);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(dZ);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage("cudaMemcpyAsync failed");
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(dZ);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage("cudaMemcpyAsync failed");
         return 0;
 	}
 
@@ -1256,8 +1256,8 @@ int memcpyAsync(Nd4jPointer dst, Nd4jPointer src, Nd4jLong size, int flags, Nd4j
 int memsetSync(Nd4jPointer dst, int value, Nd4jLong size, int flags, Nd4jPointer reserved) {
 	auto dZ = cudaMemset(reinterpret_cast<void *>(dst), value, static_cast<size_t>(size));
 	if (dZ != 0) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(dZ);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage("cudaMemset failed");
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(dZ);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage("cudaMemset failed");
 	}
 
 	return 1;
@@ -1268,8 +1268,8 @@ int memsetAsync(Nd4jPointer dst, int value, Nd4jLong size, int flags, Nd4jPointe
 
 	auto dZ = cudaMemsetAsync(reinterpret_cast<void *>(dst), value, static_cast<size_t>(size), *pStream);
 	if (dZ != 0) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(dZ);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage("cudaMemsetAsync failed");
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(dZ);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage("cudaMemsetAsync failed");
 	}
 
 	return 1;
@@ -1279,8 +1279,8 @@ int destroyEvent(Nd4jPointer event) {
 	auto pEvent = reinterpret_cast<cudaEvent_t *>(&event);
 	auto dZ = cudaEventDestroy(*pEvent);
 	if (dZ != 0) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(dZ);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage("cudaEventDestroy failed");
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(dZ);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage("cudaEventDestroy failed");
 	}
 
 	return 1;
@@ -1291,8 +1291,8 @@ int streamSynchronize(Nd4jPointer stream) {
 
 	auto dZ = cudaStreamSynchronize(*pStream);
 	if (dZ != 0) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(dZ);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage("cudaStreamSynchronize failed");
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(dZ);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage("cudaStreamSynchronize failed");
 	}
 
 	return 1L;
@@ -1303,8 +1303,8 @@ int eventSynchronize(Nd4jPointer event) {
 
 	auto  dZ = cudaEventSynchronize(*pEvent);
 	if (dZ != 0) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(dZ);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage("cudaEventSynchronize failed");
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(dZ);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage("cudaEventSynchronize failed");
 	}
 
 	return 1L;
@@ -1317,7 +1317,7 @@ int getAvailableDevices() {
 }
 
 void enableDebugMode(bool reallyEnable) {
-	nd4j::Environment::getInstance()->setDebug(reallyEnable);
+	sd::Environment::getInstance()->setDebug(reallyEnable);
 }
 
 void setGridLimit(int gridSize) {
@@ -1345,7 +1345,7 @@ void setOmpNumThreads(int threads) {
 }
 
 void enableVerboseMode(bool reallyEnable) {
-	nd4j::Environment::getInstance()->setVerbose(reallyEnable);
+	sd::Environment::getInstance()->setVerbose(reallyEnable);
 }
 
 int getDeviceMajor(int device) {
@@ -1370,12 +1370,12 @@ void specialConcat(
         void *dZ,
         Nd4jLong *dZShapeInfo, Nd4jPointer *tadPointers, Nd4jPointer *offsetPointers) {
     try {
-        BUILD_SINGLE_SELECTOR(ArrayOptions::dataType(dZShapeInfo), nd4j::SpecialMethods,
+        BUILD_SINGLE_SELECTOR(ArrayOptions::dataType(dZShapeInfo), sd::SpecialMethods,
                               ::concatCpuGeneric(dimension, numArrays, data, inputShapeInfo, dZ, dZShapeInfo),
                               LIBND4J_TYPES);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -1383,34 +1383,34 @@ void specialConcat(
 /**
  * This method saves
  */
-nd4j::TadPack* tadOnlyShapeInfo(Nd4jLong *dXShapeInfo, int *dimension, int dimensionLength) {
+sd::TadPack* tadOnlyShapeInfo(Nd4jLong *dXShapeInfo, int *dimension, int dimensionLength) {
     try {
         auto pack = new TadPack();
-        *pack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(dXShapeInfo, dimension, dimensionLength);
+        *pack = sd::ConstantTadHelper::getInstance()->tadForDimensions(dXShapeInfo, dimension, dimensionLength);
         return pack;
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
         return nullptr;
     }
 }
 
-Nd4jLong* getPrimaryShapeInfo(nd4j::TadPack* pack) {
+Nd4jLong* getPrimaryShapeInfo(sd::TadPack* pack) {
     return pack->primaryShapeInfo();
 }
-Nd4jLong* getPrimaryOffsets(nd4j::TadPack* pack) {
+Nd4jLong* getPrimaryOffsets(sd::TadPack* pack) {
     return pack->primaryOffsets();
 }
-Nd4jLong* getSpecialShapeInfo(nd4j::TadPack* pack) {
+Nd4jLong* getSpecialShapeInfo(sd::TadPack* pack) {
     return pack->specialShapeInfo();
 }
-Nd4jLong* getSpecialOffsets(nd4j::TadPack* pack) {
+Nd4jLong* getSpecialOffsets(sd::TadPack* pack) {
     return pack->specialOffsets();
 }
-Nd4jLong getNumberOfTads(nd4j::TadPack* pack) {
+Nd4jLong getNumberOfTads(sd::TadPack* pack) {
     return pack->numberOfTads();
 }
-int getShapeInfoLength(nd4j::TadPack* pack) {
+int getShapeInfoLength(sd::TadPack* pack) {
     return pack->shapeInfoLength();
 }
 
@@ -1440,8 +1440,8 @@ int memcpyConstantAsync(Nd4jLong dst, Nd4jPointer src, Nd4jLong size, int flags,
 	}
 	auto dZ = cudaMemcpyToSymbolAsync(deviceConstantMemory, const_cast<const void *>(src), size, dst, kind, *pStream);
 	if (dZ != 0) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(dZ);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage("cudaMemcpyToSymbolAsync failed");
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(dZ);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage("cudaMemcpyToSymbolAsync failed");
 	}
 
 	return 1;
@@ -1452,8 +1452,8 @@ Nd4jPointer getConstantSpace() {
 	cudaError_t dZ = cudaGetSymbolAddress(reinterpret_cast<void **>(&dConstAddr), deviceConstantMemory);
 
 	if (dZ != 0) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(dZ);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage("cudaGetSymbolAddress failed");
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(dZ);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage("cudaGetSymbolAddress failed");
 	}
 
 	return dConstAddr;
@@ -1473,7 +1473,7 @@ void pullRows(Nd4jPointer *extraPointers,
 
         cudaStream_t *stream = reinterpret_cast<cudaStream_t *>(extraPointers[1]);
         dim3 launchDims(64, 256, 1024);
-        auto xType = nd4j::ArrayOptions::dataType(xShapeInfo);
+        auto xType = sd::ArrayOptions::dataType(xShapeInfo);
         BUILD_SINGLE_SELECTOR(xType, pullRowsKernelGeneric,
                               (launchDims, stream, dbX->special(), dbZ->special(), n, indexes, tadShapeInfo, tadOffsets, zTadShapeInfo, zTadOffsets),
                               LIBND4J_TYPES);
@@ -1482,8 +1482,8 @@ void pullRows(Nd4jPointer *extraPointers,
 
         InteropDataBuffer::registerSpecialUse({dbZ}, {dbX});
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -1502,24 +1502,24 @@ void average(Nd4jPointer *extras,
 
         auto dX = reinterpret_cast<void **>(dx);
 
-        if (nd4j::Environment::getInstance()->isDebugAndVerbose())
+        if (sd::Environment::getInstance()->isDebugAndVerbose())
             printf("averageFloat called\n");
 
-        auto xType = nd4j::ArrayOptions::dataType(xShapeInfo);
+        auto xType = sd::ArrayOptions::dataType(xShapeInfo);
         // launching on gpu
         if (mode == 0) {
             dim3 launchDims(256, 256, 4096);
             BUILD_SINGLE_SELECTOR(xType, averagingKernelGeneric, (launchDims, stream, dX, dz, n, length, propagate),
                                   LIBND4J_TYPES);
-            nd4j::DebugHelper::checkErrorCode(stream, "AverageFloat(...) failed");
+            sd::DebugHelper::checkErrorCode(stream, "AverageFloat(...) failed");
         } else {
             // launching on host memory
-            BUILD_SINGLE_SELECTOR(xType, nd4j::SpecialMethods, ::averageGeneric(x, z, zShapeInfo, n, length, propagate),
+            BUILD_SINGLE_SELECTOR(xType, sd::SpecialMethods, ::averageGeneric(x, z, zShapeInfo, n, length, propagate),
                                   LIBND4J_TYPES);
         }
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -1536,24 +1536,24 @@ void accumulate(Nd4jPointer *extras,
 
         auto dX = reinterpret_cast<void **>(dx);
 
-        if (nd4j::Environment::getInstance()->isDebugAndVerbose())
+        if (sd::Environment::getInstance()->isDebugAndVerbose())
             printf("accumulateFloat called\n");
-        auto xType = nd4j::ArrayOptions::dataType(xShapeInfo);
+        auto xType = sd::ArrayOptions::dataType(xShapeInfo);
 
         // launching on gpu
         if (mode == 0) {
             dim3 launchDims(n, 256, 16384);
             BUILD_SINGLE_SELECTOR(xType, accumulateKernelGeneric, (launchDims, stream, dX, dz, n, length),
                                   LIBND4J_TYPES);
-            nd4j::DebugHelper::checkErrorCode(stream, "AccumulateFloat(...) failed");
+            sd::DebugHelper::checkErrorCode(stream, "AccumulateFloat(...) failed");
         } else {
             // launching on host memory
-            BUILD_SINGLE_SELECTOR(xType, nd4j::SpecialMethods, ::accumulateGeneric(x, z, zShapeInfo, n, length),
+            BUILD_SINGLE_SELECTOR(xType, sd::SpecialMethods, ::accumulateGeneric(x, z, zShapeInfo, n, length),
                                   LIBND4J_TYPES);
         }
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -1577,30 +1577,30 @@ void shuffle(Nd4jPointer *extras,
         auto tadOnlyShapeInfo = reinterpret_cast<Nd4jLong **>(tadShapeInfo);
         auto tadOffset = reinterpret_cast<Nd4jLong **>(tadOffsets);
 
-        auto xType = nd4j::ArrayOptions::dataType(xShape[0]);
+        auto xType = sd::ArrayOptions::dataType(xShape[0]);
         dim3 launchDims(256, 512, 8192);
         BUILD_SINGLE_SELECTOR(xType, shuffleKernelGeneric,
                               (launchDims, stream, dX, dxShape, dZ, N, shuffleMap, tadOnlyShapeInfo, tadOffset),
                               LIBND4J_TYPES);
 
-        nd4j::DebugHelper::checkErrorCode(stream, "shuffle(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "shuffle(...) failed");
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
 bool isExperimentalEnabled() {
-    return nd4j::Environment::getInstance()->isExperimentalBuild();
+    return sd::Environment::getInstance()->isExperimentalBuild();
 }
 
 void setOmpMinThreads(int threads) {
-    minThreads = nd4j::math::nd4j_max<int>(32, threads);
-    minThreads = nd4j::math::nd4j_min<int>(maxThreads, minThreads);
+    minThreads = sd::math::nd4j_max<int>(32, threads);
+    minThreads = sd::math::nd4j_min<int>(maxThreads, minThreads);
 }
 
 int getDevice() {
-    return nd4j::AffinityManager::currentDeviceId();
+    return sd::AffinityManager::currentDeviceId();
 }
 
 void setElementThreshold(int num) {
@@ -1630,8 +1630,8 @@ void execSummaryStats(Nd4jPointer *extraPointers,
 
         InteropDataBuffer::registerSpecialUse({dbZ}, {dbX});
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -1662,8 +1662,8 @@ void execSummaryStatsTad(Nd4jPointer *extraPointers,
 
         InteropDataBuffer::registerSpecialUse({dbZ}, {dbX, dbDimension});
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -1686,8 +1686,8 @@ void execReduce3(Nd4jPointer *extraPointers,
 
         InteropDataBuffer::registerSpecialUse({dbZ}, {dbX, dbY});
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -1708,7 +1708,7 @@ void execReduce3Tad(Nd4jPointer *extraPointers,
         auto dimension = reinterpret_cast<int *>(dbDimension->primary());
         int dimensionLength = static_cast<int>(shape::length(hDimensionShape));
 
-        auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo,
+        auto tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(hXShapeInfo,
                                                                                 dimension,
                                                                                 shape::length(hDimensionShape));
         auto tadLength = shape::length(tadPack.primaryShapeInfo());
@@ -1737,8 +1737,8 @@ void execReduce3Tad(Nd4jPointer *extraPointers,
 
         InteropDataBuffer::registerSpecialUse({dbZ}, {dbX, dbY});
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -1760,8 +1760,8 @@ void execReduce3Scalar(Nd4jPointer *extraPointers,int opNum,
 
         InteropDataBuffer::registerSpecialUse({dbZ}, {dbX, dbY});
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -1784,8 +1784,8 @@ void execScalarBool(Nd4jPointer *extraPointers,
 
         InteropDataBuffer::registerSpecialUse({dbZ}, {dbX, dbScalar});
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -1817,8 +1817,8 @@ void execScalarBoolTad(Nd4jPointer *extraPointers,
 
         InteropDataBuffer::registerSpecialUse({dbZ}, {dbX, dbScalars});
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -1841,8 +1841,8 @@ void execScalar(Nd4jPointer *extraPointers,
 
         InteropDataBuffer::registerSpecialUse({dbZ}, {dbX, dbScalar});
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -1865,12 +1865,12 @@ void execScalarTad(Nd4jPointer *extraPointers,
 
         cudaStream_t *stream = reinterpret_cast<cudaStream_t *>(extraPointers[1]);
 
-        auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
-        auto yType = nd4j::ArrayOptions::dataType(hScalarShapeInfo);
-        auto zType = nd4j::ArrayOptions::dataType(hZShapeInfo);
+        auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
+        auto yType = sd::ArrayOptions::dataType(hScalarShapeInfo);
+        auto zType = sd::ArrayOptions::dataType(hZShapeInfo);
 
-        if (yType != xType && yType != nd4j::DataType::BOOL && !isExperimentalEnabled())
-            throw nd4j::datatype_exception::build("execScalar both operands must have same data type", xType, yType);
+        if (yType != xType && yType != sd::DataType::BOOL && !isExperimentalEnabled())
+            throw sd::datatype_exception::build("execScalar both operands must have same data type", xType, yType);
 
         dim3 launchDims(256, 256, 16384);
 
@@ -1884,8 +1884,8 @@ void execScalarTad(Nd4jPointer *extraPointers,
 
         InteropDataBuffer::registerSpecialUse({dbZ}, {dbX, dbScalars});
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -1901,7 +1901,7 @@ void execAggregate(Nd4jPointer *extraPointers,
                                    int numIntArrays,
                                    void *realArguments,
                                    int numRealArguments,
-                                   nd4j::DataType dtype) {
+                                   sd::DataType dtype) {
 
 }
 
@@ -1915,7 +1915,7 @@ void batchExecutor(Nd4jPointer *extraPointers,
                                int maxIdx,
                                int maxReals,
                                void *ptrToArguments,
-                               nd4j::DataType dtype) {
+                               sd::DataType dtype) {
 }
 
 void execAggregateBatch(Nd4jPointer *extraPointers,
@@ -1923,7 +1923,7 @@ void execAggregateBatch(Nd4jPointer *extraPointers,
 									int maxArgs, int maxShapes,
 									int maxIntArrays, int maxIntArraySize,
 									int maxIdx, int maxReals,
-									void *ptrToArguments, nd4j::DataType dtype) {
+									void *ptrToArguments, sd::DataType dtype) {
 
 }
 
@@ -1943,8 +1943,8 @@ void execRandom(Nd4jPointer *extraPointers,
 
         InteropDataBuffer::registerSpecialUse({dbZ}, {});
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -1964,8 +1964,8 @@ void execRandom2(Nd4jPointer *extraPointers, int opNum, Nd4jPointer stateHost,
 
         InteropDataBuffer::registerSpecialUse({dbZ}, {dbX});
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -1987,8 +1987,8 @@ void execRandom3(Nd4jPointer *extraPointers, int opNum, Nd4jPointer stateHost,
 
         InteropDataBuffer::registerSpecialUse({dbZ}, {dbX, dbY});
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -2002,18 +2002,18 @@ Nd4jPointer initRandom(Nd4jPointer *extraPointers, long seed, long bufferSize, N
 	// cudaStreamSynchronize(*stream);
 
     auto ptrDev = reinterpret_cast<unsigned long long *>(ptrToBuffer);
-    auto buffer = new nd4j::random::RandomBuffer(seed, bufferSize, reinterpret_cast<uint64_t *>(ptrHost), reinterpret_cast<uint64_t *>(ptrDev));
+    auto buffer = new sd::random::RandomBuffer(seed, bufferSize, reinterpret_cast<uint64_t *>(ptrHost), reinterpret_cast<uint64_t *>(ptrDev));
     buffer->propagateToDevice(buffer, *stream);
 
-    nd4j::DebugHelper::checkErrorCode(stream, "initRandom(...) failed A");
+    sd::DebugHelper::checkErrorCode(stream, "initRandom(...) failed A");
 
 	// we generate sequence in the host memory
-    nd4j::random::Xoroshiro128 generator(buffer);
+    sd::random::Xoroshiro128 generator(buffer);
     generator.refreshBuffer();
 
 	// and copy it to gpu
     cudaMemcpyAsync(ptrDev, ptrHost, bufferSize * 8, cudaMemcpyHostToDevice, *stream);
-    nd4j::DebugHelper::checkErrorCode(stream, "initRandom(...) failed B");
+    sd::DebugHelper::checkErrorCode(stream, "initRandom(...) failed B");
 
     return buffer;
 }
@@ -2021,7 +2021,7 @@ Nd4jPointer initRandom(Nd4jPointer *extraPointers, long seed, long bufferSize, N
 
 void destroyRandom(Nd4jPointer ptrBuffer) {
 
-    nd4j::random::RandomBuffer *buffer = reinterpret_cast<nd4j::random::RandomBuffer *> (ptrBuffer);
+    sd::random::RandomBuffer *buffer = reinterpret_cast<sd::random::RandomBuffer *> (ptrBuffer);
 
     // FIXME: it's bad thing, but we can't know in advance, which stream(s) where using this generator in practice
     cudaDeviceSynchronize();
@@ -2031,7 +2031,7 @@ void destroyRandom(Nd4jPointer ptrBuffer) {
 
 void refreshBuffer(Nd4jPointer *extraPointers, long seed, Nd4jPointer ptrRandom) {
 
-    nd4j::random::RandomBuffer *buffer = reinterpret_cast<nd4j::random::RandomBuffer *> (ptrRandom);
+    sd::random::RandomBuffer *buffer = reinterpret_cast<sd::random::RandomBuffer *> (ptrRandom);
 
     unsigned long long *ptrHost = reinterpret_cast<unsigned long long *>(extraPointers[0]);
     cudaStream_t *stream = reinterpret_cast<cudaStream_t *>(extraPointers[1]);
@@ -2045,7 +2045,7 @@ void refreshBuffer(Nd4jPointer *extraPointers, long seed, Nd4jPointer ptrRandom)
     buffer->propagateToDevice(buffer, *stream);
 
 	// refresh buffer on host size
-    nd4j::random::Xoroshiro128 generator(buffer);
+    sd::random::Xoroshiro128 generator(buffer);
     generator.refreshBuffer();
 
 	// copy back to gpu
@@ -2054,7 +2054,7 @@ void refreshBuffer(Nd4jPointer *extraPointers, long seed, Nd4jPointer ptrRandom)
 
 void reSeedBuffer(Nd4jPointer *extraPointers, long seed, Nd4jPointer ptrRandom) {
 
-    nd4j::random::RandomBuffer *buffer = reinterpret_cast<nd4j::random::RandomBuffer *> (ptrRandom);
+    sd::random::RandomBuffer *buffer = reinterpret_cast<sd::random::RandomBuffer *> (ptrRandom);
 
     cudaStream_t *stream = reinterpret_cast<cudaStream_t *>(extraPointers[1]);
     cudaStreamSynchronize(*stream);
@@ -2101,17 +2101,17 @@ void tear(Nd4jPointer *extras,
 
         cudaStream_t *stream = reinterpret_cast<cudaStream_t *>(extras[1]);
         dim3 launchDims(512, 512, 512);
-        auto xType = nd4j::ArrayOptions::dataType(xShapeInfo);
+        auto xType = sd::ArrayOptions::dataType(xShapeInfo);
         BUILD_SINGLE_SELECTOR(xType, tearKernelGeneric,
                               (launchDims, stream, dbX->special(), dXShapeInfo, targets, zShapeInfo, tadShapeInfo, tadOffsets),
                               LIBND4J_TYPES);
 
-        nd4j::DebugHelper::checkErrorCode(stream, "tearFloat(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "tearFloat(...) failed");
 
         InteropDataBuffer::registerSpecialUse({}, {dbX});
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -2122,15 +2122,15 @@ void prescanArrayRecursive(Nd4jPointer *extras, int *dZ, int *dX, int numElement
     auto g_scanBlockSums = reinterpret_cast<int **>(extras[2]);
 
     int blockSize = 512; // max size of the thread blocks
-    int numBlocks = nd4j::math::nd4j_max<int>(1, static_cast<int>(ceil(static_cast<float>(numElements) / (2.f * blockSize))));
+    int numBlocks = sd::math::nd4j_max<int>(1, static_cast<int>(ceil(static_cast<float>(numElements) / (2.f * blockSize))));
     int numThreads;
 
     if (numBlocks > 1)
         numThreads = blockSize;
-    else if (nd4j::isPowerOfTwo(numElements))
+    else if (sd::isPowerOfTwo(numElements))
         numThreads = numElements / 2;
     else
-        numThreads = nd4j::floorPow2(numElements);
+        numThreads = sd::floorPow2(numElements);
 
     int numEltsPerBlock = numThreads * 2;
 
@@ -2138,7 +2138,7 @@ void prescanArrayRecursive(Nd4jPointer *extras, int *dZ, int *dX, int numElement
     // compute the smallest power of 2 able to compute its scan.
     int numEltsLastBlock =
             numElements - (numBlocks-1) * numEltsPerBlock;
-    int numThreadsLastBlock = nd4j::math::nd4j_max<int>(1, numEltsLastBlock / 2);
+    int numThreadsLastBlock = sd::math::nd4j_max<int>(1, numEltsLastBlock / 2);
     int np2LastBlock = 0;
     int sharedMemLastBlock = 0;
 
@@ -2171,9 +2171,9 @@ void prescanArrayRecursive(Nd4jPointer *extras, int *dZ, int *dX, int numElement
 
     // execute the scan
     if (numBlocks > 1) {
-        nd4j::prescanLauncher<true, false>(grid, threads, sharedMemSize, stream, dZ, dX, g_scanBlockSums[level], numThreads * 2, 0, 0);
+        sd::prescanLauncher<true, false>(grid, threads, sharedMemSize, stream, dZ, dX, g_scanBlockSums[level], numThreads * 2, 0, 0);
         if (np2LastBlock) {
-            nd4j::prescanLauncher<true, true>(gridOnes, threadsOnes, sharedMemLastBlock, stream, dZ, dX, g_scanBlockSums[level], numEltsLastBlock, numBlocks - 1, numElements - numEltsLastBlock);
+            sd::prescanLauncher<true, true>(gridOnes, threadsOnes, sharedMemLastBlock, stream, dZ, dX, g_scanBlockSums[level], numEltsLastBlock, numBlocks - 1, numElements - numEltsLastBlock);
         }
 
         // After scanning all the sub-blocks, we are mostly done.  But now we
@@ -2183,18 +2183,18 @@ void prescanArrayRecursive(Nd4jPointer *extras, int *dZ, int *dX, int numElement
         // recursive (CPU) call
         prescanArrayRecursive(extras, g_scanBlockSums[level], g_scanBlockSums[level], numBlocks, level+1);
 
-        nd4j::uniformAdd<<<grid, threads, 1024, *stream>>>(dZ, g_scanBlockSums[level], numElements - numEltsLastBlock, 0, 0);
+        sd::uniformAdd<<<grid, threads, 1024, *stream>>>(dZ, g_scanBlockSums[level], numElements - numEltsLastBlock, 0, 0);
 
         if (np2LastBlock) {
-            nd4j::uniformAdd<<<1, numThreadsLastBlock, 1024, *stream>>>(dZ, g_scanBlockSums[level], numEltsLastBlock, numBlocks - 1, numElements - numEltsLastBlock);
+            sd::uniformAdd<<<1, numThreadsLastBlock, 1024, *stream>>>(dZ, g_scanBlockSums[level], numEltsLastBlock, numBlocks - 1, numElements - numEltsLastBlock);
         }
     } else if (isPowerOfTwo(numElements)) {
-        nd4j::prescanLauncher<false, false>(grid, threads, sharedMemSize, stream, dZ, dX, 0, numThreads * 2, 0, 0);
+        sd::prescanLauncher<false, false>(grid, threads, sharedMemSize, stream, dZ, dX, 0, numThreads * 2, 0, 0);
     } else {
-        nd4j::prescanLauncher<false, true>(grid, threads, sharedMemSize, stream, dZ, dX, 0, numElements, 0, 0);
+        sd::prescanLauncher<false, true>(grid, threads, sharedMemSize, stream, dZ, dX, 0, numElements, 0, 0);
     }
 
-    nd4j::DebugHelper::checkErrorCode(stream, "prescanArray(...) failed");
+    sd::DebugHelper::checkErrorCode(stream, "prescanArray(...) failed");
 }
 
 
@@ -2206,13 +2206,13 @@ void encodeThresholdP1(Nd4jPointer *extras, void *dx, Nd4jLong *hXShapeInfo, Nd4
         int numBlocks = N / blockSize + (N % blockSize ? 1 : 0);
 
         dim3 launchDims(numBlocks, blockSize, 1024);
-        auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
+        auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
         BUILD_SINGLE_SELECTOR(xType, encoderKernelP1Generic, (launchDims, stream, dx, N, dz, threshold), LIBND4J_TYPES);
 
-        nd4j::DebugHelper::checkErrorCode(stream, "encodeThresholdP1Float(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "encodeThresholdP1Float(...) failed");
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -2223,10 +2223,10 @@ void encodeThresholdP2Int(Nd4jPointer *extraPointers, int *dx, Nd4jLong N, int *
         cudaStream_t *stream = reinterpret_cast<cudaStream_t *>(extraPointers[1]);
         //encoderKernelP2Float<<<numBlocks, blockSize , 1024 * sizeof(float), *stream>>>(dx, N, dz);
         prescanArrayRecursive(extraPointers, dz, dx + 1, (int) N, 0);
-        nd4j::DebugHelper::checkErrorCode(stream, "encodeThresholdP2Int(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "encodeThresholdP2Int(...) failed");
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -2238,13 +2238,13 @@ void encodeThresholdP3(Nd4jPointer *extraPointers, void *dx, Nd4jLong *hXShapeIn
         int numBlocks = N / blockSize + (N % blockSize ? 1 : 0);
 
         dim3 launchDims(numBlocks, blockSize, 4096);
-        auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
+        auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
         BUILD_SINGLE_SELECTOR(xType, encoderKernelP3Generic, (launchDims, stream, dx, offsets, N, dz), LIBND4J_TYPES);
 
-        nd4j::DebugHelper::checkErrorCode(stream, "encodeThresholdP3Float(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "encodeThresholdP3Float(...) failed");
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -2257,13 +2257,13 @@ void decodeThreshold(Nd4jPointer *extraPointers, void *dx, Nd4jLong N, void *dz,
         int numBlocks = N / blockSize + (N % blockSize ? 1 : 0);
 
         dim3 launchDims(numBlocks, blockSize, 1024);
-        auto zType = nd4j::ArrayOptions::dataType(zShapeInfo);
+        auto zType = sd::ArrayOptions::dataType(zShapeInfo);
         BUILD_SINGLE_SELECTOR(zType, decoderKernelGeneric, (launchDims, stream, dx, N, dz), LIBND4J_TYPES);
 
-        nd4j::DebugHelper::checkErrorCode(stream, "decodeThresholdFloat(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "decodeThresholdFloat(...) failed");
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -2295,8 +2295,8 @@ void execReduce3All(Nd4jPointer *extraPointers,
 
         InteropDataBuffer::registerSpecialUse({dbZ}, {dbX, dbY});
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -2310,12 +2310,12 @@ void sort(Nd4jPointer *extraPointers,
 
         auto xLength = shape::length(xShapeInfo);
         auto xEWS = shape::elementWiseStride(xShapeInfo);
-        auto xType = nd4j::ArrayOptions::dataType(xShapeInfo);
+        auto xType = sd::ArrayOptions::dataType(xShapeInfo);
 
 
         // check if xLength is a power of 2, and use bitonic sort, if that's the case
         if ((xLength != 0) && ((xLength & (xLength - 1)) == 0) && (xLength <= 1024 * 1024 * 10)) {
-            int numThreads = nd4j::math::nd4j_min<int>(512, xLength);
+            int numThreads = sd::math::nd4j_min<int>(512, xLength);
             int numBlocks = xLength / numThreads;
             if (xLength % numThreads > 0 || numBlocks == 0)
                 numBlocks++;
@@ -2330,12 +2330,12 @@ void sort(Nd4jPointer *extraPointers,
                 }
             }
         } else {
-            int numThreads = nd4j::math::nd4j_min<int>(512, xLength);
+            int numThreads = sd::math::nd4j_min<int>(512, xLength);
             int numBlocks = xLength / numThreads;
             if (xLength % numThreads > 0 || numBlocks == 0)
                 numBlocks++;
 
-            numBlocks = nd4j::math::nd4j_min<int>(512, numBlocks);
+            numBlocks = sd::math::nd4j_min<int>(512, numBlocks);
             dim3 launchDims(numBlocks, numThreads, 32768);
 
             int max = 2, dg = 0;
@@ -2359,10 +2359,10 @@ void sort(Nd4jPointer *extraPointers,
             }
         }
 
-        nd4j::DebugHelper::checkErrorCode(stream, "sort(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "sort(...) failed");
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -2379,8 +2379,8 @@ void sortByKey(Nd4jPointer *extraPointers,
         auto xLength = shape::length(xShapeInfo);
         auto yLength = shape::length(yShapeInfo);
         auto xEWS = shape::elementWiseStride(xShapeInfo);
-        auto xType = nd4j::ArrayOptions::dataType(xShapeInfo);
-        auto yType = nd4j::ArrayOptions::dataType(yShapeInfo);
+        auto xType = sd::ArrayOptions::dataType(xShapeInfo);
+        auto yType = sd::ArrayOptions::dataType(yShapeInfo);
 
         if (shape::isEmpty(xShapeInfo) || shape::isEmpty(yShapeInfo))
             return;
@@ -2391,7 +2391,7 @@ void sortByKey(Nd4jPointer *extraPointers,
 
         // check if xLength is a power of 2, and use bitonic sort, if that's the case
         if ((xLength != 0) && ((xLength & (xLength - 1)) == 0) && (xLength <= 1024 * 1024 * 10)) {
-            int numThreads = nd4j::math::nd4j_min<int>(512, xLength);
+            int numThreads = sd::math::nd4j_min<int>(512, xLength);
             int numBlocks = xLength / numThreads;
             if (xLength % numThreads > 0 || numBlocks == 0)
                 numBlocks++;
@@ -2406,12 +2406,12 @@ void sortByKey(Nd4jPointer *extraPointers,
                 }
             }
         } else {
-            int numThreads = nd4j::math::nd4j_min<int>(512, xLength);
+            int numThreads = sd::math::nd4j_min<int>(512, xLength);
             int numBlocks = xLength / numThreads;
             if (xLength % numThreads > 0 || numBlocks == 0)
                 numBlocks++;
 
-            numBlocks = nd4j::math::nd4j_min<int>(512, numBlocks);
+            numBlocks = sd::math::nd4j_min<int>(512, numBlocks);
             dim3 launchDims(numBlocks, numThreads, 32768);
 
             int max = 2, dg = 0;
@@ -2436,8 +2436,8 @@ void sortByKey(Nd4jPointer *extraPointers,
         }
 
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -2453,8 +2453,8 @@ void sortByValue(Nd4jPointer *extraPointers,
         auto xLength = shape::length(xShapeInfo);
         auto yLength = shape::length(yShapeInfo);
         auto xEWS = shape::elementWiseStride(xShapeInfo);
-        auto xType = nd4j::ArrayOptions::dataType(yShapeInfo);
-        auto yType = nd4j::ArrayOptions::dataType(xShapeInfo);
+        auto xType = sd::ArrayOptions::dataType(yShapeInfo);
+        auto yType = sd::ArrayOptions::dataType(xShapeInfo);
 
         if (shape::isEmpty(xShapeInfo) || shape::isEmpty(yShapeInfo))
             return;
@@ -2465,7 +2465,7 @@ void sortByValue(Nd4jPointer *extraPointers,
 
         // check if xLength is a power of 2, and use bitonic sort, if that's the case
         if ((xLength != 0) && ((xLength & (xLength - 1)) == 0) && (xLength <= 1024 * 1024 * 10)) {
-            int numThreads = nd4j::math::nd4j_min<int>(512, xLength);
+            int numThreads = sd::math::nd4j_min<int>(512, xLength);
             int numBlocks = xLength / numThreads;
             if (xLength % numThreads > 0 || numBlocks == 0)
                 numBlocks++;
@@ -2480,12 +2480,12 @@ void sortByValue(Nd4jPointer *extraPointers,
                 }
             }
         } else {
-            int numThreads = nd4j::math::nd4j_min<int>(512, xLength);
+            int numThreads = sd::math::nd4j_min<int>(512, xLength);
             int numBlocks = xLength / numThreads;
             if (xLength % numThreads > 0 || numBlocks == 0)
                 numBlocks++;
 
-            numBlocks = nd4j::math::nd4j_min<int>(512, numBlocks);
+            numBlocks = sd::math::nd4j_min<int>(512, numBlocks);
             dim3 launchDims(numBlocks, numThreads, 32768);
 
             int max = 2, dg = 0;
@@ -2509,8 +2509,8 @@ void sortByValue(Nd4jPointer *extraPointers,
             }
         }
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -2528,18 +2528,18 @@ void sortTadByKey(Nd4jPointer *extraPointers,
         auto stream = reinterpret_cast<cudaStream_t *>(extraPointers[1]);
         auto context = extraPointers[0] == 0 ? LaunchContext::defaultContext()
                                              : reinterpret_cast<LaunchContext *>(extraPointers[0]);
-        auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
+        auto tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
         dim3 launchDims((int) tadPack.numberOfTads(), 256, 2048);
-        auto xType = nd4j::ArrayOptions::dataType(xShapeInfo);
-        auto yType = nd4j::ArrayOptions::dataType(yShapeInfo);
+        auto xType = sd::ArrayOptions::dataType(xShapeInfo);
+        auto yType = sd::ArrayOptions::dataType(yShapeInfo);
         BUILD_DOUBLE_SELECTOR(xType, yType, oesTadGenericKey,
                               (launchDims, stream, dX, dXShapeInfo, dy, dyShapeInfo, nullptr, dimensionLength, tadPack.platformShapeInfo(), tadPack.platformOffsets(), descending),
                               LIBND4J_TYPES, LIBND4J_TYPES);
 
-        nd4j::DebugHelper::checkErrorCode(stream, "sortTadKey(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "sortTadKey(...) failed");
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -2555,19 +2555,19 @@ void sortTadByValue(Nd4jPointer *extraPointers,
         auto stream = reinterpret_cast<cudaStream_t *>(extraPointers[1]);
         auto context = extraPointers[0] == 0 ? LaunchContext::defaultContext()
                                              : reinterpret_cast<LaunchContext *>(extraPointers[0]);
-        auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
+        auto tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
         dim3 launchDims((int) tadPack.numberOfTads(), 256, 2048);
-        auto xType = nd4j::ArrayOptions::dataType(yShapeInfo);
-        auto yType = nd4j::ArrayOptions::dataType(xShapeInfo);
+        auto xType = sd::ArrayOptions::dataType(yShapeInfo);
+        auto yType = sd::ArrayOptions::dataType(xShapeInfo);
 
         BUILD_DOUBLE_SELECTOR(xType, yType, oesTadGenericKey,
                               (launchDims, stream, dy, dyShapeInfo, dX, dXShapeInfo, nullptr, dimensionLength, tadPack.platformShapeInfo(), tadPack.platformOffsets(), descending),
                               LIBND4J_TYPES, LIBND4J_TYPES);
 
-        nd4j::DebugHelper::checkErrorCode(stream, "sortTadValue(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "sortTadValue(...) failed");
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -2585,17 +2585,17 @@ void sortTad(Nd4jPointer *extraPointers,
         auto stream = reinterpret_cast<cudaStream_t *>(extraPointers[1]);
         auto context = extraPointers[0] == 0 ? LaunchContext::defaultContext()
                                              : reinterpret_cast<LaunchContext *>(extraPointers[0]);
-        auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
+        auto tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
         dim3 launchDims((int) tadPack.numberOfTads(), 512, 33768);
-        auto xType = nd4j::ArrayOptions::dataType(xShapeInfo);
+        auto xType = sd::ArrayOptions::dataType(xShapeInfo);
         BUILD_SINGLE_SELECTOR(xType, oesTadGeneric,
                               (launchDims, stream, dX, dXShapeInfo, nullptr, dimensionLength, tadShapeInfo, tadOffsets, descending),
                               LIBND4J_TYPES);
 
-        nd4j::DebugHelper::checkErrorCode(stream, "sortTad(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "sortTad(...) failed");
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -2616,20 +2616,20 @@ Nd4jLong encodeBitmap(Nd4jPointer *extraPointers,
         int *reductionPointer = reinterpret_cast<int *>(extraPointers[3]);
 
         dim3 launchDims(512, 512, 32768);
-        auto xType = nd4j::ArrayOptions::dataType(hXShapeInfo);
+        auto xType = sd::ArrayOptions::dataType(hXShapeInfo);
         BUILD_SINGLE_SELECTOR(xType, cudaEncodeBitmapGeneric,
                               (launchDims, stream, dx, N, dz, resultPointer, reductionPointer, threshold),
                               LIBND4J_TYPES);
 
-        nd4j::DebugHelper::checkErrorCode(stream, "encodeBitmapFloat(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "encodeBitmapFloat(...) failed");
 
         Nd4jLong dZ = (Nd4jLong) resultPointer[0];
         resultPointer[0] = 0;
 
         return dZ;
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
         return 0;
     }
 }
@@ -2642,13 +2642,13 @@ void decodeBitmap(Nd4jPointer *extraPointers,
     try {
         cudaStream_t *stream = reinterpret_cast<cudaStream_t *>(extraPointers[1]);
         dim3 launchDims(512, 512, 16384);
-        auto xType = nd4j::ArrayOptions::dataType(zShapeInfo);
+        auto xType = sd::ArrayOptions::dataType(zShapeInfo);
         BUILD_SINGLE_SELECTOR(xType, cudaDecodeBitmapGeneric, (launchDims, stream, dx, N, dz), LIBND4J_TYPES);
 
-        nd4j::DebugHelper::checkErrorCode(stream, "decodeBitmapFloat(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "decodeBitmapFloat(...) failed");
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -2661,33 +2661,33 @@ void munmapFile(Nd4jPointer *extraPointers, Nd4jLong* ptrMap, Nd4jLong length) {
 }
 
 
-nd4j::graph::ResultWrapper* executeFlatGraph(Nd4jPointer *extraPointers, Nd4jPointer flatBufferPointer) {
+sd::graph::ResultWrapper* executeFlatGraph(Nd4jPointer *extraPointers, Nd4jPointer flatBufferPointer) {
     try {
-        return nd4j::graph::GraphExecutioner::executeFlatBuffer(flatBufferPointer);
+        return sd::graph::GraphExecutioner::executeFlatBuffer(flatBufferPointer);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
         return nullptr;
     }
 }
 
-Nd4jLong getResultWrapperSize(nd4j::graph::ResultWrapper* ptr) {
+Nd4jLong getResultWrapperSize(sd::graph::ResultWrapper* ptr) {
     return ptr->size();
 }
-Nd4jPointer getResultWrapperPointer(nd4j::graph::ResultWrapper* ptr) {
+Nd4jPointer getResultWrapperPointer(sd::graph::ResultWrapper* ptr) {
     return ptr->pointer();
 }
 
 
 const char* getAllCustomOps() {
-	return nd4j::ops::OpRegistrator::getInstance()->getAllCustomOperations();
+	return sd::ops::OpRegistrator::getInstance()->getAllCustomOperations();
 }
 
 
-nd4j::ShapeList* _calculateOutputShapes(Nd4jPointer* extraPointers, nd4j::ops::DeclarableOp* op, Nd4jPointer* inputBuffers, Nd4jPointer* inputShapes, int numInputShapes, double* tArgs, int numTArgs, Nd4jLong *iArgs, int numIArgs, bool *bArgs, int numBArgs, int *dArgs, int numDArgs) {
-    nd4j::graph::VariableSpace varSpace;
+sd::ShapeList* _calculateOutputShapes(Nd4jPointer* extraPointers, sd::ops::DeclarableOp* op, Nd4jPointer* inputBuffers, Nd4jPointer* inputShapes, int numInputShapes, double* tArgs, int numTArgs, Nd4jLong *iArgs, int numIArgs, bool *bArgs, int numBArgs, int *dArgs, int numDArgs) {
+    sd::graph::VariableSpace varSpace;
     Context block(2, &varSpace);
-    nd4j::ShapeList inShapes;
+    sd::ShapeList inShapes;
 
     for (int e = 0; e < numIArgs; e++)
         block.getIArguments()->push_back(iArgs[e]);
@@ -2699,16 +2699,16 @@ nd4j::ShapeList* _calculateOutputShapes(Nd4jPointer* extraPointers, nd4j::ops::D
 		block.getBArguments()->push_back(bArgs[e]);
 
     for (int e = 0; e < numDArgs; e++)
-        block.getDArguments()->push_back((nd4j::DataType) dArgs[e]);
+        block.getDArguments()->push_back((sd::DataType) dArgs[e]);
 
 	for (int e = 0; e < numInputShapes; e++) {
 		auto shape_ = reinterpret_cast<Nd4jLong *>(inputShapes[e]);
 
 		// we shouldn't copy buffer if that's empty array
-		void *buffer_ = nd4j::ArrayOptions::arrayType(shape_) == ArrayType::EMPTY ? nullptr : inputBuffers[e];
-        void *bufferD_ = nd4j::ArrayOptions::arrayType(shape_) == ArrayType::EMPTY ? nullptr : inputBuffers[e + numInputShapes];
+		void *buffer_ = sd::ArrayOptions::arrayType(shape_) == ArrayType::EMPTY ? nullptr : inputBuffers[e];
+        void *bufferD_ = sd::ArrayOptions::arrayType(shape_) == ArrayType::EMPTY ? nullptr : inputBuffers[e + numInputShapes];
 
-		auto array = new nd4j::NDArray(buffer_, bufferD_, shape_);
+		auto array = new sd::NDArray(buffer_, bufferD_, shape_);
 
 		// block should contain references to proper variable
 		varSpace.putVariable(1, e, array);
@@ -2725,22 +2725,22 @@ nd4j::ShapeList* _calculateOutputShapes(Nd4jPointer* extraPointers, nd4j::ops::D
     return shapeList;
 }
 
-nd4j::ShapeList* calculateOutputShapes2(Nd4jPointer* extraPointers, Nd4jLong hash, Nd4jPointer* inputBuffers, Nd4jPointer* inputShapes, int numInputShapes, double* tArgs, int numTArgs, Nd4jLong *iArgs, int numIArgs, bool *bArgs, int numBArgs, int *dArgs, int numDArgs) {
+sd::ShapeList* calculateOutputShapes2(Nd4jPointer* extraPointers, Nd4jLong hash, Nd4jPointer* inputBuffers, Nd4jPointer* inputShapes, int numInputShapes, double* tArgs, int numTArgs, Nd4jLong *iArgs, int numIArgs, bool *bArgs, int numBArgs, int *dArgs, int numDArgs) {
     try {
-        auto op = nd4j::ops::OpRegistrator::getInstance()->getOperation(hash);
+        auto op = sd::ops::OpRegistrator::getInstance()->getOperation(hash);
 
         return _calculateOutputShapes(extraPointers, op, inputBuffers, inputShapes, numInputShapes, tArgs, numTArgs,
                                       iArgs, numIArgs, bArgs, numBArgs, dArgs, numDArgs);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
         return nullptr;
     }
 }
 
-nd4j::ShapeList* _calculateOutputShapes(Nd4jPointer* extraPointers, nd4j::ops::DeclarableOp* op, Nd4jPointer* inputShapes, int numInputShapes, double* tArgs, int numTArgs, Nd4jLong *iArgs, int numIArgs) {
+sd::ShapeList* _calculateOutputShapes(Nd4jPointer* extraPointers, sd::ops::DeclarableOp* op, Nd4jPointer* inputShapes, int numInputShapes, double* tArgs, int numTArgs, Nd4jLong *iArgs, int numIArgs) {
     Context block(1);
-	nd4j::ShapeList inShapes;
+	sd::ShapeList inShapes;
 
 	for (int e = 0; e < numIArgs; e++)
 		block.getIArguments()->push_back(iArgs[e]);
@@ -2756,34 +2756,34 @@ nd4j::ShapeList* _calculateOutputShapes(Nd4jPointer* extraPointers, nd4j::ops::D
 	return shapeList;
 }
 
-nd4j::ShapeList* calculateOutputShapes(Nd4jPointer* extraPointers, Nd4jLong hash, Nd4jPointer* inputShapes, int numInputShapes, double* tArgs, int numTArgs, Nd4jLong *iArgs, int numIArgs) {
+sd::ShapeList* calculateOutputShapes(Nd4jPointer* extraPointers, Nd4jLong hash, Nd4jPointer* inputShapes, int numInputShapes, double* tArgs, int numTArgs, Nd4jLong *iArgs, int numIArgs) {
     try {
-        auto op = nd4j::ops::OpRegistrator::getInstance()->getOperation(hash);
+        auto op = sd::ops::OpRegistrator::getInstance()->getOperation(hash);
 
         return _calculateOutputShapes(extraPointers, op, inputShapes, numInputShapes, tArgs, numTArgs, iArgs, numIArgs);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
         return nullptr;
     }
 }
 
-Nd4jLong getShapeListSize(nd4j::ShapeList* list) {
+Nd4jLong getShapeListSize(sd::ShapeList* list) {
     return list->size();
 }
 
-Nd4jLong* getShape(nd4j::ShapeList* list, Nd4jLong i) {
+Nd4jLong* getShape(sd::ShapeList* list, Nd4jLong i) {
     return list->at(i);
 }
 
-static FORCEINLINE Nd4jStatus realExec(nd4j::ops::DeclarableOp* op, Nd4jPointer* extraPointers, Nd4jLong hash, Nd4jPointer* inputBuffers, Nd4jPointer* inputShapes, int numInputs, Nd4jPointer* outputBuffers, Nd4jPointer* outputShapes, int numOutputs, double* tArgs, int numTArgs, Nd4jLong *iArgs, int numIArgs, bool* bArgs, int numBArgs, bool isInplace) {
+static FORCEINLINE Nd4jStatus realExec(sd::ops::DeclarableOp* op, Nd4jPointer* extraPointers, Nd4jLong hash, Nd4jPointer* inputBuffers, Nd4jPointer* inputShapes, int numInputs, Nd4jPointer* outputBuffers, Nd4jPointer* outputShapes, int numOutputs, double* tArgs, int numTArgs, Nd4jLong *iArgs, int numIArgs, bool* bArgs, int numBArgs, bool isInplace) {
 	if (op == nullptr)
 		nd4j_printf("Can't find requested operation: [%lld]\n", hash);
 
 	// we're using the same fake nodeId everywhere here
 
-	std::vector<nd4j::NDArray*> inputs(numInputs);
-	std::vector<nd4j::NDArray*> outputs(numOutputs);
+	std::vector<sd::NDArray*> inputs(numInputs);
+	std::vector<sd::NDArray*> outputs(numOutputs);
 	std::vector<double> ttArgs(numTArgs);
 	std::vector<bool> bbArgs(numBArgs);
 	std::vector<Nd4jLong> iiArgs(numIArgs);
@@ -2791,10 +2791,10 @@ static FORCEINLINE Nd4jStatus realExec(nd4j::ops::DeclarableOp* op, Nd4jPointer*
 	// filling block now with inputs
 	for (int e = 0; e < numInputs; e++) {
 		auto shape = reinterpret_cast<Nd4jLong *>(inputShapes[e]);
-		void *buffer = nd4j::ArrayOptions::arrayType(shape) == ArrayType::EMPTY ? nullptr : inputBuffers[e];
-        void *bufferD = nd4j::ArrayOptions::arrayType(shape) == ArrayType::EMPTY ? nullptr : inputBuffers[e + numInputs];
+		void *buffer = sd::ArrayOptions::arrayType(shape) == ArrayType::EMPTY ? nullptr : inputBuffers[e];
+        void *bufferD = sd::ArrayOptions::arrayType(shape) == ArrayType::EMPTY ? nullptr : inputBuffers[e + numInputs];
 
-		inputs[e] = new nd4j::NDArray(buffer, bufferD, shape);
+		inputs[e] = new sd::NDArray(buffer, bufferD, shape);
 	}
 
 	// if not inplace - transferring output arrays
@@ -2803,13 +2803,13 @@ static FORCEINLINE Nd4jStatus realExec(nd4j::ops::DeclarableOp* op, Nd4jPointer*
 		for (int e = 0; e < numOutputs; e++) {
 			// we want to keep original output shape intact
 			auto shape = shape::copyShape(reinterpret_cast<Nd4jLong *>(outputShapes[e]));
-			void *buffer = nd4j::ArrayOptions::arrayType(shape) == ArrayType::EMPTY ? nullptr : outputBuffers[e];
-            void *bufferD = nd4j::ArrayOptions::arrayType(shape) == ArrayType::EMPTY ? nullptr : outputBuffers[e + numOutputs];
+			void *buffer = sd::ArrayOptions::arrayType(shape) == ArrayType::EMPTY ? nullptr : outputBuffers[e];
+            void *bufferD = sd::ArrayOptions::arrayType(shape) == ArrayType::EMPTY ? nullptr : outputBuffers[e + numOutputs];
 
 			// FIXME: revisit this.
 			bool canNullify = true;
 			for (int i = 0; i < numInputs; i++) {
-				void *ibuffer = nd4j::ArrayOptions::arrayType(shape) == ArrayType::EMPTY ? nullptr : inputBuffers[i];
+				void *ibuffer = sd::ArrayOptions::arrayType(shape) == ArrayType::EMPTY ? nullptr : inputBuffers[i];
 				if (ibuffer == buffer) {
 					canNullify = false;
 					break;
@@ -2819,7 +2819,7 @@ static FORCEINLINE Nd4jStatus realExec(nd4j::ops::DeclarableOp* op, Nd4jPointer*
 			if (canNullify && buffer != nullptr)
 				memset((uint8_t *) buffer, '\0', shape::length(shape) * DataTypeUtils::sizeOfElement(ArrayOptions::dataType(shape)));
 
-			auto array = new nd4j::NDArray(buffer, bufferD, shape);
+			auto array = new sd::NDArray(buffer, bufferD, shape);
 			outputs[e] = array;
 		}
 
@@ -2834,7 +2834,7 @@ static FORCEINLINE Nd4jStatus realExec(nd4j::ops::DeclarableOp* op, Nd4jPointer*
 
 
 	// hypothetically at this point we have everything filled
-	auto dZ = op->execute(inputs, outputs, ttArgs, iiArgs, bbArgs, std::vector<nd4j::DataType>(), isInplace);
+	auto dZ = op->execute(inputs, outputs, ttArgs, iiArgs, bbArgs, std::vector<sd::DataType>(), isInplace);
 	//auto dZ = op->execute(inputs, ttArgs, iiArgs, isInplace);
 
 
@@ -2861,27 +2861,27 @@ static FORCEINLINE Nd4jStatus realExec(nd4j::ops::DeclarableOp* op, Nd4jPointer*
 
 int execCustomOp(Nd4jPointer* extraPointers, Nd4jLong hash, Nd4jPointer* inputBuffers, Nd4jPointer* inputShapes, int numInputs, Nd4jPointer* outputBuffers, Nd4jPointer* outputShapes, int numOutputs, double* tArgs, int numTArgs, Nd4jLong *iArgs, int numIArgs, bool* bArgs, int numBArgs, bool isInplace) {
     try {
-        auto op = nd4j::ops::OpRegistrator::getInstance()->getOperation(hash);
+        auto op = sd::ops::OpRegistrator::getInstance()->getOperation(hash);
 
         return realExec(op, extraPointers, hash, inputBuffers, inputShapes, numInputs, outputBuffers, outputShapes,
                         numOutputs, tArgs, numTArgs, iArgs, numIArgs, bArgs, numBArgs, isInplace);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
         return 1;
     }
 }
 
 int execCustomOp2(Nd4jPointer* extraPointers, Nd4jLong hash, Nd4jPointer opContext) {
     try {
-        auto op = nd4j::ops::OpRegistrator::getInstance()->getOperation(hash);
+        auto op = sd::ops::OpRegistrator::getInstance()->getOperation(hash);
         auto context = reinterpret_cast<Context *>(opContext);
 
         auto result = op->execute(context);
 
         auto res = cudaStreamSynchronize(*context->launchContext()->getCudaStream());
         if (res != 0)
-            throw nd4j::cuda_exception::build("customOp execution failed", res);
+            throw sd::cuda_exception::build("customOp execution failed", res);
 
         for (auto v:context->fastpath_in()) {
             if (!v->isEmpty())
@@ -2895,38 +2895,38 @@ int execCustomOp2(Nd4jPointer* extraPointers, Nd4jLong hash, Nd4jPointer opConte
 
         return result;
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
         return 1;
     }
 }
 
 int registerGraph(Nd4jPointer *extraPointers, Nd4jLong graphId, Nd4jPointer flatBufferPointer) {
     try {
-        auto graph = nd4j::graph::GraphExecutioner::importFromFlatPointer(flatBufferPointer);
+        auto graph = sd::graph::GraphExecutioner::importFromFlatPointer(flatBufferPointer);
 
-        nd4j::graph::GraphHolder::getInstance()->registerGraph(graphId, graph);
+        sd::graph::GraphHolder::getInstance()->registerGraph(graphId, graph);
 
         return ND4J_STATUS_OK;
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
         return 1;
     }
 }
 
 
 static VariablesSet* executeStoredGraphT(Nd4jPointer *extraPointers, Nd4jLong graphId, Nd4jPointer *inputBuffers, Nd4jPointer *inputShapes, int* inputIndices, int numInputs) {
-	auto graph = nd4j::graph::GraphHolder::getInstance()->pullGraph(graphId);
+	auto graph = sd::graph::GraphHolder::getInstance()->pullGraph(graphId);
 	auto varSpace = graph->getVariableSpace()->clone();
 
-	std::vector<nd4j::NDArray*> handles;
+	std::vector<sd::NDArray*> handles;
 
 	for (int e = 0; e < numInputs; e++) {
 		auto idx = inputIndices[e];
 
 		// we'll delete this array later, together with cloned VariableSpace
-		auto array = new nd4j::NDArray(inputBuffers[e], reinterpret_cast<Nd4jLong *>(inputShapes[e]));
+		auto array = new sd::NDArray(inputBuffers[e], reinterpret_cast<Nd4jLong *>(inputShapes[e]));
 		handles.emplace_back(array);
 
 		if (varSpace->hasVariable(idx)) {
@@ -2939,8 +2939,8 @@ static VariablesSet* executeStoredGraphT(Nd4jPointer *extraPointers, Nd4jLong gr
 			varSpace->putVariable(idx, array);
 	}
 
-	auto dZ = nd4j::graph::GraphExecutioner::execute(graph, varSpace);
-	auto varSet = new nd4j::graph::VariablesSet(dZ);
+	auto dZ = sd::graph::GraphExecutioner::execute(graph, varSpace);
+	auto varSet = new sd::graph::VariablesSet(dZ);
 
 	if (dZ == ND4J_STATUS_OK) {
 		// pull back results, and provide them
@@ -2966,52 +2966,52 @@ VariablesSet* executeStoredGraph(Nd4jPointer *extraPointers, Nd4jLong graphId, N
     try {
         return executeStoredGraphT(extraPointers, graphId, inputBuffers, inputShapes, inputIndices, numInputs);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
         return nullptr;
     }
 }
 
-Nd4jLong getVariablesSetSize(nd4j::graph::VariablesSet* set) {
+Nd4jLong getVariablesSetSize(sd::graph::VariablesSet* set) {
     return set->size();
 }
 
-Nd4jStatus getVariablesSetStatus(nd4j::graph::VariablesSet* set) {
+Nd4jStatus getVariablesSetStatus(sd::graph::VariablesSet* set) {
     return set->status();
 }
 
-nd4j::graph::Variable* getVariable(nd4j::graph::VariablesSet* set, Nd4jLong i) {
+sd::graph::Variable* getVariable(sd::graph::VariablesSet* set, Nd4jLong i) {
     return set->at(i);
 }
 
-int getVariableId(nd4j::graph::Variable* variable) {
+int getVariableId(sd::graph::Variable* variable) {
     return variable->id();
 }
 
-int getVariableIndex(nd4j::graph::Variable* variable) {
+int getVariableIndex(sd::graph::Variable* variable) {
     return variable->index();
 }
 
-const char* getVariableName(nd4j::graph::Variable* variable) {
+const char* getVariableName(sd::graph::Variable* variable) {
     return variable->getName()->c_str();
 }
 
-Nd4jLong* getVariableShape(nd4j::graph::Variable* variable) {
+Nd4jLong* getVariableShape(sd::graph::Variable* variable) {
     return variable->getNDArray()->shapeInfo();
 }
 
-void* getVariableBuffer(nd4j::graph::Variable* variable) {
+void* getVariableBuffer(sd::graph::Variable* variable) {
     return variable->getNDArray()->buffer();
 }
 
 int unregisterGraph(Nd4jPointer *extraPointers, Nd4jLong graphId) {
     try {
-        nd4j::graph::GraphHolder::getInstance()->dropGraphAny(graphId);
+        sd::graph::GraphHolder::getInstance()->dropGraphAny(graphId);
 
         return ND4J_STATUS_OK;
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
         return 1;
     }
 }
@@ -3036,33 +3036,33 @@ void deleteLongArray(Nd4jPointer pointer) {
 	delete[] ptr;
 }
 
-void deleteVariablesSet(nd4j::graph::VariablesSet* pointer) {
+void deleteVariablesSet(sd::graph::VariablesSet* pointer) {
 	delete pointer;
 }
 
 void deleteShapeList(Nd4jPointer shapeList) {
-    nd4j::ShapeList* list = reinterpret_cast<nd4j::ShapeList*>(shapeList);
+    sd::ShapeList* list = reinterpret_cast<sd::ShapeList*>(shapeList);
 
     //list->destroy();
     delete list;
 }
 
 const char* getAllOperations() {
-    return nd4j::OpTracker::getInstance()->exportOperations();
+    return sd::OpTracker::getInstance()->exportOperations();
 }
 
 Nd4jPointer getGraphState(Nd4jLong id) {
-    return (Nd4jPointer) new nd4j::graph::GraphState(id);
+    return (Nd4jPointer) new sd::graph::GraphState(id);
 }
 
 
 void deleteGraphState(Nd4jPointer state) {
-    auto stateP = reinterpret_cast<nd4j::graph::GraphState*>(state);
+    auto stateP = reinterpret_cast<sd::graph::GraphState*>(state);
     delete stateP;
 }
 
 
-Nd4jStatus execCustomOpWithScope(Nd4jPointer *extraPointers, nd4j::graph::GraphState *state, Nd4jLong opHash, Nd4jLong *scopes, int numScopes, Nd4jPointer *inputBuffers, Nd4jPointer *inputShapes, int numInputs, Nd4jPointer *outputBuffers, Nd4jPointer *outputShapes, int numOutputs) {
+Nd4jStatus execCustomOpWithScope(Nd4jPointer *extraPointers, sd::graph::GraphState *state, Nd4jLong opHash, Nd4jLong *scopes, int numScopes, Nd4jPointer *inputBuffers, Nd4jPointer *inputShapes, int numInputs, Nd4jPointer *outputBuffers, Nd4jPointer *outputShapes, int numOutputs) {
     /**
      * That's basically exec, with VariableSpace provided in GraphState:
      * depending on operation (i.e. while of if), different logic executors could be used
@@ -3080,7 +3080,7 @@ Nd4jStatus execCustomOpWithScope(Nd4jPointer *extraPointers, nd4j::graph::GraphS
         auto buffer = inputBuffers[e];
         auto shapeInfo = reinterpret_cast<Nd4jLong *>(inputShapes[e]);
 
-        auto array = new nd4j::NDArray(buffer, shapeInfo, varSpace->launchContext());
+        auto array = new sd::NDArray(buffer, shapeInfo, varSpace->launchContext());
 
         // now we just put array to VarSpace
         varSpace->putVariable(0, e, array);
@@ -3129,19 +3129,19 @@ Nd4jStatus execCustomOpWithScope(Nd4jPointer *extraPointers, nd4j::graph::GraphS
 
 Nd4jStatus execCustomOpWithScope(Nd4jPointer *extraPointers, Nd4jPointer state, Nd4jLong opHash, Nd4jLong *scopes, int numScopes, Nd4jPointer *inputBuffers, Nd4jPointer *inputShapes, int numInputs, Nd4jPointer *outputBuffers, Nd4jPointer *outputShapes, int numOutputs) {
     try {
-        return execCustomOpWithScope(extraPointers, reinterpret_cast<nd4j::graph::GraphState *>(state), opHash, scopes,
+        return execCustomOpWithScope(extraPointers, reinterpret_cast<sd::graph::GraphState *>(state), opHash, scopes,
                                      numScopes, inputBuffers, inputShapes, numInputs, outputBuffers, outputShapes,
                                      numOutputs);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
         return 1;
     }
 }
 
 void deleteResultWrapper(Nd4jPointer ptr) {
 	// just 0 room for compiler s@!t
-	auto p = reinterpret_cast<nd4j::graph::ResultWrapper *>(ptr);
+	auto p = reinterpret_cast<sd::graph::ResultWrapper *>(ptr);
 	delete p;
 }
 
@@ -3160,113 +3160,113 @@ void convertTypes(Nd4jPointer *extras, int srcType, Nd4jPointer dX, Nd4jLong N,
 
         if (srcType == ND4J_FLOAT8) {
             if (dstType == ND4J_FLOAT8) {
-                // convertKernel<double, nd4j::float8>(extras, dx, N, dz);
+                // convertKernel<double, sd::float8>(extras, dx, N, dz);
             } else if (dstType == ND4J_INT8) {
-                //nd4j::TypeCast::convertGenericCuda<nd4j::float8, nd4j::int8>(extras, dx, N, dz);
+                //sd::TypeCast::convertGenericCuda<sd::float8, sd::int8>(extras, dx, N, dz);
             } else if (dstType == ND4J_UINT8) {
-                //nd4j::TypeCast::convertGenericCuda<nd4j::float8, nd4j::uint8>(extras, dx, N, dz);
+                //sd::TypeCast::convertGenericCuda<sd::float8, sd::uint8>(extras, dx, N, dz);
             } else if (dstType == ND4J_FLOAT16) {
-                //nd4j::TypeCast::convertGenericCuda<nd4j::float8, float16>(extras, dx, N, dz);
+                //sd::TypeCast::convertGenericCuda<sd::float8, float16>(extras, dx, N, dz);
             } else if (dstType == ND4J_INT16) {
-                //nd4j::TypeCast::convertGenericCuda<nd4j::float8, nd4j::int16>(extras, dx, N, dz);
+                //sd::TypeCast::convertGenericCuda<sd::float8, sd::int16>(extras, dx, N, dz);
             } else if (dstType == ND4J_UINT16) {
-                //nd4j::TypeCast::convertGenericCuda<nd4j::float8, nd4j::uint16>(extras, dx, N, dz);
+                //sd::TypeCast::convertGenericCuda<sd::float8, sd::uint16>(extras, dx, N, dz);
             } else if (dstType == ND4J_FLOAT24) {
 
             } else if (dstType == ND4J_FLOAT32) {
-                //nd4j::TypeCast::convertGenericCuda<nd4j::float8, float>(extras, dx, N, dz);
+                //sd::TypeCast::convertGenericCuda<sd::float8, float>(extras, dx, N, dz);
             } else if (dstType == ND4J_DOUBLE) {
-                //nd4j::TypeCast::convertGenericCuda<nd4j::float8, double>(extras, dx, N, dz);
+                //sd::TypeCast::convertGenericCuda<sd::float8, double>(extras, dx, N, dz);
             } else {
                 nd4j_printf("Unsupported types conversion: [%i] -> [%i]\n", srcType, dstType);
             }
         } else if (srcType == ND4J_INT8) {
             if (dstType == ND4J_FLOAT8) {
-                //nd4j::TypeCast::convertGenericCuda<nd4j::int8, nd4j::float8>(extras, dx, N, dz);
+                //sd::TypeCast::convertGenericCuda<sd::int8, sd::float8>(extras, dx, N, dz);
             } else if (dstType == ND4J_INT8) {
-                //convertKernel<nd4j::int8, nd4j::int8>(extras, dx, N, dz);
+                //convertKernel<sd::int8, sd::int8>(extras, dx, N, dz);
             } else if (dstType == ND4J_UINT8) {
-                nd4j::TypeCast::convertGenericCuda<int8_t, uint8_t>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<int8_t, uint8_t>(extras, dx, N, dz);
             } else if (dstType == ND4J_FLOAT16) {
-                nd4j::TypeCast::convertGenericCuda<int8_t, float16>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<int8_t, float16>(extras, dx, N, dz);
             } else if (dstType == ND4J_INT16) {
-                nd4j::TypeCast::convertGenericCuda<int8_t, int16_t>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<int8_t, int16_t>(extras, dx, N, dz);
             } else if (dstType == ND4J_UINT16) {
-                nd4j::TypeCast::convertGenericCuda<int8_t, uint16_t>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<int8_t, uint16_t>(extras, dx, N, dz);
             } else if (dstType == ND4J_FLOAT24) {
                 // TODO: eventually we might want to add it
             } else if (dstType == ND4J_FLOAT32) {
-                nd4j::TypeCast::convertGenericCuda<int8_t, float>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<int8_t, float>(extras, dx, N, dz);
             } else if (dstType == ND4J_DOUBLE) {
-                nd4j::TypeCast::convertGenericCuda<int8_t, double>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<int8_t, double>(extras, dx, N, dz);
             } else {
                 nd4j_printf("Unsupported types conversion: [%i] -> [%i]\n", srcType, dstType);
             }
         } else if (srcType == ND4J_UINT8) {
             if (dstType == ND4J_FLOAT8) {
-                //nd4j::TypeCast::convertGenericCuda<uint8_t, nd4j::float8>(extras, dx, N, dz);
+                //sd::TypeCast::convertGenericCuda<uint8_t, sd::float8>(extras, dx, N, dz);
             } else if (dstType == ND4J_INT8) {
-                nd4j::TypeCast::convertGenericCuda<uint8_t, int8_t>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<uint8_t, int8_t>(extras, dx, N, dz);
             } else if (dstType == ND4J_UINT8) {
-                nd4j::TypeCast::convertGenericCuda<uint8_t, uint8_t>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<uint8_t, uint8_t>(extras, dx, N, dz);
             } else if (dstType == ND4J_FLOAT16) {
-                nd4j::TypeCast::convertGenericCuda<uint8_t, float16>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<uint8_t, float16>(extras, dx, N, dz);
             } else if (dstType == ND4J_INT16) {
-                nd4j::TypeCast::convertGenericCuda<uint8_t, int16_t>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<uint8_t, int16_t>(extras, dx, N, dz);
             } else if (dstType == ND4J_UINT16) {
-                nd4j::TypeCast::convertGenericCuda<uint8_t, uint16_t>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<uint8_t, uint16_t>(extras, dx, N, dz);
             } else if (dstType == ND4J_FLOAT24) {
                 // TODO: still might want to add
             } else if (dstType == ND4J_FLOAT32) {
-                nd4j::TypeCast::convertGenericCuda<uint8_t, float>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<uint8_t, float>(extras, dx, N, dz);
             } else if (dstType == ND4J_DOUBLE) {
-                nd4j::TypeCast::convertGenericCuda<uint8_t, double>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<uint8_t, double>(extras, dx, N, dz);
             } else {
                 nd4j_printf("Unsupported types conversion: [%i] -> [%i]\n", srcType, dstType);
             }
         } else if (srcType == ND4J_FLOAT16) {
             if (dstType == ND4J_FLOAT8) {
-                //nd4j::TypeCast::convertGenericCuda<float16, nd4j::float8>(extras, dx, N, dz);
+                //sd::TypeCast::convertGenericCuda<float16, sd::float8>(extras, dx, N, dz);
             } else if (dstType == ND4J_INT8) {
-                nd4j::TypeCast::convertGenericCuda<float16, int8_t>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<float16, int8_t>(extras, dx, N, dz);
             } else if (dstType == ND4J_UINT8) {
-                nd4j::TypeCast::convertGenericCuda<float16, uint8_t>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<float16, uint8_t>(extras, dx, N, dz);
             } else if (dstType == ND4J_FLOAT16) {
-                nd4j::TypeCast::convertGenericCuda<float16, float16>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<float16, float16>(extras, dx, N, dz);
             } else if (dstType == ND4J_INT16) {
-                nd4j::TypeCast::convertGenericCuda<float16, int16_t>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<float16, int16_t>(extras, dx, N, dz);
             } else if (dstType == ND4J_UINT16) {
-                nd4j::TypeCast::convertGenericCuda<float16, uint16_t>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<float16, uint16_t>(extras, dx, N, dz);
             } else if (dstType == ND4J_FLOAT24) {
                 // TODO: .... ^^^
             } else if (dstType == ND4J_FLOAT32) {
-                nd4j::TypeCast::convertGenericCuda<float16, float>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<float16, float>(extras, dx, N, dz);
             } else if (dstType == ND4J_DOUBLE) {
-                nd4j::TypeCast::convertGenericCuda<float16, double>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<float16, double>(extras, dx, N, dz);
             } else if (dstType == ND4J_THRESHOLD) {
-                //nd4j::convertToThreshold<float16>(nullptr, dx, N, dz);
+                //sd::convertToThreshold<float16>(nullptr, dx, N, dz);
             } else {
                 nd4j_printf("Unsupported types conversion: [%i] -> [%i]\n", srcType, dstType);
             }
         } else if (srcType == ND4J_INT16) {
             if (dstType == ND4J_FLOAT8) {
-                //nd4j::TypeCast::convertGenericCuda<int16_t, nd4j::float8>(extras, dx, N, dz);
+                //sd::TypeCast::convertGenericCuda<int16_t, sd::float8>(extras, dx, N, dz);
             } else if (dstType == ND4J_INT8) {
-                nd4j::TypeCast::convertGenericCuda<int16_t, int8_t>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<int16_t, int8_t>(extras, dx, N, dz);
             } else if (dstType == ND4J_UINT8) {
-                nd4j::TypeCast::convertGenericCuda<int16_t, uint8_t>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<int16_t, uint8_t>(extras, dx, N, dz);
             } else if (dstType == ND4J_FLOAT16) {
-                nd4j::TypeCast::convertGenericCuda<int16_t, float16>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<int16_t, float16>(extras, dx, N, dz);
             } else if (dstType == ND4J_INT16) {
-                nd4j::TypeCast::convertGenericCuda<int16_t, int16_t>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<int16_t, int16_t>(extras, dx, N, dz);
             } else if (dstType == ND4J_UINT16) {
-                nd4j::TypeCast::convertGenericCuda<int16_t, uint16_t>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<int16_t, uint16_t>(extras, dx, N, dz);
             } else if (dstType == ND4J_FLOAT24) {
                 // TODO...
             } else if (dstType == ND4J_FLOAT32) {
-                nd4j::TypeCast::convertGenericCuda<int16_t, float>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<int16_t, float>(extras, dx, N, dz);
             } else if (dstType == ND4J_DOUBLE) {
-                nd4j::TypeCast::convertGenericCuda<int16_t, double>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<int16_t, double>(extras, dx, N, dz);
             } else {
                 printf("Unsupported types conversion: [%i] -> [%i]\n", srcType, dstType);
             }
@@ -3274,57 +3274,57 @@ void convertTypes(Nd4jPointer *extras, int srcType, Nd4jPointer dX, Nd4jLong N,
 
         } else if (srcType == ND4J_FLOAT32) {
             if (dstType == ND4J_FLOAT8) {
-                //nd4j::TypeCast::convertGenericCuda<float, nd4j::float8>(extras, dx, N, dz);
+                //sd::TypeCast::convertGenericCuda<float, sd::float8>(extras, dx, N, dz);
             } else if (dstType == ND4J_INT8) {
-                nd4j::TypeCast::convertGenericCuda<float, int8_t>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<float, int8_t>(extras, dx, N, dz);
             } else if (dstType == ND4J_UINT8) {
-                nd4j::TypeCast::convertGenericCuda<float, uint8_t>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<float, uint8_t>(extras, dx, N, dz);
             } else if (dstType == ND4J_FLOAT16) {
-                nd4j::TypeCast::convertGenericCuda<float, float16>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<float, float16>(extras, dx, N, dz);
             } else if (dstType == ND4J_INT16) {
-                nd4j::TypeCast::convertGenericCuda<float, int16_t>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<float, int16_t>(extras, dx, N, dz);
             } else if (dstType == ND4J_UINT16) {
-                nd4j::TypeCast::convertGenericCuda<float, uint16_t>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<float, uint16_t>(extras, dx, N, dz);
             } else if (dstType == ND4J_FLOAT24) {
 
             } else if (dstType == ND4J_DOUBLE) {
-                nd4j::TypeCast::convertGenericCuda<float, double>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<float, double>(extras, dx, N, dz);
             } else if (dstType == ND4J_THRESHOLD) {
-                //nd4j::convertToThreshold<float>(nullptr, dx, N, dz);
+                //sd::convertToThreshold<float>(nullptr, dx, N, dz);
             } else {
                 nd4j_printf("Unsupported types conversion: [%i] -> [%i]\n", srcType, dstType);
             }
         } else if (srcType == ND4J_DOUBLE) {
             if (dstType == ND4J_FLOAT8) {
-                //nd4j::TypeCast::convertGenericCuda<double, nd4j::float8>(extras, dx, N, dz);
+                //sd::TypeCast::convertGenericCuda<double, sd::float8>(extras, dx, N, dz);
             } else if (dstType == ND4J_INT8) {
-                nd4j::TypeCast::convertGenericCuda<double, int8_t>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<double, int8_t>(extras, dx, N, dz);
             } else if (dstType == ND4J_UINT8) {
-                nd4j::TypeCast::convertGenericCuda<double, uint8_t>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<double, uint8_t>(extras, dx, N, dz);
             } else if (dstType == ND4J_FLOAT16) {
-                nd4j::TypeCast::convertGenericCuda<double, float16>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<double, float16>(extras, dx, N, dz);
             } else if (dstType == ND4J_INT16) {
-                nd4j::TypeCast::convertGenericCuda<double, int16_t>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<double, int16_t>(extras, dx, N, dz);
             } else if (dstType == ND4J_UINT16) {
-                nd4j::TypeCast::convertGenericCuda<double, uint16_t>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<double, uint16_t>(extras, dx, N, dz);
             } else if (dstType == ND4J_FLOAT24) {
 
             } else if (dstType == ND4J_FLOAT32) {
-                nd4j::TypeCast::convertGenericCuda<double, float>(extras, dx, N, dz);
+                sd::TypeCast::convertGenericCuda<double, float>(extras, dx, N, dz);
             } else if (dstType == ND4J_DOUBLE) {
                 //
             } else if (dstType == ND4J_THRESHOLD) {
-                //nd4j::convertToThreshold<double>(nullptr, dx, N, dz);
+                //sd::convertToThreshold<double>(nullptr, dx, N, dz);
             } else {
                 nd4j_printf("Unsupported types conversion: [%i] -> [%i]\n", srcType, dstType);
             }
         } else if (srcType == ND4J_THRESHOLD) {
             if (dstType == ND4J_FLOAT16) {
-                //nd4j::convertFromThreshold<float16>(nullptr, dx, N, dz);
+                //sd::convertFromThreshold<float16>(nullptr, dx, N, dz);
             } else if (dstType == ND4J_FLOAT32) {
-                //nd4j::convertFromThreshold<float>(nullptr, dx, N, dz);
+                //sd::convertFromThreshold<float>(nullptr, dx, N, dz);
             } else if (dstType == ND4J_DOUBLE) {
-                //nd4j::convertFromThreshold<double>(nullptr, dx, N, dz);
+                //sd::convertFromThreshold<double>(nullptr, dx, N, dz);
             } else {
                 nd4j_printf("Unsupported types conversion: [%i] -> [%i]\n", srcType, dstType);
             }
@@ -3332,25 +3332,25 @@ void convertTypes(Nd4jPointer *extras, int srcType, Nd4jPointer dX, Nd4jLong N,
             nd4j_printf("Unsupported types conversion: [%i] -> [%i]\n", srcType, dstType);
         }
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
 Nd4jPointer createUtf8String(Nd4jPointer *extraPointers, const char *string, int length) {
-    auto u = new nd4j::utf8string(string, length);
+    auto u = new sd::utf8string(string, length);
     return reinterpret_cast<Nd4jPointer>(u);
 }
 
 Nd4jLong getUtf8StringLength(Nd4jPointer *extraPointers, Nd4jPointer ptr) {
-    return reinterpret_cast<nd4j::utf8string*>(ptr)->_length;
+    return reinterpret_cast<sd::utf8string*>(ptr)->_length;
 }
 char* getUtf8StringBuffer(Nd4jPointer *extraPointers, Nd4jPointer ptr) {
-    return reinterpret_cast<nd4j::utf8string*>(ptr)->_buffer;
+    return reinterpret_cast<sd::utf8string*>(ptr)->_buffer;
 }
 
 void deleteUtf8String(Nd4jPointer *extraPointers, Nd4jPointer ptr) {
-    delete(reinterpret_cast<nd4j::utf8string*>(ptr));
+    delete(reinterpret_cast<sd::utf8string*>(ptr));
 }
 
 ///////////////////////////////////////////////////////////////////
@@ -3442,22 +3442,22 @@ void scatterUpdate(Nd4jPointer *extraPointers, int opCode, int numOfSubArrs,
                               (stream, opCode, numOfSubArrs, dX, dXShapeInfo, dXOffsets, dY, dYShapeInfo, dYOffsets, dIindexes),
                               LIBND4J_TYPES, INDEXING_TYPES);
 
-        nd4j::DebugHelper::checkErrorCode(stream, "scatterUpdate(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "scatterUpdate(...) failed");
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
 void inspectArray(Nd4jPointer *extraPointers, Nd4jPointer buffer, Nd4jLong *shapeInfo, Nd4jPointer specialBuffer, Nd4jLong *specialShapeInfo, Nd4jPointer debugInfo) {
     try {
         LaunchContext lc(extraPointers[1], extraPointers[4], extraPointers[5], extraPointers[3]);
-        auto p = reinterpret_cast<nd4j::DebugInfo *>(debugInfo);
+        auto p = reinterpret_cast<sd::DebugInfo *>(debugInfo);
         NDArray array(buffer, specialBuffer, shapeInfo, &lc);
-        nd4j::DebugHelper::retrieveDebugStatistics(p, &array);
+        sd::DebugHelper::retrieveDebugStatistics(p, &array);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
@@ -3483,36 +3483,36 @@ void tryPointer(Nd4jPointer extra, Nd4jPointer p, int len) {
         auto e = cudaStreamSynchronize(stream);
 
         if (e != 0)
-            throw nd4j::cuda_exception::build("tryPointer failed", e);
+            throw sd::cuda_exception::build("tryPointer failed", e);
 
         cudaStreamDestroy(stream);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
 int dataTypeFromNpyHeader(void *header) {
     return (int) cnpy::dataTypeFromHeader(reinterpret_cast<char *>(header));
 }
-nd4j::ConstantDataBuffer* shapeBuffer(int rank, Nd4jLong *shape, Nd4jLong *strides, nd4j::DataType dtype, char order, Nd4jLong ews, bool empty) {
+sd::ConstantDataBuffer* shapeBuffer(int rank, Nd4jLong *shape, Nd4jLong *strides, sd::DataType dtype, char order, Nd4jLong ews, bool empty) {
     try {
         auto buffer = new ConstantDataBuffer();
-        *buffer = nd4j::ConstantShapeHelper::getInstance()->bufferForShapeInfo(
+        *buffer = sd::ConstantShapeHelper::getInstance()->bufferForShapeInfo(
                 ShapeDescriptor(dtype, order, shape, strides, rank, ews, empty));
         return buffer;
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
         return nullptr;
     }
 }
 
-void deleteShapeBuffer(nd4j::ConstantDataBuffer* ptr) {
+void deleteShapeBuffer(sd::ConstantDataBuffer* ptr) {
     delete ptr;
 }
 
-void deleteTadPack(nd4j::TadPack* ptr) {
+void deleteTadPack(sd::TadPack* ptr) {
     delete ptr;
 }
 
@@ -3521,61 +3521,61 @@ bool isBlasVersionMatches(int major, int minor, int build) {
 
     if (!result) {
         nd4j_printf("CUDA/cuBLAS version mismatch. Expected: %i.%i.%i but got %i.%i.%i instead\n", Environment::getInstance()->_blasMajorVersion, Environment::getInstance()->_blasMinorVersion, Environment::getInstance()->_blasPatchVersion, major, minor, build);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(152);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage("CUDA/cuBLAS version mismatch");
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(152);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage("CUDA/cuBLAS version mismatch");
     }
 
     return result;
 }
 
-nd4j::ConstantDataBuffer* constantBufferLong(nd4j::DataType dtype, Nd4jLong *data, int length) {
-    return nd4j::ConstantHelper::getInstance()->constantBuffer(ConstantDescriptor(data, length), dtype);
+sd::ConstantDataBuffer* constantBufferLong(sd::DataType dtype, Nd4jLong *data, int length) {
+    return sd::ConstantHelper::getInstance()->constantBuffer(ConstantDescriptor(data, length), dtype);
 }
 
-nd4j::ConstantDataBuffer* constantBufferDouble(nd4j::DataType dtype, double *data, int length) {
-    return nd4j::ConstantHelper::getInstance()->constantBuffer(ConstantDescriptor(data, length), dtype);
+sd::ConstantDataBuffer* constantBufferDouble(sd::DataType dtype, double *data, int length) {
+    return sd::ConstantHelper::getInstance()->constantBuffer(ConstantDescriptor(data, length), dtype);
 }
 
-nd4j::ConstantDataBuffer* constantBuffer(nd4j::DataType dtype, nd4j::ConstantDescriptor *descriptor) {
-    return nd4j::ConstantHelper::getInstance()->constantBuffer(*descriptor, dtype);
+sd::ConstantDataBuffer* constantBuffer(sd::DataType dtype, sd::ConstantDescriptor *descriptor) {
+    return sd::ConstantHelper::getInstance()->constantBuffer(*descriptor, dtype);
 }
 
 
-Nd4jPointer getConstantDataBufferPrimary(nd4j::ConstantDataBuffer* dbf) {
+Nd4jPointer getConstantDataBufferPrimary(sd::ConstantDataBuffer* dbf) {
     return dbf->primary();
 }
-Nd4jPointer getConstantDataBufferSpecial(nd4j::ConstantDataBuffer* dbf) {
+Nd4jPointer getConstantDataBufferSpecial(sd::ConstantDataBuffer* dbf) {
     return dbf->special();
 }
-Nd4jLong getConstantDataBufferLength(nd4j::ConstantDataBuffer* dbf) {
+Nd4jLong getConstantDataBufferLength(sd::ConstantDataBuffer* dbf) {
     return dbf->length();
 }
-Nd4jLong getConstantDataBufferSizeOf(nd4j::ConstantDataBuffer* dbf) {
+Nd4jLong getConstantDataBufferSizeOf(sd::ConstantDataBuffer* dbf) {
     return dbf->sizeOf();
 }
 
 
-nd4j::graph::Context* createGraphContext(int nodeId) {
-    return new nd4j::graph::Context(nodeId);
+sd::graph::Context* createGraphContext(int nodeId) {
+    return new sd::graph::Context(nodeId);
 }
 
-nd4j::graph::RandomGenerator* getGraphContextRandomGenerator(nd4j::graph::Context* ptr) {
+sd::graph::RandomGenerator* getGraphContextRandomGenerator(sd::graph::Context* ptr) {
     return &ptr->randomGenerator();
 }
 
-void markGraphContextInplace(nd4j::graph::Context* ptr, bool reallyInplace) {
+void markGraphContextInplace(sd::graph::Context* ptr, bool reallyInplace) {
     ptr->markInplace(reallyInplace);
 }
 
-void setGraphContextCudaContext(nd4j::graph::Context* ptr, void *stream, void *reductionPointer, void *allocationPointer) {
+void setGraphContextCudaContext(sd::graph::Context* ptr, void *stream, void *reductionPointer, void *allocationPointer) {
     ptr->setCudaContext(stream, reductionPointer, allocationPointer);
 }
 
-void setGraphContextInputArray(nd4j::graph::Context* ptr, int index, void *buffer, void *shapeInfo, void *specialBuffer, void *specialShapeInfo) {
+void setGraphContextInputArray(sd::graph::Context* ptr, int index, void *buffer, void *shapeInfo, void *specialBuffer, void *specialShapeInfo) {
     ptr->setInputArray(index, buffer, shapeInfo, specialBuffer, specialShapeInfo);
 }
 
-void setGraphContextOutputArray(nd4j::graph::Context* ptr, int index, void *buffer, void *shapeInfo, void *specialBuffer, void *specialShapeInfo) {
+void setGraphContextOutputArray(sd::graph::Context* ptr, int index, void *buffer, void *shapeInfo, void *specialBuffer, void *specialShapeInfo) {
     ptr->setOutputArray(index, buffer, shapeInfo, specialBuffer, specialShapeInfo);
 }
 
@@ -3587,62 +3587,62 @@ void setGraphContextOutputBuffer(OpaqueContext* ptr, int index, OpaqueDataBuffer
     ptr->setOutputArray(index, buffer, shapeInfo, specialShapeInfo);
 }
 
-void setGraphContextTArguments(nd4j::graph::Context* ptr, double *arguments, int numberOfArguments) {
+void setGraphContextTArguments(sd::graph::Context* ptr, double *arguments, int numberOfArguments) {
     ptr->setTArguments(arguments, numberOfArguments);
 }
 
-void setGraphContextIArguments(nd4j::graph::Context* ptr, Nd4jLong *arguments, int numberOfArguments) {
+void setGraphContextIArguments(sd::graph::Context* ptr, Nd4jLong *arguments, int numberOfArguments) {
     ptr->setIArguments(arguments, numberOfArguments);
 }
 
-void setGraphContextBArguments(nd4j::graph::Context* ptr, bool *arguments, int numberOfArguments) {
+void setGraphContextBArguments(sd::graph::Context* ptr, bool *arguments, int numberOfArguments) {
     ptr->setBArguments(arguments, numberOfArguments);
 }
 
 void setGraphContextDArguments(OpaqueContext* ptr, int *arguments, int numberOfArguments) {
-    std::vector<nd4j::DataType> dtypes(numberOfArguments);
+    std::vector<sd::DataType> dtypes(numberOfArguments);
     for (int e = 0; e < numberOfArguments; e++)
-        dtypes[e] = (nd4j::DataType) arguments[e];
+        dtypes[e] = (sd::DataType) arguments[e];
 
     ptr->setDArguments(dtypes);
 }
 
-void deleteGraphContext(nd4j::graph::Context* ptr) {
+void deleteGraphContext(sd::graph::Context* ptr) {
     delete ptr;
 }
 
 
-nd4j::graph::RandomGenerator* createRandomGenerator(Nd4jLong rootSeed, Nd4jLong nodeSeed) {
+sd::graph::RandomGenerator* createRandomGenerator(Nd4jLong rootSeed, Nd4jLong nodeSeed) {
     try {
-        return new nd4j::graph::RandomGenerator(rootSeed, nodeSeed);
+        return new sd::graph::RandomGenerator(rootSeed, nodeSeed);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
         return nullptr;
     }
 }
 
-Nd4jLong getRandomGeneratorRootState(nd4j::graph::RandomGenerator* ptr) {
+Nd4jLong getRandomGeneratorRootState(sd::graph::RandomGenerator* ptr) {
     return ptr->rootState();
 }
 
-Nd4jLong getRandomGeneratorNodeState(nd4j::graph::RandomGenerator* ptr) {
+Nd4jLong getRandomGeneratorNodeState(sd::graph::RandomGenerator* ptr) {
     return ptr->nodeState();
 }
 
-void setRandomGeneratorStates(nd4j::graph::RandomGenerator* ptr, Nd4jLong rootSeed, Nd4jLong nodeSeed) {
+void setRandomGeneratorStates(sd::graph::RandomGenerator* ptr, Nd4jLong rootSeed, Nd4jLong nodeSeed) {
     ptr->setStates(rootSeed, nodeSeed);
 }
 
-int getRandomGeneratorRelativeInt(nd4j::graph::RandomGenerator* ptr, Nd4jLong index) {
+int getRandomGeneratorRelativeInt(sd::graph::RandomGenerator* ptr, Nd4jLong index) {
     return ptr->relativeInt(index);
 }
 
-Nd4jLong getRandomGeneratorRelativeLong(nd4j::graph::RandomGenerator* ptr, Nd4jLong index) {
+Nd4jLong getRandomGeneratorRelativeLong(sd::graph::RandomGenerator* ptr, Nd4jLong index) {
     return ptr->relativeLong(index);
 }
 
-void deleteRandomGenerator(nd4j::graph::RandomGenerator* ptr) {
+void deleteRandomGenerator(sd::graph::RandomGenerator* ptr) {
     delete ptr;
 }
 
@@ -3665,27 +3665,27 @@ Nd4jPointer shapeBufferForNumpy(Nd4jPointer npyArray) {
         Nd4jLong *shapeBuffer;
         if (shape.size() == 1 && shape[0] == 0) {
             // scalar case
-            shapeBuffer = nd4j::ShapeBuilders::createScalarShapeInfo(dtype);
+            shapeBuffer = sd::ShapeBuilders::createScalarShapeInfo(dtype);
         } else if (_empty) {
             if (shapeSize > 0)
-                shapeBuffer = nd4j::ShapeBuilders::emptyShapeInfo(dtype, arr.fortranOrder ? 'f' : 'c', shape);
+                shapeBuffer = sd::ShapeBuilders::emptyShapeInfo(dtype, arr.fortranOrder ? 'f' : 'c', shape);
             else
-                shapeBuffer = nd4j::ShapeBuilders::emptyShapeInfo(dtype);
+                shapeBuffer = sd::ShapeBuilders::emptyShapeInfo(dtype);
         } else {
-            shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(dtype, arr.fortranOrder ? 'f' : 'c', shape);
+            shapeBuffer = sd::ShapeBuilders::createShapeInfo(dtype, arr.fortranOrder ? 'f' : 'c', shape);
         }
-        return reinterpret_cast<Nd4jPointer>(nd4j::ConstantShapeHelper::getInstance()->createFromExisting(shapeBuffer,
+        return reinterpret_cast<Nd4jPointer>(sd::ConstantShapeHelper::getInstance()->createFromExisting(shapeBuffer,
                                                                                                           true));
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
         return nullptr;
     }
 }
 
 const char* runLightBenchmarkSuit(bool printOut) {
     try {
-        nd4j::LightBenchmarkSuit suit;
+        sd::LightBenchmarkSuit suit;
         auto result = suit.runSuit();
 
         if (printOut)
@@ -3697,15 +3697,15 @@ const char* runLightBenchmarkSuit(bool printOut) {
 
         return chars;
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
         return nullptr;
     }
 }
 
 const char* runFullBenchmarkSuit(bool printOut) {
     try {
-        nd4j::FullBenchmarkSuit suit;
+        sd::FullBenchmarkSuit suit;
         auto result = suit.runSuit();
 
         if (printOut)
@@ -3717,17 +3717,17 @@ const char* runFullBenchmarkSuit(bool printOut) {
 
         return chars;
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
         return nullptr;
     }
 }
 
 Nd4jLong getCachedMemory(int deviceId) {
-    return nd4j::ConstantHelper::getInstance()->getCachedAmount(deviceId);
+    return sd::ConstantHelper::getInstance()->getCachedAmount(deviceId);
 }
 
-nd4j::LaunchContext* defaultLaunchContext() {
+sd::LaunchContext* defaultLaunchContext() {
     return LaunchContext::defaultContext();
 }
 
@@ -3760,11 +3760,11 @@ Nd4jPointer lcSolverHandle(OpaqueLaunchContext* lc) {
 }
 
 int lastErrorCode() {
-    return nd4j::LaunchContext::defaultContext()->errorReference()->errorCode();
+    return sd::LaunchContext::defaultContext()->errorReference()->errorCode();
 }
 
 const char* lastErrorMessage() {
-    return nd4j::LaunchContext::defaultContext()->errorReference()->errorMessage();
+    return sd::LaunchContext::defaultContext()->errorReference()->errorMessage();
 }
 
 void ctxShapeFunctionOverride(OpaqueContext* ptr, bool reallyOverride) {
@@ -3805,10 +3805,10 @@ void ctxSetExecutionMode(OpaqueContext* ptr, int execMode) {
 OpaqueDataBuffer* allocateDataBuffer(Nd4jLong elements, int dataType, bool allocateBoth) {
     try {
         auto dtype = DataTypeUtils::fromInt(dataType);
-        return new nd4j::InteropDataBuffer(elements * DataTypeUtils::sizeOf(dtype), dtype, allocateBoth);
+        return new sd::InteropDataBuffer(elements * DataTypeUtils::sizeOf(dtype), dtype, allocateBoth);
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
         return nullptr;
     }
 }
@@ -3845,8 +3845,8 @@ void dbExpandBuffer(OpaqueDataBuffer *dataBuffer, Nd4jLong elements) {
     try {
         dataBuffer->dataBuffer()->expand(elements * DataTypeUtils::sizeOf(dataBuffer->dataBuffer()->getDataType()));
     } catch (std::exception &e) {
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
-        nd4j::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorCode(1);
+        sd::LaunchContext::defaultContext()->errorReference()->setErrorMessage(e.what());
     }
 }
 
diff --git a/libnd4j/blas/Environment.cpp b/libnd4j/include/legacy/impl/Environment.cpp
similarity index 89%
rename from libnd4j/blas/Environment.cpp
rename to libnd4j/include/legacy/impl/Environment.cpp
index 3b9502534..491d33569 100644
--- a/libnd4j/blas/Environment.cpp
+++ b/libnd4j/include/legacy/impl/Environment.cpp
@@ -22,7 +22,7 @@
 #include <cstdlib>
 #include <stdexcept>
 #include <string>
-#include "Environment.h"
+#include "system/Environment.h"
 #include <helpers/StringUtils.h>
 #include <thread>
 #include <helpers/logger.h>
@@ -38,12 +38,12 @@
 
 #include <cuda.h>
 #include <cuda_runtime.h>
-#include "BlasVersionHelper.h"
+#include <system/BlasVersionHelper.h>
 #endif
 
-namespace nd4j {
+namespace sd {
 
-    nd4j::Environment::Environment() {
+    sd::Environment::Environment() {
         _tadThreshold.store(1);
         _elementThreshold.store(1024);
         _verbose.store(false);
@@ -51,7 +51,7 @@ namespace nd4j {
         _profile.store(false);
         _precBoost.store(false);
         _leaks.store(false);
-        _dataType.store(nd4j::DataType::FLOAT32);
+        _dataType.store(sd::DataType::FLOAT32);
         _maxThreads = std::thread::hardware_concurrency();
         _maxMasterThreads = _maxThreads.load();
 
@@ -189,7 +189,7 @@ namespace nd4j {
 #endif
     }
 
-    nd4j::Environment::~Environment() {
+    sd::Environment::~Environment() {
         //
     }
 
@@ -220,7 +220,7 @@ namespace nd4j {
         return _experimental;
     }
 
-    nd4j::DataType Environment::defaultFloatDataType() {
+    sd::DataType Environment::defaultFloatDataType() {
         return _dataType.load();
     }
 
@@ -228,8 +228,8 @@ namespace nd4j {
         return _capabilities;
     }
 
-    void Environment::setDefaultFloatDataType(nd4j::DataType dtype) {
-        if (dtype != nd4j::DataType::FLOAT32 && dtype != nd4j::DataType::DOUBLE && dtype != nd4j::DataType::FLOAT8 && dtype != nd4j::DataType::HALF)
+    void Environment::setDefaultFloatDataType(sd::DataType dtype) {
+        if (dtype != sd::DataType::FLOAT32 && dtype != sd::DataType::DOUBLE && dtype != sd::DataType::FLOAT8 && dtype != sd::DataType::HALF)
             throw std::runtime_error("Default Float data type must be one of [FLOAT8, FLOAT16, FLOAT32, DOUBLE]");
 
         _dataType.store(dtype);
@@ -344,27 +344,27 @@ namespace nd4j {
     }
 
     void Environment::setGroupLimit(int group, Nd4jLong numBytes) {
-        nd4j::memory::MemoryCounter::getInstance()->setGroupLimit((nd4j::memory::MemoryType) group, numBytes);
+        sd::memory::MemoryCounter::getInstance()->setGroupLimit((sd::memory::MemoryType) group, numBytes);
     }
 
     void Environment::setDeviceLimit(int deviceId, Nd4jLong numBytes) {
-        nd4j::memory::MemoryCounter::getInstance()->setDeviceLimit(deviceId, numBytes);
+        sd::memory::MemoryCounter::getInstance()->setDeviceLimit(deviceId, numBytes);
     }
 
     Nd4jLong Environment::getGroupLimit(int group) {
-        return nd4j::memory::MemoryCounter::getInstance()->groupLimit((nd4j::memory::MemoryType) group);
+        return sd::memory::MemoryCounter::getInstance()->groupLimit((sd::memory::MemoryType) group);
     }
 
     Nd4jLong Environment::getDeviceLimit(int deviceId) {
-        return nd4j::memory::MemoryCounter::getInstance()->deviceLimit(deviceId);
+        return sd::memory::MemoryCounter::getInstance()->deviceLimit(deviceId);
     }
 
     Nd4jLong Environment::getGroupCounter(int group) {
-        return nd4j::memory::MemoryCounter::getInstance()->allocatedGroup((nd4j::memory::MemoryType) group);
+        return sd::memory::MemoryCounter::getInstance()->allocatedGroup((sd::memory::MemoryType) group);
     }
 
     Nd4jLong Environment::getDeviceCounter(int deviceId) {
-        return nd4j::memory::MemoryCounter::getInstance()->allocatedDevice(deviceId);
+        return sd::memory::MemoryCounter::getInstance()->allocatedDevice(deviceId);
     }
 
     uint64_t Environment::maxPrimaryMemory() {
@@ -375,6 +375,6 @@ namespace nd4j {
         return _maxTotalSpecialMemory.load();
     }
 
-    nd4j::Environment *nd4j::Environment::_instance = 0;
+    sd::Environment *sd::Environment::_instance = 0;
 
 }
diff --git a/libnd4j/include/cnpy/cnpy.cpp b/libnd4j/include/legacy/impl/cnpy.cpp
similarity index 96%
rename from libnd4j/include/cnpy/cnpy.cpp
rename to libnd4j/include/legacy/impl/cnpy.cpp
index a09b38bfd..ee4fa36b0 100644
--- a/libnd4j/include/cnpy/cnpy.cpp
+++ b/libnd4j/include/legacy/impl/cnpy.cpp
@@ -26,9 +26,9 @@
 //Released under MIT License
 //license available in LICENSE file, or at http://www.opensource.org/licenses/mit-license.php
 
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <stdexcept>
-#include"cnpy.h"
+#include <cnpy/cnpy.h>
 #include <types/types.h>
 
 
@@ -104,7 +104,7 @@ char cnpy::mapType() {
     else return '?';
 }
 
-nd4j::DataType cnpy::dataTypeFromHeader(char *data) {
+sd::DataType cnpy::dataTypeFromHeader(char *data) {
 
     // indices for type & data size
     const int st = 10;
@@ -120,31 +120,31 @@ nd4j::DataType cnpy::dataTypeFromHeader(char *data) {
 
     switch (t) {
         case 'b':
-            return nd4j::DataType::BOOL;
+            return sd::DataType::BOOL;
         case 'i':
             switch (s) {
-                case '1': return nd4j::DataType::INT8;
-                case '2': return nd4j::DataType::INT16;
-                case '4': return nd4j::DataType::INT32;
-                case '8': return nd4j::DataType::INT64;
+                case '1': return sd::DataType::INT8;
+                case '2': return sd::DataType::INT16;
+                case '4': return sd::DataType::INT32;
+                case '8': return sd::DataType::INT64;
                 default:
                     throw std::runtime_error("Only data sizes of [1, 2, 4, 8] are supported for Integer data types import");
             }
         case 'f':
             switch (s) {
-                case '1': return nd4j::DataType::FLOAT8;
-                case '2': return nd4j::DataType::HALF;
-                case '4': return nd4j::DataType::FLOAT32;
-                case '8': return nd4j::DataType::DOUBLE;
+                case '1': return sd::DataType::FLOAT8;
+                case '2': return sd::DataType::HALF;
+                case '4': return sd::DataType::FLOAT32;
+                case '8': return sd::DataType::DOUBLE;
                 default:
                     throw std::runtime_error("Only data sizes of [1, 2, 4, 8] are supported for Float data types import");
             }
         case 'u':
             switch (s) {
-                case '1': return nd4j::DataType::UINT8;
-                case '2': return nd4j::DataType::UINT16;
-                case '4': return nd4j::DataType::UINT32;
-                case '8': return nd4j::DataType::UINT64;
+                case '1': return sd::DataType::UINT8;
+                case '2': return sd::DataType::UINT16;
+                case '4': return sd::DataType::UINT32;
+                case '8': return sd::DataType::UINT64;
                 default:
                     throw std::runtime_error("Only data sizes of [1, 2, 4, 8] are supported for Unsigned data types import");
             }
diff --git a/libnd4j/include/loops/BroadcastPairwiseConverter.h b/libnd4j/include/loops/BroadcastPairwiseConverter.h
index f1fda4a9a..acb7e8d64 100644
--- a/libnd4j/include/loops/BroadcastPairwiseConverter.h
+++ b/libnd4j/include/loops/BroadcastPairwiseConverter.h
@@ -21,10 +21,10 @@
 #ifndef DEV_TESTS_BROADCASTPAIRWISECONVERTER_H
 #define DEV_TESTS_BROADCASTPAIRWISECONVERTER_H
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <stdexcept>
 
-namespace nd4j {
+namespace sd {
 
 //////////////////////////////////////////////////////////////////////////
 inline pairwise::Ops fromBroadcastToPairwise(broadcast::Ops op) {
diff --git a/libnd4j/include/loops/BroadcastScalarConverter.h b/libnd4j/include/loops/BroadcastScalarConverter.h
index ee745901a..12006c293 100644
--- a/libnd4j/include/loops/BroadcastScalarConverter.h
+++ b/libnd4j/include/loops/BroadcastScalarConverter.h
@@ -20,10 +20,10 @@
 #ifndef DEV_TESTS_BROADCASTSCALARCONVERTER_H
 #define DEV_TESTS_BROADCASTSCALARCONVERTER_H
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <stdexcept>
 
-namespace nd4j {
+namespace sd {
     inline bool isConvertibleToScalar(broadcast::Ops op) {
         int opNum = (int) op;
 
diff --git a/libnd4j/include/loops/TrueBroadcastHelper.h b/libnd4j/include/loops/TrueBroadcastHelper.h
index 4101aa08e..71934b674 100644
--- a/libnd4j/include/loops/TrueBroadcastHelper.h
+++ b/libnd4j/include/loops/TrueBroadcastHelper.h
@@ -21,9 +21,9 @@
 #ifndef LIBND4J_TRUEBROADCASTHELPER_H
 #define LIBND4J_TRUEBROADCASTHELPER_H
 
-#include <NDArray.h>
+#include <array/NDArray.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace helpers {
 
 ////////////////////////////////////////////////////////////////////////
@@ -39,7 +39,7 @@ class TrueBroadcastHelper {
         #endif
 
     public:
-        static void exec(const nd4j::broadcast::Ops opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr);
+        static void exec(const sd::broadcast::Ops opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr);
 };
 
 template <typename X, typename Y>
@@ -55,7 +55,7 @@ class TrueBroadcastBoolHelper {
 
     public:
 
-        static void exec(const nd4j::broadcast::BoolOps opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr);
+        static void exec(const sd::broadcast::BoolOps opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr);
 };
 
 ////////////////////////////////////////////////////////////////////////
@@ -72,7 +72,7 @@ class TrueBroadcastIntHelper {
 
     public:
 
-        static void exec(const nd4j::broadcast::IntOps opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr);
+        static void exec(const sd::broadcast::IntOps opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr);
 };
 
 
diff --git a/libnd4j/include/loops/broadcasting.h b/libnd4j/include/loops/broadcasting.h
index ebf702004..4d53e4c73 100755
--- a/libnd4j/include/loops/broadcasting.h
+++ b/libnd4j/include/loops/broadcasting.h
@@ -23,12 +23,12 @@
 
 #ifndef BROADCASTING_H_
 #define BROADCASTING_H_
-#include <dll.h>
+#include <system/dll.h>
 #include <helpers/shape.h>
-#include <templatemath.h>
-#include <pairwise_util.h>
+#include <math/templatemath.h>
+#include <system/pairwise_util.h>
 #include <ops/ops.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <helpers/DebugHelper.h>
 
 #ifdef __CUDACC__
@@ -123,7 +123,7 @@ namespace functions {
                              Nd4jLong *tadOffset,
                              Nd4jLong *tadShapeInfoZ,
                              Nd4jLong *tadOffsetZ,
-                             nd4j::LoopKind::Kind loopKind,
+                             sd::LoopKind::Kind loopKind,
                              uint64_t start,
                              uint64_t stop);
 
@@ -151,7 +151,7 @@ namespace functions {
                              Nd4jLong *tadOffset,
                              Nd4jLong *tadShapeInfoZ,
                              Nd4jLong *tadOffsetZ,
-                             nd4j::LoopKind::Kind loopKind,
+                             sd::LoopKind::Kind loopKind,
                              uint64_t start,
                              uint64_t stop);
 
diff --git a/libnd4j/include/loops/broadcasting_bool.h b/libnd4j/include/loops/broadcasting_bool.h
index 7ba5fa9eb..23ed95a21 100644
--- a/libnd4j/include/loops/broadcasting_bool.h
+++ b/libnd4j/include/loops/broadcasting_bool.h
@@ -23,12 +23,12 @@
 
 #ifndef BROADCASTING_BOOL_H_
 #define BROADCASTING_BOOL_H_
-#include <dll.h>
+#include <system/dll.h>
 #include <helpers/shape.h>
-#include <templatemath.h>
-#include <pairwise_util.h>
+#include <math/templatemath.h>
+#include <system/pairwise_util.h>
 #include <ops/ops.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <helpers/DebugHelper.h>
 
 #ifdef __CUDACC__
diff --git a/libnd4j/include/loops/broadcasting_int.h b/libnd4j/include/loops/broadcasting_int.h
index 92e4ca7dd..2c33491b2 100644
--- a/libnd4j/include/loops/broadcasting_int.h
+++ b/libnd4j/include/loops/broadcasting_int.h
@@ -23,12 +23,12 @@
 
 #ifndef BROADCASTING_INT_H_
 #define BROADCASTING_INT_H_
-#include <dll.h>
+#include <system/dll.h>
 #include <helpers/shape.h>
-#include <templatemath.h>
-#include <pairwise_util.h>
+#include <math/templatemath.h>
+#include <system/pairwise_util.h>
 #include <ops/ops.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <helpers/DebugHelper.h>
 
 #ifdef __CUDACC__
diff --git a/libnd4j/include/loops/cpu/TrueBroadcastHelper.hpp b/libnd4j/include/loops/cpu/TrueBroadcastHelper.hpp
index f18f0c788..7edb9d90d 100644
--- a/libnd4j/include/loops/cpu/TrueBroadcastHelper.hpp
+++ b/libnd4j/include/loops/cpu/TrueBroadcastHelper.hpp
@@ -24,7 +24,7 @@
 
 using namespace simdOps;
 
-namespace nd4j {
+namespace sd {
     namespace helpers {
 
         ////////////////////////////////////////////////////////////////////////
@@ -141,7 +141,7 @@ namespace nd4j {
         }
 
         template <typename X, typename  Y, typename Z>
-        void TrueBroadcastHelper<X, Y, Z>::exec(const nd4j::broadcast::Ops opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr) {
+        void TrueBroadcastHelper<X, Y, Z>::exec(const sd::broadcast::Ops opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr) {
             DISPATCH_BY_OPNUM_TTT(exec, PARAMS(xArr, yArr, zArr), BROADCAST_OPS);
         }
 
@@ -204,7 +204,7 @@ namespace nd4j {
         }
 
         template <typename X, typename  Y>
-        void TrueBroadcastBoolHelper<X, Y>::exec(const nd4j::broadcast::BoolOps opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr) {
+        void TrueBroadcastBoolHelper<X, Y>::exec(const sd::broadcast::BoolOps opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr) {
             DISPATCH_BY_OPNUM_TT(exec, PARAMS(xArr, yArr, zArr), BROADCAST_BOOL_OPS);
         }
 
@@ -267,7 +267,7 @@ namespace nd4j {
         }
 
         template <typename X>
-        void TrueBroadcastIntHelper<X>::exec(const nd4j::broadcast::IntOps opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr) {
+        void TrueBroadcastIntHelper<X>::exec(const sd::broadcast::IntOps opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr) {
             DISPATCH_BY_OPNUM_T(exec, PARAMS(xArr, yArr, zArr), BROADCAST_INT_OPS);
         }
 
diff --git a/libnd4j/include/loops/cpu/broadcasting.hpp b/libnd4j/include/loops/cpu/broadcasting.hpp
index 7226d00b3..11dcb56f2 100644
--- a/libnd4j/include/loops/cpu/broadcasting.hpp
+++ b/libnd4j/include/loops/cpu/broadcasting.hpp
@@ -18,11 +18,11 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <loops/broadcasting.h>
 #include <loops/legacy_ops.h>
 #include <types/types.h>
-#include <LoopKind.h>
+#include <helpers/LoopKind.h>
 #include <helpers/ConstantTadHelper.h>
 #include <execution/Threads.h>
 #include <helpers/ShapeUtils.h>
@@ -76,7 +76,7 @@ namespace functions {
                              Nd4jLong *xTadOffset,
                              Nd4jLong *zTadShapeInfo,
                              Nd4jLong *zTadOffset,
-                             nd4j::LoopKind::Kind loopKind,
+                             sd::LoopKind::Kind loopKind,
                              uint64_t start,
                              uint64_t stop) {
             DISPATCH_BY_OPNUM_TTT(exec, PARAMS(x,
@@ -107,7 +107,7 @@ namespace functions {
                              Nd4jLong *xTadOffset,
                              Nd4jLong *zTadShapeInfo,
                              Nd4jLong *zTadOffset,
-                             nd4j::LoopKind::Kind loopKind,
+                             sd::LoopKind::Kind loopKind,
                              uint64_t start,
                              uint64_t stop) {
 
@@ -123,7 +123,7 @@ namespace functions {
                 auto tadOffsets = xTadOffset;
 
                 if (xTadShapeInfo == nullptr || tadOffsets == nullptr) {
-                    auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
+                    auto tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
 
                     xTadShapeShapeInfo = tadPack.primaryShapeInfo();
                     tadOffsets = tadPack.primaryOffsets();
@@ -146,15 +146,15 @@ namespace functions {
                 auto zEws = shape::elementWiseStride(zTadShapeInfo);
 
 
-                const nd4j::LoopKind::Kind kindOfLoop =
-                    (loopKind == nd4j::LoopKind::BROADCAST_SCALAR_X ||
-                        loopKind == nd4j::LoopKind::BROADCAST_SCALAR_Y ||
-                        loopKind == nd4j::LoopKind::BROADCAST_3D ||
-                        loopKind == nd4j::LoopKind::BROADCAST_4D ||
-                        loopKind == nd4j::LoopKind::BROADCAST_5D)
-                    ? loopKind : nd4j::LoopKind::deduceKindOfLoopXYZ(xTadShapeShapeInfo, yShapeInfo, zTadShapeInfo);
+                const sd::LoopKind::Kind kindOfLoop =
+                    (loopKind == sd::LoopKind::BROADCAST_SCALAR_X ||
+                        loopKind == sd::LoopKind::BROADCAST_SCALAR_Y ||
+                        loopKind == sd::LoopKind::BROADCAST_3D ||
+                        loopKind == sd::LoopKind::BROADCAST_4D ||
+                        loopKind == sd::LoopKind::BROADCAST_5D)
+                    ? loopKind : sd::LoopKind::deduceKindOfLoopXYZ(xTadShapeShapeInfo, yShapeInfo, zTadShapeInfo);
 
-                if (kindOfLoop == nd4j::LoopKind::EWS1) {
+                if (kindOfLoop == sd::LoopKind::EWS1) {
                     for (auto i = start; i < stop; i++) {
                         auto oX = x + tadOffsets[i];
                         auto oZ = z + zTadOffset[i];
@@ -164,7 +164,7 @@ namespace functions {
                             oZ[f] = OpType::op(oX[f], y[f]);
                     }
                 }
-                else if(kindOfLoop == nd4j::LoopKind::EWSNONZERO){
+                else if(kindOfLoop == sd::LoopKind::EWSNONZERO){
                     for (auto i = start; i < stop; i++) {
                         auto oX = x + tadOffsets[i];
                         auto oZ = z + zTadOffset[i];
@@ -173,7 +173,7 @@ namespace functions {
                         for (unsigned int f = 0; f < tadLength; f++)
                             oZ[f * zEws] = OpType::op(oX[f * xEws], y[f * yEws]);
                     }
-                } else if(kindOfLoop == nd4j::LoopKind::BROADCAST_SCALAR_X){
+                } else if(kindOfLoop == sd::LoopKind::BROADCAST_SCALAR_X){
                     // this loop effectively turns broadcast into series of scalar ops
                     auto loopLength = yShapeInfo[shape::rank(yShapeInfo)];
 
@@ -187,7 +187,7 @@ namespace functions {
                         for (Nd4jLong f = 0; f < loopLength; f++)
                             oZ[f] = OpType::op(oX, oY[f]);
                     }
-                } else if(kindOfLoop == nd4j::LoopKind::BROADCAST_SCALAR_Y){
+                } else if(kindOfLoop == sd::LoopKind::BROADCAST_SCALAR_Y){
                     // this loop effectively turns broadcast into series of scalar ops
                     auto loopLength = xShapeInfo[shape::rank(xShapeInfo)];
 
@@ -202,7 +202,7 @@ namespace functions {
                             oZ[f] = OpType::op(oX[f], oY);
                     }
                 }
-                else if (kindOfLoop == nd4j::LoopKind::BROADCAST_3D) {
+                else if (kindOfLoop == sd::LoopKind::BROADCAST_3D) {
 
                     int xRank = shape::rank(xShapeInfo);
                     int yRank = shape::rank(yShapeInfo);
@@ -211,7 +211,7 @@ namespace functions {
                     auto  zStrides = shape::stride(zShapeInfo);
 
                     Nd4jLong  yStrides[3] = { 0,0,0 };
-                    nd4j::ShapeUtils::copyCertainStridesFromShapeInfo(yShapeInfo, xRank, dimensionLength, dimension, yStrides);
+                    sd::ShapeUtils::copyCertainStridesFromShapeInfo(yShapeInfo, xRank, dimensionLength, dimension, yStrides);
 
                     uint64_t nSize1 = shape::sizeAt(zShapeInfo, 1);
                     uint64_t nSize2 = shape::sizeAt(zShapeInfo, 2);
@@ -231,7 +231,7 @@ namespace functions {
                     }
 
                 }
-                else if (kindOfLoop == nd4j::LoopKind::BROADCAST_4D) {
+                else if (kindOfLoop == sd::LoopKind::BROADCAST_4D) {
 
                     int xRank = shape::rank(xShapeInfo);
                     int yRank = shape::rank(yShapeInfo);
@@ -240,7 +240,7 @@ namespace functions {
                     auto  zStrides = shape::stride(zShapeInfo);
 
                     Nd4jLong  yStrides[4] = { 0,0,0,0 };
-                    nd4j::ShapeUtils::copyCertainStridesFromShapeInfo(yShapeInfo, xRank, dimensionLength, dimension, yStrides);
+                    sd::ShapeUtils::copyCertainStridesFromShapeInfo(yShapeInfo, xRank, dimensionLength, dimension, yStrides);
 
                     uint64_t nSize1 = shape::sizeAt(zShapeInfo, 1);
                     uint64_t nSize2 = shape::sizeAt(zShapeInfo, 2);
@@ -263,7 +263,7 @@ namespace functions {
                     }
 
                 }
-                else if (kindOfLoop == nd4j::LoopKind::BROADCAST_5D) {
+                else if (kindOfLoop == sd::LoopKind::BROADCAST_5D) {
 
                     int xRank = shape::rank(xShapeInfo);
                     int yRank = shape::rank(yShapeInfo);
@@ -272,7 +272,7 @@ namespace functions {
                     auto  zStrides = shape::stride(zShapeInfo);
 
                     Nd4jLong  yStrides[5] = { 0,0,0,0,0 };
-                    nd4j::ShapeUtils::copyCertainStridesFromShapeInfo(yShapeInfo, xRank, dimensionLength, dimension, yStrides);
+                    sd::ShapeUtils::copyCertainStridesFromShapeInfo(yShapeInfo, xRank, dimensionLength, dimension, yStrides);
 
                     uint32_t nSize1 = shape::sizeAt(zShapeInfo, 1);
                     uint32_t nSize2 = shape::sizeAt(zShapeInfo, 2);
@@ -301,7 +301,7 @@ namespace functions {
                 }
                 else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xTadShapeShapeInfo, zTadShapeInfo)) {
                     uint tadShapeShapeInfoCast[MAX_RANK];
-                    bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
+                    bool canCastX = sd::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
 
                     for (auto i = start; i < stop; i++) {
                         auto oX = x + tadOffsets[i];
@@ -317,8 +317,8 @@ namespace functions {
                 else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, yShapeInfo)) {
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint tadShapeInfoZCast[MAX_RANK];
-                    bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
-                    bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
+                    bool canCastX = sd::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
+                    bool canCastZ = sd::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
 
 
                     for (auto i = start; i < stop; i++) {
@@ -336,8 +336,8 @@ namespace functions {
                 else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, zTadShapeInfo)) {
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
-                    bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
-                    bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
+                    bool canCastX = sd::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
+                    bool canCastY = sd::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
                     for (auto i = start; i < stop; i++) {
                         auto oZ = z + zTadOffset[i];
@@ -354,8 +354,8 @@ namespace functions {
                 else if(shape::haveSameShapeAndStrides(yShapeInfo, zTadShapeInfo)) {
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
-                    bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
-                    bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
+                    bool canCastX = sd::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
+                    bool canCastY = sd::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
                     for (auto i = start; i < stop; i++) {
                         auto oZ = z + zTadOffset[i];
@@ -373,9 +373,9 @@ namespace functions {
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint tadShapeInfoZCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
-                    bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
-                    bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
-                    bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
+                    bool canCastX = sd::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
+                    bool canCastY = sd::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
+                    bool canCastZ = sd::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
 
                     for (auto i = start; i < stop; i++) {
                         auto oZ = z + zTadOffset[i];
@@ -423,7 +423,7 @@ namespace functions {
             auto tadOffsets = yTadOffset;
 
             if (yTadShapeInfo == nullptr || tadOffsets == nullptr) {
-                auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dimension, dimensionLength);
+                auto tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dimension, dimensionLength);
 
                 yTadShapeShapeInfo = tadPack.primaryShapeInfo();
                 tadOffsets = tadPack.primaryOffsets();
@@ -442,16 +442,16 @@ namespace functions {
             auto lenX = shape::length(xShapeInfo);
 
             int tadsPerThread = tads / TAD_THRESHOLD;
-            int threads = nd4j::math::nd4j_max<int>(1, tadsPerThread);
-            threads = nd4j::math::nd4j_min<int>(threads, nd4j::Environment::getInstance()->maxThreads());
+            int threads = sd::math::nd4j_max<int>(1, tadsPerThread);
+            threads = sd::math::nd4j_min<int>(threads, sd::Environment::getInstance()->maxThreads());
 
             auto yEws = shape::elementWiseStride(yTadShapeShapeInfo);
             auto xEws = shape::elementWiseStride(xShapeInfo);
             auto zEws = shape::elementWiseStride(zTadShapeInfo);
 
-            const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXYZ(yTadShapeShapeInfo, xShapeInfo, zTadShapeInfo);
+            const sd::LoopKind::Kind kindOfLoop = sd::LoopKind::deduceKindOfLoopXYZ(yTadShapeShapeInfo, xShapeInfo, zTadShapeInfo);
 
-            if(kindOfLoop == nd4j::LoopKind::EWS1) {
+            if(kindOfLoop == sd::LoopKind::EWS1) {
                 for (auto i = start; i < stop; i++) {
                     auto oY = y + tadOffsets[i];
                     auto oZ = z + zTadOffset[i];
@@ -461,7 +461,7 @@ namespace functions {
                         oZ[f] = OpType::op(x[f], oY[f]);
                 }
             }
-            else if(kindOfLoop == nd4j::LoopKind::EWSNONZERO) {
+            else if(kindOfLoop == sd::LoopKind::EWSNONZERO) {
                 for (auto i = start; i < stop; i++) {
                     auto oY = y + tadOffsets[i];
                     auto oZ = z + zTadOffset[i];
@@ -473,7 +473,7 @@ namespace functions {
             }
             else if(shape::haveSameShapeAndStrides(yTadShapeShapeInfo, xShapeInfo) && shape::haveSameShapeAndStrides(yTadShapeShapeInfo, zTadShapeInfo)) {
                 uint tadShapeShapeInfoCast[MAX_RANK];
-                bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
+                bool canCastY = sd::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
 
                 for (auto i = start; i < stop; i++) {
                     auto oY = x + tadOffsets[i];
@@ -489,8 +489,8 @@ namespace functions {
             else if(shape::haveSameShapeAndStrides(yTadShapeShapeInfo, xShapeInfo)) {
                 uint tadShapeShapeInfoCast[MAX_RANK];
                 uint tadShapeInfoZCast[MAX_RANK];
-                bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
-                bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
+                bool canCastY = sd::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
+                bool canCastZ = sd::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
 
                 for (auto i = start; i < stop; i++) {
                     auto oZ = z + zTadOffset[i];
@@ -507,8 +507,8 @@ namespace functions {
             else if(shape::haveSameShapeAndStrides(yTadShapeShapeInfo, zTadShapeInfo)) {
                 uint tadShapeShapeInfoCast[MAX_RANK];
                 uint xShapeInfoCast[MAX_RANK];
-                bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-                bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
+                bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                bool canCastY = sd::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
 
                 for (auto i = start; i < stop; i++) {
                     auto oZ = z + zTadOffset[i];
@@ -525,8 +525,8 @@ namespace functions {
             else if(shape::haveSameShapeAndStrides(xShapeInfo, zTadShapeInfo)) {
                 uint tadShapeShapeInfoCast[MAX_RANK];
                 uint xShapeInfoCast[MAX_RANK];
-                bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-                bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
+                bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                bool canCastY = sd::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
 
                 for (auto i = start; i < stop; i++) {
                     auto oZ = z + zTadOffset[i];
@@ -544,9 +544,9 @@ namespace functions {
                 uint tadShapeShapeInfoCast[MAX_RANK];
                 uint tadShapeInfoZCast[MAX_RANK];
                 uint xShapeInfoCast[MAX_RANK];
-                bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-                bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
-                bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
+                bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                bool canCastY = sd::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
+                bool canCastZ = sd::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
 
                 for (auto i = start; i < stop; i++) {
                     auto oZ = z + zTadOffset[i];
diff --git a/libnd4j/include/loops/cpu/broadcasting_bool.cpp b/libnd4j/include/loops/cpu/broadcasting_bool.cpp
index faf6fdff6..55f681355 100644
--- a/libnd4j/include/loops/cpu/broadcasting_bool.cpp
+++ b/libnd4j/include/loops/cpu/broadcasting_bool.cpp
@@ -18,11 +18,11 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <loops/broadcasting_bool.h>
 #include <loops/legacy_ops.h>
 #include <types/types.h>
-#include <LoopKind.h>
+#include <helpers/LoopKind.h>
 #include <helpers/ConstantTadHelper.h>
 #include <execution/Threads.h>
 
@@ -126,7 +126,7 @@ namespace functions {
                 auto tadOffsets = xTadOffset;
 
                 if (xTadShapeInfo == nullptr || tadOffsets == nullptr) {
-                    auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
+                    auto tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
 
                     xTadShapeShapeInfo = tadPack.primaryShapeInfo();
                     tadOffsets = tadPack.primaryOffsets();
@@ -145,16 +145,16 @@ namespace functions {
                 auto lenY = shape::length(yShapeInfo);
 
                 int tadsPerThread = tads / TAD_THRESHOLD;
-                int threads = nd4j::math::nd4j_max<int>(1, tadsPerThread);
-                threads = nd4j::math::nd4j_min<int>(threads, nd4j::Environment::getInstance()->maxThreads());
+                int threads = sd::math::nd4j_max<int>(1, tadsPerThread);
+                threads = sd::math::nd4j_min<int>(threads, sd::Environment::getInstance()->maxThreads());
 
                 auto xEws = shape::elementWiseStride(xTadShapeShapeInfo);
                 auto yEws = shape::elementWiseStride(yShapeInfo);
                 auto zEws = shape::elementWiseStride(zTadShapeInfo);
 
-                const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXYZ(xTadShapeShapeInfo, yShapeInfo, zTadShapeInfo);
+                const sd::LoopKind::Kind kindOfLoop = sd::LoopKind::deduceKindOfLoopXYZ(xTadShapeShapeInfo, yShapeInfo, zTadShapeInfo);
 
-                if (kindOfLoop == nd4j::LoopKind::EWS1) {
+                if (kindOfLoop == sd::LoopKind::EWS1) {
                     for (auto i = start; i < stop; i++) {
                         auto oX = x + tadOffsets[i];
                         auto oZ = z + zTadOffset[i];
@@ -164,7 +164,7 @@ namespace functions {
                             oZ[f] = OpType::op(oX[f], y[f], extraParams);
                     }
                 }
-                else if(kindOfLoop == nd4j::LoopKind::EWSNONZERO) {
+                else if(kindOfLoop == sd::LoopKind::EWSNONZERO) {
                     for (auto i = start; i < stop; i ++) {
                         auto oX = x + tadOffsets[i];
                         auto oZ = z + zTadOffset[i];
@@ -176,7 +176,7 @@ namespace functions {
                 }
                 else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xTadShapeShapeInfo, zTadShapeInfo)) {
                     uint tadShapeShapeInfoCast[MAX_RANK];
-                    bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
+                    bool canCastX = sd::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
 
                     for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
@@ -192,8 +192,8 @@ namespace functions {
                 else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, yShapeInfo)) {
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint tadShapeInfoZCast[MAX_RANK];
-                    bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
-                    bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
+                    bool canCastX = sd::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
+                    bool canCastZ = sd::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
 
                     for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
@@ -210,8 +210,8 @@ namespace functions {
                 else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, zTadShapeInfo)) {
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
-                    bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
-                    bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
+                    bool canCastX = sd::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
+                    bool canCastY = sd::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
                     for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
@@ -229,8 +229,8 @@ namespace functions {
                 else if(shape::haveSameShapeAndStrides(yShapeInfo, zTadShapeInfo)) {
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
-                    bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
-                    bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
+                    bool canCastX = sd::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
+                    bool canCastY = sd::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
                     for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
@@ -248,9 +248,9 @@ namespace functions {
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint tadShapeInfoZCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
-                    bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
-                    bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
-                    bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
+                    bool canCastX = sd::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
+                    bool canCastY = sd::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
+                    bool canCastZ = sd::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
 
                     for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
@@ -299,7 +299,7 @@ namespace functions {
                 auto tadOffsets = yTadOffset;
 
                 if (yTadShapeInfo == nullptr || tadOffsets == nullptr) {
-                    auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dimension, dimensionLength);
+                    auto tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dimension, dimensionLength);
 
                     yTadShapeShapeInfo = tadPack.primaryShapeInfo();
                     tadOffsets = tadPack.primaryOffsets();
@@ -318,16 +318,16 @@ namespace functions {
                 auto lenX = shape::length(xShapeInfo);
 
                 int tadsPerThread = tads / TAD_THRESHOLD;
-                int threads = nd4j::math::nd4j_max<int>(1, tadsPerThread);
-                threads = nd4j::math::nd4j_min<int>(threads, nd4j::Environment::getInstance()->maxThreads());
+                int threads = sd::math::nd4j_max<int>(1, tadsPerThread);
+                threads = sd::math::nd4j_min<int>(threads, sd::Environment::getInstance()->maxThreads());
 
                 auto yEws = shape::elementWiseStride(yTadShapeShapeInfo);
                 auto xEws = shape::elementWiseStride(xShapeInfo);
                 auto zEws = shape::elementWiseStride(zTadShapeInfo);
 
-                const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXYZ(yTadShapeShapeInfo, xShapeInfo, zTadShapeInfo);
+                const sd::LoopKind::Kind kindOfLoop = sd::LoopKind::deduceKindOfLoopXYZ(yTadShapeShapeInfo, xShapeInfo, zTadShapeInfo);
 
-                if (kindOfLoop == nd4j::LoopKind::EWS1) {
+                if (kindOfLoop == sd::LoopKind::EWS1) {
                     for (auto i = start; i < stop; i ++) {
                         auto oY = y + tadOffsets[i];
                         auto oZ = z + zTadOffset[i];
@@ -337,7 +337,7 @@ namespace functions {
                             oZ[f] = OpType::op(x[f], oY[f], extraParams);
                     }
                 }
-                else if(kindOfLoop == nd4j::LoopKind::EWSNONZERO) {
+                else if(kindOfLoop == sd::LoopKind::EWSNONZERO) {
                     for (auto i = start; i < stop; i ++) {
                         auto oY = y + tadOffsets[i];
                         auto oZ = z + zTadOffset[i];
@@ -350,7 +350,7 @@ namespace functions {
                 else if(shape::haveSameShapeAndStrides(yTadShapeShapeInfo, xShapeInfo) && shape::haveSameShapeAndStrides(yTadShapeShapeInfo, zTadShapeInfo)) {
 
                     uint tadShapeShapeInfoCast[MAX_RANK];
-                    bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
+                    bool canCastY = sd::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
 
                     for (auto i = start; i < stop; i ++) {
                         auto oY = y + tadOffsets[i];
@@ -367,8 +367,8 @@ namespace functions {
 
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint tadShapeInfoZCast[MAX_RANK];
-                    bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
-                    bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
+                    bool canCastY = sd::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
+                    bool canCastZ = sd::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
 
                     for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
@@ -386,8 +386,8 @@ namespace functions {
 
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint xShapeInfoCast[MAX_RANK];
-                    bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-                    bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
+                    bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                    bool canCastY = sd::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
 
                     for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
@@ -405,8 +405,8 @@ namespace functions {
 
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint xShapeInfoCast[MAX_RANK];
-                    bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-                    bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
+                    bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                    bool canCastY = sd::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
 
                     for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
@@ -425,9 +425,9 @@ namespace functions {
                     uint xShapeInfoCast[MAX_RANK];
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint tadShapeInfoZCast[MAX_RANK];
-                    bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-                    bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
-                    bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
+                    bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                    bool canCastY = sd::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
+                    bool canCastZ = sd::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
 
                     for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
diff --git a/libnd4j/include/loops/cpu/broadcasting_int.cpp b/libnd4j/include/loops/cpu/broadcasting_int.cpp
index 9737cb4bb..231c56946 100644
--- a/libnd4j/include/loops/cpu/broadcasting_int.cpp
+++ b/libnd4j/include/loops/cpu/broadcasting_int.cpp
@@ -18,11 +18,11 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <loops/broadcasting_int.h>
 #include <loops/legacy_ops.h>
 #include <types/types.h>
-#include <LoopKind.h>
+#include <helpers/LoopKind.h>
 #include <helpers/ConstantTadHelper.h>
 #include <execution/Threads.h>
 
@@ -120,7 +120,7 @@ namespace functions {
                 auto tadOffsets = xTadOffset;
 
                 if (xTadShapeInfo == nullptr || tadOffsets == nullptr) {
-                    auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
+                    auto tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
 
                     xTadShapeShapeInfo = tadPack.primaryShapeInfo();
                     tadOffsets = tadPack.primaryOffsets();
@@ -139,16 +139,16 @@ namespace functions {
                 auto lenY = shape::length(yShapeInfo);
 
                 int tadsPerThread = tads / TAD_THRESHOLD;
-                int threads = nd4j::math::nd4j_max<int>(1, tadsPerThread);
-                threads = nd4j::math::nd4j_min<int>(threads, nd4j::Environment::getInstance()->maxThreads());
+                int threads = sd::math::nd4j_max<int>(1, tadsPerThread);
+                threads = sd::math::nd4j_min<int>(threads, sd::Environment::getInstance()->maxThreads());
 
                 auto xEws = shape::elementWiseStride(xTadShapeShapeInfo);
                 auto yEws = shape::elementWiseStride(yShapeInfo);
                 auto zEws = shape::elementWiseStride(zTadShapeInfo);
 
-                const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXYZ(xTadShapeShapeInfo, yShapeInfo, zTadShapeInfo);
+                const sd::LoopKind::Kind kindOfLoop = sd::LoopKind::deduceKindOfLoopXYZ(xTadShapeShapeInfo, yShapeInfo, zTadShapeInfo);
 
-                if (kindOfLoop == nd4j::LoopKind::EWS1) {
+                if (kindOfLoop == sd::LoopKind::EWS1) {
                     for (auto i = start; i < stop; i ++) {
                         auto oX = x + tadOffsets[i];
                         auto oZ = z + zTadOffset[i];
@@ -158,7 +158,7 @@ namespace functions {
                             oZ[f] = OpType::op(oX[f], y[f]);
                     };
                 }
-                else if(kindOfLoop == nd4j::LoopKind::EWSNONZERO) {
+                else if(kindOfLoop == sd::LoopKind::EWSNONZERO) {
                     for (auto i = start; i < stop; i ++) {
                         auto oX = x + tadOffsets[i];
                         auto oZ = z + zTadOffset[i];
@@ -170,7 +170,7 @@ namespace functions {
                 }
                 else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xTadShapeShapeInfo, zTadShapeInfo)) {
                     uint tadShapeShapeInfoCast[MAX_RANK];
-                    bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
+                    bool canCastX = sd::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
 
                     for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
@@ -186,8 +186,8 @@ namespace functions {
                 else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, yShapeInfo)) {
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint tadShapeInfoZCast[MAX_RANK];
-                    bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
-                    bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
+                    bool canCastX = sd::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
+                    bool canCastZ = sd::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
 
                     for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
@@ -204,8 +204,8 @@ namespace functions {
                 else if(shape::haveSameShapeAndStrides(xTadShapeShapeInfo, zTadShapeInfo)) {
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
-                    bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
-                    bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
+                    bool canCastX = sd::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
+                    bool canCastY = sd::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
                     for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
@@ -222,8 +222,8 @@ namespace functions {
                 else if(shape::haveSameShapeAndStrides(yShapeInfo, zTadShapeInfo)) {
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
-                    bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
-                    bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
+                    bool canCastX = sd::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
+                    bool canCastY = sd::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
                     for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
@@ -241,9 +241,9 @@ namespace functions {
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint tadShapeInfoZCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
-                    bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
-                    bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
-                    bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
+                    bool canCastX = sd::DataTypeUtils::castShapeInfo(xTadShapeShapeInfo, tadShapeShapeInfoCast);
+                    bool canCastY = sd::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
+                    bool canCastZ = sd::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
 
                     for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
@@ -290,7 +290,7 @@ namespace functions {
                 auto tadOffsets = yTadOffset;
 
                 if (yTadShapeInfo == nullptr || tadOffsets == nullptr) {
-                    auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dimension, dimensionLength);
+                    auto tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dimension, dimensionLength);
 
                     yTadShapeShapeInfo = tadPack.primaryShapeInfo();
                     tadOffsets = tadPack.primaryOffsets();
@@ -309,16 +309,16 @@ namespace functions {
                 auto lenX = shape::length(xShapeInfo);
 
                 int tadsPerThread = tads / TAD_THRESHOLD;
-                int threads = nd4j::math::nd4j_max<int>(1, tadsPerThread);
-                threads = nd4j::math::nd4j_min<int>(threads, nd4j::Environment::getInstance()->maxThreads());
+                int threads = sd::math::nd4j_max<int>(1, tadsPerThread);
+                threads = sd::math::nd4j_min<int>(threads, sd::Environment::getInstance()->maxThreads());
 
                 auto yEws = shape::elementWiseStride(yTadShapeShapeInfo);
                 auto xEws = shape::elementWiseStride(xShapeInfo);
                 auto zEws = shape::elementWiseStride(zTadShapeInfo);
 
-                const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXYZ(yTadShapeShapeInfo, xShapeInfo, zTadShapeInfo);
+                const sd::LoopKind::Kind kindOfLoop = sd::LoopKind::deduceKindOfLoopXYZ(yTadShapeShapeInfo, xShapeInfo, zTadShapeInfo);
 
-                if (kindOfLoop == nd4j::LoopKind::EWS1) {
+                if (kindOfLoop == sd::LoopKind::EWS1) {
                     for (auto i = start; i < stop; i ++) {
                         auto oY = y + tadOffsets[i];
                         auto oZ = z + zTadOffset[i];
@@ -328,7 +328,7 @@ namespace functions {
                             oZ[f] = OpType::op(x[f], oY[f]);
                     };
                 }
-                else if(kindOfLoop == nd4j::LoopKind::EWSNONZERO) {
+                else if(kindOfLoop == sd::LoopKind::EWSNONZERO) {
                     for (auto i = start; i < stop; i ++) {
                         auto oY = y + tadOffsets[i];
                         auto oZ = z + zTadOffset[i];
@@ -340,7 +340,7 @@ namespace functions {
                 }
                 else if(shape::haveSameShapeAndStrides(yTadShapeShapeInfo, xShapeInfo) && shape::haveSameShapeAndStrides(yTadShapeShapeInfo, zTadShapeInfo)) {
                     uint tadShapeShapeInfoCast[MAX_RANK];
-                    bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
+                    bool canCastY = sd::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
 
                     for (auto i = start; i < stop; i ++) {
                         auto oY = y + tadOffsets[i];
@@ -357,8 +357,8 @@ namespace functions {
 
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint tadShapeInfoZCast[MAX_RANK];
-                    bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
-                    bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
+                    bool canCastY = sd::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
+                    bool canCastZ = sd::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
 
                     for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
@@ -374,8 +374,8 @@ namespace functions {
                 else if(shape::haveSameShapeAndStrides(yTadShapeShapeInfo, zTadShapeInfo)) {
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint xShapeInfoCast[MAX_RANK];
-                    bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-                    bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
+                    bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                    bool canCastY = sd::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
 
                     for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
@@ -392,8 +392,8 @@ namespace functions {
                 else if(shape::haveSameShapeAndStrides(xShapeInfo, zTadShapeInfo)) {
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint xShapeInfoCast[MAX_RANK];
-                    bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-                    bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
+                    bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                    bool canCastY = sd::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
 
                     for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
@@ -411,9 +411,9 @@ namespace functions {
                     uint xShapeInfoCast[MAX_RANK];
                     uint tadShapeShapeInfoCast[MAX_RANK];
                     uint tadShapeInfoZCast[MAX_RANK];
-                    bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-                    bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
-                    bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
+                    bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                    bool canCastY = sd::DataTypeUtils::castShapeInfo(yTadShapeShapeInfo, tadShapeShapeInfoCast);
+                    bool canCastZ = sd::DataTypeUtils::castShapeInfo(zTadShapeInfo, tadShapeInfoZCast);
 
                     for (auto i = start; i < stop; i ++) {
                         auto oZ = z + zTadOffset[i];
diff --git a/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_0.cpp b/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_0.cpp
index 4685e8f5b..7bb2e8d81 100644
--- a/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_0.cpp
+++ b/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_0.cpp
@@ -20,7 +20,7 @@
 
 #include "../TrueBroadcastHelper.hpp"
 
-namespace nd4j {
+namespace sd {
     namespace helpers {
         BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_9);
     }
diff --git a/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_1.cpp b/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_1.cpp
index 6c9eb4022..7cacbe035 100644
--- a/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_1.cpp
+++ b/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_1.cpp
@@ -20,7 +20,7 @@
 
 #include "../TrueBroadcastHelper.hpp"
 
-namespace nd4j {
+namespace sd {
     namespace helpers {
         BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_0);
     }
diff --git a/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_2.cpp b/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_2.cpp
index 4d020fdfa..c98abca4a 100644
--- a/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_2.cpp
+++ b/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_2.cpp
@@ -20,7 +20,7 @@
 
 #include "../TrueBroadcastHelper.hpp"
 
-namespace nd4j {
+namespace sd {
     namespace helpers {
         BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_1);
     }
diff --git a/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_3.cpp b/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_3.cpp
index 327df9a88..10d29053c 100644
--- a/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_3.cpp
+++ b/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_3.cpp
@@ -20,7 +20,7 @@
 
 #include "../TrueBroadcastHelper.hpp"
 
-namespace nd4j {
+namespace sd {
     namespace helpers {
         BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_2);
     }
diff --git a/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_4.cpp b/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_4.cpp
index 3fb868278..ef72d78f8 100644
--- a/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_4.cpp
+++ b/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_4.cpp
@@ -20,7 +20,7 @@
 
 #include "../TrueBroadcastHelper.hpp"
 
-namespace nd4j {
+namespace sd {
     namespace helpers {
         BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_3);
     }
diff --git a/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_5.cpp b/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_5.cpp
index 9b281516a..c90b23f8e 100644
--- a/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_5.cpp
+++ b/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_5.cpp
@@ -20,7 +20,7 @@
 
 #include "../TrueBroadcastHelper.hpp"
 
-namespace nd4j {
+namespace sd {
     namespace helpers {
         BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_4);
     }
diff --git a/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_6.cpp b/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_6.cpp
index dbdd65907..ba6bcb5a3 100644
--- a/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_6.cpp
+++ b/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_6.cpp
@@ -20,7 +20,7 @@
 
 #include "../TrueBroadcastHelper.hpp"
 
-namespace nd4j {
+namespace sd {
     namespace helpers {
         BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_5);
     }
diff --git a/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_7.cpp b/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_7.cpp
index 5b3beaec6..e8c62f3b7 100644
--- a/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_7.cpp
+++ b/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_7.cpp
@@ -20,7 +20,7 @@
 
 #include "../TrueBroadcastHelper.hpp"
 
-namespace nd4j {
+namespace sd {
     namespace helpers {
         BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_6);
     }
diff --git a/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_8.cpp b/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_8.cpp
index 51e81d32a..7771d89b8 100644
--- a/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_8.cpp
+++ b/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_8.cpp
@@ -20,7 +20,7 @@
 
 #include "../TrueBroadcastHelper.hpp"
 
-namespace nd4j {
+namespace sd {
     namespace helpers {
         BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_7);
     }
diff --git a/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_9.cpp b/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_9.cpp
index 77f5e5720..1c1cf1cb0 100644
--- a/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_9.cpp
+++ b/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_9.cpp
@@ -20,7 +20,7 @@
 
 #include "../TrueBroadcastHelper.hpp"
 
-namespace nd4j {
+namespace sd {
     namespace helpers {
         BUILD_PAIRWISE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastHelper, , PAIRWISE_TYPES_8);
     }
diff --git a/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_bool.cpp b/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_bool.cpp
index c2e116666..23013f8a0 100644
--- a/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_bool.cpp
+++ b/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_bool.cpp
@@ -20,7 +20,7 @@
 
 #include "../TrueBroadcastHelper.hpp"
 
-namespace nd4j {
+namespace sd {
     namespace helpers {
         BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastBoolHelper, , LIBND4J_TYPES, BOOL_TYPES);
     }
diff --git a/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_int.cpp b/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_int.cpp
index 04b0fc3e4..00d4b6a93 100644
--- a/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_int.cpp
+++ b/libnd4j/include/loops/cpu/compilation_units/TrueBroadcastHelper_int.cpp
@@ -20,7 +20,7 @@
 
 #include "../TrueBroadcastHelper.hpp"
 
-namespace nd4j {
+namespace sd {
     namespace helpers {
         BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT TrueBroadcastIntHelper, , INTEGER_TYPES);
     }
diff --git a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_0.cpp b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_0.cpp
index c3f71b5c4..89b85485a 100644
--- a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_0.cpp
+++ b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_0.cpp
@@ -23,6 +23,6 @@
 
 namespace functions {
     namespace indexreduce {
-        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_0, (nd4j::DataType::INT32, int32_t));
+        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_0, (sd::DataType::INT32, int32_t));
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_1.cpp b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_1.cpp
index 2eeb1e37d..ada7844cb 100644
--- a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_1.cpp
+++ b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_1.cpp
@@ -23,6 +23,6 @@
 
 namespace functions {
     namespace indexreduce {
-        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_1, (nd4j::DataType::INT32, int32_t));
+        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_1, (sd::DataType::INT32, int32_t));
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_2.cpp b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_2.cpp
index da0c55f51..47dce2d5a 100644
--- a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_2.cpp
+++ b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_2.cpp
@@ -23,6 +23,6 @@
 
 namespace functions {
     namespace indexreduce {
-        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_2, (nd4j::DataType::INT32, int32_t));
+        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_2, (sd::DataType::INT32, int32_t));
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_3.cpp b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_3.cpp
index 3c255aa4f..c3d33e7f1 100644
--- a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_3.cpp
+++ b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_3.cpp
@@ -23,6 +23,6 @@
 
 namespace functions {
     namespace indexreduce {
-        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_3, (nd4j::DataType::INT32, int32_t));
+        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_3, (sd::DataType::INT32, int32_t));
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_4.cpp b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_4.cpp
index e04ce70e2..37a81e441 100644
--- a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_4.cpp
+++ b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_4.cpp
@@ -23,6 +23,6 @@
 
 namespace functions {
     namespace indexreduce {
-        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_4, (nd4j::DataType::INT32, int32_t));
+        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_4, (sd::DataType::INT32, int32_t));
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_5.cpp b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_5.cpp
index 17dd63e46..1d6555ddf 100644
--- a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_5.cpp
+++ b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_5.cpp
@@ -23,6 +23,6 @@
 
 namespace functions {
     namespace indexreduce {
-        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_5, (nd4j::DataType::INT32, int32_t));
+        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_5, (sd::DataType::INT32, int32_t));
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_6.cpp b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_6.cpp
index c84fb089a..0bb8aef4d 100644
--- a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_6.cpp
+++ b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_6.cpp
@@ -23,6 +23,6 @@
 
 namespace functions {
     namespace indexreduce {
-        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_6, (nd4j::DataType::INT32, int32_t));
+        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_6, (sd::DataType::INT32, int32_t));
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_7.cpp b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_7.cpp
index 155c4e3f4..a7d3c733f 100644
--- a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_7.cpp
+++ b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_7.cpp
@@ -23,6 +23,6 @@
 
 namespace functions {
     namespace indexreduce {
-        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_7, (nd4j::DataType::INT32, int32_t));
+        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_7, (sd::DataType::INT32, int32_t));
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_8.cpp b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_8.cpp
index cdcfd8c51..8c5de9653 100644
--- a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_8.cpp
+++ b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_8.cpp
@@ -23,6 +23,6 @@
 
 namespace functions {
     namespace indexreduce {
-        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_8, (nd4j::DataType::INT32, int32_t));
+        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_8, (sd::DataType::INT32, int32_t));
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_9.cpp b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_9.cpp
index 683245ef4..f61d604e2 100644
--- a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_9.cpp
+++ b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int32_9.cpp
@@ -23,6 +23,6 @@
 
 namespace functions {
     namespace indexreduce {
-        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_9, (nd4j::DataType::INT32, int32_t));
+        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_9, (sd::DataType::INT32, int32_t));
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_0.cpp b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_0.cpp
index 6818dea5f..d399f5e0e 100644
--- a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_0.cpp
+++ b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_0.cpp
@@ -23,6 +23,6 @@
 
 namespace functions {
     namespace indexreduce {
-        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_0, (nd4j::DataType::INT64, Nd4jLong));
+        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_0, (sd::DataType::INT64, Nd4jLong));
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_1.cpp b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_1.cpp
index c15541da5..c4df4d2e4 100644
--- a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_1.cpp
+++ b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_1.cpp
@@ -23,6 +23,6 @@
 
 namespace functions {
     namespace indexreduce {
-        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_1, (nd4j::DataType::INT64, Nd4jLong));
+        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_1, (sd::DataType::INT64, Nd4jLong));
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_2.cpp b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_2.cpp
index a95682991..538e369eb 100644
--- a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_2.cpp
+++ b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_2.cpp
@@ -23,6 +23,6 @@
 
 namespace functions {
     namespace indexreduce {
-        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_2, (nd4j::DataType::INT64, Nd4jLong));
+        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_2, (sd::DataType::INT64, Nd4jLong));
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_3.cpp b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_3.cpp
index 22597879c..b0d082bce 100644
--- a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_3.cpp
+++ b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_3.cpp
@@ -23,6 +23,6 @@
 
 namespace functions {
     namespace indexreduce {
-        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_3, (nd4j::DataType::INT64, Nd4jLong));
+        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_3, (sd::DataType::INT64, Nd4jLong));
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_4.cpp b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_4.cpp
index a5b2afb12..98e13bb63 100644
--- a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_4.cpp
+++ b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_4.cpp
@@ -23,6 +23,6 @@
 
 namespace functions {
     namespace indexreduce {
-        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_4, (nd4j::DataType::INT64, Nd4jLong));
+        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_4, (sd::DataType::INT64, Nd4jLong));
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_5.cpp b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_5.cpp
index 08797092a..4b7f599d9 100644
--- a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_5.cpp
+++ b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_5.cpp
@@ -23,6 +23,6 @@
 
 namespace functions {
     namespace indexreduce {
-        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_5, (nd4j::DataType::INT64, Nd4jLong));
+        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_5, (sd::DataType::INT64, Nd4jLong));
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_6.cpp b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_6.cpp
index b7ca6d81e..8d7de9822 100644
--- a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_6.cpp
+++ b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_6.cpp
@@ -23,6 +23,6 @@
 
 namespace functions {
     namespace indexreduce {
-        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_6, (nd4j::DataType::INT64, Nd4jLong));
+        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_6, (sd::DataType::INT64, Nd4jLong));
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_7.cpp b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_7.cpp
index 2eb10091a..8f9befddb 100644
--- a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_7.cpp
+++ b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_7.cpp
@@ -23,6 +23,6 @@
 
 namespace functions {
     namespace indexreduce {
-        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_7, (nd4j::DataType::INT64, Nd4jLong));
+        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_7, (sd::DataType::INT64, Nd4jLong));
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_8.cpp b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_8.cpp
index f2a04cc8f..b38112631 100644
--- a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_8.cpp
+++ b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_8.cpp
@@ -23,6 +23,6 @@
 
 namespace functions {
     namespace indexreduce {
-        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_8, (nd4j::DataType::INT64, Nd4jLong));
+        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_8, (sd::DataType::INT64, Nd4jLong));
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_9.cpp b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_9.cpp
index 062db6187..baacdc432 100644
--- a/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_9.cpp
+++ b/libnd4j/include/loops/cpu/compilation_units/indexreduce_int64_9.cpp
@@ -23,6 +23,6 @@
 
 namespace functions {
     namespace indexreduce {
-        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_9, (nd4j::DataType::INT64, Nd4jLong));
+        BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT IndexReduce, , LIBND4J_TYPES_9, (sd::DataType::INT64, Nd4jLong));
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/loops/cpu/indexreduce.hpp b/libnd4j/include/loops/cpu/indexreduce.hpp
index f875e170b..d4abd8c82 100644
--- a/libnd4j/include/loops/cpu/indexreduce.hpp
+++ b/libnd4j/include/loops/cpu/indexreduce.hpp
@@ -18,13 +18,13 @@
 // Created by raver on 4/9/2018.
 //
 
-#include "../indexreduce.h"
-#include <op_boilerplate.h>
-#include <Loops.h>
+#include <loops/indexreduce.h>
+#include <system/op_boilerplate.h>
+#include <helpers/Loops.h>
 #include <types/types.h>
 #include <helpers/ConstantTadHelper.h>
 #include <execution/Threads.h>
-#include "../legacy_ops.h"
+#include <loops/legacy_ops.h>
 
 using namespace simdOps;
 
@@ -60,11 +60,11 @@ Nd4jLong IndexReduce<X, Y>::execScalar(void *vx, Nd4jLong *xShapeInfo, void *vex
     auto startingIndex = OpType::startingIndexValue(x);
     auto len = shape::length(xShapeInfo);
     auto xEws = shape::elementWiseStride(xShapeInfo);
-    nd4j::OmpLaunchHelper info(len);
+    sd::OmpLaunchHelper info(len);
 
     uint xShapeInfoCast[MAX_RANK];
-    bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-    int maxThreads = nd4j::math::nd4j_min<int>(64, nd4j::Environment::getInstance()->maxThreads());
+    bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+    int maxThreads = sd::math::nd4j_min<int>(64, sd::Environment::getInstance()->maxThreads());
     IndexValue<X> intermediatery[64];
     for (int e = 0; e < maxThreads; e++)
         intermediatery[e].index = -1;
@@ -119,8 +119,8 @@ void IndexReduce<X, Z>::exec(void *vx, Nd4jLong *xShapeInfo,
 
     const Nd4jLong zLen = shape::length(zShapeInfo);
 
-    if(nd4j::ArrayOptions::arrayType(xShapeInfo) == nd4j::ArrayType::EMPTY) {
-        if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
+    if(sd::ArrayOptions::arrayType(xShapeInfo) == sd::ArrayType::EMPTY) {
+        if(sd::ArrayOptions::arrayType(zShapeInfo) == sd::ArrayType::EMPTY)
             return;
         const auto indexValue = OpType::startingIndexValue(x);
 
@@ -142,13 +142,13 @@ void IndexReduce<X, Z>::exec(void *vx, Nd4jLong *xShapeInfo,
         if (dimensionLength < 1)
             return;
 
-        auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
+        auto tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
 
         tadOnlyShapeInfo = tadPack.primaryShapeInfo();
         tadOffsets = tadPack.primaryOffsets();
     }
 
-    nd4j::IndexReductionLoops<X,Z>::template loopIndexReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams);
+    sd::IndexReductionLoops<X,Z>::template loopIndexReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams);
 }
 
 }
diff --git a/libnd4j/include/loops/cpu/pairwise.hpp b/libnd4j/include/loops/cpu/pairwise.hpp
index 1fc85e5d8..27c97efa9 100644
--- a/libnd4j/include/loops/cpu/pairwise.hpp
+++ b/libnd4j/include/loops/cpu/pairwise.hpp
@@ -21,11 +21,11 @@
 #include <ops/ops.h>
 #include <loops/pairwise_transform.h>
 #include <types/types.h>
-#include <LoopKind.h>
-#include <templatemath.h>
+#include <helpers/LoopKind.h>
+#include <math/templatemath.h>
 #include <helpers/shape.h>
-#include <op_boilerplate.h>
-#include <OmpLaunchHelper.h>
+#include <system/op_boilerplate.h>
+#include <helpers/OmpLaunchHelper.h>
 #include <execution/Threads.h>
 
 using namespace simdOps;
@@ -135,7 +135,7 @@ namespace functions {
             if (shape::isScalar(yShapeInfo)) {
 
                 uint xShapeInfoCast[MAX_RANK];
-                const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                const bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
                 if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
                     PRAGMA_OMP_SIMD
@@ -146,7 +146,7 @@ namespace functions {
                 }
                 else {
                     uint zShapeInfoCast[MAX_RANK];
-                    const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
+                    const bool canCastZ = sd::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
                     PRAGMA_OMP_SIMD
                     for(auto i = start; i < stop; i++)  {
@@ -160,20 +160,20 @@ namespace functions {
 
 
 
-            const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXYZ(xShapeInfo, yShapeInfo, zShapeInfo);
+            const sd::LoopKind::Kind kindOfLoop = sd::LoopKind::deduceKindOfLoopXYZ(xShapeInfo, yShapeInfo, zShapeInfo);
             const bool sameShapesXY = shape::shapeEquals(xShapeInfo, yShapeInfo);
 
-            if ((kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) && sameShapesXY) {
+            if ((kindOfLoop == sd::LoopKind::EWS1 || kindOfLoop == sd::LoopKind::EWSNONZERO) && sameShapesXY) {
                 exec<OpType>(x, xEws, y, yEws, z, zEws, extraParams, n, start, stop);
             }
-            else if ((kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) && !sameShapesXY) { //not same shape
+            else if ((kindOfLoop == sd::LoopKind::EWS1 || kindOfLoop == sd::LoopKind::EWSNONZERO) && !sameShapesXY) { //not same shape
                 exec<OpType>(x, xEws, y, yEws, z, zEws, extraParams, shape::length(yShapeInfo), start, stop);
             }
             else {
 
                 if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
                     uint xShapeInfoCast[MAX_RANK];
-                    bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                    bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
                     PRAGMA_OMP_SIMD
                     for (auto i = start; i < stop; i++)  {
@@ -184,8 +184,8 @@ namespace functions {
                 else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) {
                     uint xShapeInfoCast[MAX_RANK];
                     uint zShapeInfoCast[MAX_RANK];
-                    bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-                    bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
+                    bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                    bool canCastZ = sd::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
                     PRAGMA_OMP_SIMD
                     for (auto i = start; i < stop; i++)  {
@@ -197,8 +197,8 @@ namespace functions {
                 else if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
                     uint xShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
-                    bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-                    bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
+                    bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                    bool canCastY = sd::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
                     PRAGMA_OMP_SIMD
                     for (auto i = start; i < stop; i++)  {
@@ -210,8 +210,8 @@ namespace functions {
                 else if(shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo)) {
                     uint xShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
-                    bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-                    bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
+                    bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                    bool canCastY = sd::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
                     PRAGMA_OMP_SIMD
                     for (auto i = start; i < stop; i++)  {
@@ -224,9 +224,9 @@ namespace functions {
                     uint xShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     uint zShapeInfoCast[MAX_RANK];
-                    bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-                    bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
-                    bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
+                    bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                    bool canCastY = sd::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
+                    bool canCastZ = sd::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
                     PRAGMA_OMP_SIMD
                     for (auto i = start; i < stop; i++)  {
diff --git a/libnd4j/include/loops/cpu/pairwise_bool.cpp b/libnd4j/include/loops/cpu/pairwise_bool.cpp
index 2259c37b0..d77413e8c 100644
--- a/libnd4j/include/loops/cpu/pairwise_bool.cpp
+++ b/libnd4j/include/loops/cpu/pairwise_bool.cpp
@@ -20,8 +20,8 @@
 
 #include <loops/pairwise_bool.h>
 #include <types/types.h>
-#include <LoopKind.h>
-#include <OmpLaunchHelper.h>
+#include <helpers/LoopKind.h>
+#include <helpers/OmpLaunchHelper.h>
 #include <execution/Threads.h>
 
 using namespace simdOps;
@@ -129,7 +129,7 @@ namespace functions {
             if (shape::isScalar(yShapeInfo)) {
 
                uint xShapeInfoCast[MAX_RANK];
-               const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+               const bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
                 if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
 
@@ -141,7 +141,7 @@ namespace functions {
                 }
                 else {
                     uint zShapeInfoCast[MAX_RANK];
-                    const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
+                    const bool canCastZ = sd::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
                     PRAGMA_OMP_SIMD
                     for(auto i = start; i < stop; i++)  {
@@ -153,19 +153,19 @@ namespace functions {
                 return;
             }
 
-            const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXYZ(xShapeInfo, yShapeInfo, zShapeInfo);
+            const sd::LoopKind::Kind kindOfLoop = sd::LoopKind::deduceKindOfLoopXYZ(xShapeInfo, yShapeInfo, zShapeInfo);
             const bool sameShapesXY = shape::shapeEquals(xShapeInfo, yShapeInfo);
 
-            if ((kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) && sameShapesXY) {
+            if ((kindOfLoop == sd::LoopKind::EWS1 || kindOfLoop == sd::LoopKind::EWSNONZERO) && sameShapesXY) {
                 exec<OpType>(x, xEws, y, yEws, z, zEws, extraParams, n, start, stop);
             }
-            else if ((kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) && !sameShapesXY) { //not same shape
+            else if ((kindOfLoop == sd::LoopKind::EWS1 || kindOfLoop == sd::LoopKind::EWSNONZERO) && !sameShapesXY) { //not same shape
                 exec<OpType>(x, xEws, y, yEws, z, zEws, extraParams, shape::length(yShapeInfo), start, stop);
             }
             else {
                 if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
                     uint xShapeInfoCast[MAX_RANK];
-                    const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                    const bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
                     PRAGMA_OMP_SIMD
                     for (auto i = start; i < stop; i++)  {
@@ -176,8 +176,8 @@ namespace functions {
                 else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) {
                     uint xShapeInfoCast[MAX_RANK];
                     uint zShapeInfoCast[MAX_RANK];
-                    const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-                    const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
+                    const bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                    const bool canCastZ = sd::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
                     PRAGMA_OMP_SIMD
                     for (auto i = start; i < stop; i++)  {
@@ -189,8 +189,8 @@ namespace functions {
                 else if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
                     uint xShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
-                    const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-                    const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
+                    const bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                    const bool canCastY = sd::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
                     PRAGMA_OMP_SIMD
                     for (auto i = start; i < stop; i++)  {
@@ -202,8 +202,8 @@ namespace functions {
                 else if(shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo)) {
                     uint xShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
-                    const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-                    const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
+                    const bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                    const bool canCastY = sd::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
                     PRAGMA_OMP_SIMD
                     for (auto i = start; i < stop; i++)  {
@@ -216,9 +216,9 @@ namespace functions {
                     uint xShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     uint zShapeInfoCast[MAX_RANK];
-                    const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-                    const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
-                    const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
+                    const bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                    const bool canCastY = sd::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
+                    const bool canCastZ = sd::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
                     PRAGMA_OMP_SIMD
                     for (auto i = start; i < stop; i++)  {
diff --git a/libnd4j/include/loops/cpu/pairwise_int.cpp b/libnd4j/include/loops/cpu/pairwise_int.cpp
index 673951d6a..9af092a0f 100644
--- a/libnd4j/include/loops/cpu/pairwise_int.cpp
+++ b/libnd4j/include/loops/cpu/pairwise_int.cpp
@@ -20,8 +20,8 @@
 
 #include <loops/pairwise_int.h>
 #include <types/types.h>
-#include <LoopKind.h>
-#include <OmpLaunchHelper.h>
+#include <helpers/LoopKind.h>
+#include <helpers/OmpLaunchHelper.h>
 #include <execution/Threads.h>
 
 using namespace simdOps;
@@ -129,7 +129,7 @@ namespace functions {
             if (shape::isScalar(yShapeInfo)) {
 
                uint xShapeInfoCast[MAX_RANK];
-               const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+               const bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
                 if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
                     PRAGMA_OMP_SIMD
@@ -140,7 +140,7 @@ namespace functions {
                 }
                 else {
                     uint zShapeInfoCast[MAX_RANK];
-                    const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
+                    const bool canCastZ = sd::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
                     PRAGMA_OMP_SIMD
                     for(auto i = start; i < stop; i++)  {
@@ -152,20 +152,20 @@ namespace functions {
                 return;
             }
 
-            const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXYZ(xShapeInfo, yShapeInfo, zShapeInfo);
+            const sd::LoopKind::Kind kindOfLoop = sd::LoopKind::deduceKindOfLoopXYZ(xShapeInfo, yShapeInfo, zShapeInfo);
             const bool sameShapesXY = shape::shapeEquals(xShapeInfo, yShapeInfo);
 
-            if ((kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) && sameShapesXY) {
+            if ((kindOfLoop == sd::LoopKind::EWS1 || kindOfLoop == sd::LoopKind::EWSNONZERO) && sameShapesXY) {
                 exec<OpType>(x, xEws, y, yEws, z, zEws, extraParams, n, start, stop);
             }
-            else if ((kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) && !sameShapesXY) { //not same shape
+            else if ((kindOfLoop == sd::LoopKind::EWS1 || kindOfLoop == sd::LoopKind::EWSNONZERO) && !sameShapesXY) { //not same shape
                 exec<OpType>(x, xEws, y, yEws, z, zEws, extraParams, shape::length(yShapeInfo), start, stop);
             }
             else {
 
                 if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
                     uint xShapeInfoCast[MAX_RANK];
-                    const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                    const bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
                     PRAGMA_OMP_SIMD
                     for (auto i = start; i < stop; i++)  {
@@ -176,8 +176,8 @@ namespace functions {
                 else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) {
                     uint xShapeInfoCast[MAX_RANK];
                     uint zShapeInfoCast[MAX_RANK];
-                    const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-                    const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
+                    const bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                    const bool canCastZ = sd::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
                     PRAGMA_OMP_SIMD
                     for (auto i = start; i < stop; i++)  {
@@ -189,8 +189,8 @@ namespace functions {
                 else if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
                     uint xShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
-                    const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-                    const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
+                    const bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                    const bool canCastY = sd::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
                     PRAGMA_OMP_SIMD
                     for (auto i = start; i < stop; i++)  {
@@ -202,8 +202,8 @@ namespace functions {
                 else if(shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo)) {
                     uint xShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
-                    const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-                    const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
+                    const bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                    const bool canCastY = sd::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
                     PRAGMA_OMP_SIMD
                     for (auto i = start; i < stop; i++)  {
@@ -216,9 +216,9 @@ namespace functions {
                     uint xShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     uint zShapeInfoCast[MAX_RANK];
-                    const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-                    const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
-                    const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
+                    const bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                    const bool canCastY = sd::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
+                    const bool canCastZ = sd::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
                     PRAGMA_OMP_SIMD
                     for (auto i = start; i < stop; i++)  {
diff --git a/libnd4j/include/loops/cpu/random.hpp b/libnd4j/include/loops/cpu/random.hpp
index 3b9b3c515..034179f07 100644
--- a/libnd4j/include/loops/cpu/random.hpp
+++ b/libnd4j/include/loops/cpu/random.hpp
@@ -20,9 +20,9 @@
 //
 
 #include <types/types.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <loops/random.h>
-#include <OmpLaunchHelper.h>
+#include <helpers/OmpLaunchHelper.h>
 
 using namespace randomOps;
 
@@ -53,7 +53,7 @@ namespace functions {
 
             auto length = shape::length(zShapeInfo);
 
-            nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
+            sd::graph::RandomGenerator* rng = reinterpret_cast<sd::graph::RandomGenerator*>(state);
 
             if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
 
@@ -71,7 +71,7 @@ namespace functions {
                 }
                 else{
                     uint xShapeInfoCast[MAX_RANK];
-                    const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                    const bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
                     auto func = PRAGMA_THREADS_FOR {
                         PRAGMA_OMP_SIMD
@@ -88,8 +88,8 @@ namespace functions {
 
                 uint xShapeInfoCast[MAX_RANK];
                 uint zShapeInfoCast[MAX_RANK];
-                const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-                const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
+                const bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                const bool canCastZ = sd::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
                 auto func = PRAGMA_THREADS_FOR {
                     PRAGMA_OMP_SIMD
@@ -106,8 +106,8 @@ namespace functions {
 
                 uint xShapeInfoCast[MAX_RANK];
                 uint yShapeInfoCast[MAX_RANK];
-                const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-                const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
+                const bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                const bool canCastY = sd::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
                 auto func = PRAGMA_THREADS_FOR {
                     PRAGMA_OMP_SIMD
@@ -124,8 +124,8 @@ namespace functions {
 
                 uint xShapeInfoCast[MAX_RANK];
                 uint yShapeInfoCast[MAX_RANK];
-                const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-                const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
+                const bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                const bool canCastY = sd::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
                 auto func = PRAGMA_THREADS_FOR {
                     PRAGMA_OMP_SIMD
@@ -143,9 +143,9 @@ namespace functions {
                 uint xShapeInfoCast[MAX_RANK];
                 uint yShapeInfoCast[MAX_RANK];
                 uint zShapeInfoCast[MAX_RANK];
-                const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-                const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
-                const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
+                const bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                const bool canCastY = sd::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
+                const bool canCastZ = sd::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
                 auto func = PRAGMA_THREADS_FOR {
                     PRAGMA_OMP_SIMD
@@ -178,9 +178,9 @@ namespace functions {
             auto length = shape::length(zShapeInfo);
 
             uint xShapeInfoCast[MAX_RANK];
-            const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+            const bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
-            nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
+            sd::graph::RandomGenerator* rng = reinterpret_cast<sd::graph::RandomGenerator*>(state);
 
             if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
 
@@ -209,7 +209,7 @@ namespace functions {
             else {
 
                 uint zShapeInfoCast[MAX_RANK];
-                const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
+                const bool canCastZ = sd::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
                 auto func = PRAGMA_THREADS_FOR {
                     PRAGMA_OMP_SIMD
@@ -234,7 +234,7 @@ namespace functions {
 
             auto length = shape::length(zShapeInfo);
 
-            nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
+            sd::graph::RandomGenerator* rng = reinterpret_cast<sd::graph::RandomGenerator*>(state);
 
             if(shape::elementWiseStride(zShapeInfo) ==  1){
 
@@ -248,10 +248,10 @@ namespace functions {
                 samediff::Threads::parallel_for(func,  0, length, 1); 
             }
             else{
-                nd4j::OmpLaunchHelper info(length);
+                sd::OmpLaunchHelper info(length);
 
                 uint zShapeInfoCast[MAX_RANK];
-                const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
+                const bool canCastZ = sd::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
                 auto func = PRAGMA_THREADS_FOR {
                     PRAGMA_OMP_SIMD
diff --git a/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp b/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp
index 79eb9b209..afb441a45 100644
--- a/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp
+++ b/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp
@@ -20,10 +20,10 @@
 //
 
 #include <types/types.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <loops/reduce_bool.h>
 #include <loops/legacy_ops.h>
-#include <OmpLaunchHelper.h>
+#include <helpers/OmpLaunchHelper.h>
 #include <helpers/Loops.h>
 #include <helpers/ConstantTadHelper.h>
 
@@ -50,8 +50,8 @@ namespace functions {
                 return;
             }
 
-            if(nd4j::ArrayOptions::arrayType(xShapeInfo) == nd4j::ArrayType::EMPTY) {
-                if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
+            if(sd::ArrayOptions::arrayType(xShapeInfo) == sd::ArrayType::EMPTY) {
+                if(sd::ArrayOptions::arrayType(zShapeInfo) == sd::ArrayType::EMPTY)
                     return;
                 const auto startingVal = OpType::startingValue(x);
 
@@ -66,7 +66,7 @@ namespace functions {
             else {
                 auto startingValue = OpType::startingValue(x);
                 uint xShapeInfoCast[MAX_RANK];
-                const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                const bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
                 for (Nd4jLong i = 0; i < length; i++)
                     startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
@@ -92,7 +92,7 @@ namespace functions {
                 else {
                     auto startingValue = OpType::startingValue(x);
                     uint xShapeInfoCast[MAX_RANK];
-                    bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                    bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
                     for (Nd4jLong i = 0; i < length; i++)
                         startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
@@ -151,8 +151,8 @@ namespace functions {
 
                 auto resultLength = shape::length(zShapeInfo);
 
-                if(nd4j::ArrayOptions::arrayType(xShapeInfo) == nd4j::ArrayType::EMPTY) {
-                    if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
+                if(sd::ArrayOptions::arrayType(xShapeInfo) == sd::ArrayType::EMPTY) {
+                    if(sd::ArrayOptions::arrayType(zShapeInfo) == sd::ArrayType::EMPTY)
                         return;
                     const auto startingVal = OpType::startingValue(x);
 
@@ -178,15 +178,15 @@ namespace functions {
                     if (dimensionLength < 1)
                         return;
 
-                    auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
+                    auto tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
                     tadOnlyShapeInfo = tadPack.primaryShapeInfo();
                     tadOffsets = tadPack.primaryOffsets();
                 }
 
 #ifdef INLINE_LOOPS
-                nd4j::ReductionLoops<X,Z,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
+                sd::ReductionLoops<X,Z,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
 #else
-                nd4j::ReductionBoolLoops<X,Z>::template innerloopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
+                sd::ReductionBoolLoops<X,Z>::template innerloopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
 #endif
             }
 
@@ -208,7 +208,7 @@ namespace functions {
         Z _CUDA_H ReduceBoolFunction<X, Z>::execScalar(void *vx, Nd4jLong xEws, Nd4jLong length, void *vextraParams) {
                 auto x = reinterpret_cast<X *>(vx);
                 auto extraParams = reinterpret_cast<X *>(vextraParams);
-                int maxThreads = nd4j::math::nd4j_min<int>(64, nd4j::Environment::getInstance()->maxThreads());
+                int maxThreads = sd::math::nd4j_min<int>(64, sd::Environment::getInstance()->maxThreads());
                 Z intermediate[64];
 
                 PRAGMA_OMP_SIMD
diff --git a/libnd4j/include/loops/cpu/reduce/reduce_float.hpp b/libnd4j/include/loops/cpu/reduce/reduce_float.hpp
index 4437f52c0..40c24f4fa 100644
--- a/libnd4j/include/loops/cpu/reduce/reduce_float.hpp
+++ b/libnd4j/include/loops/cpu/reduce/reduce_float.hpp
@@ -20,10 +20,10 @@
 //
 
 #include <types/types.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <loops/reduce_float.h>
 #include <loops/legacy_ops.h>
-#include <OmpLaunchHelper.h>
+#include <helpers/OmpLaunchHelper.h>
 #include <helpers/Loops.h>
 #include <helpers/ConstantTadHelper.h>
 
@@ -47,15 +47,15 @@ namespace functions {
 
             if (shape::isEmpty(xShapeInfo)) {
                 if (std::is_same<OpType, simdOps::Mean<X,Z>>::value) {
-                    z[0] = nd4j::DataTypeUtils::nanOrZero<Z>();
+                    z[0] = sd::DataTypeUtils::nanOrZero<Z>();
                 } else {
                     z[0] = OpType::startingValue(x);
                 }
                 return;
             }
 
-             if(nd4j::ArrayOptions::arrayType(xShapeInfo) == nd4j::ArrayType::EMPTY) {
-                if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
+             if(sd::ArrayOptions::arrayType(xShapeInfo) == sd::ArrayType::EMPTY) {
+                if(sd::ArrayOptions::arrayType(zShapeInfo) == sd::ArrayType::EMPTY)
                     return;
                 const auto startingVal = OpType::startingValue(x);
 
@@ -71,8 +71,8 @@ namespace functions {
             else {
                 auto startingValue = OpType::startingValue(x);
                 uint xShapeInfoCast[MAX_RANK];
-                const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-                int maxThreads = nd4j::math::nd4j_min<int>(64, nd4j::Environment::getInstance()->maxThreads());
+                const bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                int maxThreads = sd::math::nd4j_min<int>(64, sd::Environment::getInstance()->maxThreads());
                 Z intermediate[64];
 
                 PRAGMA_OMP_SIMD
@@ -111,7 +111,7 @@ namespace functions {
                 else {
                     auto startingValue = OpType::startingValue(x);
                     uint xShapeInfoCast[MAX_RANK];
-                    bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                    bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
                     for (Nd4jLong i = 0; i < length; i++)
                         startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
@@ -179,10 +179,10 @@ namespace functions {
 
                 auto resultLength = shape::length(zShapeInfo);
 
-                if(nd4j::ArrayOptions::arrayType(xShapeInfo) == nd4j::ArrayType::EMPTY) {
-                    if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
+                if(sd::ArrayOptions::arrayType(xShapeInfo) == sd::ArrayType::EMPTY) {
+                    if(sd::ArrayOptions::arrayType(zShapeInfo) == sd::ArrayType::EMPTY)
                         return;
-                    const auto startingVal = std::is_same<OpType, simdOps::Mean<X,Z>>::value ? nd4j::DataTypeUtils::nanOrZero<Z>() : static_cast<Z>(OpType::startingValue(x));
+                    const auto startingVal = std::is_same<OpType, simdOps::Mean<X,Z>>::value ? sd::DataTypeUtils::nanOrZero<Z>() : static_cast<Z>(OpType::startingValue(x));
 
                     for (Nd4jLong i = 0; i < resultLength; i++)
                         z[i] = startingVal;
@@ -211,15 +211,15 @@ namespace functions {
                     if (dimensionLength < 0)
                         return;
 
-                    auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
+                    auto tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
                     tadOnlyShapeInfo = tadPack.primaryShapeInfo();
                     tadOffsets = tadPack.primaryOffsets();
                 }
 
 #ifdef INLINE_LOOPS
-                nd4j::ReductionLoops<X,Z,Z>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
+                sd::ReductionLoops<X,Z,Z>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
 #else
-                nd4j::ReductionFloatLoops<X,Z>::template innerloopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
+                sd::ReductionFloatLoops<X,Z>::template innerloopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
 #endif
             }
 
@@ -242,7 +242,7 @@ namespace functions {
 
             auto x = reinterpret_cast<X *>(vx);
             auto extraParams = reinterpret_cast<Z *>(vextraParams);
-            int maxThreads = nd4j::math::nd4j_min<int>(64, nd4j::Environment::getInstance()->maxThreads());
+            int maxThreads = sd::math::nd4j_min<int>(64, sd::Environment::getInstance()->maxThreads());
             Z intermediate[64];
 
             PRAGMA_OMP_SIMD
diff --git a/libnd4j/include/loops/cpu/reduce/reduce_long.cpp b/libnd4j/include/loops/cpu/reduce/reduce_long.cpp
index 08664fcab..98b462ebd 100644
--- a/libnd4j/include/loops/cpu/reduce/reduce_long.cpp
+++ b/libnd4j/include/loops/cpu/reduce/reduce_long.cpp
@@ -20,10 +20,10 @@
 //
 
 #include <types/types.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <loops/reduce_long.h>
 #include <loops/legacy_ops.h>
-#include <OmpLaunchHelper.h>
+#include <helpers/OmpLaunchHelper.h>
 #include <helpers/Loops.h>
 #include <helpers/ConstantTadHelper.h>
 
@@ -50,8 +50,8 @@ namespace functions {
                 return;
             }
 
-             if(nd4j::ArrayOptions::arrayType(xShapeInfo) == nd4j::ArrayType::EMPTY) {
-                if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
+             if(sd::ArrayOptions::arrayType(xShapeInfo) == sd::ArrayType::EMPTY) {
+                if(sd::ArrayOptions::arrayType(zShapeInfo) == sd::ArrayType::EMPTY)
                     return;
                 const auto startingVal = OpType::startingValue(x);
 
@@ -66,8 +66,8 @@ namespace functions {
             else {
                 auto startingValue = OpType::startingValue(x);
                 uint xShapeInfoCast[MAX_RANK];
-                const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-                int maxThreads = nd4j::math::nd4j_min<int>(64, nd4j::Environment::getInstance()->maxThreads());
+                const bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                int maxThreads = sd::math::nd4j_min<int>(64, sd::Environment::getInstance()->maxThreads());
                 Z intermediate[64];
 
                 PRAGMA_OMP_SIMD
@@ -108,7 +108,7 @@ namespace functions {
                 else {
                     auto startingValue = OpType::startingValue(x);
                     uint xShapeInfoCast[MAX_RANK];
-                    bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                    bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
                     for (Nd4jLong i = 0; i < length; i++)
                         startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
@@ -168,8 +168,8 @@ namespace functions {
 
                 auto resultLength = shape::length(zShapeInfo);
 
-                if(nd4j::ArrayOptions::arrayType(xShapeInfo) == nd4j::ArrayType::EMPTY) {
-                    if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
+                if(sd::ArrayOptions::arrayType(xShapeInfo) == sd::ArrayType::EMPTY) {
+                    if(sd::ArrayOptions::arrayType(zShapeInfo) == sd::ArrayType::EMPTY)
                         return;
                     const auto startingVal = OpType::startingValue(x);
 
@@ -200,15 +200,15 @@ namespace functions {
                     if (dimensionLength < 1)
                         return;
 
-                    auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
+                    auto tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
                     tadOnlyShapeInfo = tadPack.primaryShapeInfo();
                     tadOffsets = tadPack.primaryOffsets();
                 }
 
 #ifdef INLINE_LOOPS
-                nd4j::ReductionLoops<X,Z,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
+                sd::ReductionLoops<X,Z,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
 #else
-                nd4j::ReductionLongLoops<X,Z>::template innerloopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
+                sd::ReductionLongLoops<X,Z>::template innerloopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
 #endif
             }
 
@@ -231,7 +231,7 @@ namespace functions {
 
             auto x = reinterpret_cast<X *>(vx);
             auto extraParams = reinterpret_cast<X *>(vextraParams);
-            int maxThreads = nd4j::math::nd4j_min<int>(64, nd4j::Environment::getInstance()->maxThreads());
+            int maxThreads = sd::math::nd4j_min<int>(64, sd::Environment::getInstance()->maxThreads());
             Z intermediate[64];
 
             PRAGMA_OMP_SIMD
diff --git a/libnd4j/include/loops/cpu/reduce/reduce_same.cpp b/libnd4j/include/loops/cpu/reduce/reduce_same.cpp
index e546f71ee..f357b7e64 100644
--- a/libnd4j/include/loops/cpu/reduce/reduce_same.cpp
+++ b/libnd4j/include/loops/cpu/reduce/reduce_same.cpp
@@ -20,10 +20,10 @@
 //
 
 #include <types/types.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <loops/reduce_same.h>
 #include <loops/legacy_ops.h>
-#include <OmpLaunchHelper.h>
+#include <helpers/OmpLaunchHelper.h>
 #include <chrono>
 #include <helpers/Loops.h>
 #include <helpers/ConstantTadHelper.h>
@@ -52,8 +52,8 @@ namespace functions {
                 return;
             }
 
-            if(nd4j::ArrayOptions::arrayType(xShapeInfo) == nd4j::ArrayType::EMPTY) {
-                if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
+            if(sd::ArrayOptions::arrayType(xShapeInfo) == sd::ArrayType::EMPTY) {
+                if(sd::ArrayOptions::arrayType(zShapeInfo) == sd::ArrayType::EMPTY)
                     return;
                 const auto startingVal = OpType::startingValue(x);
 
@@ -68,8 +68,8 @@ namespace functions {
             else {
                 auto startingValue = OpType::startingValue(x);
                 uint xShapeInfoCast[MAX_RANK];
-                const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-                int maxThreads = nd4j::math::nd4j_min<int>(64, nd4j::Environment::getInstance()->maxThreads());
+                const bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                int maxThreads = sd::math::nd4j_min<int>(64, sd::Environment::getInstance()->maxThreads());
                 X intermediate[64];
 
                 PRAGMA_OMP_SIMD
@@ -109,7 +109,7 @@ namespace functions {
                 } else {
                     auto startingValue = OpType::startingValue(x);
                     uint xShapeInfoCast[MAX_RANK];
-                    bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                    bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
                     for (Nd4jLong i = 0; i < length; i++)
                         startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
@@ -177,8 +177,8 @@ namespace functions {
 
                 auto zLength = shape::length(zShapeInfo);
 
-                if(nd4j::ArrayOptions::arrayType(xShapeInfo) == nd4j::ArrayType::EMPTY) {
-                    if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
+                if(sd::ArrayOptions::arrayType(xShapeInfo) == sd::ArrayType::EMPTY) {
+                    if(sd::ArrayOptions::arrayType(zShapeInfo) == sd::ArrayType::EMPTY)
                         return;
                     const auto startingVal = OpType::startingValue(x);
 
@@ -209,15 +209,15 @@ namespace functions {
                     if (dimensionLength < 1)
                         return;
 
-                    auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
+                    auto tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
                     tadOnlyShapeInfo = tadPack.primaryShapeInfo();
                     tadOffsets = tadPack.primaryOffsets();
                 }
 
 #ifdef INLINE_LOOPS
-                nd4j::ReductionLoops<X,X,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
+                sd::ReductionLoops<X,X,X>::template loopReduce<OpType>(x, xShapeInfo, z, zShapeInfo,  tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
 #else
-                nd4j::ReductionSameLoops<X>::template innerloopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
+                sd::ReductionSameLoops<X>::template innerloopReduce<OpType>(x, xShapeInfo, z, zShapeInfo, tadOnlyShapeInfo, tadOffsets, extraParams, start, stop);
 #endif
             }
 
@@ -240,7 +240,7 @@ namespace functions {
 
             auto x = reinterpret_cast<X *>(vx);
             auto extraParams = reinterpret_cast<X *>(vextraParams);
-            int maxThreads = nd4j::math::nd4j_min<int>(64, nd4j::Environment::getInstance()->maxThreads());
+            int maxThreads = sd::math::nd4j_min<int>(64, sd::Environment::getInstance()->maxThreads());
             X intermediate[64];
 
             PRAGMA_OMP_SIMD
diff --git a/libnd4j/include/loops/cpu/reduce3.hpp b/libnd4j/include/loops/cpu/reduce3.hpp
index 39cd63754..961c6b1c8 100644
--- a/libnd4j/include/loops/cpu/reduce3.hpp
+++ b/libnd4j/include/loops/cpu/reduce3.hpp
@@ -19,11 +19,11 @@
 
 
 #include <types/types.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <loops/reduce3.h>
 #include <loops/legacy_ops.h>
 #include <helpers/ConstantTadHelper.h>
-#include <Loops.h>
+#include <helpers/Loops.h>
 #include <execution/Threads.h>
 
 using namespace simdOps;
@@ -48,8 +48,8 @@ void Reduce3<X,Z>::execScalar(void *vx, Nd4jLong *xShapeInfo,
     auto xEws = shape::elementWiseStride(xShapeInfo);
     auto yEws = shape::elementWiseStride(yShapeInfo);
 
-    if(nd4j::ArrayOptions::arrayType(xShapeInfo) == nd4j::ArrayType::EMPTY || nd4j::ArrayOptions::arrayType(yShapeInfo) == nd4j::ArrayType::EMPTY) {
-        if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
+    if(sd::ArrayOptions::arrayType(xShapeInfo) == sd::ArrayType::EMPTY || sd::ArrayOptions::arrayType(yShapeInfo) == sd::ArrayType::EMPTY) {
+        if(sd::ArrayOptions::arrayType(zShapeInfo) == sd::ArrayType::EMPTY)
             return;
         const auto startingVal = OpType::startingValue(x);
 
@@ -62,10 +62,10 @@ void Reduce3<X,Z>::execScalar(void *vx, Nd4jLong *xShapeInfo,
     Z extraParamsVals[3] = {(Z) 0.0f, (Z) 0.0f, (Z) 0.0f};
 
     uint xShapeInfoCast[MAX_RANK];
-    const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+    const bool canCastX = sd::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
     Z startingVal = OpType::startingValue(x);
-    int maxThreads = nd4j::math::nd4j_min<int>(64, nd4j::Environment::getInstance()->maxThreads());
+    int maxThreads = sd::math::nd4j_min<int>(64, sd::Environment::getInstance()->maxThreads());
     Z intermediate[64];
     Z extraParamsLocal[3 * 64];
 
@@ -84,9 +84,9 @@ void Reduce3<X,Z>::execScalar(void *vx, Nd4jLong *xShapeInfo,
         }
     }
 
-    nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXZ(xShapeInfo, yShapeInfo);
+    sd::LoopKind::Kind kindOfLoop = sd::LoopKind::deduceKindOfLoopXZ(xShapeInfo, yShapeInfo);
 
-    if (kindOfLoop == nd4j::LoopKind::EWS1) {
+    if (kindOfLoop == sd::LoopKind::EWS1) {
         auto func = PRAGMA_THREADS_FOR {
             for (auto i = start; i < stop; i++) {
                 intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i], y[i], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id);
@@ -107,7 +107,7 @@ void Reduce3<X,Z>::execScalar(void *vx, Nd4jLong *xShapeInfo,
         maxThreads = samediff::Threads::parallel_for(func, 0, length, 1, maxThreads);
     } else {
         uint yShapeInfoCast[MAX_RANK];
-        const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
+        const bool canCastY = sd::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
         auto func = PRAGMA_THREADS_FOR {
             for (auto i = start; i < stop; i++) {
@@ -162,9 +162,9 @@ void Reduce3<X,Z>::exec(void *vx, Nd4jLong *xShapeInfo,
         return;
     }
 #ifdef INLINE_LOOPS
-    nd4j::Reduction3Loops<X,Z>::template loopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dimension, dimensionLength, extraParams, start, stop);
+    sd::Reduction3Loops<X,Z>::template loopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dimension, dimensionLength, extraParams, start, stop);
 #else
-    nd4j::Reduction3Loops<X,Z>::template innerloopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dimension, dimensionLength, extraParams, start, stop);
+    sd::Reduction3Loops<X,Z>::template innerloopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dimension, dimensionLength, extraParams, start, stop);
 #endif
 }
 
@@ -183,9 +183,9 @@ void Reduce3<X,Z>::exec(void *vx, Nd4jLong *xShapeInfo,
     auto z = reinterpret_cast<Z *>(vz);
     auto extraParams = reinterpret_cast<Z *>(vextraParams);
 #ifdef INLINE_LOOPS
-    nd4j::Reduction3Loops<X,Z>::template loopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dimension, dimensionLength, extraParams, start, stop);
+    sd::Reduction3Loops<X,Z>::template loopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dimension, dimensionLength, extraParams, start, stop);
 #else
-    nd4j::Reduction3Loops<X,Z>::template innerloopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dimension, dimensionLength, extraParams, start, stop);
+    sd::Reduction3Loops<X,Z>::template innerloopReduce3<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, dimension, dimensionLength, extraParams, start, stop);
 #endif
 }
 
@@ -207,9 +207,9 @@ void Reduce3<X,Z>:: execAll(void *vx, Nd4jLong *xShapeInfo,
     auto extraParams = reinterpret_cast<Z*>(vextraParams);
 
 #ifdef INLINE_LOOPS
-    nd4j::Reduction3Loops<X,Z>::template loopReduce3All<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xOffsets, yTadShapeInfo, yOffsets, extraParams, start, stop);
+    sd::Reduction3Loops<X,Z>::template loopReduce3All<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xOffsets, yTadShapeInfo, yOffsets, extraParams, start, stop);
 #else
-    nd4j::Reduction3Loops<X,Z>::template innerloopReduce3All<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xOffsets, yTadShapeInfo, yOffsets, extraParams, start, stop);
+    sd::Reduction3Loops<X,Z>::template innerloopReduce3All<OpType>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, xTadShapeInfo, xOffsets, yTadShapeInfo, yOffsets, extraParams, start, stop);
 #endif
 }
 
diff --git a/libnd4j/include/loops/cpu/scalar.hpp b/libnd4j/include/loops/cpu/scalar.hpp
index 17a7e88d2..d93db7c8f 100644
--- a/libnd4j/include/loops/cpu/scalar.hpp
+++ b/libnd4j/include/loops/cpu/scalar.hpp
@@ -19,9 +19,9 @@
 //
 
 #include "../scalar.h"
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <types/types.h>
-#include <LoopKind.h>
+#include <helpers/LoopKind.h>
 #include <execution/Threads.h>
 #include "../legacy_ops.h"
 
@@ -58,16 +58,16 @@ void ScalarTransform<X, Y, Z>::transform(void *vx, Nd4jLong *xShapeInfo,
     const int tadLength  = shape::tadLength(xShapeInfo, dimension, dimensionLength);
     const int numTads    = shape::length(xShapeInfo) / tadLength;
 
-    nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXZ(xTadShapeInfo, zTadShapeInfo);
+    sd::LoopKind::Kind kindOfLoop = sd::LoopKind::deduceKindOfLoopXZ(xTadShapeInfo, zTadShapeInfo);
 
-    if (kindOfLoop != nd4j::LoopKind::EWS1 && kindOfLoop != nd4j::LoopKind::EWSNONZERO) {
+    if (kindOfLoop != sd::LoopKind::EWS1 && kindOfLoop != sd::LoopKind::EWSNONZERO) {
         printf("ScalarTransform<X, Z>::transform: super-bad loop visited. Shouldn't ever happen\n");
         return;
     }
 
-    int num_threads = nd4j::math::nd4j_min<int>(numTads, nd4j::Environment::getInstance()->maxThreads());
+    int num_threads = sd::math::nd4j_min<int>(numTads, sd::Environment::getInstance()->maxThreads());
 
-    if (kindOfLoop == nd4j::LoopKind::EWS1) {
+    if (kindOfLoop == sd::LoopKind::EWS1) {
         for (auto r = start; r < stop; r++) {
             auto oZ = z + zTadOffsets[r];
             auto oX = x + xTadOffsets[r];
@@ -147,15 +147,15 @@ void ScalarTransform<X, Y, Z>::transform(void *vx, Nd4jLong *xShapeInfo,
     const auto xEws = shape::elementWiseStride(xShapeInfo);
     const auto zEws = shape::elementWiseStride(zShapeInfo);
 
-    nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXZ(xShapeInfo, zShapeInfo);
+    sd::LoopKind::Kind kindOfLoop = sd::LoopKind::deduceKindOfLoopXZ(xShapeInfo, zShapeInfo);
 
-    if (kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) {
+    if (kindOfLoop == sd::LoopKind::EWS1 || kindOfLoop == sd::LoopKind::EWSNONZERO) {
         transform<OpType>(x, xEws, z, zEws, vscalar, extraParams, len, start, stop);
     }
     else {
 
         uint xShapeInfoCast[MAX_RANK];
-        const bool canCastX = nd4j::DataTypeUtils::castShapeInfo<uint>(xShapeInfo, xShapeInfoCast);
+        const bool canCastX = sd::DataTypeUtils::castShapeInfo<uint>(xShapeInfo, xShapeInfoCast);
 
         if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
             PRAGMA_OMP_SIMD
@@ -166,7 +166,7 @@ void ScalarTransform<X, Y, Z>::transform(void *vx, Nd4jLong *xShapeInfo,
         }
         else {
             uint zShapeInfoCast[MAX_RANK];
-            const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo, zShapeInfoCast);
+            const bool canCastZ = sd::DataTypeUtils::castShapeInfo<uint>(zShapeInfo, zShapeInfoCast);
 
             PRAGMA_OMP_SIMD
             for (auto i = start; i < stop; i++) {
diff --git a/libnd4j/include/loops/cpu/scalar_bool.cpp b/libnd4j/include/loops/cpu/scalar_bool.cpp
index 83e14ae66..c6f437ba8 100644
--- a/libnd4j/include/loops/cpu/scalar_bool.cpp
+++ b/libnd4j/include/loops/cpu/scalar_bool.cpp
@@ -19,9 +19,9 @@
 //
 
 #include "../scalar_bool.h"
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <types/types.h>
-#include <LoopKind.h>
+#include <helpers/LoopKind.h>
 #include <execution/Threads.h>
 
 #include "../legacy_ops.h"
@@ -59,16 +59,16 @@ namespace functions {
             const int tadLength  = shape::tadLength(xShapeInfo, dimension, dimensionLength);
             const int numTads    = shape::length(xShapeInfo) / tadLength;
 
-            nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXZ(xTadShapeInfo, zTadShapeInfo);
+            sd::LoopKind::Kind kindOfLoop = sd::LoopKind::deduceKindOfLoopXZ(xTadShapeInfo, zTadShapeInfo);
 
-            if (kindOfLoop != nd4j::LoopKind::EWS1 && kindOfLoop != nd4j::LoopKind::EWSNONZERO) {
+            if (kindOfLoop != sd::LoopKind::EWS1 && kindOfLoop != sd::LoopKind::EWSNONZERO) {
                 printf("ScalarBoolTransform<X, Z>::transform: super-bad loop visited. Shouldn't ever happen\n");
                 return;
             }
 
-            int num_threads = nd4j::math::nd4j_min<int>(numTads, nd4j::Environment::getInstance()->maxThreads());
+            int num_threads = sd::math::nd4j_min<int>(numTads, sd::Environment::getInstance()->maxThreads());
 
-            if (kindOfLoop == nd4j::LoopKind::EWS1) {
+            if (kindOfLoop == sd::LoopKind::EWS1) {
                 for (auto r = start; r < stop; r++) {
                     auto oZ = z + zTadOffsets[r];
                     auto oX = x + xTadOffsets[r];
@@ -152,15 +152,15 @@ namespace functions {
             auto zEws = shape::elementWiseStride(zShapeInfo);
             auto len = shape::length(xShapeInfo);
 
-            nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXZ(xShapeInfo, zShapeInfo);
+            sd::LoopKind::Kind kindOfLoop = sd::LoopKind::deduceKindOfLoopXZ(xShapeInfo, zShapeInfo);
 
-            if (kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) {
+            if (kindOfLoop == sd::LoopKind::EWS1 || kindOfLoop == sd::LoopKind::EWSNONZERO) {
                 transform<OpType>(x, xEws, z, zEws, vscalar, extraParams, len, start, stop);
                 return;
             }
 
             uint xShapeInfoCast[MAX_RANK];
-            const bool canCastX = nd4j::DataTypeUtils::castShapeInfo<uint>(xShapeInfo, xShapeInfoCast);
+            const bool canCastX = sd::DataTypeUtils::castShapeInfo<uint>(xShapeInfo, xShapeInfoCast);
 
             if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
                 PRAGMA_OMP_SIMD
@@ -171,7 +171,7 @@ namespace functions {
             }
             else {
                 uint zShapeInfoCast[MAX_RANK];
-                const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo, zShapeInfoCast);
+                const bool canCastZ = sd::DataTypeUtils::castShapeInfo<uint>(zShapeInfo, zShapeInfoCast);
 
                 PRAGMA_OMP_SIMD
                 for (auto i = start; i < stop; i++) {
diff --git a/libnd4j/include/loops/cpu/scalar_int.cpp b/libnd4j/include/loops/cpu/scalar_int.cpp
index 5fa51f765..ed85e28ef 100644
--- a/libnd4j/include/loops/cpu/scalar_int.cpp
+++ b/libnd4j/include/loops/cpu/scalar_int.cpp
@@ -19,9 +19,9 @@
 //
 
 #include "../scalar_int.h"
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <types/types.h>
-#include <LoopKind.h>
+#include <helpers/LoopKind.h>
 #include <execution/Threads.h>
 
 #include "../legacy_ops.h"
@@ -59,16 +59,16 @@ namespace functions {
             const int tadLength  = shape::tadLength(xShapeInfo, dimension, dimensionLength);
             const int numTads    = shape::length(xShapeInfo) / tadLength;
 
-            nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXZ(xTadShapeInfo, zTadShapeInfo);
+            sd::LoopKind::Kind kindOfLoop = sd::LoopKind::deduceKindOfLoopXZ(xTadShapeInfo, zTadShapeInfo);
 
-            if (kindOfLoop != nd4j::LoopKind::EWS1 && kindOfLoop != nd4j::LoopKind::EWSNONZERO) {
+            if (kindOfLoop != sd::LoopKind::EWS1 && kindOfLoop != sd::LoopKind::EWSNONZERO) {
                 printf("ScalarIntTransform<X>::transform: super-bad loop visited. Shouldn't ever happen\n");
                 return;
             }
 
-            int num_threads = nd4j::math::nd4j_min<int>(numTads, nd4j::Environment::getInstance()->maxThreads());
+            int num_threads = sd::math::nd4j_min<int>(numTads, sd::Environment::getInstance()->maxThreads());
 
-            if (kindOfLoop == nd4j::LoopKind::EWS1) {
+            if (kindOfLoop == sd::LoopKind::EWS1) {
                 for (auto r = start; r < stop; r++) {
                     auto oZ = z + zTadOffsets[r];
                     auto oX = x + xTadOffsets[r];
@@ -154,15 +154,15 @@ namespace functions {
             auto zEws = shape::elementWiseStride(zShapeInfo);
             auto len = shape::length(xShapeInfo);
 
-            nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXZ(xShapeInfo, zShapeInfo);
+            sd::LoopKind::Kind kindOfLoop = sd::LoopKind::deduceKindOfLoopXZ(xShapeInfo, zShapeInfo);
 
-            if (kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) {
+            if (kindOfLoop == sd::LoopKind::EWS1 || kindOfLoop == sd::LoopKind::EWSNONZERO) {
                 transform<OpType>(x, xEws, z, zEws, vscalar, extraParams, len, start, stop);
                 return;
             }
 
             uint xShapeInfoCast[MAX_RANK];
-            const bool canCastX = nd4j::DataTypeUtils::castShapeInfo<uint>(xShapeInfo, xShapeInfoCast);
+            const bool canCastX = sd::DataTypeUtils::castShapeInfo<uint>(xShapeInfo, xShapeInfoCast);
 
             if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
                 PRAGMA_OMP_SIMD
@@ -173,7 +173,7 @@ namespace functions {
             }
             else {
                 uint zShapeInfoCast[MAX_RANK];
-                const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo, zShapeInfoCast);
+                const bool canCastZ = sd::DataTypeUtils::castShapeInfo<uint>(zShapeInfo, zShapeInfoCast);
 
                 PRAGMA_OMP_SIMD
                 for (auto i = start; i < stop; i++) {
diff --git a/libnd4j/include/loops/cpu/summarystatsreduce.cpp b/libnd4j/include/loops/cpu/summarystatsreduce.cpp
index ec3c847ec..f6b44b75c 100644
--- a/libnd4j/include/loops/cpu/summarystatsreduce.cpp
+++ b/libnd4j/include/loops/cpu/summarystatsreduce.cpp
@@ -19,7 +19,7 @@
 //
 
 #include <types/types.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <loops/summarystatsreduce.h>
 #include <helpers/shape.h>
 #include <helpers/TAD.h>
@@ -89,7 +89,7 @@ namespace functions {
             auto length = shape::length(xShapeInfo);
 
             uint xShapeInfoCast[MAX_RANK];
-            const bool canCast = nd4j::DataTypeUtils::castShapeInfo<uint>(xShapeInfo, xShapeInfoCast);
+            const bool canCast = sd::DataTypeUtils::castShapeInfo<uint>(xShapeInfo, xShapeInfoCast);
 
             for (Nd4jLong i = 0; i < length; i++) {
                 auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCast);
@@ -118,8 +118,8 @@ namespace functions {
             auto extraParams = reinterpret_cast<Z *>(vextraParams);
             auto resultLength = shape::length(zShapeInfo);
 
-            if(nd4j::ArrayOptions::arrayType(xShapeInfo) == nd4j::ArrayType::EMPTY) {
-               if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
+            if(sd::ArrayOptions::arrayType(xShapeInfo) == sd::ArrayType::EMPTY) {
+               if(sd::ArrayOptions::arrayType(zShapeInfo) == sd::ArrayType::EMPTY)
                     return;
                 SummaryStatsData<X> comp;
                 comp.initWithValue(x[0]);
@@ -138,7 +138,7 @@ namespace functions {
             if (dimensionLength < 1)
                 return;
 
-            auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
+            auto tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
 
             //pre squeezed: this is for keeping the pointer to the original
             //shape information for tad offset
@@ -155,7 +155,7 @@ namespace functions {
             auto tadOrder = shape::order(tadPack.primaryShapeInfo());
 
             uint tadShapeShapeInfoCast[MAX_RANK];
-            const bool canCast = tadEWS == 1 && tadOrder == 'c' ? false : nd4j::DataTypeUtils::castShapeInfo<uint>(tadShapeShapeInfo, tadShapeShapeInfoCast);
+            const bool canCast = tadEWS == 1 && tadOrder == 'c' ? false : sd::DataTypeUtils::castShapeInfo<uint>(tadShapeShapeInfo, tadShapeShapeInfoCast);
 
             auto func = PRAGMA_THREADS_FOR {
                 for (auto r = start; r < stop; r++) {
diff --git a/libnd4j/include/loops/cpu/transform/transform_any.cpp b/libnd4j/include/loops/cpu/transform/transform_any.cpp
index 5b3c4a0f8..3fc9af1b3 100644
--- a/libnd4j/include/loops/cpu/transform/transform_any.cpp
+++ b/libnd4j/include/loops/cpu/transform/transform_any.cpp
@@ -18,8 +18,8 @@
 //  @author  raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
-#include <Loops.h>
+#include <system/op_boilerplate.h>
+#include <helpers/Loops.h>
 #include <types/types.h>
 #include <loops/transform_any.h>
 #include <loops/legacy_ops.h>
@@ -52,7 +52,7 @@ void _CUDA_H TransformAny<X, Z>::exec(void *vx, Nd4jLong *xShapeInfo,
 	auto z = reinterpret_cast<Z *>(vz);
 	auto extraParams = reinterpret_cast<X *>(vextraParams);
 
-    nd4j::TransformLoops<X,Z,X>::template loopTransform<OpType>(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads);
+    sd::TransformLoops<X,Z,X>::template loopTransform<OpType>(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads);
 }
 
 
diff --git a/libnd4j/include/loops/cpu/transform/transform_bool.cpp b/libnd4j/include/loops/cpu/transform/transform_bool.cpp
index fdfde93f5..7302ef970 100644
--- a/libnd4j/include/loops/cpu/transform/transform_bool.cpp
+++ b/libnd4j/include/loops/cpu/transform/transform_bool.cpp
@@ -18,8 +18,8 @@
 //  @author  raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
-#include <Loops.h>
+#include <system/op_boilerplate.h>
+#include <helpers/Loops.h>
 #include <types/types.h>
 #include <loops/transform_bool.h>
 #include <loops/legacy_ops.h>
@@ -54,7 +54,7 @@ namespace functions {
 		    auto z = reinterpret_cast<Z *>(vz);
 		    auto extraParams = reinterpret_cast<X *>(vextraParams);
 
-            nd4j::TransformLoops<X,Z,X>::template loopTransform<OpType>(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads);
+            sd::TransformLoops<X,Z,X>::template loopTransform<OpType>(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads);
         }
 
         BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT TransformBool, , LIBND4J_TYPES, BOOL_TYPES);
diff --git a/libnd4j/include/loops/cpu/transform/transform_float.cpp b/libnd4j/include/loops/cpu/transform/transform_float.cpp
index 8e164a90f..833b263f1 100644
--- a/libnd4j/include/loops/cpu/transform/transform_float.cpp
+++ b/libnd4j/include/loops/cpu/transform/transform_float.cpp
@@ -18,8 +18,8 @@
 //  @author  raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
-#include <Loops.h>
+#include <system/op_boilerplate.h>
+#include <helpers/Loops.h>
 #include <types/types.h>
 #include <loops/transform_float.h>
 #include <loops/legacy_ops.h>
@@ -53,7 +53,7 @@ namespace functions {
 		    auto z = reinterpret_cast<Z *>(vz);
 		    auto extraParams = reinterpret_cast<Z *>(vextraParams);
 
-            nd4j::TransformLoops<X,Z,Z>::template loopTransform<OpType>(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads);
+            sd::TransformLoops<X,Z,Z>::template loopTransform<OpType>(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads);
         }
 
         BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT TransformFloat, , LIBND4J_TYPES, FLOAT_TYPES);
diff --git a/libnd4j/include/loops/cpu/transform/transform_same.cpp b/libnd4j/include/loops/cpu/transform/transform_same.cpp
index 67f7762f0..bc9d2e525 100644
--- a/libnd4j/include/loops/cpu/transform/transform_same.cpp
+++ b/libnd4j/include/loops/cpu/transform/transform_same.cpp
@@ -18,8 +18,8 @@
 //  @author  raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
-#include <Loops.h>
+#include <system/op_boilerplate.h>
+#include <helpers/Loops.h>
 #include <types/types.h>
 #include <loops/transform_same.h>
 #include <loops/legacy_ops.h>
@@ -52,7 +52,7 @@ namespace functions {
 		    auto extraParams = reinterpret_cast<X *>(vextraParams);
 
 
-            nd4j::TransformLoops<X,X,X>::template loopTransform<OpType>(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads);
+            sd::TransformLoops<X,X,X>::template loopTransform<OpType>(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads);
         }
 
         BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT TransformSame, , LIBND4J_TYPES);
diff --git a/libnd4j/include/loops/cpu/transform/transform_strict.cpp b/libnd4j/include/loops/cpu/transform/transform_strict.cpp
index 29964e3e0..2ef3b808e 100644
--- a/libnd4j/include/loops/cpu/transform/transform_strict.cpp
+++ b/libnd4j/include/loops/cpu/transform/transform_strict.cpp
@@ -18,8 +18,8 @@
 //  @author  raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
-#include <Loops.h>
+#include <system/op_boilerplate.h>
+#include <helpers/Loops.h>
 #include <types/types.h>
 #include <loops/transform_strict.h>
 #include <loops/legacy_ops.h>
@@ -53,7 +53,7 @@ namespace functions {
             auto z = reinterpret_cast<X *>(vz);
             auto extraParams = reinterpret_cast<X *>(vextraParams);
 
-            nd4j::TransformLoops<X,X,X>::template loopTransform<OpType>(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads);
+            sd::TransformLoops<X,X,X>::template loopTransform<OpType>(x, xShapeInfo, z, zShapeInfo, extraParams, threadId, numThreads);
         }
 
         BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT TransformStrict, , FLOAT_TYPES);
diff --git a/libnd4j/include/loops/cuda/TrueBroadcastHelper.cu b/libnd4j/include/loops/cuda/TrueBroadcastHelper.cu
index 12c3eb0c5..9a775b2e7 100644
--- a/libnd4j/include/loops/cuda/TrueBroadcastHelper.cu
+++ b/libnd4j/include/loops/cuda/TrueBroadcastHelper.cu
@@ -19,18 +19,18 @@
 //
 
 // #include <exceptions/cuda_exception.h>
-#include <TrueBroadcastHelper.h>
-#include <PointersManager.h>
+#include <loops/TrueBroadcastHelper.h>
+#include <helpers/PointersManager.h>
 #include <execution/LaunchContext.h>
-#include <specials.h>
-#include <logger.h>
+#include <ops/specials.h>
+#include <helpers/logger.h>
 #include <ops/ops.h>
 // #include <cuda_runtime.h>
 // #include <cuda.h>
 
 using namespace simdOps;
 
-namespace nd4j    {
+namespace sd    {
 namespace helpers {
 
 ////////////////////////////////////////////////////////////////////////
@@ -100,7 +100,7 @@ void TrueBroadcastHelper<X,Y,Z>::execLauncher(dim3 launchDims, cudaStream_t *str
 
 //////////////////////////////////////////////////////////////////////////
 template<typename X, typename Y, typename Z>
-void TrueBroadcastHelper<X,Y,Z>::exec(const nd4j::broadcast::Ops opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr) {
+void TrueBroadcastHelper<X,Y,Z>::exec(const sd::broadcast::Ops opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr) {
 
     dim3 launchDims;
 
@@ -186,7 +186,7 @@ void TrueBroadcastBoolHelper<X,Z>::execLauncher(dim3 launchDims, cudaStream_t *s
 
 //////////////////////////////////////////////////////////////////////////
 template<typename X, typename Y>
-void TrueBroadcastBoolHelper<X,Y>::exec(const nd4j::broadcast::BoolOps opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr) {
+void TrueBroadcastBoolHelper<X,Y>::exec(const sd::broadcast::BoolOps opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr) {
 
     dim3 launchDims;
 
@@ -272,7 +272,7 @@ void TrueBroadcastIntHelper<X>::execLauncher(dim3 launchDims, cudaStream_t *stre
 
 //////////////////////////////////////////////////////////////////////////
 template<typename X>
-void TrueBroadcastIntHelper<X>::exec(const nd4j::broadcast::IntOps opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr) {
+void TrueBroadcastIntHelper<X>::exec(const sd::broadcast::IntOps opNum, const NDArray& xArr, const NDArray& yArr, NDArray& zArr) {
 
     dim3 launchDims;
 
diff --git a/libnd4j/include/loops/cuda/broadcasting.chpp b/libnd4j/include/loops/cuda/broadcasting.chpp
index 086e216e6..f8d232349 100644
--- a/libnd4j/include/loops/cuda/broadcasting.chpp
+++ b/libnd4j/include/loops/cuda/broadcasting.chpp
@@ -18,17 +18,17 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <loops/broadcasting.h>
 #include <loops/legacy_ops.h>
 #include <types/types.h>
-#include <Environment.h>
+#include <system/Environment.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <string>
 #include <stdexcept>
-#include <StringUtils.h>
-#include <specials_cuda.h>
+#include <helpers/StringUtils.h>
+#include <ops/specials_cuda.h>
 
 using namespace simdOps;
 
diff --git a/libnd4j/include/loops/cuda/broadcasting.cu b/libnd4j/include/loops/cuda/broadcasting.cu
index 8846e5473..55c882c3f 100644
--- a/libnd4j/include/loops/cuda/broadcasting.cu
+++ b/libnd4j/include/loops/cuda/broadcasting.cu
@@ -18,17 +18,17 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <loops/broadcasting.h>
 #include <loops/legacy_ops.h>
 #include <types/types.h>
-#include <Environment.h>
+#include <system/Environment.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <string>
 #include <stdexcept>
-#include <StringUtils.h>
-#include <specials_cuda.h>
+#include <helpers/StringUtils.h>
+#include <ops/specials_cuda.h>
 
 namespace functions {
     namespace broadcast {
diff --git a/libnd4j/include/loops/cuda/broadcasting_bool.cu b/libnd4j/include/loops/cuda/broadcasting_bool.cu
index d5a45ceec..47df0fd2b 100644
--- a/libnd4j/include/loops/cuda/broadcasting_bool.cu
+++ b/libnd4j/include/loops/cuda/broadcasting_bool.cu
@@ -18,16 +18,16 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <loops/broadcasting_bool.h>
 #include <loops/legacy_ops.h>
 #include <types/types.h>
-#include <Environment.h>
+#include <system/Environment.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <string>
 #include <stdexcept>
-#include <StringUtils.h>
+#include <helpers/StringUtils.h>
 
 using namespace simdOps;
 
@@ -70,7 +70,7 @@ namespace functions {
         template <typename OpClass>
         __host__ void BroadcastBool<X,Z>::intermediateBroadcast(dim3 launchDims, cudaStream_t *stream, void *x, Nd4jLong *xShapeInfo, void *y, Nd4jLong *yShapeInfo, void *z, Nd4jLong *zShapeInfo, void *extraParams, int *dimension, int dimensionLength, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadOnlyShapeInfoZ, Nd4jLong *tadOffsetsZ) {
             broadcastBoolSimple<X, Z, OpClass><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, extraParams, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ);
-            nd4j::DebugHelper::checkErrorCode(stream, "intermediateBroadcastBool(...) failed");
+            sd::DebugHelper::checkErrorCode(stream, "intermediateBroadcastBool(...) failed");
         }
 
 //////////////////////////////////////////////////////////////////////////
@@ -86,7 +86,7 @@ namespace functions {
         template <typename OpClass>
         __host__ void BroadcastBool<X,Z>::intermediateInverseBroadcast(dim3 launchDims, cudaStream_t *stream, void *x, Nd4jLong *xShapeInfo, void *y, Nd4jLong *yShapeInfo, void *z, Nd4jLong *zShapeInfo, void *extraParams, int *dimension, int dimensionLength, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadOnlyShapeInfoZ, Nd4jLong *tadOffsetsZ) {
             broadcastBoolInverseSimple<X, Z, OpClass><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(x, xShapeInfo, y, yShapeInfo, z, zShapeInfo, extraParams, dimension, dimensionLength, tadOnlyShapeInfo, tadOffsets, tadOnlyShapeInfoZ, tadOffsetsZ);
-            nd4j::DebugHelper::checkErrorCode(stream, "intermediateBroadcastBool(...) failed");
+            sd::DebugHelper::checkErrorCode(stream, "intermediateBroadcastBool(...) failed");
         }
 
 //////////////////////////////////////////////////////////////////////////
diff --git a/libnd4j/include/loops/cuda/broadcasting_int.cu b/libnd4j/include/loops/cuda/broadcasting_int.cu
index f183c009e..9a725a886 100644
--- a/libnd4j/include/loops/cuda/broadcasting_int.cu
+++ b/libnd4j/include/loops/cuda/broadcasting_int.cu
@@ -18,16 +18,16 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <loops/broadcasting_int.h>
 #include <loops/legacy_ops.h>
 #include <types/types.h>
-#include <Environment.h>
+#include <system/Environment.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <string>
 #include <stdexcept>
-#include <StringUtils.h>
+#include <helpers/StringUtils.h>
 
 using namespace simdOps;
 
diff --git a/libnd4j/include/loops/cuda/indexreduce.cu b/libnd4j/include/loops/cuda/indexreduce.cu
index aeb2d9d36..6383458c9 100644
--- a/libnd4j/include/loops/cuda/indexreduce.cu
+++ b/libnd4j/include/loops/cuda/indexreduce.cu
@@ -18,9 +18,9 @@
 // Created by raver on 4/9/2018.
 //
 
-#include <Environment.h>
+#include <system/Environment.h>
 #include "../indexreduce.h"
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <helpers/DebugHelper.h>
 #include <types/types.h>
 
@@ -230,9 +230,9 @@ namespace functions {
             }
             __syncthreads();
 
-            if(nd4j::ArrayOptions::arrayType(xShapeInfo) == nd4j::ArrayType::EMPTY) {
+            if(sd::ArrayOptions::arrayType(xShapeInfo) == sd::ArrayType::EMPTY) {
 
-                if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
+                if(sd::ArrayOptions::arrayType(zShapeInfo) == sd::ArrayType::EMPTY)
                     return;
 
                 for (uint i = blockIdx.x * blockDim.x + threadIdx.x; i < zLen; i += gridDim.x * blockDim.x)
@@ -268,7 +268,7 @@ namespace functions {
                         }
 
                         __syncthreads();
-                        aggregatePartials<OpType>(&sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, tadLength),extraParams);
+                        aggregatePartials<OpType>(&sPartials, threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, tadLength),extraParams);
 
                         __syncthreads();
                         if (threadIdx.x == 0) {
@@ -289,7 +289,7 @@ namespace functions {
                         }
 
                         __syncthreads();
-                        aggregatePartials<OpType>(&sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, tadLength),extraParams);
+                        aggregatePartials<OpType>(&sPartials, threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, tadLength),extraParams);
 
                         __syncthreads();
                         if (threadIdx.x == 0) {
@@ -320,7 +320,7 @@ namespace functions {
                 sPartials[threadIdx.x] = reduction;
                 __syncthreads();
 
-                aggregatePartials<OpType>(&sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, (int) n),extraParams);
+                aggregatePartials<OpType>(&sPartials, threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, (int) n),extraParams);
                 __syncthreads();
 
                 if (gridDim.x > 1) {
@@ -352,7 +352,7 @@ namespace functions {
                         }
 
                         __syncthreads();
-                        aggregatePartials<OpType>(&sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(gridDim.x, blockDim.x),extraParams);
+                        aggregatePartials<OpType>(&sPartials, threadIdx.x, sd::math::nd4j_min<int>(gridDim.x, blockDim.x),extraParams);
 
                         __syncthreads();
                         if (tid == 0) {
diff --git a/libnd4j/include/loops/cuda/inplace_loops/reduce_same_inplace.h b/libnd4j/include/loops/cuda/inplace_loops/reduce_same_inplace.h
index 5df583e61..1989cadc5 100644
--- a/libnd4j/include/loops/cuda/inplace_loops/reduce_same_inplace.h
+++ b/libnd4j/include/loops/cuda/inplace_loops/reduce_same_inplace.h
@@ -24,8 +24,8 @@
 
 #include <ops.h>
 #include <types/types.h>
-#include <op_boilerplate.h>
-#include <shape.h>
+#include <system/op_boilerplate.h>
+#include <helpers/shape.h>
 
 using namespace simdOps;
 
@@ -118,7 +118,7 @@ namespace functions {
                     sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[shape::getIndexOffset(i, xShapeInfo)], extraParams), extraParams);
 
             __syncthreads();
-            aggregatePartials<OpType>(sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, len), extraParams);
+            aggregatePartials<OpType>(sPartials, threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, len), extraParams);
             __syncthreads();
 
 
@@ -150,7 +150,7 @@ namespace functions {
                         sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], reductionBuffer[i], extraParams);
 
                     __syncthreads();
-                    aggregatePartials<OpType>(sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(gridDim.x, blockDim.x), extraParams);
+                    aggregatePartials<OpType>(sPartials, threadIdx.x, sd::math::nd4j_min<int>(gridDim.x, blockDim.x), extraParams);
                     __syncthreads();
 
                     if (threadIdx.x == 0) {
diff --git a/libnd4j/include/loops/cuda/inplace_loops/scalar_inplace.h b/libnd4j/include/loops/cuda/inplace_loops/scalar_inplace.h
index 9e061003d..df1a87ba8 100644
--- a/libnd4j/include/loops/cuda/inplace_loops/scalar_inplace.h
+++ b/libnd4j/include/loops/cuda/inplace_loops/scalar_inplace.h
@@ -24,8 +24,8 @@
 
 #include <ops.h>
 #include <types/types.h>
-#include <op_boilerplate.h>
-#include <shape.h>
+#include <system/op_boilerplate.h>
+#include <helpers/shape.h>
 
 using namespace simdOps;
 
diff --git a/libnd4j/include/loops/cuda/inplace_loops/transform_strict_inplace.h b/libnd4j/include/loops/cuda/inplace_loops/transform_strict_inplace.h
index b10b23d09..c4b94fca5 100644
--- a/libnd4j/include/loops/cuda/inplace_loops/transform_strict_inplace.h
+++ b/libnd4j/include/loops/cuda/inplace_loops/transform_strict_inplace.h
@@ -23,8 +23,8 @@
 
 #include <ops.h>
 #include <types/types.h>
-#include <op_boilerplate.h>
-#include <shape.h>
+#include <system/op_boilerplate.h>
+#include <helpers/shape.h>
 
 using namespace simdOps;
 
diff --git a/libnd4j/include/loops/cuda/legacy/grid_shaped.legacy b/libnd4j/include/loops/cuda/legacy/grid_shaped.legacy
index 6a43107f7..086647728 100644
--- a/libnd4j/include/loops/cuda/legacy/grid_shaped.legacy
+++ b/libnd4j/include/loops/cuda/legacy/grid_shaped.legacy
@@ -17,8 +17,8 @@
 
 
 
-#include <op_boilerplate.h>
-#include <pointercast.h>
+#include <system/op_boilerplate.h>
+#include <system/pointercast.h>
 #include <helpers/TAD.h>
 #include <types/float16.h>
 #include <loops/grid_shaped.legacy>
diff --git a/libnd4j/include/loops/cuda/legacy/grid_strided.legacy b/libnd4j/include/loops/cuda/legacy/grid_strided.legacy
index 74c5322a3..f5f88f5a1 100644
--- a/libnd4j/include/loops/cuda/legacy/grid_strided.legacy
+++ b/libnd4j/include/loops/cuda/legacy/grid_strided.legacy
@@ -17,8 +17,8 @@
 
 
 
-#include <op_boilerplate.h>
-#include <pointercast.h>
+#include <system/op_boilerplate.h>
+#include <system/pointercast.h>
 #include <helpers/TAD.h>
 #include <loops/grid_strided.legacy>
 #include <types/float16.h>
diff --git a/libnd4j/include/loops/cuda/legacy/reduce.legacy b/libnd4j/include/loops/cuda/legacy/reduce.legacy
index 1ae7985de..1f9b1a1c5 100644
--- a/libnd4j/include/loops/cuda/legacy/reduce.legacy
+++ b/libnd4j/include/loops/cuda/legacy/reduce.legacy
@@ -18,7 +18,7 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <loops/reduce_float.h>
 #include <loops/legacy_ops.h>
 #include <helpers/DebugHelper.h>
@@ -172,7 +172,7 @@ namespace functions {
 
                 DISPATCH_SIMPLE(reduceScalarSimple, float, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, nullptr, 1, reductionBuffer, tadOnlyShapeInfo), OPS_A(REDUCE_OPS))
 
-				nd4j::DebugHelper::checkErrorCode(stream, "execReduceScalarFloat(...) failed");
+				sd::DebugHelper::checkErrorCode(stream, "execReduceScalarFloat(...) failed");
             }
 
             template <>
@@ -180,7 +180,7 @@ namespace functions {
 
                 DISPATCH_SIMPLE(reduceScalarSimple, float16, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, nullptr, 1, reductionBuffer, tadOnlyShapeInfo), OPS_A(REDUCE_OPS))
 
-				nd4j::DebugHelper::checkErrorCode(stream, "execReduceScalarHalf(...) failed");
+				sd::DebugHelper::checkErrorCode(stream, "execReduceScalarHalf(...) failed");
             }
 
             template <>
@@ -188,7 +188,7 @@ namespace functions {
 
                 DISPATCH_SIMPLE(reduceScalarSimple, double, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, nullptr, 1, reductionBuffer, tadOnlyShapeInfo), OPS_A(REDUCE_OPS))
 
-				nd4j::DebugHelper::checkErrorCode(stream, "execReduceScalarDouble(...) failed");
+				sd::DebugHelper::checkErrorCode(stream, "execReduceScalarDouble(...) failed");
             }
 
             template <>
@@ -304,7 +304,7 @@ namespace functions {
 					__syncthreads();
 
 					// aggregate. do NOT reduce for elements > tadLength
-					aggregatePartials<OpType>(sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, tadLength), extraParams);
+					aggregatePartials<OpType>(sPartials, threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, tadLength), extraParams);
 
 
 					__syncthreads();
@@ -364,7 +364,7 @@ namespace functions {
 				}
 
 				__syncthreads();
-				aggregatePartials<OpType>(sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, n), extraParams);
+				aggregatePartials<OpType>(sPartials, threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, n), extraParams);
 
 
 				__syncthreads();
@@ -398,7 +398,7 @@ namespace functions {
 
 
 
-						aggregatePartials<OpType>(sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(gridDim.x, blockDim.x), extraParams);
+						aggregatePartials<OpType>(sPartials, threadIdx.x, sd::math::nd4j_min<int>(gridDim.x, blockDim.x), extraParams);
 
 						__syncthreads();
 						if (threadIdx.x == 0) {
@@ -468,7 +468,7 @@ namespace functions {
 					__syncthreads();
 
 					// aggregate. do NOT reduce for elements > tadLength
-					aggregatePartials<OpType>(sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, tadLength), extraParams);
+					aggregatePartials<OpType>(sPartials, threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, tadLength), extraParams);
 
 					__syncthreads();
 					if (threadIdx.x == 0)
@@ -533,7 +533,7 @@ namespace functions {
 					__syncthreads();
 
 					// aggregate. do NOT reduce for elements > tadLength
-					aggregatePartials<OpType>(sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, tadLength), extraParams);
+					aggregatePartials<OpType>(sPartials, threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, tadLength), extraParams);
 
 
 					__syncthreads();
diff --git a/libnd4j/include/loops/cuda/legacy/scalar_temp.legacy b/libnd4j/include/loops/cuda/legacy/scalar_temp.legacy
index 7bc30271f..f9ea86883 100644
--- a/libnd4j/include/loops/cuda/legacy/scalar_temp.legacy
+++ b/libnd4j/include/loops/cuda/legacy/scalar_temp.legacy
@@ -15,7 +15,7 @@
  ******************************************************************************/
 
 
-#include <pointercast.h>
+#include <system/pointercast.h>
 
 
 
diff --git a/libnd4j/include/loops/cuda/legacy/transform.legacy b/libnd4j/include/loops/cuda/legacy/transform.legacy
index 6a8344916..e7f76751a 100644
--- a/libnd4j/include/loops/cuda/legacy/transform.legacy
+++ b/libnd4j/include/loops/cuda/legacy/transform.legacy
@@ -18,9 +18,9 @@
 //  @author raver119@gmail.com
 //
 
-#include <Environment.h>
+#include <system/Environment.h>
 #include <loops/transform_same.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 
 #include <loops/legacy_ops.h>
 #include <helpers/DebugHelper.h>
@@ -173,7 +173,7 @@ namespace functions {
 
             DISPATCH_SIMPLE(transformShaped, float16, PARAMS(x, xShape, xRank, extraParams, z, zShape, zRank, allocationPointer, reductionPointer, tadShapeInfo, tadOffsets), OPS_A(TRANSFORM_OPS))
 
-            if (nd4j::Environment::getInstance()->isDebug())
+            if (sd::Environment::getInstance()->isDebug())
 		        checkCudaErrors(cudaStreamSynchronize(*stream));
         }
 
diff --git a/libnd4j/include/loops/cuda/pairwise_bool.cu b/libnd4j/include/loops/cuda/pairwise_bool.cu
index 05adbbce4..f697de814 100644
--- a/libnd4j/include/loops/cuda/pairwise_bool.cu
+++ b/libnd4j/include/loops/cuda/pairwise_bool.cu
@@ -104,8 +104,8 @@ void _CUDA_H PairWiseBoolTransform<X,Z>::intermediateShaped(dim3& launchDims, cu
 ////////////////////////////////////////////////////////////////////////////////
 template<typename X, typename Y>
 void PairWiseBoolTransform<X,Y>::executeCudaShaped(dim3& launchDims, cudaStream_t *stream, int opNum, void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, void *vextraParams) {
-    auto xType = nd4j::DataTypeUtils::fromT<X>();
-    auto yType = nd4j::DataTypeUtils::fromT<Y>();
+    auto xType = sd::DataTypeUtils::fromT<X>();
+    auto yType = sd::DataTypeUtils::fromT<Y>();
 
 	DISPATCH_BY_OPNUM_TT(intermediateShaped, PARAMS(launchDims, stream, vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo, vextraParams), PAIRWISE_BOOL_OPS);
 }
diff --git a/libnd4j/include/loops/cuda/pairwise_int.cu b/libnd4j/include/loops/cuda/pairwise_int.cu
index 85dce56f2..44447605e 100644
--- a/libnd4j/include/loops/cuda/pairwise_int.cu
+++ b/libnd4j/include/loops/cuda/pairwise_int.cu
@@ -104,7 +104,7 @@ void _CUDA_H PairWiseIntTransform<X>::intermediateShaped(dim3& launchDims, cudaS
 ////////////////////////////////////////////////////////////////////////////////
 template<typename X>
 void PairWiseIntTransform<X>::executeCudaShaped(dim3& launchDims, cudaStream_t *stream, int opNum, void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, void *vextraParams) {
-    auto xType = nd4j::DataTypeUtils::fromT<X>();
+    auto xType = sd::DataTypeUtils::fromT<X>();
 
 	DISPATCH_BY_OPNUM_T(intermediateShaped, PARAMS(launchDims, stream, vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo, vextraParams), PAIRWISE_INT_OPS);
 }
diff --git a/libnd4j/include/loops/cuda/random.cu b/libnd4j/include/loops/cuda/random.cu
index 47ced2769..c7550b926 100644
--- a/libnd4j/include/loops/cuda/random.cu
+++ b/libnd4j/include/loops/cuda/random.cu
@@ -18,13 +18,13 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <loops/random.h>
-#include <dll.h>
+#include <system/dll.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <helpers/DebugHelper.h>
-#include <specials_cuda.h>
+#include <ops/specials_cuda.h>
 
 using namespace randomOps;
 
@@ -130,10 +130,10 @@ namespace functions {
                 __shared__ char yOrder;
                 __shared__ char zOrder;
 
-                __shared__ nd4j::graph::RandomGenerator *buffer;
+                __shared__ sd::graph::RandomGenerator *buffer;
                 __shared__ unsigned char *cB;
                 __shared__ unsigned char *dB;
-                nd4j::graph::RandomGenerator *devBuffer;
+                sd::graph::RandomGenerator *devBuffer;
                 if (threadIdx.x == 0) {
                     length = shape::length(zShapeBuffer);
                     xEWS = shape::elementWiseStride(xShapeBuffer);
@@ -144,15 +144,15 @@ namespace functions {
                     zOrder = shape::order(zShapeBuffer);
 
                     extern __shared__ unsigned char shmem[];
-                    buffer = (nd4j::graph::RandomGenerator *) shmem;
+                    buffer = (sd::graph::RandomGenerator *) shmem;
                     cB = shmem;
-                    devBuffer = reinterpret_cast<nd4j::graph::RandomGenerator *> (state);
+                    devBuffer = reinterpret_cast<sd::graph::RandomGenerator *> (state);
                     dB = reinterpret_cast<unsigned char *> (state);
                 }
                 __syncthreads();
 
                 // using this loop instead of memcpy
-                for (int e = threadIdx.x; e < sizeof(nd4j::graph::RandomGenerator); e+= blockDim.x)
+                for (int e = threadIdx.x; e < sizeof(sd::graph::RandomGenerator); e+= blockDim.x)
                     cB[e] = dB[e];
 
                 __syncthreads();
@@ -192,16 +192,16 @@ namespace functions {
                 __shared__ char xOrder;
                 __shared__ char zOrder;
 
-                __shared__ nd4j::graph::RandomGenerator *buffer;
+                __shared__ sd::graph::RandomGenerator *buffer;
                 __shared__ unsigned char *cB;
                 __shared__ unsigned char *dB;
-                __shared__ nd4j::graph::RandomGenerator *devBuffer;
+                __shared__ sd::graph::RandomGenerator *devBuffer;
 
                 if (threadIdx.x == 0) {
                     extern __shared__ unsigned char shmem[];
-                    buffer = (nd4j::graph::RandomGenerator *) shmem;
+                    buffer = (sd::graph::RandomGenerator *) shmem;
                     cB = shmem;
-                    devBuffer = reinterpret_cast<nd4j::graph::RandomGenerator *> (state);
+                    devBuffer = reinterpret_cast<sd::graph::RandomGenerator *> (state);
                     dB = reinterpret_cast<unsigned char *> (state);
 
                     length = shape::length(zShapeBuffer);
@@ -213,7 +213,7 @@ namespace functions {
                 __syncthreads();
 
                 // using this loop instead of memcpy
-                for (int e = threadIdx.x; e < sizeof(nd4j::graph::RandomGenerator); e+= blockDim.x)
+                for (int e = threadIdx.x; e < sizeof(sd::graph::RandomGenerator); e+= blockDim.x)
                     cB[e] = dB[e];
 
                 __syncthreads();
@@ -245,16 +245,16 @@ namespace functions {
 
                 __shared__ Nd4jLong length;
                 __shared__ Nd4jLong ews;
-                __shared__ nd4j::graph::RandomGenerator *buffer;
+                __shared__ sd::graph::RandomGenerator *buffer;
                 __shared__ unsigned char *cB;
                 __shared__ unsigned char *dB;
-                __shared__ nd4j::graph::RandomGenerator *devBuffer;
+                __shared__ sd::graph::RandomGenerator *devBuffer;
 
                 if (threadIdx.x == 0) {
                     extern __shared__ unsigned char shmem[];
-                    buffer = (nd4j::graph::RandomGenerator *) shmem;
+                    buffer = (sd::graph::RandomGenerator *) shmem;
                     cB = shmem;
-                    devBuffer = reinterpret_cast<nd4j::graph::RandomGenerator *> (state);
+                    devBuffer = reinterpret_cast<sd::graph::RandomGenerator *> (state);
                     dB = reinterpret_cast<unsigned char *> (state);
                     length = shape::length(zShapeBuffer);
                     ews = shape::elementWiseStride(zShapeBuffer);
@@ -262,7 +262,7 @@ namespace functions {
                 __syncthreads();
 
                 // using this loop instead of memcpy
-                for (int e = threadIdx.x; e < sizeof(nd4j::graph::RandomGenerator); e+= blockDim.x)
+                for (int e = threadIdx.x; e < sizeof(sd::graph::RandomGenerator); e+= blockDim.x)
                     cB[e] = dB[e];
 
                 __syncthreads();
diff --git a/libnd4j/include/loops/cuda/reduce/reduce_bool.cu b/libnd4j/include/loops/cuda/reduce/reduce_bool.cu
index 52ca3decc..3aa2626a2 100644
--- a/libnd4j/include/loops/cuda/reduce/reduce_bool.cu
+++ b/libnd4j/include/loops/cuda/reduce/reduce_bool.cu
@@ -19,7 +19,7 @@
 //  @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <loops/reduce_bool.h>
 #include <loops/legacy_ops.h>
 #include <helpers/DebugHelper.h>
@@ -135,7 +135,7 @@ __device__ void ReduceBoolFunction<X,Z>::transformCudaXD( void *vx, Nd4jLong *xS
         __syncthreads();
 
         // aggregate. do NOT reduce for elements > tadLength
-        aggregatePartials<OpType>(sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, tadLength), extraParams);
+        aggregatePartials<OpType>(sPartials, threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, tadLength), extraParams);
 
         __syncthreads();
 
@@ -183,7 +183,7 @@ __device__ void ReduceBoolFunction<X,Z>::execScalarCuda(void *vx, Nd4jLong *xSha
             sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[shape::getIndexOffset(i, xShapeInfo)], extraParams), extraParams);
 
     __syncthreads();
-    aggregatePartials<OpType>(sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, len), extraParams);
+    aggregatePartials<OpType>(sPartials, threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, len), extraParams);
     __syncthreads();
 
     if (gridDim.x > 1) {
@@ -214,7 +214,7 @@ __device__ void ReduceBoolFunction<X,Z>::execScalarCuda(void *vx, Nd4jLong *xSha
                 sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], reductionBuffer[i], extraParams);
 
             __syncthreads();
-            aggregatePartials<OpType>(sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(gridDim.x, blockDim.x), extraParams);
+            aggregatePartials<OpType>(sPartials, threadIdx.x, sd::math::nd4j_min<int>(gridDim.x, blockDim.x), extraParams);
             __syncthreads();
 
             if (threadIdx.x == 0) {
@@ -246,19 +246,19 @@ __host__ void ReduceBoolFunction<X,Z>::intermediateXD(dim3 launchDims, cudaStrea
 
         const auto startingVal = static_cast<Z>(OpType::startingValue(reinterpret_cast<X*>(x)));
 
-        auto res = cudaMemcpyAsync(nd4j::LaunchContext::defaultContext()->getScalarPointer(), &startingVal, sizeof(Z), cudaMemcpyHostToDevice, *stream);
+        auto res = cudaMemcpyAsync(sd::LaunchContext::defaultContext()->getScalarPointer(), &startingVal, sizeof(Z), cudaMemcpyHostToDevice, *stream);
         if (res != 0)
-            throw nd4j::cuda_exception::build("ReduceBoolFunction<X,Z>::intermediateXD: failed to copy temporary scalar", res);
+            throw sd::cuda_exception::build("ReduceBoolFunction<X,Z>::intermediateXD: failed to copy temporary scalar", res);
 
-        auto ptr = nd4j::LaunchContext::defaultContext()->getScalarPointer();
+        auto ptr = sd::LaunchContext::defaultContext()->getScalarPointer();
 
         // scalar assign
         functions::scalar::ScalarTransform<Z, Z, Z>::executeCudaShaped(launchDims, stream, 14, z, zShapeInfo, hZShapeInfo, z, zShapeInfo, hZShapeInfo, ptr, nullptr);
-        nd4j::DebugHelper::checkErrorCode(stream, "reduceBoolDim empty(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "reduceBoolDim empty(...) failed");
     }
     else {
         simpleReduce<X, Z, OpType><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, reductionPointer, tadShapeInfo, tadOffsets);
-        nd4j::DebugHelper::checkErrorCode(stream, "reduceBoolDim(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "reduceBoolDim(...) failed");
     }
 }
 
@@ -276,14 +276,14 @@ __host__ void ReduceBoolFunction<X,Z>::intermediateScalar(dim3 launchDims, cudaS
 
         auto res = cudaMemcpyAsync(z, &startingVal, sizeof(Z), cudaMemcpyHostToDevice, *stream);
         if (res != 0)
-            throw nd4j::cuda_exception::build("ReduceBoolFunction<X,Z>::intermediateScalar: failed to copy resulting scalar", res);
+            throw sd::cuda_exception::build("ReduceBoolFunction<X,Z>::intermediateScalar: failed to copy resulting scalar", res);
 
-        nd4j::DebugHelper::checkErrorCode(stream, "reduceBoolScalar empty(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "reduceBoolScalar empty(...) failed");
 
     }
     else {
         simpleScalar<X, Z, OpType><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, reductionBuffer, tadOnlyShapeInfo);
-        nd4j::DebugHelper::checkErrorCode(stream, "reduceBoolScalar(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "reduceBoolScalar(...) failed");
     }
 }
 
@@ -292,7 +292,7 @@ template <typename X, typename Y>
 _CUDA_H void ReduceBoolFunction<X,Y>::execReduceScalar(dim3 launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShapeInfo, Nd4jLong *hXShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, Nd4jLong *hZShapeInfo, int *dimension, int dimensionLength, void *reductionBuffer, Nd4jLong *tadOnlyShapeInfo) {
 
         DISPATCH_BY_OPNUM_TT(intermediateScalar, PARAMS(launchDims, stream, x, xShapeInfo, hXShapeInfo, extraParams, z, zShapeInfo, hZShapeInfo, dimension, dimensionLength, reductionBuffer, tadOnlyShapeInfo), OPS_A(REDUCE_BOOL_OPS));
-        nd4j::DebugHelper::checkErrorCode(stream, "execReduceScalarFloat(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "execReduceScalarFloat(...) failed");
 }
 
 ////////////////////////////////////////////////////////////////////////
diff --git a/libnd4j/include/loops/cuda/reduce/reduce_float.chpp b/libnd4j/include/loops/cuda/reduce/reduce_float.chpp
index 110cc0f68..e1b95ae55 100644
--- a/libnd4j/include/loops/cuda/reduce/reduce_float.chpp
+++ b/libnd4j/include/loops/cuda/reduce/reduce_float.chpp
@@ -20,13 +20,13 @@
 
 #include <execution/LaunchContext.h>
 #include <exceptions/cuda_exception.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <loops/reduce_float.h>
 #include <loops/scalar.h>
 #include <loops/legacy_ops.h>
 #include <helpers/DebugHelper.h>
 #include <types/types.h>
-#include <specials_cuda.h>
+#include <ops/specials_cuda.h>
 
 #include <cuda.h>
 #include <cuda_runtime.h>
@@ -135,7 +135,7 @@ __device__ void ReduceFloatFunction<X,Z>::transformCudaXD( void *vx, Nd4jLong *x
         __syncthreads();
 
         // aggregate. do NOT reduce for elements > tadLength
-        aggregatePartials<OpType>(sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, tadLength), extraParams);
+        aggregatePartials<OpType>(sPartials, threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, tadLength), extraParams);
         __syncthreads();
 
         if (threadIdx.x == 0)
@@ -182,7 +182,7 @@ __device__ void ReduceFloatFunction<X,Z>::execScalarCuda(void *vx, Nd4jLong *xSh
             sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[shape::getIndexOffset(i, xShapeInfo)], extraParams), extraParams);
 
     __syncthreads();
-    aggregatePartials<OpType>(sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, len), extraParams);
+    aggregatePartials<OpType>(sPartials, threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, len), extraParams);
     __syncthreads();
 
     if (gridDim.x > 1) {
@@ -213,7 +213,7 @@ __device__ void ReduceFloatFunction<X,Z>::execScalarCuda(void *vx, Nd4jLong *xSh
                 sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], reductionBuffer[i], extraParams);
 
             __syncthreads();
-            aggregatePartials<OpType>(sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(gridDim.x, blockDim.x), extraParams);
+            aggregatePartials<OpType>(sPartials, threadIdx.x, sd::math::nd4j_min<int>(gridDim.x, blockDim.x), extraParams);
             __syncthreads();
 
             if (threadIdx.x == 0) {
@@ -241,12 +241,12 @@ __host__ void ReduceFloatFunction<X,Z>::intermediateXD(dim3 launchDims, cudaStre
         if(shape::isEmpty(hZShapeInfo))
             return;
 
-        const auto startingVal = std::is_same<OpType, simdOps::Mean<X,Z>>::value ? nd4j::DataTypeUtils::nanOrZero<Z>() : static_cast<Z>(OpType::startingValue(reinterpret_cast<X*>(x)));
-        auto res = cudaMemcpyAsync(nd4j::LaunchContext::defaultContext()->getScalarPointer(), &startingVal, sizeof(Z), cudaMemcpyHostToDevice, *stream);
+        const auto startingVal = std::is_same<OpType, simdOps::Mean<X,Z>>::value ? sd::DataTypeUtils::nanOrZero<Z>() : static_cast<Z>(OpType::startingValue(reinterpret_cast<X*>(x)));
+        auto res = cudaMemcpyAsync(sd::LaunchContext::defaultContext()->getScalarPointer(), &startingVal, sizeof(Z), cudaMemcpyHostToDevice, *stream);
         if (res != 0)
-            throw nd4j::cuda_exception::build("ReduceFloatFunction<X,Z>::intermediateXD: failed to copy temporary scalar", res);
+            throw sd::cuda_exception::build("ReduceFloatFunction<X,Z>::intermediateXD: failed to copy temporary scalar", res);
 
-        auto ptr = nd4j::LaunchContext::defaultContext()->getScalarPointer();
+        auto ptr = sd::LaunchContext::defaultContext()->getScalarPointer();
 
         // scalar assign
         functions::scalar::ScalarTransform<Z, Z, Z>::executeCudaShaped(launchDims, stream, 14, z, zShape, hZShapeInfo, z, zShape, hZShapeInfo, ptr, nullptr);
@@ -266,11 +266,11 @@ __host__ void ReduceFloatFunction<X,Z>::intermediateScalar(dim3 launchDims, cuda
         if (shape::isEmpty(hZShapeInfo))
             return;
 
-        const auto startingVal = std::is_same<OpType, simdOps::Mean<X,Z>>::value ? nd4j::DataTypeUtils::nanOrZero<Z>() : static_cast<Z>(OpType::startingValue(reinterpret_cast<X*>(x)));
+        const auto startingVal = std::is_same<OpType, simdOps::Mean<X,Z>>::value ? sd::DataTypeUtils::nanOrZero<Z>() : static_cast<Z>(OpType::startingValue(reinterpret_cast<X*>(x)));
 
         auto res = cudaMemcpyAsync(z, &startingVal, sizeof(Z), cudaMemcpyHostToDevice, *stream);
         if (res != 0)
-            throw nd4j::cuda_exception::build("ReduceFloatFunction<X,Z>::intermediateScalar: failed to copy resulting scalar", res);
+            throw sd::cuda_exception::build("ReduceFloatFunction<X,Z>::intermediateScalar: failed to copy resulting scalar", res);
     }
     else {
         simpleScalar<X, Z, OpType> << < launchDims.x, launchDims.y, launchDims.z, *stream>>>(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, reductionBuffer, tadOnlyShapeInfo);
@@ -282,7 +282,7 @@ template <typename X, typename Y>
 _CUDA_H void ReduceFloatFunction<X,Y>::execReduceScalar(dim3 launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShapeInfo, Nd4jLong *hXShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, Nd4jLong *hZShapeInfo, int *dimension, int dimensionLength, void *reductionBuffer, Nd4jLong *tadOnlyShapeInfo) {
 
         DISPATCH_BY_OPNUM_TT(intermediateScalar, PARAMS(launchDims, stream, x, xShapeInfo, hXShapeInfo, extraParams, z, zShapeInfo, hZShapeInfo, dimension, dimensionLength, reductionBuffer, tadOnlyShapeInfo), OPS_A(REDUCE_FLOAT_OPS));
-        nd4j::DebugHelper::checkErrorCode(stream, "execReduceScalarFloat(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "execReduceScalarFloat(...) failed");
 }
 
 ////////////////////////////////////////////////////////////////////////
diff --git a/libnd4j/include/loops/cuda/reduce/reduce_long.cu b/libnd4j/include/loops/cuda/reduce/reduce_long.cu
index 79ab25280..e55ecd11c 100644
--- a/libnd4j/include/loops/cuda/reduce/reduce_long.cu
+++ b/libnd4j/include/loops/cuda/reduce/reduce_long.cu
@@ -19,7 +19,7 @@
 //  @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <loops/reduce_long.h>
 #include <loops/legacy_ops.h>
 #include <helpers/DebugHelper.h>
@@ -156,7 +156,7 @@ __device__ void ReduceLongFunction<X,Z>::transformCudaXD( void *vx, Nd4jLong *xS
         __syncthreads();
 
         // aggregate. do NOT reduce for elements > tadLength
-        aggregatePartials<OpType>(sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, tadLength), extraParams);
+        aggregatePartials<OpType>(sPartials, threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, tadLength), extraParams);
         __syncthreads();
 
         if (threadIdx.x == 0)
@@ -203,7 +203,7 @@ __device__ void ReduceLongFunction<X,Z>::execScalarCuda(void *vx, Nd4jLong *xSha
             sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[shape::getIndexOffset(i, xShapeInfo)], extraParams), extraParams);
 
     __syncthreads();
-    aggregatePartials<OpType>(sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, len), extraParams);
+    aggregatePartials<OpType>(sPartials, threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, len), extraParams);
     __syncthreads();
 
     if (gridDim.x > 1) {
@@ -233,7 +233,7 @@ __device__ void ReduceLongFunction<X,Z>::execScalarCuda(void *vx, Nd4jLong *xSha
                 sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], reductionBuffer[i], extraParams);
 
             __syncthreads();
-            aggregatePartials<OpType>(sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(gridDim.x, blockDim.x), extraParams);
+            aggregatePartials<OpType>(sPartials, threadIdx.x, sd::math::nd4j_min<int>(gridDim.x, blockDim.x), extraParams);
             __syncthreads();
 
             if (threadIdx.x == 0) {
@@ -263,11 +263,11 @@ __host__ void ReduceLongFunction<X,Z>::intermediateXD(dim3 launchDims, cudaStrea
 
         const auto startingVal = static_cast<Z>(OpType::startingValue(reinterpret_cast<X*>(x)));
 
-        auto res = cudaMemcpyAsync(nd4j::LaunchContext::defaultContext()->getScalarPointer(), &startingVal, sizeof(Z), cudaMemcpyHostToDevice, *stream);
+        auto res = cudaMemcpyAsync(sd::LaunchContext::defaultContext()->getScalarPointer(), &startingVal, sizeof(Z), cudaMemcpyHostToDevice, *stream);
         if (res != 0)
-            throw nd4j::cuda_exception::build("ReduceLongFunction<X,Z>::intermediateXD: failed to copy temporary scalar", res);
+            throw sd::cuda_exception::build("ReduceLongFunction<X,Z>::intermediateXD: failed to copy temporary scalar", res);
 
-        auto ptr = nd4j::LaunchContext::defaultContext()->getScalarPointer();
+        auto ptr = sd::LaunchContext::defaultContext()->getScalarPointer();
 
         // scalar assign
         functions::scalar::ScalarTransform<Z, Z, Z>::executeCudaShaped(launchDims, stream, 14, z, zShapeInfo, hXShapeInfo, z, zShapeInfo, hZShapeInfo, ptr, nullptr);
@@ -291,7 +291,7 @@ __host__ void ReduceLongFunction<X,Z>::intermediateScalar(dim3 launchDims, cudaS
 
         auto res = cudaMemcpyAsync(z, &startingVal, sizeof(Z), cudaMemcpyHostToDevice, *stream);
         if (res != 0)
-            throw nd4j::cuda_exception::build("ReduceLongFunction<X,Z>::intermediateScalar: failed to copy resulting scalar", res);
+            throw sd::cuda_exception::build("ReduceLongFunction<X,Z>::intermediateScalar: failed to copy resulting scalar", res);
     }
     else {
         simpleScalar<X, Z, OpType><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, reductionBuffer, tadOnlyShapeInfo);
@@ -303,7 +303,7 @@ template <typename X, typename Y>
 _CUDA_H void ReduceLongFunction<X,Y>::execReduceScalar(dim3 launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShapeInfo, Nd4jLong* hXShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, Nd4jLong* hZShapeInfo, int *dimension, int dimensionLength, void *reductionBuffer, Nd4jLong *tadOnlyShapeInfo) {
 
         DISPATCH_BY_OPNUM_TT(intermediateScalar, PARAMS(launchDims, stream, x, xShapeInfo, hXShapeInfo, extraParams, z, zShapeInfo, hZShapeInfo, dimension, dimensionLength, reductionBuffer, tadOnlyShapeInfo), OPS_A(REDUCE_LONG_OPS));
-        nd4j::DebugHelper::checkErrorCode(stream, "execReduceScalarFloat(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "execReduceScalarFloat(...) failed");
 }
 
 ////////////////////////////////////////////////////////////////////////
diff --git a/libnd4j/include/loops/cuda/reduce/reduce_same.cu b/libnd4j/include/loops/cuda/reduce/reduce_same.cu
index bcf5bab7f..c3c74c806 100644
--- a/libnd4j/include/loops/cuda/reduce/reduce_same.cu
+++ b/libnd4j/include/loops/cuda/reduce/reduce_same.cu
@@ -19,7 +19,7 @@
 //  @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <loops/reduce_same.h>
 #include <loops/legacy_ops.h>
 #include <helpers/DebugHelper.h>
@@ -145,7 +145,7 @@ __device__ void ReduceSameFunction<X>::transformCudaXD( void *vx, Nd4jLong *xSha
         __syncthreads();
 
         // aggregate. do NOT reduce for elements > tadLength
-        aggregatePartials<OpType>(sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, tadLength), extraParams);
+        aggregatePartials<OpType>(sPartials, threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, tadLength), extraParams);
         __syncthreads();
 
         if (threadIdx.x == 0)
@@ -200,7 +200,7 @@ __device__ void ReduceSameFunction<X>::execScalarCuda(void *vx, Nd4jLong *xShape
             sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[shape::getIndexOffset(i, xShapeInfo)], extraParams), extraParams);
 
     __syncthreads();
-    aggregatePartials<OpType>(sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, len), extraParams);
+    aggregatePartials<OpType>(sPartials, threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, len), extraParams);
     __syncthreads();
 
     if (gridDim.x > 1) {
@@ -230,7 +230,7 @@ __device__ void ReduceSameFunction<X>::execScalarCuda(void *vx, Nd4jLong *xShape
                 sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], reductionBuffer[i], extraParams);
 
             __syncthreads();
-            aggregatePartials<OpType>(sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(gridDim.x, blockDim.x), extraParams);
+            aggregatePartials<OpType>(sPartials, threadIdx.x, sd::math::nd4j_min<int>(gridDim.x, blockDim.x), extraParams);
             __syncthreads();
 
             if (threadIdx.x == 0) {
@@ -260,11 +260,11 @@ __host__ void ReduceSameFunction<X>::intermediateXD(dim3 launchDims, cudaStream_
 
         const auto startingVal = static_cast<X>(OpType::startingValue(reinterpret_cast<X*>(x)));
 
-        auto res = cudaMemcpyAsync(nd4j::LaunchContext::defaultContext()->getScalarPointer(), &startingVal, sizeof(X), cudaMemcpyHostToDevice, *stream);
+        auto res = cudaMemcpyAsync(sd::LaunchContext::defaultContext()->getScalarPointer(), &startingVal, sizeof(X), cudaMemcpyHostToDevice, *stream);
         if (res != 0)
-            throw nd4j::cuda_exception::build("ReduceSameFunction<X,Z>::intermediateXD: failed to copy temporary scalar", res);
+            throw sd::cuda_exception::build("ReduceSameFunction<X,Z>::intermediateXD: failed to copy temporary scalar", res);
 
-        auto ptr = nd4j::LaunchContext::defaultContext()->getScalarPointer();
+        auto ptr = sd::LaunchContext::defaultContext()->getScalarPointer();
 
         // scalar assign
         functions::scalar::ScalarTransform<X, X, X>::executeCudaShaped(launchDims, stream, 14, z, zShapeInfo, hXShapeInfo, z, zShapeInfo, hZShapeInfo, ptr, nullptr);
@@ -288,7 +288,7 @@ __host__ void ReduceSameFunction<X>::intermediateScalar(dim3 launchDims, cudaStr
 
         auto res = cudaMemcpyAsync(z, &startingVal, sizeof(X), cudaMemcpyHostToDevice, *stream);
         if (res != 0)
-            throw nd4j::cuda_exception::build("ReduceSameFunction<X>::intermediateScalar: failed to copy resulting scalar", res);
+            throw sd::cuda_exception::build("ReduceSameFunction<X>::intermediateScalar: failed to copy resulting scalar", res);
     }
     else {
         simpleScalar<X, OpType><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(x, xShapeInfo, extraParams, z, zShapeInfo, dimension, dimensionLength, reductionBuffer, tadOnlyShapeInfo);
@@ -300,7 +300,7 @@ template <typename X>
 _CUDA_H void ReduceSameFunction<X>::execReduceScalar(dim3 launchDims, cudaStream_t *stream, int opNum, void *x, Nd4jLong *xShapeInfo, Nd4jLong* hXShapeInfo, void *extraParams, void *z, Nd4jLong *zShapeInfo, Nd4jLong* hZShapeInfo, int *dimension, int dimensionLength, void *reductionBuffer, Nd4jLong *tadOnlyShapeInfo) {
 
         DISPATCH_BY_OPNUM_T(intermediateScalar, PARAMS(launchDims, stream, x, xShapeInfo, hXShapeInfo, extraParams, z, zShapeInfo, hZShapeInfo, dimension, dimensionLength, reductionBuffer, tadOnlyShapeInfo), REDUCE_SAME_OPS);
-        nd4j::DebugHelper::checkErrorCode(stream, "execReduceScalarSame(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "execReduceScalarSame(...) failed");
 }
 
 ////////////////////////////////////////////////////////////////////////
diff --git a/libnd4j/include/loops/cuda/reduce3.chpp b/libnd4j/include/loops/cuda/reduce3.chpp
index ac1d1adc3..2fa16e9ac 100644
--- a/libnd4j/include/loops/cuda/reduce3.chpp
+++ b/libnd4j/include/loops/cuda/reduce3.chpp
@@ -18,11 +18,11 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 19.11.2018
 
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <loops/reduce3.h>
 #include <loops/legacy_ops.h>
 #include <types/types.h>
-#include <specials_cuda.h>
+#include <ops/specials_cuda.h>
 
 using namespace simdOps;
 
@@ -168,7 +168,7 @@ __device__ void Reduce3<X,Z>::execScalarCuda( void *vx, Nd4jLong *xShapeInfo,
     }
 
     __syncthreads();
-    aggregatePartials<OpType>(reinterpret_cast<void*>(sPartials), threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, length), extraZ);
+    aggregatePartials<OpType>(reinterpret_cast<void*>(sPartials), threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, length), extraZ);
     __syncthreads();
 
     if (gridDim.x > 1) {
@@ -214,7 +214,7 @@ __device__ void Reduce3<X,Z>::execScalarCuda( void *vx, Nd4jLong *xShapeInfo,
                     sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], static_cast<Z*>(reductionBuffer)[i], extraZ);
 
             __syncthreads();
-            aggregatePartials<OpType>(reinterpret_cast<void*>(sPartials), threadIdx.x, nd4j::math::nd4j_min<int>(gridDim.x, blockDim.x), extraZ);
+            aggregatePartials<OpType>(reinterpret_cast<void*>(sPartials), threadIdx.x, sd::math::nd4j_min<int>(gridDim.x, blockDim.x), extraZ);
             __syncthreads();
 
             if (threadIdx.x == 0)
@@ -324,7 +324,7 @@ __device__ void Reduce3<X,Z>::transformAll( void *vx, Nd4jLong *xShapeInfo,
 					__syncthreads();
        		}
 
-			aggregatePartials<OpType>(reinterpret_cast<void*>(sPartials), threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, xTadLength), extraZ);
+			aggregatePartials<OpType>(reinterpret_cast<void*>(sPartials), threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, xTadLength), extraZ);
 			__syncthreads();
 
             if (threadIdx.x == 0) {
@@ -410,7 +410,7 @@ __device__ void Reduce3<X,Z>::transform(void *vx, Nd4jLong *xShapeInfo,
             }
 
             __syncthreads();
-			aggregatePartials<OpType>(reinterpret_cast<void*>(sPartials), threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, tadLen), extraZ);
+			aggregatePartials<OpType>(reinterpret_cast<void*>(sPartials), threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, tadLen), extraZ);
             __syncthreads();
 
             if (threadIdx.x == 0)
@@ -440,7 +440,7 @@ __device__ void Reduce3<X,Z>::transform(void *vx, Nd4jLong *xShapeInfo,
             }
 
          	__syncthreads();
-			aggregatePartials<OpType>(reinterpret_cast<void*>(sPartials), threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, tadLen), extraZ);
+			aggregatePartials<OpType>(reinterpret_cast<void*>(sPartials), threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, tadLen), extraZ);
             __syncthreads();
 
             if (threadIdx.x == 0)
@@ -515,7 +515,7 @@ __host__ void Reduce3<X,Z>::exec(dim3 launchDims, cudaStream_t *stream,
 									Nd4jLong *yTadOnlyShapeInfo, Nd4jLong *yTadOffsets) {
 
     execGeneric<X, Z><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(opNum, vx, xShapeInfo, vy, yShapeInfo, extraParams, vz, zShapeInfo, dimension, dimensionLength, postProcessOrNot, allocationPointer, tadOnlyShapeInfo, tadOffsets, yTadOnlyShapeInfo, yTadOffsets);
-    nd4j::DebugHelper::checkErrorCode(stream, "reduce3exec(...) failed");
+    sd::DebugHelper::checkErrorCode(stream, "reduce3exec(...) failed");
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -533,7 +533,7 @@ __host__ void Reduce3<X,Z>::exec(dim3 launchDims, cudaStream_t *stream,
 									 Nd4jLong *yTadOnlyShapeInfo, Nd4jLong *yTadOffsets) {
 
 		execAllGeneric<X, Z><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(opNum, vx, xShapeInfo, vy, yShapeInfo, extraParams, vz, zShapeInfo, dimension, dimensionLength, postProcessOrNot, allocationPointer, tadOnlyShapeInfo, tadOffsets, yTadOnlyShapeInfo, yTadOffsets);
-        nd4j::DebugHelper::checkErrorCode(stream, "execAllGeneric(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "execAllGeneric(...) failed");
 	}
 
 ////////////////////////////////////////////////////////////////////////
@@ -549,7 +549,7 @@ __host__ void Reduce3<X,Z>::execScalar(dim3 launchDims, cudaStream_t *stream,
 										Nd4jLong *tadOnlyShapeInfo) {
 
     execScalarGeneric<X,Z><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(opNum, vx, xShapeInfo, vy, yShapeInfo, extraParams, vz, zShapeInfo, allocationPointer, reductionBuffer, tadOnlyShapeInfo);
-    nd4j::DebugHelper::checkErrorCode(stream, "execScalarGeneric(...) failed");
+    sd::DebugHelper::checkErrorCode(stream, "execScalarGeneric(...) failed");
 }
 
 
diff --git a/libnd4j/include/loops/cuda/reduce3.cu b/libnd4j/include/loops/cuda/reduce3.cu
index 4f0e0457c..c1d63e8dd 100644
--- a/libnd4j/include/loops/cuda/reduce3.cu
+++ b/libnd4j/include/loops/cuda/reduce3.cu
@@ -19,11 +19,11 @@
 //
 
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <loops/reduce3.h>
 #include <loops/legacy_ops.h>
 #include <types/types.h>
-#include <specials_cuda.h>
+#include <ops/specials_cuda.h>
 
 namespace functions {
     namespace reduce3 {
diff --git a/libnd4j/include/loops/cuda/scalar.chpp b/libnd4j/include/loops/cuda/scalar.chpp
index 7277138f9..ec1b42334 100644
--- a/libnd4j/include/loops/cuda/scalar.chpp
+++ b/libnd4j/include/loops/cuda/scalar.chpp
@@ -24,7 +24,7 @@
 #include "loops/scalar.h"
 #include <cuda.h>
 #include <cuda_runtime.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <helpers/TAD.h>
 #include <types/types.h>
 
@@ -137,7 +137,7 @@ void _CUDA_H ScalarTransform<X,Y,Z>::intermediateShaped(dim3& launchDims, cudaSt
     auto length = shape::length(hxShapeInfo);
 
     scalarSimpleShaped<X, Y, Z, OpType><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(vx, vscalar, xShapeInfo, vextraParams, vz, zShapeInfo, allocPointer);
-    nd4j::DebugHelper::checkErrorCode(stream, "scalarSimpleShapedA(...) failed");
+    sd::DebugHelper::checkErrorCode(stream, "scalarSimpleShapedA(...) failed");
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -145,14 +145,14 @@ template<typename X, typename Y, typename Z>
 template<typename OpType>
 void _CUDA_H ScalarTransform<X,Y,Z>::intermediateAlongDimension(dim3& launchDims, cudaStream_t *stream, void *x, Nd4jLong *xShapeInfo, void *z, Nd4jLong *zShapeInfo, void *scalars, void *extraParams, int *dimension, int dimensionLength, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) {
     scalarAlongDimension<X, Y, Z, OpType><<<launchDims.x, launchDims.y, launchDims.z>>>(x, xShapeInfo, extraParams, z, zShapeInfo, scalars, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ);
-    nd4j::DebugHelper::checkErrorCode(stream, "scalarAlongDimA(...) failed");
+    sd::DebugHelper::checkErrorCode(stream, "scalarAlongDimA(...) failed");
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 template<typename X, typename Y, typename Z>
 void ScalarTransform<X,Y,Z>::executeCudaShaped(dim3& launchDims, cudaStream_t *stream, int opNum, void *vx, Nd4jLong *xShapeInfo, Nd4jLong *hxShapeInfo, void *vz, Nd4jLong *zShapeInfo, Nd4jLong *hzShapeInfo, void* vscalar, void *vextraParams) {
 
-    if (nd4j::Environment::getInstance()->isDebugAndVerbose())
+    if (sd::Environment::getInstance()->isDebugAndVerbose())
 	   printf("H14 opNum:[%i]\n", opNum);
 
     DISPATCH_BY_OPNUM_TTT(intermediateShaped, PARAMS(launchDims, stream, vx, xShapeInfo, hxShapeInfo, vz, zShapeInfo, hzShapeInfo, vscalar, vextraParams, nullptr), SCALAR_OPS);
diff --git a/libnd4j/include/loops/cuda/scalar.cu b/libnd4j/include/loops/cuda/scalar.cu
index 67cbc7a98..26c3e5cb8 100644
--- a/libnd4j/include/loops/cuda/scalar.cu
+++ b/libnd4j/include/loops/cuda/scalar.cu
@@ -21,7 +21,7 @@
 #include "loops/scalar.h"
 #include <cuda.h>
 #include <cuda_runtime.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <helpers/TAD.h>
 #include <types/types.h>
 
diff --git a/libnd4j/include/loops/cuda/scalar_bool.cu b/libnd4j/include/loops/cuda/scalar_bool.cu
index bb498c3a9..1c8929ef3 100644
--- a/libnd4j/include/loops/cuda/scalar_bool.cu
+++ b/libnd4j/include/loops/cuda/scalar_bool.cu
@@ -20,7 +20,7 @@
 //
 
 #include "../scalar_bool.h"
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <types/types.h>
 
 #include "../legacy_ops.h"
@@ -193,7 +193,7 @@ _CUDA_H void ScalarBoolTransform<X, Z>::intermediateAlongDimension(dim3& launchD
                                                                 Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) {
 
     scalarAlongDimension<X, Z, OpType><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(x, xShapeInfo, extraParams, z, zShapeInfo, scalars, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ);
-    nd4j::DebugHelper::checkErrorCode(stream, "scalarAlongDim(...) failed");
+    sd::DebugHelper::checkErrorCode(stream, "scalarAlongDim(...) failed");
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -206,7 +206,7 @@ void _CUDA_H ScalarBoolTransform<X,Z>::intermediateShaped(dim3& launchDims, cuda
                                                             void *vextraParams, int *allocPointer){
 
     scalarSimpleShaped<X, Z, OpType><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(vx, vscalar, xShapeInfo, vextraParams, vz, zShapeInfo, allocPointer);
-    nd4j::DebugHelper::checkErrorCode(stream, "scalarSimpleShaped(...) failed");
+    sd::DebugHelper::checkErrorCode(stream, "scalarSimpleShaped(...) failed");
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -218,7 +218,7 @@ void ScalarBoolTransform<X,Y>::executeCudaShaped(dim3& launchDims, cudaStream_t
                                                 void* vscalar,
                                                 void *vextraParams) {
 
-    if (nd4j::Environment::getInstance()->isDebugAndVerbose())
+    if (sd::Environment::getInstance()->isDebugAndVerbose())
         printf("H14 opNum:[%i]\n", opNum);
 
     DISPATCH_BY_OPNUM_TT(intermediateShaped, PARAMS(launchDims, stream, vx, xShapeInfo, vz, zShapeInfo, vscalar, vextraParams, nullptr), SCALAR_BOOL_OPS);
diff --git a/libnd4j/include/loops/cuda/scalar_int.cu b/libnd4j/include/loops/cuda/scalar_int.cu
index f25beca82..bb761c76c 100644
--- a/libnd4j/include/loops/cuda/scalar_int.cu
+++ b/libnd4j/include/loops/cuda/scalar_int.cu
@@ -20,7 +20,7 @@
 //
 
 #include "../scalar_int.h"
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <types/types.h>
 
 #include "../legacy_ops.h"
@@ -216,7 +216,7 @@ void ScalarIntTransform<X>::executeCudaShaped(dim3& launchDims, cudaStream_t *st
                                                 void* vscalar,
                                                 void *vextraParams) {
 
-    if (nd4j::Environment::getInstance()->isDebugAndVerbose())
+    if (sd::Environment::getInstance()->isDebugAndVerbose())
         printf("H14 opNum:[%i]\n", opNum);
 
     DISPATCH_BY_OPNUM_T(intermediateShaped, PARAMS(launchDims, stream, vx, xShapeInfo, vz, zShapeInfo, vscalar, vextraParams, nullptr), SCALAR_INT_OPS);
diff --git a/libnd4j/include/loops/cuda/specials/accumulateKernel.cu b/libnd4j/include/loops/cuda/specials/accumulateKernel.cu
index c649d6834..6d6dd42a4 100644
--- a/libnd4j/include/loops/cuda/specials/accumulateKernel.cu
+++ b/libnd4j/include/loops/cuda/specials/accumulateKernel.cu
@@ -21,7 +21,7 @@
 
 #include <loops/special_kernels.h>
 
-namespace nd4j {
+namespace sd {
 
 ///////////////////////////////////////////////////////////////////////
 /**
@@ -83,7 +83,7 @@ namespace nd4j {
     accumulateKernelGeneric(dim3 &launchDims, cudaStream_t *stream, void **vx, void *vz, int n, const Nd4jLong length) {
 
         execAccumulateKernel<T><<< launchDims.x, launchDims.y, launchDims.z, *stream>>> (vx, vz, n, length);
-        nd4j::DebugHelper::checkErrorCode(stream, "accumulate(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "accumulate(...) failed");
     }
 
     BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT accumulateKernelGeneric, (dim3 & launchDims, cudaStream_t * stream, void * *vx, void * vz, int n, const Nd4jLong length), LIBND4J_TYPES);
diff --git a/libnd4j/include/loops/cuda/specials/averagingKernel.cu b/libnd4j/include/loops/cuda/specials/averagingKernel.cu
index 0b6872c75..798b273cf 100644
--- a/libnd4j/include/loops/cuda/specials/averagingKernel.cu
+++ b/libnd4j/include/loops/cuda/specials/averagingKernel.cu
@@ -21,7 +21,7 @@
 
 #include <loops/special_kernels.h>
 
-namespace nd4j {
+namespace sd {
 
 ///////////////////////////////////////////////////////////////////////
     template<typename T>
@@ -97,7 +97,7 @@ namespace nd4j {
                            bool propagate) {
 
         execAveragingKernel<T><<< launchDims.x, launchDims.y, launchDims.z, *stream>>>(vdx, vdz, n, length, propagate);
-        nd4j::DebugHelper::checkErrorCode(stream, "averaging(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "averaging(...) failed");
     }
 
     BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT averagingKernelGeneric, (dim3 & launchDims, cudaStream_t * stream, void * *vdx, void * vdz, int n, Nd4jLong length, bool propagate), LIBND4J_TYPES);
diff --git a/libnd4j/include/loops/cuda/specials/concatKernel.cu b/libnd4j/include/loops/cuda/specials/concatKernel.cu
index b6ba2f00e..59c6c5380 100644
--- a/libnd4j/include/loops/cuda/specials/concatKernel.cu
+++ b/libnd4j/include/loops/cuda/specials/concatKernel.cu
@@ -21,7 +21,7 @@
 
 #include <loops/special_kernels.h>
 
-namespace nd4j {
+namespace sd {
 ///////////////////////////////////////////////////////////////////////
     template<typename T>
     __device__ void concatKernel(int numArrays,
@@ -263,7 +263,7 @@ namespace nd4j {
 
 
         execConcatKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(numArrays, data, inputShapeInfos, vz, zShapeInfo, tadPointers, offsetPointers, zTadShape, zOffsets);
-        nd4j::DebugHelper::checkErrorCode(stream, "concatGenericLegacy(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "concatGenericLegacy(...) failed");
     }
 
     BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT concatKernelGeneric, (dim3 & launchDims, cudaStream_t * stream, int numArrays, Nd4jPointer * data, Nd4jPointer * inputShapeInfos, void * vz, Nd4jLong *zShapeInfo, Nd4jPointer * tadPointers, Nd4jPointer * offsetPointers, Nd4jLong * zTadShape, Nd4jLong * zOffsets), LIBND4J_TYPES);
diff --git a/libnd4j/include/loops/cuda/specials/concatKernelHStack.cu b/libnd4j/include/loops/cuda/specials/concatKernelHStack.cu
index 229e65f34..8ef9dfd24 100644
--- a/libnd4j/include/loops/cuda/specials/concatKernelHStack.cu
+++ b/libnd4j/include/loops/cuda/specials/concatKernelHStack.cu
@@ -21,7 +21,7 @@
 
 #include <loops/special_kernels.h>
 
-namespace nd4j {
+namespace sd {
 
 ///////////////////////////////////////////////////////////////////////
     template<typename T>
@@ -88,7 +88,7 @@ namespace nd4j {
                                             void *vz, Nd4jLong *zShapeInfo) {
 
         execConcatKernelHStack<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(numArrays, data, inputShapeInfos, vz, zShapeInfo);
-        nd4j::DebugHelper::checkErrorCode(stream, "concatHStack(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "concatHStack(...) failed");
     }
 
     BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT concatKernelHStackGeneric, (dim3 & launchDims, cudaStream_t * stream, int numArrays, Nd4jPointer * data, Nd4jPointer * inputShapeInfos, void * vz, Nd4jLong * zShapeInfo), LIBND4J_TYPES);
diff --git a/libnd4j/include/loops/cuda/specials/concatKernelScalar.cu b/libnd4j/include/loops/cuda/specials/concatKernelScalar.cu
index f08c3fae0..6614480f2 100644
--- a/libnd4j/include/loops/cuda/specials/concatKernelScalar.cu
+++ b/libnd4j/include/loops/cuda/specials/concatKernelScalar.cu
@@ -21,7 +21,7 @@
 
 #include <loops/special_kernels.h>
 
-namespace nd4j {
+namespace sd {
 
 ///////////////////////////////////////////////////////////////////////
     template<typename T>
@@ -48,7 +48,7 @@ namespace nd4j {
     concatKernelScalarGeneric(dim3 &launchDims, cudaStream_t *stream, int numArrays, Nd4jPointer *data, void *vz) {
 
         execConcatKernelScalar<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(numArrays, data, vz);
-        nd4j::DebugHelper::checkErrorCode(stream, "concatScalar(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "concatScalar(...) failed");
     }
 
     BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT concatKernelScalarGeneric, (dim3 & launchDims, cudaStream_t * stream, int numArrays, Nd4jPointer * data, void * vz), LIBND4J_TYPES);
diff --git a/libnd4j/include/loops/cuda/specials/concatKernelVStack.cu b/libnd4j/include/loops/cuda/specials/concatKernelVStack.cu
index 27011e9d8..f95bad413 100644
--- a/libnd4j/include/loops/cuda/specials/concatKernelVStack.cu
+++ b/libnd4j/include/loops/cuda/specials/concatKernelVStack.cu
@@ -21,7 +21,7 @@
 
 #include <loops/special_kernels.h>
 
-namespace nd4j {
+namespace sd {
 
 ///////////////////////////////////////////////////////////////////////
     template<typename T>
@@ -78,7 +78,7 @@ namespace nd4j {
                                             void *vz, Nd4jLong *zShapeInfo) {
 
         execConcatKernelVStack<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(numArrays, data, inputShapeInfos, vz, zShapeInfo);
-        nd4j::DebugHelper::checkErrorCode(stream, "concatVStack(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "concatVStack(...) failed");
     }
 
     BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT concatKernelVStackGeneric, (dim3 & launchDims, cudaStream_t * stream, int numArrays, Nd4jPointer * data, Nd4jPointer * inputShapeInfos, void * vz, Nd4jLong *zShapeInfo), LIBND4J_TYPES);
diff --git a/libnd4j/include/loops/cuda/specials/convertHalfs.cu b/libnd4j/include/loops/cuda/specials/convertHalfs.cu
index b1cd645ab..dec1705a4 100644
--- a/libnd4j/include/loops/cuda/specials/convertHalfs.cu
+++ b/libnd4j/include/loops/cuda/specials/convertHalfs.cu
@@ -21,7 +21,7 @@
 
 #include <loops/special_kernels.h>
 
-namespace nd4j {
+namespace sd {
 
 ///////////////////////////////////////////////////////////////////////
     template<typename T>
@@ -39,7 +39,7 @@ namespace nd4j {
     __host__ void convertHalfsToGeneric(dim3 &launchDims, cudaStream_t *stream, half *dx, Nd4jLong n, void *dz) {
 
         execConvertHalfs<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(dx, n, dz);
-        nd4j::DebugHelper::checkErrorCode(stream, "convertHalfsToGeneric(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "convertHalfsToGeneric(...) failed");
     }
 
     BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT convertHalfsToGeneric, (dim3 & launchDims, cudaStream_t * stream, half * dx, Nd4jLong n, void * dz), LIBND4J_TYPES);
diff --git a/libnd4j/include/loops/cuda/specials/convertToHalf.cu b/libnd4j/include/loops/cuda/specials/convertToHalf.cu
index c3bfa3dc3..d86982d03 100644
--- a/libnd4j/include/loops/cuda/specials/convertToHalf.cu
+++ b/libnd4j/include/loops/cuda/specials/convertToHalf.cu
@@ -21,7 +21,7 @@
 
 #include <loops/special_kernels.h>
 
-namespace nd4j {
+namespace sd {
 
 ////////////////////////////////////////////////////////////////////////
     template<typename T>
@@ -37,7 +37,7 @@ namespace nd4j {
     template<typename T>
     __host__ void convertToHalfGeneric(dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong n, half *dz) {
         execConvertToHalf<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(dx, n, dz);
-        nd4j::DebugHelper::checkErrorCode(stream, "convertToHalfs(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "convertToHalfs(...) failed");
     }
 
     BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT convertToHalfGeneric, (dim3 & launchDims, cudaStream_t * stream, void * dx, Nd4jLong n, half * dz), LIBND4J_TYPES);
diff --git a/libnd4j/include/loops/cuda/specials/fillDimensionalIsMax.cu b/libnd4j/include/loops/cuda/specials/fillDimensionalIsMax.cu
index e39ff6bec..813de162d 100644
--- a/libnd4j/include/loops/cuda/specials/fillDimensionalIsMax.cu
+++ b/libnd4j/include/loops/cuda/specials/fillDimensionalIsMax.cu
@@ -21,7 +21,7 @@
 
 #include <loops/special_kernels.h>
 
-namespace nd4j {
+namespace sd {
 
 
 ////////////////////////////////////////////////////////////////////////
@@ -88,7 +88,7 @@ namespace nd4j {
                                               Nd4jLong *tadOffsets) {
 
         execfillDimensionalIsMax<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(dX, dZ, zShapeInfo, tadOnlyShapeInfo, dimension, dimensionLength, tadOffsets);
-        nd4j::DebugHelper::checkErrorCode(stream, "fillDimensionalIsMax(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "fillDimensionalIsMax(...) failed");
     }
     BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT fillDimensionalIsMaxGeneric, (dim3& launchDims, cudaStream_t *stream, void *dX, void *dZ, Nd4jLong *zShapeInfo, Nd4jLong *tadOnlyShapeInfo, int *dimension, int dimensionLength, Nd4jLong *tadOffsets), LIBND4J_TYPES);
 }
\ No newline at end of file
diff --git a/libnd4j/include/loops/cuda/specials/fillIsMax.cu b/libnd4j/include/loops/cuda/specials/fillIsMax.cu
index c9ed51d28..1a994a13c 100644
--- a/libnd4j/include/loops/cuda/specials/fillIsMax.cu
+++ b/libnd4j/include/loops/cuda/specials/fillIsMax.cu
@@ -21,7 +21,7 @@
 
 #include <loops/special_kernels.h>
 
-namespace nd4j {
+namespace sd {
 
 ////////////////////////////////////////////////////////////////////////
     template <typename T>
@@ -37,7 +37,7 @@ namespace nd4j {
     template <typename T>
     __host__ void fillIsMaxGeneric(dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong *xShapeInfo, Nd4jLong length, long idx) {
         execFillIsMax<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(dx, xShapeInfo, length, idx);
-        nd4j::DebugHelper::checkErrorCode(stream, "fillIsMax(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "fillIsMax(...) failed");
     }
 
 
diff --git a/libnd4j/include/loops/cuda/specials/flatten.cu b/libnd4j/include/loops/cuda/specials/flatten.cu
index faec2ec90..b0bbf58e1 100644
--- a/libnd4j/include/loops/cuda/specials/flatten.cu
+++ b/libnd4j/include/loops/cuda/specials/flatten.cu
@@ -22,7 +22,7 @@
 #include <loops/special_kernels.h>
 #include <ops/declarable/helpers/flatten.h>
 
-namespace nd4j {
+namespace sd {
 
 ////////////////////////////////////////////////////////////////////////
 template <typename T>
@@ -62,7 +62,7 @@ __host__ void flattenKernelGeneric(dim3& launchDims, cudaStream_t *stream,
                             void *vy, Nd4jLong *yShapeInfo) {
 
     flattenKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(extraPointers, dOffset, order, vz, zShapeInfo, vy, yShapeInfo);
-    nd4j::DebugHelper::checkErrorCode(stream, "flattenGeneric(...) failed");
+    sd::DebugHelper::checkErrorCode(stream, "flattenGeneric(...) failed");
 }
 
 BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT flattenKernelGeneric, (dim3& launchDims, cudaStream_t *stream, Nd4jPointer *extraPointers, int dOffset, char order, void *vz, Nd4jLong *zShapeInfo, void *vy, Nd4jLong *yShapeInfo), LIBND4J_TYPES);
diff --git a/libnd4j/include/loops/cuda/specials/pullRowsKernel.cu b/libnd4j/include/loops/cuda/specials/pullRowsKernel.cu
index 9730565e6..7ef6a46db 100644
--- a/libnd4j/include/loops/cuda/specials/pullRowsKernel.cu
+++ b/libnd4j/include/loops/cuda/specials/pullRowsKernel.cu
@@ -21,7 +21,7 @@
 
 #include <loops/special_kernels.h>
 
-namespace nd4j {
+namespace sd {
 
 ///////////////////////////////////////////////////////////////////////
     template<typename T>
@@ -84,7 +84,7 @@ namespace nd4j {
                                         Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets) {
 
         execPullRowsKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(vx, vz, len, indexes, tadShapeInfo, tadOffsets, zTadShapeInfo, zTadOffsets);
-        nd4j::DebugHelper::checkErrorCode(stream, "pullRows(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "pullRows(...) failed");
     }
 
     BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT pullRowsKernelGeneric, (dim3 & launchDims, cudaStream_t * stream, void * vx, void * vz, Nd4jLong len, Nd4jLong * indexes, Nd4jLong * tadShapeInfo, Nd4jLong * tadOffsets, Nd4jLong *zTadShapeInfo, Nd4jLong * zTadOffsets), LIBND4J_TYPES);
diff --git a/libnd4j/include/loops/cuda/specials/setDiagonalKernel.cu b/libnd4j/include/loops/cuda/specials/setDiagonalKernel.cu
index 809a318ee..bb063180c 100644
--- a/libnd4j/include/loops/cuda/specials/setDiagonalKernel.cu
+++ b/libnd4j/include/loops/cuda/specials/setDiagonalKernel.cu
@@ -19,8 +19,8 @@
 //
 
 #include <loops/special_kernels.h>
-#include <NDArray.h>
-namespace nd4j {
+#include <array/NDArray.h>
+namespace sd {
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // set up given value to upper diagonal given
diff --git a/libnd4j/include/loops/cuda/specials/shuffleKernel.cu b/libnd4j/include/loops/cuda/specials/shuffleKernel.cu
index c842cad4a..db63c2af7 100644
--- a/libnd4j/include/loops/cuda/specials/shuffleKernel.cu
+++ b/libnd4j/include/loops/cuda/specials/shuffleKernel.cu
@@ -21,7 +21,7 @@
 
 #include <loops/special_kernels.h>
 
-namespace nd4j {
+namespace sd {
 
 ////////////////////////////////////////////////////////////////////////
     template<typename T>
@@ -118,7 +118,7 @@ namespace nd4j {
                                        Nd4jLong **tadOnlyShapeInfo, Nd4jLong **tadOffsets) {
 
         execShuffleKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(vdX, xShapeInfo, vdZ, N, shuffleMap, tadOnlyShapeInfo, tadOffsets);
-        nd4j::DebugHelper::checkErrorCode(stream, "shuffleGeneric(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "shuffleGeneric(...) failed");
     }
 
     BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT shuffleKernelGeneric, (dim3 & launchDims, cudaStream_t * stream, void * *vdX, Nd4jLong * *xShapeInfo, void **vdZ, int N, int * shuffleMap, Nd4jLong * *tadOnlyShapeInfo, Nd4jLong * *tadOffsets), LIBND4J_TYPES);
diff --git a/libnd4j/include/loops/cuda/specials/swapUnsafeKernel.cu b/libnd4j/include/loops/cuda/specials/swapUnsafeKernel.cu
index 1b7820f64..796ea85c0 100644
--- a/libnd4j/include/loops/cuda/specials/swapUnsafeKernel.cu
+++ b/libnd4j/include/loops/cuda/specials/swapUnsafeKernel.cu
@@ -20,7 +20,7 @@
 
 #include <loops/special_kernels.h>
 
-namespace nd4j {
+namespace sd {
 
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     // kernel to swap two NDArrays vals as linear sequences
@@ -47,7 +47,7 @@ namespace nd4j {
 
             auto xOffset = shape::getIndexOffset(i * xEws, theFirstShape);
             auto yOffset = shape::getIndexOffset(i * yEws, theSecondShape);
-            nd4j::math::nd4j_swap(output[xOffset], input[yOffset]);
+            sd::math::nd4j_swap(output[xOffset], input[yOffset]);
         }
     }
 
diff --git a/libnd4j/include/loops/cuda/specials/tearKernel.cu b/libnd4j/include/loops/cuda/specials/tearKernel.cu
index e12aa485f..a6285b5a5 100644
--- a/libnd4j/include/loops/cuda/specials/tearKernel.cu
+++ b/libnd4j/include/loops/cuda/specials/tearKernel.cu
@@ -21,7 +21,7 @@
 
 #include <loops/special_kernels.h>
 
-namespace nd4j {
+namespace sd {
 
 ////////////////////////////////////////////////////////////////////////
     template<typename T>
@@ -88,7 +88,7 @@ namespace nd4j {
                                     Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
 
         execTearKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(vx, xShapeInfo, targets, zShapeInfo, tadShapeInfo, tadOffsets);
-        nd4j::DebugHelper::checkErrorCode(stream, "tear(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "tear(...) failed");
     }
 
     BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT tearKernelGeneric, (dim3 & launchDims, cudaStream_t * stream, void * vx, Nd4jLong * xShapeInfo, Nd4jPointer *targets, Nd4jLong * zShapeInfo, Nd4jLong * tadShapeInfo, Nd4jLong * tadOffsets), LIBND4J_TYPES);
diff --git a/libnd4j/include/loops/cuda/specials/tileKernel.cu b/libnd4j/include/loops/cuda/specials/tileKernel.cu
index 257942484..885978fef 100644
--- a/libnd4j/include/loops/cuda/specials/tileKernel.cu
+++ b/libnd4j/include/loops/cuda/specials/tileKernel.cu
@@ -20,7 +20,7 @@
 
 #include <loops/special_kernels.h>
 
-namespace nd4j {
+namespace sd {
     static Nd4jLong __device__ __noinline__ _getIndexOffset(Nd4jLong index, Nd4jLong *shapeInfo) {
         return shape::getIndexOffset(index, shapeInfo);
     }
diff --git a/libnd4j/include/loops/cuda/summarystatsreduce.cu b/libnd4j/include/loops/cuda/summarystatsreduce.cu
index e505929e6..c858d8098 100644
--- a/libnd4j/include/loops/cuda/summarystatsreduce.cu
+++ b/libnd4j/include/loops/cuda/summarystatsreduce.cu
@@ -19,19 +19,19 @@
 //
 
 
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <types/types.h>
 #include <types/float16.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <loops/summarystatsreduce.h>
 #include <helpers/shape.h>
 #include <helpers/TAD.h>
-#include <dll.h>
-#include <Environment.h>
+#include <system/dll.h>
+#include <system/Environment.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <helpers/DebugHelper.h>
-#include <specials_cuda.h>
+#include <ops/specials_cuda.h>
 
 using namespace simdOps;
 
@@ -211,7 +211,7 @@ void _CUDA_G summaryStatsReduceT(int op, void *dx, Nd4jLong *xShapeInfo, int xRa
                             sPartials[threadIdx.x] = update(sPartials[threadIdx.x], OpType::op(indexVal2, extraParams), extraParams);
                         }
                         __syncthreads();
-                        aggregatePartials<OpType>(&sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, tadLength), extraParams);
+                        aggregatePartials<OpType>(&sPartials, threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, tadLength), extraParams);
 
                         __syncthreads();
                         if (threadIdx.x == 0) {
@@ -237,7 +237,7 @@ void _CUDA_G summaryStatsReduceT(int op, void *dx, Nd4jLong *xShapeInfo, int xRa
                         }
 
                         __syncthreads();
-                        aggregatePartials<OpType>(&sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, tadLength), extraParams);
+                        aggregatePartials<OpType>(&sPartials, threadIdx.x, sd::math::nd4j_min<int>(blockDim.x, tadLength), extraParams);
 
                         __syncthreads();
                         if (threadIdx.x == 0) {
@@ -344,7 +344,7 @@ void _CUDA_G summaryStatsReduceT(int op, void *dx, Nd4jLong *xShapeInfo, int xRa
             auto z = reinterpret_cast<Z*>(vz);
             auto reductionPointerA = reinterpret_cast<Z*>(reductionBuffer);
 
-            if (nd4j::Environment::getInstance()->isDebugAndVerbose())
+            if (sd::Environment::getInstance()->isDebugAndVerbose())
                 printf("D16 opNum:[%i]\n", opNum);
 
             summaryStatsReduceT<X,Z><<<launchDims.x,launchDims.y,launchDims.z, *stream>>>(
@@ -359,7 +359,7 @@ void _CUDA_G summaryStatsReduceT(int op, void *dx, Nd4jLong *xShapeInfo, int xRa
                             1,biasCorrected, nullptr, reductionPointerA, tadShapeInfo, tadOffsets);
 
             // this is blocking method since method should return scalar
-            nd4j::DebugHelper::checkErrorCode(stream, "execSSReduceScalar(...) failed");
+            sd::DebugHelper::checkErrorCode(stream, "execSSReduceScalar(...) failed");
         }
 
         template <typename X, typename Z>
@@ -369,7 +369,7 @@ void _CUDA_G summaryStatsReduceT(int op, void *dx, Nd4jLong *xShapeInfo, int xRa
             auto z = static_cast<Z*>(vz);
             auto extraParams = static_cast<Z*>(vextraParams);
 
-            if (nd4j::Environment::getInstance()->isDebugAndVerbose())
+            if (sd::Environment::getInstance()->isDebugAndVerbose())
                 printf("F17 opNum:[%i]\n", opNum);
 
             auto reductionPointerA = reinterpret_cast<Z*>(reductionBuffer);
@@ -396,7 +396,7 @@ void _CUDA_G summaryStatsReduceT(int op, void *dx, Nd4jLong *xShapeInfo, int xRa
             auto z = static_cast<Z*>(vz);
             auto extraParams = static_cast<Z*>(vextraParams);
 
-            if (nd4j::Environment::getInstance()->isDebugAndVerbose())
+            if (sd::Environment::getInstance()->isDebugAndVerbose())
                 printf("D18 opNum:[%i]\n", opNum);
 
             summaryStatsReduceT<X, Z><<<launchDims.x,launchDims.y,launchDims.z, *stream>>>(
diff --git a/libnd4j/include/loops/cuda/transform/transform_any.cu b/libnd4j/include/loops/cuda/transform/transform_any.cu
index 37a0ac804..d13b94599 100644
--- a/libnd4j/include/loops/cuda/transform/transform_any.cu
+++ b/libnd4j/include/loops/cuda/transform/transform_any.cu
@@ -18,10 +18,10 @@
 // @author raver119@gmail.com
 //
 
-#include <Environment.h>
+#include <system/Environment.h>
 #include <loops/transform_any.h>
 #include <types/types.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 
 #include <loops/legacy_ops.h>
 #include <helpers/DebugHelper.h>
@@ -111,7 +111,7 @@ namespace functions {
 		template <typename OpType>
 		_CUDA_H void TransformAny<X,Z>::intermediateShaped(dim3 launchDims, cudaStream_t *stream, void *x, Nd4jLong *xShape, int xRank, void *extraParams, void *z, Nd4jLong *zShape, int zRank, int *allocationPointer, void *reductionPointer,  Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
 			transformAnySimple<X, Z, OpType><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(x, xShape, xRank, extraParams, z, zShape, zRank, allocationPointer, reductionPointer, tadShapeInfo, tadOffsets);
-            nd4j::DebugHelper::checkErrorCode(stream, "transformAny(...) failed");
+            sd::DebugHelper::checkErrorCode(stream, "transformAny(...) failed");
 		}
 
         BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT TransformAny, , LIBND4J_TYPES, LIBND4J_TYPES);
diff --git a/libnd4j/include/loops/cuda/transform/transform_bool.cu b/libnd4j/include/loops/cuda/transform/transform_bool.cu
index d64328494..fec14a745 100644
--- a/libnd4j/include/loops/cuda/transform/transform_bool.cu
+++ b/libnd4j/include/loops/cuda/transform/transform_bool.cu
@@ -18,10 +18,10 @@
 // @author raver119@gmail.com
 //
 
-#include <Environment.h>
+#include <system/Environment.h>
 #include <loops/transform_bool.h>
 #include <types/types.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 
 #include <loops/legacy_ops.h>
 #include <helpers/DebugHelper.h>
@@ -117,7 +117,7 @@ namespace functions {
 		template <typename OpType>
 		_CUDA_H void TransformBool<X,Z>::intermediateShaped(dim3 launchDims, cudaStream_t *stream, void *x, Nd4jLong *xShape, int xRank, void *extraParams, void *z, Nd4jLong *zShape, int zRank, int *allocationPointer, void *reductionPointer,  Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
 			transformBoolSimple<X, Z, OpType><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(x, xShape, xRank, extraParams, z, zShape, zRank, allocationPointer, reductionPointer, tadShapeInfo, tadOffsets);
-            nd4j::DebugHelper::checkErrorCode(stream, "transformBool(...) failed");
+            sd::DebugHelper::checkErrorCode(stream, "transformBool(...) failed");
 		}
 
         BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT TransformBool, , LIBND4J_TYPES, BOOL_TYPES);
diff --git a/libnd4j/include/loops/cuda/transform/transform_float.cu b/libnd4j/include/loops/cuda/transform/transform_float.cu
index 2e82efdb3..f631fd4d7 100644
--- a/libnd4j/include/loops/cuda/transform/transform_float.cu
+++ b/libnd4j/include/loops/cuda/transform/transform_float.cu
@@ -18,10 +18,10 @@
 // @author raver119@gmail.com
 //
 
-#include <Environment.h>
+#include <system/Environment.h>
 #include <loops/transform_float.h>
 #include <types/types.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 
 #include <loops/legacy_ops.h>
 #include <helpers/DebugHelper.h>
@@ -139,7 +139,7 @@ namespace functions {
 		template <typename OpType>
 		_CUDA_H void TransformFloat<X,Z>::intermediateShaped(dim3 launchDims, cudaStream_t *stream, void *x, Nd4jLong *xShape, int xRank, void *extraParams, void *z, Nd4jLong *zShape, int zRank, int *allocationPointer, void *reductionPointer,  Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
 			transformFloatSimple<X, Z, OpType><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(x, xShape, xRank, extraParams, z, zShape, zRank, allocationPointer, reductionPointer, tadShapeInfo, tadOffsets);
-            nd4j::DebugHelper::checkErrorCode(stream, "transformFloat(...) failed");
+            sd::DebugHelper::checkErrorCode(stream, "transformFloat(...) failed");
 		}
 
 		BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT TransformFloat, , LIBND4J_TYPES, FLOAT_TYPES);
diff --git a/libnd4j/include/loops/cuda/transform/transform_same.cu b/libnd4j/include/loops/cuda/transform/transform_same.cu
index 0a66590a5..368a9b602 100644
--- a/libnd4j/include/loops/cuda/transform/transform_same.cu
+++ b/libnd4j/include/loops/cuda/transform/transform_same.cu
@@ -18,10 +18,10 @@
 // @author raver119@gmail.com
 //
 
-#include <Environment.h>
+#include <system/Environment.h>
 #include <loops/transform_same.h>
 #include <types/types.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 
 #include <loops/legacy_ops.h>
 #include <helpers/DebugHelper.h>
@@ -115,7 +115,7 @@ namespace functions {
 		template <typename OpType>
 		_CUDA_H void TransformSame<X>::intermediateShaped(dim3 launchDims, cudaStream_t *stream, void *x, Nd4jLong *xShape, int xRank, void *extraParams, void *z, Nd4jLong *zShape, int zRank, int *allocationPointer, void *reductionPointer,  Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
 			transformSameSimple<X, OpType><<<launchDims.x, launchDims.x, launchDims.z, *stream>>>(x, xShape, xRank, extraParams, z, zShape, zRank, allocationPointer, reductionPointer, tadShapeInfo, tadOffsets);
-            nd4j::DebugHelper::checkErrorCode(stream, "transformSame(...) failed");
+            sd::DebugHelper::checkErrorCode(stream, "transformSame(...) failed");
 		}
 
         BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT TransformSame, , LIBND4J_TYPES);
diff --git a/libnd4j/include/loops/cuda/transform/transform_strict.cu b/libnd4j/include/loops/cuda/transform/transform_strict.cu
index 35ab0b1dc..155e5aa23 100644
--- a/libnd4j/include/loops/cuda/transform/transform_strict.cu
+++ b/libnd4j/include/loops/cuda/transform/transform_strict.cu
@@ -18,10 +18,10 @@
 // @author raver119@gmail.com
 //
 
-#include <Environment.h>
+#include <system/Environment.h>
 #include <loops/transform_strict.h>
 #include <types/types.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 
 #include <loops/legacy_ops.h>
 #include <helpers/DebugHelper.h>
@@ -116,7 +116,7 @@ namespace functions {
 		template <typename OpType>
 		_CUDA_H void TransformStrict<X>::intermediateShaped(dim3 launchDims, cudaStream_t *stream, void *x, Nd4jLong *xShape, int xRank, void *extraParams, void *z, Nd4jLong *zShape, int zRank, int *allocationPointer, void *reductionPointer,  Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
 			transformStrictSimple<X, OpType><<<launchDims.x, launchDims.x, launchDims.z, *stream>>>(x, xShape, xRank, extraParams, z, zShape, zRank, allocationPointer, reductionPointer, tadShapeInfo, tadOffsets);
-            nd4j::DebugHelper::checkErrorCode(stream, "transformStrict(...) failed");
+            sd::DebugHelper::checkErrorCode(stream, "transformStrict(...) failed");
 		}
 
         BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT TransformStrict, , FLOAT_TYPES);
diff --git a/libnd4j/include/loops/cuda/type_conversions.cu b/libnd4j/include/loops/cuda/type_conversions.cu
index 5afb23f79..8c38561f4 100644
--- a/libnd4j/include/loops/cuda/type_conversions.cu
+++ b/libnd4j/include/loops/cuda/type_conversions.cu
@@ -22,13 +22,13 @@
 #include <types/types.h>
 #include <helpers/DebugHelper.h>
 
-namespace nd4j {
+namespace sd {
     template<typename S, typename T>
     void TypeCast::convertGenericCuda(Nd4jPointer *extras, void *dx, Nd4jLong N, void *dz) {
         auto stream = reinterpret_cast<cudaStream_t *>(&extras[1]);
 
-        nd4j::convertKernel<S, T><<<256, 1024, 1024, *stream>>>(dx, N, dz);
-        nd4j::DebugHelper::checkErrorCode(stream, "convertGeneric(...) failed");
+        sd::convertKernel<S, T><<<256, 1024, 1024, *stream>>>(dx, N, dz);
+        sd::DebugHelper::checkErrorCode(stream, "convertGeneric(...) failed");
     };
 
 
@@ -228,7 +228,7 @@ template<typename T>
 __host__ void encoderKernelP1Generic(dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *dz, float threshold) {
 
     execEncoderKernelP1<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(dx, N, dz, threshold);
-        nd4j::DebugHelper::checkErrorCode(stream, "encoderP1(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "encoderP1(...) failed");
 }
 BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT encoderKernelP1Generic, (dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *dz, float threshold), LIBND4J_TYPES);
 
@@ -243,7 +243,7 @@ __global__ static void execEncoderKernelP3(void *dx, int *offsets, Nd4jLong N, v
 template<typename T>
 __host__ void encoderKernelP3Generic(dim3 &launchDims, cudaStream_t *stream, void *dx, int *offsets, Nd4jLong N, void *dz) {
     execEncoderKernelP3<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(dx, offsets, N, dz);
-        nd4j::DebugHelper::checkErrorCode(stream, "encoderP3(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "encoderP3(...) failed");
 }
 BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT encoderKernelP3Generic, (dim3 &launchDims, cudaStream_t *stream, void *dx, int *offsets, Nd4jLong N, void *dz), LIBND4J_TYPES);
 
@@ -259,7 +259,7 @@ template<typename T>
 __host__ void decoderKernelGeneric(dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *dz) {
 
     execDecoderKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(dx, N, dz);
-    nd4j::DebugHelper::checkErrorCode(stream, "execDecoder(...) failed");
+    sd::DebugHelper::checkErrorCode(stream, "execDecoder(...) failed");
 }
 BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT decoderKernelGeneric, (dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *dz), LIBND4J_TYPES);
 
@@ -276,7 +276,7 @@ template<typename T>
 __host__ void cudaEncodeBitmapGeneric(dim3 &launchDims, cudaStream_t *stream, void *vdx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold) {
 
     execCudaEncodeBitmapKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(vdx, N, dz, scalar, reductionBuffer, threshold);
-    nd4j::DebugHelper::checkErrorCode(stream, "encodeBitmap(...) failed");
+    sd::DebugHelper::checkErrorCode(stream, "encodeBitmap(...) failed");
 }
 BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT cudaEncodeBitmapGeneric, (dim3 &launchDims, cudaStream_t *stream, void *vdx, Nd4jLong N, int *dz, int *scalar, int *reductionBuffer, float threshold), LIBND4J_TYPES);
 
@@ -293,7 +293,7 @@ template<typename T>
 __host__ void cudaDecodeBitmapGeneric(dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *vdz) {
 
     execCudaDecodeBitmapKernel<T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(dx, N, vdz);
-    nd4j::DebugHelper::checkErrorCode(stream, "cudeDecodeBitmap(...) failed");
+    sd::DebugHelper::checkErrorCode(stream, "cudeDecodeBitmap(...) failed");
 }
 BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT cudaDecodeBitmapGeneric, (dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong N, void *vdz), LIBND4J_TYPES);
 
@@ -301,7 +301,7 @@ BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT cudaDecodeBitmapGeneric, (dim3 &
     template <bool storeSum, bool isNP2>
     __host__ void prescanLauncher(dim3 &blocks, dim3 &threads, int shmem, cudaStream_t *stream, int *g_odata, const int *g_idata, int *g_blockSums, int n, int blockIndex, int baseIndex) {
         prescan<storeSum, isNP2><<<blocks, threads, shmem, *stream>>>(g_odata, g_idata, g_blockSums, n, blockIndex, baseIndex);
-        nd4j::DebugHelper::checkErrorCode(stream, "prescan(...) failed");
+        sd::DebugHelper::checkErrorCode(stream, "prescan(...) failed");
     };
 
     template <typename S, typename T>
@@ -309,7 +309,7 @@ BUILD_SINGLE_TEMPLATE(template void ND4J_EXPORT cudaDecodeBitmapGeneric, (dim3 &
         auto x = reinterpret_cast<S *>(dx);
         auto z = reinterpret_cast<T *>(dz);
 
-        nd4j::convertKernelGeneric(x, N, z);
+        sd::convertKernelGeneric(x, N, z);
     }
 
 
diff --git a/libnd4j/include/loops/impl/type_conversions.cpp b/libnd4j/include/loops/impl/type_conversions.cpp
index 36c95e731..16914bd86 100644
--- a/libnd4j/include/loops/impl/type_conversions.cpp
+++ b/libnd4j/include/loops/impl/type_conversions.cpp
@@ -19,12 +19,12 @@
 //
 
 #include <types/types.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <loops/type_conversions.h>
-#include <OmpLaunchHelper.h>
+#include <helpers/OmpLaunchHelper.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
 
     template <typename T>
     _CUDA_H void TypeCast::convertFromQuantized(Nd4jPointer *extras, void *dx, Nd4jLong N, void *dz) {
@@ -32,15 +32,15 @@ namespace nd4j {
         auto z = reinterpret_cast<T *>(dz);
 
         auto fx = reinterpret_cast<float *>(dx);
-        auto amin = nd4j::math::nd4j_abs<float>(fx[0]);
-        auto amax = nd4j::math::nd4j_abs<float>(fx[1]);
+        auto amin = sd::math::nd4j_abs<float>(fx[0]);
+        auto amax = sd::math::nd4j_abs<float>(fx[1]);
 
 
         auto x = reinterpret_cast<char *>(dx) + 8;
 
 
         for (Nd4jLong e = 0; e < N; e++) {
-            z[e] = static_cast<T>(static_cast<float>(x[e]) / static_cast<float>(DataTypeUtils::max<int8_t>()) * nd4j::math::nd4j_max<float>(amin, amax));
+            z[e] = static_cast<T>(static_cast<float>(x[e]) / static_cast<float>(DataTypeUtils::max<int8_t>()) * sd::math::nd4j_max<float>(amin, amax));
         }
     }
 
@@ -76,13 +76,13 @@ namespace nd4j {
         fz[0] = min;
         fz[1] = max;
 
-        auto amax = nd4j::math::nd4j_abs<float>(max);
-        auto amin = nd4j::math::nd4j_abs<float>(min);
+        auto amax = sd::math::nd4j_abs<float>(max);
+        auto amin = sd::math::nd4j_abs<float>(min);
 
         // now we actually apply quantization
         auto func = PRAGMA_THREADS_FOR {
             for (auto e = start; e < stop; e++) {
-                rz[e] = static_cast<char>(nd4j::math::nd4j_round<float, char>( 1.0f * static_cast<float>(x[e]) / nd4j::math::nd4j_max<float>(amax, amin) * max_byte));
+                rz[e] = static_cast<char>(sd::math::nd4j_round<float, char>( 1.0f * static_cast<float>(x[e]) / sd::math::nd4j_max<float>(amax, amin) * max_byte));
             }
         };
 
@@ -179,7 +179,7 @@ PRAGMA_OMP_ATOMIC_ARGS(write)
         auto func = PRAGMA_THREADS_FOR {
             for (auto e = start; e < stop; e++) {
                 int el = x[e];
-                int ael = nd4j::math::nd4j_abs<int>(el) - 1;
+                int ael = sd::math::nd4j_abs<int>(el) - 1;
                 z[ael] += el > 0 ? static_cast<T>(threshold) : static_cast<T>(-threshold);
             }
         };
diff --git a/libnd4j/include/loops/indexreduce.h b/libnd4j/include/loops/indexreduce.h
index ad4472dec..677d83db9 100755
--- a/libnd4j/include/loops/indexreduce.h
+++ b/libnd4j/include/loops/indexreduce.h
@@ -26,10 +26,10 @@
 #ifdef _OPENMP
 #include <omp.h>
 #endif
-#include <dll.h>
+#include <system/dll.h>
 #include <ops/ops.h>
-#include <op_boilerplate.h>
-#include <OmpLaunchHelper.h>
+#include <system/op_boilerplate.h>
+#include <helpers/OmpLaunchHelper.h>
 #include <helpers/DebugHelper.h>
 
 #ifdef __CUDACC__
@@ -41,7 +41,7 @@
 #include <helpers/TAD.h>
 
 
-#include "../pairwise_util.h"
+#include "system/pairwise_util.h"
 
 #include "legacy_ops.h"
 
diff --git a/libnd4j/include/loops/pairwise_bool.h b/libnd4j/include/loops/pairwise_bool.h
index f7a65c3f5..fee96df84 100644
--- a/libnd4j/include/loops/pairwise_bool.h
+++ b/libnd4j/include/loops/pairwise_bool.h
@@ -26,13 +26,13 @@
 #ifdef _OPENMP
 #include <omp.h>
 #endif
-#include <templatemath.h>
+#include <math/templatemath.h>
 #include <helpers/shape.h>
-#include <pairwise_util.h>
-#include <dll.h>
+#include <system/pairwise_util.h>
+#include <system/dll.h>
 #include <stdio.h>
 #include <ops/ops.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <helpers/DebugHelper.h>
 
 #ifdef __CUDACC__
diff --git a/libnd4j/include/loops/pairwise_int.h b/libnd4j/include/loops/pairwise_int.h
index aa6437d17..4144963c7 100644
--- a/libnd4j/include/loops/pairwise_int.h
+++ b/libnd4j/include/loops/pairwise_int.h
@@ -26,13 +26,13 @@
 #ifdef _OPENMP
 #include <omp.h>
 #endif
-#include <templatemath.h>
+#include <math/templatemath.h>
 #include <helpers/shape.h>
-#include <pairwise_util.h>
-#include <dll.h>
+#include <system/pairwise_util.h>
+#include <system/dll.h>
 #include <stdio.h>
 #include <ops/ops.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <helpers/DebugHelper.h>
 
 #ifdef __CUDACC__
diff --git a/libnd4j/include/loops/pairwise_transform.h b/libnd4j/include/loops/pairwise_transform.h
index 0109b309f..8576481f5 100755
--- a/libnd4j/include/loops/pairwise_transform.h
+++ b/libnd4j/include/loops/pairwise_transform.h
@@ -27,10 +27,10 @@
 #include <omp.h>
 #endif
 
-#include <dll.h>
+#include <system/dll.h>
 #include <stdio.h>
 #include <ops/ops.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <types/types.h>
 #include "legacy_ops.h"
 #include <helpers/DebugHelper.h>
diff --git a/libnd4j/include/loops/reduce3.h b/libnd4j/include/loops/reduce3.h
index 178bac7c2..597e450b1 100755
--- a/libnd4j/include/loops/reduce3.h
+++ b/libnd4j/include/loops/reduce3.h
@@ -26,17 +26,17 @@
 
 #define EXTRA_PARAMS_LENGTH 10
 
-#include <templatemath.h>
+#include <math/templatemath.h>
 #ifdef _OPENMP
 #include <omp.h>
 #endif
-#include <pairwise_util.h>
-#include <dll.h>
+#include <system/pairwise_util.h>
+#include <system/dll.h>
 #include <helpers/shape.h>
 #include <helpers/TAD.h>
 #include <ops/ops.h>
-#include <op_boilerplate.h>
-#include <OmpLaunchHelper.h>
+#include <system/op_boilerplate.h>
+#include <helpers/OmpLaunchHelper.h>
 #include <helpers/DebugHelper.h>
 
 #ifdef __CUDACC__
diff --git a/libnd4j/include/loops/reduce_bool.h b/libnd4j/include/loops/reduce_bool.h
index 540a6041d..815557d41 100644
--- a/libnd4j/include/loops/reduce_bool.h
+++ b/libnd4j/include/loops/reduce_bool.h
@@ -17,18 +17,18 @@
 
 #ifndef REDUCE_BOOL_H
 #define REDUCE_BOOL_H
-#include <dll.h>
+#include <system/dll.h>
 //#include <string>
 #include <stdio.h>
 #include <helpers/shape.h>
 #ifdef _OPENMP
 #include <omp.h>
 #endif
-#include <templatemath.h>
-#include <nd4jmalloc.h>
-#include <pairwise_util.h>
+#include <math/templatemath.h>
+#include <system/nd4jmalloc.h>
+#include <system/pairwise_util.h>
 #include <ops/ops.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 
 #pragma once
 #ifdef __CUDACC__
diff --git a/libnd4j/include/loops/reduce_float.h b/libnd4j/include/loops/reduce_float.h
index ff2c0e668..6ff3f88ab 100644
--- a/libnd4j/include/loops/reduce_float.h
+++ b/libnd4j/include/loops/reduce_float.h
@@ -17,18 +17,18 @@
 
 #ifndef REDUCE_FLOAT_H
 #define REDUCE_FLOAT_H
-#include <dll.h>
+#include <system/dll.h>
 //#include <string>
 #include <stdio.h>
 #include <helpers/shape.h>
 #ifdef _OPENMP
 #include <omp.h>
 #endif
-#include <templatemath.h>
-#include <nd4jmalloc.h>
-#include <pairwise_util.h>
+#include <math/templatemath.h>
+#include <system/nd4jmalloc.h>
+#include <system/pairwise_util.h>
 #include <ops/ops.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 
 #pragma once
 #ifdef __CUDACC__
diff --git a/libnd4j/include/loops/reduce_long.h b/libnd4j/include/loops/reduce_long.h
index a5d2a9498..4c83e1057 100644
--- a/libnd4j/include/loops/reduce_long.h
+++ b/libnd4j/include/loops/reduce_long.h
@@ -17,18 +17,18 @@
 
 #ifndef REDUCE_LONG_H
 #define REDUCE_LONG_H
-#include <dll.h>
+#include <system/dll.h>
 //#include <string>
 #include <stdio.h>
 #include <helpers/shape.h>
 #ifdef _OPENMP
 #include <omp.h>
 #endif
-#include <templatemath.h>
-#include <nd4jmalloc.h>
-#include <pairwise_util.h>
+#include <math/templatemath.h>
+#include <system/nd4jmalloc.h>
+#include <system/pairwise_util.h>
 #include <ops/ops.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 
 #pragma once
 #ifdef __CUDACC__
diff --git a/libnd4j/include/loops/reduce_same.h b/libnd4j/include/loops/reduce_same.h
index e828ecf46..641551b6f 100644
--- a/libnd4j/include/loops/reduce_same.h
+++ b/libnd4j/include/loops/reduce_same.h
@@ -17,18 +17,18 @@
 
 #ifndef REDUCE_SAME_H
 #define REDUCE_SAME_H
-#include <dll.h>
+#include <system/dll.h>
 //#include <string>
 #include <stdio.h>
 #include <helpers/shape.h>
 #ifdef _OPENMP
 #include <omp.h>
 #endif
-#include <templatemath.h>
-#include <nd4jmalloc.h>
-#include <pairwise_util.h>
+#include <math/templatemath.h>
+#include <system/nd4jmalloc.h>
+#include <system/pairwise_util.h>
 #include <ops/ops.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 
 #pragma once
 #ifdef __CUDACC__
diff --git a/libnd4j/include/loops/scalar.h b/libnd4j/include/loops/scalar.h
index 0f32dedf3..dc3a5b16c 100755
--- a/libnd4j/include/loops/scalar.h
+++ b/libnd4j/include/loops/scalar.h
@@ -23,16 +23,16 @@
 
 #ifndef SCALAR_H_
 #define SCALAR_H_
-#include <OmpLaunchHelper.h>
-#include <dll.h>
+#include <helpers/OmpLaunchHelper.h>
+#include <system/dll.h>
 #include <helpers/DebugHelper.h>
 
 #ifdef __JNI__
 #include <jni.h>
 #endif
-#include <templatemath.h>
+#include <math/templatemath.h>
 #include <ops/ops.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include "helpers/logger.h"
 
 #ifdef __CUDACC__
diff --git a/libnd4j/include/loops/scalar_bool.h b/libnd4j/include/loops/scalar_bool.h
index a5931ddfb..0b26531b2 100644
--- a/libnd4j/include/loops/scalar_bool.h
+++ b/libnd4j/include/loops/scalar_bool.h
@@ -23,16 +23,16 @@
 
 #ifndef SCALAR_BOOL_H_
 #define SCALAR_BOOL_H_
-#include <dll.h>
+#include <system/dll.h>
 
 #ifdef __JNI__
 #include <jni.h>
 #endif
-#include <templatemath.h>
+#include <math/templatemath.h>
 #include <ops/ops.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include "helpers/logger.h"
-#include <OmpLaunchHelper.h>
+#include <helpers/OmpLaunchHelper.h>
 #include <helpers/DebugHelper.h>
 
 #ifdef __CUDACC__
diff --git a/libnd4j/include/loops/scalar_int.h b/libnd4j/include/loops/scalar_int.h
index 509d7574f..dde7af4c7 100644
--- a/libnd4j/include/loops/scalar_int.h
+++ b/libnd4j/include/loops/scalar_int.h
@@ -23,16 +23,16 @@
 
 #ifndef SCALAR_INT_H_
 #define SCALAR_INT_H_
-#include <dll.h>
+#include <system/dll.h>
 
 #ifdef __JNI__
 #include <jni.h>
 #endif
-#include <templatemath.h>
+#include <math/templatemath.h>
 #include <ops/ops.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include "helpers/logger.h"
-#include <OmpLaunchHelper.h>
+#include <helpers/OmpLaunchHelper.h>
 #include <helpers/DebugHelper.h>
 
 #ifdef __CUDACC__
diff --git a/libnd4j/include/loops/special_kernels.h b/libnd4j/include/loops/special_kernels.h
index df7c0c617..52cdb7fdd 100644
--- a/libnd4j/include/loops/special_kernels.h
+++ b/libnd4j/include/loops/special_kernels.h
@@ -23,9 +23,9 @@
 
 #include <helpers/shape.h>
 
-#include <TAD.h>
+#include <helpers/TAD.h>
 #include <types/types.h>
-#include <dll.h>
+#include <system/dll.h>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
 #include <cuda_runtime.h>
@@ -33,7 +33,7 @@
 #include <device_launch_parameters.h>
 #include <helpers/DebugHelper.h>
 
-namespace nd4j {
+namespace sd {
 
     template <typename T>
     _CUDA_H void fillIsMaxGeneric(dim3 &launchDims, cudaStream_t *stream, void *dx, Nd4jLong *xShapeInfo, Nd4jLong length, long idx);
diff --git a/libnd4j/include/loops/summarystatsreduce.h b/libnd4j/include/loops/summarystatsreduce.h
index afaee9c47..0a429cd2b 100755
--- a/libnd4j/include/loops/summarystatsreduce.h
+++ b/libnd4j/include/loops/summarystatsreduce.h
@@ -23,8 +23,8 @@
 
 #ifndef SUMMARYSTATSREDUCE_H_
 #define SUMMARYSTATSREDUCE_H_
-#include <templatemath.h>
-#include <dll.h>
+#include <math/templatemath.h>
+#include <system/dll.h>
 
 #include <helpers/shape.h>
 #ifdef __CUDACC__
@@ -41,7 +41,7 @@
 #endif
 
 #include <ops/ops.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 
 #include "legacy_ops.h"
 
@@ -123,7 +123,7 @@ namespace functions {
                 return M2 / n;
             }
 
-            _CUDA_HD double skewness() { return M2 > 0.0 ? nd4j::math::nd4j_sqrt<double, double>(n) * M3 / nd4j::math::nd4j_pow<double, double, double>(M2, 1.5) : 0.0; }
+            _CUDA_HD double skewness() { return M2 > 0.0 ? sd::math::nd4j_sqrt<double, double>(n) * M3 / sd::math::nd4j_pow<double, double, double>(M2, 1.5) : 0.0; }
 
             _CUDA_HD double kurtosis() { return M2 > 0.0 ? n * M4 / (M2 * M2) : 0; }
 
@@ -247,8 +247,8 @@ namespace functions {
 
                 //Basic number of samples (n), min, and max
                 vz.n = n;
-                vz.min = nd4j::math::nd4j_min(x.min, y.min);
-                vz.max = nd4j::math::nd4j_max(x.max, y.max);
+                vz.min = sd::math::nd4j_min(x.min, y.min);
+                vz.max = sd::math::nd4j_max(x.max, y.max);
                 double meanD = x.mean + delta * y.n / n;
                 vz.mean = meanD;
                 double M2D = x.M2 + y.M2;
diff --git a/libnd4j/include/loops/transform_any.h b/libnd4j/include/loops/transform_any.h
index d97e3e90e..22d56a4d3 100644
--- a/libnd4j/include/loops/transform_any.h
+++ b/libnd4j/include/loops/transform_any.h
@@ -25,14 +25,14 @@
 #ifndef TRANSFORM_ANY_H_
 #define TRANSFORM_ANY_H_
 #include <vector>
-#include <templatemath.h>
+#include <math/templatemath.h>
 #include <ops/ops.h>
 
 #ifdef _OPENMP
 #include <omp.h>
 #endif
-#include <pairwise_util.h>
-#include <dll.h>
+#include <system/pairwise_util.h>
+#include <system/dll.h>
 
 //#include <loops/reduce.h>
 //#include <loops/scalar.h>
diff --git a/libnd4j/include/loops/transform_bool.h b/libnd4j/include/loops/transform_bool.h
index 4c87ae58c..56a7f8f7e 100644
--- a/libnd4j/include/loops/transform_bool.h
+++ b/libnd4j/include/loops/transform_bool.h
@@ -25,14 +25,14 @@
 #ifndef TRANSFORM_BOOL_H_
 #define TRANSFORM_BOOL_H_
 #include <vector>
-#include <templatemath.h>
+#include <math/templatemath.h>
 #include <ops/ops.h>
 
 #ifdef _OPENMP
 #include <omp.h>
 #endif
-#include <pairwise_util.h>
-#include <dll.h>
+#include <system/pairwise_util.h>
+#include <system/dll.h>
 
 //#include <loops/reduce.h>
 //#include <loops/scalar.h>
diff --git a/libnd4j/include/loops/transform_float.h b/libnd4j/include/loops/transform_float.h
index ae28e069f..1d9b6fb71 100644
--- a/libnd4j/include/loops/transform_float.h
+++ b/libnd4j/include/loops/transform_float.h
@@ -25,14 +25,14 @@
 #ifndef TRANSFORM_FLOAT_H_
 #define TRANSFORM_FLOAT_H_
 #include <vector>
-#include <templatemath.h>
+#include <math/templatemath.h>
 #include <ops/ops.h>
 
 #ifdef _OPENMP
 #include <omp.h>
 #endif
-#include <pairwise_util.h>
-#include <dll.h>
+#include <system/pairwise_util.h>
+#include <system/dll.h>
 
 //#include <loops/reduce.h>
 //#include <loops/scalar.h>
diff --git a/libnd4j/include/loops/transform_same.h b/libnd4j/include/loops/transform_same.h
index ae5b498e6..cb36ba872 100644
--- a/libnd4j/include/loops/transform_same.h
+++ b/libnd4j/include/loops/transform_same.h
@@ -25,14 +25,14 @@
 #ifndef TRANSFORM_SAME_H_
 #define TRANSFORM_SAME_H_
 #include <vector>
-#include <templatemath.h>
+#include <math/templatemath.h>
 #include <ops/ops.h>
 
 #ifdef _OPENMP
 #include <omp.h>
 #endif
-#include <OmpLaunchHelper.h>
-#include <dll.h>
+#include <helpers/OmpLaunchHelper.h>
+#include <system/dll.h>
 
 //#include <loops/reduce.h>
 //#include <loops/scalar.h>
diff --git a/libnd4j/include/loops/transform_strict.h b/libnd4j/include/loops/transform_strict.h
index 96917ebc1..b7ba63e46 100644
--- a/libnd4j/include/loops/transform_strict.h
+++ b/libnd4j/include/loops/transform_strict.h
@@ -25,14 +25,14 @@
 #ifndef TRANSFORM_STRICT_H_
 #define TRANSFORM_STRICT_H_
 #include <vector>
-#include <templatemath.h>
+#include <math/templatemath.h>
 #include <ops/ops.h>
 
 #ifdef _OPENMP
 #include <omp.h>
 #endif
-#include <pairwise_util.h>
-#include <dll.h>
+#include <system/pairwise_util.h>
+#include <system/dll.h>
 
 //#include <loops/reduce.h>
 //#include <loops/scalar.h>
diff --git a/libnd4j/include/loops/type_conversions.h b/libnd4j/include/loops/type_conversions.h
index d6029d7af..ff5ac5400 100644
--- a/libnd4j/include/loops/type_conversions.h
+++ b/libnd4j/include/loops/type_conversions.h
@@ -36,20 +36,20 @@
 #define ND4J_FLOAT24 119 // not supported after all. might want to add support later.
 
 #include <ops/ops.h>
-#include <templatemath.h>
+#include <math/templatemath.h>
 #include <types/float16.h>
 #include <types/float8.h>
 #include <types/uint8.h>
 #include <types/int8.h>
 #include <types/int16.h>
 #include <types/uint16.h>
-#include <Environment.h>
+#include <system/Environment.h>
 
 #define NUM_BANKS 32
 #define LOG_NUM_BANKS 4
 
 
-namespace nd4j {
+namespace sd {
 
     typedef union {
         float f_;
@@ -121,7 +121,7 @@ namespace nd4j {
         //basically, for phase One we want do calculation: how many eligible values we have, and which blocks will be holding data
         Nd4jLong tid = blockIdx.x * blockDim.x + threadIdx.x;
 
-        int pass = tid < N && nd4j::math::nd4j_abs<T>(x[tid]) >= static_cast<T>(threshold) ? 1 : 0;
+        int pass = tid < N && sd::math::nd4j_abs<T>(x[tid]) >= static_cast<T>(threshold) ? 1 : 0;
         int bp=__syncthreads_count(pass);
 
         if (threadIdx.x == 0) {
@@ -168,7 +168,7 @@ namespace nd4j {
 
         if (tid < N) {
             T value = x[tid];
-            int pred = nd4j::math::nd4j_abs<T>(value) >= static_cast<T>(threshold) ? 1 : 0;
+            int pred = sd::math::nd4j_abs<T>(value) >= static_cast<T>(threshold) ? 1 : 0;
             int w_i = threadIdx.x/warpSize; //warp index
             int w_l = tid % warpSize;//thread index within a warp
             unsigned int t_m = INT_MAX >> (warpSize-w_l-1); //thread mask (ERROR IN THE PAPER minus one is required)
@@ -231,7 +231,7 @@ namespace nd4j {
 
         for (int e = tid; e < limit; e += blockDim.x * gridDim.x) {
             int el = x[e+4];
-            int ael = nd4j::math::nd4j_abs<int>(el) - 1;
+            int ael = sd::math::nd4j_abs<int>(el) - 1;
 
             // TODO: investigate, if += would work better here, as in "decoded accumulation"
             z[ael] += el > 0 ? threshold : -threshold;
@@ -267,7 +267,7 @@ namespace nd4j {
         for (Nd4jLong i = tid; i < loopLimit; i += blockDim.x * gridDim.x) {
             // all threads in block reading stuff
             T val = i < N ? dx[i] : off;
-            T abs = nd4j::math::nd4j_abs<T>(val);
+            T abs = sd::math::nd4j_abs<T>(val);
 
             int byteId = i / 16 + 4;
             int bitId = i % 16;
diff --git a/libnd4j/include/platformmath.h b/libnd4j/include/math/platformmath.h
similarity index 99%
rename from libnd4j/include/platformmath.h
rename to libnd4j/include/math/platformmath.h
index 5c0a1d07f..e4990cc87 100644
--- a/libnd4j/include/platformmath.h
+++ b/libnd4j/include/math/platformmath.h
@@ -23,7 +23,7 @@
 
 #include <math.h>
 #include <cmath>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <types/types.h>
 
 #ifdef __CUDACC__
@@ -84,7 +84,7 @@ union PAIR {
 #endif
 
 
-namespace nd4j {
+namespace sd {
     namespace math {
         template <typename T>
         math_def FORCEINLINE T p_exp(T value);
@@ -838,7 +838,7 @@ namespace nd4j {
 
         template <>
         math_def FORCEINLINE uint64_t _rotate_left(uint64_t value, uint64_t shift) {
-#ifdef ARM_BUILD
+#ifdef SD_ARM_BUILD
             // TODO: eventually remove this once gcc fixes the bug
             Nd4jLong val = _rotate_left<Nd4jLong>(*reinterpret_cast<Nd4jLong *>(&value), *reinterpret_cast<Nd4jLong *>(&shift));
             return *reinterpret_cast<uint64_t *>(&val);
@@ -849,7 +849,7 @@ namespace nd4j {
 
         template <>
         math_def FORCEINLINE uint64_t _rotate_right(uint64_t value, uint64_t shift) {
-#ifdef ARM_BUILD
+#ifdef SD_ARM_BUILD
             // TODO: eventually remove this once gcc fixes the bug
             Nd4jLong val = _rotate_right<Nd4jLong>(*reinterpret_cast<Nd4jLong *>(&value), *reinterpret_cast<Nd4jLong *>(&shift));
             return *reinterpret_cast<uint64_t *>(&val);
diff --git a/libnd4j/include/templatemath.h b/libnd4j/include/math/templatemath.h
similarity index 97%
rename from libnd4j/include/templatemath.h
rename to libnd4j/include/math/templatemath.h
index 6163488e3..c220231d8 100644
--- a/libnd4j/include/templatemath.h
+++ b/libnd4j/include/math/templatemath.h
@@ -25,10 +25,10 @@
 #ifndef TEMPLATEMATH_H_
 #define TEMPLATEMATH_H_
 
-#include <dll.h>
-#include <pointercast.h>
-#include <platformmath.h>
-#include <DataTypeUtils.h>
+#include <system/dll.h>
+#include <system/pointercast.h>
+#include <math/platformmath.h>
+#include <array/DataTypeUtils.h>
 
 #define BFLOAT16_MAX_VALUE 32737.
 #define HALF_MAX_VALUE 65504.
@@ -44,7 +44,7 @@
 #define M_PI 3.14159265358979323846
 #endif
 
-namespace nd4j {
+namespace sd {
 #ifdef __CUDACC__
 
 #endif
@@ -200,7 +200,7 @@ namespace nd4j {
 
 		template<typename T, typename Z>
         math_def inline Z nd4j_softsign(T val) {
-			return val / ((T) 1.0f + nd4j::math::nd4j_abs<T>(val));
+			return val / ((T) 1.0f + sd::math::nd4j_abs<T>(val));
 		}
 
 		template<typename X, typename Z>
@@ -584,7 +584,7 @@ namespace nd4j {
 
 		template <typename T>
 		math_def inline bool nd4j_eq(T d1, T d2, double eps) {
-			if (nd4j::math::nd4j_isinf<T>(d1) && nd4j::math::nd4j_isinf<T>(d2)) {
+			if (sd::math::nd4j_isinf<T>(d1) && sd::math::nd4j_isinf<T>(d2)) {
 				if (d1 > 0 && d2 > 0)
 					return true;
 				else if (d1 < 0 && d2 < 0)
@@ -593,7 +593,7 @@ namespace nd4j {
 					return false;
 			}
 
-			auto diff = static_cast<double>(nd4j::math::nd4j_abs<T>(d1 - d2));
+			auto diff = static_cast<double>(sd::math::nd4j_abs<T>(d1 - d2));
 
 
 			// works well except in the range of very large numbers
@@ -602,7 +602,7 @@ namespace nd4j {
 
 			// Knuth approach
 			// works well except in the range of very small numbers
-			if (diff <= nd4j::math::nd4j_max<double>(nd4j::math::nd4j_abs<double>(static_cast<double>(d1)), nd4j::math::nd4j_abs<double>(static_cast<double>(d2))) * eps)
+			if (diff <= sd::math::nd4j_max<double>(sd::math::nd4j_abs<double>(static_cast<double>(d1)), sd::math::nd4j_abs<double>(static_cast<double>(d2))) * eps)
 				return true;
 
 			return false;
@@ -789,7 +789,7 @@ namespace nd4j {
             X t = static_cast<X>(2.0f);
             X e = static_cast<X>(M_E);
 
-            auto p = nd4j::math::nd4j_pow<X, X, X>(e, val * t);
+            auto p = sd::math::nd4j_pow<X, X, X>(e, val * t);
             return (p - o)/ (p + o);
         }
 
@@ -799,7 +799,7 @@ namespace nd4j {
             X t = static_cast<X>(-2.0f);
             X e = static_cast<X>(M_E);
 
-            auto p = nd4j::math::nd4j_pow<X, X, X>(e, val * t);
+            auto p = sd::math::nd4j_pow<X, X, X>(e, val * t);
             return (o - p) / (o + p);
         }
 
@@ -807,7 +807,7 @@ namespace nd4j {
         math_def inline float neu_tanh(float val, float sign) {
             float e(M_E);
             float av = sign * val;
-            auto p = nd4j::math::nd4j_pow<float, float, float>(e, -av * 2.f);
+            auto p = sd::math::nd4j_pow<float, float, float>(e, -av * 2.f);
             return (1 - p) / (1 + p);
         }
 
@@ -949,7 +949,7 @@ namespace nd4j {
 //                return DataTypeUtils::infOrMax<Z>();
             }
 
-            return nd4j::math::nd4j_exp<Z,Z>(nd4j::math::nd4j_lgamma<X,Z>(a));
+            return sd::math::nd4j_exp<Z,Z>(sd::math::nd4j_lgamma<X,Z>(a));
         }
 
         template <typename X, typename Y, typename Z>
@@ -1705,29 +1705,29 @@ inline __device__ bfloat16 nd4j_atomicDiv<bfloat16>(bfloat16* address, bfloat16
 #endif
 
 #pragma omp declare reduction(maxTF : float,double,float16,bfloat16 :              \
-                omp_out = nd4j::math::nd4j_max(omp_in, omp_out) )\
+                omp_out = sd::math::nd4j_max(omp_in, omp_out) )\
                 initializer (omp_priv=-MAX_FLOAT)
 
 #pragma omp declare reduction(minTF : float,double,float16,bfloat16 :              \
-                omp_out = nd4j::math::nd4j_min(omp_in, omp_out) )\
+                omp_out = sd::math::nd4j_min(omp_in, omp_out) )\
                 initializer (omp_priv=MAX_FLOAT)
 
 #pragma omp declare reduction(maxT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t :              \
-                omp_out = nd4j::math::nd4j_max(omp_in, omp_out) )\
+                omp_out = sd::math::nd4j_max(omp_in, omp_out) )\
                 initializer (omp_priv=0)
 
 #pragma omp declare reduction(minT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t :              \
-                omp_out = nd4j::math::nd4j_min(omp_in, omp_out) )\
+                omp_out = sd::math::nd4j_min(omp_in, omp_out) )\
                 initializer (omp_priv=0)
 
 #pragma omp declare reduction(amaxT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t :              \
-                omp_out = nd4j::math::nd4j_max(nd4j::math::nd4j_abs(omp_in), nd4j::math::nd4j_abs(omp_out)) )
+                omp_out = sd::math::nd4j_max(sd::math::nd4j_abs(omp_in), sd::math::nd4j_abs(omp_out)) )
 
 #pragma omp declare reduction(aminT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t :              \
-                omp_out = nd4j::math::nd4j_min(nd4j::math::nd4j_abs(omp_in), nd4j::math::nd4j_abs(omp_out)) )
+                omp_out = sd::math::nd4j_min(sd::math::nd4j_abs(omp_in), sd::math::nd4j_abs(omp_out)) )
 
 #pragma omp declare reduction(asumT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t :              \
-                omp_out = nd4j::math::nd4j_abs(omp_in) + nd4j::math::nd4j_abs(omp_out))\
+                omp_out = sd::math::nd4j_abs(omp_in) + sd::math::nd4j_abs(omp_out))\
                 initializer (omp_priv=0)
 
 #pragma omp declare reduction(sumT : float,double,float16,bfloat16,int,Nd4jLong,Nd4jULong,int8_t,uint8_t,bool,int16_t,uint16_t,uint32_t :              \
diff --git a/libnd4j/include/memory/AllocationEntry.h b/libnd4j/include/memory/AllocationEntry.h
index 3e690466b..815a5c992 100644
--- a/libnd4j/include/memory/AllocationEntry.h
+++ b/libnd4j/include/memory/AllocationEntry.h
@@ -21,11 +21,11 @@
 #ifndef DEV_TESTS_ALLOCATIONENTRY_H
 #define DEV_TESTS_ALLOCATIONENTRY_H
 
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <string>
 #include <memory/MemoryType.h>
 
-namespace nd4j {
+namespace sd {
     namespace memory {
         class AllocationEntry {
         private:
diff --git a/libnd4j/include/memory/ExternalWorkspace.h b/libnd4j/include/memory/ExternalWorkspace.h
index c1f3b4864..772afc608 100644
--- a/libnd4j/include/memory/ExternalWorkspace.h
+++ b/libnd4j/include/memory/ExternalWorkspace.h
@@ -21,10 +21,10 @@
 #ifndef LIBND4J_EXTERNALWORKSPACE_H
 #define LIBND4J_EXTERNALWORKSPACE_H
 
-#include <pointercast.h>
-#include <dll.h>
+#include <system/pointercast.h>
+#include <system/dll.h>
 
-namespace nd4j {
+namespace sd {
     namespace memory {
         class ND4J_EXPORT ExternalWorkspace {
         private:
diff --git a/libnd4j/include/memory/MemoryCounter.h b/libnd4j/include/memory/MemoryCounter.h
index bf8ff60dc..91aaeecff 100644
--- a/libnd4j/include/memory/MemoryCounter.h
+++ b/libnd4j/include/memory/MemoryCounter.h
@@ -21,13 +21,13 @@
 #ifndef SD_MEMORYCOUNTER_H
 #define SD_MEMORYCOUNTER_H
 
-#include <pointercast.h>
-#include <dll.h>
+#include <system/pointercast.h>
+#include <system/dll.h>
 #include <map>
 #include <memory/MemoryType.h>
 #include <mutex>
 
-namespace nd4j {
+namespace sd {
     namespace memory {
         /**
          * This class provides simple per-device counter
@@ -44,13 +44,13 @@ namespace nd4j {
 
             // TODO: change this wrt heterogenous stuff on next iteration
             // per-group counters
-            std::map<nd4j::memory::MemoryType, Nd4jLong> _groupCounters;
+            std::map<sd::memory::MemoryType, Nd4jLong> _groupCounters;
 
             // per-device limits
             std::map<int, Nd4jLong> _deviceLimits;
 
             // per-group limits
-            std::map<nd4j::memory::MemoryType, Nd4jLong> _groupLimits;
+            std::map<sd::memory::MemoryType, Nd4jLong> _groupLimits;
 
             MemoryCounter();
             ~MemoryCounter() = default;
@@ -79,7 +79,7 @@ namespace nd4j {
              * @param numBytes
              * @return TRUE if allocated ammount will keep us below limit, FALSE otherwise
              */
-            bool validateGroup(nd4j::memory::MemoryType group, Nd4jLong numBytes);
+            bool validateGroup(sd::memory::MemoryType group, Nd4jLong numBytes);
 
             /**
              * This method adds specified number of bytes to specified counter
@@ -87,7 +87,7 @@ namespace nd4j {
              * @param numBytes
              */
             void countIn(int deviceId, Nd4jLong numBytes);
-            void countIn(nd4j::memory::MemoryType group, Nd4jLong numBytes);
+            void countIn(sd::memory::MemoryType group, Nd4jLong numBytes);
 
             /**
              * This method subtracts specified number of bytes from specified counter
@@ -95,7 +95,7 @@ namespace nd4j {
              * @param numBytes
              */
             void countOut(int deviceId, Nd4jLong numBytes);
-            void countOut(nd4j::memory::MemoryType group, Nd4jLong numBytes);
+            void countOut(sd::memory::MemoryType group, Nd4jLong numBytes);
 
             /**
              * This method returns amount of memory allocated on specified device
@@ -109,7 +109,7 @@ namespace nd4j {
              * @param group
              * @return
              */
-            Nd4jLong allocatedGroup(nd4j::memory::MemoryType group);
+            Nd4jLong allocatedGroup(sd::memory::MemoryType group);
 
             /**
              * This method allows to set per-device memory limits
@@ -130,14 +130,14 @@ namespace nd4j {
              * @param group
              * @param numBytes
              */
-            void setGroupLimit(nd4j::memory::MemoryType group, Nd4jLong numBytes);
+            void setGroupLimit(sd::memory::MemoryType group, Nd4jLong numBytes);
 
             /**
              * This method returns current group limit in bytes
              * @param group
              * @return
              */
-            Nd4jLong groupLimit(nd4j::memory::MemoryType group);
+            Nd4jLong groupLimit(sd::memory::MemoryType group);
         };
     }
 }
diff --git a/libnd4j/include/memory/MemoryRegistrator.h b/libnd4j/include/memory/MemoryRegistrator.h
index a286923ad..ad1b0333a 100644
--- a/libnd4j/include/memory/MemoryRegistrator.h
+++ b/libnd4j/include/memory/MemoryRegistrator.h
@@ -22,13 +22,13 @@
 #define LIBND4J_MEMORYREGISTRATOR_H
 
 #include "Workspace.h"
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <unordered_map>
 #include <map>
 #include <mutex>
-#include <dll.h>
+#include <system/dll.h>
 
-namespace nd4j {
+namespace sd {
     namespace memory {
         class ND4J_EXPORT MemoryRegistrator {
         protected:
diff --git a/libnd4j/include/memory/MemoryReport.h b/libnd4j/include/memory/MemoryReport.h
index 636178d45..647886ab5 100644
--- a/libnd4j/include/memory/MemoryReport.h
+++ b/libnd4j/include/memory/MemoryReport.h
@@ -21,10 +21,10 @@
 #ifndef LIBND4J_MEMORYREPORT_H
 #define LIBND4J_MEMORYREPORT_H
 
-#include <pointercast.h>
-#include <dll.h>
+#include <system/pointercast.h>
+#include <system/dll.h>
 
-namespace nd4j {
+namespace sd {
     namespace memory {
         class ND4J_EXPORT MemoryReport {
         private:
diff --git a/libnd4j/include/memory/MemoryTracker.h b/libnd4j/include/memory/MemoryTracker.h
index 097d2903d..38bb926ca 100644
--- a/libnd4j/include/memory/MemoryTracker.h
+++ b/libnd4j/include/memory/MemoryTracker.h
@@ -23,12 +23,12 @@
 
 #include <map>
 #include <string>
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <mutex>
 #include "AllocationEntry.h"
-#include <dll.h>
+#include <system/dll.h>
 
-namespace nd4j {
+namespace sd {
     namespace memory {
         /**
          * This class is used for tracking memory allocation wrt their allocation points in code
diff --git a/libnd4j/include/memory/MemoryType.h b/libnd4j/include/memory/MemoryType.h
index 9a2e9b0e3..113d8d16d 100644
--- a/libnd4j/include/memory/MemoryType.h
+++ b/libnd4j/include/memory/MemoryType.h
@@ -5,7 +5,7 @@
 #ifndef DEV_TESTS_MEMORYTYPE_H
 #define DEV_TESTS_MEMORYTYPE_H
 
-namespace nd4j {
+namespace sd {
     namespace memory {
         enum MemoryType {
             HOST = 0,
diff --git a/libnd4j/include/memory/MemoryUtils.h b/libnd4j/include/memory/MemoryUtils.h
index 5fe27898c..027008238 100644
--- a/libnd4j/include/memory/MemoryUtils.h
+++ b/libnd4j/include/memory/MemoryUtils.h
@@ -22,9 +22,9 @@
 #define LIBND4J_MEMORYUTILS_H
 
 #include "MemoryReport.h"
-#include <dll.h>
+#include <system/dll.h>
 
-namespace nd4j {
+namespace sd {
     namespace memory {
         class ND4J_EXPORT MemoryUtils {
         public:
diff --git a/libnd4j/include/memory/Workspace.h b/libnd4j/include/memory/Workspace.h
index 269ed9ca7..c97f6a178 100644
--- a/libnd4j/include/memory/Workspace.h
+++ b/libnd4j/include/memory/Workspace.h
@@ -27,13 +27,13 @@
 #include <atomic>
 #include <vector>
 #include <mutex>
-#include <dll.h>
-#include <pointercast.h>
+#include <system/dll.h>
+#include <system/pointercast.h>
 #include <types/float16.h>
 #include <memory/ExternalWorkspace.h>
 #include <memory/MemoryType.h>
 
-namespace nd4j {
+namespace sd {
     namespace memory {
 
         class ND4J_EXPORT Workspace {
diff --git a/libnd4j/include/memory/cpu/Workspace.cpp b/libnd4j/include/memory/cpu/Workspace.cpp
index e297e28b0..ae60f1eea 100644
--- a/libnd4j/include/memory/cpu/Workspace.cpp
+++ b/libnd4j/include/memory/cpu/Workspace.cpp
@@ -21,17 +21,17 @@
 //
 
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <atomic>
 #include <stdio.h>
 #include <stdlib.h>
 #include "../Workspace.h"
 #include <helpers/logger.h>
-#include <templatemath.h>
+#include <math/templatemath.h>
 #include <cstring>
 
 
-namespace nd4j {
+namespace sd {
     namespace memory {
         Workspace::Workspace(ExternalWorkspace *external) {
             if (external->sizeHost() > 0) {
@@ -182,7 +182,7 @@ namespace nd4j {
             return _spillsSize.load();
         }
 
-        void* Workspace::allocateBytes(nd4j::memory::MemoryType type, Nd4jLong numBytes) {
+        void* Workspace::allocateBytes(sd::memory::MemoryType type, Nd4jLong numBytes) {
             if (type == DEVICE)
                 throw std::runtime_error("CPU backend doesn't have device memory");
 
@@ -211,7 +211,7 @@ namespace nd4j {
 
         Workspace* Workspace::clone() {
             // for clone we take whatever is higher: current allocated size, or allocated size of current loop
-            return new Workspace(nd4j::math::nd4j_max<Nd4jLong >(this->getCurrentSize(), this->_cycleAllocations.load()));
+            return new Workspace(sd::math::nd4j_max<Nd4jLong >(this->getCurrentSize(), this->_cycleAllocations.load()));
         }
     }
 }
diff --git a/libnd4j/include/memory/cuda/Workspace.cu b/libnd4j/include/memory/cuda/Workspace.cu
index aeb6b4752..9d2286156 100644
--- a/libnd4j/include/memory/cuda/Workspace.cu
+++ b/libnd4j/include/memory/cuda/Workspace.cu
@@ -20,20 +20,20 @@
 // @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <atomic>
 #include <stdio.h>
 #include <stdlib.h>
 #include "../Workspace.h"
 #include <helpers/logger.h>
-#include <templatemath.h>
+#include <math/templatemath.h>
 #include <cstring>
-#include <cuda_exception.h>
+#include <exceptions/cuda_exception.h>
 
 #include <cuda.h>
 #include <cuda_runtime.h>
 
-namespace nd4j {
+namespace sd {
     namespace memory {
         Workspace::Workspace(ExternalWorkspace *external) {
             if (external->sizeHost() > 0) {
@@ -162,7 +162,7 @@ namespace nd4j {
 
 
         void* Workspace::allocateBytes(Nd4jLong numBytes) {
-            return allocateBytes(nd4j::memory::MemoryType::HOST, numBytes);
+            return allocateBytes(sd::memory::MemoryType::HOST, numBytes);
         }
 
         Nd4jLong Workspace::getAllocatedSize() {
@@ -183,7 +183,7 @@ namespace nd4j {
             return _spillsSize.load();
         }
 
-        void* Workspace::allocateBytes(nd4j::memory::MemoryType type, Nd4jLong numBytes) {
+        void* Workspace::allocateBytes(sd::memory::MemoryType type, Nd4jLong numBytes) {
             switch (type) {
                 case HOST: {
                         if (numBytes < 1)
@@ -270,7 +270,7 @@ namespace nd4j {
 
         Workspace* Workspace::clone() {
             // for clone we take whatever is higher: current allocated size, or allocated size of current loop
-            return new Workspace(nd4j::math::nd4j_max<Nd4jLong >(this->getCurrentSize(), this->_cycleAllocations.load()));
+            return new Workspace(sd::math::nd4j_max<Nd4jLong >(this->getCurrentSize(), this->_cycleAllocations.load()));
         }
 
         Nd4jLong Workspace::getAllocatedSecondarySize() {
diff --git a/libnd4j/include/memory/impl/AllocationEntry.cpp b/libnd4j/include/memory/impl/AllocationEntry.cpp
index c0df16a3f..6b4d85bb1 100644
--- a/libnd4j/include/memory/impl/AllocationEntry.cpp
+++ b/libnd4j/include/memory/impl/AllocationEntry.cpp
@@ -20,7 +20,7 @@
 
 #include <memory/AllocationEntry.h>
 
-namespace nd4j {
+namespace sd {
     namespace memory {
         AllocationEntry::AllocationEntry(MemoryType type, Nd4jLong ptr, Nd4jLong numBytes, std::string &stack) {
             _pointer = ptr;
diff --git a/libnd4j/include/memory/impl/ExternalWorkspace.cpp b/libnd4j/include/memory/impl/ExternalWorkspace.cpp
index fffba2216..c4feb181d 100644
--- a/libnd4j/include/memory/impl/ExternalWorkspace.cpp
+++ b/libnd4j/include/memory/impl/ExternalWorkspace.cpp
@@ -20,7 +20,7 @@
 
 #include <memory/ExternalWorkspace.h>
 
-namespace nd4j {
+namespace sd {
     namespace memory {
         ExternalWorkspace::ExternalWorkspace(Nd4jPointer ptrH, Nd4jLong sizeH, Nd4jPointer ptrD, Nd4jLong sizeD) {
             _ptrH = ptrH;
diff --git a/libnd4j/include/memory/impl/MemoryCounter.cpp b/libnd4j/include/memory/impl/MemoryCounter.cpp
index 0dc845e37..96be34681 100644
--- a/libnd4j/include/memory/impl/MemoryCounter.cpp
+++ b/libnd4j/include/memory/impl/MemoryCounter.cpp
@@ -20,14 +20,14 @@
 
 #include "../MemoryCounter.h"
 #include <execution/AffinityManager.h>
-#include <Environment.h>
+#include <system/Environment.h>
 #include <helpers/logger.h>
 
-namespace nd4j {
+namespace sd {
     namespace memory {
 
         MemoryCounter::MemoryCounter() {
-            auto numDevices = nd4j::AffinityManager::numberOfDevices();
+            auto numDevices = sd::AffinityManager::numberOfDevices();
 
             // setting default 0s
             for (int e = 0; e < numDevices; e++) {
@@ -36,12 +36,12 @@ namespace nd4j {
             }
 
             // setting initial values for limits
-            _groupLimits[nd4j::memory::MemoryType::HOST] = nd4j::Environment::getInstance()->maxPrimaryMemory();
-            _groupLimits[nd4j::memory::MemoryType::DEVICE] = nd4j::Environment::getInstance()->maxSpecialMemory();
+            _groupLimits[sd::memory::MemoryType::HOST] = sd::Environment::getInstance()->maxPrimaryMemory();
+            _groupLimits[sd::memory::MemoryType::DEVICE] = sd::Environment::getInstance()->maxSpecialMemory();
 
             // setting initial counter values
-            _groupCounters[nd4j::memory::MemoryType::HOST] = 0;
-            _groupCounters[nd4j::memory::MemoryType::DEVICE] = 0;
+            _groupCounters[sd::memory::MemoryType::HOST] = 0;
+            _groupCounters[sd::memory::MemoryType::DEVICE] = 0;
         }
 
         MemoryCounter* MemoryCounter::getInstance() {
@@ -56,7 +56,7 @@ namespace nd4j {
             _deviceCounters[deviceId] += numBytes;
         }
 
-        void MemoryCounter::countIn(nd4j::memory::MemoryType group, Nd4jLong numBytes) {
+        void MemoryCounter::countIn(sd::memory::MemoryType group, Nd4jLong numBytes) {
             std::lock_guard<std::mutex> lock(_locker);
             _groupCounters[group] += numBytes;
         }
@@ -66,13 +66,13 @@ namespace nd4j {
             _deviceCounters[deviceId] -= numBytes;
         }
 
-        void MemoryCounter::countOut(nd4j::memory::MemoryType group, Nd4jLong numBytes) {
+        void MemoryCounter::countOut(sd::memory::MemoryType group, Nd4jLong numBytes) {
             std::lock_guard<std::mutex> lock(_locker);
             _groupCounters[group] -= numBytes;
         }
 
         bool MemoryCounter::validate(Nd4jLong numBytes) {
-            auto deviceId = nd4j::AffinityManager::currentDeviceId();
+            auto deviceId = sd::AffinityManager::currentDeviceId();
             return validateDevice(deviceId, numBytes);
         }
 
@@ -87,7 +87,7 @@ namespace nd4j {
             return numBytes + dAlloc <= dLimit;
         }
 
-        bool MemoryCounter::validateGroup(nd4j::memory::MemoryType group, Nd4jLong numBytes) {
+        bool MemoryCounter::validateGroup(sd::memory::MemoryType group, Nd4jLong numBytes) {
             std::lock_guard<std::mutex> lock(_locker);
             auto gLimit = _groupLimits[group];
             if (gLimit <= 0)
@@ -103,7 +103,7 @@ namespace nd4j {
             return _deviceCounters[deviceId];
         }
 
-        Nd4jLong MemoryCounter::allocatedGroup(nd4j::memory::MemoryType group) {
+        Nd4jLong MemoryCounter::allocatedGroup(sd::memory::MemoryType group) {
             std::lock_guard<std::mutex> lock(_locker);
             return _groupCounters[group];
         }
@@ -113,7 +113,7 @@ namespace nd4j {
             _deviceLimits[deviceId] = numBytes;
         }
 
-        void MemoryCounter::setGroupLimit(nd4j::memory::MemoryType group, Nd4jLong numBytes) {
+        void MemoryCounter::setGroupLimit(sd::memory::MemoryType group, Nd4jLong numBytes) {
             std::lock_guard<std::mutex> lock(_locker);
             _groupLimits[group] = numBytes;
         }
@@ -123,7 +123,7 @@ namespace nd4j {
             return _deviceLimits[deviceId];
         }
 
-        Nd4jLong MemoryCounter::groupLimit(nd4j::memory::MemoryType group) {
+        Nd4jLong MemoryCounter::groupLimit(sd::memory::MemoryType group) {
             std::lock_guard<std::mutex> lock(_locker);
             return _groupLimits[group];
         }
diff --git a/libnd4j/include/memory/impl/MemoryRegistrator.cpp b/libnd4j/include/memory/impl/MemoryRegistrator.cpp
index 0eba40af9..31b4b0eae 100644
--- a/libnd4j/include/memory/impl/MemoryRegistrator.cpp
+++ b/libnd4j/include/memory/impl/MemoryRegistrator.cpp
@@ -20,7 +20,7 @@
 
 #include <memory/MemoryRegistrator.h>
 
-namespace nd4j {
+namespace sd {
     namespace memory {
 
         MemoryRegistrator::MemoryRegistrator() {
diff --git a/libnd4j/include/memory/impl/MemoryReport.cpp b/libnd4j/include/memory/impl/MemoryReport.cpp
index 533f7ac4f..0c623b0ce 100644
--- a/libnd4j/include/memory/impl/MemoryReport.cpp
+++ b/libnd4j/include/memory/impl/MemoryReport.cpp
@@ -20,42 +20,42 @@
 
 #include "memory/MemoryReport.h"
 
-bool nd4j::memory::MemoryReport::operator<(const nd4j::memory::MemoryReport &other) const {
+bool sd::memory::MemoryReport::operator<(const sd::memory::MemoryReport &other) const {
     return this->_rss < other._rss;
 }
 
-bool nd4j::memory::MemoryReport::operator>(const nd4j::memory::MemoryReport &other) const {
+bool sd::memory::MemoryReport::operator>(const sd::memory::MemoryReport &other) const {
     return this->_rss > other._rss;
 }
 
-bool nd4j::memory::MemoryReport::operator==(const nd4j::memory::MemoryReport &other) const {
+bool sd::memory::MemoryReport::operator==(const sd::memory::MemoryReport &other) const {
     return this->_rss == other._rss;
 }
 
-bool nd4j::memory::MemoryReport::operator!=(const nd4j::memory::MemoryReport &other) const {
+bool sd::memory::MemoryReport::operator!=(const sd::memory::MemoryReport &other) const {
     return this->_rss != other._rss;
 }
 
-bool nd4j::memory::MemoryReport::operator<=(const nd4j::memory::MemoryReport &other) const {
+bool sd::memory::MemoryReport::operator<=(const sd::memory::MemoryReport &other) const {
     return this->_rss <= other._rss;
 }
 
-bool nd4j::memory::MemoryReport::operator>=(const nd4j::memory::MemoryReport &other) const {
+bool sd::memory::MemoryReport::operator>=(const sd::memory::MemoryReport &other) const {
     return this->_rss >= other._rss;
 }
 
-Nd4jLong nd4j::memory::MemoryReport::getVM() const {
+Nd4jLong sd::memory::MemoryReport::getVM() const {
     return _vm;
 }
 
-void nd4j::memory::MemoryReport::setVM(Nd4jLong _vm) {
+void sd::memory::MemoryReport::setVM(Nd4jLong _vm) {
     MemoryReport::_vm = _vm;
 }
 
-Nd4jLong nd4j::memory::MemoryReport::getRSS() const {
+Nd4jLong sd::memory::MemoryReport::getRSS() const {
     return _rss;
 }
 
-void nd4j::memory::MemoryReport::setRSS(Nd4jLong _rss) {
+void sd::memory::MemoryReport::setRSS(Nd4jLong _rss) {
     MemoryReport::_rss = _rss;
 }
diff --git a/libnd4j/include/memory/impl/MemoryTracker.cpp b/libnd4j/include/memory/impl/MemoryTracker.cpp
index 53a64f14d..be3019b08 100644
--- a/libnd4j/include/memory/impl/MemoryTracker.cpp
+++ b/libnd4j/include/memory/impl/MemoryTracker.cpp
@@ -25,7 +25,7 @@
 
 #include <stdlib.h>
 
-#if defined(__GNUC__) && !defined(__MINGW64__) && !defined(ANDROID_BUILD) && !defined(IOS_BUILD)  && !defined(APPLE_BUILD)
+#if defined(__GNUC__) && !defined(__MINGW64__) && !defined(SD_ANDROID_BUILD) && !defined(SD_IOS_BUILD)  && !defined(SD_APPLE_BUILD)
 
 #include <unistd.h>
 #include <execinfo.h>
@@ -33,7 +33,7 @@
 
 #endif
 
-namespace nd4j {
+namespace sd {
     namespace memory {
 
         MemoryTracker::MemoryTracker() {
@@ -47,7 +47,7 @@ namespace nd4j {
             return _INSTANCE;
         }
 
-#if defined(__GNUC__) && !defined(__MINGW64__) && !defined(ANDROID_BUILD) && !defined(IOS_BUILD)  && !defined(APPLE_BUILD)
+#if defined(__GNUC__) && !defined(__MINGW64__) && !defined(SD_ANDROID_BUILD) && !defined(SD_IOS_BUILD)  && !defined(SD_APPLE_BUILD)
         std::string demangle(char *message) {
             char *mangled_name = 0, *offset_begin = 0, *offset_end = 0;
 
@@ -95,7 +95,7 @@ namespace nd4j {
 #endif
 
         void MemoryTracker::countIn(MemoryType type, Nd4jPointer ptr, Nd4jLong numBytes) {
-#if defined(__GNUC__) && !defined(__MINGW64__) && !defined(ANDROID_BUILD) && !defined(IOS_BUILD)  && !defined(APPLE_BUILD)
+#if defined(__GNUC__) && !defined(__MINGW64__) && !defined(SD_ANDROID_BUILD) && !defined(SD_IOS_BUILD)  && !defined(SD_APPLE_BUILD)
             if (Environment::getInstance()->isDetectingLeaks()) {
                 auto lptr = reinterpret_cast<Nd4jLong>(ptr);
 
@@ -129,7 +129,7 @@ namespace nd4j {
         }
 
         void MemoryTracker::countOut(Nd4jPointer ptr) {
-#if defined(__GNUC__) && !defined(__MINGW64__) && !defined(ANDROID_BUILD) && !defined(IOS_BUILD)  && !defined(APPLE_BUILD)
+#if defined(__GNUC__) && !defined(__MINGW64__) && !defined(SD_ANDROID_BUILD) && !defined(SD_IOS_BUILD)  && !defined(SD_APPLE_BUILD)
             if (Environment::getInstance()->isDetectingLeaks()) {
                 auto lptr = reinterpret_cast<Nd4jLong>(ptr);
 
diff --git a/libnd4j/include/memory/impl/MemoryUtils.cpp b/libnd4j/include/memory/impl/MemoryUtils.cpp
index ac611fd64..8500a044e 100644
--- a/libnd4j/include/memory/impl/MemoryUtils.cpp
+++ b/libnd4j/include/memory/impl/MemoryUtils.cpp
@@ -35,7 +35,7 @@
 #endif
 
 
-bool nd4j::memory::MemoryUtils::retrieveMemoryStatistics(nd4j::memory::MemoryReport &report) {
+bool sd::memory::MemoryUtils::retrieveMemoryStatistics(sd::memory::MemoryReport &report) {
 #if defined(__APPLE__)
     nd4j_debug("APPLE route\n", "");
 /*
diff --git a/libnd4j/include/ops/BroadcastBoolOpsTuple.h b/libnd4j/include/ops/BroadcastBoolOpsTuple.h
index 7b0f96505..188186b4c 100644
--- a/libnd4j/include/ops/BroadcastBoolOpsTuple.h
+++ b/libnd4j/include/ops/BroadcastBoolOpsTuple.h
@@ -21,28 +21,28 @@
 #ifndef DEV_TESTS_BROADCASTBOOLOPSTUPLE_H
 #define DEV_TESTS_BROADCASTBOOLOPSTUPLE_H
 
-#include <op_enums.h>
-#include <dll.h>
+#include <system/op_enums.h>
+#include <system/dll.h>
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT BroadcastBoolOpsTuple {
     private:
 
     public:
-        nd4j::scalar::BoolOps s;
-        nd4j::pairwise::BoolOps p;
-        nd4j::broadcast::BoolOps b;
+        sd::scalar::BoolOps s;
+        sd::pairwise::BoolOps p;
+        sd::broadcast::BoolOps b;
 
         BroadcastBoolOpsTuple() = default;
         ~BroadcastBoolOpsTuple() = default;
 
-        BroadcastBoolOpsTuple(nd4j::scalar::BoolOps scalar, nd4j::pairwise::BoolOps pairwise, nd4j::broadcast::BoolOps broadcast) {
+        BroadcastBoolOpsTuple(sd::scalar::BoolOps scalar, sd::pairwise::BoolOps pairwise, sd::broadcast::BoolOps broadcast) {
             s = scalar;
             p = pairwise;
             b = broadcast;
         }
 
-        static BroadcastBoolOpsTuple custom(nd4j::scalar::BoolOps scalar, nd4j::pairwise::BoolOps pairwise, nd4j::broadcast::BoolOps broadcast);
+        static BroadcastBoolOpsTuple custom(sd::scalar::BoolOps scalar, sd::pairwise::BoolOps pairwise, sd::broadcast::BoolOps broadcast);
     };
 }
 
diff --git a/libnd4j/include/ops/BroadcastIntOpsTuple.h b/libnd4j/include/ops/BroadcastIntOpsTuple.h
index c96244b1a..258719004 100644
--- a/libnd4j/include/ops/BroadcastIntOpsTuple.h
+++ b/libnd4j/include/ops/BroadcastIntOpsTuple.h
@@ -21,28 +21,28 @@
 #ifndef DEV_TESTS_BROADCASTINTOPSTUPLE_H
 #define DEV_TESTS_BROADCASTINTOPSTUPLE_H
 
-#include <op_enums.h>
-#include <dll.h>
+#include <system/op_enums.h>
+#include <system/dll.h>
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT BroadcastIntOpsTuple {
     private:
 
     public:
-        nd4j::scalar::IntOps s;
-        nd4j::pairwise::IntOps p;
-        nd4j::broadcast::IntOps b;
+        sd::scalar::IntOps s;
+        sd::pairwise::IntOps p;
+        sd::broadcast::IntOps b;
 
         BroadcastIntOpsTuple() = default;
         ~BroadcastIntOpsTuple() = default;
 
-        BroadcastIntOpsTuple(nd4j::scalar::IntOps scalar, nd4j::pairwise::IntOps pairwise, nd4j::broadcast::IntOps broadcast) {
+        BroadcastIntOpsTuple(sd::scalar::IntOps scalar, sd::pairwise::IntOps pairwise, sd::broadcast::IntOps broadcast) {
             s = scalar;
             p = pairwise;
             b = broadcast;
         }
 
-        static BroadcastIntOpsTuple custom(nd4j::scalar::IntOps scalar, nd4j::pairwise::IntOps pairwise, nd4j::broadcast::IntOps broadcast);
+        static BroadcastIntOpsTuple custom(sd::scalar::IntOps scalar, sd::pairwise::IntOps pairwise, sd::broadcast::IntOps broadcast);
     };
 }
 
diff --git a/libnd4j/include/ops/BroadcastOpsTuple.h b/libnd4j/include/ops/BroadcastOpsTuple.h
index 1bcd2df8b..34e2c6039 100644
--- a/libnd4j/include/ops/BroadcastOpsTuple.h
+++ b/libnd4j/include/ops/BroadcastOpsTuple.h
@@ -21,28 +21,28 @@
 #ifndef DEV_TESTS_BROADCASTOPSTUPLE_H
 #define DEV_TESTS_BROADCASTOPSTUPLE_H
 
-#include <op_enums.h>
-#include <dll.h>
+#include <system/op_enums.h>
+#include <system/dll.h>
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT BroadcastOpsTuple {
     private:
 
     public:
-        nd4j::scalar::Ops  s;
-        nd4j::pairwise::Ops p;
-        nd4j::broadcast::Ops b;
+        sd::scalar::Ops  s;
+        sd::pairwise::Ops p;
+        sd::broadcast::Ops b;
 
         BroadcastOpsTuple() = default;
         ~BroadcastOpsTuple() = default;
 
-        BroadcastOpsTuple(nd4j::scalar::Ops scalar, nd4j::pairwise::Ops pairwise, nd4j::broadcast::Ops broadcast) {
+        BroadcastOpsTuple(sd::scalar::Ops scalar, sd::pairwise::Ops pairwise, sd::broadcast::Ops broadcast) {
             s = scalar;
             p = pairwise;
             b = broadcast;
         }
 
-        static BroadcastOpsTuple custom(nd4j::scalar::Ops scalar, nd4j::pairwise::Ops pairwise, nd4j::broadcast::Ops broadcast);
+        static BroadcastOpsTuple custom(sd::scalar::Ops scalar, sd::pairwise::Ops pairwise, sd::broadcast::Ops broadcast);
 
         static BroadcastOpsTuple Add();
         static BroadcastOpsTuple Assign();
diff --git a/libnd4j/include/ops/InputType.h b/libnd4j/include/ops/InputType.h
index 44bd92b65..4deff4900 100644
--- a/libnd4j/include/ops/InputType.h
+++ b/libnd4j/include/ops/InputType.h
@@ -21,7 +21,7 @@
 #ifndef ND4J_INPUTTYPE_H
 #define ND4J_INPUTTYPE_H
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         enum InputType {
             InputType_BOOLEAN = 0,
diff --git a/libnd4j/include/ops/declarable/BooleanOp.h b/libnd4j/include/ops/declarable/BooleanOp.h
index c13555407..b04ca8eca 100644
--- a/libnd4j/include/ops/declarable/BooleanOp.h
+++ b/libnd4j/include/ops/declarable/BooleanOp.h
@@ -25,7 +25,7 @@
 #include "OpDescriptor.h"
 #include "DeclarableOp.h"
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         class ND4J_EXPORT BooleanOp : public DeclarableOp {
         protected:
@@ -36,12 +36,12 @@ namespace nd4j {
         public:
             BooleanOp(const char *name, int numInputs, bool scalar);
 
-            bool verify(const std::vector<nd4j::NDArray*>& args);
-            bool verify(nd4j::graph::Context& block);
+            bool verify(const std::vector<sd::NDArray*>& args);
+            bool verify(sd::graph::Context& block);
 
             Nd4jStatus execute(Context* block) override;
 
-            ShapeList *calculateOutputShape(ShapeList *inputShape, nd4j::graph::Context& block) override;
+            ShapeList *calculateOutputShape(ShapeList *inputShape, sd::graph::Context& block) override;
         };
     }
 }
diff --git a/libnd4j/include/ops/declarable/BroadcastableOp.h b/libnd4j/include/ops/declarable/BroadcastableOp.h
index 39435195b..9bc756128 100644
--- a/libnd4j/include/ops/declarable/BroadcastableOp.h
+++ b/libnd4j/include/ops/declarable/BroadcastableOp.h
@@ -26,7 +26,7 @@
 #include "DeclarableOp.h"
 #include "DeclarableCustomOp.h"
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         class ND4J_EXPORT BroadcastableOp : public DeclarableCustomOp{
         protected:
@@ -34,7 +34,7 @@ namespace nd4j {
         public:
             BroadcastableOp(const char *name, int numTArgs, int numIArgs);
 
-            ShapeList *calculateOutputShape(ShapeList *inputShape, nd4j::graph::Context& block) override;
+            ShapeList *calculateOutputShape(ShapeList *inputShape, sd::graph::Context& block) override;
         };
     }
 }
diff --git a/libnd4j/include/ops/declarable/CustomOperations.h b/libnd4j/include/ops/declarable/CustomOperations.h
index 0b0e42809..1a1624c08 100644
--- a/libnd4j/include/ops/declarable/CustomOperations.h
+++ b/libnd4j/include/ops/declarable/CustomOperations.h
@@ -45,19 +45,19 @@
 #include <ops/declarable/headers/util.h>
 #include <ops/declarable/headers/BarnesHutTsne.h>
 #include <ops/declarable/headers/images.h>
-#include <dll.h>
+#include <system/dll.h>
 #include <helpers/shape.h>
 #include <helpers/TAD.h>
-#include <Status.h>
+#include <graph/Status.h>
 #include <helpers/ArrayUtils.h>
 #include <helpers/ShapeBuilders.h>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 #include <helpers/OpTracker.h>
-#include <ConstantShapeHelper.h>
-#include <ConstantTadHelper.h>
+#include <helpers/ConstantShapeHelper.h>
+#include <helpers/ConstantTadHelper.h>
 
 
-namespace nd4j {
+namespace sd {
     struct ND4J_EXPORT _loader {
         _loader();
     };
diff --git a/libnd4j/include/ops/declarable/DeclarableCustomOp.h b/libnd4j/include/ops/declarable/DeclarableCustomOp.h
index 49d3735d4..4aa133a4b 100644
--- a/libnd4j/include/ops/declarable/DeclarableCustomOp.h
+++ b/libnd4j/include/ops/declarable/DeclarableCustomOp.h
@@ -23,9 +23,9 @@
 
 #include <ops/declarable/DeclarableOp.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
-        class ND4J_EXPORT DeclarableCustomOp : public nd4j::ops::DeclarableOp {
+        class ND4J_EXPORT DeclarableCustomOp : public sd::ops::DeclarableOp {
         protected:
             /**
              * This method executes this Op
@@ -34,7 +34,7 @@ namespace nd4j {
         public:
             DeclarableCustomOp(int numInputs, int numOutputs, const char *opName, bool allowsInplace, int tArgs, int iArgs);
 
-            ShapeList* calculateOutputShape(ShapeList* inputShapes, nd4j::graph::Context& block) override = 0;
+            ShapeList* calculateOutputShape(ShapeList* inputShapes, sd::graph::Context& block) override = 0;
         };
     }
 }
diff --git a/libnd4j/include/ops/declarable/DeclarableListOp.h b/libnd4j/include/ops/declarable/DeclarableListOp.h
index 2d6115027..3031611f8 100644
--- a/libnd4j/include/ops/declarable/DeclarableListOp.h
+++ b/libnd4j/include/ops/declarable/DeclarableListOp.h
@@ -26,15 +26,15 @@
 #include <ops/declarable/OpRegistrator.h>
 #include <ops/declarable/DeclarableOp.h>
 
-using namespace nd4j::graph;
+using namespace sd::graph;
 
-namespace nd4j {
+namespace sd {
     namespace ops {
-        class ND4J_EXPORT DeclarableListOp : public nd4j::ops::DeclarableOp {
+        class ND4J_EXPORT DeclarableListOp : public sd::ops::DeclarableOp {
         protected:
             Nd4jStatus validateAndExecute(Context& block) override = 0;
 
-            nd4j::NDArray* getZ(Context& block, int inputId) ;
+            sd::NDArray* getZ(Context& block, int inputId) ;
             void setupResult(NDArray* array, Context& block);
             void setupResultList(NDArrayList* arrayList, Context& block);
 
@@ -48,7 +48,7 @@ namespace nd4j {
             ResultSet* execute(NDArrayList* list, std::initializer_list<NDArray*> inputs, std::initializer_list<double> tArgs, std::initializer_list<int> iArgs);
             ResultSet* execute(NDArrayList* list, std::vector<NDArray*>& inputs, std::vector<double>& tArgs, std::vector<int>& iArgs);
 
-            ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) override;
+            ShapeList* calculateOutputShape(ShapeList* inputShape, sd::graph::Context& block) override;
         };
     }
 }
diff --git a/libnd4j/include/ops/declarable/DeclarableOp.h b/libnd4j/include/ops/declarable/DeclarableOp.h
index 39a0b7041..fd95f382d 100644
--- a/libnd4j/include/ops/declarable/DeclarableOp.h
+++ b/libnd4j/include/ops/declarable/DeclarableOp.h
@@ -23,15 +23,15 @@
 
 #include <sstream>
 #include <types/float16.h>
-#include <pointercast.h>
-#include <NDArray.h>
+#include <system/pointercast.h>
+#include <array/NDArray.h>
 #include <graph/Context.h>
 #include "OpDescriptor.h"
 #include <helpers/helper_hash.h>
 #include <array/ShapeList.h>
 #include <array/ResultSet.h>
 #include <helpers/OpArgsHolder.h>
-#include <dll.h>
+#include <system/dll.h>
 #include <ops/declarable/EmptyHandling.h>
 //#include <ops/declarable/declarable_ops.h>
 
@@ -39,9 +39,9 @@
 #include <ctime>
 #include <mutex>
 
-using namespace nd4j::graph;
+using namespace sd::graph;
 
-namespace nd4j {
+namespace sd {
     namespace ops {
 
         Nd4jStatus ND4J_EXPORT conditionHelper(const char *file, int line, int condition, int argNumber, const char *format, ...);
@@ -105,7 +105,7 @@ namespace nd4j {
             */
             void storeResult(Context &block, int outputNumber, NDArray& array);
             void storeResult(Context &block, int outputNumber, NDArray* array);
-            nd4j::NDArray* getZ(Context& block, int inputId = 0);
+            sd::NDArray* getZ(Context& block, int inputId = 0);
 
             /**
             *   This method pre-allocates NDArrays for Op output, in case they are not available at op execution time
@@ -137,7 +137,7 @@ namespace nd4j {
             /**
             *   This method should be available in each implemented Op, and should return Op output shape(s), for a given input shape(s)
             */
-            virtual ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) = 0;
+            virtual ShapeList* calculateOutputShape(ShapeList* inputShape, sd::graph::Context& block) = 0;
 
             /**
              * Returns opName
@@ -174,19 +174,19 @@ namespace nd4j {
             template <class T, typename = std::enable_if<DataTypeUtils::scalarTypesForExecution<T>::value>>
             Nd4jStatus execute(const std::vector<NDArray*> &inputs, const std::vector<NDArray*> &outputs, std::initializer_list<T> tArgs);
 
-            Nd4jStatus execute(const std::vector<NDArray*> &inputs, const std::vector<NDArray*> &outputs, const std::vector<double> &tArgs, const std::vector<Nd4jLong> &iArgs, const std::vector<bool> &bArgs = std::vector<bool>(), const std::vector<nd4j::DataType> &dArgs = std::vector<nd4j::DataType>(), bool isInplace = false);
+            Nd4jStatus execute(const std::vector<NDArray*> &inputs, const std::vector<NDArray*> &outputs, const std::vector<double> &tArgs, const std::vector<Nd4jLong> &iArgs, const std::vector<bool> &bArgs = std::vector<bool>(), const std::vector<sd::DataType> &dArgs = std::vector<sd::DataType>(), bool isInplace = false);
 
 
-            nd4j::ResultSet* evaluate(const std::vector<NDArray*> &inputs);
+            sd::ResultSet* evaluate(const std::vector<NDArray*> &inputs);
 
             template <class T, typename = std::enable_if<DataTypeUtils::scalarTypesForExecution<T>::value>>
-            nd4j::ResultSet* evaluate(const std::vector<NDArray*> &inputs, std::initializer_list<T> args);
+            sd::ResultSet* evaluate(const std::vector<NDArray*> &inputs, std::initializer_list<T> args);
 
-            nd4j::ResultSet* evaluate(const std::vector<NDArray*> &inputs, const std::vector<double> &tArgs, const std::vector<Nd4jLong> &iArgs, const std::vector<bool> &bArgs = std::vector<bool>(), const std::vector<nd4j::DataType> &dArgs = std::vector<nd4j::DataType>(), bool isInplace = false);
+            sd::ResultSet* evaluate(const std::vector<NDArray*> &inputs, const std::vector<double> &tArgs, const std::vector<Nd4jLong> &iArgs, const std::vector<bool> &bArgs = std::vector<bool>(), const std::vector<sd::DataType> &dArgs = std::vector<sd::DataType>(), bool isInplace = false);
 
-            Nd4jStatus execute(nd4j::graph::RandomGenerator& rng, const std::vector<NDArray*>& inputs, const std::vector<NDArray*>& outputs, const std::vector<double>& tArgs, const std::vector<Nd4jLong>& iArgs, const std::vector<bool>& bArgs, const std::vector<nd4j::DataType> &dArgs = std::vector<nd4j::DataType>(), bool isInplace = false, nd4j::DataType type = nd4j::DataType::FLOAT32);
+            Nd4jStatus execute(sd::graph::RandomGenerator& rng, const std::vector<NDArray*>& inputs, const std::vector<NDArray*>& outputs, const std::vector<double>& tArgs, const std::vector<Nd4jLong>& iArgs, const std::vector<bool>& bArgs, const std::vector<sd::DataType> &dArgs = std::vector<sd::DataType>(), bool isInplace = false, sd::DataType type = sd::DataType::FLOAT32);
 
-            nd4j::ResultSet* execute(const nd4j::OpArgsHolder& holder, bool isInplace = false);
+            sd::ResultSet* execute(const sd::OpArgsHolder& holder, bool isInplace = false);
 
             // There methods provide various validation options
             Nd4jStatus validateNonEmptyInput(Context& block);
diff --git a/libnd4j/include/ops/declarable/DeclarableReductionOp.h b/libnd4j/include/ops/declarable/DeclarableReductionOp.h
index 5306f60eb..11f4ec410 100644
--- a/libnd4j/include/ops/declarable/DeclarableReductionOp.h
+++ b/libnd4j/include/ops/declarable/DeclarableReductionOp.h
@@ -23,9 +23,9 @@
 
 #include <ops/declarable/DeclarableOp.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
-        class ND4J_EXPORT DeclarableReductionOp : public nd4j::ops::DeclarableOp {
+        class ND4J_EXPORT DeclarableReductionOp : public sd::ops::DeclarableOp {
         protected:
             /**
              * This method executes this Op
@@ -34,7 +34,7 @@ namespace nd4j {
         public:
             DeclarableReductionOp(int numInputs, int numOutputs, const char *opName, bool allowsInplace, int tArgs, int iArgs);
 
-            ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) override;
+            ShapeList* calculateOutputShape(ShapeList* inputShape, sd::graph::Context& block) override;
         };
     }
 }
diff --git a/libnd4j/include/ops/declarable/LegacyBroadcastBoolOp.h b/libnd4j/include/ops/declarable/LegacyBroadcastBoolOp.h
index d72ed612d..67787ca4b 100644
--- a/libnd4j/include/ops/declarable/LegacyBroadcastBoolOp.h
+++ b/libnd4j/include/ops/declarable/LegacyBroadcastBoolOp.h
@@ -23,7 +23,7 @@
 
 #include <ops/declarable/LegacyOp.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         /**
         *   This class provides wrapper for broadcast operations. 
@@ -35,7 +35,7 @@ namespace nd4j {
             LegacyBroadcastBoolOp();
             LegacyBroadcastBoolOp(int opNum);
 
-            ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) override;
+            ShapeList* calculateOutputShape(ShapeList* inputShape, sd::graph::Context& block) override;
             LegacyOp* clone() override;
         };
     }
diff --git a/libnd4j/include/ops/declarable/LegacyBroadcastOp.h b/libnd4j/include/ops/declarable/LegacyBroadcastOp.h
index 7502b6ce7..755277397 100644
--- a/libnd4j/include/ops/declarable/LegacyBroadcastOp.h
+++ b/libnd4j/include/ops/declarable/LegacyBroadcastOp.h
@@ -23,7 +23,7 @@
 
 #include <ops/declarable/LegacyOp.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         /**
         *   This class provides wrapper for broadcast operations. 
@@ -35,7 +35,7 @@ namespace nd4j {
             LegacyBroadcastOp();
             LegacyBroadcastOp(int opNum);
 
-            ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) override;
+            ShapeList* calculateOutputShape(ShapeList* inputShape, sd::graph::Context& block) override;
             LegacyOp* clone() override;
         };
     }
diff --git a/libnd4j/include/ops/declarable/LegacyIndexReduceOp.h b/libnd4j/include/ops/declarable/LegacyIndexReduceOp.h
index b023cdc0d..fae0c5e8f 100644
--- a/libnd4j/include/ops/declarable/LegacyIndexReduceOp.h
+++ b/libnd4j/include/ops/declarable/LegacyIndexReduceOp.h
@@ -23,7 +23,7 @@
 
 #include <ops/declarable/LegacyOp.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         /**
         *   This class provides wrapper for IndexAccumulation operations. i.e. IndexMax or IndexAbsoluteMin etc
@@ -37,7 +37,7 @@ namespace nd4j {
             LegacyIndexReduceOp();
             LegacyIndexReduceOp(int opNum);
 
-            ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) override;
+            ShapeList* calculateOutputShape(ShapeList* inputShape, sd::graph::Context& block) override;
             LegacyOp* clone() override;
         };
     }
diff --git a/libnd4j/include/ops/declarable/LegacyOp.h b/libnd4j/include/ops/declarable/LegacyOp.h
index a7c7ad055..0dfd91a42 100644
--- a/libnd4j/include/ops/declarable/LegacyOp.h
+++ b/libnd4j/include/ops/declarable/LegacyOp.h
@@ -22,9 +22,9 @@
 #define LIBND4J_LEGACYOP_H
 
 #include <ops/declarable/DeclarableOp.h>
-#include <PointersManager.h>
+#include <helpers/PointersManager.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
 
         /**
@@ -48,7 +48,7 @@ namespace nd4j {
             ~LegacyOp() = default;
 
             // All Op classes provide own specific implementation for this method
-            ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) override = 0;
+            ShapeList* calculateOutputShape(ShapeList* inputShape, sd::graph::Context& block) override = 0;
             virtual LegacyOp* clone() = 0;
         };
     }
diff --git a/libnd4j/include/ops/declarable/LegacyPairwiseTransformBoolOp.h b/libnd4j/include/ops/declarable/LegacyPairwiseTransformBoolOp.h
index 5a2eb431f..16a482811 100644
--- a/libnd4j/include/ops/declarable/LegacyPairwiseTransformBoolOp.h
+++ b/libnd4j/include/ops/declarable/LegacyPairwiseTransformBoolOp.h
@@ -23,7 +23,7 @@
 
 #include <ops/declarable/LegacyOp.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         /**
         *   This class provides wrapper for Pairwise transform operations
@@ -35,7 +35,7 @@ namespace nd4j {
             LegacyPairwiseTransformBoolOp();
             LegacyPairwiseTransformBoolOp(int opNum);
 
-            ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) override;
+            ShapeList* calculateOutputShape(ShapeList* inputShape, sd::graph::Context& block) override;
             LegacyOp* clone() override;
         };
     }
diff --git a/libnd4j/include/ops/declarable/LegacyPairwiseTransformOp.h b/libnd4j/include/ops/declarable/LegacyPairwiseTransformOp.h
index 27a3a6f8d..81bbdc715 100644
--- a/libnd4j/include/ops/declarable/LegacyPairwiseTransformOp.h
+++ b/libnd4j/include/ops/declarable/LegacyPairwiseTransformOp.h
@@ -23,7 +23,7 @@
 
 #include <ops/declarable/LegacyOp.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         /**
         *   This class provides wrapper for Pairwise transform operations
@@ -35,7 +35,7 @@ namespace nd4j {
             LegacyPairwiseTransformOp();
             LegacyPairwiseTransformOp(int opNum);
 
-            ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) override;
+            ShapeList* calculateOutputShape(ShapeList* inputShape, sd::graph::Context& block) override;
             LegacyOp* clone() override;
         };
     }
diff --git a/libnd4j/include/ops/declarable/LegacyRandomOp.h b/libnd4j/include/ops/declarable/LegacyRandomOp.h
index 072825ef0..b6e6f5b16 100644
--- a/libnd4j/include/ops/declarable/LegacyRandomOp.h
+++ b/libnd4j/include/ops/declarable/LegacyRandomOp.h
@@ -25,7 +25,7 @@
 #include <helpers/helper_random.h>
 #include <ops/declarable/LegacyOp.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         /**
         *   This class provides wrapper for Random operations (i.e. linspace or Uniform)
@@ -41,11 +41,11 @@ namespace nd4j {
             template <typename T>
             Nd4jStatus validateAndExecute_(Context &block);
 
-            nd4j::ResultSet*  execute(nd4j::graph::RandomGenerator& rng, std::initializer_list<NDArray*> inputs, std::initializer_list<double> tArgs, std::initializer_list<int> iArgs, bool isInplace = false);
-            nd4j::ResultSet*  execute(nd4j::graph::RandomGenerator& rng, std::vector<NDArray*>& inputs, std::vector<double>& tArgs, std::vector<int>& iArgs, bool isInplace = false);
+            sd::ResultSet*  execute(sd::graph::RandomGenerator& rng, std::initializer_list<NDArray*> inputs, std::initializer_list<double> tArgs, std::initializer_list<int> iArgs, bool isInplace = false);
+            sd::ResultSet*  execute(sd::graph::RandomGenerator& rng, std::vector<NDArray*>& inputs, std::vector<double>& tArgs, std::vector<int>& iArgs, bool isInplace = false);
             Nd4jStatus execute(Context* block) override;
 
-            ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) override;
+            ShapeList* calculateOutputShape(ShapeList* inputShape, sd::graph::Context& block) override;
             LegacyOp* clone() override;
         };
     }
diff --git a/libnd4j/include/ops/declarable/LegacyReduce3Op.h b/libnd4j/include/ops/declarable/LegacyReduce3Op.h
index 9882f4cae..b0a06bd94 100644
--- a/libnd4j/include/ops/declarable/LegacyReduce3Op.h
+++ b/libnd4j/include/ops/declarable/LegacyReduce3Op.h
@@ -23,7 +23,7 @@
 
 #include <ops/declarable/LegacyOp.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         /**
         *   This class provides wrapper for Reduce3 operations (i.e. dot, cosineDistance etc)
@@ -35,7 +35,7 @@ namespace nd4j {
             LegacyReduce3Op();
             LegacyReduce3Op(int opNum);
 
-            ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) override;
+            ShapeList* calculateOutputShape(ShapeList* inputShape, sd::graph::Context& block) override;
             LegacyOp* clone() override;
         };
     }
diff --git a/libnd4j/include/ops/declarable/LegacyReduceBoolOp.h b/libnd4j/include/ops/declarable/LegacyReduceBoolOp.h
index e685cd38c..11cd52146 100644
--- a/libnd4j/include/ops/declarable/LegacyReduceBoolOp.h
+++ b/libnd4j/include/ops/declarable/LegacyReduceBoolOp.h
@@ -23,7 +23,7 @@
 
 #include <ops/declarable/LegacyOp.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         class ND4J_EXPORT LegacyReduceBoolOp : public LegacyOp {
         protected:
@@ -32,7 +32,7 @@ namespace nd4j {
             LegacyReduceBoolOp();
             LegacyReduceBoolOp(int opNum);
 
-            ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) override;
+            ShapeList* calculateOutputShape(ShapeList* inputShape, sd::graph::Context& block) override;
             LegacyOp* clone() override;
         };
     }
diff --git a/libnd4j/include/ops/declarable/LegacyReduceFloatOp.h b/libnd4j/include/ops/declarable/LegacyReduceFloatOp.h
index f85b98384..ed36a04fe 100644
--- a/libnd4j/include/ops/declarable/LegacyReduceFloatOp.h
+++ b/libnd4j/include/ops/declarable/LegacyReduceFloatOp.h
@@ -23,7 +23,7 @@
 
 #include <ops/declarable/LegacyOp.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         class ND4J_EXPORT LegacyReduceFloatOp : public LegacyOp {
         protected:
@@ -32,7 +32,7 @@ namespace nd4j {
             LegacyReduceFloatOp();
             LegacyReduceFloatOp(int opNum);
 
-            ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) override;
+            ShapeList* calculateOutputShape(ShapeList* inputShape, sd::graph::Context& block) override;
             LegacyOp* clone() override;
         };
     }
diff --git a/libnd4j/include/ops/declarable/LegacyReduceLongOp.h b/libnd4j/include/ops/declarable/LegacyReduceLongOp.h
index 171739379..4f23a9717 100644
--- a/libnd4j/include/ops/declarable/LegacyReduceLongOp.h
+++ b/libnd4j/include/ops/declarable/LegacyReduceLongOp.h
@@ -23,7 +23,7 @@
 
 #include <ops/declarable/LegacyOp.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         class ND4J_EXPORT LegacyReduceLongOp : public LegacyOp {
         protected:
@@ -32,7 +32,7 @@ namespace nd4j {
             LegacyReduceLongOp();
             LegacyReduceLongOp(int opNum);
 
-            ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) override;
+            ShapeList* calculateOutputShape(ShapeList* inputShape, sd::graph::Context& block) override;
             LegacyOp* clone() override;
         };
     }
diff --git a/libnd4j/include/ops/declarable/LegacyReduceOp.h b/libnd4j/include/ops/declarable/LegacyReduceOp.h
index 4796afdb1..3e289fe25 100644
--- a/libnd4j/include/ops/declarable/LegacyReduceOp.h
+++ b/libnd4j/include/ops/declarable/LegacyReduceOp.h
@@ -23,7 +23,7 @@
 
 //#include <ops/declarable/LegacyOp.h>
 /*
-namespace nd4j {
+namespace sd {
     namespace ops {
         class ND4J_EXPORT LegacyReduceOp : public LegacyOp {
         protected:
@@ -32,7 +32,7 @@ namespace nd4j {
             LegacyReduceOp();
             LegacyReduceOp(int opNum);
 
-            ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block);
+            ShapeList* calculateOutputShape(ShapeList* inputShape, sd::graph::Context& block);
             virtual LegacyOp* clone();
         };
     }
diff --git a/libnd4j/include/ops/declarable/LegacyReduceSameOp.h b/libnd4j/include/ops/declarable/LegacyReduceSameOp.h
index daee7c16c..86cc06a0e 100644
--- a/libnd4j/include/ops/declarable/LegacyReduceSameOp.h
+++ b/libnd4j/include/ops/declarable/LegacyReduceSameOp.h
@@ -23,7 +23,7 @@
 
 #include <ops/declarable/LegacyOp.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         class ND4J_EXPORT LegacyReduceSameOp: public LegacyOp {
         protected:
@@ -32,7 +32,7 @@ namespace nd4j {
             LegacyReduceSameOp();
             LegacyReduceSameOp(int opNum);
 
-            ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) override;
+            ShapeList* calculateOutputShape(ShapeList* inputShape, sd::graph::Context& block) override;
             LegacyOp* clone() override;
         };
     }
diff --git a/libnd4j/include/ops/declarable/LegacyScalarBoolOp.h b/libnd4j/include/ops/declarable/LegacyScalarBoolOp.h
index 915caa980..0d52eee9d 100644
--- a/libnd4j/include/ops/declarable/LegacyScalarBoolOp.h
+++ b/libnd4j/include/ops/declarable/LegacyScalarBoolOp.h
@@ -23,7 +23,7 @@
 
 #include <ops/declarable/LegacyOp.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         /**
         *   This class provides wrapper for scalar transform operations, i.e. a + b = c, where either a or b is scalar primitive and other operand is NDArray
@@ -37,7 +37,7 @@ namespace nd4j {
             LegacyScalarBoolOp(int opNum);
             LegacyScalarBoolOp(int opNum, NDArray &scalar);
 
-            ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) override;
+            ShapeList* calculateOutputShape(ShapeList* inputShape, sd::graph::Context& block) override;
             LegacyOp* clone() override;
         };
     }
diff --git a/libnd4j/include/ops/declarable/LegacyScalarOp.h b/libnd4j/include/ops/declarable/LegacyScalarOp.h
index 3cc000c85..9f2a1a23a 100644
--- a/libnd4j/include/ops/declarable/LegacyScalarOp.h
+++ b/libnd4j/include/ops/declarable/LegacyScalarOp.h
@@ -23,7 +23,7 @@
 
 #include <ops/declarable/LegacyOp.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         /**
         *   This class provides wrapper for scalar transform operations, i.e. a + b = c, where either a or b is scalar primitive and other operand is NDArray
@@ -37,7 +37,7 @@ namespace nd4j {
             LegacyScalarOp(int opNum);
             LegacyScalarOp(int opNum, NDArray &scalar);
 
-            ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) override;
+            ShapeList* calculateOutputShape(ShapeList* inputShape, sd::graph::Context& block) override;
             LegacyOp* clone() override;
         };
     }
diff --git a/libnd4j/include/ops/declarable/LegacyStatsOp.h b/libnd4j/include/ops/declarable/LegacyStatsOp.h
index 81ffd4d1b..74520b9dd 100644
--- a/libnd4j/include/ops/declarable/LegacyStatsOp.h
+++ b/libnd4j/include/ops/declarable/LegacyStatsOp.h
@@ -23,7 +23,7 @@
 
 #include <ops/declarable/LegacyOp.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         /**
         *   This class provides wrapper for SummaryStats operations: Variance and Standard Deviation
@@ -35,7 +35,7 @@ namespace nd4j {
             LegacyStatsOp();
             LegacyStatsOp(int opNum);
 
-            ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context &block) override;
+            ShapeList* calculateOutputShape(ShapeList* inputShape, sd::graph::Context &block) override;
             LegacyOp* clone() override;
         };
     }
diff --git a/libnd4j/include/ops/declarable/LegacyTransformAnyOp.h b/libnd4j/include/ops/declarable/LegacyTransformAnyOp.h
index 34c30ba09..f98ccd4c8 100644
--- a/libnd4j/include/ops/declarable/LegacyTransformAnyOp.h
+++ b/libnd4j/include/ops/declarable/LegacyTransformAnyOp.h
@@ -24,7 +24,7 @@
 
 #include <ops/declarable/LegacyOp.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         /**
         *   This class provides wrapper for Transform operations (i.e. Pow or OneMinus)
@@ -36,7 +36,7 @@ namespace nd4j {
             LegacyTransformAnyOp();
             LegacyTransformAnyOp(int opNum);
 
-            ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context &block) override;
+            ShapeList* calculateOutputShape(ShapeList* inputShape, sd::graph::Context &block) override;
             LegacyOp* clone() override;
         };
     }
diff --git a/libnd4j/include/ops/declarable/LegacyTransformBoolOp.h b/libnd4j/include/ops/declarable/LegacyTransformBoolOp.h
index 23e761979..d64dd4b01 100644
--- a/libnd4j/include/ops/declarable/LegacyTransformBoolOp.h
+++ b/libnd4j/include/ops/declarable/LegacyTransformBoolOp.h
@@ -25,7 +25,7 @@
 
 #include <ops/declarable/LegacyOp.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         /**
         *   This class provides wrapper for Transform operations (i.e. Pow or OneMinus)
@@ -37,7 +37,7 @@ namespace nd4j {
             LegacyTransformBoolOp();
             LegacyTransformBoolOp(int opNum);
 
-            ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context &block) override;
+            ShapeList* calculateOutputShape(ShapeList* inputShape, sd::graph::Context &block) override;
             LegacyOp* clone() override;
         };
     }
diff --git a/libnd4j/include/ops/declarable/LegacyTransformFloatOp.h b/libnd4j/include/ops/declarable/LegacyTransformFloatOp.h
index 3327ad9f3..37bd0edce 100644
--- a/libnd4j/include/ops/declarable/LegacyTransformFloatOp.h
+++ b/libnd4j/include/ops/declarable/LegacyTransformFloatOp.h
@@ -24,7 +24,7 @@
 
 #include <ops/declarable/LegacyOp.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         /**
         *   This class provides wrapper for Transform operations (i.e. Pow or OneMinus)
@@ -36,7 +36,7 @@ namespace nd4j {
             LegacyTransformFloatOp();
             LegacyTransformFloatOp(int opNum);
 
-            ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context &block) override;
+            ShapeList* calculateOutputShape(ShapeList* inputShape, sd::graph::Context &block) override;
             LegacyOp* clone() override;
         };
     }
diff --git a/libnd4j/include/ops/declarable/LegacyTransformOp.h b/libnd4j/include/ops/declarable/LegacyTransformOp.h
index f5524768e..7eb265bcb 100644
--- a/libnd4j/include/ops/declarable/LegacyTransformOp.h
+++ b/libnd4j/include/ops/declarable/LegacyTransformOp.h
@@ -24,7 +24,7 @@
 
 //#include <ops/declarable/LegacyOp.h>
 #ifdef ONLY_SAME_TRANSFORM
-namespace nd4j {
+namespace sd {
     namespace ops {
         /**
         *   This class provides wrapper for Transform operations (i.e. Pow or OneMinus)
@@ -36,7 +36,7 @@ namespace nd4j {
             LegacyTransformOp();
             LegacyTransformOp(int opNum);
 
-            ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context &block);
+            ShapeList* calculateOutputShape(ShapeList* inputShape, sd::graph::Context &block);
             virtual LegacyOp* clone();
         };
     }
diff --git a/libnd4j/include/ops/declarable/LegacyTransformSameOp.h b/libnd4j/include/ops/declarable/LegacyTransformSameOp.h
index 7b847562b..4d9312daf 100644
--- a/libnd4j/include/ops/declarable/LegacyTransformSameOp.h
+++ b/libnd4j/include/ops/declarable/LegacyTransformSameOp.h
@@ -25,7 +25,7 @@
 
 #include <ops/declarable/LegacyOp.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         /**
         *   This class provides wrapper for Transform operations (i.e. Pow or OneMinus)
@@ -37,7 +37,7 @@ namespace nd4j {
             LegacyTransformSameOp();
             LegacyTransformSameOp(int opNum);
 
-            ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context &block) override;
+            ShapeList* calculateOutputShape(ShapeList* inputShape, sd::graph::Context &block) override;
             LegacyOp* clone() override;
         };
     }
diff --git a/libnd4j/include/ops/declarable/LegacyTransformStrictOp.h b/libnd4j/include/ops/declarable/LegacyTransformStrictOp.h
index 4d1722b01..ee48c02b7 100644
--- a/libnd4j/include/ops/declarable/LegacyTransformStrictOp.h
+++ b/libnd4j/include/ops/declarable/LegacyTransformStrictOp.h
@@ -25,7 +25,7 @@
 
 #include <ops/declarable/LegacyOp.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         /**
         *   This class provides wrapper for Transform operations (i.e. Pow or OneMinus)
@@ -37,7 +37,7 @@ namespace nd4j {
             LegacyTransformStrictOp();
             LegacyTransformStrictOp(int opNum);
 
-            ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context &block) override;
+            ShapeList* calculateOutputShape(ShapeList* inputShape, sd::graph::Context &block) override;
             LegacyOp* clone() override;
         };
     }
diff --git a/libnd4j/include/ops/declarable/LogicOp.h b/libnd4j/include/ops/declarable/LogicOp.h
index 70fa3a6ff..d3ad59af2 100644
--- a/libnd4j/include/ops/declarable/LogicOp.h
+++ b/libnd4j/include/ops/declarable/LogicOp.h
@@ -23,7 +23,7 @@
 
 #include "DeclarableOp.h"
 
-namespace nd4j {
+namespace sd {
     namespace ops {
 
         /**
@@ -34,11 +34,11 @@ namespace nd4j {
          */
         class ND4J_EXPORT LogicOp : public DeclarableOp {
         protected:
-            Nd4jStatus validateAndExecute(nd4j::graph::Context& block) override;
+            Nd4jStatus validateAndExecute(sd::graph::Context& block) override;
         public:
             LogicOp(const char *name);
 
-            ShapeList* calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context &block) override;
+            ShapeList* calculateOutputShape(ShapeList* inputShape, sd::graph::Context &block) override;
         };
     }
 }
diff --git a/libnd4j/include/ops/declarable/OpDescriptor.h b/libnd4j/include/ops/declarable/OpDescriptor.h
index 72f09f96d..3feff5916 100644
--- a/libnd4j/include/ops/declarable/OpDescriptor.h
+++ b/libnd4j/include/ops/declarable/OpDescriptor.h
@@ -29,7 +29,7 @@
 #include <graph/generated/node_generated.h>
 #include <array/DataType.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
 
         /**
@@ -52,7 +52,7 @@ namespace nd4j {
             int _numOutputs = 1;
 
             // enum for ops. deprecated. will be removed
-            nd4j::graph::OpClass _opClass;
+            sd::graph::OpClass _opClass;
 
             // special flag for divergent ops - ops that CAN and WILL modify graph behavior. Literally: IF, CASE.
             bool _divergent = false;
@@ -79,18 +79,18 @@ namespace nd4j {
 
 
             bool _sameMode = false;
-            std::vector<nd4j::DataType> _allowedIns;
-            std::vector<nd4j::DataType> _allowedOuts;
+            std::vector<sd::DataType> _allowedIns;
+            std::vector<sd::DataType> _allowedOuts;
 
             // optional per-input configuration
-            MAP_IMPL<int, std::vector<nd4j::DataType>> _outputTypes;
-            MAP_IMPL<int, std::vector<nd4j::DataType>> _inputTypes;
+            MAP_IMPL<int, std::vector<sd::DataType>> _outputTypes;
+            MAP_IMPL<int, std::vector<sd::DataType>> _inputTypes;
 
 
             // field for ops that allow data type override at runtime
             bool _dtypeOverride = false;
 
-            bool checkDataTypesMatch(nd4j::DataType needle, std::vector<nd4j::DataType> &haystack) const;
+            bool checkDataTypesMatch(sd::DataType needle, std::vector<sd::DataType> &haystack) const;
         public:
             // default constructor
             OpDescriptor(int numInputs, int numOutputs, std::string opName, bool allowsInplace);
@@ -162,23 +162,23 @@ namespace nd4j {
 
 
             OpDescriptor* setInputType(InputType type);
-            OpDescriptor* setAllowedInputTypes(const std::initializer_list<nd4j::DataType> &dtype);
-            OpDescriptor* setAllowedOutputTypes(const std::initializer_list<nd4j::DataType> &dtype);
-            OpDescriptor* setAllowedInputTypes(int index, const std::vector<nd4j::DataType> &dtype);
-            OpDescriptor* setAllowedOutputTypes(int index, const std::vector<nd4j::DataType> &dtype);
-            OpDescriptor* setAllowedInputTypes(int index,  nd4j::DataType dtype);
-            OpDescriptor* setAllowedOutputTypes(int index, nd4j::DataType dtype);
-            OpDescriptor* setAllowedInputTypes(nd4j::DataType dtype);
-            OpDescriptor* setAllowedOutputTypes(nd4j::DataType dtype);
+            OpDescriptor* setAllowedInputTypes(const std::initializer_list<sd::DataType> &dtype);
+            OpDescriptor* setAllowedOutputTypes(const std::initializer_list<sd::DataType> &dtype);
+            OpDescriptor* setAllowedInputTypes(int index, const std::vector<sd::DataType> &dtype);
+            OpDescriptor* setAllowedOutputTypes(int index, const std::vector<sd::DataType> &dtype);
+            OpDescriptor* setAllowedInputTypes(int index,  sd::DataType dtype);
+            OpDescriptor* setAllowedOutputTypes(int index, sd::DataType dtype);
+            OpDescriptor* setAllowedInputTypes(sd::DataType dtype);
+            OpDescriptor* setAllowedOutputTypes(sd::DataType dtype);
             OpDescriptor* allowOverride(bool reallyAllow);
             OpDescriptor* setSameMode(bool reallySame);
-            OpDescriptor* setInputType(int idx, nd4j::DataType dtype);
-            OpDescriptor* setOutputType(int idx, nd4j::DataType dtype);
+            OpDescriptor* setInputType(int idx, sd::DataType dtype);
+            OpDescriptor* setOutputType(int idx, sd::DataType dtype);
 
-            std::vector<nd4j::DataType> getOutputTypesForOutput(int index);
+            std::vector<sd::DataType> getOutputTypesForOutput(int index);
 
-            bool checkInputMatch(int index, nd4j::DataType dataType);
-            bool checkOutputMatch(int index, nd4j::DataType dataType);
+            bool checkInputMatch(int index, sd::DataType dataType);
+            bool checkOutputMatch(int index, sd::DataType dataType);
             bool isSameMode();
 
             bool isInherit(int index);
diff --git a/libnd4j/include/ops/declarable/OpRegistrator.h b/libnd4j/include/ops/declarable/OpRegistrator.h
index cd747498f..3a9fb3df6 100644
--- a/libnd4j/include/ops/declarable/OpRegistrator.h
+++ b/libnd4j/include/ops/declarable/OpRegistrator.h
@@ -21,7 +21,7 @@
 #ifndef LIBND4J_OPREGISTRATOR_H
 #define LIBND4J_OPREGISTRATOR_H
 
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <vector>
 #include <unordered_map>
 #include <mutex>
@@ -53,7 +53,7 @@ namespace std {
 #endif
 
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         /**
         *   This class provides runtime ops lookup, based on opName or opHash.
@@ -82,14 +82,14 @@ namespace nd4j {
             MAP_IMPL<Nd4jLong, std::string> _msvc;
 
             // pointers to our operations
-            MAP_IMPL<Nd4jLong, nd4j::ops::DeclarableOp*> _declarablesLD;
-            MAP_IMPL<std::string, nd4j::ops::DeclarableOp*> _declarablesD;
-            std::vector<nd4j::ops::DeclarableOp *> _uniqueD;
+            MAP_IMPL<Nd4jLong, sd::ops::DeclarableOp*> _declarablesLD;
+            MAP_IMPL<std::string, sd::ops::DeclarableOp*> _declarablesD;
+            std::vector<sd::ops::DeclarableOp *> _uniqueD;
 
             // pointers to platform-specific helpers
-            MAP_IMPL<std::pair<Nd4jLong, samediff::Engine>, nd4j::ops::platforms::PlatformHelper*> _helpersLH;
-            MAP_IMPL<std::pair<std::string, samediff::Engine>, nd4j::ops::platforms::PlatformHelper*> _helpersH;
-            std::vector<nd4j::ops::platforms::PlatformHelper*> _uniqueH;
+            MAP_IMPL<std::pair<Nd4jLong, samediff::Engine>, sd::ops::platforms::PlatformHelper*> _helpersLH;
+            MAP_IMPL<std::pair<std::string, samediff::Engine>, sd::ops::platforms::PlatformHelper*> _helpersH;
+            std::vector<sd::ops::platforms::PlatformHelper*> _uniqueH;
 
             std::mutex _locker;
             std::string _opsList;
@@ -114,18 +114,18 @@ namespace nd4j {
             *
             * @param op
             */
-            bool registerOperation(const char* name, nd4j::ops::DeclarableOp* op);
-            bool registerOperation(nd4j::ops::DeclarableOp *op);
+            bool registerOperation(const char* name, sd::ops::DeclarableOp* op);
+            bool registerOperation(sd::ops::DeclarableOp *op);
 
-            void registerHelper(nd4j::ops::platforms::PlatformHelper* op);
+            void registerHelper(sd::ops::platforms::PlatformHelper* op);
 
             bool hasHelper(Nd4jLong hash, samediff::Engine engine);
 
-            nd4j::ops::DeclarableOp* getOperation(const char *name);
-            nd4j::ops::DeclarableOp* getOperation(Nd4jLong hash);
-            nd4j::ops::DeclarableOp* getOperation(std::string &name);
+            sd::ops::DeclarableOp* getOperation(const char *name);
+            sd::ops::DeclarableOp* getOperation(Nd4jLong hash);
+            sd::ops::DeclarableOp* getOperation(std::string &name);
 
-            nd4j::ops::platforms::PlatformHelper* getPlatformHelper(Nd4jLong hash, samediff::Engine engine);
+            sd::ops::platforms::PlatformHelper* getPlatformHelper(Nd4jLong hash, samediff::Engine engine);
 
             std::vector<Nd4jLong> getAllHashes();
 
diff --git a/libnd4j/include/ops/declarable/OpTuple.h b/libnd4j/include/ops/declarable/OpTuple.h
index fc0fd594a..7458ef3d0 100644
--- a/libnd4j/include/ops/declarable/OpTuple.h
+++ b/libnd4j/include/ops/declarable/OpTuple.h
@@ -23,24 +23,24 @@
 
 #include <vector>
 #include <initializer_list>
-#include <NDArray.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         class ND4J_EXPORT OpTuple {
         public:
             std::string _opName;
-            std::vector<nd4j::NDArray*> _inputs;
-            std::vector<nd4j::NDArray*> _outputs;
+            std::vector<sd::NDArray*> _inputs;
+            std::vector<sd::NDArray*> _outputs;
             std::vector<double> _tArgs;
             std::vector<Nd4jLong> _iArgs;
 
             OpTuple(const char *opName);
-            OpTuple(const char *opName, std::initializer_list<nd4j::NDArray *>&& inputs, std::initializer_list<double>&& tArgs, std::initializer_list<Nd4jLong>&& iArgs);
+            OpTuple(const char *opName, std::initializer_list<sd::NDArray *>&& inputs, std::initializer_list<double>&& tArgs, std::initializer_list<Nd4jLong>&& iArgs);
             ~OpTuple();
 
-            OpTuple* addInput(nd4j::NDArray *array);
-            OpTuple* addOutput(nd4j::NDArray *array);
+            OpTuple* addInput(sd::NDArray *array);
+            OpTuple* addOutput(sd::NDArray *array);
             OpTuple* setTArgs(std::initializer_list<double> tArgs);
             OpTuple* setIArgs(std::initializer_list<Nd4jLong> iArgs);
         };
diff --git a/libnd4j/include/ops/declarable/PlatformHelper.h b/libnd4j/include/ops/declarable/PlatformHelper.h
index afa0107fc..b34a936ee 100644
--- a/libnd4j/include/ops/declarable/PlatformHelper.h
+++ b/libnd4j/include/ops/declarable/PlatformHelper.h
@@ -21,14 +21,14 @@
 #ifndef SD_PLATFORMHELPER_H
 #define SD_PLATFORMHELPER_H
 
-#include <ShapeUtils.h>
+#include <helpers/ShapeUtils.h>
 #include <execution/Engine.h>
 #include <graph/Context.h>
 #include <string>
-#include <pointercast.h>
-#include <dll.h>
+#include <system/pointercast.h>
+#include <system/dll.h>
 
-namespace  nd4j {
+namespace sd {
     namespace ops {
         namespace platforms {
             /**
@@ -77,7 +77,7 @@ namespace  nd4j {
                  * @param inputId
                  * @return
                  */
-                nd4j::NDArray *getZ(graph::Context &ctx, int inputId);
+                sd::NDArray *getZ(graph::Context &ctx, int inputId);
             };
         }
     }
diff --git a/libnd4j/include/ops/declarable/generic/CustomOperations.cpp b/libnd4j/include/ops/declarable/generic/CustomOperations.cpp
index b2b0d165e..c13430ce3 100644
--- a/libnd4j/include/ops/declarable/generic/CustomOperations.cpp
+++ b/libnd4j/include/ops/declarable/generic/CustomOperations.cpp
@@ -24,7 +24,7 @@
 #include <helpers/OpTracker.h>
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
 
     _loader::_loader() {
         //
@@ -47,5 +47,5 @@ namespace nd4j {
 //#endif
     };
 
-    static nd4j::_loader loader;
+    static sd::_loader loader;
 }
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/generic/activations/crelu.cpp b/libnd4j/include/ops/declarable/generic/activations/crelu.cpp
index a0ba6aa11..4ce40f121 100644
--- a/libnd4j/include/ops/declarable/generic/activations/crelu.cpp
+++ b/libnd4j/include/ops/declarable/generic/activations/crelu.cpp
@@ -18,13 +18,13 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_crelu)
 
 #include <ops/declarable/CustomOperations.h>
 #include<ops/declarable/helpers/transforms.h>
 #include <ops/declarable/helpers/legacy_helpers.h>
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(crelu, 1, 1, false, 0, 0) {
             auto x = INPUT_VARIABLE(0);
@@ -32,7 +32,7 @@ namespace nd4j {
             REQUIRE_TRUE(x->isR(), 0, "CRELU: input must be real type");
 
             auto tmp = x->dup();
-            tmp.applyTransform(nd4j::transform::Neg, tmp);
+            tmp.applyTransform(sd::transform::Neg, tmp);
 
             auto z = OUTPUT_VARIABLE(0);
 
@@ -41,7 +41,7 @@ namespace nd4j {
 
             // TODO: make this configurable?
             double threshold = 0.0;
-            z->applyScalar(nd4j::scalar::RELU, threshold, *z);
+            z->applyScalar(sd::scalar::RELU, threshold, *z);
 
             STORE_RESULT(z);
 
@@ -72,7 +72,7 @@ namespace nd4j {
             auto epsilon = OUTPUT_VARIABLE(0);
 
             // at first step we build fwd activation
-            nd4j::ops::crelu op;
+            sd::ops::crelu op;
             auto tmpResult = op.evaluate({input});
             if (tmpResult->status() != ND4J_STATUS_OK)
                 return tmpResult->status();
@@ -83,7 +83,7 @@ namespace nd4j {
             //actv->applyPairwiseTransform(pairwise::RELUDerivativeE, *epsilon, nullptr);
             helpers::reluDerivative(block.launchContext(), actv, epsilonNext);
             // now we split updated array into 2 chunks along last dimension
-            nd4j::ops::concat_bp opc;
+            sd::ops::concat_bp opc;
             auto dec = opc.evaluate({input, input, actv}, {-1});
             if (dec->status() != ND4J_STATUS_OK)
                 return dec->status();
@@ -92,7 +92,7 @@ namespace nd4j {
             auto pos = dec->at(0);
             auto neg = dec->at(1);
 
-            pos->applyPairwiseTransform(nd4j::pairwise::Subtract, *neg, *epsilon);
+            pos->applyPairwiseTransform(sd::pairwise::Subtract, *neg, *epsilon);
 
             delete tmpResult;
             delete dec;
diff --git a/libnd4j/include/ops/declarable/generic/activations/cube.cpp b/libnd4j/include/ops/declarable/generic/activations/cube.cpp
index 75a33ab79..d71906d29 100644
--- a/libnd4j/include/ops/declarable/generic/activations/cube.cpp
+++ b/libnd4j/include/ops/declarable/generic/activations/cube.cpp
@@ -19,19 +19,19 @@
 //
 
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_cube)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/legacy_helpers.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CONFIGURABLE_OP_IMPL(cube, 1, 1, true, 0, 0) {
             auto input = INPUT_VARIABLE(0);
             auto output = OUTPUT_VARIABLE(0);
 
-            input->applyTransform(nd4j::transform::Cube, *output);
+            input->applyTransform(sd::transform::Cube, *output);
             STORE_RESULT(output);
 
             return Status::OK();
diff --git a/libnd4j/include/ops/declarable/generic/activations/elu.cpp b/libnd4j/include/ops/declarable/generic/activations/elu.cpp
index 85becd858..f89f0d2c7 100644
--- a/libnd4j/include/ops/declarable/generic/activations/elu.cpp
+++ b/libnd4j/include/ops/declarable/generic/activations/elu.cpp
@@ -18,12 +18,12 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_elu)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/legacy_helpers.h>
-namespace nd4j {
+namespace sd {
     namespace ops {
         CONFIGURABLE_OP_IMPL(elu, 1, 1, true, -2, 0) {
 
@@ -32,7 +32,7 @@ namespace nd4j {
 
             const auto alpha = block.numT() > 0 ? T_ARG(0) : 1.f;
 
-            input->applyScalar(nd4j::scalar::ELU, alpha, *output);
+            input->applyScalar(sd::scalar::ELU, alpha, *output);
 
             return Status::OK();
         }
diff --git a/libnd4j/include/ops/declarable/generic/activations/hardsigmoid.cpp b/libnd4j/include/ops/declarable/generic/activations/hardsigmoid.cpp
index d8b937a0a..ba498fea9 100644
--- a/libnd4j/include/ops/declarable/generic/activations/hardsigmoid.cpp
+++ b/libnd4j/include/ops/declarable/generic/activations/hardsigmoid.cpp
@@ -18,19 +18,19 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_hardsigmoid)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/legacy_helpers.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CONFIGURABLE_OP_IMPL(hardsigmoid, 1, 1, true, 0, 0) {
             auto input = INPUT_VARIABLE(0);
             auto output = OUTPUT_VARIABLE(0);
 
-            input->applyTransform(nd4j::transform::HardSigmoid, *output);
+            input->applyTransform(sd::transform::HardSigmoid, *output);
             STORE_RESULT(output);
 
             return Status::OK();
diff --git a/libnd4j/include/ops/declarable/generic/activations/hardtanh.cpp b/libnd4j/include/ops/declarable/generic/activations/hardtanh.cpp
index a4d9fe4e6..0a245e6a0 100644
--- a/libnd4j/include/ops/declarable/generic/activations/hardtanh.cpp
+++ b/libnd4j/include/ops/declarable/generic/activations/hardtanh.cpp
@@ -18,19 +18,19 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_hardtanh)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/legacy_helpers.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CONFIGURABLE_OP_IMPL(hardtanh, 1, 1, true, 0, 0) {
             auto input = INPUT_VARIABLE(0);
             auto output = OUTPUT_VARIABLE(0);
 
-            input->applyTransform(nd4j::transform::HardTanh, *output);
+            input->applyTransform(sd::transform::HardTanh, *output);
             STORE_RESULT(output);
 
             return Status::OK();
diff --git a/libnd4j/include/ops/declarable/generic/activations/identity.cpp b/libnd4j/include/ops/declarable/generic/activations/identity.cpp
index 465231937..38e4a3ae8 100644
--- a/libnd4j/include/ops/declarable/generic/activations/identity.cpp
+++ b/libnd4j/include/ops/declarable/generic/activations/identity.cpp
@@ -18,12 +18,12 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_identity)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         OP_IMPL(identity, 1, 1, true) {
             auto z = OUTPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/activations/identity_n.cpp b/libnd4j/include/ops/declarable/generic/activations/identity_n.cpp
index b96ab9a3f..4b7088660 100644
--- a/libnd4j/include/ops/declarable/generic/activations/identity_n.cpp
+++ b/libnd4j/include/ops/declarable/generic/activations/identity_n.cpp
@@ -18,12 +18,12 @@
 //  @author sgazeos@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_identity_n)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(identity_n, 1, 1, true, 0, 0) {
 
@@ -52,8 +52,8 @@ namespace nd4j {
 
         DECLARE_TYPES(identity_n) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
-                    ->setAllowedOutputTypes(nd4j::DataType::ANY);
+                    ->setAllowedInputTypes(sd::DataType::ANY)
+                    ->setAllowedOutputTypes(sd::DataType::ANY);
         }
 
     }
diff --git a/libnd4j/include/ops/declarable/generic/activations/lrelu.cpp b/libnd4j/include/ops/declarable/generic/activations/lrelu.cpp
index 80404135f..2f4c2dc04 100644
--- a/libnd4j/include/ops/declarable/generic/activations/lrelu.cpp
+++ b/libnd4j/include/ops/declarable/generic/activations/lrelu.cpp
@@ -18,12 +18,12 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_lrelu)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/legacy_helpers.h>
-namespace nd4j {
+namespace sd {
     namespace ops {
         CONFIGURABLE_OP_IMPL(lrelu, 1, 1, true, -2, 0) {
             auto input = INPUT_VARIABLE(0);
@@ -31,7 +31,7 @@ namespace nd4j {
 
             float alpha = block.numT() > 0 ? T_ARG(0) : 0.01f;
 
-            input->applyScalar(nd4j::scalar::LeakyRELU, alpha, *output);
+            input->applyScalar(sd::scalar::LeakyRELU, alpha, *output);
             STORE_RESULT(output);
 
             return Status::OK();
diff --git a/libnd4j/include/ops/declarable/generic/activations/prelu.cpp b/libnd4j/include/ops/declarable/generic/activations/prelu.cpp
index 2bf3578b1..b7d260a4c 100644
--- a/libnd4j/include/ops/declarable/generic/activations/prelu.cpp
+++ b/libnd4j/include/ops/declarable/generic/activations/prelu.cpp
@@ -19,14 +19,14 @@
 //
 
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_prelu)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/activations.h>
 #include <numeric>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 
diff --git a/libnd4j/include/ops/declarable/generic/activations/rationaltanh.cpp b/libnd4j/include/ops/declarable/generic/activations/rationaltanh.cpp
index 5bae4d2dc..3386d1578 100644
--- a/libnd4j/include/ops/declarable/generic/activations/rationaltanh.cpp
+++ b/libnd4j/include/ops/declarable/generic/activations/rationaltanh.cpp
@@ -18,19 +18,19 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_rationaltanh)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/legacy_helpers.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CONFIGURABLE_OP_IMPL(rationaltanh, 1, 1, true, 0, 0) {
             auto input = INPUT_VARIABLE(0);
             auto output = OUTPUT_VARIABLE(0);
 
-            input->applyTransform(nd4j::transform::RationalTanh, *output);
+            input->applyTransform(sd::transform::RationalTanh, *output);
             STORE_RESULT(output);
 
             return Status::OK();
diff --git a/libnd4j/include/ops/declarable/generic/activations/rectifiedtanh.cpp b/libnd4j/include/ops/declarable/generic/activations/rectifiedtanh.cpp
index 40738c343..641ee0d0e 100644
--- a/libnd4j/include/ops/declarable/generic/activations/rectifiedtanh.cpp
+++ b/libnd4j/include/ops/declarable/generic/activations/rectifiedtanh.cpp
@@ -18,19 +18,19 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_rectifiedtanh)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/legacy_helpers.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CONFIGURABLE_OP_IMPL(rectifiedtanh, 1, 1, true, 0, 0) {
             auto input = INPUT_VARIABLE(0);
             auto output = OUTPUT_VARIABLE(0);
 
-            input->applyTransform(nd4j::transform::RectifiedTanh, *output);
+            input->applyTransform(sd::transform::RectifiedTanh, *output);
             STORE_RESULT(output);
 
             return Status::OK();
diff --git a/libnd4j/include/ops/declarable/generic/activations/relu.cpp b/libnd4j/include/ops/declarable/generic/activations/relu.cpp
index 2c8b978ff..3b42c2e5a 100644
--- a/libnd4j/include/ops/declarable/generic/activations/relu.cpp
+++ b/libnd4j/include/ops/declarable/generic/activations/relu.cpp
@@ -18,13 +18,13 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_relu)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/legacy_helpers.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CONFIGURABLE_OP_IMPL(relu, 1, 1, true, 1, 0) {
             auto first = INPUT_VARIABLE(0);
@@ -32,7 +32,7 @@ namespace nd4j {
 
             auto scalar = block.numT() > 0 ? block.getTArguments()->at(0) : 0.0;
 
-            first->applyScalar(nd4j::scalar::RELU, scalar, *z);
+            first->applyScalar(sd::scalar::RELU, scalar, *z);
 
             STORE_RESULT(*z);
 
diff --git a/libnd4j/include/ops/declarable/generic/activations/relu6.cpp b/libnd4j/include/ops/declarable/generic/activations/relu6.cpp
index cf12d1592..129c09480 100644
--- a/libnd4j/include/ops/declarable/generic/activations/relu6.cpp
+++ b/libnd4j/include/ops/declarable/generic/activations/relu6.cpp
@@ -18,13 +18,13 @@
 // @author Yurii Shyrma, created on 16.02.2018
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_relu6)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/legacy_helpers.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 
@@ -33,7 +33,7 @@ CONFIGURABLE_OP_IMPL(relu6, 1, 1, true, 1, 0) {
     auto input  = INPUT_VARIABLE(0);
     auto output = OUTPUT_VARIABLE(0);
 
-    input->applyScalar(nd4j::scalar::RELU6, T_ARG(0), *output);
+    input->applyScalar(sd::scalar::RELU6, T_ARG(0), *output);
 
     return Status::OK();
 }
diff --git a/libnd4j/include/ops/declarable/generic/activations/selu.cpp b/libnd4j/include/ops/declarable/generic/activations/selu.cpp
index ca16f6832..7fc6aa11a 100644
--- a/libnd4j/include/ops/declarable/generic/activations/selu.cpp
+++ b/libnd4j/include/ops/declarable/generic/activations/selu.cpp
@@ -18,19 +18,19 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_selu)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/legacy_helpers.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CONFIGURABLE_OP_IMPL(selu, 1, 1, true, 0, 0) {
             auto first = INPUT_VARIABLE(0);
             auto z = OUTPUT_VARIABLE(0);
 
-            first->applyTransform(nd4j::transform::SELU, *z);
+            first->applyTransform(sd::transform::SELU, *z);
 
             STORE_RESULT(*z);
 
diff --git a/libnd4j/include/ops/declarable/generic/activations/sigmoid.cpp b/libnd4j/include/ops/declarable/generic/activations/sigmoid.cpp
index fb8e507a7..047d973e6 100644
--- a/libnd4j/include/ops/declarable/generic/activations/sigmoid.cpp
+++ b/libnd4j/include/ops/declarable/generic/activations/sigmoid.cpp
@@ -18,18 +18,18 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_sigmoid)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/legacy_helpers.h>
-namespace nd4j {
+namespace sd {
     namespace ops {
         CONFIGURABLE_OP_IMPL(sigmoid, 1, 1, true, 0, 0) {
             auto first = INPUT_VARIABLE(0);
             auto z = OUTPUT_VARIABLE(0);
 
-            first->applyTransform(nd4j::transform::Sigmoid, *z);
+            first->applyTransform(sd::transform::Sigmoid, *z);
 
             STORE_RESULT(*z);
 
diff --git a/libnd4j/include/ops/declarable/generic/activations/softplus.cpp b/libnd4j/include/ops/declarable/generic/activations/softplus.cpp
index bd538ab71..5cd17e752 100644
--- a/libnd4j/include/ops/declarable/generic/activations/softplus.cpp
+++ b/libnd4j/include/ops/declarable/generic/activations/softplus.cpp
@@ -18,19 +18,19 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_softplus)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/legacy_helpers.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CONFIGURABLE_OP_IMPL(softplus, 1, 1, true, 0, 0) {
             auto first = INPUT_VARIABLE(0);
             auto z = OUTPUT_VARIABLE(0);
 
-            first->applyTransform(nd4j::transform::SoftPlus, *z);
+            first->applyTransform(sd::transform::SoftPlus, *z);
 
             STORE_RESULT(*z);
 
diff --git a/libnd4j/include/ops/declarable/generic/activations/softsign.cpp b/libnd4j/include/ops/declarable/generic/activations/softsign.cpp
index 99e52ab68..c7fb15fdd 100644
--- a/libnd4j/include/ops/declarable/generic/activations/softsign.cpp
+++ b/libnd4j/include/ops/declarable/generic/activations/softsign.cpp
@@ -18,19 +18,19 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_softsign)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/legacy_helpers.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CONFIGURABLE_OP_IMPL(softsign, 1, 1, true, 0, 0) {
             auto first = INPUT_VARIABLE(0);
             auto z = OUTPUT_VARIABLE(0);
 
-            first->applyTransform(nd4j::transform::SoftSign, *z);
+            first->applyTransform(sd::transform::SoftSign, *z);
 
             STORE_RESULT(*z);
 
diff --git a/libnd4j/include/ops/declarable/generic/activations/tanh.cpp b/libnd4j/include/ops/declarable/generic/activations/tanh.cpp
index 5677da728..a42552f75 100644
--- a/libnd4j/include/ops/declarable/generic/activations/tanh.cpp
+++ b/libnd4j/include/ops/declarable/generic/activations/tanh.cpp
@@ -18,19 +18,19 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_tanh)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/legacy_helpers.h>
 
-namespace nd4j {
+namespace sd {
         namespace ops {
         CONFIGURABLE_OP_IMPL(tanh, 1, 1, true, 0, 0) {
             auto first = INPUT_VARIABLE(0);
             auto z = OUTPUT_VARIABLE(0);
 
-            first->applyTransform(nd4j::transform::Tanh, *z);
+            first->applyTransform(sd::transform::Tanh, *z);
 
             STORE_RESULT(*z);
 
diff --git a/libnd4j/include/ops/declarable/generic/activations/thresholdedrelu.cpp b/libnd4j/include/ops/declarable/generic/activations/thresholdedrelu.cpp
index b45327dea..a0cba155a 100644
--- a/libnd4j/include/ops/declarable/generic/activations/thresholdedrelu.cpp
+++ b/libnd4j/include/ops/declarable/generic/activations/thresholdedrelu.cpp
@@ -19,13 +19,13 @@
 //
 
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_thresholdedrelu)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/legacy_helpers.h>
 #include <ops/declarable/helpers/activations.h>
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 
diff --git a/libnd4j/include/ops/declarable/generic/bitwise/bits_hamming_distance.cpp b/libnd4j/include/ops/declarable/generic/bitwise/bits_hamming_distance.cpp
index f2a39b270..10f7095e0 100644
--- a/libnd4j/include/ops/declarable/generic/bitwise/bits_hamming_distance.cpp
+++ b/libnd4j/include/ops/declarable/generic/bitwise/bits_hamming_distance.cpp
@@ -18,14 +18,14 @@
 // @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_bits_hamming_distance)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/helpers.h>
 #include <ops/declarable/helpers/hamming.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(bits_hamming_distance, 2, 1, true, 0, 0) {
             auto x = INPUT_VARIABLE(0);
@@ -41,7 +41,7 @@ namespace nd4j {
         }
 
         DECLARE_SHAPE_FN(bits_hamming_distance) {
-            return SHAPELIST(ConstantShapeHelper::getInstance()->scalarShapeInfo(nd4j::DataType::INT64));
+            return SHAPELIST(ConstantShapeHelper::getInstance()->scalarShapeInfo(sd::DataType::INT64));
         }
 
         DECLARE_TYPES(bits_hamming_distance) {
diff --git a/libnd4j/include/ops/declarable/generic/bitwise/bitwise_and.cpp b/libnd4j/include/ops/declarable/generic/bitwise/bitwise_and.cpp
index 6eb3728ed..1e951c1d9 100644
--- a/libnd4j/include/ops/declarable/generic/bitwise/bitwise_and.cpp
+++ b/libnd4j/include/ops/declarable/generic/bitwise/bitwise_and.cpp
@@ -18,14 +18,14 @@
 // @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_bitwise_and)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/helpers.h>
 #include <ops/declarable/helpers/shift.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BROADCASTABLE_OP_IMPL(bitwise_and, 0, 0) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/bitwise/bitwise_or.cpp b/libnd4j/include/ops/declarable/generic/bitwise/bitwise_or.cpp
index 4683e3f3e..cd20a8434 100644
--- a/libnd4j/include/ops/declarable/generic/bitwise/bitwise_or.cpp
+++ b/libnd4j/include/ops/declarable/generic/bitwise/bitwise_or.cpp
@@ -18,14 +18,14 @@
 // @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_bitwise_or)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/helpers.h>
 #include <ops/declarable/helpers/shift.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BROADCASTABLE_OP_IMPL(bitwise_or, 0, 0) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/bitwise/bitwise_xor.cpp b/libnd4j/include/ops/declarable/generic/bitwise/bitwise_xor.cpp
index 1d79a84f3..0af9fe759 100644
--- a/libnd4j/include/ops/declarable/generic/bitwise/bitwise_xor.cpp
+++ b/libnd4j/include/ops/declarable/generic/bitwise/bitwise_xor.cpp
@@ -18,14 +18,14 @@
 // @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_bitwise_xor)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/helpers.h>
 #include <ops/declarable/helpers/shift.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BROADCASTABLE_OP_IMPL(bitwise_xor, 0, 0) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/bitwise/cyclic_rshift.cpp b/libnd4j/include/ops/declarable/generic/bitwise/cyclic_rshift.cpp
index 7a2c61c95..cc0c4827b 100644
--- a/libnd4j/include/ops/declarable/generic/bitwise/cyclic_rshift.cpp
+++ b/libnd4j/include/ops/declarable/generic/bitwise/cyclic_rshift.cpp
@@ -18,14 +18,14 @@
 // @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_cyclic_rshift_bits)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/helpers.h>
 #include <ops/declarable/helpers/shift.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BROADCASTABLE_OP_IMPL(cyclic_rshift_bits, 0, 0) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/bitwise/cyclic_shift.cpp b/libnd4j/include/ops/declarable/generic/bitwise/cyclic_shift.cpp
index 0a1c3d5c8..f2b36a6d8 100644
--- a/libnd4j/include/ops/declarable/generic/bitwise/cyclic_shift.cpp
+++ b/libnd4j/include/ops/declarable/generic/bitwise/cyclic_shift.cpp
@@ -18,14 +18,14 @@
 // @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_cyclic_shift_bits)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/helpers.h>
 #include <ops/declarable/helpers/shift.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BROADCASTABLE_OP_IMPL(cyclic_shift_bits, 0, 0) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/bitwise/rshift.cpp b/libnd4j/include/ops/declarable/generic/bitwise/rshift.cpp
index 0543cc72d..8b44d2a6f 100644
--- a/libnd4j/include/ops/declarable/generic/bitwise/rshift.cpp
+++ b/libnd4j/include/ops/declarable/generic/bitwise/rshift.cpp
@@ -18,14 +18,14 @@
 // @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_rshift_bits)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/helpers.h>
 #include <ops/declarable/helpers/shift.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BROADCASTABLE_OP_IMPL(rshift_bits, 0, 0) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/bitwise/shift.cpp b/libnd4j/include/ops/declarable/generic/bitwise/shift.cpp
index 4f0fec82d..7d0647e1b 100644
--- a/libnd4j/include/ops/declarable/generic/bitwise/shift.cpp
+++ b/libnd4j/include/ops/declarable/generic/bitwise/shift.cpp
@@ -18,14 +18,14 @@
 // @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_shift_bits)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/helpers.h>
 #include <ops/declarable/helpers/shift.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BROADCASTABLE_OP_IMPL(shift_bits, 0, 0) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/bitwise/toggle_bits.cpp b/libnd4j/include/ops/declarable/generic/bitwise/toggle_bits.cpp
index 4aaae3c0d..0ba6fbcc7 100644
--- a/libnd4j/include/ops/declarable/generic/bitwise/toggle_bits.cpp
+++ b/libnd4j/include/ops/declarable/generic/bitwise/toggle_bits.cpp
@@ -18,14 +18,14 @@
 // Created by raver119 on 23.11.17.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_toggle_bits)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/helpers.h>
 #include <ops/declarable/helpers/toggle_bits.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         OP_IMPL(toggle_bits, -1, -1, true) {
 
diff --git a/libnd4j/include/ops/declarable/generic/blas/axpy.cpp b/libnd4j/include/ops/declarable/generic/blas/axpy.cpp
index 65d20589f..7c1150599 100644
--- a/libnd4j/include/ops/declarable/generic/blas/axpy.cpp
+++ b/libnd4j/include/ops/declarable/generic/blas/axpy.cpp
@@ -18,12 +18,12 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_axpy)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CONFIGURABLE_OP_IMPL(axpy, 2, 1, false, -2, 0) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/blas/batched_gemm.cpp b/libnd4j/include/ops/declarable/generic/blas/batched_gemm.cpp
index 67a839e7b..194af35b8 100644
--- a/libnd4j/include/ops/declarable/generic/blas/batched_gemm.cpp
+++ b/libnd4j/include/ops/declarable/generic/blas/batched_gemm.cpp
@@ -18,13 +18,13 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_batched_gemm)
 
 #include <ops/declarable/headers/blas.h>
 #include <ops/declarable/helpers/batched_gemm.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 CUSTOM_OP_IMPL(batched_gemm, -1, -1, false, 0, 9) {
@@ -85,7 +85,7 @@ CUSTOM_OP_IMPL(batched_gemm, -1, -1, false, 0, 9) {
 
     REQUIRE_TRUE(vA.size() == vB.size() && vA.size() == vC.size() && vA.size() == batchSize, 0, "BatchedGemm: mismatched numbers of A, B, C for unknown reason");
 
-    nd4j::ops::helpers::bgemm(vA, vB, vC, alpha, beta, transA, transB, M, N, K, ldA, ldB, ldC);
+    sd::ops::helpers::bgemm(vA, vB, vC, alpha, beta, transA, transB, M, N, K, ldA, ldB, ldC);
 
     return Status::OK();
 };
diff --git a/libnd4j/include/ops/declarable/generic/blas/matmul.cpp b/libnd4j/include/ops/declarable/generic/blas/matmul.cpp
index a673b1988..6209e7bbf 100644
--- a/libnd4j/include/ops/declarable/generic/blas/matmul.cpp
+++ b/libnd4j/include/ops/declarable/generic/blas/matmul.cpp
@@ -20,13 +20,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), fully rewritten
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_matmul)
 
 #include <ops/declarable/CustomOperations.h>
-#include <MmulHelper.h>
+#include <helpers/MmulHelper.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
 
 //////////////////////////////////////////////////////////////////////
@@ -163,7 +163,7 @@ F   F   T   [a,b]   [b,c]   [c,a]   [c,a]
 */
 
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     op.execute({eps, y}, {dldx}, {}, {transZ, !transY, transX}, {});
     op.execute({x, eps}, {dldy}, {}, {!transX, transZ, transY}, {});
 
diff --git a/libnd4j/include/ops/declarable/generic/blas/svd.cpp b/libnd4j/include/ops/declarable/generic/blas/svd.cpp
index 8db2c2ff3..ca5fd52c2 100644
--- a/libnd4j/include/ops/declarable/generic/blas/svd.cpp
+++ b/libnd4j/include/ops/declarable/generic/blas/svd.cpp
@@ -18,13 +18,13 @@
 //  @author Yurii Shyrma (iuriish@yahoo.com), created on 20.01.2018
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_svd)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/svd.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 CUSTOM_OP_IMPL(svd, 1, 1, false, 0, 3) {
diff --git a/libnd4j/include/ops/declarable/generic/blas/tensormmul.cpp b/libnd4j/include/ops/declarable/generic/blas/tensormmul.cpp
index 3db3b6097..889bd4957 100644
--- a/libnd4j/include/ops/declarable/generic/blas/tensormmul.cpp
+++ b/libnd4j/include/ops/declarable/generic/blas/tensormmul.cpp
@@ -18,16 +18,16 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_tensormmul)
 
 #include <numeric>
 #include <helpers/ShapeUtils.h>
 #include <ops/declarable/CustomOperations.h>
-#include <MmulHelper.h>
+#include <helpers/MmulHelper.h>
 
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 ////////////////////////////////////////////////////////////////////////
@@ -78,7 +78,7 @@ DECLARE_SHAPE_FN(tensormmul) {
     // evaluate shapes
     std::vector<int> permutAt, permutBt;
     std::vector<Nd4jLong> shapeAt, shapeBt;
-    auto outShape = nd4j::ShapeUtils::evalShapeForTensorDot(aShapeInfo, bShapeInfo, axes_0, axes_1, permutAt, permutBt, shapeAt, shapeBt);
+    auto outShape = sd::ShapeUtils::evalShapeForTensorDot(aShapeInfo, bShapeInfo, axes_0, axes_1, permutAt, permutBt, shapeAt, shapeBt);
 
     return SHAPELIST(ConstantShapeHelper::getInstance()->createShapeInfo(ShapeDescriptor(ArrayOptions::dataType(aShapeInfo), 'c', outShape)));
 }
diff --git a/libnd4j/include/ops/declarable/generic/boolean/boolean_not.cpp b/libnd4j/include/ops/declarable/generic/boolean/boolean_not.cpp
index 83cbc9004..56047f16c 100644
--- a/libnd4j/include/ops/declarable/generic/boolean/boolean_not.cpp
+++ b/libnd4j/include/ops/declarable/generic/boolean/boolean_not.cpp
@@ -18,12 +18,12 @@
 // Created by raver on 6/6/2018.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_boolean_not)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         OP_IMPL(boolean_not, 1, 1,true) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/boolean/choose.cpp b/libnd4j/include/ops/declarable/generic/boolean/choose.cpp
index c0e6ee49f..9689c9cd5 100644
--- a/libnd4j/include/ops/declarable/generic/boolean/choose.cpp
+++ b/libnd4j/include/ops/declarable/generic/boolean/choose.cpp
@@ -18,14 +18,14 @@
 //  @author Adam Gibson
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_choose)
 
 #include <helpers/ShapeUtils.h>
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/choose.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(choose, -1, 2, false, -2, -1) {
 
@@ -88,7 +88,7 @@ namespace nd4j {
 
             auto newShape = ConstantShapeHelper::getInstance()->vectorShapeInfo(numResults.e<Nd4jLong>(0), ArrayOptions::dataType(inputShape->at(0)));
 
-            auto shapeScalar = ConstantShapeHelper::getInstance()->scalarShapeInfo(nd4j::DataType::INT64);
+            auto shapeScalar = ConstantShapeHelper::getInstance()->scalarShapeInfo(sd::DataType::INT64);
             return SHAPELIST(newShape, shapeScalar);
         }
 
diff --git a/libnd4j/include/ops/declarable/generic/boolean/eq_scalar.cpp b/libnd4j/include/ops/declarable/generic/boolean/eq_scalar.cpp
index 08db9aa1f..c0623ebb7 100644
--- a/libnd4j/include/ops/declarable/generic/boolean/eq_scalar.cpp
+++ b/libnd4j/include/ops/declarable/generic/boolean/eq_scalar.cpp
@@ -18,12 +18,12 @@
 // Created by raver119 on 13.10.2017.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_eq_scalar)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BOOLEAN_OP_IMPL(eq_scalar, 2, true) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/boolean/gt_scalar.cpp b/libnd4j/include/ops/declarable/generic/boolean/gt_scalar.cpp
index ca7ffb63f..d40c501d4 100644
--- a/libnd4j/include/ops/declarable/generic/boolean/gt_scalar.cpp
+++ b/libnd4j/include/ops/declarable/generic/boolean/gt_scalar.cpp
@@ -18,12 +18,12 @@
 // Created by raver119 on 13.10.2017.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_gt_scalar)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BOOLEAN_OP_IMPL(gt_scalar, 2, true) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/boolean/gte_scalar.cpp b/libnd4j/include/ops/declarable/generic/boolean/gte_scalar.cpp
index 7c28159d5..d555f5d24 100644
--- a/libnd4j/include/ops/declarable/generic/boolean/gte_scalar.cpp
+++ b/libnd4j/include/ops/declarable/generic/boolean/gte_scalar.cpp
@@ -18,12 +18,12 @@
 // Created by raver119 on 13.10.2017.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_gte_scalar)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BOOLEAN_OP_IMPL(gte_scalar, 2, true) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/boolean/is_non_decreasing.cpp b/libnd4j/include/ops/declarable/generic/boolean/is_non_decreasing.cpp
index 37fa1350a..4dd8ca605 100644
--- a/libnd4j/include/ops/declarable/generic/boolean/is_non_decreasing.cpp
+++ b/libnd4j/include/ops/declarable/generic/boolean/is_non_decreasing.cpp
@@ -18,13 +18,13 @@
 //  @author @cpuheater
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_is_non_decreasing)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/compare_elem.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BOOLEAN_OP_IMPL(is_non_decreasing, 1, true) {
             auto input = INPUT_VARIABLE(0);
@@ -35,7 +35,7 @@ namespace nd4j {
 
             bool isNonDecreasing = true;
 
-            nd4j::ops::helpers::compare_elem(block.launchContext(), input, false, isNonDecreasing);
+            sd::ops::helpers::compare_elem(block.launchContext(), input, false, isNonDecreasing);
 
             if (isNonDecreasing)
                 return ND4J_STATUS_TRUE;
diff --git a/libnd4j/include/ops/declarable/generic/boolean/is_numeric_tensor.cpp b/libnd4j/include/ops/declarable/generic/boolean/is_numeric_tensor.cpp
index cdaa49241..184b7b0a6 100644
--- a/libnd4j/include/ops/declarable/generic/boolean/is_numeric_tensor.cpp
+++ b/libnd4j/include/ops/declarable/generic/boolean/is_numeric_tensor.cpp
@@ -18,13 +18,13 @@
 //  @author @cpuheater
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_is_numeric_tensor)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/compare_elem.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BOOLEAN_OP_IMPL(is_numeric_tensor, 1, true) {
 
diff --git a/libnd4j/include/ops/declarable/generic/boolean/is_strictly_increasing.cpp b/libnd4j/include/ops/declarable/generic/boolean/is_strictly_increasing.cpp
index a41852394..0c434cf57 100644
--- a/libnd4j/include/ops/declarable/generic/boolean/is_strictly_increasing.cpp
+++ b/libnd4j/include/ops/declarable/generic/boolean/is_strictly_increasing.cpp
@@ -18,13 +18,13 @@
 //  @author @cpuheater
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_is_strictly_increasing)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/compare_elem.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BOOLEAN_OP_IMPL(is_strictly_increasing, 1, true) {
             auto input = INPUT_VARIABLE(0);
@@ -35,7 +35,7 @@ namespace nd4j {
 
             bool isStrictlyIncreasing = true;
 
-            nd4j::ops::helpers::compare_elem(block.launchContext(), input, true, isStrictlyIncreasing);
+            sd::ops::helpers::compare_elem(block.launchContext(), input, true, isStrictlyIncreasing);
 
             if (isStrictlyIncreasing)
                 return ND4J_STATUS_TRUE;
diff --git a/libnd4j/include/ops/declarable/generic/boolean/lt_scalar.cpp b/libnd4j/include/ops/declarable/generic/boolean/lt_scalar.cpp
index 722d3cbd1..1c4f7ab27 100644
--- a/libnd4j/include/ops/declarable/generic/boolean/lt_scalar.cpp
+++ b/libnd4j/include/ops/declarable/generic/boolean/lt_scalar.cpp
@@ -18,12 +18,12 @@
 // Created by raver119 on 13.10.2017.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_lt_scalar)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BOOLEAN_OP_IMPL(lt_scalar, 2, true) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/boolean/lte_scalar.cpp b/libnd4j/include/ops/declarable/generic/boolean/lte_scalar.cpp
index c950445dd..07a72cfed 100644
--- a/libnd4j/include/ops/declarable/generic/boolean/lte_scalar.cpp
+++ b/libnd4j/include/ops/declarable/generic/boolean/lte_scalar.cpp
@@ -18,12 +18,12 @@
 // Created by raver119 on 13.10.2017.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_lte_scalar)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BOOLEAN_OP_IMPL(lte_scalar, 2, true) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/boolean/neq_scalar.cpp b/libnd4j/include/ops/declarable/generic/boolean/neq_scalar.cpp
index a90f90516..1c05b9fc1 100644
--- a/libnd4j/include/ops/declarable/generic/boolean/neq_scalar.cpp
+++ b/libnd4j/include/ops/declarable/generic/boolean/neq_scalar.cpp
@@ -18,12 +18,12 @@
 // Created by raver119 on 13.10.2017.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_neq_scalar)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BOOLEAN_OP_IMPL(neq_scalar, 2, true) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/boolean/select.cpp b/libnd4j/include/ops/declarable/generic/boolean/select.cpp
index 92cb5e421..e8e257258 100644
--- a/libnd4j/include/ops/declarable/generic/boolean/select.cpp
+++ b/libnd4j/include/ops/declarable/generic/boolean/select.cpp
@@ -18,13 +18,13 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_select)
 
 #include <helpers/ShapeUtils.h>
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(select, 3, 1, false, 0, 0) {
             auto cond = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/boolean/where.cpp b/libnd4j/include/ops/declarable/generic/boolean/where.cpp
index 6aa646cb6..c72c10d6b 100644
--- a/libnd4j/include/ops/declarable/generic/boolean/where.cpp
+++ b/libnd4j/include/ops/declarable/generic/boolean/where.cpp
@@ -18,14 +18,14 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_where)
 
 #include <helpers/ShapeUtils.h>
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/where.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(Where, 1, 1, false, 0, 0) {
             auto condition = INPUT_VARIABLE(0);
@@ -112,12 +112,12 @@ namespace nd4j {
                     newShape[5] = 0;
                     newShape[6] = 1;
                     newShape[7] = 99;
-                    ShapeUtils::updateStridesAndType(newShape, nd4j::DataType::INT64, 'c');
+                    ShapeUtils::updateStridesAndType(newShape, sd::DataType::INT64, 'c');
 
                     newShape = CONSTANT(newShape);
                 }
                 else {
-                    newShape = ConstantShapeHelper::getInstance()->emptyShapeInfo(nd4j::DataType::INT64);
+                    newShape = ConstantShapeHelper::getInstance()->emptyShapeInfo(sd::DataType::INT64);
                 }
 
                 return SHAPELIST(newShape);
diff --git a/libnd4j/include/ops/declarable/generic/boolean/where_np.cpp b/libnd4j/include/ops/declarable/generic/boolean/where_np.cpp
index aa6169bae..9a03acf11 100644
--- a/libnd4j/include/ops/declarable/generic/boolean/where_np.cpp
+++ b/libnd4j/include/ops/declarable/generic/boolean/where_np.cpp
@@ -18,7 +18,7 @@
 //  @author Adam Gibson
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <ops/declarable/headers/boolean.h>
 
 #if NOT_EXCLUDED(OP_where_np)
@@ -26,7 +26,7 @@
 #include <helpers/ShapeUtils.h>
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(where_np, -1, 1, false, 0, 0) {
             auto condition = INPUT_VARIABLE(0);
@@ -102,7 +102,7 @@ namespace nd4j {
                 REQUIRE_TRUE(block.width() == 1, 0, "Where op takes either 1 or 3 operands, But got %d operands instead", block.width());
 //                if (output->isEmpty())
                 Nd4jLong width = condition->rankOf();
-                nd4j::ops::Where op;
+                sd::ops::Where op;
                 std::unique_ptr<ResultSet> res(op.evaluate({condition}));
                 REQUIRE_OK(res->status());
                 NDArray* whereTrue = res->at(0);
@@ -137,11 +137,11 @@ namespace nd4j {
                 // output shape - a tuple of rank(inShape) 1D tensors with numOfTrue len
                 if (numOfTrue) {
                     for (Nd4jLong e = 0; e < condition->rankOf(); ++e) {
-                        shapes->push_back(ConstantShapeHelper::getInstance()->vectorShapeInfo(numOfTrue, nd4j::DataType::INT64));
+                        shapes->push_back(ConstantShapeHelper::getInstance()->vectorShapeInfo(numOfTrue, sd::DataType::INT64));
                     }
                 }
                 else {
-                    shapes->push_back(ConstantShapeHelper::getInstance()->emptyShapeInfo(nd4j::DataType::INT64));
+                    shapes->push_back(ConstantShapeHelper::getInstance()->emptyShapeInfo(sd::DataType::INT64));
                 }
             }
             return shapes;
@@ -149,9 +149,9 @@ namespace nd4j {
 
         DECLARE_TYPES(where_np) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(0, nd4j::DataType::BOOL)
-                    ->setAllowedInputTypes(1, nd4j::DataType::ANY)
-                    ->setAllowedInputTypes(2, nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(0, sd::DataType::BOOL)
+                    ->setAllowedInputTypes(1, sd::DataType::ANY)
+                    ->setAllowedInputTypes(2, sd::DataType::ANY)
                     ->setAllowedOutputTypes( {ALL_FLOATS, ALL_INTS});
         }
     }
diff --git a/libnd4j/include/ops/declarable/generic/broadcastable/add.cpp b/libnd4j/include/ops/declarable/generic/broadcastable/add.cpp
index 415a2c37a..936addea5 100644
--- a/libnd4j/include/ops/declarable/generic/broadcastable/add.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/add.cpp
@@ -18,13 +18,13 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_add)
 
 #include <ops/declarable/generic/helpers/BroadcastHelper.h>
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BROADCASTABLE_OP_IMPL(add, 0, 0) {
             auto x = INPUT_VARIABLE(0);
@@ -33,7 +33,7 @@ namespace nd4j {
 
             BROADCAST_CHECK_EMPTY(x,y,z);
 
-            auto tZ = BroadcastHelper::broadcastApply(nd4j::BroadcastOpsTuple::Add(), x, y, z);
+            auto tZ = BroadcastHelper::broadcastApply(sd::BroadcastOpsTuple::Add(), x, y, z);
             if (tZ == nullptr)
                 return ND4J_STATUS_KERNEL_FAILURE;
             else if (tZ != z)
@@ -71,7 +71,7 @@ namespace nd4j {
                 gradX->assign(epsNext);
             } else if (y->isScalar()) {
                 // scalar case
-                auto tmp = epsNext->reduceNumber(nd4j::reduce::Sum);
+                auto tmp = epsNext->reduceNumber(sd::reduce::Sum);
                 gradY->assign(tmp);
                 gradX->assign(epsNext);
             } else {
@@ -80,13 +80,13 @@ namespace nd4j {
                 auto axisY = ShapeUtils::evalBroadcastBackwardAxis(y->shapeInfo(), epsNext->shapeInfo());
 
                 if (axisX.size() > 0) {
-                    auto sum = epsNext->reduceAlongDimension(nd4j::reduce::Sum, axisX);
+                    auto sum = epsNext->reduceAlongDimension(sd::reduce::Sum, axisX);
                     gradX->assign(sum);
                 } else
                     gradX->assign(epsNext);
 
                 if (axisY.size() > 0) {
-                    auto sum = epsNext->reduceAlongDimension(nd4j::reduce::Sum, axisY);
+                    auto sum = epsNext->reduceAlongDimension(sd::reduce::Sum, axisY);
                     gradY->assign(sum);
                 } else
                     gradY->assign(epsNext);
diff --git a/libnd4j/include/ops/declarable/generic/broadcastable/assign.cpp b/libnd4j/include/ops/declarable/generic/broadcastable/assign.cpp
index 24b673a8c..aeaa5d128 100644
--- a/libnd4j/include/ops/declarable/generic/broadcastable/assign.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/assign.cpp
@@ -18,13 +18,13 @@
 // Created by raver119 on 24.11.17.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_assign)
 
 #include <ops/declarable/generic/helpers/BroadcastHelper.h>
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BROADCASTABLE_OP_IMPL(assign, 0, 0) {
             auto x = INPUT_VARIABLE(0);
@@ -33,7 +33,7 @@ namespace nd4j {
 
             BROADCAST_CHECK_EMPTY(x,y,z);
 
-            auto tZ = BroadcastHelper::broadcastApply(nd4j::BroadcastOpsTuple::Assign(), x, y, z);
+            auto tZ = BroadcastHelper::broadcastApply(sd::BroadcastOpsTuple::Assign(), x, y, z);
             if (tZ == nullptr)
                 return ND4J_STATUS_KERNEL_FAILURE;
             else if (tZ != z) {
@@ -71,14 +71,14 @@ namespace nd4j {
             if (x->isSameShape(y)) {
                 gradY->assign(epsNext);
             } else if (y->isScalar()) {
-                auto sum = epsNext->reduceNumber(nd4j::reduce::Sum);
+                auto sum = epsNext->reduceNumber(sd::reduce::Sum);
                 gradY->assign(sum);
             } else {
                 // broadcastable
                 auto axisY = ShapeUtils::evalBroadcastBackwardAxis(y->shapeInfo(), epsNext->shapeInfo());
 
                 if (axisY.size() > 0) {
-                    auto sum = epsNext->reduceAlongDimension(nd4j::reduce::Sum, axisY);
+                    auto sum = epsNext->reduceAlongDimension(sd::reduce::Sum, axisY);
                     gradY->assign(sum);
                 } else
                     gradY->assign(epsNext);
diff --git a/libnd4j/include/ops/declarable/generic/broadcastable/atan2.cpp b/libnd4j/include/ops/declarable/generic/broadcastable/atan2.cpp
index 32a7d7d65..ed60f5925 100644
--- a/libnd4j/include/ops/declarable/generic/broadcastable/atan2.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/atan2.cpp
@@ -19,12 +19,12 @@
 // @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_tf_atan2)
 
 #include <ops/declarable/headers/broadcastable.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
 BROADCASTABLE_OP_IMPL(tf_atan2, 0, 0) {
@@ -36,7 +36,7 @@ BROADCASTABLE_OP_IMPL(tf_atan2, 0, 0) {
     BROADCAST_CHECK_EMPTY(x,y,z);
 
     // auto tZ = BroadcastHelper<T>::template broadcastApply<simdOps::Atan2<T>>(y, x, z);
-    x->applyTrueBroadcast(nd4j::BroadcastOpsTuple::custom(scalar::Atan2, pairwise::Atan2, broadcast::Atan2), *y, *z, true);
+    x->applyTrueBroadcast(sd::BroadcastOpsTuple::custom(scalar::Atan2, pairwise::Atan2, broadcast::Atan2), *y, *z, true);
 
     // if (tZ == nullptr)
     //     return ND4J_STATUS_KERNEL_FAILURE;
diff --git a/libnd4j/include/ops/declarable/generic/broadcastable/boolean_and.cpp b/libnd4j/include/ops/declarable/generic/broadcastable/boolean_and.cpp
index f10bf97e7..32593ecf6 100644
--- a/libnd4j/include/ops/declarable/generic/broadcastable/boolean_and.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/boolean_and.cpp
@@ -18,12 +18,12 @@
 // Created by raver on 6/6/2018.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_boolean_or)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BROADCASTABLE_OP_IMPL(boolean_and, 0, 0) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/broadcastable/boolean_or.cpp b/libnd4j/include/ops/declarable/generic/broadcastable/boolean_or.cpp
index bf0f66410..1dbb69f30 100644
--- a/libnd4j/include/ops/declarable/generic/broadcastable/boolean_or.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/boolean_or.cpp
@@ -18,12 +18,12 @@
 // Created by raver on 6/6/2018.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_boolean_or)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BROADCASTABLE_OP_IMPL(boolean_or, 0, 0) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/broadcastable/boolean_xor.cpp b/libnd4j/include/ops/declarable/generic/broadcastable/boolean_xor.cpp
index 6e83f31ea..8f242fbda 100644
--- a/libnd4j/include/ops/declarable/generic/broadcastable/boolean_xor.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/boolean_xor.cpp
@@ -18,12 +18,12 @@
 // Created by raver on 6/6/2018.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_boolean_xor)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BROADCASTABLE_OP_IMPL(boolean_xor, 0, 0) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/broadcastable/divide.cpp b/libnd4j/include/ops/declarable/generic/broadcastable/divide.cpp
index 1811781f1..cd907de36 100644
--- a/libnd4j/include/ops/declarable/generic/broadcastable/divide.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/divide.cpp
@@ -18,13 +18,13 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_divide)
 
 #include <ops/declarable/generic/helpers/BroadcastHelper.h>
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BROADCASTABLE_OP_IMPL(divide, 0, 0) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/broadcastable/divide_no_nan.cpp b/libnd4j/include/ops/declarable/generic/broadcastable/divide_no_nan.cpp
index 3cf808b8e..9ef300e1c 100644
--- a/libnd4j/include/ops/declarable/generic/broadcastable/divide_no_nan.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/divide_no_nan.cpp
@@ -18,13 +18,13 @@
 //  @author George A. Shulinok <sgazeos@gmail.com>
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_divide_no_nan)
 
 #include <ops/declarable/generic/helpers/BroadcastHelper.h>
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BROADCASTABLE_OP_IMPL(divide_no_nan, 0, 0) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/broadcastable/equals.cpp b/libnd4j/include/ops/declarable/generic/broadcastable/equals.cpp
index 440cdf0d3..c82fe6748 100644
--- a/libnd4j/include/ops/declarable/generic/broadcastable/equals.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/equals.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/headers/broadcastable.h>
 #include <ops/BroadcastBoolOpsTuple.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BROADCASTABLE_OP_IMPL(equals, 0, 0) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/broadcastable/floordiv.cpp b/libnd4j/include/ops/declarable/generic/broadcastable/floordiv.cpp
index 10d898522..d0a59bcc1 100644
--- a/libnd4j/include/ops/declarable/generic/broadcastable/floordiv.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/floordiv.cpp
@@ -18,13 +18,13 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_floordiv)
 
 #include <ops/declarable/generic/helpers/BroadcastHelper.h>
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BROADCASTABLE_OP_IMPL(floordiv, 0, 0) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/broadcastable/floormod.cpp b/libnd4j/include/ops/declarable/generic/broadcastable/floormod.cpp
index 062d3cfab..8ed93dd3a 100644
--- a/libnd4j/include/ops/declarable/generic/broadcastable/floormod.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/floormod.cpp
@@ -18,13 +18,13 @@
 //  @author raver119@gmail.com
 //  modified by sgazeos@gmail.com with backprop implementation.
 //
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_floormod)
 
 #include <ops/declarable/generic/helpers/BroadcastHelper.h>
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BROADCASTABLE_OP_IMPL(floormod, 0, 0) {
             auto x = INPUT_VARIABLE(0);
@@ -65,7 +65,7 @@ namespace nd4j {
             auto gradX = OUTPUT_VARIABLE(0);
             auto gradY = OUTPUT_VARIABLE(1);
             gradX->assign(epsNext);
-            nd4j::ops::floormod op;
+            sd::ops::floormod op;
             std::unique_ptr<ResultSet> tmpResult(op.evaluate({x, y}));
 
             if (gradY->rankOf() == gradX->rankOf())
diff --git a/libnd4j/include/ops/declarable/generic/broadcastable/greater.cpp b/libnd4j/include/ops/declarable/generic/broadcastable/greater.cpp
index 9f381f403..961259946 100644
--- a/libnd4j/include/ops/declarable/generic/broadcastable/greater.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/greater.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/generic/helpers/BroadcastHelper.h>
 #include <ops/declarable/headers/broadcastable.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BROADCASTABLE_OP_IMPL(greater, 0, 0) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/broadcastable/greater_equal.cpp b/libnd4j/include/ops/declarable/generic/broadcastable/greater_equal.cpp
index caa1f05a0..1adbad420 100644
--- a/libnd4j/include/ops/declarable/generic/broadcastable/greater_equal.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/greater_equal.cpp
@@ -20,7 +20,7 @@
 
 #include <ops/declarable/headers/broadcastable.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BROADCASTABLE_OP_IMPL(greater_equal, 0, 0) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/broadcastable/igamma.cpp b/libnd4j/include/ops/declarable/generic/broadcastable/igamma.cpp
index 6bd1c88ed..9fa07424c 100644
--- a/libnd4j/include/ops/declarable/generic/broadcastable/igamma.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/igamma.cpp
@@ -18,13 +18,13 @@
 // @author sgazeos@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_igamma)
 
 #include <ops/declarable/generic/helpers/BroadcastHelper.h>
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BROADCASTABLE_OP_IMPL(igamma, 0, 0) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/broadcastable/igammac.cpp b/libnd4j/include/ops/declarable/generic/broadcastable/igammac.cpp
index 89494dc4b..deeacd4ef 100644
--- a/libnd4j/include/ops/declarable/generic/broadcastable/igammac.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/igammac.cpp
@@ -18,13 +18,13 @@
 // @author sgazeos@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_igammac)
 
 #include <ops/declarable/generic/helpers/BroadcastHelper.h>
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BROADCASTABLE_OP_IMPL(igammac, 0, 0) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/broadcastable/less.cpp b/libnd4j/include/ops/declarable/generic/broadcastable/less.cpp
index c1715bec2..ba5c72fa4 100644
--- a/libnd4j/include/ops/declarable/generic/broadcastable/less.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/less.cpp
@@ -20,7 +20,7 @@
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BROADCASTABLE_OP_IMPL(less, 0, 0) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/broadcastable/less_equal.cpp b/libnd4j/include/ops/declarable/generic/broadcastable/less_equal.cpp
index 3041938c6..b602f1374 100644
--- a/libnd4j/include/ops/declarable/generic/broadcastable/less_equal.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/less_equal.cpp
@@ -20,7 +20,7 @@
 
 #include <ops/declarable/headers/broadcastable.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BROADCASTABLE_OP_IMPL(less_equal, 0, 0) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/broadcastable/maximum.cpp b/libnd4j/include/ops/declarable/generic/broadcastable/maximum.cpp
index a18626b48..dfb6b3d66 100644
--- a/libnd4j/include/ops/declarable/generic/broadcastable/maximum.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/maximum.cpp
@@ -18,13 +18,13 @@
 // Created by raver119 on 12.10.2017.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_maximum)
 
 #include <ops/declarable/generic/helpers/BroadcastHelper.h>
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/minimax.h>
-namespace nd4j {
+namespace sd {
     namespace ops {
         BROADCASTABLE_OP_IMPL(maximum, 0, 0) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/broadcastable/meshgrid.cpp b/libnd4j/include/ops/declarable/generic/broadcastable/meshgrid.cpp
index 04444e53a..b07f50202 100644
--- a/libnd4j/include/ops/declarable/generic/broadcastable/meshgrid.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/meshgrid.cpp
@@ -18,14 +18,14 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 05.02.2018
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_meshgrid)
 
 #include <ops/declarable/CustomOperations.h>
 #include<ops/declarable/helpers/meshgrid.h>
 #include <numeric>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 CUSTOM_OP_IMPL(meshgrid, -1, -1, false, 0, 0) {    
diff --git a/libnd4j/include/ops/declarable/generic/broadcastable/minimum.cpp b/libnd4j/include/ops/declarable/generic/broadcastable/minimum.cpp
index d2fb43558..ef8645d1d 100644
--- a/libnd4j/include/ops/declarable/generic/broadcastable/minimum.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/minimum.cpp
@@ -18,14 +18,14 @@
 // Created by raver119 on 12.10.2017.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_minimum)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/generic/helpers/BroadcastHelper.h>
 #include <ops/declarable/helpers/minimax.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BROADCASTABLE_OP_IMPL(minimum, 0, 0) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/broadcastable/mod.cpp b/libnd4j/include/ops/declarable/generic/broadcastable/mod.cpp
index 5ae075c99..95c710d17 100644
--- a/libnd4j/include/ops/declarable/generic/broadcastable/mod.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/mod.cpp
@@ -18,13 +18,13 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_mod)
 
 #include <ops/declarable/generic/helpers/BroadcastHelper.h>
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BROADCASTABLE_OP_IMPL(mod, 0, 0) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/broadcastable/multiply.cpp b/libnd4j/include/ops/declarable/generic/broadcastable/multiply.cpp
index d50ffacaa..3ddbe57ca 100644
--- a/libnd4j/include/ops/declarable/generic/broadcastable/multiply.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/multiply.cpp
@@ -19,12 +19,12 @@
 //  @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_multiply)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
     BROADCASTABLE_OP_IMPL(multiply, 0, 0) {
@@ -38,7 +38,7 @@ namespace ops {
         const bool areShapesBroadcastable = ShapeUtils::evalBroadcastShapeInfo(x->getShapeInfo(), y->getShapeInfo(), true, zShapeInfo, block.getWorkspace());
         REQUIRE_TRUE(areShapesBroadcastable, 0, "MULTIPLY OP: the shapes of x %s and y %s are not suitable for broadcast !", ShapeUtils::shapeAsString(x).c_str(), ShapeUtils::shapeAsString(y).c_str());
 
-        auto tZ = BroadcastHelper::broadcastApply(nd4j::BroadcastOpsTuple::Multiply(), x, y, z);
+        auto tZ = BroadcastHelper::broadcastApply(sd::BroadcastOpsTuple::Multiply(), x, y, z);
         if (tZ == nullptr)
             return ND4J_STATUS_KERNEL_FAILURE;
         else if (tZ != z)
diff --git a/libnd4j/include/ops/declarable/generic/broadcastable/not_equals.cpp b/libnd4j/include/ops/declarable/generic/broadcastable/not_equals.cpp
index 1dede042d..fddd653e9 100644
--- a/libnd4j/include/ops/declarable/generic/broadcastable/not_equals.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/not_equals.cpp
@@ -20,7 +20,7 @@
 
 #include <ops/declarable/headers/broadcastable.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BROADCASTABLE_OP_IMPL(not_equals, 0, 0) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/broadcastable/percentile.cpp b/libnd4j/include/ops/declarable/generic/broadcastable/percentile.cpp
index 3c50edbc7..e2bf723b3 100644
--- a/libnd4j/include/ops/declarable/generic/broadcastable/percentile.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/percentile.cpp
@@ -18,14 +18,14 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 17.05.2018
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_percentile)
 
 #include <ops/declarable/CustomOperations.h>
-#include <declarable/helpers/percentile.h>
+#include <ops/declarable/helpers/percentile.h>
 
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 CUSTOM_OP_IMPL(percentile, 1, 1, false, 1, -2) {    
diff --git a/libnd4j/include/ops/declarable/generic/broadcastable/pow.cpp b/libnd4j/include/ops/declarable/generic/broadcastable/pow.cpp
index 1fe3e359b..5a1ac02c5 100644
--- a/libnd4j/include/ops/declarable/generic/broadcastable/pow.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/pow.cpp
@@ -19,13 +19,13 @@
 // @author Oleh Semeniv (oleg.semeniv@gmail.com)
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_Pow)
 
 #include <ops/declarable/generic/helpers/BroadcastHelper.h>
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
         BROADCASTABLE_OP_IMPL(Pow, 0, 0) {
             auto x = INPUT_VARIABLE(0);
@@ -75,7 +75,7 @@ namespace ops {
            // dL/dy = x^y * log(x) * dL/dz
            auto temp = x->applyTrueBroadcast(BroadcastOpsTuple::Pow(), *y); // a = x^y
            x->applyTransform(transform::Log, *dLdx); // b = log(x)
-           dLdx->applyScalar(nd4j::scalar::ReplaceNans, 0, *dLdx);
+           dLdx->applyScalar(sd::scalar::ReplaceNans, 0, *dLdx);
            temp *= *dLdx; // c = b*a
            temp *= *dLdz; // dL/dy = c * dL/dz
            if (dLdy->isSameShape(*dLdz)) {
diff --git a/libnd4j/include/ops/declarable/generic/broadcastable/realdiv.cpp b/libnd4j/include/ops/declarable/generic/broadcastable/realdiv.cpp
index 3e7445cf0..3691ffb55 100644
--- a/libnd4j/include/ops/declarable/generic/broadcastable/realdiv.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/realdiv.cpp
@@ -18,13 +18,13 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_realdiv)
 
 #include <ops/declarable/generic/helpers/BroadcastHelper.h>
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BROADCASTABLE_OP_IMPL(realdiv, 0, 0) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/broadcastable/reverse_divide.cpp b/libnd4j/include/ops/declarable/generic/broadcastable/reverse_divide.cpp
index 04c4c926e..0b6ea7d2a 100644
--- a/libnd4j/include/ops/declarable/generic/broadcastable/reverse_divide.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/reverse_divide.cpp
@@ -18,13 +18,13 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_reversedivide)
 
 #include <ops/declarable/generic/helpers/BroadcastHelper.h>
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BROADCASTABLE_OP_IMPL(reversedivide, 0, 0) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/broadcastable/reverse_mod.cpp b/libnd4j/include/ops/declarable/generic/broadcastable/reverse_mod.cpp
index 9dea93699..bb25fada6 100644
--- a/libnd4j/include/ops/declarable/generic/broadcastable/reverse_mod.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/reverse_mod.cpp
@@ -18,13 +18,13 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_reversemod)
 
 #include <ops/declarable/generic/helpers/BroadcastHelper.h>
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BROADCASTABLE_OP_IMPL(reversemod, 0, 0) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/broadcastable/reverse_subtract.cpp b/libnd4j/include/ops/declarable/generic/broadcastable/reverse_subtract.cpp
index dbb14c78b..5d33c7cea 100644
--- a/libnd4j/include/ops/declarable/generic/broadcastable/reverse_subtract.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/reverse_subtract.cpp
@@ -18,13 +18,13 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_reversesubtract)
 
 #include <ops/declarable/generic/helpers/BroadcastHelper.h>
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BROADCASTABLE_OP_IMPL(reversesubtract, 0, 0) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/broadcastable/squared_subtract.cpp b/libnd4j/include/ops/declarable/generic/broadcastable/squared_subtract.cpp
index ae9c93d4d..6f5482512 100644
--- a/libnd4j/include/ops/declarable/generic/broadcastable/squared_subtract.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/squared_subtract.cpp
@@ -18,13 +18,13 @@
 // Created by raver119 on 23.11.17.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_squaredsubtract)
 
 #include <ops/declarable/generic/helpers/BroadcastHelper.h>
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BROADCASTABLE_OP_IMPL(squaredsubtract, 0, 0) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/broadcastable/subtract.cpp b/libnd4j/include/ops/declarable/generic/broadcastable/subtract.cpp
index 40bbb8559..fac1c5dfa 100644
--- a/libnd4j/include/ops/declarable/generic/broadcastable/subtract.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/subtract.cpp
@@ -18,13 +18,13 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_subtract)
 
 #include <ops/declarable/generic/helpers/BroadcastHelper.h>
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BROADCASTABLE_OP_IMPL(subtract, 0, 0) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/broadcastable/template.tpl b/libnd4j/include/ops/declarable/generic/broadcastable/template.tpl
index 65b90dccf..cc311b9f4 100644
--- a/libnd4j/include/ops/declarable/generic/broadcastable/template.tpl
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/template.tpl
@@ -20,7 +20,7 @@
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         
     }
diff --git a/libnd4j/include/ops/declarable/generic/broadcastable/truncatediv.cpp b/libnd4j/include/ops/declarable/generic/broadcastable/truncatediv.cpp
index 33aae6d78..60900a5d9 100644
--- a/libnd4j/include/ops/declarable/generic/broadcastable/truncatediv.cpp
+++ b/libnd4j/include/ops/declarable/generic/broadcastable/truncatediv.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/generic/helpers/BroadcastHelper.h>
 #include <ops/declarable/headers/broadcastable.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BROADCASTABLE_OP_IMPL(truncatediv, 0, 0) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/compat/compat_sparse_to_dense.cpp b/libnd4j/include/ops/declarable/generic/compat/compat_sparse_to_dense.cpp
index 4a84dbdac..2d6cf5f12 100644
--- a/libnd4j/include/ops/declarable/generic/compat/compat_sparse_to_dense.cpp
+++ b/libnd4j/include/ops/declarable/generic/compat/compat_sparse_to_dense.cpp
@@ -18,13 +18,13 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_split_string)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/sparse_to_dense.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(compat_sparse_to_dense, 4, 1, false, 0, 0) {
             auto indices = INPUT_VARIABLE(0);
@@ -37,7 +37,7 @@ namespace nd4j {
             if (block.width() > 3)
                 def = INPUT_VARIABLE(3);
 
-            nd4j::ops::helpers::compat_sparse_to_dense(*values, *indices, def, *output);
+            sd::ops::helpers::compat_sparse_to_dense(*values, *indices, def, *output);
 
             return Status::OK();
         };
@@ -63,9 +63,9 @@ namespace nd4j {
             getOpDescriptor()
                     ->setAllowedInputTypes(0, {ALL_INTS}) // indices
                     ->setAllowedInputTypes(1, {ALL_INTS}) // shape
-                    ->setAllowedInputTypes(2,nd4j::DataType::ANY) // sparse values
-                    ->setAllowedInputTypes(3,nd4j::DataType::ANY) // default value
-                    ->setAllowedOutputTypes(nd4j::DataType::ANY);
+                    ->setAllowedInputTypes(2,sd::DataType::ANY) // sparse values
+                    ->setAllowedInputTypes(3,sd::DataType::ANY) // default value
+                    ->setAllowedOutputTypes(sd::DataType::ANY);
         }
     }
 }
diff --git a/libnd4j/include/ops/declarable/generic/compat/compat_string_split.cpp b/libnd4j/include/ops/declarable/generic/compat/compat_string_split.cpp
index ac88a4a60..e835dc711 100644
--- a/libnd4j/include/ops/declarable/generic/compat/compat_string_split.cpp
+++ b/libnd4j/include/ops/declarable/generic/compat/compat_string_split.cpp
@@ -18,13 +18,13 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_split_string)
 
 #include <ops/declarable/CustomOperations.h>
 #include <helpers/StringUtils.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(compat_string_split, 2, 2, false, 0, 0) {
             auto input = INPUT_VARIABLE(0);
@@ -122,8 +122,8 @@ namespace nd4j {
             // values tensor is going to be vector always
             // indices tensor is going to be vector with length equal to values.length * output rank
 
-            auto valuesShape = ConstantShapeHelper::getInstance()->vectorShapeInfo(cnt, nd4j::DataType::UTF8);
-            auto indicesShape = ConstantShapeHelper::getInstance()->vectorShapeInfo(cnt * (input->rankOf() + 1), nd4j::DataType::INT64);
+            auto valuesShape = ConstantShapeHelper::getInstance()->vectorShapeInfo(cnt, sd::DataType::UTF8);
+            auto indicesShape = ConstantShapeHelper::getInstance()->vectorShapeInfo(cnt * (input->rankOf() + 1), sd::DataType::INT64);
 
             return SHAPELIST(indicesShape, valuesShape);
         }
diff --git a/libnd4j/include/ops/declarable/generic/datatypes/bitcast.cpp b/libnd4j/include/ops/declarable/generic/datatypes/bitcast.cpp
index 8591d3449..fe42d7057 100644
--- a/libnd4j/include/ops/declarable/generic/datatypes/bitcast.cpp
+++ b/libnd4j/include/ops/declarable/generic/datatypes/bitcast.cpp
@@ -18,13 +18,13 @@
 // @author George A. Shulinok <sgazeos@gmail.com>
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_bitcast)
 
 #include <array/DataTypeUtils.h>
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(bitcast, 1, 1, false, 0, 1) {
             auto input = INPUT_VARIABLE(0);
@@ -94,8 +94,8 @@ namespace nd4j {
 
         DECLARE_TYPES(bitcast) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
-                    ->setAllowedOutputTypes(nd4j::DataType::ANY);
+                    ->setAllowedInputTypes(sd::DataType::ANY)
+                    ->setAllowedOutputTypes(sd::DataType::ANY);
         }
     }
 }
diff --git a/libnd4j/include/ops/declarable/generic/datatypes/cast.cpp b/libnd4j/include/ops/declarable/generic/datatypes/cast.cpp
index ad7b7fee2..cf8729d2f 100644
--- a/libnd4j/include/ops/declarable/generic/datatypes/cast.cpp
+++ b/libnd4j/include/ops/declarable/generic/datatypes/cast.cpp
@@ -18,13 +18,13 @@
 // @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_cast)
 
 #include <array/DataTypeUtils.h>
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(cast, 1, 1, false, 0, 1) {
             auto input = INPUT_VARIABLE(0);
@@ -54,8 +54,8 @@ namespace nd4j {
 
         DECLARE_TYPES(cast) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
-                    ->setAllowedOutputTypes(nd4j::DataType::ANY);
+                    ->setAllowedInputTypes(sd::DataType::ANY)
+                    ->setAllowedOutputTypes(sd::DataType::ANY);
         }
     }
 }
diff --git a/libnd4j/include/ops/declarable/generic/datatypes/to_double.cpp b/libnd4j/include/ops/declarable/generic/datatypes/to_double.cpp
index 9d3e5aaed..4eae77a5a 100644
--- a/libnd4j/include/ops/declarable/generic/datatypes/to_double.cpp
+++ b/libnd4j/include/ops/declarable/generic/datatypes/to_double.cpp
@@ -18,12 +18,12 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_to_double)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(to_double, 1, 1, true, 0, 0) {
             auto input = INPUT_VARIABLE(0);
@@ -39,8 +39,8 @@ namespace nd4j {
 
         DECLARE_TYPES(to_double) {
             getOpDescriptor()
-            ->setAllowedInputTypes(nd4j::DataType::ANY)
-            ->setAllowedOutputTypes(nd4j::DataType::DOUBLE);
+            ->setAllowedInputTypes(sd::DataType::ANY)
+            ->setAllowedOutputTypes(sd::DataType::DOUBLE);
         }
 
         DECLARE_SHAPE_FN(to_double) {
diff --git a/libnd4j/include/ops/declarable/generic/datatypes/to_float16.cpp b/libnd4j/include/ops/declarable/generic/datatypes/to_float16.cpp
index d6818bec4..aa8ceb045 100644
--- a/libnd4j/include/ops/declarable/generic/datatypes/to_float16.cpp
+++ b/libnd4j/include/ops/declarable/generic/datatypes/to_float16.cpp
@@ -18,12 +18,12 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_to_float16)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(to_float16, 1, 1, true, 0, 0) {
             auto input = INPUT_VARIABLE(0);
@@ -39,8 +39,8 @@ namespace nd4j {
 
         DECLARE_TYPES(to_float16) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
-                    ->setAllowedOutputTypes(nd4j::DataType::HALF);
+                    ->setAllowedInputTypes(sd::DataType::ANY)
+                    ->setAllowedOutputTypes(sd::DataType::HALF);
         }
 
         DECLARE_SHAPE_FN(to_float16) {
diff --git a/libnd4j/include/ops/declarable/generic/datatypes/to_float32.cpp b/libnd4j/include/ops/declarable/generic/datatypes/to_float32.cpp
index 4ca46bb82..23a924f9c 100644
--- a/libnd4j/include/ops/declarable/generic/datatypes/to_float32.cpp
+++ b/libnd4j/include/ops/declarable/generic/datatypes/to_float32.cpp
@@ -18,12 +18,12 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_to_float32)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(to_float32, 1, 1, true, 0, 0) {
             auto input = INPUT_VARIABLE(0);
@@ -39,8 +39,8 @@ namespace nd4j {
 
         DECLARE_TYPES(to_float32) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
-                    ->setAllowedOutputTypes(nd4j::DataType::FLOAT32);
+                    ->setAllowedInputTypes(sd::DataType::ANY)
+                    ->setAllowedOutputTypes(sd::DataType::FLOAT32);
         }
 
         DECLARE_SHAPE_FN(to_float32) {
diff --git a/libnd4j/include/ops/declarable/generic/datatypes/to_int32.cpp b/libnd4j/include/ops/declarable/generic/datatypes/to_int32.cpp
index 897868be5..c28fa6049 100644
--- a/libnd4j/include/ops/declarable/generic/datatypes/to_int32.cpp
+++ b/libnd4j/include/ops/declarable/generic/datatypes/to_int32.cpp
@@ -18,12 +18,12 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_to_int32)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(to_int32, 1, 1, true, 0, 0) {
             auto input = INPUT_VARIABLE(0);
@@ -39,8 +39,8 @@ namespace nd4j {
 
         DECLARE_TYPES(to_int32) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
-                    ->setAllowedOutputTypes(nd4j::DataType::INT32);
+                    ->setAllowedInputTypes(sd::DataType::ANY)
+                    ->setAllowedOutputTypes(sd::DataType::INT32);
         }
         DECLARE_SHAPE_FN(to_int32) {
             auto outShape = ShapeBuilders::copyShapeInfoAndType(inputShape->at(0), DataType::INT32, true, block.workspace());
diff --git a/libnd4j/include/ops/declarable/generic/datatypes/to_int64.cpp b/libnd4j/include/ops/declarable/generic/datatypes/to_int64.cpp
index 6fa728254..cb994ccfe 100644
--- a/libnd4j/include/ops/declarable/generic/datatypes/to_int64.cpp
+++ b/libnd4j/include/ops/declarable/generic/datatypes/to_int64.cpp
@@ -18,12 +18,12 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_to_int64)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(to_int64, 1, 1, true, 0, 0) {
             auto input = INPUT_VARIABLE(0);
@@ -39,8 +39,8 @@ namespace nd4j {
 
         DECLARE_TYPES(to_int64) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
-                    ->setAllowedOutputTypes(nd4j::DataType::INT64);
+                    ->setAllowedInputTypes(sd::DataType::ANY)
+                    ->setAllowedOutputTypes(sd::DataType::INT64);
         }
         DECLARE_SHAPE_FN(to_int64) {
             auto outShape = ShapeBuilders::copyShapeInfoAndType(inputShape->at(0), DataType::INT64, true, block.workspace());
diff --git a/libnd4j/include/ops/declarable/generic/datatypes/to_uint32.cpp b/libnd4j/include/ops/declarable/generic/datatypes/to_uint32.cpp
index 6805855f1..f62d9cd9b 100644
--- a/libnd4j/include/ops/declarable/generic/datatypes/to_uint32.cpp
+++ b/libnd4j/include/ops/declarable/generic/datatypes/to_uint32.cpp
@@ -18,12 +18,12 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_to_uint32)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(to_uint32, 1, 1, true, 0, 0) {
             auto input = INPUT_VARIABLE(0);
@@ -39,8 +39,8 @@ namespace nd4j {
 
         DECLARE_TYPES(to_uint32) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
-                    ->setAllowedOutputTypes(nd4j::DataType::INT32);
+                    ->setAllowedInputTypes(sd::DataType::ANY)
+                    ->setAllowedOutputTypes(sd::DataType::INT32);
         }
         DECLARE_SHAPE_FN(to_uint32) {
             auto outShape = ShapeBuilders::copyShapeInfoAndType(inputShape->at(0), DataType::UINT32, true, block.workspace());
diff --git a/libnd4j/include/ops/declarable/generic/datatypes/to_uint64.cpp b/libnd4j/include/ops/declarable/generic/datatypes/to_uint64.cpp
index fe61821a5..dc337ea1b 100644
--- a/libnd4j/include/ops/declarable/generic/datatypes/to_uint64.cpp
+++ b/libnd4j/include/ops/declarable/generic/datatypes/to_uint64.cpp
@@ -18,12 +18,12 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_to_uint64)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(to_uint64, 1, 1, true, 0, 0) {
             auto input = INPUT_VARIABLE(0);
@@ -39,8 +39,8 @@ namespace nd4j {
 
         DECLARE_TYPES(to_uint64) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
-                    ->setAllowedOutputTypes(nd4j::DataType::INT8);
+                    ->setAllowedInputTypes(sd::DataType::ANY)
+                    ->setAllowedOutputTypes(sd::DataType::INT8);
         }
         DECLARE_SHAPE_FN(to_uint64) {
             auto outShape = ShapeBuilders::copyShapeInfoAndType(inputShape->at(0), DataType::UINT64, true, block.workspace());
diff --git a/libnd4j/include/ops/declarable/generic/flow/flow_control_ops.cpp b/libnd4j/include/ops/declarable/generic/flow/flow_control_ops.cpp
index 5296e8844..108660c7b 100644
--- a/libnd4j/include/ops/declarable/generic/flow/flow_control_ops.cpp
+++ b/libnd4j/include/ops/declarable/generic/flow/flow_control_ops.cpp
@@ -20,9 +20,9 @@
 
 
 #include <ops/declarable/CustomOperations.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         /**
          * This operation is, basically IF statement
diff --git a/libnd4j/include/ops/declarable/generic/grad/broadcast_gradient_args.cpp b/libnd4j/include/ops/declarable/generic/grad/broadcast_gradient_args.cpp
index 34d979160..e4dbcd6d5 100644
--- a/libnd4j/include/ops/declarable/generic/grad/broadcast_gradient_args.cpp
+++ b/libnd4j/include/ops/declarable/generic/grad/broadcast_gradient_args.cpp
@@ -18,12 +18,12 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_broadcastgradientargs)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         /**
          * PLEASE NOTE: This op is disabled atm, and reserved for future releases.
@@ -38,7 +38,7 @@ namespace nd4j {
 
         DECLARE_TYPES(broadcastgradientargs) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY);
+                    ->setAllowedInputTypes(sd::DataType::ANY);
         }
     }
 }
diff --git a/libnd4j/include/ops/declarable/generic/helpers/BroadcastHelper.h b/libnd4j/include/ops/declarable/generic/helpers/BroadcastHelper.h
index e497be416..7df331c4d 100644
--- a/libnd4j/include/ops/declarable/generic/helpers/BroadcastHelper.h
+++ b/libnd4j/include/ops/declarable/generic/helpers/BroadcastHelper.h
@@ -21,17 +21,17 @@
 #ifndef LIBND4J_BROADCAST_HELPER_H
 #define LIBND4J_BROADCAST_HELPER_H
 
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <helpers/ShapeUtils.h>
 #include <ops/BroadcastOpsTuple.h>
 #include <ops/BroadcastBoolOpsTuple.h>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         class BroadcastHelper {
         public:
-            static FORCEINLINE NDArray* broadcastApply(nd4j::BroadcastOpsTuple op, NDArray* x, NDArray* y, NDArray* z, ExtraArguments *extraArgs = nullptr) {
+            static FORCEINLINE NDArray* broadcastApply(sd::BroadcastOpsTuple op, NDArray* x, NDArray* y, NDArray* z, ExtraArguments *extraArgs = nullptr) {
 
                 if(x->isEmpty() || y->isEmpty()) {
                     if(!z->isEmpty())
@@ -98,7 +98,7 @@ namespace nd4j {
                 return z;
             }
 
-            static FORCEINLINE NDArray* broadcastApply(nd4j::BroadcastBoolOpsTuple op, NDArray* x, NDArray* y, NDArray* z, ExtraArguments *extraArgs = nullptr) {
+            static FORCEINLINE NDArray* broadcastApply(sd::BroadcastBoolOpsTuple op, NDArray* x, NDArray* y, NDArray* z, ExtraArguments *extraArgs = nullptr) {
 
                 if(x->isEmpty() || y->isEmpty()) {
                     if(!z->isEmpty())
diff --git a/libnd4j/include/ops/declarable/generic/helpers/ScatterHelper.h b/libnd4j/include/ops/declarable/generic/helpers/ScatterHelper.h
index d802ee8fd..4d464a745 100644
--- a/libnd4j/include/ops/declarable/generic/helpers/ScatterHelper.h
+++ b/libnd4j/include/ops/declarable/generic/helpers/ScatterHelper.h
@@ -22,14 +22,14 @@
 #ifndef LIBND4J_SCATTERHELPER_H
 #define LIBND4J_SCATTERHELPER_H
 
-#include <pointercast.h>
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/pointercast.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 #include <numeric>
 #include <ops/declarable/helpers/scatter.h>
 
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
 
diff --git a/libnd4j/include/ops/declarable/generic/images/hsvToRgb.cpp b/libnd4j/include/ops/declarable/generic/images/hsvToRgb.cpp
index f1f8522d7..d5211e498 100644
--- a/libnd4j/include/ops/declarable/generic/images/hsvToRgb.cpp
+++ b/libnd4j/include/ops/declarable/generic/images/hsvToRgb.cpp
@@ -23,7 +23,7 @@
 #include <helpers/ConstantTadHelper.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
 CONFIGURABLE_OP_IMPL(hsv_to_rgb, 1, 1, true, 0, 0) {
diff --git a/libnd4j/include/ops/declarable/generic/images/rgbToGrs.cpp b/libnd4j/include/ops/declarable/generic/images/rgbToGrs.cpp
index aa2bec9da..f7378d333 100644
--- a/libnd4j/include/ops/declarable/generic/images/rgbToGrs.cpp
+++ b/libnd4j/include/ops/declarable/generic/images/rgbToGrs.cpp
@@ -23,7 +23,7 @@
 #include <helpers/ConstantTadHelper.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
 CUSTOM_OP_IMPL(rgb_to_grs, 1, 1, false, 0, 0) {
diff --git a/libnd4j/include/ops/declarable/generic/images/rgbToHsv.cpp b/libnd4j/include/ops/declarable/generic/images/rgbToHsv.cpp
index 2ba45bea9..ac5a27c66 100644
--- a/libnd4j/include/ops/declarable/generic/images/rgbToHsv.cpp
+++ b/libnd4j/include/ops/declarable/generic/images/rgbToHsv.cpp
@@ -25,7 +25,7 @@
 #include <helpers/ConstantTadHelper.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
 CONFIGURABLE_OP_IMPL(rgb_to_hsv, 1, 1, true, 0, 0) {
diff --git a/libnd4j/include/ops/declarable/generic/images/rgbToYiq.cpp b/libnd4j/include/ops/declarable/generic/images/rgbToYiq.cpp
index 6d202ee4a..40c936e4f 100644
--- a/libnd4j/include/ops/declarable/generic/images/rgbToYiq.cpp
+++ b/libnd4j/include/ops/declarable/generic/images/rgbToYiq.cpp
@@ -1,60 +1,60 @@
-/*******************************************************************************
- * Copyright (c) 2019 Konduit K.K.
- *
- * This program and the accompanying materials are made available under the
- * terms of the Apache License, Version 2.0 which is available at
- * https://www.apache.org/licenses/LICENSE-2.0.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations
- * under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- ******************************************************************************/
-
- //
- // @author AbdelRauf    (rauf@konduit.ai)
- // 
-
-#include <ops/declarable/headers/images.h>
-#include <ops/declarable/CustomOperations.h>  
-#include <helpers/ConstantTadHelper.h>
-#include <execution/Threads.h>
-
-namespace nd4j {
-    namespace ops {
-
-
-
-        CONFIGURABLE_OP_IMPL(rgb_to_yiq, 1, 1, true, 0, 0) {
-
-            auto input = INPUT_VARIABLE(0);
-            auto output = OUTPUT_VARIABLE(0);
-
-            if (input->isEmpty())
-                return Status::OK();
-
-            const int rank = input->rankOf();
-            const int arg_size = block.getIArguments()->size();
-            const int dimC = arg_size > 0 ? (INT_ARG(0) >= 0 ? INT_ARG(0) : INT_ARG(0) + rank) : rank - 1;
-
-            REQUIRE_TRUE(rank >= 1, 0, "RGBtoYIQ: Fails to meet the rank requirement: %i >= 1 ", rank);
-            if (arg_size > 0) {
-                REQUIRE_TRUE(dimC >= 0 && dimC < rank, 0, "Index of the Channel dimension out of range: %i not in [%i,%i) ", INT_ARG(0), -rank, rank);
-            }
-            REQUIRE_TRUE(input->sizeAt(dimC) == 3, 0, "RGBtoYIQ: operation expects 3 channels (R, G, B), but got %i instead", input->sizeAt(dimC));
-
-            helpers::transformRgbYiq(block.launchContext(), input, output, dimC);
-
-            return Status::OK();
-        }
-
-
-        DECLARE_TYPES(rgb_to_yiq) {
-            getOpDescriptor()->setAllowedInputTypes({ ALL_FLOATS })
-                ->setSameMode(true);
-        }
-    }
-}
+/*******************************************************************************
+ * Copyright (c) 2019 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+ //
+ // @author AbdelRauf    (rauf@konduit.ai)
+ // 
+
+#include <ops/declarable/headers/images.h>
+#include <ops/declarable/CustomOperations.h>  
+#include <helpers/ConstantTadHelper.h>
+#include <execution/Threads.h>
+
+namespace sd {
+    namespace ops {
+
+
+
+        CONFIGURABLE_OP_IMPL(rgb_to_yiq, 1, 1, true, 0, 0) {
+
+            auto input = INPUT_VARIABLE(0);
+            auto output = OUTPUT_VARIABLE(0);
+
+            if (input->isEmpty())
+                return Status::OK();
+
+            const int rank = input->rankOf();
+            const int arg_size = block.getIArguments()->size();
+            const int dimC = arg_size > 0 ? (INT_ARG(0) >= 0 ? INT_ARG(0) : INT_ARG(0) + rank) : rank - 1;
+
+            REQUIRE_TRUE(rank >= 1, 0, "RGBtoYIQ: Fails to meet the rank requirement: %i >= 1 ", rank);
+            if (arg_size > 0) {
+                REQUIRE_TRUE(dimC >= 0 && dimC < rank, 0, "Index of the Channel dimension out of range: %i not in [%i,%i) ", INT_ARG(0), -rank, rank);
+            }
+            REQUIRE_TRUE(input->sizeAt(dimC) == 3, 0, "RGBtoYIQ: operation expects 3 channels (R, G, B), but got %i instead", input->sizeAt(dimC));
+
+            helpers::transformRgbYiq(block.launchContext(), input, output, dimC);
+
+            return Status::OK();
+        }
+
+
+        DECLARE_TYPES(rgb_to_yiq) {
+            getOpDescriptor()->setAllowedInputTypes({ ALL_FLOATS })
+                ->setSameMode(true);
+        }
+    }
+}
diff --git a/libnd4j/include/ops/declarable/generic/images/rgbToYuv.cpp b/libnd4j/include/ops/declarable/generic/images/rgbToYuv.cpp
index 58dd8a432..b52b5a8a6 100644
--- a/libnd4j/include/ops/declarable/generic/images/rgbToYuv.cpp
+++ b/libnd4j/include/ops/declarable/generic/images/rgbToYuv.cpp
@@ -25,7 +25,7 @@
 #include <helpers/ConstantTadHelper.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
 CONFIGURABLE_OP_IMPL(rgb_to_yuv, 1, 1, true, 0, 0) {
diff --git a/libnd4j/include/ops/declarable/generic/images/yiqToRgb.cpp b/libnd4j/include/ops/declarable/generic/images/yiqToRgb.cpp
index 287aa150a..e339fb74b 100644
--- a/libnd4j/include/ops/declarable/generic/images/yiqToRgb.cpp
+++ b/libnd4j/include/ops/declarable/generic/images/yiqToRgb.cpp
@@ -1,61 +1,61 @@
-/*******************************************************************************
- * Copyright (c) 2019 Konduit K.K.
- *
- * This program and the accompanying materials are made available under the
- * terms of the Apache License, Version 2.0 which is available at
- * https://www.apache.org/licenses/LICENSE-2.0.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations
- * under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- ******************************************************************************/
-
-//
-// @author AbdelRauf    (rauf@konduit.ai)
-// 
-
-#include <ops/declarable/headers/images.h>
-#include <ops/declarable/CustomOperations.h>  
-#include <helpers/ConstantTadHelper.h>
-#include <execution/Threads.h>
-
-namespace nd4j {
-    namespace ops {
-
-
-
-        CONFIGURABLE_OP_IMPL(yiq_to_rgb, 1, 1, true, 0, 0) {
-
-            auto input = INPUT_VARIABLE(0);
-            auto output = OUTPUT_VARIABLE(0);
-
-            if (input->isEmpty())
-                return Status::OK();
-
-            const int rank = input->rankOf();
-            const int arg_size = block.getIArguments()->size();
-            const int dimC = arg_size > 0 ? (INT_ARG(0) >= 0 ? INT_ARG(0) : INT_ARG(0) + rank) : rank - 1;
-
-            REQUIRE_TRUE(rank >= 1, 0, "YIQtoRGB: Fails to meet the rank requirement: %i >= 1 ", rank);
-            if (arg_size > 0) {
-                REQUIRE_TRUE(dimC >= 0 && dimC < rank, 0, "Index of the Channel dimension out of range: %i not in [%i,%i) ", INT_ARG(0), -rank, rank);
-            }
-            REQUIRE_TRUE(input->sizeAt(dimC) == 3, 0, "YIQtoRGB: operation expects 3 channels (Y, I, Q), but got %i instead", input->sizeAt(dimC));
-
-            helpers::transformYiqRgb(block.launchContext(), input, output, dimC);
-
-            return Status::OK();
-        }
-         
-
-        DECLARE_TYPES(yiq_to_rgb) {
-            getOpDescriptor()->setAllowedInputTypes({ ALL_FLOATS })
-                ->setSameMode(true);
-        }
-         
-    }
-}
+/*******************************************************************************
+ * Copyright (c) 2019 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author AbdelRauf    (rauf@konduit.ai)
+// 
+
+#include <ops/declarable/headers/images.h>
+#include <ops/declarable/CustomOperations.h>  
+#include <helpers/ConstantTadHelper.h>
+#include <execution/Threads.h>
+
+namespace sd {
+    namespace ops {
+
+
+
+        CONFIGURABLE_OP_IMPL(yiq_to_rgb, 1, 1, true, 0, 0) {
+
+            auto input = INPUT_VARIABLE(0);
+            auto output = OUTPUT_VARIABLE(0);
+
+            if (input->isEmpty())
+                return Status::OK();
+
+            const int rank = input->rankOf();
+            const int arg_size = block.getIArguments()->size();
+            const int dimC = arg_size > 0 ? (INT_ARG(0) >= 0 ? INT_ARG(0) : INT_ARG(0) + rank) : rank - 1;
+
+            REQUIRE_TRUE(rank >= 1, 0, "YIQtoRGB: Fails to meet the rank requirement: %i >= 1 ", rank);
+            if (arg_size > 0) {
+                REQUIRE_TRUE(dimC >= 0 && dimC < rank, 0, "Index of the Channel dimension out of range: %i not in [%i,%i) ", INT_ARG(0), -rank, rank);
+            }
+            REQUIRE_TRUE(input->sizeAt(dimC) == 3, 0, "YIQtoRGB: operation expects 3 channels (Y, I, Q), but got %i instead", input->sizeAt(dimC));
+
+            helpers::transformYiqRgb(block.launchContext(), input, output, dimC);
+
+            return Status::OK();
+        }
+         
+
+        DECLARE_TYPES(yiq_to_rgb) {
+            getOpDescriptor()->setAllowedInputTypes({ ALL_FLOATS })
+                ->setSameMode(true);
+        }
+         
+    }
+}
diff --git a/libnd4j/include/ops/declarable/generic/images/yuvToRgb.cpp b/libnd4j/include/ops/declarable/generic/images/yuvToRgb.cpp
index 90ca217ce..48d4e379a 100644
--- a/libnd4j/include/ops/declarable/generic/images/yuvToRgb.cpp
+++ b/libnd4j/include/ops/declarable/generic/images/yuvToRgb.cpp
@@ -23,7 +23,7 @@
 #include <helpers/ConstantTadHelper.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
 CONFIGURABLE_OP_IMPL(yuv_to_rgb, 1, 1, true, 0, 0) {
diff --git a/libnd4j/include/ops/declarable/generic/kernels/knn_mindistance.cpp b/libnd4j/include/ops/declarable/generic/kernels/knn_mindistance.cpp
index a7e825a9c..8ef699aa2 100644
--- a/libnd4j/include/ops/declarable/generic/kernels/knn_mindistance.cpp
+++ b/libnd4j/include/ops/declarable/generic/kernels/knn_mindistance.cpp
@@ -18,13 +18,13 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_knn_mindistance)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/knn.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(knn_mindistance, 3, 1, false, 0, 0) {
             auto input = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/list/clone_list.cpp b/libnd4j/include/ops/declarable/generic/list/clone_list.cpp
index 46160d041..d100153ec 100644
--- a/libnd4j/include/ops/declarable/generic/list/clone_list.cpp
+++ b/libnd4j/include/ops/declarable/generic/list/clone_list.cpp
@@ -18,12 +18,12 @@
 // @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_clone_list)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         LIST_OP_IMPL(clone_list, 1, 1, 0, 0) {
             auto list = INPUT_LIST(0);
diff --git a/libnd4j/include/ops/declarable/generic/list/create_list.cpp b/libnd4j/include/ops/declarable/generic/list/create_list.cpp
index 40ef17e73..606558e7e 100644
--- a/libnd4j/include/ops/declarable/generic/list/create_list.cpp
+++ b/libnd4j/include/ops/declarable/generic/list/create_list.cpp
@@ -18,12 +18,12 @@
 // Created by raver119 on 06.11.2017.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_create_list)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         LIST_OP_IMPL(create_list, 1, 2, 0, -2) {
             int height = 0;
diff --git a/libnd4j/include/ops/declarable/generic/list/gather_list.cpp b/libnd4j/include/ops/declarable/generic/list/gather_list.cpp
index 4c3b0a389..943313ad0 100644
--- a/libnd4j/include/ops/declarable/generic/list/gather_list.cpp
+++ b/libnd4j/include/ops/declarable/generic/list/gather_list.cpp
@@ -18,12 +18,12 @@
 // @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_gather_list)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         LIST_OP_IMPL(gather_list, 2, 1, 0, -2) {
             auto list = INPUT_LIST(0);
diff --git a/libnd4j/include/ops/declarable/generic/list/pick_list.cpp b/libnd4j/include/ops/declarable/generic/list/pick_list.cpp
index 8b7cc17db..1254456bd 100644
--- a/libnd4j/include/ops/declarable/generic/list/pick_list.cpp
+++ b/libnd4j/include/ops/declarable/generic/list/pick_list.cpp
@@ -18,12 +18,12 @@
 // Created by raver119 on 06.11.2017.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_pick_list)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         LIST_OP_IMPL(pick_list, 1, 1, 0, -2) {
             auto list = INPUT_LIST(0);
diff --git a/libnd4j/include/ops/declarable/generic/list/read_list.cpp b/libnd4j/include/ops/declarable/generic/list/read_list.cpp
index 92d08d977..a1320b9b3 100644
--- a/libnd4j/include/ops/declarable/generic/list/read_list.cpp
+++ b/libnd4j/include/ops/declarable/generic/list/read_list.cpp
@@ -18,12 +18,12 @@
 // Created by raver119 on 06.11.2017.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_read_list)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         LIST_OP_IMPL(read_list, 1, 1, 0, 0) {
             auto list = INPUT_LIST(0);
diff --git a/libnd4j/include/ops/declarable/generic/list/scatter_list.cpp b/libnd4j/include/ops/declarable/generic/list/scatter_list.cpp
index 2d854ae0b..38a4da7bd 100644
--- a/libnd4j/include/ops/declarable/generic/list/scatter_list.cpp
+++ b/libnd4j/include/ops/declarable/generic/list/scatter_list.cpp
@@ -18,13 +18,13 @@
 // Created by raver119 on 06.11.2017.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_scatter_list)
 
 #include <ops/declarable/CustomOperations.h>
 #include <helpers/ShapeUtils.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         LIST_OP_IMPL(scatter_list, 1, 1, 0, -2) {
             NDArrayList *list = nullptr;
diff --git a/libnd4j/include/ops/declarable/generic/list/size_list.cpp b/libnd4j/include/ops/declarable/generic/list/size_list.cpp
index fa0aa1e0d..9c4d7ff70 100644
--- a/libnd4j/include/ops/declarable/generic/list/size_list.cpp
+++ b/libnd4j/include/ops/declarable/generic/list/size_list.cpp
@@ -18,12 +18,12 @@
 // Created by raver119 on 06.11.2017.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_size_list)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         LIST_OP_IMPL(size_list, 1, 1, 0, 0) {
             auto list = INPUT_LIST(0);
diff --git a/libnd4j/include/ops/declarable/generic/list/split_list.cpp b/libnd4j/include/ops/declarable/generic/list/split_list.cpp
index 5a403dd06..c49047961 100644
--- a/libnd4j/include/ops/declarable/generic/list/split_list.cpp
+++ b/libnd4j/include/ops/declarable/generic/list/split_list.cpp
@@ -18,13 +18,13 @@
 // Created by raver119 on 06.11.2017.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_split_list)
 
 #include <ops/declarable/CustomOperations.h>
 #include <helpers/ShapeUtils.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         LIST_OP_IMPL(split_list, 2, 1, 0, -2) {
             NDArrayList *list = nullptr;
diff --git a/libnd4j/include/ops/declarable/generic/list/stack_list.cpp b/libnd4j/include/ops/declarable/generic/list/stack_list.cpp
index a97b30179..a0f0f4220 100644
--- a/libnd4j/include/ops/declarable/generic/list/stack_list.cpp
+++ b/libnd4j/include/ops/declarable/generic/list/stack_list.cpp
@@ -19,12 +19,12 @@
 // @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_stack_list)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         LIST_OP_IMPL(stack_list, 1, 1, 0, 0) {
             auto list = INPUT_LIST(0);
diff --git a/libnd4j/include/ops/declarable/generic/list/unstack_list.cpp b/libnd4j/include/ops/declarable/generic/list/unstack_list.cpp
index b5e5f207e..5f4522949 100644
--- a/libnd4j/include/ops/declarable/generic/list/unstack_list.cpp
+++ b/libnd4j/include/ops/declarable/generic/list/unstack_list.cpp
@@ -18,12 +18,12 @@
 // @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_unstack_list)
 
 #include <ops/declarable/headers/list.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
     LIST_OP_IMPL(unstack_list, 1, 1, 0, 0) {
         auto outputList = INPUT_LIST(0);
diff --git a/libnd4j/include/ops/declarable/generic/list/write_list.cpp b/libnd4j/include/ops/declarable/generic/list/write_list.cpp
index c9b32234e..c61bcb68b 100644
--- a/libnd4j/include/ops/declarable/generic/list/write_list.cpp
+++ b/libnd4j/include/ops/declarable/generic/list/write_list.cpp
@@ -18,12 +18,12 @@
 // @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_write_list)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         LIST_OP_IMPL(write_list, 2, 1, 0, -2) {
             auto list = INPUT_LIST(0);
diff --git a/libnd4j/include/ops/declarable/generic/loss/absoluteDifference.cpp b/libnd4j/include/ops/declarable/generic/loss/absoluteDifference.cpp
index ba488df65..812588710 100644
--- a/libnd4j/include/ops/declarable/generic/loss/absoluteDifference.cpp
+++ b/libnd4j/include/ops/declarable/generic/loss/absoluteDifference.cpp
@@ -18,11 +18,11 @@
 // Created by Yurii Shyrma on 20.11.2017.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_absolute_difference_loss)
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
 
 
@@ -51,7 +51,7 @@ CUSTOM_OP_IMPL(absolute_difference_loss, 3, 1, false, 0, 1) {
 	if(!weights->isScalar() && !weights->isSameShape(predictions))
 		weightsBroad = new NDArray(weights->tileToShape(predictions->getShapeInfo()));
 
-	NDArray E = (*predictions - *labels).transform(nd4j::transform::Abs);
+	NDArray E = (*predictions - *labels).transform(sd::transform::Abs);
  	E *= *weightsBroad;
 
 	switch (reductionMode) {
@@ -101,7 +101,7 @@ CUSTOM_OP_IMPL(absolute_difference_loss, 3, 1, false, 0, 1) {
 }
 
 DECLARE_TYPES(absolute_difference_loss) {
-	getOpDescriptor()->setAllowedInputTypes(nd4j::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
+	getOpDescriptor()->setAllowedInputTypes(sd::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
 }
 
 DECLARE_SHAPE_FN(absolute_difference_loss) {
@@ -169,10 +169,10 @@ CUSTOM_OP_IMPL(absolute_difference_loss_grad, 3, 3, false, 0, 1) {
 	NDArray E = *predictions - *labels;
 
 	// dE_i/dp_i = sign(p_i - y_i)
-	E.applyTransform(nd4j::transform::Sign, *dLdp);	// dE/dp
+	E.applyTransform(sd::transform::Sign, *dLdp);	// dE/dp
 	// dE_i/dy_i = -sign(p_i - y_i)
 
-	E.applyTransform(nd4j::transform::Abs, E);
+	E.applyTransform(sd::transform::Abs, E);
 
 	switch (reductionMode) {
 
@@ -261,7 +261,7 @@ CUSTOM_OP_IMPL(absolute_difference_loss_grad, 3, 3, false, 0, 1) {
 
 DECLARE_TYPES(absolute_difference_loss_grad) {
 
-	getOpDescriptor()->setAllowedInputTypes(nd4j::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
+	getOpDescriptor()->setAllowedInputTypes(sd::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
 }
 
 DECLARE_SHAPE_FN(absolute_difference_loss_grad) {
diff --git a/libnd4j/include/ops/declarable/generic/loss/cosineDistance.cpp b/libnd4j/include/ops/declarable/generic/loss/cosineDistance.cpp
index 7fe75c03a..10995c90b 100644
--- a/libnd4j/include/ops/declarable/generic/loss/cosineDistance.cpp
+++ b/libnd4j/include/ops/declarable/generic/loss/cosineDistance.cpp
@@ -18,13 +18,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 22.11.2017
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_cosine_distance_loss)
 
 #include <ops/declarable/CustomOperations.h>
 #include <helpers/ShapeUtils.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 
@@ -118,7 +118,7 @@ CUSTOM_OP_IMPL(cosine_distance_loss, 3, 1, false, 0, 2) {
 //////////////////////////////////////////////////////////////////////////
 DECLARE_TYPES(cosine_distance_loss) {
 
-	getOpDescriptor()->setAllowedInputTypes(nd4j::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
+	getOpDescriptor()->setAllowedInputTypes(sd::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -304,7 +304,7 @@ CUSTOM_OP_IMPL(cosine_distance_loss_grad, 3, 3, false, 0, 2) {
 //////////////////////////////////////////////////////////////////////////
 DECLARE_TYPES(cosine_distance_loss_grad) {
 
-	getOpDescriptor()->setAllowedInputTypes(nd4j::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
+	getOpDescriptor()->setAllowedInputTypes(sd::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
 }
 
 //////////////////////////////////////////////////////////////////////////
diff --git a/libnd4j/include/ops/declarable/generic/loss/hingeLoss.cpp b/libnd4j/include/ops/declarable/generic/loss/hingeLoss.cpp
index 8670bf9e1..244083a03 100644
--- a/libnd4j/include/ops/declarable/generic/loss/hingeLoss.cpp
+++ b/libnd4j/include/ops/declarable/generic/loss/hingeLoss.cpp
@@ -18,12 +18,12 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 23.11.2017
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_hinge_loss)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
 
 //////////////////////////////////////////////////////////////////////////
@@ -106,7 +106,7 @@ namespace nd4j {
 //////////////////////////////////////////////////////////////////////////
         DECLARE_TYPES(hinge_loss) {
 
-            getOpDescriptor()->setAllowedInputTypes(nd4j::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
+            getOpDescriptor()->setAllowedInputTypes(sd::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
         }
 
 //////////////////////////////////////////////////////////////////////////
@@ -176,7 +176,7 @@ namespace nd4j {
             // turn E into gradient mask
 
             NDArray gradientMask(E.getShapeInfo(), block.getWorkspace());
-            E.applyTransform(nd4j::transform::Sign, gradientMask);
+            E.applyTransform(sd::transform::Sign, gradientMask);
 
             dLdp->assign(-z * gradientMask);
             dLdl->assign(-2.f * (*logits) * gradientMask);
@@ -271,7 +271,7 @@ namespace nd4j {
 
         DECLARE_TYPES(hinge_loss_grad) {
 
-            getOpDescriptor()->setAllowedInputTypes(nd4j::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
+            getOpDescriptor()->setAllowedInputTypes(sd::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
         }
 
         DECLARE_SHAPE_FN(hinge_loss_grad) {
diff --git a/libnd4j/include/ops/declarable/generic/loss/huberLoss.cpp b/libnd4j/include/ops/declarable/generic/loss/huberLoss.cpp
index e844b4126..0c05de0ba 100644
--- a/libnd4j/include/ops/declarable/generic/loss/huberLoss.cpp
+++ b/libnd4j/include/ops/declarable/generic/loss/huberLoss.cpp
@@ -18,12 +18,12 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 23.11.2017
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_huber_loss)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 
@@ -111,7 +111,7 @@ CUSTOM_OP_IMPL(huber_loss, 3, 1, false, 1, 1) {
 //////////////////////////////////////////////////////////////////////////
 DECLARE_TYPES(huber_loss) {
 
-	getOpDescriptor()->setAllowedInputTypes(nd4j::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
+	getOpDescriptor()->setAllowedInputTypes(sd::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -286,7 +286,7 @@ DECLARE_SHAPE_FN(huber_loss) {
 
 		DECLARE_TYPES(huber_loss_grad) {
 
-			getOpDescriptor()->setAllowedInputTypes(nd4j::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
+			getOpDescriptor()->setAllowedInputTypes(sd::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
 		}
 
 		DECLARE_SHAPE_FN(huber_loss_grad) {
diff --git a/libnd4j/include/ops/declarable/generic/loss/l2_loss.cpp b/libnd4j/include/ops/declarable/generic/loss/l2_loss.cpp
index c42789015..3afeea2ba 100644
--- a/libnd4j/include/ops/declarable/generic/loss/l2_loss.cpp
+++ b/libnd4j/include/ops/declarable/generic/loss/l2_loss.cpp
@@ -18,12 +18,12 @@
 // Created by GS <sgazeos@gmail.com> 31.01.2018
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_l2_loss)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(l2_loss, 1, 1, false, 0, 0) {
             auto input = INPUT_VARIABLE(0);
@@ -43,7 +43,7 @@ namespace nd4j {
 
         DECLARE_TYPES(l2_loss) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
     }
diff --git a/libnd4j/include/ops/declarable/generic/loss/logLoss.cpp b/libnd4j/include/ops/declarable/generic/loss/logLoss.cpp
index f83947c69..dc889d5c9 100644
--- a/libnd4j/include/ops/declarable/generic/loss/logLoss.cpp
+++ b/libnd4j/include/ops/declarable/generic/loss/logLoss.cpp
@@ -18,12 +18,12 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 23.11.2017
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_log_loss)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 
@@ -108,7 +108,7 @@ CUSTOM_OP_IMPL(log_loss, 3, 1, false, 1, 1) {
 //////////////////////////////////////////////////////////////////////////
 DECLARE_TYPES(log_loss) {
 
-	getOpDescriptor()->setAllowedInputTypes(nd4j::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
+	getOpDescriptor()->setAllowedInputTypes(sd::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -277,7 +277,7 @@ CUSTOM_OP_IMPL(log_loss_grad, 3, 3, false, 1, 1) {
 //////////////////////////////////////////////////////////////////////////
 DECLARE_TYPES(log_loss_grad) {
 
-	getOpDescriptor()->setAllowedInputTypes(nd4j::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
+	getOpDescriptor()->setAllowedInputTypes(sd::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
 }
 
 //////////////////////////////////////////////////////////////////////////
diff --git a/libnd4j/include/ops/declarable/generic/loss/log_poisson_loss.cpp b/libnd4j/include/ops/declarable/generic/loss/log_poisson_loss.cpp
index 0d85c6e23..9a00b4eb4 100644
--- a/libnd4j/include/ops/declarable/generic/loss/log_poisson_loss.cpp
+++ b/libnd4j/include/ops/declarable/generic/loss/log_poisson_loss.cpp
@@ -18,12 +18,12 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_log_poisson_loss)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
     CUSTOM_OP_IMPL(log_poisson_loss, 3, 1, true, 0, 1) {
         auto log_predictions = INPUT_VARIABLE(0);
@@ -111,7 +111,7 @@ namespace ops {
 
     //////////////////////////////////////////////////////////////////////////
     DECLARE_TYPES(log_poisson_loss) {
-        getOpDescriptor()->setAllowedInputTypes(nd4j::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
+        getOpDescriptor()->setAllowedInputTypes(sd::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
     }
 
     //////////////////////////////////////////////////////////////////////////
@@ -279,7 +279,7 @@ namespace ops {
 
     DECLARE_TYPES(log_poisson_loss_grad) {
 
-        getOpDescriptor()->setAllowedInputTypes(nd4j::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
+        getOpDescriptor()->setAllowedInputTypes(sd::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
     }
 
     DECLARE_SHAPE_FN(log_poisson_loss_grad) {
diff --git a/libnd4j/include/ops/declarable/generic/loss/meanPairWsSqErr.cpp b/libnd4j/include/ops/declarable/generic/loss/meanPairWsSqErr.cpp
index ef511921f..312a32674 100644
--- a/libnd4j/include/ops/declarable/generic/loss/meanPairWsSqErr.cpp
+++ b/libnd4j/include/ops/declarable/generic/loss/meanPairWsSqErr.cpp
@@ -22,14 +22,14 @@
 // @author Paul Dubs
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_mean_pairwssqerr_loss)
 
 #include <ops/declarable/CustomOperations.h>
 #include <numeric>
 #include <iostream>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
 
 
@@ -181,7 +181,7 @@ namespace nd4j {
 //////////////////////////////////////////////////////////////////////////
         DECLARE_TYPES(mean_pairwssqerr_loss) {
 
-            getOpDescriptor()->setAllowedInputTypes(nd4j::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
+            getOpDescriptor()->setAllowedInputTypes(sd::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
         }
 
 //////////////////////////////////////////////////////////////////////////
@@ -349,7 +349,7 @@ namespace nd4j {
         }
 
         DECLARE_TYPES(mean_pairwssqerr_loss_grad) {
-            getOpDescriptor()->setAllowedInputTypes(nd4j::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
+            getOpDescriptor()->setAllowedInputTypes(sd::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
         }
 
         DECLARE_SHAPE_FN(mean_pairwssqerr_loss_grad) {
diff --git a/libnd4j/include/ops/declarable/generic/loss/meanSqErr.cpp b/libnd4j/include/ops/declarable/generic/loss/meanSqErr.cpp
index f446d0bf0..c5925fe90 100644
--- a/libnd4j/include/ops/declarable/generic/loss/meanSqErr.cpp
+++ b/libnd4j/include/ops/declarable/generic/loss/meanSqErr.cpp
@@ -18,12 +18,12 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 25.11.2017
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_mean_sqerr_loss)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
 
 
@@ -107,7 +107,7 @@ CUSTOM_OP_IMPL(mean_sqerr_loss, 3, 1, false, 0, 1) {
 
 DECLARE_TYPES(mean_sqerr_loss) {
 
-	getOpDescriptor()->setAllowedInputTypes(nd4j::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
+	getOpDescriptor()->setAllowedInputTypes(sd::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
 }
 
 
@@ -268,7 +268,7 @@ CUSTOM_OP_IMPL(mean_sqerr_loss_grad, 3, 3, false, 0, 1) {
 
 DECLARE_TYPES(mean_sqerr_loss_grad) {
 
-	getOpDescriptor()->setAllowedInputTypes(nd4j::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
+	getOpDescriptor()->setAllowedInputTypes(sd::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
 }
 
 DECLARE_SHAPE_FN(mean_sqerr_loss_grad) {
diff --git a/libnd4j/include/ops/declarable/generic/loss/sigmCrossEntropy.cpp b/libnd4j/include/ops/declarable/generic/loss/sigmCrossEntropy.cpp
index 5b0075466..4d3c5749c 100644
--- a/libnd4j/include/ops/declarable/generic/loss/sigmCrossEntropy.cpp
+++ b/libnd4j/include/ops/declarable/generic/loss/sigmCrossEntropy.cpp
@@ -18,13 +18,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 25.11.2017
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_sigm_cross_entropy_loss)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/legacy_helpers.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 
@@ -118,7 +118,7 @@ CUSTOM_OP_IMPL(sigm_cross_entropy_loss, 3, 1, false, 1, 1) {
 //////////////////////////////////////////////////////////////////////////
 DECLARE_TYPES(sigm_cross_entropy_loss) {
 
-	getOpDescriptor()->setAllowedInputTypes(nd4j::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
+	getOpDescriptor()->setAllowedInputTypes(sd::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -292,7 +292,7 @@ CUSTOM_OP_IMPL(sigm_cross_entropy_loss_grad, 3, 3, false, 1, 1) {
 //////////////////////////////////////////////////////////////////////////
 DECLARE_TYPES(sigm_cross_entropy_loss_grad) {
 
-	getOpDescriptor()->setAllowedInputTypes(nd4j::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
+	getOpDescriptor()->setAllowedInputTypes(sd::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
 }
 
 //////////////////////////////////////////////////////////////////////////
diff --git a/libnd4j/include/ops/declarable/generic/loss/softmaxCrossEntropy.cpp b/libnd4j/include/ops/declarable/generic/loss/softmaxCrossEntropy.cpp
index a1a197fae..3ea9ce2bd 100644
--- a/libnd4j/include/ops/declarable/generic/loss/softmaxCrossEntropy.cpp
+++ b/libnd4j/include/ops/declarable/generic/loss/softmaxCrossEntropy.cpp
@@ -18,12 +18,12 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 25.11.2017.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_softmax_cross_entropy_loss)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 
@@ -253,8 +253,8 @@ CUSTOM_OP_IMPL(softmax_cross_entropy_loss_grad, 3, 3, false, 1, 1) {
 				*dLdl *= *weights;
 			}
 			else {
-				dLdp->applyBroadcast(nd4j::broadcast::Multiply, dimensions, *weightsBroad, *dLdp);
-				dLdl->applyBroadcast(nd4j::broadcast::Multiply, dimensions, *weightsBroad, *dLdl);
+				dLdp->applyBroadcast(sd::broadcast::Multiply, dimensions, *weightsBroad, *dLdp);
+				dLdl->applyBroadcast(sd::broadcast::Multiply, dimensions, *weightsBroad, *dLdl);
 
 				if(weights != weightsBroad) {
 					std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->getShapeInfo(), weightsBroad->getShapeInfo());
@@ -289,8 +289,8 @@ CUSTOM_OP_IMPL(softmax_cross_entropy_loss_grad, 3, 3, false, 1, 1) {
 				else {
 
 					NDArray temp = *weightsBroad / sum;
-					dLdp->applyBroadcast(nd4j::broadcast::Multiply, dimensions, temp, *dLdp);
-					dLdl->applyBroadcast(nd4j::broadcast::Multiply, dimensions, temp, *dLdl);
+					dLdp->applyBroadcast(sd::broadcast::Multiply, dimensions, temp, *dLdp);
+					dLdl->applyBroadcast(sd::broadcast::Multiply, dimensions, temp, *dLdl);
 
 					if(weights != weightsBroad) {
 						std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->getShapeInfo(), weightsBroad->getShapeInfo());
@@ -326,8 +326,8 @@ CUSTOM_OP_IMPL(softmax_cross_entropy_loss_grad, 3, 3, false, 1, 1) {
 				}
 				else {
 					NDArray temp = *weightsBroad / numOfNonZeroWeights;
-					dLdp->applyBroadcast(nd4j::broadcast::Multiply, dimensions, temp, *dLdp);
-					dLdl->applyBroadcast(nd4j::broadcast::Multiply, dimensions, temp, *dLdl);
+					dLdp->applyBroadcast(sd::broadcast::Multiply, dimensions, temp, *dLdp);
+					dLdl->applyBroadcast(sd::broadcast::Multiply, dimensions, temp, *dLdl);
 
 					if(weights != weightsBroad) {
 						std::vector<int> axesToReduceAlong = ShapeUtils::evalBroadcastBackwardAxis(weights->getShapeInfo(), weightsBroad->getShapeInfo());
diff --git a/libnd4j/include/ops/declarable/generic/loss/softmaxCrossEntropyWithLogits.cpp b/libnd4j/include/ops/declarable/generic/loss/softmaxCrossEntropyWithLogits.cpp
index 5e88ec0e6..6dab14365 100644
--- a/libnd4j/include/ops/declarable/generic/loss/softmaxCrossEntropyWithLogits.cpp
+++ b/libnd4j/include/ops/declarable/generic/loss/softmaxCrossEntropyWithLogits.cpp
@@ -18,12 +18,12 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 18.06.2018
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_softmax_cross_entropy_loss_with_logits)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 
@@ -53,7 +53,7 @@ CUSTOM_OP_IMPL(softmax_cross_entropy_loss_with_logits, 2, 1, false, 0, 0) {
 //////////////////////////////////////////////////////////////////////////
 DECLARE_TYPES(softmax_cross_entropy_loss_with_logits) {
 
-    getOpDescriptor()->setAllowedInputTypes(nd4j::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
+    getOpDescriptor()->setAllowedInputTypes(sd::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -113,7 +113,7 @@ CUSTOM_OP_IMPL(softmax_cross_entropy_loss_with_logits_grad, 2, 2, false, 0, 0) {
 //////////////////////////////////////////////////////////////////////////
 DECLARE_TYPES(softmax_cross_entropy_loss_with_logits_grad) {
 
-    getOpDescriptor()->setAllowedInputTypes(nd4j::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
+    getOpDescriptor()->setAllowedInputTypes(sd::DataType::ANY)->setAllowedOutputTypes({ALL_FLOATS});
 }
 
 //////////////////////////////////////////////////////////////////////////
diff --git a/libnd4j/include/ops/declarable/generic/loss/sparseSoftmaxCrossEntropyWithLogits.cpp b/libnd4j/include/ops/declarable/generic/loss/sparseSoftmaxCrossEntropyWithLogits.cpp
index e7c8da123..c641bf12f 100644
--- a/libnd4j/include/ops/declarable/generic/loss/sparseSoftmaxCrossEntropyWithLogits.cpp
+++ b/libnd4j/include/ops/declarable/generic/loss/sparseSoftmaxCrossEntropyWithLogits.cpp
@@ -18,13 +18,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 29.08.2018
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_sparse_softmax_cross_entropy_loss_with_logits)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/generic/helpers/ScatterHelper.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 
diff --git a/libnd4j/include/ops/declarable/generic/nlp/cbow.cpp b/libnd4j/include/ops/declarable/generic/nlp/cbow.cpp
index 21906f4eb..9b5ed1918 100644
--- a/libnd4j/include/ops/declarable/generic/nlp/cbow.cpp
+++ b/libnd4j/include/ops/declarable/generic/nlp/cbow.cpp
@@ -18,13 +18,13 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_cbow)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/sg_cb.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CONFIGURABLE_OP_IMPL(cbow, 15, 15, true, 0, 0) {
             auto target = INPUT_VARIABLE(0);
@@ -62,7 +62,7 @@ namespace nd4j {
             REQUIRE_TRUE(syn0->dataType() == expTable->dataType(), 0, "CBOW: expTable must have the same data type as syn0 table");
 
 
-            nd4j::ops::helpers::cbow(*syn0, *syn1, *syn1neg, *expTable, *negTable, *target, *ngStarter, nsRounds, *context, *lockedWords, *indices, *codes, *alpha, *randomValue, *numLabels, *inferenceVector, trainWords, numWorkers);
+            sd::ops::helpers::cbow(*syn0, *syn1, *syn1neg, *expTable, *negTable, *target, *ngStarter, nsRounds, *context, *lockedWords, *indices, *codes, *alpha, *randomValue, *numLabels, *inferenceVector, trainWords, numWorkers);
 
 
             return Status::OK();
@@ -70,22 +70,22 @@ namespace nd4j {
 
         DECLARE_TYPES(cbow) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(0, nd4j::DataType::INT32)
-                    ->setAllowedInputTypes(1, nd4j::DataType::INT32)
-                    ->setAllowedInputTypes(2, nd4j::DataType::INT32)
-                    ->setAllowedInputTypes(3, nd4j::DataType::INT32)
-                    ->setAllowedInputTypes(4, nd4j::DataType::INT8)
+                    ->setAllowedInputTypes(0, sd::DataType::INT32)
+                    ->setAllowedInputTypes(1, sd::DataType::INT32)
+                    ->setAllowedInputTypes(2, sd::DataType::INT32)
+                    ->setAllowedInputTypes(3, sd::DataType::INT32)
+                    ->setAllowedInputTypes(4, sd::DataType::INT8)
                     ->setAllowedInputTypes(5, {ALL_FLOATS})
                     ->setAllowedInputTypes(6, {ALL_FLOATS})
                     ->setAllowedInputTypes(7, {ALL_FLOATS})
                     ->setAllowedInputTypes(8, {ALL_FLOATS})
                     ->setAllowedInputTypes(9, {ALL_FLOATS})
                     ->setAllowedInputTypes(10, {ALL_FLOATS})
-                    ->setAllowedInputTypes(11, nd4j::DataType::INT64)
-                    ->setAllowedInputTypes(12, nd4j::DataType::INT32)
-                    ->setAllowedInputTypes(13, nd4j::DataType::INT32)
+                    ->setAllowedInputTypes(11, sd::DataType::INT64)
+                    ->setAllowedInputTypes(12, sd::DataType::INT32)
+                    ->setAllowedInputTypes(13, sd::DataType::INT32)
                     ->setAllowedInputTypes(14, {ALL_FLOATS})
-                    ->setAllowedOutputTypes(nd4j::DataType::ANY);
+                    ->setAllowedOutputTypes(sd::DataType::ANY);
         }
     }
 }
diff --git a/libnd4j/include/ops/declarable/generic/nlp/skipgram.cpp b/libnd4j/include/ops/declarable/generic/nlp/skipgram.cpp
index a97e1a79e..921662fa6 100644
--- a/libnd4j/include/ops/declarable/generic/nlp/skipgram.cpp
+++ b/libnd4j/include/ops/declarable/generic/nlp/skipgram.cpp
@@ -18,13 +18,13 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_skipgram)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/sg_cb.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CONFIGURABLE_OP_IMPL(skipgram, 12, 12, true, 0, 0) {
             auto target = INPUT_VARIABLE(0);
@@ -60,26 +60,26 @@ namespace nd4j {
             REQUIRE_TRUE(syn0->dataType() == expTable->dataType(), 0, "SkipGram: expTable must have the same data type as syn0 table");
 
 
-            nd4j::ops::helpers::skipgram(*syn0, *syn1, *syn1neg, *expTable, *negTable, *target, *ngStarter, nsRounds, *indices, *codes, *alpha, *randomValue, *inferenceVector, isPreciseMode, numWorkers);
+            sd::ops::helpers::skipgram(*syn0, *syn1, *syn1neg, *expTable, *negTable, *target, *ngStarter, nsRounds, *indices, *codes, *alpha, *randomValue, *inferenceVector, isPreciseMode, numWorkers);
 
             return Status::OK();
         }
 
         DECLARE_TYPES(skipgram) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(0, nd4j::DataType::INT32)
-                    ->setAllowedInputTypes(1, nd4j::DataType::INT32)
-                    ->setAllowedInputTypes(2, nd4j::DataType::INT32)
-                    ->setAllowedInputTypes(3, nd4j::DataType::INT8)
+                    ->setAllowedInputTypes(0, sd::DataType::INT32)
+                    ->setAllowedInputTypes(1, sd::DataType::INT32)
+                    ->setAllowedInputTypes(2, sd::DataType::INT32)
+                    ->setAllowedInputTypes(3, sd::DataType::INT8)
                     ->setAllowedInputTypes(4, {ALL_FLOATS})
                     ->setAllowedInputTypes(5, {ALL_FLOATS})
                     ->setAllowedInputTypes(6, {ALL_FLOATS})
                     ->setAllowedInputTypes(7, {ALL_FLOATS})
                     ->setAllowedInputTypes(8, {ALL_FLOATS})
                     ->setAllowedInputTypes(9, {ALL_FLOATS})
-                    ->setAllowedInputTypes(10, nd4j::DataType::INT64)
+                    ->setAllowedInputTypes(10, sd::DataType::INT64)
                     ->setAllowedInputTypes(11, {ALL_FLOATS})
-                    ->setAllowedOutputTypes(nd4j::DataType::ANY);
+                    ->setAllowedOutputTypes(sd::DataType::ANY);
         }
 
         /*
diff --git a/libnd4j/include/ops/declarable/generic/nn/apply_sgd.cpp b/libnd4j/include/ops/declarable/generic/nn/apply_sgd.cpp
index cc558c905..389d07c7b 100644
--- a/libnd4j/include/ops/declarable/generic/nn/apply_sgd.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/apply_sgd.cpp
@@ -18,13 +18,13 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_apply_sgd)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/gradient.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CONFIGURABLE_OP_IMPL(apply_sgd, 2, 1, true, -2, 0) {
             auto parameters = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/nn/batchnorm.cpp b/libnd4j/include/ops/declarable/generic/nn/batchnorm.cpp
index ca9622af9..e69b370ca 100644
--- a/libnd4j/include/ops/declarable/generic/nn/batchnorm.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/batchnorm.cpp
@@ -20,13 +20,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_batchnorm)
 
 #include <ops/declarable/CustomOperations.h>
 #include<ops/declarable/helpers/batchnorm.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
 
@@ -90,7 +90,7 @@ CUSTOM_OP_IMPL(batchnorm, 3, 1, false, 1, 2) {
 
     // formula: output = gamma * ((input - mean) / sqrt(variance + epsilon)) + beta
     // auto v = input->varianceAlongDimension(variance::SummaryStatsVariance, false, ShapeUtils::evalDimsToExclude(input->rankOf(), axes));
-    // auto m = input->reduceAlongDimension(nd4j::reduce::Mean, ShapeUtils::evalDimsToExclude(input->rankOf(), axes));
+    // auto m = input->reduceAlongDimension(sd::reduce::Mean, ShapeUtils::evalDimsToExclude(input->rankOf(), axes));
 
     helpers::batchnorm(input, mean, variance, gamma, beta, output, axes, epsilon);
 
@@ -101,11 +101,11 @@ CUSTOM_OP_IMPL(batchnorm, 3, 1, false, 1, 2) {
     //     stdInv *= *gamma;
 
     //  // empty array with same shape as input
-    // input->applyBroadcast(nd4j::broadcast::Subtract, axes, m, output);
-    // output->applyBroadcast(nd4j::broadcast::Multiply, axes, &stdInv);
+    // input->applyBroadcast(sd::broadcast::Subtract, axes, m, output);
+    // output->applyBroadcast(sd::broadcast::Multiply, axes, &stdInv);
 
     // if(applyOffset)
-    //     output->applyBroadcast(nd4j::broadcast::Add, axes, beta);
+    //     output->applyBroadcast(sd::broadcast::Add, axes, beta);
 
     // delete v;
     // delete m;
@@ -218,7 +218,7 @@ CUSTOM_OP_IMPL(batchnorm_bp, 4, 3, false, 1, 2) {
     // dLdB = g_sum
 
     // variance = input->varianceAlongDimension(variance::SummaryStatsVariance, false, ShapeUtils::evalDimsToExclude(input->rankOf(), axes));
-    // mean = input->reduceAlongDimension(nd4j::reduce::Mean, ShapeUtils::evalDimsToExclude(input->rankOf(), axes));
+    // mean = input->reduceAlongDimension(sd::reduce::Mean, ShapeUtils::evalDimsToExclude(input->rankOf(), axes));
 
     const auto excludedAxes = ShapeUtils::evalDimsToExclude(inRank, axes);
     const bool keepUnitiesInShape = inRank == mean->rankOf();
@@ -228,7 +228,7 @@ CUSTOM_OP_IMPL(batchnorm_bp, 4, 3, false, 1, 2) {
 
     // input - mean
     NDArray xMinusMean(input); // empty array with same shape as input
-    input->applyBroadcast(nd4j::broadcast::Subtract, axes, *mean, xMinusMean);
+    input->applyBroadcast(sd::broadcast::Subtract, axes, *mean, xMinusMean);
 
     // stdInv
     NDArray stdInv = *variance + epsilon;
@@ -236,11 +236,11 @@ CUSTOM_OP_IMPL(batchnorm_bp, 4, 3, false, 1, 2) {
     stdInv.applyTransform(transform::Sqrt, stdInv);                                 // 1 / (variance + epsilon)^0.5
 
     // dvdm (use dLdM as storage for dvdm)
-    xMinusMean.reduceAlongDimension(nd4j::reduce::Sum, *dLdM, excludedAxes, keepUnitiesInShape);
+    xMinusMean.reduceAlongDimension(sd::reduce::Sum, *dLdM, excludedAxes, keepUnitiesInShape);
     *dLdM *= -Ninv;
 
     // g_sum
-    auto gSum = dLdO->reduceAlongDimension(nd4j::reduce::Sum, excludedAxes, keepUnitiesInShape);
+    auto gSum = dLdO->reduceAlongDimension(sd::reduce::Sum, excludedAxes, keepUnitiesInShape);
 
     // dLdB
     if(applyOffset)
@@ -248,11 +248,11 @@ CUSTOM_OP_IMPL(batchnorm_bp, 4, 3, false, 1, 2) {
 
     // stdInv * (g - g_sum/N) (use dLdI as storage for this expression)
     gSum *= Ninv;
-    dLdO->applyBroadcast(nd4j::broadcast::Subtract, axes, gSum, *dLdI);
-    dLdI->applyBroadcast(nd4j::broadcast::Multiply, axes, stdInv, *dLdI);
+    dLdO->applyBroadcast(sd::broadcast::Subtract, axes, gSum, *dLdI);
+    dLdI->applyBroadcast(sd::broadcast::Multiply, axes, stdInv, *dLdI);
 
     // dLdV <- [g*(x - m)]_sum
-    (xMinusMean * *dLdO).reduceAlongDimension(nd4j::reduce::Sum, *dLdV, excludedAxes, keepUnitiesInShape);
+    (xMinusMean * *dLdO).reduceAlongDimension(sd::reduce::Sum, *dLdV, excludedAxes, keepUnitiesInShape);
 
     // dLdG
     *dLdV *= stdInv;
@@ -264,13 +264,13 @@ CUSTOM_OP_IMPL(batchnorm_bp, 4, 3, false, 1, 2) {
     *dLdV *=  -Ninv;             // -0.5f * (2 / N);
 
     // dfdv * (dvdm  + (x - m)) (use xMinusMean as storage for this expression)
-    xMinusMean.applyBroadcast(nd4j::broadcast::Add, axes, *dLdM, xMinusMean);
-    xMinusMean.applyBroadcast(nd4j::broadcast::Multiply, axes, *dLdV, xMinusMean);
+    xMinusMean.applyBroadcast(sd::broadcast::Add, axes, *dLdM, xMinusMean);
+    xMinusMean.applyBroadcast(sd::broadcast::Multiply, axes, *dLdV, xMinusMean);
 
     // dLdI
     *dLdI += xMinusMean;
     if(applyScale)
-        dLdI->applyBroadcast(nd4j::broadcast::Multiply, axes, *gamma, *dLdI);
+        dLdI->applyBroadcast(sd::broadcast::Multiply, axes, *gamma, *dLdI);
 
     *dLdM = 0;      // put zeros so far
     *dLdV = 0;      // put zeros so far
@@ -280,11 +280,11 @@ CUSTOM_OP_IMPL(batchnorm_bp, 4, 3, false, 1, 2) {
     // std.applyTransform(transform::Reciprocal);                           // 1 / (variance + epsilon)
     // std.applyTransform(transform::Sqrt);                                 // 1 / (variance + epsilon)^0.5
     // NDArray xMu(input);
-    // input->applyBroadcast(nd4j::broadcast::Subtract, axes, mean, &xMu);
+    // input->applyBroadcast(sd::broadcast::Subtract, axes, mean, &xMu);
     // NDArray xHat(input);
-    // xMu.applyBroadcast(nd4j::broadcast::Multiply, axes, &std, &xHat);
+    // xMu.applyBroadcast(sd::broadcast::Multiply, axes, &std, &xHat);
     // NDArray dxhat(input);
-    // dLdO->applyBroadcast(nd4j::broadcast::Multiply, axes, gamma, &dxhat);
+    // dLdO->applyBroadcast(sd::broadcast::Multiply, axes, gamma, &dxhat);
     // NDArray temp = dxhat*xMu;
     // temp.reduceAlongDimension(reduce::Sum, dLdV, excludedAxes, keepUnitiesInShape);
     // *dLdV *= -0.5f * std*std*std;
@@ -295,10 +295,10 @@ CUSTOM_OP_IMPL(batchnorm_bp, 4, 3, false, 1, 2) {
     // NDArray dLdmu = *dxmu1 + *dxmu2;
     // dLdmu *= (1.f /N);
     // *dLdV *= (2.f/N);
-    // dxhat.applyBroadcast(nd4j::broadcast::Multiply, axes, &std);
-    // xMu.applyBroadcast(nd4j::broadcast::Multiply, axes, dLdV);
+    // dxhat.applyBroadcast(sd::broadcast::Multiply, axes, &std);
+    // xMu.applyBroadcast(sd::broadcast::Multiply, axes, dLdV);
     // dxhat += xMu;
-    // dxhat.applyBroadcast(nd4j::broadcast::Add, axes, &dLdmu, dLdI);
+    // dxhat.applyBroadcast(sd::broadcast::Add, axes, &dLdmu, dLdI);
     // delete  dxmu1;
     // delete  dxmu2;
     // xHat *= *dLdO;
@@ -309,12 +309,12 @@ CUSTOM_OP_IMPL(batchnorm_bp, 4, 3, false, 1, 2) {
 
 DECLARE_TYPES(batchnorm_bp) {
     getOpDescriptor()
-            ->setAllowedInputTypes(0, nd4j::DataType::ANY)
-            ->setAllowedInputTypes(1, nd4j::DataType::ANY)
-            ->setAllowedInputTypes(2, nd4j::DataType::ANY)
+            ->setAllowedInputTypes(0, sd::DataType::ANY)
+            ->setAllowedInputTypes(1, sd::DataType::ANY)
+            ->setAllowedInputTypes(2, sd::DataType::ANY)
             ->setAllowedInputTypes(3, {ALL_FLOATS})
-            ->setAllowedInputTypes(4, nd4j::DataType::ANY)
-            ->setAllowedInputTypes(5, nd4j::DataType::ANY)
+            ->setAllowedInputTypes(4, sd::DataType::ANY)
+            ->setAllowedInputTypes(5, sd::DataType::ANY)
             ->setAllowedOutputTypes({ALL_FLOATS});
 }
 
diff --git a/libnd4j/include/ops/declarable/generic/nn/convo/col2im.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/col2im.cpp
index d09ccdb68..b68c4c211 100644
--- a/libnd4j/include/ops/declarable/generic/nn/convo/col2im.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/convo/col2im.cpp
@@ -18,13 +18,13 @@
 // Created by raver119 on 17.10.2017.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_col2im)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/col2im.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(col2im, 1, 1, false, 0, 9) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/nn/convo/conv1d.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/conv1d.cpp
index c5e26c73e..91a189794 100644
--- a/libnd4j/include/ops/declarable/generic/nn/convo/conv1d.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/convo/conv1d.cpp
@@ -19,14 +19,14 @@
 //  @author Yurii Shyrma
 
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_conv1d)
 
 #include <ops/declarable/DeclarableOp.h>
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/convolutions.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 
@@ -82,7 +82,7 @@ CUSTOM_OP_IMPL(conv1d, 2, 1, false, 0, 5) {
     auto outputReshaped  = output ->reshape(output->ordering(),  reshapeForOutput, false);
     auto weightsReshaped = weights->reshape(weights->ordering(), {1, weights->sizeAt(0), weights->sizeAt(1), weights->sizeAt(2)});   // [kW, iC, oC] -> [1, kW, iC, oC]
 
-    nd4j::ops::conv2d conv2d;
+    sd::ops::conv2d conv2d;
     const Nd4jStatus status = conv2d.execute({&inputReshaped, &weightsReshaped, bias}, {&outputReshaped}, {}, {1,kW,  1,sW,  0,pW,  1,dW,  paddingMode,  !isNCW}, {});
     if (status != ND4J_STATUS_OK)
         return status;
@@ -221,7 +221,7 @@ CUSTOM_OP_IMPL(conv1d_bp, 3, 2, false, 0, 5) {
     auto weightsReshaped = weights->reshape(weights->ordering(),{1, weights->sizeAt(0), weights->sizeAt(1), weights->sizeAt(2)});       // [kW, iC, oC] -> [1, kW, iC, oC]
     auto gradWReshaped   = gradW  ->reshape(gradW->ordering(),  {1, weights->sizeAt(0), weights->sizeAt(1), weights->sizeAt(2)}, false);// [kW, iC, oC] -> [1, kW, iC, oC]
 
-    nd4j::ops::conv2d_bp conv2dBP;
+    sd::ops::conv2d_bp conv2dBP;
     auto status = conv2dBP.execute({&inputReshaped, &weightsReshaped, bias, &gradOReshaped}, {&gradIReshaped, &gradWReshaped, gradB}, {}, {1,kW,  1,sW,  0,pW,  1,dW,  paddingMode,  !isNCW}, {});
     if (status != ND4J_STATUS_OK)
         return status;
diff --git a/libnd4j/include/ops/declarable/generic/nn/convo/conv2d.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/conv2d.cpp
index 26d03358a..1217eb60e 100644
--- a/libnd4j/include/ops/declarable/generic/nn/convo/conv2d.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/convo/conv2d.cpp
@@ -21,16 +21,16 @@
 #ifndef LIBND4J_CONVO_OPS_H
 #define LIBND4J_CONVO_OPS_H
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_conv2d)
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <memory>
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/OpRegistrator.h>
-#include <declarable/helpers/convolutions.h>
+#include <ops/declarable/helpers/convolutions.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 
@@ -140,7 +140,7 @@ DECLARE_SHAPE_FN(conv2d) {
 
     DECLARE_TYPES(conv2d) {
         getOpDescriptor()
-                ->setAllowedInputTypes(0, nd4j::DataType::ANY)
+                ->setAllowedInputTypes(0, sd::DataType::ANY)
                 ->setAllowedInputTypes(1, {ALL_FLOATS})
                 ->setAllowedInputTypes(2, {ALL_FLOATS})
                 ->setAllowedOutputTypes({ALL_FLOATS});
@@ -148,7 +148,7 @@ DECLARE_SHAPE_FN(conv2d) {
 
     DECLARE_TYPES(conv2d_bp) {
         getOpDescriptor()
-                ->setAllowedInputTypes(nd4j::DataType::ANY)
+                ->setAllowedInputTypes(sd::DataType::ANY)
                 ->setAllowedOutputTypes({ALL_FLOATS});
     }
 
@@ -312,7 +312,7 @@ CUSTOM_OP_IMPL(conv2d_input_bp, 3, 1, false, 0, 9) {
 
         DECLARE_TYPES(conv2d_input_bp) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
 
diff --git a/libnd4j/include/ops/declarable/generic/nn/convo/conv3d.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/conv3d.cpp
index 7ce42756d..72cf8b9f4 100644
--- a/libnd4j/include/ops/declarable/generic/nn/convo/conv3d.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/convo/conv3d.cpp
@@ -19,15 +19,15 @@
 // @author Yurii Shyrma, created on 05.02.2018
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_conv3dnew)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/convolutions.h>
 #include <ops/declarable/helpers/addBias.h>
-#include <MmulHelper.h>
+#include <helpers/MmulHelper.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 CUSTOM_OP_IMPL(conv3dnew, 2, 1, false, 0, 13) {
@@ -92,7 +92,7 @@ CUSTOM_OP_IMPL(conv3dnew, 2, 1, false, 0, 13) {
 
    DECLARE_TYPES(conv3dnew) {
         getOpDescriptor()
-                ->setAllowedInputTypes(0, nd4j::DataType::ANY)
+                ->setAllowedInputTypes(0, sd::DataType::ANY)
                 ->setAllowedInputTypes(1, {ALL_FLOATS})
                 ->setAllowedInputTypes(2, {ALL_FLOATS})
                 ->setAllowedOutputTypes({ALL_FLOATS});
@@ -259,7 +259,7 @@ CUSTOM_OP_IMPL(conv3dnew_bp, 3, 2, false, 0, 13) {
 
    DECLARE_TYPES(conv3dnew_bp) {
         getOpDescriptor()
-                ->setAllowedInputTypes(0, nd4j::DataType::ANY)
+                ->setAllowedInputTypes(0, sd::DataType::ANY)
                 ->setAllowedInputTypes(1, {ALL_FLOATS})
                 ->setAllowedInputTypes(2, {ALL_FLOATS})
                 ->setAllowedInputTypes(3, {ALL_FLOATS})
diff --git a/libnd4j/include/ops/declarable/generic/nn/convo/deconv2d.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/deconv2d.cpp
index e3632f36a..9f94d1459 100644
--- a/libnd4j/include/ops/declarable/generic/nn/convo/deconv2d.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/convo/deconv2d.cpp
@@ -19,17 +19,17 @@
 // @author Yurii Shyrma
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_deconv2d)
 
 #include <ops/declarable/CustomOperations.h>
-#include <MmulHelper.h>
-#include <declarable/helpers/convolutions.h>
+#include <helpers/MmulHelper.h>
+#include <ops/declarable/helpers/convolutions.h>
 #include <ops/declarable/helpers/im2col.h>
 #include <ops/declarable/helpers/col2im.h>
 #include <ops/declarable/helpers/addBias.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 CUSTOM_OP_IMPL(deconv2d, 2, 1, false, 0, 9) {
@@ -74,7 +74,7 @@ CUSTOM_OP_IMPL(deconv2d, 2, 1, false, 0, 9) {
     //----- calculation of output -----//
     // NHWC: [kH, kW, oC, iC] x [bS, iH, iW, iC] = [kH, kW, oC, bS, iH, iW]
     // NCHW: [kH, kW, oC, iC] x [bS, iC, iH, iW] = [kH, kW, oC, bS, iH, iW]
-    nd4j::MmulHelper::tensorDot(weights, input, &columns, {indWiC}, {indIOioC}, {2, 3, 1, 0, 4, 5});
+    sd::MmulHelper::tensorDot(weights, input, &columns, {indWiC}, {indIOioC}, {2, 3, 1, 0, 4, 5});
     LaunchContext* ctx = block.launchContext();
     helpers::col2im(*ctx, columns, *output, sH, sW, pH, pW, oH, oW, dH, dW);     // [bS, oC, kH, kW, iH, iW] is de-convoluted to [bS, oC, oH, oW]
 
@@ -90,7 +90,7 @@ CUSTOM_OP_IMPL(deconv2d, 2, 1, false, 0, 9) {
 }
     DECLARE_TYPES(deconv2d) {
         getOpDescriptor()
-                ->setAllowedInputTypes(nd4j::DataType::ANY)
+                ->setAllowedInputTypes(sd::DataType::ANY)
                 ->setAllowedOutputTypes({ALL_FLOATS});
     }
 
@@ -155,7 +155,7 @@ DECLARE_SHAPE_FN(deconv2d) {
 
     DECLARE_TYPES(deconv2d_bp) {
         getOpDescriptor()
-                ->setAllowedInputTypes(nd4j::DataType::ANY)
+                ->setAllowedInputTypes(sd::DataType::ANY)
                 ->setAllowedOutputTypes({ALL_FLOATS});
     }
 
@@ -208,7 +208,7 @@ CUSTOM_OP_IMPL(deconv2d_bp, 3, 2, false, 0, 9) {
 
 
      // ----- calculation of gradI -> pass it through conv2d_ff ----- //
-    nd4j::ops::conv2d conv2d;
+    sd::ops::conv2d conv2d;
     const Nd4jStatus status = conv2d.execute({gradO, weights}, {gradI}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW,  isSameMode,  !isNCHW}, {});
     if (status != ND4J_STATUS_OK)
         return status;
diff --git a/libnd4j/include/ops/declarable/generic/nn/convo/deconv2d_tf.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/deconv2d_tf.cpp
index d5a61d397..70fc46e0c 100644
--- a/libnd4j/include/ops/declarable/generic/nn/convo/deconv2d_tf.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/convo/deconv2d_tf.cpp
@@ -19,13 +19,13 @@
 // @author Yurii Shyrma
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_deconv2d)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/convolutions.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 //////////////////////////////////////////////////////////////////////////
@@ -77,7 +77,7 @@ CUSTOM_OP_IMPL(deconv2d_tf, 3, 1, false, 0, 9) {
 
         DECLARE_TYPES(deconv2d_tf) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
 
diff --git a/libnd4j/include/ops/declarable/generic/nn/convo/deconv3d.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/deconv3d.cpp
index 78d275c69..d4899fbab 100644
--- a/libnd4j/include/ops/declarable/generic/nn/convo/deconv3d.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/convo/deconv3d.cpp
@@ -18,15 +18,15 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 05.09.2018
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_deconv3d)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/convolutions.h>
 #include <ops/declarable/helpers/addBias.h>
-#include <MmulHelper.h>
+#include <helpers/MmulHelper.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 CUSTOM_OP_IMPL(deconv3d, 2, 1, false, 0, 13) {
@@ -75,7 +75,7 @@ CUSTOM_OP_IMPL(deconv3d, 2, 1, false, 0, 13) {
     //----- calculation of output -----//
     // NDHWC: [kD, kH, kW, oC, iC] x [bS, iD, iH, iW, iC] = [kD, kH, kW, oC, bS, iD, iH, iW]
     // NCDHW: [kD, kH, kW, oC, iC] x [bS, iC, iD, iH, iW] = [kD, kH, kW, oC, bS, iD, iH, iW]
-    nd4j::MmulHelper::tensorDot(weights, input, &columns, {indWiC}, {indIOioC}, {2, 3, 4, 1, 0, 5, 6, 7});   // [bS, oC, kD, kH, kW, iD, iH, iW] -> [kD, kH, kW, oC, bS, iD, iH, iW]
+    sd::MmulHelper::tensorDot(weights, input, &columns, {indWiC}, {indIOioC}, {2, 3, 4, 1, 0, 5, 6, 7});   // [bS, oC, kD, kH, kW, iD, iH, iW] -> [kD, kH, kW, oC, bS, iD, iH, iW]
     ConvolutionUtils::col2vol(block, columns, *output, sD, sH, sW, pD, pH, pW, dD, dH, dW);                   // [bS, oC, kD, kH, kW, iD, iH, iW] is de-convoluted to [bS, oC, oD, oH, oW]
 
     //----- add biases if required -----//
@@ -92,7 +92,7 @@ CUSTOM_OP_IMPL(deconv3d, 2, 1, false, 0, 13) {
 
    DECLARE_TYPES(deconv3d) {
         getOpDescriptor()
-                ->setAllowedInputTypes(0, nd4j::DataType::ANY)
+                ->setAllowedInputTypes(0, sd::DataType::ANY)
                 ->setAllowedInputTypes(1, {ALL_FLOATS})
                 ->setAllowedInputTypes(2, {ALL_FLOATS})
                 ->setAllowedOutputTypes({ALL_FLOATS});
@@ -220,7 +220,7 @@ CUSTOM_OP_IMPL(deconv3d_bp, 3, 2, false, 0, 13) {
         ConvolutionUtils::calcPadding3D(pD, pH, pW, iD, iH, iW, oD, oH, oW, kD, kH, kW, sD, sH, sW, dD, dH, dW);
 
      // ----- calculation of gradI -> pass it through conv3d_ff ----- //
-    nd4j::ops::conv3dnew conv3d;
+    sd::ops::conv3dnew conv3d;
     const Nd4jStatus status = conv3d.execute({gradO, weights}, {gradI}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW,  dD,dH,dW,  isSameMode,  !isNCDHW}, {});
     if (status != ND4J_STATUS_OK)
         return status;
@@ -257,7 +257,7 @@ CUSTOM_OP_IMPL(deconv3d_bp, 3, 2, false, 0, 13) {
 
  DECLARE_TYPES(deconv3d_bp) {
         getOpDescriptor()
-                ->setAllowedInputTypes(0, nd4j::DataType::ANY)
+                ->setAllowedInputTypes(0, sd::DataType::ANY)
                 ->setAllowedInputTypes(1, {ALL_FLOATS})
                 ->setAllowedInputTypes(2, {ALL_FLOATS})
                 ->setAllowedInputTypes(3, {ALL_FLOATS})
diff --git a/libnd4j/include/ops/declarable/generic/nn/convo/depthwiseConv2d.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/depthwiseConv2d.cpp
index e18836688..c04bcf6dd 100644
--- a/libnd4j/include/ops/declarable/generic/nn/convo/depthwiseConv2d.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/convo/depthwiseConv2d.cpp
@@ -18,15 +18,15 @@
 // @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_depthwise_conv2d)
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <ops/declarable/CustomOperations.h>
-#include <declarable/helpers/convolutions.h>
+#include <ops/declarable/helpers/convolutions.h>
 
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 CUSTOM_OP_IMPL(depthwise_conv2d, 2, 1, false, 0, 9) {
@@ -69,7 +69,7 @@ CUSTOM_OP_IMPL(depthwise_conv2d, 2, 1, false, 0, 9) {
 
     DECLARE_TYPES(depthwise_conv2d) {
         getOpDescriptor()
-                ->setAllowedInputTypes(nd4j::DataType::ANY)
+                ->setAllowedInputTypes(sd::DataType::ANY)
                 ->setAllowedOutputTypes({ALL_FLOATS});
     }
 DECLARE_SHAPE_FN(depthwise_conv2d) {
@@ -140,7 +140,7 @@ DECLARE_SHAPE_FN(depthwise_conv2d) {
 
     DECLARE_TYPES(depthwise_conv2d_bp) {
         getOpDescriptor()
-                ->setAllowedInputTypes(nd4j::DataType::ANY)
+                ->setAllowedInputTypes(sd::DataType::ANY)
                 ->setAllowedOutputTypes({ALL_FLOATS});
     }
 
diff --git a/libnd4j/include/ops/declarable/generic/nn/convo/dilation2d.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/dilation2d.cpp
index 6a85d2c7d..ea1193400 100644
--- a/libnd4j/include/ops/declarable/generic/nn/convo/dilation2d.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/convo/dilation2d.cpp
@@ -18,13 +18,13 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_dilation2d)
 
 #include <ops/declarable/headers/convo.h>
 #include <ops/declarable/helpers/dilation2d.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
     CUSTOM_OP_IMPL(dilation2d, 2, 1, false, 0, 1) {
         auto input = INPUT_VARIABLE(0);
@@ -83,7 +83,7 @@ namespace ops {
 
     DECLARE_TYPES(dilation2d) {
         getOpDescriptor()
-                ->setAllowedInputTypes(nd4j::DataType::ANY)
+                ->setAllowedInputTypes(sd::DataType::ANY)
                 ->setAllowedOutputTypes({ALL_FLOATS});
     }
 
diff --git a/libnd4j/include/ops/declarable/generic/nn/convo/im2col.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/im2col.cpp
index e9084fb61..179dd3005 100644
--- a/libnd4j/include/ops/declarable/generic/nn/convo/im2col.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/convo/im2col.cpp
@@ -18,7 +18,7 @@
 // Created by raver119 on 17.10.2017.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_im2col)
 
 #include <ops/declarable/CustomOperations.h>
@@ -26,7 +26,7 @@
 #include <ops/declarable/helpers/im2col.h>
 #include <ops/declarable/helpers/col2im.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(im2col, 1, 1, false, 0, 9) {
             auto x = INPUT_VARIABLE(0);
@@ -51,7 +51,7 @@ namespace nd4j {
 
             // FIXME: zeropad value is void
             LaunchContext* ctx = block.launchContext();
-            nd4j::ops::helpers::im2col(*ctx, *x, *z, kernelHeight, kernelWidth, strideY, strideX, padHeight, padWidth, dY, dX, NDArrayFactory::create(zeroPadVal, block.launchContext()));
+            sd::ops::helpers::im2col(*ctx, *x, *z, kernelHeight, kernelWidth, strideY, strideX, padHeight, padWidth, dY, dX, NDArrayFactory::create(zeroPadVal, block.launchContext()));
 
             STORE_RESULT(*z);
 
diff --git a/libnd4j/include/ops/declarable/generic/nn/convo/ismax.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/ismax.cpp
index 13de73e81..d786504ad 100644
--- a/libnd4j/include/ops/declarable/generic/nn/convo/ismax.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/convo/ismax.cpp
@@ -19,13 +19,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_ismax)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/ismax.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 CONFIGURABLE_OP_IMPL(ismax, 1, 1, true, 0, -2) {
diff --git a/libnd4j/include/ops/declarable/generic/nn/convo/pointwiseConv2d.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/pointwiseConv2d.cpp
index 69435ecb2..554802307 100644
--- a/libnd4j/include/ops/declarable/generic/nn/convo/pointwiseConv2d.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/convo/pointwiseConv2d.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/convolutions.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 
@@ -65,7 +65,7 @@ CUSTOM_OP_IMPL(pointwise_conv2d, 2, 1, false, 0, 0) {
 
     DECLARE_TYPES(pointwise_conv2d) {
         getOpDescriptor()
-                ->setAllowedInputTypes(nd4j::DataType::ANY)
+                ->setAllowedInputTypes(sd::DataType::ANY)
                 ->setAllowedOutputTypes({ALL_FLOATS});
     }
 
diff --git a/libnd4j/include/ops/declarable/generic/nn/convo/sconv2d.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/sconv2d.cpp
index 392cd3128..02b7ab50c 100644
--- a/libnd4j/include/ops/declarable/generic/nn/convo/sconv2d.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/convo/sconv2d.cpp
@@ -19,14 +19,14 @@
 // @author Yurii Shyrma, changed on 20.03.2018
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_sconv2d)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/convolutions.h>
 #include <memory>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 
@@ -95,7 +95,7 @@ CUSTOM_OP_IMPL(sconv2d, 2, 1, false, 0, 9) {
 
     DECLARE_TYPES(sconv2d) {
         getOpDescriptor()
-                ->setAllowedInputTypes(nd4j::DataType::ANY)
+                ->setAllowedInputTypes(sd::DataType::ANY)
                 ->setAllowedOutputTypes({ALL_FLOATS});
     }
 
@@ -186,7 +186,7 @@ DECLARE_SHAPE_FN(sconv2d) {
 
     DECLARE_TYPES(sconv2d_bp) {
         getOpDescriptor()
-                ->setAllowedInputTypes(nd4j::DataType::ANY)
+                ->setAllowedInputTypes(sd::DataType::ANY)
                 ->setAllowedOutputTypes({ALL_FLOATS});
     }
 
@@ -265,7 +265,7 @@ CUSTOM_OP_IMPL(sconv2d_bp, 3, 2, false, 0, 9) {
 
     // if (iC == 1) {
     //     nd4j_debug(" SCONV2D_BP OP: for input_channels=1 this op is equivalent to standard conv2d_bp \n","");
-    //     nd4j::ops::conv2d_bp op;
+    //     sd::ops::conv2d_bp op;
     //     return op.execute(&block);
     // }
 
diff --git a/libnd4j/include/ops/declarable/generic/nn/convo/upsampling2d.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/upsampling2d.cpp
index 2978feff1..4f04eb921 100644
--- a/libnd4j/include/ops/declarable/generic/nn/convo/upsampling2d.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/convo/upsampling2d.cpp
@@ -19,13 +19,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), changed on 03.05.2018
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_upsampling2d)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/convolutions.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 
@@ -49,7 +49,7 @@ DECLARE_SYN(upsampling, upsampling2d);
 
         DECLARE_TYPES(upsampling2d) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
 
@@ -87,7 +87,7 @@ DECLARE_SHAPE_FN(upsampling2d) {
 
         DECLARE_TYPES(upsampling2d_bp) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
 
diff --git a/libnd4j/include/ops/declarable/generic/nn/convo/upsampling3d.cpp b/libnd4j/include/ops/declarable/generic/nn/convo/upsampling3d.cpp
index 9cdbbffc2..f88f3705f 100644
--- a/libnd4j/include/ops/declarable/generic/nn/convo/upsampling3d.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/convo/upsampling3d.cpp
@@ -18,13 +18,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 04.05.2018
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_upsampling3d)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/convolutions.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 
@@ -48,7 +48,7 @@ CUSTOM_OP_IMPL(upsampling3d, 1, 1, false, 0, 3) {
 
         DECLARE_TYPES(upsampling3d) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
         
@@ -89,7 +89,7 @@ DECLARE_SHAPE_FN(upsampling3d) {
 
         DECLARE_TYPES(upsampling3d_bp) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
 
diff --git a/libnd4j/include/ops/declarable/generic/nn/dot_product_attention.cpp b/libnd4j/include/ops/declarable/generic/nn/dot_product_attention.cpp
index 726083deb..bd0cf329a 100644
--- a/libnd4j/include/ops/declarable/generic/nn/dot_product_attention.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/dot_product_attention.cpp
@@ -18,14 +18,14 @@
 // @author Paul Dubs
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_dot_product_attention)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/reverse.h>
 
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
     CUSTOM_OP_IMPL(dot_product_attention, 3, -1, false, 0, 2) {
@@ -67,7 +67,7 @@ namespace ops  {
                 "dot_product_attention: Keys and Values must have the same timestep length. "
                 "But got keys = %i, values = %i", keys->sizeAt(-1), values->sizeAt(-1));
 
-        nd4j::ops::matmul mmul;
+        sd::ops::matmul mmul;
         mmul.execute({keys, queries}, {weights}, {}, {1}, {});
         if(normalization) {
             *weights /= sqrt((double)keys->sizeAt(-2));
@@ -90,7 +90,7 @@ namespace ops  {
             *weights += (reshapedMask - 1) * 1e9;
         }
 
-        nd4j::ops::softmax softmax;
+        sd::ops::softmax softmax;
         softmax.execute({weights}, std::vector<NDArray*>{weights}, {}, {-2}, {}, {}, true);
 
         mmul.execute({values, weights}, {output}, {}, {}, {});
@@ -113,8 +113,8 @@ namespace ops  {
         auto keys_shape = inputShape->at(1);
         auto values_shape = inputShape->at(2);
 
-        auto weights_shape = ConstantShapeHelper::getInstance()->createShapeInfo(nd4j::ArrayOptions::dataType(values_shape), 'c', ShapeUtils::evalShapeForMatmul(keys_shape, query_shape, true, false));
-        auto output_shape = ConstantShapeHelper::getInstance()->createShapeInfo(nd4j::ArrayOptions::dataType(values_shape), 'c', ShapeUtils::evalShapeForMatmul(values_shape, weights_shape, false, false));
+        auto weights_shape = ConstantShapeHelper::getInstance()->createShapeInfo(sd::ArrayOptions::dataType(values_shape), 'c', ShapeUtils::evalShapeForMatmul(keys_shape, query_shape, true, false));
+        auto output_shape = ConstantShapeHelper::getInstance()->createShapeInfo(sd::ArrayOptions::dataType(values_shape), 'c', ShapeUtils::evalShapeForMatmul(values_shape, weights_shape, false, false));
 
         if(INT_ARG(1)){
             return SHAPELIST(output_shape, weights_shape);
@@ -166,7 +166,7 @@ namespace ops  {
 
         auto weightShape = ShapeUtils::evalShapeForMatmul(keys->getShapeInfo(), queries->getShapeInfo(), true, false);
 
-        nd4j::ops::matmul mmul;
+        sd::ops::matmul mmul;
         NDArray preSoftmax('c', weightShape, values->dataType(), block.launchContext());
         mmul.execute({keys, queries}, {&preSoftmax},{}, {1}, {});
         
@@ -184,15 +184,15 @@ namespace ops  {
         }
 
         NDArray weights('c', weightShape, values->dataType(), block.launchContext());
-        nd4j::ops::softmax softmax;
+        sd::ops::softmax softmax;
         softmax.execute({&preSoftmax}, {&weights},{}, {-2}, {});
 
-        nd4j::ops::matmul_bp mmul_bp;
+        sd::ops::matmul_bp mmul_bp;
         NDArray dLdw(weights.getShapeInfo(), block.workspace());
         mmul_bp.execute({values, &weights, eps}, std::vector<NDArray*>{dLdv, &dLdw}, {}, {}, {});
 
         NDArray dLds(preSoftmax.shapeInfo(), block.workspace());
-        nd4j::ops::softmax_bp softmax_bp;
+        sd::ops::softmax_bp softmax_bp;
         softmax_bp.execute({&preSoftmax, &dLdw}, {&dLds}, {}, {-2}, {});
 
         if(normalization)
diff --git a/libnd4j/include/ops/declarable/generic/nn/fusedBatchNorm.cpp b/libnd4j/include/ops/declarable/generic/nn/fusedBatchNorm.cpp
index 0754000a3..6e911e405 100644
--- a/libnd4j/include/ops/declarable/generic/nn/fusedBatchNorm.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/fusedBatchNorm.cpp
@@ -18,17 +18,17 @@
 // Created by raver119 on 29/10/17.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_fused_batch_norm)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
     DECLARE_TYPES(fused_batch_norm) {
         getOpDescriptor()
-                ->setAllowedInputTypes(nd4j::DataType::ANY)
+                ->setAllowedInputTypes(sd::DataType::ANY)
                 ->setAllowedOutputTypes({ALL_FLOATS});
     }
 
diff --git a/libnd4j/include/ops/declarable/generic/nn/logSoftmax.cpp b/libnd4j/include/ops/declarable/generic/nn/logSoftmax.cpp
index f5cc78e2b..64aadce37 100644
--- a/libnd4j/include/ops/declarable/generic/nn/logSoftmax.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/logSoftmax.cpp
@@ -18,13 +18,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 01.02.2018
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_log_softmax)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/activations.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
 
diff --git a/libnd4j/include/ops/declarable/generic/nn/lrn.cpp b/libnd4j/include/ops/declarable/generic/nn/lrn.cpp
index 17a2d7175..e9546d1db 100644
--- a/libnd4j/include/ops/declarable/generic/nn/lrn.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/lrn.cpp
@@ -20,18 +20,18 @@
 // @author Yurii Shyrma (iuriish@yahoo.com) -> back prop author
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_lrn)
 
 #include <ops/declarable/helpers/lrn.h>
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
 
         DECLARE_TYPES(lrn) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
 
@@ -51,7 +51,7 @@ namespace nd4j {
 
         DECLARE_TYPES(lrn_bp) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
 
diff --git a/libnd4j/include/ops/declarable/generic/nn/multi_head_dot_product_attention.cpp b/libnd4j/include/ops/declarable/generic/nn/multi_head_dot_product_attention.cpp
index cff8545b2..f9b7284f1 100644
--- a/libnd4j/include/ops/declarable/generic/nn/multi_head_dot_product_attention.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/multi_head_dot_product_attention.cpp
@@ -18,13 +18,13 @@
 // @author Paul Dubs
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_multi_head_dot_product_attention)
 
 #include <ops/declarable/CustomOperations.h>
 #include <helpers/AttentionHelper.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
     CUSTOM_OP_IMPL(multi_head_dot_product_attention, 7, -1, false, 0, 2) {
@@ -100,14 +100,14 @@ namespace ops  {
         // Apply Attention
         // attnResults = [minibatch, numHeads, projectedSize, seqLenth
         NDArray attnResults('c', {projectedQueries.sizeAt(0), projectedValues.sizeAt(1), projectedValues.sizeAt(2), projectedQueries.sizeAt(3)}, projectedValues.dataType(), block.launchContext());
-        nd4j::ops::dot_product_attention attention;
+        sd::ops::dot_product_attention attention;
         attention.execute({&projectedQueries, &projectedKeys, &projectedValues, mask}, {&attnResults, weights ? OUTPUT_VARIABLE(1) : nullptr}, {}, {normalization, weights}, {});
 
         // Project attention results
         attnResults.permutei({0, 3, 1, 2});
         attnResults.reshapei(attnResults.ordering(), {miniBatchSize * queryCount, numHeads * projectedValuesSize});
 
-        nd4j::ops::matmul mmul;
+        sd::ops::matmul mmul;
         NDArray projRes('c', {attnResults.sizeAt(0), Wo->sizeAt(1)}, values->dataType(), block.launchContext());
         mmul.execute({&attnResults, Wo},{&projRes}, {}, {}, {});
         projRes.reshapei(projRes.ordering(), {miniBatchSize, queryCount, outSize});
@@ -138,8 +138,8 @@ namespace ops  {
         auto numHeads = shape::sizeAt(WkShape, 0);
         auto timeSteps = shape::sizeAt(keysShape, 2);
 
-        auto weightsShape = ConstantShapeHelper::getInstance()->createShapeInfo(nd4j::ArrayOptions::dataType(valuesShape), 'c', {batchSize, numHeads, timeSteps, queryCount});
-        auto outputShape = ConstantShapeHelper::getInstance()->createShapeInfo(nd4j::ArrayOptions::dataType(valuesShape), 'c', {batchSize, outSize, queryCount});
+        auto weightsShape = ConstantShapeHelper::getInstance()->createShapeInfo(sd::ArrayOptions::dataType(valuesShape), 'c', {batchSize, numHeads, timeSteps, queryCount});
+        auto outputShape = ConstantShapeHelper::getInstance()->createShapeInfo(sd::ArrayOptions::dataType(valuesShape), 'c', {batchSize, outSize, queryCount});
 
         if(INT_ARG(1)){
             return SHAPELIST(outputShape, weightsShape);
@@ -227,7 +227,7 @@ namespace ops  {
 
         // Apply Attention
         NDArray attnResults('c', {projectedQueries.sizeAt(0), projectedValues.sizeAt(1), projectedValues.sizeAt(2), projectedQueries.sizeAt(3)}, projectedValues.dataType(), block.launchContext());
-        nd4j::ops::dot_product_attention attention;
+        sd::ops::dot_product_attention attention;
         attention.execute({&projectedQueries, &projectedKeys, &projectedValues, mask}, {&attnResults}, {}, {normalization, 0}, {});
 
         // Project attention results
@@ -237,7 +237,7 @@ namespace ops  {
         // dLdWo
         auto epsPerm = eps->permute({0, 2, 1});
         auto epsPostReshape = epsPerm.reshape(eps->ordering(), {miniBatchSize * queryCount, outSize});
-        nd4j::ops::matmul_bp matmulBp;
+        sd::ops::matmul_bp matmulBp;
         NDArray dLdPreWo(attnResults.shapeInfo(), false, block.launchContext());
         matmulBp.execute({&attnResults, Wo, &epsPostReshape}, std::vector<NDArray*>{&dLdPreWo, dLdWo}, {}, {}, {});
 
@@ -245,7 +245,7 @@ namespace ops  {
         dLdPreWo.reshapei({miniBatchSize, queryCount, numHeads, projectedValues.sizeAt(2)});
         dLdPreWo.permutei({0, 2, 3, 1});
 
-        nd4j::ops::dot_product_attention_bp attentionBp;
+        sd::ops::dot_product_attention_bp attentionBp;
         NDArray dLdProjectedQueries(projectedQueries.shapeInfo(), false, block.launchContext());
         NDArray dLdProjectedKeys(projectedKeys.shapeInfo(), false, block.launchContext());
         NDArray dLdProjectedValues(projectedValues.shapeInfo(), false, block.launchContext());
diff --git a/libnd4j/include/ops/declarable/generic/nn/pooling/avgpool2d.cpp b/libnd4j/include/ops/declarable/generic/nn/pooling/avgpool2d.cpp
index 873ac545a..500c159d9 100644
--- a/libnd4j/include/ops/declarable/generic/nn/pooling/avgpool2d.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/pooling/avgpool2d.cpp
@@ -19,13 +19,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), changed on 14.05.2018
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_avgpool2d)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/convolutions.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 CUSTOM_OP_IMPL(avgpool2d, 1, 1, false, 0, 10) {
@@ -83,7 +83,7 @@ DECLARE_SYN(avgpool, avgpool2d);
 
     DECLARE_TYPES(avgpool2d) {
         getOpDescriptor()
-                ->setAllowedInputTypes(nd4j::DataType::ANY)
+                ->setAllowedInputTypes(sd::DataType::ANY)
                 ->setAllowedOutputTypes({ALL_FLOATS});
     }
 
@@ -138,7 +138,7 @@ DECLARE_SHAPE_FN(avgpool2d) {
 
     DECLARE_TYPES(avgpool2d_bp) {
         getOpDescriptor()
-                ->setAllowedInputTypes(nd4j::DataType::ANY)
+                ->setAllowedInputTypes(sd::DataType::ANY)
                 ->setAllowedOutputTypes({ALL_FLOATS});
     }
 
diff --git a/libnd4j/include/ops/declarable/generic/nn/pooling/avgpool3d.cpp b/libnd4j/include/ops/declarable/generic/nn/pooling/avgpool3d.cpp
index b72a1f6f7..529180cbb 100644
--- a/libnd4j/include/ops/declarable/generic/nn/pooling/avgpool3d.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/pooling/avgpool3d.cpp
@@ -18,13 +18,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 01.03.2018
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_avgpool3dnew)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/convolutions.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 
@@ -81,7 +81,7 @@ CUSTOM_OP_IMPL(avgpool3dnew, 1, 1, false, 0, 14) {
 
         DECLARE_TYPES(avgpool3dnew) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
 
@@ -140,7 +140,7 @@ DECLARE_SHAPE_FN(avgpool3dnew) {
 
         DECLARE_TYPES(avgpool3dnew_bp) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
 
diff --git a/libnd4j/include/ops/declarable/generic/nn/pooling/maxpool2d.cpp b/libnd4j/include/ops/declarable/generic/nn/pooling/maxpool2d.cpp
index 13ba252e7..51b9db928 100644
--- a/libnd4j/include/ops/declarable/generic/nn/pooling/maxpool2d.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/pooling/maxpool2d.cpp
@@ -19,13 +19,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), changed on 09.05.2018
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_maxpool2d)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/convolutions.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 
@@ -87,7 +87,7 @@ DECLARE_SYN(maxpool, maxpool2d);
 
         DECLARE_TYPES(maxpool2d) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setSameMode(true);
         }
 
@@ -141,7 +141,7 @@ DECLARE_SHAPE_FN(maxpool2d) {
 
         DECLARE_TYPES(maxpool2d_bp) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
 
diff --git a/libnd4j/include/ops/declarable/generic/nn/pooling/maxpool3d.cpp b/libnd4j/include/ops/declarable/generic/nn/pooling/maxpool3d.cpp
index be905e22f..321ff027b 100644
--- a/libnd4j/include/ops/declarable/generic/nn/pooling/maxpool3d.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/pooling/maxpool3d.cpp
@@ -18,13 +18,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 19.02.2018
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_maxpool3dnew)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/convolutions.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 
@@ -82,7 +82,7 @@ CUSTOM_OP_IMPL(maxpool3dnew, 1, 1, false, 0, 14) {
 
         DECLARE_TYPES(maxpool3dnew) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setSameMode(true);
         }
 
@@ -142,7 +142,7 @@ DECLARE_SHAPE_FN(maxpool3dnew) {
 
         DECLARE_TYPES(maxpool3dnew_bp) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
 
diff --git a/libnd4j/include/ops/declarable/generic/nn/pooling/maxpool_with_argmax.cpp b/libnd4j/include/ops/declarable/generic/nn/pooling/maxpool_with_argmax.cpp
index 5fe7455fc..fabfd9bad 100644
--- a/libnd4j/include/ops/declarable/generic/nn/pooling/maxpool_with_argmax.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/pooling/maxpool_with_argmax.cpp
@@ -18,14 +18,14 @@
 // Created by GS <sgazeos@gmail.com> at 2/20/18
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_max_pool_with_argmax)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/convolutions.h>
 #include <ops/declarable/helpers/max_pooling.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(max_pool_with_argmax, 1, 2, false, 0, 9) {
 
@@ -44,7 +44,7 @@ namespace nd4j {
 
         DECLARE_TYPES(max_pool_with_argmax) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes(0, DataType::INHERIT)
                     ->setAllowedOutputTypes(1, {ALL_INTS});
 
diff --git a/libnd4j/include/ops/declarable/generic/nn/pooling/pnormpool2d.cpp b/libnd4j/include/ops/declarable/generic/nn/pooling/pnormpool2d.cpp
index 5c7dc28cd..5ece99d36 100644
--- a/libnd4j/include/ops/declarable/generic/nn/pooling/pnormpool2d.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/pooling/pnormpool2d.cpp
@@ -19,13 +19,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), changed on 14.05.2018
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_pnormpool2d)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/convolutions.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(pnormpool2d, 1, 1, false, 0, 10) {
 
@@ -83,7 +83,7 @@ namespace nd4j {
 
         DECLARE_TYPES(pnormpool2d) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
 
@@ -136,7 +136,7 @@ namespace nd4j {
 
         DECLARE_TYPES(pnormpool2d_bp) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
 
diff --git a/libnd4j/include/ops/declarable/generic/nn/relu_layer.cpp b/libnd4j/include/ops/declarable/generic/nn/relu_layer.cpp
index 22c7a9137..94a4a0ca4 100644
--- a/libnd4j/include/ops/declarable/generic/nn/relu_layer.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/relu_layer.cpp
@@ -20,7 +20,7 @@
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(relu_layer, 3, 1, false, 0, 0) {
             auto x = INPUT_VARIABLE(0);
@@ -35,13 +35,13 @@ namespace nd4j {
 
             auto output = OUTPUT_VARIABLE(0);
 
-            nd4j::ops::xw_plus_b op;
+            sd::ops::xw_plus_b op;
             auto status = op.execute({x, w, b}, {output});
             REQUIRE_TRUE(Status::OK() == status, 0, "relu_layer: xw_plus_b op failed on input data.");
 
             auto scalar = block.numT() > 0 ? block.getTArguments()->at(0) : 0.0;
 
-            output->applyScalar(nd4j::scalar::RELU, scalar, *output);
+            output->applyScalar(sd::scalar::RELU, scalar, *output);
 
             return Status::OK();
         }
@@ -56,7 +56,7 @@ namespace nd4j {
 
         DECLARE_TYPES(relu_layer) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
 //                  ->setAllowedInputTypes(1, {ALL_FLOATS})
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
diff --git a/libnd4j/include/ops/declarable/generic/nn/softmax.cpp b/libnd4j/include/ops/declarable/generic/nn/softmax.cpp
index 06bd6d379..d5c58bb7a 100644
--- a/libnd4j/include/ops/declarable/generic/nn/softmax.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/softmax.cpp
@@ -19,13 +19,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_softmax)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/activations.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
     DECLARE_TYPES(softmax) {
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops.cpp b/libnd4j/include/ops/declarable/generic/parity_ops.cpp
index d64759faf..3595512a2 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops.cpp
@@ -24,12 +24,12 @@
 #define LIBND4J_PARITY_OPS_H
 
 #include <climits>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <memory>
-#include <shape.h>
+#include <helpers/shape.h>
 #include <ops/ops.h>
 #include <loops/random.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <graph/Variable.h>
 #include <ops/declarable/DeclarableOp.h>
 #include <ops/declarable/DeclarableReductionOp.h>
@@ -39,7 +39,7 @@
 #include <ops/declarable/OpRegistrator.h>
 #include <helpers/ShapeUtils.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
 
     }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/adjust_contrast.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/adjust_contrast.cpp
index 65f01cf6c..796dbb80b 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/adjust_contrast.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/adjust_contrast.cpp
@@ -18,13 +18,13 @@
 //  @author George A. Shulinok <sgazeos@gmail.com>
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_adjust_contrast)
 
 #include <ops/declarable/headers/parity_ops.h>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
 ////////////////////////////////////////////////////////////////////
@@ -68,7 +68,7 @@ CONFIGURABLE_OP_IMPL(adjust_contrast, 1, 1, true, 0, 0) {
 }
 
 DECLARE_TYPES(adjust_contrast) {
-    getOpDescriptor()->setAllowedInputTypes(nd4j::DataType::ANY)
+    getOpDescriptor()->setAllowedInputTypes(sd::DataType::ANY)
                      ->setAllowedOutputTypes({ALL_FLOATS})
                      ->setSameMode(true);
 }
@@ -119,7 +119,7 @@ CONFIGURABLE_OP_IMPL(adjust_contrast_v2, 1, 1, true, 0, 0) {
 }
 
 DECLARE_TYPES(adjust_contrast_v2) {
-    getOpDescriptor()->setAllowedInputTypes(nd4j::DataType::ANY)
+    getOpDescriptor()->setAllowedInputTypes(sd::DataType::ANY)
             ->setAllowedOutputTypes({ALL_FLOATS})
             ->setSameMode(true);
 }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/adjust_hue.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/adjust_hue.cpp
index 003ff6e75..436fae28d 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/adjust_hue.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/adjust_hue.cpp
@@ -19,13 +19,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_adjust_hue)
 
 #include <ops/declarable/headers/parity_ops.h>
 #include <ops/declarable/helpers/adjust_hue.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
 
@@ -69,7 +69,7 @@ CONFIGURABLE_OP_IMPL(adjust_hue, 1, 1, true, 0, 0) {
 }
 
 DECLARE_TYPES(adjust_hue) {
-    getOpDescriptor()->setAllowedInputTypes(nd4j::DataType::ANY)
+    getOpDescriptor()->setAllowedInputTypes(sd::DataType::ANY)
                      ->setSameMode(true);
 }
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/adjust_saturation.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/adjust_saturation.cpp
index 0a8eaf0c3..5be1699f4 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/adjust_saturation.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/adjust_saturation.cpp
@@ -18,14 +18,14 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_adjust_saturation)
 
 #include <ops/declarable/headers/parity_ops.h>
 #include <ops/declarable/helpers/adjust_saturation.h>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
 CONFIGURABLE_OP_IMPL(adjust_saturation, 1, 1, true, 0, 0) {
@@ -66,7 +66,7 @@ CONFIGURABLE_OP_IMPL(adjust_saturation, 1, 1, true, 0, 0) {
 }
 
 DECLARE_TYPES(adjust_saturation) {
-    getOpDescriptor()->setAllowedInputTypes(nd4j::DataType::ANY)
+    getOpDescriptor()->setAllowedInputTypes(sd::DataType::ANY)
                      ->setSameMode(true);
 }
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/argmax.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/argmax.cpp
index 10e036b61..928a0f7d0 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/argmax.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/argmax.cpp
@@ -18,18 +18,18 @@
 // Created by raver119 on 01.11.2017.
 // Modified by GS <sgazeos@gmail.com> 4/5/2018
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_argmax)
 
 #include <ops/declarable/helpers/axis.h>
 #include <ops/declarable/CustomOperations.h>
 #include <helpers/ConstantTadHelper.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         DECLARE_TYPES(argmax) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_INTS});
         }
 
@@ -78,8 +78,8 @@ namespace nd4j {
             }
 
             // special case - output is scalar
-            if (dims.size() == 0 || (dims.size() == 1 && dims.at(0) == nd4j::DataTypeUtils::max<int>())) {
-                return SHAPELIST(ConstantShapeHelper::getInstance()->scalarShapeInfo(nd4j::DataType::INT64));
+            if (dims.size() == 0 || (dims.size() == 1 && dims.at(0) == sd::DataTypeUtils::max<int>())) {
+                return SHAPELIST(ConstantShapeHelper::getInstance()->scalarShapeInfo(sd::DataType::INT64));
             }
 
             return SHAPELIST(ShapeUtils::evalReduceShapeInfo('c', dims, inputShape->at(0), DataType::INT64, false, false, block.getWorkspace()));
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/argmin.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/argmin.cpp
index 554b7b95b..f4fb25daa 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/argmin.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/argmin.cpp
@@ -18,18 +18,18 @@
 // Created by raver119 on 01.11.2017.
 // Modified by GS <sgazeos@gmail.com> 4/5/2018.
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_argmin)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/axis.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
 
         DECLARE_TYPES(argmin) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_INTS});
         }
 
@@ -77,7 +77,7 @@ namespace nd4j {
             }
 
             // special case - output is scalar
-            if (dims.size() == 0 || (dims.size() == 1 && dims.at(0) == nd4j::DataTypeUtils::max<int>())) {
+            if (dims.size() == 0 || (dims.size() == 1 && dims.at(0) == sd::DataTypeUtils::max<int>())) {
                 return SHAPELIST(ConstantShapeHelper::getInstance()->scalarShapeInfo(DataType::INT64));
             }
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/assert.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/assert.cpp
index cd8d50df2..362d51c83 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/assert.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/assert.cpp
@@ -18,13 +18,13 @@
 // @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_Assert)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/axis.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         OP_IMPL(Assert, 1, 1, false) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/batch_to_space.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/batch_to_space.cpp
index fe2575f1f..e915df7f0 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/batch_to_space.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/batch_to_space.cpp
@@ -33,13 +33,13 @@ limitations under the License.
 // @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_batch_to_space)
 
 #include <ops/declarable/headers/parity_ops.h>
 #include <ops/declarable/helpers/s_t_b.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 
@@ -83,7 +83,7 @@ CUSTOM_OP_IMPL(batch_to_space, 2, 1, false, 0, 1) {
 ////////////////////////////////////////////////////////////////////////////////
 DECLARE_TYPES(batch_to_space) {
 
-    getOpDescriptor()->setAllowedInputTypes(0, nd4j::DataType::ANY)
+    getOpDescriptor()->setAllowedInputTypes(0, sd::DataType::ANY)
                      ->setAllowedInputTypes(1, {ALL_INTS})
                      ->setSameMode(true);
 }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/batch_to_space_nd.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/batch_to_space_nd.cpp
index 147944acc..312fff7ec 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/batch_to_space_nd.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/batch_to_space_nd.cpp
@@ -33,13 +33,13 @@ limitations under the License.
 // @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_batch_to_space_nd)
 
 #include <ops/declarable/headers/parity_ops.h>
 #include <ops/declarable/helpers/s_t_b.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 
@@ -58,7 +58,7 @@ CUSTOM_OP_IMPL(batch_to_space_nd, 3, 1, false, 0, 0) {
 
     const uint numOfSpatialDims = blockShape->sizeAt(0);
 
-    const auto product = blockShape->reduceNumber(nd4j::reduce::Prod).e<Nd4jLong>(0);
+    const auto product = blockShape->reduceNumber(sd::reduce::Prod).e<Nd4jLong>(0);
     REQUIRE_TRUE(input->sizeAt(0) % product == 0, 0, "BatchToSpaceND: first dimension of input array must be divisible by product of blockShape array elements (= %lld), but got first dimension equal to %i", product, input->sizeAt(0));
 
     if(crop->sizeAt(0) != numOfSpatialDims || crop->sizeAt(1) != 2) {
@@ -82,7 +82,7 @@ CUSTOM_OP_IMPL(batch_to_space_nd, 3, 1, false, 0, 0) {
 ////////////////////////////////////////////////////////////////////////////////
 DECLARE_TYPES(batch_to_space_nd) {
 
-    getOpDescriptor()->setAllowedInputTypes(0, nd4j::DataType::ANY)
+    getOpDescriptor()->setAllowedInputTypes(0, sd::DataType::ANY)
                      ->setAllowedInputTypes(1, {ALL_INTS})
                      ->setAllowedInputTypes(2, {ALL_INTS})
                      ->setSameMode(true);
@@ -97,7 +97,7 @@ DECLARE_SHAPE_FN(batch_to_space_nd) {
 
     REQUIRE_TRUE(blockShapeInfo[0] == 1, 0, "BatchToSpaceND: rank of blockShape array must be equal to one, but got %i instead !", blockShapeInfo[0]);
 
-    const auto product = INPUT_VARIABLE(1)->reduceNumber(nd4j::reduce::Prod).e<Nd4jLong>(0);
+    const auto product = INPUT_VARIABLE(1)->reduceNumber(sd::reduce::Prod).e<Nd4jLong>(0);
     REQUIRE_TRUE(inputShapeInfo[1] % product == 0, 0, "BatchToSpaceND: first dimension of input array must be divisible by product of blockShape array elements (= %lld), but got first dimension equal to %i", product, inputShapeInfo[1]);
 
     const auto numOfSpatialDims = blockShapeInfo[1];
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/betaInc.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/betaInc.cpp
index 1b09bbf77..1850f10a1 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/betaInc.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/betaInc.cpp
@@ -18,13 +18,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 12.12.2017
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_betainc)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/betaInc.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
     DECLARE_TYPES(betainc) {
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/bias_add.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/bias_add.cpp
index 0c88a9c53..eec864c5e 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/bias_add.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/bias_add.cpp
@@ -19,13 +19,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_biasadd)
 
 #include <ops/declarable/CustomOperations.h>
 #include<ops/declarable/helpers/addBias.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
 ////////////////////////////////////////////////////////////////////
@@ -46,7 +46,7 @@ CUSTOM_OP_IMPL(biasadd, 2, 1, true, 0, 0) {
     REQUIRE_TRUE(output->isSameShape(input), 0, "BIASADD CUSTOM_OP: wrong shape of output array, expected is %s but got %s instead !", ShapeUtils::shapeAsString(input).c_str(), ShapeUtils::shapeAsString(output).c_str());
 
     helpers::addBias(block, *input, *bias, *output, isNCHW);
-    // input->applyBroadcast(nd4j::broadcast::Add, {channelDim}, bias, output);
+    // input->applyBroadcast(sd::broadcast::Add, {channelDim}, bias, output);
 
     return Status::OK();
 }
@@ -63,7 +63,7 @@ DECLARE_SHAPE_FN(biasadd) {
 
 DECLARE_TYPES(biasadd) {
     getOpDescriptor()
-            ->setAllowedInputTypes(nd4j::DataType::ANY)
+            ->setAllowedInputTypes(sd::DataType::ANY)
             ->setAllowedOutputTypes({ALL_FLOATS});
 }
 
@@ -82,7 +82,7 @@ CUSTOM_OP_IMPL(biasadd_bp, 3, 2, false, 0, 0) {
 
     gradI->assign(gradO);
 
-    gradO->reduceAlongDimension(nd4j::reduce::Sum, *gradB, ShapeUtils::evalDimsToExclude(gradO->rankOf(), {channelDim}));
+    gradO->reduceAlongDimension(sd::reduce::Sum, *gradB, ShapeUtils::evalDimsToExclude(gradO->rankOf(), {channelDim}));
 
     return ND4J_STATUS_OK;
 }
@@ -104,7 +104,7 @@ DECLARE_SHAPE_FN(biasadd_bp) {
 
 DECLARE_TYPES(biasadd_bp) {
     getOpDescriptor()
-            ->setAllowedInputTypes(nd4j::DataType::ANY)
+            ->setAllowedInputTypes(sd::DataType::ANY)
             ->setAllowedOutputTypes({ALL_FLOATS});
 }
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/bincount.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/bincount.cpp
index 0eff84cf0..3b9fc3916 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/bincount.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/bincount.cpp
@@ -18,19 +18,19 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_bincount)
 
 //#include <ops/declarable/headers/parity_ops.h>
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/weights.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         DECLARE_TYPES(bincount) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(0, nd4j::DataType::INT32)
-                    ->setAllowedInputTypes(1, nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(0, sd::DataType::INT32)
+                    ->setAllowedInputTypes(1, sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_INTS, ALL_FLOATS});
         }
 
@@ -45,9 +45,9 @@ namespace nd4j {
             maxLength = values->e<int>(maxIndex)  + 1;
 
             if (block.numI() > 0) {
-                minLength = nd4j::math::nd4j_max(INT_ARG(0), 0);
+                minLength = sd::math::nd4j_max(INT_ARG(0), 0);
                 if (block.numI() == 2)
-                    maxLength = nd4j::math::nd4j_min(maxLength, INT_ARG(1));
+                    maxLength = sd::math::nd4j_min(maxLength, INT_ARG(1));
             }
 
             if (block.width() == 2) { // the second argument is weights
@@ -69,8 +69,8 @@ namespace nd4j {
                 REQUIRE_TRUE(values->isSameShape(weights), 0, "bincount: the input and weights shapes should be equals");
 
             }
-            minLength = nd4j::math::nd4j_max(minLength, 0);
-            maxLength = nd4j::math::nd4j_min(maxLength, values->e<int>(maxIndex) + 1);
+            minLength = sd::math::nd4j_max(minLength, 0);
+            maxLength = sd::math::nd4j_min(maxLength, values->e<int>(maxIndex) + 1);
 
             auto result = OUTPUT_VARIABLE(0);
             result->assign(0.0f);
@@ -83,32 +83,32 @@ namespace nd4j {
         DECLARE_SHAPE_FN(bincount) {
             auto shapeList = SHAPELIST(); 
             auto in = INPUT_VARIABLE(0);
-            nd4j::DataType dtype = DataType::INT32;
+            sd::DataType dtype = DataType::INT32;
             if (block.width() > 1)
                 dtype = ArrayOptions::dataType(inputShape->at(1));
             else if (block.numI() > 2)
-                dtype = (nd4j::DataType)INT_ARG(2);
+                dtype = (sd::DataType)INT_ARG(2);
 
             int maxIndex = in->argMax();
             int maxLength = in->e<int>(maxIndex)  + 1;
             int outLength = maxLength;
             if (block.numI() > 0)
-                outLength = nd4j::math::nd4j_max(maxLength, INT_ARG(0));
+                outLength = sd::math::nd4j_max(maxLength, INT_ARG(0));
 
             if (block.numI() > 1) 
-                outLength = nd4j::math::nd4j_min(outLength, INT_ARG(1));
+                outLength = sd::math::nd4j_min(outLength, INT_ARG(1));
 
             if (block.width() == 3) { // the second argument is min and the third is max
                 auto min= INPUT_VARIABLE(1)->e<int>(0);
                 auto max = INPUT_VARIABLE(2)->e<int>(0);
-                outLength = nd4j::math::nd4j_max(maxLength, min);
-                outLength = nd4j::math::nd4j_min(outLength, max);
+                outLength = sd::math::nd4j_max(maxLength, min);
+                outLength = sd::math::nd4j_min(outLength, max);
             }
             else if (block.width() > 3) {
                 auto min= INPUT_VARIABLE(2);
                 auto max = INPUT_VARIABLE(3);
-                outLength = nd4j::math::nd4j_max(maxLength, min->e<int>(0));
-                outLength = nd4j::math::nd4j_min(outLength, max->e<int>(0));
+                outLength = sd::math::nd4j_max(maxLength, min->e<int>(0));
+                outLength = sd::math::nd4j_min(outLength, max->e<int>(0));
             }
 
             auto newshape = ConstantShapeHelper::getInstance()->vectorShapeInfo(outLength, dtype);
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/broadcast_dynamic_shape.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/broadcast_dynamic_shape.cpp
index fa95997be..2f90adb78 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/broadcast_dynamic_shape.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/broadcast_dynamic_shape.cpp
@@ -18,12 +18,12 @@
 // @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_broadcast_dynamic_shape)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
 //////////////////////////////////////////////////////////////////////////
@@ -46,8 +46,8 @@ CUSTOM_OP_IMPL(broadcast_dynamic_shape, 2, 1, false, 0, 0) {
     // fill rank and data type
     xShapeInfo[0] = x->lengthOf();
     yShapeInfo[0] = y->lengthOf();
-    ArrayOptions::setDataType(xShapeInfo.data(), nd4j::DataType::INT64); // fill with some data type, it doesn't matter what type exactly to choose
-    ArrayOptions::setDataType(yShapeInfo.data(), nd4j::DataType::INT64);
+    ArrayOptions::setDataType(xShapeInfo.data(), sd::DataType::INT64); // fill with some data type, it doesn't matter what type exactly to choose
+    ArrayOptions::setDataType(yShapeInfo.data(), sd::DataType::INT64);
 
     for (Nd4jLong i = 0; i < x->lengthOf(); ++i)
         xShapeInfo[i + 1] = x->e<Nd4jLong>(i);
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/check_numerics.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/check_numerics.cpp
index fee3f751c..561c6bb5b 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/check_numerics.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/check_numerics.cpp
@@ -18,12 +18,12 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_check_numerics)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
 
         CUSTOM_OP_IMPL(check_numerics, 2, 1, true, 0, 0) {
@@ -47,7 +47,7 @@ namespace nd4j {
         DECLARE_TYPES(check_numerics) {
             getOpDescriptor()
                     ->setAllowedInputTypes(0, {ALL_FLOATS})
-                    ->setAllowedInputTypes(1, nd4j::DataType::UTF8)
+                    ->setAllowedInputTypes(1, sd::DataType::UTF8)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
     }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/cholesky.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/cholesky.cpp
index 815261f98..dfc3830ca 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/cholesky.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/cholesky.cpp
@@ -18,12 +18,12 @@
 // Created by GS <sgazeos@gmail.com> at 11/12/2018
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_cholesky)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/lup.h>
-namespace nd4j {
+namespace sd {
     namespace ops {
         OP_IMPL(cholesky, 1, 1, true) {
             NDArray* input = INPUT_VARIABLE(0);
@@ -36,7 +36,7 @@ namespace nd4j {
         }
         DECLARE_TYPES(cholesky) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/compare_and_bitpack.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/compare_and_bitpack.cpp
index 1a30e0c91..43a7c7c75 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/compare_and_bitpack.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/compare_and_bitpack.cpp
@@ -21,9 +21,9 @@
 #include <ops/declarable/generic/helpers/BroadcastHelper.h>
 #include <ops/declarable/headers/parity_ops.h>
 #include <ops/declarable/headers/datatypes.h>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(compare_and_bitpack, 2, 1, false, 0, 0) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/confusion_matrix.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/confusion_matrix.cpp
index 9139f63b2..cc8a64fa6 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/confusion_matrix.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/confusion_matrix.cpp
@@ -18,17 +18,17 @@
 // @author @cpuheater
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_confusion_matrix)
 
 #include <ops/declarable/CustomOperations.h>
 #include <helpers/ShapeUtils.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <array/NDArrayList.h>
 #include <array>
 #include <ops/declarable/helpers/confusion.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         DECLARE_TYPES(confusion_matrix) {
             getOpDescriptor()
@@ -65,9 +65,9 @@ namespace nd4j {
             auto labels = INPUT_VARIABLE(0);
             auto predictions = INPUT_VARIABLE(1);
             auto dtype = block.dataType();
-            dtype = nd4j::DataType::INT64; // dtype - should be a param with int argument
+            dtype = sd::DataType::INT64; // dtype - should be a param with int argument
             if (block.numI() > 1)
-                dtype = (nd4j::DataType)INT_ARG(1);
+                dtype = (sd::DataType)INT_ARG(1);
 
             int numClasses = 0;
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/crop_and_resize.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/crop_and_resize.cpp
index cdce8a95a..b8ce12d64 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/crop_and_resize.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/crop_and_resize.cpp
@@ -18,14 +18,14 @@
 //  @author sgazeos@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_crop_and_resize)
 
 //#include <ops/declarable/headers/parity_ops.h>
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/crop_and_resize.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(crop_and_resize, 4, 1, false, 0, 0) {
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/cross.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/cross.cpp
index 57d7f3a87..0d701cf71 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/cross.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/cross.cpp
@@ -18,13 +18,13 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_cross)
 
 #include <ops/declarable/headers/parity_ops.h>
 #include <ops/declarable/helpers/cross.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
     DECLARE_TYPES(cross) {
         getOpDescriptor()
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/depth_to_space.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/depth_to_space.cpp
index 3e2ff1596..63c351c34 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/depth_to_space.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/depth_to_space.cpp
@@ -18,14 +18,14 @@
 // @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_depth_to_space)
 
 #include <ops/declarable/headers/parity_ops.h>
 #include <ops/declarable/helpers/d_t_s.h>
 #include <array>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
     CUSTOM_OP_IMPL(depth_to_space, 1, 1, false, 0, 2) {
         int block_size = INT_ARG(0);
@@ -53,7 +53,7 @@ namespace ops {
 
     DECLARE_TYPES(depth_to_space) {
         getOpDescriptor()
-                ->setAllowedInputTypes(nd4j::DataType::ANY)
+                ->setAllowedInputTypes(sd::DataType::ANY)
                 ->setSameMode(true);
     }
     
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/diag.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/diag.cpp
index 7067b78e8..d67ca057b 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/diag.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/diag.cpp
@@ -18,13 +18,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 06.12.2017
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_diag)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/diag.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 ////////////////////////////////////////////////////////////////////////// 
@@ -48,7 +48,7 @@ DECLARE_SYN(MatrixDiag, diag);
 
         DECLARE_TYPES(diag) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setSameMode(true);
         }
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/diagPart.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/diagPart.cpp
index 4a93606f4..925c4b6c1 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/diagPart.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/diagPart.cpp
@@ -18,13 +18,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 06.12.2017.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_diag_part)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/diag.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 		
 		CUSTOM_OP_IMPL(diag_part, 1, 1, false, 0, 0) {
@@ -46,7 +46,7 @@ namespace ops  {
 
 		DECLARE_TYPES(diag_part) {
 			getOpDescriptor()
-				->setAllowedInputTypes(nd4j::DataType::ANY)
+				->setAllowedInputTypes(sd::DataType::ANY)
 				->setSameMode(true);
 		}
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/digamma.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/digamma.cpp
index 8a2894be7..17afcc10b 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/digamma.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/digamma.cpp
@@ -19,13 +19,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_digamma)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/gammaMathFunc.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 CONFIGURABLE_OP_IMPL(digamma, 1, 1, false, 0, 0) {
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/draw_bounding_boxes.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/draw_bounding_boxes.cpp
index 8578f287e..d143bdcf8 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/draw_bounding_boxes.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/draw_bounding_boxes.cpp
@@ -18,12 +18,12 @@
 //  @author George A. Shulinok <sgazeos@gmail.com>
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_draw_bounding_boxes)
 
 #include <ops/declarable/headers/parity_ops.h>
 #include <ops/declarable/helpers/image_draw_bounding_boxes.h>
-namespace nd4j {
+namespace sd {
     namespace ops {
         OP_IMPL(draw_bounding_boxes, 3, 1, true) {
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/dropout.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/dropout.cpp
index f5b807c53..79c3c5cde 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/dropout.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/dropout.cpp
@@ -18,13 +18,13 @@
 // Created by GS <sgazeos@gmail.com>
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_dropout)
 
 #include <ops/declarable/headers/parity_ops.h>
 #include <ops/declarable/helpers/dropout.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/dynamic_parititon.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/dynamic_parititon.cpp
index 49c9ed5e8..8c54ab129 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/dynamic_parititon.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/dynamic_parititon.cpp
@@ -18,14 +18,14 @@
 //  @author GS <sgazeos@gmail.com>
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_dynamic_partition)
 
 #include <ops/declarable/CustomOperations.h>
 #include <array>
 #include <ops/declarable/helpers/dynamic.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
     CUSTOM_OP_IMPL(dynamic_partition, 2, 1, false, 0, 1) {
         auto input = INPUT_VARIABLE(0);
@@ -86,13 +86,13 @@ namespace ops {
 
     DECLARE_TYPES(dynamic_partition) {
         getOpDescriptor()
-                ->setAllowedInputTypes(nd4j::DataType::ANY)
+                ->setAllowedInputTypes(sd::DataType::ANY)
                 ->setAllowedOutputTypes({ALL_FLOATS, ALL_INTS});
     }
 
     DECLARE_TYPES(dynamic_partition_bp) {
         getOpDescriptor()
-                ->setAllowedInputTypes(nd4j::DataType::ANY)
+                ->setAllowedInputTypes(sd::DataType::ANY)
                 ->setSameMode(true);
     }
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/dynamic_stitch.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/dynamic_stitch.cpp
index e6913dc34..ecf0e5324 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/dynamic_stitch.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/dynamic_stitch.cpp
@@ -18,13 +18,13 @@
 //  @author GS <sgazeos@gmail.com>
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_dynamic_stitch)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/dynamic.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
     CUSTOM_OP_IMPL(dynamic_stitch, 2, 1, false, 0, 0) {
         int numOfData = block.width();
@@ -54,7 +54,7 @@ namespace ops {
 
     DECLARE_TYPES(dynamic_stitch) {
         getOpDescriptor()
-                ->setAllowedInputTypes(nd4j::DataType::ANY)
+                ->setAllowedInputTypes(sd::DataType::ANY)
                 ->setAllowedOutputTypes({ALL_INTS, ALL_FLOATS});
     }
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/embedding_lookup.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/embedding_lookup.cpp
index 9df3d52b8..0c3dc35f3 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/embedding_lookup.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/embedding_lookup.cpp
@@ -18,7 +18,7 @@
 // Created by GS <sgazeos@gmail.com>
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_embedding_lookup)
 
 #include <ops/declarable/CustomOperations.h>
@@ -27,7 +27,7 @@
 #include <numeric>
 
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
 
@@ -64,7 +64,7 @@ CUSTOM_OP_IMPL(embedding_lookup, 2, 1, false, 0, 1) {
         int lastIndDim = indeces->lengthOf();
         int partition_mode = INT_ARG(0); // partition_mode == 0 - i.e. 'mod' , 1 - 'div'
 
-        nd4j::ops::gather op;
+        sd::ops::gather op;
 
         std::unique_ptr<ResultSet> result(op.evaluate({input, indeces}, {0}));
         REQUIRE_TRUE(result->status() == Status::OK(), 0, "embedding_lookup: cannot retrieve results from gather op.");
@@ -76,8 +76,8 @@ CUSTOM_OP_IMPL(embedding_lookup, 2, 1, false, 0, 1) {
 
 DECLARE_TYPES(embedding_lookup) {
     getOpDescriptor()
-            ->setAllowedInputTypes(nd4j::DataType::ANY)
-            ->setAllowedOutputTypes(nd4j::DataType::ANY);
+            ->setAllowedInputTypes(sd::DataType::ANY)
+            ->setAllowedOutputTypes(sd::DataType::ANY);
 }
 
 DECLARE_SHAPE_FN(embedding_lookup) {
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/expose.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/expose.cpp
index de67a283d..fd3315157 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/expose.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/expose.cpp
@@ -20,7 +20,7 @@
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(expose, -1, -1, true, 0, 0) {
 
@@ -49,7 +49,7 @@ namespace nd4j {
 
         DECLARE_TYPES(expose) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setSameMode(true);
         }
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/extract_image_patches.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/extract_image_patches.cpp
index 6ebc0a72d..1bcb8ef36 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/extract_image_patches.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/extract_image_patches.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/extract_patches.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(extract_image_patches, 1, 1, false, 0, 7) {
             auto input = INPUT_VARIABLE(0);
@@ -47,7 +47,7 @@ namespace nd4j {
 
         DECLARE_TYPES(extract_image_patches) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setSameMode(true);
         }
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/fake_quant_with_min_max_vars.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/fake_quant_with_min_max_vars.cpp
index ea16b2274..291e8b7c1 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/fake_quant_with_min_max_vars.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/fake_quant_with_min_max_vars.cpp
@@ -18,12 +18,12 @@
 // @author George Shulinok (sgazeos@gmail.com), created on 13.11.2018
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_fake_quant_with_min_max_vars)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/fake_quantization.h>
-namespace nd4j {
+namespace sd {
     namespace ops {
         CONFIGURABLE_OP_IMPL(fake_quant_with_min_max_vars, 1, 1, true, 0, 0) {
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/fake_quant_with_min_max_vars_per_channel.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/fake_quant_with_min_max_vars_per_channel.cpp
index 8f379911b..4af483e22 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/fake_quant_with_min_max_vars_per_channel.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/fake_quant_with_min_max_vars_per_channel.cpp
@@ -18,12 +18,12 @@
 // @author George Shulinok <sgazeos@gmail.com>, created on 08.10.2019
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_fake_quant_with_min_max_vars_per_channel)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/fake_quantization.h>
-namespace nd4j {
+namespace sd {
     namespace ops {
         CONFIGURABLE_OP_IMPL(fake_quant_with_min_max_vars_per_channel, 3, 1, true, 0, 0) {
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/fill.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/fill.cpp
index 9fc508860..18b9ce2b8 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/fill.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/fill.cpp
@@ -18,12 +18,12 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_fill)
 
 #include <ops/declarable/headers/parity_ops.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         
         CUSTOM_OP_IMPL(fill, 1, 1, false, -2, 0) {
@@ -75,16 +75,16 @@ namespace nd4j {
                 newShape[e+1] = shapeArray->e<Nd4jLong>(e);
             }
 
-            nd4j::DataType dataType;
+            sd::DataType dataType;
 
             if (block.width() > 1) {
                 dataType = INPUT_VARIABLE(1)->dataType();
             } else if (block.numT() > 0) {
                 dataType = Environment::getInstance()->defaultFloatDataType();
             } else if (block.numI() > 0) {
-                dataType = nd4j::DataType::INT32;
+                dataType = sd::DataType::INT32;
             } else if (block.numB() > 0) {
-                dataType = nd4j::DataType::BOOL;
+                dataType = sd::DataType::BOOL;
             } else
                 throw std::runtime_error("Fill: missing value to fill output array with");
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/fill_as.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/fill_as.cpp
index b981804a4..f2d74572d 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/fill_as.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/fill_as.cpp
@@ -18,12 +18,12 @@
 // Created by raver119 on 01.11.2017.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_fill_as)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CONFIGURABLE_OP_IMPL(fill_as, 1, 1, true, 0, 0) {
             auto output = OUTPUT_VARIABLE(0);
@@ -47,7 +47,7 @@ namespace nd4j {
 
         DECLARE_TYPES(fill_as) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setSameMode(true);
         }
     }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/image_resize.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/image_resize.cpp
index cf26b69b3..3ceba93d8 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/image_resize.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/image_resize.cpp
@@ -18,13 +18,13 @@
 //  @author sgazeos@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_image_resize)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/image_resize.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(image_resize, 2, 1, false, 0, 0) {
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/in_top_k.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/in_top_k.cpp
index f0c7e7027..a243842d2 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/in_top_k.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/in_top_k.cpp
@@ -18,14 +18,14 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_in_top_k)
 
 //#include <ops/declarable/headers/parity_ops.h>
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/top_k.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(in_top_k, 2, 1, true, 0, 1) {
             auto predictions = INPUT_VARIABLE(0);
@@ -46,7 +46,7 @@ namespace nd4j {
             auto in = inputShape->at(1);
             int shapeRank = shape::rank(in);
 
-            auto aShape = ConstantShapeHelper::getInstance()->createShapeInfo(nd4j::DataType::BOOL, shape::order(in), shape::rank(in), shape::shapeOf(in));
+            auto aShape = ConstantShapeHelper::getInstance()->createShapeInfo(sd::DataType::BOOL, shape::order(in), shape::rank(in), shape::shapeOf(in));
             shapeList->push_back(aShape);
             return shapeList;
         }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/lgamma.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/lgamma.cpp
index 615190c2f..c39f8b55d 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/lgamma.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/lgamma.cpp
@@ -19,13 +19,13 @@
 // @author George A. Shulinok <sgazeos@gmail.com>
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_lgamma)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/lgamma.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 OP_IMPL(lgamma, 1, 1, true) {
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/lin_space.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/lin_space.cpp
index 8d30185b1..54fd8fb0e 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/lin_space.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/lin_space.cpp
@@ -18,12 +18,12 @@
 // @author sgazeos@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_lin_space)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
     CUSTOM_OP_IMPL(lin_space, 3, 1, false, 0, 0) {
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/listdiff.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/listdiff.cpp
index 4f20e19fa..49c7a2957 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/listdiff.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/listdiff.cpp
@@ -18,14 +18,14 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_listdiff)
 
 #include <ops/declarable/headers/parity_ops.h>
 #include <ops/declarable/helpers/listdiff.h>
 
 // this op will probably never become GPU-compatible
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(listdiff, 2, 2, false, 0, 0) {
             auto values = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/lstsq.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/lstsq.cpp
index 550165c6b..df55db586 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/lstsq.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/lstsq.cpp
@@ -18,14 +18,15 @@
 // Created by GS <sgazeos@gmail.com> at 01/28/2020
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_lstsq)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/lstsq.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
+
         CUSTOM_OP_IMPL(lstsq, 2, 1, false, 0, 0) {
             auto a = INPUT_VARIABLE(0);
             auto b = INPUT_VARIABLE(1);
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/lup.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/lup.cpp
index e0e960159..e0b1eb8d7 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/lup.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/lup.cpp
@@ -18,12 +18,12 @@
 // Created by GS <sgazeos@gmail.com> at 12/10/2019
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_matrix_inverse)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/lup.h>
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(lu, 1, 2, false, 0, 0) {
             auto input = INPUT_VARIABLE(0);
@@ -32,7 +32,7 @@ namespace nd4j {
             auto p = OUTPUT_VARIABLE(1);
             if (block.getIArguments()->size()) {
                 DataType dtype = (DataType)INT_ARG(0);
-                REQUIRE_TRUE(dtype == nd4j::DataType::INT32 || dtype == nd4j::DataType::INT64, 0, "lu: Permutation data type should be 32bit or 64bit int only, but '%s' given.", DataTypeUtils::asString(dtype).c_str());            }
+                REQUIRE_TRUE(dtype == sd::DataType::INT32 || dtype == sd::DataType::INT64, 0, "lu: Permutation data type should be 32bit or 64bit int only, but '%s' given.", DataTypeUtils::asString(dtype).c_str());            }
 
             REQUIRE_TRUE(input->rankOf() >=2, 0, "lu: The rank of input array should not less than 2, but %i is given", input->rankOf());
             REQUIRE_TRUE(input->sizeAt(-1) == input->sizeAt(-2), 0, "lu: The last two dimmensions should be equal, but %i and %i are given", input->sizeAt(-1), input->sizeAt(-2));
@@ -45,10 +45,10 @@ namespace nd4j {
             auto in = inputShape->at(0);
             auto shapeVector = ShapeUtils::shapeAsVector(in);
             auto luShape = ShapeBuilders::copyShapeInfoAndType(in, in, true, block.workspace());
-            auto dtype = nd4j::DataType::INT32;
+            auto dtype = sd::DataType::INT32;
             if (block.getIArguments()->size()) {
                 dtype = (DataType)INT_ARG(0);
-                REQUIRE_TRUE(dtype == nd4j::DataType::INT32 || dtype == nd4j::DataType::INT64, 0, "lu: Permutation data type should be 32bit or 64bit int only, but '%s' given.", DataTypeUtils::asString(dtype).c_str());
+                REQUIRE_TRUE(dtype == sd::DataType::INT32 || dtype == sd::DataType::INT64, 0, "lu: Permutation data type should be 32bit or 64bit int only, but '%s' given.", DataTypeUtils::asString(dtype).c_str());
             }
             auto luP = ShapeBuilders::createShapeInfo(dtype, shape::order(in), shapeVector.size() - 1,
                     shapeVector.data(),  block.workspace());
@@ -59,7 +59,7 @@ namespace nd4j {
             getOpDescriptor()
                     ->setAllowedInputTypes({ALL_FLOATS})
                     ->setAllowedOutputTypes(0, {ALL_FLOATS})
-                    ->setAllowedOutputTypes(1, {nd4j::DataType::INT32, nd4j::DataType::INT64})
+                    ->setAllowedOutputTypes(1, {sd::DataType::INT32, sd::DataType::INT64})
                     ->setSameMode(false);
         }
     }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/matrixDiagPart.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/matrixDiagPart.cpp
index 8cef439de..9d4a00be3 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/matrixDiagPart.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/matrixDiagPart.cpp
@@ -22,7 +22,7 @@
 #include <ops/declarable/helpers/matrix_diag_part.h>
 
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(matrix_diag_part, 1, 1, false, 0, 0) {
             auto input  = INPUT_VARIABLE(0);
@@ -43,7 +43,7 @@ namespace nd4j {
             REQUIRE_TRUE(inRank >= 2, 0, "CUSTOM_OP matrix_diag_part: input array must have rank >= 2, but %i given!", inRank);
 
             int outRank = inRank - 1;
-            int lastDimension = nd4j::math::nd4j_min(shape::sizeAt(in, -1), shape::sizeAt(in, -2));
+            int lastDimension = sd::math::nd4j_min(shape::sizeAt(in, -1), shape::sizeAt(in, -2));
             if(outRank == 1) {
                 //output shape is a vector with size min(sizeAt(0), sizeAt(1))
                 outShapeInfo = ConstantShapeHelper::getInstance()->vectorShapeInfo(lastDimension, ArrayOptions::dataType(in));
@@ -63,7 +63,7 @@ namespace nd4j {
 
         DECLARE_TYPES(matrix_diag_part) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setSameMode(true);
         }
 }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/matrixSetDiag.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/matrixSetDiag.cpp
index 3a52057a5..222074c81 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/matrixSetDiag.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/matrixSetDiag.cpp
@@ -18,13 +18,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_matrix_set_diag)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/matrixSetDiag.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 CONFIGURABLE_OP_IMPL(matrix_set_diag, 2, 1, false, 0, 0) {
@@ -38,7 +38,7 @@ CONFIGURABLE_OP_IMPL(matrix_set_diag, 2, 1, false, 0, 0) {
     for(int i = 0;  i < diagonal->rankOf() - 1; ++i)
         REQUIRE_TRUE(diagonal->sizeAt(i) == input->sizeAt(i), 0, "MATRIX_SET_DIAG op: the shapes of diagonal and input arrays must be equal till last diagonal dimension but one, however got diagonal=%s and input=%s instead !", ShapeUtils::shapeAsString(diagonal).c_str(), ShapeUtils::shapeAsString(input).c_str());
 
-    REQUIRE_TRUE(diagonal->sizeAt(-1) == (int)nd4j::math::nd4j_min<Nd4jLong>(input->sizeAt(-1), input->sizeAt(-2)), 0, "MATRIX_SET_DIAG op: the value of last dimension of diagonal array must be equal to min(input_last_shape=%i, input_last_but_one_shape=%i), but got %i instead !", input->sizeAt(-1), input->sizeAt(-2), diagonal->sizeAt(-1));
+    REQUIRE_TRUE(diagonal->sizeAt(-1) == (int)sd::math::nd4j_min<Nd4jLong>(input->sizeAt(-1), input->sizeAt(-2)), 0, "MATRIX_SET_DIAG op: the value of last dimension of diagonal array must be equal to min(input_last_shape=%i, input_last_but_one_shape=%i), but got %i instead !", input->sizeAt(-1), input->sizeAt(-2), diagonal->sizeAt(-1));
 
     helpers::matrixSetDiag(block.launchContext(), *input, *diagonal, *output, false);
 
@@ -49,7 +49,7 @@ DECLARE_SYN(MatrixSetDiag, matrix_set_diag);
 
     DECLARE_TYPES(matrix_set_diag) {
         getOpDescriptor()
-                ->setAllowedInputTypes(nd4j::DataType::ANY)
+                ->setAllowedInputTypes(sd::DataType::ANY)
                 ->setSameMode(true);
     }
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/matrix_band_part.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/matrix_band_part.cpp
index 8e4be83b9..08e059c37 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/matrix_band_part.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/matrix_band_part.cpp
@@ -18,12 +18,12 @@
 // @author GS <sgazeos@gmail.com>, created on 8/22/2018
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_matrix_band_part)
 #include <ops/declarable/helpers/matrix_band.h>
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CONFIGURABLE_OP_IMPL(matrix_band_part, 1, 1, true, 0, 2) {
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/matrix_determinant.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/matrix_determinant.cpp
index 1beff9dab..6c1fd40fe 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/matrix_determinant.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/matrix_determinant.cpp
@@ -18,12 +18,12 @@
 // Created by GS <sgazeos@gmail.com> at 2/26/2018
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/lup.h>
 
 #if NOT_EXCLUDED(OP_matrix_determinant)
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(matrix_determinant, 1, 1, false, 0, 0) {
             auto input = INPUT_VARIABLE(0);
@@ -55,7 +55,7 @@ namespace nd4j {
 
         DECLARE_TYPES(matrix_determinant) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
     }
@@ -64,11 +64,11 @@ namespace nd4j {
 #endif
 
 #if NOT_EXCLUDED(OP_log_matrix_determinant)
-namespace nd4j {
+namespace sd {
     namespace ops {
         DECLARE_TYPES(log_matrix_determinant) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
 
@@ -104,11 +104,11 @@ namespace nd4j {
 #endif
 
 #if NOT_EXCLUDED(OP_logdet)
-namespace nd4j {
+namespace sd {
     namespace ops {
         DECLARE_TYPES(logdet) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/matrix_diag.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/matrix_diag.cpp
index c3e73da84..6e95d127d 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/matrix_diag.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/matrix_diag.cpp
@@ -22,7 +22,7 @@
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/matrixSetDiag.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 CUSTOM_OP_IMPL(matrix_diag, 1, 1, false, 0, 0) {
@@ -60,7 +60,7 @@ DECLARE_SHAPE_FN(matrix_diag) {
 
 DECLARE_TYPES(matrix_diag) {
     getOpDescriptor()
-            ->setAllowedInputTypes(nd4j::DataType::ANY)
+            ->setAllowedInputTypes(sd::DataType::ANY)
             ->setSameMode(true);
 }
 }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/matrix_inverse.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/matrix_inverse.cpp
index 0b6ef5acb..6a595a92b 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/matrix_inverse.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/matrix_inverse.cpp
@@ -18,12 +18,12 @@
 // Created by GS <sgazeos@gmail.com> at 2/27/2018
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_matrix_inverse)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/lup.h>
-namespace nd4j {
+namespace sd {
     namespace ops {
         OP_IMPL(matrix_inverse, 1, 1, true) {
             auto input = INPUT_VARIABLE(0);
@@ -37,7 +37,7 @@ namespace nd4j {
 
         DECLARE_TYPES(matrix_inverse) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setSameMode(true);
         }
     }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/moments.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/moments.cpp
index 5e76fefec..c8fdf2e48 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/moments.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/moments.cpp
@@ -18,13 +18,13 @@
 // Created by sgazeos@gmail.com on 26.01.2018.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_moments)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/axis.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(moments, 1, 2, false, 0, -2) {
             auto input = INPUT_VARIABLE(0);
@@ -82,7 +82,7 @@ namespace nd4j {
 
         DECLARE_TYPES(moments) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
     }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/non_max_suppression.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/non_max_suppression.cpp
index e0812c403..c32ee1ba9 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/non_max_suppression.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/non_max_suppression.cpp
@@ -22,7 +22,7 @@
 #include <ops/declarable/helpers/image_suppression.h>
 
 
-namespace nd4j {
+namespace sd {
     namespace ops {
 #if NOT_EXCLUDED(OP_image_non_max_suppression)
         CUSTOM_OP_IMPL(non_max_suppression, 2, 1, false, 0, 0) {
@@ -112,14 +112,14 @@ namespace nd4j {
         }
         DECLARE_TYPES(non_max_suppression) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_INDICES});
         }
 #endif
 #if NOT_EXCLUDED(OP_image_non_max_suppression_v3)
         DECLARE_TYPES(non_max_suppression_v3) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_INDICES});
         }
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/non_max_suppression_overlaps.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/non_max_suppression_overlaps.cpp
index 4f405d8c8..a8477c63a 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/non_max_suppression_overlaps.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/non_max_suppression_overlaps.cpp
@@ -23,7 +23,7 @@
 
 #if NOT_EXCLUDED(OP_image_non_max_suppression_overlaps)
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(non_max_suppression_overlaps, 2, 1, false, 0, 0) {
             auto boxes = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/norm.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/norm.cpp
index e74a28184..64c2e5ccb 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/norm.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/norm.cpp
@@ -18,13 +18,13 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_norm)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/axis.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         REDUCTION_OP_IMPL(norm, 1, 1, false, 1, -2) {
             auto input = INPUT_VARIABLE(0);
@@ -97,7 +97,7 @@ namespace nd4j {
 
         DECLARE_TYPES(norm) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
     }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/normalize_moments.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/normalize_moments.cpp
index 15f295995..d3ccff82a 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/normalize_moments.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/normalize_moments.cpp
@@ -18,12 +18,12 @@
 // Created by george@skymind.io on 26.01.2018.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_normalize_moments)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(normalize_moments, 3, 2, false, 1, 0) {
             auto counts = INPUT_VARIABLE(0);
@@ -75,7 +75,7 @@ namespace nd4j {
 
         DECLARE_TYPES(normalize_moments) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
     }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/nth_element.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/nth_element.cpp
index 57d0191b2..b1b68c23d 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/nth_element.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/nth_element.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/nth_element.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(nth_element, 2, 1, false, 0, 0) {
             auto input = INPUT_VARIABLE(0);
@@ -70,8 +70,8 @@ namespace nd4j {
         }
         DECLARE_TYPES(nth_element) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
-                    ->setAllowedOutputTypes(nd4j::DataType::ANY);
+                    ->setAllowedInputTypes(sd::DataType::ANY)
+                    ->setAllowedOutputTypes(sd::DataType::ANY);
         }
 
     }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/onehot.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/onehot.cpp
index 49d91275f..d64499ecf 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/onehot.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/onehot.cpp
@@ -18,14 +18,14 @@
 // Created by raver119 on 01/11/17.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_onehot)
 
 #include <ops/declarable/CustomOperations.h>
 #include <helpers/ShapeUtils.h>
 #include <ops/declarable/helpers/one_hot.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(onehot, 1, 1, false, -2, -2) {
             auto input = INPUT_VARIABLE(0);
@@ -74,7 +74,7 @@ namespace nd4j {
         DECLARE_SHAPE_FN(onehot) {
             auto inShape = inputShape->at(0);
 
-            nd4j::DataType dtype = block.numD() > 0 ? D_ARG(0) : nd4j::DataType::FLOAT32;
+            sd::DataType dtype = block.numD() > 0 ? D_ARG(0) : sd::DataType::FLOAT32;
 
             int depth = -1;
             Nd4jLong axis = -1;
@@ -108,7 +108,7 @@ namespace nd4j {
 
         DECLARE_TYPES(onehot) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS, ALL_INTS});
         }
     }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/ones_as.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/ones_as.cpp
index 702aa6711..dccebf8c9 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/ones_as.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/ones_as.cpp
@@ -18,12 +18,12 @@
 // Created by raver119 on 01.11.2017.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_ones_as)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(ones_as, 1, 1, false, 0, 0) {
             auto output = OUTPUT_VARIABLE(0);
@@ -36,7 +36,7 @@ namespace nd4j {
         DECLARE_SHAPE_FN(ones_as) {
             auto in = inputShape->at(0);
             auto dtype = block.numD() ? D_ARG(0) : ArrayOptions::dataType(in);
-            auto shape = nd4j::ConstantShapeHelper::getInstance()->createShapeInfo(dtype, in);
+            auto shape = sd::ConstantShapeHelper::getInstance()->createShapeInfo(dtype, in);
 
             nd4j_printf("numD: %i; dtype: %s\n", block.numD(), DataTypeUtils::asString(dtype).c_str());
 
@@ -45,8 +45,8 @@ namespace nd4j {
 
         DECLARE_TYPES(ones_as) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
-                    ->setAllowedOutputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
+                    ->setAllowedOutputTypes(sd::DataType::ANY)
                     ->setSameMode(false);
         }
     }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/parallelStack.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/parallelStack.cpp
index 26ff1b596..2a8466b11 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/parallelStack.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/parallelStack.cpp
@@ -18,13 +18,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 01.11.2017
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_parallel_stack)
 
 #include <ops/declarable/CustomOperations.h>
 #include<ops/declarable/helpers/stack.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 
@@ -48,7 +48,7 @@ CUSTOM_OP_IMPL(parallel_stack, -1, 1, false, 0, 0) {
 
 	DECLARE_TYPES(parallel_stack) {
 		getOpDescriptor()
-				->setAllowedInputTypes(nd4j::DataType::ANY)
+				->setAllowedInputTypes(sd::DataType::ANY)
 				->setAllowedOutputTypes({ALL_FLOATS});
 	}
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/polygamma.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/polygamma.cpp
index 1cfd86a26..35ffdcbc6 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/polygamma.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/polygamma.cpp
@@ -18,13 +18,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_polygamma)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/gammaMathFunc.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 CONFIGURABLE_OP_IMPL(polygamma, 2, 1, false, 0, 0) {
@@ -37,8 +37,8 @@ CONFIGURABLE_OP_IMPL(polygamma, 2, 1, false, 0, 0) {
 
     Nd4jLong arrLen = n->lengthOf();
     // FIXME: this shit should be single op call, not a loop!
-    auto nNegative = n->reduceNumber(nd4j::reduce::IsNegative, nullptr);
-    auto xPositive = x->reduceNumber(nd4j::reduce::IsPositive, nullptr);
+    auto nNegative = n->reduceNumber(sd::reduce::IsNegative, nullptr);
+    auto xPositive = x->reduceNumber(sd::reduce::IsPositive, nullptr);
     bool nPositiveFlag = !nNegative.e<bool>(0);                             // require all n >= 0
     bool xPositiveFlag = xPositive.e<bool>(0);                              // require all x > 0
     REQUIRE_TRUE(nPositiveFlag, 0, "POLYGAMMA op: all elements of n array must be >= 0 !");
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/qr.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/qr.cpp
index c4b8f8404..2cf9156ce 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/qr.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/qr.cpp
@@ -18,12 +18,12 @@
 // Created by GS <sgazeos@gmail.com> at 12/20/2019
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/qr.h>
 
 #if NOT_EXCLUDED(OP_qr)
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(qr, 1, 2, false, 0, 0) {
             auto input = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/range.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/range.cpp
index 7faf82b08..a39e07912 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/range.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/range.cpp
@@ -19,13 +19,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_range)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/range.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
 CUSTOM_OP_IMPL(range, -2, 1, false, -2, -2) {
@@ -130,7 +130,7 @@ DECLARE_SHAPE_FN(range) {
     const int numIArgs  = block.getIArguments()->size();    
 
     Nd4jLong steps = 0;
-    nd4j::DataType dataType = block.numD() ? D_ARG(0) : nd4j::DataType::INHERIT;
+    sd::DataType dataType = block.numD() ? D_ARG(0) : sd::DataType::INHERIT;
 
     if (numInArrs > 0) {
         auto isR = INPUT_VARIABLE(0)->isR();
@@ -213,16 +213,16 @@ DECLARE_SHAPE_FN(range) {
 
         if (limit == start){
             //Return [0] to match TF
-            return SHAPELIST(ConstantShapeHelper::getInstance()->vectorShapeInfo(0, nd4j::DataType::INT32));
+            return SHAPELIST(ConstantShapeHelper::getInstance()->vectorShapeInfo(0, sd::DataType::INT32));
         }
 
         REQUIRE_TRUE(delta != 0, 0, "CUSTOM RANGE OP: delta should not be equal to zero !");
 
         if (!block.numD()) {
             if (limit > DataTypeUtils::max<int>())
-                dataType = nd4j::DataType::INT64;
+                dataType = sd::DataType::INT64;
             else
-                dataType = nd4j::DataType::INT32;
+                dataType = sd::DataType::INT32;
         }
 
         steps = (limit - start) / delta;
@@ -257,7 +257,7 @@ DECLARE_SHAPE_FN(range) {
 
         if (!block.numD()) {
             if (Environment::getInstance()->precisionBoostAllowed())
-                dataType = nd4j::DataType::DOUBLE;
+                dataType = sd::DataType::DOUBLE;
             else
                 dataType = Environment::getInstance()->defaultFloatDataType();
         }
@@ -276,7 +276,7 @@ DECLARE_SHAPE_FN(range) {
 
     DECLARE_TYPES(range) {
         getOpDescriptor()
-                ->setAllowedInputTypes(nd4j::DataType::ANY)
+                ->setAllowedInputTypes(sd::DataType::ANY)
                 ->setAllowedOutputTypes({ALL_FLOATS, ALL_INTS});
     }
 }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/rank.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/rank.cpp
index 6035b267d..8a617dc59 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/rank.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/rank.cpp
@@ -18,12 +18,12 @@
 // Created by raver119 on 01.11.2017.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_rank)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(rank, 1, 1, false, 0, 0) {
             auto input = INPUT_VARIABLE(0);
@@ -37,13 +37,13 @@ namespace nd4j {
             return Status::OK();
         }
         DECLARE_SHAPE_FN(rank) {
-            return SHAPELIST(ConstantShapeHelper::getInstance()->scalarShapeInfo(nd4j::DataType::INT32));
+            return SHAPELIST(ConstantShapeHelper::getInstance()->scalarShapeInfo(sd::DataType::INT32));
         }
 
 
         DECLARE_TYPES(rank) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_INTS, ALL_FLOATS})
                     ->allowOverride(true);
         }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/reduceMean.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/reduceMean.cpp
index f83994606..90560bbb6 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/reduceMean.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/reduceMean.cpp
@@ -22,7 +22,7 @@
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/axis.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 
 //////////////////////////////////////////////////////////////////////////
@@ -79,7 +79,7 @@ DECLARE_SHAPE_FN(reduce_mean) {
 
 DECLARE_TYPES(reduce_mean) {
     getOpDescriptor()
-        ->setAllowedInputTypes(nd4j::DataType::ANY)
+        ->setAllowedInputTypes(sd::DataType::ANY)
         ->setAllowedOutputTypes({ALL_FLOATS});
 }
 
@@ -150,7 +150,7 @@ DECLARE_SHAPE_FN(reduce_mean_bp) {
 
 DECLARE_TYPES(reduce_mean_bp) {
     getOpDescriptor()
-        ->setAllowedInputTypes(nd4j::DataType::ANY)
+        ->setAllowedInputTypes(sd::DataType::ANY)
         ->setAllowedOutputTypes({ALL_FLOATS});
 }
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/reduceStDev.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/reduceStDev.cpp
index 6a3e7c050..1682b9d72 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/reduceStDev.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/reduceStDev.cpp
@@ -22,7 +22,7 @@
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/axis.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 
 //////////////////////////////////////////////////////////////////////////
@@ -90,7 +90,7 @@ DECLARE_SHAPE_FN(reduce_stdev) {
 
 DECLARE_TYPES(reduce_stdev) {
     getOpDescriptor()
-        ->setAllowedInputTypes(nd4j::DataType::ANY)
+        ->setAllowedInputTypes(sd::DataType::ANY)
         ->setAllowedOutputTypes({ALL_FLOATS});
 }
 
@@ -169,7 +169,7 @@ DECLARE_SHAPE_FN(reduce_stdev_bp) {
 
 DECLARE_TYPES(reduce_stdev_bp) {
     getOpDescriptor()
-        ->setAllowedInputTypes(nd4j::DataType::ANY)
+        ->setAllowedInputTypes(sd::DataType::ANY)
         ->setAllowedOutputTypes({ALL_FLOATS});
 }
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/reduceVariance.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/reduceVariance.cpp
index 16bfdc8a9..cd7441304 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/reduceVariance.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/reduceVariance.cpp
@@ -22,7 +22,7 @@
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/axis.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 
 //////////////////////////////////////////////////////////////////////////
@@ -88,7 +88,7 @@ DECLARE_SHAPE_FN(reduce_variance) {
 
 DECLARE_TYPES(reduce_variance) {
     getOpDescriptor()
-        ->setAllowedInputTypes(nd4j::DataType::ANY)
+        ->setAllowedInputTypes(sd::DataType::ANY)
         ->setAllowedOutputTypes({ALL_FLOATS});
 }
 
@@ -166,7 +166,7 @@ DECLARE_SHAPE_FN(reduce_variance_bp) {
 
 DECLARE_TYPES(reduce_variance_bp) {
     getOpDescriptor()
-        ->setAllowedInputTypes(nd4j::DataType::ANY)
+        ->setAllowedInputTypes(sd::DataType::ANY)
         ->setAllowedOutputTypes({ALL_FLOATS});
 }
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/reduce_dot.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/reduce_dot.cpp
index 9569f524e..75cb40ca2 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/reduce_dot.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/reduce_dot.cpp
@@ -22,7 +22,7 @@
 #include <ops/declarable/helpers/axis.h>
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 #if NOT_EXCLUDED(OP_reduce_dot_bp)
 
@@ -114,7 +114,7 @@ DECLARE_SHAPE_FN(reduce_dot_bp) {
 
 DECLARE_TYPES(reduce_dot_bp) {
     getOpDescriptor()
-        ->setAllowedInputTypes(nd4j::DataType::ANY)
+        ->setAllowedInputTypes(sd::DataType::ANY)
         ->setAllowedOutputTypes({ALL_FLOATS});
 }
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/reduce_logsumexp.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/reduce_logsumexp.cpp
index a02b4db9b..805db1883 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/reduce_logsumexp.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/reduce_logsumexp.cpp
@@ -20,7 +20,7 @@
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/axis.h>
-namespace nd4j {
+namespace sd {
 namespace ops {
 #if NOT_EXCLUDED(OP_reduce_logsumexp)
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/reduce_max.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/reduce_max.cpp
index 870017e8d..3d2dbe57e 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/reduce_max.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/reduce_max.cpp
@@ -23,7 +23,7 @@
 #include <ops/declarable/helpers/transforms.h>
 #include <ops/declarable/helpers/axis.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
 #if NOT_EXCLUDED(OP_reduce_max)
@@ -84,7 +84,7 @@ DECLARE_SHAPE_FN(reduce_max) {
 
 DECLARE_TYPES(reduce_max) {
     getOpDescriptor()
-        ->setAllowedInputTypes(nd4j::DataType::ANY)
+        ->setAllowedInputTypes(sd::DataType::ANY)
         ->setSameMode(true);
 }
 
@@ -116,12 +116,12 @@ CUSTOM_OP_IMPL(reduce_max_bp, 2, 1, false, 0, 0) {
 
     if(gradO->lengthOf() == 1) {
 
-        auto indOfMaxElem = input->indexReduceNumber(nd4j::indexreduce::IndexMax);
+        auto indOfMaxElem = input->indexReduceNumber(sd::indexreduce::IndexMax);
         gradI->p(indOfMaxElem.t<Nd4jLong>(0), gradO->e(0));
     }
     else {
 
-        auto indicesArr = input->applyIndexReduce(nd4j::indexreduce::IndexMax, dimensions);
+        auto indicesArr = input->applyIndexReduce(sd::indexreduce::IndexMax, dimensions);
         helpers::scatterSimple(block.launchContext(), 6, *gradI, *gradO, indicesArr, ShapeUtils::evalDimsToExclude(gradI->rankOf(), dimensions)); // 6 corresponds to copy operation
     }
 
@@ -151,7 +151,7 @@ DECLARE_SHAPE_FN(reduce_max_bp) {
 
 DECLARE_TYPES(reduce_max_bp) {
     getOpDescriptor()
-        ->setAllowedInputTypes(nd4j::DataType::ANY)
+        ->setAllowedInputTypes(sd::DataType::ANY)
         ->setAllowedOutputTypes({ALL_FLOATS});
 }
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/reduce_min.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/reduce_min.cpp
index e8b073de8..254cfe021 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/reduce_min.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/reduce_min.cpp
@@ -23,7 +23,7 @@
 #include <ops/declarable/helpers/transforms.h>
 #include <ops/declarable/helpers/axis.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
 #if NOT_EXCLUDED(OP_reduce_min)
@@ -84,7 +84,7 @@ DECLARE_SHAPE_FN(reduce_min) {
 
 DECLARE_TYPES(reduce_min) {
     getOpDescriptor()
-        ->setAllowedInputTypes(nd4j::DataType::ANY)
+        ->setAllowedInputTypes(sd::DataType::ANY)
         ->setSameMode(true);
 }
 
@@ -119,12 +119,12 @@ CUSTOM_OP_IMPL(reduce_min_bp, 2, 1, false, 0, 0) {
 
     if(gradO->lengthOf() == 1) {
 
-        auto indOfMaxElem = input->indexReduceNumber(nd4j::indexreduce::IndexMin);
+        auto indOfMaxElem = input->indexReduceNumber(sd::indexreduce::IndexMin);
         gradI->p(indOfMaxElem.e<Nd4jLong>(0), gradO->e(0));
     }
     else {
 
-        auto indicesArr = input->applyIndexReduce(nd4j::indexreduce::IndexMin, dimensions);
+        auto indicesArr = input->applyIndexReduce(sd::indexreduce::IndexMin, dimensions);
         helpers::scatterSimple(block.launchContext(), 6, *gradI, *gradO, indicesArr, ShapeUtils::evalDimsToExclude(gradI->rankOf(), dimensions));  // 6 corresponds to copy operation
     }
 
@@ -153,7 +153,7 @@ DECLARE_SHAPE_FN(reduce_min_bp) {
 
 DECLARE_TYPES(reduce_min_bp) {
     getOpDescriptor()
-        ->setAllowedInputTypes(nd4j::DataType::ANY)
+        ->setAllowedInputTypes(sd::DataType::ANY)
         ->setAllowedOutputTypes({ALL_FLOATS});
 }
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/reduce_norm1.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/reduce_norm1.cpp
index 172f3df8e..31261fe5c 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/reduce_norm1.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/reduce_norm1.cpp
@@ -22,7 +22,7 @@
 #include <ops/declarable/helpers/axis.h>
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 #if NOT_EXCLUDED(OP_reduce_norm1)
 
@@ -82,7 +82,7 @@ DECLARE_SHAPE_FN(reduce_norm1) {
 
 DECLARE_TYPES(reduce_norm1) {
     getOpDescriptor()
-        ->setAllowedInputTypes(nd4j::DataType::ANY)
+        ->setAllowedInputTypes(sd::DataType::ANY)
         ->setAllowedOutputTypes({ALL_FLOATS});
 }
 #endif
@@ -100,7 +100,7 @@ CUSTOM_OP_IMPL(reduce_norm1_bp, 2, 1, false, 0, 0) {
     auto gradO = INPUT_VARIABLE(1);
     auto gradI = OUTPUT_VARIABLE(0);
 
-    input->applyTransform(nd4j::transform::Sign, *gradI);
+    input->applyTransform(sd::transform::Sign, *gradI);
 
     if (gradO->lengthOf() == 1) {
         *gradI *= *gradO;
@@ -158,7 +158,7 @@ DECLARE_SHAPE_FN(reduce_norm1_bp) {
 
 DECLARE_TYPES(reduce_norm1_bp) {
     getOpDescriptor()
-        ->setAllowedInputTypes(nd4j::DataType::ANY)
+        ->setAllowedInputTypes(sd::DataType::ANY)
         ->setAllowedOutputTypes({ALL_FLOATS});
 }
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/reduce_norm2.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/reduce_norm2.cpp
index e54518359..c9ea8e374 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/reduce_norm2.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/reduce_norm2.cpp
@@ -22,7 +22,7 @@
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/axis.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 #if NOT_EXCLUDED(OP_reduce_norm2)
 
@@ -82,7 +82,7 @@ DECLARE_SHAPE_FN(reduce_norm2) {
 
 DECLARE_TYPES(reduce_norm2) {
     getOpDescriptor()
-        ->setAllowedInputTypes(nd4j::DataType::ANY)
+        ->setAllowedInputTypes(sd::DataType::ANY)
         ->setAllowedOutputTypes({ALL_FLOATS});
 }
 #endif
@@ -156,7 +156,7 @@ DECLARE_SHAPE_FN(reduce_norm2_bp) {
 
 DECLARE_TYPES(reduce_norm2_bp) {
     getOpDescriptor()
-        ->setAllowedInputTypes(nd4j::DataType::ANY)
+        ->setAllowedInputTypes(sd::DataType::ANY)
         ->setAllowedOutputTypes({ALL_FLOATS});
 }
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/reduce_norm_max.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/reduce_norm_max.cpp
index c71310947..b1a018900 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/reduce_norm_max.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/reduce_norm_max.cpp
@@ -23,7 +23,7 @@
 #include <ops/declarable/helpers/transforms.h>
 #include <ops/declarable/helpers/axis.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 #if NOT_EXCLUDED(OP_reduce_norm_max)
 
@@ -84,7 +84,7 @@ DECLARE_SHAPE_FN(reduce_norm_max) {
 
 DECLARE_TYPES(reduce_norm_max) {
     getOpDescriptor()
-        ->setAllowedInputTypes(nd4j::DataType::ANY)
+        ->setAllowedInputTypes(sd::DataType::ANY)
         ->setAllowedOutputTypes({ALL_FLOATS});
 }
 #endif
@@ -116,16 +116,16 @@ CUSTOM_OP_IMPL(reduce_norm_max_bp, 2, 1, false, 0, 0) {
 
     if(gradO->lengthOf() == 1) {
 
-        auto indOfAbsMaxElem = input->indexReduceNumber(nd4j::indexreduce::IndexAbsoluteMax);
+        auto indOfAbsMaxElem = input->indexReduceNumber(sd::indexreduce::IndexAbsoluteMax);
         const Nd4jLong ind = indOfAbsMaxElem.t<Nd4jLong>(0);
         const int sign = input->e<float>(ind) >= 0 ? 1 : -1;
         gradI->p(ind, sign * gradO->e(0));
     }
     else {
 
-        auto indicesArr = input->applyIndexReduce(nd4j::indexreduce::IndexAbsoluteMax, dimensions);
+        auto indicesArr = input->applyIndexReduce(sd::indexreduce::IndexAbsoluteMax, dimensions);
         helpers::scatterSimple(block.launchContext(), 6, *gradI, *gradO, indicesArr, ShapeUtils::evalDimsToExclude(gradI->rankOf(), dimensions));      // 6 corresponds to copy operation
-        *gradI *= input->transform(nd4j::transform::Sign);
+        *gradI *= input->transform(sd::transform::Sign);
     }
 
     return Status::OK();
@@ -152,7 +152,7 @@ DECLARE_SHAPE_FN(reduce_norm_max_bp) {
 
 DECLARE_TYPES(reduce_norm_max_bp) {
     getOpDescriptor()
-        ->setAllowedInputTypes(nd4j::DataType::ANY)
+        ->setAllowedInputTypes(sd::DataType::ANY)
         ->setAllowedOutputTypes({ALL_FLOATS});
 }
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/reduce_prod.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/reduce_prod.cpp
index 965b6dcaa..e873220ef 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/reduce_prod.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/reduce_prod.cpp
@@ -22,7 +22,7 @@
 #include <ops/declarable/helpers/axis.h>
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 #if NOT_EXCLUDED(OP_reduce_prod)
 
@@ -82,7 +82,7 @@ DECLARE_SHAPE_FN(reduce_prod) {
 
 DECLARE_TYPES(reduce_prod) {
     getOpDescriptor()
-        ->setAllowedInputTypes(nd4j::DataType::ANY)
+        ->setAllowedInputTypes(sd::DataType::ANY)
         ->setAllowedOutputTypes({ALL_FLOATS});
 }
 
@@ -97,7 +97,7 @@ CUSTOM_OP_IMPL(reduce_prod_bp, 2, 1, false, 0, 0) {
     auto gradI = OUTPUT_VARIABLE(0);
 
     if (gradO->lengthOf() == 1) {
-        gradI->assign(input->reduceNumber(nd4j::reduce::Prod));
+        gradI->assign(input->reduceNumber(sd::reduce::Prod));
         *gradI /= *input;
         *gradI *= gradO->e(0);
     }
@@ -124,7 +124,7 @@ CUSTOM_OP_IMPL(reduce_prod_bp, 2, 1, false, 0, 0) {
         // *** calculations *** //
 
         auto products = input->reduceAlongDimension(reduce::Prod, dimensions, true);
-        gradI->applyTrueBroadcast(nd4j::BroadcastOpsTuple::Assign(), products, *gradI);
+        gradI->applyTrueBroadcast(sd::BroadcastOpsTuple::Assign(), products, *gradI);
         *gradI /= *input;
 
         if(!keepDims) {
@@ -158,7 +158,7 @@ DECLARE_SHAPE_FN(reduce_prod_bp) {
 
 DECLARE_TYPES(reduce_prod_bp) {
     getOpDescriptor()
-        ->setAllowedInputTypes(nd4j::DataType::ANY)
+        ->setAllowedInputTypes(sd::DataType::ANY)
         ->setAllowedOutputTypes({ALL_FLOATS});
 }
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/reduce_sqnorm.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/reduce_sqnorm.cpp
index e42050ff6..0c53a261b 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/reduce_sqnorm.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/reduce_sqnorm.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/helpers/axis.h>
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 #if NOT_EXCLUDED(OP_reduce_sqnorm)
 
@@ -82,7 +82,7 @@ DECLARE_SHAPE_FN(reduce_sqnorm) {
 
 DECLARE_TYPES(reduce_sqnorm) {
     getOpDescriptor()
-        ->setAllowedInputTypes(nd4j::DataType::ANY)
+        ->setAllowedInputTypes(sd::DataType::ANY)
         ->setAllowedOutputTypes({ALL_FLOATS});
 }
 
@@ -155,7 +155,7 @@ DECLARE_SHAPE_FN(reduce_sqnorm_bp) {
 
 DECLARE_TYPES(reduce_sqnorm_bp) {
     getOpDescriptor()
-        ->setAllowedInputTypes(nd4j::DataType::ANY)
+        ->setAllowedInputTypes(sd::DataType::ANY)
         ->setAllowedOutputTypes({ALL_FLOATS});
 }
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/reduce_sum.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/reduce_sum.cpp
index 522164593..0f4a5f467 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/reduce_sum.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/reduce_sum.cpp
@@ -22,7 +22,7 @@
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/axis.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 #if NOT_EXCLUDED(OP_reduce_sum)
 
@@ -82,7 +82,7 @@ DECLARE_SHAPE_FN(reduce_sum) {
 
 DECLARE_TYPES(reduce_sum) {
     getOpDescriptor()
-        ->setAllowedInputTypes(nd4j::DataType::ANY)
+        ->setAllowedInputTypes(sd::DataType::ANY)
         ->setSameMode(true);
 }
 #endif
@@ -123,9 +123,9 @@ CUSTOM_OP_IMPL(reduce_sum_bp, 2, 1, false, 0, 0) {
         if(!keepDims) {
             auto gradOShapeKeepDims = ShapeUtils::evalReduceShapeInfo(gradO->ordering(), dimensions, *input, true, false, block.getWorkspace());
             auto r  = gradO->reshape(gradO->ordering(), ShapeUtils::pullShapeFromShapeInfo(gradOShapeKeepDims));  // for example could be something like [a,b] -> [1,a,1,b]
-            gradI->applyTrueBroadcast(nd4j::BroadcastOpsTuple::Assign(), r, *gradI);
+            gradI->applyTrueBroadcast(sd::BroadcastOpsTuple::Assign(), r, *gradI);
         } else
-            gradI->applyTrueBroadcast(nd4j::BroadcastOpsTuple::Assign(), *gradO, *gradI);
+            gradI->applyTrueBroadcast(sd::BroadcastOpsTuple::Assign(), *gradO, *gradI);
     }
 
     return Status::OK();
@@ -152,7 +152,7 @@ DECLARE_SHAPE_FN(reduce_sum_bp) {
 
 DECLARE_TYPES(reduce_sum_bp) {
     getOpDescriptor()
-        ->setAllowedInputTypes(nd4j::DataType::ANY)
+        ->setAllowedInputTypes(sd::DataType::ANY)
         ->setAllowedOutputTypes({ALL_FLOATS});
 }
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/resize_area.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/resize_area.cpp
index dc304e4a9..4ae03cc25 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/resize_area.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/resize_area.cpp
@@ -18,13 +18,13 @@
 //  @author sgazeos@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_resize_area)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/image_resize.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(resize_area, 1, 1, false, 0, -2) {
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/resize_bicubic.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/resize_bicubic.cpp
index 63da432c7..a867a2147 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/resize_bicubic.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/resize_bicubic.cpp
@@ -18,13 +18,13 @@
 //  @author sgazeos@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_resize_bicubic)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/image_resize.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(resize_bicubic, 2, 1, false, 0, 0) {
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/resize_linear.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/resize_linear.cpp
index fa7054c29..6d72bf889 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/resize_linear.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/resize_linear.cpp
@@ -19,13 +19,13 @@
 //  @author sgazeos@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_resize_bilinear)
 
 //#include <ops/declarable/headers/parity_ops.h>
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/image_resize.h>
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(resize_bilinear, 1, 1, false, 0, -2) {
 
@@ -120,7 +120,7 @@ namespace nd4j {
         }
         DECLARE_TYPES(resize_bilinear) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/resize_neighbor.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/resize_neighbor.cpp
index 9d6ac8a81..3454fb897 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/resize_neighbor.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/resize_neighbor.cpp
@@ -19,14 +19,14 @@
 //  @author sgazeos@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_resize_nearest_neighbor)
 
 //#include <ops/declarable/headers/parity_ops.h>
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/image_resize.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(resize_nearest_neighbor, 1, 1, false, 0, -2) {
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/rint.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/rint.cpp
index fbff41c47..1e1058397 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/rint.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/rint.cpp
@@ -18,12 +18,12 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_rint)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         OP_IMPL(rint, 1, 1, true) {
             auto x = INPUT_VARIABLE(0);
@@ -37,7 +37,7 @@ namespace nd4j {
 
     DECLARE_TYPES(rint) {
         getOpDescriptor()
-                ->setAllowedInputTypes(nd4j::DataType::ANY)
+                ->setAllowedInputTypes(sd::DataType::ANY)
                 ->setAllowedOutputTypes({ALL_FLOATS});
     }
 }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/roll.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/roll.cpp
index f93fc198e..75f102fa0 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/roll.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/roll.cpp
@@ -18,14 +18,14 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_roll)
 
 #include <ops/declarable/headers/parity_ops.h>
 #include <ops/declarable/helpers/roll.h>
 #include <ops/declarable/helpers/axis.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
     CONFIGURABLE_OP_IMPL(roll, 1, 1, true, 0, 0) {
@@ -98,10 +98,10 @@ namespace ops {
 
     DECLARE_TYPES(roll) {
         getOpDescriptor()
-                ->setAllowedInputTypes(0,nd4j::DataType::ANY)
-                ->setAllowedInputTypes(1,nd4j::DataType::INT32) // TODO: all ints in future
-                ->setAllowedInputTypes(2,nd4j::DataType::INT32)
-                ->setAllowedOutputTypes(nd4j::DataType::ANY)
+                ->setAllowedInputTypes(0,sd::DataType::ANY)
+                ->setAllowedInputTypes(1,sd::DataType::INT32) // TODO: all ints in future
+                ->setAllowedInputTypes(2,sd::DataType::INT32)
+                ->setAllowedOutputTypes(sd::DataType::ANY)
                 ->setSameMode(true);
     }
 }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_add.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_add.cpp
index db4eeeff6..e624afeb1 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_add.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_add.cpp
@@ -19,13 +19,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_scatter_add)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/generic/helpers/ScatterHelper.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
 OP_IMPL(scatter_add, 3, 1, true) {
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_div.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_div.cpp
index 40ddbd424..fd0b2a730 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_div.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_div.cpp
@@ -19,13 +19,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_scatter_div)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/generic/helpers/ScatterHelper.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         OP_IMPL(scatter_div, 3, 1, true) {
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_max.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_max.cpp
index 4ec55f088..b3342c5a5 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_max.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_max.cpp
@@ -18,13 +18,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 1.08.2018
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_scatter_max)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/generic/helpers/ScatterHelper.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
 OP_IMPL(scatter_max, 3, 1, true) {
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_min.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_min.cpp
index ea8dbf081..d37adb692 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_min.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_min.cpp
@@ -18,13 +18,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 1.08.2018
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_scatter_min)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/generic/helpers/ScatterHelper.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
 OP_IMPL(scatter_min, 3, 1, true) {
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_mul.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_mul.cpp
index 4685cef5d..9bf5be748 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_mul.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_mul.cpp
@@ -19,13 +19,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_scatter_mul)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/generic/helpers/ScatterHelper.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         OP_IMPL(scatter_mul, 3, 1, true) {
             auto input = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_nd.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_nd.cpp
index 4c3884e07..7c2194c6c 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_nd.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_nd.cpp
@@ -18,13 +18,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 21.08.2018
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_scatter_nd)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/generic/helpers/ScatterHelper.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
     CUSTOM_OP_IMPL(scatter_nd, 3, 1, false, 0, 0) {
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_nd_add.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_nd_add.cpp
index 43c2c66ed..8fb4288ee 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_nd_add.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_nd_add.cpp
@@ -18,13 +18,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 22.08.2018
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_scatter_nd_add)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/generic/helpers/ScatterHelper.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 OP_IMPL(scatter_nd_add, 3, 1, true) {
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_nd_sub.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_nd_sub.cpp
index eb4768f86..6cfa5d046 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_nd_sub.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_nd_sub.cpp
@@ -18,13 +18,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 24.08.2018
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_scatter_nd_sub)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/generic/helpers/ScatterHelper.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 OP_IMPL(scatter_nd_sub, 3, 1, true) {
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_nd_update.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_nd_update.cpp
index e6bfb8703..b6122c724 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_nd_update.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_nd_update.cpp
@@ -18,13 +18,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 24.08.2018
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_scatter_nd_update)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/generic/helpers/ScatterHelper.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 OP_IMPL(scatter_nd_update, 3, 1, true) {
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_sub.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_sub.cpp
index 1971b346f..c955ac042 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_sub.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_sub.cpp
@@ -19,13 +19,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_scatter_sub)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/generic/helpers/ScatterHelper.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         OP_IMPL(scatter_sub, 3, 1, true) {
             auto input = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_upd.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_upd.cpp
index 081d1cd76..ef54b9813 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_upd.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_upd.cpp
@@ -18,13 +18,13 @@
 // Created by raver119 on 24.11.17.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_scatter_upd)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/generic/helpers/ScatterHelper.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         OP_IMPL(scatter_upd, 3, 1, true) {
             auto input = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/segment_max.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/segment_max.cpp
index f78f2034f..fe469e5ec 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/segment_max.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/segment_max.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/segment.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(segment_max, 2, 1, false, 0, 0) {
             auto input = INPUT_VARIABLE(0);
@@ -94,7 +94,7 @@ namespace nd4j {
         }
         DECLARE_TYPES(segment_max_bp) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
 					->setAllowedOutputTypes(0, {ALL_FLOATS})
 					->setAllowedOutputTypes(1, {ALL_INTS})
                     ->setSameMode(true);
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/segment_mean.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/segment_mean.cpp
index 7f9c0664a..ef35f4839 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/segment_mean.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/segment_mean.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/segment.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(segment_mean, 2, 1, false, 0, 0) {
             auto input = INPUT_VARIABLE(0);
@@ -93,7 +93,7 @@ namespace nd4j {
         }
         DECLARE_TYPES(segment_mean_bp) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes(0, {ALL_FLOATS})
 					->setAllowedOutputTypes(1, {ALL_INTS})
                     ->setSameMode(false);
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/segment_min.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/segment_min.cpp
index a8c9f1b01..9c4a11255 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/segment_min.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/segment_min.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/segment.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(segment_min, 2, 1, false, 0, 0) {
             auto input = INPUT_VARIABLE(0);
@@ -91,7 +91,7 @@ namespace nd4j {
         }
         DECLARE_TYPES(segment_min_bp) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes(0, {ALL_FLOATS})
 					->setAllowedOutputTypes(1, {ALL_INTS})
                     ->setSameMode(true);
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/segment_prod.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/segment_prod.cpp
index a2c24de56..576fae508 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/segment_prod.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/segment_prod.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/segment.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(segment_prod, 2, 1, false, 0, 0) {
             auto input = INPUT_VARIABLE(0);
@@ -97,7 +97,7 @@ namespace nd4j {
 
         DECLARE_TYPES(segment_prod_bp) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes(0, {ALL_FLOATS})
 					->setAllowedOutputTypes(1, {ALL_INTS})
                     ->setSameMode(false);
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/segment_sum.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/segment_sum.cpp
index 46efaf2a8..203797e34 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/segment_sum.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/segment_sum.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/segment.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(segment_sum, 2, 1, false, 0, 0) {
             auto input = INPUT_VARIABLE(0);
@@ -81,12 +81,12 @@ namespace nd4j {
 
         DECLARE_TYPES(segment_sum) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setSameMode(true);
         }
         DECLARE_TYPES(segment_sum_bp) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes(0, {ALL_FLOATS})
 					->setAllowedOutputTypes(1, {ALL_INTS})
                     ->setSameMode(false);
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/sequence_mask.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/sequence_mask.cpp
index 477b298a3..310180685 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/sequence_mask.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/sequence_mask.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/sequence_mask.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(sequence_mask, 1, 1, false, 0, 0) {
             auto input  = INPUT_VARIABLE(0);
@@ -99,7 +99,7 @@ namespace nd4j {
         DECLARE_TYPES(sequence_mask) {
             getOpDescriptor()
                     ->setAllowedInputTypes({ALL_INTS})
-                    ->setAllowedOutputTypes(nd4j::DataType::ANY);
+                    ->setAllowedOutputTypes(sd::DataType::ANY);
         }
 }
 }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/size.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/size.cpp
index ed8927f98..d31e782c6 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/size.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/size.cpp
@@ -18,12 +18,12 @@
 // Created by raver119 on 01.11.2017.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_size)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(size, 1, 1, false, 0, 0) {
             auto input = INPUT_VARIABLE(0);
@@ -37,12 +37,12 @@ namespace nd4j {
             return Status::OK();
         }
         DECLARE_SHAPE_FN(size) {
-            return SHAPELIST(ConstantShapeHelper::getInstance()->scalarShapeInfo(nd4j::DataType::INT64));
+            return SHAPELIST(ConstantShapeHelper::getInstance()->scalarShapeInfo(sd::DataType::INT64));
         }
 
         DECLARE_TYPES(size) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_INTS, ALL_FLOATS})
                     ->allowOverride(true);
         }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/slice.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/slice.cpp
index d2a390eb9..4516fcffc 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/slice.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/slice.cpp
@@ -18,13 +18,13 @@
 // Created by raver119 on 02.11.2017.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 //#if NOT_EXCLUDED(OP_slice)
 
 #include <ops/declarable/CustomOperations.h>
 #include <helpers/ShapeUtils.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(slice, 1, 1, false, 0, -2) {
             auto input = INPUT_VARIABLE(0);
@@ -90,7 +90,7 @@ namespace nd4j {
 
         DECLARE_TYPES(slice) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setSameMode(true);
         }
 
@@ -145,7 +145,7 @@ namespace nd4j {
 
         DECLARE_TYPES(slice_bp) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/solve.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/solve.cpp
index 1d1d80919..154001684 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/solve.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/solve.cpp
@@ -18,12 +18,12 @@
 // Created by GS <sgazeos@gmail.com> at 01/22/2020
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_solve)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/solve.h>
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(solve, 2, 1, false, 0, 0) {
             auto a = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/space_to_batch.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/space_to_batch.cpp
index 2f297c893..12b981ac2 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/space_to_batch.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/space_to_batch.cpp
@@ -17,13 +17,13 @@ limitations under the License.
 // @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_space_to_batch)
 
 #include <ops/declarable/headers/parity_ops.h>
 #include <ops/declarable/helpers/s_t_b.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
 
@@ -60,7 +60,7 @@ CUSTOM_OP_IMPL(space_to_batch, 2, 1, false, 0, 1) {
 ////////////////////////////////////////////////////////////////////////////////
 DECLARE_TYPES(space_to_batch) {
 
-    getOpDescriptor()->setAllowedInputTypes(0, nd4j::DataType::ANY)
+    getOpDescriptor()->setAllowedInputTypes(0, sd::DataType::ANY)
                      ->setAllowedInputTypes(1, {ALL_INTS})
                      ->setSameMode(true);
 }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/space_to_batch_nd.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/space_to_batch_nd.cpp
index d3939e303..a782f5b02 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/space_to_batch_nd.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/space_to_batch_nd.cpp
@@ -16,13 +16,13 @@ limitations under the License.
 // @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_space_to_batch_nd)
 
 #include <ops/declarable/headers/parity_ops.h>
 #include <ops/declarable/helpers/s_t_b.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 
@@ -64,7 +64,7 @@ CUSTOM_OP_IMPL(space_to_batch_nd, 3, 1, false, 0, 0) {
 ////////////////////////////////////////////////////////////////////////////////
 DECLARE_TYPES(space_to_batch_nd) {
 
-    getOpDescriptor()->setAllowedInputTypes(0, nd4j::DataType::ANY)
+    getOpDescriptor()->setAllowedInputTypes(0, sd::DataType::ANY)
                      ->setAllowedInputTypes(1, {ALL_INTS})
                      ->setAllowedInputTypes(2, {ALL_INTS})
                      ->setSameMode(true);
@@ -88,7 +88,7 @@ DECLARE_SHAPE_FN(space_to_batch_nd) {
 
     std::vector<Nd4jLong> outShape(inputShapeInfo + 1, inputShapeInfo + 1 + inputShapeInfo[0]);
 
-    outShape[0] *= INPUT_VARIABLE(1)->reduceNumber(nd4j::reduce::Prod).e<Nd4jLong>(0);
+    outShape[0] *= INPUT_VARIABLE(1)->reduceNumber(sd::reduce::Prod).e<Nd4jLong>(0);
 
     for (uint i = 0; i < numOfSpatialDims; ++i)
         outShape[i + 1] = (outShape[i + 1] + INPUT_VARIABLE(2)->e<uint>(i,0) + INPUT_VARIABLE(2)->e<uint>(i,1)) / INPUT_VARIABLE(1)->e<Nd4jLong>(i);
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/space_to_depth.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/space_to_depth.cpp
index f6ad8c163..3daf62ccd 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/space_to_depth.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/space_to_depth.cpp
@@ -18,19 +18,19 @@
 // @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_space_to_depth)
 
 #include <ops/declarable/headers/parity_ops.h>
 #include <array>
 #include <ops/declarable/helpers/s_t_d.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
     DECLARE_TYPES(space_to_depth) {
         getOpDescriptor()
-                ->setAllowedInputTypes(nd4j::DataType::ANY)
+                ->setAllowedInputTypes(sd::DataType::ANY)
                 ->setSameMode(true);
     }
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/split.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/split.cpp
index 0e36f0913..c3a3779c6 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/split.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/split.cpp
@@ -18,14 +18,14 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_split)
 
 #include <ops/declarable/headers/parity_ops.h>
 #include<ops/declarable/helpers/transforms.h>
 #include <array>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
     CUSTOM_OP_IMPL(split, 1, -1, false, 0, 1) {
         NDArray *input = nullptr;
@@ -85,7 +85,7 @@ namespace ops {
     DECLARE_SHAPE_FN(split) {
         int num_splits = INT_ARG(0);
         Nd4jLong *input = nullptr;
-        nd4j::DataType dataType;
+        sd::DataType dataType;
 
         // axis is 0 by default
         int axis = 0;
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/split_v.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/split_v.cpp
index 0e36c15ea..0bda3a6be 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/split_v.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/split_v.cpp
@@ -18,12 +18,12 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_split_v)
 
 #include <ops/declarable/headers/parity_ops.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
     CUSTOM_OP_IMPL(split_v, 2, -1, false, 0, -2) {
         auto input = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/square.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/square.cpp
index 34da37897..adc3a2cb5 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/square.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/square.cpp
@@ -18,12 +18,12 @@
 // Created by raver119 on 01/11/17.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_square)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         OP_IMPL(square, 1, 1, true) {
             auto input = INPUT_VARIABLE(0);
@@ -37,7 +37,7 @@ namespace nd4j {
 
         DECLARE_TYPES(square) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setSameMode(true);
         }
     }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/stack.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/stack.cpp
index e52f83129..fef64e105 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/stack.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/stack.cpp
@@ -18,13 +18,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 01.11.2017.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_stack)
 
 #include <ops/declarable/CustomOperations.h>
 #include<ops/declarable/helpers/stack.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
 CUSTOM_OP_IMPL(stack, -1, 1, false, 0, 0) {
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/stop_gradient.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/stop_gradient.cpp
index 9bb70d38f..621df4462 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/stop_gradient.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/stop_gradient.cpp
@@ -18,12 +18,12 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_stop_gradient)
 
 #include <ops/declarable/headers/parity_ops.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         OP_IMPL(stop_gradient, 1, 1, true) {
             auto out = OUTPUT_VARIABLE(0);
@@ -40,7 +40,7 @@ namespace nd4j {
 
         DECLARE_TYPES(stop_gradient) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setSameMode(true);
         }
     }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/strided_slice.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/strided_slice.cpp
index 4b622e821..f7d093da6 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/strided_slice.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/strided_slice.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_strided_slice)
 
 #include <array>
@@ -21,7 +21,7 @@ limitations under the License.
 #include <helpers/ShapeUtils.h>
 #include <helpers/BitwiseUtils.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
 
         constexpr int kShrinkAxis = -1, kNewAxis = -2;
@@ -70,7 +70,7 @@ namespace nd4j {
 
                         for (int e = 0; e < sparse_spec.dims; e++) {
                             if ((1 << e) & sparse_spec.ellipsis_mask) {
-                                int next_index = nd4j::math::nd4j_min<int>(this->dims - (sparse_spec.dims - e) + 1 + sparse_spec.num_add_axis_after_ellipsis, this->dims);
+                                int next_index = sd::math::nd4j_min<int>(this->dims - (sparse_spec.dims - e) + 1 + sparse_spec.num_add_axis_after_ellipsis, this->dims);
                             
                                 for (; full_index < next_index; full_index++) {
                                     // new_axis' aren't real axis so you have to skip
@@ -646,13 +646,13 @@ namespace nd4j {
 
         DECLARE_TYPES(strided_slice) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setSameMode(true);
         }
 
         DECLARE_TYPES(strided_slice_bp) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
     }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/sufficient_statistics.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/sufficient_statistics.cpp
index ed7698d15..9a9fb730b 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/sufficient_statistics.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/sufficient_statistics.cpp
@@ -18,12 +18,12 @@
 // Created by george@skymind.io on 2/21/2018.
 // Modified by sgazeos@gmail.com on 4/4/2018
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_sufficient_statistics)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/axis.h>
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(sufficient_statistics, 2, 3, false, 0, 0) {
             auto input = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/tear.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/tear.cpp
index c76435622..61850ab0e 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/tear.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/tear.cpp
@@ -18,14 +18,14 @@
 // Created by raver119 on 12.10.2017.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_tear)
 
 #include <ops/declarable/CustomOperations.h>
-#include <TAD.h>
+#include <helpers/TAD.h>
 #include <helpers/ConstantTadHelper.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(tear, 1, -1, false, 0, -1) {
             auto input = INPUT_VARIABLE(0);
@@ -57,7 +57,7 @@ namespace nd4j {
             if (dims.size() > 1)
                 std::sort(dims.begin(), dims.end());
 
-            auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(inShape, dims);
+            auto tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(inShape, dims);
             auto numTads = tadPack.numberOfTads();
 
             auto result = SHAPELIST();
@@ -71,7 +71,7 @@ namespace nd4j {
 
         DECLARE_TYPES(tear) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setSameMode(true);
         }
     }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/top_k.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/top_k.cpp
index bd16cdd79..799572794 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/top_k.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/top_k.cpp
@@ -18,14 +18,14 @@
 //  @author sgazeos@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_top_k)
 
 //#include <ops/declarable/headers/parity_ops.h>
 #include <ops/declarable/helpers/top_k.h>
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(top_k, 1, 2, false, 0, -1) {
             auto x = INPUT_VARIABLE(0);
@@ -76,7 +76,7 @@ namespace nd4j {
                 aShape[shapeRank] = k;
 
                 shape::updateStrides(aShape, shape::order(in));
-                shapeList->push_back(ConstantShapeHelper::getInstance()->createShapeInfo(ShapeDescriptor(aShape, (e == 0?ArrayOptions::dataType(in):nd4j::DataType::INT64))));
+                shapeList->push_back(ConstantShapeHelper::getInstance()->createShapeInfo(ShapeDescriptor(aShape, (e == 0?ArrayOptions::dataType(in):sd::DataType::INT64))));
 
                 RELEASE(aShape, block.getWorkspace());
             }
@@ -85,8 +85,8 @@ namespace nd4j {
 
         DECLARE_TYPES(top_k) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
-                    ->setAllowedOutputTypes(0, nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
+                    ->setAllowedOutputTypes(0, sd::DataType::ANY)
                     ->setAllowedOutputTypes(1, {ALL_INDICES});
         }
     }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/triangular_solve.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/triangular_solve.cpp
index 181f47d3d..c9d23753c 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/triangular_solve.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/triangular_solve.cpp
@@ -18,12 +18,12 @@
 // Created by GS <sgazeos@gmail.com> at 01/14/2020
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_triangual_solve)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/triangular_solve.h>
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(triangular_solve, 2, 1, false, 0, 0) {
             auto a = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/unique.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/unique.cpp
index 715720e07..64b915c53 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/unique.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/unique.cpp
@@ -18,13 +18,13 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_unique)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/unique.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(unique, 1, 2, false, 0, 0) {
             auto x = INPUT_VARIABLE(0);
@@ -53,7 +53,7 @@ namespace nd4j {
                 valuesShape = ConstantShapeHelper::getInstance()->vectorShapeInfo(uniqueCount, ArrayOptions::dataType(in));
             }
             // second output is always LONG
-            indicesShape = ConstantShapeHelper::getInstance()->vectorShapeInfo(shape::length(in), nd4j::DataType::INT64);
+            indicesShape = ConstantShapeHelper::getInstance()->vectorShapeInfo(shape::length(in), sd::DataType::INT64);
 
             //COPY_SHAPE_EX(in, indicesShape, block.getWorkspace());
 
@@ -80,17 +80,17 @@ namespace nd4j {
             auto valuesShape = ConstantShapeHelper::getInstance()->vectorShapeInfo(uniqueCount, source->dataType());
 
             // second output is always LONG
-            auto indicesShape = ConstantShapeHelper::getInstance()->vectorShapeInfo(source->lengthOf(), nd4j::DataType::INT64);
+            auto indicesShape = ConstantShapeHelper::getInstance()->vectorShapeInfo(source->lengthOf(), sd::DataType::INT64);
 
             // third one as well
-            auto countsShape = ConstantShapeHelper::getInstance()->vectorShapeInfo(uniqueCount, nd4j::DataType::INT64);
+            auto countsShape = ConstantShapeHelper::getInstance()->vectorShapeInfo(uniqueCount, sd::DataType::INT64);
 
             return SHAPELIST(valuesShape, indicesShape, countsShape);
         }
 
         DECLARE_TYPES(unique) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes(0, {ALL_INTS, ALL_FLOATS})
                     ->setAllowedOutputTypes(1, {ALL_INTS});
         }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/unsorted_segment_max.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/unsorted_segment_max.cpp
index d8bf36aed..14bee3853 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/unsorted_segment_max.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/unsorted_segment_max.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/segment.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(unsorted_segment_max, 2, 1, false, 0, 0) {
             auto input = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/unsorted_segment_mean.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/unsorted_segment_mean.cpp
index 7e34cb296..38032c0d7 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/unsorted_segment_mean.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/unsorted_segment_mean.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/segment.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(unsorted_segment_mean, 2, 1, false, 0, 0) {
             auto input = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/unsorted_segment_min.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/unsorted_segment_min.cpp
index e011350ed..5d6c58b16 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/unsorted_segment_min.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/unsorted_segment_min.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/segment.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(unsorted_segment_min, 2, 1, false, 0, 0) {
             auto input = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/unsorted_segment_prod.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/unsorted_segment_prod.cpp
index 695207c4b..1bb1d5bf5 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/unsorted_segment_prod.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/unsorted_segment_prod.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/segment.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(unsorted_segment_prod, 2, 1, false, 0, 0) {
             auto input = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/unsorted_segment_sqrt_n.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/unsorted_segment_sqrt_n.cpp
index 1286002b8..e29a86a42 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/unsorted_segment_sqrt_n.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/unsorted_segment_sqrt_n.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/segment.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(unsorted_segment_sqrt_n, 2, 1, false, 0, 0) {
             auto input = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/unsorted_segment_sum.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/unsorted_segment_sum.cpp
index a761718d1..89c6cb76f 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/unsorted_segment_sum.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/unsorted_segment_sum.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/segment.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(unsorted_segment_sum, 2, 1, false, 0, 0) {
             auto input = INPUT_VARIABLE(0);
@@ -85,7 +85,7 @@ namespace nd4j {
             getOpDescriptor()
                     ->setAllowedOutputTypes(0, {ALL_FLOATS})
 					->setAllowedOutputTypes(1, {ALL_INTS})
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setSameMode(false);
         }
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/unstack.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/unstack.cpp
index a44510104..e77fdf4d5 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/unstack.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/unstack.cpp
@@ -18,13 +18,13 @@
 // @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_unstack)
 
 #include <ops/declarable/CustomOperations.h>
 #include <helpers/ConstantTadHelper.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(unstack, 1, -1, false, 0, 1) {
             auto input = INPUT_VARIABLE(0);
@@ -105,7 +105,7 @@ namespace nd4j {
                 return result;
             }
 
-            auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(inShape, dims);
+            auto tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(inShape, dims);
             auto numTads = tadPack.numberOfTads();
 
             std::vector<Nd4jLong> shape(shape::rank(tadPack.primaryShapeInfo()));
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/weighted_cross_entropy_with_logits.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/weighted_cross_entropy_with_logits.cpp
index ff6537b1a..71d337c7c 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/weighted_cross_entropy_with_logits.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/weighted_cross_entropy_with_logits.cpp
@@ -18,13 +18,13 @@
 //  @author @shugeo
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_weighted_cross_entropy_with_logits)
 
 #include <ops/declarable/headers/parity_ops.h>
 #include <ops/declarable/helpers/legacy_helpers.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
     OP_IMPL(weighted_cross_entropy_with_logits, 3, 1, true) {
@@ -43,7 +43,7 @@ namespace ops {
 
     DECLARE_TYPES(weighted_cross_entropy_with_logits) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
     }
 }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/xw_plus_b.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/xw_plus_b.cpp
index ce68df1a0..ad7a430f4 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/xw_plus_b.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/xw_plus_b.cpp
@@ -19,14 +19,14 @@
 //
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_xw_plus_b)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/matmul.h>
-#include <MmulHelper.h>
+#include <helpers/MmulHelper.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(xw_plus_b, 3, 1, false, 0, 0) {
             auto x = INPUT_VARIABLE(0);
@@ -55,7 +55,7 @@ namespace nd4j {
 
         DECLARE_TYPES(xw_plus_b) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
     }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/zero_fraction.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/zero_fraction.cpp
index 2b623d23e..f70e92cf5 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/zero_fraction.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/zero_fraction.cpp
@@ -18,12 +18,12 @@
 // Created by GS <sgazeos@gmail.com> 31.01.2018
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_zero_fraction)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(zero_fraction, 1, 1, false, 0, 0) {
             auto input = INPUT_VARIABLE(0);
@@ -48,12 +48,12 @@ namespace nd4j {
             return Status::OK();
         }
         DECLARE_SHAPE_FN(zero_fraction) {
-            return SHAPELIST(ConstantShapeHelper::getInstance()->scalarShapeInfo(nd4j::DataType::DOUBLE));
+            return SHAPELIST(ConstantShapeHelper::getInstance()->scalarShapeInfo(sd::DataType::DOUBLE));
         }
 
         DECLARE_TYPES(zero_fraction) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
     }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/zeros_as.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/zeros_as.cpp
index 56b4264d0..6d475af53 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/zeros_as.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/zeros_as.cpp
@@ -18,12 +18,12 @@
 // Created by raver119 on 12.10.2017.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_zeros_as)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(zeros_as, 1, 1, false, 0, 0) {
             auto out = OUTPUT_VARIABLE(0);
@@ -39,15 +39,15 @@ namespace nd4j {
         DECLARE_SHAPE_FN(zeros_as) {
             auto in = inputShape->at(0);
             auto dtype = block.numD() ? D_ARG(0) : ArrayOptions::dataType(in);
-            auto shape = nd4j::ConstantShapeHelper::getInstance()->createShapeInfo(dtype, in);
+            auto shape = sd::ConstantShapeHelper::getInstance()->createShapeInfo(dtype, in);
 
             return SHAPELIST(shape);
         }
 
         DECLARE_TYPES(zeros_as) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
-                    ->setAllowedOutputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
+                    ->setAllowedOutputTypes(sd::DataType::ANY)
                     ->setSameMode(false);
         }
     }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/zeta.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/zeta.cpp
index eb10e53af..6aba1fc5f 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/zeta.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/zeta.cpp
@@ -18,13 +18,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 12.12.2017
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_zeta)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/zeta.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CONFIGURABLE_OP_IMPL(zeta, 2, 1, false, 0, 0) {
             auto x = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/random/bernoulli.cpp b/libnd4j/include/ops/declarable/generic/random/bernoulli.cpp
index 338760921..1441448c9 100644
--- a/libnd4j/include/ops/declarable/generic/random/bernoulli.cpp
+++ b/libnd4j/include/ops/declarable/generic/random/bernoulli.cpp
@@ -18,13 +18,13 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_random_bernoulli)
 
 #include <ops/declarable/headers/random.h>
 #include <helpers/RandomLauncher.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(random_bernoulli, 1, 1, true, 1, 0) {
             auto rng = block.getRng();
@@ -60,7 +60,7 @@ namespace nd4j {
 
         DECLARE_TYPES(random_bernoulli) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
     }
diff --git a/libnd4j/include/ops/declarable/generic/random/exponential.cpp b/libnd4j/include/ops/declarable/generic/random/exponential.cpp
index bc942fc9b..8605ffafe 100644
--- a/libnd4j/include/ops/declarable/generic/random/exponential.cpp
+++ b/libnd4j/include/ops/declarable/generic/random/exponential.cpp
@@ -18,13 +18,13 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_random_exponential)
 
 #include <ops/declarable/headers/random.h>
 #include <helpers/RandomLauncher.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(random_exponential, 1, 1, true, 1, 0) {
             // uniform distribution
@@ -69,7 +69,7 @@ namespace nd4j {
 
         DECLARE_TYPES(random_exponential) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
     }
diff --git a/libnd4j/include/ops/declarable/generic/random/gamma.cpp b/libnd4j/include/ops/declarable/generic/random/gamma.cpp
index 672eba422..d508e1929 100644
--- a/libnd4j/include/ops/declarable/generic/random/gamma.cpp
+++ b/libnd4j/include/ops/declarable/generic/random/gamma.cpp
@@ -18,13 +18,13 @@
 //  @author George A. Shulinok <sgazeos@gmail.com>
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_random_gamma)
 
 #include <ops/declarable/headers/random.h>
 #include <ops/declarable/helpers/random.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(random_gamma, 2, 1, false, 0, 0) {
             // gamma distribution
diff --git a/libnd4j/include/ops/declarable/generic/random/get_seed.cpp b/libnd4j/include/ops/declarable/generic/random/get_seed.cpp
index 2161a2378..7042ae6dd 100644
--- a/libnd4j/include/ops/declarable/generic/random/get_seed.cpp
+++ b/libnd4j/include/ops/declarable/generic/random/get_seed.cpp
@@ -18,12 +18,12 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_get_seed)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(get_seed, -2, 1, false, 0, 0) {
 //            REQUIRE_TRUE(block.getRNG() != nullptr, 0, "RNG should be defined in Graph");
@@ -42,7 +42,7 @@ namespace nd4j {
 
         DECLARE_TYPES(get_seed) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes(DataType::INT64);
         }
     }
diff --git a/libnd4j/include/ops/declarable/generic/random/multinomial.cpp b/libnd4j/include/ops/declarable/generic/random/multinomial.cpp
index bbdee17f4..c86417ef0 100644
--- a/libnd4j/include/ops/declarable/generic/random/multinomial.cpp
+++ b/libnd4j/include/ops/declarable/generic/random/multinomial.cpp
@@ -19,14 +19,14 @@
 // @author Oleh Semeniv (oleg.semeniv@gmail.com)
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_random_multinomial)
 
 #include <ops/declarable/CustomOperations.h>
 #include <helpers/RandomLauncher.h>
 #include <ops/declarable/helpers/random.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         ///////////////////////
         /**
@@ -98,14 +98,14 @@ namespace nd4j {
             auto dimA = (0 == dimC) ? 1 : 0;
             nShape[dimA] = numOfSamples;
 
-            DataType nType = (argSize > 1) ? ( INT_ARG(1) >= 0 ? static_cast<DataType>(INT_ARG(1)) : nd4j::DataType::INT64) : nd4j::DataType::INT64;
+            DataType nType = (argSize > 1) ? ( INT_ARG(1) >= 0 ? static_cast<DataType>(INT_ARG(1)) : sd::DataType::INT64) : sd::DataType::INT64;
             return SHAPELIST(ConstantShapeHelper::getInstance()->createShapeInfo(nType, input->ordering(), nShape));
         }
         
         DECLARE_TYPES(random_multinomial) {
             getOpDescriptor()
                     ->setAllowedInputTypes(0, { ALL_FLOATS, ALL_INTS })
-                    ->setAllowedInputTypes(1, { nd4j::DataType::INT32 })
+                    ->setAllowedInputTypes(1, { sd::DataType::INT32 })
                     ->setAllowedOutputTypes(0, { ALL_INDICES });
         }
     }
diff --git a/libnd4j/include/ops/declarable/generic/random/normal.cpp b/libnd4j/include/ops/declarable/generic/random/normal.cpp
index 781d495f0..8bfbd8db6 100644
--- a/libnd4j/include/ops/declarable/generic/random/normal.cpp
+++ b/libnd4j/include/ops/declarable/generic/random/normal.cpp
@@ -18,13 +18,13 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_random_normal)
 
 #include <ops/declarable/headers/random.h>
 #include <helpers/RandomLauncher.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(random_normal, 1, 1, true, 2, 0) {
             // normal distribution
@@ -56,7 +56,7 @@ namespace nd4j {
 
         DECLARE_TYPES(random_normal) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
     }
diff --git a/libnd4j/include/ops/declarable/generic/random/poisson.cpp b/libnd4j/include/ops/declarable/generic/random/poisson.cpp
index 935bed095..74f3a8570 100644
--- a/libnd4j/include/ops/declarable/generic/random/poisson.cpp
+++ b/libnd4j/include/ops/declarable/generic/random/poisson.cpp
@@ -18,13 +18,13 @@
 //  @author George A. Shulinok <sgazeos@gmail.com>
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_random_poisson)
 
 #include <ops/declarable/headers/random.h>
 #include <ops/declarable/helpers/random.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(random_poisson, 2, 1, false, 0, 0) {
             // gamma distribution
diff --git a/libnd4j/include/ops/declarable/generic/random/random_crop.cpp b/libnd4j/include/ops/declarable/generic/random/random_crop.cpp
index 701a57c83..2ac2495d3 100644
--- a/libnd4j/include/ops/declarable/generic/random/random_crop.cpp
+++ b/libnd4j/include/ops/declarable/generic/random/random_crop.cpp
@@ -22,7 +22,7 @@
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/random_crop.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
 
@@ -65,7 +65,7 @@ DECLARE_SHAPE_FN(random_crop) {
 
         DECLARE_TYPES(random_crop) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
 }
diff --git a/libnd4j/include/ops/declarable/generic/random/random_shuffle.cpp b/libnd4j/include/ops/declarable/generic/random/random_shuffle.cpp
index e9eb73dbf..012d2e55a 100644
--- a/libnd4j/include/ops/declarable/generic/random/random_shuffle.cpp
+++ b/libnd4j/include/ops/declarable/generic/random/random_shuffle.cpp
@@ -18,13 +18,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 26.01.2018
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_random_shuffle)
 
 #include <ops/declarable/CustomOperations.h>
 #include<ops/declarable/helpers/transforms.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 OP_IMPL(random_shuffle, 1, 1, true) {
@@ -33,8 +33,8 @@ OP_IMPL(random_shuffle, 1, 1, true) {
     const bool isInplace = block.isInplace();
     auto output = isInplace ? nullptr : OUTPUT_VARIABLE(0);
 
-//    nd4j::random::RandomBuffer* rng = block.getRNG();
-    nd4j::graph::RandomGenerator rng = block.randomGenerator();
+//    sd::random::RandomBuffer* rng = block.getRNG();
+    sd::graph::RandomGenerator rng = block.randomGenerator();
 //    REQUIRE_TRUE(rng != nullptr, 0, "RANDOM_SHUFFLE op: RNG should be defined in Graph !");
 
     helpers::randomShuffle(block.launchContext(), *input, *output, rng, isInplace);
@@ -45,7 +45,7 @@ OP_IMPL(random_shuffle, 1, 1, true) {
 
     DECLARE_TYPES(random_shuffle) {
         getOpDescriptor()
-                ->setAllowedInputTypes(nd4j::DataType::ANY)
+                ->setAllowedInputTypes(sd::DataType::ANY)
                 ->setSameMode(true);
     }
 }
diff --git a/libnd4j/include/ops/declarable/generic/random/set_seed.cpp b/libnd4j/include/ops/declarable/generic/random/set_seed.cpp
index fa9dcf992..f4c240d50 100644
--- a/libnd4j/include/ops/declarable/generic/random/set_seed.cpp
+++ b/libnd4j/include/ops/declarable/generic/random/set_seed.cpp
@@ -18,13 +18,13 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_set_seed)
 
 #include <ops/declarable/CustomOperations.h>
-#include <NativeOps.h>
+#include <legacy/NativeOps.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(set_seed, -2, 1, false, 0, -2) {
 //            REQUIRE_TRUE(block.getRNG() != nullptr, 0, "RNG should be defined in Graph");
diff --git a/libnd4j/include/ops/declarable/generic/random/uniform.cpp b/libnd4j/include/ops/declarable/generic/random/uniform.cpp
index fd65f842a..6dec96739 100644
--- a/libnd4j/include/ops/declarable/generic/random/uniform.cpp
+++ b/libnd4j/include/ops/declarable/generic/random/uniform.cpp
@@ -19,14 +19,14 @@
 // Created by raver119 on 29/10/17.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_randomuniform)
 
 #include <ops/declarable/CustomOperations.h>
 #include <helpers/RandomLauncher.h>
 #include <ops/declarable/helpers/random.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         ///////////////////////
         /**
diff --git a/libnd4j/include/ops/declarable/generic/recurrent/dynamicBidirectionalRNN.cpp b/libnd4j/include/ops/declarable/generic/recurrent/dynamicBidirectionalRNN.cpp
index 7f536a9ea..9fe70ee92 100644
--- a/libnd4j/include/ops/declarable/generic/recurrent/dynamicBidirectionalRNN.cpp
+++ b/libnd4j/include/ops/declarable/generic/recurrent/dynamicBidirectionalRNN.cpp
@@ -20,7 +20,7 @@
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 
@@ -83,7 +83,7 @@ CUSTOM_OP_IMPL(dynamic_bidirectional_rnn, 7, 4, false, 0, 0) {
 
 
     // forward steps
-    nd4j::ops::dynamic_rnn dynamicRnn;
+    sd::ops::dynamic_rnn dynamicRnn;
     auto resultsFW = dynamicRnn.evaluate({x, WxFW, WhFW, bFW, h0FW, maxTimeStep}, {timeMajor});
     hFW->assign(resultsFW->at(0));                              // [time x bS x numUnitsFW] or [bS x time x numUnitsFW]
     hFWFinal->assign(resultsFW->at(1));
@@ -91,12 +91,12 @@ CUSTOM_OP_IMPL(dynamic_bidirectional_rnn, 7, 4, false, 0, 0) {
     auto seqLen = maxTimeStep;
     if(seqLen == nullptr) {
         // FIXME: which datatype should be used here?
-    	seqLen = new NDArray(x->ordering(), {bS}, nd4j::DataType::INT64, block.launchContext());
+    	seqLen = new NDArray(x->ordering(), {bS}, sd::DataType::INT64, block.launchContext());
     	seqLen->assign(time);                                        // set each element of seqLen to be equal to time
     }
 
     // reverse x     
-    nd4j::ops::reverse_sequence reverse;
+    sd::ops::reverse_sequence reverse;
     auto resultsIn = timeMajor ? reverse.evaluate({x, seqLen}, {0, 1}) : reverse.evaluate({x, seqLen}, {1, 0});
     REQUIRE_TRUE (resultsIn->status() == ND4J_STATUS_OK, 0, "dynamic_bidirectional_rnn: there is a problem with reverse on the sequence.");
     auto revInput = resultsIn->at(0);
@@ -123,7 +123,7 @@ CUSTOM_OP_IMPL(dynamic_bidirectional_rnn, 7, 4, false, 0, 0) {
 
         DECLARE_TYPES(dynamic_bidirectional_rnn) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
 
diff --git a/libnd4j/include/ops/declarable/generic/recurrent/dynamicRNN.cpp b/libnd4j/include/ops/declarable/generic/recurrent/dynamicRNN.cpp
index 9fe30b345..68c71b2ac 100644
--- a/libnd4j/include/ops/declarable/generic/recurrent/dynamicRNN.cpp
+++ b/libnd4j/include/ops/declarable/generic/recurrent/dynamicRNN.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/CustomOperations.h>
 #include<ops/declarable/helpers/rnn.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 
@@ -85,7 +85,7 @@ CUSTOM_OP_IMPL(dynamic_rnn, 4, 2, false, 0, 0) {
 
         DECLARE_TYPES(dynamic_rnn) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(0, nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(0, sd::DataType::ANY)
                     ->setAllowedInputTypes(1, {ALL_FLOATS})
                     ->setAllowedInputTypes(2, {ALL_FLOATS})
                     ->setAllowedInputTypes(3, {ALL_FLOATS})
diff --git a/libnd4j/include/ops/declarable/generic/recurrent/gru.cpp b/libnd4j/include/ops/declarable/generic/recurrent/gru.cpp
index 7a9ca3d16..c24b36632 100644
--- a/libnd4j/include/ops/declarable/generic/recurrent/gru.cpp
+++ b/libnd4j/include/ops/declarable/generic/recurrent/gru.cpp
@@ -18,13 +18,13 @@
 // created by Yurii Shyrma on 15.02.2018
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_gru)
 
 #include <ops/declarable/CustomOperations.h>
 #include<ops/declarable/helpers/gru.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
 
@@ -67,7 +67,7 @@ CUSTOM_OP_IMPL(gru, 5, 1, false, 0, 0) {
 
         DECLARE_TYPES(gru) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
 
diff --git a/libnd4j/include/ops/declarable/generic/recurrent/gruCell.cpp b/libnd4j/include/ops/declarable/generic/recurrent/gruCell.cpp
index ddd18dc84..67050742b 100644
--- a/libnd4j/include/ops/declarable/generic/recurrent/gruCell.cpp
+++ b/libnd4j/include/ops/declarable/generic/recurrent/gruCell.cpp
@@ -19,13 +19,13 @@
 // @author Alex Black
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_gruCell)
 
 #include <ops/declarable/CustomOperations.h>
 #include<ops/declarable/helpers/gru.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 
@@ -70,7 +70,7 @@ CUSTOM_OP_IMPL(gruCell, 6, 4, false, 0, 0) {
 
 DECLARE_TYPES(gruCell) {
     getOpDescriptor()
-        ->setAllowedInputTypes(0, nd4j::DataType::ANY)
+        ->setAllowedInputTypes(0, sd::DataType::ANY)
         ->setAllowedInputTypes(1, {ALL_FLOATS})
         ->setAllowedInputTypes(2, {ALL_FLOATS})
         ->setAllowedInputTypes(3, {ALL_FLOATS})
@@ -177,7 +177,7 @@ CUSTOM_OP_IMPL(gruCell_bp, 10, 6, false, 0, 0) {
 
 DECLARE_TYPES(gruCell_bp) {
     getOpDescriptor()
-        ->setAllowedInputTypes(0, nd4j::DataType::ANY)
+        ->setAllowedInputTypes(0, sd::DataType::ANY)
         ->setAllowedInputTypes(1, {ALL_FLOATS})
         ->setAllowedInputTypes(2, {ALL_FLOATS})
         ->setAllowedInputTypes(3, {ALL_FLOATS})
diff --git a/libnd4j/include/ops/declarable/generic/recurrent/lstm.cpp b/libnd4j/include/ops/declarable/generic/recurrent/lstm.cpp
index 463d8b507..0594dec4a 100644
--- a/libnd4j/include/ops/declarable/generic/recurrent/lstm.cpp
+++ b/libnd4j/include/ops/declarable/generic/recurrent/lstm.cpp
@@ -18,13 +18,13 @@
 // @author Yurii Shyrma, created on 15.02.2018
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_lstm)
 
 #include <ops/declarable/CustomOperations.h>
 #include<ops/declarable/helpers/lstm.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 
@@ -90,7 +90,7 @@ CUSTOM_OP_IMPL(lstm, 8, 2, false, 3, 2) {
 
         DECLARE_TYPES(lstm) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
 
diff --git a/libnd4j/include/ops/declarable/generic/recurrent/lstmBlock.cpp b/libnd4j/include/ops/declarable/generic/recurrent/lstmBlock.cpp
index 9517caa47..3225f3f74 100644
--- a/libnd4j/include/ops/declarable/generic/recurrent/lstmBlock.cpp
+++ b/libnd4j/include/ops/declarable/generic/recurrent/lstmBlock.cpp
@@ -18,13 +18,13 @@
 // @author Alex Black
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_lstmBlock)
 
 #include <ops/declarable/CustomOperations.h>
 #include<ops/declarable/helpers/lstmBlock.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 
@@ -68,7 +68,7 @@ CUSTOM_OP_IMPL(lstmBlock, 9, 7, false, 2, 2) {
 
 DECLARE_TYPES(lstmBlock) {
     getOpDescriptor()
-            ->setAllowedInputTypes(nd4j::DataType::ANY)
+            ->setAllowedInputTypes(sd::DataType::ANY)
             ->setAllowedOutputTypes({ALL_FLOATS});
 }
 
diff --git a/libnd4j/include/ops/declarable/generic/recurrent/lstmBlockCell.cpp b/libnd4j/include/ops/declarable/generic/recurrent/lstmBlockCell.cpp
index 446b523c1..333854ba3 100644
--- a/libnd4j/include/ops/declarable/generic/recurrent/lstmBlockCell.cpp
+++ b/libnd4j/include/ops/declarable/generic/recurrent/lstmBlockCell.cpp
@@ -18,13 +18,13 @@
 // @author Alex Black
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_lstmBlockCell)
 
 #include <ops/declarable/CustomOperations.h>
 #include<ops/declarable/helpers/lstmBlock.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
 
@@ -79,7 +79,7 @@ CUSTOM_OP_IMPL(lstmBlockCell, 8, 7, false, 2, 1) {
 
 DECLARE_TYPES(lstmBlockCell) {
     getOpDescriptor()
-            ->setAllowedInputTypes(nd4j::DataType::ANY)
+            ->setAllowedInputTypes(sd::DataType::ANY)
             ->setAllowedOutputTypes({ALL_FLOATS});
 }
 
diff --git a/libnd4j/include/ops/declarable/generic/recurrent/lstmCell.cpp b/libnd4j/include/ops/declarable/generic/recurrent/lstmCell.cpp
index 9d7e70728..be6aafac3 100644
--- a/libnd4j/include/ops/declarable/generic/recurrent/lstmCell.cpp
+++ b/libnd4j/include/ops/declarable/generic/recurrent/lstmCell.cpp
@@ -18,13 +18,13 @@
 // @author Yurii Shyrma, created on 30.11.2017
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_lstmCell)
 
 #include <ops/declarable/CustomOperations.h>
 #include<ops/declarable/helpers/lstm.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
 
@@ -90,7 +90,7 @@ CUSTOM_OP_IMPL(lstmCell, 8, 2, false, 3, 2) {
 
         DECLARE_TYPES(lstmCell) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
 
diff --git a/libnd4j/include/ops/declarable/generic/recurrent/lstmLayer.cpp b/libnd4j/include/ops/declarable/generic/recurrent/lstmLayer.cpp
index 3b794b945..3a02b8a70 100644
--- a/libnd4j/include/ops/declarable/generic/recurrent/lstmLayer.cpp
+++ b/libnd4j/include/ops/declarable/generic/recurrent/lstmLayer.cpp
@@ -18,13 +18,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_lstmLayer)
 
 #include <ops/declarable/CustomOperations.h>
 #include<ops/declarable/helpers/lstmLayer.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 
@@ -304,7 +304,7 @@ CUSTOM_OP_IMPL(lstmLayer, 3, 1, false, 1, 5) {
 
 DECLARE_TYPES(lstmLayer) {
     getOpDescriptor()
-            ->setAllowedInputTypes(nd4j::DataType::ANY)
+            ->setAllowedInputTypes(sd::DataType::ANY)
             ->setAllowedOutputTypes({ALL_FLOATS});
 }
 
@@ -332,7 +332,7 @@ DECLARE_SHAPE_FN(lstmLayer) {
     if(x->isR())
         type = x->dataType();
     else
-        type = nd4j::DataType::FLOAT32;
+        type = sd::DataType::FLOAT32;
 
     std::vector<Nd4jLong*> shapes;
 
diff --git a/libnd4j/include/ops/declarable/generic/recurrent/sru.cpp b/libnd4j/include/ops/declarable/generic/recurrent/sru.cpp
index 6ca57d297..b60180622 100644
--- a/libnd4j/include/ops/declarable/generic/recurrent/sru.cpp
+++ b/libnd4j/include/ops/declarable/generic/recurrent/sru.cpp
@@ -20,15 +20,15 @@
 //@author Yurii Shyrma
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_sru)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/sru.h>
-#include <MmulHelper.h>
+#include <helpers/MmulHelper.h>
 #include <helpers/PointersManager.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 //////////////////////////////////////////////////////////////////////////
@@ -87,7 +87,7 @@ CUSTOM_OP_IMPL(sru, 5, 2, false, 0, 0) {
 
         DECLARE_TYPES(sru) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
 
@@ -304,7 +304,7 @@ CUSTOM_OP_IMPL(sru_bp, 8, 4, true, 0, 0) {
 
         DECLARE_TYPES(sru_bp) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
 
@@ -372,7 +372,7 @@ CUSTOM_OP_IMPL(sru_bi, 5, 2, true, 0, 0) {
 
         DECLARE_TYPES(sru_bi) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
 
@@ -422,7 +422,7 @@ DECLARE_SHAPE_FN(sru_bi) {
 
         DECLARE_TYPES(sru_bi_bp) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
 
@@ -679,7 +679,7 @@ DECLARE_SHAPE_FN(sru_bi_bp) {
 
 //         DECLARE_TYPES(sru_logic) {
 //             getOpDescriptor()
-//                     ->setAllowedInputTypes(nd4j::DataType::ANY)
+//                     ->setAllowedInputTypes(sd::DataType::ANY)
 //                     ->setAllowedOutputTypes({ALL_FLOATS});
 //         }
 
@@ -781,7 +781,7 @@ DECLARE_SHAPE_FN(sru_bi_bp) {
 
 //         DECLARE_TYPES(sru_old) {
 //             getOpDescriptor()
-//                     ->setAllowedInputTypes(nd4j::DataType::ANY)
+//                     ->setAllowedInputTypes(sd::DataType::ANY)
 //                     ->setAllowedOutputTypes({ALL_FLOATS});
 //         }
 
@@ -952,7 +952,7 @@ DECLARE_SHAPE_FN(sru_bi_bp) {
 
 //         DECLARE_TYPES(sru_bp_logic) {
 //             getOpDescriptor()
-//                     ->setAllowedInputTypes(nd4j::DataType::ANY)
+//                     ->setAllowedInputTypes(sd::DataType::ANY)
 //                     ->setAllowedOutputTypes({ALL_FLOATS});
 //         }
 
diff --git a/libnd4j/include/ops/declarable/generic/recurrent/sruCell.cpp b/libnd4j/include/ops/declarable/generic/recurrent/sruCell.cpp
index 23b2ec172..3528c0967 100644
--- a/libnd4j/include/ops/declarable/generic/recurrent/sruCell.cpp
+++ b/libnd4j/include/ops/declarable/generic/recurrent/sruCell.cpp
@@ -18,14 +18,14 @@
 //  @author Yurii Shyrma, created on 05.12.2017
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_sruCell)
 
 #include <ops/declarable/CustomOperations.h>
 #include<ops/declarable/helpers/sru.h>
 
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 
@@ -64,7 +64,7 @@ CUSTOM_OP_IMPL(sruCell, 4, 2, false, 0, 0) {
 
         DECLARE_TYPES(sruCell) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
 
diff --git a/libnd4j/include/ops/declarable/generic/recurrent/staticBidirectionalRNN.cpp b/libnd4j/include/ops/declarable/generic/recurrent/staticBidirectionalRNN.cpp
index 453480ed6..6c9147bc0 100644
--- a/libnd4j/include/ops/declarable/generic/recurrent/staticBidirectionalRNN.cpp
+++ b/libnd4j/include/ops/declarable/generic/recurrent/staticBidirectionalRNN.cpp
@@ -23,7 +23,7 @@
 #include<ops/declarable/helpers/reverse.h>
 #include<ops/declarable/helpers/transforms.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 
@@ -88,7 +88,7 @@ CUSTOM_OP_IMPL(static_bidirectional_rnn, 7, 3, false, 0, 0) {
     auto seqLen = maxTimeStep;    
     if(seqLen == nullptr) {
 //        seqLen = new NDArray(x->ordering(), {x->sizeAt(1)}, x->dataType(), block.launchContext());	  // [bS]
-    	seqLen = new NDArray(x->ordering(), {x->sizeAt(1)}, nd4j::DataType::INT64, block.launchContext());	  // [bS]
+    	seqLen = new NDArray(x->ordering(), {x->sizeAt(1)}, sd::DataType::INT64, block.launchContext());	  // [bS]
         *seqLen = x->sizeAt(0);                                 			                  // set each element of seqLen to be equal to time
     }    
     
@@ -123,7 +123,7 @@ CUSTOM_OP_IMPL(static_bidirectional_rnn, 7, 3, false, 0, 0) {
 
         DECLARE_TYPES(static_bidirectional_rnn) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
 
diff --git a/libnd4j/include/ops/declarable/generic/recurrent/staticRNN.cpp b/libnd4j/include/ops/declarable/generic/recurrent/staticRNN.cpp
index 7eea9e7f2..cb1781d92 100644
--- a/libnd4j/include/ops/declarable/generic/recurrent/staticRNN.cpp
+++ b/libnd4j/include/ops/declarable/generic/recurrent/staticRNN.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/CustomOperations.h>
 #include<ops/declarable/helpers/rnn.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 
@@ -73,7 +73,7 @@ CUSTOM_OP_IMPL(static_rnn, 4, 2, false, 0, 0) {
 
 DECLARE_TYPES(static_rnn) {
     getOpDescriptor()
-            ->setAllowedInputTypes(nd4j::DataType::ANY)
+            ->setAllowedInputTypes(sd::DataType::ANY)
             ->setAllowedOutputTypes({ALL_FLOATS});
 }
 
diff --git a/libnd4j/include/ops/declarable/generic/shape/broadcast_to.cpp b/libnd4j/include/ops/declarable/generic/shape/broadcast_to.cpp
index ee8f0a864..49961bfe2 100644
--- a/libnd4j/include/ops/declarable/generic/shape/broadcast_to.cpp
+++ b/libnd4j/include/ops/declarable/generic/shape/broadcast_to.cpp
@@ -18,12 +18,12 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 03.09.2018
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_broadcast_to)
 
 #include <ops/declarable/headers/shape.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 CUSTOM_OP_IMPL(broadcast_to, 2, 1, false, 0, 0) {
diff --git a/libnd4j/include/ops/declarable/generic/shape/create.cpp b/libnd4j/include/ops/declarable/generic/shape/create.cpp
index c87f63a56..c79b55497 100644
--- a/libnd4j/include/ops/declarable/generic/shape/create.cpp
+++ b/libnd4j/include/ops/declarable/generic/shape/create.cpp
@@ -18,12 +18,12 @@
 // @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_shapes_of)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
 
         CUSTOM_OP_IMPL(create, 1, 1, false, 0, 1) {
@@ -44,13 +44,13 @@ namespace nd4j {
 
             auto shape = shapeInput->getBufferAsVector<Nd4jLong>();
 
-            return SHAPELIST(nd4j::ConstantShapeHelper::getInstance()->createShapeInfo(dtype, order, shape));
+            return SHAPELIST(sd::ConstantShapeHelper::getInstance()->createShapeInfo(dtype, order, shape));
         }
 
         DECLARE_TYPES(create) {
             getOpDescriptor()
                     ->setAllowedInputTypes({ALL_INTS})
-                    ->setAllowedOutputTypes(nd4j::DataType::ANY);
+                    ->setAllowedOutputTypes(sd::DataType::ANY);
         }
     }
 }
diff --git a/libnd4j/include/ops/declarable/generic/shape/evaluate_reduction_shape.cpp b/libnd4j/include/ops/declarable/generic/shape/evaluate_reduction_shape.cpp
index 85e51d136..6a0ad187c 100644
--- a/libnd4j/include/ops/declarable/generic/shape/evaluate_reduction_shape.cpp
+++ b/libnd4j/include/ops/declarable/generic/shape/evaluate_reduction_shape.cpp
@@ -18,12 +18,12 @@
 // @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_evaluate_reduction_shape)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(evaluate_reduction_shape, 2, 1, false, 0, 0) {
             auto inputShape = INPUT_VARIABLE(0);
@@ -34,7 +34,7 @@ namespace nd4j {
 
             auto shape = inputShape->asVectorT<Nd4jLong>();
 
-            auto tempShapeInfo = ConstantShapeHelper::getInstance()->createShapeInfo(nd4j::DataType::INT64, 'c', shape);
+            auto tempShapeInfo = ConstantShapeHelper::getInstance()->createShapeInfo(sd::DataType::INT64, 'c', shape);
             auto tempReductionShapeInfo = ShapeUtils::evalReduceShapeInfo('c', axis, tempShapeInfo, keepDims, oldFormat, block.workspace());
 
             REQUIRE_TRUE(output->lengthOf() == shape::rank(tempReductionShapeInfo), 0, "evaluate_reduction_shape: output length should be %i, but got %i instead", shape::rank(tempReductionShapeInfo), output->lengthOf());
@@ -49,7 +49,7 @@ namespace nd4j {
             getOpDescriptor()
                     ->setAllowedInputTypes(0, {ALL_INTS})
                     ->setAllowedInputTypes(1, {ALL_INTS})
-                    ->setAllowedOutputTypes(0, nd4j::DataType::INT64);
+                    ->setAllowedOutputTypes(0, sd::DataType::INT64);
         }
 
         DECLARE_SHAPE_FN(evaluate_reduction_shape) {
@@ -64,16 +64,16 @@ namespace nd4j {
             if (keepDims) {
                 if (oldFormat) {
                     // for oldFormat we can't go below rank 2
-                    length = nd4j::math::nd4j_max<int>(2, length);
+                    length = sd::math::nd4j_max<int>(2, length);
                 }
             } else {
                 length -= axis.size();
                 if (oldFormat) {
-                    length = nd4j::math::nd4j_max<int>(2, length);
+                    length = sd::math::nd4j_max<int>(2, length);
                 }
             }
 
-            return SHAPELIST(ConstantShapeHelper::getInstance()->vectorShapeInfo(length, nd4j::DataType::INT64));
+            return SHAPELIST(ConstantShapeHelper::getInstance()->vectorShapeInfo(length, sd::DataType::INT64));
         }
     }
 }
diff --git a/libnd4j/include/ops/declarable/generic/shape/expand_dims.cpp b/libnd4j/include/ops/declarable/generic/shape/expand_dims.cpp
index 50aa0cb9c..86900c264 100644
--- a/libnd4j/include/ops/declarable/generic/shape/expand_dims.cpp
+++ b/libnd4j/include/ops/declarable/generic/shape/expand_dims.cpp
@@ -18,12 +18,12 @@
 // Created by raver119 on 02.11.2017.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_expand_dims)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(expand_dims, 1, 1, false, 0, -2) {
             auto input = INPUT_VARIABLE(0);
@@ -59,7 +59,7 @@ namespace nd4j {
 
         DECLARE_TYPES(expand_dims) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setSameMode(true);
         }
 
diff --git a/libnd4j/include/ops/declarable/generic/shape/order.cpp b/libnd4j/include/ops/declarable/generic/shape/order.cpp
index dfb1789bc..5b978f48f 100644
--- a/libnd4j/include/ops/declarable/generic/shape/order.cpp
+++ b/libnd4j/include/ops/declarable/generic/shape/order.cpp
@@ -18,12 +18,12 @@
 // Created by raver119 on 12.02.18.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_order)
 
 #include <ops/declarable/headers/shape.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(order, 1, 1, false, 0, 1) {
             auto input = INPUT_VARIABLE(0);
@@ -36,7 +36,7 @@ namespace nd4j {
 
         DECLARE_TYPES(order) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(0, nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(0, sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_INTS});
         }
 
diff --git a/libnd4j/include/ops/declarable/generic/shape/permute.cpp b/libnd4j/include/ops/declarable/generic/shape/permute.cpp
index 63c20e888..f612aec92 100644
--- a/libnd4j/include/ops/declarable/generic/shape/permute.cpp
+++ b/libnd4j/include/ops/declarable/generic/shape/permute.cpp
@@ -19,13 +19,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_permute)
 
 #include <ops/declarable/CustomOperations.h>
 #include <helpers/ShapeUtils.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
 
 //////////////////////////////////////////////////////////////////////////
@@ -55,7 +55,7 @@ CUSTOM_OP_IMPL(permute, 1, 1, true, 0, -2) {
 //////////////////////////////////////////////////////////////////////////
 DECLARE_TYPES(permute) {
     getOpDescriptor()
-            ->setAllowedInputTypes(0, nd4j::DataType::ANY)
+            ->setAllowedInputTypes(0, sd::DataType::ANY)
             ->setAllowedInputTypes(1, {ALL_INTS})
             ->setSameMode(true);
 }
diff --git a/libnd4j/include/ops/declarable/generic/shape/reshape.cpp b/libnd4j/include/ops/declarable/generic/shape/reshape.cpp
index dba15bf22..ace58a0b8 100644
--- a/libnd4j/include/ops/declarable/generic/shape/reshape.cpp
+++ b/libnd4j/include/ops/declarable/generic/shape/reshape.cpp
@@ -18,12 +18,12 @@
 // Created by raver119 on 29/10/17.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_reshape)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 //////////////////////////////////////////////////////////////////////////
@@ -145,7 +145,7 @@ CUSTOM_OP_IMPL(reshape, 1, 1, false, 0, -2) {
 
 DECLARE_TYPES(reshape) {
     getOpDescriptor()
-            ->setAllowedInputTypes(0, nd4j::DataType::ANY)
+            ->setAllowedInputTypes(0, sd::DataType::ANY)
             ->setAllowedInputTypes(1, {ALL_INTS})
             ->setSameMode(true);
 }
diff --git a/libnd4j/include/ops/declarable/generic/shape/reshape_as.cpp b/libnd4j/include/ops/declarable/generic/shape/reshape_as.cpp
index 3035f104b..90e2ff398 100644
--- a/libnd4j/include/ops/declarable/generic/shape/reshape_as.cpp
+++ b/libnd4j/include/ops/declarable/generic/shape/reshape_as.cpp
@@ -18,12 +18,12 @@
 // Created by raver119 on 29/10/17.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_reshapeas)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
 
 
@@ -52,7 +52,7 @@ namespace nd4j {
 
         DECLARE_TYPES(reshapeas) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setSameMode(true);
         }
 }
diff --git a/libnd4j/include/ops/declarable/generic/shape/shape.cpp b/libnd4j/include/ops/declarable/generic/shape/shape.cpp
index 676b347cb..e2db3db3e 100644
--- a/libnd4j/include/ops/declarable/generic/shape/shape.cpp
+++ b/libnd4j/include/ops/declarable/generic/shape/shape.cpp
@@ -18,12 +18,12 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_shape)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(shape_of, 1, 1, false, 0, 0) {
             auto x = INPUT_VARIABLE(0);
@@ -51,7 +51,7 @@ namespace nd4j {
 
         DECLARE_TYPES(shape_of) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_INTS});
         }
     }
diff --git a/libnd4j/include/ops/declarable/generic/shape/shapes.cpp b/libnd4j/include/ops/declarable/generic/shape/shapes.cpp
index 8f772ef35..6481d1db3 100644
--- a/libnd4j/include/ops/declarable/generic/shape/shapes.cpp
+++ b/libnd4j/include/ops/declarable/generic/shape/shapes.cpp
@@ -18,12 +18,12 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_shapes_of)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(shapes_of, -1, -1, false, 0, 0) {
             for (int e = 0; e < block.width(); e++) {
@@ -43,7 +43,7 @@ namespace nd4j {
 
             for (int e = 0; e < inputShape->size(); e++) {
                 auto inShape = inputShape->at(e);
-                shapeList->push_back(ConstantShapeHelper::getInstance()->vectorShapeInfo(shape::rank(inShape), nd4j::DataType::INT64));
+                shapeList->push_back(ConstantShapeHelper::getInstance()->vectorShapeInfo(shape::rank(inShape), sd::DataType::INT64));
             }
 
             return shapeList;
@@ -51,7 +51,7 @@ namespace nd4j {
 
         DECLARE_TYPES(shapes_of) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_INTS});
         }
     }
diff --git a/libnd4j/include/ops/declarable/generic/shape/size_at.cpp b/libnd4j/include/ops/declarable/generic/shape/size_at.cpp
index 0f1e9c669..2c27b018a 100644
--- a/libnd4j/include/ops/declarable/generic/shape/size_at.cpp
+++ b/libnd4j/include/ops/declarable/generic/shape/size_at.cpp
@@ -18,12 +18,12 @@
 // Created by raver119 on 12.02.18.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_size_at)
 
 #include <ops/declarable/headers/shape.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(size_at, 1, 1, false, 0, 1) {
             auto input = INPUT_VARIABLE(0);
@@ -42,12 +42,12 @@ namespace nd4j {
         }
 
         DECLARE_SHAPE_FN(size_at) {
-            return SHAPELIST(ConstantShapeHelper::getInstance()->scalarShapeInfo(nd4j::DataType::INT64));
+            return SHAPELIST(ConstantShapeHelper::getInstance()->scalarShapeInfo(sd::DataType::INT64));
         }
 
         DECLARE_TYPES(size_at) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes(DataType::INT64)
                     ->allowOverride(true);
         }
diff --git a/libnd4j/include/ops/declarable/generic/shape/squeeze.cpp b/libnd4j/include/ops/declarable/generic/shape/squeeze.cpp
index 22e229643..812947422 100644
--- a/libnd4j/include/ops/declarable/generic/shape/squeeze.cpp
+++ b/libnd4j/include/ops/declarable/generic/shape/squeeze.cpp
@@ -18,12 +18,12 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_squeeze)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(squeeze, 1, 1, false, 0, -2) {
             auto input = INPUT_VARIABLE(0);
@@ -86,7 +86,7 @@ namespace nd4j {
 
         DECLARE_TYPES(squeeze) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setSameMode(true);
         }
 
diff --git a/libnd4j/include/ops/declarable/generic/shape/tile_to_shape.cpp b/libnd4j/include/ops/declarable/generic/shape/tile_to_shape.cpp
index d71fbddd5..687d79f25 100644
--- a/libnd4j/include/ops/declarable/generic/shape/tile_to_shape.cpp
+++ b/libnd4j/include/ops/declarable/generic/shape/tile_to_shape.cpp
@@ -18,12 +18,12 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_tile_to_shape)
 
 #include <ops/declarable/headers/shape.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
     CUSTOM_OP_IMPL(tile_to_shape, 1, 1, false, 0, -1) {
 
@@ -55,13 +55,13 @@ namespace ops {
 
     DECLARE_TYPES(tile_to_shape) {
         getOpDescriptor()
-                ->setAllowedInputTypes(nd4j::DataType::ANY)
+                ->setAllowedInputTypes(sd::DataType::ANY)
                 ->setSameMode(true);
     }
 
     DECLARE_TYPES(tile_to_shape_bp) {
         getOpDescriptor()
-                ->setAllowedInputTypes(nd4j::DataType::ANY)
+                ->setAllowedInputTypes(sd::DataType::ANY)
                 ->setAllowedOutputTypes({ALL_FLOATS});
     }
 
diff --git a/libnd4j/include/ops/declarable/generic/shape/transpose.cpp b/libnd4j/include/ops/declarable/generic/shape/transpose.cpp
index 4ec586370..0b12f415f 100644
--- a/libnd4j/include/ops/declarable/generic/shape/transpose.cpp
+++ b/libnd4j/include/ops/declarable/generic/shape/transpose.cpp
@@ -19,13 +19,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_transpose)
 
 #include <ops/declarable/CustomOperations.h>
 #include <helpers/ShapeUtils.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 //////////////////////////////////////////////////////////////////////////
@@ -54,7 +54,7 @@ CUSTOM_OP_IMPL(transpose, 1, 1, false, 0, 0) {
 
 DECLARE_TYPES(transpose) {
     getOpDescriptor()
-            ->setAllowedInputTypes(nd4j::DataType::ANY)
+            ->setAllowedInputTypes(sd::DataType::ANY)
             ->setSameMode(true);
 }
 
diff --git a/libnd4j/include/ops/declarable/generic/strings/split_string.cpp b/libnd4j/include/ops/declarable/generic/strings/split_string.cpp
index 4af4e3aac..e42591b5c 100644
--- a/libnd4j/include/ops/declarable/generic/strings/split_string.cpp
+++ b/libnd4j/include/ops/declarable/generic/strings/split_string.cpp
@@ -18,12 +18,12 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_split_string)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(split_string, 2, 1, true, 0, 0) {
             auto input = INPUT_VARIABLE(0);
diff --git a/libnd4j/include/ops/declarable/generic/tests/noop.cpp b/libnd4j/include/ops/declarable/generic/tests/noop.cpp
index 0c8e51804..37980c7e6 100644
--- a/libnd4j/include/ops/declarable/generic/tests/noop.cpp
+++ b/libnd4j/include/ops/declarable/generic/tests/noop.cpp
@@ -18,12 +18,12 @@
 // @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_noop)
 
 #include <ops/declarable/headers/tests.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         OP_IMPL(noop, -2, -2, true) {
             // Fastest op ever.
@@ -32,7 +32,7 @@ namespace nd4j {
 
         DECLARE_TYPES(noop) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setSameMode(true);
         }
     }
diff --git a/libnd4j/include/ops/declarable/generic/tests/test_output_reshape.cpp b/libnd4j/include/ops/declarable/generic/tests/test_output_reshape.cpp
index eab2cc320..7ded29e20 100644
--- a/libnd4j/include/ops/declarable/generic/tests/test_output_reshape.cpp
+++ b/libnd4j/include/ops/declarable/generic/tests/test_output_reshape.cpp
@@ -18,12 +18,12 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 
 #if NOT_EXCLUDED(test_output_reshape)
 #include <ops/declarable/headers/tests.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         OP_IMPL(test_output_reshape, 1, 1, true) {
             auto input = INPUT_VARIABLE(0);
@@ -39,7 +39,7 @@ namespace nd4j {
 
         DECLARE_TYPES(test_output_reshape) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setSameMode(true);
         }
     }
diff --git a/libnd4j/include/ops/declarable/generic/tests/test_scalar.cpp b/libnd4j/include/ops/declarable/generic/tests/test_scalar.cpp
index 0009e6696..437222052 100644
--- a/libnd4j/include/ops/declarable/generic/tests/test_scalar.cpp
+++ b/libnd4j/include/ops/declarable/generic/tests/test_scalar.cpp
@@ -18,12 +18,12 @@
 // Created by raver119 on 24.02.18.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_test_scalar)
 
 #include <ops/declarable/headers/tests.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(test_scalar, 1, 1, false, 0, 0) {
             auto input = INPUT_VARIABLE(0);
@@ -57,7 +57,7 @@ namespace nd4j {
 
         DECLARE_TYPES(test_scalar) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setSameMode(true);
         }
     }
diff --git a/libnd4j/include/ops/declarable/generic/tests/testcustom.cpp b/libnd4j/include/ops/declarable/generic/tests/testcustom.cpp
index 73b3c1721..89480e5bc 100644
--- a/libnd4j/include/ops/declarable/generic/tests/testcustom.cpp
+++ b/libnd4j/include/ops/declarable/generic/tests/testcustom.cpp
@@ -18,12 +18,12 @@
 // @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_testcustom)
 
 #include <ops/declarable/headers/tests.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         //////////////////////////////////////////////////////////////////////////
         CUSTOM_OP_IMPL(testcustom, 1, 1, false, 0, -1) {
@@ -46,7 +46,7 @@ namespace nd4j {
 
         DECLARE_TYPES(testcustom) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setSameMode(true);
         }
     }
diff --git a/libnd4j/include/ops/declarable/generic/tests/testop2i2o.cpp b/libnd4j/include/ops/declarable/generic/tests/testop2i2o.cpp
index 7c257b903..f4d4d3159 100644
--- a/libnd4j/include/ops/declarable/generic/tests/testop2i2o.cpp
+++ b/libnd4j/include/ops/declarable/generic/tests/testop2i2o.cpp
@@ -18,12 +18,12 @@
 // @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_testop2i2o)
 
 #include <ops/declarable/headers/tests.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         //////////////////////////////////////////////////////////////////////////
         // test op, non-divergent
@@ -46,7 +46,7 @@ namespace nd4j {
 
         DECLARE_TYPES(testop2i2o) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setSameMode(true);
         }
     }
diff --git a/libnd4j/include/ops/declarable/generic/tests/testreduction.cpp b/libnd4j/include/ops/declarable/generic/tests/testreduction.cpp
index 9b59ee3e4..a0749ed7f 100644
--- a/libnd4j/include/ops/declarable/generic/tests/testreduction.cpp
+++ b/libnd4j/include/ops/declarable/generic/tests/testreduction.cpp
@@ -18,12 +18,12 @@
 // @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_testreduction)
 
 #include <ops/declarable/headers/tests.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         REDUCTION_OP_IMPL(testreduction, 1, 1, false, 0, -1) {
             auto z = OUTPUT_VARIABLE(0);
@@ -34,7 +34,7 @@ namespace nd4j {
 
         DECLARE_TYPES(testreduction) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setSameMode(true);
         }
     }
diff --git a/libnd4j/include/ops/declarable/generic/thrid_party/firas_sparse.cpp b/libnd4j/include/ops/declarable/generic/thrid_party/firas_sparse.cpp
index 21164f520..d385c2fa9 100644
--- a/libnd4j/include/ops/declarable/generic/thrid_party/firas_sparse.cpp
+++ b/libnd4j/include/ops/declarable/generic/thrid_party/firas_sparse.cpp
@@ -20,22 +20,22 @@
 // @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_firas_sparse)
 
 #ifndef LIBND4J_THIRD_PARTY_H
 #define LIBND4J_THIRD_PARTY_H
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <memory>
-#include <shape.h>
+#include <helpers/shape.h>
 #include <loops/random.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <ops/declarable/DeclarableOp.h>
 #include <ops/declarable/OpRegistrator.h>
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
 
 
@@ -99,7 +99,7 @@ namespace nd4j {
 
         DECLARE_TYPES(firas_sparse) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
     }
diff --git a/libnd4j/include/ops/declarable/generic/transforms/clip_by_averaged_norm.cpp b/libnd4j/include/ops/declarable/generic/transforms/clip_by_averaged_norm.cpp
index d8633e125..958a90410 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/clip_by_averaged_norm.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/clip_by_averaged_norm.cpp
@@ -18,13 +18,13 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_clipbyavgnorm)
 
 #include <ops/declarable/CustomOperations.h>
 #include<ops/declarable/helpers/transforms.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 CONFIGURABLE_OP_IMPL(clipbyavgnorm, 1, 1, true, 1, 0) {
@@ -42,7 +42,7 @@ CONFIGURABLE_OP_IMPL(clipbyavgnorm, 1, 1, true, 1, 0) {
 
     DECLARE_TYPES(clipbyavgnorm) {
         getOpDescriptor()
-                ->setAllowedInputTypes(nd4j::DataType::ANY)
+                ->setAllowedInputTypes(sd::DataType::ANY)
                 ->setAllowedOutputTypes({ALL_FLOATS});
     }
 
diff --git a/libnd4j/include/ops/declarable/generic/transforms/clip_by_global_norm.cpp b/libnd4j/include/ops/declarable/generic/transforms/clip_by_global_norm.cpp
index f36c2114e..99a01d390 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/clip_by_global_norm.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/clip_by_global_norm.cpp
@@ -18,13 +18,13 @@
 //  @author sgazeos@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_clip_by_global_norm)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/transforms.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 CUSTOM_OP_IMPL(clip_by_global_norm, 1, 2, true, 1, 0) {
@@ -61,7 +61,7 @@ DECLARE_SHAPE_FN(clip_by_global_norm) {
 
     DECLARE_TYPES(clip_by_global_norm) {
         getOpDescriptor()
-                ->setAllowedInputTypes(nd4j::DataType::ANY)
+                ->setAllowedInputTypes(sd::DataType::ANY)
                 ->setAllowedOutputTypes({ALL_FLOATS});
     }
 
diff --git a/libnd4j/include/ops/declarable/generic/transforms/clip_by_norm.cpp b/libnd4j/include/ops/declarable/generic/transforms/clip_by_norm.cpp
index d558167eb..43b23ba18 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/clip_by_norm.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/clip_by_norm.cpp
@@ -18,13 +18,13 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_clipbynorm)
 
 #include <ops/declarable/CustomOperations.h>
 #include<ops/declarable/helpers/transforms.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
     CONFIGURABLE_OP_IMPL(clipbynorm, 1, 1, true, 1, 0) {
@@ -63,7 +63,7 @@ namespace ops  {
 
     DECLARE_TYPES(clipbynorm) {
         getOpDescriptor()
-                ->setAllowedInputTypes(nd4j::DataType::ANY)
+                ->setAllowedInputTypes(sd::DataType::ANY)
                 ->setAllowedOutputTypes({ALL_FLOATS});
     }
 
diff --git a/libnd4j/include/ops/declarable/generic/transforms/clip_by_value.cpp b/libnd4j/include/ops/declarable/generic/transforms/clip_by_value.cpp
index e4cb1eda5..4275e4837 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/clip_by_value.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/clip_by_value.cpp
@@ -18,13 +18,13 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_clipbyvalue)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/transforms.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CONFIGURABLE_OP_IMPL(clipbyvalue, 1, 1, true, 2, 0) {
             auto input = INPUT_VARIABLE(0);
@@ -45,7 +45,7 @@ namespace nd4j {
 
         DECLARE_TYPES(clipbyvalue) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setAllowedOutputTypes({ALL_FLOATS});
         }
     }
diff --git a/libnd4j/include/ops/declarable/generic/transforms/concat.cpp b/libnd4j/include/ops/declarable/generic/transforms/concat.cpp
index faa59fa6c..a639ab53a 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/concat.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/concat.cpp
@@ -23,7 +23,7 @@
 #include<ops/declarable/helpers/transforms.h>
 #include<array>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 
@@ -117,7 +117,7 @@ CUSTOM_OP_IMPL(concat, -1, 1, false, 0, 0) {
 
         DECLARE_TYPES(concat) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY);
+                    ->setAllowedInputTypes(sd::DataType::ANY);
                     // ->setSameMode(true);
         }
 
@@ -243,7 +243,7 @@ DECLARE_SHAPE_FN(concat) {
         //     if (_dimension < 0)
         //         _dimension += first->rankOf();
 
-        //     if (nd4j::Environment::getInstance()->isDebugAndVerbose()) {
+        //     if (sd::Environment::getInstance()->isDebugAndVerbose()) {
         //         printf("Shape %i: ", 0);
         //         shape::printShapeInfoLinear((Nd4jLong *) shapes[0]);
         //     }
@@ -261,12 +261,12 @@ DECLARE_SHAPE_FN(concat) {
 
         //         oldScalars &= array->rankOf() == 2 && array->isScalar();
 
-        //         if (nd4j::Environment::getInstance()->isDebugAndVerbose()) {
+        //         if (sd::Environment::getInstance()->isDebugAndVerbose()) {
         //             printf("Shape %i: ", e);
         //             shape::printShapeInfoLinear(array->shapeInfo());
         //         }
         //     }
-        //     if (nd4j::Environment::getInstance()->isDebugAndVerbose())
+        //     if (sd::Environment::getInstance()->isDebugAndVerbose())
         //         fflush(stdout);
 
         //     if (oldScalars) {
@@ -274,11 +274,11 @@ DECLARE_SHAPE_FN(concat) {
         //         _dimension = 1;
         //     }
 
-        //     nd4j::SpecialMethods<T>::concatCpuGeneric(_dimension, elements, buffers, shapes, output->getBuffer(), output->getShapeInfo());
+        //     sd::SpecialMethods<T>::concatCpuGeneric(_dimension, elements, buffers, shapes, output->getBuffer(), output->getShapeInfo());
 
         //     STORE_RESULT(*output);
 
-        //     if (nd4j::Environment::getInstance()->isDebugAndVerbose())
+        //     if (sd::Environment::getInstance()->isDebugAndVerbose())
         //         output->printShapeInfo("Concat result shape");
 
         //     delete[] buffers;
@@ -413,7 +413,7 @@ CUSTOM_OP_IMPL(concat_bp, -1, -1, false, 0, 0) {
 
 DECLARE_TYPES(concat_bp) {
     getOpDescriptor()
-            ->setAllowedInputTypes(nd4j::DataType::ANY)
+            ->setAllowedInputTypes(sd::DataType::ANY)
             ->setAllowedOutputTypes({ALL_FLOATS});
 }
 
diff --git a/libnd4j/include/ops/declarable/generic/transforms/cumprod.cpp b/libnd4j/include/ops/declarable/generic/transforms/cumprod.cpp
index e7cd1ccb9..c0b011f99 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/cumprod.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/cumprod.cpp
@@ -18,13 +18,13 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_cumprod)
 
 #include <ops/declarable/helpers/prefix.h>
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CONFIGURABLE_OP_IMPL(cumprod, 1, 1, true, 0, 2) {
             auto input = INPUT_VARIABLE(0);
@@ -42,7 +42,7 @@ namespace nd4j {
 
             if (block.getIArguments()->size() == 2 && block.width() == 1) {
                 // all at once case
-                nd4j::ops::helpers::prefix(block.launchContext(), scalar::Multiply, input, output, exclusive, reverse);
+                sd::ops::helpers::prefix(block.launchContext(), scalar::Multiply, input, output, exclusive, reverse);
             } else {
                 std::vector<int> dims(block.numI() - 2);
 
@@ -59,7 +59,7 @@ namespace nd4j {
                     if (dims[e] < 0)
                         dims[e] += input->rankOf();
 
-                nd4j::ops::helpers::prefix(block.launchContext(), scalar::Multiply, input, output, dims, exclusive, reverse);
+                sd::ops::helpers::prefix(block.launchContext(), scalar::Multiply, input, output, dims, exclusive, reverse);
             }
 
             return Status::OK();
@@ -67,7 +67,7 @@ namespace nd4j {
 
         DECLARE_TYPES(cumprod) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(0, nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(0, sd::DataType::ANY)
                     ->setAllowedInputTypes(1, {ALL_INTS})
                     ->setAllowedOutputTypes({ALL_FLOATS})
                     ->setSameMode(true);
@@ -75,7 +75,7 @@ namespace nd4j {
 
         DECLARE_TYPES(cumprod_bp) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(0, nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(0, sd::DataType::ANY)
                     ->setAllowedInputTypes(1, {ALL_INTS, ALL_FLOATS}) // there is a case when axes given as IArgs
                     ->setAllowedInputTypes(2, {ALL_FLOATS})
                     ->setAllowedOutputTypes({ALL_FLOATS})
@@ -103,35 +103,35 @@ namespace nd4j {
                     dims[e] = INT_ARG(e + 2);
             }
 
-            nd4j::ops::helpers::prefix(block.launchContext(), scalar::Multiply, input, output, dims, exclusive, reverse);
+            sd::ops::helpers::prefix(block.launchContext(), scalar::Multiply, input, output, dims, exclusive, reverse);
             NDArray val = NDArray(output->dup());
 
             gradOut->applyPairwiseTransform(pairwise::Multiply, *output, val);
             val.applyPairwiseTransform(pairwise::Divide, *input, val);
             if (!exclusive && !reverse) {
                 if (dims.size())
-                    nd4j::ops::helpers::prefix(block.launchContext(), scalar::Add, &val, output, dims, true, false);
+                    sd::ops::helpers::prefix(block.launchContext(), scalar::Add, &val, output, dims, true, false);
                 else
-                    nd4j::ops::helpers::prefix(block.launchContext(), scalar::Add, &val, output, false, true);
+                    sd::ops::helpers::prefix(block.launchContext(), scalar::Add, &val, output, false, true);
 
             }
             else if (!exclusive && reverse){
                 if (dims.size())
-                    nd4j::ops::helpers::prefix(block.launchContext(), scalar::Add, &val, output, dims, false, false);
+                    sd::ops::helpers::prefix(block.launchContext(), scalar::Add, &val, output, dims, false, false);
                 else
-                    nd4j::ops::helpers::prefix(block.launchContext(), scalar::Add, &val, output, false, false);
+                    sd::ops::helpers::prefix(block.launchContext(), scalar::Add, &val, output, false, false);
             }
             else if (exclusive && !reverse) {
                 if (dims.size())
-                    nd4j::ops::helpers::prefix(block.launchContext(), scalar::Add, &val, output, dims, true, true);
+                    sd::ops::helpers::prefix(block.launchContext(), scalar::Add, &val, output, dims, true, true);
                 else
-                    nd4j::ops::helpers::prefix(block.launchContext(), scalar::Add, &val, output, true, true);
+                    sd::ops::helpers::prefix(block.launchContext(), scalar::Add, &val, output, true, true);
             }
             else {
                 if (dims.size())
-                    nd4j::ops::helpers::prefix(block.launchContext(), scalar::Add, &val, output, dims, true, false);
+                    sd::ops::helpers::prefix(block.launchContext(), scalar::Add, &val, output, dims, true, false);
                 else
-                    nd4j::ops::helpers::prefix(block.launchContext(), scalar::Add, &val, output, true, false);
+                    sd::ops::helpers::prefix(block.launchContext(), scalar::Add, &val, output, true, false);
             }
 
             return Status::OK();
diff --git a/libnd4j/include/ops/declarable/generic/transforms/cumsum.cpp b/libnd4j/include/ops/declarable/generic/transforms/cumsum.cpp
index 866853c5e..97389fddb 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/cumsum.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/cumsum.cpp
@@ -18,13 +18,13 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_cumsum)
 
 #include <ops/declarable/helpers/prefix.h>
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 CONFIGURABLE_OP_IMPL(cumsum, 1, 1, true, 0, 2) {
@@ -43,7 +43,7 @@ CONFIGURABLE_OP_IMPL(cumsum, 1, 1, true, 0, 2) {
 
     if (block.getIArguments()->size() == 2 && block.width() == 1) {
         // all at once case
-        nd4j::ops::helpers::prefix(block.launchContext(), scalar::Add, input, output, exclusive, reverse);
+        sd::ops::helpers::prefix(block.launchContext(), scalar::Add, input, output, exclusive, reverse);
     }
     else {
         std::vector<int> dims(block.numI() - 2);
@@ -62,7 +62,7 @@ CONFIGURABLE_OP_IMPL(cumsum, 1, 1, true, 0, 2) {
             if (dims[e] < 0)
                 dims[e] += input->rankOf();
 
-        nd4j::ops::helpers::prefix(block.launchContext(), scalar::Add, input, output, dims, exclusive, reverse);
+        sd::ops::helpers::prefix(block.launchContext(), scalar::Add, input, output, dims, exclusive, reverse);
     }
 
     return Status::OK();
@@ -98,28 +98,28 @@ CUSTOM_OP_IMPL(cumsum_bp, 2, -1, true, 0, 2) {
     }
     if (!exclusive && !reverse) {
         if (dims.size())
-            nd4j::ops::helpers::prefix(block.launchContext(), scalar::Add, gradOut, output, dims, false, true);
+            sd::ops::helpers::prefix(block.launchContext(), scalar::Add, gradOut, output, dims, false, true);
         else
-            nd4j::ops::helpers::prefix(block.launchContext(), scalar::Add, gradOut, output, false, true);
+            sd::ops::helpers::prefix(block.launchContext(), scalar::Add, gradOut, output, false, true);
 
     }
     else if (!exclusive && reverse){
         if (dims.size())
-            nd4j::ops::helpers::prefix(block.launchContext(), scalar::Add, gradOut, output, dims, false, false);
+            sd::ops::helpers::prefix(block.launchContext(), scalar::Add, gradOut, output, dims, false, false);
         else
-            nd4j::ops::helpers::prefix(block.launchContext(), scalar::Add, gradOut, output, false, false);
+            sd::ops::helpers::prefix(block.launchContext(), scalar::Add, gradOut, output, false, false);
     }
     else if (exclusive && !reverse) {
         if (dims.size())
-            nd4j::ops::helpers::prefix(block.launchContext(), scalar::Add, gradOut, output, dims, true, true);
+            sd::ops::helpers::prefix(block.launchContext(), scalar::Add, gradOut, output, dims, true, true);
         else
-            nd4j::ops::helpers::prefix(block.launchContext(), scalar::Add, gradOut, output, true, true);
+            sd::ops::helpers::prefix(block.launchContext(), scalar::Add, gradOut, output, true, true);
     }
     else {
         if (dims.size())
-            nd4j::ops::helpers::prefix(block.launchContext(), scalar::Add, gradOut, output, dims, true, false);
+            sd::ops::helpers::prefix(block.launchContext(), scalar::Add, gradOut, output, dims, true, false);
         else
-            nd4j::ops::helpers::prefix(block.launchContext(), scalar::Add, gradOut, output, true, false);
+            sd::ops::helpers::prefix(block.launchContext(), scalar::Add, gradOut, output, true, false);
     }
 
     return Status::OK();
diff --git a/libnd4j/include/ops/declarable/generic/transforms/eye.cpp b/libnd4j/include/ops/declarable/generic/transforms/eye.cpp
index c5f3dbff6..41469468c 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/eye.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/eye.cpp
@@ -17,13 +17,13 @@
 //
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 22.01.2018
 //
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_eye)
 
 #include <ops/declarable/CustomOperations.h>
 #include<ops/declarable/helpers/transforms.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
 
@@ -44,7 +44,7 @@ namespace ops {
 
         std::vector<int> params;
 
-        nd4j::DataType dtype = block.getTArguments()->empty() ? nd4j::DataType::FLOAT32 : nd4j::DataTypeUtils::fromInt(T_ARG(0));
+        sd::DataType dtype = block.getTArguments()->empty() ? sd::DataType::FLOAT32 : sd::DataTypeUtils::fromInt(T_ARG(0));
 
         if(block.width() == 0) {
             params = *block.getIArguments();
diff --git a/libnd4j/include/ops/declarable/generic/transforms/flatten.cpp b/libnd4j/include/ops/declarable/generic/transforms/flatten.cpp
index b387a0970..19cc4f469 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/flatten.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/flatten.cpp
@@ -18,13 +18,13 @@
 // @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_)
 
 #include <ops/declarable/CustomOperations.h>
 #include<ops/declarable/helpers/flatten.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(flatten, -1, 1, false, 0, 1) {
             auto output = OUTPUT_VARIABLE(0);
@@ -48,13 +48,13 @@ namespace nd4j {
         }
 
         DECLARE_TYPES(flatten) {
-            getOpDescriptor()->setAllowedInputTypes({ALL_INTS, ALL_FLOATS, nd4j::DataType::BOOL});
-            getOpDescriptor()->setAllowedOutputTypes(0, {ALL_FLOATS, ALL_INTS, nd4j::DataType::BOOL});
+            getOpDescriptor()->setAllowedInputTypes({ALL_INTS, ALL_FLOATS, sd::DataType::BOOL});
+            getOpDescriptor()->setAllowedOutputTypes(0, {ALL_FLOATS, ALL_INTS, sd::DataType::BOOL});
         }
 
         DECLARE_SHAPE_FN(flatten) {
             Nd4jLong length = 0;
-            nd4j::DataType dtype = ArrayOptions::dataType(inputShape->at(0));
+            sd::DataType dtype = ArrayOptions::dataType(inputShape->at(0));
             for (int e = 0; e < inputShape->size(); e++) {
                 length += shape::length(inputShape->at(e));
                 REQUIRE_TRUE(dtype == ArrayOptions::dataType(inputShape->at(e)), 0, "Flatten: all input arrays must have the same datatype");
diff --git a/libnd4j/include/ops/declarable/generic/transforms/floor.cpp b/libnd4j/include/ops/declarable/generic/transforms/floor.cpp
index 5a8559075..984708de5 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/floor.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/floor.cpp
@@ -18,12 +18,12 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_Floor)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         OP_IMPL(Floor, 1, 1, true) {
             auto first = INPUT_VARIABLE(0);
@@ -39,7 +39,7 @@ namespace nd4j {
 
         DECLARE_TYPES(Floor) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setSameMode(true);
         }
     }
diff --git a/libnd4j/include/ops/declarable/generic/transforms/gather.cpp b/libnd4j/include/ops/declarable/generic/transforms/gather.cpp
index 61ed3bc65..79ce8ad29 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/gather.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/gather.cpp
@@ -18,7 +18,7 @@
 // @author Shyrma Yurii (iuriish@yahoo.com), created on 16.11.2017
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_gather)
 
 #include <ops/declarable/CustomOperations.h>
@@ -26,7 +26,7 @@
 #include <ops/declarable/helpers/scatter.h>
 
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 
diff --git a/libnd4j/include/ops/declarable/generic/transforms/gatherNd.cpp b/libnd4j/include/ops/declarable/generic/transforms/gatherNd.cpp
index c889569e2..30b5b19ef 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/gatherNd.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/gatherNd.cpp
@@ -18,14 +18,14 @@
 // @author Shyrma Yurii (iuriish@yahoo.com), created on 23.01.2018
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_gather_nd)
 
 #include <ops/declarable/CustomOperations.h>
 #include<ops/declarable/helpers/transforms.h>
 #include <ops/declarable/helpers/scatter.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
 
diff --git a/libnd4j/include/ops/declarable/generic/transforms/hashcode.cpp b/libnd4j/include/ops/declarable/generic/transforms/hashcode.cpp
index 123001dda..4196385c1 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/hashcode.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/hashcode.cpp
@@ -18,14 +18,14 @@
 // @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_hashcode)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/transforms.h>
 #include <ops/declarable/helpers/hashcode.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(hashcode, 1, 1, false, 0, 0) {
             REQUIRE_TRUE(block.width() == 1, 0, "hashcode: this op can't be applied along dimension");
@@ -41,7 +41,7 @@ namespace nd4j {
         };
 
         DECLARE_SHAPE_FN(hashcode) {
-            return SHAPELIST(ConstantShapeHelper::getInstance()->scalarShapeInfo(nd4j::DataType::INT64));
+            return SHAPELIST(ConstantShapeHelper::getInstance()->scalarShapeInfo(sd::DataType::INT64));
         }
 
 
@@ -49,7 +49,7 @@ namespace nd4j {
             getOpDescriptor()
                     ->setAllowedInputTypes(0, {ALL_INTS, ALL_FLOATS})
                     ->setAllowedInputTypes(1, {ALL_INTS})
-                    ->setAllowedOutputTypes({nd4j::DataType::INT64});
+                    ->setAllowedOutputTypes({sd::DataType::INT64});
         };
     }
 }
diff --git a/libnd4j/include/ops/declarable/generic/transforms/histogram.cpp b/libnd4j/include/ops/declarable/generic/transforms/histogram.cpp
index ab5a70c4b..415361894 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/histogram.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/histogram.cpp
@@ -18,14 +18,14 @@
 // @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_histogram)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/transforms.h>
 #include <ops/declarable/helpers/histogram.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(histogram, 1, 1, false, 0, 1) {
             auto input = INPUT_VARIABLE(0);
@@ -43,7 +43,7 @@ namespace nd4j {
         DECLARE_SHAPE_FN(histogram) {
             auto numBins = INT_ARG(0);
 
-            return SHAPELIST(ConstantShapeHelper::getInstance()->vectorShapeInfo(numBins, nd4j::DataType::INT64));
+            return SHAPELIST(ConstantShapeHelper::getInstance()->vectorShapeInfo(numBins, sd::DataType::INT64));
         }
 
 
diff --git a/libnd4j/include/ops/declarable/generic/transforms/histogram_fixed_width.cpp b/libnd4j/include/ops/declarable/generic/transforms/histogram_fixed_width.cpp
index 529446e12..36175fc01 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/histogram_fixed_width.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/histogram_fixed_width.cpp
@@ -18,13 +18,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 31.08.2018
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_histogram_fixed_width)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/histogramFixedWidth.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 CUSTOM_OP_IMPL(histogram_fixed_width, 2, 1, false, 0, 0) {
@@ -48,7 +48,7 @@ CUSTOM_OP_IMPL(histogram_fixed_width, 2, 1, false, 0, 0) {
 
 DECLARE_TYPES(histogram_fixed_width) {
     getOpDescriptor()
-        ->setAllowedInputTypes(nd4j::DataType::ANY)
+        ->setAllowedInputTypes(sd::DataType::ANY)
         ->setAllowedOutputTypes({ALL_INDICES});
 }
 
diff --git a/libnd4j/include/ops/declarable/generic/transforms/invertPermutation.cpp b/libnd4j/include/ops/declarable/generic/transforms/invertPermutation.cpp
index e4bbf8dff..3814106cf 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/invertPermutation.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/invertPermutation.cpp
@@ -18,13 +18,13 @@
 // @author, Yurii Shyrma (iuriish@yahoo.com), created on 06.12.2017
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_invert_permutation)
 
 #include <ops/declarable/CustomOperations.h>
 #include<ops/declarable/helpers/transforms.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 ////////////////////////////////////////////////////////////////////////
@@ -44,7 +44,7 @@ DECLARE_SYN(InvertPermutation, invert_permutation);
 
         DECLARE_TYPES(invert_permutation) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setSameMode(true);
         }
 }
diff --git a/libnd4j/include/ops/declarable/generic/transforms/layer_norm.cpp b/libnd4j/include/ops/declarable/generic/transforms/layer_norm.cpp
index 7afe9b3ed..5643932cb 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/layer_norm.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/layer_norm.cpp
@@ -18,14 +18,14 @@
 // @author Paul Dubs
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_layer_norm)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/reverse.h>
 #include <ops/declarable/helpers/addBias.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
     CONFIGURABLE_OP_IMPL(layer_norm, 2, 1, false, 0, -1) {
@@ -48,18 +48,18 @@ namespace ops  {
 
         std::vector<Nd4jLong> longAxis = ArrayUtils::toLongVector(axis);
 
-        nd4j::ops::standardize standardizeOp;
+        sd::ops::standardize standardizeOp;
         std::vector<NDArray *> inputs = {input};
         std::vector<NDArray *> outputs = {output};
         std::vector<double> targs = {};
         std::vector<bool> bargs = {};
         standardizeOp.execute(inputs, outputs, targs, longAxis, bargs);
 
-        // output->applyTrueBroadcast(nd4j::BroadcastOpsTuple::Multiply(), gain, output);
-        output->applyBroadcast(nd4j::broadcast::Multiply, {dimC}, *gain, *output);
+        // output->applyTrueBroadcast(sd::BroadcastOpsTuple::Multiply(), gain, output);
+        output->applyBroadcast(sd::broadcast::Multiply, {dimC}, *gain, *output);
         if(bias != nullptr) {
-            // output->applyTrueBroadcast(nd4j::BroadcastOpsTuple::Add(), bias, output);
-            // output->applyBroadcast(nd4j::broadcast::Add, {dimC}, bias);
+            // output->applyTrueBroadcast(sd::BroadcastOpsTuple::Add(), bias, output);
+            // output->applyBroadcast(sd::broadcast::Add, {dimC}, bias);
             helpers::addBias(block, *output, *bias, *output, isNCHW);
         }
 
@@ -93,25 +93,25 @@ namespace ops  {
 
         if(bias != nullptr) {
             REQUIRE_TRUE(bias->rankOf() == 1 && bias->sizeAt(0) == input->sizeAt(dimC), 0, "LAYER_NORM_BP OP: wrong shape of bias array, expected is {%i}, but got %s instead !", input->sizeAt(dimC), ShapeUtils::shapeAsString(bias).c_str());
-            // eps->reduceAlongDimension(nd4j::reduce::Sum, *dLdb, {0}, true);
-            eps->reduceAlongDimension(nd4j::reduce::Sum, *dLdb, ShapeUtils::evalDimsToExclude(input->rankOf(), {dimC}));
+            // eps->reduceAlongDimension(sd::reduce::Sum, *dLdb, {0}, true);
+            eps->reduceAlongDimension(sd::reduce::Sum, *dLdb, ShapeUtils::evalDimsToExclude(input->rankOf(), {dimC}));
         }
 
         NDArray standardized(input->shapeInfo(), false, block.launchContext());
 
-        nd4j::ops::standardize standardizeOp;
+        sd::ops::standardize standardizeOp;
         std::vector<NDArray *> inputs = {input};
         std::vector<NDArray *> outputs = {&standardized};
         std::vector<double> targs = {};
         std::vector<bool> bargs = {};
 
         standardizeOp.execute(inputs, outputs, targs, longAxis, bargs);
-        standardized.applyPairwiseTransform(nd4j::pairwise::Multiply, *eps, standardized);
-        standardized.reduceAlongDimension(nd4j::reduce::Sum, *dLdg, ShapeUtils::evalDimsToExclude(input->rankOf(), {dimC}));
+        standardized.applyPairwiseTransform(sd::pairwise::Multiply, *eps, standardized);
+        standardized.reduceAlongDimension(sd::reduce::Sum, *dLdg, ShapeUtils::evalDimsToExclude(input->rankOf(), {dimC}));
 
-        nd4j::ops::standardize_bp standardizeBp;
-        // eps->applyTrueBroadcast(nd4j::BroadcastOpsTuple::Multiply(), gain, dLdx);
-        eps->applyBroadcast(nd4j::broadcast::Multiply, {dimC}, *gain, *dLdx);
+        sd::ops::standardize_bp standardizeBp;
+        // eps->applyTrueBroadcast(sd::BroadcastOpsTuple::Multiply(), gain, dLdx);
+        eps->applyBroadcast(sd::broadcast::Multiply, {dimC}, *gain, *dLdx);
 
         auto dLdx_tmp = dLdx->dup();
         std::vector<NDArray *> standardizeBpArgs = {input, &dLdx_tmp};
diff --git a/libnd4j/include/ops/declarable/generic/transforms/log1p.cpp b/libnd4j/include/ops/declarable/generic/transforms/log1p.cpp
index ef9bdb925..797ca8b2a 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/log1p.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/log1p.cpp
@@ -18,12 +18,12 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_Log1p)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         OP_IMPL(Log1p, 1, 1, true) {
             auto x = INPUT_VARIABLE(0);
@@ -40,7 +40,7 @@ namespace nd4j {
 
     DECLARE_TYPES(Log1p) {
         getOpDescriptor()
-                ->setAllowedInputTypes(nd4j::DataType::ANY)
+                ->setAllowedInputTypes(sd::DataType::ANY)
                 ->setAllowedOutputTypes({ALL_FLOATS});
     }
 }
diff --git a/libnd4j/include/ops/declarable/generic/transforms/merge_add.cpp b/libnd4j/include/ops/declarable/generic/transforms/merge_add.cpp
index c8a88c37e..a68a4ce02 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/merge_add.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/merge_add.cpp
@@ -18,13 +18,13 @@
 // Created by raver119 on 24.11.17.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_mergeadd)
 
 #include <ops/declarable/CustomOperations.h>
 #include<ops/declarable/helpers/transforms.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 OP_IMPL(mergeadd, -1, 1, false) {
@@ -51,8 +51,8 @@ DECLARE_SYN(accumulate_n, mergeadd);
 
     DECLARE_TYPES(mergeadd) {
         getOpDescriptor()
-                ->setAllowedInputTypes(nd4j::DataType::ANY)
-                ->setAllowedOutputTypes(nd4j::DataType::ANY);
+                ->setAllowedInputTypes(sd::DataType::ANY)
+                ->setAllowedOutputTypes(sd::DataType::ANY);
     }
 }
 }
diff --git a/libnd4j/include/ops/declarable/generic/transforms/merge_avg.cpp b/libnd4j/include/ops/declarable/generic/transforms/merge_avg.cpp
index d9f9dd0dc..53e46c16e 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/merge_avg.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/merge_avg.cpp
@@ -18,13 +18,13 @@
 // Created by raver119 on 24.11.17.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_mergeavg)
 
 #include <ops/declarable/CustomOperations.h>
 #include<ops/declarable/helpers/transforms.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 OP_IMPL(mergeavg, -1, 1, false) {
diff --git a/libnd4j/include/ops/declarable/generic/transforms/merge_max.cpp b/libnd4j/include/ops/declarable/generic/transforms/merge_max.cpp
index c6f53e444..7a41f4c1b 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/merge_max.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/merge_max.cpp
@@ -18,13 +18,13 @@
 // Created by raver119 on 24.11.17.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_mergemax)
 
 #include <ops/declarable/CustomOperations.h>
 #include<ops/declarable/helpers/transforms.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
     
 OP_IMPL(mergemax, -1, 1, false) {
@@ -47,8 +47,8 @@ DECLARE_SYN(MergeMax, mergemax);
 
     DECLARE_TYPES(mergemax) {
         getOpDescriptor()
-                ->setAllowedInputTypes(nd4j::DataType::ANY)
-                ->setAllowedOutputTypes(nd4j::DataType::ANY);
+                ->setAllowedInputTypes(sd::DataType::ANY)
+                ->setAllowedOutputTypes(sd::DataType::ANY);
     }
 
 }
diff --git a/libnd4j/include/ops/declarable/generic/transforms/merge_max_idx.cpp b/libnd4j/include/ops/declarable/generic/transforms/merge_max_idx.cpp
index e48761f8f..7fe727452 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/merge_max_idx.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/merge_max_idx.cpp
@@ -18,13 +18,13 @@
 // Created by raver119 on 24.11.17.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_mergemaxindex)
 
 #include <ops/declarable/CustomOperations.h>
 #include<ops/declarable/helpers/transforms.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 CUSTOM_OP_IMPL(mergemaxindex, -1, 1, false, 0, 0) {
diff --git a/libnd4j/include/ops/declarable/generic/transforms/mirrorPad.cpp b/libnd4j/include/ops/declarable/generic/transforms/mirrorPad.cpp
index 2de8ee5a2..a4b934853 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/mirrorPad.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/mirrorPad.cpp
@@ -17,13 +17,13 @@
 //
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 07.06.2018
 //
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_mirror_pad)
 
 #include <ops/declarable/CustomOperations.h>
 #include<ops/declarable/helpers/transforms.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
 
diff --git a/libnd4j/include/ops/declarable/generic/transforms/pad.cpp b/libnd4j/include/ops/declarable/generic/transforms/pad.cpp
index c6c8c8ff8..800a00f1b 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/pad.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/pad.cpp
@@ -18,14 +18,14 @@
 // @author Shyrma Yurii (iuriish@yahoo.com), created on 06.11.2017.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_pad)
 
 #include <ops/declarable/CustomOperations.h>
 #include<ops/declarable/helpers/transforms.h>
 #include <numeric>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 
@@ -77,7 +77,7 @@ CUSTOM_OP_IMPL(pad, 2, 1, false, 0, 1) {
 
 DECLARE_TYPES(pad) {
     getOpDescriptor()
-    	->setAllowedInputTypes(0, nd4j::DataType::ANY)
+    	->setAllowedInputTypes(0, sd::DataType::ANY)
     	->setAllowedInputTypes(1, {DataType::INT32, DataType::INT64}) // INT32 with TF
 //    	->setAllowedInputTypes(1, {DataType::INT32, DataType::INT64}) // INT32 with TF, but used also INT64 due long shapes
     	->setSameMode(true);
diff --git a/libnd4j/include/ops/declarable/generic/transforms/repeat.cpp b/libnd4j/include/ops/declarable/generic/transforms/repeat.cpp
index f3e7b84a5..99ab3d635 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/repeat.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/repeat.cpp
@@ -18,12 +18,12 @@
 // @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_repeat)
 
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 //////////////////////////////////////////////////////////////////////////
@@ -50,7 +50,7 @@ CUSTOM_OP_IMPL(repeat, 1, 1, true, 0, -1) {
 
 DECLARE_TYPES(repeat) {
     getOpDescriptor()
-            ->setAllowedInputTypes(nd4j::DataType::ANY)
+            ->setAllowedInputTypes(sd::DataType::ANY)
             ->setSameMode(true);
 }
 
diff --git a/libnd4j/include/ops/declarable/generic/transforms/reverse.cpp b/libnd4j/include/ops/declarable/generic/transforms/reverse.cpp
index 9f93d37a8..ceb953979 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/reverse.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/reverse.cpp
@@ -18,14 +18,14 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 02.11.2017
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_reverse)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/reverse.h>
 
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
     CONFIGURABLE_OP_IMPL(reverse, 1, 1, true, 0, -2) {
@@ -93,7 +93,7 @@ namespace ops  {
 
     DECLARE_TYPES(reverse_bp) {
         getOpDescriptor()
-                ->setAllowedInputTypes(nd4j::DataType::ANY)
+                ->setAllowedInputTypes(sd::DataType::ANY)
                 ->setAllowedOutputTypes({ALL_FLOATS});
     }
 
diff --git a/libnd4j/include/ops/declarable/generic/transforms/reverseSequence.cpp b/libnd4j/include/ops/declarable/generic/transforms/reverseSequence.cpp
index b3c2a93d4..c7dcc6e36 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/reverseSequence.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/reverseSequence.cpp
@@ -18,13 +18,13 @@
 //  Created by Yurii Shyrma on 25.01.2018
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_reverse_sequence)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/reverse.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
 CUSTOM_OP_IMPL(reverse_sequence, 2, 1, false, 0, 2) {
diff --git a/libnd4j/include/ops/declarable/generic/transforms/scatter_update.cpp b/libnd4j/include/ops/declarable/generic/transforms/scatter_update.cpp
index 247e454bd..d15b4c859 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/scatter_update.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/scatter_update.cpp
@@ -18,13 +18,13 @@
 // Created by raver119 on 24.11.17.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_scatter_update)
 
 #include <ops/declarable/CustomOperations.h>
 #include<ops/declarable/helpers/transforms.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         /**
          * scatter update operation
@@ -51,7 +51,7 @@ namespace nd4j {
 
         DECLARE_TYPES(scatter_update) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
                     ->setSameMode(true);
         }
     }
diff --git a/libnd4j/include/ops/declarable/generic/transforms/standardize.cpp b/libnd4j/include/ops/declarable/generic/transforms/standardize.cpp
index 25efc1a73..f4e8a6f7a 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/standardize.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/standardize.cpp
@@ -18,14 +18,14 @@
 // @author Paul Dubs
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_standardize)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/reverse.h>
 
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
     CONFIGURABLE_OP_IMPL(standardize, 1, 1, true, 0, -2) {
@@ -48,9 +48,9 @@ namespace ops  {
         auto stdev = input->varianceAlongDimension(variance::SummaryStatsStandardDeviation, false, axis);
         stdev.reshapei(means.getShapeAsVector());
 
-        input->applyTrueBroadcast(nd4j::BroadcastOpsTuple::Subtract(), means, *output, false);
-        output->applyTrueBroadcast(nd4j::BroadcastOpsTuple::Divide(), stdev, *output, false);
-        output->applyScalar(nd4j::scalar::ReplaceNans, 0, *output);
+        input->applyTrueBroadcast(sd::BroadcastOpsTuple::Subtract(), means, *output, false);
+        output->applyTrueBroadcast(sd::BroadcastOpsTuple::Divide(), stdev, *output, false);
+        output->applyScalar(sd::scalar::ReplaceNans, 0, *output);
 
         return Status::OK();
     }
@@ -84,7 +84,7 @@ namespace ops  {
         auto stdev = input->varianceAlongDimension(variance::SummaryStatsStandardDeviation, false, axis);
         stdev.reshapei(means.getShapeAsVector());
 
-        eps->applyTrueBroadcast(nd4j::BroadcastOpsTuple::Divide(), stdev, *output, false);
+        eps->applyTrueBroadcast(sd::BroadcastOpsTuple::Divide(), stdev, *output, false);
 
         NDArray dldu_sum = -output->reduceAlongDimension(reduce::Sum, axis, true);
 
@@ -94,16 +94,16 @@ namespace ops  {
         std::vector<double> meanBpTArgs = {};
         std::vector<bool> meanBpBArgs = {};
 
-        nd4j::ops::reduce_mean_bp meanBp;
+        sd::ops::reduce_mean_bp meanBp;
         meanBp.execute(meanBpArgs, meanBpOutput, meanBpTArgs, longAxis, meanBpBArgs);
         *output += dldx_u;
 
         // (eps * (means - input) / (stdev * stdev))
         NDArray tmp(eps->shapeInfo(), false, block.launchContext());
-        means.applyTrueBroadcast(nd4j::BroadcastOpsTuple::Subtract(), *input, tmp, false);
-        tmp.applyPairwiseTransform(nd4j::pairwise::Multiply, *eps, tmp);
-        stdev.applyPairwiseTransform(nd4j::pairwise::Multiply, stdev, stdev);
-        tmp.applyTrueBroadcast(nd4j::BroadcastOpsTuple::Divide(), stdev, tmp, false);
+        means.applyTrueBroadcast(sd::BroadcastOpsTuple::Subtract(), *input, tmp, false);
+        tmp.applyPairwiseTransform(sd::pairwise::Multiply, *eps, tmp);
+        stdev.applyPairwiseTransform(sd::pairwise::Multiply, stdev, stdev);
+        tmp.applyTrueBroadcast(sd::BroadcastOpsTuple::Divide(), stdev, tmp, false);
 
         auto dlds_sum = tmp.reduceAlongDimension(reduce::Sum, axis, true);
         NDArray dldx_s(input->shapeInfo(), false, block.launchContext());
@@ -111,18 +111,18 @@ namespace ops  {
         std::vector<NDArray*> stdevBpOutput = {&dldx_s};
         std::vector<double> stdevBpTArgs = {};
         std::vector<bool> stdevBpBArgs = {};
-        nd4j::ops::reduce_stdev_bp stdevBp;
+        sd::ops::reduce_stdev_bp stdevBp;
         stdevBp.execute(stdevBpArgs,  stdevBpOutput, stdevBpTArgs, longAxis, stdevBpBArgs);
         *output += dldx_s;
 
-        output->applyScalar(nd4j::scalar::ReplaceNans, 0, *output);
+        output->applyScalar(sd::scalar::ReplaceNans, 0, *output);
 
         return Status::OK();
     }
 
     DECLARE_TYPES(standardize_bp) {
         getOpDescriptor()
-                ->setAllowedInputTypes(nd4j::DataType::ANY)
+                ->setAllowedInputTypes(sd::DataType::ANY)
                 ->setAllowedOutputTypes({ALL_FLOATS});
     }
 
diff --git a/libnd4j/include/ops/declarable/generic/transforms/tile.cpp b/libnd4j/include/ops/declarable/generic/transforms/tile.cpp
index 8ef1032d5..6041d1c41 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/tile.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/tile.cpp
@@ -19,13 +19,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_tile)
 
 #include <ops/declarable/CustomOperations.h>
 #include<ops/declarable/helpers/transforms.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
 CUSTOM_OP_IMPL(tile, 1, 1, false, 0, -2) {
@@ -60,9 +60,9 @@ CUSTOM_OP_IMPL(tile, 1, 1, false, 0, -2) {
 }
 
     DECLARE_TYPES(tile) {
-        getOpDescriptor()->setAllowedInputTypes(0, nd4j::DataType::ANY)
+        getOpDescriptor()->setAllowedInputTypes(0, sd::DataType::ANY)
                 ->setAllowedInputTypes(1, {ALL_INTS})
-                ->setAllowedOutputTypes(nd4j::DataType::ANY);
+                ->setAllowedOutputTypes(sd::DataType::ANY);
     }
 
 
diff --git a/libnd4j/include/ops/declarable/generic/transforms/trace.cpp b/libnd4j/include/ops/declarable/generic/transforms/trace.cpp
index cac0ddfc0..fa9fd5f56 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/trace.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/trace.cpp
@@ -18,13 +18,13 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 24.01.2018
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_trace)
 
 #include <ops/declarable/CustomOperations.h>
 #include<ops/declarable/helpers/transforms.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 CUSTOM_OP_IMPL(trace, 1, 1, false, 0, 0) {
diff --git a/libnd4j/include/ops/declarable/generic/transforms/tri.cpp b/libnd4j/include/ops/declarable/generic/transforms/tri.cpp
index a6106f197..19144e2fb 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/tri.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/tri.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/CustomOperations.h>
 
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 
diff --git a/libnd4j/include/ops/declarable/generic/transforms/triu.cpp b/libnd4j/include/ops/declarable/generic/transforms/triu.cpp
index b382cbfb1..839828f62 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/triu.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/triu.cpp
@@ -22,7 +22,7 @@
 #include<ops/declarable/helpers/transforms.h>
 
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 
diff --git a/libnd4j/include/ops/declarable/generic/tsne/cell_contains.cpp b/libnd4j/include/ops/declarable/generic/tsne/cell_contains.cpp
index e72799a54..e176797b0 100644
--- a/libnd4j/include/ops/declarable/generic/tsne/cell_contains.cpp
+++ b/libnd4j/include/ops/declarable/generic/tsne/cell_contains.cpp
@@ -18,13 +18,13 @@
 // @author George A. Shulinok <sgazeos@gmail.com), created on 5/15/2019.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_cell_contains)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/BarnesHutTsne.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
 
         CUSTOM_OP_IMPL(cell_contains, 3, 1, false, 0, 1) {
@@ -40,13 +40,13 @@ namespace nd4j {
 
         DECLARE_TYPES(cell_contains) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
-                    ->setAllowedOutputTypes(nd4j::DataType::BOOL)
+                    ->setAllowedInputTypes(sd::DataType::ANY)
+                    ->setAllowedOutputTypes(sd::DataType::BOOL)
                     ->setSameMode(false);
         }
 
         DECLARE_SHAPE_FN(cell_contains) {
-            return SHAPELIST(CONSTANT(ShapeBuilders::createScalarShapeInfo(nd4j::DataType::BOOL, block.workspace())));
+            return SHAPELIST(CONSTANT(ShapeBuilders::createScalarShapeInfo(sd::DataType::BOOL, block.workspace())));
         }
     }
 }
diff --git a/libnd4j/include/ops/declarable/generic/tsne/edge_force.cpp b/libnd4j/include/ops/declarable/generic/tsne/edge_force.cpp
index 75a984e66..1d409c51f 100644
--- a/libnd4j/include/ops/declarable/generic/tsne/edge_force.cpp
+++ b/libnd4j/include/ops/declarable/generic/tsne/edge_force.cpp
@@ -18,13 +18,13 @@
 // @author George A. Shulinok <sgazeos@gmail.com>, created on 4/18/2019.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_barnes_edge_force)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/BarnesHutTsne.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 		
     CUSTOM_OP_IMPL(barnes_edge_forces, 4, 1, false, 0, 1) {
diff --git a/libnd4j/include/ops/declarable/generic/tsne/gains.cpp b/libnd4j/include/ops/declarable/generic/tsne/gains.cpp
index 8aa701435..4fb943483 100644
--- a/libnd4j/include/ops/declarable/generic/tsne/gains.cpp
+++ b/libnd4j/include/ops/declarable/generic/tsne/gains.cpp
@@ -18,13 +18,13 @@
 // @author George A. Shulinok <sgazeos@gmail.com), created on 4/18/2019.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_barnes_gains)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/BarnesHutTsne.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 		
     OP_IMPL(barnes_gains, 3, 1, true) {
@@ -40,7 +40,7 @@ namespace ops  {
 
     DECLARE_TYPES(barnes_gains) {
         getOpDescriptor()
-            ->setAllowedInputTypes(nd4j::DataType::ANY)
+            ->setAllowedInputTypes(sd::DataType::ANY)
             ->setSameMode(true);
     }
 }
diff --git a/libnd4j/include/ops/declarable/generic/tsne/symmetrized.cpp b/libnd4j/include/ops/declarable/generic/tsne/symmetrized.cpp
index 135426aa1..82dd8c36e 100644
--- a/libnd4j/include/ops/declarable/generic/tsne/symmetrized.cpp
+++ b/libnd4j/include/ops/declarable/generic/tsne/symmetrized.cpp
@@ -18,13 +18,13 @@
 // @author George A. Shulinok <sgazeos@gmail.com>, created on 4/18/2019.
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_barnes_symmetrized)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/BarnesHutTsne.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 		NDArray* rowCountsPtr = nullptr;
 
@@ -80,9 +80,9 @@ namespace ops  {
 //            outShapeInfo[2] = len;
            // ShapeUtils::updateStridesAndType(outShapeInfo, ArrayOptions::dataType(valPShapeInfo), 'c');
             //outShapeInfo = ShapeBuilders::createVectorShapeInfo(ArrayOptions::dataType(valPShapeInfo), len, block.workspace());
-            outShapeInfo = nd4j::ShapeBuilders::createShapeInfo(ArrayOptions::dataType(valPShapeInfo), 'c', {1, len}, block.getWorkspace());
-            auto outColsShapeInfo = nd4j::ShapeBuilders::createShapeInfo(dataType, 'c', {1, len}, block.getWorkspace());
-            auto outRowsShapeInfo = nd4j::ShapeBuilders::createShapeInfo(dataType, 'c', {1, N + 1}, block.getWorkspace());
+            outShapeInfo = sd::ShapeBuilders::createShapeInfo(ArrayOptions::dataType(valPShapeInfo), 'c', {1, len}, block.getWorkspace());
+            auto outColsShapeInfo = sd::ShapeBuilders::createShapeInfo(dataType, 'c', {1, len}, block.getWorkspace());
+            auto outRowsShapeInfo = sd::ShapeBuilders::createShapeInfo(dataType, 'c', {1, N + 1}, block.getWorkspace());
     		return SHAPELIST(CONSTANT(outRowsShapeInfo), CONSTANT(outColsShapeInfo), CONSTANT(outShapeInfo));
 		}
 
diff --git a/libnd4j/include/ops/declarable/generic/util/print_affinity.cpp b/libnd4j/include/ops/declarable/generic/util/print_affinity.cpp
index 6b1514ab9..0103e8672 100644
--- a/libnd4j/include/ops/declarable/generic/util/print_affinity.cpp
+++ b/libnd4j/include/ops/declarable/generic/util/print_affinity.cpp
@@ -18,13 +18,13 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_print_affinity)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/print_variable.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(print_affinity, 1, 1, true, 0, 0) {
             // TODO: make this op compatible with ArrayList etc
@@ -38,9 +38,9 @@ namespace nd4j {
 
         DECLARE_TYPES(print_affinity) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(0, nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(0, sd::DataType::ANY)
                     ->setAllowedInputTypes(1, {ALL_STRINGS})
-                    ->setAllowedOutputTypes(0, nd4j::DataType::INT32);
+                    ->setAllowedOutputTypes(0, sd::DataType::INT32);
         }
 
         DECLARE_SHAPE_FN(print_affinity) {
diff --git a/libnd4j/include/ops/declarable/generic/util/print_variable.cpp b/libnd4j/include/ops/declarable/generic/util/print_variable.cpp
index 6828b2f90..9d3369627 100644
--- a/libnd4j/include/ops/declarable/generic/util/print_variable.cpp
+++ b/libnd4j/include/ops/declarable/generic/util/print_variable.cpp
@@ -17,13 +17,13 @@
 //
 //  @author raver119@gmail.com
 //
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #if NOT_EXCLUDED(OP_print_variable)
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/print_variable.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         CUSTOM_OP_IMPL(print_variable, 1, 1, true, 0, 0) {
             // TODO: make this op compatible with ArrayList etc
@@ -42,7 +42,7 @@ namespace nd4j {
             if (block.numB() > 0)
                 printSpecial = B_ARG(0);
 
-            if (printSpecial && !nd4j::Environment::getInstance()->isCPU()) {
+            if (printSpecial && !sd::Environment::getInstance()->isCPU()) {
                 // only specific backends support special printout. for cpu-based backends it's the same as regular print
 
                 if (block.width() == 2)
@@ -63,9 +63,9 @@ namespace nd4j {
 
         DECLARE_TYPES(print_variable) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(0, nd4j::DataType::ANY)
+                    ->setAllowedInputTypes(0, sd::DataType::ANY)
                     ->setAllowedInputTypes(1, {ALL_STRINGS})
-                    ->setAllowedOutputTypes(0, nd4j::DataType::INT32);
+                    ->setAllowedOutputTypes(0, sd::DataType::INT32);
         }
 
         DECLARE_SHAPE_FN(print_variable) {
diff --git a/libnd4j/include/ops/declarable/headers/BarnesHutTsne.h b/libnd4j/include/ops/declarable/headers/BarnesHutTsne.h
index d3a4c042d..3f0d86e19 100644
--- a/libnd4j/include/ops/declarable/headers/BarnesHutTsne.h
+++ b/libnd4j/include/ops/declarable/headers/BarnesHutTsne.h
@@ -23,7 +23,7 @@
 
 #include <ops/declarable/headers/common.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         /**
          * This operation used as helper with BarnesHutTsne class 
diff --git a/libnd4j/include/ops/declarable/headers/activations.h b/libnd4j/include/ops/declarable/headers/activations.h
index 9d0b22198..db9e8186a 100644
--- a/libnd4j/include/ops/declarable/headers/activations.h
+++ b/libnd4j/include/ops/declarable/headers/activations.h
@@ -24,7 +24,7 @@
 
 #include <ops/declarable/headers/common.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         /**
          * This is Sigmoid activation function implementation
diff --git a/libnd4j/include/ops/declarable/headers/bitwise.h b/libnd4j/include/ops/declarable/headers/bitwise.h
index cb395b496..b5f29896f 100644
--- a/libnd4j/include/ops/declarable/headers/bitwise.h
+++ b/libnd4j/include/ops/declarable/headers/bitwise.h
@@ -23,7 +23,7 @@
 
 #include <ops/declarable/headers/common.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         /**
          * This operation toggles individual bits of each element in array
diff --git a/libnd4j/include/ops/declarable/headers/blas.h b/libnd4j/include/ops/declarable/headers/blas.h
index d94d365dd..09215e113 100644
--- a/libnd4j/include/ops/declarable/headers/blas.h
+++ b/libnd4j/include/ops/declarable/headers/blas.h
@@ -22,7 +22,7 @@
 
 #include <ops/declarable/headers/common.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         
         /**
diff --git a/libnd4j/include/ops/declarable/headers/boolean.h b/libnd4j/include/ops/declarable/headers/boolean.h
index 21fe48202..75e95f630 100644
--- a/libnd4j/include/ops/declarable/headers/boolean.h
+++ b/libnd4j/include/ops/declarable/headers/boolean.h
@@ -23,7 +23,7 @@
 
 #include <ops/declarable/headers/common.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
 
         /**
diff --git a/libnd4j/include/ops/declarable/headers/broadcastable.h b/libnd4j/include/ops/declarable/headers/broadcastable.h
index 9a2dc9f62..691a1b7b2 100644
--- a/libnd4j/include/ops/declarable/headers/broadcastable.h
+++ b/libnd4j/include/ops/declarable/headers/broadcastable.h
@@ -25,7 +25,7 @@
 #include <ops/declarable/headers/common.h>
 #include <ops/declarable/generic/helpers/BroadcastHelper.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         // TODO: make broadcastables separate class
 
diff --git a/libnd4j/include/ops/declarable/headers/common.h b/libnd4j/include/ops/declarable/headers/common.h
index 6b670e3be..d70e6beb8 100644
--- a/libnd4j/include/ops/declarable/headers/common.h
+++ b/libnd4j/include/ops/declarable/headers/common.h
@@ -22,9 +22,9 @@
 #define LIBND4J_OPS_DECLARABLE_COMMON_H
 
 #include <memory>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <types/float16.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <graph/Context.h>
 #include <ops/declarable/DeclarableOp.h>
 #include <ops/declarable/BooleanOp.h>
@@ -36,7 +36,7 @@
 #include <helpers/ArrayUtils.h>
 #include <helpers/ShapeUtils.h>
 #include <array/ShapeList.h>
-#include <Status.h>
+#include <graph/Status.h>
 #include <stdexcept>
 
 #endif
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/headers/compat.h b/libnd4j/include/ops/declarable/headers/compat.h
index 8ce73153e..37894517a 100644
--- a/libnd4j/include/ops/declarable/headers/compat.h
+++ b/libnd4j/include/ops/declarable/headers/compat.h
@@ -23,7 +23,7 @@
 
 #include <ops/declarable/headers/common.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         /**
          * This operation splits input string into pieces separated by delimiter
diff --git a/libnd4j/include/ops/declarable/headers/convo.h b/libnd4j/include/ops/declarable/headers/convo.h
index 89824c342..d00da07f2 100644
--- a/libnd4j/include/ops/declarable/headers/convo.h
+++ b/libnd4j/include/ops/declarable/headers/convo.h
@@ -23,7 +23,7 @@
 
 #include <ops/declarable/headers/common.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
 
         /**
diff --git a/libnd4j/include/ops/declarable/headers/datatypes.h b/libnd4j/include/ops/declarable/headers/datatypes.h
index b82ab4ad6..e46753919 100644
--- a/libnd4j/include/ops/declarable/headers/datatypes.h
+++ b/libnd4j/include/ops/declarable/headers/datatypes.h
@@ -22,7 +22,7 @@
 
 #include <ops/declarable/headers/common.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         /**
          * This operation casts elements of input array to double data type
diff --git a/libnd4j/include/ops/declarable/headers/images.h b/libnd4j/include/ops/declarable/headers/images.h
index 14acd1877..41974901a 100644
--- a/libnd4j/include/ops/declarable/headers/images.h
+++ b/libnd4j/include/ops/declarable/headers/images.h
@@ -30,7 +30,7 @@
 #include <execution/Threads.h>
 #include <ops/declarable/helpers/imagesHelpers.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 
 
diff --git a/libnd4j/include/ops/declarable/headers/kernels.h b/libnd4j/include/ops/declarable/headers/kernels.h
index 8fb2bab62..c4cc02cb5 100644
--- a/libnd4j/include/ops/declarable/headers/kernels.h
+++ b/libnd4j/include/ops/declarable/headers/kernels.h
@@ -23,7 +23,7 @@
 
 #include <ops/declarable/headers/common.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
     #if NOT_EXCLUDED(OP_knn_mindistance)
         DECLARE_CUSTOM_OP(knn_mindistance, 3, 1, false, 0, 0);
diff --git a/libnd4j/include/ops/declarable/headers/list.h b/libnd4j/include/ops/declarable/headers/list.h
index 756895a1f..af4fb5706 100644
--- a/libnd4j/include/ops/declarable/headers/list.h
+++ b/libnd4j/include/ops/declarable/headers/list.h
@@ -23,7 +23,7 @@
 
 #include <ops/declarable/headers/common.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         // list operations, basically all around NDArrayList
 
diff --git a/libnd4j/include/ops/declarable/headers/loss.h b/libnd4j/include/ops/declarable/headers/loss.h
index 04d14532d..3f3625040 100644
--- a/libnd4j/include/ops/declarable/headers/loss.h
+++ b/libnd4j/include/ops/declarable/headers/loss.h
@@ -23,7 +23,7 @@
 
 #include <ops/declarable/headers/common.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
     
     //////////////////////////////////////////////////////////////////////////
diff --git a/libnd4j/include/ops/declarable/headers/nlp.h b/libnd4j/include/ops/declarable/headers/nlp.h
index 6da62e506..e12db1402 100644
--- a/libnd4j/include/ops/declarable/headers/nlp.h
+++ b/libnd4j/include/ops/declarable/headers/nlp.h
@@ -22,7 +22,7 @@
 #define DEV_TESTS_NLP_H
 #include <ops/declarable/headers/common.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
 
         #if NOT_EXCLUDED(OP_skipgram)
diff --git a/libnd4j/include/ops/declarable/headers/nn.h b/libnd4j/include/ops/declarable/headers/nn.h
index 810733680..26699aa32 100644
--- a/libnd4j/include/ops/declarable/headers/nn.h
+++ b/libnd4j/include/ops/declarable/headers/nn.h
@@ -23,7 +23,7 @@
 
 #include <ops/declarable/headers/common.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
 
         #if NOT_EXCLUDED(OP_softmax)
@@ -175,7 +175,7 @@ namespace nd4j {
          * applies layer normalization to input
          * y = g * standardize(x) + b
          *
-         * see nd4j::ops::standardize
+         * see sd::ops::standardize
          *
          */
         #if NOT_EXCLUDED(OP_layer_norm)
diff --git a/libnd4j/include/ops/declarable/headers/parity_ops.h b/libnd4j/include/ops/declarable/headers/parity_ops.h
index 79ff1acd2..81742fa3d 100644
--- a/libnd4j/include/ops/declarable/headers/parity_ops.h
+++ b/libnd4j/include/ops/declarable/headers/parity_ops.h
@@ -24,7 +24,7 @@
 
 #include <ops/declarable/headers/common.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         /**
          * This operation returns index of max element in a given NDArray (optionally: along given dimension(s))
diff --git a/libnd4j/include/ops/declarable/headers/random.h b/libnd4j/include/ops/declarable/headers/random.h
index f52534411..367a41995 100644
--- a/libnd4j/include/ops/declarable/headers/random.h
+++ b/libnd4j/include/ops/declarable/headers/random.h
@@ -24,7 +24,7 @@
 
 #include <ops/declarable/headers/common.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         #if NOT_EXCLUDED(OP_set_seed)
         DECLARE_CUSTOM_OP(set_seed, -2, 1, false, 0, -2);
diff --git a/libnd4j/include/ops/declarable/headers/recurrent.h b/libnd4j/include/ops/declarable/headers/recurrent.h
index bf6aaa6bc..55138bb60 100644
--- a/libnd4j/include/ops/declarable/headers/recurrent.h
+++ b/libnd4j/include/ops/declarable/headers/recurrent.h
@@ -23,7 +23,7 @@
 
 #include <ops/declarable/headers/common.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
     //////////////////////////////////////////////////////////////////////////
diff --git a/libnd4j/include/ops/declarable/headers/shape.h b/libnd4j/include/ops/declarable/headers/shape.h
index c21cdb84d..7f9330342 100644
--- a/libnd4j/include/ops/declarable/headers/shape.h
+++ b/libnd4j/include/ops/declarable/headers/shape.h
@@ -23,7 +23,7 @@
 
 #include <ops/declarable/headers/common.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         #if NOT_EXCLUDED(OP_permute)
         DECLARE_CUSTOM_OP(permute, 1, 1, false, 0, -2);
diff --git a/libnd4j/include/ops/declarable/headers/strings.h b/libnd4j/include/ops/declarable/headers/strings.h
index 0849f118a..bd4b8b949 100644
--- a/libnd4j/include/ops/declarable/headers/strings.h
+++ b/libnd4j/include/ops/declarable/headers/strings.h
@@ -23,7 +23,7 @@
 
 #include <ops/declarable/headers/common.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         /**
          * This operation splits input string into pieces separated by delimiter
diff --git a/libnd4j/include/ops/declarable/headers/tests.h b/libnd4j/include/ops/declarable/headers/tests.h
index da3cd8ecf..cad12b3c8 100644
--- a/libnd4j/include/ops/declarable/headers/tests.h
+++ b/libnd4j/include/ops/declarable/headers/tests.h
@@ -19,7 +19,7 @@
 //
 #include <ops/declarable/headers/common.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         #if NOT_EXCLUDED(OP_test_output_reshape)
         DECLARE_OP(test_output_reshape, 1, 1, true);
diff --git a/libnd4j/include/ops/declarable/headers/third_party.h b/libnd4j/include/ops/declarable/headers/third_party.h
index 38639d038..705a02903 100644
--- a/libnd4j/include/ops/declarable/headers/third_party.h
+++ b/libnd4j/include/ops/declarable/headers/third_party.h
@@ -23,7 +23,7 @@
 
 #include <ops/declarable/headers/common.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         #if NOT_EXCLUDED(OP_firas_sparse)
         DECLARE_CUSTOM_OP(firas_sparse, 1, 1, false, 0, -1);
diff --git a/libnd4j/include/ops/declarable/headers/transforms.h b/libnd4j/include/ops/declarable/headers/transforms.h
index ab4e962a3..0e14037df 100644
--- a/libnd4j/include/ops/declarable/headers/transforms.h
+++ b/libnd4j/include/ops/declarable/headers/transforms.h
@@ -23,7 +23,7 @@
 
 #include <ops/declarable/headers/common.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         #if NOT_EXCLUDED(OP_clipbyvalue)
         DECLARE_CONFIGURABLE_OP(clipbyvalue, 1, 1, true, 2, 0);
diff --git a/libnd4j/include/ops/declarable/headers/util.h b/libnd4j/include/ops/declarable/headers/util.h
index aa1f52363..57b013f29 100644
--- a/libnd4j/include/ops/declarable/headers/util.h
+++ b/libnd4j/include/ops/declarable/headers/util.h
@@ -23,7 +23,7 @@
 
 #include <ops/declarable/headers/common.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         /**
          * This operation prints out NDArray content, either on host or device.
diff --git a/libnd4j/include/ops/declarable/helpers/BarnesHutTsne.h b/libnd4j/include/ops/declarable/helpers/BarnesHutTsne.h
index 9e2258a59..f52dd9ba4 100644
--- a/libnd4j/include/ops/declarable/helpers/BarnesHutTsne.h
+++ b/libnd4j/include/ops/declarable/helpers/BarnesHutTsne.h
@@ -23,7 +23,7 @@
 
 #include <ops/declarable/helpers/helpers.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
diff --git a/libnd4j/include/ops/declarable/helpers/activations.h b/libnd4j/include/ops/declarable/helpers/activations.h
index 331170369..ab652ab24 100644
--- a/libnd4j/include/ops/declarable/helpers/activations.h
+++ b/libnd4j/include/ops/declarable/helpers/activations.h
@@ -23,27 +23,27 @@
 
 #include <ops/declarable/helpers/helpers.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
-    ND4J_EXPORT void softMaxForVector(nd4j::LaunchContext * context, const NDArray &input, NDArray &output);
+    ND4J_EXPORT void softMaxForVector(sd::LaunchContext * context, const NDArray &input, NDArray &output);
 
-    ND4J_EXPORT void logSoftMaxForVector(nd4j::LaunchContext * context, const NDArray &input, NDArray &output);
+    ND4J_EXPORT void logSoftMaxForVector(sd::LaunchContext * context, const NDArray &input, NDArray &output);
 
-    ND4J_EXPORT void softmax(nd4j::LaunchContext * context, const NDArray &input, NDArray &output, const int dimension);
+    ND4J_EXPORT void softmax(sd::LaunchContext * context, const NDArray &input, NDArray &output, const int dimension);
 
-    ND4J_EXPORT void logSoftmax(nd4j::LaunchContext * context, const NDArray &input, NDArray &output, const int dimension);
+    ND4J_EXPORT void logSoftmax(sd::LaunchContext * context, const NDArray &input, NDArray &output, const int dimension);
 
-    ND4J_EXPORT void softmaxDerivative(nd4j::LaunchContext * context, const NDArray& input, NDArray& output, const int dimension);
+    ND4J_EXPORT void softmaxDerivative(sd::LaunchContext * context, const NDArray& input, NDArray& output, const int dimension);
 
-    ND4J_EXPORT void prelu(nd4j::LaunchContext * context, const NDArray &input, const NDArray &alpha, NDArray &output);
+    ND4J_EXPORT void prelu(sd::LaunchContext * context, const NDArray &input, const NDArray &alpha, NDArray &output);
 
-    ND4J_EXPORT void preluBP(nd4j::LaunchContext * context, const NDArray &input, const NDArray &alpha, const NDArray &dLdO, NDArray &dLdI, NDArray &dLdA);
+    ND4J_EXPORT void preluBP(sd::LaunchContext * context, const NDArray &input, const NDArray &alpha, const NDArray &dLdO, NDArray &dLdI, NDArray &dLdA);
 
-    ND4J_EXPORT void thresholdRelu(nd4j::LaunchContext * context, const NDArray &input, double threshold, NDArray &output);
+    ND4J_EXPORT void thresholdRelu(sd::LaunchContext * context, const NDArray &input, double threshold, NDArray &output);
 
-    ND4J_EXPORT void thresholdReluDerivative(nd4j::LaunchContext * context, NDArray *input, double threshold, NDArray* dLdO, NDArray *output);
+    ND4J_EXPORT void thresholdReluDerivative(sd::LaunchContext * context, NDArray *input, double threshold, NDArray* dLdO, NDArray *output);
 }
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/addBias.h b/libnd4j/include/ops/declarable/helpers/addBias.h
index c754c07de..8eff731f7 100644
--- a/libnd4j/include/ops/declarable/helpers/addBias.h
+++ b/libnd4j/include/ops/declarable/helpers/addBias.h
@@ -24,7 +24,7 @@
 #include <ops/declarable/helpers/helpers.h>
 #include <graph/Context.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
diff --git a/libnd4j/include/ops/declarable/helpers/adjust_hue.h b/libnd4j/include/ops/declarable/helpers/adjust_hue.h
index afa7b2436..2d0e2f087 100644
--- a/libnd4j/include/ops/declarable/helpers/adjust_hue.h
+++ b/libnd4j/include/ops/declarable/helpers/adjust_hue.h
@@ -20,15 +20,15 @@
 // @author Oleh Semeniv (oleg.semeniv@gmail.com)
 //
 
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
 
-    void adjustHue(nd4j::LaunchContext* context, const NDArray *input, const NDArray* deltaScalarArr, NDArray *output, const int dimC);
+    void adjustHue(sd::LaunchContext* context, const NDArray *input, const NDArray* deltaScalarArr, NDArray *output, const int dimC);
 
 
 
@@ -39,8 +39,8 @@ FORCEINLINE _CUDA_HD void rgbToHsv(const T& r, const T& g, const T& b, T& h, T&
     // h values are in range [0, 360)
     // s and v values are in range [0, 1]
 
-    const T max = nd4j::math::nd4j_max<T>(r, nd4j::math::nd4j_max<T>(g, b));
-    const T min = nd4j::math::nd4j_min<T>(r, nd4j::math::nd4j_min<T>(g, b));
+    const T max = sd::math::nd4j_max<T>(r, sd::math::nd4j_max<T>(g, b));
+    const T min = sd::math::nd4j_min<T>(r, sd::math::nd4j_min<T>(g, b));
     const T c  = max - min;
     const T _p6 = (T)1 / (T)6;
     // calculate h
diff --git a/libnd4j/include/ops/declarable/helpers/adjust_saturation.h b/libnd4j/include/ops/declarable/helpers/adjust_saturation.h
index cd0930cc0..25dc30f10 100644
--- a/libnd4j/include/ops/declarable/helpers/adjust_saturation.h
+++ b/libnd4j/include/ops/declarable/helpers/adjust_saturation.h
@@ -19,21 +19,21 @@
 // @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
-#include <op_boilerplate.h>
-#include <templatemath.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <math/templatemath.h>
+#include <array/NDArray.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
-    void adjustSaturation(nd4j::LaunchContext* context, const NDArray *input, const NDArray* factorScalarArr, NDArray *output, const int dimC);
+    void adjustSaturation(sd::LaunchContext* context, const NDArray *input, const NDArray* factorScalarArr, NDArray *output, const int dimC);
 
 /*
     template <typename T>
     static FORCEINLINE _CUDA_HD void rgb_to_hsv(T r, T g, T b, T* h, T* s, T* v) {
-        T vv = nd4j::math::nd4j_max<T>(r, nd4j::math::nd4j_max<T>(g, b));
-        T range = vv - nd4j::math::nd4j_min<T>(r, nd4j::math::nd4j_min<T>(g, b));
+        T vv = sd::math::nd4j_max<T>(r, sd::math::nd4j_max<T>(g, b));
+        T range = vv - sd::math::nd4j_min<T>(r, sd::math::nd4j_min<T>(g, b));
         if (vv > 0) {
             *s = range / vv;
         } else {
@@ -72,7 +72,7 @@ namespace helpers {
         while (fmodu >= (T) 2.0f)
             fmodu -= (T) 2.0f;
 
-        T x = c * (1. - nd4j::math::nd4j_abs<T>(fmodu - 1.));
+        T x = c * (1. - sd::math::nd4j_abs<T>(fmodu - 1.));
         switch (h_category) {
             case 0:
                 rr = c;
diff --git a/libnd4j/include/ops/declarable/helpers/axis.h b/libnd4j/include/ops/declarable/helpers/axis.h
index 42876b6f0..76c5a070d 100644
--- a/libnd4j/include/ops/declarable/helpers/axis.h
+++ b/libnd4j/include/ops/declarable/helpers/axis.h
@@ -19,10 +19,10 @@
 //
 #ifndef __AXIS_H_HELPERS__
 #define __AXIS_H_HELPERS__
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
diff --git a/libnd4j/include/ops/declarable/helpers/batched_gemm.h b/libnd4j/include/ops/declarable/helpers/batched_gemm.h
index 5bf1c6c99..26651cf3c 100644
--- a/libnd4j/include/ops/declarable/helpers/batched_gemm.h
+++ b/libnd4j/include/ops/declarable/helpers/batched_gemm.h
@@ -19,9 +19,9 @@
 //
 
 #include <vector>
-#include <NDArray.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
diff --git a/libnd4j/include/ops/declarable/helpers/batchnorm.h b/libnd4j/include/ops/declarable/helpers/batchnorm.h
index 1df55f6f3..72bc69718 100644
--- a/libnd4j/include/ops/declarable/helpers/batchnorm.h
+++ b/libnd4j/include/ops/declarable/helpers/batchnorm.h
@@ -23,7 +23,7 @@
 
 #include <ops/declarable/helpers/helpers.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
diff --git a/libnd4j/include/ops/declarable/helpers/betaInc.h b/libnd4j/include/ops/declarable/helpers/betaInc.h
index 9192bb1c1..4c37d7c5d 100644
--- a/libnd4j/include/ops/declarable/helpers/betaInc.h
+++ b/libnd4j/include/ops/declarable/helpers/betaInc.h
@@ -22,15 +22,15 @@
 #define LIBND4J_BETAINC_H
 
 #include <ops/declarable/helpers/helpers.h>
-#include "NDArray.h"
+#include "array/NDArray.h"
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
 	const uint maxIter = MAX_NUM_THREADS /*articles propose 10000*/;				// max number of loop iterations in function for continued fractions
 
-    void betaInc(nd4j::LaunchContext* context, const NDArray& a, const NDArray& b, const NDArray& x, NDArray& output);
+    void betaInc(sd::LaunchContext* context, const NDArray& a, const NDArray& b, const NDArray& x, NDArray& output);
 
 
 }
diff --git a/libnd4j/include/ops/declarable/helpers/choose.h b/libnd4j/include/ops/declarable/helpers/choose.h
index 2f2a67277..233c6b1ff 100644
--- a/libnd4j/include/ops/declarable/helpers/choose.h
+++ b/libnd4j/include/ops/declarable/helpers/choose.h
@@ -19,15 +19,15 @@
 //
 #ifndef __CHOOSE_H_HELPERS__
 #define __CHOOSE_H_HELPERS__
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
-    void chooseFunctorArray(nd4j::LaunchContext * context, NDArray* arg, NDArray* comp, int mode, NDArray* result, NDArray* numResults);
-    void chooseFunctorScalar(nd4j::LaunchContext * context, NDArray* arg, double scalar, int mode, NDArray* result, NDArray* numResults);
+    void chooseFunctorArray(sd::LaunchContext * context, NDArray* arg, NDArray* comp, int mode, NDArray* result, NDArray* numResults);
+    void chooseFunctorScalar(sd::LaunchContext * context, NDArray* arg, double scalar, int mode, NDArray* result, NDArray* numResults);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/col2im.h b/libnd4j/include/ops/declarable/helpers/col2im.h
index 66d7a684a..39a29da85 100644
--- a/libnd4j/include/ops/declarable/helpers/col2im.h
+++ b/libnd4j/include/ops/declarable/helpers/col2im.h
@@ -23,11 +23,11 @@
 
 #include <ops/declarable/helpers/helpers.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
-    ND4J_EXPORT void col2im(nd4j::LaunchContext & context, const NDArray& input,  NDArray& output, const int sH, const int sW, const int pH, const int pW, const int iH, const int iW, const int dH, const int dW);
+    ND4J_EXPORT void col2im(sd::LaunchContext & context, const NDArray& input,  NDArray& output, const int sH, const int sW, const int pH, const int pW, const int iH, const int iW, const int dH, const int dW);
 
 
 }
diff --git a/libnd4j/include/ops/declarable/helpers/compare_elem.h b/libnd4j/include/ops/declarable/helpers/compare_elem.h
index 016c73bdc..18faee328 100644
--- a/libnd4j/include/ops/declarable/helpers/compare_elem.h
+++ b/libnd4j/include/ops/declarable/helpers/compare_elem.h
@@ -19,13 +19,13 @@
 #define LIBND4J_COMPARE_ELEM_H
 
 #include <ops/declarable/helpers/helpers.h>
-#include "NDArray.h"
+#include "array/NDArray.h"
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
 
-            void compare_elem(nd4j::LaunchContext * context, NDArray* input, bool isStrictlyIncreasing, bool& output);
+            void compare_elem(sd::LaunchContext * context, NDArray* input, bool isStrictlyIncreasing, bool& output);
         }
     }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/confusion.h b/libnd4j/include/ops/declarable/helpers/confusion.h
index a0debcedc..a4d27be4f 100644
--- a/libnd4j/include/ops/declarable/helpers/confusion.h
+++ b/libnd4j/include/ops/declarable/helpers/confusion.h
@@ -19,14 +19,14 @@
 //
 #ifndef __CONFUSION_H_HELPERS__
 #define __CONFUSION_H_HELPERS__
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
-    void confusionFunctor(nd4j::LaunchContext * context, NDArray* labels, NDArray* predictions, NDArray* weights, NDArray* output);
+    void confusionFunctor(sd::LaunchContext * context, NDArray* labels, NDArray* predictions, NDArray* weights, NDArray* output);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/convolutions.h b/libnd4j/include/ops/declarable/helpers/convolutions.h
index e8bf735bc..6ba6136a4 100644
--- a/libnd4j/include/ops/declarable/helpers/convolutions.h
+++ b/libnd4j/include/ops/declarable/helpers/convolutions.h
@@ -21,13 +21,13 @@
 #ifndef LIBND4J_CONVOLUTIONS_H
 #define LIBND4J_CONVOLUTIONS_H
 
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <graph/Context.h>
-#include <dll.h>
+#include <system/dll.h>
 
 #include <execution/LaunchContext.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
 
         enum PoolingType {
@@ -64,9 +64,9 @@ namespace nd4j {
                     oW = (iW - ((kW - 1) * dW + 1) + 2 * pW) / sW + 1;
                 }
                 else if(paddingMode == 1) {        // same
-                    oD = (int) nd4j::math::nd4j_ceil<double, double>(iD * 1. / sD);
-                    oH = (int) nd4j::math::nd4j_ceil<double, double>(iH * 1. / sH);
-                    oW = (int) nd4j::math::nd4j_ceil<double, double>(iW * 1. / sW);
+                    oD = (int) sd::math::nd4j_ceil<double, double>(iD * 1. / sD);
+                    oH = (int) sd::math::nd4j_ceil<double, double>(iH * 1. / sH);
+                    oW = (int) sd::math::nd4j_ceil<double, double>(iW * 1. / sW);
 
                 }
                 else {                      // causal
@@ -254,39 +254,39 @@ namespace nd4j {
             //     }
             // }
 
-            static void conv2d(nd4j::graph::Context  &context, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW);
+            static void conv2d(sd::graph::Context  &context, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW);
 
-            // static void conv2d(nd4j::graph::Context & block, const std::vector<NDArray*>& inArrs, NDArray* output, const std::vector<int>& intArgs);
+            // static void conv2d(sd::graph::Context & block, const std::vector<NDArray*>& inArrs, NDArray* output, const std::vector<int>& intArgs);
 
-            // static void conv2dBP(nd4j::graph::Context & block, const std::vector<NDArray*>& inArrs, const std::vector<NDArray*>& outArrs, const std::vector<int>& intArgs);
+            // static void conv2dBP(sd::graph::Context & block, const std::vector<NDArray*>& inArrs, const std::vector<NDArray*>& outArrs, const std::vector<int>& intArgs);
 
-            static void conv2dBP(nd4j::graph::Context & block, const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW);
+            static void conv2dBP(sd::graph::Context & block, const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW);
 
-            static void depthwiseConv2d(nd4j::graph::Context & block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW);
+            static void depthwiseConv2d(sd::graph::Context & block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW);
 
-            static void depthwiseConv2dBP(nd4j::graph::Context & block, const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW);
+            static void depthwiseConv2dBP(sd::graph::Context & block, const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW);
 
-            static void sconv2d(nd4j::graph::Context & block, const NDArray* input, const NDArray* weightsDepth, const NDArray* weightsPoint, const NDArray* bias,  NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW);
+            static void sconv2d(sd::graph::Context & block, const NDArray* input, const NDArray* weightsDepth, const NDArray* weightsPoint, const NDArray* bias,  NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW);
 
-            static void vol2col(nd4j::graph::Context & block, const NDArray& vol, NDArray& col, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW);
+            static void vol2col(sd::graph::Context & block, const NDArray& vol, NDArray& col, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW);
 
-            static void col2vol(nd4j::graph::Context & block, const NDArray& col, NDArray& vol, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW);
+            static void col2vol(sd::graph::Context & block, const NDArray& col, NDArray& vol, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW);
 
-            static void upsampling2d(nd4j::graph::Context & block, const NDArray& input, NDArray& output, const int factorH, const int factorW, const bool isNCHW);
+            static void upsampling2d(sd::graph::Context & block, const NDArray& input, NDArray& output, const int factorH, const int factorW, const bool isNCHW);
 
-            static void upsampling3d(nd4j::graph::Context & block, const NDArray& input, NDArray& output, const int factorD, const int factorH, const int factorW, const bool isNCDHW);
+            static void upsampling3d(sd::graph::Context & block, const NDArray& input, NDArray& output, const int factorD, const int factorH, const int factorW, const bool isNCDHW);
 
-            static void upsampling2dBP(nd4j::graph::Context & block, const NDArray& gradO, NDArray& gradI, const bool isNCHW);
+            static void upsampling2dBP(sd::graph::Context & block, const NDArray& gradO, NDArray& gradI, const bool isNCHW);
 
-            static void upsampling3dBP(nd4j::graph::Context & block, const NDArray& gradO, NDArray& gradI, const bool isNCDHW);
+            static void upsampling3dBP(sd::graph::Context & block, const NDArray& gradO, NDArray& gradI, const bool isNCDHW);
 
-            static void pooling2d(nd4j::graph::Context & block, const NDArray& input, NDArray& output, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const PoolingType poolingMode, const int extraParam0);
+            static void pooling2d(sd::graph::Context & block, const NDArray& input, NDArray& output, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const PoolingType poolingMode, const int extraParam0);
 
-            static void pooling3d(nd4j::graph::Context & block, const NDArray& input, NDArray& output, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int poolingMode, const int extraParam0);
+            static void pooling3d(sd::graph::Context & block, const NDArray& input, NDArray& output, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int poolingMode, const int extraParam0);
 
-            static void pooling2dBP(nd4j::graph::Context & block, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int poolingMode, const int extraParam0);
+            static void pooling2dBP(sd::graph::Context & block, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int poolingMode, const int extraParam0);
 
-            static void pooling3dBP(nd4j::graph::Context & block, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int poolingMode, const int extraParam0);
+            static void pooling3dBP(sd::graph::Context & block, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int poolingMode, const int extraParam0);
     };
 
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp b/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp
index ee45d46a7..5ac61964c 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/helpers/BarnesHutTsne.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -188,9 +188,9 @@ namespace helpers {
         //        gains = gains.add(.2).muli(sign(yGrads)).neq(sign(yIncs)).castTo(Nd4j.defaultFloatingPointType())
         //                .addi(gains.mul(0.8).muli(sign(yGrads)).neq(sign(yIncs)));
         auto gainsInternal = LAMBDA_TTT(x, grad, eps) {
-//            return T((x + 2.) * nd4j::math::nd4j_sign<T,T>(grad) != nd4j::math::nd4j_sign<T,T>(eps)) + T(x * 0.8 * nd4j::math::nd4j_sign<T,T>(grad) != nd4j::math::nd4j_sign<T,T>(eps));
-            //return T((x + 2.) * nd4j::math::nd4j_sign<T,T>(grad) == nd4j::math::nd4j_sign<T,T>(eps)) + T(x * 0.8 * nd4j::math::nd4j_sign<T,T>(grad) == nd4j::math::nd4j_sign<T,T>(eps));
-            T res = nd4j::math::nd4j_sign<T,T>(grad) != nd4j::math::nd4j_sign<T,T>(eps) ? x + T(.2) : x * T(.8);
+//            return T((x + 2.) * sd::math::nd4j_sign<T,T>(grad) != sd::math::nd4j_sign<T,T>(eps)) + T(x * 0.8 * sd::math::nd4j_sign<T,T>(grad) != sd::math::nd4j_sign<T,T>(eps));
+            //return T((x + 2.) * sd::math::nd4j_sign<T,T>(grad) == sd::math::nd4j_sign<T,T>(eps)) + T(x * 0.8 * sd::math::nd4j_sign<T,T>(grad) == sd::math::nd4j_sign<T,T>(eps));
+            T res = sd::math::nd4j_sign<T,T>(grad) != sd::math::nd4j_sign<T,T>(eps) ? x + T(.2) : x * T(.8);
             if(res < .01) res = .01;
             return res;
         };
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp b/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp
index 738da9bc5..d3e78368a 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp
@@ -20,12 +20,12 @@
 //
 
 #include <ops/declarable/helpers/activations.h>
-#include <ShapeUtils.h>
+#include <helpers/ShapeUtils.h>
 #include <numeric>
-#include <ConstantTadHelper.h>
+#include <helpers/ConstantTadHelper.h>
 #include <execution/Threads.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
@@ -46,10 +46,10 @@ static void softMaxForVector_(void *input, Nd4jLong *inShapeInfo, void *output,
         if (inEWS == 1 && outEWS == 1) {
 
             for (int i = 0; i < length; i++)
-                max = nd4j::math::nd4j_max<T>(max, inBuff[i]);
+                max = sd::math::nd4j_max<T>(max, inBuff[i]);
 
             for (int i = 0; i < length; i++) {
-                outBuff[i] = nd4j::math::nd4j_exp<T, T>(inBuff[i] - max);
+                outBuff[i] = sd::math::nd4j_exp<T, T>(inBuff[i] - max);
                 sum += outBuff[i];
             }
 
@@ -60,10 +60,10 @@ static void softMaxForVector_(void *input, Nd4jLong *inShapeInfo, void *output,
         else {
 
             for (int i = 0; i < length; i++)
-                max = nd4j::math::nd4j_max<T>(max, inBuff[i * inEWS]);
+                max = sd::math::nd4j_max<T>(max, inBuff[i * inEWS]);
 
             for (int i = 0; i < length; i++) {
-                T r = nd4j::math::nd4j_exp<T, T>(inBuff[i * inEWS] - max);
+                T r = sd::math::nd4j_exp<T, T>(inBuff[i * inEWS] - max);
                 outBuff[i * outEWS] = r;
                 sum += r;
             }
@@ -77,7 +77,7 @@ static void softMaxForVector_(void *input, Nd4jLong *inShapeInfo, void *output,
 
 ///////////////////////////////////////////////////////////////////
     template <typename T>
-    void static _softMaxDerivForVector(nd4j::LaunchContext * context, const void *input, const Nd4jLong *inShapeInfo, void *output) {
+    void static _softMaxDerivForVector(sd::LaunchContext * context, const void *input, const Nd4jLong *inShapeInfo, void *output) {
 
         const T* inBuff  = reinterpret_cast<const T *>(input);
         T* outBuff = reinterpret_cast<T *>(output);
@@ -88,12 +88,12 @@ static void softMaxForVector_(void *input, Nd4jLong *inShapeInfo, void *output,
 
         for (int i = 0; i < length; i++) {
             const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo);
-            max = nd4j::math::nd4j_max<T>(max, inBuff[offset]);
+            max = sd::math::nd4j_max<T>(max, inBuff[offset]);
         }
 
         for (int i = 0; i < length; i++) {
             const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo);
-            outBuff[offset] = nd4j::math::nd4j_exp<T, T>(inBuff[offset] - max);
+            outBuff[offset] = sd::math::nd4j_exp<T, T>(inBuff[offset] - max);
             sum += outBuff[offset];
         }
 
@@ -105,7 +105,7 @@ static void softMaxForVector_(void *input, Nd4jLong *inShapeInfo, void *output,
     }
 
 ///////////////////////////////////////////////////////////////////
-    void softmaxDerivative(nd4j::LaunchContext * context, const NDArray& input, NDArray& output, const int dimension) {
+    void softmaxDerivative(sd::LaunchContext * context, const NDArray& input, NDArray& output, const int dimension) {
 
         const int rank = input.rankOf();
         int temp;
@@ -124,7 +124,7 @@ static void softMaxForVector_(void *input, Nd4jLong *inShapeInfo, void *output,
     }
 
     ///////////////////////////////////////////////////////////////////
-void softMaxForVector(nd4j::LaunchContext * context, const NDArray& input, NDArray& output) {
+void softMaxForVector(sd::LaunchContext * context, const NDArray& input, NDArray& output) {
 
     if(!input.isVector() || !output.isVector())
         throw std::runtime_error("ops::helpers::softMaxForVector function: input and output arrays must be vectors !");
@@ -147,42 +147,42 @@ void softMaxForVector(nd4j::LaunchContext * context, const NDArray& input, NDArr
 
         if (inEWS == 1) {
             for (Nd4jLong i = 0; i < length; i++)
-                max = nd4j::math::nd4j_max<T>(max, inBuff[i]);
+                max = sd::math::nd4j_max<T>(max, inBuff[i]);
 
             PRAGMA_OMP_SIMD_SUM(sum)
             for (Nd4jLong i = 0; i < length; i++) {
-                outBuff[i] = nd4j::math::nd4j_exp<T,T>(inBuff[i] - max);
+                outBuff[i] = sd::math::nd4j_exp<T,T>(inBuff[i] - max);
                 sum += outBuff[i];
             }
 
             PRAGMA_OMP_SIMD
             for (Nd4jLong i = 0; i < length; i++) {
                 outBuff[i] /= sum;
-                outBuff[i] = nd4j::math::nd4j_log<T,T>(outBuff[i]);
+                outBuff[i] = sd::math::nd4j_log<T,T>(outBuff[i]);
             }
         }
         else if (inEWS > 1) {
 
             PRAGMA_OMP_SIMD_MAX(max)
             for (Nd4jLong i = 0; i < length; i++)
-                max = nd4j::math::nd4j_max<T>(max, inBuff[i * inEWS]);
+                max = sd::math::nd4j_max<T>(max, inBuff[i * inEWS]);
 
             PRAGMA_OMP_SIMD_SUM(sum)
             for (Nd4jLong i = 0; i < length; i++) {
-                outBuff[i * inEWS] = nd4j::math::nd4j_exp<T,T>(inBuff[i * inEWS] - max);
+                outBuff[i * inEWS] = sd::math::nd4j_exp<T,T>(inBuff[i * inEWS] - max);
                 sum += outBuff[i * inEWS];
             }
 
             PRAGMA_OMP_SIMD
             for (Nd4jLong i = 0; i < length; i++) {
                 outBuff[i * inEWS] /= sum;
-                outBuff[i * inEWS] = nd4j::math::nd4j_log<T, T>(outBuff[i * inEWS]);
+                outBuff[i * inEWS] = sd::math::nd4j_log<T, T>(outBuff[i * inEWS]);
             }
         }
     }
 
     ///////////////////////////////////////////////////////////////////
-    void logSoftMaxForVector(nd4j::LaunchContext * context, const NDArray& input, NDArray& output) {
+    void logSoftMaxForVector(sd::LaunchContext * context, const NDArray& input, NDArray& output) {
 
         if(!input.isVector() || !output.isVector())
             throw std::runtime_error("ops::helpers::logSoftMaxForVector function input and output arrays must be vectors !");
@@ -206,11 +206,11 @@ void softMaxForVector(nd4j::LaunchContext * context, const NDArray& input, NDArr
 
                 #pragma omp simd reduction(max:max)
                 for (uint j = 0; j < tadLen; ++j)
-                    max = nd4j::math::nd4j_max<float>(max, inBuff[j]);
+                    max = sd::math::nd4j_max<float>(max, inBuff[j]);
 
                 #pragma omp simd reduction(+:sum)
                 for (uint j = 0; j < tadLen; ++j) {
-                    float temp = nd4j::math::nd4j_exp<float, float>(inBuff[j] - max);
+                    float temp = sd::math::nd4j_exp<float, float>(inBuff[j] - max);
                     outBuff[j] = temp;
                     sum += temp;
                 }
@@ -237,11 +237,11 @@ void softMaxForVector(nd4j::LaunchContext * context, const NDArray& input, NDArr
 
                 #pragma omp simd reduction(maxT:max)
                 for (uint j = 0; j < tadLen; ++j)
-                    max = nd4j::math::nd4j_max<T>(max, inBuff[j]);
+                    max = sd::math::nd4j_max<T>(max, inBuff[j]);
 
                 #pragma omp simd reduction(sumT:sum)
                 for (uint j = 0; j < tadLen; ++j) {
-                    T temp = nd4j::math::nd4j_exp<T, T>(inBuff[j] - max);
+                    T temp = sd::math::nd4j_exp<T, T>(inBuff[j] - max);
                     outBuff[j] = temp;
                     sum += temp;
                 }
@@ -257,7 +257,7 @@ void softMaxForVector(nd4j::LaunchContext * context, const NDArray& input, NDArr
 
 //////////////////////////////////////////////////////////////////////////
 template <typename T>
-static void softmax_(nd4j::LaunchContext * context, const NDArray& input, NDArray& output, const int dimension) {
+static void softmax_(sd::LaunchContext * context, const NDArray& input, NDArray& output, const int dimension) {
 
     const int rank = input.rankOf();
 
@@ -270,7 +270,7 @@ static void softmax_(nd4j::LaunchContext * context, const NDArray& input, NDArra
     }
     else if(input.isSameShapeStrict(output)) {
 
-        TadPack tadPack  = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input.getShapeInfo(), dimension);
+        TadPack tadPack  = sd::ConstantTadHelper::getInstance()->tadForDimensions(input.getShapeInfo(), dimension);
         Nd4jLong* tadShapeInfo  = tadPack.primaryShapeInfo();
         Nd4jLong* tadOffsets    = tadPack.primaryOffsets();
         const uint numOfSubArrs = tadPack.numberOfTads();
@@ -285,7 +285,7 @@ static void softmax_(nd4j::LaunchContext * context, const NDArray& input, NDArra
         else {
 
             uint inShapeInfoCast[MAX_RANK];
-            bool canCast = nd4j::DataTypeUtils::castShapeInfo(tadShapeInfo, inShapeInfoCast);
+            bool canCast = sd::DataTypeUtils::castShapeInfo(tadShapeInfo, inShapeInfoCast);
 
             auto offsets = new Nd4jLong[tadLen];
             shape::calcOffsets(tadShapeInfo, offsets);
@@ -299,10 +299,10 @@ static void softmax_(nd4j::LaunchContext * context, const NDArray& input, NDArra
                     T sum = 0.f;
 
                     for (uint j = 0; j < tadLen; ++j)
-                        max = nd4j::math::nd4j_max<T>(max, inBuff[offsets[j]]);
+                        max = sd::math::nd4j_max<T>(max, inBuff[offsets[j]]);
 
                     for (uint j = 0; j < tadLen; ++j) {
-                        T temp = nd4j::math::nd4j_exp<T, T>(inBuff[offsets[j]] - max);
+                        T temp = sd::math::nd4j_exp<T, T>(inBuff[offsets[j]] - max);
                         outBuff[offsets[j]] = temp;
                         sum += temp;
                     }
@@ -318,24 +318,24 @@ static void softmax_(nd4j::LaunchContext * context, const NDArray& input, NDArra
         }
     }
     else {
-        NDArray max = input.reduceAlongDimension(nd4j::reduce::Max, {dimension}, true);
-        input.applyTrueBroadcast(nd4j::BroadcastOpsTuple::Subtract(), max, output, false);
-        output.applyTransform(nd4j::transform::Exp, output);
-        NDArray sum = output.reduceAlongDimension(nd4j::reduce::Sum, {dimension}, true);
+        NDArray max = input.reduceAlongDimension(sd::reduce::Max, {dimension}, true);
+        input.applyTrueBroadcast(sd::BroadcastOpsTuple::Subtract(), max, output, false);
+        output.applyTransform(sd::transform::Exp, output);
+        NDArray sum = output.reduceAlongDimension(sd::reduce::Sum, {dimension}, true);
         output /= sum;
     }
 }
 
 
 ///////////////////////////////////////////////////////////////////
-void softmax(nd4j::LaunchContext * context, const NDArray& input, NDArray& output, const int dimension) {
+void softmax(sd::LaunchContext * context, const NDArray& input, NDArray& output, const int dimension) {
 
     BUILD_SINGLE_SELECTOR(input.dataType(), softmax_, (context, input, output, dimension), FLOAT_TYPES);
 }
 
 
 //////////////////////////////////////////////////////////////////////////
-void prelu(nd4j::LaunchContext * context, const NDArray& input, const NDArray& alpha, NDArray& output) {
+void prelu(sd::LaunchContext * context, const NDArray& input, const NDArray& alpha, NDArray& output) {
     const Nd4jLong inputLen = input.lengthOf();
     const Nd4jLong* inputShapeInfo = input.getShapeInfo();
     const Nd4jLong* alphaShapeInfo = alpha.getShapeInfo();
@@ -356,7 +356,7 @@ void prelu(nd4j::LaunchContext * context, const NDArray& input, const NDArray& a
 }
 
 //////////////////////////////////////////////////////////////////////////
-void preluBP(nd4j::LaunchContext * context, const NDArray& input, const NDArray& alpha, const NDArray& dLdO, NDArray& dLdI, NDArray& dLdA) {
+void preluBP(sd::LaunchContext * context, const NDArray& input, const NDArray& alpha, const NDArray& dLdO, NDArray& dLdI, NDArray& dLdA) {
 
     const Nd4jLong inputLen = input.lengthOf();
     const Nd4jLong* inputShapeInfo = input.getShapeInfo();
@@ -393,24 +393,24 @@ void preluBP(nd4j::LaunchContext * context, const NDArray& input, const NDArray&
         const_cast<NDArray&>(input).applyLambda<T>(routine, output);
     }
 
-    void thresholdRelu(nd4j::LaunchContext * context, NDArray const& input, double threshold, NDArray& output) {
+    void thresholdRelu(sd::LaunchContext * context, NDArray const& input, double threshold, NDArray& output) {
         BUILD_SINGLE_SELECTOR(input.dataType(), thresholdRelu_, (input, threshold, output), FLOAT_TYPES);
     }
 
     template <typename T>
-    static void thresholdReluDerivative_(nd4j::LaunchContext * context, NDArray* input, double theta, NDArray* dLdO, NDArray* output) {
+    static void thresholdReluDerivative_(sd::LaunchContext * context, NDArray* input, double theta, NDArray* dLdO, NDArray* output) {
         auto derivative = LAMBDA_TT(_x, grO, theta) {if (_x > theta) return grO; else return static_cast<T>(0); };
 
         input->applyPairwiseLambda<T>(*dLdO, derivative, *output);
 
     }
 
-    void thresholdReluDerivative(nd4j::LaunchContext * context, NDArray* input, double threshold, NDArray* dLdO, NDArray* output) {
+    void thresholdReluDerivative(sd::LaunchContext * context, NDArray* input, double threshold, NDArray* dLdO, NDArray* output) {
         BUILD_SINGLE_SELECTOR(input->dataType(), thresholdReluDerivative_, (context, input, threshold, dLdO, output), FLOAT_TYPES);
     }
 
     ///////////////////////////////////////////////////////////////////
-    void logSoftmax(nd4j::LaunchContext * context, const NDArray& input, NDArray& output, const int dimension) {
+    void logSoftmax(sd::LaunchContext * context, const NDArray& input, NDArray& output, const int dimension) {
 
         const int rank = input.rankOf();
 
@@ -432,10 +432,10 @@ void preluBP(nd4j::LaunchContext * context, const NDArray& input, const NDArray&
         }
     }
 
-BUILD_SINGLE_TEMPLATE(template void thresholdReluDerivative_, (nd4j::LaunchContext * context, NDArray* input, double threshold, NDArray* dLdO, NDArray* output), FLOAT_TYPES);
-BUILD_SINGLE_TEMPLATE(template void softmax_, (nd4j::LaunchContext * context, const NDArray& input, NDArray& output, const int dimension), FLOAT_TYPES);
+BUILD_SINGLE_TEMPLATE(template void thresholdReluDerivative_, (sd::LaunchContext * context, NDArray* input, double threshold, NDArray* dLdO, NDArray* output), FLOAT_TYPES);
+BUILD_SINGLE_TEMPLATE(template void softmax_, (sd::LaunchContext * context, const NDArray& input, NDArray& output, const int dimension), FLOAT_TYPES);
 BUILD_SINGLE_TEMPLATE(template void logSoftMaxForVector_, (void *input, Nd4jLong *inShapeInfo, void *output, Nd4jLong *outShapeInfo), FLOAT_TYPES);
-    BUILD_SINGLE_TEMPLATE(template void _softMaxDerivForVector, (nd4j::LaunchContext * context, const void *input, const Nd4jLong *inShapeInfo, void *output), FLOAT_TYPES);
+    BUILD_SINGLE_TEMPLATE(template void _softMaxDerivForVector, (sd::LaunchContext * context, const void *input, const Nd4jLong *inShapeInfo, void *output), FLOAT_TYPES);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp b/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp
index bfa1d5a32..68b8c6955 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp
@@ -28,7 +28,7 @@
 #include <memory>
 #include <execution/Threads.h>
 #include <execution/ThreadPool.h>
-#include <LoopsCoordsHelper.h>
+#include <helpers/LoopsCoordsHelper.h>
 #include <ops/declarable/helpers/addBias.h>
 
 #if defined(__GNUC__) 
@@ -39,7 +39,7 @@
 #define align32 
 #endif 
 
-namespace nd4j {
+namespace sd {
 	namespace ops {
 		namespace helpers {
 
@@ -129,19 +129,19 @@ namespace nd4j {
 			static void channel_atTheEnd_stride1_C(const Nd4jLong*& x_strides, const Nd4jLong*& bases, T* x, const T* b, T* z, const bool& inplace, const Nd4jLong& start, const Nd4jLong& stop, const Nd4jLong& inc)
 			{
 				size_t loop_count = (stop - start) / inc;
-				nd4j::CoordsState<constRank - 1> cst;
-				size_t offset = nd4j::init_coords<constRank>(cst, start, bases, x_strides);
+				sd::CoordsState<constRank - 1> cst;
+				size_t offset = sd::init_coords<constRank>(cst, start, bases, x_strides);
 
 				if (!inplace) {
 					for (size_t i = 0; i < loop_count; i++) {
 						_add(&(x[offset]), b, &(z[offset]), inc);
-						offset = nd4j::inc_coords<constRank - 1>(cst, offset);
+						offset = sd::inc_coords<constRank - 1>(cst, offset);
 					}
 				}
 				else {
 					for (size_t i = 0; i < loop_count; i++) {
 						_add_inplace(&(x[offset]), b, inc);
-						offset = nd4j::inc_coords<constRank - 1>(cst, offset);
+						offset = sd::inc_coords<constRank - 1>(cst, offset);
 					}
 				}
 			}
@@ -157,8 +157,8 @@ namespace nd4j {
 				}
 				else {
 					size_t loop_count = (stop - start) / inc;
-					nd4j::ZipCoordsState<constRank - 1> cst;
-					nd4j::zip_size_t offset = nd4j::init_coords<constRank>(cst, start, bases, x_strides, z_strides);
+					sd::ZipCoordsState<constRank - 1> cst;
+					sd::zip_size_t offset = sd::init_coords<constRank>(cst, start, bases, x_strides, z_strides);
 					Nd4jLong x_stride = ZIP_STRIDE1(cst, constRank - 1);
 					Nd4jLong z_stride = ZIP_STRIDE2(cst, constRank - 1);
 
@@ -166,7 +166,7 @@ namespace nd4j {
 						/* bases are equal with different strides , but the last one is 1. So we can still vectorize it  */
 						for (size_t i = 0; i < loop_count; i++) {
 							_add(&(x[offset.first]), b, &(z[offset.second]), inc);
-							offset = nd4j::inc_coords<constRank - 1>(cst, offset);
+							offset = sd::inc_coords<constRank - 1>(cst, offset);
 						}
 					}
 					else {
@@ -175,7 +175,7 @@ namespace nd4j {
 							T* zz = &(z[offset.second]);
 							for (size_t j = 0; j < inc; j++)
 								zz[j * z_stride] = xx[j * x_stride] + b[j];
-							offset = nd4j::inc_coords<constRank - 1>(cst, offset);
+							offset = sd::inc_coords<constRank - 1>(cst, offset);
 						}
 					}
 				}
@@ -215,21 +215,21 @@ namespace nd4j {
 			static void channel_NC_stride1_C(const Nd4jLong*& x_strides, const Nd4jLong*& bases, T* x, const T2* b, T* z, const bool& inplace, const Nd4jLong yStrideC, const Nd4jLong& start, const Nd4jLong& stop, const Nd4jLong& inc)
 			{
 				size_t loop_count = (stop - start) / inc;
-				nd4j::CoordsState<constRank - 1> cst;
-				size_t offset = nd4j::init_coords<constRank>(cst, start, bases, x_strides);
+				sd::CoordsState<constRank - 1> cst;
+				size_t offset = sd::init_coords<constRank>(cst, start, bases, x_strides);
 
 				if (!inplace) {
 					for (size_t i = 0; i < loop_count; i++) {
 						T yy = static_cast<T>(b[COORDS(cst, 1) * yStrideC]);
 						_add_broadcast(&(x[offset]), yy, &(z[offset]), inc);
-						offset = nd4j::inc_coords<constRank - 1>(cst, offset);
+						offset = sd::inc_coords<constRank - 1>(cst, offset);
 					}
 				}
 				else {
 					for (size_t i = 0; i < loop_count; i++) {
 						T yy = static_cast<T>(b[COORDS(cst, 1) * yStrideC]);
 						_add_broadcast_inplace(&(x[offset]), yy, inc);
-						offset = nd4j::inc_coords<constRank - 1>(cst, offset);
+						offset = sd::inc_coords<constRank - 1>(cst, offset);
 					}
 				}
 			}
@@ -248,8 +248,8 @@ namespace nd4j {
 
 					// (stop-start) % inc == 0 because  we  handled inside partitioning using the channel size
 					size_t loop_count = (stop - start) / inc;
-					nd4j::ZipCoordsState<constRank - 1> cst;
-					nd4j::zip_size_t offset = nd4j::init_coords<constRank>(cst, start, bases, x_strides, z_strides);
+					sd::ZipCoordsState<constRank - 1> cst;
+					sd::zip_size_t offset = sd::init_coords<constRank>(cst, start, bases, x_strides, z_strides);
 					Nd4jLong x_stride = ZIP_STRIDE1(cst, constRank - 1);
 					Nd4jLong z_stride = ZIP_STRIDE2(cst, constRank - 1);
 					if (same_order && z_stride == 1 && x_stride == 1) {
@@ -257,7 +257,7 @@ namespace nd4j {
 						for (size_t i = 0; i < loop_count; i++) {
 							T yy = static_cast<T>(b[ZIP_COORDS(cst, 1) * yStrideC]);
 							_add_broadcast(&(x[offset.first]), yy, &(z[offset.second]), inc);
-							offset = nd4j::inc_coords<constRank - 1>(cst, offset);
+							offset = sd::inc_coords<constRank - 1>(cst, offset);
 						}
 					}
 					else {
@@ -267,7 +267,7 @@ namespace nd4j {
 							T yy = static_cast<T>(b[ZIP_COORDS(cst, 1) * yStrideC]);
 							for (size_t j = 0; j < inc; j++)
 								zz[j * z_stride] = xx[j * x_stride] + yy;
-							offset = nd4j::inc_coords<constRank - 1>(cst, offset);
+							offset = sd::inc_coords<constRank - 1>(cst, offset);
 						}
 					}
 				}
@@ -280,16 +280,16 @@ namespace nd4j {
 				// (stop-start) % inc == 0 because  we  handled inside partitioning using the channel size
 				size_t loop_count = (stop - start) / inc;
 
-				nd4j::CoordsState<1> cst;
+				sd::CoordsState<1> cst;
 				//note: we had to manually pass index
-				size_t offset_p = nd4j::init_coords<2>(cst, start / inc, bases, x_strides);
+				size_t offset_p = sd::init_coords<2>(cst, start / inc, bases, x_strides);
 
 				//partitioning was done using numHW, so we can increment from rank 2
 				if (inplaceOp) {
 					for (size_t i = 0; i < loop_count; i++) {
 						T yy = static_cast<T>(b[COORDS(cst, 1) * yStrideC]);
 						_add_broadcast_inplace(&(x[offset_p]), yy, inc);
-						offset_p = nd4j::inc_coords<2>(cst, offset_p);
+						offset_p = sd::inc_coords<2>(cst, offset_p);
 					}
 				}
 				else {
@@ -297,14 +297,14 @@ namespace nd4j {
 						for (size_t i = 0; i < loop_count; i++) {
 							T yy = static_cast<T>(b[COORDS(cst, 1)]);
 							_add_broadcast(&(x[offset_p]), yy, &(z[offset_p]), inc);
-							offset_p = nd4j::inc_coords<2>(cst, offset_p);
+							offset_p = sd::inc_coords<2>(cst, offset_p);
 						}
 					}
 					else {
 						for (size_t i = 0; i < loop_count; i++) {
 							T yy = static_cast<T>(b[COORDS(cst, 1) * yStrideC]);
 							_add_broadcast(&(x[offset_p]), yy, &(z[offset_p]), inc);
-							offset_p = nd4j::inc_coords<2>(cst, offset_p);
+							offset_p = sd::inc_coords<2>(cst, offset_p);
 						}
 					}
 				}
@@ -316,20 +316,20 @@ namespace nd4j {
 			{
 				// (stop-start) % inc == 0 because  we  handled inside partitioning using the channel size
 				size_t loop_count = (stop - start) / inc;
-				nd4j::CoordsState<constRank - 1> cst;
-				size_t offset_p = nd4j::init_coords<constRank, 0, false>(cst, start, bases, x_strides);
+				sd::CoordsState<constRank - 1> cst;
+				size_t offset_p = sd::init_coords<constRank, 0, false>(cst, start, bases, x_strides);
 				if (!inplace) {
 					for (size_t i = 0; i < loop_count; i++) {
 						T yy = static_cast<T>(b[COORDS(cst, b_index) * yStrideC]);
 						_add_broadcast(&(x[offset_p]), yy, &(z[offset_p]), inc);
-						offset_p = nd4j::inc_coords<constRank, skip, false>(cst, offset_p);
+						offset_p = sd::inc_coords<constRank, skip, false>(cst, offset_p);
 					}
 				}
 				else {
 					for (size_t i = 0; i < loop_count; i++) {
 						T yy = static_cast<T>(b[COORDS(cst, b_index) * yStrideC]);
 						_add_broadcast_inplace(&(x[offset_p]), yy, inc);
-						offset_p = nd4j::inc_coords<constRank, skip, false>(cst, offset_p);
+						offset_p = sd::inc_coords<constRank, skip, false>(cst, offset_p);
 					}
 				}
 			}
@@ -346,8 +346,8 @@ namespace nd4j {
 					// (stop-start) % inc == 0 because  we  handled inside partitioning using the channel size
 
 					size_t loop_count = (stop - start) / inc;
-					nd4j::ZipCoordsState<constRank - 1> cst;
-					nd4j::zip_size_t offset = nd4j::init_coords<constRank, 0, false>(cst, start, bases, x_strides, z_strides);
+					sd::ZipCoordsState<constRank - 1> cst;
+					sd::zip_size_t offset = sd::init_coords<constRank, 0, false>(cst, start, bases, x_strides, z_strides);
 					Nd4jLong x_stride = ZIP_STRIDE1(cst, 0);
 					Nd4jLong z_stride = ZIP_STRIDE2(cst, 0);
 					if (same_order && z_stride == 1 && x_stride == 1) {
@@ -355,7 +355,7 @@ namespace nd4j {
 						for (size_t i = 0; i < loop_count; i++) {
 							T yy = static_cast<T>(b[ZIP_COORDS(cst, b_index) * yStrideC]);
 							_add_broadcast(&(x[offset.first]), yy, &(z[offset.second]), inc);
-							offset = nd4j::inc_coords<constRank, 1, false>(cst, offset);
+							offset = sd::inc_coords<constRank, 1, false>(cst, offset);
 						}
 					}
 					else {
@@ -365,7 +365,7 @@ namespace nd4j {
 							T yy = static_cast<T>(b[ZIP_COORDS(cst, b_index) * yStrideC]);
 							for (size_t j = 0; j < inc; j++)
 								zz[j * z_stride] = xx[j * x_stride] + yy;
-							offset = nd4j::inc_coords<constRank, 1, false>(cst, offset);
+							offset = sd::inc_coords<constRank, 1, false>(cst, offset);
 						}
 					}
 				}
@@ -418,7 +418,7 @@ namespace nd4j {
 				//for rank>5 
 				if (rank > 5) {
 					const int channelDim = isNCHW ? 1 : input.rankOf() - 1;      // second or last
-					const_cast<NDArray&>(input).applyBroadcast(nd4j::broadcast::Add, { channelDim }, bias, output);
+					const_cast<NDArray&>(input).applyBroadcast(sd::broadcast::Add, { channelDim }, bias, output);
 					return;
 				}
 
@@ -482,7 +482,7 @@ namespace nd4j {
 						if (isContinuous) {
 							//we can choose other inc and index for that case
 							//but for now lets choose all till the last one
-							uint32_t req_numThreads = nd4j::Environment::getInstance()->maxMasterThreads();
+							uint32_t req_numThreads = sd::Environment::getInstance()->maxMasterThreads();
 							isContinuous = false;
 							if (rank > 2) {
 								if (req_numThreads < 2 || bases[rank - 1] >= req_numThreads) {
@@ -582,7 +582,7 @@ namespace nd4j {
 					if (order == 'c' && isContinuous) {
 						//sometimes last dimension is too big and multithreading could suffer using unfair partitioning
 						//so we will do it only when inc is smaller our value or multithreading turned off
-						uint32_t req_numThreads = nd4j::Environment::getInstance()->maxMasterThreads();
+						uint32_t req_numThreads = sd::Environment::getInstance()->maxMasterThreads();
 						if (req_numThreads < 2 || numNC >= req_numThreads || inc <= 2 * 8196 || rank == 3) {
 							inc = numHW;
 						}
@@ -635,7 +635,7 @@ namespace nd4j {
 				}
 			}
 			//////////////////////////////////////////////////////////////////////////
-			void addBias(nd4j::graph::Context& block, const NDArray& input, const NDArray& bias, NDArray& output, const bool isNCHW) {
+			void addBias(sd::graph::Context& block, const NDArray& input, const NDArray& bias, NDArray& output, const bool isNCHW) {
 
 			    // bias.rankOf() == 1 ? bias : bias.reshape(bias.ordering(), {bias.lengthOf()})
 			    BUILD_DOUBLE_SELECTOR(input.dataType(), bias.dataType(), addBias_, (input, bias, output, isNCHW), FLOAT_TYPES, FLOAT_TYPES);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp b/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp
index 5a22b02eb..078ebda10 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp
@@ -23,7 +23,7 @@
 #include <helpers/ConstantTadHelper.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -59,8 +59,8 @@ static void adjustHue_(const NDArray *input, const NDArray* deltaScalarArr, NDAr
     }
     else {
 
-        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(),  dimC);
-        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimC);
+        auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(),  dimC);
+        auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimC);
 
         const Nd4jLong numOfTads   = packX.numberOfTads();
         const Nd4jLong xDimCstride = input->stridesOf()[dimC];
@@ -92,13 +92,13 @@ static void adjustHue_(const NDArray *input, const NDArray* deltaScalarArr, NDAr
 }
 
 
-void adjustHue(nd4j::LaunchContext* context, const NDArray *input, const NDArray* deltaScalarArr, NDArray *output, const int dimC) {
+void adjustHue(sd::LaunchContext* context, const NDArray *input, const NDArray* deltaScalarArr, NDArray *output, const int dimC) {
     BUILD_SINGLE_SELECTOR(input->dataType(), adjustHue_, (input, deltaScalarArr, output, dimC), FLOAT_TYPES);
 }
 
 /*
 template <typename T>
-static void adjust_hue_single_(nd4j::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC) {
+static void adjust_hue_single_(sd::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC) {
     // we're 100% sure it's 3
     const int numChannels = 3;
     int tuples = array->lengthOf() /  numChannels;
@@ -166,7 +166,7 @@ static void adjust_hue_single_(nd4j::LaunchContext * context, NDArray *array, ND
     }
 }
 
-void adjust_hue_(nd4j::LaunchContext * context, NDArray *array, NDArray *output, NDArray* delta, bool isNHWC) {
+void adjust_hue_(sd::LaunchContext * context, NDArray *array, NDArray *output, NDArray* delta, bool isNHWC) {
     auto xType = array->dataType();
 
     float d = delta->e<float>(0);
@@ -188,7 +188,7 @@ void adjust_hue_(nd4j::LaunchContext * context, NDArray *array, NDArray *output,
     }
 }
 
-BUILD_SINGLE_TEMPLATE(template void adjust_hue_single_, (nd4j::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC);, FLOAT_TYPES);
+BUILD_SINGLE_TEMPLATE(template void adjust_hue_single_, (sd::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC);, FLOAT_TYPES);
 */
 
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp b/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp
index 594280ebe..c5c5cf9c6 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp
@@ -25,7 +25,7 @@
 #include <execution/Threads.h>
 
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
@@ -58,8 +58,8 @@ static void adjustSaturation_(const NDArray *input, const NDArray* factorScalarA
 
         samediff::Threads::parallel_for(func, 0, input->lengthOf(), 3);
     } else {
-        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(),  dimC);
-        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimC);
+        auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(),  dimC);
+        auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimC);
 
         const Nd4jLong numOfTads   = packX.numberOfTads();
         const Nd4jLong xDimCstride = input->stridesOf()[dimC];
@@ -89,14 +89,14 @@ static void adjustSaturation_(const NDArray *input, const NDArray* factorScalarA
 }
 
 
-void adjustSaturation(nd4j::LaunchContext* context, const NDArray *input, const NDArray* factorScalarArr, NDArray *output, const int dimC) {
+void adjustSaturation(sd::LaunchContext* context, const NDArray *input, const NDArray* factorScalarArr, NDArray *output, const int dimC) {
 
     BUILD_SINGLE_SELECTOR(input->dataType(), adjustSaturation_, (input, factorScalarArr, output, dimC), FLOAT_TYPES);
 }
 
 /*
 template <typename T>
-static void adjust_saturation_single_(nd4j::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC) {
+static void adjust_saturation_single_(sd::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC) {
     // we're 100% sure it's 3
     const int numChannels = 3;
     int tuples = array->lengthOf() /  numChannels;
@@ -114,7 +114,7 @@ static void adjust_saturation_single_(nd4j::LaunchContext * context, NDArray *ar
             T h, s, v;
             // Convert the RGB color to Hue/V-range.
             helpers::rgb_to_hsv(i[0], i[1], i[2], &h, &s, &v);
-            s = nd4j::math::nd4j_min<T>((T) 1.0f, nd4j::math::nd4j_max<T>((T) 0.0f, s * delta));
+            s = sd::math::nd4j_min<T>((T) 1.0f, sd::math::nd4j_max<T>((T) 0.0f, s * delta));
             // Convert the hue and v-range back into RGB.
             helpers::hsv_to_rgb(h, s, v, o, o + 1, o + 2);
         }
@@ -143,7 +143,7 @@ static void adjust_saturation_single_(nd4j::LaunchContext * context, NDArray *ar
             T h, s, v;
             // Convert the RGB color to Hue/V-range.
             helpers::rgb_to_hsv(_ri[0], _gi[0], _bi[0], &h, &s, &v);
-            s = nd4j::math::nd4j_min<T>((T) 1.0f, nd4j::math::nd4j_max<T>((T) 0.0f, s * delta));
+            s = sd::math::nd4j_min<T>((T) 1.0f, sd::math::nd4j_max<T>((T) 0.0f, s * delta));
             // Convert the hue and v-range back into RGB.
             helpers::hsv_to_rgb(h, s, v, _ro, _go, _bo);
         }
@@ -153,7 +153,7 @@ static void adjust_saturation_single_(nd4j::LaunchContext * context, NDArray *ar
     }
 }
 
-void adjust_saturation(nd4j::LaunchContext * context, NDArray *array, NDArray *output, NDArray* delta, bool isNHWC) {
+void adjust_saturation(sd::LaunchContext * context, NDArray *array, NDArray *output, NDArray* delta, bool isNHWC) {
     auto xType = array->dataType();
 
     float d = delta->e<float>(0);
@@ -177,7 +177,7 @@ void adjust_saturation(nd4j::LaunchContext * context, NDArray *array, NDArray *o
     }
 }
 
-BUILD_SINGLE_TEMPLATE(template void adjust_saturation_single_, (nd4j::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC), FLOAT_TYPES);
+BUILD_SINGLE_TEMPLATE(template void adjust_saturation_single_, (sd::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC), FLOAT_TYPES);
 */
 
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/axis.cpp b/libnd4j/include/ops/declarable/helpers/cpu/axis.cpp
index f082cd248..36f716630 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/axis.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/axis.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/helpers/axis.h>
 
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/batched_gemm.cpp b/libnd4j/include/ops/declarable/helpers/cpu/batched_gemm.cpp
index c63dc3c1c..daaf4f71a 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/batched_gemm.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/batched_gemm.cpp
@@ -18,14 +18,14 @@
 //  @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <types/float16.h>
 #include <ops/declarable/helpers/batched_gemm.h>
 #include <helpers/BlasHelper.h>
 #include <execution/Threads.h>
 
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp b/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp
index 32824684f..a0e6cf061 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp
@@ -19,12 +19,12 @@
 //
 
 
-#include<ops/declarable/helpers/batchnorm.h>
+#include <ops/declarable/helpers/batchnorm.h>
 #include <helpers/ShapeUtils.h>
-#include <OmpLaunchHelper.h>
+#include <helpers/OmpLaunchHelper.h>
 #include <execution/Threads.h>
 
-namespace nd4j 	  {
+namespace sd 	  {
 namespace ops 	  {
 namespace helpers {
 
@@ -77,7 +77,7 @@ static void batchnorm_(const NDArray* input, const NDArray* mean, const NDArray*
             const auto varOffset  = paramSameOffset ? meanOffset : shape::getIndexOffset(j, variance->getShapeInfo());
 
             const auto meanVal = m[meanOffset];
-            auto sigmaInvGam   = static_cast<T>(1) / nd4j::math::nd4j_sqrt<T, T>(v[varOffset] + epsilon);
+            auto sigmaInvGam   = static_cast<T>(1) / sd::math::nd4j_sqrt<T, T>(v[varOffset] + epsilon);
 
             if(g != nullptr) {
                 const auto gammaOffset = paramSameOffset ? meanOffset : shape::getIndexOffset(j, gamma->getShapeInfo());
@@ -162,7 +162,7 @@ static void batchnorm2_(const NDArray* input, const NDArray* mean, const NDArray
             const auto meanOffset     = shape::getOffset(mean->getShapeInfo(), coords);
             const auto varianceOffset = paramSameOffset ? meanOffset : shape::getOffset(variance->getShapeInfo(), coords);
 
-            T sigmaInvGam = 1. / nd4j::math::nd4j_sqrt<T, T>(v[varianceOffset] + epsilon);
+            T sigmaInvGam = 1. / sd::math::nd4j_sqrt<T, T>(v[varianceOffset] + epsilon);
 
             if(g != nullptr) {
                 const auto gammaOffset = paramSameOffset ? meanOffset : shape::getOffset(gamma->getShapeInfo(), coords);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp b/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp
index 5e80d12fb..ec06610b8 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp
@@ -19,12 +19,12 @@
 //
 
 #include<cmath>
-#include <DataTypeUtils.h>
-#include<ops/declarable/helpers/betaInc.h>
-#include <NDArrayFactory.h>
+#include <array/DataTypeUtils.h>
+#include <ops/declarable/helpers/betaInc.h>
+#include <array/NDArrayFactory.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -112,7 +112,7 @@ static T betaIncCore(T a, T b, T x) {
 
 ///////////////////////////////////////////////////////////////////
 template<typename T>
-static void betaIncForArray(nd4j::LaunchContext * context, const NDArray& a, const NDArray& b, const NDArray& x, NDArray& output) {
+static void betaIncForArray(sd::LaunchContext * context, const NDArray& a, const NDArray& b, const NDArray& x, NDArray& output) {
 
 	int xLen = x.lengthOf();
 
@@ -126,12 +126,12 @@ static void betaIncForArray(nd4j::LaunchContext * context, const NDArray& a, con
 
 ///////////////////////////////////////////////////////////////////
 // overload betaInc for arrays, shapes of a, b and x must be the same !!!
-void betaInc(nd4j::LaunchContext * context, const NDArray& a, const NDArray& b, const NDArray& x, NDArray& output) {
+void betaInc(sd::LaunchContext * context, const NDArray& a, const NDArray& b, const NDArray& x, NDArray& output) {
 	auto xType = a.dataType();
 	BUILD_SINGLE_SELECTOR(xType, betaIncForArray, (context, a, b, x, output), FLOAT_TYPES);
 }
 
-BUILD_SINGLE_TEMPLATE(template void betaIncForArray, (nd4j::LaunchContext * context, const NDArray& a, const NDArray& b, const NDArray& x, NDArray& output), FLOAT_TYPES);
+BUILD_SINGLE_TEMPLATE(template void betaIncForArray, (sd::LaunchContext * context, const NDArray& a, const NDArray& b, const NDArray& x, NDArray& output), FLOAT_TYPES);
 
 
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp b/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp
index 5573bb8f6..db6d27ffd 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp
@@ -21,13 +21,13 @@
 #include <ops/declarable/helpers/col2im.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
 // [bS, iC, kH, kW, oH, oW] is de-convoluted to [bS, iC, iH, iW]
 template <typename T>
-void col2im_(nd4j::LaunchContext & context, const NDArray& input,  NDArray& output, const int sH, const int sW, const int pH, const int pW, const int iH, const int iW, const int dH, const int dW) {
+void col2im_(sd::LaunchContext & context, const NDArray& input,  NDArray& output, const int sH, const int sW, const int pH, const int pW, const int iH, const int iW, const int dH, const int dW) {
 
     auto imBuff         = output.bufferAsT<T>();
 	auto colBuff        = input.bufferAsT<T>();
@@ -132,7 +132,7 @@ void col2im_(nd4j::LaunchContext & context, const NDArray& input,  NDArray& outp
 }
 
 
-void col2im(nd4j::LaunchContext & context, const NDArray& input,  NDArray& output, const int sH, const int sW, const int pH, const int pW, const int iH, const int iW, const int dH, const int dW) {
+void col2im(sd::LaunchContext & context, const NDArray& input,  NDArray& output, const int sH, const int sW, const int pH, const int pW, const int iH, const int iW, const int dH, const int dW) {
 	BUILD_SINGLE_SELECTOR(input.dataType(), col2im_, (context, input, output, sH, sW, pH, pW, iH, iW, dH, dW), FLOAT_TYPES);
 }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/compare_elem.cpp b/libnd4j/include/ops/declarable/helpers/cpu/compare_elem.cpp
index e5e51d38f..12961fe92 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/compare_elem.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/compare_elem.cpp
@@ -17,7 +17,7 @@
 #include <ops/declarable/helpers/compare_elem.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             template<typename T>
@@ -25,8 +25,8 @@ namespace nd4j {
                 auto length = shape::length(input->getShapeInfo());
 
                 int elementsPerThread = length / ELEMENT_THRESHOLD;
-                int num_threads = nd4j::math::nd4j_max<int>(1, elementsPerThread);
-                num_threads = nd4j::math::nd4j_min<int>(num_threads, omp_get_max_threads());
+                int num_threads = sd::math::nd4j_max<int>(1, elementsPerThread);
+                num_threads = sd::math::nd4j_min<int>(num_threads, omp_get_max_threads());
                 Nd4jLong sumt = 0;
 
                 if(isStrictlyIncreasing) {
@@ -62,7 +62,7 @@ namespace nd4j {
 
             }
 
-            void compare_elem(nd4j::LaunchContext * context, NDArray *input, bool isStrictlyIncreasing, bool& output) {
+            void compare_elem(sd::LaunchContext * context, NDArray *input, bool isStrictlyIncreasing, bool& output) {
                 auto xType = input->dataType();
 
                 BUILD_SINGLE_SELECTOR(xType, _compare_elem, (input, isStrictlyIncreasing, output), LIBND4J_TYPES);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_0.cpp b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_0.cpp
index 3177cca34..94e74cd84 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_0.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_0.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/helpers/crop_and_resize.h>
 #include "../crop_and_resize.hpp"
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             BUILD_TRIPLE_TEMPLATE(template void cropAndResizeFunctor_, (NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops), NUMERIC_TYPES_0, FLOAT_TYPES, INTEGER_TYPES);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_1.cpp b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_1.cpp
index cd9c00dc5..9820c1392 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_1.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_1.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/helpers/crop_and_resize.h>
 #include "../crop_and_resize.hpp"
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             BUILD_TRIPLE_TEMPLATE(template void cropAndResizeFunctor_, (NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops), NUMERIC_TYPES_1, FLOAT_TYPES, INTEGER_TYPES);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_2.cpp b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_2.cpp
index 3b126d288..2a78f285f 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_2.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_2.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/helpers/crop_and_resize.h>
 #include "../crop_and_resize.hpp"
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             BUILD_TRIPLE_TEMPLATE(template void cropAndResizeFunctor_, (NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops), NUMERIC_TYPES_2, FLOAT_TYPES, INTEGER_TYPES);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_3.cpp b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_3.cpp
index cca97a1ac..13757997a 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_3.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_3.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/helpers/crop_and_resize.h>
 #include "../crop_and_resize.hpp"
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             BUILD_TRIPLE_TEMPLATE(template void cropAndResizeFunctor_, (NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops), NUMERIC_TYPES_3, FLOAT_TYPES, INTEGER_TYPES);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_4.cpp b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_4.cpp
index 568492c08..ea3043eeb 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_4.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_4.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/helpers/crop_and_resize.h>
 #include "../crop_and_resize.hpp"
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             BUILD_TRIPLE_TEMPLATE(template void cropAndResizeFunctor_, (NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops), NUMERIC_TYPES_4, FLOAT_TYPES, INTEGER_TYPES);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_5.cpp b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_5.cpp
index 1491c9e1d..60c1ae906 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_5.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_5.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/helpers/crop_and_resize.h>
 #include "../crop_and_resize.hpp"
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             BUILD_TRIPLE_TEMPLATE(template void cropAndResizeFunctor_, (NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops), NUMERIC_TYPES_5, FLOAT_TYPES, INTEGER_TYPES);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_6.cpp b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_6.cpp
index 8517a39e9..6e33d5546 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_6.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_6.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/helpers/crop_and_resize.h>
 #include "../crop_and_resize.hpp"
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             BUILD_TRIPLE_TEMPLATE(template void cropAndResizeFunctor_, (NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops), NUMERIC_TYPES_6, FLOAT_TYPES, INTEGER_TYPES);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_7.cpp b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_7.cpp
index e12190170..ef4a199fd 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_7.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_7.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/helpers/crop_and_resize.h>
 #include "../crop_and_resize.hpp"
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             BUILD_TRIPLE_TEMPLATE(template void cropAndResizeFunctor_, (NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops), NUMERIC_TYPES_7, FLOAT_TYPES, INTEGER_TYPES);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_8.cpp b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_8.cpp
index c4ddd7066..71cd2ebb8 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_8.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_8.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/helpers/crop_and_resize.h>
 #include "../crop_and_resize.hpp"
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             BUILD_TRIPLE_TEMPLATE(template void cropAndResizeFunctor_, (NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops), NUMERIC_TYPES_8, FLOAT_TYPES, INTEGER_TYPES);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_9.cpp b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_9.cpp
index 38cf05787..e9db5c303 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_9.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/compilation_units/crop_and_resize_9.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/helpers/crop_and_resize.h>
 #include "../crop_and_resize.hpp"
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             BUILD_TRIPLE_TEMPLATE(template void cropAndResizeFunctor_, (NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops), NUMERIC_TYPES_9, FLOAT_TYPES, INTEGER_TYPES);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/concat.cpp b/libnd4j/include/ops/declarable/helpers/cpu/concat.cpp
index 1bdf0a6ad..6f2b2886e 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/concat.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/concat.cpp
@@ -22,16 +22,16 @@
 #include <ops/declarable/helpers/transforms.h>
 #include <ops/specials.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             //////////////////////////////////////////////////////////////////////////
             template<typename T>
             static void concat_(const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
-                nd4j::SpecialMethods<T>::concatCpuGeneric(inArrs, output, axis);
+                sd::SpecialMethods<T>::concatCpuGeneric(inArrs, output, axis);
             }
 
-            void concat(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
+            void concat(sd::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
                 BUILD_SINGLE_SELECTOR(output.dataType(), concat_,(inArrs, output, axis), LIBND4J_TYPES);
             }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/confusion.cpp b/libnd4j/include/ops/declarable/helpers/cpu/confusion.cpp
index 39449c7f8..685d80d2d 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/confusion.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/confusion.cpp
@@ -22,7 +22,7 @@
 #include <execution/Threads.h>
 
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -43,7 +43,7 @@ namespace helpers {
         samediff::Threads::parallel_for(func, 0, lLen);
     }
 
-    void confusionFunctor(nd4j::LaunchContext * context, NDArray* labels, NDArray* predictions, NDArray* weights, NDArray* output) {
+    void confusionFunctor(sd::LaunchContext * context, NDArray* labels, NDArray* predictions, NDArray* weights, NDArray* output) {
         auto xType = output->dataType(); // weights can be null
 
         BUILD_SINGLE_SELECTOR(xType, _confusionFunctor, (labels, predictions, weights, output), NUMERIC_TYPES);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp b/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp
index 51ddc0369..c1dd5dd56 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp
@@ -22,11 +22,11 @@
 #include<ops/declarable/helpers/addBias.h>
 #include <ops/declarable/helpers/im2col.h>
 #include <ops/declarable/helpers/col2im.h>
-#include <NDArrayFactory.h>
-#include <MmulHelper.h>
+#include <array/NDArrayFactory.h>
+#include <helpers/MmulHelper.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops  {
 
 
@@ -258,7 +258,7 @@ namespace nd4j {
 
 //////////////////////////////////////////////////////////////////////////
         template <typename X, typename Y>
-        static void conv2d_(nd4j::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
+        static void conv2d_(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
 
             // input   [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
             // weights [kH, kW, iC, oC] always
@@ -319,7 +319,7 @@ namespace nd4j {
 
 //////////////////////////////////////////////////////////////////////////
         template <typename X, typename Y>
-        static void conv2dBP_(nd4j::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
+        static void conv2dBP_(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
 
             // input   [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
             // weights [kH, kW, iC, oC] always
@@ -365,7 +365,7 @@ namespace nd4j {
             if(gradW) {
                 auto ctx = block.launchContext();
                 helpers::im2col(*ctx, *input, columns, kH, kW, sH, sW, pH, pW, dH, dW, NDArrayFactory::create(0.f, input->getContext()));   // [bS, iC, iH, iW] is convoluted to [bS, iC, kH, kW, oH, oW]
-                nd4j::MmulHelper::tensorDot(&columns, gradO, gradW, {0,4,5}, gradOaxesForDot, {2, 0, 1, 3});       // [bS, iC, kH, kW, oH, oW] x [bS, oH, oW, oC]/[bS, oC, oH, oW] = [iC, kH, kW, oC]
+                sd::MmulHelper::tensorDot(&columns, gradO, gradW, {0,4,5}, gradOaxesForDot, {2, 0, 1, 3});       // [bS, iC, kH, kW, oH, oW] x [bS, oH, oW, oC]/[bS, oC, oH, oW] = [iC, kH, kW, oC]
             }
 
             // ----- calculation of gradB ----- //
@@ -379,7 +379,7 @@ namespace nd4j {
             }
 
             //----- calculation of gradI -----//
-            nd4j::MmulHelper::tensorDot(weights, gradO, &columns, {indWoC}, {indIOioC}, {2, 3, 1, 0, 4, 5});  // [kH, kW, iC, oC]/[oC, iC, kH, kW]] x [bS, oH, oW, oC]/[bS, oC, oH, oW] = [kH, kW, iC, bS, oH, oW]
+            sd::MmulHelper::tensorDot(weights, gradO, &columns, {indWoC}, {indIOioC}, {2, 3, 1, 0, 4, 5});  // [kH, kW, iC, oC]/[oC, iC, kH, kW]] x [bS, oH, oW, oC]/[bS, oC, oH, oW] = [kH, kW, iC, bS, oH, oW]
 
             helpers::col2im(*block.launchContext(), columns, *gradI, sH, sW, pH, pW, iH, iW, dH, dW);                          // [bS, iC, kH, kW, oH, oW] is de-convoluted to [bS, iC, iH, iW]
 
@@ -391,7 +391,7 @@ namespace nd4j {
 
 //////////////////////////////////////////////////////////////////////////
         template <typename X, typename Y>
-        static void depthwiseConv2d_(nd4j::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
+        static void depthwiseConv2d_(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
 
             // input     [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
             // weights   [kH, kW, iC, mC] always
@@ -499,7 +499,7 @@ namespace nd4j {
             // ----- calculation of gradW and gradB ----- //
 
             helpers::im2col(*input->getContext(), *input, columns, kH, kW, sH, sW, pH, pW, dH, dW, NDArrayFactory::create(0.f, input->getContext()));  // [bS, iC, iH, iW] is convoluted to [bS, iC, kH, kW, oH, oW]
-            nd4j::MmulHelper::tensorDot(&columns, &gradOreshaped, gradW, modifColumns, modifGradO1, {{2,0,1,3},{iC,kH*kW,mC}});  // [iC, kW*kH, bS*oH*oW] x [iC, bS*oH*oW, mC] = [iC, kH*kW, mC]
+            sd::MmulHelper::tensorDot(&columns, &gradOreshaped, gradW, modifColumns, modifGradO1, {{2,0,1,3},{iC,kH*kW,mC}});  // [iC, kW*kH, bS*oH*oW] x [iC, bS*oH*oW, mC] = [iC, kH*kW, mC]
 
             // ----- calculation of gradB ----- //
             if(gradB) {
@@ -513,7 +513,7 @@ namespace nd4j {
             }
 
             //----- calculation of gradI -----//
-            nd4j::MmulHelper::tensorDot(weights, gradO, &columns, {{2,0,1,3},{iC,kH*kW,mC}}, modifGradO2, modifColumns); // [iC, kH*kW, mC] x [iC, mC, bS*oH*oW] = [iC, kW*kH, bS*oH*oW]
+            sd::MmulHelper::tensorDot(weights, gradO, &columns, {{2,0,1,3},{iC,kH*kW,mC}}, modifGradO2, modifColumns); // [iC, kH*kW, mC] x [iC, mC, bS*oH*oW] = [iC, kW*kH, bS*oH*oW]
             helpers::col2im(*input->getContext(), columns, *gradI, sH, sW, pH, pW, iH, iW, dH, dW);                                       // [bS, iC, kH, kW, oH, oW] is de-convoluted to [bS, iC, iH, iW]
 
             if(!isNCHW) {
@@ -524,7 +524,7 @@ namespace nd4j {
 
 //////////////////////////////////////////////////////////////////////////
         template <typename X, typename Y>
-        static void sconv2d_(nd4j::graph::Context& block, const NDArray* input, const NDArray* weightsDepth, const NDArray* weightsPoint, const NDArray* bias,  NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
+        static void sconv2d_(sd::graph::Context& block, const NDArray* input, const NDArray* weightsDepth, const NDArray* weightsPoint, const NDArray* bias,  NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
 
             // input         [bS, iH, iW, iC]  (NHWC) or [bS, iC, iH, iW]  (NCHW)
             // weightsDepth  [kH, kW, iC, mC]  always
@@ -782,7 +782,7 @@ namespace nd4j {
 
 //////////////////////////////////////////////////////////////////////////
         template <typename T>
-        static void pooling2d_(nd4j::graph::Context& block, const NDArray& input, NDArray& output, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int poolingMode, const int extraParam0) {
+        static void pooling2d_(sd::graph::Context& block, const NDArray& input, NDArray& output, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int poolingMode, const int extraParam0) {
             // input is  [bS, iC, iH, iW]
             // output is [bS, iC, oH, oW]
             T* out = output.bufferAsT<T>();
@@ -832,13 +832,13 @@ namespace nd4j {
                                     wend = wstart + kWEff;
 
                                     if (hstart < 0)
-                                        hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
+                                        hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
                                     if (wstart < 0)
-                                        wstart += dW * ((-wstart + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
+                                        wstart += dW * ((-wstart + dW - 1) / dW); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
                                     if (hend > iH)
-                                        hend -= dH * ((hend - iH + dH - 1) / dH); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
+                                        hend -= dH * ((hend - iH + dH - 1) / dH); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
                                     if (wend > iW)
-                                        wend -= dW * ((wend - iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
+                                        wend -= dW * ((wend - iW + dW - 1) / dW); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
 
                                     hstart *= iStride2;
                                     hend *= iStride2;
@@ -881,13 +881,13 @@ namespace nd4j {
                                     wend = wstart + kWEff;
 
                                     if (hstart < 0)
-                                        hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
+                                        hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
                                     if (wstart < 0)
-                                        wstart += dW * ((-wstart + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
+                                        wstart += dW * ((-wstart + dW - 1) / dW); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
                                     if (hend > iH)
-                                        hend -= dH * ((hend - iH + dH - 1) / dH); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
+                                        hend -= dH * ((hend - iH + dH - 1) / dH); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
                                     if (wend > iW)
-                                        wend -= dW * ((wend - iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
+                                        wend -= dW * ((wend - iW + dW - 1) / dW); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
 
                                     hstart *= iStride2;
                                     hend *= iStride2;
@@ -935,13 +935,13 @@ namespace nd4j {
                                     wend = wstart + kWEff;
 
                                     if (hstart < 0)
-                                        hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
+                                        hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
                                     if (wstart < 0)
-                                        wstart += dW * ((-wstart + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
+                                        wstart += dW * ((-wstart + dW - 1) / dW); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
                                     if (hend > iH)
-                                        hend -= dH * ((hend - iH + dH - 1) / dH); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
+                                        hend -= dH * ((hend - iH + dH - 1) / dH); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
                                     if (wend > iW)
-                                        wend -= dW * ((wend - iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
+                                        wend -= dW * ((wend - iW + dW - 1) / dW); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
 
                                     hstart *= iStride2;
                                     hend *= iStride2;
@@ -952,9 +952,9 @@ namespace nd4j {
 
                                     for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
                                         for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
-                                            sum += nd4j::math::nd4j_pow<T, T, T>(nd4j::math::nd4j_abs<T>(pIn[kh + kw]), extraParam0);
+                                            sum += sd::math::nd4j_pow<T, T, T>(sd::math::nd4j_abs<T>(pIn[kh + kw]), extraParam0);
 
-                                    sum = nd4j::math::nd4j_pow<T, T, T>(sum, static_cast<T>((T) 1.f) / extraParam0);
+                                    sum = sd::math::nd4j_pow<T, T, T>(sum, static_cast<T>((T) 1.f) / extraParam0);
 
                                     out[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3] = sum;
                                 }
@@ -973,7 +973,7 @@ namespace nd4j {
 
 //////////////////////////////////////////////////////////////////////////
         template <typename T>
-        static void pooling3d_(nd4j::graph::Context& block, const NDArray& input, NDArray& output, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int poolingMode, const int extraParam0) {
+        static void pooling3d_(sd::graph::Context& block, const NDArray& input, NDArray& output, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int poolingMode, const int extraParam0) {
             // input is  [bS, iC, iD, iH, iW]
             // output is [bS, iC, oD, oH, oW]
             T* out = output.bufferAsT<T>();
@@ -1119,7 +1119,7 @@ namespace nd4j {
                                                     sum += pIn[kd + kh + kw];
 
                                         if (extraParam0 == 0)         //Exclude padding
-                                            sum /= nd4j::math::nd4j_ceil<double, T>(static_cast<double>(dend - dstart) / static_cast<double>(iStep2)) * nd4j::math::nd4j_ceil<double, T>(static_cast<double>(hend - hstart) / static_cast<double>(iStep3)) * nd4j::math::nd4j_ceil<double, T>(static_cast<double>(wend - wstart) / static_cast<double>(iStep4));   //Accounts for dilation
+                                            sum /= sd::math::nd4j_ceil<double, T>(static_cast<double>(dend - dstart) / static_cast<double>(iStep2)) * sd::math::nd4j_ceil<double, T>(static_cast<double>(hend - hstart) / static_cast<double>(iStep3)) * sd::math::nd4j_ceil<double, T>(static_cast<double>(wend - wstart) / static_cast<double>(iStep4));   //Accounts for dilation
                                         else if (extraParam0 == 1)    //Include padding
                                             sum /= kProd;
 
@@ -1179,9 +1179,9 @@ namespace nd4j {
                                         for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
                                             for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
                                                 for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
-                                                    sum += nd4j::math::nd4j_pow<T, T, T>(nd4j::math::nd4j_abs<T>(pIn[kd + kh + kw]), extraParam0);
+                                                    sum += sd::math::nd4j_pow<T, T, T>(sd::math::nd4j_abs<T>(pIn[kd + kh + kw]), extraParam0);
 
-                                        sum = nd4j::math::nd4j_pow<T, T, T>(sum, (T) 1.f / extraParam0);
+                                        sum = sd::math::nd4j_pow<T, T, T>(sum, (T) 1.f / extraParam0);
 
                                         out[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4] = sum;
                                     }
@@ -1202,7 +1202,7 @@ namespace nd4j {
 
 //////////////////////////////////////////////////////////////////////////
         template <typename T>
-        static void pooling2dBP_(nd4j::graph::Context& block, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int poolingMode, const int extraParam0) {
+        static void pooling2dBP_(sd::graph::Context& block, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int poolingMode, const int extraParam0) {
             // input [bS, iC, iH, iW]
             // gradI [bS, iC, iH, iW] -> gradI is output in this function
             // gradO [bS, iC, oH, oW]
@@ -1265,13 +1265,13 @@ namespace nd4j {
                                     wend = wstart + kWEff;
 
                                     if (hstart < 0)
-                                        hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
+                                        hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
                                     if (wstart < 0)
-                                        wstart += dW * ((-wstart + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
+                                        wstart += dW * ((-wstart + dW - 1) / dW); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
                                     if (hend > iH)
-                                        hend -= dH * ((hend - iH + dH - 1) / dH); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
+                                        hend -= dH * ((hend - iH + dH - 1) / dH); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
                                     if (wend > iW)
-                                        wend -= dW * ((wend - iW + dW - 1) / dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
+                                        wend -= dW * ((wend - iW + dW - 1) / dW); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
 
                                     sum = -DataTypeUtils::max<T>();
                                     valO = gO[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3];
@@ -1343,16 +1343,16 @@ namespace nd4j {
 
                                     if (hstart < 0)
                                         hstart += dH * ((-hstart + dH - 1) /
-                                                        dH); // (Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
+                                                        dH); // (Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
                                     if (wstart < 0)
                                         wstart += dW * ((-wstart + dW - 1) /
-                                                        dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
+                                                        dW); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
                                     if (hend > iH)
                                         hend -= dH * ((hend - iH + dH - 1) /
-                                                      dH); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
+                                                      dH); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
                                     if (wend > iW)
                                         wend -= dW * ((wend - iW + dW - 1) /
-                                                      dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
+                                                      dW); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
 
                                     hstart *= gIStride2;
                                     hend *= gIStride2;
@@ -1362,9 +1362,9 @@ namespace nd4j {
                                     valO = gO[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3];
 
                                     if ((int) extraParam0 == 0)         //Exclude padding
-                                        valO /= static_cast<T>(nd4j::math::nd4j_ceil<double, T>(
+                                        valO /= static_cast<T>(sd::math::nd4j_ceil<double, T>(
                                                 static_cast<double>(hend - hstart) / static_cast<double>(gIStep2))) *
-                                                static_cast<T>(nd4j::math::nd4j_ceil<double, T>(
+                                                static_cast<T>(sd::math::nd4j_ceil<double, T>(
                                                         static_cast<double>(wend - wstart) /
                                                         static_cast<double>(gIStep3)));   //Accounts for dilation
                                     else if ((int) extraParam0 == 1)    //Include padding
@@ -1402,16 +1402,16 @@ namespace nd4j {
 
                                     if (hstart < 0)
                                         hstart += dH * ((-hstart + dH - 1) /
-                                                        dH); // (Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
+                                                        dH); // (Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
                                     if (wstart < 0)
                                         wstart += dW * ((-wstart + dW - 1) /
-                                                        dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
+                                                        dW); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
                                     if (hend > iH)
                                         hend -= dH * ((hend - iH + dH - 1) /
-                                                      dH); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
+                                                      dH); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
                                     if (wend > iW)
                                         wend -= dW * ((wend - iW + dW - 1) /
-                                                      dW); //(Nd4jLong)nd4j::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
+                                                      dW); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
 
                                     sum = static_cast<T>(0.f);
                                     valO = gO[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3];
@@ -1425,37 +1425,37 @@ namespace nd4j {
 
                                         for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
                                             for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
-                                                sum += nd4j::math::nd4j_pow<T, T, T>(
-                                                        nd4j::math::nd4j_abs<T>(pIn[kh + kw]), extraParam0);
+                                                sum += sd::math::nd4j_pow<T, T, T>(
+                                                        sd::math::nd4j_abs<T>(pIn[kh + kw]), extraParam0);
 
-                                        valO *= nd4j::math::nd4j_pow<T, T, T>(sum,
+                                        valO *= sd::math::nd4j_pow<T, T, T>(sum,
                                                                               ((T) 1. - extraParam0) / extraParam0);
 
                                         for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
                                             for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
-                                                pgI[kh + kw] += valO * nd4j::math::nd4j_pow<T, T, T>(
-                                                        nd4j::math::nd4j_abs<T>(pIn[kh + kw]), extraParam0 - 1.f) *
-                                                                nd4j::math::nd4j_sgn<T, T>(pIn[kh + kw]);
+                                                pgI[kh + kw] += valO * sd::math::nd4j_pow<T, T, T>(
+                                                        sd::math::nd4j_abs<T>(pIn[kh + kw]), extraParam0 - 1.f) *
+                                                                sd::math::nd4j_sgn<T, T>(pIn[kh + kw]);
                                     } else {
 
                                         for (Nd4jLong kh = hstart; kh < hend; kh += dH)
                                             for (Nd4jLong kw = wstart; kw < wend; kw += dW)
-                                                sum += nd4j::math::nd4j_pow<T, T, T>(
-                                                        nd4j::math::nd4j_abs<T>(pIn[kh * iStride2 + kw * iStride3]),
+                                                sum += sd::math::nd4j_pow<T, T, T>(
+                                                        sd::math::nd4j_abs<T>(pIn[kh * iStride2 + kw * iStride3]),
                                                         extraParam0);
 
-                                        valO *= nd4j::math::nd4j_pow<T, T, T>(sum,
+                                        valO *= sd::math::nd4j_pow<T, T, T>(sum,
                                                                               ((T) 1. - extraParam0) / extraParam0);
 
                                         for (Nd4jLong kh = hstart; kh < hend; kh += dH) {
                                             for (Nd4jLong kw = wstart; kw < wend; kw += dW) {
                                                 const auto inVal = pIn[kh * iStride2 + kw * iStride3];
                                                 pgI[kh * gIStride2 + kw * gIStride3] += valO *
-                                                                                        nd4j::math::nd4j_pow<T, T, T>(
-                                                                                                nd4j::math::nd4j_abs<T>(
+                                                                                        sd::math::nd4j_pow<T, T, T>(
+                                                                                                sd::math::nd4j_abs<T>(
                                                                                                         inVal),
                                                                                                 extraParam0 - 1.f) *
-                                                                                        nd4j::math::nd4j_sgn<T, T>(
+                                                                                        sd::math::nd4j_sgn<T, T>(
                                                                                                 inVal);
                                             }
                                         }
@@ -1476,7 +1476,7 @@ namespace nd4j {
 
 //////////////////////////////////////////////////////////////////////////
         template <typename T>
-        static void pooling3dBP_(nd4j::graph::Context& block, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int poolingMode, const int extraParam0) {
+        static void pooling3dBP_(sd::graph::Context& block, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int poolingMode, const int extraParam0) {
             // input [bS, iC, iD, iH, iW]
             // gradI [bS, iC, iD, iH, iW] -> gradI is output in this function
             // gradO [bS, iC, oD, oH, oW]
@@ -1664,7 +1664,7 @@ namespace nd4j {
                                         valO = gO[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4];
 
                                         if (extraParam0 == 0)         //Exclude padding
-                                            valO /= nd4j::math::nd4j_ceil<double, T>(static_cast<double>(dend - dstart) / static_cast<double>(gIStep2)) * nd4j::math::nd4j_ceil<double, T>(static_cast<double>(hend - hstart) / static_cast<double>(gIStep3)) * nd4j::math::nd4j_ceil<double, T>(static_cast<double>(wend - wstart) / static_cast<double>(gIStep4));   //Accounts for dilation
+                                            valO /= sd::math::nd4j_ceil<double, T>(static_cast<double>(dend - dstart) / static_cast<double>(gIStep2)) * sd::math::nd4j_ceil<double, T>(static_cast<double>(hend - hstart) / static_cast<double>(gIStep3)) * sd::math::nd4j_ceil<double, T>(static_cast<double>(wend - wstart) / static_cast<double>(gIStep4));   //Accounts for dilation
                                         else if (extraParam0 == 1)    //Include padding
                                             valO /= kProd;
 
@@ -1731,27 +1731,27 @@ namespace nd4j {
                                             for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
                                                 for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
                                                     for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
-                                                        sum += nd4j::math::nd4j_pow<T, T, T>(nd4j::math::nd4j_abs<T>(pIn[kd + kh + kw]), extraParam0);
+                                                        sum += sd::math::nd4j_pow<T, T, T>(sd::math::nd4j_abs<T>(pIn[kd + kh + kw]), extraParam0);
 
-                                            valO *= nd4j::math::nd4j_pow<T, T, T>(sum, ((T) 1.f - extraParam0) / extraParam0);
+                                            valO *= sd::math::nd4j_pow<T, T, T>(sum, ((T) 1.f - extraParam0) / extraParam0);
 
                                             for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
                                                 for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
                                                     for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
-                                                        pgI[kd + kh + kw] += valO * nd4j::math::nd4j_pow<T, T, T>(nd4j::math::nd4j_abs<T>(pIn[kd + kh + kw]),extraParam0 - (T) 1.f) * nd4j::math::nd4j_sgn<T, T>(pIn[kd + kh + kw]);
+                                                        pgI[kd + kh + kw] += valO * sd::math::nd4j_pow<T, T, T>(sd::math::nd4j_abs<T>(pIn[kd + kh + kw]),extraParam0 - (T) 1.f) * sd::math::nd4j_sgn<T, T>(pIn[kd + kh + kw]);
                                         } else {
                                             for (Nd4jLong kd = dstart; kd < dend; kd += dD)
                                                 for (Nd4jLong kh = hstart; kh < hend; kh += dH)
                                                     for (Nd4jLong kw = wstart; kw < wend; kw += dW)
-                                                        sum += nd4j::math::nd4j_pow<T, T, T>(nd4j::math::nd4j_abs<T>(pIn[kd * iStride2 + kh * iStride3 + kw * iStride4]), extraParam0);
+                                                        sum += sd::math::nd4j_pow<T, T, T>(sd::math::nd4j_abs<T>(pIn[kd * iStride2 + kh * iStride3 + kw * iStride4]), extraParam0);
 
-                                            valO *= nd4j::math::nd4j_pow<T, T, T>(sum, ((T) 1.f - extraParam0) / extraParam0);
+                                            valO *= sd::math::nd4j_pow<T, T, T>(sum, ((T) 1.f - extraParam0) / extraParam0);
 
                                             for (Nd4jLong kd = dstart; kd < dend; kd += dD)
                                                 for (Nd4jLong kh = hstart; kh < hend; kh += dH)
                                                     for (Nd4jLong kw = wstart; kw < wend; kw += dW) {
                                                         const auto inVal = pIn[kD * iStride2 + kh * iStride3 + kw * iStride4];
-                                                        pgI[kd * gIStride2 + kh * gIStride3 + kw * gIStride4] += valO * nd4j::math::nd4j_pow<T, T, T>(nd4j::math::nd4j_abs<T>(inVal), extraParam0 - 1.f) * nd4j::math::nd4j_sgn<T, T>(inVal);
+                                                        pgI[kd * gIStride2 + kh * gIStride3 + kw * gIStride4] += valO * sd::math::nd4j_pow<T, T, T>(sd::math::nd4j_abs<T>(inVal), extraParam0 - 1.f) * sd::math::nd4j_sgn<T, T>(inVal);
                                                     }
                                         }
                                     }
@@ -1772,52 +1772,52 @@ namespace nd4j {
 
 
 
-        void ConvolutionUtils::conv2d(nd4j::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
+        void ConvolutionUtils::conv2d(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
             BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), conv2d_, (block, input, weights, bias, output, kH, kW, sH, sW, pH, pW, dH, dW, paddingMode, isNCHW), FLOAT_TYPES);
         }
-        void ConvolutionUtils::conv2dBP(nd4j::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
+        void ConvolutionUtils::conv2dBP(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
             BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), conv2dBP_, (block, input, weights, bias, gradO, gradI, gradW, gradB, kH, kW, sH, sW, pH, pW, dH, dW, paddingMode, isNCHW), FLOAT_TYPES);
         }
-        void ConvolutionUtils::depthwiseConv2d(nd4j::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
+        void ConvolutionUtils::depthwiseConv2d(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
             BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), depthwiseConv2d_, (block, input, weights, bias, output, kH, kW, sH, sW, pH, pW, dH, dW, paddingMode, isNCHW), FLOAT_TYPES);
         }
-        void ConvolutionUtils::depthwiseConv2dBP(nd4j::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
+        void ConvolutionUtils::depthwiseConv2dBP(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
             BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), depthwiseConv2dBP_, (input, weights, bias, gradO, gradI, gradW, gradB, kH, kW, sH, sW, pH, pW, dH, dW, paddingMode, isNCHW), FLOAT_TYPES);
         }
-        void ConvolutionUtils::sconv2d(nd4j::graph::Context& block, const NDArray* input, const NDArray* weightsDepth, const NDArray* weightsPoint, const NDArray* bias,  NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
+        void ConvolutionUtils::sconv2d(sd::graph::Context& block, const NDArray* input, const NDArray* weightsDepth, const NDArray* weightsPoint, const NDArray* bias,  NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
             BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), sconv2d_, (block, input, weightsDepth, weightsPoint, bias, output, kH, kW, sH, sW, pH, pW, dH, dW, paddingMode, isNCHW), FLOAT_TYPES);
         }
-        void ConvolutionUtils::vol2col(nd4j::graph::Context& block, const NDArray& volume, NDArray& columns, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW) {
+        void ConvolutionUtils::vol2col(sd::graph::Context& block, const NDArray& volume, NDArray& columns, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW) {
             BUILD_SINGLE_SELECTOR(volume.dataType(), vol2col_, (volume, columns, sD, sH, sW, pD, pH, pW, dD, dH, dW), FLOAT_TYPES);
         }
-        void ConvolutionUtils::col2vol(nd4j::graph::Context& block, const NDArray& columns, NDArray& volume, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW) {
+        void ConvolutionUtils::col2vol(sd::graph::Context& block, const NDArray& columns, NDArray& volume, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW) {
             BUILD_SINGLE_SELECTOR(volume.dataType(), col2vol_, (columns, volume, sD, sH, sW, pD, pH, pW, dD, dH, dW), FLOAT_TYPES);
         }
-        void ConvolutionUtils::upsampling2d(nd4j::graph::Context& block, const NDArray& input, NDArray& output, const int factorH, const int factorW, const bool isNCHW) {
+        void ConvolutionUtils::upsampling2d(sd::graph::Context& block, const NDArray& input, NDArray& output, const int factorH, const int factorW, const bool isNCHW) {
             BUILD_SINGLE_SELECTOR(input.dataType(), upsampling2d_, (input, output, factorH, factorW, isNCHW), FLOAT_TYPES);
         }
-        void ConvolutionUtils::upsampling3d(nd4j::graph::Context& block, const NDArray& input, NDArray& output, const int factorD, const int factorH, const int factorW, const bool isNCDHW) {
+        void ConvolutionUtils::upsampling3d(sd::graph::Context& block, const NDArray& input, NDArray& output, const int factorD, const int factorH, const int factorW, const bool isNCDHW) {
             BUILD_SINGLE_SELECTOR(input.dataType(), upsampling3d_, (input, output, factorD, factorH, factorW, isNCDHW), FLOAT_TYPES);
         }
-        void ConvolutionUtils::upsampling2dBP(nd4j::graph::Context& block, const NDArray& gradO, NDArray& gradI, const bool isNCHW) {
+        void ConvolutionUtils::upsampling2dBP(sd::graph::Context& block, const NDArray& gradO, NDArray& gradI, const bool isNCHW) {
             BUILD_SINGLE_SELECTOR(gradO.dataType(), upsampling2dBP_, (gradO, gradI, isNCHW), FLOAT_TYPES);
         }
-        void ConvolutionUtils::upsampling3dBP(nd4j::graph::Context& block, const NDArray& gradO, NDArray& gradI, const bool isNCHW) {
+        void ConvolutionUtils::upsampling3dBP(sd::graph::Context& block, const NDArray& gradO, NDArray& gradI, const bool isNCHW) {
             BUILD_SINGLE_SELECTOR(gradO.dataType(), upsampling3dBP_, (gradO, gradI, isNCHW), FLOAT_TYPES);
         }
 
 
 
-        void ConvolutionUtils::pooling2d(nd4j::graph::Context& block, const NDArray& input, NDArray& output, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const PoolingType poolingMode, const int extraParam0) {
+        void ConvolutionUtils::pooling2d(sd::graph::Context& block, const NDArray& input, NDArray& output, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const PoolingType poolingMode, const int extraParam0) {
             BUILD_SINGLE_SELECTOR(input.dataType(), pooling2d_, (block, input, output, kH, kW, sH, sW, pH, pW, dH, dW, poolingMode, extraParam0), FLOAT_TYPES);
         }
-        void ConvolutionUtils::pooling3d(nd4j::graph::Context& block, const NDArray& input, NDArray& output, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int poolingMode, const int extraParam0) {
+        void ConvolutionUtils::pooling3d(sd::graph::Context& block, const NDArray& input, NDArray& output, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int poolingMode, const int extraParam0) {
             BUILD_SINGLE_SELECTOR(input.dataType(), pooling3d_, (block, input, output, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, poolingMode, extraParam0), FLOAT_TYPES);
         }
-        void ConvolutionUtils::pooling2dBP(nd4j::graph::Context& block, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int poolingMode, const int extraParam0) {
+        void ConvolutionUtils::pooling2dBP(sd::graph::Context& block, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int poolingMode, const int extraParam0) {
             BUILD_SINGLE_SELECTOR(input.dataType(), pooling2dBP_, (block, input, gradO, gradI, kH, kW, sH, sW, pH, pW, dH, dW, poolingMode, extraParam0), FLOAT_TYPES);
         }
-        void ConvolutionUtils::pooling3dBP(nd4j::graph::Context& block, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int poolingMode, const int extraParam0) {
+        void ConvolutionUtils::pooling3dBP(sd::graph::Context& block, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int poolingMode, const int extraParam0) {
             BUILD_SINGLE_SELECTOR(input.dataType(), pooling3dBP_, (block, input, gradO, gradI, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, poolingMode, extraParam0), FLOAT_TYPES);
         }
     }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/crop_and_resize.cpp b/libnd4j/include/ops/declarable/helpers/cpu/crop_and_resize.cpp
index 233699163..ab6503946 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/crop_and_resize.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/crop_and_resize.cpp
@@ -36,7 +36,7 @@ limitations under the License.
 #include <ops/declarable/helpers/crop_and_resize.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
 
@@ -53,7 +53,7 @@ namespace nd4j {
 // \@param crops - output image batch (4D with given type)
 //
             void
-            cropAndResizeFunctor(nd4j::LaunchContext * context, NDArray const *images, NDArray const *boxes,
+            cropAndResizeFunctor(sd::LaunchContext * context, NDArray const *images, NDArray const *boxes,
                                  NDArray const *indices, NDArray const *cropSize,
                                  int method, double extrapolationVal, NDArray *crops) {
                 BUILD_TRIPLE_SELECTOR(images->dataType(), boxes->dataType(), indices->dataType(), cropAndResizeFunctor_, (images, boxes, indices, cropSize, method, extrapolationVal, crops), NUMERIC_TYPES, FLOAT_TYPES, INTEGER_TYPES);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/crop_and_resize.hpp b/libnd4j/include/ops/declarable/helpers/cpu/crop_and_resize.hpp
index 1f55378c0..c7d29c471 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/crop_and_resize.hpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/crop_and_resize.hpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/helpers/crop_and_resize.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             template<typename T, typename F, typename I>
@@ -64,8 +64,8 @@ namespace nd4j {
                                 continue;
                             }
                             if (method == 0 /* bilinear */) {
-                                const int topYIndex = nd4j::math::p_floor(inY);
-                                const int bottomYIndex = nd4j::math::p_ceil(inY);
+                                const int topYIndex = sd::math::p_floor(inY);
+                                const int bottomYIndex = sd::math::p_ceil(inY);
                                 const float y_lerp = inY - topYIndex;
 
                                 for (auto x = 0; x < cropWidth; ++x) {
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/cross.cpp b/libnd4j/include/ops/declarable/helpers/cpu/cross.cpp
index 6a8523925..51af1840b 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/cross.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/cross.cpp
@@ -23,11 +23,11 @@
 #include <helpers/ShapeUtils.h>
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j 	  {
+namespace sd 	  {
 namespace ops 	  {
 namespace helpers {
 
-void crossBatched(nd4j::LaunchContext * context, NDArray *a, NDArray *b, NDArray *o) {
+void crossBatched(sd::LaunchContext * context, NDArray *a, NDArray *b, NDArray *o) {
     auto _a = a->reshape(a->ordering(), {-1, 3});
     auto _b = b->reshape(b->ordering(), {-1, 3});
     auto _o = o->reshape(o->ordering(), {-1, 3}, false);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/d_t_s.cpp b/libnd4j/include/ops/declarable/helpers/cpu/d_t_s.cpp
index d3e524ff4..700e5b8dd 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/d_t_s.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/d_t_s.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/helpers/d_t_s.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -93,7 +93,7 @@ namespace helpers {
         }
     }
 
-    void _depthToSpace(nd4j::LaunchContext * context, NDArray *input, NDArray *output, int block_size, bool isNHWC) {
+    void _depthToSpace(sd::LaunchContext * context, NDArray *input, NDArray *output, int block_size, bool isNHWC) {
         auto xType = input->dataType();
 
         BUILD_SINGLE_SELECTOR(xType, __depthToSpace, (input, output, block_size, isNHWC), LIBND4J_TYPES);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/diGamma.cpp b/libnd4j/include/ops/declarable/helpers/cpu/diGamma.cpp
index 2a51b92a6..37abaf559 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/diGamma.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/diGamma.cpp
@@ -22,7 +22,7 @@
 #include<ops/declarable/helpers/gammaMathFunc.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -38,7 +38,7 @@ static void diGamma_(const NDArray& x, NDArray& z) {
 	samediff::Threads::parallel_for(func, 0, x.lengthOf());
 }
 
-void diGamma(nd4j::LaunchContext* context, const NDArray& x, NDArray& z) {
+void diGamma(sd::LaunchContext* context, const NDArray& x, NDArray& z) {
 
 	BUILD_SINGLE_SELECTOR(x.dataType(), diGamma_, (x, z), FLOAT_TYPES);
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/diag.cpp b/libnd4j/include/ops/declarable/helpers/cpu/diag.cpp
index f2f2033c1..670ad5322 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/diag.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/diag.cpp
@@ -18,10 +18,10 @@
 // Created by GS <sgazeos@gmail.com> on 4/6/2018.
 //
 
-#include "ResultSet.h"
+#include <array/ResultSet.h>
 #include <ops/declarable/helpers/diag.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -38,7 +38,7 @@ static void _diagFunctor(const NDArray* input, NDArray* output) {
         output->p<T>(i * (inLength + 1), (*input).e<T>(i));
 }
 
-    void diagFunctor(nd4j::LaunchContext * context, const NDArray* input, NDArray* output) {
+    void diagFunctor(sd::LaunchContext * context, const NDArray* input, NDArray* output) {
         auto xType = input->dataType();
 
         BUILD_SINGLE_SELECTOR(xType, _diagFunctor, (input, output), LIBND4J_TYPES);
@@ -46,7 +46,7 @@ static void _diagFunctor(const NDArray* input, NDArray* output) {
 
 BUILD_SINGLE_TEMPLATE(template void _diagFunctor, (const NDArray* input, NDArray* output);, LIBND4J_TYPES);
 
-void diagPartFunctor(nd4j::LaunchContext * context, NDArray const* input, NDArray* output) {
+void diagPartFunctor(sd::LaunchContext * context, NDArray const* input, NDArray* output) {
     const int outLen = output->lengthOf();
     const int inLen = input->lengthOf();
     int i(0), j(0);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/dilation2d.cpp b/libnd4j/include/ops/declarable/helpers/cpu/dilation2d.cpp
index b17d33db3..fbf071e28 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/dilation2d.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/dilation2d.cpp
@@ -22,7 +22,7 @@
 #include <array/DataTypeUtils.h>
 #include <execution/Threads.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
@@ -90,7 +90,7 @@ static void dilation2d_(NDArray *input, NDArray *weights, NDArray *output, const
     samediff::Threads::parallel_for(func, 0, bS, 1, 0, oH, 1);
 }
 
-void dilation2d(nd4j::LaunchContext* context, NDArray *input, NDArray *weights, NDArray *output, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW) {
+void dilation2d(sd::LaunchContext* context, NDArray *input, NDArray *weights, NDArray *output, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW) {
     BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), dilation2d_, (input, weights, output, sH, sW, pH, pW, dH, dW), FLOAT_TYPES);
 }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp b/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp
index e529ab84f..54981dea5 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp
@@ -19,19 +19,19 @@
 //
 
 #include <ops/declarable/helpers/dropout.h>
-#include <NativeOps.h>
+#include <legacy/NativeOps.h>
 #include <vector>
 #include <memory>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
     template <typename T>
     static void dropoutSimple(NDArray const* input, NDArray* output, double probValue, int seed) {
 
-        nd4j::graph::RandomGenerator nodeRng(3019L, seed);
+        sd::graph::RandomGenerator nodeRng(3019L, seed);
         int inLen = input->lengthOf();
 
         auto func = PRAGMA_THREADS_FOR {
@@ -50,7 +50,7 @@ namespace helpers {
     template <typename T>
     int dropOutFunctor_(graph::Context& context, NDArray* input, NDArray* output, NDArray* reduceShape, int seed, double probValue) {
         //NativeOps native;
-        //nd4j::graph::RandomGenerator nodeRng(seed);   //static int dropOutFunctor_(nd4j::random::RandomBuffer* rng, NDArray* input, NDArray* output, NDArray* reduceShape, int seed, double probValue) {
+        //sd::graph::RandomGenerator nodeRng(seed);   //static int dropOutFunctor_(sd::random::RandomBuffer* rng, NDArray* input, NDArray* output, NDArray* reduceShape, int seed, double probValue) {
         //NativeOps native;
         //native.reSeedBuffer(nullptr, (long)seed, rng);
         //if (newRng )
@@ -127,7 +127,7 @@ namespace helpers {
         //    return ND4J_STATUS_BAD_RNG;
         //T probValueArr[] = {probValue, alpha, alpha1, beta};
         //input->template applyRandom<randomOps::AlphaDropOut<T>>(rng, nullptr, output, probValueArr);
-        nd4j::graph::RandomGenerator nodeRng(3019L, seed);
+        sd::graph::RandomGenerator nodeRng(3019L, seed);
 
         auto func = PRAGMA_THREADS_FOR {
             for (auto e = start; e < stop; e++) {
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp b/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp
index 3030b1255..2b6b4cd02 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp
@@ -20,7 +20,7 @@
 #include <ops/declarable/helpers/dynamic.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
 
@@ -183,7 +183,7 @@ namespace nd4j {
                 outputList[1]->assign(indices);
             }
 
-            void dynamicPartitionFunctor(nd4j::LaunchContext * context, NDArray const* input, NDArray const* indices, std::vector<NDArray*>& outputList) {
+            void dynamicPartitionFunctor(sd::LaunchContext * context, NDArray const* input, NDArray const* indices, std::vector<NDArray*>& outputList) {
                 auto xType = input->dataType();
 
                 BUILD_SINGLE_SELECTOR(xType, _dynamicPartitionFunctor, (input, indices, outputList), LIBND4J_TYPES);
@@ -194,19 +194,19 @@ namespace nd4j {
                 throw std::runtime_error("Not umplemented yet");
             }
 
-            int dynamicStitchFunctor(nd4j::LaunchContext * context, std::vector<NDArray*> const& inputs, std::vector<NDArray*> const& indices, NDArray* output){
+            int dynamicStitchFunctor(sd::LaunchContext * context, std::vector<NDArray*> const& inputs, std::vector<NDArray*> const& indices, NDArray* output){
                 auto xType = inputs.at(0)->dataType();
 
                 BUILD_SINGLE_SELECTOR(xType, return _dynamicStitchFunctor, (inputs, indices, output), LIBND4J_TYPES);
             }
 
-            int dynamicStitchFunctorBP(nd4j::LaunchContext * context, std::vector<NDArray*> const& inputs, std::vector<NDArray*> const& indices, NDArray const* gradInput, std::vector<NDArray*>& outputList) {
+            int dynamicStitchFunctorBP(sd::LaunchContext * context, std::vector<NDArray*> const& inputs, std::vector<NDArray*> const& indices, NDArray const* gradInput, std::vector<NDArray*>& outputList) {
                 auto xType = inputs.at(0)->dataType();
 
                 BUILD_SINGLE_SELECTOR(xType, return _dynamicStitchFunctorBP, (inputs, indices, gradInput, outputList), LIBND4J_TYPES);
             }
 
-            void dynamicPartitionFunctorBP(nd4j::LaunchContext * context, NDArray const* input, NDArray const* indices, std::vector<NDArray*> const& inputGradientList, std::vector<NDArray*>& outputList) {
+            void dynamicPartitionFunctorBP(sd::LaunchContext * context, NDArray const* input, NDArray const* indices, std::vector<NDArray*> const& inputGradientList, std::vector<NDArray*>& outputList) {
                 auto xType = input->dataType();
 
                 BUILD_SINGLE_SELECTOR(xType, _dynamicPartitionFunctorBP, (input, indices, inputGradientList, outputList), LIBND4J_TYPES);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/extract_patches.cpp b/libnd4j/include/ops/declarable/helpers/cpu/extract_patches.cpp
index b2707ea5c..15ea569e8 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/extract_patches.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/extract_patches.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/helpers/axis.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -86,7 +86,7 @@ namespace helpers {
     }
 
 
-    void extractPatches(nd4j::LaunchContext * context, NDArray* images, NDArray* output, int sizeRow, int sizeCol, int stradeRow, int stradeCol, int rateRow, int rateCol, bool theSame){
+    void extractPatches(sd::LaunchContext * context, NDArray* images, NDArray* output, int sizeRow, int sizeCol, int stradeRow, int stradeCol, int rateRow, int rateCol, bool theSame){
         auto xType = images->dataType();
 
         BUILD_SINGLE_SELECTOR(xType, _extractPatches, (images, output, sizeRow, sizeCol, stradeRow, stradeCol, rateRow, rateCol, theSame), LIBND4J_TYPES);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/fake_quantization.cpp b/libnd4j/include/ops/declarable/helpers/cpu/fake_quantization.cpp
index f18f48fac..d2c918da9 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/fake_quantization.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/fake_quantization.cpp
@@ -19,9 +19,9 @@
 //
 
 #include <ops/declarable/helpers/fake_quantization.h>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -47,7 +47,7 @@ namespace helpers {
                 if (zeroPointFromMin > quantMaxF) {
                     return static_cast<uint16_t>(quantMax);
                 }
-                return (uint16_t)nd4j::math::nd4j_round<T,int>(zeroPointFromMin);
+                return (uint16_t)sd::math::nd4j_round<T,int>(zeroPointFromMin);
         }();
         // compute nudged min and max with computed nudged zero point
         *nudgedMin = (quantMinF - nudged_zero_point) * (*scale);
@@ -102,7 +102,7 @@ namespace helpers {
                 val = nudgedMax;
             // converse value with scale and shifted with nudged min
             val -= nudgedMin;
-            return (nd4j::math::nd4j_floor<T,T>(val / scale + T(0.5f)) * scale + nudgedMin);
+            return (sd::math::nd4j_floor<T,T>(val / scale + T(0.5f)) * scale + nudgedMin);
         };
 
         input->applyLambda<T>(fakeQuantizationWithMinMax, *output);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/flatten.cpp b/libnd4j/include/ops/declarable/helpers/cpu/flatten.cpp
index d43cd716f..aadd74298 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/flatten.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/flatten.cpp
@@ -20,7 +20,7 @@
 
 #include <ops/declarable/helpers/flatten.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
 
@@ -50,7 +50,7 @@ namespace nd4j {
                 }
             }
 
-            void flatten(nd4j::LaunchContext *context, std::vector<NDArray*> &inputs, NDArray *output, char order) {
+            void flatten(sd::LaunchContext *context, std::vector<NDArray*> &inputs, NDArray *output, char order) {
                 BUILD_SINGLE_SELECTOR(output->dataType(), flatten_, (inputs, output, order), LIBND4J_TYPES);
             }
         }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp b/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp
index ed844e84f..fb715a5e5 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp
@@ -21,15 +21,15 @@
 #include <ops/declarable/helpers/gather.h>
 #include <numeric>
 #include <execution/Threads.h>
-#include <ShapeUtils.h>
-#include <ConstantTadHelper.h>
+#include <helpers/ShapeUtils.h>
+#include <helpers/ConstantTadHelper.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
 ////////////////////////////////////////////////////////////////////////
-void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray* indices, NDArray* output, const std::vector<int>& intArgs) {
+void gather(sd::LaunchContext * context, const NDArray* input, const NDArray* indices, NDArray* output, const std::vector<int>& intArgs) {
 
     int axis = intArgs.size() > 0 ? intArgs[0] : 0;
     const int inputRank = input->rankOf();
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/gradient.cpp b/libnd4j/include/ops/declarable/helpers/cpu/gradient.cpp
index f6756dd88..df5ee1afc 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/gradient.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/gradient.cpp
@@ -19,9 +19,9 @@
 //
 
 #include <ops/declarable/helpers/axis.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 template <typename T>
@@ -33,7 +33,7 @@ static void applyGradientDescent_(NDArray* input, NDArray* step, double weight,
     input->applyPairwiseLambda<T>(*step, lambda, *output);
 }
 
-void applyGradientDescent(nd4j::LaunchContext* context, NDArray* input, NDArray* step, double weight, NDArray* output) {
+void applyGradientDescent(sd::LaunchContext* context, NDArray* input, NDArray* step, double weight, NDArray* output) {
     BUILD_SINGLE_SELECTOR(input->dataType(), applyGradientDescent_, (input, step, weight, output), FLOAT_TYPES);
 }
 BUILD_SINGLE_TEMPLATE(template void applyGradientDescent_, (NDArray* input, NDArray* step, double weight, NDArray* output), FLOAT_TYPES);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/gru.cpp b/libnd4j/include/ops/declarable/helpers/cpu/gru.cpp
index 3db5e5373..b00036b81 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/gru.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/gru.cpp
@@ -24,18 +24,18 @@
 // "Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation"
 
 
-#include<ops/declarable/helpers/gru.h>
+#include <ops/declarable/helpers/gru.h>
 #include <ops/declarable/CustomOperations.h>
-#include<ops/declarable/helpers/transforms.h>
-#include <MmulHelper.h>
+#include <ops/declarable/helpers/transforms.h>
+#include <helpers/MmulHelper.h>
 
-namespace nd4j 	  {
+namespace sd 	  {
 namespace ops 	  {
 namespace helpers {
 
 
 //////////////////////////////////////////////////////////////////////////
-void gruCell(nd4j::LaunchContext * context, const NDArray* x, const NDArray* hLast, const NDArray* W, const NDArray* Wc,
+void gruCell(sd::LaunchContext * context, const NDArray* x, const NDArray* hLast, const NDArray* W, const NDArray* Wc,
              const NDArray* b, const NDArray* bc,
              NDArray* r, NDArray* u, NDArray* c, NDArray* h) {
 
@@ -128,7 +128,7 @@ void gruCell(nd4j::LaunchContext * context, const NDArray* x, const NDArray* hLa
 }
 
 //////////////////////////////////////////////////////////////////////////
-void gruTimeLoop(nd4j::LaunchContext * context, const NDArray* x, const NDArray* hLast, const NDArray* Wx, const NDArray* Wh, const NDArray* b, NDArray* h) {
+void gruTimeLoop(sd::LaunchContext * context, const NDArray* x, const NDArray* hLast, const NDArray* Wx, const NDArray* Wh, const NDArray* b, NDArray* h) {
 
     // x   input [time, bS, iS]
     // hLast  initial cell output (at time step = 0) [bS, nU]
@@ -154,7 +154,7 @@ void gruTimeLoop(nd4j::LaunchContext * context, const NDArray* x, const NDArray*
 }
 
 //////////////////////////////////////////////////////////////////////////
-void gruCellBP(nd4j::LaunchContext* context,
+void gruCellBP(sd::LaunchContext* context,
               const NDArray* x,    const NDArray* hLast,
               const NDArray* W,    const NDArray* Wc,        const NDArray* b,    const NDArray* bc,
               const NDArray* dLdr, const NDArray* dLdu,      const NDArray* dLdc, const NDArray* dLdh,
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/hamming.cpp b/libnd4j/include/ops/declarable/helpers/cpu/hamming.cpp
index fc6fc768b..10b6a27e0 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/hamming.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/hamming.cpp
@@ -22,7 +22,7 @@
 #include <ops/declarable/helpers/hamming.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
 
@@ -47,7 +47,7 @@ namespace nd4j {
 
                 Nd4jLong distance = 0;
                 auto lengthOf = x.lengthOf();
-                int maxThreads = nd4j::math::nd4j_min<int>(256, omp_get_max_threads());
+                int maxThreads = sd::math::nd4j_min<int>(256, omp_get_max_threads());
                 Nd4jLong intermediate[256];
 
                 // nullify temp values
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp b/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp
index 6ece88ae6..5893b2c88 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/helpers/hashcode.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             template <typename T>
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/histogram.cpp b/libnd4j/include/ops/declarable/helpers/cpu/histogram.cpp
index 8720b53d9..9fc6ddefb 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/histogram.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/histogram.cpp
@@ -20,7 +20,7 @@
 
 #include <ops/declarable/helpers/histogram.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             template <typename X, typename Z>
@@ -58,7 +58,7 @@ namespace nd4j {
                 }
             }
 
-            void histogramHelper(nd4j::LaunchContext *context, NDArray &input, NDArray &output) {
+            void histogramHelper(sd::LaunchContext *context, NDArray &input, NDArray &output) {
                 Nd4jLong numBins = output.lengthOf();
                 double min_val = input.reduceNumber(reduce::SameOps::Min).e<double>(0);
                 double max_val = input.reduceNumber(reduce::SameOps::Max).e<double>(0);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/histogramFixedWidth.cpp b/libnd4j/include/ops/declarable/helpers/cpu/histogramFixedWidth.cpp
index 1ffb59824..9376e80bf 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/histogramFixedWidth.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/histogramFixedWidth.cpp
@@ -20,7 +20,7 @@
 
 #include <ops/declarable/helpers/histogramFixedWidth.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -57,7 +57,7 @@ void histogramFixedWidth_(const NDArray& input, const NDArray& range, NDArray& o
     }
 }
 
-void histogramFixedWidth(nd4j::LaunchContext * context, const NDArray& input, const NDArray& range, NDArray& output) {
+void histogramFixedWidth(sd::LaunchContext * context, const NDArray& input, const NDArray& range, NDArray& output) {
     BUILD_SINGLE_SELECTOR(input.dataType(), histogramFixedWidth_, (input, range, output), LIBND4J_TYPES);
 }
 BUILD_SINGLE_TEMPLATE(template void histogramFixedWidth_, (const NDArray& input, const NDArray& range, NDArray& output), LIBND4J_TYPES);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/im2col.cpp b/libnd4j/include/ops/declarable/helpers/cpu/im2col.cpp
index 43fa52d34..2129b4bee 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/im2col.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/im2col.cpp
@@ -22,13 +22,13 @@
 #include <execution/Threads.h>
 
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
 //////////////////////////////////////////////////////////////////////////
 template <typename T>
-static void im2col_(nd4j::LaunchContext & context, const NDArray& input,  NDArray& output, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const NDArray& arrZeroPadVal) {
+static void im2col_(sd::LaunchContext & context, const NDArray& input,  NDArray& output, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const NDArray& arrZeroPadVal) {
 
     // input [bS, iC, iH, iW] is convoluted to output [bS, iC, kH, kW, oH, oW]
 
@@ -129,7 +129,7 @@ static void im2col_(nd4j::LaunchContext & context, const NDArray& input,  NDArra
 }
 
 
-void im2col(nd4j::LaunchContext & context, const NDArray& im,  NDArray& col, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const NDArray& arrZeroPadVal) {
+void im2col(sd::LaunchContext & context, const NDArray& im,  NDArray& col, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const NDArray& arrZeroPadVal) {
 	BUILD_SINGLE_SELECTOR(im.dataType(), im2col_, (context, im, col, kH, kW, sH, sW, pH, pW, dH, dW, arrZeroPadVal), FLOAT_TYPES);
 }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/image_draw_bounding_boxes.cpp b/libnd4j/include/ops/declarable/helpers/cpu/image_draw_bounding_boxes.cpp
index d9b018268..ee4faafb0 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/image_draw_bounding_boxes.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/image_draw_bounding_boxes.cpp
@@ -31,11 +31,11 @@ limitations under the License.
 //
 //  @author sgazeos@gmail.com
 //
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
     typedef std::vector<std::vector<float>> ColorTable_t;
@@ -60,7 +60,7 @@ namespace helpers {
         return colorTable;
     }
 
-    void drawBoundingBoxesFunctor(nd4j::LaunchContext * context, NDArray* images, NDArray* boxes, NDArray* colors, NDArray* output) {
+    void drawBoundingBoxesFunctor(sd::LaunchContext * context, NDArray* images, NDArray* boxes, NDArray* colors, NDArray* output) {
         // images - batch of 3D images with BW (last dim = 1), RGB (last dim = 3) or RGBA (last dim = 4) channel set
         // boxes - batch of 2D bounds with last dim (y_start, x_start, y_end, x_end) to compute i and j as
         // floor((height - 1 ) * y_start) => rowStart, floor((height - 1) * y_end) => rowEnd
@@ -95,13 +95,13 @@ namespace helpers {
                 for (auto boxIndex = 0; boxIndex < numBoxes; ++boxIndex) {
                     auto colorIndex = boxIndex % colorTable.size();
                     auto rowStart = Nd4jLong((height - 1) * boxes->t<float>(batch, boxIndex, 0));
-                    auto rowStartBound = nd4j::math::nd4j_max(Nd4jLong(0), rowStart);
+                    auto rowStartBound = sd::math::nd4j_max(Nd4jLong(0), rowStart);
                     auto rowEnd = Nd4jLong((height - 1) * boxes->t<float>(batch, boxIndex, 2));
-                    auto rowEndBound = nd4j::math::nd4j_min(Nd4jLong(height - 1), rowEnd);
+                    auto rowEndBound = sd::math::nd4j_min(Nd4jLong(height - 1), rowEnd);
                     auto colStart = Nd4jLong((width - 1) * boxes->t<float>(batch, boxIndex, 1));
-                    auto colStartBound = nd4j::math::nd4j_max(Nd4jLong(0), colStart);
+                    auto colStartBound = sd::math::nd4j_max(Nd4jLong(0), colStart);
                     auto colEnd = Nd4jLong((width - 1) * boxes->t<float>(batch, boxIndex, 3));
-                    auto colEndBound = nd4j::math::nd4j_min(Nd4jLong(width - 1), colEnd);
+                    auto colEndBound = sd::math::nd4j_min(Nd4jLong(width - 1), colEnd);
 
                     if (rowStart > rowEnd || colStart > colEnd) {
                         nd4j_debug(
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp b/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp
index 18b52925a..2f0f00779 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp
@@ -38,7 +38,7 @@ limitations under the License.
 #include <ops/declarable/headers/parity_ops.h>
 #include "../cross.h"
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -171,10 +171,10 @@ namespace helpers {
        	    for (auto k = start; k < stop; k++) {
                 auto i = (outSize - k - 1);
                 double  const in =  scaler(i, scale);
-                double const in_f = nd4j::math::nd4j_floor<double, double>(in);
-                double const in_c = nd4j::math::nd4j_ceil<double, double>(in);
-                interpolationData[i]._bottomIndex = nd4j::math::nd4j_max(static_cast<Nd4jLong>(in_f), (Nd4jLong)0LL);//static_cast<Nd4jLong>(in);
-                interpolationData[i]._topIndex = nd4j::math::nd4j_min(static_cast<Nd4jLong>(in_c), inSize - 1);
+                double const in_f = sd::math::nd4j_floor<double, double>(in);
+                double const in_c = sd::math::nd4j_ceil<double, double>(in);
+                interpolationData[i]._bottomIndex = sd::math::nd4j_max(static_cast<Nd4jLong>(in_f), (Nd4jLong)0LL);//static_cast<Nd4jLong>(in);
+                interpolationData[i]._topIndex = sd::math::nd4j_min(static_cast<Nd4jLong>(in_c), inSize - 1);
                 interpolationData[i]._interpolarValue = in - in_f;
      	    }
 	    };
@@ -305,16 +305,16 @@ namespace helpers {
         auto func = PRAGMA_THREADS_FOR_2D {
             for (auto b = start_x; b < stop_x; b += inc_x) {
                 for (auto y = start_y; y < stop_y; y += inc_y) {
-                    auto posY = alignCorners ? static_cast<Nd4jLong>(nd4j::math::p_round<float>(scaler(y, st.heightScale))) : static_cast<Nd4jLong>(nd4j::math::p_floor<float>(scaler(y, st.heightScale)));
-                    Nd4jLong inY = nd4j::math::nd4j_min(posY, inHeight - 1);
+                    auto posY = alignCorners ? static_cast<Nd4jLong>(sd::math::p_round<float>(scaler(y, st.heightScale))) : static_cast<Nd4jLong>(sd::math::p_floor<float>(scaler(y, st.heightScale)));
+                    Nd4jLong inY = sd::math::nd4j_min(posY, inHeight - 1);
                     if (halfPixelCenter) {
-                        inY = nd4j::math::nd4j_max(0LL, inY);
+                        inY = sd::math::nd4j_max(0LL, inY);
                     }
                     for (Nd4jLong x = 0; x < outWidth; ++x) {
-                        auto posX = alignCorners ? static_cast<Nd4jLong>(nd4j::math::p_round<float>(scaler(x, st.widthScale))) : static_cast<Nd4jLong>(nd4j::math::p_floor<float>(scaler(x, st.widthScale)));
-                        Nd4jLong inX = nd4j::math::nd4j_min(posX,inWidth - 1);
+                        auto posX = alignCorners ? static_cast<Nd4jLong>(sd::math::p_round<float>(scaler(x, st.widthScale))) : static_cast<Nd4jLong>(sd::math::p_floor<float>(scaler(x, st.widthScale)));
+                        Nd4jLong inX = sd::math::nd4j_min(posX,inWidth - 1);
                         if (halfPixelCenter) {
-                            inX = nd4j::math::nd4j_max(0LL, inX);
+                            inX = sd::math::nd4j_max(0LL, inX);
                         }
                         // copy pixel over all channels
                         for (Nd4jLong e = 0; e < channels; e++)
@@ -355,13 +355,13 @@ namespace helpers {
 //                              NUMERIC_TYPES, FLOAT_TYPES);
 //    }
 
-    int resizeBilinearFunctor(nd4j::LaunchContext * context, NDArray const *images, int const width, int const height,
+    int resizeBilinearFunctor(sd::LaunchContext * context, NDArray const *images, int const width, int const height,
             bool const alignCorners, bool const halfPixelCenter, NDArray *output) {
         BUILD_DOUBLE_SELECTOR(images->dataType(), output->dataType(), return resizeBilinearFunctor_, (images, width, height, alignCorners, halfPixelCenter, output), NUMERIC_TYPES, FLOAT_TYPES);
         return Status::OK();
     }
 
-    int resizeNeighborFunctor(nd4j::LaunchContext * context, NDArray const *images, int const width, int const height,
+    int resizeNeighborFunctor(sd::LaunchContext * context, NDArray const *images, int const width, int const height,
             bool const alignCorners,  bool const halfPixelCenter, NDArray *output) {
         BUILD_SINGLE_SELECTOR(images->dataType(), return resizeNeighborFunctor_, (images, width, height, alignCorners, halfPixelCenter, output), LIBND4J_TYPES);
     }
@@ -451,12 +451,12 @@ namespace helpers {
     }
 
     template <typename T>
-    int resizeBicubicFunctor_(nd4j::LaunchContext * context, NDArray const* image, int width, int height,
+    int resizeBicubicFunctor_(sd::LaunchContext * context, NDArray const* image, int width, int height,
                              bool preserveAspectRatio, bool antialias, NDArray* output) {
         return ND4J_STATUS_OK;
     }
 
-    int resizeBicubicFunctor(nd4j::LaunchContext * context, NDArray const* image, int width, int height,
+    int resizeBicubicFunctor(sd::LaunchContext * context, NDArray const* image, int width, int height,
                              bool preserveAspectRatio, bool antialias, NDArray* output) {
         BUILD_SINGLE_SELECTOR(image->dataType(), return resizeBicubicFunctor_, (context, image,
                 width, height, preserveAspectRatio, antialias, output), NUMERIC_TYPES);
@@ -780,7 +780,7 @@ namespace helpers {
 // simplified bicubic resize without antialiasing
 //
     template <typename T>
-    int resizeBicubicFunctorA_(nd4j::LaunchContext * context, NDArray const* image, int const width, int const height,
+    int resizeBicubicFunctorA_(sd::LaunchContext * context, NDArray const* image, int const width, int const height,
                               bool const alignCorners, bool const halfPixelAlign, NDArray* output) {
         ImageResizerState st(alignCorners, halfPixelAlign); // align_corners, half_pixel_align
         int res = st.validateAndCreateOutput(image, width, height);
@@ -789,7 +789,7 @@ namespace helpers {
 
         return res;
     }
-    int resizeBicubicFunctorA(nd4j::LaunchContext * context, NDArray const* image, int const width, int const height,
+    int resizeBicubicFunctorA(sd::LaunchContext * context, NDArray const* image, int const width, int const height,
                               bool const alignCorners, bool const halfPixelAlign, NDArray* output) {
         BUILD_SINGLE_SELECTOR(image->dataType(), return resizeBicubicFunctorA_, (context, image, width, height, alignCorners, halfPixelAlign, output), NUMERIC_TYPES);
     }
@@ -954,7 +954,7 @@ namespace helpers {
     }
 
     template <typename X>
-    int resizeAreaFunctor_(nd4j::LaunchContext* context, NDArray const* image, int const width, int const height,
+    int resizeAreaFunctor_(sd::LaunchContext* context, NDArray const* image, int const width, int const height,
                               bool const alignCorners, NDArray* output) {
             ImageResizerState st(alignCorners, false); // Create resize info
             auto res = st.validateAndCalculateOutputSize(image, width, height);
@@ -988,13 +988,13 @@ namespace helpers {
             return res;
     }
 
-    int resizeAreaFunctor(nd4j::LaunchContext * context, NDArray const* image, int const width, int const height,
+    int resizeAreaFunctor(sd::LaunchContext * context, NDArray const* image, int const width, int const height,
                               bool const alignCorners, NDArray* output) {
         BUILD_SINGLE_SELECTOR(image->dataType(), return resizeAreaFunctor_, (context, image, width, height, alignCorners, output), NUMERIC_TYPES);
     }
 
 // ------------------------------------------------------------------------------------------------------------------ //
-    int resizeFunctor(nd4j::LaunchContext * context, NDArray const* image, int const width, int const height,
+    int resizeFunctor(sd::LaunchContext * context, NDArray const* image, int const width, int const height,
                       ImageResizeMethods method, bool preserveAspectRatio, bool antialias, NDArray* output) {
         switch (method) {
             case kResizeBilinear: return resizeBilinearFunctor(context, image, width, height, false, false, output); break;
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/image_suppression.cpp b/libnd4j/include/ops/declarable/helpers/cpu/image_suppression.cpp
index bf5acd01e..c3ad42db3 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/image_suppression.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/image_suppression.cpp
@@ -19,12 +19,12 @@
 //
 
 #include <ops/declarable/helpers/image_suppression.h>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 #include <algorithm>
 #include <numeric>
 #include <queue>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -46,26 +46,26 @@ namespace helpers {
         std::vector<int> selectedIndices(output->lengthOf(), 0);
         auto needToSuppressWithThreshold = [] (NDArray& boxes, int previousIndex, int nextIndex, T threshold) -> bool {
             if (previousIndex < 0 || nextIndex < 0) return true;
-            T minYPrev = nd4j::math::nd4j_min(boxes.t<T>(previousIndex, 0), boxes.t<T>(previousIndex, 2));
-            T minXPrev = nd4j::math::nd4j_min(boxes.t<T>(previousIndex, 1), boxes.t<T>(previousIndex, 3));
-            T maxYPrev = nd4j::math::nd4j_max(boxes.t<T>(previousIndex, 0), boxes.t<T>(previousIndex, 2));
-            T maxXPrev = nd4j::math::nd4j_max(boxes.t<T>(previousIndex, 1), boxes.t<T>(previousIndex, 3));
-            T minYNext = nd4j::math::nd4j_min(boxes.t<T>(nextIndex, 0), boxes.t<T>(nextIndex, 2));
-            T minXNext = nd4j::math::nd4j_min(boxes.t<T>(nextIndex, 1), boxes.t<T>(nextIndex, 3));
-            T maxYNext = nd4j::math::nd4j_max(boxes.t<T>(nextIndex, 0), boxes.t<T>(nextIndex, 2));
-            T maxXNext = nd4j::math::nd4j_max(boxes.t<T>(nextIndex, 1), boxes.t<T>(nextIndex, 3));
+            T minYPrev = sd::math::nd4j_min(boxes.t<T>(previousIndex, 0), boxes.t<T>(previousIndex, 2));
+            T minXPrev = sd::math::nd4j_min(boxes.t<T>(previousIndex, 1), boxes.t<T>(previousIndex, 3));
+            T maxYPrev = sd::math::nd4j_max(boxes.t<T>(previousIndex, 0), boxes.t<T>(previousIndex, 2));
+            T maxXPrev = sd::math::nd4j_max(boxes.t<T>(previousIndex, 1), boxes.t<T>(previousIndex, 3));
+            T minYNext = sd::math::nd4j_min(boxes.t<T>(nextIndex, 0), boxes.t<T>(nextIndex, 2));
+            T minXNext = sd::math::nd4j_min(boxes.t<T>(nextIndex, 1), boxes.t<T>(nextIndex, 3));
+            T maxYNext = sd::math::nd4j_max(boxes.t<T>(nextIndex, 0), boxes.t<T>(nextIndex, 2));
+            T maxXNext = sd::math::nd4j_max(boxes.t<T>(nextIndex, 1), boxes.t<T>(nextIndex, 3));
             T areaPrev = (maxYPrev - minYPrev) * (maxXPrev - minXPrev);
             T areaNext = (maxYNext - minYNext) * (maxXNext - minXNext);
 
             if (areaNext <= T(0.f) || areaPrev <= T(0.f)) return false;
 
-            T minIntersectionY = nd4j::math::nd4j_max(minYPrev, minYNext);
-            T minIntersectionX = nd4j::math::nd4j_max(minXPrev, minXNext);
-            T maxIntersectionY = nd4j::math::nd4j_min(maxYPrev, maxYNext);
-            T maxIntersectionX = nd4j::math::nd4j_min(maxXPrev, maxXNext);
+            T minIntersectionY = sd::math::nd4j_max(minYPrev, minYNext);
+            T minIntersectionX = sd::math::nd4j_max(minXPrev, minXNext);
+            T maxIntersectionY = sd::math::nd4j_min(maxYPrev, maxYNext);
+            T maxIntersectionX = sd::math::nd4j_min(maxXPrev, maxXNext);
             T intersectionArea =
-                    nd4j::math::nd4j_max(T(maxIntersectionY - minIntersectionY), T(0.0f)) *
-                            nd4j::math::nd4j_max(T(maxIntersectionX - minIntersectionX), T(0.0f));
+                    sd::math::nd4j_max(T(maxIntersectionY - minIntersectionY), T(0.0f)) *
+                            sd::math::nd4j_max(T(maxIntersectionX - minIntersectionX), T(0.0f));
             T intersectionValue = intersectionArea / (areaPrev + areaNext - intersectionArea);
             return intersectionValue > threshold;
 
@@ -140,7 +140,7 @@ namespace helpers {
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     template <typename T, typename I>
     static Nd4jLong
-    nonMaxSuppressionGeneric_(nd4j::LaunchContext* context, NDArray* boxes, NDArray* scores, int outputSize,
+    nonMaxSuppressionGeneric_(sd::LaunchContext* context, NDArray* boxes, NDArray* scores, int outputSize,
             float overlapThreshold, float scoreThreshold, NDArray* output,  SimiliratyFunc f) {
 
         auto numBoxes = boxes->sizeAt(0);
@@ -232,24 +232,24 @@ namespace helpers {
     }
 
     Nd4jLong
-    nonMaxSuppressionGeneric(nd4j::LaunchContext* context, NDArray* boxes, NDArray* scores, int maxSize,
+    nonMaxSuppressionGeneric(sd::LaunchContext* context, NDArray* boxes, NDArray* scores, int maxSize,
                               double overlapThreshold, double scoreThreshold, NDArray* output) {
         BUILD_DOUBLE_SELECTOR(boxes->dataType(), output == nullptr?DataType::INT32:output->dataType(), return nonMaxSuppressionGeneric_, (context, boxes, scores, maxSize, overlapThreshold, scoreThreshold, output, similiratyOverlaps), FLOAT_TYPES, INTEGER_TYPES);
         return 0;
     }
 
     Nd4jLong
-    nonMaxSuppressionV3(nd4j::LaunchContext* context, NDArray* boxes, NDArray* scores, int maxSize,
+    nonMaxSuppressionV3(sd::LaunchContext* context, NDArray* boxes, NDArray* scores, int maxSize,
                              double overlapThreshold, double scoreThreshold, NDArray* output) {
         BUILD_DOUBLE_SELECTOR(boxes->dataType(), output == nullptr?DataType::INT32:output->dataType(), return nonMaxSuppressionGeneric_, (context, boxes, scores, maxSize, overlapThreshold, scoreThreshold, output, similiratyV3), FLOAT_TYPES, INTEGER_TYPES);
         return 0;
     }
 
-    BUILD_DOUBLE_TEMPLATE(template Nd4jLong nonMaxSuppressionGeneric_, (nd4j::LaunchContext* context, NDArray* boxes, NDArray* scores, int maxSize,
+    BUILD_DOUBLE_TEMPLATE(template Nd4jLong nonMaxSuppressionGeneric_, (sd::LaunchContext* context, NDArray* boxes, NDArray* scores, int maxSize,
             float overlapThreshold, float scoreThreshold, NDArray* output, SimiliratyFunc similiratyFunc), FLOAT_TYPES, INTEGER_TYPES);
 
     void
-    nonMaxSuppression(nd4j::LaunchContext * context, NDArray* boxes, NDArray* scales, int maxSize,
+    nonMaxSuppression(sd::LaunchContext * context, NDArray* boxes, NDArray* scales, int maxSize,
             double overlapThreshold, double scoreThreshold, NDArray* output) {
         BUILD_SINGLE_SELECTOR(boxes->dataType(), nonMaxSuppressionV2_, (boxes, scales, maxSize,
                 overlapThreshold, scoreThreshold, output), NUMERIC_TYPES);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/imagesHelpers.cpp b/libnd4j/include/ops/declarable/helpers/cpu/imagesHelpers.cpp
index b98e7f026..46729fbb8 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/imagesHelpers.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/imagesHelpers.cpp
@@ -24,7 +24,7 @@
 #include <helpers/ConstantTadHelper.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -66,7 +66,7 @@ static void rgbToGrs_(const NDArray& input, NDArray& output, const int dimC) {
      return;
 }
 
-void transformRgbGrs(nd4j::LaunchContext* context, const NDArray& input, NDArray& output, const int dimC) {
+void transformRgbGrs(sd::LaunchContext* context, const NDArray& input, NDArray& output, const int dimC) {
     BUILD_SINGLE_SELECTOR(input.dataType(), rgbToGrs_, (input, output, dimC), NUMERIC_TYPES);
 }
 
@@ -91,8 +91,8 @@ FORCEINLINE static void rgbToFromYuv_(const NDArray& input, NDArray& output, con
         return;
     }
 
-    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input.getShapeInfo(), dimC);
-    auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output.getShapeInfo(), dimC);
+    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input.getShapeInfo(), dimC);
+    auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output.getShapeInfo(), dimC);
 
     const Nd4jLong numOfTads = packX.numberOfTads();
     const Nd4jLong xDimCstride = input.stridesOf()[dimC];
@@ -112,21 +112,21 @@ FORCEINLINE static void rgbToFromYuv_(const NDArray& input, NDArray& output, con
 
 template <typename T>
 FORCEINLINE static void rgbYuv_(const NDArray& input, NDArray& output, const int dimC) {
-    auto op = nd4j::ops::helpers::rgbYuv<T>;
+    auto op = sd::ops::helpers::rgbYuv<T>;
     return rgbToFromYuv_<T>(input, output, dimC, op);
 }
 
-void transformRgbYuv(nd4j::LaunchContext* context, const NDArray& input, NDArray& output, const int dimC) {
+void transformRgbYuv(sd::LaunchContext* context, const NDArray& input, NDArray& output, const int dimC) {
     BUILD_SINGLE_SELECTOR(input.dataType(), rgbYuv_, (input, output, dimC), FLOAT_TYPES);
 }
 
 template <typename T>
 FORCEINLINE static void yuvRgb_(const NDArray& input, NDArray& output, const int dimC) {
-    auto op = nd4j::ops::helpers::yuvRgb<T>;
+    auto op = sd::ops::helpers::yuvRgb<T>;
     return rgbToFromYuv_<T>(input, output, dimC, op);
 }
 
-void transformYuvRgb(nd4j::LaunchContext* context, const NDArray& input, NDArray& output, const int dimC) {
+void transformYuvRgb(sd::LaunchContext* context, const NDArray& input, NDArray& output, const int dimC) {
     BUILD_SINGLE_SELECTOR(input.dataType(), yuvRgb_, (input, output, dimC), FLOAT_TYPES);
 }
 
@@ -149,8 +149,8 @@ FORCEINLINE static void tripleTransformer(const NDArray* input, NDArray* output,
         samediff::Threads::parallel_for(func, 0, input->lengthOf(), 3);
     }
     else {
-        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimC);
-        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimC);
+        auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimC);
+        auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimC);
 
         const Nd4jLong numOfTads = packX.numberOfTads();
         const Nd4jLong xDimCstride = input->stridesOf()[dimC];
@@ -199,8 +199,8 @@ FORCEINLINE static void tripleTransformer(const NDArray* input, NDArray* output,
         samediff::Threads::parallel_for(func, 0, input->lengthOf(), 3);
     }
     else {
-        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimC);
-        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimC);
+        auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimC);
+        auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimC);
 
         const Nd4jLong numOfTads = packX.numberOfTads();
         const Nd4jLong xDimCstride = input->stridesOf()[dimC];
@@ -231,13 +231,13 @@ FORCEINLINE static void tripleTransformer(const NDArray* input, NDArray* output,
 
 template <typename T>
 FORCEINLINE static void hsvRgb(const NDArray* input, NDArray* output, const int dimC) {
-    auto op = nd4j::ops::helpers::hsvToRgb<T>;
+    auto op = sd::ops::helpers::hsvToRgb<T>;
     return tripleTransformer<T>(input, output, dimC, op);
 }
 
 template <typename T>
 FORCEINLINE static void rgbHsv(const NDArray* input, NDArray* output, const int dimC) {
-    auto op = nd4j::ops::helpers::rgbToHsv<T>;
+    auto op = sd::ops::helpers::rgbToHsv<T>;
     return tripleTransformer<T>(input, output, dimC, op);
 }
 
@@ -266,19 +266,19 @@ FORCEINLINE static void yiqRgb(const NDArray* input, NDArray* output, const int
 
 
 
-void transformHsvRgb(nd4j::LaunchContext* context, const NDArray* input, NDArray* output, const int dimC) {
+void transformHsvRgb(sd::LaunchContext* context, const NDArray* input, NDArray* output, const int dimC) {
     BUILD_SINGLE_SELECTOR(input->dataType(), hsvRgb, (input, output, dimC), FLOAT_TYPES);
 }
 
-void transformRgbHsv(nd4j::LaunchContext* context, const NDArray* input, NDArray* output, const int dimC) {
+void transformRgbHsv(sd::LaunchContext* context, const NDArray* input, NDArray* output, const int dimC) {
     BUILD_SINGLE_SELECTOR(input->dataType(), rgbHsv, (input, output, dimC), FLOAT_TYPES);
 }
 
-void transformYiqRgb(nd4j::LaunchContext* context, const NDArray* input, NDArray* output, const int dimC) {
+void transformYiqRgb(sd::LaunchContext* context, const NDArray* input, NDArray* output, const int dimC) {
     BUILD_SINGLE_SELECTOR(input->dataType(), yiqRgb, (input, output, dimC), FLOAT_TYPES);
 }
 
-void transformRgbYiq(nd4j::LaunchContext* context, const NDArray* input, NDArray* output, const int dimC) {
+void transformRgbYiq(sd::LaunchContext* context, const NDArray* input, NDArray* output, const int dimC) {
     BUILD_SINGLE_SELECTOR(input->dataType(), rgbYiq, (input, output, dimC), FLOAT_TYPES);
 }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp b/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp
index 1fea8e4fe..5a4bb28cc 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp
@@ -25,7 +25,7 @@
 #include <helpers/ConstantTadHelper.h>
 #include <execution/Threads.h>
 
-namespace nd4j 	  {
+namespace sd 	  {
 namespace ops 	  {
 namespace helpers {
 
@@ -125,8 +125,8 @@ static void ismax_(const NDArray* input, NDArray* output, const std::vector<int>
         //moving all dimensions (in sorted order)
         //to the back.
         //permuted version of the input shape info for setting up the tad problem
-        auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), const_cast<int*>(dimensions.data()), dimensionsLength);
-        auto tadPackZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), const_cast<int*>(dimensions.data()), dimensionsLength);
+        auto tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), const_cast<int*>(dimensions.data()), dimensionsLength);
+        auto tadPackZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), const_cast<int*>(dimensions.data()), dimensionsLength);
 
 
         auto tadShapeShapeInfo = tadPack.primaryShapeInfo();
@@ -137,8 +137,8 @@ static void ismax_(const NDArray* input, NDArray* output, const std::vector<int>
         int tads = tadPack.numberOfTads();
 
         int tadsPerThread = tads / TAD_THRESHOLD;
-        int num_threads = nd4j::math::nd4j_max<int>(1, tadsPerThread);
-        num_threads = nd4j::math::nd4j_min<int>(num_threads, omp_get_max_threads());
+        int num_threads = sd::math::nd4j_max<int>(1, tadsPerThread);
+        num_threads = sd::math::nd4j_min<int>(num_threads, omp_get_max_threads());
 
         auto tadEWS = shape::elementWiseStride(tadShapeShapeInfo);
         auto zEWS = shape::elementWiseStride(tadPackZ.primaryShapeInfo());
@@ -200,7 +200,7 @@ static void ismax_(const NDArray* input, NDArray* output, const std::vector<int>
 }
 
 
-void ismax(nd4j::LaunchContext * context, const NDArray *input, NDArray *output, const std::vector<int>& dimensions) {
+void ismax(sd::LaunchContext * context, const NDArray *input, NDArray *output, const std::vector<int>& dimensions) {
     BUILD_DOUBLE_SELECTOR(input->dataType(), output->dataType(), ismax_, (input, output, dimensions), LIBND4J_TYPES, LIBND4J_TYPES);
 }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/legacy_helper.cpp b/libnd4j/include/ops/declarable/helpers/cpu/legacy_helper.cpp
index 4db975ddf..b2a0e537f 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/legacy_helper.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/legacy_helper.cpp
@@ -19,10 +19,10 @@
 //
 
 #include <ops/declarable/helpers/legacy_helpers.h>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 #include <ops/ops.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
     template <typename T>
@@ -34,7 +34,7 @@ namespace helpers {
         theFirst->applyPairwiseLambda<T>(*theSecond, functor, *theFirst);
     }
 
-    void reluDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond) {
+    void reluDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond) {
         BUILD_SINGLE_SELECTOR(theFirst->dataType(), reluDerivative__, (theFirst, theSecond), FLOAT_TYPES);
     }
 
@@ -64,7 +64,7 @@ namespace helpers {
         */
     }
 
-    void reluDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
+    void reluDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
         BUILD_SINGLE_SELECTOR(theFirst->dataType(), reluDerivative_, (theFirst, theSecond, theOutput), FLOAT_TYPES);
     }
 
@@ -77,7 +77,7 @@ namespace helpers {
         input->applyPairwiseLambda<T>(*epsilon, functor, *output);
     }
 
-    void relu6Derivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
+    void relu6Derivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
         BUILD_SINGLE_SELECTOR(theFirst->dataType(), relu6Derivative_, (theFirst, theSecond, theOutput), FLOAT_TYPES);
     }
 
@@ -93,7 +93,7 @@ namespace helpers {
         input->applyPairwiseLambda<T>(*epsilon, functor, *output);
     }
 
-    void leakyReluDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput, const float alpha) {
+    void leakyReluDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput, const float alpha) {
         BUILD_SINGLE_SELECTOR(theFirst->dataType(), leakyReluDerivative_, (theFirst, theSecond, theOutput, alpha), FLOAT_TYPES);
     }
 
@@ -103,13 +103,13 @@ namespace helpers {
         const T alphaT = static_cast<T>(alpha);
 
         auto functor = LAMBDA_TT(x, y, alphaT){
-            return y * nd4j::math::nd4j_eluderivative<T,T>(x, alphaT);
+            return y * sd::math::nd4j_eluderivative<T,T>(x, alphaT);
         };
 
         input->applyPairwiseLambda<T>(*epsilon, functor, *output);
     }
 
-    void eluDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput, const float alpha) {
+    void eluDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput, const float alpha) {
         BUILD_SINGLE_SELECTOR(theFirst->dataType(), eluDerivative_, (theFirst, theSecond, theOutput, alpha), FLOAT_TYPES);
     }
 
@@ -122,7 +122,7 @@ namespace helpers {
         input->applyPairwiseLambda<T>(*epsilon, functor, *output);
     }
 
-    void seluDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
+    void seluDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
         BUILD_SINGLE_SELECTOR(theFirst->dataType(), seluDerivative_, (theFirst, theSecond, theOutput), FLOAT_TYPES);
     }
 
@@ -135,7 +135,7 @@ namespace helpers {
         input->applyPairwiseLambda<T>(*epsilon, functor, *output);
     }
 
-    void cubeDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
+    void cubeDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
         BUILD_SINGLE_SELECTOR(theFirst->dataType(), cubeDerivative_, (theFirst, theSecond, theOutput), FLOAT_TYPES);
     }
 
@@ -149,7 +149,7 @@ namespace helpers {
         input->applyPairwiseLambda<T>(*epsilon, functor, *output);
     }
 
-    void reduceNorm1(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
+    void reduceNorm1(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
         BUILD_SINGLE_SELECTOR(theFirst->dataType(), reduceNorm1_, (theFirst, theSecond, theOutput), FLOAT_TYPES);
     }
 
@@ -157,13 +157,13 @@ namespace helpers {
     template <typename T>
     static void sigmCrossEntropy_(NDArray* logits, NDArray* labels, NDArray* output) {
         auto functor = LAMBDA_TT(x, y){
-            return nd4j::math::nd4j_max<T>(x, (T)0.f) - x * y + nd4j::math::nd4j_log<T,T>((T)1.f + nd4j::math::nd4j_exp<T,T>(-nd4j::math::nd4j_abs(x)));
+            return sd::math::nd4j_max<T>(x, (T)0.f) - x * y + sd::math::nd4j_log<T,T>((T)1.f + sd::math::nd4j_exp<T,T>(-sd::math::nd4j_abs(x)));
         };
 
         logits->applyPairwiseLambda<T>(*labels, functor, *output);
     }
 
-    void sigmCrossEntropy(nd4j::LaunchContext * context, NDArray* logits, NDArray* labels, NDArray* output) {
+    void sigmCrossEntropy(sd::LaunchContext * context, NDArray* logits, NDArray* labels, NDArray* output) {
         BUILD_SINGLE_SELECTOR(logits->dataType(), sigmCrossEntropy_, (logits, labels, output), FLOAT_TYPES);
     }
 
@@ -173,15 +173,15 @@ namespace helpers {
         // 1 - labels - 1 / (1 + exp(logits))
         auto functor = LAMBDA_TT(x, y) {
             if(x <= 0)
-                return static_cast<T>(1.) - y - static_cast<T>(1.) / (static_cast<T>(1.) + nd4j::math::nd4j_exp<T,T>(x));
-            auto e = nd4j::math::nd4j_exp<T,T>(-x);
+                return static_cast<T>(1.) - y - static_cast<T>(1.) / (static_cast<T>(1.) + sd::math::nd4j_exp<T,T>(x));
+            auto e = sd::math::nd4j_exp<T,T>(-x);
             return static_cast<T>(1.) - y - e / (static_cast<T>(1.) + e);
         };
 
         logits->applyPairwiseLambda<T>(*labels, functor, *output);
     }
 
-    void sigmCrossEntropyGrad(nd4j::LaunchContext * context, NDArray* logits, NDArray* labels, NDArray* output) {
+    void sigmCrossEntropyGrad(sd::LaunchContext * context, NDArray* logits, NDArray* labels, NDArray* output) {
         BUILD_SINGLE_SELECTOR(logits->dataType(), sigmCrossEntropyGrad_, (logits, labels, output), FLOAT_TYPES);
     }
 
@@ -189,14 +189,14 @@ namespace helpers {
     template <typename T>
     static void tanhDerivative_(NDArray* input, NDArray* epsilon, NDArray* output) {
         auto functor = LAMBDA_TT(x, y){
-            T th = nd4j::math::nd4j_tanh<T,T>(x);
+            T th = sd::math::nd4j_tanh<T,T>(x);
             return y * ((T)1.0f - (th * th));
         };
 
         input->applyPairwiseLambda<T>(*epsilon, functor, *output);
     }
 
-    void tanhDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
+    void tanhDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
         BUILD_SINGLE_SELECTOR(theFirst->dataType(), tanhDerivative_, (theFirst, theSecond, theOutput), FLOAT_TYPES);
     }
 
@@ -204,14 +204,14 @@ namespace helpers {
     template <typename T>
     static void hardTanhDerivative_(NDArray* input, NDArray* epsilon, NDArray* output) {
         auto functor = LAMBDA_TT(x, y){
-            T th = nd4j::math::nd4j_tanh<T,T>(x);
+            T th = sd::math::nd4j_tanh<T,T>(x);
             return y * simdOps::HardTanhDerivative<T>::op(x, nullptr);
         };
 
         input->applyPairwiseLambda<T>(*epsilon, functor, *output);
     }
 
-    void hardTanhDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
+    void hardTanhDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
         BUILD_SINGLE_SELECTOR(theFirst->dataType(), hardTanhDerivative_, (theFirst, theSecond, theOutput), FLOAT_TYPES);
     }
 
@@ -224,51 +224,51 @@ namespace helpers {
         input->applyPairwiseLambda<T>(*epsilon, functor, *output);
     }
 
-    void rationalTanhDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
+    void rationalTanhDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
         BUILD_SINGLE_SELECTOR(theFirst->dataType(), rationalTanhDerivative_, (theFirst, theSecond, theOutput), FLOAT_TYPES);
     }
 
     template <typename T>
     static void rectifiedTanhDerivative_(NDArray* input, NDArray* epsilon, NDArray* output) {
         auto functor = LAMBDA_TT(x, y){
-            return x > (T) 0.0f ? y * (nd4j::math::nd4j_tanhderivative<T,T>(x)) : (T) 0.0f;
+            return x > (T) 0.0f ? y * (sd::math::nd4j_tanhderivative<T,T>(x)) : (T) 0.0f;
         };
 
         input->applyPairwiseLambda<T>(*epsilon, functor, *output);
     }
 
-    void rectifiedTanhDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
+    void rectifiedTanhDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
         BUILD_SINGLE_SELECTOR(theFirst->dataType(), rectifiedTanhDerivative_, (theFirst, theSecond, theOutput), FLOAT_TYPES);
     }
 
-    //            X f = (X) 1.0f + nd4j::math::nd4j_abs<X>(d1);
+    //            X f = (X) 1.0f + sd::math::nd4j_abs<X>(d1);
     //            return (X) d2 * ((X) 1.0f / (f * f));
 
     template <typename T>
     static void softSignDerivative_(NDArray* input, NDArray* epsilon, NDArray* output) {
         auto functor = LAMBDA_TT(x, y){
-            T ss = (T)1.f + nd4j::math::nd4j_abs<T>(x);
+            T ss = (T)1.f + sd::math::nd4j_abs<T>(x);
             return y * ((T) 1.0f  / (ss * ss));
         };
 
         input->applyPairwiseLambda<T>(*epsilon, functor, *output);
     }
 
-    void softSignDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
+    void softSignDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
         BUILD_SINGLE_SELECTOR(theFirst->dataType(), softSignDerivative_, (theFirst, theSecond, theOutput), FLOAT_TYPES);
     }
 
     template <typename T>
     static void softPlusDerivative_(NDArray* input, NDArray* epsilon, NDArray* output) {
         auto functor = LAMBDA_TT(x, y){
-            T p = nd4j::math::nd4j_pow<T, T, T>(static_cast<T>(M_E), x);
+            T p = sd::math::nd4j_pow<T, T, T>(static_cast<T>(M_E), x);
             return y * (p / (p + 1.));
         };
 
         input->applyPairwiseLambda<T>(*epsilon, functor, *output);
     }
 
-    void softPlusDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
+    void softPlusDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
         BUILD_SINGLE_SELECTOR(theFirst->dataType(), softPlusDerivative_, (theFirst, theSecond, theOutput), FLOAT_TYPES);
     }
 ///
@@ -278,14 +278,14 @@ namespace helpers {
     template <typename T>
     static void sigmoidDerivative_(NDArray* input, NDArray* epsilon, NDArray* output) {
         auto functor = LAMBDA_TT(x, y){
-            T s = nd4j::math::nd4j_sigmoid<T,T>(x);
+            T s = sd::math::nd4j_sigmoid<T,T>(x);
             return y * (s * ((T) 1.0f - s));
         };
 
         input->applyPairwiseLambda<T>(*epsilon, functor, *output);
     }
 
-    void sigmoidDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
+    void sigmoidDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
         BUILD_SINGLE_SELECTOR(theFirst->dataType(), sigmoidDerivative_, (theFirst, theSecond, theOutput), FLOAT_TYPES);
     }
 
@@ -298,7 +298,7 @@ namespace helpers {
         input->applyPairwiseLambda<T>(*epsilon, functor, *output);
     }
 
-    void hardSigmoidDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
+    void hardSigmoidDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
         BUILD_SINGLE_SELECTOR(theFirst->dataType(), hardSigmoidDerivative_, (theFirst, theSecond, theOutput), FLOAT_TYPES);
     }
 
@@ -334,11 +334,11 @@ namespace helpers {
         output->applyTransform(transform::Log, *output);
     }
 
-    void logSumExp(nd4j::LaunchContext * context, NDArray* input, NDArray* axis, NDArray* output) {
+    void logSumExp(sd::LaunchContext * context, NDArray* input, NDArray* axis, NDArray* output) {
         BUILD_SINGLE_SELECTOR(input->dataType(), logSumExp_, (input, axis, output), FLOAT_TYPES);
     }
 
-    void logSumExp(nd4j::LaunchContext * context, NDArray* input, NDArray* subtrah, NDArray* axis, NDArray* output) {
+    void logSumExp(sd::LaunchContext * context, NDArray* input, NDArray* subtrah, NDArray* axis, NDArray* output) {
         BUILD_SINGLE_SELECTOR(input->dataType(), logSumExp_, (input, subtrah, axis, output), FLOAT_TYPES);
     }
 
@@ -351,15 +351,15 @@ static void weightedCrossEntropyWithLogitsFunctor_(NDArray const* targets, NDArr
     auto mainRoutineT1 = LAMBDA_TT(_x, _z, posWeight) {
         T targetWeight = (1. + (posWeight - (T)1.f) * _z);
         return (1. - _z) * _x +
-               targetWeight * (nd4j::math::nd4j_log<T,T>((T)1.f + nd4j::math::nd4j_exp<T,T>(-nd4j::math::nd4j_abs(_x))) +
-                               nd4j::math::nd4j_max(-_x, T(0.f))
+               targetWeight * (sd::math::nd4j_log<T,T>((T)1.f + sd::math::nd4j_exp<T,T>(-sd::math::nd4j_abs(_x))) +
+                               sd::math::nd4j_max(-_x, T(0.f))
                );
     };
 
     auto mainRoutineT2 = LAMBDA_TTT(_x, _z, _w) {
         return (((T)1.0 - _z) * _x) +
-               _w * (nd4j::math::nd4j_log<T,T>(T(1.) + nd4j::math::nd4j_exp<T,T>(-nd4j::math::nd4j_abs(_x))) +
-                     nd4j::math::nd4j_max(-_x, T(0.f)));
+               _w * (sd::math::nd4j_log<T,T>(T(1.) + sd::math::nd4j_exp<T,T>(-sd::math::nd4j_abs(_x))) +
+                     sd::math::nd4j_max(-_x, T(0.f)));
     };
 
 
@@ -377,7 +377,7 @@ static void weightedCrossEntropyWithLogitsFunctor_(NDArray const* targets, NDArr
     }
 }
 
-void weightedCrossEntropyWithLogitsFunctor(nd4j::LaunchContext * context, NDArray const* targets, NDArray const* input, NDArray const* weights, NDArray* output) {
+void weightedCrossEntropyWithLogitsFunctor(sd::LaunchContext * context, NDArray const* targets, NDArray const* input, NDArray const* weights, NDArray* output) {
     BUILD_SINGLE_SELECTOR(targets->dataType(), weightedCrossEntropyWithLogitsFunctor_, (targets, input, weights, output), FLOAT_TYPES);
 }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/lgamma.cpp b/libnd4j/include/ops/declarable/helpers/cpu/lgamma.cpp
index 2978a9d45..3b71f7ce9 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/lgamma.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/lgamma.cpp
@@ -22,7 +22,7 @@
 #include<ops/declarable/helpers/lgamma.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -38,7 +38,7 @@ static void lgamma_(NDArray& x, NDArray& z) {
     x.applyLambda<T>(lgammaProc, z);
 }
 
-void lgamma(nd4j::LaunchContext* context, NDArray& x, NDArray& z) {
+void lgamma(sd::LaunchContext* context, NDArray& x, NDArray& z) {
 
 	BUILD_SINGLE_SELECTOR(x.dataType(), lgamma_, (x, z), FLOAT_TYPES);
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp b/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp
index 226e3ceed..31235d737 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp
@@ -20,28 +20,28 @@
 //
 
 #include <ops/declarable/helpers/lrn.h>
-#include <Status.h>
-#include <ConstantTadHelper.h>
+#include <graph/Status.h>
+#include <helpers/ConstantTadHelper.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
 template <typename T>
-static int lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* output, int depth, float bias, float alpha, float beta) {
+static int lrnFunctor_(sd::graph::Context& block, NDArray* input, NDArray* output, int depth, float bias, float alpha, float beta) {
 
     nd4j_debug("MKL-DNN is not used for lrn!\n", 0);
 
     const int rank = input->rankOf();
 
-    TadPack inTadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), {rank - 1});
+    TadPack inTadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), {rank - 1});
     TadPack outTadPack;
 
     if(shape::haveSameShapeAndStrides(input->getShapeInfo(), output->getShapeInfo()))
         outTadPack = inTadPack;
     else
-        outTadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {rank - 1});
+        outTadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {rank - 1});
 
     const Nd4jLong numOfTads = inTadPack.numberOfTads();
     const Nd4jLong tadLen    = input->sizeAt(-1); 
@@ -71,9 +71,9 @@ static int lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* out
                 // calculate squared sum of elements per each j-th element range [j - depth, j + depth + 1]
                 // we store each squared sum in corresponding element of y array
                 for (Nd4jLong j = 0; j < tadLen; ++j) {
-                    const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
+                    const uint begin = sd::math::nd4j_max<int>(0, j - depth);
                     const uint last = depth + j + 1;
-                    const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
+                    const uint end = sd::math::nd4j_min<int>(last, tadLen);
 
                     if (j == 0) {
                         for (uint s = begin; s < end; ++s)
@@ -91,7 +91,7 @@ static int lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* out
                     if (j != 0)
                         prev = y[j];
 
-                    y[j] = x[j] / nd4j::math::nd4j_pow<T, T, T>(tbias + alpha * prev, tbeta);
+                    y[j] = x[j] / sd::math::nd4j_pow<T, T, T>(tbias + alpha * prev, tbeta);
                 }
             }
         };
@@ -109,9 +109,9 @@ static int lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* out
                 // calculate squared sum of elements per each j-th element range [j - depth, j + depth + 1]
                 // we store each squared sum in corresponding element of y array
                 for (Nd4jLong j = 0; j < tadLen; ++j) {
-                    const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
+                    const uint begin = sd::math::nd4j_max<int>(0, j - depth);
                     const uint last = depth + j + 1;
-                    const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
+                    const uint end = sd::math::nd4j_min<int>(last, tadLen);
 
                     if (j == 0) {
                         for (uint s = begin; s < end; ++s)
@@ -129,7 +129,7 @@ static int lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* out
                     if (j != 0)
                         prev = y[j * outTadEws];
 
-                    y[j * outTadEws] = x[j * inTadEws] / nd4j::math::nd4j_pow<T, T, T>(tbias + alpha * prev, tbeta);
+                    y[j * outTadEws] = x[j * inTadEws] / sd::math::nd4j_pow<T, T, T>(tbias + alpha * prev, tbeta);
                 }
             }
         };
@@ -139,9 +139,9 @@ static int lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* out
     return Status::OK();
 }
     
-BUILD_SINGLE_TEMPLATE(template int lrnFunctor_, (nd4j::graph::Context& block, NDArray* input, NDArray* output, int depth, float bias, float alpha, float beta), FLOAT_TYPES);
+BUILD_SINGLE_TEMPLATE(template int lrnFunctor_, (sd::graph::Context& block, NDArray* input, NDArray* output, int depth, float bias, float alpha, float beta), FLOAT_TYPES);
 
-int lrnFunctor(nd4j::graph::Context& block, NDArray* input, NDArray* output, int depth, double bias, double alpha, double beta) {
+int lrnFunctor(sd::graph::Context& block, NDArray* input, NDArray* output, int depth, double bias, double alpha, double beta) {
     BUILD_SINGLE_SELECTOR(input->dataType(), return lrnFunctor_, (block, input, output, depth, bias, alpha, beta), FLOAT_TYPES);
 }
 
@@ -151,13 +151,13 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c
     
     const int rank = input.rankOf();
 
-    TadPack inTadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input.getShapeInfo(), {rank - 1});
+    TadPack inTadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(input.getShapeInfo(), {rank - 1});
     TadPack gradITadPack;
 
     if(shape::haveSameShapeAndStrides(input.getShapeInfo(), gradI.getShapeInfo()))
         gradITadPack = inTadPack;
     else
-        gradITadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(gradI.getShapeInfo(), {rank - 1});
+        gradITadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(gradI.getShapeInfo(), {rank - 1});
 
     const Nd4jLong numOfTads = inTadPack.numberOfTads();
     const Nd4jLong tadLen    = input.sizeAt(-1); 
@@ -186,9 +186,9 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c
                 // this loop calculates squared sum of elements per each j-th element range [j - depth, j + depth + 1]
                 // we store each squared sum in corresponding element of y array
                 for (Nd4jLong j = 0; j < tadLen; ++j) {
-                    const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
+                    const uint begin = sd::math::nd4j_max<int>(0, j - depth);
                     const uint last = depth + j + 1;
-                    const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
+                    const uint end = sd::math::nd4j_min<int>(last, tadLen);
 
                     if (j == 0) {
                         y[0] = 0;
@@ -209,23 +209,23 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c
                 Y prev = 0;
                 // second loop calculates derivatives using information gained in first loop above
                 for (Nd4jLong j = 0; j < tadLen; ++j) {
-                    const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
+                    const uint begin = sd::math::nd4j_max<int>(0, j - depth);
                     const uint last = depth + j + 1;
-                    const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
+                    const uint end = sd::math::nd4j_min<int>(last, tadLen);
 
                     Y init = tbias + talpha * y[j];
 
                     if (j == 0) {
                         for (uint s = begin; s < end; ++s) {
-                            factor[s] = nd4j::math::nd4j_pow<Y, Y, Y>(tbias + talpha * y[s], -tbeta - 1);
+                            factor[s] = sd::math::nd4j_pow<Y, Y, Y>(tbias + talpha * y[s], -tbeta - 1);
                             prev = prev + x[s] * factor[s];
                         }
                         y[0] = prev;
                     } else if (begin == 0 && last <= tadLen) {
-                        factor[end - 1] = nd4j::math::nd4j_pow<Y, Y, Y>(tbias + talpha * y[end - 1], -tbeta - 1);
+                        factor[end - 1] = sd::math::nd4j_pow<Y, Y, Y>(tbias + talpha * y[end - 1], -tbeta - 1);
                         y[j] = prev + x[end - 1] * factor[end - 1];
                     } else if (begin > 0 && last <= tadLen) {
-                        factor[end - 1] = nd4j::math::nd4j_pow<Y, Y, Y>(tbias + talpha * y[end - 1], -tbeta - 1);
+                        factor[end - 1] = sd::math::nd4j_pow<Y, Y, Y>(tbias + talpha * y[end - 1], -tbeta - 1);
                         y[j] = prev + x[end - 1] * factor[end - 1] - x[begin - 1] * factor[begin - 1];
                     } else if (begin > 0 && last > tadLen)
                         y[j] = prev - x[begin - 1] * factor[begin - 1];
@@ -254,9 +254,9 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c
                 // this loop calculates squared sum of elements per each j-th element range [j - depth, j + depth + 1]
                 // we store each squared sum in corresponding element of y array
                 for (Nd4jLong j = 0; j < tadLen; ++j) {
-                    const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
+                    const uint begin = sd::math::nd4j_max<int>(0, j - depth);
                     const uint last = depth + j + 1;
-                    const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
+                    const uint end = sd::math::nd4j_min<int>(last, tadLen);
 
                     if (j == 0) {
                         y[0] = 0;
@@ -281,24 +281,24 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c
                 Y prev = 0;
                 // second loop calculates derivatives using information gained in first loop above
                 for (Nd4jLong j = 0; j < tadLen; ++j) {
-                    const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
+                    const uint begin = sd::math::nd4j_max<int>(0, j - depth);
                     const uint last = depth + j + 1;
-                    const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
+                    const uint end = sd::math::nd4j_min<int>(last, tadLen);
 
                     Y init = tbias + talpha * y[j * gradITadEws];
 
                     if (j == 0) {
                         for (uint s = begin; s < end; ++s) {
-                            factor[s] = nd4j::math::nd4j_pow<Y, Y, Y>(tbias + talpha * y[s * gradITadEws], -tbeta - 1);
+                            factor[s] = sd::math::nd4j_pow<Y, Y, Y>(tbias + talpha * y[s * gradITadEws], -tbeta - 1);
                             prev = prev + x[s * inTadEws] * factor[s];
                         }
                         y[0] = prev;
                     } else if (begin == 0 && last <= tadLen) {
-                        factor[end - 1] = nd4j::math::nd4j_pow<Y, Y, Y>(tbias + talpha * y[(end - 1) * gradITadEws],
+                        factor[end - 1] = sd::math::nd4j_pow<Y, Y, Y>(tbias + talpha * y[(end - 1) * gradITadEws],
                                                                         -tbeta - 1);
                         y[j * gradITadEws] = prev + x[(end - 1) * inTadEws] * factor[end - 1];
                     } else if (begin > 0 && last <= tadLen) {
-                        factor[end - 1] = nd4j::math::nd4j_pow<Y, Y, Y>(tbias + talpha * y[(end - 1) * gradITadEws],
+                        factor[end - 1] = sd::math::nd4j_pow<Y, Y, Y>(tbias + talpha * y[(end - 1) * gradITadEws],
                                                                         -tbeta - 1);
                         y[j * gradITadEws] = prev + x[(end - 1) * inTadEws] * factor[end - 1] -
                                              x[(begin - 1) * inTadEws] * factor[begin - 1];
@@ -323,7 +323,7 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c
 }
 
 
-void lrnBP(nd4j::graph::Context& block, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int depth, const float bias, const float alpha, const float beta) {
+void lrnBP(sd::graph::Context& block, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int depth, const float bias, const float alpha, const float beta) {
     BUILD_DOUBLE_SELECTOR(input.dataType(), gradO.dataType(), lrnBP_, (input, gradO, gradI, depth, bias, alpha, beta), FLOAT_TYPES, FLOAT_TYPES);
 }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp b/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp
index 47c5c2a22..02d4c9855 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp
@@ -27,22 +27,22 @@
 
 
 #include<ops/declarable/helpers/lstm.h>
-#include <VariableSpace.h>
+#include <graph/VariableSpace.h>
 #include <ops/declarable/CustomOperations.h>
 #include<ops/declarable/helpers/transforms.h>
 #include <ops/declarable/helpers/legacy_helpers.h>
 #include <array/NDArrayList.h>
 #include <iterator>
-#include <MmulHelper.h>
+#include <helpers/MmulHelper.h>
 #include <execution/Threads.h>
 
-namespace nd4j 	  {
+namespace sd 	  {
 namespace ops 	  {
 namespace helpers {
 
 
 //////////////////////////////////////////////////////////////////////////
-void lstmCell(nd4j::LaunchContext * context, const NDArray* xt, const NDArray* ht_1, const NDArray* ct_1, const NDArray* Wx, const NDArray* Wh, const NDArray* Wc, const NDArray* Wp, const NDArray* b,
+void lstmCell(sd::LaunchContext * context, const NDArray* xt, const NDArray* ht_1, const NDArray* ct_1, const NDArray* Wx, const NDArray* Wh, const NDArray* Wc, const NDArray* Wp, const NDArray* b,
               NDArray* ht, NDArray* ct, const std::vector<double>& params) {
 
     // xt   input [bS x nIn]
@@ -126,7 +126,7 @@ static void fusedTanh(NDArray *z, NDArray *i, NDArray *c, const NDArray *cLast,
     auto func = PRAGMA_THREADS_FOR {
         for (auto e = start; e < stop; e++) {
             c_[e] = z_[e] * i_[e] + (f_[e] * cLast_[e]);
-            h_[e] = nd4j::math::nd4j_tanh<T, T>(c_[e]);
+            h_[e] = sd::math::nd4j_tanh<T, T>(c_[e]);
         }
     };
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/lstsq.cpp b/libnd4j/include/ops/declarable/helpers/cpu/lstsq.cpp
index 4e9fb8fff..554486bbf 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/lstsq.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/lstsq.cpp
@@ -17,18 +17,18 @@
 //
 //  @author GS <sgazeos@gmail.com>
 //
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 #include <execution/Threads.h>
-#include <MmulHelper.h>
-#include <ShapeUtils.h>
+#include <helpers/MmulHelper.h>
+#include <helpers/ShapeUtils.h>
 
-#include "../lup.h"
-#include "../triangular_solve.h"
-#include "../lstsq.h"
-#include "../qr.h"
+#include <ops/declarable/helpers/lup.h>
+#include <ops/declarable/helpers/triangular_solve.h>
+#include <ops/declarable/helpers/lstsq.h>
+#include <ops/declarable/helpers/qr.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -47,7 +47,7 @@ namespace helpers {
     }
 
     template <typename T>
-    int leastSquaresSolveFunctor_(nd4j::LaunchContext* context, NDArray const* leftInput, NDArray const* rightInput, double const l2Regularizer, bool const fast, NDArray* output) {
+    int leastSquaresSolveFunctor_(sd::LaunchContext* context, NDArray const* leftInput, NDArray const* rightInput, double const l2Regularizer, bool const fast, NDArray* output) {
         NDArray::preparePrimaryUse({output}, {leftInput, rightInput});
         if (fast) { // Cholesky decomposition approach
             // Equation for solve A^T * Ax = A^T * b, so
@@ -99,7 +99,7 @@ namespace helpers {
         return Status::OK();
     }
 
-    int leastSquaresSolveFunctor(nd4j::LaunchContext* context, NDArray const* leftInput, NDArray const* rightInput, double const l2Regularizer, bool const fast, NDArray* output) {
+    int leastSquaresSolveFunctor(sd::LaunchContext* context, NDArray const* leftInput, NDArray const* rightInput, double const l2Regularizer, bool const fast, NDArray* output) {
         BUILD_SINGLE_SELECTOR(leftInput->dataType(), return leastSquaresSolveFunctor_, (context, leftInput, rightInput, l2Regularizer, fast, output), FLOAT_TYPES);
     }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/lup.cpp b/libnd4j/include/ops/declarable/helpers/cpu/lup.cpp
index b9687532f..8466631da 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/lup.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/lup.cpp
@@ -19,13 +19,13 @@
 //
 
 #include <ops/declarable/helpers/top_k.h>
-#include <MmulHelper.h>
-#include <NDArrayFactory.h>
-#include <Status.h>
+#include <helpers/MmulHelper.h>
+#include <array/NDArrayFactory.h>
+#include <graph/Status.h>
 #include <execution/Threads.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -156,8 +156,8 @@ namespace helpers {
             pivot = -1;
             //PRAGMA_OMP_PARALLEL_FOR //_ARGS(firstprivate(pivot,pivotValue))
             for(int rowCounter = i; rowCounter < rowNum; rowCounter++ ) {
-                if (nd4j::math::nd4j_abs(compoundMatrix.t<T>(rowCounter, i)) > pivotValue) {
-                    pivotValue = nd4j::math::nd4j_abs(compoundMatrix.t<T>(rowCounter, i));
+                if (sd::math::nd4j_abs(compoundMatrix.t<T>(rowCounter, i)) > pivotValue) {
+                    pivotValue = sd::math::nd4j_abs(compoundMatrix.t<T>(rowCounter, i));
                     pivot = rowCounter;
                 }
             }
@@ -212,15 +212,15 @@ namespace helpers {
         auto rowNum = shape::sizeAt(compoundShape, 0);
         Nd4jLong xInitial[] = {column, column};
         auto xInitialIndex = shape::getOffset(compoundShape, xInitial, 0);
-        auto maxValue = T(0); //nd4j::math::nd4j_abs(compoundBuffer[xInitialIndex]);
+        auto maxValue = T(0); //sd::math::nd4j_abs(compoundBuffer[xInitialIndex]);
         auto result = -1;
         //auto loop = PRAGMA_THREADS_FOR {
             auto start = column, stop = rowNum, increment = 1;
             for (auto rowCounter = start; rowCounter < stop; rowCounter++) {
                 Nd4jLong xPos[] = {rowCounter, column};
                 auto xIndex = shape::getOffset(compoundShape, xPos, 0);
-                if (nd4j::math::nd4j_abs(compoundBuffer[xIndex]) > maxValue) {
-                    maxValue = nd4j::math::nd4j_max(maxValue, nd4j::math::nd4j_abs(compoundBuffer[xIndex]));
+                if (sd::math::nd4j_abs(compoundBuffer[xIndex]) > maxValue) {
+                    maxValue = sd::math::nd4j_max(maxValue, sd::math::nd4j_abs(compoundBuffer[xIndex]));
                     result = rowCounter;
                 }
             }
@@ -353,7 +353,7 @@ namespace helpers {
         return Status::OK();
     }
 
-    int determinant(nd4j::LaunchContext * context, NDArray* input, NDArray* output) {
+    int determinant(sd::LaunchContext * context, NDArray* input, NDArray* output) {
         BUILD_SINGLE_SELECTOR(input->dataType(), return determinant_, (context, input, output), FLOAT_TYPES);
     }
 
@@ -370,13 +370,13 @@ template <typename T>
             }
 	    NDArray det = lup_<T, int>(context, &matrix, (NDArray*)nullptr, (NDArray*)nullptr);
 	    if (det.e<T>(0) != 0.f)
-             	output->p(e, nd4j::math::nd4j_log<T,T>(nd4j::math::nd4j_abs(det.t<T>(0))));
+             	output->p(e, sd::math::nd4j_log<T,T>(sd::math::nd4j_abs(det.t<T>(0))));
         }
 
         return ND4J_STATUS_OK;
     }
 
-    int logAbsDeterminant(nd4j::LaunchContext * context, NDArray* input, NDArray* output) {
+    int logAbsDeterminant(sd::LaunchContext * context, NDArray* input, NDArray* output) {
         BUILD_SINGLE_SELECTOR(input->dataType(), return logAbsDeterminant_, (context, input, output), FLOAT_TYPES);
     }
 
@@ -404,7 +404,7 @@ template <typename T>
             T det = lup_<T, int>(context, &matrix, &compound, &permutation).template e<T>(0);
 
             // FIXME: and how this is going to work on float16?
-            if (nd4j::math::nd4j_abs<T>(det) < T(0.000001)) {
+            if (sd::math::nd4j_abs<T>(det) < T(0.000001)) {
                 nd4j_printf("matrix_inverse: The matrix %i has no inverse due determinant is %lf. Quiting...\n", e, det);
                 matrix.printIndexedBuffer("Wrong matrix");
                 return ND4J_STATUS_VALIDATION;
@@ -423,8 +423,8 @@ template <typename T>
 
             invertLowerMatrix(&lowerMatrix, &upperMatrix);
 
-            nd4j::MmulHelper::mmul(&matrix, &upperMatrix, &compound, 1.0, 0.0);
-            nd4j::MmulHelper::mmul(&compound, &permutation, &matrix, 1.0, 0.0);
+            sd::MmulHelper::mmul(&matrix, &upperMatrix, &compound, 1.0, 0.0);
+            sd::MmulHelper::mmul(&compound, &permutation, &matrix, 1.0, 0.0);
             for (int k = e * n2, row = 0; k < (e + 1) * n2; k++) {
                 output->t<T>(k) = matrix.template t<T>(row++);
             }
@@ -461,7 +461,7 @@ template <typename T>
             }
 
             // FIXME: and how this is going to work on float16?
-            if (nd4j::math::nd4j_abs<T>(det) < T(0.000001)) {
+            if (sd::math::nd4j_abs<T>(det) < T(0.000001)) {
                 nd4j_printf("matrix_inverse: The matrix %i has no inverse due determinant is %lf. Quiting...\n", e, det);
                 matrix.printIndexedBuffer("Wrong matrix");
                 return ND4J_STATUS_VALIDATION;
@@ -496,20 +496,20 @@ template <typename T>
         return Status::OK();
     }
 
-    int inverse(nd4j::LaunchContext * context, NDArray* input, NDArray* output) {
+    int inverse(sd::LaunchContext * context, NDArray* input, NDArray* output) {
         BUILD_SINGLE_SELECTOR(input->dataType(), return inverse_, (context, input, output), FLOAT_TYPES);
     }
 
-    int lowerInverseFunctor(nd4j::LaunchContext * context, NDArray* input, NDArray* output) {
+    int lowerInverseFunctor(sd::LaunchContext * context, NDArray* input, NDArray* output) {
         BUILD_SINGLE_SELECTOR(input->dataType(), return lowerInverse_, (context, input, output), FLOAT_TYPES);
     }
 
-    int upperInverseFunctor(nd4j::LaunchContext * context, NDArray* input, NDArray* output) {
+    int upperInverseFunctor(sd::LaunchContext * context, NDArray* input, NDArray* output) {
         BUILD_SINGLE_SELECTOR(input->dataType(), return upperInverse_, (context, input, output), FLOAT_TYPES);
     }
 
     template <typename T>
-    static bool checkCholeskyInput_(nd4j::LaunchContext * context, NDArray const* input) {
+    static bool checkCholeskyInput_(sd::LaunchContext * context, NDArray const* input) {
         //std::unique_ptr<NDArray> matrix(NDArrayFactory::create_('c', {n, n}, input->dataType())); //, block.getWorkspace());
         ResultSet lastMatrixList = input->allTensorsAlongDimension({input->rankOf() - 2, input->rankOf()-1});
         for (size_t i = 0; i < lastMatrixList.size(); i++) {
@@ -517,7 +517,7 @@ template <typename T>
             // check for symmetric
             for (Nd4jLong r = 0; r < thisMatrix->rows(); r++)
                 for (Nd4jLong c = 0; c < thisMatrix->columns(); c++)
-                    if (nd4j::math::nd4j_abs(thisMatrix->e<T>(r, c) - lastMatrixList.at(i)->e<T>(c,r)) > DataTypeUtils::min<T>()) return false;
+                    if (sd::math::nd4j_abs(thisMatrix->e<T>(r, c) - lastMatrixList.at(i)->e<T>(c,r)) > DataTypeUtils::min<T>()) return false;
 
             NDArray output = NDArrayFactory::create<T>(0., context);
             if (ND4J_STATUS_OK != determinant(context, thisMatrix, &output)) return false;
@@ -533,7 +533,7 @@ template <typename T>
         return true;
     }
 
-    bool checkCholeskyInput(nd4j::LaunchContext * context, NDArray const* input) {
+    bool checkCholeskyInput(sd::LaunchContext * context, NDArray const* input) {
         BUILD_SINGLE_SELECTOR(input->dataType(), return checkCholeskyInput_, (context, input), FLOAT_TYPES);
     }
 
@@ -568,7 +568,7 @@ template <typename T>
                 T diagonalSum = 0;
                 for (Nd4jLong k = 0; k < col;  ++k)
                     diagonalSum += lowerMatrix->e<T>(col, k) * lowerMatrix->e<T>(col, k);
-                lowerMatrix->p(col, col, nd4j::math::nd4j_sqrt<T, T>(matrix->e<T>(col, col) - diagonalSum));
+                lowerMatrix->p(col, col, sd::math::nd4j_sqrt<T, T>(matrix->e<T>(col, col) - diagonalSum));
                 //nd4j_printf("%i: ", col);
                 //lowerMatrix->printIndexedBuffer("Lower matrix");
             }
@@ -580,7 +580,7 @@ template <typename T>
         return ND4J_STATUS_OK;
     }
 
-    int cholesky(nd4j::LaunchContext * context, NDArray* input, NDArray* output, bool inplace) {
+    int cholesky(sd::LaunchContext * context, NDArray* input, NDArray* output, bool inplace) {
         BUILD_SINGLE_SELECTOR(input->dataType(), return cholesky_, (context, input, output, inplace), FLOAT_TYPES);
     }
 
@@ -597,16 +597,16 @@ template <typename T>
 
         for (Nd4jLong e = 0; e < totalCount; e++) {
             for (size_t i = 0; i < n; ++i)
-                output->t<T>(e) += nd4j::math::nd4j_log<T,T>(nd4j::math::nd4j_pow<T,T,T>(matricies.at(e)->t<T>(i, i), T(2)));
+                output->t<T>(e) += sd::math::nd4j_log<T,T>(sd::math::nd4j_pow<T,T,T>(matricies.at(e)->t<T>(i, i), T(2)));
         }
         return ND4J_STATUS_OK;
     }
 
-    int logdetFunctor(nd4j::LaunchContext * context, NDArray* input, NDArray* output) {
+    int logdetFunctor(sd::LaunchContext * context, NDArray* input, NDArray* output) {
         BUILD_SINGLE_SELECTOR(input->dataType(), return logdetFunctor_, (context, input, output), FLOAT_TYPES);
     }
 
-    int lup(nd4j::LaunchContext * context, NDArray* input, NDArray* compound, NDArray* permutation) {
+    int lup(sd::LaunchContext * context, NDArray* input, NDArray* compound, NDArray* permutation) {
         BUILD_DOUBLE_SELECTOR(input->dataType(), permutation->dataType(), lup_, (context, input, compound, permutation), FLOAT_NATIVE, INDEXING_TYPES);
         return Status::OK();
     }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp b/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp
index 25605d77e..3372950f2 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp
@@ -18,11 +18,11 @@
 // @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
-#include "ResultSet.h"
+#include <array/ResultSet.h>
 #include <ops/declarable/helpers/matrixSetDiag.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -67,7 +67,7 @@ void matrixSetDiag_(const NDArray& input, const NDArray& diagonal, NDArray& outp
 }
 
 //////////////////////////////////////////////////////////////////////////
-void matrixSetDiag(nd4j::LaunchContext* context, const NDArray& input, const NDArray& diagonal, NDArray& output, const bool zeroPad) {
+void matrixSetDiag(sd::LaunchContext* context, const NDArray& input, const NDArray& diagonal, NDArray& output, const bool zeroPad) {
     BUILD_SINGLE_SELECTOR(input.dataType(), matrixSetDiag_, (input, diagonal, output, zeroPad), LIBND4J_TYPES);
 }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/matrix_band.cpp b/libnd4j/include/ops/declarable/helpers/cpu/matrix_band.cpp
index 53531dd17..d83f0dab9 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/matrix_band.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/matrix_band.cpp
@@ -19,7 +19,7 @@
 //
 #include <ops/declarable/helpers/matrix_band.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -63,7 +63,7 @@ namespace helpers {
         }
     }
 
-    void matrixBandPart(nd4j::LaunchContext * context, NDArray* input, NDArray* output, Nd4jLong lowerBand, Nd4jLong upperBand) {
+    void matrixBandPart(sd::LaunchContext * context, NDArray* input, NDArray* output, Nd4jLong lowerBand, Nd4jLong upperBand) {
         BUILD_SINGLE_SELECTOR(input->dataType(), matrixBandPart_, (input, output, lowerBand, upperBand), FLOAT_TYPES);
     }
     BUILD_SINGLE_TEMPLATE(template void matrixBandPart_, (NDArray* input, NDArray* output, Nd4jLong lowerBand, Nd4jLong upperBand), FLOAT_TYPES);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag_part.cpp b/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag_part.cpp
index 8a2048263..3271dc110 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag_part.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag_part.cpp
@@ -18,12 +18,12 @@
 // Created by GS <sgazeos@gmail.com> on 3/21/2018.
 //
 
-#include "ResultSet.h"
+#include <array/ResultSet.h>
 #include <ops/declarable/helpers/matrix_diag_part.h>
-#include <Status.h>
+#include <graph/Status.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -41,7 +41,7 @@ int _matrixDiagPart(const NDArray* input, NDArray* output) {
         nd4j_printf("matrix_diag_part: Input matrix has wrong shape.", "");
         return ND4J_STATUS_VALIDATION;
     }
-    int lastDimension = nd4j::math::nd4j_min(input->sizeAt(-2), input->sizeAt(-1));
+    int lastDimension = sd::math::nd4j_min(input->sizeAt(-2), input->sizeAt(-1));
     // TODO: tune this properlys
     int lO = listOut.size();
 
@@ -56,7 +56,7 @@ int _matrixDiagPart(const NDArray* input, NDArray* output) {
     return Status::OK();
 }
 
-    int matrixDiagPart(nd4j::LaunchContext * context, const NDArray* input, NDArray* output) {
+    int matrixDiagPart(sd::LaunchContext * context, const NDArray* input, NDArray* output) {
         BUILD_SINGLE_SELECTOR(input->dataType(), return _matrixDiagPart, (input, output), LIBND4J_TYPES);
     }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/max_pooling.cpp b/libnd4j/include/ops/declarable/helpers/cpu/max_pooling.cpp
index 6ebca9184..a458b5eff 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/max_pooling.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/max_pooling.cpp
@@ -22,12 +22,12 @@
 #include <ops/declarable/helpers/convolutions.h>
 
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
     template <typename T>
-    static void maxPoolingFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* values, std::vector<int> const& params, NDArray* indices) {
+    static void maxPoolingFunctor_(sd::graph::Context& block, NDArray* input, NDArray* values, std::vector<int> const& params, NDArray* indices) {
 
             int kY = params[0];
             int kX = params[1];
@@ -72,7 +72,7 @@ namespace helpers {
 
     }
 
-    void maxPoolingFunctor(nd4j::LaunchContext * context, nd4j::graph::Context& block, NDArray* input, NDArray* values, std::vector<int> const& params, NDArray* indices) {
+    void maxPoolingFunctor(sd::LaunchContext * context, sd::graph::Context& block, NDArray* input, NDArray* values, std::vector<int> const& params, NDArray* indices) {
         BUILD_SINGLE_SELECTOR(input->dataType(), maxPoolingFunctor_, (block, input, values, params, indices), FLOAT_TYPES);
     }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/meshgrid.cpp b/libnd4j/include/ops/declarable/helpers/cpu/meshgrid.cpp
index a8a0d919d..336eacf20 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/meshgrid.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/meshgrid.cpp
@@ -23,14 +23,14 @@
 #include <array/ResultSet.h>
 #include <numeric>
 
-namespace nd4j 	  {
+namespace sd 	  {
 namespace ops 	  {
 namespace helpers {
 
 
 //////////////////////////////////////////////////////////////////////////
 
-void meshgrid(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, const std::vector<NDArray*>& outArrs, const bool swapFirst2Dims) {
+void meshgrid(sd::LaunchContext * context, const std::vector<NDArray*>& inArrs, const std::vector<NDArray*>& outArrs, const bool swapFirst2Dims) {
 
     const int rank = inArrs.size();
     int inIndices[MAX_RANK];
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/minimax.cpp b/libnd4j/include/ops/declarable/helpers/cpu/minimax.cpp
index 8d94d23ca..6174151d6 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/minimax.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/minimax.cpp
@@ -19,11 +19,11 @@
 //
 #ifndef __MIN_I_MAX_H_HELPERS__
 #define __MIN_I_MAX_H_HELPERS__
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 #include <helpers/ShapeUtils.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -161,11 +161,11 @@ namespace helpers {
             }
     }
 
-    void minimumBPFunctor(nd4j::LaunchContext * context, NDArray* x, NDArray* y, NDArray* epsNext, NDArray* gradX, NDArray* gradY) {
+    void minimumBPFunctor(sd::LaunchContext * context, NDArray* x, NDArray* y, NDArray* epsNext, NDArray* gradX, NDArray* gradY) {
         BUILD_SINGLE_SELECTOR(x->dataType(), minimumBPFunctor_, (x, y, epsNext, gradX, gradY), NUMERIC_TYPES);
     }
 
-    void maximumBPFunctor(nd4j::LaunchContext * context, NDArray* x, NDArray* y, NDArray* epsNext, NDArray* gradX, NDArray* gradY) {
+    void maximumBPFunctor(sd::LaunchContext * context, NDArray* x, NDArray* y, NDArray* epsNext, NDArray* gradX, NDArray* gradY) {
         BUILD_SINGLE_SELECTOR(x->dataType(), maximumBPFunctor_, (x, y, epsNext, gradX, gradY), NUMERIC_TYPES);
     }
     BUILD_SINGLE_TEMPLATE(template void minimumBPFunctor_, (NDArray* x, NDArray* y, NDArray* epsNext, NDArray* gradX, NDArray* gradY), NUMERIC_TYPES);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/nth_element.cpp b/libnd4j/include/ops/declarable/helpers/cpu/nth_element.cpp
index 20d8bd34f..2730d9e88 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/nth_element.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/nth_element.cpp
@@ -19,12 +19,12 @@
 //
 
 #include <ops/declarable/helpers/nth_element.h>
-#include <TAD.h>
-#include <ShapeUtils.h>
+#include <helpers/TAD.h>
+#include <helpers/ShapeUtils.h>
 #include <helpers/ConstantTadHelper.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -47,7 +47,7 @@ namespace helpers {
         else { // rank greater than 1
             std::vector<int> lastDims({input->rankOf() - 1});// = ShapeUtils::evalDimsToExclude(input->rankOf(), {input->rankOf() - 1});
 
-            auto pack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(sortedVals.shapeInfo(), lastDims);
+            auto pack = sd::ConstantTadHelper::getInstance()->tadForDimensions(sortedVals.shapeInfo(), lastDims);
 
             SpecialMethods<T>::sortTadGeneric(sortedVals.buffer(), sortedVals.shapeInfo(), lastDims.data(), lastDims.size(), pack.primaryShapeInfo(), pack.primaryOffsets(), reverse);
 
@@ -65,7 +65,7 @@ namespace helpers {
         }
     }
 
-    void nthElementFunctor(nd4j::LaunchContext  *launchContext, NDArray* input, Nd4jLong n, NDArray* output, bool reverse) {
+    void nthElementFunctor(sd::LaunchContext  *launchContext, NDArray* input, Nd4jLong n, NDArray* output, bool reverse) {
     BUILD_SINGLE_SELECTOR(input->dataType(), nthElementFunctor_, (input, n, output, reverse), LIBND4J_TYPES);
 
     }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp b/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp
index 71beed7f9..d3f7add49 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp
@@ -23,7 +23,7 @@
 #include <execution/Threads.h>
 #include "../one_hot.h"
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             template <typename Z, typename I>
@@ -31,7 +31,7 @@ namespace nd4j {
                 auto output = reinterpret_cast<Z*>(voutput);
                 auto indices = reinterpret_cast<I*>(vindices);
 
-                auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(zShapeInfo, {axis});
+                auto tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(zShapeInfo, {axis});
 
                 auto iLen = static_cast<unsigned int>(shape::length(iShapeInfo));
                 auto tLen = static_cast<unsigned int>(shape::length(tadPack.primaryShapeInfo()));
@@ -92,7 +92,7 @@ namespace nd4j {
                 }
             }
 
-            void onehot(const nd4j::LaunchContext* context, const NDArray *indices, NDArray *output, const uint axis, const uint depth, const double on, const double off) {
+            void onehot(const sd::LaunchContext* context, const NDArray *indices, NDArray *output, const uint axis, const uint depth, const double on, const double off) {
                 auto zType = output->dataType();
                 auto iType = indices->dataType();
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/percentile.cpp b/libnd4j/include/ops/declarable/helpers/cpu/percentile.cpp
index fa8061e54..3ffa4dd82 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/percentile.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/percentile.cpp
@@ -19,10 +19,10 @@
 //
 
 #include <ops/declarable/helpers/percentile.h>
-#include <NDArrayFactory.h>
-#include "ResultSet.h"
+#include <array/NDArrayFactory.h>
+#include <array/ResultSet.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
@@ -76,7 +76,7 @@ static void _percentile(const NDArray& input, NDArray& output, std::vector<int>&
     }
 }
 
-    void percentile(nd4j::LaunchContext * context, const NDArray& input, NDArray& output, std::vector<int>& axises, const float q, const int interpolation) {
+    void percentile(sd::LaunchContext * context, const NDArray& input, NDArray& output, std::vector<int>& axises, const float q, const int interpolation) {
         BUILD_SINGLE_SELECTOR(input.dataType(), _percentile, (input, output, axises, q, interpolation), LIBND4J_TYPES);
     }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/polyGamma.cpp b/libnd4j/include/ops/declarable/helpers/cpu/polyGamma.cpp
index df80636ee..2c93cee08 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/polyGamma.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/polyGamma.cpp
@@ -20,10 +20,10 @@
 
 #include<ops/declarable/helpers/gammaMathFunc.h>
 #include<ops/declarable/helpers/zeta.h>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -49,7 +49,7 @@ static FORCEINLINE T getFactorial(const int n) {
 //////////////////////////////////////////////////////////////////////////
 // implementation is based on serial representation written in terms of the Hurwitz zeta function as polygamma = (-1)^{n+1} * n! * zeta(n+1, x)
 template <typename T>
-static FORCEINLINE T polyGammaScalar(nd4j::LaunchContext * context, const int n, const T x) {
+static FORCEINLINE T polyGammaScalar(sd::LaunchContext * context, const int n, const T x) {
 
 	// if (n < 0)
 	// 	throw("polyGamma function: n must be >= 0 !");
@@ -67,7 +67,7 @@ static FORCEINLINE T polyGammaScalar(nd4j::LaunchContext * context, const int n,
 //////////////////////////////////////////////////////////////////////////
 // calculate polygamma function for arrays
 template <typename T>
-static void polyGamma_(nd4j::LaunchContext * context, const NDArray& n, const NDArray& x, NDArray& output) {
+static void polyGamma_(sd::LaunchContext * context, const NDArray& n, const NDArray& x, NDArray& output) {
 
 	auto func = PRAGMA_THREADS_FOR {
         for (auto i = start; i < stop; i++) {
@@ -83,11 +83,11 @@ static void polyGamma_(nd4j::LaunchContext * context, const NDArray& n, const ND
 	samediff::Threads::parallel_for(func, 0, x.lengthOf());
 }
 
-	void polyGamma(nd4j::LaunchContext * context, const NDArray& n, const NDArray& x, NDArray& output) {
+	void polyGamma(sd::LaunchContext * context, const NDArray& n, const NDArray& x, NDArray& output) {
 		BUILD_SINGLE_SELECTOR(x.dataType(), polyGamma_, (context, n, x, output), FLOAT_TYPES);
 	}
 
-BUILD_SINGLE_TEMPLATE(template void polyGamma_, (nd4j::LaunchContext * context, const NDArray& n, const NDArray& x, NDArray& output), FLOAT_TYPES);
+BUILD_SINGLE_TEMPLATE(template void polyGamma_, (sd::LaunchContext * context, const NDArray& n, const NDArray& x, NDArray& output), FLOAT_TYPES);
 
 
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/prefix.cpp b/libnd4j/include/ops/declarable/helpers/cpu/prefix.cpp
index 233d7d972..5307f841e 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/prefix.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/prefix.cpp
@@ -23,7 +23,7 @@
 #include <helpers/TAD.h>
 #include <ops/declarable/helpers/prefix.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             template <typename T>
@@ -116,11 +116,11 @@ namespace nd4j {
                     prefix_<T>(op, x->getBuffer(), x->getShapeInfo(), z->buffer(), z->shapeInfo(), exclusive, reverse);
             };
 
-            void prefix(nd4j::LaunchContext * context, scalar::Ops op, const NDArray* x, NDArray* z, bool exclusive, bool reverse) {
+            void prefix(sd::LaunchContext * context, scalar::Ops op, const NDArray* x, NDArray* z, bool exclusive, bool reverse) {
                 BUILD_SINGLE_SELECTOR(x->dataType(), prefix_, (op, x, z, exclusive, reverse), LIBND4J_TYPES);
             }
 
-            void prefix(nd4j::LaunchContext * context, scalar::Ops op, const NDArray* x, NDArray* z, const std::vector<int>& dims, bool exclusive, bool reverse) {
+            void prefix(sd::LaunchContext * context, scalar::Ops op, const NDArray* x, NDArray* z, const std::vector<int>& dims, bool exclusive, bool reverse) {
                 BUILD_SINGLE_SELECTOR(x->dataType(), prefix_, (op, x, z, dims, exclusive, reverse), LIBND4J_TYPES);
             }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/print_variable.cpp b/libnd4j/include/ops/declarable/helpers/cpu/print_variable.cpp
index 293518be6..26a24a5af 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/print_variable.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/print_variable.cpp
@@ -20,7 +20,7 @@
 
 #include <ops/declarable/helpers/print_variable.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             void print_special(LaunchContext &ctx, const NDArray &array, const std::string &message) {
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/qr.cpp b/libnd4j/include/ops/declarable/helpers/cpu/qr.cpp
index ce9f67551..2ea18a79d 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/qr.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/qr.cpp
@@ -20,9 +20,9 @@
 #include <ops/declarable/helpers/qr.h>
 #include <helpers/MmulHelper.h>
 #include <execution/Threads.h>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -123,7 +123,7 @@ namespace helpers {
 
     }
 
-    void qr(nd4j::LaunchContext* context, NDArray const* input, NDArray* outputQ, NDArray* outputR, bool const fullMatricies) {
+    void qr(sd::LaunchContext* context, NDArray const* input, NDArray* outputQ, NDArray* outputR, bool const fullMatricies) {
         BUILD_SINGLE_SELECTOR(input->dataType(), qr_, (input, outputQ, outputR, fullMatricies), FLOAT_TYPES);
     }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/random.cpp b/libnd4j/include/ops/declarable/helpers/cpu/random.cpp
index e8f37f31c..b38101feb 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/random.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/random.cpp
@@ -22,12 +22,12 @@
 //#include <vector>
 #include <memory>
 //#include <graph/Context.h>
-#include <ShapeUtils.h>
+#include <helpers/ShapeUtils.h>
 #include <helpers/RandomLauncher.h>
 #include <execution/Threads.h>
 #include <helpers/ConstantTadHelper.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -186,7 +186,7 @@ namespace helpers {
                         auto nClassesPerSample = nSampleIndexInBatch * numOfClassX;
                         for (Nd4jLong nClass = 0; nClass < numOfClassX; nClass += 1) {
                             auto nIndex = nSamplesPerBatch + nClassesPerSample + nClass;
-                            auto unifornLog = nd4j::math::nd4j_log<Tx, Tx>(-nd4j::math::nd4j_log<Tx, Tx>(rng.relativeT<Tx>(nIndex, minVal, maxVal)));
+                            auto unifornLog = sd::math::nd4j_log<Tx, Tx>(-sd::math::nd4j_log<Tx, Tx>(rng.relativeT<Tx>(nIndex, minVal, maxVal)));
                             Tx tValue = (xTad[nClass * xDimAstride] - unifornLog);
                             if (tValue > Max) {
                                 Max = tValue;
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/random_crop.cpp b/libnd4j/include/ops/declarable/helpers/cpu/random_crop.cpp
index 9fb2281b0..365465f64 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/random_crop.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/random_crop.cpp
@@ -23,7 +23,7 @@
 #include <vector>
 #include <memory>
 #include <graph/Context.h>
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/range.cpp b/libnd4j/include/ops/declarable/helpers/cpu/range.cpp
index a14fb89f9..e4349ac8a 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/range.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/range.cpp
@@ -22,7 +22,7 @@
 #include <ops/declarable/helpers/range.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -45,7 +45,7 @@ static void _range(const NDArray& start, const NDArray& delta, NDArray& outVecto
     samediff::Threads::parallel_for(func, 0, len);
 }
 
-    void range(nd4j::LaunchContext * context, const NDArray& start, const NDArray& delta, NDArray& outVector) {
+    void range(sd::LaunchContext * context, const NDArray& start, const NDArray& delta, NDArray& outVector) {
         BUILD_SINGLE_SELECTOR(outVector.dataType(), _range, (start, delta, outVector), LIBND4J_TYPES);
     }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp b/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp
index 4c80e3bf2..3d17fb62a 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp
@@ -24,7 +24,7 @@
 #include <execution/Threads.h>
 
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
@@ -38,7 +38,7 @@ inline void swap(T* arr, Nd4jLong from, Nd4jLong to) {
 // this legacy op is written by raver119@gmail.com
 
 template<typename T>
-static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *inShapeBuffer, void *voutArr, Nd4jLong *outShapeBuffer, int numOfElemsToReverse = 0) {
+static void reverseArray(sd::LaunchContext * context, void *vinArr, Nd4jLong *inShapeBuffer, void *voutArr, Nd4jLong *outShapeBuffer, int numOfElemsToReverse = 0) {
             auto inArr = reinterpret_cast<T *>(vinArr);
             auto outArr = reinterpret_cast<T *>(voutArr);
 
@@ -151,7 +151,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
 
 ///////////////////////////////////////////////////////////////////
 template <typename T>
-static void reverseSequence_(nd4j::LaunchContext * context, const NDArray* input, const NDArray* seqLengths, NDArray* output, int seqDim, const int batchDim){
+static void reverseSequence_(sd::LaunchContext * context, const NDArray* input, const NDArray* seqLengths, NDArray* output, int seqDim, const int batchDim){
 
     int posOfNonUnityDim = -1;
     if(input->isVector() || shape::isLikeVector(input->getShapeInfo(), posOfNonUnityDim)) {
@@ -188,12 +188,12 @@ static void reverseSequence_(nd4j::LaunchContext * context, const NDArray* input
     }
 }
 
-    void reverseSequence(nd4j::LaunchContext * context, const NDArray* input, const NDArray* seqLengths, NDArray* output, int seqDim, const int batchDim) {
+    void reverseSequence(sd::LaunchContext * context, const NDArray* input, const NDArray* seqLengths, NDArray* output, int seqDim, const int batchDim) {
         BUILD_SINGLE_SELECTOR(input->dataType(), reverseSequence_, (context, input, seqLengths, output, seqDim, batchDim), LIBND4J_TYPES);
     }
 
 //////////////////////////////////////////////////////////////////////////
-void reverse(nd4j::LaunchContext * context, const NDArray* input, NDArray* output, const std::vector<int>* intArgs, bool isBackProp) {
+void reverse(sd::LaunchContext * context, const NDArray* input, NDArray* output, const std::vector<int>* intArgs, bool isBackProp) {
 
     // we need to reverse axis only if that's new op
     std::vector<int> dimensions = isBackProp ? ShapeUtils::evalDimsToExclude(input->rankOf(), *intArgs) : *intArgs;
@@ -210,8 +210,8 @@ void reverse(nd4j::LaunchContext * context, const NDArray* input, NDArray* outpu
     }
 }
 
-BUILD_SINGLE_TEMPLATE(template void reverseSequence_, (nd4j::LaunchContext * context, const NDArray* input, const NDArray* seqLengths, NDArray* output, int seqDim, const int batchDim), LIBND4J_TYPES);
-BUILD_SINGLE_TEMPLATE(template void reverseArray, (nd4j::LaunchContext * context, void *inArr, Nd4jLong *inShapeBuffer, void *outArr, Nd4jLong *outShapeBuffer, int numOfElemsToReverse), LIBND4J_TYPES);
+BUILD_SINGLE_TEMPLATE(template void reverseSequence_, (sd::LaunchContext * context, const NDArray* input, const NDArray* seqLengths, NDArray* output, int seqDim, const int batchDim), LIBND4J_TYPES);
+BUILD_SINGLE_TEMPLATE(template void reverseArray, (sd::LaunchContext * context, void *inArr, Nd4jLong *inShapeBuffer, void *outArr, Nd4jLong *outShapeBuffer, int numOfElemsToReverse), LIBND4J_TYPES);
 
 
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/roll.cpp b/libnd4j/include/ops/declarable/helpers/cpu/roll.cpp
index f61f1a1cf..278f3bcf5 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/roll.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/roll.cpp
@@ -20,7 +20,7 @@
 
 #include <ops/declarable/helpers/roll.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -50,7 +50,7 @@ namespace helpers {
                 auto _e0 = output->e<T>(e);
                 auto _e1 = output->e<T>(sourceIndex);
 
-                //nd4j::math::nd4j_swap((*output)(e), (*output)(sourceIndex));
+                //sd::math::nd4j_swap((*output)(e), (*output)(sourceIndex));
                 output->p<T>(e, _e1);
                 output->p<T>(sourceIndex, _e0);
             }
@@ -65,7 +65,7 @@ namespace helpers {
                     auto _e0 = output->e<T>(destinationIndex);
                     auto _e1 = output->e<T>(sourceIndex);
 
-                    //nd4j::math::nd4j_swap((*output)(destinationIndex), (*output)(sourceIndex));
+                    //sd::math::nd4j_swap((*output)(destinationIndex), (*output)(sourceIndex));
                     output->p<T>(destinationIndex, _e1);
                     output->p<T>(sourceIndex, _e0);
                 }
@@ -77,7 +77,7 @@ namespace helpers {
                 auto _e0 = output->e<T>(i);
                 auto _e1 = output->e<T>(i + remainShift);
 
-                //nd4j::math::nd4j_swap((*output)(i), (*output)(i + remainShift));
+                //sd::math::nd4j_swap((*output)(i), (*output)(i + remainShift));
 
                 output->p<T>(i, _e1);
                 output->p<T>(i + remainShift, _e0);
@@ -85,7 +85,7 @@ namespace helpers {
         }
     }
 
-    void rollFunctorFull(nd4j::LaunchContext * context, NDArray* input, NDArray* output, std::vector<int> const& shifts, std::vector<int> const& axes, bool inplace){
+    void rollFunctorFull(sd::LaunchContext * context, NDArray* input, NDArray* output, std::vector<int> const& shifts, std::vector<int> const& axes, bool inplace){
 
         if (!inplace)
             output->assign(input);
@@ -151,7 +151,7 @@ namespace helpers {
         }
     }
 
-    void rollFunctorLinear(nd4j::LaunchContext * context, NDArray* input, NDArray* output, int shift, bool inplace){
+    void rollFunctorLinear(sd::LaunchContext * context, NDArray* input, NDArray* output, int shift, bool inplace){
         BUILD_SINGLE_SELECTOR(input->dataType(), rollFunctorLinear_, (input, output, shift, inplace), LIBND4J_TYPES);
     }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp b/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp
index 3f2c5d02f..bbbb9199e 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp
@@ -22,7 +22,7 @@
 #include <ops/declarable/helpers/s_t_b.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -75,7 +75,7 @@ static void batchToSpace_(const NDArray& input, NDArray& output, const uint crop
 BUILD_SINGLE_TEMPLATE(template void batchToSpace_, (const NDArray& input, NDArray& output, const uint cropBottom, const uint cropTop, const uint cropLeft, const uint cropRight), LIBND4J_TYPES);
 
 //////////////////////////////////////////////////////////////////////////
-void batchToSpace(nd4j::LaunchContext* context, const NDArray& input, NDArray& output, const uint cropBottom, const uint cropTop, const uint cropLeft, const uint cropRight, const uint blockSize) {
+void batchToSpace(sd::LaunchContext* context, const NDArray& input, NDArray& output, const uint cropBottom, const uint cropTop, const uint cropLeft, const uint cropRight, const uint blockSize) {
 
     // [bS*blockSize*blockSize, H/blockSize, W/blockSize, iC] is rearranged/permuted to [bS, oH, oW, iC]
     // oH = H - cropTop  - cropBottom
@@ -134,7 +134,7 @@ static void batchToSpaceND_(const NDArray& input, const NDArray& crop, NDArray&
 BUILD_SINGLE_TEMPLATE(template void batchToSpaceND_, (const NDArray& input, const NDArray& crop, NDArray& output, const uint numOfSpatialDims), LIBND4J_TYPES);
 
 //////////////////////////////////////////////////////////////////////////
-void batchToSpaceND(nd4j::LaunchContext* context, const NDArray& input, const NDArray& blockShape, const NDArray& crop, NDArray& output) {
+void batchToSpaceND(sd::LaunchContext* context, const NDArray& input, const NDArray& blockShape, const NDArray& crop, NDArray& output) {
 
     // 4D example, numOfSpatialDims = 2 - two spatial dimensions
     // [bS*blockShape[0]*blockShape[1], iH, iW, iC] is rearranged/permuted to [bS, iH*blockShape[0] - cropTop  - cropBottom, iW*blockShape[1] - cropLeft - cropRight, iC]
@@ -240,7 +240,7 @@ static void spaceToBatch_(const NDArray& input, NDArray& output, const uint padB
 BUILD_SINGLE_TEMPLATE(template void spaceToBatch_, (const NDArray& input, NDArray& output, const uint padBottom, const uint padTop, const uint padLeft, const uint padRight), LIBND4J_TYPES);
 
 //////////////////////////////////////////////////////////////////////////
-void spaceToBatch(nd4j::LaunchContext* context, const NDArray& input, NDArray& output, const uint padBottom, const uint padTop, const uint padLeft, const uint padRight, const uint blockSize) {
+void spaceToBatch(sd::LaunchContext* context, const NDArray& input, NDArray& output, const uint padBottom, const uint padTop, const uint padLeft, const uint padRight, const uint blockSize) {
 
     // [bS, iH, iW, iC] is rearranged/permuted to [bS*blockSize*blockSize, (iH + padBottom + padTop)/blockSize, (iW + padLeft + padRight)/blockSize, iC]
 
@@ -333,7 +333,7 @@ static void spaceToBatchND_(const NDArray& input, const NDArray& padding, NDArra
 BUILD_SINGLE_TEMPLATE(template void spaceToBatchND_, (const NDArray& input, const NDArray& padding, NDArray& output, const uint numOfSpatialDims), LIBND4J_TYPES);
 
 //////////////////////////////////////////////////////////////////////////
-void spaceToBatchND(nd4j::LaunchContext* context, const NDArray& input, const NDArray& blockShape, const NDArray& padding, NDArray& output ) {
+void spaceToBatchND(sd::LaunchContext* context, const NDArray& input, const NDArray& blockShape, const NDArray& padding, NDArray& output ) {
 
     // 4D example with two spatial dimensions
     // [bS, iH, iW, iC] is rearranged/permuted to [bS*blockShape[0]*blockShape[1], (iH + padBottom + padTop)/blockShape[0], (iW + padLeft + padRight)/blockShape[1], iC]
@@ -426,13 +426,13 @@ void spaceToBatchND(nd4j::LaunchContext* context, const NDArray& input, const ND
     };
 
     template <typename T, int NUM_BLOCK_DIMS, bool B2S>
-    void _execute(nd4j::LaunchContext * context, void *vptrSpace, const Nd4jLong *space_shape, const Nd4jLong *space_strides, const Nd4jLong *block_shape, const Nd4jLong *pad_start, const Nd4jLong *block_offsets, void *vptrBatch, const Nd4jLong *batch_shape, const Nd4jLong *batch_strides) {
+    void _execute(sd::LaunchContext * context, void *vptrSpace, const Nd4jLong *space_shape, const Nd4jLong *space_strides, const Nd4jLong *block_shape, const Nd4jLong *pad_start, const Nd4jLong *block_offsets, void *vptrBatch, const Nd4jLong *batch_shape, const Nd4jLong *batch_strides) {
         auto ptrSpace = reinterpret_cast<T *>(vptrSpace);
         auto ptrBatch = reinterpret_cast<T *>(vptrBatch);
         SpaceToBatchHelper<NUM_BLOCK_DIMS, B2S>::run(ptrSpace, space_shape, space_strides, block_shape, pad_start, block_offsets, ptrBatch, batch_shape, batch_strides);
     };
 
-    Nd4jStatus _spaceToBatch(nd4j::LaunchContext * context, int internal_block_dims, NDArray *input, NDArray *output, std::vector<Nd4jLong> &internal_input_shape, std::vector<Nd4jLong> &internal_output_shape, Nd4jLong *block_shape, Nd4jLong *paddings) {
+    Nd4jStatus _spaceToBatch(sd::LaunchContext * context, int internal_block_dims, NDArray *input, NDArray *output, std::vector<Nd4jLong> &internal_input_shape, std::vector<Nd4jLong> &internal_output_shape, Nd4jLong *block_shape, Nd4jLong *paddings) {
         auto in = input->reshape('c', internal_input_shape);
         auto out = output->reshape('c', internal_output_shape);
         switch (internal_block_dims) {
@@ -456,7 +456,7 @@ void spaceToBatchND(nd4j::LaunchContext* context, const NDArray& input, const ND
         return Status::OK();
     }
 
-    Nd4jStatus _batchToSpace(nd4j::LaunchContext * context, int internal_block_dims, NDArray *input, NDArray *output, std::vector<Nd4jLong> &internal_input_shape, std::vector<Nd4jLong> &internal_output_shape, Nd4jLong *block_shape, Nd4jLong *crops) {
+    Nd4jStatus _batchToSpace(sd::LaunchContext * context, int internal_block_dims, NDArray *input, NDArray *output, std::vector<Nd4jLong> &internal_input_shape, std::vector<Nd4jLong> &internal_output_shape, Nd4jLong *block_shape, Nd4jLong *crops) {
         auto in = input->reshape('c', internal_input_shape);
         auto out = output->reshape('c', internal_output_shape);
         switch (internal_block_dims) {
@@ -488,7 +488,7 @@ void spaceToBatchND(nd4j::LaunchContext* context, const NDArray& input, const ND
 #define STB_BOOL (0, false),\
                  (1, true)
 
-    BUILD_TRIPLE_TEMPLATE(template void _execute, (nd4j::LaunchContext * context, void *ptrSpace, const Nd4jLong *space_shape, const Nd4jLong *space_strides, const Nd4jLong *block_shape, const Nd4jLong *pad_start, const Nd4jLong *block_offsets, void *ptrBatch, const Nd4jLong *batch_shape, const Nd4jLong *batch_strides), LIBND4J_TYPES, STB_DIM, STB_BOOL);
+    BUILD_TRIPLE_TEMPLATE(template void _execute, (sd::LaunchContext * context, void *ptrSpace, const Nd4jLong *space_shape, const Nd4jLong *space_strides, const Nd4jLong *block_shape, const Nd4jLong *pad_start, const Nd4jLong *block_offsets, void *ptrBatch, const Nd4jLong *batch_shape, const Nd4jLong *batch_strides), LIBND4J_TYPES, STB_DIM, STB_BOOL);
 
 #undef STB_BOOL
 #undef STB_DIM
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/s_t_d.cpp b/libnd4j/include/ops/declarable/helpers/cpu/s_t_d.cpp
index 557d63fd3..32968b486 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/s_t_d.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/s_t_d.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/helpers/s_t_d.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
     template <typename T>
@@ -97,7 +97,7 @@ namespace helpers {
         }
     }
 
-    void _spaceTodepth(nd4j::LaunchContext * context, NDArray *input, NDArray *output, int block_size, bool isNHWC) {
+    void _spaceTodepth(sd::LaunchContext * context, NDArray *input, NDArray *output, int block_size, bool isNHWC) {
         BUILD_SINGLE_SELECTOR(input->dataType(), _spaceTodepth_, (input, output, block_size, isNHWC), LIBND4J_TYPES);
     }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp b/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp
index 2de2b2d22..2d9250f9b 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp
@@ -23,7 +23,7 @@
 #include <helpers/ShapeUtils.h>
 #include <execution/Threads.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
@@ -64,13 +64,13 @@ Nd4jLong checkIndices_(const NDArray& indices, const NDArray& output, const int
 }
 
 ///////////////////////////////////////////////////////////////////
-Nd4jLong checkIndices(nd4j::LaunchContext *context, const NDArray& indices, const NDArray& output, const int axis) {
+Nd4jLong checkIndices(sd::LaunchContext *context, const NDArray& indices, const NDArray& output, const int axis) {
 
     BUILD_SINGLE_SELECTOR(indices.dataType(), return checkIndices_, (indices, output, axis), INDEXING_TYPES);
 }
 
 ///////////////////////////////////////////////////////////////////
-void scatter(nd4j::LaunchContext  *context, pairwise::Ops op, const NDArray& indices, const NDArray& updates, NDArray& output, const bool lock) {
+void scatter(sd::LaunchContext  *context, pairwise::Ops op, const NDArray& indices, const NDArray& updates, NDArray& output, const bool lock) {
 
     const int outRank = output.rankOf();
     const int indRank = indices.rankOf();
@@ -87,7 +87,7 @@ void scatter(nd4j::LaunchContext  *context, pairwise::Ops op, const NDArray& ind
             }
         };
 
-        samediff::Threads::parallel_tad(func, 0, indLen, 1, lock ? 1 : nd4j::Environment::getInstance()->maxThreads());
+        samediff::Threads::parallel_tad(func, 0, indLen, 1, lock ? 1 : sd::Environment::getInstance()->maxThreads());
     }
     else {      // outRank > 1
 
@@ -107,12 +107,12 @@ void scatter(nd4j::LaunchContext  *context, pairwise::Ops op, const NDArray& ind
             }
         };
 
-        samediff::Threads::parallel_tad(func, 0, indLen, 1, lock ? 1 : nd4j::Environment::getInstance()->maxThreads());
+        samediff::Threads::parallel_tad(func, 0, indLen, 1, lock ? 1 : sd::Environment::getInstance()->maxThreads());
     }
 }
 
 ///////////////////////////////////////////////////////////////////
-void scatterND(nd4j::LaunchContext  *context, pairwise::Ops op, const NDArray& indices, const NDArray& updates, NDArray& output, const bool lock) {
+void scatterND(sd::LaunchContext  *context, pairwise::Ops op, const NDArray& indices, const NDArray& updates, NDArray& output, const bool lock) {
 
     const Nd4jLong indLen = indices.lengthOf();
     const int outRank = output.rankOf();
@@ -129,7 +129,7 @@ void scatterND(nd4j::LaunchContext  *context, pairwise::Ops op, const NDArray& i
             }
         };
 
-        samediff::Threads::parallel_tad(func, 0, indLen, 1, lock ? 1 : nd4j::Environment::getInstance()->maxThreads());
+        samediff::Threads::parallel_tad(func, 0, indLen, 1, lock ? 1 : sd::Environment::getInstance()->maxThreads());
     }
     else {
         std::vector<int> dimsToExcludeInd = ShapeUtils::evalDimsToExclude(indRank, {indRank-1});
@@ -154,11 +154,11 @@ void scatterND(nd4j::LaunchContext  *context, pairwise::Ops op, const NDArray& i
             }
         };
 
-        samediff::Threads::parallel_tad(func, 0, indLen / indLastDim, 1, lock ? 1 : nd4j::Environment::getInstance()->maxThreads());
+        samediff::Threads::parallel_tad(func, 0, indLen / indLastDim, 1, lock ? 1 : sd::Environment::getInstance()->maxThreads());
     }
 }
 
-void scatterForLoss(nd4j::LaunchContext  *context, const NDArray& indices, NDArray& updates, NDArray& output, const bool calcGrad) {
+void scatterForLoss(sd::LaunchContext  *context, const NDArray& indices, NDArray& updates, NDArray& output, const bool calcGrad) {
 
     // shapes of indices and output must be the same
     // shape of indices should be the same as updates shape with last dimension excluded
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp b/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp
index 06833d6b3..e57264e66 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp
@@ -20,11 +20,11 @@
 //
 
 #include <ops/declarable/helpers/segment.h>
-#include <ShapeUtils.h>
+#include <helpers/ShapeUtils.h>
 #include <execution/Threads.h>
 #include <unordered_map>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -40,7 +40,7 @@ namespace helpers {
             for (Nd4jLong e = 1; e < indices->lengthOf(); e++) {
                 if (idx == indices->e<Nd4jLong>(e)) {
                    // max
-                   val = nd4j::math::nd4j_max<T>(val, input->t<T>(e));
+                   val = sd::math::nd4j_max<T>(val, input->t<T>(e));
                 }
                 else {
                     idx = indices->e<Nd4jLong>(e);
@@ -65,7 +65,7 @@ namespace helpers {
                 if (indices->e<int>(i) == idx) {
 
                     for (Nd4jLong e = 0; e < maxT->lengthOf(); e++) {
-                       maxT->t<T>(e) = nd4j::math::nd4j_max(maxT->t<T>(e), listOfTensors.at(i)->t<T>(e));
+                       maxT->t<T>(e) = sd::math::nd4j_max(maxT->t<T>(e), listOfTensors.at(i)->t<T>(e));
                     }
                 }
                 else {
@@ -90,7 +90,7 @@ namespace helpers {
             for (Nd4jLong e = 1; e < indices->lengthOf(); e++) {
                 if (idx == indices->e<Nd4jLong>(e)) {
                    // min
-                   val = nd4j::math::nd4j_min<T>(val, input->t<T>(e));
+                   val = sd::math::nd4j_min<T>(val, input->t<T>(e));
                 }
                 else {
                     idx = indices->e<Nd4jLong>(e);
@@ -116,7 +116,7 @@ namespace helpers {
                 if (indices->e<Nd4jLong>(i) == idx) {
 
                     for (Nd4jLong e = 0; e < minT->lengthOf(); e++) {
-                       minT->p(e, nd4j::math::nd4j_min(minT->e<T>(e), listOfTensors.at(i)->e<T>(e)));
+                       minT->p(e, sd::math::nd4j_min(minT->e<T>(e), listOfTensors.at(i)->e<T>(e)));
                     }
                 }
                 else {
@@ -291,27 +291,27 @@ namespace helpers {
 //    static bool segmentIndicesValidate_(NDArray* indices, NDArray& aexpected, NDArray& anOutput) {
 //      }
 
-    void segmentMaxFunctor(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* output) {
+    void segmentMaxFunctor(sd::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* output) {
         BUILD_SINGLE_SELECTOR(input->dataType(), segmentMaxFunctor_, (input, indices, output), LIBND4J_TYPES);
     }
 
-    void segmentMinFunctor(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* output) {
+    void segmentMinFunctor(sd::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* output) {
         BUILD_SINGLE_SELECTOR(input->dataType(), segmentMinFunctor_, (input, indices, output), LIBND4J_TYPES);
     }
 
-    void segmentMeanFunctor(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* output) {
+    void segmentMeanFunctor(sd::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* output) {
         BUILD_SINGLE_SELECTOR(input->dataType(), segmentMeanFunctor_, (input, indices, output), LIBND4J_TYPES);
     }
 
-    void segmentSumFunctor(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* output) {
+    void segmentSumFunctor(sd::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* output) {
         BUILD_SINGLE_SELECTOR(input->dataType(), segmentSumFunctor_, (input, indices, output), LIBND4J_TYPES);
     }
 
-    void segmentProdFunctor(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* output) {
+    void segmentProdFunctor(sd::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* output) {
         BUILD_SINGLE_SELECTOR(input->dataType(), segmentProdFunctor_, (input, indices, output), LIBND4J_TYPES);
     }
 
-    bool segmentIndicesValidate(nd4j::LaunchContext * context, NDArray* indices, NDArray& expected, NDArray& output) {
+    bool segmentIndicesValidate(sd::LaunchContext * context, NDArray* indices, NDArray& expected, NDArray& output) {
         auto val = indices->e(0);
         for (Nd4jLong e = 1; e < indices->lengthOf(); e++) {
             output = indices->e(e);
@@ -333,7 +333,7 @@ namespace helpers {
     // Unsorted segment ops
     // -------------------------------------------------------------------------------------------------------------- //
 
-    bool unsortedSegmentIndicesValidate(nd4j::LaunchContext * context, NDArray* indices, Nd4jLong expected, Nd4jLong& output) {
+    bool unsortedSegmentIndicesValidate(sd::LaunchContext * context, NDArray* indices, Nd4jLong expected, Nd4jLong& output) {
         Nd4jLong val = indices->e<Nd4jLong>(0);
 
         Nd4jLong maxInd = indices->argMax();
@@ -363,7 +363,7 @@ namespace helpers {
             for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) {
                 T val = input->e<T>(fi->second.at(0));
                 for (Nd4jLong idx = 1; idx < static_cast<Nd4jLong>(fi->second.size()); ++idx) {
-                    val = nd4j::math::nd4j_max(val, input->e<T>(fi->second.at(idx)));
+                    val = sd::math::nd4j_max(val, input->e<T>(fi->second.at(idx)));
                 }
                 output->p(fi->first, val);
             }
@@ -383,7 +383,7 @@ namespace helpers {
                 for (Nd4jLong idx = 1; idx < static_cast<Nd4jLong>(fi->second.size()); ++idx) {
                     auto maxT = listOfTensors.at(fi->second.at(idx));
                     for (Nd4jLong e = 0; e < outputT->lengthOf(); ++e) {
-                        T val = nd4j::math::nd4j_max(maxT->e<T>(e), outputT->e<T>(e));
+                        T val = sd::math::nd4j_max(maxT->e<T>(e), outputT->e<T>(e));
 
                         outputT->p(e, val);
                     }
@@ -391,7 +391,7 @@ namespace helpers {
             }
         }
     }
-    void unsortedSegmentMaxFunctor(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
+    void unsortedSegmentMaxFunctor(sd::LaunchContext * context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
         BUILD_SINGLE_SELECTOR(input->dataType(), unsortedSegmentMaxFunctor_, (input, indices, numOfClasses, output), NUMERIC_TYPES);
     }
     BUILD_SINGLE_TEMPLATE(template void unsortedSegmentMaxFunctor_, (NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output), NUMERIC_TYPES);
@@ -415,7 +415,7 @@ namespace helpers {
                 T val = input->t<T>(fi->second.at(0));
 
                 for (size_t idx = 1; idx < fi->second.size(); ++idx) {
-                    val = nd4j::math::nd4j_min(val, input->t<T>(fi->second.at(idx)));
+                    val = sd::math::nd4j_min(val, input->t<T>(fi->second.at(idx)));
                 }
                 output->t<T>(fi->first) = val;
             }
@@ -436,7 +436,7 @@ namespace helpers {
                     auto minT = listOfTensors.at(fi->second.at(idx));
 
                     for (Nd4jLong e = 0; e < outputT->lengthOf(); ++e) {
-                        outputT->t<T>(e) = nd4j::math::nd4j_min(minT->t<T>(e), outputT->t<T>(e));
+                        outputT->t<T>(e) = sd::math::nd4j_min(minT->t<T>(e), outputT->t<T>(e));
                     }
                 }
                 //outputT->assign(maxT);
@@ -444,14 +444,14 @@ namespace helpers {
         }
 
     }
-    void unsortedSegmentMinFunctor(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
+    void unsortedSegmentMinFunctor(sd::LaunchContext * context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
         BUILD_SINGLE_SELECTOR(input->dataType(), unsortedSegmentMinFunctor_, (input, indices, numOfClasses, output),
                               NUMERIC_TYPES);
     }
 
     BUILD_SINGLE_TEMPLATE(template void unsortedSegmentMinFunctor_, (NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output), NUMERIC_TYPES);
 
-    void unsortedSegmentMeanFunctor(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
+    void unsortedSegmentMeanFunctor(sd::LaunchContext * context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
         MAP_IMPL<Nd4jLong, std::vector<Nd4jLong>> idxs;//(indices->lengthOf());
         for (Nd4jLong e = 0; e < indices->lengthOf(); ++e)
             idxs[indices->e<Nd4jLong>(e)].push_back(e);
@@ -493,7 +493,7 @@ namespace helpers {
         }
     }
 
-    void unsortedSegmentSumFunctor(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
+    void unsortedSegmentSumFunctor(sd::LaunchContext * context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
         MAP_IMPL<Nd4jLong, std::vector<Nd4jLong>> idxs;//(indices->lengthOf());
         for (Nd4jLong e = 0; e < indices->lengthOf(); ++e)
             idxs[indices->e<Nd4jLong>(e)].push_back(e);
@@ -569,12 +569,12 @@ namespace helpers {
         }
     }
 
-    void unsortedSegmentProdFunctor(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
+    void unsortedSegmentProdFunctor(sd::LaunchContext * context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
         BUILD_SINGLE_SELECTOR(input->dataType(), unsortedSegmentProdFunctor_, (input, indices, numOfClasses, output), NUMERIC_TYPES);
     }
     BUILD_SINGLE_TEMPLATE(template void unsortedSegmentProdFunctor_, (NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output), NUMERIC_TYPES);
 
-    void unsortedSegmentSqrtNFunctor(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
+    void unsortedSegmentSqrtNFunctor(sd::LaunchContext * context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
         MAP_IMPL<Nd4jLong, std::vector<Nd4jLong>> idxs;//(indices->lengthOf());
         for (Nd4jLong e = 0; e < indices->lengthOf(); ++e)
             idxs[indices->e<Nd4jLong>(e)].push_back(e);
@@ -587,7 +587,7 @@ namespace helpers {
                 for (size_t idx = 1; idx < fi->second.size(); ++idx) {
                     sumValue += input->e<double>(fi->second.at(idx));
                 }
-                output->p(fi->first, sumValue / nd4j::math::nd4j_sqrt<Nd4jLong, double>(fi->second.size()));
+                output->p(fi->first, sumValue / sd::math::nd4j_sqrt<Nd4jLong, double>(fi->second.size()));
             }
         }
         else {
@@ -604,7 +604,7 @@ namespace helpers {
                     *outputT += *current;
                 }
                 //outputT->assign(maxT);
-                (*outputT) /= nd4j::math::nd4j_sqrt<size_t, double>(fi->second.size());
+                (*outputT) /= sd::math::nd4j_sqrt<size_t, double>(fi->second.size());
             }
         }
     }
@@ -616,7 +616,7 @@ namespace helpers {
     //
     // segment max
     template <typename T>
-    int segmentMaxFunctorBP_(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
+    int segmentMaxFunctorBP_(sd::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
         //int numOfClasses = gradOut->sizeAt(0);
         // if input is a vector: (as if in doc sample)
         auto tempRes = gradOut->dup();
@@ -627,7 +627,7 @@ namespace helpers {
             auto func = PRAGMA_THREADS_FOR {
                 for (auto e = start; e < stop; e++) {
                     auto classNum = indices->e<Nd4jLong>(e);
-                    if (nd4j::math::nd4j_abs(tempRes.e<T>(classNum) - input->e<T>(e)) <= T(1.e-6))
+                    if (sd::math::nd4j_abs(tempRes.e<T>(classNum) - input->e<T>(e)) <= T(1.e-6))
                         output->p(e, gradOut->e<T>(classNum));
                 }
             };
@@ -652,7 +652,7 @@ namespace helpers {
                     auto currentGradOut = listOfGradOuts.at(classNum);
 
                     for (Nd4jLong e = 0; e < current->lengthOf(); e++) {
-                        if (nd4j::math::nd4j_abs(listOfBPTensors.at(classNum)->e<T>(e) - current->e<T>(e)) <= T(1.e-6))
+                        if (sd::math::nd4j_abs(listOfBPTensors.at(classNum)->e<T>(e) - current->e<T>(e)) <= T(1.e-6))
                             currentOut->p(e, currentGradOut->e<T>(e));
                     }
                 }
@@ -664,20 +664,20 @@ namespace helpers {
         return ND4J_STATUS_OK;
     }
 
-    int segmentMaxFunctorBP(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
+    int segmentMaxFunctorBP(sd::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
         BUILD_SINGLE_SELECTOR(output->dataType(), return segmentMaxFunctorBP_, (context, input, indices, gradOut, output), NUMERIC_TYPES);
     }
-    BUILD_SINGLE_TEMPLATE(template int segmentMaxFunctorBP_, (nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output), NUMERIC_TYPES);
+    BUILD_SINGLE_TEMPLATE(template int segmentMaxFunctorBP_, (sd::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output), NUMERIC_TYPES);
 
     // segmen min
-    int segmentMinFunctorBP(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
+    int segmentMinFunctorBP(sd::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
         NDArray tempRes = gradOut->dup();
         segmentMinFunctor(context, input, indices, &tempRes);
         if (input->isVector()) {
             auto func = PRAGMA_THREADS_FOR {
                 for (auto e = start; e < stop; e++) {
                     auto classNum = indices->e<Nd4jLong>(e);
-                    if (nd4j::math::nd4j_abs(tempRes.e<double>(classNum) - input->e<double>(e)) < 1.e-5)
+                    if (sd::math::nd4j_abs(tempRes.e<double>(classNum) - input->e<double>(e)) < 1.e-5)
                         output->p(e, gradOut->e<double>(classNum));
                 }
             };
@@ -704,7 +704,7 @@ namespace helpers {
                     auto currentGradOut = listOfGradOuts.at(classNum);
 
                     for (Nd4jLong e = 0; e < current->lengthOf(); e++) {
-                        if (nd4j::math::nd4j_abs(listOfBPTensors.at(classNum)->e<double>(e) - current->e<double>(e)) <
+                        if (sd::math::nd4j_abs(listOfBPTensors.at(classNum)->e<double>(e) - current->e<double>(e)) <
                             1.e-5)
                             currentOut->p(e, currentGradOut->e<double>(e));
                     }
@@ -717,7 +717,7 @@ namespace helpers {
     }
 
     // segmen mean
-    int segmentMeanFunctorBP(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
+    int segmentMeanFunctorBP(sd::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
         int numClasses = output->sizeAt(0);
         MAP_IMPL<Nd4jLong, Nd4jLong> classCount;//(numClasses);
 
@@ -763,7 +763,7 @@ namespace helpers {
         return ND4J_STATUS_OK;
     }
 
-    int segmentSumFunctorBP(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
+    int segmentSumFunctorBP(sd::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
 //        int numClasses = output->sizeAt(0);
         // if input is a vector: (as if in doc sample)
         Nd4jLong idx = indices->e<Nd4jLong>(0);
@@ -796,7 +796,7 @@ namespace helpers {
         return Status::OK();
     }
 
-    int segmentProdFunctorBP(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
+    int segmentProdFunctorBP(sd::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
         auto tempRes = gradOut->dup();
         segmentProdFunctor(context, input, indices, &tempRes);
         if (input->isVector()) {
@@ -839,7 +839,7 @@ namespace helpers {
     // -------------------------------------------------------------------------------------------------------------- //
 
     template <typename T>
-    static int unsortedSegmentMaxFunctorBP_(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
+    static int unsortedSegmentMaxFunctorBP_(sd::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
 //        int numOfClasses = gradOut->sizeAt(0);
         // if input is a vector: (as if in doc sample)
         auto tempRes = gradOut->dup();
@@ -848,7 +848,7 @@ namespace helpers {
 
             for (Nd4jLong e = 0; e < input->lengthOf(); ++e) {
                 Nd4jLong classNum = indices->e<Nd4jLong>(e);
-                if (nd4j::math::nd4j_abs(tempRes.e<double>(classNum) - input->e<double>(e)) < 1.e-5)
+                if (sd::math::nd4j_abs(tempRes.e<double>(classNum) - input->e<double>(e)) < 1.e-5)
                     output->p(e, gradOut->e<T>(classNum));
             }
         }
@@ -866,7 +866,7 @@ namespace helpers {
                 NDArray* currentOut = listOfOutTensors.at(i);
                 NDArray* currentGradOut = listOfGradOuts.at(classNum);
                 for (int e = 0; e < current->lengthOf(); e++) {
-                    if (nd4j::math::nd4j_abs(listOfBPTensors.at(classNum)->e<double>(e) - current->e<double>(e)) < 1.e-5)
+                    if (sd::math::nd4j_abs(listOfBPTensors.at(classNum)->e<double>(e) - current->e<double>(e)) < 1.e-5)
                         currentOut->p(e, currentGradOut->e<T>(e));
                 }
             }
@@ -875,13 +875,13 @@ namespace helpers {
         return ND4J_STATUS_OK;
     }
 
-    int unsortedSegmentMaxFunctorBP(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
+    int unsortedSegmentMaxFunctorBP(sd::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
         BUILD_SINGLE_SELECTOR(output->dataType(), return unsortedSegmentMaxFunctorBP_, (context, input, indices, gradOut, numOfClasses, output), NUMERIC_TYPES);
     }
-    BUILD_SINGLE_TEMPLATE(template int unsortedSegmentMaxFunctorBP_, (nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output), NUMERIC_TYPES);
+    BUILD_SINGLE_TEMPLATE(template int unsortedSegmentMaxFunctorBP_, (sd::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output), NUMERIC_TYPES);
 
     template <typename T>
-    static int unsortedSegmentMinFunctorBP_(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
+    static int unsortedSegmentMinFunctorBP_(sd::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
         auto tempRes = gradOut->dup();
         unsortedSegmentMinFunctor(context, input, indices, numOfClasses, &tempRes);
         if (input->isVector()) {
@@ -889,7 +889,7 @@ namespace helpers {
             auto func = PRAGMA_THREADS_FOR {
                 for (auto e = start; e < stop; e++) {
                     auto classNum = indices->e<Nd4jLong>(e);
-                    if (nd4j::math::nd4j_abs(tempRes.t<T>(classNum) - input->t<T>(e)) < 1.e-6)
+                    if (sd::math::nd4j_abs(tempRes.t<T>(classNum) - input->t<T>(e)) < 1.e-6)
                         output->t<T>(e) = gradOut->t<T>(classNum);
                 }
             };
@@ -912,7 +912,7 @@ namespace helpers {
                     auto currentGradOut = listOfGradOuts.at(classNum);
 
                     for (Nd4jLong e = 0; e < current->lengthOf(); e++) {
-                        if (nd4j::math::nd4j_abs(listOfBPTensors.at(classNum)->t<T>(e) - current->t<T>(e)) < 1.e-6)
+                        if (sd::math::nd4j_abs(listOfBPTensors.at(classNum)->t<T>(e) - current->t<T>(e)) < 1.e-6)
                             currentOut->t<T>(e) = currentGradOut->t<T>(e);
                     }
                 }
@@ -924,12 +924,12 @@ namespace helpers {
         return ND4J_STATUS_OK;
     }
 
-    int unsortedSegmentMinFunctorBP(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
+    int unsortedSegmentMinFunctorBP(sd::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
         BUILD_SINGLE_SELECTOR(output->dataType(), return unsortedSegmentMinFunctorBP_, (context, input, indices, gradOut, numOfClasses, output), NUMERIC_TYPES);
     }
-    BUILD_SINGLE_TEMPLATE(template int unsortedSegmentMinFunctorBP_, (nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output), NUMERIC_TYPES);
+    BUILD_SINGLE_TEMPLATE(template int unsortedSegmentMinFunctorBP_, (sd::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output), NUMERIC_TYPES);
 
-    int unsortedSegmentMeanFunctorBP(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
+    int unsortedSegmentMeanFunctorBP(sd::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
 
         MAP_IMPL<Nd4jLong, Nd4jLong> classCount;//(numClasses);
 
@@ -966,7 +966,7 @@ namespace helpers {
         return ND4J_STATUS_OK;
     }
 
-    int unsortedSegmentSumFunctorBP(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
+    int unsortedSegmentSumFunctorBP(sd::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
 
         // if input is a vector: (as if in doc sample)
         Nd4jLong idx = indices->e<Nd4jLong>(0);
@@ -998,7 +998,7 @@ namespace helpers {
         return Status::OK();
     }
 
-    int unsortedSegmentProdFunctorBP(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
+    int unsortedSegmentProdFunctorBP(sd::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
 
         auto tempRes = gradOut->dup();
         unsortedSegmentProdFunctor(context, input, indices, numOfClasses, &tempRes);
@@ -1039,7 +1039,7 @@ namespace helpers {
     }
 
 //    template <typename T>
-    int unsortedSegmentSqrtNFunctorBP(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
+    int unsortedSegmentSqrtNFunctorBP(sd::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
             MAP_IMPL<Nd4jLong, Nd4jLong> classCount;//(numClasses);
 
         for (Nd4jLong count = 0; count < numOfClasses; ++count) {
@@ -1055,7 +1055,7 @@ namespace helpers {
             //auto func = PRAGMA_THREADS_FOR {
                 for (Nd4jLong e = 0; e < indices->lengthOf(); e++) {
                     auto classNum = indices->e<Nd4jLong>(e);
-                    output->p(e, gradOut->e<double>(classNum) / nd4j::math::nd4j_sqrt<double, double>(classCount[classNum]));
+                    output->p(e, gradOut->e<double>(classNum) / sd::math::nd4j_sqrt<double, double>(classCount[classNum]));
                 }
             //};
 
@@ -1076,7 +1076,7 @@ namespace helpers {
                     auto currentGradOut = listOfGradOuts.at(classNum);
 
                     for (int e = 0; e < current->lengthOf(); e++) {
-                        currentOut->p<double>(e, currentGradOut->e<double>(e) / nd4j::math::nd4j_sqrt<double, double>(classCount[classNum]));
+                        currentOut->p<double>(e, currentGradOut->e<double>(e) / sd::math::nd4j_sqrt<double, double>(classCount[classNum]));
                     }
                 }
             //};
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/sequence_mask.cpp b/libnd4j/include/ops/declarable/helpers/cpu/sequence_mask.cpp
index c175fd96d..8e25c4690 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/sequence_mask.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/sequence_mask.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/helpers/sequence_mask.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -37,7 +37,7 @@ namespace helpers {
         samediff::Threads::parallel_for(func, 0, maxIndex, 1, 0, input->lengthOf(), 1);
     }
 
-    void sequenceMask(nd4j::LaunchContext * context, NDArray* input, NDArray* output, int maxIndex) {
+    void sequenceMask(sd::LaunchContext * context, NDArray* input, NDArray* output, int maxIndex) {
         BUILD_DOUBLE_SELECTOR(input->dataType(), output->dataType(), sequenceMask_, (input, output, maxIndex), INTEGER_TYPES, LIBND4J_TYPES_EXTENDED);
     }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp b/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp
index 3c3db8139..07cbca04e 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp
@@ -19,12 +19,12 @@
 //
 
 #include <ops/declarable/helpers/sg_cb.h>
-#include <specials.h>
+#include <ops/specials.h>
 #include <execution/Threads.h>
 
 #define HS_MAX_EXP 6.0f
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             template <typename T>
@@ -172,7 +172,7 @@ namespace nd4j {
                             // target is known in advance
                         } else {
                             randomValue = randomValue * (unsigned long long) 25214903917 + 11;
-                            auto idx = nd4j::math::nd4j_abs<Nd4jLong >((randomValue >> 16) % negLength);
+                            auto idx = sd::math::nd4j_abs<Nd4jLong >((randomValue >> 16) % negLength);
                             irow = idx >= negLength ? -1 : static_cast<int>(negTable[idx]);
 
                             if (irow < 0 || irow >= vocabSize) irow = randomValue % (vocabSize - 1) + 1;
@@ -247,7 +247,7 @@ namespace nd4j {
                             // target is known in advance
                         } else {
                             randomValue = randomValue * (unsigned long long) 25214903917 + 11;
-                            auto idx = nd4j::math::nd4j_abs<Nd4jLong >((randomValue >> 16) % negLength);
+                            auto idx = sd::math::nd4j_abs<Nd4jLong >((randomValue >> 16) % negLength);
                             irow = idx >= negLength ? -1 : static_cast<int>(negTable[idx]);
 
                             if (irow < 0 || irow >= vocabSize) irow = randomValue % (vocabSize - 1) + 1;
@@ -276,7 +276,7 @@ namespace nd4j {
             int binarySearch(const int *haystack, const int needle, const int totalElements) {
                 int firstIndex = 0;
                 int lastIndex = totalElements - 1;
-                int halfIndex = nd4j::math::nd4j_floor<float, int>((lastIndex + firstIndex) / (float) 2);
+                int halfIndex = sd::math::nd4j_floor<float, int>((lastIndex + firstIndex) / (float) 2);
 
                 while(haystack[halfIndex] != needle && firstIndex < lastIndex) {
                     if (needle < haystack[halfIndex]) {
@@ -284,7 +284,7 @@ namespace nd4j {
                     } else if (needle > haystack[halfIndex]) {
                         firstIndex = halfIndex + 1;
                     }
-                    halfIndex = nd4j::math::nd4j_floor<float, int>((lastIndex + firstIndex) / (float) 2);
+                    halfIndex = sd::math::nd4j_floor<float, int>((lastIndex + firstIndex) / (float) 2);
                 }
 
                 return (haystack[halfIndex] == needle) ? halfIndex : -1;
@@ -310,8 +310,8 @@ namespace nd4j {
                 int irow = 0;
                 unsigned long long randomValue = rv;
                 for (int r = 0; r < nsRounds; r++) {
-                    randomValue = nd4j::math::nd4j_abs<Nd4jLong>(randomValue * (unsigned long long) 25214903917 + 11);
-                    auto idx = nd4j::math::nd4j_abs<Nd4jLong>((randomValue >> 16) % negLength);
+                    randomValue = sd::math::nd4j_abs<Nd4jLong>(randomValue * (unsigned long long) 25214903917 + 11);
+                    auto idx = sd::math::nd4j_abs<Nd4jLong>((randomValue >> 16) % negLength);
                     irow = idx >= negLength ? -1 : static_cast<int>(negTable[idx]);
 
                     if (irow < 0 || irow >= vocabSize)
@@ -401,7 +401,7 @@ namespace nd4j {
                                         // target is known in advance
                                     } else {
                                         randomValue = randomValue * (unsigned long long) 25214903917 + 11;
-                                        auto idx = nd4j::math::nd4j_abs<Nd4jLong>((randomValue >> 16) % negLength);
+                                        auto idx = sd::math::nd4j_abs<Nd4jLong>((randomValue >> 16) % negLength);
                                         irow = idx >= negLength ? -1 : static_cast<int>(negTable[idx]);
 
                                         if (irow < 0 || irow >= vocabSize)
@@ -526,7 +526,7 @@ namespace nd4j {
                                 // we're skipping rng on 0 step
                                 if (r != 0) {
                                     randomValue = randomValue * (unsigned long long) 25214903917 + 11;
-                                    auto idx = nd4j::math::nd4j_abs<Nd4jLong>((randomValue >> 16) % negLength);
+                                    auto idx = sd::math::nd4j_abs<Nd4jLong>((randomValue >> 16) % negLength);
                                     irow = idx >= negLength ? -1 : static_cast<int>(negTable[idx]);
 
                                     if (irow < 0 || irow >= vocabSize) irow = randomValue % (vocabSize - 1) + 1;
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/shift.cpp b/libnd4j/include/ops/declarable/helpers/cpu/shift.cpp
index 4b54c7362..9dfeac2ec 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/shift.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/shift.cpp
@@ -20,7 +20,7 @@
 
 #include <ops/declarable/helpers/shift.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             template <typename T>
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/solve.cpp b/libnd4j/include/ops/declarable/helpers/cpu/solve.cpp
index 63c7758dc..9a06975aa 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/solve.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/solve.cpp
@@ -17,9 +17,9 @@
 //
 //  @author GS <sgazeos@gmail.com>
 //
-#include <op_boilerplate.h>
-#include <NDArray.h>
-#include <NDArrayFactory.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
+#include <array/NDArrayFactory.h>
 #include <execution/Threads.h>
 #include <helpers/MmulHelper.h>
 
@@ -27,13 +27,13 @@
 #include "../lup.h"
 #include "../solve.h"
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
 // --------------------------------------------------------------------------------------------------------------------------------------- //
     template <typename T>
-    static void adjointMatrix_(nd4j::LaunchContext* context, NDArray const* input, NDArray* output) {
+    static void adjointMatrix_(sd::LaunchContext* context, NDArray const* input, NDArray* output) {
         auto inputPart = input->allTensorsAlongDimension({-2, -1});
         auto outputPart = output->allTensorsAlongDimension({-2, -1});
         auto rows = input->sizeAt(-2);
@@ -53,7 +53,7 @@ namespace helpers {
 
 // --------------------------------------------------------------------------------------------------------------------------------------- //
     template <typename T>
-    static int solveFunctor_(nd4j::LaunchContext * context, NDArray* leftInput, NDArray* rightInput, bool const adjoint, NDArray* output) {
+    static int solveFunctor_(sd::LaunchContext * context, NDArray* leftInput, NDArray* rightInput, bool const adjoint, NDArray* output) {
 
         // stage 1: LU decomposition batched
         auto leftOutput = leftInput->ulike();
@@ -89,11 +89,11 @@ namespace helpers {
     }
 
 // --------------------------------------------------------------------------------------------------------------------------------------- //
-    int solveFunctor(nd4j::LaunchContext * context, NDArray* leftInput, NDArray* rightInput, bool const adjoint, NDArray* output) {
+    int solveFunctor(sd::LaunchContext * context, NDArray* leftInput, NDArray* rightInput, bool const adjoint, NDArray* output) {
         BUILD_SINGLE_SELECTOR(leftInput->dataType(), return solveFunctor_, (context, leftInput, rightInput, adjoint, output), FLOAT_TYPES);
     }
 // --------------------------------------------------------------------------------------------------------------------------------------- //
-    void adjointMatrix(nd4j::LaunchContext* context, NDArray const* input, NDArray* output) {
+    void adjointMatrix(sd::LaunchContext* context, NDArray const* input, NDArray* output) {
         BUILD_SINGLE_SELECTOR(input->dataType(), adjointMatrix_, (context, input, output), FLOAT_TYPES);
     }
 // --------------------------------------------------------------------------------------------------------------------------------------- //
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/split.cpp b/libnd4j/include/ops/declarable/helpers/cpu/split.cpp
index b648c2b82..d138d9892 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/split.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/split.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/helpers/transforms.h>
 #include <helpers/Loops.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -118,7 +118,7 @@ namespace helpers {
                 samediff::Threads::parallel_for(func, 0, input.lengthOf());
             }
 
-            void split(nd4j::LaunchContext* context, const NDArray& input, std::vector<NDArray*>& outArrs, const int axis) {
+            void split(sd::LaunchContext* context, const NDArray& input, std::vector<NDArray*>& outArrs, const int axis) {
                 BUILD_SINGLE_SELECTOR(input.dataType(), split_, (input, outArrs, axis), LIBND4J_TYPES);
             }
       }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/sru.cpp b/libnd4j/include/ops/declarable/helpers/cpu/sru.cpp
index d2dd3bf30..ecd5ead2b 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/sru.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/sru.cpp
@@ -21,11 +21,11 @@
 //
 
 #include<ops/declarable/helpers/sru.h>
-#include <NDArrayFactory.h>
-#include <MmulHelper.h>
+#include <array/NDArrayFactory.h>
+#include <helpers/MmulHelper.h>
 #include <execution/Threads.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
@@ -46,7 +46,7 @@ static FORCEINLINE NDArray sigmoid(const NDArray& arr) {
 
 
 //////////////////////////////////////////////////////////////////////////
-void sruCell(nd4j::LaunchContext * context, const NDArray* x, const NDArray* c0, const NDArray* w, const NDArray* b, NDArray* h, NDArray* c) {
+void sruCell(sd::LaunchContext * context, const NDArray* x, const NDArray* c0, const NDArray* w, const NDArray* b, NDArray* h, NDArray* c) {
 
     // x   input [bS x inSize], bS - batch size, inSize - number of features
     // c0  previous cell state c  [bS x inSize], that is at previous time step t-1
@@ -78,7 +78,7 @@ void sruCell(nd4j::LaunchContext * context, const NDArray* x, const NDArray* c0,
 
 
 //////////////////////////////////////////////////////////////////////////
-void sruTimeLoop(nd4j::LaunchContext * context, const NDArray* x, const NDArray* c0, const NDArray* w, const NDArray* b, NDArray* h, NDArray* c) {
+void sruTimeLoop(sd::LaunchContext * context, const NDArray* x, const NDArray* c0, const NDArray* w, const NDArray* b, NDArray* h, NDArray* c) {
 
     // x   input [bS x inSize x time]
     // c0  initial cell state  (at time step = 0) [bS x inSize],
@@ -168,12 +168,12 @@ static void sruBI_(NDArray* x, const NDArray* w, const NDArray* b, const NDArray
 
             for (Nd4jLong t = 0; t < time; ++t) {
                 // evaluate sigmoids
-                T ft = (1.) / (1. + nd4j::math::nd4j_exp<T, T>(-(pWiVal[1] + bF)));
-                T rt = (1.) / (1. + nd4j::math::nd4j_exp<T, T>(-(pWiVal[2] + bR)));
+                T ft = (1.) / (1. + sd::math::nd4j_exp<T, T>(-(pWiVal[1] + bF)));
+                T rt = (1.) / (1. + sd::math::nd4j_exp<T, T>(-(pWiVal[2] + bR)));
 
                 cur = (cur - *pWiVal) * ft + *pWiVal;
                 *pCtVal = cur;
-                T val = nd4j::math::nd4j_tanh<T, T>(cur);
+                T val = sd::math::nd4j_tanh<T, T>(cur);
                 *pHtVal = (val * maskVal - *pIVal) * rt + *pIVal;
 
                 pIVal += ncolsRev;
@@ -268,10 +268,10 @@ static void sruBIBP_(NDArray* x, const NDArray* w, const NDArray* b, const NDArr
 
             for (Nd4jLong t = 0; t < time; ++t) {
                 // evaluate sigmoids
-                T ft = ((T) 1.) / ((T) 1. + nd4j::math::nd4j_exp<T, T>(-(*(pWiVal + 1) + bF)));
-                T rt = ((T) 1.) / ((T) 1. + nd4j::math::nd4j_exp<T, T>(-(*(pWiVal + 2) + bR)));
+                T ft = ((T) 1.) / ((T) 1. + sd::math::nd4j_exp<T, T>(-(*(pWiVal + 1) + bF)));
+                T rt = ((T) 1.) / ((T) 1. + sd::math::nd4j_exp<T, T>(-(*(pWiVal + 2) + bR)));
 
-                T val = nd4j::math::nd4j_tanh<T, T>(*pStateVal);
+                T val = sd::math::nd4j_tanh<T, T>(*pStateVal);
                 T prevVal = (t < time - 1) ? (*(pStateVal - ncolsRev)) : (*(pInit + col));
                 // grad wrt input
                 *pGradInputVal = *pInGradHtVal - (*pInGradHtVal) * rt;
@@ -314,10 +314,10 @@ static void sruBIBP_(NDArray* x, const NDArray* w, const NDArray* b, const NDArr
 }
 
 
-void sruBI(nd4j::LaunchContext * context, NDArray* x, const NDArray* w, const NDArray* b, const NDArray* c0, const NDArray* mask, NDArray* ht, NDArray* ct) {
+void sruBI(sd::LaunchContext * context, NDArray* x, const NDArray* w, const NDArray* b, const NDArray* c0, const NDArray* mask, NDArray* ht, NDArray* ct) {
     BUILD_SINGLE_SELECTOR(x->dataType(), sruBI_, (x, w, b, c0, mask, ht, ct), FLOAT_TYPES);
 }
-void sruBIBP(nd4j::LaunchContext * context, NDArray* x, const NDArray* w, const NDArray* b, const NDArray* c0, const NDArray* ct, const NDArray* inGradC0, const NDArray* inGradH, const NDArray* mask, NDArray* gradI, NDArray* gradW, NDArray* gradB, NDArray* gradC0) {
+void sruBIBP(sd::LaunchContext * context, NDArray* x, const NDArray* w, const NDArray* b, const NDArray* c0, const NDArray* ct, const NDArray* inGradC0, const NDArray* inGradH, const NDArray* mask, NDArray* gradI, NDArray* gradW, NDArray* gradB, NDArray* gradC0) {
     BUILD_SINGLE_SELECTOR(x->dataType(), sruBIBP_, (x, w, b, c0, ct, inGradC0, inGradH, mask, gradI, gradW, gradB, gradC0), FLOAT_TYPES);
 }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/stack.cpp b/libnd4j/include/ops/declarable/helpers/cpu/stack.cpp
index a3d27702d..975f2aed6 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/stack.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/stack.cpp
@@ -24,7 +24,7 @@
 #include <execution/Threads.h>
 
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -57,7 +57,7 @@ static void stack_(const std::vector<const NDArray*>& inArrs, NDArray* outArr, c
 	}
 }
 
-	void stack(nd4j::LaunchContext * context, const std::vector<const NDArray*>& inArrs, NDArray* outArr, const int dim) {
+	void stack(sd::LaunchContext * context, const std::vector<const NDArray*>& inArrs, NDArray* outArr, const int dim) {
 		BUILD_SINGLE_SELECTOR(outArr->dataType(), stack_, (inArrs, outArr, dim), LIBND4J_TYPES);
 	}
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/svd.cpp b/libnd4j/include/ops/declarable/helpers/cpu/svd.cpp
index 9d755f6b6..c4f99af3f 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/svd.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/svd.cpp
@@ -18,12 +18,12 @@
 // @author Yurii Shyrma (iuriish@yahoo.com), created on 03.01.2018
 //
 
-#include <svd.h>
-#include <NDArrayFactory.h>
+#include <helpers/svd.h>
+#include <array/NDArrayFactory.h>
 #include <helpers/jacobiSVD.h>
 #include <helpers/biDiagonalUp.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -972,7 +972,7 @@ static void svd_(const NDArray* x, const std::vector<NDArray*>& outArrs, const b
     }
 }
 
-    void svd(nd4j::LaunchContext * context, const NDArray* x, const std::vector<NDArray*>& outArrs, const bool fullUV, const bool calcUV, const int switchNum) {
+    void svd(sd::LaunchContext * context, const NDArray* x, const std::vector<NDArray*>& outArrs, const bool fullUV, const bool calcUV, const int switchNum) {
         BUILD_SINGLE_SELECTOR(x->dataType(), svd_, (x, outArrs, fullUV, calcUV, switchNum), FLOAT_TYPES);
     }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/toggle_bits.cpp b/libnd4j/include/ops/declarable/helpers/cpu/toggle_bits.cpp
index 481575297..67b4e0f77 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/toggle_bits.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/toggle_bits.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/helpers/toggle_bits.h>
 #include <helpers/BitwiseUtils.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             template<typename T>
@@ -33,7 +33,7 @@ namespace nd4j {
                 in.applyLambda<T>(lambda, out);
             }
 
-            void __toggle_bits(nd4j::LaunchContext * context, NDArray& in, NDArray& out) {
+            void __toggle_bits(sd::LaunchContext * context, NDArray& in, NDArray& out) {
                 BUILD_SINGLE_SELECTOR(in.dataType(), toggle_bits__, (in, out), INTEGER_TYPES);
             }
         }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp b/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp
index e50b18cd6..7e0b07da0 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp
@@ -20,10 +20,10 @@
 
 #include <ops/declarable/helpers/top_k.h>
 #include <ops/declarable/headers/parity_ops.h>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -138,7 +138,7 @@ namespace helpers {
 // ----------------------------------------------------------------------------------------------- //
 
     template <typename T>
-    static int inTopKFunctor_(nd4j::LaunchContext* context, const NDArray* input, const NDArray* target, NDArray* result, const uint k) {
+    static int inTopKFunctor_(sd::LaunchContext* context, const NDArray* input, const NDArray* target, NDArray* result, const uint k) {
 
             std::vector<Nd4jLong> shapeI(input->rankOf());
             for (int i = 0; i < input->rankOf() - 1; i++)
@@ -169,16 +169,16 @@ namespace helpers {
 
     }
 
-        int topKFunctor(nd4j::LaunchContext * context, const NDArray* input, NDArray* values, NDArray* indices, const uint k, bool needSort) {
+        int topKFunctor(sd::LaunchContext * context, const NDArray* input, NDArray* values, NDArray* indices, const uint k, bool needSort) {
             BUILD_SINGLE_SELECTOR(input->dataType(), return topKFunctor_, (input, values, indices, k, needSort), NUMERIC_TYPES);
         }
 
-        int inTopKFunctor(nd4j::LaunchContext * context, const NDArray* input, const NDArray* target, NDArray* result, const uint k) {
+        int inTopKFunctor(sd::LaunchContext * context, const NDArray* input, const NDArray* target, NDArray* result, const uint k) {
             BUILD_SINGLE_SELECTOR(input->dataType(), return inTopKFunctor_, (context, input, target, result, k), NUMERIC_TYPES);
         }
 
         BUILD_SINGLE_TEMPLATE(template int topKFunctor_, (const NDArray* input, NDArray* values, NDArray* indices, const uint k, bool needSort), NUMERIC_TYPES);
-        BUILD_SINGLE_TEMPLATE(template int inTopKFunctor_, (nd4j::LaunchContext * context, const NDArray* input, const NDArray* target, NDArray* result, const uint k), NUMERIC_TYPES);
+        BUILD_SINGLE_TEMPLATE(template int inTopKFunctor_, (sd::LaunchContext * context, const NDArray* input, const NDArray* target, NDArray* result, const uint k), NUMERIC_TYPES);
 }
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp
index f0b3a3a25..fa3570879 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp
@@ -23,20 +23,20 @@
 #include <array/ResultSet.h>
 #include <helpers/ShapeUtils.h>
 #include <numeric>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 #include <helpers/TAD.h>
 #include <helpers/ConstantTadHelper.h>
-#include <Loops.h>
+#include <helpers/Loops.h>
 #include <graph/RandomGenerator.h>
 
-namespace nd4j 	  {
+namespace sd 	  {
 namespace ops 	  {
 namespace helpers {
 
 
 //////////////////////////////////////////////////////////////////////////
 template <typename T>
-static void triuBP_(nd4j::LaunchContext * context, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int diagonal) {
+static void triuBP_(sd::LaunchContext * context, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int diagonal) {
 
     auto dOdI = NDArray(&gradO);                // dO/dI
     const_cast<NDArray&>(input).fillAsTriangular<T>(0, diagonal, dOdI.sizeAt(-1), dOdI, 'b');
@@ -54,7 +54,7 @@ static void triuBP_(nd4j::LaunchContext * context, const NDArray& input, const N
     gradI.assign(dOdI * gradO);                          // chain rule: dLoss/dI = dO/dI * dLoss/dO
 }
 
-    void triuBP(nd4j::LaunchContext * context, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int diagonal) {
+    void triuBP(sd::LaunchContext * context, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int diagonal) {
         BUILD_SINGLE_SELECTOR(gradO.dataType(), triuBP_, (context, input, gradO, gradI, diagonal), LIBND4J_TYPES);
     }
 
@@ -71,13 +71,13 @@ static void trace_(const NDArray& input, NDArray& output) {
     samediff::Threads::parallel_for(func, 0, setOfSubArrs.size());
 }
 
-    void trace(nd4j::LaunchContext * context, const NDArray& input, NDArray& output) {
+    void trace(sd::LaunchContext * context, const NDArray& input, NDArray& output) {
         BUILD_SINGLE_SELECTOR(input.dataType(), trace_, (input, output), LIBND4J_TYPES);
     }
 
 //////////////////////////////////////////////////////////////////////////
 template <typename T>
-void randomShuffle_(NDArray& input, NDArray& output, nd4j::graph::RandomGenerator& rng, const bool isInplace) {
+void randomShuffle_(NDArray& input, NDArray& output, sd::graph::RandomGenerator& rng, const bool isInplace) {
 
     // check edge cases first
     int temp;
@@ -163,7 +163,7 @@ void randomShuffle_(NDArray& input, NDArray& output, nd4j::graph::RandomGenerato
 
 }
 
-    void randomShuffle(nd4j::LaunchContext * context, NDArray& input, NDArray& output, nd4j::graph::RandomGenerator& rng, const bool isInplace) {
+    void randomShuffle(sd::LaunchContext * context, NDArray& input, NDArray& output, sd::graph::RandomGenerator& rng, const bool isInplace) {
         BUILD_SINGLE_SELECTOR(input.dataType(), randomShuffle_, (input, output, rng, isInplace), LIBND4J_TYPES);
     }
 
@@ -374,7 +374,7 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray
 //     }
 // }
 
-void pad(nd4j::LaunchContext * context, const int mode, const NDArray& input, const NDArray& paddings, NDArray& output, NDArray const& padValue) {
+void pad(sd::LaunchContext * context, const int mode, const NDArray& input, const NDArray& paddings, NDArray& output, NDArray const& padValue) {
     BUILD_SINGLE_SELECTOR(input.dataType(), pad_, (mode, input, paddings, output, padValue), LIBND4J_TYPES);
 }
 
@@ -528,7 +528,7 @@ static void recursiveLoopForPad_(const int mode, NDArray& input, const NDArray&
 */
 
 ////////////////////////////////////////////////////////////////////////
-void invertPermutation(nd4j::LaunchContext * context, const NDArray& input, NDArray& output) {
+void invertPermutation(sd::LaunchContext * context, const NDArray& input, NDArray& output) {
 
     std::set<int> uniqueElems;
     const int length = input.lengthOf();
@@ -558,7 +558,7 @@ static void gatherND_(NDArray& input, NDArray& indices, NDArray& output) {
     const int xRank    = input.rankOf();
     const int yRank    = indices.rankOf();
     const int zRank    = output.rankOf();
-    const int maxRank  = nd4j::math::nd4j_max<int>(yRank, nd4j::math::nd4j_max<int>(xRank, zRank));
+    const int maxRank  = sd::math::nd4j_max<int>(yRank, sd::math::nd4j_max<int>(xRank, zRank));
 
     const Nd4jLong zLen = output.lengthOf();
 
@@ -610,7 +610,7 @@ static void gatherND_(NDArray& input, NDArray& indices, NDArray& output) {
 }
 
 ////////////////////////////////////////////////////////////////////////
-void gatherND(nd4j::LaunchContext * context, NDArray& input, NDArray& indices, NDArray& output) {
+void gatherND(sd::LaunchContext * context, NDArray& input, NDArray& indices, NDArray& output) {
     BUILD_DOUBLE_SELECTOR(input.dataType(), indices.dataType(), gatherND_, (input, indices, output), LIBND4J_TYPES, INDEXING_TYPES);
 }
 
@@ -641,7 +641,7 @@ static void gather_(NDArray* input, const NDArray* indices, NDArray* output, con
                 output->assign(scalarNDArray);
             } else {
                 auto dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {axis});
-                auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+                auto tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
 
                 auto tadArr = NDArray(reinterpret_cast<void *>(reinterpret_cast<T*>(input->getBuffer()) + tadPack.primaryOffsets()[indices->e<Nd4jLong>(0)]), tadPack.primaryShapeInfo(), output->getContext());
                 output->assign(&tadArr);
@@ -704,7 +704,7 @@ static void gather_(NDArray* input, const NDArray* indices, NDArray* output, con
     }
 
 //////////////////////////////////////////////////////////////////////////
-void eye(nd4j::LaunchContext * context, NDArray& output) {
+void eye(sd::LaunchContext * context, NDArray& output) {
 
     const int rank = output.rankOf();
     auto arrs = output.allTensorsAlongDimension({rank-2, rank-1});
@@ -718,7 +718,7 @@ void eye(nd4j::LaunchContext * context, NDArray& output) {
 }
 
 //////////////////////////////////////////////////////////////////////////
-void scatterUpdate(nd4j::LaunchContext * context, NDArray& input, NDArray& updates, const std::vector<int>* intArgs) {
+void scatterUpdate(sd::LaunchContext * context, NDArray& input, NDArray& updates, const std::vector<int>* intArgs) {
 
     int opCode = (*intArgs)[0];
     int dimSize = (*intArgs)[1];
@@ -777,7 +777,7 @@ void scatterUpdate(nd4j::LaunchContext * context, NDArray& input, NDArray& updat
 
 
 //////////////////////////////////////////////////////////////////////////
-void scatterSimple(nd4j::LaunchContext * context, const int opId, NDArray& input, const NDArray& updates, const NDArray& indices, const std::vector<int>& dimensions) {
+void scatterSimple(sd::LaunchContext * context, const int opId, NDArray& input, const NDArray& updates, const NDArray& indices, const std::vector<int>& dimensions) {
 
     // updates and indices have same length
     const Nd4jLong len = indices.lengthOf();
@@ -827,7 +827,7 @@ static void mergeMaxIndex_(const std::vector<NDArray*>& inArrs, NDArray& output)
     samediff::Threads::parallel_for(func, 0, x->lengthOf());
 }
 
-void mergeMaxIndex(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
+void mergeMaxIndex(sd::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
     BUILD_SINGLE_SELECTOR(inArrs[0]->dataType(), mergeMaxIndex_, (inArrs, output), LIBND4J_TYPES);
 }
 
@@ -853,7 +853,7 @@ static void mergeMax_(const std::vector<NDArray*>& inArrs, NDArray& output) {
     samediff::Threads::parallel_for(func, 0, x->lengthOf());
 }
 
-void mergeMax(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
+void mergeMax(sd::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
     BUILD_SINGLE_SELECTOR(output.dataType(), mergeMax_, (inArrs, output), LIBND4J_TYPES);
 }
 
@@ -878,7 +878,7 @@ static void mergeAvg_(const std::vector<NDArray*>& inArrs, NDArray& output) {
     samediff::Threads::parallel_for(func, 0, x->lengthOf());
 }
 
-void mergeAvg(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
+void mergeAvg(sd::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
     BUILD_SINGLE_SELECTOR(output.dataType(), mergeAvg_, (inArrs, output), LIBND4J_TYPES);
 }
 
@@ -902,7 +902,7 @@ static void mergeAdd_(const std::vector<NDArray*>& inArrs, NDArray& output) {
 
     samediff::Threads::parallel_for(func, 0, x->lengthOf());
 }
-    void mergeAdd(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
+    void mergeAdd(sd::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
         BUILD_SINGLE_SELECTOR(output.dataType(), mergeAdd_, (inArrs, output), LIBND4J_TYPES);
     }
 
@@ -969,7 +969,7 @@ static void clipByNorm_(NDArray& input, NDArray& output, const std::vector<int>&
 }
 
 //////////////////////////////////////////////////////////////////////////
-void clipByNorm(nd4j::LaunchContext * context, NDArray& input, NDArray& output, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool isInplace) {
+void clipByNorm(sd::LaunchContext * context, NDArray& input, NDArray& output, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool isInplace) {
     BUILD_SINGLE_SELECTOR(output.dataType(), clipByNorm_, (input, output, dimensions, clipNorm, isInplace), FLOAT_TYPES);
 }
 
@@ -983,7 +983,7 @@ void clipByNorm(nd4j::LaunchContext * context, NDArray& input, NDArray& output,
 
 
     template <typename T>
-    static void clipByGlobalNorm_(std::vector<NDArray*> const& inputs, double clipNorm, nd4j::memory::Workspace* workspace, std::vector<NDArray*>& outputs, bool isInplace) {
+    static void clipByGlobalNorm_(std::vector<NDArray*> const& inputs, double clipNorm, sd::memory::Workspace* workspace, std::vector<NDArray*>& outputs, bool isInplace) {
         T globalNorm = 0; //NDArrayFactory::create<T>(0, inputs[0]->getContext()); //sqrt(sum([l2norm(t)**2 for t in t_list]))
 //        PRAGMA_OMP_PARALLEL_FOR_SIMD_REDUCTION(sumT : globalNorm)
         for (size_t i = 0; i < inputs.size(); i++) {
@@ -992,8 +992,8 @@ void clipByNorm(nd4j::LaunchContext * context, NDArray& input, NDArray& output,
             globalNorm += l2norm.t<T>(0) * l2norm.t<T>(0);
         }
 
-        //globalNorm.applyTransform(transform::Sqrt, nullptr, nullptr);// = nd4j::math::nd4j_sqrt(globalNorm);
-        auto normS = nd4j::math::nd4j_sqrt<T,T>(globalNorm);
+        //globalNorm.applyTransform(transform::Sqrt, nullptr, nullptr);// = sd::math::nd4j_sqrt(globalNorm);
+        auto normS = sd::math::nd4j_sqrt<T,T>(globalNorm);
         outputs[inputs.size()]->p(0, normS);
 
         const T factor = clipNorm / normS;
@@ -1014,11 +1014,11 @@ void clipByNorm(nd4j::LaunchContext * context, NDArray& input, NDArray& output,
             }
         }
     }
-    void clipByGlobalNorm(nd4j::LaunchContext * context, std::vector<NDArray*> const& inputs, double clipNorm, nd4j::memory::Workspace* workspace, std::vector<NDArray*>& outputs, bool isInplace) {
+    void clipByGlobalNorm(sd::LaunchContext * context, std::vector<NDArray*> const& inputs, double clipNorm, sd::memory::Workspace* workspace, std::vector<NDArray*>& outputs, bool isInplace) {
         BUILD_SINGLE_SELECTOR(outputs[0]->dataType(), clipByGlobalNorm_, (inputs, clipNorm, workspace, outputs, isInplace), FLOAT_TYPES);
     }
 
-    BUILD_SINGLE_TEMPLATE(template void clipByGlobalNorm_, (std::vector<NDArray*> const& inputs, double clipNorm, nd4j::memory::Workspace* workspace, std::vector<NDArray*>& outputs, bool isInplace), FLOAT_TYPES);
+    BUILD_SINGLE_TEMPLATE(template void clipByGlobalNorm_, (std::vector<NDArray*> const& inputs, double clipNorm, sd::memory::Workspace* workspace, std::vector<NDArray*>& outputs, bool isInplace), FLOAT_TYPES);
 
 //////////////////////////////////////////////////////////////////////////
 template<typename T>
@@ -1083,7 +1083,7 @@ static void clipByNormBP_(const NDArray& input, const NDArray& gradO, NDArray& g
     }
 }
 
-    void clipByNormBP(nd4j::LaunchContext * context, const NDArray& input, const NDArray& gradO, NDArray& gradI /*output*/, const std::vector<int>& dimensions, const NDArray& clipNorm) {
+    void clipByNormBP(sd::LaunchContext * context, const NDArray& input, const NDArray& gradO, NDArray& gradI /*output*/, const std::vector<int>& dimensions, const NDArray& clipNorm) {
         BUILD_SINGLE_SELECTOR(gradI.dataType(), clipByNormBP_, (input, gradO, gradI, dimensions, clipNorm), FLOAT_TYPES);
     }
 
@@ -1126,7 +1126,7 @@ static void clipByAveraged_(NDArray& input, NDArray& output, const std::vector<i
     }
 }
 
-    void clipByAveraged(nd4j::LaunchContext * context, NDArray& input, NDArray& output, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool isInplace) {
+    void clipByAveraged(sd::LaunchContext * context, NDArray& input, NDArray& output, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool isInplace) {
         BUILD_SINGLE_SELECTOR(input.dataType(), clipByAveraged_, (input, output, dimensions, clipNorm, isInplace), FLOAT_TYPES);
     }
 
@@ -1151,7 +1151,7 @@ static void clipByAveraged_(NDArray& input, NDArray& output, const std::vector<i
         input.applyLambda<T>(routine, output);
     }
 
-    void clipByValue(nd4j::LaunchContext * context, NDArray& input, double leftBound, double rightBound, NDArray& output) {
+    void clipByValue(sd::LaunchContext * context, NDArray& input, double leftBound, double rightBound, NDArray& output) {
         BUILD_SINGLE_SELECTOR(input.dataType(), clipByValue_, (input, leftBound, rightBound, output), FLOAT_TYPES);
     }
 
@@ -1219,7 +1219,7 @@ static void mirrorPad_(const NDArray& input, const NDArray& paddings, NDArray& o
     }
 }
 
-    void mirrorPad(nd4j::LaunchContext * context, const NDArray& input, const NDArray& paddings, NDArray& output, const int mode) {
+    void mirrorPad(sd::LaunchContext * context, const NDArray& input, const NDArray& paddings, NDArray& output, const int mode) {
         BUILD_SINGLE_SELECTOR(input.dataType(), mirrorPad_, (input, paddings, output, mode), LIBND4J_TYPES);
     }
 
@@ -1234,7 +1234,7 @@ static void tileBP_(const NDArray& gradO /*input*/, NDArray& gradI /*output*/, c
     const T* gradOBuff      = reinterpret_cast<T*>(gradO.getBuffer());
     const Nd4jLong gradILen = gradI.lengthOf();
     const Nd4jLong gradOLen = gradO.lengthOf();  // gradOLen >= gradILen
-    const Nd4jLong gradIEWS = nd4j::math::nd4j_abs<Nd4jLong>(gradI.ews());
+    const Nd4jLong gradIEWS = sd::math::nd4j_abs<Nd4jLong>(gradI.ews());
     const Nd4jLong gradOEWS = gradO.ews();
 
     // initial zeroing of gradI content
@@ -1274,7 +1274,7 @@ static void tileBP_(const NDArray& gradO /*input*/, NDArray& gradI /*output*/, c
     }
 }
 
-void tileBP(nd4j::LaunchContext * context, const NDArray& gradO /*input*/, NDArray& gradI /*output*/, const std::vector<Nd4jLong> reps) {
+void tileBP(sd::LaunchContext * context, const NDArray& gradO /*input*/, NDArray& gradI /*output*/, const std::vector<Nd4jLong> reps) {
     BUILD_SINGLE_SELECTOR(gradI.dataType(), tileBP_, (gradO, gradI, reps), FLOAT_TYPES);
 }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/triangular_solve.cpp b/libnd4j/include/ops/declarable/helpers/cpu/triangular_solve.cpp
index 04508dcf8..bcf406392 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/triangular_solve.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/triangular_solve.cpp
@@ -17,12 +17,12 @@
 //
 //  @author GS <sgazeos@gmail.com>
 //
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 #include <execution/Threads.h>
 #include "../triangular_solve.h"
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
     /*
@@ -39,7 +39,7 @@ namespace helpers {
      *
      * */
     template <typename T>
-    static void lowerTriangularSolve(nd4j::LaunchContext * context, NDArray* leftInput, NDArray* rightInput, bool adjoint, NDArray* output) {
+    static void lowerTriangularSolve(sd::LaunchContext * context, NDArray* leftInput, NDArray* rightInput, bool adjoint, NDArray* output) {
         auto rows = leftInput->rows();
         auto cols = rightInput->columns();
         //output->t<T>(0,0) = rightInput->t<T>(0,0) / leftInput->t<T>(0,0);
@@ -69,7 +69,7 @@ namespace helpers {
      * */
 
     template <typename T>
-    static void upperTriangularSolve(nd4j::LaunchContext * context, NDArray* leftInput, NDArray* rightInput, bool adjoint, NDArray* output) {
+    static void upperTriangularSolve(sd::LaunchContext * context, NDArray* leftInput, NDArray* rightInput, bool adjoint, NDArray* output) {
         auto rows = leftInput->rows();
         auto cols = rightInput->columns();
         for (Nd4jLong r = rows; r > 0; r--) {
@@ -84,7 +84,7 @@ namespace helpers {
     }
 
     template <typename T>
-    static int triangularSolveFunctor_(nd4j::LaunchContext * context, NDArray* leftInput, NDArray* rightInput, bool lower, bool adjoint, NDArray* output) {
+    static int triangularSolveFunctor_(sd::LaunchContext * context, NDArray* leftInput, NDArray* rightInput, bool lower, bool adjoint, NDArray* output) {
         auto leftPart = leftInput->allTensorsAlongDimension({-2, -1});
         auto rightPart = rightInput->allTensorsAlongDimension({-2, -1});
         auto outputPart = output->allTensorsAlongDimension({-2, -1});
@@ -105,7 +105,7 @@ namespace helpers {
 
     }
     template <typename T>
-    static void adjointTriangularMatrix_(nd4j::LaunchContext* context, NDArray const* input, bool const lower, NDArray* output) {
+    static void adjointTriangularMatrix_(sd::LaunchContext* context, NDArray const* input, bool const lower, NDArray* output) {
         auto inputPart = input->allTensorsAlongDimension({-2, -1});
         auto outputPart = output->allTensorsAlongDimension({-2, -1});
         auto cols = input->sizeAt(-1);
@@ -131,11 +131,11 @@ namespace helpers {
         samediff::Threads::parallel_tad(batchLoop, 0, inputPart.size(), 1);
     }
 
-    int triangularSolveFunctor(nd4j::LaunchContext * context, NDArray* leftInput, NDArray* rightInput, bool lower, bool adjoint, NDArray* output) {
+    int triangularSolveFunctor(sd::LaunchContext * context, NDArray* leftInput, NDArray* rightInput, bool lower, bool adjoint, NDArray* output) {
         BUILD_SINGLE_SELECTOR(leftInput->dataType(), return triangularSolveFunctor_, (context, leftInput, rightInput, lower, adjoint, output), FLOAT_NATIVE);
     }
 
-    void adjointMatrix(nd4j::LaunchContext* context, NDArray const* input, bool const lower, NDArray* output) {
+    void adjointMatrix(sd::LaunchContext* context, NDArray const* input, bool const lower, NDArray* output) {
         BUILD_SINGLE_SELECTOR(input->dataType(), adjointTriangularMatrix_, (context, input, lower, output), FLOAT_NATIVE);
     }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/weights.cpp b/libnd4j/include/ops/declarable/helpers/cpu/weights.cpp
index 2dd936b09..ebdfc674b 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/weights.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/weights.cpp
@@ -20,7 +20,7 @@
 
 #include <ops/declarable/helpers/weights.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -37,7 +37,7 @@ namespace helpers {
             }
     }
 
-    void adjustWeights(nd4j::LaunchContext * context, NDArray* input, NDArray* weights, NDArray* output, int minLength, int maxLength) {
+    void adjustWeights(sd::LaunchContext * context, NDArray* input, NDArray* weights, NDArray* output, int minLength, int maxLength) {
         BUILD_SINGLE_SELECTOR(output->dataType(), adjustWeights_, (input, weights, output, minLength, maxLength), LIBND4J_TYPES);
     }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/zeta.cpp b/libnd4j/include/ops/declarable/helpers/cpu/zeta.cpp
index 90ef634c1..d127fc166 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/zeta.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/zeta.cpp
@@ -21,7 +21,7 @@
 #include<ops/declarable/helpers/zeta.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -58,7 +58,7 @@ static FORCEINLINE T zetaScalarSlow(const T x, const T q) {
 //////////////////////////////////////////////////////////////////////////
 // calculate the Hurwitz zeta function for arrays
 template <typename T>
-static void zeta_(nd4j::LaunchContext * context, const NDArray& x, const NDArray& q, NDArray &z) {
+static void zeta_(sd::LaunchContext * context, const NDArray& x, const NDArray& q, NDArray &z) {
 
 	//auto result = NDArray(&x, false, context);
 	int xLen = x.lengthOf();
@@ -71,11 +71,11 @@ static void zeta_(nd4j::LaunchContext * context, const NDArray& x, const NDArray
 	samediff::Threads::parallel_for(func, 0, xLen);
 }
 
-void zeta(nd4j::LaunchContext * context, const NDArray& x, const NDArray& q, NDArray& z) {
+void zeta(sd::LaunchContext * context, const NDArray& x, const NDArray& q, NDArray& z) {
     BUILD_SINGLE_SELECTOR(x.dataType(), zeta_, (context, x, q, z), FLOAT_TYPES);
 }
 
-BUILD_SINGLE_TEMPLATE(template void zeta_, (nd4j::LaunchContext * context, const NDArray& x, const NDArray& q, NDArray& z), FLOAT_TYPES);
+BUILD_SINGLE_TEMPLATE(template void zeta_, (sd::LaunchContext * context, const NDArray& x, const NDArray& q, NDArray& z), FLOAT_TYPES);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/crop_and_resize.h b/libnd4j/include/ops/declarable/helpers/crop_and_resize.h
index 3926dbfb0..cff96f93e 100644
--- a/libnd4j/include/ops/declarable/helpers/crop_and_resize.h
+++ b/libnd4j/include/ops/declarable/helpers/crop_and_resize.h
@@ -22,17 +22,17 @@
 #ifndef SD_CROP_AND_RESIZE_H
 #define SD_CROP_AND_RESIZE_H
 
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             template<typename T, typename F, typename I>
             void cropAndResizeFunctor_(NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops);
 
-            void cropAndResizeFunctor(nd4j::LaunchContext * context, NDArray const* images, NDArray const* boxes, NDArray const* indices, NDArray const* cropSize, int method, double extrapolationVal, NDArray* crops);
+            void cropAndResizeFunctor(sd::LaunchContext * context, NDArray const* images, NDArray const* boxes, NDArray const* indices, NDArray const* cropSize, int method, double extrapolationVal, NDArray* crops);
         }
     }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cross.h b/libnd4j/include/ops/declarable/helpers/cross.h
index 3ea80966b..bd1e2a61d 100644
--- a/libnd4j/include/ops/declarable/helpers/cross.h
+++ b/libnd4j/include/ops/declarable/helpers/cross.h
@@ -21,13 +21,13 @@
 #include <ops/declarable/helpers/helpers.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
-void crossBatched(nd4j::LaunchContext * context, NDArray *a, NDArray *b, NDArray *o);
+void crossBatched(sd::LaunchContext * context, NDArray *a, NDArray *b, NDArray *o);
 
-void FORCEINLINE cross(nd4j::LaunchContext * context, NDArray *a, NDArray *b, NDArray *o) {
+void FORCEINLINE cross(sd::LaunchContext * context, NDArray *a, NDArray *b, NDArray *o) {
 
     if (a->isR()) {
         auto a0 = a->e<double>(0);
@@ -56,7 +56,7 @@ void FORCEINLINE cross(nd4j::LaunchContext * context, NDArray *a, NDArray *b, ND
     }
 }
 
-    void FORCEINLINE _crossBatched(nd4j::LaunchContext * context, NDArray *a, NDArray *b, NDArray *o) {
+    void FORCEINLINE _crossBatched(sd::LaunchContext * context, NDArray *a, NDArray *b, NDArray *o) {
         auto a_ = a->reshape(a->ordering(), {-1, 3});
         auto b_ = b->reshape(b->ordering(), {-1, 3});
         auto o_ = o->reshape(o->ordering(), {-1, 3}, false);
@@ -80,7 +80,7 @@ void FORCEINLINE cross(nd4j::LaunchContext * context, NDArray *a, NDArray *b, ND
         samediff::Threads::parallel_tad(func,  0, tads);
     }
 
-    void weightedCrossEntropyWithLogitsFunctor(nd4j::LaunchContext * context, NDArray const* targets, NDArray const* input, NDArray const* weights, NDArray* output);
+    void weightedCrossEntropyWithLogitsFunctor(sd::LaunchContext * context, NDArray const* targets, NDArray const* input, NDArray const* weights, NDArray* output);
 }
 }
 }
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/BarnesHutTsne.cu b/libnd4j/include/ops/declarable/helpers/cuda/BarnesHutTsne.cu
index 4c746f244..71eef3386 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/BarnesHutTsne.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/BarnesHutTsne.cu
@@ -20,7 +20,7 @@
 
 #include <ops/declarable/helpers/BarnesHutTsne.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -233,13 +233,13 @@ namespace helpers {
     BUILD_SINGLE_TEMPLATE(template void barnes_edge_forces_, (const NDArray* rowP, NDArray const* colP, NDArray const* valP, int N, NDArray const* data, NDArray* output), FLOAT_TYPES);
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// gains - run a function T((x + 2.) * nd4j::math::nd4j_sign<T,T>(grad) != nd4j::math::nd4j_sign<T,T>(eps)) + T(x * 0.8 * nd4j::math::nd4j_sign<T,T>(grad) != nd4j::math::nd4j_sign<T,T>(eps));
+// gains - run a function T((x + 2.) * sd::math::nd4j_sign<T,T>(grad) != sd::math::nd4j_sign<T,T>(eps)) + T(x * 0.8 * sd::math::nd4j_sign<T,T>(grad) != sd::math::nd4j_sign<T,T>(eps));
 // for all members in input and put all in output
 //
     template <typename T>
     void barnes_gains_(NDArray* input, NDArray* gradX, NDArray* epsilon, NDArray* output) {
         auto gainsInternal = LAMBDA_TTT(x, grad, eps) {
-            T res = nd4j::math::nd4j_sign<T,T>(grad) != nd4j::math::nd4j_sign<T,T>(eps) ? x + T(.2) : x * T(.8);
+            T res = sd::math::nd4j_sign<T,T>(grad) != sd::math::nd4j_sign<T,T>(eps) ? x + T(.2) : x * T(.8);
             if(res < .01) res = .01;
             return res;
         };
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/activations.cu b/libnd4j/include/ops/declarable/helpers/cuda/activations.cu
index 7bddb00fe..6f658c72e 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/activations.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/activations.cu
@@ -19,14 +19,14 @@
 // @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <ops/declarable/helpers/activations.h>
-#include <ShapeUtils.h>
+#include <helpers/ShapeUtils.h>
 #include <numeric>
-#include <PointersManager.h>
+#include <helpers/PointersManager.h>
 #include <helpers/ConstantTadHelper.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
@@ -79,7 +79,7 @@ linkage void preluCudaLauncher(const int blocksPerGrid, const int threadsPerBloc
 }
 
 ///////////////////////////////////////////////////////////////////
-void prelu(nd4j::LaunchContext * context, const NDArray& input, const NDArray& alpha, NDArray& output) {
+void prelu(sd::LaunchContext * context, const NDArray& input, const NDArray& alpha, NDArray& output) {
 
 	PointersManager manager(context, "prelu");
 
@@ -147,7 +147,7 @@ __global__ linkage void preluBPCuda(const void *vIn,    const Nd4jLong *inShapeI
 
 			dLdI[dLdIOffset] =  grO * alpha[alphaOffset];
 
-			nd4j::math::atomics::nd4j_atomicAdd<Y>(&dLdA[dLdAOffset], static_cast<Y>(grO * xVal));
+			sd::math::atomics::nd4j_atomicAdd<Y>(&dLdA[dLdAOffset], static_cast<Y>(grO * xVal));
 		}
 		else
 			dLdI[dLdIOffset] = grO;
@@ -162,7 +162,7 @@ __host__ linkage void preluBPCudaLauncher(const int blocksPerGrid, const int thr
 }
 
 //////////////////////////////////////////////////////////////////////////
-void preluBP(nd4j::LaunchContext* context, const NDArray& input, const NDArray& alpha, const NDArray& dLdO, NDArray& dLdI, NDArray& dLdA) {
+void preluBP(sd::LaunchContext* context, const NDArray& input, const NDArray& alpha, const NDArray& dLdO, NDArray& dLdI, NDArray& dLdA) {
     dLdA.nullify();
 
 	PointersManager manager(context, "preluBP");
@@ -211,7 +211,7 @@ __device__ void softMaxForVectorCuda(const void *vx, const Nd4jLong *xShapeInfo,
 		const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x;
 		if(elemIdx < len) {
 			const Nd4jLong xOffset = shape::getIndexOffset(elemIdx, xShapeInfo);
-			shmem[threadIdx.x] = (threadIdx.x != 0) ? x[xOffset] : nd4j::math::nd4j_max<T>(x[xOffset], temp);	// take into account max element evaluated on previous iteration and stored in temp
+			shmem[threadIdx.x] = (threadIdx.x != 0) ? x[xOffset] : sd::math::nd4j_max<T>(x[xOffset], temp);	// take into account max element evaluated on previous iteration and stored in temp
 		}
 		else
 			shmem[threadIdx.x] = -DataTypeUtils::max<T>();	// FIXME: what if T is unsigned ??
@@ -220,7 +220,7 @@ __device__ void softMaxForVectorCuda(const void *vx, const Nd4jLong *xShapeInfo,
 
 		for (int s = blockDim.x / 2; s > 0; s /= 2) {
 			if(threadIdx.x < s)
-				shmem[threadIdx.x] = nd4j::math::nd4j_max<T>(shmem[threadIdx.x], shmem[threadIdx.x + s]);
+				shmem[threadIdx.x] = sd::math::nd4j_max<T>(shmem[threadIdx.x], shmem[threadIdx.x + s]);
 			__syncthreads();
 		}
 
@@ -238,7 +238,7 @@ __device__ void softMaxForVectorCuda(const void *vx, const Nd4jLong *xShapeInfo,
 		if(elemIdx < len) {
 			const Nd4jLong xOffset = shape::getIndexOffset(elemIdx, xShapeInfo);
 			const Nd4jLong zOffset = shape::getIndexOffset(elemIdx, zShapeInfo);
-			z[zOffset] = nd4j::math::nd4j_exp<T, T>(x[xOffset] - max);
+			z[zOffset] = sd::math::nd4j_exp<T, T>(x[xOffset] - max);
 			shmem[threadIdx.x] = (threadIdx.x != 0) ? z[zOffset] : (z[zOffset] + temp); // take into account sum element evaluated on previous iteration and stored in temp
 		}
 		else
@@ -302,7 +302,7 @@ static void softMaxCudaLauncher(const int blocksPerGrid, const int threadsPerBlo
 
 
 //////////////////////////////////////////////////////////////////////////
-void softmax(nd4j::LaunchContext * context, const NDArray& input, NDArray& output, const int dimension) {
+void softmax(sd::LaunchContext * context, const NDArray& input, NDArray& output, const int dimension) {
 
 	if(!input.isActualOnDeviceSide()) input.syncToDevice();
 	const int rank = input.rankOf();
@@ -321,8 +321,8 @@ void softmax(nd4j::LaunchContext * context, const NDArray& input, NDArray& outpu
 	}
 	else {
 
-		auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input.getShapeInfo(), {dimension});
-        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output.getShapeInfo(), {dimension});
+		auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input.getShapeInfo(), {dimension});
+        auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output.getShapeInfo(), {dimension});
 
         const int threadsPerBlock = MAX_NUM_THREADS / 4;
         const int blocksPerGrid = packZ.numberOfTads();
@@ -374,7 +374,7 @@ __global__  void logSoftMaxForVectorCuda(const void *vx, const Nd4jLong *xzShape
 		const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x;
 		if(elemIdx < len) {
 			const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo);
-			shmem[threadIdx.x] = (threadIdx.x != 0) ? x[offset] : nd4j::math::nd4j_max<T>(x[offset], temp);	// take into account max element evaluated on previous iteration and stored in temp
+			shmem[threadIdx.x] = (threadIdx.x != 0) ? x[offset] : sd::math::nd4j_max<T>(x[offset], temp);	// take into account max element evaluated on previous iteration and stored in temp
 		}
 		else
 			shmem[threadIdx.x] = -DataTypeUtils::max<T>();	// FIXME: what if T is unsigned ??
@@ -383,7 +383,7 @@ __global__  void logSoftMaxForVectorCuda(const void *vx, const Nd4jLong *xzShape
 
 		for (int s = blockDim.x / 2; s > 0; s /= 2) {
 			if(threadIdx.x < s)
-				shmem[threadIdx.x] = nd4j::math::nd4j_max<T>(shmem[threadIdx.x], shmem[threadIdx.x + s]);
+				shmem[threadIdx.x] = sd::math::nd4j_max<T>(shmem[threadIdx.x], shmem[threadIdx.x + s]);
 			__syncthreads();
 		}
 
@@ -400,7 +400,7 @@ __global__  void logSoftMaxForVectorCuda(const void *vx, const Nd4jLong *xzShape
 		const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x;
 		if(elemIdx < len) {
 			const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo);
-			z[offset] = nd4j::math::nd4j_exp<T, T>(x[offset] - max);
+			z[offset] = sd::math::nd4j_exp<T, T>(x[offset] - max);
 			shmem[threadIdx.x] = (threadIdx.x != 0) ? z[offset] : (z[offset] + temp); // take into account sum element evaluated on previous iteration and stored in temp
 		}
 		else
@@ -422,7 +422,7 @@ __global__  void logSoftMaxForVectorCuda(const void *vx, const Nd4jLong *xzShape
 		const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x;
 		if(elemIdx >= len) continue;
 		const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo);
-		z[offset] = nd4j::math::nd4j_log<T,T>(z[offset] / shmem[0]);
+		z[offset] = sd::math::nd4j_log<T,T>(z[offset] / shmem[0]);
 	}
 }
 
@@ -434,7 +434,7 @@ linkage void logSoftMaxForVectorCudaLauncher(const cudaStream_t* stream, const v
 }
 
 //////////////////////////////////////////////////////////////////////////
-void logSoftmax(nd4j::LaunchContext * context, const NDArray& input, NDArray& output, const int dimension) {
+void logSoftmax(sd::LaunchContext * context, const NDArray& input, NDArray& output, const int dimension) {
 
 	if(!input.isActualOnDeviceSide()) input.syncToDevice();
 	const int rank = input.rankOf();
@@ -493,7 +493,7 @@ __global__ linkage void softMaxDerivForVectorCuda(const void *vx, const Nd4jLong
 		const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x;
 		if(elemIdx < len) {
 			const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo);
-			shmem[threadIdx.x] = (threadIdx.x != 0) ? x[offset] : nd4j::math::nd4j_max<T>(x[offset], temp);	// take into account max element evaluated on previous iteration and stored in temp
+			shmem[threadIdx.x] = (threadIdx.x != 0) ? x[offset] : sd::math::nd4j_max<T>(x[offset], temp);	// take into account max element evaluated on previous iteration and stored in temp
 		}
 		else
 			shmem[threadIdx.x] = -DataTypeUtils::max<T>();	// FIXME: what if T is unsigned ??
@@ -502,7 +502,7 @@ __global__ linkage void softMaxDerivForVectorCuda(const void *vx, const Nd4jLong
 
 		for (int s = blockDim.x / 2; s > 0; s /= 2) {
 			if(threadIdx.x < s)
-				shmem[threadIdx.x] = nd4j::math::nd4j_max<T>(shmem[threadIdx.x], shmem[threadIdx.x + s]);
+				shmem[threadIdx.x] = sd::math::nd4j_max<T>(shmem[threadIdx.x], shmem[threadIdx.x + s]);
 			__syncthreads();
 		}
 
@@ -519,7 +519,7 @@ __global__ linkage void softMaxDerivForVectorCuda(const void *vx, const Nd4jLong
 		const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x;
 		if(elemIdx < len) {
 			const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo);
-			z[offset] = nd4j::math::nd4j_exp<T, T>(x[offset] - max);
+			z[offset] = sd::math::nd4j_exp<T, T>(x[offset] - max);
 			shmem[threadIdx.x] = (threadIdx.x != 0) ? z[offset] : (z[offset] + temp); // take into account sum element evaluated on previous iteration and stored in temp
 		}
 		else
@@ -554,7 +554,7 @@ linkage void softMaxDerivForVectorCudaLauncher(const cudaStream_t* stream, const
 }
 
 ///////////////////////////////////////////////////////////////////
-void softmaxDerivative(nd4j::LaunchContext * context, const NDArray& input, NDArray& output, const int dimension) {
+void softmaxDerivative(sd::LaunchContext * context, const NDArray& input, NDArray& output, const int dimension) {
 
 	if(!input.isActualOnDeviceSide()) input.syncToDevice();
 	const int rank = input.rankOf();
@@ -590,7 +590,7 @@ void softmaxDerivative(nd4j::LaunchContext * context, const NDArray& input, NDAr
 		const_cast<NDArray&>(input).applyLambda(routine, output);
 	}
 
-	void thresholdRelu(nd4j::LaunchContext * context, NDArray const& input, double threshold, NDArray& output) {
+	void thresholdRelu(sd::LaunchContext * context, NDArray const& input, double threshold, NDArray& output) {
 		BUILD_SINGLE_SELECTOR(input.dataType(), thresholdRelu_, (input, threshold, output), FLOAT_TYPES);
 	}
 
@@ -601,7 +601,7 @@ void softmaxDerivative(nd4j::LaunchContext * context, const NDArray& input, NDAr
         input->applyPairwiseLambda(*dLdO, derivative, *output);
 	}
 
-	void thresholdReluDerivative(nd4j::LaunchContext * context, NDArray* input, double threshold, NDArray* dLdO, NDArray* output) {
+	void thresholdReluDerivative(sd::LaunchContext * context, NDArray* input, double threshold, NDArray* dLdO, NDArray* output) {
 		BUILD_SINGLE_SELECTOR(input->dataType(), thresholdReluDerivative_, (input, threshold, dLdO, output), FLOAT_TYPES);
 	}
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/addBias.cu b/libnd4j/include/ops/declarable/helpers/cuda/addBias.cu
index 1c0bc1925..3fc5d42c9 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/addBias.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/addBias.cu
@@ -20,9 +20,9 @@
 
 
 #include<ops/declarable/helpers/addBias.h>
-#include <PointersManager.h>
+#include <helpers/PointersManager.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
@@ -92,7 +92,7 @@ static void addBiasCudaLauncher(const int blocksPerGrid, const int threadsPerBlo
 }
 
 //////////////////////////////////////////////////////////////////////////
-void addBias(nd4j::graph::Context& block, const NDArray& input, const NDArray& bias, NDArray& output, const bool isNCHW) {
+void addBias(sd::graph::Context& block, const NDArray& input, const NDArray& bias, NDArray& output, const bool isNCHW) {
 
     PointersManager manager(block.launchContext(), "addBias");
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu b/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu
index 5712887da..0b3681663 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu
@@ -21,9 +21,9 @@
 
 #include <ops/declarable/helpers/adjust_hue.h>
 #include <helpers/ConstantTadHelper.h>
-#include <PointersManager.h>
+#include <helpers/PointersManager.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
@@ -79,10 +79,10 @@ static _CUDA_H void adjustHueCudaLauncher(const int blocksPerGrid, const int thr
 }
 
 ////////////////////////////////////////////////////////////////////////
-void adjustHue(nd4j::LaunchContext* context, const NDArray *input, const NDArray* deltaScalarArr, NDArray *output, const int dimC) {
+void adjustHue(sd::LaunchContext* context, const NDArray *input, const NDArray* deltaScalarArr, NDArray *output, const int dimC) {
 
-    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(),  {dimC});
-    auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {dimC});
+    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(),  {dimC});
+    auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {dimC});
 
     const Nd4jLong numOfTads = packX.numberOfTads();
 
@@ -166,15 +166,15 @@ static void _CUDA_G adjustHueSingleNCHWKernel(void *xBuffer, Nd4jLong *xTadShape
 }
 
 template <typename T>
-static void _adjust_hue_single(nd4j::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC) {
+static void _adjust_hue_single(sd::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC) {
     // numChannels is always 3
     auto tuples = array->lengthOf() / 3;
     if (isNHWC) {
         adjustHueSingleNHWCKernel<T><<<256, 256, 1024, *context->getCudaStream()>>>(array->specialBuffer(), array->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(), tuples, delta);
     } else {
         // TODO: check this one
-        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(array->getShapeInfo(), {1, 2});
-        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {1, 2});
+        auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(array->getShapeInfo(), {1, 2});
+        auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {1, 2});
 
         auto tadLength = shape::length(packX.primaryShapeInfo());
 
@@ -184,7 +184,7 @@ static void _adjust_hue_single(nd4j::LaunchContext * context, NDArray *array, ND
 
 
 template <typename T>
-static void _adjust_hue_batch(nd4j::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC) {
+static void _adjust_hue_batch(sd::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC) {
     auto xType = array->dataType();
 
     // numChannels is always 3
@@ -195,8 +195,8 @@ static void _adjust_hue_batch(nd4j::LaunchContext * context, NDArray *array, NDA
         BUILD_SINGLE_SELECTOR(xType, _adjust_hue_single, (context, array, output, delta, isNHWC);, FLOAT_TYPES);
     } else {
         // TODO: check this one
-        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(array->getShapeInfo(), {0, 2, 3});
-        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {0, 2, 3});
+        auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(array->getShapeInfo(), {0, 2, 3});
+        auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {0, 2, 3});
 
         auto tadLength = shape::length(packX.primaryShapeInfo());
 
@@ -204,7 +204,7 @@ static void _adjust_hue_batch(nd4j::LaunchContext * context, NDArray *array, NDA
     }
 }
 
-void _adjust_hue(nd4j::LaunchContext * context, NDArray *array, NDArray *output, NDArray* delta, bool isNHWC) {
+void _adjust_hue(sd::LaunchContext * context, NDArray *array, NDArray *output, NDArray* delta, bool isNHWC) {
     auto xType = array->dataType();
 
     float d = delta->e<float>(0);
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu b/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu
index b801765b2..f2da480cb 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu
@@ -22,10 +22,10 @@
 #include <ops/declarable/helpers/adjust_saturation.h>
 #include <ops/declarable/helpers/adjust_hue.h>
 #include <helpers/ConstantTadHelper.h>
-#include <PointersManager.h>
+#include <helpers/PointersManager.h>
 
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
@@ -81,10 +81,10 @@ static _CUDA_H void adjustSaturationCudaLauncher(const int blocksPerGrid, const
 }
 
 ////////////////////////////////////////////////////////////////////////
-void adjustSaturation(nd4j::LaunchContext* context, const NDArray *input, const NDArray* factorScalarArr, NDArray *output, const int dimC) {
+void adjustSaturation(sd::LaunchContext* context, const NDArray *input, const NDArray* factorScalarArr, NDArray *output, const int dimC) {
 
-    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(),  {dimC});
-    auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {dimC});
+    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(),  {dimC});
+    auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {dimC});
 
     const Nd4jLong numOfTads = packX.numberOfTads();
 
@@ -117,7 +117,7 @@ static void _CUDA_G adjustSaturationSingleNHWCKernel(void *xBuffer, Nd4jLong *xS
         T h, s, v;
         // Convert the RGB color to Hue/V-range.
         helpers::rgb_to_hsv(i[0], i[1], i[2], &h, &s, &v);
-        s = nd4j::math::nd4j_min<T>((T) 1.0f, nd4j::math::nd4j_max<T>((T) 0.0f, s * delta));
+        s = sd::math::nd4j_min<T>((T) 1.0f, sd::math::nd4j_max<T>((T) 0.0f, s * delta));
 
         // Convert the hue and v-range back into RGB.
         helpers::hsv_to_rgb(h, s, v, o, o + 1, o + 2);
@@ -150,22 +150,22 @@ static void _CUDA_G adjustSaturationSingleNCHWKernel(void *xBuffer, Nd4jLong *xT
         T h, s, v;
         // Convert the RGB color to Hue/V-range.
         helpers::rgb_to_hsv(_ri[0], _gi[0], _bi[0], &h, &s, &v);
-        s = nd4j::math::nd4j_min<T>((T) 1.0f, nd4j::math::nd4j_max<T>((T) 0.0f, s * delta));
+        s = sd::math::nd4j_min<T>((T) 1.0f, sd::math::nd4j_max<T>((T) 0.0f, s * delta));
         // Convert the hue and v-range back into RGB.
         helpers::hsv_to_rgb(h, s, v, _ro, _go, _bo);
     }
 }
 
 template <typename T>
-static void _adjust_saturation_single(nd4j::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC) {
+static void _adjust_saturation_single(sd::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC) {
     // numChannels is always 3
     auto tuples = array->lengthOf() / 3;
 
     if (isNHWC) {
         adjustSaturationSingleNHWCKernel<T><<<256, 256, 1024, *context->getCudaStream()>>>(array->specialBuffer(), array->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(), tuples, delta);
     } else {
-        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(array->getShapeInfo(), {1, 2});
-        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {1, 2});
+        auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(array->getShapeInfo(), {1, 2});
+        auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {1, 2});
 
         auto tadLength = shape::length(packX.primaryShapeInfo());
 
@@ -174,7 +174,7 @@ static void _adjust_saturation_single(nd4j::LaunchContext * context, NDArray *ar
 }
 
 template <typename T>
-static void _adjust_saturation_batch(nd4j::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC) {
+static void _adjust_saturation_batch(sd::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC) {
     auto xType = array->dataType();
 
     // numChannels is always 3
@@ -185,8 +185,8 @@ static void _adjust_saturation_batch(nd4j::LaunchContext * context, NDArray *arr
         BUILD_SINGLE_SELECTOR(xType, _adjust_saturation_single, (context, array, output, delta, isNHWC);, FLOAT_TYPES);
     } else {
         // TODO: check this one
-        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(array->getShapeInfo(), {0, 2, 3});
-        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {0, 2, 3});
+        auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(array->getShapeInfo(), {0, 2, 3});
+        auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {0, 2, 3});
 
         auto tadLength = shape::length(packX.primaryShapeInfo());
 
@@ -194,7 +194,7 @@ static void _adjust_saturation_batch(nd4j::LaunchContext * context, NDArray *arr
     }
 }
 
-void adjust_saturation(nd4j::LaunchContext * context, NDArray *array, NDArray *output, NDArray* delta, bool isNHWC) {
+void adjust_saturation(sd::LaunchContext * context, NDArray *array, NDArray *output, NDArray* delta, bool isNHWC) {
     auto xType = array->dataType();
 
     float d = delta->e<float>(0);
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/axis.cu b/libnd4j/include/ops/declarable/helpers/cuda/axis.cu
index 1236ae495..1dd00f688 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/axis.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/axis.cu
@@ -21,7 +21,7 @@
 #include <ops/declarable/helpers/axis.h>
 
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/batched_gemm.cu b/libnd4j/include/ops/declarable/helpers/cuda/batched_gemm.cu
index 99fbd33a8..3e3110b0e 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/batched_gemm.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/batched_gemm.cu
@@ -21,14 +21,14 @@
 
 #include <exceptions/cuda_exception.h>
 #include <cublas_v2.h>
-#include <specials_cuda.h>
-#include <op_boilerplate.h>
+#include <ops/specials_cuda.h>
+#include <system/op_boilerplate.h>
 #include <types/float16.h>
 #include <ops/declarable/helpers/batched_gemm.h>
-#include <PointersManager.h>
+#include <helpers/PointersManager.h>
 
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -92,7 +92,7 @@ void bgemm(const std::vector<NDArray*>& vA, const std::vector<NDArray*>& vB, std
         pCbuffs[i] = pC[i]->getSpecialBuffer();
     }
 
-    nd4j::LaunchContext* context = vA[0]->getContext();
+    sd::LaunchContext* context = vA[0]->getContext();
     PointersManager manager(context, "helpers::bgemm cuda");
 
     const void** aBuffers = reinterpret_cast<const void**>(manager.replicatePointer(pAbuffs.data(), bS * sizeof(void*)));
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/batchnorm.cu b/libnd4j/include/ops/declarable/helpers/cuda/batchnorm.cu
index eedbe1fdf..5e113ff2f 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/batchnorm.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/batchnorm.cu
@@ -21,11 +21,11 @@
 
 #include<ops/declarable/helpers/batchnorm.h>
 #include <helpers/ShapeUtils.h>
-#include <OmpLaunchHelper.h>
-#include <ConstantTadHelper.h>
-#include <PointersManager.h>
+#include <helpers/OmpLaunchHelper.h>
+#include <helpers/ConstantTadHelper.h>
+#include <helpers/PointersManager.h>
 
-namespace nd4j 	  {
+namespace sd 	  {
 namespace ops 	  {
 namespace helpers {
 
@@ -67,7 +67,7 @@ namespace helpers {
 // 		const auto meanOffset     = shape::getIndexOffset(i, meanShapeInfo);
 //     	const auto varianceOffset = shape::getIndexOffset(i, varianceShapeInfo);
 
-//     	T sigmaInvGam = 1. / nd4j::math::nd4j_sqrt<T, T>(variance[varianceOffset] + epsilon);
+//     	T sigmaInvGam = 1. / sd::math::nd4j_sqrt<T, T>(variance[varianceOffset] + epsilon);
 
 //     	if(gamma != nullptr)
 //     		sigmaInvGam *= gamma[shape::getIndexOffset(i, gammaShapeInfo)];
@@ -149,7 +149,7 @@ __global__ static void batchnormCuda2(const void* vx, const Nd4jLong* xShapeInfo
         const auto meanOffset     = shape::getOffset(meanShapeInfo, coords);
         const auto varianceOffset = shape::getOffset(varianceShapeInfo, coords);
 
-        T sigmaInvGam = 1. / nd4j::math::nd4j_sqrt<T, T>(variance[varianceOffset] + epsilon);
+        T sigmaInvGam = 1. / sd::math::nd4j_sqrt<T, T>(variance[varianceOffset] + epsilon);
 
         if(gamma != nullptr) {
             const auto gammaOffset = shape::getOffset(gammaShapeInfo, coords);
@@ -201,8 +201,8 @@ void batchnorm(const NDArray* input, const NDArray* mean, const NDArray* varianc
 
 	// std::vector<int> dimsToExclude = ShapeUtils::evalDimsToExclude(input->rankOf(), axes);
 
-	// auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimsToExclude);
- //    auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->shapeInfo(), dimsToExclude);
+	// auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimsToExclude);
+ //    auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->shapeInfo(), dimsToExclude);
 
  //    const int threadsPerBlock = MAX_NUM_THREADS / 2;
  //    const int blocksPerGrid = (mean->lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/betaInc.cu b/libnd4j/include/ops/declarable/helpers/cuda/betaInc.cu
index 267ae21c2..f1407f9e8 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/betaInc.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/betaInc.cu
@@ -19,11 +19,11 @@
 //
 
 #include<cmath>
-#include <DataTypeUtils.h>
+#include <array/DataTypeUtils.h>
 #include<ops/declarable/helpers/betaInc.h>
-#include <PointersManager.h>
+#include <helpers/PointersManager.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -170,7 +170,7 @@ static void betaIncForArrayCudaLauncher(const int blocksPerGrid, const int threa
 
 ///////////////////////////////////////////////////////////////////
 // overload betaInc for arrays, shapes of a, b and x must be the same !!!
-void betaInc(nd4j::LaunchContext* context, const NDArray& a, const NDArray& b, const NDArray& x, NDArray& output) {
+void betaInc(sd::LaunchContext* context, const NDArray& a, const NDArray& b, const NDArray& x, NDArray& output) {
 
     const int threadsPerBlock = maxIter;
     const int blocksPerGrid = output.lengthOf();
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/col2im.cu b/libnd4j/include/ops/declarable/helpers/cuda/col2im.cu
index f924fdb75..878ce3a6a 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/col2im.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/col2im.cu
@@ -20,9 +20,9 @@
 //
 
 #include <ops/declarable/helpers/col2im.h>
-#include <PointersManager.h>
+#include <helpers/PointersManager.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
@@ -69,8 +69,8 @@ static __global__ void col2imCuda(const void* columns, const Nd4jLong* colShapeI
         const uint colHstart = (imH < kH) ? 0 : (imH - kH) / sH + 1;
         const uint colWstart = (imW < kW) ? 0 : (imW - kW) / sW + 1;
 
-        const uint colHend = nd4j::math::nd4j_min<uint>(imH / sH + 1, oH);
-        const uint colWend = nd4j::math::nd4j_min<uint>(imW / sW + 1, oW);
+        const uint colHend = sd::math::nd4j_min<uint>(imH / sH + 1, oH);
+        const uint colWend = sd::math::nd4j_min<uint>(imW / sW + 1, oW);
 
         T val = 0;
 
@@ -140,10 +140,10 @@ __global__ static void col2imCuda2(const void *columns, void *image, const Nd4jL
               // compute the start and end of the output
               // These are the indexes for dimensions ??? in the 6d col matrix
               int w_col_start = (w_im < kWeff) ? 0 : (w_im - kWeff) / sW + 1;
-              int w_col_end = nd4j::math::nd4j_min<int>(w_im / sW + 1, oW);
+              int w_col_end = sd::math::nd4j_min<int>(w_im / sW + 1, oW);
 
               int h_col_start = (h_im < kHeff) ? 0 : (h_im - kHeff) / sH + 1;
-              int h_col_end = nd4j::math::nd4j_min<int>(h_im / sH + 1, oH);
+              int h_col_end = sd::math::nd4j_min<int>(h_im / sH + 1, oH);
 
               //Iterate over col entries in the 6d array... these are added up
               for (int colH = h_col_start; colH < h_col_end; colH += 1) {
@@ -184,7 +184,7 @@ static void col2imCudaLauncher(const int blocksPerGrid, const int threadsPerBloc
 }
 
 //////////////////////////////////////////////////////////////////////////
-void col2im(nd4j::LaunchContext& context, const NDArray& col, NDArray& im, const int sH, const int sW, const int pH, const int pW, const int iH, const int iW, const int dH, const int dW) {
+void col2im(sd::LaunchContext& context, const NDArray& col, NDArray& im, const int sH, const int sW, const int pH, const int pW, const int iH, const int iW, const int dH, const int dW) {
 
     PointersManager manager(&context, "col2im");
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/compare_elem.cu b/libnd4j/include/ops/declarable/helpers/cuda/compare_elem.cu
index d2792b630..8d0bede62 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/compare_elem.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/compare_elem.cu
@@ -16,7 +16,7 @@
 
 #include <ops/declarable/helpers/compare_elem.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
 
@@ -107,21 +107,21 @@ namespace nd4j {
             }
 
             template<typename T>
-            static void _compare_elem(nd4j::LaunchContext * context, NDArray *input, bool isStrictlyIncreasing, bool& output) {
+            static void _compare_elem(sd::LaunchContext * context, NDArray *input, bool isStrictlyIncreasing, bool& output) {
                 auto z = NDArrayFactory::create<bool>(false, context);
 
                 const int numThreads = 256;
-                const int numBlocks = nd4j::math::nd4j_min<int>(128, nd4j::math::nd4j_max<int>(1, input->lengthOf() / numThreads));
+                const int numBlocks = sd::math::nd4j_min<int>(128, sd::math::nd4j_max<int>(1, input->lengthOf() / numThreads));
 
                 comparator<T><<<numBlocks, numThreads, numThreads * 4 + 1024, *context->getCudaStream()>>>(input->specialBuffer(), input->specialShapeInfo(), input->lengthOf(), isStrictlyIncreasing, context->getReductionPointer(), reinterpret_cast<bool *>(z.specialBuffer()));
 
                 z.tickWriteDevice();
-                nd4j::DebugHelper::checkErrorCode(context->getCudaStream(), "is_strictly_increasing");
+                sd::DebugHelper::checkErrorCode(context->getCudaStream(), "is_strictly_increasing");
 
                 output = z.e<bool>(0);
             }
 
-            void compare_elem(nd4j::LaunchContext * context, NDArray *input, bool isStrictlyIncreasing, bool& output) {
+            void compare_elem(sd::LaunchContext * context, NDArray *input, bool isStrictlyIncreasing, bool& output) {
                 auto xType = input->dataType();
                 input->syncToDevice();
 
@@ -129,7 +129,7 @@ namespace nd4j {
             }
 
 
-            BUILD_SINGLE_TEMPLATE(template void _compare_elem, (nd4j::LaunchContext * context, NDArray *A, bool isStrictlyIncreasing, bool& output);, LIBND4J_TYPES);
+            BUILD_SINGLE_TEMPLATE(template void _compare_elem, (sd::LaunchContext * context, NDArray *A, bool isStrictlyIncreasing, bool& output);, LIBND4J_TYPES);
         }
     }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/concat.cu b/libnd4j/include/ops/declarable/helpers/cuda/concat.cu
index b455ff659..8ae46a19e 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/concat.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/concat.cu
@@ -23,13 +23,13 @@
 #include <array/ResultSet.h>
 #include <helpers/ShapeUtils.h>
 #include <numeric>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 #include <helpers/TAD.h>
 #include <exceptions/cuda_exception.h>
-#include <PointersManager.h>
-#include <ConstantTadHelper.h>
+#include <helpers/PointersManager.h>
+#include <helpers/ConstantTadHelper.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
@@ -83,7 +83,7 @@ __host__ static void concatCudaLauncher(const int blocksPerGrid, const int threa
 BUILD_SINGLE_TEMPLATE(template void concatCudaLauncher, (const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream, void* pVx, void* pxShapeInfo, void* vz, Nd4jLong* zShapeInfo, const int axis), LIBND4J_TYPES);
 
 //////////////////////////////////////////////////////////////////////////
-void concat(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
+void concat(sd::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
 
     const int numOfInArrs = inArrs.size();
     const auto sizeofT    = output.sizeOfT();
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/confusion.cu b/libnd4j/include/ops/declarable/helpers/cuda/confusion.cu
index 3738d7770..edb7538d4 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/confusion.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/confusion.cu
@@ -19,12 +19,12 @@
 //
 
 #include <ops/declarable/helpers/confusion.h>
-#include <cuda_exception.h>
-#include <TAD.h>
-#include <PointersManager.h>
+#include <exceptions/cuda_exception.h>
+#include <helpers/TAD.h>
+#include <helpers/PointersManager.h>
 #include <helpers/ConstantTadHelper.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -65,20 +65,20 @@ namespace helpers {
     }
 
     template <typename X, typename Z>
-    void _confusionFunctor(nd4j::LaunchContext * context, NDArray* labels, NDArray* predictions, NDArray* weights, NDArray* output) {
+    void _confusionFunctor(sd::LaunchContext * context, NDArray* labels, NDArray* predictions, NDArray* weights, NDArray* output) {
         auto stream = context->getCudaStream();
 
-        auto pack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->shapeInfo(), 1);
+        auto pack = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->shapeInfo(), 1);
 
         PointersManager manager(context, "helpers::confusion");
 
-        Nd4jLong* labelsLongBuffer = labels->dataType() == nd4j::DataType::INT64?(Nd4jLong*)labels->specialBuffer():nullptr;
-        Nd4jLong* predictionLongBuffer = predictions->dataType() == nd4j::DataType::INT64?(Nd4jLong*)predictions->specialBuffer():nullptr;
+        Nd4jLong* labelsLongBuffer = labels->dataType() == sd::DataType::INT64?(Nd4jLong*)labels->specialBuffer():nullptr;
+        Nd4jLong* predictionLongBuffer = predictions->dataType() == sd::DataType::INT64?(Nd4jLong*)predictions->specialBuffer():nullptr;
 
         if (labelsLongBuffer == nullptr) {
             auto err = cudaMalloc(&labelsLongBuffer, labels->lengthOf() * sizeof(Nd4jLong));
             if (err != 0)
-                throw nd4j::cuda_exception::build("Cannot allocate memory for labels long buffer", err);
+                throw sd::cuda_exception::build("Cannot allocate memory for labels long buffer", err);
             // copy with type conversion
             copyBuffers<X><<<256, 512, 1024, *stream>>>(labelsLongBuffer, labels->getSpecialBuffer(), labels->lengthOf());
         }
@@ -86,7 +86,7 @@ namespace helpers {
         if (predictionLongBuffer == nullptr) {
             auto err = cudaMalloc(&predictionLongBuffer, predictions->lengthOf() * sizeof(Nd4jLong));
             if (err != 0)
-                throw nd4j::cuda_exception::build("Cannot allocate memory for predictions long buffer", err);
+                throw sd::cuda_exception::build("Cannot allocate memory for predictions long buffer", err);
             // copy with type conversion
             copyBuffers<X><<<256, 512, 1024, *stream>>>(predictionLongBuffer, predictions->getSpecialBuffer(), predictions->lengthOf());
         }
@@ -100,17 +100,17 @@ namespace helpers {
         if (predictionLongBuffer != predictions->getSpecialBuffer()) {
             cudaError_t err = cudaFree(predictionLongBuffer);
             if (err != 0)
-                throw nd4j::cuda_exception::build("Cannot deallocate memory for predictions long buffer", err);
+                throw sd::cuda_exception::build("Cannot deallocate memory for predictions long buffer", err);
         }
 
         if (labelsLongBuffer != labels->getSpecialBuffer()) {
             cudaError_t err = cudaFree(labelsLongBuffer);
             if (err != 0)
-                throw nd4j::cuda_exception::build("Cannot deallocate memory for labels long buffer", err);
+                throw sd::cuda_exception::build("Cannot deallocate memory for labels long buffer", err);
         }
     }
 
-    void confusionFunctor(nd4j::LaunchContext * context, NDArray* labels, NDArray* predictions, NDArray* weights, NDArray* output) {
+    void confusionFunctor(sd::LaunchContext * context, NDArray* labels, NDArray* predictions, NDArray* weights, NDArray* output) {
         auto xType = predictions->dataType();
         auto zType = output->dataType(); // weights can be null
         NDArray::prepareSpecialUse({output}, {labels, predictions, weights});
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu b/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu
index 39732b024..d02e99987 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu
@@ -24,12 +24,12 @@
 #include <ops/declarable/helpers/col2im.h>
 #include<ops/declarable/helpers/addBias.h>
 #include <exceptions/cuda_exception.h>
-#include <NDArrayFactory.h>
-#include <MmulHelper.h>
-#include <PointersManager.h>
-#include <templatemath.h>
+#include <array/NDArrayFactory.h>
+#include <helpers/MmulHelper.h>
+#include <helpers/PointersManager.h>
+#include <math/templatemath.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops  {
 
 //////////////////////////////////////////////////////////////////////////
@@ -99,7 +99,7 @@ static void vol2colCudaLauncher(const int blocksPerGrid, const int threadsPerBlo
 }
 
 //////////////////////////////////////////////////////////////////////////
-void ConvolutionUtils::vol2col(nd4j::graph::Context& block, const NDArray& vol, NDArray& col, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW) {
+void ConvolutionUtils::vol2col(sd::graph::Context& block, const NDArray& vol, NDArray& col, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW) {
 
     PointersManager manager(block.launchContext(), "vol2col");
 
@@ -161,9 +161,9 @@ static __global__ void col2volCuda(const void* columns, const Nd4jLong* colShape
         const uint colHstart = (imH < kH) ? 0 : (imH - kH) / sH + 1;
         const uint colWstart = (imW < kW) ? 0 : (imW - kW) / sW + 1;
 
-        const uint colDend = nd4j::math::nd4j_min<uint>(imD / sD + 1, oD);
-        const uint colHend = nd4j::math::nd4j_min<uint>(imH / sH + 1, oH);
-        const uint colWend = nd4j::math::nd4j_min<uint>(imW / sW + 1, oW);
+        const uint colDend = sd::math::nd4j_min<uint>(imD / sD + 1, oD);
+        const uint colHend = sd::math::nd4j_min<uint>(imH / sH + 1, oH);
+        const uint colWend = sd::math::nd4j_min<uint>(imW / sW + 1, oW);
 
         T val = 0;
 
@@ -200,7 +200,7 @@ static void col2volCudaLauncher(const int blocksPerGrid, const int threadsPerBlo
 }
 
 //////////////////////////////////////////////////////////////////////////
-void ConvolutionUtils::col2vol(nd4j::graph::Context& block, const NDArray& col, NDArray& vol, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW) {
+void ConvolutionUtils::col2vol(sd::graph::Context& block, const NDArray& col, NDArray& vol, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW) {
 
     PointersManager manager(block.launchContext(), "col2vol");
 
@@ -217,7 +217,7 @@ void ConvolutionUtils::col2vol(nd4j::graph::Context& block, const NDArray& col,
 
 //////////////////////////////////////////////////////////////////////////
 template <typename X, typename Y>
-static void conv2d_(nd4j::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
+static void conv2d_(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
 
     // input   [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
     // weights [kH, kW, iC, oC] always
@@ -275,13 +275,13 @@ static void conv2d_(nd4j::graph::Context& block, const NDArray* input, const NDA
 }
 
 //////////////////////////////////////////////////////////////////////////
-void ConvolutionUtils::conv2d(nd4j::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
+void ConvolutionUtils::conv2d(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
     BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), conv2d_, (block, input, weights, bias, output, kH, kW, sH, sW, pH, pW, dH, dW, paddingMode, isNCHW), FLOAT_TYPES);
 }
 
 //////////////////////////////////////////////////////////////////////////
 template <typename X, typename Y>
-static void depthwiseConv2d_(nd4j::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
+static void depthwiseConv2d_(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
 
     // input     [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
     // weights   [kH, kW, iC, mC] always
@@ -336,13 +336,13 @@ static void depthwiseConv2d_(nd4j::graph::Context& block, const NDArray* input,
 }
 
 //////////////////////////////////////////////////////////////////////////
-void ConvolutionUtils::depthwiseConv2d(nd4j::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
+void ConvolutionUtils::depthwiseConv2d(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
     BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), depthwiseConv2d_, (block, input, weights, bias, output, kH, kW, sH, sW, pH, pW, dH, dW, paddingMode, isNCHW), FLOAT_TYPES);
 }
 
 //////////////////////////////////////////////////////////////////////////
 template <typename X, typename Y>
-static void sconv2d_(nd4j::graph::Context& block, const NDArray* input, const NDArray* weightsDepth, const NDArray* weightsPoint, const NDArray* bias,  NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
+static void sconv2d_(sd::graph::Context& block, const NDArray* input, const NDArray* weightsDepth, const NDArray* weightsPoint, const NDArray* bias,  NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
 
     // input         [bS, iH, iW, iC]  (NHWC) or [bS, iC, iH, iW]  (NCHW)
     // weightsDepth  [kH, kW, iC, mC]  always
@@ -381,7 +381,7 @@ static void sconv2d_(nd4j::graph::Context& block, const NDArray* input, const ND
 }
 
 //////////////////////////////////////////////////////////////////////////
-void ConvolutionUtils::sconv2d(nd4j::graph::Context& block, const NDArray* input, const NDArray* weightsDepth, const NDArray* weightsPoint, const NDArray* bias,  NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
+void ConvolutionUtils::sconv2d(sd::graph::Context& block, const NDArray* input, const NDArray* weightsDepth, const NDArray* weightsPoint, const NDArray* bias,  NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
     BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), sconv2d_, (block, input, weightsDepth, weightsPoint, bias, output, kH, kW, sH, sW, pH, pW, dH, dW, paddingMode, isNCHW), FLOAT_TYPES);
 }
 
@@ -438,24 +438,24 @@ static __global__ void avgPooling2dCuda(const void *vx, const Nd4jLong *xShapeIn
         int wend = wstart + kWEff;
 
         if(hstart < 0){
-            int f = nd4j::math::nd4j_ceil<Z,int>((Z) -hstart / (Z)dH);
+            int f = sd::math::nd4j_ceil<Z,int>((Z) -hstart / (Z)dH);
             hstart += f * dH;
         }
         if(wstart < 0){
-            int f = nd4j::math::nd4j_ceil<Z,int>((Z) -wstart / (Z) dW);
+            int f = sd::math::nd4j_ceil<Z,int>((Z) -wstart / (Z) dW);
             wstart += f * dW;
         }
         if(hend > iH){
-            int f = nd4j::math::nd4j_ceil<Z,int>((Z) (hend-iH) / (Z) dH);
+            int f = sd::math::nd4j_ceil<Z,int>((Z) (hend-iH) / (Z) dH);
             hend -= f * dH;
         }
         if(wend > iW){
-            int f = nd4j::math::nd4j_ceil<Z,int>((Z) (wend-iW) / (Z) dW);
+            int f = sd::math::nd4j_ceil<Z,int>((Z) (wend-iW) / (Z) dW);
             wend -= f * dW;
         }
 
         //Accounts for dilation
-        int pool_size = nd4j::math::nd4j_ceil<double,int>((double) (hend-hstart) / (double) dH) * nd4j::math::nd4j_ceil<double,int>((double) (wend-wstart) / (double) dW);
+        int pool_size = sd::math::nd4j_ceil<double,int>((double) (hend-hstart) / (double) dH) * sd::math::nd4j_ceil<double,int>((double) (wend-wstart) / (double) dW);
 
         Z sum = 0.0f;
 
@@ -475,7 +475,7 @@ static __global__ void avgPooling2dCuda(const void *vx, const Nd4jLong *xShapeIn
 
 //////////////////////////////////////////////////////////////////////////
 template <typename X, typename Z>
-static void avgPooling2dCudaLauncher(nd4j::LaunchContext & block, void *vx, Nd4jLong *vxShapeInfo, void *vz, Nd4jLong *vzShapeInfo, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int extraParam0) {
+static void avgPooling2dCudaLauncher(sd::LaunchContext & block, void *vx, Nd4jLong *vxShapeInfo, void *vz, Nd4jLong *vzShapeInfo, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int extraParam0) {
     avgPooling2dCuda<X, Z><<<512, 512, 4192, *block.getCudaStream()>>>(vx, vxShapeInfo, vz, vzShapeInfo, kH, kW, sH, sW, pH, pW, dH, dW, extraParam0);
 }
 
@@ -533,24 +533,24 @@ static __global__ void pnormPooling2dCuda(const void *vx, const Nd4jLong *xShape
         int wend = wstart + kWEff;
 
         if (hstart < 0) {
-            int f = nd4j::math::nd4j_ceil<Z, int>((Z) -hstart / (Z) dH);
+            int f = sd::math::nd4j_ceil<Z, int>((Z) -hstart / (Z) dH);
             hstart += f * dH;
         }
         if (wstart < 0) {
-            int f = nd4j::math::nd4j_ceil<Z, int>((Z) -wstart / (Z) dW);
+            int f = sd::math::nd4j_ceil<Z, int>((Z) -wstart / (Z) dW);
             wstart += f * dW;
         }
         if (hend > iH) {
-            int f = nd4j::math::nd4j_ceil<Z, int>((Z) (hend - iH) / (Z) dH);
+            int f = sd::math::nd4j_ceil<Z, int>((Z) (hend - iH) / (Z) dH);
             hend -= f * dH;
         }
         if (wend > iW) {
-            int f = nd4j::math::nd4j_ceil<Z, int>((Z) (wend - iW) / (Z) dW);
+            int f = sd::math::nd4j_ceil<Z, int>((Z) (wend - iW) / (Z) dW);
             wend -= f * dW;
         }
         //Accounts for dilation
-        int pool_size = nd4j::math::nd4j_ceil<double, int>((double) (hend - hstart) / (double) dH) *
-                        nd4j::math::nd4j_ceil<double, int>((double) (wend - wstart) / (double) dW);
+        int pool_size = sd::math::nd4j_ceil<double, int>((double) (hend - hstart) / (double) dH) *
+                        sd::math::nd4j_ceil<double, int>((double) (wend - wstart) / (double) dW);
 
         Z sum = 0.f;
 
@@ -558,15 +558,15 @@ static __global__ void pnormPooling2dCuda(const void *vx, const Nd4jLong *xShape
 
         for (int h = hstart; h < hend; h += dH)
             for (int w = wstart; w < wend; w += dW)
-                sum += nd4j::math::nd4j_pow<Z, Z, Z>(static_cast<Z>(nd4j::math::nd4j_abs<X>(inSlice[h * strideY + w * strideX])), extraParam0);
+                sum += sd::math::nd4j_pow<Z, Z, Z>(static_cast<Z>(sd::math::nd4j_abs<X>(inSlice[h * strideY + w * strideX])), extraParam0);
 
-        z[n * strideOB + c * strideOC + pw * strideOX + ph * strideOY] = nd4j::math::nd4j_pow<Z, Z, Z>(sum, (Z) 1.0f / extraParam0);
+        z[n * strideOB + c * strideOC + pw * strideOX + ph * strideOY] = sd::math::nd4j_pow<Z, Z, Z>(sum, (Z) 1.0f / extraParam0);
     }
 }
 
 //////////////////////////////////////////////////////////////////////////
 template <typename X, typename Z>
-static void pnormPooling2dCudaLauncher(nd4j::LaunchContext & block, void *vx, Nd4jLong *vxShapeInfo, void *vz, Nd4jLong *vzShapeInfo, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int extraParam0) {
+static void pnormPooling2dCudaLauncher(sd::LaunchContext & block, void *vx, Nd4jLong *vxShapeInfo, void *vz, Nd4jLong *vzShapeInfo, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int extraParam0) {
     pnormPooling2dCuda<X, Z><<<512, 512, 4192, *block.getCudaStream()>>>(vx, vxShapeInfo, vz, vzShapeInfo, kH, kW, sH, sW, pH, pW, dH, dW, extraParam0);
 }
 
@@ -624,25 +624,25 @@ static __global__ void maxPooling2dCuda(const void *vx, const Nd4jLong *xShapeIn
         int wend = wstart + kWEff;
 
         if(hstart < 0){
-            int f = nd4j::math::nd4j_ceil<Z,int>((Z) -hstart / (Z)dH);
+            int f = sd::math::nd4j_ceil<Z,int>((Z) -hstart / (Z)dH);
             hstart += f * dH;
         }
         if(wstart < 0){
-            int f = nd4j::math::nd4j_ceil<Z,int>((Z) -wstart / (Z) dW);
+            int f = sd::math::nd4j_ceil<Z,int>((Z) -wstart / (Z) dW);
             wstart += f * dW;
         }
         if(hend > iH){
-            int f = nd4j::math::nd4j_ceil<Z,int>((Z) (hend-iH) / (Z) dH);
+            int f = sd::math::nd4j_ceil<Z,int>((Z) (hend-iH) / (Z) dH);
             hend -= f * dH;
         }
         if(wend > iW){
-            int f = nd4j::math::nd4j_ceil<Z,int>((Z) (wend-iW) / (Z) dW);
+            int f = sd::math::nd4j_ceil<Z,int>((Z) (wend-iW) / (Z) dW);
             wend -= f * dW;
         }
         //Accounts for dilation
-        int pool_size = nd4j::math::nd4j_ceil<double,int>((double) (hend-hstart) / (double) dH) * nd4j::math::nd4j_ceil<double,int>((double) (wend-wstart) / (double) dW);
+        int pool_size = sd::math::nd4j_ceil<double,int>((double) (hend-hstart) / (double) dH) * sd::math::nd4j_ceil<double,int>((double) (wend-wstart) / (double) dW);
 
-        Z max = -nd4j::DataTypeUtils::max<Z>();
+        Z max = -sd::DataTypeUtils::max<Z>();
 
         const X *inSlice = x + (n * strideB + c * strideC);
 
@@ -660,12 +660,12 @@ static __global__ void maxPooling2dCuda(const void *vx, const Nd4jLong *xShapeIn
 
 //////////////////////////////////////////////////////////////////////////
 template <typename X, typename Z>
-static void maxPooling2dCudaLauncher(nd4j::LaunchContext & block, void *vx, Nd4jLong *vxShapeInfo, void *vz, Nd4jLong *vzShapeInfo, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int extraParam0) {
+static void maxPooling2dCudaLauncher(sd::LaunchContext & block, void *vx, Nd4jLong *vxShapeInfo, void *vz, Nd4jLong *vzShapeInfo, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int extraParam0) {
     maxPooling2dCuda<X,Z><<<512, 512, 4192, *block.getCudaStream()>>>(vx, vxShapeInfo, vz, vzShapeInfo, kH, kW, sH, sW, pH, pW, dH, dW, extraParam0);
 }
 
 //////////////////////////////////////////////////////////////////////////
-void ConvolutionUtils::pooling2d(nd4j::graph::Context& block, const NDArray& input, NDArray& output, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const PoolingType poolingMode, const int extraParam0) {
+void ConvolutionUtils::pooling2d(sd::graph::Context& block, const NDArray& input, NDArray& output, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const PoolingType poolingMode, const int extraParam0) {
 
     if(!input.isActualOnDeviceSide()) input.syncToDevice();
 
@@ -789,7 +789,7 @@ __global__ static void pooling3dCuda(const void* vx, const Nd4jLong* xShapeInfo,
                 uint a = (dend - dstart) / dD + ((dend - dstart) % dD == 0 ? 0 : 1);
                 uint b = (hend - hstart) / dH + ((hend - hstart) % dH == 0 ? 0 : 1);
                 uint c = (wend - wstart) / dW + ((wend - wstart) % dW == 0 ? 0 : 1);
-                sum /=  static_cast<T>(a * b * c);                                       //  /= nd4j::math::nd4j_ceil<double,T>(static_cast<double>(dend - dstart) / static_cast<double>(dD)) * nd4j::math::nd4j_ceil<double,T>(static_cast<double>(hend - hstart) / static_cast<double>(dH)) * nd4j::math::nd4j_ceil<double,T>(static_cast<double>(wend - wstart) / static_cast<double>(dW));   //Accounts for dilation
+                sum /=  static_cast<T>(a * b * c);                                       //  /= sd::math::nd4j_ceil<double,T>(static_cast<double>(dend - dstart) / static_cast<double>(dD)) * sd::math::nd4j_ceil<double,T>(static_cast<double>(hend - hstart) / static_cast<double>(dH)) * sd::math::nd4j_ceil<double,T>(static_cast<double>(wend - wstart) / static_cast<double>(dW));   //Accounts for dilation
             }
             else if (extraParam0 == 1)    //Include padding
                 sum /= kProd;
@@ -804,9 +804,9 @@ __global__ static void pooling3dCuda(const void* vx, const Nd4jLong* xShapeInfo,
             for (coords[2] = dstart; coords[2] < dend; coords[2] += dD)
                 for (coords[3] = hstart; coords[3] < hend; coords[3] += dH)
                     for (coords[4] = wstart; coords[4] < wend; coords[4] += dW)
-                        sum += nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(x[shape::getOffset(xShapeInfo, coords)]), extraParam0);
+                        sum += sd::math::nd4j_pow<T,T,T>(sd::math::nd4j_abs<T>(x[shape::getOffset(xShapeInfo, coords)]), extraParam0);
 
-            sum = nd4j::math::nd4j_pow<T,T,T>(sum, (T) 1.f / extraParam0);
+            sum = sd::math::nd4j_pow<T,T,T>(sum, (T) 1.f / extraParam0);
 
             z[zOffset] = sum;
         }
@@ -829,7 +829,7 @@ static void pooling3dCudaLauncher(const int blocksPerGrid, const int threadsPerB
 }
 
 //////////////////////////////////////////////////////////////////////////
-void ConvolutionUtils::pooling3d(nd4j::graph::Context& block, const NDArray& input, NDArray& output, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int poolingMode, const int extraParam0) {
+void ConvolutionUtils::pooling3d(sd::graph::Context& block, const NDArray& input, NDArray& output, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int poolingMode, const int extraParam0) {
 
     PointersManager manager(block.launchContext(), "pooling3d");
 
@@ -923,7 +923,7 @@ __global__ static void pooling2dBPCuda(const void* vx, const Nd4jLong* xShapeInf
             coords[2] = coord2;
             coords[3] = coord3;
             auto zOffset = shape::getOffset(zShapeInfo, coords);
-            nd4j::math::atomics::nd4j_atomicAdd<T>(&z[zOffset], y[yOffset]);
+            sd::math::atomics::nd4j_atomicAdd<T>(&z[zOffset], y[yOffset]);
             //z[zOffset] += y[yOffset];
         }
         break;
@@ -934,13 +934,13 @@ __global__ static void pooling2dBPCuda(const void* vx, const Nd4jLong* xShapeInf
             T val = y[yOffset];
 
             if (extraParam0 == 0)         //Exclude padding
-                val /= nd4j::math::nd4j_ceil<double,T>(static_cast<double>(hend - hstart) / static_cast<double>(dH)) * nd4j::math::nd4j_ceil<double,T>(static_cast<double>(wend - wstart) / static_cast<double>(dW));   //Accounts for dilation
+                val /= sd::math::nd4j_ceil<double,T>(static_cast<double>(hend - hstart) / static_cast<double>(dH)) * sd::math::nd4j_ceil<double,T>(static_cast<double>(wend - wstart) / static_cast<double>(dW));   //Accounts for dilation
             else if (extraParam0 == 1)    //Include padding
                 val /= kProd;
 
             for (coords[2] = hstart; coords[2] < hend; coords[2] += dH)
                 for (coords[3] = wstart; coords[3] < wend; coords[3] += dW)
-                    nd4j::math::atomics::nd4j_atomicAdd<T>(&z[shape::getOffset(zShapeInfo, coords)], val);
+                    sd::math::atomics::nd4j_atomicAdd<T>(&z[shape::getOffset(zShapeInfo, coords)], val);
         }
         break;
 
@@ -952,15 +952,15 @@ __global__ static void pooling2dBPCuda(const void* vx, const Nd4jLong* xShapeInf
 
             for (coords[2] = hstart; coords[2] < hend; coords[2] += dH)
                 for (coords[3] = wstart; coords[3] < wend; coords[3] += dW)
-                    sum += nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(x[shape::getOffset(xShapeInfo, coords)]), extraParam0);
+                    sum += sd::math::nd4j_pow<T,T,T>(sd::math::nd4j_abs<T>(x[shape::getOffset(xShapeInfo, coords)]), extraParam0);
 
-            val *= nd4j::math::nd4j_pow<T,T,T>(sum, ((T)1.f - extraParam0) / extraParam0);
+            val *= sd::math::nd4j_pow<T,T,T>(sum, ((T)1.f - extraParam0) / extraParam0);
 
             for (coords[2] = hstart; coords[2] < hend; coords[2] += dH) {
                 for (coords[3] = wstart; coords[3] < wend; coords[3] += dW) {
                     const auto xOffset = shape::getOffset(xShapeInfo, coords);
                     const auto zOffset = shape::getOffset(zShapeInfo, coords);
-                    nd4j::math::atomics::nd4j_atomicAdd<T>(&z[zOffset], val * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(x[xOffset]), extraParam0 - 1.f) * nd4j::math::nd4j_sgn<T,T>(x[xOffset]));
+                    sd::math::atomics::nd4j_atomicAdd<T>(&z[zOffset], val * sd::math::nd4j_pow<T,T,T>(sd::math::nd4j_abs<T>(x[xOffset]), extraParam0 - 1.f) * sd::math::nd4j_sgn<T,T>(x[xOffset]));
                 }
             }
         }
@@ -984,7 +984,7 @@ static void pooling2dBPCudaLauncher(const int blocksPerGrid, const int threadsPe
 }
 
 //////////////////////////////////////////////////////////////////////////
-void ConvolutionUtils::pooling2dBP(nd4j::graph::Context& block, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int poolingMode, const int extraParam0) {
+void ConvolutionUtils::pooling2dBP(sd::graph::Context& block, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int poolingMode, const int extraParam0) {
 
     // initial zeroing of gradI
     gradI.nullify();
@@ -1092,7 +1092,7 @@ __global__ static void pooling3dBPCuda(const void* vx, const Nd4jLong* xShapeInf
             coords[2] = coord2;
             coords[3] = coord3;
             coords[4] = coord4;
-            nd4j::math::atomics::nd4j_atomicAdd<T>(&z[shape::getOffset(zShapeInfo, coords)], y[yOffset]);
+            sd::math::atomics::nd4j_atomicAdd<T>(&z[shape::getOffset(zShapeInfo, coords)], y[yOffset]);
         }
         break;
 
@@ -1102,14 +1102,14 @@ __global__ static void pooling3dBPCuda(const void* vx, const Nd4jLong* xShapeInf
             T val = y[yOffset];
 
             if (extraParam0 == 0)         //Exclude padding
-                val /= nd4j::math::nd4j_ceil<double,T>(static_cast<double>(dend - dstart) / static_cast<double>(dD))  * nd4j::math::nd4j_ceil<double,T>(static_cast<double>(hend - hstart) / static_cast<double>(dH))     * nd4j::math::nd4j_ceil<double,T>(static_cast<double>(wend - wstart)    / static_cast<double>(dW));   //Accounts for dilation
+                val /= sd::math::nd4j_ceil<double,T>(static_cast<double>(dend - dstart) / static_cast<double>(dD))  * sd::math::nd4j_ceil<double,T>(static_cast<double>(hend - hstart) / static_cast<double>(dH))     * sd::math::nd4j_ceil<double,T>(static_cast<double>(wend - wstart)    / static_cast<double>(dW));   //Accounts for dilation
             else if (extraParam0 == 1)    //Include padding
                 val /= kProd;
 
             for (coords[2] = dstart; coords[2] < dend; coords[2] += dD)
                 for (coords[3] = hstart; coords[3] < hend; coords[3] += dH)
                     for (coords[4] = wstart; coords[4] < wend; coords[4] += dW)
-                        nd4j::math::atomics::nd4j_atomicAdd<T>(&z[shape::getOffset(zShapeInfo, coords)], val);
+                        sd::math::atomics::nd4j_atomicAdd<T>(&z[shape::getOffset(zShapeInfo, coords)], val);
         }
         break;
 
@@ -1122,16 +1122,16 @@ __global__ static void pooling3dBPCuda(const void* vx, const Nd4jLong* xShapeInf
             for (coords[2] = dstart; coords[2] < dend; coords[2] += dD)
                 for (coords[3] = hstart; coords[3] < hend; coords[3] += dH)
                     for (coords[4] = wstart; coords[4] < wend; coords[4] += dW)
-                        sum += nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(x[shape::getOffset(xShapeInfo, coords)]), extraParam0);
+                        sum += sd::math::nd4j_pow<T,T,T>(sd::math::nd4j_abs<T>(x[shape::getOffset(xShapeInfo, coords)]), extraParam0);
 
-            val *= nd4j::math::nd4j_pow<T,T,T>(sum, ((T)1.f - extraParam0) / extraParam0);
+            val *= sd::math::nd4j_pow<T,T,T>(sum, ((T)1.f - extraParam0) / extraParam0);
 
             for (coords[2] = dstart; coords[2] < dend; coords[2] += dD) {
                 for (coords[3] = hstart; coords[3] < hend; coords[3] += dH) {
                     for (coords[4] = wstart; coords[4] < wend; coords[4] += dW) {
                         const auto xOffset = shape::getOffset(xShapeInfo, coords);
                         const auto zOffset = shape::getOffset(zShapeInfo, coords);
-                        nd4j::math::atomics::nd4j_atomicAdd<T>(&z[zOffset], val * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(x[xOffset]), extraParam0 - 1.f) * nd4j::math::nd4j_sgn<T,T>(x[xOffset]));
+                        sd::math::atomics::nd4j_atomicAdd<T>(&z[zOffset], val * sd::math::nd4j_pow<T,T,T>(sd::math::nd4j_abs<T>(x[xOffset]), extraParam0 - 1.f) * sd::math::nd4j_sgn<T,T>(x[xOffset]));
                     }
                 }
             }
@@ -1156,7 +1156,7 @@ static void pooling3dBPCudaLauncher(const int blocksPerGrid, const int threadsPe
 }
 
 //////////////////////////////////////////////////////////////////////////
-void ConvolutionUtils::pooling3dBP(nd4j::graph::Context& block, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int poolingMode, const int extraParam0) {
+void ConvolutionUtils::pooling3dBP(sd::graph::Context& block, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int poolingMode, const int extraParam0) {
 
     // initial zeroing of gradI
     gradI.nullify();
@@ -1176,7 +1176,7 @@ void ConvolutionUtils::pooling3dBP(nd4j::graph::Context& block, const NDArray& i
 
 //////////////////////////////////////////////////////////////////////////
 template <typename X, typename Y>
-static void conv2dBP_(nd4j::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
+static void conv2dBP_(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
 
     // input   [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
     // weights [kH, kW, iC, oC] always
@@ -1220,7 +1220,7 @@ static void conv2dBP_(nd4j::graph::Context& block, const NDArray* input, const N
     if(gradW) {
         auto ctx = block.launchContext();
         helpers::im2col(*ctx, *input, columns, kH, kW, sH, sW, pH, pW, dH, dW, NDArrayFactory::create(0.f, input->getContext()));   // [bS, iC, iH, iW] is convoluted to [bS, iC, kH, kW, oH, oW]
-        nd4j::MmulHelper::tensorDot(&columns, gradO, gradW, {0,4,5}, gradOaxesForDot, {2, 0, 1, 3});       // [bS, iC, kH, kW, oH, oW] x [bS, oH, oW, oC]/[bS, oC, oH, oW] = [iC, kH, kW, oC]
+        sd::MmulHelper::tensorDot(&columns, gradO, gradW, {0,4,5}, gradOaxesForDot, {2, 0, 1, 3});       // [bS, iC, kH, kW, oH, oW] x [bS, oH, oW, oC]/[bS, oC, oH, oW] = [iC, kH, kW, oC]
     }
 
     // ----- calculation of gradB ----- //
@@ -1234,7 +1234,7 @@ static void conv2dBP_(nd4j::graph::Context& block, const NDArray* input, const N
     }
 
     //----- calculation of gradI -----//
-    nd4j::MmulHelper::tensorDot(weights, gradO, &columns, {indWoC}, {indIOioC}, {2, 3, 1, 0, 4, 5});  // [kH, kW, iC, oC]/[oC, iC, kH, kW]] x [bS, oH, oW, oC]/[bS, oC, oH, oW] = [kH, kW, iC, bS, oH, oW]
+    sd::MmulHelper::tensorDot(weights, gradO, &columns, {indWoC}, {indIOioC}, {2, 3, 1, 0, 4, 5});  // [kH, kW, iC, oC]/[oC, iC, kH, kW]] x [bS, oH, oW, oC]/[bS, oC, oH, oW] = [kH, kW, iC, bS, oH, oW]
 
     helpers::col2im(*block.launchContext(), columns, *gradI, sH, sW, pH, pW, iH, iW, dH, dW);                          // [bS, iC, kH, kW, oH, oW] is de-convoluted to [bS, iC, iH, iW]
 
@@ -1245,7 +1245,7 @@ static void conv2dBP_(nd4j::graph::Context& block, const NDArray* input, const N
 }
 
 //////////////////////////////////////////////////////////////////////////
-void ConvolutionUtils::conv2dBP(nd4j::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
+void ConvolutionUtils::conv2dBP(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
     BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), conv2dBP_, (block, input, weights, bias, gradO, gradI, gradW, gradB, kH, kW, sH, sW, pH, pW, dH, dW, paddingMode, isNCHW), FLOAT_TYPES);
 }
 
@@ -1303,7 +1303,7 @@ static void depthwiseConv2dBP_(const NDArray* input, const NDArray* weights, con
     // ----- calculation of gradW and gradB ----- //
 
     helpers::im2col(*input->getContext(), *input, columns, kH, kW, sH, sW, pH, pW, dH, dW, NDArrayFactory::create(0.f, input->getContext()));  // [bS, iC, iH, iW] is convoluted to [bS, iC, kH, kW, oH, oW]
-    nd4j::MmulHelper::tensorDot(&columns, &gradOreshaped, gradW, modifColumns, modifGradO1, {{2,0,1,3},{iC,kH*kW,mC}});  // [iC, kW*kH, bS*oH*oW] x [iC, bS*oH*oW, mC] = [iC, kH*kW, mC]
+    sd::MmulHelper::tensorDot(&columns, &gradOreshaped, gradW, modifColumns, modifGradO1, {{2,0,1,3},{iC,kH*kW,mC}});  // [iC, kW*kH, bS*oH*oW] x [iC, bS*oH*oW, mC] = [iC, kH*kW, mC]
 
     // ----- calculation of gradB ----- //
     if(gradB) {
@@ -1316,7 +1316,7 @@ static void depthwiseConv2dBP_(const NDArray* input, const NDArray* weights, con
     }
 
     //----- calculation of gradI -----//
-    nd4j::MmulHelper::tensorDot(weights, gradO, &columns, {{2,0,1,3},{iC,kH*kW,mC}}, modifGradO2, modifColumns); // [iC, kH*kW, mC] x [iC, mC, bS*oH*oW] = [iC, kW*kH, bS*oH*oW]
+    sd::MmulHelper::tensorDot(weights, gradO, &columns, {{2,0,1,3},{iC,kH*kW,mC}}, modifGradO2, modifColumns); // [iC, kH*kW, mC] x [iC, mC, bS*oH*oW] = [iC, kW*kH, bS*oH*oW]
     helpers::col2im(*input->getContext(), columns, *gradI, sH, sW, pH, pW, iH, iW, dH, dW);                                       // [bS, iC, kH, kW, oH, oW] is de-convoluted to [bS, iC, iH, iW]
 
     if(!isNCHW) {
@@ -1326,7 +1326,7 @@ static void depthwiseConv2dBP_(const NDArray* input, const NDArray* weights, con
 }
 
 //////////////////////////////////////////////////////////////////////////
-void ConvolutionUtils::depthwiseConv2dBP(nd4j::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
+void ConvolutionUtils::depthwiseConv2dBP(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW) {
     BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), depthwiseConv2dBP_, (input, weights, bias, gradO, gradI, gradW, gradB, kH, kW, sH, sW, pH, pW, dH, dW, paddingMode, isNCHW), FLOAT_TYPES);
 }
 
@@ -1384,7 +1384,7 @@ static void upsampling2dCudaLauncher(const int blocksPerGrid, const int threadsP
 }
 
 //////////////////////////////////////////////////////////////////////////
-void ConvolutionUtils::upsampling2d(nd4j::graph::Context& block, const NDArray& input, NDArray& output, const int factorH, const int factorW, const bool isNCHW) {
+void ConvolutionUtils::upsampling2d(sd::graph::Context& block, const NDArray& input, NDArray& output, const int factorH, const int factorW, const bool isNCHW) {
 
     PointersManager manager(block.launchContext(), "upsampling2d");
 
@@ -1453,7 +1453,7 @@ static void upsampling3dCudaLauncher(const int blocksPerGrid, const int threadsP
 }
 
 //////////////////////////////////////////////////////////////////////////
-void ConvolutionUtils::upsampling3d(nd4j::graph::Context& block, const NDArray& input, NDArray& output, const int factorD, const int factorH, const int factorW, const bool isNCDHW) {
+void ConvolutionUtils::upsampling3d(sd::graph::Context& block, const NDArray& input, NDArray& output, const int factorD, const int factorH, const int factorW, const bool isNCDHW) {
 
     PointersManager manager(block.launchContext(), "upsampling3d");
 
@@ -1527,7 +1527,7 @@ static void upsampling2dBPCudaLauncher(const int blocksPerGrid, const int thread
 }
 
 //////////////////////////////////////////////////////////////////////////
-void ConvolutionUtils::upsampling2dBP(nd4j::graph::Context& block, const NDArray& gradO, NDArray& gradI, const bool isNCHW) {
+void ConvolutionUtils::upsampling2dBP(sd::graph::Context& block, const NDArray& gradO, NDArray& gradI, const bool isNCHW) {
 
     PointersManager manager(block.launchContext(), "upsampling2d_bp");
 
@@ -1604,7 +1604,7 @@ static void upsampling3dBPCudaLauncher(const int blocksPerGrid, const int thread
 }
 
 //////////////////////////////////////////////////////////////////////////
-void ConvolutionUtils::upsampling3dBP(nd4j::graph::Context& block, const NDArray& gradO, NDArray& gradI, const bool isNCDHW) {
+void ConvolutionUtils::upsampling3dBP(sd::graph::Context& block, const NDArray& gradO, NDArray& gradI, const bool isNCDHW) {
 
     PointersManager manager(block.launchContext(), "upsampling3d_bp");
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/cross.cu b/libnd4j/include/ops/declarable/helpers/cuda/cross.cu
index 1cd771b98..38e8d0cca 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/cross.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/cross.cu
@@ -20,10 +20,10 @@
 
 
 #include <ops/declarable/helpers/cross.h>
-#include <PointersManager.h>
+#include <helpers/PointersManager.h>
 
 
-namespace nd4j 	  {
+namespace sd 	  {
 namespace ops 	  {
 namespace helpers {
 
@@ -102,7 +102,7 @@ __host__ static void crossCudaLauncher(const int blocksPerGrid, const int thread
 BUILD_SINGLE_TEMPLATE(template void crossCudaLauncher, (const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream, const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo), NUMERIC_TYPES);
 
 
-void crossBatched(nd4j::LaunchContext* context, NDArray *x, NDArray *y, NDArray *z) {
+void crossBatched(sd::LaunchContext* context, NDArray *x, NDArray *y, NDArray *z) {
 
 	const int threadsPerBlock = MAX_NUM_THREADS / 4;
     const int blocksPerGrid = (x->lengthOf() / x->sizeAt(-1) + threadsPerBlock - 1) / threadsPerBlock;
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/d_t_s.cu b/libnd4j/include/ops/declarable/helpers/cuda/d_t_s.cu
index b9aa4339b..35103d18b 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/d_t_s.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/d_t_s.cu
@@ -20,7 +20,7 @@
 
 #include <ops/declarable/helpers/d_t_s.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -88,11 +88,11 @@ namespace helpers {
 
 
     template <typename T>
-    static void __depthToSpace(nd4j::LaunchContext * context, NDArray *input, NDArray *output, int block_size, bool isNHWC) {
+    static void __depthToSpace(sd::LaunchContext * context, NDArray *input, NDArray *output, int block_size, bool isNHWC) {
         depthToSpaceKernel<T><<<512, 512, 1024, *context->getCudaStream()>>>(input->specialBuffer(), input->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(), block_size, isNHWC);
     }
 
-    void _depthToSpace(nd4j::LaunchContext * context, NDArray *input, NDArray *output, int block_size, bool isNHWC) {
+    void _depthToSpace(sd::LaunchContext * context, NDArray *input, NDArray *output, int block_size, bool isNHWC) {
         auto xType = input->dataType();
 
         NDArray::prepareSpecialUse({output}, {input});
@@ -101,7 +101,7 @@ namespace helpers {
         NDArray::registerSpecialUse({output}, {input});
     }
 
-    BUILD_SINGLE_TEMPLATE(template void __depthToSpace, (nd4j::LaunchContext * context, NDArray *input, NDArray *output, int block_size, bool isNHWC);, LIBND4J_TYPES);
+    BUILD_SINGLE_TEMPLATE(template void __depthToSpace, (sd::LaunchContext * context, NDArray *input, NDArray *output, int block_size, bool isNHWC);, LIBND4J_TYPES);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/diGamma.cu b/libnd4j/include/ops/declarable/helpers/cuda/diGamma.cu
index 3edd59ecc..a6d06be17 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/diGamma.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/diGamma.cu
@@ -20,9 +20,9 @@
 //
 
 #include<ops/declarable/helpers/gammaMathFunc.h>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -60,7 +60,7 @@ static void diGammaCudaLauncher(const int blocksPerGrid, const int threadsPerBlo
 }
 
 ///////////////////////////////////////////////////////////////////
-void diGamma(nd4j::LaunchContext* context, const NDArray& x, NDArray& z) {
+void diGamma(sd::LaunchContext* context, const NDArray& x, NDArray& z) {
 
     int threadsPerBlock = MAX_NUM_THREADS / 2;
     int blocksPerGrid = (z.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/diag.cu b/libnd4j/include/ops/declarable/helpers/cuda/diag.cu
index 265e47776..87fd2aa98 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/diag.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/diag.cu
@@ -18,10 +18,10 @@
 // Created by GS <sgazeos@gmail.com> on 4/6/2018.
 //
 
-#include "ResultSet.h"
+#include <array/ResultSet.h>
 #include <ops/declarable/helpers/diag.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -90,7 +90,7 @@ static __global__ void diagFunctorKernel(void* outputBuffer, Nd4jLong* outputSha
 // Returns a batched matrix tensor with new batched diagonal values.
 // for detailed explanations please take a look on web page: https://www.tensorflow.org/api_docs/python/tf/matrix_set_diag
     template <typename T>
-    static void _diagFunctor(nd4j::LaunchContext * context, const NDArray* input, NDArray* output) {
+    static void _diagFunctor(sd::LaunchContext * context, const NDArray* input, NDArray* output) {
         auto stream = context->getCudaStream();
         auto inputLength = input->lengthOf();
         dim3 launchDims(256, 512, 8192);
@@ -101,18 +101,18 @@ static __global__ void diagFunctorKernel(void* outputBuffer, Nd4jLong* outputSha
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // diagFunctor - caller for diag functor processor
-    void diagFunctor(nd4j::LaunchContext * context, const NDArray* input, NDArray* output) {
+    void diagFunctor(sd::LaunchContext * context, const NDArray* input, NDArray* output) {
         auto xType = input->dataType();
 
         BUILD_SINGLE_SELECTOR(xType, _diagFunctor, (context, input, output), LIBND4J_TYPES);
     }
 
-    BUILD_SINGLE_TEMPLATE(template void _diagFunctor, (nd4j::LaunchContext * context, const NDArray* input, NDArray* output);, LIBND4J_TYPES);
+    BUILD_SINGLE_TEMPLATE(template void _diagFunctor, (sd::LaunchContext * context, const NDArray* input, NDArray* output);, LIBND4J_TYPES);
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // diagPartFunctor - caller for diag part functor kernel
     template <typename T>
-    void _diagPartFunctor(nd4j::LaunchContext * context, NDArray const* input, NDArray* output) {
+    void _diagPartFunctor(sd::LaunchContext * context, NDArray const* input, NDArray* output) {
         const int outLen = output->lengthOf();
         const int inLen = input->lengthOf();
         auto stream = context->getCudaStream();
@@ -126,7 +126,7 @@ static __global__ void diagFunctorKernel(void* outputBuffer, Nd4jLong* outputSha
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // diagPartFunctor - caller for diag part functor processor
-    void diagPartFunctor(nd4j::LaunchContext * context, NDArray const* input, NDArray* output) {
+    void diagPartFunctor(sd::LaunchContext * context, NDArray const* input, NDArray* output) {
         auto zType = output->dataType();
         BUILD_SINGLE_SELECTOR(zType, _diagPartFunctor, (context, input, output), NUMERIC_TYPES);
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/dilation2d.cu b/libnd4j/include/ops/declarable/helpers/cuda/dilation2d.cu
index 92aa4c55a..b318465a7 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/dilation2d.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/dilation2d.cu
@@ -20,9 +20,9 @@
 
 #include <ops/declarable/helpers/dilation2d.h>
 #include <array/DataTypeUtils.h>
-#include <PointersManager.h>
+#include <helpers/PointersManager.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops 	  {
 namespace helpers {
 
@@ -113,7 +113,7 @@ static void dilation2dCudaLauncher(const int blocksPerGrid, const int threadsPer
     dilation2dCuda<X,Z><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo, sH, sW, pH, pW, dH, dW);
 }
 
-void dilation2d(nd4j::LaunchContext* context, NDArray *input, NDArray *weights, NDArray *output, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW) {
+void dilation2d(sd::LaunchContext* context, NDArray *input, NDArray *weights, NDArray *output, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW) {
 
    	PointersManager manager(context, "dilation2d");
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/dropout.cu b/libnd4j/include/ops/declarable/helpers/cuda/dropout.cu
index 9b2a42d8f..aee79caa7 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/dropout.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/dropout.cu
@@ -19,17 +19,17 @@
 //
 
 #include <ops/declarable/helpers/dropout.h>
-#include <NativeOps.h>
+#include <legacy/NativeOps.h>
 #include <vector>
 #include <memory>
-#include <cuda_exception.h>
+#include <exceptions/cuda_exception.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
     template <typename T>
-    static __global__ void dropoutSimpleKernel(void const* inputBuf, Nd4jLong const* inputShape, void* outputBuf, Nd4jLong* outputShape, double probVal, int inLen, nd4j::graph::RandomGenerator* nodeRng) {
+    static __global__ void dropoutSimpleKernel(void const* inputBuf, Nd4jLong const* inputShape, void* outputBuf, Nd4jLong* outputShape, double probVal, int inLen, sd::graph::RandomGenerator* nodeRng) {
         auto tid = blockIdx.x * blockDim.x + threadIdx.x;
         auto step = blockDim.x * gridDim.x;
         T const* input = reinterpret_cast<T const*>(inputBuf);
@@ -46,18 +46,18 @@ namespace helpers {
     }
 
     template <typename T>
-    static void dropoutSimple(nd4j::LaunchContext* context, NDArray const* input, NDArray* output, double probValue, int seed) {
-        nd4j::graph::RandomGenerator nodeRng(3019L, seed);
+    static void dropoutSimple(sd::LaunchContext* context, NDArray const* input, NDArray* output, double probValue, int seed) {
+        sd::graph::RandomGenerator nodeRng(3019L, seed);
         int inLen = input->lengthOf();
-        nd4j::graph::RandomGenerator* dRandom;
+        sd::graph::RandomGenerator* dRandom;
         auto stream = context->getCudaStream();
         NDArray::prepareSpecialUse({output}, {input});
 
-        auto err = cudaMalloc(&dRandom, sizeof(nd4j::graph::RandomGenerator));
+        auto err = cudaMalloc(&dRandom, sizeof(sd::graph::RandomGenerator));
         if (err) {
             throw cuda_exception::build("helpers::dropoutSimple: Cannot allocate device memory for random generator.", err);
         }
-        err = cudaMemcpy(dRandom, &nodeRng, sizeof(nd4j::graph::RandomGenerator), cudaMemcpyHostToDevice);
+        err = cudaMemcpy(dRandom, &nodeRng, sizeof(sd::graph::RandomGenerator), cudaMemcpyHostToDevice);
         if (err) {
             throw cuda_exception::build("helpers::dropoutSimple: Cannot set up device memory for random generator.", err);
         }
@@ -165,7 +165,7 @@ namespace helpers {
     }
 
     template <typename T>
-    static __global__ void alphaDropoutSimpleKernel(void const* inputBuf, Nd4jLong const* inputShape, void* outputBuf, Nd4jLong* outputShape, double probValue, double alpha, double alpha1, double beta, int inLen, nd4j::graph::RandomGenerator* nodeRng) {
+    static __global__ void alphaDropoutSimpleKernel(void const* inputBuf, Nd4jLong const* inputShape, void* outputBuf, Nd4jLong* outputShape, double probValue, double alpha, double alpha1, double beta, int inLen, sd::graph::RandomGenerator* nodeRng) {
         auto tid = blockIdx.x * blockDim.x + threadIdx.x;
         auto step = blockDim.x * gridDim.x;
         T const* input = reinterpret_cast<T const*>(inputBuf);
@@ -178,15 +178,15 @@ namespace helpers {
         }
     }
     template <typename T>
-    static void alphaDropoutSimple(nd4j::LaunchContext* context, NDArray const* input, NDArray* output, int seed, double probValue, double alpha, double alpha1, double beta) {
-        nd4j::graph::RandomGenerator nodeRng(3019L, seed), *dRandom;
+    static void alphaDropoutSimple(sd::LaunchContext* context, NDArray const* input, NDArray* output, int seed, double probValue, double alpha, double alpha1, double beta) {
+        sd::graph::RandomGenerator nodeRng(3019L, seed), *dRandom;
         auto stream = context->getCudaStream();
-        auto err = cudaMalloc(&dRandom, sizeof(nd4j::graph::RandomGenerator));
+        auto err = cudaMalloc(&dRandom, sizeof(sd::graph::RandomGenerator));
         NDArray::prepareSpecialUse({output}, {input});
         if (err) {
             throw cuda_exception::build("helpers::alphaDropoutSimple: Cannot allocate device memory for random generator.", err);
         }
-        err = cudaMemcpy(dRandom, &nodeRng, sizeof(nd4j::graph::RandomGenerator), cudaMemcpyHostToDevice);
+        err = cudaMemcpy(dRandom, &nodeRng, sizeof(sd::graph::RandomGenerator), cudaMemcpyHostToDevice);
         if (err) {
             throw cuda_exception::build("helpers::alphaDropoutSimple: Cannot set up device memory for random generator.", err);
         }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/dynamic.cu b/libnd4j/include/ops/declarable/helpers/cuda/dynamic.cu
index c70283997..a80d838be 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/dynamic.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/dynamic.cu
@@ -21,7 +21,7 @@
 #include <helpers/PointersManager.h>
 #include <helpers/ConstantTadHelper.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
 
@@ -110,7 +110,7 @@ namespace nd4j {
             }
 
             template <typename X, typename Y>
-            static void _dynamicPartitionFunctor(nd4j::LaunchContext * context, NDArray const* input, NDArray const* indices, std::vector<NDArray*>& outputList) {
+            static void _dynamicPartitionFunctor(sd::LaunchContext * context, NDArray const* input, NDArray const* indices, std::vector<NDArray*>& outputList) {
                 std::vector<std::pair<NDArray *, int>> outputs(outputList.size());
                 int sourceDimsLen = input->rankOf() - indices->rankOf();
 
@@ -230,7 +230,7 @@ namespace nd4j {
             }
 
             template <typename X, typename Y>
-            static int _dynamicStitchFunctor(nd4j::LaunchContext * context, std::vector<NDArray*> const& inputs, std::vector<NDArray*> const& indices, NDArray* output){
+            static int _dynamicStitchFunctor(sd::LaunchContext * context, std::vector<NDArray*> const& inputs, std::vector<NDArray*> const& indices, NDArray* output){
 
                 int inputSize = inputs.size();
 
@@ -307,7 +307,7 @@ namespace nd4j {
 
             }
 
-            void dynamicPartitionFunctor(nd4j::LaunchContext * context, NDArray const* input, NDArray const* indices, std::vector<NDArray*>& outputList) {
+            void dynamicPartitionFunctor(sd::LaunchContext * context, NDArray const* input, NDArray const* indices, std::vector<NDArray*>& outputList) {
                 auto xType = input->dataType();
                 auto yType = indices->dataType();
 
@@ -328,7 +328,7 @@ namespace nd4j {
                 throw std::runtime_error("Not umplemented yet");
             }
 
-            int dynamicStitchFunctor(nd4j::LaunchContext * context, std::vector<NDArray*> const& inputs, std::vector<NDArray*> const& indices, NDArray* output){
+            int dynamicStitchFunctor(sd::LaunchContext * context, std::vector<NDArray*> const& inputs, std::vector<NDArray*> const& indices, NDArray* output){
                 auto xType = inputs.at(0)->dataType();
                 auto yType = indices.at(0)->dataType();
 
@@ -352,13 +352,13 @@ namespace nd4j {
                 return Status::OK();
             }
 
-            int dynamicStitchFunctorBP(nd4j::LaunchContext * context, std::vector<NDArray*> const& inputs, std::vector<NDArray*> const& indices, NDArray const* gradInput, std::vector<NDArray*>& outputList) {
+            int dynamicStitchFunctorBP(sd::LaunchContext * context, std::vector<NDArray*> const& inputs, std::vector<NDArray*> const& indices, NDArray const* gradInput, std::vector<NDArray*>& outputList) {
                 auto xType = inputs.at(0)->dataType();
 
                 BUILD_SINGLE_SELECTOR(xType, return _dynamicStitchFunctorBP, (inputs, indices, gradInput, outputList), NUMERIC_TYPES);
             }
 
-            void dynamicPartitionFunctorBP(nd4j::LaunchContext * context, NDArray const* input, NDArray const* indices, std::vector<NDArray*> const& inputGradientList, std::vector<NDArray*>& outputList) {
+            void dynamicPartitionFunctorBP(sd::LaunchContext * context, NDArray const* input, NDArray const* indices, std::vector<NDArray*> const& inputGradientList, std::vector<NDArray*>& outputList) {
                 auto xType = input->dataType();
 
                 BUILD_SINGLE_SELECTOR(xType, _dynamicPartitionFunctorBP, (input, indices, inputGradientList, outputList), NUMERIC_TYPES);
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/extract_patches.cu b/libnd4j/include/ops/declarable/helpers/cuda/extract_patches.cu
index 9470f21be..3a0ea9240 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/extract_patches.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/extract_patches.cu
@@ -25,7 +25,7 @@
 #include <array>
 #include <helpers/ConstantTadHelper.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -92,7 +92,7 @@ namespace helpers {
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     template <typename T>
-    static void _extractPatches(nd4j::LaunchContext * context, NDArray* images, NDArray* output, int sizeRow, int sizeCol, int strideRow, int strideCol, int rateRow, int rateCol, bool theSame){
+    static void _extractPatches(sd::LaunchContext * context, NDArray* images, NDArray* output, int sizeRow, int sizeCol, int strideRow, int strideCol, int rateRow, int rateCol, bool theSame){
         NDArray::prepareSpecialUse({output}, {images});
         std::vector<int> restDims({1, 2, 3}); // the first and the last dims
         // 3D matricies - 2D matricies of vectors (if last dim is greater than 1)
@@ -114,8 +114,8 @@ namespace helpers {
         if (sizeCol * rateCol < 3)
             colCast = 0;
 
-        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(images->getShapeInfo(), restDims.data(), restDims.size());
-        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), restDims.data(), restDims.size());
+        auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(images->getShapeInfo(), restDims.data(), restDims.size());
+        auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), restDims.data(), restDims.size());
         int batchCount = packX.numberOfTads();
 
         PointersManager manager(context, "helpers::extractPatches");
@@ -132,11 +132,11 @@ namespace helpers {
         manager.synchronize();
         NDArray::registerSpecialUse({output}, {images});
     }
-    BUILD_SINGLE_TEMPLATE(template void _extractPatches, (nd4j::LaunchContext * context, NDArray* input, NDArray* output, int sizeRow, int sizeCol, int stradeRow, int stradeCol, int rateRow, int rateCol, bool theSame), LIBND4J_TYPES);
+    BUILD_SINGLE_TEMPLATE(template void _extractPatches, (sd::LaunchContext * context, NDArray* input, NDArray* output, int sizeRow, int sizeCol, int stradeRow, int stradeCol, int rateRow, int rateCol, bool theSame), LIBND4J_TYPES);
 
 
 
-    void extractPatches(nd4j::LaunchContext * context, NDArray* images, NDArray* output, int sizeRow, int sizeCol, int stradeRow, int stradeCol, int rateRow, int rateCol, bool theSame){
+    void extractPatches(sd::LaunchContext * context, NDArray* images, NDArray* output, int sizeRow, int sizeCol, int stradeRow, int stradeCol, int rateRow, int rateCol, bool theSame){
         auto xType = images->dataType();
 
         BUILD_SINGLE_SELECTOR(xType, _extractPatches, (context, images, output, sizeRow, sizeCol, stradeRow, stradeCol, rateRow, rateCol, theSame), LIBND4J_TYPES);
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/fake_quantization.cu b/libnd4j/include/ops/declarable/helpers/cuda/fake_quantization.cu
index cbdff509d..262b1fe3e 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/fake_quantization.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/fake_quantization.cu
@@ -19,9 +19,9 @@
 //
 
 #include <ops/declarable/helpers/fake_quantization.h>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -47,7 +47,7 @@ namespace helpers {
             if (zeroPointFromMin > quantMaxF) {
                 return static_cast<uint16_t>(quantMax);
             }
-            return nd4j::math::nd4j_round<T,uint16_t>(zeroPointFromMin);
+            return sd::math::nd4j_round<T,uint16_t>(zeroPointFromMin);
         }();
         *nudgedMax = (quantMaxF - static_cast<T>(nudgedZeroPoint)) * (*scale);
         *nudgedMin = (quantMinF - static_cast<T>(nudgedZeroPoint)) * (*scale);
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/flatten.cu b/libnd4j/include/ops/declarable/helpers/cuda/flatten.cu
index df4e25130..e8b5e83c0 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/flatten.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/flatten.cu
@@ -21,7 +21,7 @@
 #include <ops/declarable/helpers/flatten.h>
 #include <helpers/PointersManager.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             template <typename T>
@@ -44,7 +44,7 @@ namespace nd4j {
             }
 
             template <typename T>
-            void flatten_(nd4j::LaunchContext *context, std::vector<NDArray*> &inputs, NDArray *output, char order) {
+            void flatten_(sd::LaunchContext *context, std::vector<NDArray*> &inputs, NDArray *output, char order) {
                 PointersManager pm(context, "flatten");
 
                 std::vector<void*> hdBuffers(inputs.size());
@@ -72,7 +72,7 @@ namespace nd4j {
                 pm.synchronize();
             }
 
-            void flatten(nd4j::LaunchContext *context, std::vector<NDArray*> &inputs, NDArray *output, char order) {
+            void flatten(sd::LaunchContext *context, std::vector<NDArray*> &inputs, NDArray *output, char order) {
                 // FIXME: we want NDArrayFactory::prepareSpecialUse here eventually
                 for (auto v:inputs)
                     v->syncToDevice();
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/gather.cu b/libnd4j/include/ops/declarable/helpers/cuda/gather.cu
index f6d8acc77..03d2f35d8 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/gather.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/gather.cu
@@ -21,10 +21,10 @@
 
 #include <ops/declarable/helpers/gather.h>
 #include <numeric>
-#include <PointersManager.h>
-#include <ShapeUtils.h>
+#include <helpers/PointersManager.h>
+#include <helpers/ShapeUtils.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
@@ -105,7 +105,7 @@ __host__ static void gatherCudaLauncher(const cudaStream_t *stream, const int nu
 }
 
 //////////////////////////////////////////////////////////////////////
-void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray* indices, NDArray* output, const std::vector<int>& intArgs) {
+void gather(sd::LaunchContext * context, const NDArray* input, const NDArray* indices, NDArray* output, const std::vector<int>& intArgs) {
 
     const int inputRank = input->rankOf();
     const int numOfIntArgs = intArgs.size();
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/gather_nd.cu b/libnd4j/include/ops/declarable/helpers/cuda/gather_nd.cu
index 6b3bf5135..21ab1ff98 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/gather_nd.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/gather_nd.cu
@@ -23,13 +23,13 @@
 #include <array/ResultSet.h>
 #include <helpers/ShapeUtils.h>
 #include <numeric>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 #include <helpers/TAD.h>
 #include <exceptions/cuda_exception.h>
-#include <PointersManager.h>
-#include <ConstantTadHelper.h>
+#include <helpers/PointersManager.h>
+#include <helpers/ConstantTadHelper.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             ///////////////////////////////////////////////////////////////////
@@ -53,7 +53,7 @@ namespace nd4j {
                     xRank   = shape::rank(xShapeInfo);
                     yRank   = shape::rank(yShapeInfo);
                     zRank   = shape::rank(zShapeInfo);
-                    maxRank = nd4j::math::nd4j_max<int>(yRank, nd4j::math::nd4j_max<int>(xRank, zRank));
+                    maxRank = sd::math::nd4j_max<int>(yRank, sd::math::nd4j_max<int>(xRank, zRank));
 
                     zLen     = shape::length(zShapeInfo);
                     yLastDim = yShapeInfo[yRank];
@@ -121,9 +121,9 @@ namespace nd4j {
             }
 
 ///////////////////////////////////////////////////////////////////
-            void gatherND(nd4j::LaunchContext * context, NDArray& input, NDArray& indices, NDArray& output) {
+            void gatherND(sd::LaunchContext * context, NDArray& input, NDArray& indices, NDArray& output) {
 
-                const int maxRank = nd4j::math::nd4j_max<int>(indices.rankOf(), nd4j::math::nd4j_max<int>(input.rankOf(), output.rankOf()));
+                const int maxRank = sd::math::nd4j_max<int>(indices.rankOf(), sd::math::nd4j_max<int>(input.rankOf(), output.rankOf()));
 
                 const int threadsPerBlock = 256;
                 const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/gradient.cu b/libnd4j/include/ops/declarable/helpers/cuda/gradient.cu
index a12b43973..f165d88b7 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/gradient.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/gradient.cu
@@ -19,9 +19,9 @@
 //
 
 #include <ops/declarable/helpers/axis.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 template <typename T>
@@ -34,7 +34,7 @@ void applyGradientDescent_(LaunchContext* context, NDArray* input, NDArray* step
     input->applyPairwiseLambda(*step, lambda, *output);
 }
 
-void applyGradientDescent(nd4j::LaunchContext* context, NDArray* input, NDArray* step, double weight, NDArray* output) {
+void applyGradientDescent(sd::LaunchContext* context, NDArray* input, NDArray* step, double weight, NDArray* output) {
     BUILD_SINGLE_SELECTOR(input->dataType(), applyGradientDescent_, (context, input, step, weight, output), FLOAT_TYPES);
 }
 BUILD_SINGLE_TEMPLATE(template void applyGradientDescent_, (LaunchContext* context, NDArray* input, NDArray* step, double weight, NDArray* output), FLOAT_TYPES);
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/gru.cu b/libnd4j/include/ops/declarable/helpers/cuda/gru.cu
index 82ab9d764..bd4e878e3 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/gru.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/gru.cu
@@ -27,15 +27,15 @@
 #include<ops/declarable/helpers/gru.h>
 #include <ops/declarable/CustomOperations.h>
 #include<ops/declarable/helpers/transforms.h>
-#include <MmulHelper.h>
+#include <helpers/MmulHelper.h>
 
-namespace nd4j 	  {
+namespace sd 	  {
 namespace ops 	  {
 namespace helpers {
 
 
 //////////////////////////////////////////////////////////////////////////
-void gruCell(nd4j::LaunchContext * context, const NDArray* x, const NDArray* hLast, const NDArray* W, const NDArray* Wc,
+void gruCell(sd::LaunchContext * context, const NDArray* x, const NDArray* hLast, const NDArray* W, const NDArray* Wc,
              const NDArray* b, const NDArray* bc,
              NDArray* r, NDArray* u, NDArray* c, NDArray* h) {
 
@@ -128,7 +128,7 @@ void gruCell(nd4j::LaunchContext * context, const NDArray* x, const NDArray* hLa
 }
 
 //////////////////////////////////////////////////////////////////////////
-void gruTimeLoop(nd4j::LaunchContext * context, const NDArray* x, const NDArray* hLast, const NDArray* Wx, const NDArray* Wh, const NDArray* b, NDArray* h) {
+void gruTimeLoop(sd::LaunchContext * context, const NDArray* x, const NDArray* hLast, const NDArray* Wx, const NDArray* Wh, const NDArray* b, NDArray* h) {
 
     // x   input [time, bS, iS]
     // hLast  initial cell output (at time step = 0) [bS, nU]
@@ -154,7 +154,7 @@ void gruTimeLoop(nd4j::LaunchContext * context, const NDArray* x, const NDArray*
 }
 
 //////////////////////////////////////////////////////////////////////////
-void gruCellBP(nd4j::LaunchContext* context,
+void gruCellBP(sd::LaunchContext* context,
               const NDArray* x,    const NDArray* hLast,
               const NDArray* W,    const NDArray* Wc,        const NDArray* b,    const NDArray* bc,
               const NDArray* dLdr, const NDArray* dLdu,      const NDArray* dLdc, const NDArray* dLdh,
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/hamming.cu b/libnd4j/include/ops/declarable/helpers/cuda/hamming.cu
index 9802ff231..f88ec6003 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/hamming.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/hamming.cu
@@ -21,7 +21,7 @@
 #include <ops/declarable/helpers/helpers.h>
 #include <ops/declarable/helpers/hamming.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             template <typename X, typename Z>
@@ -52,7 +52,7 @@ namespace nd4j {
                 __syncthreads();
 
                 // now we accumulate values
-                auto numItems = nd4j::math::nd4j_min<Nd4jLong>(blockDim.x, length);
+                auto numItems = sd::math::nd4j_min<Nd4jLong>(blockDim.x, length);
                 auto floorPow2 = numItems;
                 if (floorPow2 & (floorPow2 - 1)) {
 
@@ -77,7 +77,7 @@ namespace nd4j {
                 // FIXME: do we really want atomicAdd on global memory here
                 // and store them to output
                 if (threadIdx.x == 0 && shared[0] > 0)
-                    nd4j::math::atomics::nd4j_atomicAdd<Z>(&z[0], static_cast<Z>(shared[threadIdx.x]));
+                    sd::math::atomics::nd4j_atomicAdd<Z>(&z[0], static_cast<Z>(shared[threadIdx.x]));
             }
 
             template <typename X, typename Z>
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/hashcode.cu b/libnd4j/include/ops/declarable/helpers/cuda/hashcode.cu
index ac56b69d5..1c4ca9152 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/hashcode.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/hashcode.cu
@@ -21,7 +21,7 @@
 #include <ops/declarable/helpers/hashcode.h>
 
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             template <typename T>
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/histogram.cu b/libnd4j/include/ops/declarable/helpers/cuda/histogram.cu
index a4bcbb311..51af14fc4 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/histogram.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/histogram.cu
@@ -19,9 +19,9 @@
 //
 
 #include <ops/declarable/helpers/histogram.h>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             template <typename X, typename Z>
@@ -54,7 +54,7 @@ namespace nd4j {
                     int idx = int((dx[e] - *min_val) / binSize);
                     idx = math::nd4j_max(idx, 0); //atomicMax(&idx, 0);//atomicMax(&idx, 0);
                     idx = math::nd4j_min(idx, int(numBins - 1)); //atomicMin(&idx, int(numBins - 1));
-                    nd4j::math::atomics::nd4j_atomicAdd<Z>(&bins[idx], (Z)1);
+                    sd::math::atomics::nd4j_atomicAdd<Z>(&bins[idx], (Z)1);
                 }
                 __syncthreads();
                 // at this point all bins in shared memory are calculated, so we aggregate them now via threadfence trick
@@ -108,9 +108,9 @@ namespace nd4j {
             }
 
             template <typename X, typename Z>
-            static void histogram_(nd4j::LaunchContext *context, void *xBuffer, Nd4jLong *xShapeInfo, Nd4jLong *dxShapeInfo, void *zBuffer, Nd4jLong *zShapeInfo, Nd4jLong numBins, void* min_val, void* max_val) {
+            static void histogram_(sd::LaunchContext *context, void *xBuffer, Nd4jLong *xShapeInfo, Nd4jLong *dxShapeInfo, void *zBuffer, Nd4jLong *zShapeInfo, Nd4jLong numBins, void* min_val, void* max_val) {
                 int numThreads = 256;
-                int numBlocks = nd4j::math::nd4j_max<int>(256, nd4j::math::nd4j_min<int>(1, shape::length(xShapeInfo) / numThreads));
+                int numBlocks = sd::math::nd4j_max<int>(256, sd::math::nd4j_min<int>(1, shape::length(xShapeInfo) / numThreads));
                 int workspaceSize = numBlocks * numBins;
                 auto tmp = NDArrayFactory::create<Z>('c', {workspaceSize});
 
@@ -119,7 +119,7 @@ namespace nd4j {
                 cudaStreamSynchronize(*context->getCudaStream());
             }
 
-            void histogramHelper(nd4j::LaunchContext *context, NDArray &input, NDArray &output) {
+            void histogramHelper(sd::LaunchContext *context, NDArray &input, NDArray &output) {
                 Nd4jLong numBins = output.lengthOf();
                 NDArray::registerSpecialUse({&output}, {&input});
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/histogramFixedWidth.cu b/libnd4j/include/ops/declarable/helpers/cuda/histogramFixedWidth.cu
index 07d7bcd93..e39f9b438 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/histogramFixedWidth.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/histogramFixedWidth.cu
@@ -19,10 +19,10 @@
 //
 
 #include <ops/declarable/helpers/histogramFixedWidth.h>
-#include <cuda_exception.h>
-#include <PointersManager.h>
+#include <exceptions/cuda_exception.h>
+#include <helpers/PointersManager.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
@@ -66,7 +66,7 @@ __global__ static void histogramFixedWidthCuda( const void* vx, const Nd4jLong*
         else
             zIndex = static_cast<Nd4jLong>((value - leftEdge) / binWidth);
 
-        nd4j::math::atomics::nd4j_atomicAdd<Z>(&z[shape::getIndexOffset(zIndex, zShapeInfo)], 1);
+        sd::math::atomics::nd4j_atomicAdd<Z>(&z[shape::getIndexOffset(zIndex, zShapeInfo)], 1);
     }
 }
 
@@ -81,7 +81,7 @@ __host__ static void histogramFixedWidthCudaLauncher(const cudaStream_t *stream,
 }
 
 ////////////////////////////////////////////////////////////////////////
-void histogramFixedWidth(nd4j::LaunchContext* context, const NDArray& input, const NDArray& range, NDArray& output) {
+void histogramFixedWidth(sd::LaunchContext* context, const NDArray& input, const NDArray& range, NDArray& output) {
 
     // firstly initialize output with zeros
     output.nullify();
@@ -137,13 +137,13 @@ void histogramFixedWidth(nd4j::LaunchContext* context, const NDArray& input, con
 //                 currInd = 0;
 //             else if(value >= lastButOneEdge)
 //                 currInd = outputLength - 1;
-//             nd4j::math::atomics::nd4j_atomicAdd(&z[currInd], 1LL);
+//             sd::math::atomics::nd4j_atomicAdd(&z[currInd], 1LL);
 //         }
 //     }
 
 
 //     template <typename T>
-//     void histogramFixedWidth_(nd4j::LaunchContext * context, const NDArray& input, const NDArray& range, NDArray& output) {
+//     void histogramFixedWidth_(sd::LaunchContext * context, const NDArray& input, const NDArray& range, NDArray& output) {
 //         const int nbins = output.lengthOf();
 //         auto stream = context->getCudaStream();
 //         // firstly initialize output with zeros
@@ -191,10 +191,10 @@ void histogramFixedWidth(nd4j::LaunchContext* context, const NDArray& input, con
 // //        }
 //     }
 
-//     void histogramFixedWidth(nd4j::LaunchContext * context, const NDArray& input, const NDArray& range, NDArray& output) {
+//     void histogramFixedWidth(sd::LaunchContext * context, const NDArray& input, const NDArray& range, NDArray& output) {
 //         BUILD_SINGLE_SELECTOR(input.dataType(), histogramFixedWidth_, (context, input, range, output), LIBND4J_TYPES);
 //     }
-//     BUILD_SINGLE_TEMPLATE(template void histogramFixedWidth_, (nd4j::LaunchContext * context, const NDArray& input, const NDArray& range, NDArray& output), LIBND4J_TYPES);
+//     BUILD_SINGLE_TEMPLATE(template void histogramFixedWidth_, (sd::LaunchContext * context, const NDArray& input, const NDArray& range, NDArray& output), LIBND4J_TYPES);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/im2col.cu b/libnd4j/include/ops/declarable/helpers/cuda/im2col.cu
index 62fcd0588..5bce6c1b7 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/im2col.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/im2col.cu
@@ -19,9 +19,9 @@
 //
 
 #include <ops/declarable/helpers/im2col.h>
-#include <PointersManager.h>
+#include <helpers/PointersManager.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -80,12 +80,12 @@ __global__ static void im2colCuda(const void *image, void *columns,
 
 //////////////////////////////////////////////////////////////////////////
 template <typename T>
-static void im2colCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, nd4j::LaunchContext & context, const void *image, void *columns, const Nd4jLong *imShapeInfo, const Nd4jLong *colShapeInfo, int sH, int sW, int pH, int pW, int dH, int dW, double zeroPadVal) {
+static void im2colCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, sd::LaunchContext & context, const void *image, void *columns, const Nd4jLong *imShapeInfo, const Nd4jLong *colShapeInfo, int sH, int sW, int pH, int pW, int dH, int dW, double zeroPadVal) {
     im2colCuda<T><<<blocksPerGrid, threadsPerBlock, threadsPerBlock * sizeof(Nd4jLong) * 6 /* rank of columns = 6 */, *context.getCudaStream()>>>(image, columns, imShapeInfo, colShapeInfo, sH, sW, pH, pW, dH, dW, zeroPadVal);
 }
 
 //////////////////////////////////////////////////////////////////////////
-void im2col(nd4j::LaunchContext& context, const NDArray& image, NDArray& columns, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const NDArray& arrZeroPadVal) {
+void im2col(sd::LaunchContext& context, const NDArray& image, NDArray& columns, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const NDArray& arrZeroPadVal) {
 
     PointersManager manager(&context, "im2col");
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/image_draw_bounding_boxes.cu b/libnd4j/include/ops/declarable/helpers/cuda/image_draw_bounding_boxes.cu
index 9b05f891a..9817471bb 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/image_draw_bounding_boxes.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/image_draw_bounding_boxes.cu
@@ -17,10 +17,10 @@
 //
 //  @author sgazeos@gmail.com
 //
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -58,22 +58,22 @@ namespace helpers {
                 // box with shape
                 //auto internalBox = &boxes[b * colorSetSize * 4 + c * 4];//(*boxes)(b, {0})(c, {0});//internalBoxes->at(c);
                 auto colorIndex = boxIndex % colorTableLen;//colorSet->at(c);
-//                auto rowStart = nd4j::math::nd4j_max(Nd4jLong (0), Nd4jLong ((height - 1) * internalBox[0]));
-//                auto rowEnd = nd4j::math::nd4j_min(Nd4jLong (height - 1), Nd4jLong ((height - 1) * internalBox[2]));
-//                auto colStart = nd4j::math::nd4j_max(Nd4jLong (0), Nd4jLong ((width - 1) * internalBox[1]));
-//                auto colEnd = nd4j::math::nd4j_min(Nd4jLong(width - 1), Nd4jLong ((width - 1) * internalBox[3]));
+//                auto rowStart = sd::math::nd4j_max(Nd4jLong (0), Nd4jLong ((height - 1) * internalBox[0]));
+//                auto rowEnd = sd::math::nd4j_min(Nd4jLong (height - 1), Nd4jLong ((height - 1) * internalBox[2]));
+//                auto colStart = sd::math::nd4j_max(Nd4jLong (0), Nd4jLong ((width - 1) * internalBox[1]));
+//                auto colEnd = sd::math::nd4j_min(Nd4jLong(width - 1), Nd4jLong ((width - 1) * internalBox[3]));
                 Nd4jLong indices0[] = {batch, boxIndex, 0};
                 Nd4jLong indices1[] = {batch, boxIndex, 1};
                 Nd4jLong indices2[] = {batch, boxIndex, 2};
                 Nd4jLong indices3[] = {batch, boxIndex, 3};
                 auto rowStart = Nd4jLong ((height - 1) * boxes[shape::getOffset(boxesShape, indices0, 0)]);
-                auto rowStartBound = nd4j::math::nd4j_max(Nd4jLong (0), rowStart);
+                auto rowStartBound = sd::math::nd4j_max(Nd4jLong (0), rowStart);
                 auto rowEnd = Nd4jLong ((height - 1) * boxes[shape::getOffset(boxesShape, indices2, 0)]);
-                auto rowEndBound = nd4j::math::nd4j_min(Nd4jLong (height - 1), rowEnd);
+                auto rowEndBound = sd::math::nd4j_min(Nd4jLong (height - 1), rowEnd);
                 auto colStart = Nd4jLong ((width - 1) * boxes[shape::getOffset(boxesShape, indices1, 0)]);
-                auto colStartBound = nd4j::math::nd4j_max(Nd4jLong (0), colStart);
+                auto colStartBound = sd::math::nd4j_max(Nd4jLong (0), colStart);
                 auto colEnd = Nd4jLong ((width - 1) * boxes[shape::getOffset(boxesShape, indices3, 0)]);
-                auto colEndBound = nd4j::math::nd4j_min(Nd4jLong(width - 1), colEnd);
+                auto colEndBound = sd::math::nd4j_min(Nd4jLong(width - 1), colEnd);
                 if (rowStart > rowEnd || colStart > colEnd) {
 //                    printf("helpers::drawBoundingBoxesFunctor: Bounding box (%lld, %lld, %lld, %lld) is inverted "
 //                                "and will not be drawn\n", rowStart, colStart, rowEnd, colEnd);
@@ -137,7 +137,7 @@ namespace helpers {
     }
 
     template <typename T>
-    void drawBoundingBoxesH(nd4j::LaunchContext* context, NDArray const* images, NDArray const* boxes, NDArray const* colors, NDArray* output) {
+    void drawBoundingBoxesH(sd::LaunchContext* context, NDArray const* images, NDArray const* boxes, NDArray const* colors, NDArray* output) {
         auto batchSize = images->sizeAt(0);
         auto height = images->sizeAt(1);
         auto width = images->sizeAt(2);
@@ -158,7 +158,7 @@ namespace helpers {
                 outputBuf, output->specialShapeInfo(), batchSize, width, height, channels, boxSize, colorsTable.lengthOf());
     }
 
-    void drawBoundingBoxesFunctor(nd4j::LaunchContext * context, NDArray* images, NDArray* boxes, NDArray* colors, NDArray* output) {
+    void drawBoundingBoxesFunctor(sd::LaunchContext * context, NDArray* images, NDArray* boxes, NDArray* colors, NDArray* output) {
         // images - batch of 3D images with BW (last dim = 1), RGB (last dim = 3) or RGBA (last dim = 4) channel set
         // boxes - batch of 2D bounds with last dim (y_start, x_start, y_end, x_end) to compute i and j as
         // floor((height - 1 ) * y_start) => rowStart, floor((height - 1) * y_end) => rowEnd
@@ -171,7 +171,7 @@ namespace helpers {
         BUILD_SINGLE_SELECTOR(output->dataType(), drawBoundingBoxesH, (context, images, boxes, colors, output), FLOAT_TYPES);
         NDArray::registerSpecialUse({output}, {images, boxes, colors});
     }
-    BUILD_SINGLE_TEMPLATE(template void drawBoundingBoxesH, (nd4j::LaunchContext* context, NDArray const* images, NDArray const* boxes, NDArray const* colors, NDArray* output), FLOAT_TYPES);
+    BUILD_SINGLE_TEMPLATE(template void drawBoundingBoxesH, (sd::LaunchContext* context, NDArray const* images, NDArray const* boxes, NDArray const* colors, NDArray* output), FLOAT_TYPES);
 }
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/image_resize.cu b/libnd4j/include/ops/declarable/helpers/cuda/image_resize.cu
index 6096f3a85..6a045bc8d 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/image_resize.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/image_resize.cu
@@ -34,9 +34,9 @@ limitations under the License.
 //
 
 #include <ops/declarable/helpers/image_resize.h>
-#include <cuda_exception.h>
+#include <exceptions/cuda_exception.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -100,12 +100,12 @@ namespace helpers {
         for (Nd4jLong i = outSize - tid; i >= 0; i -= step) {
             double in = scaler(i, scale);
 //            interpolationData[i].bottomIndex = static_cast<Nd4jLong>(in);
-//            interpolationData[i].topIndex = nd4j::math::nd4j_min(interpolationData[i].bottomIndex + 1, inSize - 1);
+//            interpolationData[i].topIndex = sd::math::nd4j_min(interpolationData[i].bottomIndex + 1, inSize - 1);
 //            interpolationData[i].interpolarValue = in - interpolationData[i].bottomIndex;
-            double const in_f = nd4j::math::p_floor<double>(in);
-            double const in_c = nd4j::math::p_ceil<double>(in);
-            interpolationData[i].bottomIndex = nd4j::math::nd4j_max(static_cast<Nd4jLong>(in_f), (Nd4jLong)0LL);//static_cast<Nd4jLong>(in);
-            interpolationData[i].topIndex = nd4j::math::nd4j_min(static_cast<Nd4jLong>(in_c), inSize - 1);
+            double const in_f = sd::math::p_floor<double>(in);
+            double const in_c = sd::math::p_ceil<double>(in);
+            interpolationData[i].bottomIndex = sd::math::nd4j_max(static_cast<Nd4jLong>(in_f), (Nd4jLong)0LL);//static_cast<Nd4jLong>(in);
+            interpolationData[i].topIndex = sd::math::nd4j_min(static_cast<Nd4jLong>(in_c), inSize - 1);
             interpolationData[i].interpolarValue = in - in_f;
 
             if (channels) {
@@ -117,7 +117,7 @@ namespace helpers {
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // resize image with bilinear interpolation algorithm
 //
-    static void resizeImage(nd4j::LaunchContext* context, NDArray const* images, Nd4jLong batchSize, Nd4jLong inHeight, Nd4jLong inWidth, Nd4jLong outHeight,
+    static void resizeImage(sd::LaunchContext* context, NDArray const* images, Nd4jLong batchSize, Nd4jLong inHeight, Nd4jLong inWidth, Nd4jLong outHeight,
                      Nd4jLong outWidth, Nd4jLong channels,
                      BilinearInterpolationData* xs_,
                      BilinearInterpolationData* ys_,
@@ -162,7 +162,7 @@ namespace helpers {
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // resize image with
     template <typename T, typename F>
-    static void resizeImage_(nd4j::LaunchContext* context, NDArray const* images, Nd4jLong batchSize, Nd4jLong inHeight, Nd4jLong inWidth, Nd4jLong outHeight,
+    static void resizeImage_(sd::LaunchContext* context, NDArray const* images, Nd4jLong batchSize, Nd4jLong inHeight, Nd4jLong inWidth, Nd4jLong outHeight,
                      Nd4jLong outWidth, Nd4jLong channels,
                      BilinearInterpolationData* xs_,
                      BilinearInterpolationData* ys_,
@@ -187,7 +187,7 @@ namespace helpers {
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     template <typename T, typename F>
-    static int resizeBilinearFunctor_(nd4j::LaunchContext* context, NDArray const* images, int const width,
+    static int resizeBilinearFunctor_(sd::LaunchContext* context, NDArray const* images, int const width,
             int const height, bool const alignCorners, bool const halfPixelCenter, NDArray* output) {
         const Nd4jLong batchSize = images->sizeAt(0);
         const Nd4jLong inHeight = images->sizeAt(1);
@@ -263,19 +263,19 @@ namespace helpers {
         {
             auto b = blockIdx.x;
             for (int y = threadIdx.x; y < outHeight; y += blockDim.x) {
-                auto posY = alignCorners ? static_cast<Nd4jLong>(nd4j::math::p_round<float>(halfPixelCenters?((float)y + 0.5f) * heightScale:(float)y * heightScale)) : static_cast<Nd4jLong>(nd4j::math::p_floor<float>(
+                auto posY = alignCorners ? static_cast<Nd4jLong>(sd::math::p_round<float>(halfPixelCenters?((float)y + 0.5f) * heightScale:(float)y * heightScale)) : static_cast<Nd4jLong>(sd::math::p_floor<float>(
                         halfPixelCenters?((float)y + 0.5f) * heightScale:(float)y * heightScale));
-                Nd4jLong inY = nd4j::math::nd4j_min(posY, inHeight - 1);
+                Nd4jLong inY = sd::math::nd4j_min(posY, inHeight - 1);
                 if (halfPixelCenters) {
-                    inY = nd4j::math::nd4j_max(0LL, inY);
+                    inY = sd::math::nd4j_max(0LL, inY);
                 }
 
                 for (int x = threadIdx.y; x < outWidth; x += blockDim.y) {
-                    auto posX = alignCorners ? static_cast<Nd4jLong>(nd4j::math::p_round<float>(halfPixelCenters?((float)x + 0.5f) * widthScale:(float)x * widthScale)) : static_cast<Nd4jLong>(nd4j::math::p_floor<float>(
+                    auto posX = alignCorners ? static_cast<Nd4jLong>(sd::math::p_round<float>(halfPixelCenters?((float)x + 0.5f) * widthScale:(float)x * widthScale)) : static_cast<Nd4jLong>(sd::math::p_floor<float>(
                             halfPixelCenters?((float)x + 0.5f) * widthScale:(float)x * widthScale));
-                    Nd4jLong inX = nd4j::math::nd4j_min(posX, inWidth - 1);
+                    Nd4jLong inX = sd::math::nd4j_min(posX, inWidth - 1);
                     if (halfPixelCenters) {
-                        inX = nd4j::math::nd4j_max(0LL, inX);
+                        inX = sd::math::nd4j_max(0LL, inX);
                     }
 
                     auto start = blockIdx.z * blockDim.z + threadIdx.z;
@@ -298,7 +298,7 @@ namespace helpers {
 // resizeNeighborFunctor - main algorithm by nearest neighbor
 //
     template <typename T>
-    int resizeNeighborFunctor_(nd4j::LaunchContext* context, NDArray const* images, int const width, int const height,
+    int resizeNeighborFunctor_(sd::LaunchContext* context, NDArray const* images, int const width, int const height,
             bool const alignCorners, bool const halfPixelCenters, NDArray* output) {
         const Nd4jLong batchSize = images->sizeAt(0);
         const Nd4jLong inHeight = images->sizeAt(1);
@@ -339,7 +339,7 @@ namespace helpers {
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // resizeImage - resize bilinear algorithm caller
 //
-    void resizeImage(nd4j::LaunchContext* context, NDArray const* images, Nd4jLong batchSize, Nd4jLong inHeight,
+    void resizeImage(sd::LaunchContext* context, NDArray const* images, Nd4jLong batchSize, Nd4jLong inHeight,
             Nd4jLong inWidth, Nd4jLong outHeight, Nd4jLong outWidth, Nd4jLong channels, BilinearInterpolationData* xs_,
             BilinearInterpolationData* ys_, NDArray* output) {
         BUILD_DOUBLE_SELECTOR(images->dataType(), output->dataType(),
@@ -347,28 +347,28 @@ namespace helpers {
                         xs_, ys_, output), NUMERIC_TYPES, FLOAT_TYPES);
     }
 
-    BUILD_DOUBLE_TEMPLATE(template void resizeImage_,(nd4j::LaunchContext* context, NDArray const* images,
+    BUILD_DOUBLE_TEMPLATE(template void resizeImage_,(sd::LaunchContext* context, NDArray const* images,
             Nd4jLong batchSize, Nd4jLong inHeight, Nd4jLong inWidth, Nd4jLong outHeight, Nd4jLong outWidth,
             Nd4jLong channels, BilinearInterpolationData* xs_, BilinearInterpolationData* ys_, NDArray* output),
             NUMERIC_TYPES, FLOAT_TYPES);
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    int resizeBilinearFunctor(nd4j::LaunchContext* context, NDArray const* images, int width, int height,
+    int resizeBilinearFunctor(sd::LaunchContext* context, NDArray const* images, int width, int height,
             bool const alignCorners, bool const halfPixelCenter, NDArray* output) {
         BUILD_DOUBLE_SELECTOR(images->dataType(), output->dataType(), return resizeBilinearFunctor_, (context, images,
                 width, height, alignCorners, halfPixelCenter, output), NUMERIC_TYPES, FLOAT_TYPES);
     }
-//    BUILD_SINGLE_TEMPLATE(template int resizeBilinearFunctor_, (nd4j::LaunchContext* context,
+//    BUILD_SINGLE_TEMPLATE(template int resizeBilinearFunctor_, (sd::LaunchContext* context,
 //            NDArray const* images, int const width, int const height, bool const alignCorners,
 //            bool const halfPixelCenter, NDArray* output), LIBND4J_TYPES);
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    int resizeNeighborFunctor(nd4j::LaunchContext* context, NDArray const* images, int const width, int const height,
+    int resizeNeighborFunctor(sd::LaunchContext* context, NDArray const* images, int const width, int const height,
             bool const alignCorners, bool const halfPixelCenter, NDArray* output) {
         BUILD_SINGLE_SELECTOR(images->dataType(), return resizeNeighborFunctor_,
                 (context, images, width, height, alignCorners, halfPixelCenter, output), LIBND4J_TYPES);
     }
-//    BUILD_SINGLE_TEMPLATE(template int resizeNeighborFunctor_, (nd4j::LaunchContext* context, NDArray const* images,
+//    BUILD_SINGLE_TEMPLATE(template int resizeNeighborFunctor_, (sd::LaunchContext* context, NDArray const* images,
 //            int width, int height, bool const alignCorners, bool const halfPixelCenter, NDArray* output), LIBND4J_TYPES);
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -915,17 +915,17 @@ namespace helpers {
     }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     template <typename T>
-    int resizeBicubicFunctor_(nd4j::LaunchContext * context, NDArray const* image, int width, int height,
+    int resizeBicubicFunctor_(sd::LaunchContext * context, NDArray const* image, int width, int height,
                               bool preserveAspectRatio, bool antialias, NDArray* output) {
         return Status::OK();
     }
 
-    int resizeBicubicFunctor(nd4j::LaunchContext * context, NDArray const* image, int width, int height,
+    int resizeBicubicFunctor(sd::LaunchContext * context, NDArray const* image, int width, int height,
                              bool preserveAspectRatio, bool antialias, NDArray* output) {
         BUILD_SINGLE_SELECTOR(image->dataType(), return resizeBicubicFunctor_, (context, image,
                 width, height, preserveAspectRatio, antialias, output), NUMERIC_TYPES);
     }
-    BUILD_SINGLE_TEMPLATE(template int resizeBicubicFunctor_, (nd4j::LaunchContext * context, NDArray const* image, int width, int height,
+    BUILD_SINGLE_TEMPLATE(template int resizeBicubicFunctor_, (sd::LaunchContext * context, NDArray const* image, int width, int height,
             bool preserveAspectRatio, bool antialias, NDArray* output), NUMERIC_TYPES);
 // ------------------------------------------------------------------------------------------------------------------ //
     struct CachedInterpolation {
@@ -1124,7 +1124,7 @@ namespace helpers {
     }
 // ------------------------------------------------------------------------------------------------------------------ //
     template <typename T>
-    int resizeAreaFunctor_(nd4j::LaunchContext* context, NDArray const* image, int const width, int const height,
+    int resizeAreaFunctor_(sd::LaunchContext* context, NDArray const* image, int const width, int const height,
                               bool const alignCorners, NDArray* output) {
 
         ImageResizerState st(alignCorners, false); // Create resize info
@@ -1144,7 +1144,7 @@ namespace helpers {
 
         return res;
     }
-    int resizeAreaFunctor(nd4j::LaunchContext * context, NDArray const* image, int const width, int const height,
+    int resizeAreaFunctor(sd::LaunchContext * context, NDArray const* image, int const width, int const height,
                               bool const alignCorners, NDArray* output) {
         BUILD_SINGLE_SELECTOR(image->dataType(), return resizeAreaFunctor_, (context, image, width, height, alignCorners, output), NUMERIC_TYPES);
     }
@@ -1153,7 +1153,7 @@ namespace helpers {
 // simplified bicubic resize without antialiasing
 //
     template <typename T>
-    int resizeBicubicFunctorA_(nd4j::LaunchContext * context, NDArray const* image, int width, int height,
+    int resizeBicubicFunctorA_(sd::LaunchContext * context, NDArray const* image, int width, int height,
                                bool const alignCorners, bool const halfPixelCenters, NDArray* output) {
 
             ImageResizerState st(alignCorners, halfPixelCenters); // align_corners, half_pixel_align
@@ -1166,16 +1166,16 @@ namespace helpers {
             return res;
     }
 
-    int resizeBicubicFunctorA(nd4j::LaunchContext * context, NDArray const* image, int width, int height,
+    int resizeBicubicFunctorA(sd::LaunchContext * context, NDArray const* image, int width, int height,
                               bool const alignCorners, bool const halfPixelCenters, NDArray* output) {
         BUILD_SINGLE_SELECTOR(image->dataType(), return resizeBicubicFunctorA_, (context,
                 image, width, height, alignCorners, halfPixelCenters, output), NUMERIC_TYPES);
     }
-    BUILD_SINGLE_TEMPLATE(template int resizeBicubicFunctorA_, (nd4j::LaunchContext * context,
+    BUILD_SINGLE_TEMPLATE(template int resizeBicubicFunctorA_, (sd::LaunchContext * context,
             NDArray const* image, int width, int height, bool const alignCorners, bool const halfPixelCenters, NDArray* output), NUMERIC_TYPES);
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    int resizeFunctor(nd4j::LaunchContext * context, NDArray const* image, int width, int height,
+    int resizeFunctor(sd::LaunchContext * context, NDArray const* image, int width, int height,
                       ImageResizeMethods method, bool preserveAspectRatio, bool antialias, NDArray* output) {
         switch (method) {
             case kResizeBilinear: return resizeBilinearFunctor(context, image, width, height, false, false, output); break;
@@ -1240,8 +1240,8 @@ namespace helpers {
                 }
 
                 if (method == 0 /* bilinear */) {
-                    const int topYIndex = nd4j::math::p_floor(inY);
-                    const int bottomYIndex = nd4j::math::p_ceil(inY);
+                    const int topYIndex = sd::math::p_floor(inY);
+                    const int bottomYIndex = sd::math::p_ceil(inY);
                     const float y_lerp = inY - topYIndex;
 
                     for (int x = 0; x < cropWidth; ++x) {
@@ -1326,7 +1326,7 @@ namespace helpers {
 //      crops - output (4D tensor - [batch, outWidth, outHeight, pixels])
 //
     template <typename T, typename Z, typename I>
-    void cropAndResizeFunctor_(nd4j::LaunchContext* context, NDArray const *images, NDArray const *boxes, NDArray const *indices,
+    void cropAndResizeFunctor_(sd::LaunchContext* context, NDArray const *images, NDArray const *boxes, NDArray const *indices,
                                       NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops) {
         const int batchSize = images->sizeAt(0);
         const int imageHeight = images->sizeAt(1);
@@ -1354,13 +1354,13 @@ namespace helpers {
     }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    void cropAndResizeFunctor(nd4j::LaunchContext * context, NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops) {
+    void cropAndResizeFunctor(sd::LaunchContext * context, NDArray const *images, NDArray const *boxes, NDArray const *indices, NDArray const *cropSize, int method, double extrapolationVal, NDArray *crops) {
         BUILD_TRIPLE_SELECTOR(images->dataType(), boxes->dataType(), indices->dataType(), cropAndResizeFunctor_,
                               (context, images, boxes, indices, cropSize, method, extrapolationVal, crops), NUMERIC_TYPES, FLOAT_TYPES, INTEGER_TYPES);
         //
     }
     BUILD_TRIPLE_TEMPLATE(template void cropAndResizeFunctor_,
-                          (nd4j::LaunchContext * context, NDArray const* images, NDArray const* boxes, NDArray const* indices, NDArray const* cropSize, int method, double extrapolationVal, NDArray* crops),
+                          (sd::LaunchContext * context, NDArray const* images, NDArray const* boxes, NDArray const* indices, NDArray const* cropSize, int method, double extrapolationVal, NDArray* crops),
                           NUMERIC_TYPES, FLOAT_TYPES, INTEGER_TYPES);
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/image_suppression.cu b/libnd4j/include/ops/declarable/helpers/cuda/image_suppression.cu
index be2a71e8c..5ed534cb6 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/image_suppression.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/image_suppression.cu
@@ -19,12 +19,12 @@
 //
 
 #include <ops/declarable/helpers/image_suppression.h>
-#include <NDArrayFactory.h>
-#include <NativeOps.h>
-#include <cuda_exception.h>
+#include <array/NDArrayFactory.h>
+#include <legacy/NativeOps.h>
+#include <exceptions/cuda_exception.h>
 #include <queue>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -50,14 +50,14 @@ namespace helpers {
 
         // we have rectangle with given max values. Compute vexes of rectangle first
 
-        T minYPrev = nd4j::math::nd4j_min(boxes[shape::getOffset(boxesShape, previous0)], boxes[shape::getOffset(boxesShape, previous2)]);
-        T minXPrev = nd4j::math::nd4j_min(boxes[shape::getOffset(boxesShape, previous1)], boxes[shape::getOffset(boxesShape, previous3)]);
-        T maxYPrev = nd4j::math::nd4j_max(boxes[shape::getOffset(boxesShape, previous0)], boxes[shape::getOffset(boxesShape, previous2)]);
-        T maxXPrev = nd4j::math::nd4j_max(boxes[shape::getOffset(boxesShape, previous1)], boxes[shape::getOffset(boxesShape, previous3)]);
-        T minYNext = nd4j::math::nd4j_min(boxes[shape::getOffset(boxesShape, next0)],     boxes[shape::getOffset(boxesShape, next2)]);
-        T minXNext = nd4j::math::nd4j_min(boxes[shape::getOffset(boxesShape, next1)],     boxes[shape::getOffset(boxesShape, next3)]);
-        T maxYNext = nd4j::math::nd4j_max(boxes[shape::getOffset(boxesShape, next0)],     boxes[shape::getOffset(boxesShape, next2)]);
-        T maxXNext = nd4j::math::nd4j_max(boxes[shape::getOffset(boxesShape, next1)],     boxes[shape::getOffset(boxesShape, next3)]);
+        T minYPrev = sd::math::nd4j_min(boxes[shape::getOffset(boxesShape, previous0)], boxes[shape::getOffset(boxesShape, previous2)]);
+        T minXPrev = sd::math::nd4j_min(boxes[shape::getOffset(boxesShape, previous1)], boxes[shape::getOffset(boxesShape, previous3)]);
+        T maxYPrev = sd::math::nd4j_max(boxes[shape::getOffset(boxesShape, previous0)], boxes[shape::getOffset(boxesShape, previous2)]);
+        T maxXPrev = sd::math::nd4j_max(boxes[shape::getOffset(boxesShape, previous1)], boxes[shape::getOffset(boxesShape, previous3)]);
+        T minYNext = sd::math::nd4j_min(boxes[shape::getOffset(boxesShape, next0)],     boxes[shape::getOffset(boxesShape, next2)]);
+        T minXNext = sd::math::nd4j_min(boxes[shape::getOffset(boxesShape, next1)],     boxes[shape::getOffset(boxesShape, next3)]);
+        T maxYNext = sd::math::nd4j_max(boxes[shape::getOffset(boxesShape, next0)],     boxes[shape::getOffset(boxesShape, next2)]);
+        T maxXNext = sd::math::nd4j_max(boxes[shape::getOffset(boxesShape, next1)],     boxes[shape::getOffset(boxesShape, next3)]);
 
         // compute areas for comparation
         T areaPrev = (maxYPrev - minYPrev) * (maxXPrev - minXPrev);
@@ -67,13 +67,13 @@ namespace helpers {
         if (areaNext <= T(0.f) || areaPrev <= T(0.f)) return false;
 
         // compute intersection of rectangles
-        T minIntersectionY = nd4j::math::nd4j_max(minYPrev, minYNext);
-        T minIntersectionX = nd4j::math::nd4j_max(minXPrev, minXNext);
-        T maxIntersectionY = nd4j::math::nd4j_min(maxYPrev, maxYNext);
-        T maxIntersectionX = nd4j::math::nd4j_min(maxXPrev, maxXNext);
+        T minIntersectionY = sd::math::nd4j_max(minYPrev, minYNext);
+        T minIntersectionX = sd::math::nd4j_max(minXPrev, minXNext);
+        T maxIntersectionY = sd::math::nd4j_min(maxYPrev, maxYNext);
+        T maxIntersectionX = sd::math::nd4j_min(maxXPrev, maxXNext);
         T intersectionArea =
-                nd4j::math::nd4j_max(T(maxIntersectionY - minIntersectionY), T(0.0f)) *
-                nd4j::math::nd4j_max(T(maxIntersectionX - minIntersectionX), T(0.0f));
+                sd::math::nd4j_max(T(maxIntersectionY - minIntersectionY), T(0.0f)) *
+                sd::math::nd4j_max(T(maxIntersectionX - minIntersectionX), T(0.0f));
         T intersectionValue = intersectionArea / (areaPrev + areaNext - intersectionArea);
         // final check
         return intersectionValue > threshold;
@@ -92,14 +92,14 @@ namespace helpers {
 
         // we have rectangle with given max values. Compute vexes of rectangle first
 
-        T minYPrev = nd4j::math::nd4j_min(boxes[shape::getOffset(boxesShape, previous0)], boxes[shape::getOffset(boxesShape, previous2)]);
-        T minXPrev = nd4j::math::nd4j_min(boxes[shape::getOffset(boxesShape, previous1)], boxes[shape::getOffset(boxesShape, previous3)]);
-        T maxYPrev = nd4j::math::nd4j_max(boxes[shape::getOffset(boxesShape, previous0)], boxes[shape::getOffset(boxesShape, previous2)]);
-        T maxXPrev = nd4j::math::nd4j_max(boxes[shape::getOffset(boxesShape, previous1)], boxes[shape::getOffset(boxesShape, previous3)]);
-        T minYNext = nd4j::math::nd4j_min(boxes[shape::getOffset(boxesShape, next0)],     boxes[shape::getOffset(boxesShape, next2)]);
-        T minXNext = nd4j::math::nd4j_min(boxes[shape::getOffset(boxesShape, next1)],     boxes[shape::getOffset(boxesShape, next3)]);
-        T maxYNext = nd4j::math::nd4j_max(boxes[shape::getOffset(boxesShape, next0)],     boxes[shape::getOffset(boxesShape, next2)]);
-        T maxXNext = nd4j::math::nd4j_max(boxes[shape::getOffset(boxesShape, next1)],     boxes[shape::getOffset(boxesShape, next3)]);
+        T minYPrev = sd::math::nd4j_min(boxes[shape::getOffset(boxesShape, previous0)], boxes[shape::getOffset(boxesShape, previous2)]);
+        T minXPrev = sd::math::nd4j_min(boxes[shape::getOffset(boxesShape, previous1)], boxes[shape::getOffset(boxesShape, previous3)]);
+        T maxYPrev = sd::math::nd4j_max(boxes[shape::getOffset(boxesShape, previous0)], boxes[shape::getOffset(boxesShape, previous2)]);
+        T maxXPrev = sd::math::nd4j_max(boxes[shape::getOffset(boxesShape, previous1)], boxes[shape::getOffset(boxesShape, previous3)]);
+        T minYNext = sd::math::nd4j_min(boxes[shape::getOffset(boxesShape, next0)],     boxes[shape::getOffset(boxesShape, next2)]);
+        T minXNext = sd::math::nd4j_min(boxes[shape::getOffset(boxesShape, next1)],     boxes[shape::getOffset(boxesShape, next3)]);
+        T maxYNext = sd::math::nd4j_max(boxes[shape::getOffset(boxesShape, next0)],     boxes[shape::getOffset(boxesShape, next2)]);
+        T maxXNext = sd::math::nd4j_max(boxes[shape::getOffset(boxesShape, next1)],     boxes[shape::getOffset(boxesShape, next3)]);
 
         // compute areas for comparation
         T areaPrev = (maxYPrev - minYPrev) * (maxXPrev - minXPrev);
@@ -109,13 +109,13 @@ namespace helpers {
         if (areaNext <= T(0.f) || areaPrev <= T(0.f)) return false;
 
         // compute intersection of rectangles
-        T minIntersectionY = nd4j::math::nd4j_max(minYPrev, minYNext);
-        T minIntersectionX = nd4j::math::nd4j_max(minXPrev, minXNext);
-        T maxIntersectionY = nd4j::math::nd4j_min(maxYPrev, maxYNext);
-        T maxIntersectionX = nd4j::math::nd4j_min(maxXPrev, maxXNext);
+        T minIntersectionY = sd::math::nd4j_max(minYPrev, minYNext);
+        T minIntersectionX = sd::math::nd4j_max(minXPrev, minXNext);
+        T maxIntersectionY = sd::math::nd4j_min(maxYPrev, maxYNext);
+        T maxIntersectionX = sd::math::nd4j_min(maxXPrev, maxXNext);
         T intersectionArea =
-                nd4j::math::nd4j_max(T(maxIntersectionY - minIntersectionY), T(0.0f)) *
-                nd4j::math::nd4j_max(T(maxIntersectionX - minIntersectionX), T(0.0f));
+                sd::math::nd4j_max(T(maxIntersectionY - minIntersectionY), T(0.0f)) *
+                sd::math::nd4j_max(T(maxIntersectionX - minIntersectionX), T(0.0f));
         T intersectionValue = intersectionArea / (areaPrev + areaNext - intersectionArea);
         // final check
         return intersectionValue;
@@ -185,7 +185,7 @@ namespace helpers {
 // nonMaxSuppressionV2 algorithm - given from TF NonMaxSuppressionV2 implementation
 //
     template <typename T, typename I>
-    static void nonMaxSuppressionV2_(nd4j::LaunchContext* context, NDArray* boxes, NDArray* scales, int maxSize, double threshold, double scoreThreshold, NDArray* output) {
+    static void nonMaxSuppressionV2_(sd::LaunchContext* context, NDArray* boxes, NDArray* scales, int maxSize, double threshold, double scoreThreshold, NDArray* output) {
         auto stream = context->getCudaStream();
         NDArray::prepareSpecialUse({output}, {boxes, scales});
         std::unique_ptr<NDArray> indices(NDArrayFactory::create_<I>('c', {scales->lengthOf()})); // - 1, scales->lengthOf()); //, scales->getContext());
@@ -335,7 +335,7 @@ namespace helpers {
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     template <typename T, typename I>
     static Nd4jLong
-    nonMaxSuppressionGeneric_(nd4j::LaunchContext* context, NDArray* boxes, NDArray* scores, int outputSize,
+    nonMaxSuppressionGeneric_(sd::LaunchContext* context, NDArray* boxes, NDArray* scores, int outputSize,
                               double overlapThreshold, double scoreThreshold, NDArray* output, bool simple) {
         auto stream = context->getCudaStream();
         if (output)
@@ -386,14 +386,14 @@ namespace helpers {
         return res;
     }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    void nonMaxSuppression(nd4j::LaunchContext * context, NDArray* boxes, NDArray* scales, int maxSize, double threshold, double scoreThreshold, NDArray* output) {
+    void nonMaxSuppression(sd::LaunchContext * context, NDArray* boxes, NDArray* scales, int maxSize, double threshold, double scoreThreshold, NDArray* output) {
         BUILD_DOUBLE_SELECTOR(boxes->dataType(), output->dataType(), nonMaxSuppressionV2_,
                 (context, boxes, scales, maxSize, threshold, scoreThreshold, output),
                 FLOAT_TYPES, INDEXING_TYPES);
     }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-    Nd4jLong nonMaxSuppressionGeneric(nd4j::LaunchContext * context, NDArray* boxes, NDArray* scales, int maxSize, double threshold, double scoreThreshold, NDArray* output) {
+    Nd4jLong nonMaxSuppressionGeneric(sd::LaunchContext * context, NDArray* boxes, NDArray* scales, int maxSize, double threshold, double scoreThreshold, NDArray* output) {
         BUILD_DOUBLE_SELECTOR(boxes->dataType(), output ? output->dataType():DataType::INT32, return nonMaxSuppressionGeneric_,
                               (context, boxes, scales, maxSize, threshold, scoreThreshold, output, true),
                               FLOAT_TYPES, INDEXING_TYPES);
@@ -401,7 +401,7 @@ namespace helpers {
     }
 
     Nd4jLong
-    nonMaxSuppressionV3(nd4j::LaunchContext* context, NDArray* boxes, NDArray* scores, int maxSize,
+    nonMaxSuppressionV3(sd::LaunchContext* context, NDArray* boxes, NDArray* scores, int maxSize,
                              double overlapThreshold, double scoreThreshold, NDArray* output) {
         BUILD_DOUBLE_SELECTOR(boxes->dataType(), output ? output->dataType():DataType::INT32, return nonMaxSuppressionGeneric_,
                               (context, boxes, scores, maxSize, overlapThreshold, scoreThreshold, output, false),
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/imagesHelpers.cu b/libnd4j/include/ops/declarable/helpers/cuda/imagesHelpers.cu
index 35393c48c..11ec1f46e 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/imagesHelpers.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/imagesHelpers.cu
@@ -19,14 +19,14 @@
 // @author Oleh Semeniv (oleg.semeniv@gmail.com)
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <ops/declarable/helpers/imagesHelpers.h>
 #include <helpers/ConstantTadHelper.h>
 #include <ops/declarable/helpers/adjust_hue.h>
-#include <PointersManager.h>
+#include <helpers/PointersManager.h>
 
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
@@ -67,10 +67,10 @@ linkage void rgbToYuvCudaLauncher(const int blocksPerGrid, const int threadsPerB
 }
 
 ///////////////////////////////////////////////////////////////////
-void transformRgbYuv(nd4j::LaunchContext* context, const NDArray& input, NDArray& output, const int dimC) {
+void transformRgbYuv(sd::LaunchContext* context, const NDArray& input, NDArray& output, const int dimC) {
 
-    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input.getShapeInfo(), { dimC });
-    auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output.getShapeInfo(), { dimC });
+    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input.getShapeInfo(), { dimC });
+    auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output.getShapeInfo(), { dimC });
 
     const Nd4jLong numOfTads = packX.numberOfTads();
 
@@ -122,10 +122,10 @@ linkage void yuvToRgbCudaLauncher(const int blocksPerGrid, const int threadsPerB
 }
 
 ///////////////////////////////////////////////////////////////////
-void transformYuvRgb(nd4j::LaunchContext* context, const NDArray& input, NDArray& output, const int dimC) {
+void transformYuvRgb(sd::LaunchContext* context, const NDArray& input, NDArray& output, const int dimC) {
 
-    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input.getShapeInfo(), { dimC });
-    auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output.getShapeInfo(), { dimC });
+    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input.getShapeInfo(), { dimC });
+    auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output.getShapeInfo(), { dimC });
 
     const Nd4jLong numOfTads = packX.numberOfTads();
 
@@ -191,7 +191,7 @@ linkage void rgbToGrsCudaLauncher(const int blocksPerGrid, const int threadsPerB
 }
 
 ///////////////////////////////////////////////////////////////////
-void transformRgbGrs(nd4j::LaunchContext* context, const NDArray& input, NDArray& output, const int dimC) {
+void transformRgbGrs(sd::LaunchContext* context, const NDArray& input, NDArray& output, const int dimC) {
 
 	PointersManager manager(context, "rgbToGrs");
 
@@ -285,10 +285,10 @@ static _CUDA_H void rgbToHsvCudaLauncher(const int blocksPerGrid, const int thre
 }
 
 ///////////////////////////////////////////////////////////////////
-void transformHsvRgb(nd4j::LaunchContext* context, const NDArray* input, NDArray* output, const int dimC) {
+void transformHsvRgb(sd::LaunchContext* context, const NDArray* input, NDArray* output, const int dimC) {
 
-    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(),  {dimC});
-    auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {dimC});
+    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(),  {dimC});
+    auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {dimC});
 
     const Nd4jLong numOfTads = packX.numberOfTads();
 
@@ -305,9 +305,9 @@ void transformHsvRgb(nd4j::LaunchContext* context, const NDArray* input, NDArray
 }
 
 ///////////////////////////////////////////////////////////////////
-void transformRgbHsv(nd4j::LaunchContext* context, const NDArray* input, NDArray* output, const int dimC) {
-    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(),  {dimC});
-    auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {dimC});
+void transformRgbHsv(sd::LaunchContext* context, const NDArray* input, NDArray* output, const int dimC) {
+    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(),  {dimC});
+    auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {dimC});
 
     const Nd4jLong numOfTads = packX.numberOfTads();
 
@@ -388,9 +388,9 @@ __global__ void tripleTransformerCuda(const void *vx, const Nd4jLong *xShapeInfo
 
 
 template <typename T>
-static void rgbYiq(nd4j::LaunchContext* context, const NDArray* input, NDArray* output, const int dimC) {
-    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimC);
-    auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimC);
+static void rgbYiq(sd::LaunchContext* context, const NDArray* input, NDArray* output, const int dimC) {
+    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimC);
+    auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimC);
 
     NDArray::prepareSpecialUse({output}, {input});
     return tripleTransformerCuda<T><<<256, 256, 8192, *context->getCudaStream()>>>(input->getSpecialBuffer(), input->getSpecialShapeInfo(), packX.platformShapeInfo(), packX.platformOffsets(), output->specialBuffer(), output->specialShapeInfo(), packZ.platformShapeInfo(), packZ.platformOffsets(), dimC, 1, packZ.numberOfTads());
@@ -398,20 +398,20 @@ static void rgbYiq(nd4j::LaunchContext* context, const NDArray* input, NDArray*
 }
 
 template <typename T>
-FORCEINLINE static void yiqRgb(nd4j::LaunchContext* context, const NDArray* input, NDArray* output, const int dimC) {
-    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimC);
-    auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimC);
+FORCEINLINE static void yiqRgb(sd::LaunchContext* context, const NDArray* input, NDArray* output, const int dimC) {
+    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimC);
+    auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimC);
 
     NDArray::prepareSpecialUse({output}, {input});
     return tripleTransformerCuda<T><<<256, 256, 8192, *context->getCudaStream()>>>(input->getSpecialBuffer(), input->getSpecialShapeInfo(), packX.platformShapeInfo(), packX.platformOffsets(), output->specialBuffer(), output->specialShapeInfo(), packZ.platformShapeInfo(), packZ.platformOffsets(), dimC, 2, packZ.numberOfTads());
     NDArray::registerSpecialUse({output}, {input});
 }
 
-void transformYiqRgb(nd4j::LaunchContext* context, const NDArray* input, NDArray* output, const int dimC) {
+void transformYiqRgb(sd::LaunchContext* context, const NDArray* input, NDArray* output, const int dimC) {
     BUILD_SINGLE_SELECTOR(input->dataType(), yiqRgb, (context, input, output, dimC), FLOAT_TYPES);
 }
 
-void transformRgbYiq(nd4j::LaunchContext* context, const NDArray* input, NDArray* output, const int dimC) {
+void transformRgbYiq(sd::LaunchContext* context, const NDArray* input, NDArray* output, const int dimC) {
     BUILD_SINGLE_SELECTOR(input->dataType(), rgbYiq, (context, input, output, dimC), FLOAT_TYPES);
 }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/ismax.cu b/libnd4j/include/ops/declarable/helpers/cuda/ismax.cu
index bf6a943fa..27f4f35f2 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/ismax.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/ismax.cu
@@ -24,16 +24,16 @@
 #include<ops/declarable/helpers/ismax.h>
 #include<loops/special_kernels.h>
 #include <helpers/DebugHelper.h>
-#include <cuda_exception.h>
-#include <PointersManager.h>
+#include <exceptions/cuda_exception.h>
+#include <helpers/PointersManager.h>
 #include <helpers/ConstantTadHelper.h>
 
-namespace nd4j 	  {
+namespace sd 	  {
 namespace ops 	  {
 namespace helpers {
 
 template <typename T>
-static void ismax_(nd4j::LaunchContext * context, const NDArray* input, NDArray* output, const std::vector<int>& dimensions) {
+static void ismax_(sd::LaunchContext * context, const NDArray* input, NDArray* output, const std::vector<int>& dimensions) {
     auto stream = context->getCudaStream();
 
     auto xRank = input->rankOf();
@@ -61,7 +61,7 @@ static void ismax_(nd4j::LaunchContext * context, const NDArray* input, NDArray*
         int dimensionLength = dimensions.size();
         std::vector<int> copy(dimensions);
 
-        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), copy.data(), copy.size());
+        auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), copy.data(), copy.size());
 
         // we launch legacy IndexMax op, to get indices of max values along dimension
         auto indexMaxArr = input->applyIndexReduce(indexreduce::IndexMax, dimensions);
@@ -76,7 +76,7 @@ static void ismax_(nd4j::LaunchContext * context, const NDArray* input, NDArray*
 }
 
 
-void ismax(nd4j::LaunchContext * context, const NDArray *input, NDArray *output, const std::vector<int>& dimensions) {
+void ismax(sd::LaunchContext * context, const NDArray *input, NDArray *output, const std::vector<int>& dimensions) {
     NDArray::prepareSpecialUse({output}, {input});
 
     BUILD_SINGLE_SELECTOR(input->dataType(), ismax_, (context, input, output, dimensions), LIBND4J_TYPES);
@@ -84,7 +84,7 @@ void ismax(nd4j::LaunchContext * context, const NDArray *input, NDArray *output,
     NDArray::registerSpecialUse({output}, {input});
 }
 
-BUILD_SINGLE_TEMPLATE(template void ismax_, (nd4j::LaunchContext * context, const NDArray *input, NDArray *output, const std::vector<int>& dimensions), LIBND4J_TYPES);
+BUILD_SINGLE_TEMPLATE(template void ismax_, (sd::LaunchContext * context, const NDArray *input, NDArray *output, const std::vector<int>& dimensions), LIBND4J_TYPES);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/legacy/relu.cu b/libnd4j/include/ops/declarable/helpers/cuda/legacy/relu.cu
index a3d24111a..ab65ed96b 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/legacy/relu.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/legacy/relu.cu
@@ -20,10 +20,10 @@
 
 #include <ops/declarable/helpers/legacy_helpers.h>
 #include <ops/ops.h>
-#include <NDArrayFactory.h>
-#include <op_boilerplate.h>
+#include <array/NDArrayFactory.h>
+#include <system/op_boilerplate.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
 
@@ -36,7 +36,7 @@ namespace nd4j {
                 theFirst->applyPairwiseLambda(*theSecond, functor, *theFirst);
             }
 
-            void reluDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond) {
+            void reluDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond) {
                 BUILD_SINGLE_SELECTOR(theFirst->dataType(), reluDerivative__, (theFirst, theSecond), FLOAT_TYPES);
             }
 
@@ -49,7 +49,7 @@ namespace nd4j {
                 input->applyPairwiseLambda(*epsilon, functor, *output);
             }
 
-            void reluDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
+            void reluDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
                 BUILD_SINGLE_SELECTOR(theFirst->dataType(), reluDerivative_, (theFirst, theSecond, theOutput), FLOAT_TYPES);
             }
 
@@ -62,7 +62,7 @@ namespace nd4j {
                 input->applyPairwiseLambda(*epsilon, functor, *output);
             }
 
-            void relu6Derivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
+            void relu6Derivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
                 BUILD_SINGLE_SELECTOR(theFirst->dataType(), relu6Derivative_, (theFirst, theSecond, theOutput), FLOAT_TYPES);
             }
 
@@ -78,7 +78,7 @@ namespace nd4j {
                 input->applyPairwiseLambda(*epsilon, functor, *output);
             }
 
-            void leakyReluDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput, const float alpha) {
+            void leakyReluDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput, const float alpha) {
                 BUILD_SINGLE_SELECTOR(theFirst->dataType(), leakyReluDerivative_, (theFirst, theSecond, theOutput, alpha), FLOAT_TYPES);
             }
 
@@ -88,13 +88,13 @@ namespace nd4j {
                 const T alphaT = static_cast<T>(alpha);
 
                 auto functor = LAMBDA_TT(x, y, alphaT){
-                    return y * nd4j::math::nd4j_eluderivative<T,T>(x, alphaT);
+                    return y * sd::math::nd4j_eluderivative<T,T>(x, alphaT);
                 };
 
                 input->applyPairwiseLambda(*epsilon, functor, *output);
             }
 
-            void eluDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput, const float alpha) {
+            void eluDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput, const float alpha) {
                 BUILD_SINGLE_SELECTOR(theFirst->dataType(), eluDerivative_, (theFirst, theSecond, theOutput, alpha), FLOAT_TYPES);
             }
 
@@ -107,7 +107,7 @@ namespace nd4j {
                 input->applyPairwiseLambda(*epsilon, functor, *output);
             }
 
-            void seluDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
+            void seluDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
                 BUILD_SINGLE_SELECTOR(theFirst->dataType(), seluDerivative_, (theFirst, theSecond, theOutput), FLOAT_TYPES);
             }
         }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/legacy/tanh.cu b/libnd4j/include/ops/declarable/helpers/cuda/legacy/tanh.cu
index afd07cd48..56a57614f 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/legacy/tanh.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/legacy/tanh.cu
@@ -19,25 +19,25 @@
 //
 
 #include <ops/declarable/helpers/legacy_helpers.h>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 #include <ops/ops.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             ////////////////////////////////////////////////////////////////////////
             template <typename T>
             linkage void tanhDerivative_(NDArray* input, NDArray* epsilon, NDArray* output) {
                 auto functor = LAMBDA_TT(x, y){
-                    T th = nd4j::math::nd4j_tanh<T,T>(x);
+                    T th = sd::math::nd4j_tanh<T,T>(x);
                     return y * ((T)1.0f - (th * th));
                 };
 
                 input->applyPairwiseLambda(*epsilon, functor, *output);
             }
 
-            void tanhDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
+            void tanhDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
                 BUILD_SINGLE_SELECTOR(theFirst->dataType(), tanhDerivative_, (theFirst, theSecond, theOutput), FLOAT_TYPES);
             }
 
@@ -45,14 +45,14 @@ namespace nd4j {
             template <typename T>
             linkage void hardTanhDerivative_(NDArray* input, NDArray* epsilon, NDArray* output) {
                 auto functor = LAMBDA_TT(x, y){
-                    T th = nd4j::math::nd4j_tanh<T,T>(x);
+                    T th = sd::math::nd4j_tanh<T,T>(x);
                     return y * simdOps::HardTanhDerivative<T>::op(x, nullptr);
                 };
 
                 input->applyPairwiseLambda(*epsilon, functor, *output);
             }
 
-            void hardTanhDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
+            void hardTanhDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
                 BUILD_SINGLE_SELECTOR(theFirst->dataType(), hardTanhDerivative_, (theFirst, theSecond, theOutput), FLOAT_TYPES);
             }
 
@@ -65,20 +65,20 @@ namespace nd4j {
                 input->applyPairwiseLambda(*epsilon, functor, *output);
             }
 
-            void rationalTanhDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
+            void rationalTanhDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
                 BUILD_SINGLE_SELECTOR(theFirst->dataType(), rationalTanhDerivative_, (theFirst, theSecond, theOutput), FLOAT_TYPES);
             }
 
             template <typename T>
             linkage void rectifiedTanhDerivative_(NDArray* input, NDArray* epsilon, NDArray* output) {
                 auto functor = LAMBDA_TT(x, y){
-                    return x > (T) 0.0f ? y * (nd4j::math::nd4j_tanhderivative<T,T>(x)) : (T) 0.0f;
+                    return x > (T) 0.0f ? y * (sd::math::nd4j_tanhderivative<T,T>(x)) : (T) 0.0f;
                 };
 
                 input->applyPairwiseLambda(*epsilon, functor, *output);
             }
 
-            void rectifiedTanhDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
+            void rectifiedTanhDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
                 BUILD_SINGLE_SELECTOR(theFirst->dataType(), rectifiedTanhDerivative_, (theFirst, theSecond, theOutput), FLOAT_TYPES);
             }
         }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/legacy_helper.cu b/libnd4j/include/ops/declarable/helpers/cuda/legacy_helper.cu
index fb4a94abb..181050919 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/legacy_helper.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/legacy_helper.cu
@@ -19,11 +19,11 @@
 //
 
 #include <ops/declarable/helpers/legacy_helpers.h>
-#include <NDArrayFactory.h>
-#include <op_boilerplate.h>
+#include <array/NDArrayFactory.h>
+#include <system/op_boilerplate.h>
 #include <ops/ops.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -38,7 +38,7 @@ namespace helpers {
     }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    void cubeDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
+    void cubeDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
         BUILD_SINGLE_SELECTOR(theFirst->dataType(), cubeDerivative_, (theFirst, theSecond, theOutput), FLOAT_TYPES);
     }
 
@@ -54,7 +54,7 @@ namespace helpers {
     }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    void reduceNorm1(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
+    void reduceNorm1(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
         BUILD_SINGLE_SELECTOR(theFirst->dataType(), reduceNorm1_, (theFirst, theSecond, theOutput), FLOAT_TYPES);
     }
 
@@ -63,14 +63,14 @@ namespace helpers {
     template <typename T>
     linkage void sigmCrossEntropy_(NDArray* logits, NDArray* labels, NDArray* output) {
         auto functor = LAMBDA_TT(x, y){
-            return nd4j::math::nd4j_max<T>(x, (T)0.f) - x * y + nd4j::math::nd4j_log<T,T>((T)1.f + nd4j::math::nd4j_exp<T,T>(-nd4j::math::nd4j_abs(x)));
+            return sd::math::nd4j_max<T>(x, (T)0.f) - x * y + sd::math::nd4j_log<T,T>((T)1.f + sd::math::nd4j_exp<T,T>(-sd::math::nd4j_abs(x)));
         };
 
         logits->applyPairwiseLambda(*labels, functor, *output);
     }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    void sigmCrossEntropy(nd4j::LaunchContext * context, NDArray* logits, NDArray* labels, NDArray* output) {
+    void sigmCrossEntropy(sd::LaunchContext * context, NDArray* logits, NDArray* labels, NDArray* output) {
         BUILD_SINGLE_SELECTOR(logits->dataType(), sigmCrossEntropy_, (logits, labels, output), FLOAT_TYPES);
     }
 
@@ -81,26 +81,26 @@ namespace helpers {
         // 1 - labels - 1 / (1 + exp(logits))
         auto functor = LAMBDA_TT(x, y) {
             if(x <= 0)
-                return static_cast<T>(1.) - y - static_cast<T>(1.) / (static_cast<T>(1.) + nd4j::math::nd4j_exp<T,T>(x));
-            auto e = nd4j::math::nd4j_exp<T,T>(-x);
+                return static_cast<T>(1.) - y - static_cast<T>(1.) / (static_cast<T>(1.) + sd::math::nd4j_exp<T,T>(x));
+            auto e = sd::math::nd4j_exp<T,T>(-x);
             return static_cast<T>(1.) - y - e / (static_cast<T>(1.) + e);
         };
 
         logits->applyPairwiseLambda(*labels, functor, *output);
     }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    void sigmCrossEntropyGrad(nd4j::LaunchContext * context, NDArray* logits, NDArray* labels, NDArray* output) {
+    void sigmCrossEntropyGrad(sd::LaunchContext * context, NDArray* logits, NDArray* labels, NDArray* output) {
         BUILD_SINGLE_SELECTOR(logits->dataType(), sigmCrossEntropyGrad_, (logits, labels, output), FLOAT_TYPES);
     }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    //            X f = (X) 1.0f + nd4j::math::nd4j_abs<X>(d1);
+    //            X f = (X) 1.0f + sd::math::nd4j_abs<X>(d1);
     //            return (X) d2 * ((X) 1.0f / (f * f));
     //
     template <typename T>
     linkage void softSignDerivative_(NDArray* input, NDArray* epsilon, NDArray* output) {
         auto functor = LAMBDA_TT(x, y){
-            T ss = (T)1.f + nd4j::math::nd4j_abs<T>(x);
+            T ss = (T)1.f + sd::math::nd4j_abs<T>(x);
             return y * ((T) 1.0f  / (ss * ss));
         };
 
@@ -108,7 +108,7 @@ namespace helpers {
     }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    void softSignDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
+    void softSignDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
         BUILD_SINGLE_SELECTOR(theFirst->dataType(), softSignDerivative_, (theFirst, theSecond, theOutput), FLOAT_TYPES);
     }
 
@@ -116,14 +116,14 @@ namespace helpers {
     template <typename T>
     linkage void softPlusDerivative_(NDArray* input, NDArray* epsilon, NDArray* output) {
         auto functor = LAMBDA_TT(x, y){
-            T p = nd4j::math::nd4j_pow<T, T, T>(static_cast<T>(M_E), x);
+            T p = sd::math::nd4j_pow<T, T, T>(static_cast<T>(M_E), x);
             return y * (p / (p + 1.));
         };
 
         input->applyPairwiseLambda(*epsilon, functor, *output);
     }
 
-    void softPlusDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
+    void softPlusDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
         BUILD_SINGLE_SELECTOR(theFirst->dataType(), softPlusDerivative_, (theFirst, theSecond, theOutput), FLOAT_TYPES);
     }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -134,14 +134,14 @@ namespace helpers {
     template <typename T>
     linkage void sigmoidDerivative_(NDArray* input, NDArray* epsilon, NDArray* output) {
         auto functor = LAMBDA_TT(x, y){
-            T s = nd4j::math::nd4j_sigmoid<T,T>(x);
+            T s = sd::math::nd4j_sigmoid<T,T>(x);
             return y * (s * ((T) 1.0f - s));
         };
 
         input->applyPairwiseLambda(*epsilon, functor, *output);
     }
 
-    void sigmoidDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
+    void sigmoidDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
         BUILD_SINGLE_SELECTOR(theFirst->dataType(), sigmoidDerivative_, (theFirst, theSecond, theOutput), FLOAT_TYPES);
     }
 
@@ -154,7 +154,7 @@ namespace helpers {
         input->applyPairwiseLambda(*epsilon, functor, *output);
     }
 
-    void hardSigmoidDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
+    void hardSigmoidDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput) {
         BUILD_SINGLE_SELECTOR(theFirst->dataType(), hardSigmoidDerivative_, (theFirst, theSecond, theOutput), FLOAT_TYPES);
     }
 
@@ -192,12 +192,12 @@ namespace helpers {
     }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    void logSumExp(nd4j::LaunchContext * context, NDArray* input, NDArray* axis, NDArray* output) {
+    void logSumExp(sd::LaunchContext * context, NDArray* input, NDArray* axis, NDArray* output) {
         BUILD_SINGLE_SELECTOR(input->dataType(), logSumExp_, (input, axis, output), FLOAT_TYPES);
     }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    void logSumExp(nd4j::LaunchContext * context, NDArray* input, NDArray* subtrah, NDArray* axis, NDArray* output) {
+    void logSumExp(sd::LaunchContext * context, NDArray* input, NDArray* subtrah, NDArray* axis, NDArray* output) {
         BUILD_SINGLE_SELECTOR(input->dataType(), logSumExp_, (input, subtrah, axis, output), FLOAT_TYPES);
     }
 
@@ -210,15 +210,15 @@ namespace helpers {
         auto mainRoutineT1 = LAMBDA_TT(_x, _z, posWeight) {
             T targetWeight = (1. + (posWeight - (T)1.f) * _z);
             return (1. - _z) * _x +
-                   targetWeight * (nd4j::math::nd4j_log<T,T>((T)1.f + nd4j::math::nd4j_exp<T,T>(-nd4j::math::nd4j_abs(_x))) +
-                                   nd4j::math::nd4j_max(-_x, T(0.f))
+                   targetWeight * (sd::math::nd4j_log<T,T>((T)1.f + sd::math::nd4j_exp<T,T>(-sd::math::nd4j_abs(_x))) +
+                                   sd::math::nd4j_max(-_x, T(0.f))
                    );
         };
 
         auto mainRoutineT2 = LAMBDA_TTT(_x, _z, _w) {
             return (((T)1.0 - _z) * _x) +
-                   _w * (nd4j::math::nd4j_log<T,T>(T(1.) + nd4j::math::nd4j_exp<T,T>(-nd4j::math::nd4j_abs(_x))) +
-                         nd4j::math::nd4j_max(-_x, T(0.f)));
+                   _w * (sd::math::nd4j_log<T,T>(T(1.) + sd::math::nd4j_exp<T,T>(-sd::math::nd4j_abs(_x))) +
+                         sd::math::nd4j_max(-_x, T(0.f)));
         };
 
 
@@ -236,7 +236,7 @@ namespace helpers {
         }
     }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    void weightedCrossEntropyWithLogitsFunctor(nd4j::LaunchContext * context, NDArray const* targets, NDArray const* input, NDArray const* weights, NDArray* output) {
+    void weightedCrossEntropyWithLogitsFunctor(sd::LaunchContext * context, NDArray const* targets, NDArray const* input, NDArray const* weights, NDArray* output) {
         NDArray::prepareSpecialUse({output}, {targets, input, weights});
 
         BUILD_SINGLE_SELECTOR(targets->dataType(), weightedCrossEntropyWithLogitsFunctor_, (targets, input, weights, output), FLOAT_TYPES);
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/lgamma.cu b/libnd4j/include/ops/declarable/helpers/cuda/lgamma.cu
index ea9901f0a..2de455d0f 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/lgamma.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/lgamma.cu
@@ -23,7 +23,7 @@
 //#include <execution/Threads.h>
 //#include <helper_math.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -39,7 +39,7 @@ void lgamma_(NDArray& x, NDArray& z) {
     x.applyLambda(lgammaProc, z);
 }
 
-void lgamma(nd4j::LaunchContext* context, NDArray& x, NDArray& z) {
+void lgamma(sd::LaunchContext* context, NDArray& x, NDArray& z) {
 
 	BUILD_SINGLE_SELECTOR(x.dataType(), lgamma_, (x, z), FLOAT_TYPES);
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/lrn.cu b/libnd4j/include/ops/declarable/helpers/cuda/lrn.cu
index ca2bda4a4..57bb205a9 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/lrn.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/lrn.cu
@@ -19,10 +19,10 @@
 //
 
 #include <ops/declarable/helpers/lrn.h>
-#include <Status.h>
-#include <ConstantTadHelper.h>
+#include <graph/Status.h>
+#include <helpers/ConstantTadHelper.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -50,15 +50,15 @@ namespace helpers {
             shared[threadIdx.x] = x[threadIdx.x * xEws];
             __syncthreads();
 
-            const uint begin = nd4j::math::nd4j_max<int>(0, threadIdx.x - depth);
+            const uint begin = sd::math::nd4j_max<int>(0, threadIdx.x - depth);
             const uint last  = depth + threadIdx.x + 1;
-            const uint end   = nd4j::math::nd4j_min<int>(last, tadLength);
+            const uint end   = sd::math::nd4j_min<int>(last, tadLength);
 
             T prev = 0.;
             for (int s = begin; s < end; s++)
                 prev = prev + shared[s] * shared[s];
 
-            z[threadIdx.x * zEws] = shared[threadIdx.x] / nd4j::math::nd4j_pow<T, T, T>(tbias + alpha * prev, tbeta);
+            z[threadIdx.x * zEws] = shared[threadIdx.x] / sd::math::nd4j_pow<T, T, T>(tbias + alpha * prev, tbeta);
         }
     }
 
@@ -85,9 +85,9 @@ namespace helpers {
             auto x = reinterpret_cast<X*>(vx) + xTadOffsets[i];
             auto z = reinterpret_cast<Z*>(vz) + zTadOffsets[i];
 
-            const uint begin = nd4j::math::nd4j_max<int>(0, threadIdx.x - depth);
+            const uint begin = sd::math::nd4j_max<int>(0, threadIdx.x - depth);
             const uint last  = depth + threadIdx.x + 1;
-            const uint end   = nd4j::math::nd4j_min<int>(last, tadLength);
+            const uint end   = sd::math::nd4j_min<int>(last, tadLength);
 
             // load everything into shared memory
             sharedX[threadIdx.x] = x[threadIdx.x * xEws];
@@ -104,7 +104,7 @@ namespace helpers {
 
             Z prev = 0.f;
             for (uint s = begin; s < end; ++s) {
-                factor[s] = nd4j::math::nd4j_pow<Z, Z, Z>(tbias + talpha * sharedY[s], -tbeta - 1);
+                factor[s] = sd::math::nd4j_pow<Z, Z, Z>(tbias + talpha * sharedY[s], -tbeta - 1);
                 prev = prev + sharedX[s] * factor[s];
             }
 
@@ -114,13 +114,13 @@ namespace helpers {
 
 
     template <typename X, typename Z>
-    static void lrnBP_(nd4j::graph::Context& block, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int depth, const float bias, const float alpha, const float beta) {
+    static void lrnBP_(sd::graph::Context& block, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int depth, const float bias, const float alpha, const float beta) {
         auto rank = input.rankOf();
         auto packX = ConstantTadHelper::getInstance()->tadForDimensions(input.getShapeInfo(), {rank - 1});
         auto packZ = ConstantTadHelper::getInstance()->tadForDimensions(gradI.getShapeInfo(), {rank - 1});
 
         const auto tadLength = shape::length(packX.primaryShapeInfo());
-        const int numBlocks = nd4j::math::nd4j_min<Nd4jLong>(1024, packX.numberOfTads());
+        const int numBlocks = sd::math::nd4j_min<Nd4jLong>(1024, packX.numberOfTads());
         const int numThreads = tadLength;
 
         if (tadLength > 1024 || tadLength < 1)
@@ -132,7 +132,7 @@ namespace helpers {
         gradI *= gradO;
     }
 
-    void lrnBP(nd4j::graph::Context& block, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int depth, const float bias, const float alpha, const float beta) {
+    void lrnBP(sd::graph::Context& block, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int depth, const float bias, const float alpha, const float beta) {
         input.syncToDevice();
         gradO.syncToDevice();
 
@@ -142,13 +142,13 @@ namespace helpers {
     }
 
     template <typename T>
-    static void lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* output, int depth, double bias, double alpha, double beta) {
+    static void lrnFunctor_(sd::graph::Context& block, NDArray* input, NDArray* output, int depth, double bias, double alpha, double beta) {
         auto rank = input->rankOf();
         auto packX = ConstantTadHelper::getInstance()->tadForDimensions(input->shapeInfo(), {rank - 1});
         auto packZ = ConstantTadHelper::getInstance()->tadForDimensions(output->shapeInfo(), {rank - 1});
 
         const auto tadLength = shape::length(packX.primaryShapeInfo());
-        const int numBlocks = nd4j::math::nd4j_min<Nd4jLong>(1024, packX.numberOfTads());
+        const int numBlocks = sd::math::nd4j_min<Nd4jLong>(1024, packX.numberOfTads());
         const int numThreads = tadLength;
 
         if (tadLength > 1024 || tadLength < 1)
@@ -157,7 +157,7 @@ namespace helpers {
         lrnKernel<T><<<numBlocks, numThreads, numThreads * sizeof(T), *block.launchContext()->getCudaStream()>>>(input->specialBuffer(), packX.platformShapeInfo(), packX.platformOffsets(), output->specialBuffer(), packZ.platformShapeInfo(), packZ.platformOffsets(), packX.numberOfTads(), tadLength, depth, bias, alpha, beta);
     }
 
-    int lrnFunctor(nd4j::graph::Context& block, NDArray* input, NDArray* output, int depth, double bias, double alpha, double beta) {
+    int lrnFunctor(sd::graph::Context& block, NDArray* input, NDArray* output, int depth, double bias, double alpha, double beta) {
         input->syncToDevice();
 
         BUILD_SINGLE_SELECTOR(input->dataType(), lrnFunctor_, (block, input, output, depth, bias, alpha, beta), FLOAT_TYPES);
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/lstm.cu b/libnd4j/include/ops/declarable/helpers/cuda/lstm.cu
index 2204c9189..af0c413d6 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/lstm.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/lstm.cu
@@ -34,14 +34,14 @@
 #include <array/NDArrayList.h>
 #include <iterator>
 
-namespace nd4j 	  {
+namespace sd 	  {
 namespace ops 	  {
 namespace helpers {
 
 
 
 //////////////////////////////////////////////////////////////////////////
-void lstmCell(nd4j::LaunchContext * context, const NDArray* xt, const NDArray* ht_1, const NDArray* ct_1, const NDArray* Wx, const NDArray* Wh, const NDArray* Wc, const NDArray* Wp, const NDArray* b,
+void lstmCell(sd::LaunchContext * context, const NDArray* xt, const NDArray* ht_1, const NDArray* ct_1, const NDArray* Wx, const NDArray* Wh, const NDArray* Wc, const NDArray* Wp, const NDArray* b,
               NDArray* ht, NDArray* ct, const std::vector<double>& params) {
 
     // xt   input [bS x nIn]
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/lstsq.cu b/libnd4j/include/ops/declarable/helpers/cuda/lstsq.cu
index e29325084..a3b029c0b 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/lstsq.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/lstsq.cu
@@ -17,18 +17,18 @@
 //
 //  @author GS <sgazeos@gmail.com>
 //
-#include <op_boilerplate.h>
-#include <NDArray.h>
-#include <MmulHelper.h>
-#include <ShapeUtils.h>
-#include <ConstantTadHelper.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
+#include <helpers/MmulHelper.h>
+#include <helpers/ShapeUtils.h>
+#include <helpers/ConstantTadHelper.h>
 
-#include "../triangular_solve.h"
-#include "../lup.h"
-#include "../qr.h"
-#include "../lstsq.h"
+#include <ops/declarable/helpers/triangular_solve.h>
+#include <ops/declarable/helpers/lup.h>
+#include <ops/declarable/helpers/qr.h>
+#include <ops/declarable/helpers/lstsq.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -47,7 +47,7 @@ namespace helpers {
     }
 
     template <typename T>
-    static void fillRegularizer(nd4j::LaunchContext* context, NDArray& ioMatrix, double const value) {
+    static void fillRegularizer(sd::LaunchContext* context, NDArray& ioMatrix, double const value) {
         auto lastDimsTads = ConstantTadHelper::getInstance()->tadForDimensions(ioMatrix.shapeInfo(), {-2, -1});
         auto stream = context->getCudaStream();
         auto rows = ioMatrix.sizeAt(-2);
@@ -57,7 +57,7 @@ namespace helpers {
     }
 
     template <typename T>
-    int leastSquaresSolveFunctor_(nd4j::LaunchContext* context, NDArray const* leftInput, NDArray const* rightInput, double const l2Regularizer, bool const fast, NDArray* output) {
+    int leastSquaresSolveFunctor_(sd::LaunchContext* context, NDArray const* leftInput, NDArray const* rightInput, double const l2Regularizer, bool const fast, NDArray* output) {
         if (fast) { // Cholesky decomposition approach
             // Equation for solve A^T * Ax = A^T * b, so
             // 1. Computing A2:
@@ -106,7 +106,7 @@ namespace helpers {
         return Status::OK();
     }
 
-    int leastSquaresSolveFunctor(nd4j::LaunchContext* context, NDArray const* leftInput, NDArray const* rightInput, double const l2Regularizer, bool const fast, NDArray* output) {
+    int leastSquaresSolveFunctor(sd::LaunchContext* context, NDArray const* leftInput, NDArray const* rightInput, double const l2Regularizer, bool const fast, NDArray* output) {
         BUILD_SINGLE_SELECTOR(leftInput->dataType(), return leastSquaresSolveFunctor_, (context, leftInput, rightInput, l2Regularizer, fast, output), FLOAT_TYPES);
     }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/lup.cu b/libnd4j/include/ops/declarable/helpers/cuda/lup.cu
index 3e8def28a..7630694e1 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/lup.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/lup.cu
@@ -19,17 +19,17 @@
 //
 
 #include <ops/declarable/helpers/top_k.h>
-#include <MmulHelper.h>
-#include <NDArrayFactory.h>
-#include <Status.h>
-#include <ConstantTadHelper.h>
-#include <ShapeUtils.h>
+#include <helpers/MmulHelper.h>
+#include <array/NDArrayFactory.h>
+#include <graph/Status.h>
+#include <helpers/ConstantTadHelper.h>
+#include <helpers/ShapeUtils.h>
 //#include <ops/declarable/generic/helpers/BroadcastHelper.h>
 
 #include <cusolverDn.h>
-#include <cuda_exception.h>
+#include <exceptions/cuda_exception.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -401,7 +401,7 @@ namespace helpers {
                     }
                 }
                 else {
-                    NDArray permutVector('c', {n}, nd4j::DataType::INT32, context);
+                    NDArray permutVector('c', {n}, sd::DataType::INT32, context);
                     int* permutationBuf = permutVector.dataBuffer()->specialAsT<int>();
                     status = cusolverDnDgetrf(
                             cusolverH,
@@ -548,14 +548,14 @@ namespace helpers {
         auto rowNum = shape::sizeAt(compoundShape, 0);
         Nd4jLong xInitial[] = {column, column};
         auto xInitialIndex = shape::getOffset(compoundShape, xInitial, 0);
-        auto maxValue = T(0); //nd4j::math::nd4j_abs(compoundBuffer[xInitialIndex]);
+        auto maxValue = T(0); //sd::math::nd4j_abs(compoundBuffer[xInitialIndex]);
         auto result = -1LL;
 
         for (auto rowCounter = column; rowCounter < rowNum; rowCounter++) {
             Nd4jLong xPos[] = {rowCounter, column};
             auto xIndex = shape::getOffset(compoundShape, xPos, 0);
-            if (nd4j::math::nd4j_abs(compoundBuffer[xIndex]) > maxValue) {
-                maxValue = nd4j::math::nd4j_max(maxValue, nd4j::math::nd4j_abs(compoundBuffer[xIndex]));
+            if (sd::math::nd4j_abs(compoundBuffer[xIndex]) > maxValue) {
+                maxValue = sd::math::nd4j_max(maxValue, sd::math::nd4j_abs(compoundBuffer[xIndex]));
                 result = rowCounter;
             }
         }
@@ -603,7 +603,7 @@ namespace helpers {
 
         output->assign(input); // fill up output tensor with zeros
 //        output->tickWriteDevice();
-        permutationVectors->applyTrueBroadcast(nd4j::BroadcastOpsTuple::Assign(), iota, *permutationVectors, true, nullptr);
+        permutationVectors->applyTrueBroadcast(sd::BroadcastOpsTuple::Assign(), iota, *permutationVectors, true, nullptr);
 //        permutationVectors->tickWriteDevice();
         auto tads = ConstantTadHelper::getInstance()->tadForDimensions(output->shapeInfo(), {-2, -1});
         auto permutaionTads = ConstantTadHelper::getInstance()->tadForDimensions(output->shapeInfo(), {-1});
@@ -621,7 +621,7 @@ namespace helpers {
     }
 // ------------------------------------------------------------------------------------------------------------------ //
     template<typename T>
-    static int determinant_(nd4j::LaunchContext *context, NDArray *input, NDArray *output) {
+    static int determinant_(sd::LaunchContext *context, NDArray *input, NDArray *output) {
         Nd4jLong n = input->sizeAt(-1);
         Nd4jLong n2 = n * n;
         std::vector<int> dims();
@@ -659,7 +659,7 @@ namespace helpers {
         return Status::OK();
     }
 
-        int determinant(nd4j::LaunchContext *context, NDArray *input, NDArray *output) {
+        int determinant(sd::LaunchContext *context, NDArray *input, NDArray *output) {
             NDArray::prepareSpecialUse({output}, {input});
             BUILD_SINGLE_SELECTOR(input->dataType(), return determinant_, (context, input, output), FLOAT_NATIVE);
             NDArray::registerSpecialUse({output}, {input});
@@ -708,7 +708,7 @@ namespace helpers {
             return ND4J_STATUS_OK;
         }
 
-        int logAbsDeterminant(nd4j::LaunchContext *context, NDArray *input, NDArray *output) {
+        int logAbsDeterminant(sd::LaunchContext *context, NDArray *input, NDArray *output) {
             NDArray::prepareSpecialUse({output}, {input});
             BUILD_SINGLE_SELECTOR(input->dataType(), return logAbsDeterminant_, (context, input, output), FLOAT_NATIVE);
             NDArray::registerSpecialUse({output}, {input});
@@ -747,7 +747,7 @@ namespace helpers {
         }
 
         template<typename T>
-        static int inverse_(nd4j::LaunchContext *context, NDArray *input, NDArray *output) {
+        static int inverse_(sd::LaunchContext *context, NDArray *input, NDArray *output) {
             auto n = input->sizeAt(-1);
             auto n2 = n * n;
             auto dtype = DataTypeUtils::fromT<T>(); //input->dataType();
@@ -758,10 +758,10 @@ namespace helpers {
             NDArray lower = NDArrayFactory::create('c', {n, n}, dtype, context);
             NDArray compound = NDArrayFactory::create('c', {n, n}, dtype, context);
             NDArray permutation = NDArrayFactory::create('c', {n, n}, dtype, context);
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(),
+            auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(),
                                                                                   {input->rankOf() - 2,
                                                                                    input->rankOf() - 1});
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(),
+            auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(),
                                                                                   {output->rankOf() - 2,
                                                                                    output->rankOf() - 1});
             auto stream = context->getCudaStream();
@@ -787,7 +787,7 @@ namespace helpers {
 //                compound.printIndexedBuffer("Lower Inverted");
 //                matrix.tickWriteDevice();
 //                compound.tickWriteDevice();
-                nd4j::MmulHelper::mmul(&matrix, &compound, &upper, 1.0, 0.0);
+                sd::MmulHelper::mmul(&matrix, &compound, &upper, 1.0, 0.0);
                 upper.tickWriteDevice();
 //                upper.printIndexedBuffer("Full inverted");
                 returnMatrix<T> <<<1, n2, 1024, *stream>>>(output->specialBuffer(), output->specialShapeInfo(), upper.specialBuffer(), upper.specialShapeInfo(), i * n2, n);
@@ -795,13 +795,13 @@ namespace helpers {
             return Status::OK();
         }
 
-        int inverse(nd4j::LaunchContext *context, NDArray *input, NDArray *output) {
+        int inverse(sd::LaunchContext *context, NDArray *input, NDArray *output) {
             NDArray::prepareSpecialUse({output}, {input});
             BUILD_SINGLE_SELECTOR(input->dataType(), return inverse_, (context, input, output), FLOAT_NATIVE);
             NDArray::registerSpecialUse({output}, {input});
         }
 
-        bool checkCholeskyInput(nd4j::LaunchContext *context, NDArray const *input) {
+        bool checkCholeskyInput(sd::LaunchContext *context, NDArray const *input) {
             return true;
         }
 
@@ -848,7 +848,7 @@ namespace helpers {
                 throw cuda_exception::build("helpers::cholesky_: Cannot create solver handle", status);
             }
             F **dArrayBatch = nullptr;
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(tempOutput.getShapeInfo(),
+            auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(tempOutput.getShapeInfo(),
                                                                                   {tempOutput.rankOf() - 2,
                                                                                    tempOutput.rankOf() - 1});
             const Nd4jLong batchSize = packX.numberOfTads();
@@ -933,12 +933,12 @@ namespace helpers {
             return Status::OK();
         }
 
-        int cholesky(nd4j::LaunchContext *context, NDArray *input, NDArray *output, bool inplace) {
+        int cholesky(sd::LaunchContext *context, NDArray *input, NDArray *output, bool inplace) {
 //        BUILD_SINGLE_SELECTOR(input->dataType(), return cholesky_, (context, input, output, inplace), FLOAT_TYPES);
             return cholesky_(context, input, output, inplace);
         }
 //    BUILD_SINGLE_TEMPLATE(template int cholesky_, (LaunchContext* context, NDArray* input, NDArray* output, bool inplace), FLOAT_TYPES);
-        BUILD_SINGLE_TEMPLATE(template int inverse_, (nd4j::LaunchContext * context, NDArray * input, NDArray * output),
+        BUILD_SINGLE_TEMPLATE(template int inverse_, (sd::LaunchContext * context, NDArray * input, NDArray * output),
                               FLOAT_NATIVE);
 
         template<typename T>
@@ -968,7 +968,7 @@ namespace helpers {
         }
 
         template<typename T>
-        int logdetFunctor_(nd4j::LaunchContext *context, NDArray *input, NDArray *output) {
+        int logdetFunctor_(sd::LaunchContext *context, NDArray *input, NDArray *output) {
             NDArray::prepareSpecialUse({output}, {input});
             auto n2 = input->sizeAt(-1) * input->sizeAt(-2);
             auto stream = context->getCudaStream();
@@ -979,7 +979,7 @@ namespace helpers {
             auto outputBuf = output->dataBuffer()->specialAsT<T>(); //reinterpret_cast<T*>(output->specialBuffer()); // + e * n2; // + e * n2;
             auto inputBuf = tempOutput.dataBuffer()->specialAsT<T>(); //reinterpret_cast<T*>(tempOutput.specialBuffer());
             output->nullify();
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(tempOutput.getShapeInfo(),
+            auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(tempOutput.getShapeInfo(),
                                                                                   {tempOutput.rankOf() - 2,
                                                                                    tempOutput.rankOf() - 1});
             logDetKernel<T> <<<128, 512, 256, *stream>>>(inputBuf, tempOutput.specialShapeInfo(),
@@ -990,7 +990,7 @@ namespace helpers {
             return Status::OK();
         }
 
-        int logdetFunctor(nd4j::LaunchContext *context, NDArray *input, NDArray *output) {
+        int logdetFunctor(sd::LaunchContext *context, NDArray *input, NDArray *output) {
             BUILD_SINGLE_SELECTOR(output->dataType(), return logdetFunctor_, (context, input, output), FLOAT_NATIVE);
         }
 
@@ -1003,7 +1003,7 @@ namespace helpers {
         }
 
 //        BUILD_SINGLE_TEMPLATE(template int logdetFunctor_,
-//                              (nd4j::LaunchContext * context, NDArray * input, NDArray * output), FLOAT_NATIVE);
+//                              (sd::LaunchContext * context, NDArray * input, NDArray * output), FLOAT_NATIVE);
     }
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu b/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu
index a3c754cf5..c771d12ff 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu
@@ -18,11 +18,11 @@
 // @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
-#include "ResultSet.h"
+#include <array/ResultSet.h>
 #include <ops/declarable/helpers/matrixSetDiag.h>
-#include <PointersManager.h>
+#include <helpers/PointersManager.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
@@ -82,7 +82,7 @@ static void matrixSetDiagCudaLauncher(const int blocksPerGrid, const int threads
 }
 
 ///////////////////////////////////////////////////////////////////
-void matrixSetDiag(nd4j::LaunchContext* context, const NDArray& input, const NDArray& diagonal, NDArray& output, const bool zeroPad) {
+void matrixSetDiag(sd::LaunchContext* context, const NDArray& input, const NDArray& diagonal, NDArray& output, const bool zeroPad) {
 
     const int threadsPerBlock = MAX_NUM_THREADS / 2;
     const int blocksPerGrid = (input.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/matrix_band.cu b/libnd4j/include/ops/declarable/helpers/cuda/matrix_band.cu
index 3b7ea9551..3c1305391 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/matrix_band.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/matrix_band.cu
@@ -18,12 +18,12 @@
 //  @author George A. Shulinok <sgazeos@gmail.com>
 //
 #include <ops/declarable/helpers/matrix_band.h>
-#include <TAD.h>
-#include <cuda_exception.h>
-#include <ShapeUtils.h>
+#include <helpers/TAD.h>
+#include <exceptions/cuda_exception.h>
+#include <helpers/ShapeUtils.h>
 #include <helpers/ConstantTadHelper.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -83,15 +83,15 @@ namespace helpers {
 // matrixBandPart_ - main algorithm caller
 //
     template <typename T>
-    void matrixBandPart_(nd4j::LaunchContext * context, NDArray* input, NDArray* output, Nd4jLong lowerBand, Nd4jLong upperBand) {
+    void matrixBandPart_(sd::LaunchContext * context, NDArray* input, NDArray* output, Nd4jLong lowerBand, Nd4jLong upperBand) {
         dim3 launchDims(256, 512, 8192);
         auto stream = context->getCudaStream();
 
         std::vector<int> lastDims({input->rankOf() - 2, input->rankOf() - 1});
         std::vector<int> dimsToExclude = ShapeUtils::evalDimsToExclude(input->rankOf(), lastDims);
 
-        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), lastDims);
-        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), lastDims);
+        auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), lastDims);
+        auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), lastDims);
 
         const Nd4jLong numTads = packX.numberOfTads();
 
@@ -103,10 +103,10 @@ namespace helpers {
     }
 
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    void matrixBandPart(nd4j::LaunchContext * context, NDArray* input, NDArray* output, Nd4jLong lowerBand, Nd4jLong upperBand) {
+    void matrixBandPart(sd::LaunchContext * context, NDArray* input, NDArray* output, Nd4jLong lowerBand, Nd4jLong upperBand) {
         BUILD_SINGLE_SELECTOR(input->dataType(), matrixBandPart_, (context, input, output, lowerBand, upperBand), FLOAT_TYPES);
     }
-    BUILD_SINGLE_TEMPLATE(template void matrixBandPart_, (nd4j::LaunchContext * context, NDArray* input, NDArray* output, Nd4jLong lowerBand, Nd4jLong upperBand), FLOAT_TYPES);
+    BUILD_SINGLE_TEMPLATE(template void matrixBandPart_, (sd::LaunchContext * context, NDArray* input, NDArray* output, Nd4jLong lowerBand, Nd4jLong upperBand), FLOAT_TYPES);
 }
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/matrix_diag_part.cu b/libnd4j/include/ops/declarable/helpers/cuda/matrix_diag_part.cu
index 5a95eeb83..7d78d0323 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/matrix_diag_part.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/matrix_diag_part.cu
@@ -18,16 +18,16 @@
 // Created by GS <sgazeos@gmail.com> on 3/21/2018.
 //
 
-#include "ResultSet.h"
+#include <array/ResultSet.h>
 #include <ops/declarable/helpers/matrix_diag_part.h>
-#include <Status.h>
-#include <ShapeUtils.h>
-#include <ShapeUtils.h>
-#include <TAD.h>
-#include <cuda_exception.h>
+#include <graph/Status.h>
+#include <helpers/ShapeUtils.h>
+#include <helpers/ShapeUtils.h>
+#include <helpers/TAD.h>
+#include <exceptions/cuda_exception.h>
 #include <helpers/ConstantTadHelper.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -54,7 +54,7 @@ namespace helpers {
 // for detailed explanations please take a look on web page: https://www.tensorflow.org/api_docs/python/tf/matrix_set_diag
 //
     template <typename T>
-    int _matrixDiagPart(nd4j::LaunchContext * context, const NDArray* input, NDArray* output) {
+    int _matrixDiagPart(sd::LaunchContext * context, const NDArray* input, NDArray* output) {
         auto stream = context->getCudaStream();
         auto listOut  = output->allTensorsAlongDimension({output->rankOf() - 1});
         auto listDiag = input->allTensorsAlongDimension({input->rankOf() - 2, input->rankOf() - 1});
@@ -63,7 +63,7 @@ namespace helpers {
             nd4j_printf("matrix_diag_part: Input matrix has wrong shape.", "");
             return ND4J_STATUS_VALIDATION;
         }
-        Nd4jLong lastDimension = nd4j::math::nd4j_min(input->sizeAt(-2), input->sizeAt(-1));
+        Nd4jLong lastDimension = sd::math::nd4j_min(input->sizeAt(-2), input->sizeAt(-1));
 
         std::vector<int> dimsToExclude = ShapeUtils::evalDimsToExclude(output->rankOf(), {output->rankOf() - 1});
         const Nd4jLong numTads = ShapeUtils::getNumOfSubArrs(input->getShapeInfo(), dimsToExclude); //this->tensorsAlongDimension({dimension});
@@ -71,8 +71,8 @@ namespace helpers {
         //tadOnlyInputShapeInfo, tadInputOffsets, tadOnlyOutputShapeInfo, tadOutputOffsets;
         std::vector<int> outputDims({output->rankOf() - 1});
         std::vector<int> inputDims({input->rankOf() - 2, input->rankOf() - 1});
-        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), inputDims);
-        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), outputDims);
+        auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), inputDims);
+        auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), outputDims);
 
 
         if (!output->isActualOnDeviceSide())
@@ -91,11 +91,11 @@ namespace helpers {
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // caller for _matrixDiagPart
 //
-    int matrixDiagPart(nd4j::LaunchContext * context, const NDArray* input, NDArray* output) {
+    int matrixDiagPart(sd::LaunchContext * context, const NDArray* input, NDArray* output) {
         BUILD_SINGLE_SELECTOR(input->dataType(), return _matrixDiagPart, (context, input, output), LIBND4J_TYPES);
     }
 
-    BUILD_SINGLE_TEMPLATE(template int _matrixDiagPart, (nd4j::LaunchContext * context, const NDArray* input, NDArray* output), LIBND4J_TYPES);
+    BUILD_SINGLE_TEMPLATE(template int _matrixDiagPart, (sd::LaunchContext * context, const NDArray* input, NDArray* output), LIBND4J_TYPES);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/max_pooling.cu b/libnd4j/include/ops/declarable/helpers/cuda/max_pooling.cu
index aa129ee8e..b809647c1 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/max_pooling.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/max_pooling.cu
@@ -22,7 +22,7 @@
 #include <ops/declarable/helpers/convolutions.h>
 
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -38,7 +38,7 @@ namespace helpers {
     }
 
     template <typename T, typename Y>
-    static void maxPoolingFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* values, std::vector<int> const& params, NDArray* indices) {
+    static void maxPoolingFunctor_(sd::graph::Context& block, NDArray* input, NDArray* values, std::vector<int> const& params, NDArray* indices) {
         int kY = params[0];
         int kX = params[1];
 
@@ -85,9 +85,9 @@ namespace helpers {
         }
     }
 
-    void maxPoolingFunctor(nd4j::LaunchContext * context, nd4j::graph::Context& block, NDArray* input, NDArray* values, std::vector<int> const& params, NDArray* indices) {
+    void maxPoolingFunctor(sd::LaunchContext * context, sd::graph::Context& block, NDArray* input, NDArray* values, std::vector<int> const& params, NDArray* indices) {
         NDArray::prepareSpecialUse({values, indices}, {input});
-        auto yType = indices == nullptr ? nd4j::DataType::INT64 : indices->dataType();
+        auto yType = indices == nullptr ? sd::DataType::INT64 : indices->dataType();
         BUILD_DOUBLE_SELECTOR(input->dataType(), yType,  maxPoolingFunctor_, (block, input, values, params, indices), FLOAT_TYPES, INDEXING_TYPES);
         NDArray::registerSpecialUse({values, indices}, {input});
     }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/maximum.cu b/libnd4j/include/ops/declarable/helpers/cuda/maximum.cu
index b93563de2..c4c8783ff 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/maximum.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/maximum.cu
@@ -18,12 +18,12 @@
 //  @author sgazeos@gmail.com
 //
 
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 #include <helpers/ShapeUtils.h>
 
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
 
@@ -94,7 +94,7 @@ namespace nd4j {
                 }
             }
 
-            void maximumBPFunctor(nd4j::LaunchContext * context, NDArray* x, NDArray* y, NDArray* epsNext, NDArray* gradX, NDArray* gradY) {
+            void maximumBPFunctor(sd::LaunchContext * context, NDArray* x, NDArray* y, NDArray* epsNext, NDArray* gradX, NDArray* gradY) {
                 NDArray::prepareSpecialUse({gradX, gradY}, {x, y, epsNext});
 
                 BUILD_SINGLE_SELECTOR(x->dataType(), maximumBPFunctor_, (x, y, epsNext, gradX, gradY), NUMERIC_TYPES);
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/merge.cu b/libnd4j/include/ops/declarable/helpers/cuda/merge.cu
index 14fda24ec..b448fbd35 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/merge.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/merge.cu
@@ -23,13 +23,13 @@
 #include <array/ResultSet.h>
 #include <helpers/ShapeUtils.h>
 #include <numeric>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 #include <helpers/TAD.h>
 #include <exceptions/cuda_exception.h>
-#include <PointersManager.h>
-#include <ConstantTadHelper.h>
+#include <helpers/PointersManager.h>
+#include <helpers/ConstantTadHelper.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             //////////////////////////////////////////////////////////////////////////
@@ -60,7 +60,7 @@ namespace nd4j {
             }
 
             template <typename T, typename Z>
-            static void mergeMaxIndex_(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
+            static void mergeMaxIndex_(sd::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
                 std::vector<void *> inBuffers(inArrs.size());
                 std::vector<void *> inShapes(inArrs.size());
 
@@ -80,7 +80,7 @@ namespace nd4j {
                 manager.synchronize();
             }
 
-            void mergeMaxIndex(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
+            void mergeMaxIndex(sd::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
                 NDArray::prepareSpecialUse({&output}, {});
                 for (auto v:inArrs)
                     v->syncToDevice();
@@ -116,7 +116,7 @@ namespace nd4j {
             }
 
             template<typename T>
-            static void mergeMax_(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
+            static void mergeMax_(sd::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
                 std::vector<void *> inBuffers(inArrs.size());
                 std::vector<void *> inShapes(inArrs.size());
 
@@ -136,7 +136,7 @@ namespace nd4j {
                 manager.synchronize();
             }
 
-            void mergeMax(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
+            void mergeMax(sd::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
                 NDArray::prepareSpecialUse({&output}, {});
                 for (auto v:inArrs)
                     v->syncToDevice();
@@ -168,7 +168,7 @@ namespace nd4j {
             }
 
             template<typename T>
-            static void mergeAvg_(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
+            static void mergeAvg_(sd::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
                 std::vector<void *> inBuffers(inArrs.size());
                 std::vector<void *> inShapes(inArrs.size());
 
@@ -188,7 +188,7 @@ namespace nd4j {
                 manager.synchronize();
             }
 
-            void mergeAvg(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
+            void mergeAvg(sd::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
                 NDArray::prepareSpecialUse({&output}, {});
                 for (auto v:inArrs)
                     v->syncToDevice();
@@ -221,7 +221,7 @@ namespace nd4j {
             }
 
             template<typename T>
-            static void mergeAdd_(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
+            static void mergeAdd_(sd::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
                 std::vector<void *> inBuffers(inArrs.size());
                 std::vector<void *> inShapes(inArrs.size());
 
@@ -240,9 +240,9 @@ namespace nd4j {
 
                 manager.synchronize();
             }
-            BUILD_SINGLE_TEMPLATE(template void mergeAdd_, (nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output), NUMERIC_TYPES);
+            BUILD_SINGLE_TEMPLATE(template void mergeAdd_, (sd::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output), NUMERIC_TYPES);
 
-            void mergeAdd(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
+            void mergeAdd(sd::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output) {
                 NDArray::prepareSpecialUse({&output}, {});
                 for (auto v:inArrs)
                     v->syncToDevice();
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/meshgrid.cu b/libnd4j/include/ops/declarable/helpers/cuda/meshgrid.cu
index 399447c9a..53570a0ba 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/meshgrid.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/meshgrid.cu
@@ -25,7 +25,7 @@
 #include <array/ResultSet.h>
 #include <numeric>
 
-namespace nd4j 	  {
+namespace sd 	  {
 namespace ops 	  {
 namespace helpers {
 
@@ -78,7 +78,7 @@ namespace helpers {
     }
 
     template <typename T>
-    static void meshgrid_(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, const std::vector<NDArray*>& outArrs, const bool swapFirst2Dims) {
+    static void meshgrid_(sd::LaunchContext * context, const std::vector<NDArray*>& inArrs, const std::vector<NDArray*>& outArrs, const bool swapFirst2Dims) {
         const int rank = inArrs.size();
         int inIndices[MAX_RANK];
         std::iota(inIndices, inIndices + rank, 0);
@@ -134,7 +134,7 @@ namespace helpers {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void meshgrid(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, const std::vector<NDArray*>& outArrs, const bool swapFirst2Dims) {
+    void meshgrid(sd::LaunchContext * context, const std::vector<NDArray*>& inArrs, const std::vector<NDArray*>& outArrs, const bool swapFirst2Dims) {
 
         BUILD_SINGLE_SELECTOR(inArrs.at(0)->dataType(), meshgrid_, (context, inArrs, outArrs, swapFirst2Dims), NUMERIC_TYPES);
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/minimum.cu b/libnd4j/include/ops/declarable/helpers/cuda/minimum.cu
index 90142091f..b43bb418e 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/minimum.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/minimum.cu
@@ -19,11 +19,11 @@
 //
 #ifndef __MIN_I_MAX_H_HELPERS__
 #define __MIN_I_MAX_H_HELPERS__
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 #include <helpers/ShapeUtils.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
 
@@ -95,7 +95,7 @@ namespace nd4j {
 
             }
 
-            void minimumBPFunctor(nd4j::LaunchContext * context, NDArray* x, NDArray* y, NDArray* epsNext, NDArray* gradX, NDArray* gradY) {
+            void minimumBPFunctor(sd::LaunchContext * context, NDArray* x, NDArray* y, NDArray* epsNext, NDArray* gradX, NDArray* gradY) {
                 NDArray::prepareSpecialUse({gradX, gradY}, {x, y, epsNext});
 
                 BUILD_SINGLE_SELECTOR(x->dataType(), minimumBPFunctor_, (x, y, epsNext, gradX, gradY), NUMERIC_TYPES);
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/nth_element.cu b/libnd4j/include/ops/declarable/helpers/cuda/nth_element.cu
index bafbde2e2..4f26ef397 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/nth_element.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/nth_element.cu
@@ -19,13 +19,13 @@
 //
 
 #include <ops/declarable/helpers/nth_element.h>
-#include <TAD.h>
-#include <ShapeUtils.h>
-#include <PointersManager.h>
-#include <NativeOps.h>
+#include <helpers/TAD.h>
+#include <helpers/ShapeUtils.h>
+#include <helpers/PointersManager.h>
+#include <legacy/NativeOps.h>
 #include <helpers/ConstantTadHelper.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -50,7 +50,7 @@ namespace helpers {
     }
 
     template <typename T>
-    void nthElementFunctor_(nd4j::LaunchContext * context, NDArray* input, Nd4jLong n, NDArray* output, bool reverse) {
+    void nthElementFunctor_(sd::LaunchContext * context, NDArray* input, Nd4jLong n, NDArray* output, bool reverse) {
 
         NDArray::prepareSpecialUse({output}, {input});
         NDArray sortedVals(*input);
@@ -66,7 +66,7 @@ namespace helpers {
         else { // rank greater than 1
             std::vector<int> lastDims({input->rankOf() - 1});// = ShapeUtils::evalDimsToExclude(input->rankOf(), {input->rankOf() - 1});
 
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(sortedVals.getShapeInfo(), lastDims);
+            auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(sortedVals.getShapeInfo(), lastDims);
 
             auto pTadShape = packX.specialShapeInfo();
             auto pTadShapeH = packX.primaryShapeInfo();
@@ -79,7 +79,7 @@ namespace helpers {
         }
         NDArray::registerSpecialUse({output}, {input});
     }
-    void nthElementFunctor(nd4j::LaunchContext * context, NDArray* input, Nd4jLong n, NDArray* output, bool reverse) {
+    void nthElementFunctor(sd::LaunchContext * context, NDArray* input, Nd4jLong n, NDArray* output, bool reverse) {
     BUILD_SINGLE_SELECTOR(input->dataType(), nthElementFunctor_, (context, input, n, output, reverse), LIBND4J_TYPES);
 
     }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/one_hot.cu b/libnd4j/include/ops/declarable/helpers/cuda/one_hot.cu
index c0d1d95dc..f1b87c1aa 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/one_hot.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/one_hot.cu
@@ -23,14 +23,14 @@
 #include <array/ResultSet.h>
 #include <helpers/ShapeUtils.h>
 #include <numeric>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 #include <helpers/TAD.h>
 #include <exceptions/cuda_exception.h>
-#include <PointersManager.h>
-#include <ConstantTadHelper.h>
+#include <helpers/PointersManager.h>
+#include <helpers/ConstantTadHelper.h>
 
 
-namespace nd4j 		{
+namespace sd 		{
 namespace ops		{
 namespace helpers 	{
 
@@ -86,7 +86,7 @@ static void onehotCudaLauncher(const int blocksPerGrid, const int threadsPerBloc
 }
 
 ///////////////////////////////////////////////////////////////////
-void onehot(const nd4j::LaunchContext* context, const NDArray *indices, NDArray *output, const uint axis, const uint depth, const double on, const double off) {
+void onehot(const sd::LaunchContext* context, const NDArray *indices, NDArray *output, const uint axis, const uint depth, const double on, const double off) {
 
 	const auto xType = indices->dataType();
 	const auto zType = output->dataType();
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/pad.cu b/libnd4j/include/ops/declarable/helpers/cuda/pad.cu
index aede6243a..fc4d96ce0 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/pad.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/pad.cu
@@ -23,13 +23,13 @@
 #include <array/ResultSet.h>
 #include <helpers/ShapeUtils.h>
 #include <numeric>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 #include <helpers/TAD.h>
 #include <exceptions/cuda_exception.h>
-#include <PointersManager.h>
-#include <ConstantTadHelper.h>
+#include <helpers/PointersManager.h>
+#include <helpers/ConstantTadHelper.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
 ///////////////////////////////////////////////////////////////////
@@ -126,7 +126,7 @@ namespace nd4j {
             }
 
 ///////////////////////////////////////////////////////////////////
-            void pad(nd4j::LaunchContext * context, const int mode, const NDArray& input, const NDArray& paddings, NDArray& output, const NDArray& padValue) {
+            void pad(sd::LaunchContext * context, const int mode, const NDArray& input, const NDArray& paddings, NDArray& output, const NDArray& padValue) {
 
                 PointersManager manager(context, "pad");
 
@@ -232,7 +232,7 @@ namespace nd4j {
             }
 
             template<typename F, typename I>
-            static void mirrorPad_(nd4j::LaunchContext * context, const NDArray& input, const NDArray& paddings, NDArray& output, const int mode) {
+            static void mirrorPad_(sd::LaunchContext * context, const NDArray& input, const NDArray& paddings, NDArray& output, const int mode) {
                 // mode:  0 - REFLECT, else - SYMMETRIC
                 const int reflBorder = (bool)mode ? 1 : 0;
                 const int rank        = input.rankOf();
@@ -248,16 +248,16 @@ namespace nd4j {
                     const Nd4jLong len           = 2*(inLen-1) + leftSide + reflBorder;
 
                     mirrorPadLinearKernel<F><<<256, 512, 256, *stream>>>(input.getSpecialBuffer(), input.getSpecialShapeInfo(), output.specialBuffer(), output.specialShapeInfo(), leftSide, leftSideCorrected, inLen, len, outLen);
-                    nd4j::DebugHelper::checkErrorCode(stream, "helpers::mirrorPadLinearKernel(...) failed");
+                    sd::DebugHelper::checkErrorCode(stream, "helpers::mirrorPadLinearKernel(...) failed");
                 }
                 else {
                     mirrorPadKernel<F, I><<<256, 256, 8192, *stream>>>(input.getSpecialBuffer(), input.getSpecialShapeInfo(), output.specialBuffer(), output.specialShapeInfo(), outLen, paddings.getSpecialBuffer(), paddings.getSpecialShapeInfo(), reflBorder);
-                    nd4j::DebugHelper::checkErrorCode(stream, "helpers::mirrorPadKernel(...) failed");
+                    sd::DebugHelper::checkErrorCode(stream, "helpers::mirrorPadKernel(...) failed");
                 }
                 NDArray::registerSpecialUse({&output}, {&input, &paddings});
             }
 
-            void mirrorPad(nd4j::LaunchContext * context, const NDArray& input, const NDArray& paddings, NDArray& output, const int mode) {
+            void mirrorPad(sd::LaunchContext * context, const NDArray& input, const NDArray& paddings, NDArray& output, const int mode) {
                 BUILD_DOUBLE_SELECTOR(input.dataType(), paddings.dataType(), mirrorPad_, (context, input, paddings, output, mode), LIBND4J_TYPES, INDEXING_TYPES);
             }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/percentile.cu b/libnd4j/include/ops/declarable/helpers/cuda/percentile.cu
index 79c9024f5..ebb067251 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/percentile.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/percentile.cu
@@ -20,12 +20,12 @@
 //
 
 #include <ops/declarable/helpers/percentile.h>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 #include <helpers/ConstantTadHelper.h>
 #include <helpers/DebugHelper.h>
-#include "ResultSet.h"
+#include <array/ResultSet.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
@@ -84,7 +84,7 @@ namespace helpers {
 
 
     template <typename T>
-    static void _percentile(nd4j::LaunchContext * context, const NDArray& input, NDArray& output, std::vector<int>& axis, const float q, const int interpolation) {
+    static void _percentile(sd::LaunchContext * context, const NDArray& input, NDArray& output, std::vector<int>& axis, const float q, const int interpolation) {
         const int inputRank = input.rankOf();
 
         if(axis.empty())
@@ -116,10 +116,10 @@ namespace helpers {
 
         percentileKernel<T><<<256, 512, 1024, *context->getCudaStream()>>>(tempArray.specialBuffer(), packX.platformShapeInfo(), packX.platformOffsets(), packX.numberOfTads(), tadLength, output.specialBuffer(), output.specialShapeInfo(), output.lengthOf(), position);
 
-        nd4j::DebugHelper::checkErrorCode(context->getCudaStream(), "percentile");
+        sd::DebugHelper::checkErrorCode(context->getCudaStream(), "percentile");
     }
 
-    void percentile(nd4j::LaunchContext * context, const NDArray& input, NDArray& output, std::vector<int>& axises, const float q, const int interpolation) {
+    void percentile(sd::LaunchContext * context, const NDArray& input, NDArray& output, std::vector<int>& axises, const float q, const int interpolation) {
         NDArray::prepareSpecialUse({&output}, {&input});
 
         BUILD_SINGLE_SELECTOR(input.dataType(), _percentile, (context, input, output, axises, q, interpolation), LIBND4J_TYPES);
@@ -127,7 +127,7 @@ namespace helpers {
         NDArray::registerSpecialUse({&output}, {&input});
     }
 
-    BUILD_SINGLE_TEMPLATE(template void _percentile, (nd4j::LaunchContext * context, const NDArray& input, NDArray& output, std::vector<int>& axises, const float q, const int interpolation), LIBND4J_TYPES);
+    BUILD_SINGLE_TEMPLATE(template void _percentile, (sd::LaunchContext * context, const NDArray& input, NDArray& output, std::vector<int>& axises, const float q, const int interpolation), LIBND4J_TYPES);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/polyGamma.cu b/libnd4j/include/ops/declarable/helpers/cuda/polyGamma.cu
index 6a62246a3..2f96d96e7 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/polyGamma.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/polyGamma.cu
@@ -20,9 +20,9 @@
 
 #include<ops/declarable/helpers/gammaMathFunc.h>
 #include<ops/declarable/helpers/zeta.h>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -83,7 +83,7 @@ static void polyGammaCudaLauncher(const int blocksPerGrid, const int threadsPerB
 }
 
 ///////////////////////////////////////////////////////////////////
-void polyGamma(nd4j::LaunchContext * context, const NDArray& n, const NDArray& x, NDArray& z) {
+void polyGamma(sd::LaunchContext * context, const NDArray& n, const NDArray& x, NDArray& z) {
 
     NDArray::prepareSpecialUse({&z}, {&n, &x});
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/prefix.cu b/libnd4j/include/ops/declarable/helpers/cuda/prefix.cu
index 52dd8b815..3d1fd104a 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/prefix.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/prefix.cu
@@ -19,12 +19,12 @@
 //
 
 #include <ops/ops.h>
-#include <ConstantTadHelper.h>
-#include <PointersManager.h>
-#include <ShapeUtils.h>
+#include <helpers/ConstantTadHelper.h>
+#include <helpers/PointersManager.h>
+#include <helpers/ShapeUtils.h>
 #include <ops/declarable/helpers/prefix.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -146,10 +146,10 @@ static void prefixPerBlockCudaLauncher(const int blocksPerGrid, const int thread
 }
 
 ///////////////////////////////////////////////////////////////////
-void prefix(nd4j::LaunchContext * context, scalar::Ops op, const NDArray* x, NDArray* z, const std::vector<int>& dims, bool exclusive, bool reverse) {
+void prefix(sd::LaunchContext * context, scalar::Ops op, const NDArray* x, NDArray* z, const std::vector<int>& dims, bool exclusive, bool reverse) {
 
-    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(x->getShapeInfo(), dims);
-    auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(z->getShapeInfo(), dims);
+    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(x->getShapeInfo(), dims);
+    auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(z->getShapeInfo(), dims);
 
     const Nd4jLong numTads = packX.numberOfTads();
     const Nd4jLong tadLen = x->lengthOf() / numTads;
@@ -168,7 +168,7 @@ void prefix(nd4j::LaunchContext * context, scalar::Ops op, const NDArray* x, NDA
 }
 
 ///////////////////////////////////////////////////////////////////
-void prefix(nd4j::LaunchContext * context, scalar::Ops op, const NDArray* x, NDArray* z, bool exclusive, bool reverse) {
+void prefix(sd::LaunchContext * context, scalar::Ops op, const NDArray* x, NDArray* z, bool exclusive, bool reverse) {
     prefix(context, op, x, z, {}, exclusive, reverse);
 }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/print_variable.cu b/libnd4j/include/ops/declarable/helpers/cuda/print_variable.cu
index 88d2b5937..a518ddd72 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/print_variable.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/print_variable.cu
@@ -21,7 +21,7 @@
 #include <ops/declarable/helpers/print_variable.h>
 #include <helpers/PointersManager.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             template <typename T>
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/qr.cu b/libnd4j/include/ops/declarable/helpers/cuda/qr.cu
index bb3ed0124..d0d5fddd5 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/qr.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/qr.cu
@@ -18,10 +18,10 @@
 //  @author George A. Shulinok <sgazeos@gmail.com>
 //
 #include <ops/declarable/helpers/qr.h>
-#include <NDArrayFactory.h>
-#include <MmulHelper.h>
+#include <array/NDArrayFactory.h>
+#include <helpers/MmulHelper.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -170,7 +170,7 @@ namespace helpers {
         NDArray::registerSpecialUse({outputQ, outputR}, {input});
     }
 
-    void qr(nd4j::LaunchContext* context, NDArray const* input, NDArray* outputQ, NDArray* outputR, bool const fullMatricies) {
+    void qr(sd::LaunchContext* context, NDArray const* input, NDArray* outputQ, NDArray* outputR, bool const fullMatricies) {
         BUILD_SINGLE_SELECTOR(input->dataType(), qr_, (context, input, outputQ, outputR, fullMatricies), FLOAT_TYPES);
     }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/random.cu b/libnd4j/include/ops/declarable/helpers/cuda/random.cu
index 1e290bc56..59f22d878 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/random.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/random.cu
@@ -24,13 +24,13 @@
 #include <memory>
 #include <graph/Context.h>
 #include <helpers/RandomLauncher.h>
-#include <ShapeUtils.h>
-#include <NDArrayFactory.h>
-#include <cuda_exception.h>
+#include <helpers/ShapeUtils.h>
+#include <array/NDArrayFactory.h>
+#include <exceptions/cuda_exception.h>
 #include <helpers/ConstantTadHelper.h>
-#include <PointersManager.h>
+#include <helpers/PointersManager.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -291,7 +291,7 @@ __global__ static void fillMultiNomialCuda_(graph::RandomGenerator* devRng, cons
         
         for (Nd4jLong nClass = 0; nClass < numOfClassX; nClass++) {
             Nd4jLong nIndex = nSamplesPerBatch + nClassPerSamples + nClass;
-            X tValue = (xTad[nClass * xDimAstride] - nd4j::math::nd4j_log<X, X>(-nd4j::math::nd4j_log<X, X>(devRng->relativeT<X>(nIndex, minVal, maxVal))));
+            X tValue = (xTad[nClass * xDimAstride] - sd::math::nd4j_log<X, X>(-sd::math::nd4j_log<X, X>(devRng->relativeT<X>(nIndex, minVal, maxVal))));
             if (tValue > Max) {
                 Max = tValue; 
                 arg = nClass;
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/random_crop.cu b/libnd4j/include/ops/declarable/helpers/cuda/random_crop.cu
index d2c83d0f4..0489103f9 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/random_crop.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/random_crop.cu
@@ -23,7 +23,7 @@
 #include <vector>
 #include <memory>
 #include <graph/Context.h>
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/range.cu b/libnd4j/include/ops/declarable/helpers/cuda/range.cu
index 3a5504905..668518d82 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/range.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/range.cu
@@ -21,7 +21,7 @@
 
 #include <ops/declarable/helpers/range.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -38,11 +38,11 @@ namespace helpers {
     //////////////////////////////////////////////////////////////////////////
     // be careful: outVector must have c-order and ews = 1 !!!
     template <typename T>
-    static void _range(nd4j::LaunchContext * context, const NDArray& start, const NDArray& delta, NDArray& outVector) {
+    static void _range(sd::LaunchContext * context, const NDArray& start, const NDArray& delta, NDArray& outVector) {
         global_range<T><<<512, 512, 2048, *context->getCudaStream()>>>(outVector.getSpecialBuffer(), outVector.lengthOf(), start.e<T>(0), delta.e<T>(0));
     }
 
-    void range(nd4j::LaunchContext * context, const NDArray& start, const NDArray& delta, NDArray& outVector) {
+    void range(sd::LaunchContext * context, const NDArray& start, const NDArray& delta, NDArray& outVector) {
         NDArray::prepareSpecialUse({&outVector}, {&start, &delta});
         BUILD_SINGLE_SELECTOR(outVector.dataType(), _range, (context, start, delta, outVector), LIBND4J_TYPES);
         NDArray::registerSpecialUse({&outVector}, {&start, &delta});
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/reverse.cu b/libnd4j/include/ops/declarable/helpers/cuda/reverse.cu
index 15335d57e..793d90f91 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/reverse.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/reverse.cu
@@ -21,12 +21,12 @@
 #include <ops/declarable/helpers/reverse.h>
 #include <helpers/ShapeUtils.h>
 #include <array/ResultSet.h>
-#include <TAD.h>
-#include <PointersManager.h>
-#include <ConstantTadHelper.h>
+#include <helpers/TAD.h>
+#include <helpers/PointersManager.h>
+#include <helpers/ConstantTadHelper.h>
 
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
@@ -141,13 +141,13 @@ namespace helpers {
     }
 
     template<typename T>
-    static void reverseTad(nd4j::LaunchContext * context, const NDArray* input, NDArray* output, Nd4jLong *inputTadShape, Nd4jLong *inputTadOffsets, Nd4jLong *outputTadShape, Nd4jLong *outputTadOffsets, uint64_t tadLength) {
+    static void reverseTad(sd::LaunchContext * context, const NDArray* input, NDArray* output, Nd4jLong *inputTadShape, Nd4jLong *inputTadOffsets, Nd4jLong *outputTadShape, Nd4jLong *outputTadOffsets, uint64_t tadLength) {
         auto stream = context->getCudaStream();
         reverseTadKernel<T><<<256, 512, 8192, *stream>>>(input->getSpecialBuffer(), input->getSpecialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(), inputTadShape, inputTadOffsets, outputTadShape, outputTadOffsets, input->lengthOf(), tadLength, input->lengthOf() / tadLength);
     }
 
     template<typename T>
-    static void reverseArray(nd4j::LaunchContext * context, const NDArray* input, NDArray* output, Nd4jLong numOfElemsToReverse) {
+    static void reverseArray(sd::LaunchContext * context, const NDArray* input, NDArray* output, Nd4jLong numOfElemsToReverse) {
         auto stream = context->getCudaStream();
         Nd4jLong numOfReverse = numOfElemsToReverse;
         if (numOfElemsToReverse == 0)
@@ -159,7 +159,7 @@ namespace helpers {
 
     ///////////////////////////////////////////////////////////////////
     template <typename T>
-    static void reverseSequence_(nd4j::LaunchContext * context, const NDArray* input, const NDArray* seqLengths, NDArray* output, int seqDim, const int batchDim){
+    static void reverseSequence_(sd::LaunchContext * context, const NDArray* input, const NDArray* seqLengths, NDArray* output, int seqDim, const int batchDim){
         int posOfNonUnityDim = -1;
         seqLengths->syncToHost();
         auto stream = context->getCudaStream();
@@ -198,7 +198,7 @@ namespace helpers {
         }
     }
 
-    void reverseSequence(nd4j::LaunchContext * context, const NDArray* input, const NDArray* seqLengths, NDArray* output, int seqDim, const int batchDim) {
+    void reverseSequence(sd::LaunchContext * context, const NDArray* input, const NDArray* seqLengths, NDArray* output, int seqDim, const int batchDim) {
         NDArray::prepareSpecialUse({output}, {input, seqLengths});
 
         // if op isn't inplace - copy original data into output array
@@ -210,12 +210,12 @@ namespace helpers {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void reverse(nd4j::LaunchContext * context, const NDArray* input, NDArray* output, const std::vector<int>* intArgs, bool isBackProp) {
+    void reverse(sd::LaunchContext * context, const NDArray* input, NDArray* output, const std::vector<int>* intArgs, bool isBackProp) {
         // we need to reverse axis only if that's new op
         std::vector<int> dimensions = isBackProp ? ShapeUtils::evalDimsToExclude(input->rankOf(), *intArgs) : *intArgs;
         std::vector<int> axis = ShapeUtils::evalDimsToExclude(input->rankOf(), dimensions);
-        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+        auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+        auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
 
 
 
@@ -230,7 +230,7 @@ namespace helpers {
         NDArray::registerSpecialUse({output}, {input});
     }
 
-BUILD_SINGLE_TEMPLATE(template void reverseArray, (nd4j::LaunchContext * context, const NDArray *inArr, NDArray *outArr, Nd4jLong numOfElemsToReverse), LIBND4J_TYPES);
+BUILD_SINGLE_TEMPLATE(template void reverseArray, (sd::LaunchContext * context, const NDArray *inArr, NDArray *outArr, Nd4jLong numOfElemsToReverse), LIBND4J_TYPES);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/roll.cu b/libnd4j/include/ops/declarable/helpers/cuda/roll.cu
index bc53946d3..d014b9115 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/roll.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/roll.cu
@@ -22,7 +22,7 @@
 #include <helpers/ConstantTadHelper.h>
 #include <helpers/PointersManager.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -308,7 +308,7 @@ namespace helpers {
         }
     }
 
-    void rollFunctorFull(nd4j::LaunchContext * context, NDArray* input, NDArray* output, std::vector<int> const& shifts, std::vector<int> const& axes, bool inplace){
+    void rollFunctorFull(sd::LaunchContext * context, NDArray* input, NDArray* output, std::vector<int> const& shifts, std::vector<int> const& axes, bool inplace){
         input->syncToDevice();
 
         BUILD_SINGLE_SELECTOR(input->dataType(), rollFunctorFull_, (input, output, shifts, axes, inplace), LIBND4J_TYPES);
@@ -316,7 +316,7 @@ namespace helpers {
         output->tickWriteDevice();
     }
 
-    void rollFunctorLinear(nd4j::LaunchContext * context, NDArray* input, NDArray* output, int shift, bool inplace){
+    void rollFunctorLinear(sd::LaunchContext * context, NDArray* input, NDArray* output, int shift, bool inplace){
         input->syncToDevice();
 
         BUILD_SINGLE_SELECTOR(input->dataType(), rollFunctorLinear_, (input, output, shift, inplace), LIBND4J_TYPES);
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu b/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu
index f3bee349b..da61b0e48 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu
@@ -20,9 +20,9 @@
 //
 
 #include <ops/declarable/helpers/s_t_b.h>
-#include <PointersManager.h>
+#include <helpers/PointersManager.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
@@ -84,7 +84,7 @@ static void batchToSpaceCudaLauncher(const int blocksPerGrid, const int threadsP
 BUILD_SINGLE_TEMPLATE(template void batchToSpaceCudaLauncher, (const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,  const void* vx, const Nd4jLong* xShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const uint cropBottom, const uint cropLeft), LIBND4J_TYPES);
 
 ///////////////////////////////////////////////////////////////////
-void batchToSpace(nd4j::LaunchContext* context, const NDArray& input, NDArray& output, const uint cropBottom, const uint cropTop, const uint cropLeft, const uint cropRight, const uint blockSize) {
+void batchToSpace(sd::LaunchContext* context, const NDArray& input, NDArray& output, const uint cropBottom, const uint cropTop, const uint cropLeft, const uint cropRight, const uint blockSize) {
 
     // [bS*blockSize*blockSize, H/blockSize, W/blockSize, iC] is rearranged/permuted to [bS, oH, oW, iC]
     // oH = H - cropTop  - cropBottom
@@ -181,7 +181,7 @@ static void batchToSpaceNDCudaLauncher(const int blocksPerGrid, const int thread
 BUILD_DOUBLE_TEMPLATE(template void batchToSpaceNDCudaLauncher, (const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,  const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const uint numOfSpatialDims), LIBND4J_TYPES, INTEGER_TYPES);
 
 //////////////////////////////////////////////////////////////////////////
-void batchToSpaceND(nd4j::LaunchContext* context, const NDArray& input, const NDArray& blockShape, const NDArray& crop, NDArray& output) {
+void batchToSpaceND(sd::LaunchContext* context, const NDArray& input, const NDArray& blockShape, const NDArray& crop, NDArray& output) {
 
     // 4D example, numOfSpatialDims = 2 - two spatial dimensions
     // [bS*blockShape[0]*blockShape[1], iH, iW, iC] is rearranged/permuted to [bS, iH*blockShape[0] - cropTop  - cropBottom, iW*blockShape[1] - cropLeft - cropRight, iC]
@@ -309,7 +309,7 @@ static void spaceToBatchCudaLauncher(const int blocksPerGrid, const int threadsP
 BUILD_SINGLE_TEMPLATE(template void spaceToBatchCudaLauncher, (const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,  const void* vx, const Nd4jLong* xShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const uint padBottom, const uint padTop, const uint padLeft, const uint padRight), LIBND4J_TYPES);
 
 ///////////////////////////////////////////////////////////////////
-void spaceToBatch(nd4j::LaunchContext* context, const NDArray& input, NDArray& output, const uint padBottom, const uint padTop, const uint padLeft, const uint padRight, const uint blockSize) {
+void spaceToBatch(sd::LaunchContext* context, const NDArray& input, NDArray& output, const uint padBottom, const uint padTop, const uint padLeft, const uint padRight, const uint blockSize) {
 
     // [bS, iH, iW, iC] is rearranged/permuted to [bS*blockSize*blockSize, (iH + padBottom + padTop)/blockSize, (iW + padLeft + padRight)/blockSize, iC]
 
@@ -420,7 +420,7 @@ static void spaceToBatchNDCudaLauncher(const int blocksPerGrid, const int thread
 BUILD_DOUBLE_TEMPLATE(template void spaceToBatchNDCudaLauncher, (const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,  const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const uint numOfSpatialDims), LIBND4J_TYPES, INTEGER_TYPES);
 
 //////////////////////////////////////////////////////////////////////////
-void spaceToBatchND(nd4j::LaunchContext* context, const NDArray& input, const NDArray& blockShape, const NDArray& padding, NDArray& output ) {
+void spaceToBatchND(sd::LaunchContext* context, const NDArray& input, const NDArray& blockShape, const NDArray& padding, NDArray& output ) {
 
     // 4D example with two spatial dimensions
     // [bS, iH, iW, iC] is rearranged/permuted to [bS*blockShape[0]*blockShape[1], (iH + padBottom + padTop)/blockShape[0], (iW + padLeft + padRight)/blockShape[1], iC]
@@ -523,13 +523,13 @@ void spaceToBatchND(nd4j::LaunchContext* context, const NDArray& input, const ND
     };
 
     template <typename T, int NUM_BLOCK_DIMS, bool B2S>
-    void _execute(nd4j::LaunchContext * context, void *vptrSpace, const Nd4jLong *space_shape, const Nd4jLong *space_strides, const Nd4jLong *block_shape, const Nd4jLong *pad_start, const Nd4jLong *block_offsets, void *vptrBatch, const Nd4jLong *batch_shape, const Nd4jLong *batch_strides) {
+    void _execute(sd::LaunchContext * context, void *vptrSpace, const Nd4jLong *space_shape, const Nd4jLong *space_strides, const Nd4jLong *block_shape, const Nd4jLong *pad_start, const Nd4jLong *block_offsets, void *vptrBatch, const Nd4jLong *batch_shape, const Nd4jLong *batch_strides) {
         auto ptrSpace = reinterpret_cast<T *>(vptrSpace);
         auto ptrBatch = reinterpret_cast<T *>(vptrBatch);
         SpaceToBatchHelper<NUM_BLOCK_DIMS, B2S>::run(ptrSpace, space_shape, space_strides, block_shape, pad_start, block_offsets, ptrBatch, batch_shape, batch_strides);
     };
 
-    Nd4jStatus _batchToSpace(nd4j::LaunchContext * context, int internal_block_dims, NDArray *input, NDArray *output, std::vector<Nd4jLong> &internal_input_shape, std::vector<Nd4jLong> &internal_output_shape, Nd4jLong *block_shape, Nd4jLong *crops) {
+    Nd4jStatus _batchToSpace(sd::LaunchContext * context, int internal_block_dims, NDArray *input, NDArray *output, std::vector<Nd4jLong> &internal_input_shape, std::vector<Nd4jLong> &internal_output_shape, Nd4jLong *block_shape, Nd4jLong *crops) {
 
         return Status::OK();
     }
@@ -542,7 +542,7 @@ void spaceToBatchND(nd4j::LaunchContext* context, const NDArray& input, const ND
 #define STB_BOOL (0, false),\
                  (1, true)
 
-    BUILD_TRIPLE_TEMPLATE(template void _execute, (nd4j::LaunchContext * context, void *ptrSpace, const Nd4jLong *space_shape, const Nd4jLong *space_strides, const Nd4jLong *block_shape, const Nd4jLong *pad_start, const Nd4jLong *block_offsets, void *ptrBatch, const Nd4jLong *batch_shape, const Nd4jLong *batch_strides), LIBND4J_TYPES, STB_DIM, STB_BOOL);
+    BUILD_TRIPLE_TEMPLATE(template void _execute, (sd::LaunchContext * context, void *ptrSpace, const Nd4jLong *space_shape, const Nd4jLong *space_strides, const Nd4jLong *block_shape, const Nd4jLong *pad_start, const Nd4jLong *block_offsets, void *ptrBatch, const Nd4jLong *batch_shape, const Nd4jLong *batch_strides), LIBND4J_TYPES, STB_DIM, STB_BOOL);
 
 #undef STB_BOOL
 #undef STB_DIM
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/s_t_d.cu b/libnd4j/include/ops/declarable/helpers/cuda/s_t_d.cu
index 1e02a54ba..a5ae42e78 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/s_t_d.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/s_t_d.cu
@@ -20,7 +20,7 @@
 
 #include <ops/declarable/helpers/s_t_d.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
     template <typename T>
@@ -90,17 +90,17 @@ namespace helpers {
     }
 
     template <typename T>
-    static void _spaceTodepth_(nd4j::LaunchContext * context, NDArray *input, NDArray *output, int block_size, bool isNHWC) {
+    static void _spaceTodepth_(sd::LaunchContext * context, NDArray *input, NDArray *output, int block_size, bool isNHWC) {
         spaceToDepthKernel<T><<<512, 512, 1024, *context->getCudaStream()>>>(input->specialBuffer(), input->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(), block_size, isNHWC);
     }
 
-    void _spaceTodepth(nd4j::LaunchContext * context, NDArray *input, NDArray *output, int block_size, bool isNHWC) {
+    void _spaceTodepth(sd::LaunchContext * context, NDArray *input, NDArray *output, int block_size, bool isNHWC) {
         NDArray::prepareSpecialUse({output}, {input});
         BUILD_SINGLE_SELECTOR(input->dataType(), _spaceTodepth_, (context, input, output, block_size, isNHWC), LIBND4J_TYPES);
         NDArray::registerSpecialUse({output}, {input});
     }
 
-    BUILD_SINGLE_TEMPLATE(template void _spaceTodepth_, (nd4j::LaunchContext * context, NDArray *input, NDArray *output, int block_size, bool isNHWC), LIBND4J_TYPES);
+    BUILD_SINGLE_TEMPLATE(template void _spaceTodepth_, (sd::LaunchContext * context, NDArray *input, NDArray *output, int block_size, bool isNHWC), LIBND4J_TYPES);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu b/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu
index ec85efddf..4e7360004 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu
@@ -22,12 +22,12 @@
 #include <ops/declarable/helpers/scatter.h>
 #include <numeric>
 #include <helpers/ShapeUtils.h>
-#include <TAD.h>
+#include <helpers/TAD.h>
 #include <helpers/ConstantShapeHelper.h>
 #include <helpers/ConstantTadHelper.h>
 #include <helpers/PointersManager.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
@@ -63,13 +63,13 @@ __global__ static void checkIndicesCuda(const void *vx, const Nd4jLong *xShapeIn
 
         if(currentInd >= shape::sizeAt(zShapeInfo, axis == -1 ? xCoords[xRank-1] : axis)) {
             printf("checkIndices cuda: out of range element %lld at index %lld \n", currentInd,  i);
-            nd4j::math::atomics::nd4j_atomicAdd<Nd4jLong>(&numOfBadIndxPerBlock, 1);
+            sd::math::atomics::nd4j_atomicAdd<Nd4jLong>(&numOfBadIndxPerBlock, 1);
         }
     }
     __syncthreads();
 
     if (threadIdx.x == 0 && numOfBadIndxPerBlock != 0)
-        nd4j::math::atomics::nd4j_atomicAdd<Nd4jLong>(y, numOfBadIndxPerBlock);
+        sd::math::atomics::nd4j_atomicAdd<Nd4jLong>(y, numOfBadIndxPerBlock);
 }
 
 ///////////////////////////////////////////////////////////////////
@@ -82,7 +82,7 @@ static void checkIndicesCudaLauncher(const int blocksPerGrid, const int threadsP
 
 
 ///////////////////////////////////////////////////////////////////
-Nd4jLong checkIndices(nd4j::LaunchContext *context, const NDArray& indices, const NDArray& output, const int axis) {
+Nd4jLong checkIndices(sd::LaunchContext *context, const NDArray& indices, const NDArray& output, const int axis) {
 
     const int threadsPerBlock = MAX_NUM_THREADS / 2;
     const int blocksPerGrid = (indices.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
@@ -93,7 +93,7 @@ Nd4jLong checkIndices(nd4j::LaunchContext *context, const NDArray& indices, cons
     PointersManager manager(context, "scatterNDcheckIndices");
 
     // scalar, initial value = 0
-    NDArray numOfBadIndx(nd4j::DataType::INT64, context, true);
+    NDArray numOfBadIndx(sd::DataType::INT64, context, true);
 
     NDArray::prepareSpecialUse({&numOfBadIndx}, {&indices});
     BUILD_SINGLE_SELECTOR(xType, checkIndicesCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), indices.getSpecialBuffer(), indices.getSpecialShapeInfo(), reinterpret_cast<Nd4jLong*>(numOfBadIndx.getSpecialBuffer()), output.getSpecialShapeInfo(), axis), INDEXING_TYPES);
@@ -334,7 +334,7 @@ static void scatterCudaLauncher(const int blocksPerGrid, const int threadsPerBlo
 
 
 ///////////////////////////////////////////////////////////////////
-void scatter(nd4j::LaunchContext  *context, pairwise::Ops op, const NDArray& indices, const NDArray& updates, NDArray& output, const bool lock) {
+void scatter(sd::LaunchContext  *context, pairwise::Ops op, const NDArray& indices, const NDArray& updates, NDArray& output, const bool lock) {
 
     const auto xType = indices.dataType();
     const auto yType = updates.dataType();
@@ -596,7 +596,7 @@ static void scatterNDCudaLauncher(const int blocksPerGrid, const int threadsPerB
 }
 
 ///////////////////////////////////////////////////////////////////
-void scatterND(nd4j::LaunchContext  *context, pairwise::Ops op, const NDArray& indices, const NDArray& updates, NDArray& output, const bool lock) {
+void scatterND(sd::LaunchContext  *context, pairwise::Ops op, const NDArray& indices, const NDArray& updates, NDArray& output, const bool lock) {
 
     const int xRank = indices.rankOf();
     const int yRank = updates.rankOf();
@@ -670,7 +670,7 @@ static void scatterForLossCudaLauncher(const int blocksPerGrid, const int thread
 }
 
 ///////////////////////////////////////////////////////////////////
-void scatterForLoss(nd4j::LaunchContext* context, const NDArray& indices, NDArray& updates, NDArray& output, const bool calcGrad) {
+void scatterForLoss(sd::LaunchContext* context, const NDArray& indices, NDArray& updates, NDArray& output, const bool calcGrad) {
     // shapes of indices and output must be the same
     // shape of indices should be the same as updates shape with last dimension excluded, for example if updates is {a,b,c} then indices should be {a,b}
 
@@ -736,13 +736,13 @@ __global__ static void scatterLockCuda(const int opCode,
         std::vector<int> yTadDims(sizeOfUpdDims);
         std::iota(yTadDims.begin(), yTadDims.end(), 0);
 
-        auto packY = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(updates.getShapeInfo(), ShapeUtils::evalDimsToExclude(updates.rankOf(), yTadDims));
-        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output.getShapeInfo(), zTadDims);
+        auto packY = sd::ConstantTadHelper::getInstance()->tadForDimensions(updates.getShapeInfo(), ShapeUtils::evalDimsToExclude(updates.rankOf(), yTadDims));
+        auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output.getShapeInfo(), zTadDims);
 
         const Nd4jLong zTadLen = shape::length(packZ.primaryShapeInfo());
         const Nd4jLong yTadLen = shape::length(packY.primaryShapeInfo());
 
-        const auto threadsPerBlock = nd4j::math::nd4j_max<int>(32, nd4j::math::nd4j_min<int>(zTadLen, 1024));
+        const auto threadsPerBlock = sd::math::nd4j_max<int>(32, sd::math::nd4j_min<int>(zTadLen, 1024));
         const auto blocksPerGrid = indices.lengthOf();
 
         const auto xType = indices.dataType();
@@ -959,12 +959,12 @@ __global__ static void scatterLockCuda(const int opCode,
 
 
             template <typename T>
-            void scatter_(nd4j::LaunchContext  *context, pairwise::Ops op, const NDArray& indices, const NDArray& updates, NDArray& output, const bool lock) {
+            void scatter_(sd::LaunchContext  *context, pairwise::Ops op, const NDArray& indices, const NDArray& updates, NDArray& output, const bool lock) {
                 std::vector<int> dims = {0};
                 auto inverted = ShapeUtils::evalDimsToExclude(output.rankOf(), dims);
 
-                auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output.getShapeInfo(), inverted);
-                auto packY = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(updates.getShapeInfo(), inverted);
+                auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(output.getShapeInfo(), inverted);
+                auto packY = sd::ConstantTadHelper::getInstance()->tadForDimensions(updates.getShapeInfo(), inverted);
 
                 auto psX = packX.specialShapeInfo();
                 auto psY = packY.specialShapeInfo();
@@ -981,7 +981,7 @@ __global__ static void scatterLockCuda(const int opCode,
                 if (tadLengthX != tadLengthY)
                     throw std::runtime_error("scatter: Lengths of TADs must be equal");
 
-                auto blockSize = nd4j::math::nd4j_max<int>(32, nd4j::math::nd4j_min<int>(tadLengthX, 1024));
+                auto blockSize = sd::math::nd4j_max<int>(32, sd::math::nd4j_min<int>(tadLengthX, 1024));
 
                 if (lock)
                     scatterCuda<T, true><<<512, blockSize, 1024, *context->getCudaStream()>>>(op, indices.lengthOf(), output.getSpecialBuffer(), psX, poX, updates.getSpecialBuffer(), psY, poY, reinterpret_cast<int *>(indices.getSpecialBuffer()), tadLengthX, tadLengthY);
@@ -1016,9 +1016,9 @@ const int xLastDim = indices.sizeAt(-1);
             zTadDims[i] = zRank - 1 - j;
         }
 
-        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(indices.getShapeInfo(), {xRank - 1});
-        auto packY = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(updates.getShapeInfo(), yTadDims);
-        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output.getShapeInfo(), zTadDims);
+        auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(indices.getShapeInfo(), {xRank - 1});
+        auto packY = sd::ConstantTadHelper::getInstance()->tadForDimensions(updates.getShapeInfo(), yTadDims);
+        auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output.getShapeInfo(), zTadDims);
 
         const int threadsPerBlock = MAX_NUM_THREADS / 4;
         const int blocksPerGrid = packZ.numberOfTads();
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/scatter_simple.cu b/libnd4j/include/ops/declarable/helpers/cuda/scatter_simple.cu
index 37a465144..277a1f587 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/scatter_simple.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/scatter_simple.cu
@@ -23,13 +23,13 @@
 #include <array/ResultSet.h>
 #include <helpers/ShapeUtils.h>
 #include <numeric>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 #include <helpers/TAD.h>
 #include <exceptions/cuda_exception.h>
-#include <PointersManager.h>
-#include <ConstantTadHelper.h>
+#include <helpers/PointersManager.h>
+#include <helpers/ConstantTadHelper.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             template <typename X, typename Y>
@@ -48,7 +48,7 @@ namespace nd4j {
 
 
             template <typename X, typename Y>
-            void scatterSimple_(nd4j::LaunchContext * context, const int opId, NDArray& input, const NDArray& updates, const NDArray& indices, const std::vector<int>& dimensions) {
+            void scatterSimple_(sd::LaunchContext * context, const int opId, NDArray& input, const NDArray& updates, const NDArray& indices, const std::vector<int>& dimensions) {
 
                 auto dims = ShapeUtils::evalDimsToExclude(input.rankOf(), dimensions);
                 auto packX = ConstantTadHelper::getInstance()->tadForDimensions(input.getShapeInfo(), dims);
@@ -61,7 +61,7 @@ namespace nd4j {
             }
 
 
-            void scatterSimple(nd4j::LaunchContext * context, const int opId, NDArray& input, const NDArray& updates, const NDArray& indices, const std::vector<int>& dimensions) {
+            void scatterSimple(sd::LaunchContext * context, const int opId, NDArray& input, const NDArray& updates, const NDArray& indices, const std::vector<int>& dimensions) {
                 auto xType = input.dataType();
                 auto yType = indices.dataType();
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/scatter_update.cu b/libnd4j/include/ops/declarable/helpers/cuda/scatter_update.cu
index 1ad55a111..748a2e6a3 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/scatter_update.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/scatter_update.cu
@@ -23,13 +23,13 @@
 #include <array/ResultSet.h>
 #include <helpers/ShapeUtils.h>
 #include <numeric>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 #include <helpers/TAD.h>
 #include <exceptions/cuda_exception.h>
-#include <PointersManager.h>
-#include <ConstantTadHelper.h>
+#include <helpers/PointersManager.h>
+#include <helpers/ConstantTadHelper.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             ///////////////////////////////////////////////////////////////////
@@ -104,7 +104,7 @@ namespace nd4j {
 
 
 //////////////////////////////////////////////////////////////////////////
-            void scatterUpdate(nd4j::LaunchContext* context, NDArray& input, NDArray& updates, const std::vector<int>* intArgs) {
+            void scatterUpdate(sd::LaunchContext* context, NDArray& input, NDArray& updates, const std::vector<int>* intArgs) {
 
                 const int opCode    = (*intArgs)[0];
                 const int numOfDims = (*intArgs)[1];
@@ -117,7 +117,7 @@ namespace nd4j {
                 auto packX = ConstantTadHelper::getInstance()->tadForDimensions(input.getShapeInfo(), tadDimensions);
                 auto packY = ConstantTadHelper::getInstance()->tadForDimensions(updates.getShapeInfo(), tadDimensions);
 
-                NDArray indices(const_cast<int*>(intArgs->data()) + numOfDims + 3, 'c', {numOfInd}, nd4j::DataType::INT32, context);
+                NDArray indices(const_cast<int*>(intArgs->data()) + numOfDims + 3, 'c', {numOfInd}, sd::DataType::INT32, context);
 
                 PointersManager manager(context, "scatterUpdate");
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment.cu
index 4aa5c762d..796dd6a1e 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/segment.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/segment.cu
@@ -20,14 +20,14 @@
 
 #include <ops/declarable/helpers/segment.h>
 #include <ops/declarable/helpers/segment_common.h>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 #include <helpers/ShapeUtils.h>
 #include <helpers/TAD.h>
 #include <exceptions/cuda_exception.h>
-#include <PointersManager.h>
-#include <ConstantTadHelper.h>
+#include <helpers/PointersManager.h>
+#include <helpers/ConstantTadHelper.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -39,7 +39,7 @@ namespace helpers {
         return true;
     }
 
-    bool segmentIndicesValidate(nd4j::LaunchContext* context , NDArray* indices, NDArray& expected, NDArray& output) {
+    bool segmentIndicesValidate(sd::LaunchContext* context , NDArray* indices, NDArray& expected, NDArray& output) {
         BUILD_DOUBLE_SELECTOR(output.dataType(), indices->dataType(), return segmentIndicesValidate_, (indices, expected, output), NUMERIC_TYPES, INDEXING_TYPES);
     }
 
@@ -59,14 +59,14 @@ namespace helpers {
         auto start = threadIdx.x + blockIdx.x * blockDim.x;
         auto step = gridDim.x * blockDim.x;
         for (int e = start; e < len && onlyTrue; e += step) {
-            nd4j::math::atomics::nd4j_atomicMax(found, indices[e]);
+            sd::math::atomics::nd4j_atomicMax(found, indices[e]);
             if (expected < *found)
                 onlyTrue = false;
         }
     }
 
     template <typename I>
-    static bool unsortedSegmentIndicesValidate_(nd4j::LaunchContext* context , NDArray* indices, Nd4jLong expected, Nd4jLong& output) {
+    static bool unsortedSegmentIndicesValidate_(sd::LaunchContext* context , NDArray* indices, Nd4jLong expected, Nd4jLong& output) {
         output = expected;
         I found = output;
         I exp = expected;
@@ -81,7 +81,7 @@ namespace helpers {
         return expected == output;
     }
 
-    bool unsortedSegmentIndicesValidate(nd4j::LaunchContext* context , NDArray* indices, Nd4jLong expected, Nd4jLong& output) {
+    bool unsortedSegmentIndicesValidate(sd::LaunchContext* context , NDArray* indices, Nd4jLong expected, Nd4jLong& output) {
         BUILD_SINGLE_SELECTOR(indices->dataType(), return unsortedSegmentIndicesValidate_, (context, indices, expected, output), INDEXING_TYPES);
     }
 
@@ -105,8 +105,8 @@ namespace helpers {
 
         for (auto j = tid; j < idxLen; j += step) {
             auto pos = idxBuf[j];
-            nd4j::math::atomics::nd4j_atomicMin<int>(&classesRangesStart[pos], (int)j);
-            nd4j::math::atomics::nd4j_atomicAdd<int>(&classesRangesLenghts[pos], 1);
+            sd::math::atomics::nd4j_atomicMin<int>(&classesRangesStart[pos], (int)j);
+            sd::math::atomics::nd4j_atomicAdd<int>(&classesRangesLenghts[pos], 1);
         }
     }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_max.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_max.cu
index 9585642dd..208bf764b 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/segment_max.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_max.cu
@@ -21,14 +21,14 @@
 #include <ops/declarable/helpers/segment.h>
 #include <ops/declarable/helpers/segment_common.h>
 
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 #include <helpers/ShapeUtils.h>
 #include <helpers/TAD.h>
 #include <exceptions/cuda_exception.h>
-#include <PointersManager.h>
-#include <ConstantTadHelper.h>
+#include <helpers/PointersManager.h>
+#include <helpers/ConstantTadHelper.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
 
@@ -70,7 +70,7 @@ namespace nd4j {
 
                 for (auto e = start + threadIdx.x + 1; e < finish; e += blockDim.x) {
                     auto xIndex = shape::getIndexOffset(e, inputShape);
-                    nd4j::math::atomics::nd4j_atomicMax(&z[zIndex], x[xIndex]);
+                    sd::math::atomics::nd4j_atomicMax(&z[zIndex], x[xIndex]);
                 }
             }
             // -------------------------------------------------------------------------------------------------------------- //
@@ -108,7 +108,7 @@ namespace nd4j {
                         auto xIndex = shape::getIndexOffset(e, inputShape);
                         auto yIndex = shape::getIndexOffset(e, indicesShape);
                         if (y[yIndex] == segment) {
-                            nd4j::math::atomics::nd4j_atomicMax(&z[zIndex], x[xIndex]);
+                            sd::math::atomics::nd4j_atomicMax(&z[zIndex], x[xIndex]);
                         }
                     }
             }
@@ -142,7 +142,7 @@ namespace nd4j {
                         for (auto e = threadIdx.x; e < len; e += blockDim.x) {
                             auto xIndex = shape::getIndexOffset(e, inputTads);
                             auto zIndex = shape::getIndexOffset(e, outputTads);
-                            nd4j::math::atomics::nd4j_atomicMax(&z[zIndex], x[xIndex]);
+                            sd::math::atomics::nd4j_atomicMax(&z[zIndex], x[xIndex]);
                             //z[zIndex] = x[xIndex];
                         }
                     }
@@ -151,7 +151,7 @@ namespace nd4j {
                             auto xIndex = shape::getIndexOffset(e, inputTads);
                             auto zIndex = shape::getIndexOffset(e, outputTads);
                             if (lengths[segment])
-                                nd4j::math::atomics::nd4j_atomicMax(&z[zIndex], x[xIndex]);
+                                sd::math::atomics::nd4j_atomicMax(&z[zIndex], x[xIndex]);
                         }
                     }
                 }
@@ -185,8 +185,8 @@ namespace nd4j {
                 }
                 else {
                     std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-                    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-                    auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+                    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+                    auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
                     Nd4jLong* inputTads = packX.specialShapeInfo();
                     Nd4jLong* inputTadOffsets = packX.specialOffsets();
                     Nd4jLong* outputTads = packZ.specialShapeInfo();
@@ -196,7 +196,7 @@ namespace nd4j {
                 NDArray::registerSpecialUse({output}, {input, indices, &classesRangesBegs, &classesRangesLens});
             }
             // -------------------------------------------------------------------------------------------------------------- //
-            void segmentMaxFunctor(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* output) {
+            void segmentMaxFunctor(sd::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* output) {
                 NDArray::prepareSpecialUse({output}, {input, indices});
                 BUILD_DOUBLE_SELECTOR(input->dataType(), indices->dataType(), segmentMaxFunctor_, (context, input, indices, output), NUMERIC_TYPES, INDEXING_TYPES);
                 NDArray::registerSpecialUse({output}, {input, indices});
@@ -204,7 +204,7 @@ namespace nd4j {
             // -------------------------------------------------------------------------------------------------------------- //
 
             template <typename T, typename I>
-            static void unsortedSegmentMaxFunctor_(nd4j::LaunchContext* context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
+            static void unsortedSegmentMaxFunctor_(sd::LaunchContext* context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
                 auto stream = context->getCudaStream();
 //        NDArray classes = NDArrayFactory::create<int>('c', {numOfClasses, 2});
                 output->assign(DataTypeUtils::infOrMax<T>());
@@ -212,7 +212,7 @@ namespace nd4j {
                 NDArray classesRangesBegs = NDArrayFactory::create<int>('c', {numOfClasses});
                 NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numOfClasses});
 //        NDArray row = NDArrayFactory::create<int>('c', {1, 2}, {(int)indices->lengthOf(), (int)0});
-//        classes.applyTrueBroadcast(nd4j::BroadcastOpsTuple::Assign(), row, classes);
+//        classes.applyTrueBroadcast(sd::BroadcastOpsTuple::Assign(), row, classes);
                 classesRangesBegs.assign(indices->lengthOf());
                 classesRangesLens.assign(0);
                 dim3 dims(numOfClasses, indices->lengthOf(), numOfClasses * 32 + 32);
@@ -226,8 +226,8 @@ namespace nd4j {
                 }
                 else {
                     std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-                    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-                    auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+                    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+                    auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
                     Nd4jLong* inputTads = packX.specialShapeInfo();
                     Nd4jLong* inputTadOffsets = packX.specialOffsets();
                     Nd4jLong* outputTads = packZ.specialShapeInfo();
@@ -239,7 +239,7 @@ namespace nd4j {
 
             }
             // -------------------------------------------------------------------------------------------------------------- //
-            void unsortedSegmentMaxFunctor(nd4j::LaunchContext* context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
+            void unsortedSegmentMaxFunctor(sd::LaunchContext* context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
                 NDArray::prepareSpecialUse({output}, {input, indices});
                 output->nullify();
                 BUILD_DOUBLE_SELECTOR(input->dataType(), indices->dataType(), unsortedSegmentMaxFunctor_, (context, input, indices, numOfClasses, output), NUMERIC_TYPES, INDEXING_TYPES);
@@ -283,7 +283,7 @@ namespace nd4j {
                     auto gradOffsetI = shape::getIndexOffset(classIndex, forwardShape);
                     auto gradOffsetO = shape::getIndexOffset(classIndex, epsShape);
 
-                    if (nd4j::math::nd4j_abs(gradIn[gradOffsetI] - x[xOffset]) <= T(1.e-6)) {
+                    if (sd::math::nd4j_abs(gradIn[gradOffsetI] - x[xOffset]) <= T(1.e-6)) {
                         z[zOffset] = gradOut[gradOffsetO];
                     }
                 }
@@ -326,14 +326,14 @@ namespace nd4j {
                     T* outGrad = gradOut + gradOutOffsets[segment];
 
                     for (auto e = threadIdx.x; e < currentLen; e += blockDim.x) {
-                        if (nd4j::math::nd4j_abs(in[e] - current[e]) <= T(1.e-6))
+                        if (sd::math::nd4j_abs(in[e] - current[e]) <= T(1.e-6))
                             currentOut[e] = outGrad[e];
                     }
                 }
             }
             // -------------------------------------------------------------------------------------------------------------- //
             template <typename T, typename I>
-            int segmentMaxFunctorBP_(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
+            int segmentMaxFunctorBP_(sd::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
                 //int numOfClasses = gradOut->sizeAt(0);
                 // if input is a vector: (as if in doc sample)
                 auto stream = context->getCudaStream();
@@ -349,10 +349,10 @@ namespace nd4j {
                 }
                 else {
                     std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-                    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-                    auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
-                    auto packGradIn = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(tempRes.getShapeInfo(), dimensions);
-                    auto packGradOut = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
+                    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+                    auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+                    auto packGradIn = sd::ConstantTadHelper::getInstance()->tadForDimensions(tempRes.getShapeInfo(), dimensions);
+                    auto packGradOut = sd::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
                     Nd4jLong* inputTads = packX.specialShapeInfo();
                     Nd4jLong* inputTadOffsets = packX.specialOffsets();
                     Nd4jLong* outputTads = packZ.specialShapeInfo();
@@ -372,7 +372,7 @@ namespace nd4j {
                 return Status::OK();
             }
             // -------------------------------------------------------------------------------------------------------------- //
-            int segmentMaxFunctorBP(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
+            int segmentMaxFunctorBP(sd::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
                 NDArray::prepareSpecialUse({output}, {input, indices, gradOut});
                 BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), return segmentMaxFunctorBP_, (context, input,
                         indices, gradOut, output), FLOAT_TYPES, INDEXING_TYPES);
@@ -381,7 +381,7 @@ namespace nd4j {
 
             // -------------------------------------------------------------------------------------------------------------- //
             template <typename T, typename I>
-            static int unsortedSegmentMaxFunctorBP_(nd4j::LaunchContext* context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
+            static int unsortedSegmentMaxFunctorBP_(sd::LaunchContext* context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
                 //int numOfClasses = gradOut->sizeAt(0);
                 // if input is a vector: (as if in doc sample)
                 auto stream = context->getCudaStream();
@@ -397,10 +397,10 @@ namespace nd4j {
                 }
                 else {
                     std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-                    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-                    auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
-                    auto packGradIn = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(tempRes.getShapeInfo(), dimensions);
-                    auto packGradOut = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
+                    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+                    auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+                    auto packGradIn = sd::ConstantTadHelper::getInstance()->tadForDimensions(tempRes.getShapeInfo(), dimensions);
+                    auto packGradOut = sd::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
                     Nd4jLong* inputTads = packX.specialShapeInfo();
                     Nd4jLong* inputTadOffsets = packX.specialOffsets();
                     Nd4jLong* outputTads = packZ.specialShapeInfo();
@@ -420,7 +420,7 @@ namespace nd4j {
                 return Status::OK();
             }
             // -------------------------------------------------------------------------------------------------------------- //
-            int unsortedSegmentMaxFunctorBP(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
+            int unsortedSegmentMaxFunctorBP(sd::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
                 NDArray::prepareSpecialUse({output}, {input, indices, gradOut});
                 BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), return unsortedSegmentMaxFunctorBP_, (context, input, indices, gradOut, numOfClasses, output), FLOAT_TYPES, INDEXING_TYPES);
                 NDArray::registerSpecialUse({output}, {input, indices, gradOut});
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_mean.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_mean.cu
index dc958f79c..fa8882190 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/segment_mean.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_mean.cu
@@ -20,14 +20,14 @@
 
 #include <ops/declarable/helpers/segment.h>
 #include <ops/declarable/helpers/segment_common.h>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 #include <helpers/ShapeUtils.h>
 #include <helpers/TAD.h>
 #include <exceptions/cuda_exception.h>
-#include <PointersManager.h>
-#include <ConstantTadHelper.h>
+#include <helpers/PointersManager.h>
+#include <helpers/ConstantTadHelper.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
     // -------------------------------------------------------------------------------------------------------------- //
@@ -67,7 +67,7 @@ namespace helpers {
         for (auto e = start + threadIdx.x + 1; e < finish; e += blockDim.x) {
             auto xIndex = shape::getIndexOffset(e, inputShape);
             if (lengths[segment])
-                nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], T(x[xIndex] / lengths[segment]));
+                sd::math::atomics::nd4j_atomicAdd(&z[zIndex], T(x[xIndex] / lengths[segment]));
         }
     }
     // -------------------------------------------------------------------------------------------------------------- //
@@ -108,7 +108,7 @@ namespace helpers {
                 auto xIndex = shape::getIndexOffset(e, inputShape);
                 auto yIndex = shape::getIndexOffset(e, indicesShape);
                 if (y[yIndex] == segment && e != starts[segment]) {
-                    nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], T(x[xIndex]/T(lengths[segment])));
+                    sd::math::atomics::nd4j_atomicAdd(&z[zIndex], T(x[xIndex]/T(lengths[segment])));
                 }
             }
     }
@@ -139,7 +139,7 @@ namespace helpers {
                 for (auto e = threadIdx.x; e < len; e += blockDim.x) {
                     auto xIndex = shape::getIndexOffset(e, inputTads);
                     auto zIndex = shape::getIndexOffset(e, outputTads);
-                    nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], T(x[xIndex]/lengths[segment]));
+                    sd::math::atomics::nd4j_atomicAdd(&z[zIndex], T(x[xIndex]/lengths[segment]));
                 }
             }
             else {
@@ -147,7 +147,7 @@ namespace helpers {
                     auto xIndex = shape::getIndexOffset(e, inputTads);
                     auto zIndex = shape::getIndexOffset(e, outputTads);
                     if (lengths[segment])
-                        nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], T(x[xIndex]/lengths[segment]));
+                        sd::math::atomics::nd4j_atomicAdd(&z[zIndex], T(x[xIndex]/lengths[segment]));
                 }
             }
         }
@@ -174,8 +174,8 @@ namespace helpers {
         }
         else {
             std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+            auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+            auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
             Nd4jLong* inputTads = packX.specialShapeInfo();
             Nd4jLong* inputTadOffsets = packX.specialOffsets();
             Nd4jLong* outputTads = packZ.specialShapeInfo();
@@ -186,7 +186,7 @@ namespace helpers {
 
     }
     // -------------------------------------------------------------------------------------------------------------- //
-    void segmentMeanFunctor(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* output) {
+    void segmentMeanFunctor(sd::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* output) {
         NDArray::prepareSpecialUse({output}, {input, indices});
         BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), segmentMeanFunctor_, (context, input, indices, output), NUMERIC_TYPES, INDEXING_TYPES);
         NDArray::registerSpecialUse({output}, {input, indices});
@@ -194,14 +194,14 @@ namespace helpers {
 
     // -------------------------------------------------------------------------------------------------------------- //
     template <typename T, typename I>
-    static void unsortedSegmentMeanFunctor_(nd4j::LaunchContext* context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
+    static void unsortedSegmentMeanFunctor_(sd::LaunchContext* context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
         auto stream = context->getCudaStream();
 //        NDArray classes = NDArrayFactory::create<int>('c', {numOfClasses, 2});
 
         NDArray classesRangesBegs = NDArrayFactory::create<int>('c', {numOfClasses});
         NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numOfClasses});
 //        NDArray row = NDArrayFactory::create<int>('c', {1, 2}, {(int)indices->lengthOf(), (int)0});
-//        classes.applyTrueBroadcast(nd4j::BroadcastOpsTuple::Assign(), &row, &classes);
+//        classes.applyTrueBroadcast(sd::BroadcastOpsTuple::Assign(), &row, &classes);
         classesRangesBegs.assign(indices->lengthOf());
         classesRangesLens.assign(0);
         dim3 dims(numOfClasses, indices->lengthOf(), numOfClasses * 32 + 32);
@@ -216,8 +216,8 @@ namespace helpers {
         else {
             output->assign(0);
             std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+            auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+            auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
             Nd4jLong* inputTads = packX.specialShapeInfo();
             Nd4jLong* inputTadOffsets = packX.specialOffsets();
             Nd4jLong* outputTads = packZ.specialShapeInfo();
@@ -228,7 +228,7 @@ namespace helpers {
 
     }
     // -------------------------------------------------------------------------------------------------------------- //
-    void unsortedSegmentMeanFunctor(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
+    void unsortedSegmentMeanFunctor(sd::LaunchContext* context , NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
         NDArray::prepareSpecialUse({output}, {input, indices});
         BUILD_DOUBLE_SELECTOR(input->dataType(), indices->dataType(), unsortedSegmentMeanFunctor_, (context, input, indices, numOfClasses, output),
                               NUMERIC_TYPES, INDEXING_TYPES);
@@ -310,7 +310,7 @@ namespace helpers {
     // -------------------------------------------------------------------------------------------------------------- //
     // backrop for mean
     template <typename T, typename I>
-    int segmentMeanFunctorBP_(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
+    int segmentMeanFunctorBP_(sd::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
         auto stream = context->getCudaStream();
         NDArray::prepareSpecialUse({output}, {input, indices, gradOut});
         auto numClasses = indices->e<int>(indices->lengthOf() - 1) + 1;
@@ -333,10 +333,10 @@ namespace helpers {
         }
         else {
             std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
-//            auto packGradIn = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(tempRes.getShapeInfo(), dimensions);
-            auto packGradOut = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
+            auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+            auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+//            auto packGradIn = sd::ConstantTadHelper::getInstance()->tadForDimensions(tempRes.getShapeInfo(), dimensions);
+            auto packGradOut = sd::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
             Nd4jLong* inputTads = packX.specialShapeInfo();
             Nd4jLong* inputTadOffsets = packX.specialOffsets();
             Nd4jLong* outputTads = packZ.specialShapeInfo();
@@ -354,7 +354,7 @@ namespace helpers {
     }
     // -------------------------------------------------------------------------------------------------------------- //
     // segmen mean bp main
-    int segmentMeanFunctorBP(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
+    int segmentMeanFunctorBP(sd::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
         NDArray::prepareSpecialUse({output}, {input, indices, gradOut});
         BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), return segmentMeanFunctorBP_, (context, input,
                 indices, gradOut, output), FLOAT_TYPES, INDEXING_TYPES);
@@ -363,7 +363,7 @@ namespace helpers {
     // -------------------------------------------------------------------------------------------------------------- //
 
     template <typename T, typename I>
-    static int unsortedSegmentMeanFunctorBP_(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
+    static int unsortedSegmentMeanFunctorBP_(sd::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
         auto stream = context->getCudaStream();
         NDArray::prepareSpecialUse({output}, {input, indices, gradOut});
         auto numClasses = indices->e<int>(indices->lengthOf() - 1) + 1;
@@ -386,10 +386,10 @@ namespace helpers {
         }
         else {
             std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
-//            auto packGradIn = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(tempRes.getShapeInfo(), dimensions);
-            auto packGradOut = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
+            auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+            auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+//            auto packGradIn = sd::ConstantTadHelper::getInstance()->tadForDimensions(tempRes.getShapeInfo(), dimensions);
+            auto packGradOut = sd::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
             Nd4jLong* inputTads = packX.specialShapeInfo();
             Nd4jLong* inputTadOffsets = packX.specialOffsets();
             Nd4jLong* outputTads = packZ.specialShapeInfo();
@@ -406,7 +406,7 @@ namespace helpers {
         return Status::OK();
     }
     // -------------------------------------------------------------------------------------------------------------- //
-    int unsortedSegmentMeanFunctorBP(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
+    int unsortedSegmentMeanFunctorBP(sd::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
         NDArray::prepareSpecialUse({output}, {input, indices, gradOut});
         BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), return unsortedSegmentMeanFunctorBP_, (context, input, indices, gradOut, numOfClasses, output), FLOAT_TYPES, INDEXING_TYPES);
         NDArray::registerSpecialUse({output}, {input, indices, gradOut});
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_min.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_min.cu
index 506cfaa41..b83d37567 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/segment_min.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_min.cu
@@ -20,14 +20,14 @@
 
 #include <ops/declarable/helpers/segment.h>
 #include <ops/declarable/helpers/segment_common.h>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 #include <helpers/ShapeUtils.h>
 #include <helpers/TAD.h>
 #include <exceptions/cuda_exception.h>
-#include <PointersManager.h>
-#include <ConstantTadHelper.h>
+#include <helpers/PointersManager.h>
+#include <helpers/ConstantTadHelper.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
     // -------------------------------------------------------------------------------------------------------------- //
@@ -68,7 +68,7 @@ namespace helpers {
 
         for (auto e = start + threadIdx.x + 1; e < finish; e += blockDim.x) {
             auto xIndex = shape::getIndexOffset(e, inputShape);
-            nd4j::math::atomics::nd4j_atomicMin(&z[zIndex], x[xIndex]);
+            sd::math::atomics::nd4j_atomicMin(&z[zIndex], x[xIndex]);
         }
 
     }
@@ -111,7 +111,7 @@ namespace helpers {
                 auto xIndex = shape::getIndexOffset(e, inputShape);
                 auto yIndex = shape::getIndexOffset(e, indicesShape);
                 if (y[yIndex] == segment) {
-                    nd4j::math::atomics::nd4j_atomicMin(&z[zIndex], x[xIndex]);
+                    sd::math::atomics::nd4j_atomicMin(&z[zIndex], x[xIndex]);
                 }
             }
     }
@@ -142,7 +142,7 @@ namespace helpers {
                 for (auto e = threadIdx.x; e < len; e += blockDim.x) {
                     auto xIndex = shape::getIndexOffset(e, inputTads);
                     auto zIndex = shape::getIndexOffset(e, outputTads);
-                    nd4j::math::atomics::nd4j_atomicMin(&z[zIndex], x[xIndex]);
+                    sd::math::atomics::nd4j_atomicMin(&z[zIndex], x[xIndex]);
                 }
             }
             else {
@@ -150,7 +150,7 @@ namespace helpers {
                     auto xIndex = shape::getIndexOffset(e, inputTads);
                     auto zIndex = shape::getIndexOffset(e, outputTads);
 //                    if (lengths[indices[idx]])
-                        nd4j::math::atomics::nd4j_atomicMin(&z[zIndex], x[xIndex]);
+                        sd::math::atomics::nd4j_atomicMin(&z[zIndex], x[xIndex]);
                 }
             }
         }
@@ -176,8 +176,8 @@ namespace helpers {
         }
         else {
             std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+            auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+            auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
             Nd4jLong* inputTads = packX.specialShapeInfo();
             Nd4jLong* inputTadOffsets = packX.specialOffsets();
             Nd4jLong* outputTads = packZ.specialShapeInfo();
@@ -189,7 +189,7 @@ namespace helpers {
 
     }
     // -------------------------------------------------------------------------------------------------------------- //
-    void segmentMinFunctor(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* output) {
+    void segmentMinFunctor(sd::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* output) {
         NDArray::prepareSpecialUse({output}, {input, indices});
         output->nullify();
         BUILD_DOUBLE_SELECTOR(input->dataType(), indices->dataType(), segmentMinFunctor_, (context, input, indices, output), NUMERIC_TYPES, INDEXING_TYPES);
@@ -199,13 +199,13 @@ namespace helpers {
     // -------------------------------------------------------------------------------------------------------------- //
 
     template <typename T, typename I>
-    static void unsortedSegmentMinFunctor_(nd4j::LaunchContext* context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
+    static void unsortedSegmentMinFunctor_(sd::LaunchContext* context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
         auto stream = context->getCudaStream();
 //        NDArray classes = NDArrayFactory::create<int>('c', {numOfClasses, 2});
         NDArray classesRangesBegs = NDArrayFactory::create<int>('c', {numOfClasses});
         NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numOfClasses});
 //        NDArray row = NDArrayFactory::create<int>('c', {1, 2}, {(int)indices->lengthOf(), (int)0});
-//        classes.applyTrueBroadcast(nd4j::BroadcastOpsTuple::Assign(), &row, &classes);
+//        classes.applyTrueBroadcast(sd::BroadcastOpsTuple::Assign(), &row, &classes);
         output->assign(DataTypeUtils::infOrMax<T>());
         classesRangesBegs.assign(indices->lengthOf());
         classesRangesLens.assign(0);
@@ -221,8 +221,8 @@ namespace helpers {
         else {
             output->assign(DataTypeUtils::max<T>());
             std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+            auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+            auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
             Nd4jLong* inputTads = packX.specialShapeInfo();
             Nd4jLong* inputTadOffsets = packX.specialOffsets();
             Nd4jLong* outputTads = packZ.specialShapeInfo();
@@ -234,7 +234,7 @@ namespace helpers {
 
     }
     // -------------------------------------------------------------------------------------------------------------- //
-    void unsortedSegmentMinFunctor(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
+    void unsortedSegmentMinFunctor(sd::LaunchContext* context , NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
         NDArray::prepareSpecialUse({output}, {input, indices});
         output->nullify();
         BUILD_DOUBLE_SELECTOR(input->dataType(), indices->dataType(), unsortedSegmentMinFunctor_, (context, input, indices, numOfClasses, output),
@@ -276,7 +276,7 @@ namespace helpers {
             auto gradOffsetI = shape::getIndexOffset(classIndex, forwardShape);
             auto gradOffsetO = shape::getIndexOffset(classIndex, epsShape);
 
-            if (nd4j::math::nd4j_abs(gradIn[gradOffsetI] - x[xOffset]) <= T(1.e-6)) {
+            if (sd::math::nd4j_abs(gradIn[gradOffsetI] - x[xOffset]) <= T(1.e-6)) {
                 z[zOffset] = gradOut[gradOffsetO];
             }
         }
@@ -319,7 +319,7 @@ namespace helpers {
             T* outGrad = gradOut + gradOutOffsets[segment];
 
             for (auto e = threadIdx.x; e < currentLen; e += blockDim.x) {
-                if (nd4j::math::nd4j_abs(in[e] - current[e]) <= T(1.e-6))
+                if (sd::math::nd4j_abs(in[e] - current[e]) <= T(1.e-6))
                     currentOut[e] = outGrad[e];
             }
         }
@@ -327,7 +327,7 @@ namespace helpers {
 
     // -------------------------------------------------------------------------------------------------------------- //
     template <typename T, typename I>
-    int segmentMinFunctorBP_(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
+    int segmentMinFunctorBP_(sd::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
         //int numOfClasses = gradOut->sizeAt(0);
         // if input is a vector: (as if in doc sample)
         auto stream = context->getCudaStream();
@@ -344,10 +344,10 @@ namespace helpers {
         }
         else {
             std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
-            auto packGradIn = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(tempRes.getShapeInfo(), dimensions);
-            auto packGradOut = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
+            auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+            auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+            auto packGradIn = sd::ConstantTadHelper::getInstance()->tadForDimensions(tempRes.getShapeInfo(), dimensions);
+            auto packGradOut = sd::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
             Nd4jLong* inputTads = packX.specialShapeInfo();
             Nd4jLong* inputTadOffsets = packX.specialOffsets();
             Nd4jLong* outputTads = packZ.specialShapeInfo();
@@ -368,7 +368,7 @@ namespace helpers {
     }
     // -------------------------------------------------------------------------------------------------------------- //
     // segmen min
-    int segmentMinFunctorBP(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
+    int segmentMinFunctorBP(sd::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
         NDArray::prepareSpecialUse({output}, {input, indices, gradOut});
         BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), return segmentMinFunctorBP_, (context, input,
                 indices, gradOut, output), FLOAT_TYPES, INDEXING_TYPES);
@@ -376,7 +376,7 @@ namespace helpers {
     }
 
     template <typename T, typename I>
-    static int unsortedSegmentMinFunctorBP_(nd4j::LaunchContext* context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
+    static int unsortedSegmentMinFunctorBP_(sd::LaunchContext* context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
         //int numOfClasses = gradOut->sizeAt(0);
         // if input is a vector: (as if in doc sample)
         auto stream = context->getCudaStream();
@@ -392,10 +392,10 @@ namespace helpers {
         }
         else {
             std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
-            auto packGradIn = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(tempRes.getShapeInfo(), dimensions);
-            auto packGradOut = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
+            auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+            auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+            auto packGradIn = sd::ConstantTadHelper::getInstance()->tadForDimensions(tempRes.getShapeInfo(), dimensions);
+            auto packGradOut = sd::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
             Nd4jLong* inputTads = packX.specialShapeInfo();
             Nd4jLong* inputTadOffsets = packX.specialOffsets();
             Nd4jLong* outputTads = packZ.specialShapeInfo();
@@ -415,7 +415,7 @@ namespace helpers {
         return Status::OK();
     }
     // -------------------------------------------------------------------------------------------------------------- //
-    int unsortedSegmentMinFunctorBP(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
+    int unsortedSegmentMinFunctorBP(sd::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
         NDArray::prepareSpecialUse({output}, {input, indices, gradOut});
         BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), return unsortedSegmentMinFunctorBP_, (context, input, indices, gradOut, numOfClasses, output), FLOAT_TYPES, INDEXING_TYPES);
         NDArray::registerSpecialUse({output}, {input, indices, gradOut});
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_prod.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_prod.cu
index 613aa548b..baec75b9e 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/segment_prod.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_prod.cu
@@ -20,14 +20,14 @@
 
 #include <ops/declarable/helpers/segment.h>
 #include <ops/declarable/helpers/segment_common.h>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 #include <helpers/ShapeUtils.h>
 #include <helpers/TAD.h>
 #include <exceptions/cuda_exception.h>
-#include <PointersManager.h>
-#include <ConstantTadHelper.h>
+#include <helpers/PointersManager.h>
+#include <helpers/ConstantTadHelper.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
     // -------------------------------------------------------------------------------------------------------------- //
@@ -59,7 +59,7 @@ namespace helpers {
             }
             for (auto e = start + threadIdx.x; e < finish; e += blockDim.x) {
                 auto xIndex = shape::getIndexOffset(e, inputShape);
-                nd4j::math::atomics::nd4j_atomicMul(&z[segment], x[xIndex]);
+                sd::math::atomics::nd4j_atomicMul(&z[segment], x[xIndex]);
             }
         }
 
@@ -84,7 +84,7 @@ namespace helpers {
             if (lengths[segment] == 0) {
                 continue;
             }
-            nd4j::math::atomics::nd4j_atomicMul(&output[zIndex], input[xIndex]);
+            sd::math::atomics::nd4j_atomicMul(&output[zIndex], input[xIndex]);
         }
     }
     // -------------------------------------------------------------------------------------------------------------- //
@@ -112,14 +112,14 @@ namespace helpers {
             for (auto e = threadIdx.x; e < len; e += blockDim.x) {
                 auto xIndex = shape::getIndexOffset(e, inputTads);
                 auto zIndex = shape::getIndexOffset(e, outputTads);
-                nd4j::math::atomics::nd4j_atomicMul(&z[zIndex], x[xIndex]);
+                sd::math::atomics::nd4j_atomicMul(&z[zIndex], x[xIndex]);
             }
         }
     }
     // -------------------------------------------------------------------------------------------------------------- //
 
     template <typename T, typename I>
-    static void segmentProdFunctor_(nd4j::LaunchContext* context, NDArray* input, NDArray* indices, NDArray* output) {
+    static void segmentProdFunctor_(sd::LaunchContext* context, NDArray* input, NDArray* indices, NDArray* output) {
         auto stream = context->getCudaStream();
         Nd4jLong numClasses = indices->e<Nd4jLong>(indices->lengthOf() - 1) + 1;
         NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numClasses});
@@ -138,8 +138,8 @@ namespace helpers {
         }
         else {
             std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+            auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+            auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
             Nd4jLong* inputTads = packX.specialShapeInfo();
             Nd4jLong* inputTadOffsets = packX.specialOffsets();
             Nd4jLong* outputTads = packZ.specialShapeInfo();
@@ -149,7 +149,7 @@ namespace helpers {
 
     }
     // -------------------------------------------------------------------------------------------------------------- //
-    void segmentProdFunctor(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* output) {
+    void segmentProdFunctor(sd::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* output) {
         NDArray::prepareSpecialUse({output}, {input, indices});
         BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), segmentProdFunctor_, (context, input, indices, output), NUMERIC_TYPES, INDEXING_TYPES);
         NDArray::registerSpecialUse({output}, {input, indices});
@@ -157,13 +157,13 @@ namespace helpers {
 
     // -------------------------------------------------------------------------------------------------------------- //
     template <typename T, typename I>
-    static void unsortedSegmentProdFunctor_(nd4j::LaunchContext* context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
+    static void unsortedSegmentProdFunctor_(sd::LaunchContext* context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
         auto stream = context->getCudaStream();
 //        NDArray classes = NDArrayFactory::create<int>('c', {numOfClasses, 2});
         NDArray classesRangesBegs = NDArrayFactory::create<int>('c', {numOfClasses});
         NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numOfClasses});
 //        NDArray row = NDArrayFactory::create<int>('c', {1, 2}, {(int)indices->lengthOf(), (int)0});
-//        classes.applyTrueBroadcast(nd4j::BroadcastOpsTuple::Assign(), &row, &classes);
+//        classes.applyTrueBroadcast(sd::BroadcastOpsTuple::Assign(), &row, &classes);
         classesRangesBegs.assign(indices->lengthOf());
         classesRangesLens.assign(0);
         dim3 dims(numOfClasses, indices->lengthOf(), numOfClasses * 32 + 32);
@@ -181,8 +181,8 @@ namespace helpers {
         }
         else {
             std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+            auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+            auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
             Nd4jLong* inputTads = packX.specialShapeInfo();
             Nd4jLong* inputTadOffsets = packX.specialOffsets();
             Nd4jLong* outputTads = packZ.specialShapeInfo();
@@ -193,7 +193,7 @@ namespace helpers {
 
     }
     // -------------------------------------------------------------------------------------------------------------- //
-    void unsortedSegmentProdFunctor(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
+    void unsortedSegmentProdFunctor(sd::LaunchContext* context , NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
         NDArray::prepareSpecialUse({output}, {input, indices});
         BUILD_DOUBLE_SELECTOR(input->dataType(), indices->dataType(), unsortedSegmentProdFunctor_, (context, input, indices, numOfClasses, output),
                               NUMERIC_TYPES, INDEXING_TYPES);
@@ -283,7 +283,7 @@ namespace helpers {
 
     // -------------------------------------------------------------------------------------------------------------- //
     template <typename T, typename I>
-    int segmentProdFunctorBP_(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
+    int segmentProdFunctorBP_(sd::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
         auto stream = context->getCudaStream();
         NDArray tempRes(gradOut->ordering(), gradOut->getShapeAsVector(), DataTypeUtils::fromT<T>(), context);//->shapeInfo(), context);
         segmentProdFunctor_<T, I>(context, input, indices, &tempRes);
@@ -297,10 +297,10 @@ namespace helpers {
         }
         else {
             std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
-            auto packGradIn = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(tempRes.getShapeInfo(), dimensions);
-            auto packGradOut = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
+            auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+            auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+            auto packGradIn = sd::ConstantTadHelper::getInstance()->tadForDimensions(tempRes.getShapeInfo(), dimensions);
+            auto packGradOut = sd::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
             Nd4jLong* inputTads = packX.specialShapeInfo();
             Nd4jLong* inputTadOffsets = packX.specialOffsets();
             Nd4jLong* outputTads = packZ.specialShapeInfo();
@@ -322,7 +322,7 @@ namespace helpers {
 
     // -------------------------------------------------------------------------------------------------------------- //
 
-    int segmentProdFunctorBP(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
+    int segmentProdFunctorBP(sd::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
         NDArray::prepareSpecialUse({output}, {input, indices, gradOut});
         BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), return segmentProdFunctorBP_, (context, input,
                 indices, gradOut, output), FLOAT_TYPES, INDEXING_TYPES);
@@ -332,7 +332,7 @@ namespace helpers {
     // -------------------------------------------------------------------------------------------------------------- //
 
     template <typename T, typename I>
-    static int unsortedSegmentProdFunctorBP_(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
+    static int unsortedSegmentProdFunctorBP_(sd::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
         auto stream = context->getCudaStream();
 
         NDArray tempRes(gradOut->ordering(), gradOut->getShapeAsVector(), DataTypeUtils::fromT<T>(), context);//->shapeInfo(), context);
@@ -347,10 +347,10 @@ namespace helpers {
         }
         else {
             std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
-            auto packGradIn = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(tempRes.getShapeInfo(), dimensions);
-            auto packGradOut = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
+            auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+            auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+            auto packGradIn = sd::ConstantTadHelper::getInstance()->tadForDimensions(tempRes.getShapeInfo(), dimensions);
+            auto packGradOut = sd::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
             Nd4jLong* inputTads = packX.specialShapeInfo();
             Nd4jLong* inputTadOffsets = packX.specialOffsets();
             Nd4jLong* outputTads = packZ.specialShapeInfo();
@@ -371,7 +371,7 @@ namespace helpers {
     }
 
     // -------------------------------------------------------------------------------------------------------------- //
-    int unsortedSegmentProdFunctorBP(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
+    int unsortedSegmentProdFunctorBP(sd::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
         NDArray::prepareSpecialUse({output}, {input, indices, gradOut});
         BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), return unsortedSegmentProdFunctorBP_, (context, input, indices, gradOut, numOfClasses, output), FLOAT_TYPES, INDEXING_TYPES);
         NDArray::registerSpecialUse({output}, {input, indices, gradOut});
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_sqrtn.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_sqrtn.cu
index 4daa0f4ab..7d85a0ea6 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/segment_sqrtn.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_sqrtn.cu
@@ -20,14 +20,14 @@
 
 #include <ops/declarable/helpers/segment.h>
 #include <ops/declarable/helpers/segment_common.h>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 #include <helpers/ShapeUtils.h>
 #include <helpers/TAD.h>
 #include <exceptions/cuda_exception.h>
-#include <PointersManager.h>
-#include <ConstantTadHelper.h>
+#include <helpers/PointersManager.h>
+#include <helpers/ConstantTadHelper.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
     // -------------------------------------------------------------------------------------------------------------- //
@@ -51,7 +51,7 @@ namespace helpers {
             if (lengths[segment] == 0) continue;
             auto xIndex = shape::getIndexOffset(idx, inputShape);
 
-            nd4j::math::atomics::nd4j_atomicAdd(&output[zIndex],  input[xIndex] / nd4j::math::nd4j_sqrt<int, T>(lengths[segment]));
+            sd::math::atomics::nd4j_atomicAdd(&output[zIndex],  input[xIndex] / sd::math::nd4j_sqrt<int, T>(lengths[segment]));
         }
     }
     // -------------------------------------------------------------------------------------------------------------- //
@@ -77,19 +77,19 @@ namespace helpers {
             for (auto e = threadIdx.x; e < len; e += blockDim.x) {
                 auto xIndex = shape::getIndexOffset(e, inputTads);
                 auto zIndex = shape::getIndexOffset(e, outputTads);
-                nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], x[xIndex] / nd4j::math::nd4j_sqrt<int, T>(lengths[segment]));
+                sd::math::atomics::nd4j_atomicAdd(&z[zIndex], x[xIndex] / sd::math::nd4j_sqrt<int, T>(lengths[segment]));
             }
         }
     }
     // -------------------------------------------------------------------------------------------------------------- //
     template <typename T, typename I>
-    static void unsortedSegmentSqrtNFunctor_(nd4j::LaunchContext* context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
+    static void unsortedSegmentSqrtNFunctor_(sd::LaunchContext* context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
         auto stream = context->getCudaStream();
 //        NDArray classes = NDArrayFactory::create<int>('c', {numOfClasses, 2});
         NDArray classesRangesBegs = NDArrayFactory::create<int>('c', {numOfClasses});
         NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numOfClasses});
 //        NDArray row = NDArrayFactory::create<int>('c', {1, 2}, {(int)indices->lengthOf(), (int)0});
-//        classes.applyTrueBroadcast(nd4j::BroadcastOpsTuple::Assign(), &row, &classes);
+//        classes.applyTrueBroadcast(sd::BroadcastOpsTuple::Assign(), &row, &classes);
         classesRangesBegs.assign(indices->lengthOf());
         classesRangesLens.assign(0);
 //        dim3 dims(numOfClasses, indices->lengthOf(), numOfClasses * 32 + 32);
@@ -108,8 +108,8 @@ namespace helpers {
         else {
             output->nullify();
             std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+            auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+            auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
             Nd4jLong* inputTads = packX.specialShapeInfo();
             Nd4jLong* inputTadOffsets = packX.specialOffsets();
             Nd4jLong* outputTads = packZ.specialShapeInfo();
@@ -121,7 +121,7 @@ namespace helpers {
         }
     }
     // -------------------------------------------------------------------------------------------------------------- //
-    void unsortedSegmentSqrtNFunctor(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
+    void unsortedSegmentSqrtNFunctor(sd::LaunchContext* context , NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
         NDArray::prepareSpecialUse({output}, {input, indices});
         BUILD_DOUBLE_SELECTOR(input->dataType(), indices->dataType(), unsortedSegmentSqrtNFunctor_, (context, input, indices, numOfClasses, output),
                               FLOAT_TYPES, INDEXING_TYPES);
@@ -203,7 +203,7 @@ namespace helpers {
     // -------------------------------------------------------------------------------------------------------------- //
 
     template <typename T, typename I>
-    static int unsortedSegmentSqrtNFunctorBP_(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
+    static int unsortedSegmentSqrtNFunctorBP_(sd::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
         auto stream = context->getCudaStream();
         NDArray::prepareSpecialUse({output}, {input, indices, gradOut});
         auto numClasses = indices->e<int>(indices->lengthOf() - 1) + 1;
@@ -226,10 +226,10 @@ namespace helpers {
         }
         else {
             std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
-//            auto packGradIn = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(tempRes.getShapeInfo(), dimensions);
-            auto packGradOut = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
+            auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+            auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+//            auto packGradIn = sd::ConstantTadHelper::getInstance()->tadForDimensions(tempRes.getShapeInfo(), dimensions);
+            auto packGradOut = sd::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
             Nd4jLong* inputTads = packX.specialShapeInfo();
             Nd4jLong* inputTadOffsets = packX.specialOffsets();
             Nd4jLong* outputTads = packZ.specialShapeInfo();
@@ -247,7 +247,7 @@ namespace helpers {
         return Status::OK();
     }
     // -------------------------------------------------------------------------------------------------------------- //
-    int unsortedSegmentSqrtNFunctorBP(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
+    int unsortedSegmentSqrtNFunctorBP(sd::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
         NDArray::prepareSpecialUse({output}, {input, indices, gradOut});
         BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), return unsortedSegmentSqrtNFunctorBP_, (context, input, indices, gradOut, numOfClasses, output), FLOAT_TYPES, INDEXING_TYPES);
         NDArray::registerSpecialUse({output}, {input, indices, gradOut});
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_sum.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_sum.cu
index cf4ddd942..6e1e3fca8 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/segment_sum.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_sum.cu
@@ -20,14 +20,14 @@
 
 #include <ops/declarable/helpers/segment.h>
 #include <ops/declarable/helpers/segment_common.h>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 #include <helpers/ShapeUtils.h>
 #include <helpers/TAD.h>
 #include <exceptions/cuda_exception.h>
-#include <PointersManager.h>
-#include <ConstantTadHelper.h>
+#include <helpers/PointersManager.h>
+#include <helpers/ConstantTadHelper.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
     // -------------------------------------------------------------------------------------------------------------- //
@@ -70,7 +70,7 @@ namespace helpers {
 
         for (auto e = start + threadIdx.x + 1; e < finish; e += blockDim.x) {
             auto xIndex = shape::getIndexOffset(e, inputShape);
-            nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], x[xIndex]);
+            sd::math::atomics::nd4j_atomicAdd(&z[zIndex], x[xIndex]);
         }
     }
     // -------------------------------------------------------------------------------------------------------------- //
@@ -112,7 +112,7 @@ namespace helpers {
                 auto xIndex = shape::getIndexOffset(e, inputShape);
                 auto yIndex = shape::getIndexOffset(e, indicesShape);
                 if (y[yIndex] == segment && e != starts[segment]) {
-                    nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], x[xIndex]);
+                    sd::math::atomics::nd4j_atomicAdd(&z[zIndex], x[xIndex]);
                 }
             }
     }
@@ -143,7 +143,7 @@ namespace helpers {
                 for (auto e = threadIdx.x; e < len; e += blockDim.x) {
                     auto xIndex = shape::getIndexOffset(e, inputTads);
                     auto zIndex = shape::getIndexOffset(e, outputTads);
-                    nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], x[xIndex]);
+                    sd::math::atomics::nd4j_atomicAdd(&z[zIndex], x[xIndex]);
                 }
             }
             else {
@@ -151,7 +151,7 @@ namespace helpers {
                     auto xIndex = shape::getIndexOffset(e, inputTads);
                     auto zIndex = shape::getIndexOffset(e, outputTads);
                     if (lengths[indices[idx]])
-                        nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], x[xIndex]);
+                        sd::math::atomics::nd4j_atomicAdd(&z[zIndex], x[xIndex]);
                 }
             }
         }
@@ -159,7 +159,7 @@ namespace helpers {
     // -------------------------------------------------------------------------------------------------------------- //
 
     template <typename T, typename I>
-    static void segmentSumFunctor_(nd4j::LaunchContext* context, NDArray* input, NDArray* indices, NDArray* output) {
+    static void segmentSumFunctor_(sd::LaunchContext* context, NDArray* input, NDArray* indices, NDArray* output) {
         auto stream = context->getCudaStream();
         Nd4jLong numClasses = indices->e<Nd4jLong>(indices->lengthOf() - 1) + 1;
         NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numClasses});
@@ -178,8 +178,8 @@ namespace helpers {
         }
         else {
             std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+            auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+            auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
             Nd4jLong* inputTads = packX.specialShapeInfo();
             Nd4jLong* inputTadOffsets = packX.specialOffsets();
             Nd4jLong* outputTads = packZ.specialShapeInfo();
@@ -189,7 +189,7 @@ namespace helpers {
 
     }
     // -------------------------------------------------------------------------------------------------------------- //
-    void segmentSumFunctor(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* output) {
+    void segmentSumFunctor(sd::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* output) {
         NDArray::prepareSpecialUse({output}, {input, indices});
         output->nullify();
         BUILD_DOUBLE_SELECTOR(input->dataType(), indices->dataType(), segmentSumFunctor_, (context, input, indices, output), NUMERIC_TYPES, INDEXING_TYPES);
@@ -198,13 +198,13 @@ namespace helpers {
 
     // -------------------------------------------------------------------------------------------------------------- //
     template <typename T, typename I>
-    static void unsortedSegmentSumFunctor_(nd4j::LaunchContext* context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
+    static void unsortedSegmentSumFunctor_(sd::LaunchContext* context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
         auto stream = context->getCudaStream();
 //        NDArray classes = NDArrayFactory::create<int>('c', {numOfClasses, 2});
         NDArray classesRangesBegs = NDArrayFactory::create<int>('c', {numOfClasses});
         NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numOfClasses});
 //        NDArray row = NDArrayFactory::create<int>('c', {1, 2}, {(int)indices->lengthOf(), (int)0});
-//        classes.applyTrueBroadcast(nd4j::BroadcastOpsTuple::Assign(), &row, &classes);
+//        classes.applyTrueBroadcast(sd::BroadcastOpsTuple::Assign(), &row, &classes);
         classesRangesBegs.assign(indices->lengthOf());
         classesRangesLens.assign(0);
         dim3 dims(numOfClasses, indices->lengthOf(), (numOfClasses + 1) * 64);
@@ -219,8 +219,8 @@ namespace helpers {
         else {
             output->assign(0);
             std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+            auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+            auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
             Nd4jLong* inputTads = packX.specialShapeInfo();
             Nd4jLong* inputTadOffsets = packX.specialOffsets();
             Nd4jLong* outputTads = packZ.specialShapeInfo();
@@ -231,7 +231,7 @@ namespace helpers {
 
     }
     // -------------------------------------------------------------------------------------------------------------- //
-    void unsortedSegmentSumFunctor(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
+    void unsortedSegmentSumFunctor(sd::LaunchContext* context , NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
         NDArray::prepareSpecialUse({output}, {input, indices});
         output->nullify();
         BUILD_DOUBLE_SELECTOR(input->dataType(), indices->dataType(), unsortedSegmentSumFunctor_, (context, input, indices, numOfClasses, output),
@@ -315,7 +315,7 @@ namespace helpers {
     }
     // -------------------------------------------------------------------------------------------------------------- //
     template <typename T, typename I>
-    int segmentSumFunctorBP_(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
+    int segmentSumFunctorBP_(sd::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
         auto stream = context->getCudaStream();
         NDArray::prepareSpecialUse({output}, {input, indices, gradOut});
         if (input->isVector()) {
@@ -327,9 +327,9 @@ namespace helpers {
         }
         else {
             std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
-            auto packGradOut = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
+            auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+            auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+            auto packGradOut = sd::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
             Nd4jLong* inputTads = packX.specialShapeInfo();
             Nd4jLong* inputTadOffsets = packX.specialOffsets();
             Nd4jLong* outputTads = packZ.specialShapeInfo();
@@ -348,7 +348,7 @@ namespace helpers {
     }
     // -------------------------------------------------------------------------------------------------------------- //
 
-    int segmentSumFunctorBP(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
+    int segmentSumFunctorBP(sd::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
         NDArray::prepareSpecialUse({output}, {input, indices, gradOut});
         BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), return segmentSumFunctorBP_, (context, input,
                 indices, gradOut, output), FLOAT_TYPES, INDEXING_TYPES);
@@ -356,7 +356,7 @@ namespace helpers {
     }
 
     template <typename T, typename I>
-    static int unsortedSegmentSumFunctorBP_(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
+    static int unsortedSegmentSumFunctorBP_(sd::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
         auto stream = context->getCudaStream();
         NDArray::prepareSpecialUse({output}, {input, indices, gradOut});
         if (input->isVector()) {
@@ -368,9 +368,9 @@ namespace helpers {
         }
         else {
             std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
-            auto packGradOut = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
+            auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+            auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+            auto packGradOut = sd::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
             Nd4jLong* inputTads = packX.specialShapeInfo();
             Nd4jLong* inputTadOffsets = packX.specialOffsets();
             Nd4jLong* outputTads = packZ.specialShapeInfo();
@@ -388,7 +388,7 @@ namespace helpers {
         return Status::OK();
     }
     // -------------------------------------------------------------------------------------------------------------- //
-    int unsortedSegmentSumFunctorBP(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
+    int unsortedSegmentSumFunctorBP(sd::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
         NDArray::prepareSpecialUse({output}, {input, indices, gradOut});
         BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), return unsortedSegmentSumFunctorBP_, (context, input, indices, gradOut, numOfClasses, output), FLOAT_TYPES, INDEXING_TYPES);
         NDArray::registerSpecialUse({output}, {input, indices, gradOut});
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/sequence_mask.cu b/libnd4j/include/ops/declarable/helpers/cuda/sequence_mask.cu
index 6b33a384e..b06797753 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/sequence_mask.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/sequence_mask.cu
@@ -20,7 +20,7 @@
 
 #include <ops/declarable/helpers/sequence_mask.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -54,11 +54,11 @@ namespace helpers {
         NDArray::registerSpecialUse({output}, {input});
     }
 
-    void sequenceMask(nd4j::LaunchContext * context, NDArray* input, NDArray* output, int maxIndex) {
+    void sequenceMask(sd::LaunchContext * context, NDArray* input, NDArray* output, int maxIndex) {
         BUILD_DOUBLE_SELECTOR(input->dataType(), output->dataType(), sequenceMask_, (context, input, output, maxIndex), INTEGER_TYPES, LIBND4J_TYPES_EXTENDED);
     }
 
-    BUILD_DOUBLE_TEMPLATE(template void sequenceMask_, (nd4j::LaunchContext* context, NDArray* input, NDArray* output, int maxIndex), INTEGER_TYPES, LIBND4J_TYPES_EXTENDED);
+    BUILD_DOUBLE_TEMPLATE(template void sequenceMask_, (sd::LaunchContext* context, NDArray* input, NDArray* output, int maxIndex), INTEGER_TYPES, LIBND4J_TYPES_EXTENDED);
 }
 }
 }
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/sg_cb.cu b/libnd4j/include/ops/declarable/helpers/cuda/sg_cb.cu
index 24116ec63..f85a855b7 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/sg_cb.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/sg_cb.cu
@@ -19,12 +19,12 @@
 //
 
 #include <ops/declarable/helpers/sg_cb.h>
-#include <cuda_exception.h>
-#include <NDArrayFactory.h>
+#include <exceptions/cuda_exception.h>
+#include <array/NDArrayFactory.h>
 
 #define HS_MAX_EXP 6.0f
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             template <typename T>
@@ -129,7 +129,7 @@ namespace nd4j {
             int binarySearch(const int *haystack, const int needle, const int totalElements) {
                 int firstIndex = 0;
                 int lastIndex = totalElements - 1;
-                int halfIndex = nd4j::math::nd4j_floor<float, int>((lastIndex + firstIndex) / (float) 2);
+                int halfIndex = sd::math::nd4j_floor<float, int>((lastIndex + firstIndex) / (float) 2);
 
                 while(haystack[halfIndex] != needle && firstIndex < lastIndex) {
                     if (needle < haystack[halfIndex]) {
@@ -137,7 +137,7 @@ namespace nd4j {
                     } else if (needle > haystack[halfIndex]) {
                         firstIndex = halfIndex + 1;
                     }
-                    halfIndex = nd4j::math::nd4j_floor<float, int>((lastIndex + firstIndex) / (float) 2);
+                    halfIndex = sd::math::nd4j_floor<float, int>((lastIndex + firstIndex) / (float) 2);
                 }
 
                 return (haystack[halfIndex] == needle) ? halfIndex : -1;
@@ -198,7 +198,7 @@ namespace nd4j {
                             // target is known in advance
                         } else {
                             randomValue = randomValue * (unsigned long long) 25214903917 + 11;
-                            auto idx = nd4j::math::nd4j_abs<Nd4jLong >((randomValue >> 16) % negLength);
+                            auto idx = sd::math::nd4j_abs<Nd4jLong >((randomValue >> 16) % negLength);
                             irow = idx >= negLength ? -1 : negTableV.e<int>(idx);
 
                             if (irow < 0 || irow >= vocabSize) irow = randomValue % (vocabSize - 1) + 1;
@@ -306,7 +306,7 @@ namespace nd4j {
                                 // target is known in advance
                             } else {
                                 randomValue = randomValue * (unsigned long long) 25214903917 + 11;
-                                auto idx = nd4j::math::nd4j_abs<Nd4jLong >((randomValue >> 16) % negLength);
+                                auto idx = sd::math::nd4j_abs<Nd4jLong >((randomValue >> 16) % negLength);
                                 irow = idx >= negLength ? -1 : static_cast<int>(negTable[idx]);
 
                                 if (irow < 0 || irow >= vocabSize)
@@ -470,7 +470,7 @@ namespace nd4j {
                             // target is known in advance
                         } else {
                             randomValue = randomValue * (unsigned long long) 25214903917 + 11;
-                            auto idx = nd4j::math::nd4j_abs<Nd4jLong >((randomValue >> 16) % negLength);
+                            auto idx = sd::math::nd4j_abs<Nd4jLong >((randomValue >> 16) % negLength);
                             irow = idx >= negLength ? -1 : static_cast<int>(negTable[idx]);
 
                             if (irow < 0 || irow >= vocabSize) irow = randomValue % (vocabSize - 1) + 1;
@@ -672,7 +672,7 @@ namespace nd4j {
                             // we're skipping rng on 0 step
                             if (r != 0) {
                                 randomValue = randomValue * (unsigned long long) 25214903917 + 11;
-                                auto idx = nd4j::math::nd4j_abs<Nd4jLong>((randomValue >> 16) % negLength);
+                                auto idx = sd::math::nd4j_abs<Nd4jLong>((randomValue >> 16) % negLength);
                                 irow = idx >= negLength ? -1 : static_cast<int>(negTable[idx]);
 
                                 if (irow < 0 || irow >= vocabSize) irow = randomValue % (vocabSize - 1) + 1;
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/shift.cu b/libnd4j/include/ops/declarable/helpers/cuda/shift.cu
index 8ba3d40ce..c69285ef2 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/shift.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/shift.cu
@@ -20,7 +20,7 @@
 
 #include <ops/declarable/helpers/shift.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             template <typename T>
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/solve.cu b/libnd4j/include/ops/declarable/helpers/cuda/solve.cu
index 6437b80bd..74823483e 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/solve.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/solve.cu
@@ -18,18 +18,18 @@
 //  @author GS <sgazeos@gmail.com>
 //
 
-#include <op_boilerplate.h>
-#include <NDArray.h>
-#include <NDArrayFactory.h>
-#include <MmulHelper.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
+#include <array/NDArrayFactory.h>
+#include <helpers/MmulHelper.h>
 
 #include <execution/Threads.h>
-#include <ConstantTadHelper.h>
+#include <helpers/ConstantTadHelper.h>
 #include "../triangular_solve.h"
 #include "../lup.h"
 #include "../solve.h"
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
     
@@ -63,7 +63,7 @@ namespace nd4j {
             }
 
             template <typename T>
-            static int solveFunctor_(nd4j::LaunchContext * context, NDArray* leftInput, NDArray* rightInput,
+            static int solveFunctor_(sd::LaunchContext * context, NDArray* leftInput, NDArray* rightInput,
                                      bool adjoint, NDArray* output) {
                 NDArray::prepareSpecialUse({output}, {leftInput, rightInput});
                 // stage 1: LU decomposition batched
@@ -94,7 +94,7 @@ namespace nd4j {
                 return Status::OK();
             }
 
-            int solveFunctor(nd4j::LaunchContext * context, NDArray* leftInput, NDArray* rightInput, bool adjoint, NDArray* output) {
+            int solveFunctor(sd::LaunchContext * context, NDArray* leftInput, NDArray* rightInput, bool adjoint, NDArray* output) {
                 BUILD_SINGLE_SELECTOR(leftInput->dataType(), return solveFunctor_, (context, leftInput, rightInput, adjoint, output), FLOAT_TYPES);
             }
 
@@ -118,7 +118,7 @@ namespace nd4j {
             }
 
             template <typename T>
-            static void adjointMatrix_(nd4j::LaunchContext* context, NDArray const* input, NDArray* output) {
+            static void adjointMatrix_(sd::LaunchContext* context, NDArray const* input, NDArray* output) {
                 NDArray::prepareSpecialUse({output}, {input});
                 auto inputTads = ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), {-2, -1});
                 auto outputTads = ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {-2, -1});
@@ -131,7 +131,7 @@ namespace nd4j {
                 NDArray::registerSpecialUse({output}, {input});
             }
 
-            void adjointMatrix(nd4j::LaunchContext* context, NDArray const* input, NDArray* output) {
+            void adjointMatrix(sd::LaunchContext* context, NDArray const* input, NDArray* output) {
                 BUILD_SINGLE_SELECTOR(input->dataType(), adjointMatrix_, (context, input, output), FLOAT_TYPES);
             }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/split.cu b/libnd4j/include/ops/declarable/helpers/cuda/split.cu
index fa6b46539..5631492ce 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/split.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/split.cu
@@ -24,13 +24,13 @@
 #include <array/ResultSet.h>
 #include <helpers/ShapeUtils.h>
 #include <numeric>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 #include <helpers/TAD.h>
 #include <exceptions/cuda_exception.h>
-#include <PointersManager.h>
-#include <ConstantTadHelper.h>
+#include <helpers/PointersManager.h>
+#include <helpers/ConstantTadHelper.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
@@ -82,7 +82,7 @@ __host__ static void splitCudaLauncher(const int blocksPerGrid, const int thread
 BUILD_SINGLE_TEMPLATE(template void splitCudaLauncher, (const int blocksPerGrid, const int threadsPerBlock, const cudaStream_t *stream, const void* vx, const Nd4jLong* xShapeInfo, void* pVz, const Nd4jLong* zTadShapeInfo, const int axis), LIBND4J_TYPES);
 
 //////////////////////////////////////////////////////////////////////////
-void split(nd4j::LaunchContext* context, const NDArray& input, std::vector<NDArray*>& outArrs, const int axis) {
+void split(sd::LaunchContext* context, const NDArray& input, std::vector<NDArray*>& outArrs, const int axis) {
 
     const int numOfSubArrs = outArrs.size();
     const auto sizeofT     = input.sizeOfT();
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/sru.cu b/libnd4j/include/ops/declarable/helpers/cuda/sru.cu
index 76530269c..0ded43593 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/sru.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/sru.cu
@@ -21,11 +21,11 @@
 //
 
 #include<ops/declarable/helpers/sru.h>
-#include <NDArrayFactory.h>
-#include <PointersManager.h>
-#include <MmulHelper.h>
+#include <array/NDArrayFactory.h>
+#include <helpers/PointersManager.h>
+#include <helpers/MmulHelper.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
@@ -45,7 +45,7 @@ namespace helpers {
 
 
 //////////////////////////////////////////////////////////////////////////
-void sruCell(nd4j::LaunchContext * context, const NDArray* x, const NDArray* c0, const NDArray* w, const NDArray* b, NDArray* h, NDArray* c) {
+void sruCell(sd::LaunchContext * context, const NDArray* x, const NDArray* c0, const NDArray* w, const NDArray* b, NDArray* h, NDArray* c) {
 
     // x   input [bS x inSize], bS - batch size, inSize - number of features
     // c0  previous cell state c  [bS x inSize], that is at previous time step t-1
@@ -76,7 +76,7 @@ void sruCell(nd4j::LaunchContext * context, const NDArray* x, const NDArray* c0,
 }
 
 //////////////////////////////////////////////////////////////////////////
-void sruTimeLoop(nd4j::LaunchContext * context, const NDArray* x, const NDArray* c0, const NDArray* w, const NDArray* b, NDArray* h, NDArray* c) {
+void sruTimeLoop(sd::LaunchContext * context, const NDArray* x, const NDArray* c0, const NDArray* w, const NDArray* b, NDArray* h, NDArray* c) {
 
     // x   input [bS x inSize x time]
     // c0  initial cell state  (at time step = 0) [bS x inSize],
@@ -189,12 +189,12 @@ __global__ static void sruBICuda(const void* vx,    const Nd4jLong* xShapeInfo,
     for (uint t = 0; t < time; ++t) {
 
         // evaluate sigmoids
-        T ft = (1.f)/(1.f + nd4j::math::nd4j_exp<T, T>(-(wi[wiOffset1] + bF)));
-        T rt = (1.f)/(1.f + nd4j::math::nd4j_exp<T, T>(-(wi[wiOffset2] + bR)));
+        T ft = (1.f)/(1.f + sd::math::nd4j_exp<T, T>(-(wi[wiOffset1] + bF)));
+        T rt = (1.f)/(1.f + sd::math::nd4j_exp<T, T>(-(wi[wiOffset2] + bR)));
 
         c0Val = (c0Val - wi[wiOffset0]) * ft + wi[wiOffset0];
         ct[ctOffset] = c0Val;
-        T val  = nd4j::math::nd4j_tanh<T, T>(c0Val);
+        T val  = sd::math::nd4j_tanh<T, T>(c0Val);
         T xVal = x[xOffset];
         ht[htOffset] = (val * maskVal - xVal) * rt + xVal;
 
@@ -232,7 +232,7 @@ static void sruBICudaLauncher(const int blocksPerGrid, const int threadsPerBlock
 }
 
 //////////////////////////////////////////////////////////////////////////
-void sruBI(nd4j::LaunchContext * context, NDArray* x, const NDArray* w, const NDArray* b, const NDArray* c0, const NDArray* mask, NDArray* ht, NDArray* ct) {
+void sruBI(sd::LaunchContext * context, NDArray* x, const NDArray* w, const NDArray* b, const NDArray* c0, const NDArray* mask, NDArray* ht, NDArray* ct) {
 
     //  x = x * mask
     if(mask)
@@ -408,10 +408,10 @@ __global__ static void sruBIBPCuda(const void* vx,       const Nd4jLong* xShapeI
     for (uint t = 0; t < time; ++t) {
 
         // evaluate sigmoids
-        T ft = (1.f)/(1.f + nd4j::math::nd4j_exp<T, T>(-(wi[wiOffset1] + bF)));
-        T rt = (1.f)/(1.f + nd4j::math::nd4j_exp<T, T>(-(wi[wiOffset2] + bR)));
+        T ft = (1.f)/(1.f + sd::math::nd4j_exp<T, T>(-(wi[wiOffset1] + bF)));
+        T rt = (1.f)/(1.f + sd::math::nd4j_exp<T, T>(-(wi[wiOffset2] + bR)));
 
-        T val = nd4j::math::nd4j_tanh<T,T>(ct[ctOffset]);
+        T val = sd::math::nd4j_tanh<T,T>(ct[ctOffset]);
 
         T prevVal;
         if(t < time-1)
@@ -491,7 +491,7 @@ static void sruBIBPCudaLauncher(const int blocksPerGrid, const int threadsPerBlo
 BUILD_SINGLE_TEMPLATE(template void sruBIBPCudaLauncher, (const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream, const void* vx, const Nd4jLong* xShapeInfo, const void* vwi, const Nd4jLong* wiShapeInfo, const void* vb, const Nd4jLong* bShapeInfo, const void* vc0, const Nd4jLong* c0ShapeInfo, const void* vmask, const Nd4jLong* maskShapeInfo, const void* vct, const Nd4jLong* ctShapeInfo, const void* vgradHt, const Nd4jLong* gradHtShapeInfo, const void* vgradCt, const Nd4jLong* gradCtShapeInfo, void* vgradI, const Nd4jLong* gradIShapeInfo, void* vgradWi, const Nd4jLong* gradWiShapeInfo, void* vgradB, const Nd4jLong* gradBShapeInfo, void* vgradC0, const Nd4jLong* gradC0ShapeInfo), FLOAT_TYPES);
 
 //////////////////////////////////////////////////////////////////////////
-void sruBIBP(nd4j::LaunchContext* context, NDArray* x, const NDArray* w, const NDArray* b, const NDArray* c0, const NDArray* ct,
+void sruBIBP(sd::LaunchContext* context, NDArray* x, const NDArray* w, const NDArray* b, const NDArray* c0, const NDArray* ct,
                                           const NDArray* gradCt, const NDArray* gradHt, const NDArray* mask,
                                           NDArray* gradI, NDArray* gradW, NDArray* gradB, NDArray* gradC0) {
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/stack.cu b/libnd4j/include/ops/declarable/helpers/cuda/stack.cu
index e88f5ade8..875327c3c 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/stack.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/stack.cu
@@ -21,12 +21,12 @@
 #include <ops/declarable/helpers/stack.h>
 #include <helpers/ShapeUtils.h>
 #include <array/ResultSet.h>
-#include <cuda_exception.h>
-#include <TAD.h>
-#include <PointersManager.h>
-#include <ConstantTadHelper.h>
+#include <exceptions/cuda_exception.h>
+#include <helpers/TAD.h>
+#include <helpers/PointersManager.h>
+#include <helpers/ConstantTadHelper.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -57,7 +57,7 @@ namespace helpers {
 
 	///////////////////////////////////////////////////////////////////
 	template <typename T>
-	static void stack_(nd4j::LaunchContext * context, const std::vector<const NDArray*>& inArrs, NDArray* outArr, const int dim) {
+	static void stack_(sd::LaunchContext * context, const std::vector<const NDArray*>& inArrs, NDArray* outArr, const int dim) {
 
 		const bool scalarCase = inArrs[0]->isScalar();
 
@@ -87,7 +87,7 @@ namespace helpers {
         }
         else {
         	std::vector<int> axis = ShapeUtils::evalDimsToExclude(outArr->rankOf(), {dim});
-        	auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(outArr->getShapeInfo(), axis);
+        	auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(outArr->getShapeInfo(), axis);
 			stackKernel<T><<<blocksPerGrid, threadsPerBlock, 1024, *context->getCudaStream()>>>((void**)dInBuffers, (void**)dInShapeInfo, inputList.size(), inArrs[0]->lengthOf(), outArr->specialBuffer(), nullptr, packZ.specialShapeInfo(), packZ.specialOffsets());
         }
         manager.synchronize();
@@ -99,11 +99,11 @@ namespace helpers {
             NDArray::registerSpecialUse({}, {v});
 	}
 
-	void stack(nd4j::LaunchContext * context, const std::vector<const NDArray*>& inArrs, NDArray* outArr, const int dim) {
+	void stack(sd::LaunchContext * context, const std::vector<const NDArray*>& inArrs, NDArray* outArr, const int dim) {
 		BUILD_SINGLE_SELECTOR(outArr->dataType(), stack_, (context, inArrs, outArr, dim), LIBND4J_TYPES);
 	}
 
-	BUILD_SINGLE_TEMPLATE(template void stack_ , (nd4j::LaunchContext * context, const std::vector<const NDArray*>& inArrs, NDArray* outArr, const int dim), LIBND4J_TYPES);
+	BUILD_SINGLE_TEMPLATE(template void stack_ , (sd::LaunchContext * context, const std::vector<const NDArray*>& inArrs, NDArray* outArr, const int dim), LIBND4J_TYPES);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/svd.cu b/libnd4j/include/ops/declarable/helpers/cuda/svd.cu
index 4d1b18eef..00885aa72 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/svd.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/svd.cu
@@ -18,15 +18,15 @@
 // @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
-#include <svd.h>
+#include <helpers/svd.h>
 #include <cuda_runtime.h>
 #include <cublas_v2.h>
 #include <cusolverDn.h>
 #include <exceptions/cuda_exception.h>
-#include <PointersManager.h>
-#include <ShapeUtils.h>
+#include <helpers/PointersManager.h>
+#include <helpers/ShapeUtils.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
@@ -100,7 +100,7 @@ static void inverseColumnSignCudaLauncher(const int blocksPerGrid, const int thr
 BUILD_SINGLE_TEMPLATE(template void inverseColumnSignCudaLauncher, (const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t* stream, void* vu, const Nd4jLong* uShapeInfo, void* vv, const Nd4jLong* vShapeInfo), FLOAT_TYPES);
 
 //////////////////////////////////////////////////////////////////////////
-static void svdQR(nd4j::LaunchContext* context, const NDArray* A, NDArray* S, NDArray* U, NDArray* VT, const bool fullUV, const bool calcUV) {
+static void svdQR(sd::LaunchContext* context, const NDArray* A, NDArray* S, NDArray* U, NDArray* VT, const bool fullUV, const bool calcUV) {
 
     // since cusa api cusolverDnDgesvd/cusolverDnSgesvd have following constrain on input matrix A: A_rows >= A_columns && A_order = 'f'
     // we make this function to have deal with 2 valid cases only:
@@ -266,7 +266,7 @@ static void svdQR(nd4j::LaunchContext* context, const NDArray* A, NDArray* S, ND
 }
 
 //////////////////////////////////////////////////////////////////////////
-static void svdJcb(nd4j::LaunchContext* context, const NDArray* A, NDArray* S, NDArray* U, NDArray* V, const bool fullUV, const bool calcUV) {
+static void svdJcb(sd::LaunchContext* context, const NDArray* A, NDArray* S, NDArray* U, NDArray* V, const bool fullUV, const bool calcUV) {
 
     // A [m, n]
     // S [n]
@@ -455,7 +455,7 @@ static void svdJcb(nd4j::LaunchContext* context, const NDArray* A, NDArray* S, N
 }
 
 //////////////////////////////////////////////////////////////////////////
-static void svdBatched(nd4j::LaunchContext* context, const NDArray* A, NDArray* S, NDArray* U, NDArray* V, const bool fullUV, const bool calcUV) {
+static void svdBatched(sd::LaunchContext* context, const NDArray* A, NDArray* S, NDArray* U, NDArray* V, const bool fullUV, const bool calcUV) {
 
     // A [..., m, n]
     // S [..., n]
@@ -628,7 +628,7 @@ static void svdBatched(nd4j::LaunchContext* context, const NDArray* A, NDArray*
 }
 
 ////////////////////////////////////////////////////////////////////
-void svd(nd4j::LaunchContext* context, const NDArray* x, const std::vector<NDArray*>& outArrs, const bool fullUV, const bool calcUV, const int switchNum) {
+void svd(sd::LaunchContext* context, const NDArray* x, const std::vector<NDArray*>& outArrs, const bool fullUV, const bool calcUV, const int switchNum) {
 
     NDArray* S = outArrs[0];
     NDArray* U = outArrs[1];
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/toggle_bits.cu b/libnd4j/include/ops/declarable/helpers/cuda/toggle_bits.cu
index bc1171efe..2138e1188 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/toggle_bits.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/toggle_bits.cu
@@ -21,7 +21,7 @@
 #include <ops/declarable/helpers/toggle_bits.h>
 #include <helpers/BitwiseUtils.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             template<typename T>
@@ -34,7 +34,7 @@ namespace nd4j {
             }
             BUILD_SINGLE_TEMPLATE(template void toggle_bits__, (NDArray &in, NDArray &out), INTEGER_TYPES);
 
-            void __toggle_bits(nd4j::LaunchContext * context, NDArray& in, NDArray& out) {
+            void __toggle_bits(sd::LaunchContext * context, NDArray& in, NDArray& out) {
                 BUILD_SINGLE_SELECTOR(in.dataType(), toggle_bits__, (in, out), INTEGER_TYPES);
             }
         }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/top_k.cu b/libnd4j/include/ops/declarable/helpers/cuda/top_k.cu
index 520a6115d..b344f570e 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/top_k.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/top_k.cu
@@ -19,10 +19,10 @@
 //
 
 #include <ops/declarable/helpers/top_k.h>
-#include <PointersManager.h>
-#include <ConstantTadHelper.h>
+#include <helpers/PointersManager.h>
+#include <helpers/ConstantTadHelper.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
@@ -87,11 +87,11 @@ static void inTopKCudaLauncher(const int blocksPerGrid, const int threadsPerBloc
 }
 
 ///////////////////////////////////////////////////////////////////
-int inTopKFunctor(nd4j::LaunchContext * context, const NDArray* predictions, const NDArray* targets, NDArray* output, const uint k) {
+int inTopKFunctor(sd::LaunchContext * context, const NDArray* predictions, const NDArray* targets, NDArray* output, const uint k) {
 
     PointersManager manager(context, "in_top_k");
 
-    const auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(predictions->getShapeInfo(), {1});
+    const auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(predictions->getShapeInfo(), {1});
 
     const int threadsPerBlock = MAX_NUM_THREADS;
     const int blocksPerGrid = static_cast<int>(packX.numberOfTads());
@@ -241,7 +241,7 @@ int inTopKFunctor(nd4j::LaunchContext * context, const NDArray* predictions, con
 
 
     template <typename X, typename Y>
-    static int topKFunctor_(nd4j::LaunchContext * context, const NDArray* input, NDArray* values, NDArray* indices, const uint k, bool needSort) {
+    static int topKFunctor_(sd::LaunchContext * context, const NDArray* input, NDArray* values, NDArray* indices, const uint k, bool needSort) {
 
         auto packX = ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), {input->rankOf() - 1});
         auto packI = ConstantTadHelper::getInstance()->tadForDimensions(indices->shapeInfo(), {input->rankOf() - 1});
@@ -266,7 +266,7 @@ int inTopKFunctor(nd4j::LaunchContext * context, const NDArray* predictions, con
         return Status::OK();
     }
 
-    int topKFunctor(nd4j::LaunchContext * context, const NDArray* input, NDArray* values, NDArray* indices, const uint k, bool needSort) {
+    int topKFunctor(sd::LaunchContext * context, const NDArray* input, NDArray* values, NDArray* indices, const uint k, bool needSort) {
         input->syncToDevice();
 
         BUILD_DOUBLE_SELECTOR(input->dataType(), indices->dataType(), topKFunctor_, (context, input, values, indices, k, needSort), LIBND4J_TYPES, INDEXING_TYPES);
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu b/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu
index 764b6abbf..2c4299a90 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu
@@ -24,13 +24,13 @@
 #include <array/ResultSet.h>
 #include <helpers/ShapeUtils.h>
 #include <numeric>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 #include <helpers/TAD.h>
 #include <exceptions/cuda_exception.h>
-#include <PointersManager.h>
-#include <ConstantTadHelper.h>
+#include <helpers/PointersManager.h>
+#include <helpers/ConstantTadHelper.h>
 
-namespace nd4j 	  {
+namespace sd 	  {
 namespace ops 	  {
 namespace helpers {
 
@@ -71,7 +71,7 @@ __host__ static void invertPermutationCudaLauncher(const int blocksPerGrid, cons
 }
 
 ////////////////////////////////////////////////////////////////////////
-void invertPermutation(nd4j::LaunchContext* context, const NDArray& input, NDArray& output) {
+void invertPermutation(sd::LaunchContext* context, const NDArray& input, NDArray& output) {
 
     const int threadsPerBlock = MAX_NUM_THREADS;
     const int blocksPerGrid = (input.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
@@ -153,7 +153,7 @@ static void traceCudaLauncher(const int blocksPerGrid, const int threadsPerBlock
 
 
 ///////////////////////////////////////////////////////////////////
-void trace(nd4j::LaunchContext* context, const NDArray& input, NDArray& output) {
+void trace(sd::LaunchContext* context, const NDArray& input, NDArray& output) {
 
     PointersManager manager(context, "trace");
 
@@ -217,7 +217,7 @@ static void triuBPCudaLauncher(const int blocksPerGrid, const int threadsPerBloc
 }
 
 ///////////////////////////////////////////////////////////////////
-void triuBP(nd4j::LaunchContext* context, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int diagonal) {
+void triuBP(sd::LaunchContext* context, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int diagonal) {
 
     const int threadsPerBlock = MAX_NUM_THREADS / 4;
     const int blocksPerGrid = (gradO.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
@@ -283,9 +283,9 @@ static void tileBPCudaLauncher(const int blocksPerGrid, const int threadsPerBloc
 
 
 //////////////////////////////////////////////////////////////////////////
-void tileBP(nd4j::LaunchContext * context, const NDArray& gradO /*input*/, NDArray& gradI /*output*/, const std::vector<Nd4jLong> reps) {
+void tileBP(sd::LaunchContext * context, const NDArray& gradO /*input*/, NDArray& gradI /*output*/, const std::vector<Nd4jLong> reps) {
 
-    NDArray memBuff('c', gradO.getShapeAsVector(), nd4j::DataType::INT64, context);        // empty auxiliary array for storing device memory which will be used in kernel calculations
+    NDArray memBuff('c', gradO.getShapeAsVector(), sd::DataType::INT64, context);        // empty auxiliary array for storing device memory which will be used in kernel calculations
 
     const int threadsPerBlock = MAX_NUM_THREADS / 4;
     const int blocksPerGrid = (gradI.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
@@ -528,7 +528,7 @@ static void clipByNormBPCudaLauncher(const int blocksPerGrid, const int threadsP
 BUILD_DOUBLE_TEMPLATE(template void clipByNormBPCudaLauncher, (const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream, const void *vx, const Nd4jLong *xShapeInfo, const Nd4jLong* xTadOffsets, const void *vy, const Nd4jLong *yShapeInfo, const Nd4jLong* yTadOffsets, void *vz, const Nd4jLong *zShapeInfo, const Nd4jLong* zTadOffsets, void* vreducBuff, const double clipNormVal), FLOAT_TYPES, FLOAT_TYPES);
 
 //////////////////////////////////////////////////////////////////////////
-void clipByNormBP(nd4j::LaunchContext* context, const NDArray& input, const NDArray& gradO, NDArray& gradI /*output*/, const std::vector<int>& dimensions, const NDArray& clipNorm) {
+void clipByNormBP(sd::LaunchContext* context, const NDArray& input, const NDArray& gradO, NDArray& gradI /*output*/, const std::vector<int>& dimensions, const NDArray& clipNorm) {
 
     PointersManager manager(context, "clipByNormBP");
 
@@ -564,7 +564,7 @@ void clipByNormBP(nd4j::LaunchContext* context, const NDArray& input, const NDAr
 }
 
     template <typename T>
-    static __global__ void swapShuffleKernel(T* input, Nd4jLong* shape, Nd4jLong firstDim, nd4j::graph::RandomGenerator* rng) {
+    static __global__ void swapShuffleKernel(T* input, Nd4jLong* shape, Nd4jLong firstDim, sd::graph::RandomGenerator* rng) {
         auto tid = blockIdx.x * blockDim.x;
         auto step = blockDim.x * gridDim.x;
 
@@ -582,7 +582,7 @@ void clipByNormBP(nd4j::LaunchContext* context, const NDArray& input, const NDAr
         }
     }
     template <typename T>
-    static __global__ void fillShuffleKernel(T* input, Nd4jLong* inputShape, T* output, Nd4jLong* outputShape, Nd4jLong firstDim, int* indices, nd4j::graph::RandomGenerator* rng) {
+    static __global__ void fillShuffleKernel(T* input, Nd4jLong* inputShape, T* output, Nd4jLong* outputShape, Nd4jLong firstDim, int* indices, sd::graph::RandomGenerator* rng) {
 
 //        PRAGMA_OMP_PARALLEL_FOR_IF((firstDim-1) > Environment::getInstance()->tadThreshold())
         auto tid = blockIdx.x * blockDim.x;
@@ -602,7 +602,7 @@ void clipByNormBP(nd4j::LaunchContext* context, const NDArray& input, const NDAr
     }
     //////////////////////////////////////////////////////////////////////////
     template <typename T>
-    void randomShuffle_(nd4j::LaunchContext * context, NDArray& input, NDArray& output, nd4j::graph::RandomGenerator& rng, const bool isInplace) {
+    void randomShuffle_(sd::LaunchContext * context, NDArray& input, NDArray& output, sd::graph::RandomGenerator& rng, const bool isInplace) {
 
         // check edge cases first
         int temp;
@@ -616,9 +616,9 @@ void clipByNormBP(nd4j::LaunchContext* context, const NDArray& input, const NDAr
         else if (input.isVector() || shape::isLikeVector(input.getShapeInfo(), temp)) {
 
             // apply Fisher-Yates shuffle
-            nd4j::graph::RandomGenerator* dRandom = nullptr;
-            cudaMalloc(&dRandom, sizeof(nd4j::graph::RandomGenerator));
-            cudaMemcpy(dRandom, &rng, sizeof(nd4j::graph::RandomGenerator), cudaMemcpyHostToDevice);
+            sd::graph::RandomGenerator* dRandom = nullptr;
+            cudaMalloc(&dRandom, sizeof(sd::graph::RandomGenerator));
+            cudaMemcpy(dRandom, &rng, sizeof(sd::graph::RandomGenerator), cudaMemcpyHostToDevice);
             T* inputBuf = reinterpret_cast<T*>(input.specialBuffer());
             if(isInplace) {
                 swapShuffleKernel<T><<<128, 256, 1024, *stream>>>(inputBuf, input.specialShapeInfo(), firstDim, dRandom);
@@ -679,15 +679,15 @@ void clipByNormBP(nd4j::LaunchContext* context, const NDArray& input, const NDAr
 
     }
 
-    void randomShuffle(nd4j::LaunchContext * context, NDArray& input, NDArray& output, nd4j::graph::RandomGenerator& rng, const bool isInplace) {
+    void randomShuffle(sd::LaunchContext * context, NDArray& input, NDArray& output, sd::graph::RandomGenerator& rng, const bool isInplace) {
         BUILD_SINGLE_SELECTOR(input.dataType(), randomShuffle_, (context, input, output, rng, isInplace), LIBND4J_TYPES);
     }
 
-    BUILD_SINGLE_TEMPLATE(template void randomShuffle_, (nd4j::LaunchContext * context, NDArray& input, NDArray& output, nd4j::graph::RandomGenerator& rng, const bool isInplace), LIBND4J_TYPES);
+    BUILD_SINGLE_TEMPLATE(template void randomShuffle_, (sd::LaunchContext * context, NDArray& input, NDArray& output, sd::graph::RandomGenerator& rng, const bool isInplace), LIBND4J_TYPES);
 
 
     //////////////////////////////////////////////////////////////////////////
-    void eye(nd4j::LaunchContext * context, NDArray& output) {
+    void eye(sd::LaunchContext * context, NDArray& output) {
 
         output.setIdentity();
     }
@@ -743,7 +743,7 @@ void clipByNormBP(nd4j::LaunchContext* context, const NDArray& input, const NDAr
 
     //////////////////////////////////////////////////////////////////////////
     template<typename T>
-    static void clipByNorm_(nd4j::LaunchContext * context, NDArray& input, NDArray& output, const std::vector<int>& dimensions, NDArray const& clipNormA, const bool isInplace) {
+    static void clipByNorm_(sd::LaunchContext * context, NDArray& input, NDArray& output, const std::vector<int>& dimensions, NDArray const& clipNormA, const bool isInplace) {
         const int rank = input.rankOf();
         auto norm2 = input.reduceAlongDimension(reduce::Norm2, dimensions);
         clipNormA.syncToHost();
@@ -762,8 +762,8 @@ void clipByNormBP(nd4j::LaunchContext* context, const NDArray& input, const NDAr
 
                 std::vector<int> dimsToExclude = ShapeUtils::evalDimsToExclude(rank, dimensions);
                 const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(input.getShapeInfo(), dimsToExclude);
-                auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input.getShapeInfo(), dimensions);
-                //auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output.getShapeInfo(), dimsToExclude);
+                auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input.getShapeInfo(), dimensions);
+                //auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output.getShapeInfo(), dimsToExclude);
                 T* inputBuffer = reinterpret_cast<T*>(input.specialBuffer());
                 T* norm2buf = reinterpret_cast<T*>(norm2.specialBuffer());
 
@@ -785,8 +785,8 @@ void clipByNormBP(nd4j::LaunchContext* context, const NDArray& input, const NDAr
 
                 std::vector<int> dimsToExclude = ShapeUtils::evalDimsToExclude(rank, dimensions);
                 const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(input.getShapeInfo(), dimsToExclude);
-                auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input.getShapeInfo(), dimensions);
-                auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output.getShapeInfo(), dimensions);
+                auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input.getShapeInfo(), dimensions);
+                auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(output.getShapeInfo(), dimensions);
                 T* inputBuffer = reinterpret_cast<T*>(input.specialBuffer());
                 T* norm2buf = reinterpret_cast<T*>(norm2.specialBuffer());
                 T* outputBuffer = reinterpret_cast<T*>(output.specialBuffer());
@@ -796,14 +796,14 @@ void clipByNormBP(nd4j::LaunchContext* context, const NDArray& input, const NDAr
         }
     }
 
-    void clipByNorm(nd4j::LaunchContext * context, NDArray& input, NDArray& output, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool isInplace) {
+    void clipByNorm(sd::LaunchContext * context, NDArray& input, NDArray& output, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool isInplace) {
         BUILD_SINGLE_SELECTOR(output.dataType(), clipByNorm_, (context, input, output, dimensions, clipNorm, isInplace), FLOAT_TYPES);
     }
 
-    BUILD_SINGLE_TEMPLATE(template void clipByNorm_, (nd4j::LaunchContext * context, NDArray& input, NDArray& output, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool isInplace), FLOAT_TYPES);
+    BUILD_SINGLE_TEMPLATE(template void clipByNorm_, (sd::LaunchContext * context, NDArray& input, NDArray& output, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool isInplace), FLOAT_TYPES);
 
     template <typename T>
-    void clipByGlobalNorm_(nd4j::LaunchContext * context, std::vector<NDArray*> const& inputs, double clipNorm, nd4j::memory::Workspace* workspace, std::vector<NDArray*>& outputs, bool isInplace) {
+    void clipByGlobalNorm_(sd::LaunchContext * context, std::vector<NDArray*> const& inputs, double clipNorm, sd::memory::Workspace* workspace, std::vector<NDArray*>& outputs, bool isInplace) {
         NDArray globalNorm = NDArrayFactory::create<T>(0, inputs[0]->getContext()); //sqrt(sum([l2norm(t)**2 for t in t_list]))
 
         for (auto i = 0; i < inputs.size(); i++) {
@@ -812,7 +812,7 @@ void clipByNormBP(nd4j::LaunchContext* context, const NDArray& input, const NDAr
             globalNorm += l2norm * l2norm;
         }
 
-        globalNorm.applyTransform(transform::Sqrt, globalNorm);     // = nd4j::math::nd4j_sqrt(globalNorm);
+        globalNorm.applyTransform(transform::Sqrt, globalNorm);     // = sd::math::nd4j_sqrt(globalNorm);
         outputs[inputs.size()]->p(0, globalNorm);
         globalNorm.syncToHost();
         const T factor = static_cast<T>(clipNorm) / globalNorm.e<T>(0);
@@ -833,16 +833,16 @@ void clipByNormBP(nd4j::LaunchContext* context, const NDArray& input, const NDAr
         }
     }
 
-    void clipByGlobalNorm(nd4j::LaunchContext * context, std::vector<NDArray*> const& inputs, double clipNorm, nd4j::memory::Workspace* workspace, std::vector<NDArray*>& outputs, bool isInplace) {
+    void clipByGlobalNorm(sd::LaunchContext * context, std::vector<NDArray*> const& inputs, double clipNorm, sd::memory::Workspace* workspace, std::vector<NDArray*>& outputs, bool isInplace) {
         BUILD_SINGLE_SELECTOR(outputs[0]->dataType(), clipByGlobalNorm_, (context, inputs, clipNorm, workspace, outputs, isInplace), FLOAT_TYPES);
     }
 
-    BUILD_SINGLE_TEMPLATE(template void clipByGlobalNorm_, (nd4j::LaunchContext * context, std::vector<NDArray*> const& inputs, double clipNorm, nd4j::memory::Workspace* workspace, std::vector<NDArray*>& outputs, bool isInplace), FLOAT_TYPES);
+    BUILD_SINGLE_TEMPLATE(template void clipByGlobalNorm_, (sd::LaunchContext * context, std::vector<NDArray*> const& inputs, double clipNorm, sd::memory::Workspace* workspace, std::vector<NDArray*>& outputs, bool isInplace), FLOAT_TYPES);
 
 
     //////////////////////////////////////////////////////////////////////////
     template<typename T>
-    static void clipByAveraged_(nd4j::LaunchContext * context, NDArray& input, NDArray& output, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool isInplace) {
+    static void clipByAveraged_(sd::LaunchContext * context, NDArray& input, NDArray& output, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool isInplace) {
         auto cn = clipNorm.e<T>(0);
         if (dimensions.size() == 0) {
             // all-reduce
@@ -877,11 +877,11 @@ void clipByNormBP(nd4j::LaunchContext* context, const NDArray& input, const NDAr
         }
     }
 
-    void clipByAveraged(nd4j::LaunchContext * context, NDArray& input, NDArray& output, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool isInplace) {
+    void clipByAveraged(sd::LaunchContext * context, NDArray& input, NDArray& output, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool isInplace) {
         BUILD_SINGLE_SELECTOR(input.dataType(), clipByAveraged_, (context, input, output, dimensions, clipNorm, isInplace), FLOAT_TYPES);
     }
 
-    BUILD_SINGLE_TEMPLATE(template void clipByAveraged_, (nd4j::LaunchContext * context, NDArray& input, NDArray& output, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool isInplace), FLOAT_TYPES);
+    BUILD_SINGLE_TEMPLATE(template void clipByAveraged_, (sd::LaunchContext * context, NDArray& input, NDArray& output, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool isInplace), FLOAT_TYPES);
 
 /*
     if (d1 > params[1])
@@ -923,7 +923,7 @@ void clipByNormBP(nd4j::LaunchContext* context, const NDArray& input, const NDAr
     }
 
     template <typename T>
-    static void clipByValue_(nd4j::LaunchContext * context, NDArray& input, double leftBound, double rightBound, NDArray& output) {
+    static void clipByValue_(sd::LaunchContext * context, NDArray& input, double leftBound, double rightBound, NDArray& output) {
         auto stream = context->getCudaStream();
         if (!input.isActualOnDeviceSide())
             input.syncToDevice();
@@ -932,11 +932,11 @@ void clipByNormBP(nd4j::LaunchContext* context, const NDArray& input, const NDAr
         NDArray::registerSpecialUse({&output}, {&input});
     }
 
-    void clipByValue(nd4j::LaunchContext * context, NDArray& input, double leftBound, double rightBound, NDArray& output) {
+    void clipByValue(sd::LaunchContext * context, NDArray& input, double leftBound, double rightBound, NDArray& output) {
         BUILD_SINGLE_SELECTOR(input.dataType(), clipByValue_, (context, input, leftBound, rightBound, output), FLOAT_TYPES);
     }
 
-    BUILD_SINGLE_TEMPLATE(template void clipByValue_, (nd4j::LaunchContext * context, NDArray& input, double leftBound, double rightBound, NDArray& output);, FLOAT_TYPES);
+    BUILD_SINGLE_TEMPLATE(template void clipByValue_, (sd::LaunchContext * context, NDArray& input, double leftBound, double rightBound, NDArray& output);, FLOAT_TYPES);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/triangular_solve.cu b/libnd4j/include/ops/declarable/helpers/cuda/triangular_solve.cu
index 6f5fe6b8c..e34fd11f8 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/triangular_solve.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/triangular_solve.cu
@@ -18,13 +18,13 @@
 //  @author GS <sgazeos@gmail.com>
 //
 
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 #include <execution/Threads.h>
-#include <ConstantTadHelper.h>
+#include <helpers/ConstantTadHelper.h>
 #include "../triangular_solve.h"
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             /*
@@ -138,7 +138,7 @@ namespace nd4j {
             }
 
             template <typename T>
-            static int triangularSolveFunctor_(nd4j::LaunchContext * context, NDArray* leftInput, NDArray* rightInput,
+            static int triangularSolveFunctor_(sd::LaunchContext * context, NDArray* leftInput, NDArray* rightInput,
                     bool lower, bool adjoint, NDArray* output) {
                 NDArray::prepareSpecialUse({output}, {leftInput, rightInput});
                 auto leftTads = ConstantTadHelper::getInstance()->tadForDimensions(leftInput->getShapeInfo(), {-2, -1});
@@ -161,7 +161,7 @@ namespace nd4j {
 
             }
 
-            int triangularSolveFunctor(nd4j::LaunchContext * context, NDArray* leftInput, NDArray* rightInput, bool lower, bool adjoint, NDArray* output) {
+            int triangularSolveFunctor(sd::LaunchContext * context, NDArray* leftInput, NDArray* rightInput, bool lower, bool adjoint, NDArray* output) {
                 BUILD_SINGLE_SELECTOR(leftInput->dataType(), return triangularSolveFunctor_, (context, leftInput, rightInput, lower, adjoint, output), FLOAT_NATIVE);
             }
 
@@ -207,7 +207,7 @@ namespace nd4j {
             }
 
             template <typename T>
-            static void adjointTriangularMatrix_(nd4j::LaunchContext* context, NDArray const* input, bool const lower,
+            static void adjointTriangularMatrix_(sd::LaunchContext* context, NDArray const* input, bool const lower,
                     NDArray* output) {
 
                 auto inputTads = ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), {-2, -1});
@@ -225,7 +225,7 @@ namespace nd4j {
                 }
             }
 
-            void adjointMatrix(nd4j::LaunchContext* context, NDArray const* input, bool const lower, NDArray* output) {
+            void adjointMatrix(sd::LaunchContext* context, NDArray const* input, bool const lower, NDArray* output) {
                 BUILD_SINGLE_SELECTOR(input->dataType(), adjointTriangularMatrix_, (context, input, lower, output), FLOAT_NATIVE);
             }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/weights.cu b/libnd4j/include/ops/declarable/helpers/cuda/weights.cu
index 622732d7d..b543fa1c2 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/weights.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/weights.cu
@@ -20,7 +20,7 @@
 
 #include <ops/declarable/helpers/weights.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -48,13 +48,13 @@ namespace helpers {
                     Nd4jLong yOffset = shape::getIndexOffset(e, weightsShape);
                     //atomicAdd();
                     //*reinterpret_cast<int *>(outputBuffer) +=  reinterpret_cast<int *>(weightsBuffer)[yOffset];
-                    nd4j::math::atomics::nd4j_atomicAdd(reinterpret_cast<T *>(outputBuffer), reinterpret_cast<T *>(weightsBuffer)[yOffset]); //output->p(val, output->e<T>(val) + 1);
+                    sd::math::atomics::nd4j_atomicAdd(reinterpret_cast<T *>(outputBuffer), reinterpret_cast<T *>(weightsBuffer)[yOffset]); //output->p(val, output->e<T>(val) + 1);
 //                    atomicAdd(reinterpret_cast<int *>(outputBuffer), reinterpret_cast<int *>(weightsBuffer)[yOffset]); //output->p(val, output->e<T>(val) + 1);
                 }
                 else {
                     //*reinterpret_cast<int *>(outputBuffer) += int(1);
                     //printf("outputBuffer[0] = %d\n", static_cast<int>(*(reinterpret_cast<T *>(outputBuffer))));
-                    nd4j::math::atomics::nd4j_atomicAdd(reinterpret_cast<T *>(outputBuffer), T(1)); //output->p(val, output->e<T>(val) + 1);
+                    sd::math::atomics::nd4j_atomicAdd(reinterpret_cast<T *>(outputBuffer), T(1)); //output->p(val, output->e<T>(val) + 1);
 //                    atomicAdd(reinterpret_cast<int *>(outputBuffer), int(1)); //output->p(val, output->e<T>(val) + 1);
                     //            printf("outputBuffer[%ld] = %d\n", zOffset, static_cast<int>(*(reinterpret_cast<T *>(outputBuffer) + zOffset)));
                 }
@@ -92,7 +92,7 @@ namespace helpers {
     }
 
     template <typename T>
-    static void adjustWeights_(nd4j::LaunchContext * context, NDArray* input, NDArray* weights, NDArray* output, int minLength, int maxLength) {
+    static void adjustWeights_(sd::LaunchContext * context, NDArray* input, NDArray* weights, NDArray* output, int minLength, int maxLength) {
 //        for (int e = 0; e < input->lengthOf(); e++) {
 //            int val = input->e<int>(e);
 //            if (val < maxLength) {
@@ -109,11 +109,11 @@ namespace helpers {
                 output->specialBuffer(), output->specialShapeInfo(), minLength, maxLength);
     }
 
-    void adjustWeights(nd4j::LaunchContext * context, NDArray* input, NDArray* weights, NDArray* output, int minLength, int maxLength) {
+    void adjustWeights(sd::LaunchContext * context, NDArray* input, NDArray* weights, NDArray* output, int minLength, int maxLength) {
         BUILD_SINGLE_SELECTOR(output->dataType(), adjustWeights_, (context, input, weights, output, minLength, maxLength), GENERIC_NUMERIC_TYPES);
     }
 
-    BUILD_SINGLE_TEMPLATE(template void adjustWeights_, (nd4j::LaunchContext * context, NDArray* input, NDArray* weights, NDArray* output, int minLength, int maxLength), GENERIC_NUMERIC_TYPES);
+    BUILD_SINGLE_TEMPLATE(template void adjustWeights_, (sd::LaunchContext * context, NDArray* input, NDArray* weights, NDArray* output, int minLength, int maxLength), GENERIC_NUMERIC_TYPES);
 }
 }
 }
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/zeta.cu b/libnd4j/include/ops/declarable/helpers/cuda/zeta.cu
index ada547ac3..43f0ee8d1 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/zeta.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/zeta.cu
@@ -20,7 +20,7 @@
 
 #include<ops/declarable/helpers/zeta.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -61,7 +61,7 @@ static void zetaCudaLauncher(const int blocksPerGrid, const int threadsPerBlock,
     zetaCuda<T><<<blocksPerGrid, threadsPerBlock, 1024, *stream>>>(vx, xShapeInfo, vq, qShapeInfo, vz, zShapeInfo);
 }
 
-void zeta(nd4j::LaunchContext * context, const NDArray& x, const NDArray& q, NDArray& z) {
+void zeta(sd::LaunchContext * context, const NDArray& x, const NDArray& q, NDArray& z) {
 
     if(!x.isActualOnDeviceSide()) x.syncToDevice();
     if(!q.isActualOnDeviceSide()) q.syncToDevice();
diff --git a/libnd4j/include/ops/declarable/helpers/d_t_s.h b/libnd4j/include/ops/declarable/helpers/d_t_s.h
index b27e6d8af..20c11ec24 100644
--- a/libnd4j/include/ops/declarable/helpers/d_t_s.h
+++ b/libnd4j/include/ops/declarable/helpers/d_t_s.h
@@ -19,13 +19,13 @@
 //
 
 #include <ops/declarable/helpers/helpers.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
-    void _depthToSpace(nd4j::LaunchContext * context, NDArray *input, NDArray *output, int block_size, bool isNHWC);
+    void _depthToSpace(sd::LaunchContext * context, NDArray *input, NDArray *output, int block_size, bool isNHWC);
 }
 }
 }
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/diag.h b/libnd4j/include/ops/declarable/helpers/diag.h
index 48c2e1f71..af84eec01 100644
--- a/libnd4j/include/ops/declarable/helpers/diag.h
+++ b/libnd4j/include/ops/declarable/helpers/diag.h
@@ -19,15 +19,15 @@
 //
 #ifndef __DIAG_H_HELPERS__
 #define __DIAG_H_HELPERS__
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
-    void diagFunctor(nd4j::LaunchContext * context, NDArray const* input, NDArray* output);
-    void diagPartFunctor(nd4j::LaunchContext * context, NDArray const* input, NDArray* output);
+    void diagFunctor(sd::LaunchContext * context, NDArray const* input, NDArray* output);
+    void diagPartFunctor(sd::LaunchContext * context, NDArray const* input, NDArray* output);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/dilation2d.h b/libnd4j/include/ops/declarable/helpers/dilation2d.h
index d6908c0f8..a26fe10f1 100644
--- a/libnd4j/include/ops/declarable/helpers/dilation2d.h
+++ b/libnd4j/include/ops/declarable/helpers/dilation2d.h
@@ -20,15 +20,15 @@
 
 #include <ops/declarable/helpers/helpers.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
 //////////////////////////////////////////////////////////////////////
-void dilation2d(nd4j::LaunchContext* context, NDArray *input, NDArray *weights, NDArray *output, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW);
+void dilation2d(sd::LaunchContext* context, NDArray *input, NDArray *weights, NDArray *output, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW);
 
 //////////////////////////////////////////////////////////////////////
-FORCEINLINE Nd4jStatus outputSize(nd4j::LaunchContext * context, const int inSize, const int k, const int d, const int s, bool isSameMode, int *outSize, int *padding_before, int *padding_after) {
+FORCEINLINE Nd4jStatus outputSize(sd::LaunchContext * context, const int inSize, const int k, const int d, const int s, bool isSameMode, int *outSize, int *padding_before, int *padding_after) {
     if (s <= 0)
         return Status::THROW("Dilation2D: Stride must be > 0");
 
@@ -38,7 +38,7 @@ FORCEINLINE Nd4jStatus outputSize(nd4j::LaunchContext * context, const int inSiz
     int kEff = (k - 1) * d + 1;
     if (isSameMode) {
         *outSize = (inSize + s - 1) / s;
-        const int padding_needed = nd4j::math::nd4j_max<int>(0, (*outSize - 1) * s + kEff -inSize);
+        const int padding_needed = sd::math::nd4j_max<int>(0, (*outSize - 1) * s + kEff -inSize);
 
         *padding_before = padding_needed / 2;
         *padding_after = padding_needed - *padding_before;
@@ -54,7 +54,7 @@ FORCEINLINE Nd4jStatus outputSize(nd4j::LaunchContext * context, const int inSiz
 }
 
 //////////////////////////////////////////////////////////////////////
-FORCEINLINE Nd4jStatus dilation_hw(nd4j::LaunchContext * context, Nd4jLong *in, Nd4jLong *wh, std::vector<int> &strides, std::vector<int> &rates, bool isSameMode, int *sH, int *sW, int *pH, int *pW, int *dH, int *dW, int *oH, int *oW) {
+FORCEINLINE Nd4jStatus dilation_hw(sd::LaunchContext * context, Nd4jLong *in, Nd4jLong *wh, std::vector<int> &strides, std::vector<int> &rates, bool isSameMode, int *sH, int *sW, int *pH, int *pW, int *dH, int *dW, int *oH, int *oW) {
     const int iH = shape::sizeAt(in, 1);
     const int iW = shape::sizeAt(in, 2);
     const int iC = shape::sizeAt(in, 3);
diff --git a/libnd4j/include/ops/declarable/helpers/dropout.h b/libnd4j/include/ops/declarable/helpers/dropout.h
index 1825efbd9..052b68f33 100644
--- a/libnd4j/include/ops/declarable/helpers/dropout.h
+++ b/libnd4j/include/ops/declarable/helpers/dropout.h
@@ -19,11 +19,11 @@
 //
 #ifndef __DROP_OUT_HELPERS__
 #define __DROP_OUT_HELPERS__
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 #include <graph/Context.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
diff --git a/libnd4j/include/ops/declarable/helpers/dynamic.h b/libnd4j/include/ops/declarable/helpers/dynamic.h
index 64656a2bc..29452cb8f 100644
--- a/libnd4j/include/ops/declarable/helpers/dynamic.h
+++ b/libnd4j/include/ops/declarable/helpers/dynamic.h
@@ -20,20 +20,20 @@
 
 #ifndef __DYNAMIC_H_HELPERS__
 #define __DYNAMIC_H_HELPERS__
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
 
-            void dynamicPartitionFunctor(nd4j::LaunchContext * context, NDArray const* input, NDArray const* indices, std::vector<NDArray*>& outputList);
+            void dynamicPartitionFunctor(sd::LaunchContext * context, NDArray const* input, NDArray const* indices, std::vector<NDArray*>& outputList);
 
-            int dynamicStitchFunctor(nd4j::LaunchContext * context, std::vector<NDArray*> const& inputs, std::vector<NDArray*> const& indices, NDArray* output);
+            int dynamicStitchFunctor(sd::LaunchContext * context, std::vector<NDArray*> const& inputs, std::vector<NDArray*> const& indices, NDArray* output);
 
-            void dynamicPartitionFunctorBP(nd4j::LaunchContext * context, NDArray const* input, NDArray const* indices, std::vector<NDArray*> const& gradientInputList, std::vector<NDArray*>& outputList);
+            void dynamicPartitionFunctorBP(sd::LaunchContext * context, NDArray const* input, NDArray const* indices, std::vector<NDArray*> const& gradientInputList, std::vector<NDArray*>& outputList);
 
-            int dynamicStitchFunctorBP(nd4j::LaunchContext * context, std::vector<NDArray*> const& inputs, std::vector<NDArray*> const& indices, NDArray const* gradientInput, std::vector<NDArray*>& outputList);
+            int dynamicStitchFunctorBP(sd::LaunchContext * context, std::vector<NDArray*> const& inputs, std::vector<NDArray*> const& indices, NDArray const* gradientInput, std::vector<NDArray*>& outputList);
         }
     }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/extract_patches.h b/libnd4j/include/ops/declarable/helpers/extract_patches.h
index 2b0d46a90..63d5e94f4 100644
--- a/libnd4j/include/ops/declarable/helpers/extract_patches.h
+++ b/libnd4j/include/ops/declarable/helpers/extract_patches.h
@@ -19,14 +19,14 @@
 //
 #ifndef __EXTRACT_PATCHES_H_HELPERS__
 #define __EXTRACT_PATCHES_H_HELPERS__
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
-    void extractPatches(nd4j::LaunchContext * context, NDArray* images, NDArray* output, int sizeRow, int sizeCol, int stradeRow, int stradeCol, int rateRow, int rateCol, bool theSame);
+    void extractPatches(sd::LaunchContext * context, NDArray* images, NDArray* output, int sizeRow, int sizeCol, int stradeRow, int stradeCol, int rateRow, int rateCol, bool theSame);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/fake_quantization.h b/libnd4j/include/ops/declarable/helpers/fake_quantization.h
index cadd8be7c..b5f4dff00 100644
--- a/libnd4j/include/ops/declarable/helpers/fake_quantization.h
+++ b/libnd4j/include/ops/declarable/helpers/fake_quantization.h
@@ -19,10 +19,10 @@
 //
 #ifndef __FAKE_QUANTIZATION_H_HELPERS__
 #define __FAKE_QUANTIZATION_H_HELPERS__
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
diff --git a/libnd4j/include/ops/declarable/helpers/flatten.h b/libnd4j/include/ops/declarable/helpers/flatten.h
index 0513e45ea..da6253dfa 100644
--- a/libnd4j/include/ops/declarable/helpers/flatten.h
+++ b/libnd4j/include/ops/declarable/helpers/flatten.h
@@ -22,15 +22,15 @@
 #define DEV_TESTS_FLATTEN_H
 
 #include <vector>
-#include <NDArray.h>
+#include <array/NDArray.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
 
 //////////////////////////////////////////////////////////////////////
-void flatten(nd4j::LaunchContext *context, std::vector<NDArray*> &inputs, NDArray *output, char order);
+void flatten(sd::LaunchContext *context, std::vector<NDArray*> &inputs, NDArray *output, char order);
 
 
 //////////////////////////////////////////////////////////////////////
diff --git a/libnd4j/include/ops/declarable/helpers/gammaMathFunc.h b/libnd4j/include/ops/declarable/helpers/gammaMathFunc.h
index 2ad540409..2f99f3777 100644
--- a/libnd4j/include/ops/declarable/helpers/gammaMathFunc.h
+++ b/libnd4j/include/ops/declarable/helpers/gammaMathFunc.h
@@ -23,17 +23,17 @@
 #define LIBND4J_GAMMAMATHFUNC_H
 
 #include <ops/declarable/helpers/helpers.h>
-#include "NDArray.h"
+#include "array/NDArray.h"
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
     // calculate the digamma function for each element for array
-    void diGamma(nd4j::LaunchContext* context, const NDArray& x, NDArray& z);
+    void diGamma(sd::LaunchContext* context, const NDArray& x, NDArray& z);
 
 	// calculate the polygamma function
-    void polyGamma(nd4j::LaunchContext* context, const NDArray& n, const NDArray& x, NDArray& z);
+    void polyGamma(sd::LaunchContext* context, const NDArray& n, const NDArray& x, NDArray& z);
 
     // calculate the digamma function for one element
 	// implementation is based on serial representation written in terms of the Hurwitz zeta function as polygamma = (-1)^{n+1} * n! * zeta(n+1, x)
@@ -47,7 +47,7 @@ namespace helpers {
 			if(x == xInt)	// integer
 				return DataTypeUtils::infOrMax<T>();
 			else
-				return diGammaScalar<T>(1 - x) - M_PI / nd4j::math::nd4j_tan<T,T>(M_PI * x); // use reflection formula psi(1-x) = psi(x) + pi*cot(pi*x)
+				return diGammaScalar<T>(1 - x) - M_PI / sd::math::nd4j_tan<T,T>(M_PI * x); // use reflection formula psi(1-x) = psi(x) + pi*cot(pi*x)
 		}
 
 		// positive integer
@@ -61,7 +61,7 @@ namespace helpers {
 
 		// positive half-integer
 		if(x - xInt == 0.5 && xInt <= 20) {		// psi(n+0.5) = -Euler_Mascheroni_const - 2*ln(2) + sum_from_k=1_to_n( 2/(2*k-1) )	, for n = 1,2,3,...inf, we use this formula only for n <= 20 to avoid time consuming sum calculation for bigger n
-			T result = -0.577215664901532 - 2 * nd4j::math::nd4j_log<T,T>(2);
+			T result = -0.577215664901532 - 2 * sd::math::nd4j_log<T,T>(2);
 			for (uint i = 1; i <= xInt; ++i) {
 				result += static_cast<T>(2) / (2*i - 1);
 			}
@@ -78,7 +78,7 @@ namespace helpers {
 		// psi(x) = log(x) - 1/(2*x) - 1/(12*x^2) + 1/(120*x^4) - 1/(252*x^6) + 1/(240*x^8) - 5/(660*x^10) + 691/(32760*x^12) - 1/(12*x^14) + ...
 
 		if(x >= (sizeof(T) > 4 ? 1.e16 : 1.e8))		// if x is too big take into account only log(x)
-			return nd4j::math::nd4j_log<T,T>(x);
+			return sd::math::nd4j_log<T,T>(x);
 
 		// coefficients used in truncated asymptotic expansion formula
 		const T coeffs[7] = {-(T)1/12, (T)1/120, -(T)1/252, (T)1/240, -(T)5/660, (T)691/32760, -(T)1/12};
@@ -89,7 +89,7 @@ namespace helpers {
 
 		for (int i = 6; i >= 0; --i)
 			result = (result + coeffs[i]) * x2Inv;
-		return result + nd4j::math::nd4j_log<T,T>(x) - static_cast<T>(0.5) / x;
+		return result + sd::math::nd4j_log<T,T>(x) - static_cast<T>(0.5) / x;
 	}
 
 }
diff --git a/libnd4j/include/ops/declarable/helpers/gather.h b/libnd4j/include/ops/declarable/helpers/gather.h
index b2eca1bc0..f38708385 100644
--- a/libnd4j/include/ops/declarable/helpers/gather.h
+++ b/libnd4j/include/ops/declarable/helpers/gather.h
@@ -21,13 +21,13 @@
 #ifndef LIBND4J_GATHER_H
 #define LIBND4J_GATHER_H
 
-#include <NDArray.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
             
-	void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray* indices, NDArray* output, const std::vector<int>& intArgs);
+	void gather(sd::LaunchContext * context, const NDArray* input, const NDArray* indices, NDArray* output, const std::vector<int>& intArgs);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/gradient.h b/libnd4j/include/ops/declarable/helpers/gradient.h
index 3850338e1..583396cf3 100644
--- a/libnd4j/include/ops/declarable/helpers/gradient.h
+++ b/libnd4j/include/ops/declarable/helpers/gradient.h
@@ -19,17 +19,17 @@
 //
 #ifndef __GRADIENT_H_HELPERS__
 #define __GRADIENT_H_HELPERS__
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
     /*
      * applyGradientDescent: calculate z = x - y * w.
      * */
-    void applyGradientDescent(nd4j::LaunchContext* context, NDArray* input, NDArray* step, double weight, NDArray* output);
+    void applyGradientDescent(sd::LaunchContext* context, NDArray* input, NDArray* step, double weight, NDArray* output);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/gru.h b/libnd4j/include/ops/declarable/helpers/gru.h
index 87e1786fb..3fecfa71b 100644
--- a/libnd4j/include/ops/declarable/helpers/gru.h
+++ b/libnd4j/include/ops/declarable/helpers/gru.h
@@ -23,17 +23,17 @@
 
 #include <ops/declarable/helpers/helpers.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
-	void gruCell(nd4j::LaunchContext * context, const NDArray* x, const NDArray* hLast, const NDArray* Wru, const NDArray* Wc,
+	void gruCell(sd::LaunchContext * context, const NDArray* x, const NDArray* hLast, const NDArray* Wru, const NDArray* Wc,
 				 const NDArray* bru, const NDArray* bc,
 				 NDArray* r, NDArray* u, NDArray* c, NDArray* h);
 
-	void gruTimeLoop(nd4j::LaunchContext * context, const NDArray* x, const NDArray* h0, const NDArray* Wx, const NDArray* Wh, const NDArray* b, NDArray* h);
+	void gruTimeLoop(sd::LaunchContext * context, const NDArray* x, const NDArray* h0, const NDArray* Wx, const NDArray* Wh, const NDArray* b, NDArray* h);
 
-	void gruCellBP(nd4j::LaunchContext* context, const NDArray* x, const NDArray* hLast, const NDArray* W, const NDArray* Wc, const NDArray* b, const NDArray* bc, const NDArray* dLdr, const NDArray* dLdu, const NDArray* dLdc, const NDArray* dLdh, NDArray* dLdx, NDArray* dLdhLast, NDArray* dLdW, NDArray* dLdWc, NDArray* dLdb, NDArray* dLdbc);
+	void gruCellBP(sd::LaunchContext* context, const NDArray* x, const NDArray* hLast, const NDArray* W, const NDArray* Wc, const NDArray* b, const NDArray* bc, const NDArray* dLdr, const NDArray* dLdu, const NDArray* dLdc, const NDArray* dLdh, NDArray* dLdx, NDArray* dLdhLast, NDArray* dLdW, NDArray* dLdWc, NDArray* dLdb, NDArray* dLdbc);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/hamming.h b/libnd4j/include/ops/declarable/helpers/hamming.h
index 3ca3a6933..6450d7882 100644
--- a/libnd4j/include/ops/declarable/helpers/hamming.h
+++ b/libnd4j/include/ops/declarable/helpers/hamming.h
@@ -21,7 +21,7 @@
 #ifndef SD_HAMMING_H
 #define SD_HAMMING_H
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             void hamming(LaunchContext *context, NDArray &x, NDArray &y, NDArray &output);
diff --git a/libnd4j/include/ops/declarable/helpers/hashcode.h b/libnd4j/include/ops/declarable/helpers/hashcode.h
index a7fa5bc83..730249d1a 100644
--- a/libnd4j/include/ops/declarable/helpers/hashcode.h
+++ b/libnd4j/include/ops/declarable/helpers/hashcode.h
@@ -23,7 +23,7 @@
 
 #include "helpers.h"
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             template <typename T>
diff --git a/libnd4j/include/ops/declarable/helpers/helpers.h b/libnd4j/include/ops/declarable/helpers/helpers.h
index f3aebc7b7..c36387e6e 100644
--- a/libnd4j/include/ops/declarable/helpers/helpers.h
+++ b/libnd4j/include/ops/declarable/helpers/helpers.h
@@ -21,17 +21,17 @@
 #ifndef LIBND4J_OPS_HELPERS_H
 #define LIBND4J_OPS_HELPERS_H
 
-#include <pointercast.h>
-#include <op_boilerplate.h>
+#include <system/pointercast.h>
+#include <system/op_boilerplate.h>
 #include <execution/LaunchContext.h>
 #include <types/float16.h>
 #include <types/types.h>
 #include <helpers/shape.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <vector>
 #include <array>
-#include <Status.h>
-#include <NDArrayFactory.h>
+#include <graph/Status.h>
+#include <array/NDArrayFactory.h>
 
 #ifdef __CUDACC__
 #include <cuda.h>
@@ -42,7 +42,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-#include <DebugHelper.h>
+#include <helpers/DebugHelper.h>
 
 #endif // CUDACC
 
diff --git a/libnd4j/include/ops/declarable/helpers/histogram.h b/libnd4j/include/ops/declarable/helpers/histogram.h
index b6556599f..b9738ef07 100644
--- a/libnd4j/include/ops/declarable/helpers/histogram.h
+++ b/libnd4j/include/ops/declarable/helpers/histogram.h
@@ -21,12 +21,12 @@
 #ifndef LIBND4J_HISTOGRAM_H
 #define LIBND4J_HISTOGRAM_H
 
-#include <NDArray.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
-            void histogramHelper(nd4j::LaunchContext *context, NDArray &input, NDArray &output);
+            void histogramHelper(sd::LaunchContext *context, NDArray &input, NDArray &output);
         }
     }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/histogramFixedWidth.h b/libnd4j/include/ops/declarable/helpers/histogramFixedWidth.h
index 17fc84e27..40ba6ffec 100644
--- a/libnd4j/include/ops/declarable/helpers/histogramFixedWidth.h
+++ b/libnd4j/include/ops/declarable/helpers/histogramFixedWidth.h
@@ -23,11 +23,11 @@
 
 #include <ops/declarable/helpers/helpers.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
-void histogramFixedWidth(nd4j::LaunchContext * context, const NDArray& input, const NDArray& range, NDArray& output);
+void histogramFixedWidth(sd::LaunchContext * context, const NDArray& input, const NDArray& range, NDArray& output);
 
 
 }
diff --git a/libnd4j/include/ops/declarable/helpers/im2col.h b/libnd4j/include/ops/declarable/helpers/im2col.h
index f484c9bc4..6b61535f9 100644
--- a/libnd4j/include/ops/declarable/helpers/im2col.h
+++ b/libnd4j/include/ops/declarable/helpers/im2col.h
@@ -23,11 +23,11 @@
 
 #include <ops/declarable/helpers/helpers.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
-    ND4J_EXPORT void im2col(nd4j::LaunchContext & context, const NDArray& im,  NDArray& col, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const NDArray& arrZeroPadVal);
+    ND4J_EXPORT void im2col(sd::LaunchContext & context, const NDArray& im,  NDArray& col, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const NDArray& arrZeroPadVal);
 }
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/image_draw_bounding_boxes.h b/libnd4j/include/ops/declarable/helpers/image_draw_bounding_boxes.h
index dd61d9532..758a02e31 100644
--- a/libnd4j/include/ops/declarable/helpers/image_draw_bounding_boxes.h
+++ b/libnd4j/include/ops/declarable/helpers/image_draw_bounding_boxes.h
@@ -19,14 +19,14 @@
 //
 #ifndef __IMAGE_DRAW_BOUNDING_BOXES_H_HELPERS__
 #define __IMAGE_DRAW_BOUNDING_BOXES_H_HELPERS__
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
-    void drawBoundingBoxesFunctor(nd4j::LaunchContext * context, NDArray* images, NDArray* boxes, NDArray* colors, NDArray* output);
+    void drawBoundingBoxesFunctor(sd::LaunchContext * context, NDArray* images, NDArray* boxes, NDArray* colors, NDArray* output);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/image_resize.h b/libnd4j/include/ops/declarable/helpers/image_resize.h
index decac3db9..c11e94ed4 100644
--- a/libnd4j/include/ops/declarable/helpers/image_resize.h
+++ b/libnd4j/include/ops/declarable/helpers/image_resize.h
@@ -20,10 +20,10 @@
 //
 #ifndef __IMAGE_RESIZE_HELPERS__
 #define __IMAGE_RESIZE_HELPERS__
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -37,18 +37,18 @@ namespace helpers {
         kResizeArea
     };
 
-    int resizeBilinearFunctor(nd4j::LaunchContext * context, NDArray const* image, int const width, int const height,
+    int resizeBilinearFunctor(sd::LaunchContext * context, NDArray const* image, int const width, int const height,
             bool const alignCorners, bool const halfPixelCenter, NDArray* output);
-    int resizeNeighborFunctor(nd4j::LaunchContext * context, NDArray const* image, int const width, int const height,
+    int resizeNeighborFunctor(sd::LaunchContext * context, NDArray const* image, int const width, int const height,
             bool const alignCorners, bool const halfPixelCenter, NDArray* output);
-    int resizeBicubicFunctor(nd4j::LaunchContext * context, NDArray const* image, int const width, int const height,
+    int resizeBicubicFunctor(sd::LaunchContext * context, NDArray const* image, int const width, int const height,
                       bool preserveAspectRatio, bool antialias, NDArray* output);
-    int resizeBicubicFunctorA(nd4j::LaunchContext * context, NDArray const* image, int const width, int const height,
+    int resizeBicubicFunctorA(sd::LaunchContext * context, NDArray const* image, int const width, int const height,
                              bool const alignCorners, bool const halfPixelAlign, NDArray* output);
-    int resizeAreaFunctor(nd4j::LaunchContext * context, NDArray const* image, int const width, int const height,
+    int resizeAreaFunctor(sd::LaunchContext * context, NDArray const* image, int const width, int const height,
                              bool const alignCorners, NDArray* output);
 
-    int resizeFunctor(nd4j::LaunchContext * context, NDArray const* image, int const width, int const height,
+    int resizeFunctor(sd::LaunchContext * context, NDArray const* image, int const width, int const height,
             ImageResizeMethods method, bool preserveAspectRatio, bool antialias, NDArray* output);
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/image_suppression.h b/libnd4j/include/ops/declarable/helpers/image_suppression.h
index 4013b72b3..a8d2027b8 100644
--- a/libnd4j/include/ops/declarable/helpers/image_suppression.h
+++ b/libnd4j/include/ops/declarable/helpers/image_suppression.h
@@ -19,18 +19,18 @@
 //
 #ifndef __IMAGE_SUPPRESSION_H_HELPERS__
 #define __IMAGE_SUPPRESSION_H_HELPERS__
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
-    void nonMaxSuppression(nd4j::LaunchContext * context, NDArray* boxes, NDArray* scales, int maxSize,
+    void nonMaxSuppression(sd::LaunchContext * context, NDArray* boxes, NDArray* scales, int maxSize,
             double overlapThreshold, double scoreThreshold, NDArray* output);
-    Nd4jLong nonMaxSuppressionV3(nd4j::LaunchContext * context, NDArray* boxes, NDArray* scales, int maxSize,
+    Nd4jLong nonMaxSuppressionV3(sd::LaunchContext * context, NDArray* boxes, NDArray* scales, int maxSize,
                            double overlapThreshold, double scoreThreshold, NDArray* output);
-    Nd4jLong nonMaxSuppressionGeneric(nd4j::LaunchContext* context, NDArray* boxes, NDArray* scores, int maxSize,
+    Nd4jLong nonMaxSuppressionGeneric(sd::LaunchContext* context, NDArray* boxes, NDArray* scores, int maxSize,
                              double overlapThreshold, double scoreThreshold, NDArray* output);
 
 }
diff --git a/libnd4j/include/ops/declarable/helpers/imagesHelpers.h b/libnd4j/include/ops/declarable/helpers/imagesHelpers.h
index 0ae8ba072..3a1666c7a 100644
--- a/libnd4j/include/ops/declarable/helpers/imagesHelpers.h
+++ b/libnd4j/include/ops/declarable/helpers/imagesHelpers.h
@@ -24,25 +24,25 @@
 #ifndef LIBND4J_HELPERS_IMAGES_H
 #define LIBND4J_HELPERS_IMAGES_H
 
-#include <op_boilerplate.h>
-#include <templatemath.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <math/templatemath.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
-    void transformRgbGrs(nd4j::LaunchContext* context, const NDArray& input, NDArray& output, const int dimC);
+    void transformRgbGrs(sd::LaunchContext* context, const NDArray& input, NDArray& output, const int dimC);
 
-    void transformHsvRgb(nd4j::LaunchContext* context, const NDArray* input, NDArray* output, const int dimC);
+    void transformHsvRgb(sd::LaunchContext* context, const NDArray* input, NDArray* output, const int dimC);
 
-    void transformRgbHsv(nd4j::LaunchContext* context, const NDArray* input, NDArray* output, const int dimC);
-    void transformYuvRgb(nd4j::LaunchContext* context, const NDArray& input, NDArray& output, const int dimC);
-    void transformRgbYuv(nd4j::LaunchContext* context, const NDArray& input, NDArray& output, const int dimC);
+    void transformRgbHsv(sd::LaunchContext* context, const NDArray* input, NDArray* output, const int dimC);
+    void transformYuvRgb(sd::LaunchContext* context, const NDArray& input, NDArray& output, const int dimC);
+    void transformRgbYuv(sd::LaunchContext* context, const NDArray& input, NDArray& output, const int dimC);
 
-    void transformYiqRgb(nd4j::LaunchContext* context, const NDArray* input, NDArray* output, const int dimC);
+    void transformYiqRgb(sd::LaunchContext* context, const NDArray* input, NDArray* output, const int dimC);
 
-    void transformRgbYiq(nd4j::LaunchContext* context, const NDArray* input, NDArray* output, const int dimC);
+    void transformRgbYiq(sd::LaunchContext* context, const NDArray* input, NDArray* output, const int dimC);
 }
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/impl/choose.cpp b/libnd4j/include/ops/declarable/helpers/impl/choose.cpp
index a75298af6..2f574a52e 100644
--- a/libnd4j/include/ops/declarable/helpers/impl/choose.cpp
+++ b/libnd4j/include/ops/declarable/helpers/impl/choose.cpp
@@ -19,21 +19,21 @@
 //
 
 #include <ops/declarable/helpers/choose.h>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 #include <ops/ops.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
     template <typename T>
-    static nd4j::NDArray* processCondition_(int mode,nd4j::NDArray *arg, nd4j::NDArray *comp, nd4j::NDArray& compScalar);
+    static sd::NDArray* processCondition_(int mode,sd::NDArray *arg, sd::NDArray *comp, sd::NDArray& compScalar);
 
     template <typename T>
     static T processElementCondition(int mode,T d1,T d2);
 
 
     template <typename T>
-    nd4j::NDArray* processCondition_(int mode,nd4j::NDArray *arg, nd4j::NDArray *comp, nd4j::NDArray *output, nd4j::NDArray *numResult, nd4j::NDArray& compScalar) {
+    sd::NDArray* processCondition_(int mode,sd::NDArray *arg, sd::NDArray *comp, sd::NDArray *output, sd::NDArray *numResult, sd::NDArray& compScalar) {
 
         //Convert to straight ndarray based on input
 
@@ -42,8 +42,8 @@ namespace helpers {
             if (comp->isScalar()) {
                 //Other input for compare could be an ndarray or a secondary scalar
                 //for comparison
-//                nd4j::NDArray arg1 = *arg;
-//                nd4j::NDArray comp1 = *comp;
+//                sd::NDArray arg1 = *arg;
+//                sd::NDArray comp1 = *comp;
                 for (Nd4jLong i = 0; i < arg->lengthOf(); i++) {
                     T result2 = processElementCondition(mode, arg->e<T>(i), comp->e<T>(0));
                     if(result2 > static_cast<T>(0)) {
@@ -56,7 +56,7 @@ namespace helpers {
                 // REQUIRE_TRUE(comp.isSameShape(arg));
                 //Other input for compare could be an ndarray or a secondary scalar
                 //for comparison
-                nd4j::NDArray arg1 = *arg;
+                sd::NDArray arg1 = *arg;
                 for (Nd4jLong i = 0; i < arg->lengthOf(); i++) {
                     T result2 = processElementCondition(mode, arg->e<T>(i), comp->e<T>(i));
                     if(result2 > static_cast<T>(0)) {
@@ -69,7 +69,7 @@ namespace helpers {
 
         }
         else {
-    //        nd4j::NDArray arg1 = *arg;
+    //        sd::NDArray arg1 = *arg;
             //Other input for compare could be an ndarray or a secondary scalar
             //for comparison
             for (Nd4jLong i = 0; i < arg->lengthOf(); i++) {
@@ -88,7 +88,7 @@ namespace helpers {
         return output;
     }
 
-    nd4j::NDArray* processCondition(nd4j::LaunchContext * context, int mode,nd4j::NDArray *arg, nd4j::NDArray *comp, nd4j::NDArray *output, nd4j::NDArray *numResult, nd4j::NDArray& compScalar) {
+    sd::NDArray* processCondition(sd::LaunchContext * context, int mode,sd::NDArray *arg, sd::NDArray *comp, sd::NDArray *output, sd::NDArray *numResult, sd::NDArray& compScalar) {
         arg->syncToHost();
 
         if (comp != nullptr)
@@ -118,7 +118,7 @@ namespace helpers {
         compScalar.syncToDevice();
 
     }
-    BUILD_SINGLE_TEMPLATE(template NDArray* processCondition_, (int mode,nd4j::NDArray *arg, nd4j::NDArray *comp, nd4j::NDArray *output, nd4j::NDArray *numResult, nd4j::NDArray& compScalar), FLOAT_TYPES);
+    BUILD_SINGLE_TEMPLATE(template NDArray* processCondition_, (int mode,sd::NDArray *arg, sd::NDArray *comp, sd::NDArray *output, sd::NDArray *numResult, sd::NDArray& compScalar), FLOAT_TYPES);
 
     template <typename T>
     T processElementCondition(int mode,T d1,T d2) {
@@ -129,7 +129,7 @@ namespace helpers {
 
     }
 
-    void chooseFunctorArray(nd4j::LaunchContext * context, NDArray* arg, NDArray* comp, int mode, NDArray* result, NDArray* numResults) {
+    void chooseFunctorArray(sd::LaunchContext * context, NDArray* arg, NDArray* comp, int mode, NDArray* result, NDArray* numResults) {
         if(arg->isScalar() || comp->isScalar()) {
             if(arg->isScalar()) {
                 processCondition(context, mode,comp,nullptr,result,numResults, *arg);
@@ -144,7 +144,7 @@ namespace helpers {
         }
     }
 
-    void chooseFunctorScalar(nd4j::LaunchContext * context, NDArray* arg, double scalar, int mode, NDArray* result, NDArray* numResults) {
+    void chooseFunctorScalar(sd::LaunchContext * context, NDArray* arg, double scalar, int mode, NDArray* result, NDArray* numResults) {
         auto scalarA = NDArrayFactory::create(scalar);
         processCondition(context, mode, arg, nullptr,result, numResults, scalarA);
     }
diff --git a/libnd4j/include/ops/declarable/helpers/impl/knn_mindistance.cpp b/libnd4j/include/ops/declarable/helpers/impl/knn_mindistance.cpp
index 71711832d..d3880c730 100644
--- a/libnd4j/include/ops/declarable/helpers/impl/knn_mindistance.cpp
+++ b/libnd4j/include/ops/declarable/helpers/impl/knn_mindistance.cpp
@@ -20,7 +20,7 @@
 
 #include <ops/declarable/helpers/knn.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             template <typename T>
@@ -41,13 +41,13 @@ namespace nd4j {
                     T h = high[e];
                     if (!(l <= p || h <= p)) {
                         if (p < l)
-                            res += nd4j::math::nd4j_pow<T, T, T>((p - o), po);
+                            res += sd::math::nd4j_pow<T, T, T>((p - o), po);
                         else
-                            res += nd4j::math::nd4j_pow<T, T, T>((p - h), po);
+                            res += sd::math::nd4j_pow<T, T, T>((p - h), po);
                     }
                 }
 
-                output[0] = nd4j::math::nd4j_pow<T, T, T>(res, (T) 0.5f);
+                output[0] = sd::math::nd4j_pow<T, T, T>(res, (T) 0.5f);
             }
 
             void knn_mindistance(const NDArray &input, const NDArray &lowest, const NDArray &highest, NDArray &output) {
diff --git a/libnd4j/include/ops/declarable/helpers/impl/listdiff.cpp b/libnd4j/include/ops/declarable/helpers/impl/listdiff.cpp
index c840f6960..6d937dc0f 100644
--- a/libnd4j/include/ops/declarable/helpers/impl/listdiff.cpp
+++ b/libnd4j/include/ops/declarable/helpers/impl/listdiff.cpp
@@ -22,7 +22,7 @@
 #include <vector>
 //#include <memory>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
     template <typename T>
@@ -39,7 +39,7 @@ namespace helpers {
         return saved;
     }
 
-    Nd4jLong listDiffCount(nd4j::LaunchContext * context, NDArray* values, NDArray* keep) {
+    Nd4jLong listDiffCount(sd::LaunchContext * context, NDArray* values, NDArray* keep) {
         auto xType = values->dataType();
 
         NDArray::preparePrimaryUse({},{values, keep});
@@ -71,7 +71,7 @@ namespace helpers {
 
 
         if (saved.size() == 0) {
-//            if (nd4j::ops::conditionHelper(__FILE__, __LINE__, false, 0, "ListDiff: search returned no results") != 0)
+//            if (sd::ops::conditionHelper(__FILE__, __LINE__, false, 0, "ListDiff: search returned no results") != 0)
             nd4j_printf("ListDiff: search returned no results", "");
                 throw std::invalid_argument("Op validation failed");
         } else {
@@ -95,7 +95,7 @@ namespace helpers {
         return Status::OK();
     }
 
-    int listDiffFunctor(nd4j::LaunchContext * context, NDArray* values, NDArray* keep, NDArray* output1, NDArray* output2) {
+    int listDiffFunctor(sd::LaunchContext * context, NDArray* values, NDArray* keep, NDArray* output1, NDArray* output2) {
         auto xType = values->dataType();
 
         NDArray::preparePrimaryUse({output1, output2}, {values, keep});
diff --git a/libnd4j/include/ops/declarable/helpers/impl/lstm.cpp b/libnd4j/include/ops/declarable/helpers/impl/lstm.cpp
index d80b0d323..4ab585e26 100644
--- a/libnd4j/include/ops/declarable/helpers/impl/lstm.cpp
+++ b/libnd4j/include/ops/declarable/helpers/impl/lstm.cpp
@@ -27,15 +27,15 @@
 
 
 #include <ops/declarable/helpers/lstm.h>
-#include <VariableSpace.h>
+#include <graph/VariableSpace.h>
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/transforms.h>
 #include <ops/declarable/helpers/legacy_helpers.h>
 #include <array/NDArrayList.h>
 #include <iterator>
-#include <MmulHelper.h>
+#include <helpers/MmulHelper.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
 
@@ -102,7 +102,7 @@ namespace nd4j {
 
 
             //////////////////////////////////////////////////////////////////////////
-            void lstmTimeLoop(nd4j::LaunchContext * context, const NDArray* x, const NDArray* h0, const NDArray* c0, const NDArray* Wx, const NDArray* Wh, const NDArray* Wc, const NDArray* Wp, const NDArray* b,
+            void lstmTimeLoop(sd::LaunchContext * context, const NDArray* x, const NDArray* h0, const NDArray* c0, const NDArray* Wx, const NDArray* Wh, const NDArray* Wc, const NDArray* Wp, const NDArray* b,
                               NDArray* h, NDArray* c, const std::vector<double>& params) {
 
                 // x  input [time x bS x nIn]
diff --git a/libnd4j/include/ops/declarable/helpers/impl/lstmLayer.cpp b/libnd4j/include/ops/declarable/helpers/impl/lstmLayer.cpp
index e83b5875d..435a3e32d 100644
--- a/libnd4j/include/ops/declarable/helpers/impl/lstmLayer.cpp
+++ b/libnd4j/include/ops/declarable/helpers/impl/lstmLayer.cpp
@@ -34,9 +34,9 @@
 // #include <ops/declarable/helpers/legacy_helpers.h>
 // #include <array/NDArrayList.h>
 // #include <iterator>
-// #include <MmulHelper.h>
+// #include <helpers/MmulHelper.h>
 
-namespace nd4j 	  {
+namespace sd 	  {
 namespace ops 	  {
 namespace helpers {
 
diff --git a/libnd4j/include/ops/declarable/helpers/impl/multiUnique.cpp b/libnd4j/include/ops/declarable/helpers/impl/multiUnique.cpp
index cfa2fe53a..9d8962cda 100644
--- a/libnd4j/include/ops/declarable/helpers/impl/multiUnique.cpp
+++ b/libnd4j/include/ops/declarable/helpers/impl/multiUnique.cpp
@@ -21,18 +21,18 @@
 #include <ops/declarable/helpers/multiUnique.h>
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    bool multiUnique(std::vector<NDArray*> const& inputList, nd4j::memory::Workspace *workspace) {
+    bool multiUnique(std::vector<NDArray*> const& inputList, sd::memory::Workspace *workspace) {
         Nd4jLong length = 0;
         std::vector<NDArray> reshaped(inputList.size());
         int pos = 0;
         Nd4jLong axis = 0;
         Context cContext(1);
         for (auto array: inputList) {
-            if (array->dataType() != nd4j::DataType::INT32)
+            if (array->dataType() != sd::DataType::INT32)
                 throw std::runtime_error("multiUnique: this op support INT32 data type only.");
 
             reshaped[pos] = array->reshape(array->ordering(), {-1});
@@ -41,16 +41,16 @@ namespace helpers {
             length += array->lengthOf();
             pos++;
         }
-        NDArray arrayFull('c', {length}, nd4j::DataType::INT32);
+        NDArray arrayFull('c', {length}, sd::DataType::INT32);
         cContext.setOutputArray(0, &arrayFull);
         cContext.setIArguments(&axis, 1);
 
-        nd4j::ops::concat opConcat;
+        sd::ops::concat opConcat;
         auto cResult = opConcat.execute(&cContext);
         if (Status::OK() != cResult)
             throw std::runtime_error("multiUnique: cannot execute concat op properly.");
 
-        nd4j::ops::unique opUnique;
+        sd::ops::unique opUnique;
         auto uResult = opUnique.evaluate({&arrayFull});
         if (Status::OK() != uResult->status())
             throw std::runtime_error("multiUnique: cannot execute unique op properly.");
diff --git a/libnd4j/include/ops/declarable/helpers/impl/rnn.cpp b/libnd4j/include/ops/declarable/helpers/impl/rnn.cpp
index 3c65f740d..f910f07ed 100644
--- a/libnd4j/include/ops/declarable/helpers/impl/rnn.cpp
+++ b/libnd4j/include/ops/declarable/helpers/impl/rnn.cpp
@@ -24,13 +24,13 @@
 #include <helpers/BlasHelper.h>
 
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
 
 //////////////////////////////////////////////////////////////////////////
-void rnnCell(nd4j::LaunchContext * context, const NDArray* xt, const NDArray* Wx, const NDArray* Wh, const NDArray* b, const NDArray* hPrev, NDArray* ht) {
+void rnnCell(sd::LaunchContext * context, const NDArray* xt, const NDArray* Wx, const NDArray* Wh, const NDArray* b, const NDArray* hPrev, NDArray* ht) {
 
     // xt    input [bS x iS]
     // Wx    input-to-hidden weights, [iS  x nU]
@@ -46,7 +46,7 @@ void rnnCell(nd4j::LaunchContext * context, const NDArray* xt, const NDArray* Wx
 }
 
 //////////////////////////////////////////////////////////////////////////
-void rnnTimeLoop(nd4j::LaunchContext * context, const NDArray* x, const NDArray* Wx, const NDArray* Wh, const NDArray* b, const NDArray* h0, const NDArray* maxTimeStep, NDArray* h, NDArray* hFinal) {
+void rnnTimeLoop(sd::LaunchContext * context, const NDArray* x, const NDArray* Wx, const NDArray* Wh, const NDArray* b, const NDArray* h0, const NDArray* maxTimeStep, NDArray* h, NDArray* hFinal) {
 
     // x   input [time x bS x iS]
 	// Wx  input-to-hidden  weights, [iS  x nU]
diff --git a/libnd4j/include/ops/declarable/helpers/impl/sparse_to_dense.cpp b/libnd4j/include/ops/declarable/helpers/impl/sparse_to_dense.cpp
index a6e971cf2..7a99b4ecc 100644
--- a/libnd4j/include/ops/declarable/helpers/impl/sparse_to_dense.cpp
+++ b/libnd4j/include/ops/declarable/helpers/impl/sparse_to_dense.cpp
@@ -23,7 +23,7 @@
 #include <helpers/StringUtils.h>
 #include <helpers/ShapeUtils.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             template <typename X, typename I>
diff --git a/libnd4j/include/ops/declarable/helpers/impl/unique.cpp b/libnd4j/include/ops/declarable/helpers/impl/unique.cpp
index 3a8af9e3d..c67b713c2 100644
--- a/libnd4j/include/ops/declarable/helpers/impl/unique.cpp
+++ b/libnd4j/include/ops/declarable/helpers/impl/unique.cpp
@@ -19,11 +19,11 @@
 //
 
 #include <ops/declarable/helpers/unique.h>
-#include <Status.h>
+#include <graph/Status.h>
 #include <execution/Threads.h>
 #include <graph/Variable.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
@@ -43,7 +43,7 @@ namespace helpers {
         return count;
     }
 
-    Nd4jLong uniqueCount(nd4j::LaunchContext * context, NDArray* input) {
+    Nd4jLong uniqueCount(sd::LaunchContext * context, NDArray* input) {
         BUILD_SINGLE_SELECTOR(input->dataType(), return uniqueCount_, (input), LIBND4J_TYPES);
     }
 
@@ -86,7 +86,7 @@ namespace helpers {
         return Status::OK();
     }
 
-    Nd4jStatus uniqueFunctor(nd4j::LaunchContext * context, NDArray* input, NDArray* values, NDArray* indices, NDArray* counts) {
+    Nd4jStatus uniqueFunctor(sd::LaunchContext * context, NDArray* input, NDArray* values, NDArray* indices, NDArray* counts) {
         input->syncToHost();
         values->syncToHost();
         indices->syncToHost();
diff --git a/libnd4j/include/ops/declarable/helpers/impl/where.cpp b/libnd4j/include/ops/declarable/helpers/impl/where.cpp
index ed4fe3c97..f73bcae9a 100644
--- a/libnd4j/include/ops/declarable/helpers/impl/where.cpp
+++ b/libnd4j/include/ops/declarable/helpers/impl/where.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/helpers/where.h>
 #include <array/NDArrayList.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             template <typename T>
@@ -49,7 +49,7 @@ namespace nd4j {
             }
             BUILD_SINGLE_TEMPLATE(template void __where,(NDArray &condition, NDArray& output, memory::Workspace *workspace), LIBND4J_TYPES);
 
-            void _where(nd4j::LaunchContext * context, NDArray &condition, NDArray& output, memory::Workspace *workspace) {
+            void _where(sd::LaunchContext * context, NDArray &condition, NDArray& output, memory::Workspace *workspace) {
                 condition.syncToHost();
                 BUILD_SINGLE_SELECTOR(output.dataType(), __where, (condition, output, workspace), LIBND4J_TYPES);
                 output.syncToDevice();
diff --git a/libnd4j/include/ops/declarable/helpers/ismax.h b/libnd4j/include/ops/declarable/helpers/ismax.h
index 831cd592f..6052b3624 100644
--- a/libnd4j/include/ops/declarable/helpers/ismax.h
+++ b/libnd4j/include/ops/declarable/helpers/ismax.h
@@ -24,11 +24,11 @@
 
 #include <ops/declarable/helpers/helpers.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
-	void ismax(nd4j::LaunchContext * context, const NDArray* input, NDArray* output, const std::vector<int>& dimensions);
+	void ismax(sd::LaunchContext * context, const NDArray* input, NDArray* output, const std::vector<int>& dimensions);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/knn.h b/libnd4j/include/ops/declarable/helpers/knn.h
index a2de9c71c..3a3494a12 100644
--- a/libnd4j/include/ops/declarable/helpers/knn.h
+++ b/libnd4j/include/ops/declarable/helpers/knn.h
@@ -23,7 +23,7 @@
 
 #include <ops/declarable/helpers/helpers.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             void knn_mindistance(const NDArray &input, const NDArray &lowest, const NDArray &highest, NDArray &output);
diff --git a/libnd4j/include/ops/declarable/helpers/legacy_helpers.h b/libnd4j/include/ops/declarable/helpers/legacy_helpers.h
index dfe338864..e3191425d 100644
--- a/libnd4j/include/ops/declarable/helpers/legacy_helpers.h
+++ b/libnd4j/include/ops/declarable/helpers/legacy_helpers.h
@@ -19,9 +19,9 @@
 //
 #ifndef __H_LEGACY_HELPERS__
 #define __H_LEGACY_HELPERS__
-#include <NDArray.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 /*
@@ -43,27 +43,27 @@ namespace helpers {
     FORCEINLINE void sigmoidDerivative(NDArray* theFirst, NDArray* theSecond, NDArray* theOutput);
     FORCEINLINE void hardSigmoidDerivative(NDArray* theFirst, NDArray* theSecond, NDArray* theOutput);
 */
-    void reluDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond);
-    void reluDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput);
-    void relu6Derivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput);
-    void leakyReluDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput, const float alpha);
-    void eluDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput, const float alpha);
-    void seluDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput);
-    void cubeDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput);
-    void reduceNorm1(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput);
-    void sigmCrossEntropy(nd4j::LaunchContext * context, NDArray* logits, NDArray* lablels, NDArray* theOutput);
-    void sigmCrossEntropyGrad(nd4j::LaunchContext * context, NDArray* logits, NDArray* lablels, NDArray* theOutput);
-    void tanhDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput);
-    void hardTanhDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput);
-    void rationalTanhDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput);
-    void rectifiedTanhDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput);
-    void softSignDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput);
-    void softPlusDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput);
-    void sigmoidDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput);
-    void hardSigmoidDerivative(nd4j::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput);
-    void logSumExp(nd4j::LaunchContext * context, NDArray* input, NDArray* axis, NDArray* output);
-    void logSumExp(nd4j::LaunchContext * context, NDArray* input, NDArray* subtrah, NDArray* axis, NDArray* output);
-    void weightedCrossEntropyWithLogitsFunctor(nd4j::LaunchContext * context, NDArray const* targets, NDArray const* input, NDArray const* weights, NDArray* output);
+    void reluDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond);
+    void reluDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput);
+    void relu6Derivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput);
+    void leakyReluDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput, const float alpha);
+    void eluDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput, const float alpha);
+    void seluDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput);
+    void cubeDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput);
+    void reduceNorm1(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput);
+    void sigmCrossEntropy(sd::LaunchContext * context, NDArray* logits, NDArray* lablels, NDArray* theOutput);
+    void sigmCrossEntropyGrad(sd::LaunchContext * context, NDArray* logits, NDArray* lablels, NDArray* theOutput);
+    void tanhDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput);
+    void hardTanhDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput);
+    void rationalTanhDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput);
+    void rectifiedTanhDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput);
+    void softSignDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput);
+    void softPlusDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput);
+    void sigmoidDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput);
+    void hardSigmoidDerivative(sd::LaunchContext * context, NDArray* theFirst, NDArray* theSecond, NDArray* theOutput);
+    void logSumExp(sd::LaunchContext * context, NDArray* input, NDArray* axis, NDArray* output);
+    void logSumExp(sd::LaunchContext * context, NDArray* input, NDArray* subtrah, NDArray* axis, NDArray* output);
+    void weightedCrossEntropyWithLogitsFunctor(sd::LaunchContext * context, NDArray const* targets, NDArray const* input, NDArray const* weights, NDArray* output);
 }
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/lgamma.h b/libnd4j/include/ops/declarable/helpers/lgamma.h
index 48bcf1d73..184e33556 100644
--- a/libnd4j/include/ops/declarable/helpers/lgamma.h
+++ b/libnd4j/include/ops/declarable/helpers/lgamma.h
@@ -23,14 +23,14 @@
 #define __LIBND4J_L_GAMMA__H__
 
 #include <ops/declarable/helpers/helpers.h>
-#include "NDArray.h"
+#include "array/NDArray.h"
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
     // calculate the digamma function for each element for array
-    void lgamma(nd4j::LaunchContext* context, NDArray& x, NDArray& z);
+    void lgamma(sd::LaunchContext* context, NDArray& x, NDArray& z);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/listdiff.h b/libnd4j/include/ops/declarable/helpers/listdiff.h
index 99e83f5af..227eccac8 100644
--- a/libnd4j/include/ops/declarable/helpers/listdiff.h
+++ b/libnd4j/include/ops/declarable/helpers/listdiff.h
@@ -20,14 +20,14 @@
 #ifndef __LIST_DIFF_HELPERS__
 #define __LIST_DIFF_HELPERS__
 //#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
-    int listDiffFunctor(nd4j::LaunchContext * context, NDArray* values, NDArray* keep, NDArray* output1, NDArray* output2);
-    Nd4jLong listDiffCount(nd4j::LaunchContext * context, NDArray* values, NDArray* keep);
+    int listDiffFunctor(sd::LaunchContext * context, NDArray* values, NDArray* keep, NDArray* output1, NDArray* output2);
+    Nd4jLong listDiffCount(sd::LaunchContext * context, NDArray* values, NDArray* keep);
 }
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/lrn.h b/libnd4j/include/ops/declarable/helpers/lrn.h
index bbec42586..f8c9089c7 100644
--- a/libnd4j/include/ops/declarable/helpers/lrn.h
+++ b/libnd4j/include/ops/declarable/helpers/lrn.h
@@ -19,17 +19,17 @@
 //
 #ifndef __LRN_H_HELPERS__
 #define __LRN_H_HELPERS__
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 #include <graph/Context.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
-    int lrnFunctor(nd4j::graph::Context& block, NDArray* input, NDArray* output, int depth, double bias, double alpha, double beta);
+    int lrnFunctor(sd::graph::Context& block, NDArray* input, NDArray* output, int depth, double bias, double alpha, double beta);
 
-    void lrnBP(nd4j::graph::Context& block, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int depth, const float bias, const float alpha, const float beta);
+    void lrnBP(sd::graph::Context& block, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int depth, const float bias, const float alpha, const float beta);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/lstm.h b/libnd4j/include/ops/declarable/helpers/lstm.h
index 9c0df2fa5..6eb5886f3 100644
--- a/libnd4j/include/ops/declarable/helpers/lstm.h
+++ b/libnd4j/include/ops/declarable/helpers/lstm.h
@@ -23,7 +23,7 @@
 
 #include <ops/declarable/helpers/helpers.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
@@ -59,10 +59,10 @@ namespace helpers {
         }
     }
 
-	void lstmCell(nd4j::LaunchContext * context, const NDArray* xt, const NDArray* ht_1, const NDArray* ct_1, const NDArray* Wx, const NDArray* Wh, const NDArray* Wc, const NDArray* Wp, const NDArray* b,
+	void lstmCell(sd::LaunchContext * context, const NDArray* xt, const NDArray* ht_1, const NDArray* ct_1, const NDArray* Wx, const NDArray* Wh, const NDArray* Wc, const NDArray* Wp, const NDArray* b,
                   NDArray* ht, NDArray* ct, const std::vector<double>& params);
 
-	void lstmTimeLoop(nd4j::LaunchContext * context, const NDArray* x, const NDArray* h0, const NDArray* c0, const NDArray* Wx, const NDArray* Wh, const NDArray* Wc, const NDArray* Wp, const NDArray* b,
+	void lstmTimeLoop(sd::LaunchContext * context, const NDArray* x, const NDArray* h0, const NDArray* c0, const NDArray* Wx, const NDArray* Wh, const NDArray* Wc, const NDArray* Wp, const NDArray* b,
                       NDArray* h, NDArray* c, const std::vector<double>& params);
 
     void lstmBlockCell(const NDArray* xt, const NDArray* cLast, const NDArray* yLast,
diff --git a/libnd4j/include/ops/declarable/helpers/lstmBlock.h b/libnd4j/include/ops/declarable/helpers/lstmBlock.h
index ff30fe29d..7df9bb795 100644
--- a/libnd4j/include/ops/declarable/helpers/lstmBlock.h
+++ b/libnd4j/include/ops/declarable/helpers/lstmBlock.h
@@ -23,7 +23,7 @@
 
 #include <ops/declarable/helpers/helpers.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
diff --git a/libnd4j/include/ops/declarable/helpers/lstmLayer.h b/libnd4j/include/ops/declarable/helpers/lstmLayer.h
index a52c2c0e5..dfa9268b4 100644
--- a/libnd4j/include/ops/declarable/helpers/lstmLayer.h
+++ b/libnd4j/include/ops/declarable/helpers/lstmLayer.h
@@ -24,7 +24,7 @@
 #include <ops/declarable/helpers/helpers.h>
 #include <ops/declarable/helpers/activations.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
diff --git a/libnd4j/include/ops/declarable/helpers/lstsq.h b/libnd4j/include/ops/declarable/helpers/lstsq.h
index c6e87125d..9cc629383 100644
--- a/libnd4j/include/ops/declarable/helpers/lstsq.h
+++ b/libnd4j/include/ops/declarable/helpers/lstsq.h
@@ -19,14 +19,15 @@
 //
 #ifndef __LST_SQ_SOLVE__H_HELPERS__
 #define __LST_SQ_SOLVE__H_HELPERS__
-#include <op_boilerplate.h>
-#include <NDArray.h>
 
-namespace nd4j {
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
+
+namespace sd {
 namespace ops {
 namespace helpers {
 
-    int leastSquaresSolveFunctor(nd4j::LaunchContext* context, NDArray const* leftInput, NDArray const* rightInput, double const l2Regularizer, bool const fast, NDArray* output);
+    int leastSquaresSolveFunctor(sd::LaunchContext* context, NDArray const* leftInput, NDArray const* rightInput, double const l2Regularizer, bool const fast, NDArray* output);
 }
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/lup.h b/libnd4j/include/ops/declarable/helpers/lup.h
index 26f187374..1e58c2e3f 100644
--- a/libnd4j/include/ops/declarable/helpers/lup.h
+++ b/libnd4j/include/ops/declarable/helpers/lup.h
@@ -19,25 +19,25 @@
 //
 #ifndef __LUP_H_HELPERS__
 #define __LUP_H_HELPERS__
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
-    int lup(nd4j::LaunchContext* context, NDArray* input, NDArray* lu, NDArray* permutation);
-    void lu(nd4j::LaunchContext *context, NDArray* input, NDArray* output, NDArray* permutation);
-    int determinant(nd4j::LaunchContext * context, NDArray* input, NDArray* output);
-    int logAbsDeterminant(nd4j::LaunchContext * context, NDArray* input, NDArray* output);
+    int lup(sd::LaunchContext* context, NDArray* input, NDArray* lu, NDArray* permutation);
+    void lu(sd::LaunchContext *context, NDArray* input, NDArray* output, NDArray* permutation);
+    int determinant(sd::LaunchContext * context, NDArray* input, NDArray* output);
+    int logAbsDeterminant(sd::LaunchContext * context, NDArray* input, NDArray* output);
 
-    int inverse(nd4j::LaunchContext * context, NDArray* input, NDArray* output);
-    int upperInverseFunctor(nd4j::LaunchContext* context, NDArray* input, NDArray* output);
-    int lowerInverseFunctor(nd4j::LaunchContext* context, NDArray* input, NDArray* output);
+    int inverse(sd::LaunchContext * context, NDArray* input, NDArray* output);
+    int upperInverseFunctor(sd::LaunchContext* context, NDArray* input, NDArray* output);
+    int lowerInverseFunctor(sd::LaunchContext* context, NDArray* input, NDArray* output);
 
-    bool checkCholeskyInput(nd4j::LaunchContext * context, NDArray const* input);
-    int cholesky(nd4j::LaunchContext * context, NDArray* input, NDArray* output, bool inplace = false);
-    int logdetFunctor(nd4j::LaunchContext * context, NDArray* input, NDArray* output);
+    bool checkCholeskyInput(sd::LaunchContext * context, NDArray const* input);
+    int cholesky(sd::LaunchContext * context, NDArray* input, NDArray* output, bool inplace = false);
+    int logdetFunctor(sd::LaunchContext * context, NDArray* input, NDArray* output);
 }
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/matmul.h b/libnd4j/include/ops/declarable/helpers/matmul.h
index 2e7cce13f..2cce162d8 100644
--- a/libnd4j/include/ops/declarable/helpers/matmul.h
+++ b/libnd4j/include/ops/declarable/helpers/matmul.h
@@ -21,13 +21,13 @@
 #ifndef LIBND4J_HELPERS_MATMUL_H
 #define LIBND4J_HELPERS_MATMUL_H
 
-#include <NDArray.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
 
-            void _matmul(nd4j::LaunchContext * context, NDArray *A, NDArray *B, NDArray *C, int transA, int transB, double alpha = 1., double beta = 0.);
+            void _matmul(sd::LaunchContext * context, NDArray *A, NDArray *B, NDArray *C, int transA, int transB, double alpha = 1., double beta = 0.);
         }
     }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/matrixSetDiag.h b/libnd4j/include/ops/declarable/helpers/matrixSetDiag.h
index fb7d57d18..332c3134b 100644
--- a/libnd4j/include/ops/declarable/helpers/matrixSetDiag.h
+++ b/libnd4j/include/ops/declarable/helpers/matrixSetDiag.h
@@ -22,13 +22,13 @@
 #define LIBND4J_MATRIXSETDIAG_H
 
 #include <ops/declarable/helpers/helpers.h>
-#include "NDArray.h"
+#include "array/NDArray.h"
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
-    void matrixSetDiag(nd4j::LaunchContext* context, const NDArray& input, const NDArray& diagonal, NDArray& output, const bool zeroPad);
+    void matrixSetDiag(sd::LaunchContext* context, const NDArray& input, const NDArray& diagonal, NDArray& output, const bool zeroPad);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/matrix_band.h b/libnd4j/include/ops/declarable/helpers/matrix_band.h
index f63ae6680..f997e4d56 100644
--- a/libnd4j/include/ops/declarable/helpers/matrix_band.h
+++ b/libnd4j/include/ops/declarable/helpers/matrix_band.h
@@ -19,14 +19,14 @@
 //
 #ifndef __MATRIX_BAND_H_HELPERS__
 #define __MATRIX_BAND_H_HELPERS__
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
-    void matrixBandPart(nd4j::LaunchContext * context, NDArray* input, NDArray* output, Nd4jLong lowerBand, Nd4jLong upperBand);
+    void matrixBandPart(sd::LaunchContext * context, NDArray* input, NDArray* output, Nd4jLong lowerBand, Nd4jLong upperBand);
 
 
 }
diff --git a/libnd4j/include/ops/declarable/helpers/matrix_diag_part.h b/libnd4j/include/ops/declarable/helpers/matrix_diag_part.h
index 43ab84091..fd25c636c 100644
--- a/libnd4j/include/ops/declarable/helpers/matrix_diag_part.h
+++ b/libnd4j/include/ops/declarable/helpers/matrix_diag_part.h
@@ -19,14 +19,14 @@
 //
 #ifndef __MATRIX_DIAG_PART_HELPERS__
 #define __MATRIX_DIAG_PART_HELPERS__
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
-    int matrixDiagPart(nd4j::LaunchContext * context, NDArray const* input, NDArray* output);
+    int matrixDiagPart(sd::LaunchContext * context, NDArray const* input, NDArray* output);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/max_pooling.h b/libnd4j/include/ops/declarable/helpers/max_pooling.h
index fda2560bf..a3750798b 100644
--- a/libnd4j/include/ops/declarable/helpers/max_pooling.h
+++ b/libnd4j/include/ops/declarable/helpers/max_pooling.h
@@ -19,15 +19,15 @@
 //
 #ifndef __MAX_POOLING_HELPERS__
 #define __MAX_POOLING_HELPERS__
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 #include <graph/Context.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
-    void maxPoolingFunctor(nd4j::LaunchContext * context, nd4j::graph::Context& block, NDArray* input, NDArray* values, std::vector<int> const& params, NDArray* indices);
+    void maxPoolingFunctor(sd::LaunchContext * context, sd::graph::Context& block, NDArray* input, NDArray* values, std::vector<int> const& params, NDArray* indices);
 }
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/meshgrid.h b/libnd4j/include/ops/declarable/helpers/meshgrid.h
index e39e54e0a..e6c385029 100644
--- a/libnd4j/include/ops/declarable/helpers/meshgrid.h
+++ b/libnd4j/include/ops/declarable/helpers/meshgrid.h
@@ -23,11 +23,11 @@
 
 #include <ops/declarable/helpers/helpers.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
-	void meshgrid(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, const std::vector<NDArray*>& outArrs, const bool swapFirst2Dims);
+	void meshgrid(sd::LaunchContext * context, const std::vector<NDArray*>& inArrs, const std::vector<NDArray*>& outArrs, const bool swapFirst2Dims);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/minimax.h b/libnd4j/include/ops/declarable/helpers/minimax.h
index 6cb2f60b6..f619a20f6 100644
--- a/libnd4j/include/ops/declarable/helpers/minimax.h
+++ b/libnd4j/include/ops/declarable/helpers/minimax.h
@@ -19,15 +19,15 @@
 //
 #ifndef __MIN_I_MAX_H_HELPERS__
 #define __MIN_I_MAX_H_HELPERS__
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
-    void minimumBPFunctor(nd4j::LaunchContext * context, NDArray* x, NDArray* y, NDArray* epsNext, NDArray* gradX, NDArray* gradY);
-    void maximumBPFunctor(nd4j::LaunchContext * context, NDArray* x, NDArray* y, NDArray* epsNext, NDArray* gradX, NDArray* gradY);
+    void minimumBPFunctor(sd::LaunchContext * context, NDArray* x, NDArray* y, NDArray* epsNext, NDArray* gradX, NDArray* gradY);
+    void maximumBPFunctor(sd::LaunchContext * context, NDArray* x, NDArray* y, NDArray* epsNext, NDArray* gradX, NDArray* gradY);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/multiUnique.h b/libnd4j/include/ops/declarable/helpers/multiUnique.h
index 12fa6db10..3119901c1 100644
--- a/libnd4j/include/ops/declarable/helpers/multiUnique.h
+++ b/libnd4j/include/ops/declarable/helpers/multiUnique.h
@@ -19,14 +19,14 @@
 //
 #ifndef __MULTI_UNIQUE_H_HELPERS__
 #define __MULTI_UNIQUE_H_HELPERS__
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
-    ND4J_EXPORT bool multiUnique(std::vector<NDArray*> const& inputList, nd4j::memory::Workspace* workspace = nullptr);
+    ND4J_EXPORT bool multiUnique(std::vector<NDArray*> const& inputList, sd::memory::Workspace* workspace = nullptr);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/nth_element.h b/libnd4j/include/ops/declarable/helpers/nth_element.h
index 486c7489f..1a2c28719 100644
--- a/libnd4j/include/ops/declarable/helpers/nth_element.h
+++ b/libnd4j/include/ops/declarable/helpers/nth_element.h
@@ -19,14 +19,14 @@
 //
 #ifndef __NTH_ELEMENT__H_HELPERS__
 #define __NTH_ELEMENT__H_HELPERS__
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
-    void nthElementFunctor(nd4j::LaunchContext * context, NDArray* input, Nd4jLong n, NDArray* output, bool reverse);
+    void nthElementFunctor(sd::LaunchContext * context, NDArray* input, Nd4jLong n, NDArray* output, bool reverse);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/one_hot.h b/libnd4j/include/ops/declarable/helpers/one_hot.h
index f91ed8cc9..1fefd7c44 100644
--- a/libnd4j/include/ops/declarable/helpers/one_hot.h
+++ b/libnd4j/include/ops/declarable/helpers/one_hot.h
@@ -21,14 +21,14 @@
 #ifndef DEV_TESTS_ONE_HOT_H
 #define DEV_TESTS_ONE_HOT_H
 
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 
-namespace nd4j 		{
+namespace sd 		{
 namespace ops 		{
 namespace helpers 	{
 
-	void onehot(const nd4j::LaunchContext* context, const NDArray *indices, NDArray *output, const uint axis, const uint depth, const double on, const double off);
+	void onehot(const sd::LaunchContext* context, const NDArray *indices, NDArray *output, const uint axis, const uint depth, const double on, const double off);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/percentile.h b/libnd4j/include/ops/declarable/helpers/percentile.h
index a2859c9b6..5eddb42b2 100644
--- a/libnd4j/include/ops/declarable/helpers/percentile.h
+++ b/libnd4j/include/ops/declarable/helpers/percentile.h
@@ -22,13 +22,13 @@
 #define LIBND4J_PERCENTILE_H
 
 #include <ops/declarable/helpers/helpers.h>
-#include "NDArray.h"
+#include "array/NDArray.h"
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
-    void percentile(nd4j::LaunchContext * context, const NDArray& input, NDArray& output, std::vector<int>& axises, const float q, const int interpolation);
+    void percentile(sd::LaunchContext * context, const NDArray& input, NDArray& output, std::vector<int>& axises, const float q, const int interpolation);
     
 
 }
diff --git a/libnd4j/include/ops/declarable/helpers/prefix.h b/libnd4j/include/ops/declarable/helpers/prefix.h
index 50b692623..757c5c94f 100644
--- a/libnd4j/include/ops/declarable/helpers/prefix.h
+++ b/libnd4j/include/ops/declarable/helpers/prefix.h
@@ -21,20 +21,20 @@
 #ifndef LIBND4J_PREFIX_HELPER_H
 #define LIBND4J_PREFIX_HELPER_H
 
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <types/float16.h>
 #include <vector>
-#include <NDArray.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             // template <typename T>
-            // void prefix(nd4j::LaunchContext * context, nd4j::scalar::Ops op, void* x, Nd4jLong *xShapeInfo, void* z, Nd4jLong* zShapeInfo, bool exclusive, bool reverse);
+            // void prefix(sd::LaunchContext * context, sd::scalar::Ops op, void* x, Nd4jLong *xShapeInfo, void* z, Nd4jLong* zShapeInfo, bool exclusive, bool reverse);
 
-            void prefix(nd4j::LaunchContext* context, nd4j::scalar::Ops op, const NDArray* x, NDArray* z, bool exclusive, bool reverse);
+            void prefix(sd::LaunchContext* context, sd::scalar::Ops op, const NDArray* x, NDArray* z, bool exclusive, bool reverse);
 
-            void prefix(nd4j::LaunchContext* context, nd4j::scalar::Ops op, const NDArray* x, NDArray* z, const std::vector<int>& dims, bool exclusive, bool reverse);
+            void prefix(sd::LaunchContext* context, sd::scalar::Ops op, const NDArray* x, NDArray* z, const std::vector<int>& dims, bool exclusive, bool reverse);
         }
     }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/print_variable.h b/libnd4j/include/ops/declarable/helpers/print_variable.h
index 3521e38b9..46cf4ee01 100644
--- a/libnd4j/include/ops/declarable/helpers/print_variable.h
+++ b/libnd4j/include/ops/declarable/helpers/print_variable.h
@@ -23,7 +23,7 @@
 
 #include <ops/declarable/helpers/helpers.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             void print_special(LaunchContext &ctx, const NDArray &array, const std::string &message = {});
diff --git a/libnd4j/include/ops/declarable/helpers/qr.h b/libnd4j/include/ops/declarable/helpers/qr.h
index 76236235e..05de6ca40 100644
--- a/libnd4j/include/ops/declarable/helpers/qr.h
+++ b/libnd4j/include/ops/declarable/helpers/qr.h
@@ -19,14 +19,14 @@
 //
 #ifndef __QR__H_HELPERS__
 #define __QR__H_HELPERS__
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
-    void qr(nd4j::LaunchContext * context, NDArray const* input, NDArray* outputQ, NDArray* outputR, bool const fullMatricies);
+    void qr(sd::LaunchContext * context, NDArray const* input, NDArray* outputQ, NDArray* outputR, bool const fullMatricies);
 
 
 }
diff --git a/libnd4j/include/ops/declarable/helpers/random.h b/libnd4j/include/ops/declarable/helpers/random.h
index c97aae118..5ee75e141 100644
--- a/libnd4j/include/ops/declarable/helpers/random.h
+++ b/libnd4j/include/ops/declarable/helpers/random.h
@@ -22,12 +22,12 @@
 //
 #ifndef __RANDOM_HELPERS__
 #define __RANDOM_HELPERS__
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 #include <helpers/helper_random.h>
 #include <graph/Context.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
diff --git a/libnd4j/include/ops/declarable/helpers/random_crop.h b/libnd4j/include/ops/declarable/helpers/random_crop.h
index dba950210..f4d36a850 100644
--- a/libnd4j/include/ops/declarable/helpers/random_crop.h
+++ b/libnd4j/include/ops/declarable/helpers/random_crop.h
@@ -19,12 +19,12 @@
 //
 #ifndef __RANDOM_CROP_HELPERS__
 #define __RANDOM_CROP_HELPERS__
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 #include <helpers/helper_random.h>
 #include <graph/Context.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
diff --git a/libnd4j/include/ops/declarable/helpers/range.h b/libnd4j/include/ops/declarable/helpers/range.h
index 8381e2a2c..13155fd70 100644
--- a/libnd4j/include/ops/declarable/helpers/range.h
+++ b/libnd4j/include/ops/declarable/helpers/range.h
@@ -23,12 +23,12 @@
 
 #include <ops/declarable/helpers/helpers.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
 	// be careful: outVector must have c-order and ews = 1 !!!
-	void range(nd4j::LaunchContext * context, const NDArray& start, const NDArray& delta, NDArray& outVector);
+	void range(sd::LaunchContext * context, const NDArray& start, const NDArray& delta, NDArray& outVector);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/reverse.h b/libnd4j/include/ops/declarable/helpers/reverse.h
index 28045cbb6..d85d017ba 100644
--- a/libnd4j/include/ops/declarable/helpers/reverse.h
+++ b/libnd4j/include/ops/declarable/helpers/reverse.h
@@ -23,13 +23,13 @@
 
 #include <ops/declarable/helpers/helpers.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
-	void reverseSequence(nd4j::LaunchContext * context, const NDArray* input, const NDArray* seqLengths, NDArray* output, int seqDim, const int batchDim);
+	void reverseSequence(sd::LaunchContext * context, const NDArray* input, const NDArray* seqLengths, NDArray* output, int seqDim, const int batchDim);
 
-	void reverse(nd4j::LaunchContext * context, const NDArray* input, NDArray* output, const std::vector<int>* intArgs, bool isBackProp);
+	void reverse(sd::LaunchContext * context, const NDArray* input, NDArray* output, const std::vector<int>* intArgs, bool isBackProp);
 
     
 
diff --git a/libnd4j/include/ops/declarable/helpers/rnn.h b/libnd4j/include/ops/declarable/helpers/rnn.h
index cb0d6d6e4..32f49fe2e 100644
--- a/libnd4j/include/ops/declarable/helpers/rnn.h
+++ b/libnd4j/include/ops/declarable/helpers/rnn.h
@@ -23,14 +23,14 @@
 
 #include <ops/declarable/helpers/helpers.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
 
-	void rnnCell(nd4j::LaunchContext * context, const NDArray* xt, const NDArray* Wx, const NDArray* Wh, const NDArray* b, const NDArray* ht_1, NDArray* ht);
+	void rnnCell(sd::LaunchContext * context, const NDArray* xt, const NDArray* Wx, const NDArray* Wh, const NDArray* b, const NDArray* ht_1, NDArray* ht);
 
-	void rnnTimeLoop(nd4j::LaunchContext * context, const NDArray* x, const NDArray* Wx, const NDArray* Wh, const NDArray* b, const NDArray* h0, const NDArray* maxTimeStep, NDArray* h, NDArray* hFinal);
+	void rnnTimeLoop(sd::LaunchContext * context, const NDArray* x, const NDArray* Wx, const NDArray* Wh, const NDArray* b, const NDArray* h0, const NDArray* maxTimeStep, NDArray* h, NDArray* hFinal);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/roll.h b/libnd4j/include/ops/declarable/helpers/roll.h
index b20367c0d..3e637dbc4 100644
--- a/libnd4j/include/ops/declarable/helpers/roll.h
+++ b/libnd4j/include/ops/declarable/helpers/roll.h
@@ -21,12 +21,12 @@
 #include <ops/declarable/helpers/helpers.h>
 #ifndef __HELPERS__ROLL__H__
 #define __HELPERS__ROLL__H__
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
-    void rollFunctorLinear(nd4j::LaunchContext * context, NDArray* input, NDArray* output, int shift, bool inplace = false);
+    void rollFunctorLinear(sd::LaunchContext * context, NDArray* input, NDArray* output, int shift, bool inplace = false);
 
-    void rollFunctorFull(nd4j::LaunchContext * context, NDArray* input, NDArray* output, std::vector<int> const& shifts, std::vector<int> const& axes, bool inplace = false);
+    void rollFunctorFull(sd::LaunchContext * context, NDArray* input, NDArray* output, std::vector<int> const& shifts, std::vector<int> const& axes, bool inplace = false);
 }
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/s_t_b.h b/libnd4j/include/ops/declarable/helpers/s_t_b.h
index 35e57a1e8..1147f05ab 100644
--- a/libnd4j/include/ops/declarable/helpers/s_t_b.h
+++ b/libnd4j/include/ops/declarable/helpers/s_t_b.h
@@ -23,27 +23,27 @@
 
 #include <ops/declarable/helpers/helpers.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
-    void batchToSpace(nd4j::LaunchContext* context, const NDArray& input, NDArray& output, const uint cropBottom, const uint cropTop, const uint cropLeft, const uint cropRight, const uint blockSize);
+    void batchToSpace(sd::LaunchContext* context, const NDArray& input, NDArray& output, const uint cropBottom, const uint cropTop, const uint cropLeft, const uint cropRight, const uint blockSize);
 
-    void spaceToBatch(nd4j::LaunchContext* context, const NDArray& input, NDArray& output, const uint padBottom, const uint padTop, const uint padLeft, const uint padRight, const uint blockSize);
+    void spaceToBatch(sd::LaunchContext* context, const NDArray& input, NDArray& output, const uint padBottom, const uint padTop, const uint padLeft, const uint padRight, const uint blockSize);
 
-    void spaceToBatchND(nd4j::LaunchContext* context, const NDArray& input, const NDArray& blockShape, const NDArray& padding, NDArray& output);
+    void spaceToBatchND(sd::LaunchContext* context, const NDArray& input, const NDArray& blockShape, const NDArray& padding, NDArray& output);
 
-    void batchToSpaceND(nd4j::LaunchContext* context, const NDArray& input, const NDArray& blockShape, const NDArray& crop, NDArray& output);
+    void batchToSpaceND(sd::LaunchContext* context, const NDArray& input, const NDArray& blockShape, const NDArray& crop, NDArray& output);
 
 /*
     // this method MUST be platform-specific
 
     template <typename T, int NUM_BLOCK_DIMS, bool B2S>
-    void _execute(nd4j::LaunchContext * context, void *ptrSpace, const Nd4jLong *space_shape, const Nd4jLong *space_strides, const Nd4jLong *block_shape, const Nd4jLong *pad_start, const Nd4jLong *block_offsets, void *ptrBatch, const Nd4jLong *batch_shape, const Nd4jLong *batch_strides);
+    void _execute(sd::LaunchContext * context, void *ptrSpace, const Nd4jLong *space_shape, const Nd4jLong *space_strides, const Nd4jLong *block_shape, const Nd4jLong *pad_start, const Nd4jLong *block_offsets, void *ptrBatch, const Nd4jLong *batch_shape, const Nd4jLong *batch_strides);
 
 
     template <int NUM_BLOCK_DIMS, bool B2S>
-    FORCEINLINE void _prepare(nd4j::LaunchContext * context, NDArray * space, NDArray *batch, const Nd4jLong block_array[NUM_BLOCK_DIMS], const Nd4jLong padding_array[NUM_BLOCK_DIMS * 2]) {
+    FORCEINLINE void _prepare(sd::LaunchContext * context, NDArray * space, NDArray *batch, const Nd4jLong block_array[NUM_BLOCK_DIMS], const Nd4jLong padding_array[NUM_BLOCK_DIMS * 2]) {
 
         Nd4jLong pad_start[NUM_BLOCK_DIMS];
         Nd4jLong block_shape[NUM_BLOCK_DIMS];
@@ -83,9 +83,9 @@ namespace helpers {
         }
     };
 
-    Nd4jStatus _spaceToBatch(nd4j::LaunchContext * context, int internal_block_dims, NDArray *input, NDArray *output, std::vector<Nd4jLong> &internal_input_shape, std::vector<Nd4jLong> &internal_output_shape, Nd4jLong *block_shape, Nd4jLong *paddings);
+    Nd4jStatus _spaceToBatch(sd::LaunchContext * context, int internal_block_dims, NDArray *input, NDArray *output, std::vector<Nd4jLong> &internal_input_shape, std::vector<Nd4jLong> &internal_output_shape, Nd4jLong *block_shape, Nd4jLong *paddings);
 
-    Nd4jStatus _batchToSpace(nd4j::LaunchContext * context, int internal_block_dims, NDArray *input, NDArray *output, std::vector<Nd4jLong> &internal_input_shape, std::vector<Nd4jLong> &internal_output_shape, Nd4jLong *block_shape, Nd4jLong *crops);
+    Nd4jStatus _batchToSpace(sd::LaunchContext * context, int internal_block_dims, NDArray *input, NDArray *output, std::vector<Nd4jLong> &internal_input_shape, std::vector<Nd4jLong> &internal_output_shape, Nd4jLong *block_shape, Nd4jLong *crops);
     */
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/s_t_d.h b/libnd4j/include/ops/declarable/helpers/s_t_d.h
index 64764935e..6dbc64f21 100644
--- a/libnd4j/include/ops/declarable/helpers/s_t_d.h
+++ b/libnd4j/include/ops/declarable/helpers/s_t_d.h
@@ -19,12 +19,12 @@
 //
 
 #include <ops/declarable/helpers/helpers.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
-    void _spaceTodepth(nd4j::LaunchContext * context, NDArray *input, NDArray *output, int block_size, bool isNHWC);
+    void _spaceTodepth(sd::LaunchContext * context, NDArray *input, NDArray *output, int block_size, bool isNHWC);
 }
 }
 }
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/scatter.h b/libnd4j/include/ops/declarable/helpers/scatter.h
index b470285ff..6e456ff97 100644
--- a/libnd4j/include/ops/declarable/helpers/scatter.h
+++ b/libnd4j/include/ops/declarable/helpers/scatter.h
@@ -21,18 +21,18 @@
 #ifndef DEV_TESTS_SCATTER_H
 #define DEV_TESTS_SCATTER_H
 
-#include <NDArray.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
-            void scatter(nd4j::LaunchContext* context, pairwise::Ops op, const NDArray& indices, const NDArray& updates, NDArray& output, const bool lock);
+            void scatter(sd::LaunchContext* context, pairwise::Ops op, const NDArray& indices, const NDArray& updates, NDArray& output, const bool lock);
 
-            void scatterND(nd4j::LaunchContext* context, pairwise::Ops op, const NDArray& indices, const NDArray& updates, NDArray& output, const bool lock);
+            void scatterND(sd::LaunchContext* context, pairwise::Ops op, const NDArray& indices, const NDArray& updates, NDArray& output, const bool lock);
 
-            void scatterForLoss(nd4j::LaunchContext* context, const NDArray& indices, NDArray& updates, NDArray& output, const bool calcGrad);
+            void scatterForLoss(sd::LaunchContext* context, const NDArray& indices, NDArray& updates, NDArray& output, const bool calcGrad);
 
-            Nd4jLong checkIndices(nd4j::LaunchContext *context, const NDArray& indices, const NDArray& output, const int axis = -1);
+            Nd4jLong checkIndices(sd::LaunchContext *context, const NDArray& indices, const NDArray& output, const int axis = -1);
         }
     }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/segment.h b/libnd4j/include/ops/declarable/helpers/segment.h
index 8064d69c6..2433313ff 100644
--- a/libnd4j/include/ops/declarable/helpers/segment.h
+++ b/libnd4j/include/ops/declarable/helpers/segment.h
@@ -21,60 +21,60 @@
 //
 #ifndef __SEGMENT_HELPERS__
 #define __SEGMENT_HELPERS__
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
-    bool segmentIndicesValidate(nd4j::LaunchContext * context, NDArray* indices, NDArray& expected, NDArray& output);
+    bool segmentIndicesValidate(sd::LaunchContext * context, NDArray* indices, NDArray& expected, NDArray& output);
 
-    bool unsortedSegmentIndicesValidate(nd4j::LaunchContext * context, NDArray* indices, Nd4jLong numOfClasses, Nd4jLong& output);
+    bool unsortedSegmentIndicesValidate(sd::LaunchContext * context, NDArray* indices, Nd4jLong numOfClasses, Nd4jLong& output);
 
-    void segmentMaxFunctor(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* output);
+    void segmentMaxFunctor(sd::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* output);
 
-    void segmentMinFunctor(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* output);
+    void segmentMinFunctor(sd::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* output);
 
-    void segmentMeanFunctor(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* output);
+    void segmentMeanFunctor(sd::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* output);
 
-    void segmentSumFunctor(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* output);
+    void segmentSumFunctor(sd::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* output);
 
-    void segmentProdFunctor(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* output);
+    void segmentProdFunctor(sd::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* output);
 
-    void unsortedSegmentSqrtNFunctor(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output);
+    void unsortedSegmentSqrtNFunctor(sd::LaunchContext * context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output);
 
-    void unsortedSegmentMaxFunctor(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output);
+    void unsortedSegmentMaxFunctor(sd::LaunchContext * context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output);
 
-    void unsortedSegmentMinFunctor(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output);
+    void unsortedSegmentMinFunctor(sd::LaunchContext * context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output);
 
-    void unsortedSegmentMeanFunctor(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output);
+    void unsortedSegmentMeanFunctor(sd::LaunchContext * context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output);
 
-    void unsortedSegmentSumFunctor(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output);
+    void unsortedSegmentSumFunctor(sd::LaunchContext * context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output);
 
-    void unsortedSegmentProdFunctor(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output);
+    void unsortedSegmentProdFunctor(sd::LaunchContext * context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output);
 
-    int segmentMaxFunctorBP(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output);
+    int segmentMaxFunctorBP(sd::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output);
 
-    int segmentMinFunctorBP(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output);
+    int segmentMinFunctorBP(sd::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output);
 
-    int segmentMeanFunctorBP(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output);
+    int segmentMeanFunctorBP(sd::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output);
 
-    int segmentSumFunctorBP(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output);
+    int segmentSumFunctorBP(sd::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output);
 
-    int segmentProdFunctorBP(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output);
+    int segmentProdFunctorBP(sd::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output);
 
-    int unsortedSegmentSqrtNFunctorBP(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output);
+    int unsortedSegmentSqrtNFunctorBP(sd::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output);
 
-    int unsortedSegmentMaxFunctorBP(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output);
+    int unsortedSegmentMaxFunctorBP(sd::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output);
 
-    int unsortedSegmentMinFunctorBP(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output);
+    int unsortedSegmentMinFunctorBP(sd::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output);
 
-    int unsortedSegmentMeanFunctorBP(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output);
+    int unsortedSegmentMeanFunctorBP(sd::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output);
 
-    int unsortedSegmentSumFunctorBP(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output);
+    int unsortedSegmentSumFunctorBP(sd::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output);
 
-    int unsortedSegmentProdFunctorBP(nd4j::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output);
+    int unsortedSegmentProdFunctorBP(sd::LaunchContext * context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/segment_common.h b/libnd4j/include/ops/declarable/helpers/segment_common.h
index 2d0ec5b8b..b0a92b8b3 100644
--- a/libnd4j/include/ops/declarable/helpers/segment_common.h
+++ b/libnd4j/include/ops/declarable/helpers/segment_common.h
@@ -21,10 +21,10 @@
 //
 #ifndef __SEGMENT_COMMON_HELPERS__
 #define __SEGMENT_COMMON_HELPERS__
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
     void fillUpSegments(NDArray* indices, Nd4jLong numClasses, NDArray& classesRangesBegs, NDArray& classesRangesLens);
diff --git a/libnd4j/include/ops/declarable/helpers/sequence_mask.h b/libnd4j/include/ops/declarable/helpers/sequence_mask.h
index 9711b7cde..491640b9e 100644
--- a/libnd4j/include/ops/declarable/helpers/sequence_mask.h
+++ b/libnd4j/include/ops/declarable/helpers/sequence_mask.h
@@ -19,14 +19,14 @@
 //
 #ifndef __SEQUENCE_MASK_HELPERS__
 #define __SEQUENCE_MASK_HELPERS__
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
-    void sequenceMask(nd4j::LaunchContext * context, NDArray* input, NDArray* output, int maxIndex);
+    void sequenceMask(sd::LaunchContext * context, NDArray* input, NDArray* output, int maxIndex);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/sg_cb.h b/libnd4j/include/ops/declarable/helpers/sg_cb.h
index 8aa616fa6..6b0824a81 100644
--- a/libnd4j/include/ops/declarable/helpers/sg_cb.h
+++ b/libnd4j/include/ops/declarable/helpers/sg_cb.h
@@ -21,11 +21,11 @@
 #ifndef DEV_TESTS_SG_CB_H
 #define DEV_TESTS_SG_CB_H
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <types/types.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             void skipgram(NDArray &syn0, NDArray &syn1, NDArray &syn1Neg, NDArray &expTable, NDArray &negTable, NDArray &target, NDArray &ngStarter, int nsRounds, NDArray &indices, NDArray &codes, NDArray &alpha, NDArray &randomValue, NDArray &inferenceVector, const bool preciseMode, const int numWorkers);
diff --git a/libnd4j/include/ops/declarable/helpers/shift.h b/libnd4j/include/ops/declarable/helpers/shift.h
index e07a0e992..f1b21741c 100644
--- a/libnd4j/include/ops/declarable/helpers/shift.h
+++ b/libnd4j/include/ops/declarable/helpers/shift.h
@@ -21,11 +21,11 @@
 #ifndef DEV_TESTS_SHIFT_H
 #define DEV_TESTS_SHIFT_H
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <types/types.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             void rshift_bits(LaunchContext* launchContext, NDArray &x, NDArray &z, uint32_t shift);
diff --git a/libnd4j/include/ops/declarable/helpers/solve.h b/libnd4j/include/ops/declarable/helpers/solve.h
index d097fa217..17234f313 100644
--- a/libnd4j/include/ops/declarable/helpers/solve.h
+++ b/libnd4j/include/ops/declarable/helpers/solve.h
@@ -19,15 +19,15 @@
 //
 #ifndef __SOLVE__H_HELPERS__
 #define __SOLVE__H_HELPERS__
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
-    int solveFunctor(nd4j::LaunchContext* context, NDArray* leftInput, NDArray* rightInput, bool adjoint, NDArray* output);
-    void adjointMatrix(nd4j::LaunchContext* context, NDArray const* input, NDArray* output);
+    int solveFunctor(sd::LaunchContext* context, NDArray* leftInput, NDArray* rightInput, bool adjoint, NDArray* output);
+    void adjointMatrix(sd::LaunchContext* context, NDArray const* input, NDArray* output);
 }
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/sparse_to_dense.h b/libnd4j/include/ops/declarable/helpers/sparse_to_dense.h
index 8d00639de..541621257 100644
--- a/libnd4j/include/ops/declarable/helpers/sparse_to_dense.h
+++ b/libnd4j/include/ops/declarable/helpers/sparse_to_dense.h
@@ -23,7 +23,7 @@
 
 #include <ops/declarable/helpers/helpers.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             void compat_sparse_to_dense(const NDArray &values, const NDArray &indices, NDArray *def, NDArray &output);
diff --git a/libnd4j/include/ops/declarable/helpers/sru.h b/libnd4j/include/ops/declarable/helpers/sru.h
index 3a89ccd66..639247278 100644
--- a/libnd4j/include/ops/declarable/helpers/sru.h
+++ b/libnd4j/include/ops/declarable/helpers/sru.h
@@ -23,18 +23,18 @@
 
 #include <ops/declarable/helpers/helpers.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
 
-	void sruCell(nd4j::LaunchContext * context, const NDArray* x, const NDArray* c0, const NDArray* w, const NDArray* b, NDArray* h, NDArray* c);
+	void sruCell(sd::LaunchContext * context, const NDArray* x, const NDArray* c0, const NDArray* w, const NDArray* b, NDArray* h, NDArray* c);
 
-	void sruTimeLoop(nd4j::LaunchContext * context, const NDArray* x, const NDArray* c0, const NDArray* w, const NDArray* b, NDArray* h, NDArray* c);
+	void sruTimeLoop(sd::LaunchContext * context, const NDArray* x, const NDArray* c0, const NDArray* w, const NDArray* b, NDArray* h, NDArray* c);
 
-	void sruBI(nd4j::LaunchContext * context, NDArray* x, const NDArray* w, const NDArray* b, const NDArray* c0, const NDArray* mask, NDArray* ht, NDArray* ct);
+	void sruBI(sd::LaunchContext * context, NDArray* x, const NDArray* w, const NDArray* b, const NDArray* c0, const NDArray* mask, NDArray* ht, NDArray* ct);
 
-	void sruBIBP(nd4j::LaunchContext * context, NDArray* x, const NDArray* w, const NDArray* b, const NDArray* c0, const NDArray* ct, const NDArray* inGradC0, const NDArray* inGradH, const NDArray* mask,
+	void sruBIBP(sd::LaunchContext * context, NDArray* x, const NDArray* w, const NDArray* b, const NDArray* c0, const NDArray* ct, const NDArray* inGradC0, const NDArray* inGradH, const NDArray* mask,
                  NDArray* gradI, NDArray* gradWeights, NDArray* gradB, NDArray* gradC0);
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/stack.h b/libnd4j/include/ops/declarable/helpers/stack.h
index 49115ef8e..d1915a9ba 100644
--- a/libnd4j/include/ops/declarable/helpers/stack.h
+++ b/libnd4j/include/ops/declarable/helpers/stack.h
@@ -22,13 +22,13 @@
 #define LIBND4J_STACK_H
 
 #include <ops/declarable/helpers/helpers.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
-void stack(nd4j::LaunchContext * context, const std::vector<const NDArray*>& inArrs, NDArray* outArr, const int dim);
+void stack(sd::LaunchContext * context, const std::vector<const NDArray*>& inArrs, NDArray* outArr, const int dim);
 
 
 }
diff --git a/libnd4j/include/ops/declarable/helpers/svd.h b/libnd4j/include/ops/declarable/helpers/svd.h
index 42689c319..027807191 100644
--- a/libnd4j/include/ops/declarable/helpers/svd.h
+++ b/libnd4j/include/ops/declarable/helpers/svd.h
@@ -22,15 +22,15 @@
 #define LIBND4J_SVD_HELPER_H
 
 #include <ops/declarable/helpers/helpers.h>
-#include "NDArray.h"
+#include "array/NDArray.h"
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
 //////////////////////////////////////////////////////////////////////////
 // svd operation, this function is not method of SVD class, it is standalone function
-void svd(nd4j::LaunchContext* context, const NDArray* x, const std::vector<NDArray*>& outArrs, const bool fullUV, const bool calcUV, const int switchNum);
+void svd(sd::LaunchContext* context, const NDArray* x, const std::vector<NDArray*>& outArrs, const bool fullUV, const bool calcUV, const int switchNum);
 
 
 }
diff --git a/libnd4j/include/ops/declarable/helpers/toggle_bits.h b/libnd4j/include/ops/declarable/helpers/toggle_bits.h
index bf3481e2c..6d8ffe44a 100644
--- a/libnd4j/include/ops/declarable/helpers/toggle_bits.h
+++ b/libnd4j/include/ops/declarable/helpers/toggle_bits.h
@@ -23,13 +23,13 @@
 #ifndef DEV_TESTS_TOGGLE_BITS_H
 #define DEV_TESTS_TOGGLE_BITS_H
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
             template <typename T>
-            static void toggle_bits__(nd4j::LaunchContext * context, NDArray& in, NDArray& out);
+            static void toggle_bits__(sd::LaunchContext * context, NDArray& in, NDArray& out);
 
-            void __toggle_bits(nd4j::LaunchContext * context, NDArray& in, NDArray& out);
+            void __toggle_bits(sd::LaunchContext * context, NDArray& in, NDArray& out);
         }
     }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/top_k.h b/libnd4j/include/ops/declarable/helpers/top_k.h
index 5ce7c93fb..6a459f925 100644
--- a/libnd4j/include/ops/declarable/helpers/top_k.h
+++ b/libnd4j/include/ops/declarable/helpers/top_k.h
@@ -19,16 +19,16 @@
 //
 #ifndef __TOP_K_HELPERS__
 #define __TOP_K_HELPERS__
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
-    int topKFunctor(nd4j::LaunchContext * context, const NDArray* input, NDArray* values, NDArray* indices, const uint k, bool needSort);
+    int topKFunctor(sd::LaunchContext * context, const NDArray* input, NDArray* values, NDArray* indices, const uint k, bool needSort);
 
-    int inTopKFunctor(nd4j::LaunchContext * context, const NDArray* predictions, const NDArray* targets, NDArray* output, const uint k);
+    int inTopKFunctor(sd::LaunchContext * context, const NDArray* predictions, const NDArray* targets, NDArray* output, const uint k);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/transforms.h b/libnd4j/include/ops/declarable/helpers/transforms.h
index 8ceb9c6f8..317f40a99 100644
--- a/libnd4j/include/ops/declarable/helpers/transforms.h
+++ b/libnd4j/include/ops/declarable/helpers/transforms.h
@@ -25,56 +25,56 @@
 #include <helpers/helper_random.h>
 #include <graph/RandomGenerator.h>
 
-namespace nd4j    {
+namespace sd    {
 namespace ops     {
 namespace helpers {
 
-	void triuBP(nd4j::LaunchContext * context, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int diagonal);
+	void triuBP(sd::LaunchContext * context, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int diagonal);
 
-	void trace(nd4j::LaunchContext * context, const NDArray& input, NDArray& output);
+	void trace(sd::LaunchContext * context, const NDArray& input, NDArray& output);
 
-	void randomShuffle(nd4j::LaunchContext * context, NDArray& input, NDArray& output, nd4j::graph::RandomGenerator& rng, const bool isInplace);
+	void randomShuffle(sd::LaunchContext * context, NDArray& input, NDArray& output, sd::graph::RandomGenerator& rng, const bool isInplace);
 
     // auxiliary function which serves for recursion purpose and is used in pad operation
 	// void recursiveLoopForPad(const int mode, NDArray& input, const NDArray& paddings, NDArray& output, std::vector<int> dimensions, int dim, int inIdx, int outIdx, NDArray& padValue);
 
-	void pad(nd4j::LaunchContext * context, const int mode, const NDArray& input, const NDArray& paddings, NDArray& output, NDArray const& padValue);
+	void pad(sd::LaunchContext * context, const int mode, const NDArray& input, const NDArray& paddings, NDArray& output, NDArray const& padValue);
 
-	void invertPermutation(nd4j::LaunchContext * context, const NDArray& input, NDArray& output);
+	void invertPermutation(sd::LaunchContext * context, const NDArray& input, NDArray& output);
 
-	void gatherND(nd4j::LaunchContext * context, NDArray& input, NDArray& indices, NDArray& output);
+	void gatherND(sd::LaunchContext * context, NDArray& input, NDArray& indices, NDArray& output);
 
-	void gather(nd4j::LaunchContext * context, NDArray* input, const NDArray* indices, NDArray* output, const std::vector<int>& intArgs);
+	void gather(sd::LaunchContext * context, NDArray* input, const NDArray* indices, NDArray* output, const std::vector<int>& intArgs);
 
-	void eye(nd4j::LaunchContext * context, NDArray& output);
+	void eye(sd::LaunchContext * context, NDArray& output);
 
-	void scatterUpdate(nd4j::LaunchContext * context, NDArray& operand, NDArray& updates, const std::vector<int>* intArgs);
+	void scatterUpdate(sd::LaunchContext * context, NDArray& operand, NDArray& updates, const std::vector<int>* intArgs);
 
-	void scatterSimple(nd4j::LaunchContext * context, const int opId, NDArray& input, const NDArray& updates, const NDArray& indices, const std::vector<int>& dimensions);
+	void scatterSimple(sd::LaunchContext * context, const int opId, NDArray& input, const NDArray& updates, const NDArray& indices, const std::vector<int>& dimensions);
 
-	void mergeMaxIndex(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output);
+	void mergeMaxIndex(sd::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output);
 
-	void mergeMax(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output);
+	void mergeMax(sd::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output);
 
-	void mergeAvg(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output);
+	void mergeAvg(sd::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output);
 
-	void mergeAdd(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output);
+	void mergeAdd(sd::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output);
 
-	void clipByNorm(nd4j::LaunchContext * context, NDArray& input, NDArray& output, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool isInplace);
-	void clipByGlobalNorm(nd4j::LaunchContext * context, std::vector<NDArray*> const& inputs, double clipNorm, nd4j::memory::Workspace* workspace, std::vector<NDArray*>& outputs, bool isInplace);
+	void clipByNorm(sd::LaunchContext * context, NDArray& input, NDArray& output, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool isInplace);
+	void clipByGlobalNorm(sd::LaunchContext * context, std::vector<NDArray*> const& inputs, double clipNorm, sd::memory::Workspace* workspace, std::vector<NDArray*>& outputs, bool isInplace);
 
-	void clipByNormBP(nd4j::LaunchContext * context, const NDArray& input, const NDArray& gradO, NDArray& gradI /*output*/, const std::vector<int>& dimensions, const NDArray& clipNorm);
+	void clipByNormBP(sd::LaunchContext * context, const NDArray& input, const NDArray& gradO, NDArray& gradI /*output*/, const std::vector<int>& dimensions, const NDArray& clipNorm);
 
-	void clipByAveraged(nd4j::LaunchContext * context, NDArray& input, NDArray& output, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool isInplace);
-	void clipByValue(nd4j::LaunchContext * context, NDArray& input, double leftBound, double rightBound, NDArray& output);
+	void clipByAveraged(sd::LaunchContext * context, NDArray& input, NDArray& output, const std::vector<int>& dimensions, const NDArray& clipNorm, const bool isInplace);
+	void clipByValue(sd::LaunchContext * context, NDArray& input, double leftBound, double rightBound, NDArray& output);
 
-	void mirrorPad(nd4j::LaunchContext * context, const NDArray& input, const NDArray& paddings, NDArray& output, const int mode);
+	void mirrorPad(sd::LaunchContext * context, const NDArray& input, const NDArray& paddings, NDArray& output, const int mode);
 
-	void concat(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output, const int axis);
+	void concat(sd::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output, const int axis);
 
-	void tileBP(nd4j::LaunchContext * context, const NDArray& gradO /*input*/, NDArray& gradI /*output*/, const std::vector<Nd4jLong> reps);
+	void tileBP(sd::LaunchContext * context, const NDArray& gradO /*input*/, NDArray& gradI /*output*/, const std::vector<Nd4jLong> reps);
 
-	void split(nd4j::LaunchContext* context, const NDArray& input, std::vector<NDArray*>& outArrs, const int axis);
+	void split(sd::LaunchContext* context, const NDArray& input, std::vector<NDArray*>& outArrs, const int axis);
 }
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/triangular_solve.h b/libnd4j/include/ops/declarable/helpers/triangular_solve.h
index a40a3e144..73965f8c5 100644
--- a/libnd4j/include/ops/declarable/helpers/triangular_solve.h
+++ b/libnd4j/include/ops/declarable/helpers/triangular_solve.h
@@ -19,15 +19,15 @@
 //
 #ifndef __TRIANGULAR_SOLVE__H_HELPERS__
 #define __TRIANGULAR_SOLVE__H_HELPERS__
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
-    int triangularSolveFunctor(nd4j::LaunchContext* context, NDArray* leftInput, NDArray* rightInput, bool lower, bool adjoint, NDArray* output);
-    void adjointMatrix(nd4j::LaunchContext* context, NDArray const* input, bool const lower, NDArray* output);
+    int triangularSolveFunctor(sd::LaunchContext* context, NDArray* leftInput, NDArray* rightInput, bool lower, bool adjoint, NDArray* output);
+    void adjointMatrix(sd::LaunchContext* context, NDArray const* input, bool const lower, NDArray* output);
 }
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/unique.h b/libnd4j/include/ops/declarable/helpers/unique.h
index 8116d7ec9..8898be585 100644
--- a/libnd4j/include/ops/declarable/helpers/unique.h
+++ b/libnd4j/include/ops/declarable/helpers/unique.h
@@ -20,16 +20,16 @@
 
 #ifndef __UNIQUE_H_HELPERS__
 #define __UNIQUE_H_HELPERS__
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
-    Nd4jLong uniqueCount(nd4j::LaunchContext * context, NDArray* input);
+    Nd4jLong uniqueCount(sd::LaunchContext * context, NDArray* input);
 
-    Nd4jStatus uniqueFunctor(nd4j::LaunchContext * context, NDArray* input, NDArray* values, NDArray* indices, NDArray* counts);
+    Nd4jStatus uniqueFunctor(sd::LaunchContext * context, NDArray* input, NDArray* values, NDArray* indices, NDArray* counts);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/weights.h b/libnd4j/include/ops/declarable/helpers/weights.h
index 367c85b5e..66246b641 100644
--- a/libnd4j/include/ops/declarable/helpers/weights.h
+++ b/libnd4j/include/ops/declarable/helpers/weights.h
@@ -19,14 +19,14 @@
 //
 #ifndef __WEIGHTS_H_HELPERS__
 #define __WEIGHTS_H_HELPERS__
-#include <op_boilerplate.h>
-#include <NDArray.h>
+#include <system/op_boilerplate.h>
+#include <array/NDArray.h>
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
-    void adjustWeights(nd4j::LaunchContext * context, NDArray* input, NDArray* weights, NDArray* output, int minLength, int maxLength);
+    void adjustWeights(sd::LaunchContext * context, NDArray* input, NDArray* weights, NDArray* output, int minLength, int maxLength);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/where.h b/libnd4j/include/ops/declarable/helpers/where.h
index 09df7ebc9..2c9588462 100644
--- a/libnd4j/include/ops/declarable/helpers/where.h
+++ b/libnd4j/include/ops/declarable/helpers/where.h
@@ -23,10 +23,10 @@
 
 #include <ops/declarable/helpers/helpers.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace helpers {
-            void _where(nd4j::LaunchContext * context, NDArray &condition, NDArray& output, memory::Workspace *workspace);
+            void _where(sd::LaunchContext * context, NDArray &condition, NDArray& output, memory::Workspace *workspace);
         }
     }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/zeta.h b/libnd4j/include/ops/declarable/helpers/zeta.h
index 1f4a02747..7aee45f1c 100644
--- a/libnd4j/include/ops/declarable/helpers/zeta.h
+++ b/libnd4j/include/ops/declarable/helpers/zeta.h
@@ -22,15 +22,15 @@
 #define LIBND4J_ZETA_H
 
 #include <ops/declarable/helpers/helpers.h>
-#include "NDArray.h"
+#include "array/NDArray.h"
 
-namespace nd4j {
+namespace sd {
 namespace ops {
 namespace helpers {
 
 
 	// calculate the Hurwitz zeta function for arrays
-    void zeta(nd4j::LaunchContext * context, const NDArray& x, const NDArray& q, NDArray& output);
+    void zeta(sd::LaunchContext * context, const NDArray& x, const NDArray& q, NDArray& output);
 
     	
 	
diff --git a/libnd4j/include/ops/declarable/impl/BooleanOp.cpp b/libnd4j/include/ops/declarable/impl/BooleanOp.cpp
index ea4838f01..00079f9ae 100644
--- a/libnd4j/include/ops/declarable/impl/BooleanOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/BooleanOp.cpp
@@ -21,9 +21,9 @@
 #include "ops/declarable/BooleanOp.h"
 #include <vector>
 #include <initializer_list>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BooleanOp::BooleanOp(const char *name, int numInputs, bool scalar) : DeclarableOp::DeclarableOp(name, numInputs, scalar) {
             //
@@ -32,11 +32,11 @@ namespace nd4j {
         /**
         * Output shape of any BooleanOp is ALWAYS scalar
         */
-        ShapeList *BooleanOp::calculateOutputShape(ShapeList *inputShape, nd4j::graph::Context &block) {
+        ShapeList *BooleanOp::calculateOutputShape(ShapeList *inputShape, sd::graph::Context &block) {
             return SHAPELIST(ConstantShapeHelper::getInstance()->scalarShapeInfo(DataType::BOOL));
         }
 
-        bool BooleanOp::verify(nd4j::graph::Context &block) {
+        bool BooleanOp::verify(sd::graph::Context &block) {
             // check if scalar or not
 
             // validation?
@@ -81,7 +81,7 @@ namespace nd4j {
             return true;
         }
 
-        Nd4jStatus nd4j::ops::BooleanOp::execute(Context* block)  {
+        Nd4jStatus sd::ops::BooleanOp::execute(Context* block)  {
 
             // basic validation: ensure inputs are set
             REQUIRE_OK(this->validateNonEmptyInput(*block));
@@ -115,7 +115,7 @@ namespace nd4j {
             return ND4J_STATUS_KERNEL_FAILURE;
         }
 
-        bool BooleanOp::verify(const std::vector<nd4j::NDArray *> &args) {
+        bool BooleanOp::verify(const std::vector<sd::NDArray *> &args) {
             VariableSpace variableSpace;
 
             int cnt = -1;
diff --git a/libnd4j/include/ops/declarable/impl/BroadcastableOp.cpp b/libnd4j/include/ops/declarable/impl/BroadcastableOp.cpp
index 7d696c8ef..eb691b84d 100644
--- a/libnd4j/include/ops/declarable/impl/BroadcastableOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/BroadcastableOp.cpp
@@ -18,24 +18,24 @@
 // Created by raver on 6/6/2018.
 //
 
-#include <op_boilerplate.h>
-#include <pointercast.h>
+#include <system/op_boilerplate.h>
+#include <system/pointercast.h>
 #include <ops/declarable/BroadcastableOp.h>
 #include <helpers/ShapeUtils.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         BroadcastableOp::BroadcastableOp(const char *name, int numTArgs, int numIArgs) : DeclarableCustomOp::DeclarableCustomOp(2, 1, name, false, numTArgs, numIArgs) {
             //
         }
 
-        ShapeList *BroadcastableOp::calculateOutputShape(ShapeList *inputShape, nd4j::graph::Context &block) {
+        ShapeList *BroadcastableOp::calculateOutputShape(ShapeList *inputShape, sd::graph::Context &block) {
             auto shapeList = SHAPELIST();
             auto x = inputShape->at(0);
             auto y = inputShape->at(1);
             auto outputs = _descriptor->getOutputTypesForOutput(0);
-            nd4j::DataType dtype = block.dataType(0);
-            if (block.dataType(0) != nd4j::DataType::BOOL && !(outputs.size() == 1 && outputs[0] == nd4j::DataType::BOOL)) {
+            sd::DataType dtype = block.dataType(0);
+            if (block.dataType(0) != sd::DataType::BOOL && !(outputs.size() == 1 && outputs[0] == sd::DataType::BOOL)) {
                 if (Environment::getInstance()->isExperimentalBuild()) {
                     if (shape::length(y) > shape::length(x)) {
                         dtype = DataTypeUtils::pickPairwiseResultType(y, x);
@@ -46,7 +46,7 @@ namespace nd4j {
                     dtype = ArrayOptions::dataType(x);
                 }
             } else
-                dtype = nd4j::DataType::BOOL;
+                dtype = sd::DataType::BOOL;
 
             if(shape::isEmpty(x) || shape::isEmpty(y)) {
                 // this is edge case, [3, 4] + [] = []
diff --git a/libnd4j/include/ops/declarable/impl/DeclarableCustomOp.cpp b/libnd4j/include/ops/declarable/impl/DeclarableCustomOp.cpp
index 1fd57c867..d6227af0c 100644
--- a/libnd4j/include/ops/declarable/impl/DeclarableCustomOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/DeclarableCustomOp.cpp
@@ -21,9 +21,9 @@
 #include <ops/declarable/DeclarableCustomOp.h>
 #include <ops/declarable/DeclarableOp.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
-        DeclarableCustomOp::DeclarableCustomOp(int numInputs, int numOutputs, const char *opName, bool allowsInplace, int tArgs, int iArgs) : nd4j::ops::DeclarableOp(numInputs, numOutputs, opName, allowsInplace, tArgs, iArgs) {
+        DeclarableCustomOp::DeclarableCustomOp(int numInputs, int numOutputs, const char *opName, bool allowsInplace, int tArgs, int iArgs) : sd::ops::DeclarableOp(numInputs, numOutputs, opName, allowsInplace, tArgs, iArgs) {
             //
         }
     }
diff --git a/libnd4j/include/ops/declarable/impl/DeclarableListOp.cpp b/libnd4j/include/ops/declarable/impl/DeclarableListOp.cpp
index 624d6dbef..774917d70 100644
--- a/libnd4j/include/ops/declarable/impl/DeclarableListOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/DeclarableListOp.cpp
@@ -24,7 +24,7 @@
 #include <graph/Variable.h>
 #include <graph/VariableSpace.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         DeclarableListOp::DeclarableListOp(int numInputs, int numOutputs, const char* opName, int tArgs, int iArgs) : DeclarableOp::DeclarableOp(numInputs, numOutputs, opName, false, tArgs, iArgs) {
             // This kind of operations work with sets: NDArrayList
@@ -44,14 +44,14 @@ namespace nd4j {
          * @param block
          * @return
          */
-        ShapeList* DeclarableListOp::calculateOutputShape(ShapeList* inputShape, nd4j::graph::Context& block) {
+        ShapeList* DeclarableListOp::calculateOutputShape(ShapeList* inputShape, sd::graph::Context& block) {
             // TODO: ensure this method isn't ever called
 
             auto newShape = ConstantShapeHelper::getInstance()->createShapeInfo(block.dataType(), 'c', {1, 1});
             return SHAPELIST(newShape);
         }
 
-        nd4j::NDArray* nd4j::ops::DeclarableListOp::getZ(Context& block, int inputId) {
+        sd::NDArray* sd::ops::DeclarableListOp::getZ(Context& block, int inputId) {
             //nd4j_printf("wow\n","");
             return nullptr;
         }
diff --git a/libnd4j/include/ops/declarable/impl/DeclarableOp.cpp b/libnd4j/include/ops/declarable/impl/DeclarableOp.cpp
index 25c25fc2d..493834a4e 100644
--- a/libnd4j/include/ops/declarable/impl/DeclarableOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/DeclarableOp.cpp
@@ -19,17 +19,17 @@
 //
 
 #include <ops/declarable/DeclarableOp.h>
-#include <Status.h>
+#include <graph/Status.h>
 #include <helpers/ShapeUtils.h>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 #include <exceptions/graph_exception.h>
-#include <exceptions/unresolved_input_exception.h>
+#include <graph/exceptions/unresolved_input_exception.h>
 #include <ops/declarable/OpRegistrator.h>
 #include <exceptions/datatype_exception.h>
 #include <helpers/StringUtils.h>
 #include <cstdarg>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         Nd4jStatus conditionHelper(const char *file, int line, int condition, int argNumber, const char *format, ...) {
             if (!condition) {
@@ -97,7 +97,7 @@ namespace nd4j {
         }
 
 
-        nd4j::NDArray* nd4j::ops::DeclarableOp::getZ(Context& ctx, int inputId) {
+        sd::NDArray* sd::ops::DeclarableOp::getZ(Context& ctx, int inputId) {
             NDArray* z = nullptr;
 
             if (ctx.isFastPath()) {
@@ -141,7 +141,7 @@ namespace nd4j {
             return z;
         }
 
-        int nd4j::ops::DeclarableOp::prepareOutputs(Context &ctx) {
+        int sd::ops::DeclarableOp::prepareOutputs(Context &ctx) {
             auto workspace = ctx.getWorkspace();
             GraphProfile *prof = nullptr;
             NodeProfile *node = nullptr;
@@ -324,7 +324,7 @@ namespace nd4j {
                             //checking out data type equality
                             if (ArrayOptions::dataType(out) != ArrayOptions::dataType(shape)) {
                                 std::string msg = "Provided array [" + StringUtils::valueToString<int>(pair.second) + "] has unexpected data type";
-                                throw nd4j::datatype_exception::build(msg, ArrayOptions::dataType(out), ArrayOptions::dataType(shape));
+                                throw sd::datatype_exception::build(msg, ArrayOptions::dataType(out), ArrayOptions::dataType(shape));
                             }
                              */
                         }
@@ -368,15 +368,15 @@ namespace nd4j {
             }
         }
 
-        void nd4j::ops::DeclarableOp::storeResult(Context &block, int outputNumber, NDArray* array) {
+        void sd::ops::DeclarableOp::storeResult(Context &block, int outputNumber, NDArray* array) {
             this->storeResult(block, outputNumber, *array);
         }
 
-        void nd4j::ops::DeclarableOp::storeResult(nd4j::graph::Context &ctx, int outputNumber, NDArray& array) {
+        void sd::ops::DeclarableOp::storeResult(sd::graph::Context &ctx, int outputNumber, NDArray& array) {
             ctx.pushNDArrayToVariableSpace(ctx.nodeId(), outputNumber, &array, !ctx.isInplace());
         }
 
-        bool nd4j::ops::DeclarableOp::allocateResult(Context& block, Nd4jLong* shape) {
+        bool sd::ops::DeclarableOp::allocateResult(Context& block, Nd4jLong* shape) {
             auto var = block.variable(block.getNodeId(), 0);
 
             auto workspace = block.getWorkspace();
@@ -404,7 +404,7 @@ namespace nd4j {
         }
 
 
-        bool nd4j::ops::DeclarableOp::allocateResult(Context& block, std::initializer_list<Nd4jLong>& shape, char order) {
+        bool sd::ops::DeclarableOp::allocateResult(Context& block, std::initializer_list<Nd4jLong>& shape, char order) {
             auto var = block.variable(block.getNodeId(), 0);
             auto workspace = block.getWorkspace();
 
@@ -421,7 +421,7 @@ namespace nd4j {
             return true;
         }
 
-        Nd4jStatus nd4j::ops::DeclarableOp::validateDataTypes(Context& block) {
+        Nd4jStatus sd::ops::DeclarableOp::validateDataTypes(Context& block) {
             _registrator.lock();
             if (!_registered) {
                 _registered = true;
@@ -431,7 +431,7 @@ namespace nd4j {
 
             // rolling over inputs first
             int cnt = 0, inT = 0;
-            std::vector<nd4j::DataType> inputTypes(block.width());
+            std::vector<sd::DataType> inputTypes(block.width());
             if (block.isFastPath()) {
                 for (auto array: block.fastpath_in()) {
                     if (array == nullptr)
@@ -451,7 +451,7 @@ namespace nd4j {
                     auto var = block.variable(p);
 
                     // we're not checking validity, if ANY types were explicitly allowed
-                    //if (block.dataType(cnt) == nd4j::DataType::ANY)
+                    //if (block.dataType(cnt) == sd::DataType::ANY)
                     //    continue;
 
                     // only validating non-null variables
@@ -583,7 +583,7 @@ namespace nd4j {
             return ND4J_STATUS_OK;
         }
 
-        Nd4jStatus nd4j::ops::DeclarableOp::execute(Context* block) {
+        Nd4jStatus sd::ops::DeclarableOp::execute(Context* block) {
             nd4j_debug("Executing op: [%s]\n", this->getOpName()->c_str());
 
             std::chrono::time_point<std::chrono::system_clock> timeEnter, timeStart, timeEnd;
@@ -616,7 +616,7 @@ namespace nd4j {
             bool hasHelper = false;
 
             // platform helpers use might be forbidden for various reasons, so we'll check it out first
-            if (block->helpersAllowed() && nd4j::Environment::getInstance()->helpersAllowed()) {
+            if (block->helpersAllowed() && sd::Environment::getInstance()->helpersAllowed()) {
                 // if we have platform-specific helper for this op - invoke it
                 if (OpRegistrator::getInstance()->hasHelper(this->getOpHash(), block->engine())) {
                     auto helper = OpRegistrator::getInstance()->getPlatformHelper(this->getOpHash(), block->engine());
@@ -654,7 +654,7 @@ namespace nd4j {
 
 
             // now we print out all outputs for this node
-            if (nd4j::Environment::getInstance()->isDebugAndVerbose()) {
+            if (sd::Environment::getInstance()->isDebugAndVerbose()) {
                 auto vs = block->getVariableSpace();
 
                 for (int e = 0; e < numOutputs; e++) {
@@ -722,7 +722,7 @@ namespace nd4j {
             */
         }
 
-        Nd4jStatus nd4j::ops::DeclarableOp::validateArguments(Context& block) {
+        Nd4jStatus sd::ops::DeclarableOp::validateArguments(Context& block) {
             /*
              * We're checking number of T and I arguments. If number of args is finite number - we check strict equality
              * If number of args is variable (-1), but variables MUST be present - we check for non-zero number of arguments
@@ -755,7 +755,7 @@ namespace nd4j {
             return ND4J_STATUS_OK;
         }
 
-        Nd4jStatus nd4j::ops::DeclarableOp::validateInputDimensions(Context& block, int rank) {
+        Nd4jStatus sd::ops::DeclarableOp::validateInputDimensions(Context& block, int rank) {
             if (block.width() == 0)
                 return ND4J_STATUS_OK;
 
@@ -773,19 +773,19 @@ namespace nd4j {
             return ND4J_STATUS_OK;
         }
 
-        Nd4jStatus nd4j::ops::DeclarableOp::validateInput2D(Context& block) {
+        Nd4jStatus sd::ops::DeclarableOp::validateInput2D(Context& block) {
             return validateInputDimensions(block, 2);
         }
 
-        Nd4jStatus nd4j::ops::DeclarableOp::validateInput3D(Context& block) {
+        Nd4jStatus sd::ops::DeclarableOp::validateInput3D(Context& block) {
             return validateInputDimensions(block, 3);
         }
 
-        Nd4jStatus nd4j::ops::DeclarableOp::validateInput4D(Context& block) {
+        Nd4jStatus sd::ops::DeclarableOp::validateInput4D(Context& block) {
             return validateInputDimensions(block, 4);
         }
 
-        Nd4jStatus nd4j::ops::DeclarableOp::validateNonEmptyInput(Context& block) {
+        Nd4jStatus sd::ops::DeclarableOp::validateNonEmptyInput(Context& block) {
             if (this->getOpDescriptor()->getNumberOfInputs() == -2 || this->getOpDescriptor()->getNumberOfInputs() == 0)
                 return Status::OK();
 
@@ -830,7 +830,7 @@ namespace nd4j {
             return ND4J_STATUS_OK;
         }
 
-        Nd4jStatus nd4j::ops::DeclarableOp::validateOrdersMatch(Context& block) {
+        Nd4jStatus sd::ops::DeclarableOp::validateOrdersMatch(Context& block) {
             if (block.width() == 0)
                 return ND4J_STATUS_OK;
 
@@ -845,7 +845,7 @@ namespace nd4j {
             return ND4J_STATUS_OK;
         }
 
-        Nd4jStatus nd4j::ops::DeclarableOp::execute(nd4j::graph::RandomGenerator& rng, const std::vector<NDArray*>& inputs, const std::vector<NDArray*>& outputs, const std::vector<double>& tArgs, const std::vector<Nd4jLong>& iArgs, const std::vector<bool>& bArgs, const std::vector<nd4j::DataType>& dArgs, bool isInplace, nd4j::DataType type) {
+        Nd4jStatus sd::ops::DeclarableOp::execute(sd::graph::RandomGenerator& rng, const std::vector<NDArray*>& inputs, const std::vector<NDArray*>& outputs, const std::vector<double>& tArgs, const std::vector<Nd4jLong>& iArgs, const std::vector<bool>& bArgs, const std::vector<sd::DataType>& dArgs, bool isInplace, sd::DataType type) {
             VariableSpace variableSpace;
             FlowPath fp;
             variableSpace.setFlowPath(&fp);
@@ -898,16 +898,16 @@ namespace nd4j {
         }
 
         Nd4jStatus DeclarableOp::execute(const std::vector<NDArray *> &inputs, const std::vector<NDArray *> &outputs) {
-            return execute(inputs, outputs, std::vector<double>(), std::vector<Nd4jLong>(), std::vector<bool>(), std::vector<nd4j::DataType>());
+            return execute(inputs, outputs, std::vector<double>(), std::vector<Nd4jLong>(), std::vector<bool>(), std::vector<sd::DataType>());
         }
 
         template <>
         Nd4jStatus DeclarableOp::execute(const std::vector<NDArray *> &inputs, const std::vector<NDArray *> &outputs, std::initializer_list<double> tArgs) {
-            return execute(inputs, outputs, tArgs, std::vector<Nd4jLong>(), std::vector<bool>(), std::vector<nd4j::DataType>());
+            return execute(inputs, outputs, tArgs, std::vector<Nd4jLong>(), std::vector<bool>(), std::vector<sd::DataType>());
         }
 
         template <>
-        Nd4jStatus DeclarableOp::execute(const std::vector<NDArray *> &inputs, const std::vector<NDArray *> &outputs, std::initializer_list<nd4j::DataType> dArgs) {
+        Nd4jStatus DeclarableOp::execute(const std::vector<NDArray *> &inputs, const std::vector<NDArray *> &outputs, std::initializer_list<sd::DataType> dArgs) {
             return execute(inputs, outputs, std::vector<double>(), std::vector<Nd4jLong>(), std::vector<bool>(), dArgs);
         }
 
@@ -917,12 +917,12 @@ namespace nd4j {
             for (auto v:tArgs)
                 realArgs.emplace_back(v);
 
-            return execute(inputs, outputs, realArgs, std::vector<Nd4jLong>(), std::vector<bool>(), std::vector<nd4j::DataType>());
+            return execute(inputs, outputs, realArgs, std::vector<Nd4jLong>(), std::vector<bool>(), std::vector<sd::DataType>());
         }
 
         template <>
         Nd4jStatus DeclarableOp::execute(const std::vector<NDArray *> &inputs, const std::vector<NDArray *> &outputs, std::initializer_list<Nd4jLong> iArgs) {
-            return execute(inputs, outputs, std::vector<double>(), iArgs, std::vector<bool>(), std::vector<nd4j::DataType>());
+            return execute(inputs, outputs, std::vector<double>(), iArgs, std::vector<bool>(), std::vector<sd::DataType>());
         }
 
         template <>
@@ -931,15 +931,15 @@ namespace nd4j {
             for (auto v:iArgs)
                 realArgs.emplace_back(v);
 
-            return execute(inputs, outputs, std::vector<double>(), realArgs, std::vector<bool>(), std::vector<nd4j::DataType>());
+            return execute(inputs, outputs, std::vector<double>(), realArgs, std::vector<bool>(), std::vector<sd::DataType>());
         }
 
         template <>
         Nd4jStatus DeclarableOp::execute(const std::vector<NDArray *> &inputs, const std::vector<NDArray *> &outputs, std::initializer_list<bool> bArgs) {
-            return execute(inputs, outputs, std::vector<double>(), std::vector<Nd4jLong>(), bArgs, std::vector<nd4j::DataType>());
+            return execute(inputs, outputs, std::vector<double>(), std::vector<Nd4jLong>(), bArgs, std::vector<sd::DataType>());
         }
 
-        Nd4jStatus DeclarableOp::execute(const std::vector<NDArray *> &inputs, const std::vector<NDArray *> &outputs, const std::vector<double> &tArgs, const std::vector<Nd4jLong> &iArgs, const std::vector<bool> &bArgs, const std::vector<nd4j::DataType> &dArgs, bool isInplace) {
+        Nd4jStatus DeclarableOp::execute(const std::vector<NDArray *> &inputs, const std::vector<NDArray *> &outputs, const std::vector<double> &tArgs, const std::vector<Nd4jLong> &iArgs, const std::vector<bool> &bArgs, const std::vector<sd::DataType> &dArgs, bool isInplace) {
             Context ctx(1);
 
             for (int e = 0; e < inputs.size(); e++) {
@@ -962,49 +962,49 @@ namespace nd4j {
             return execute(&ctx);
         }
 
-        nd4j::ResultSet *DeclarableOp::evaluate(const std::vector<NDArray *> &inputs) {
-            return evaluate(inputs, std::vector<double>(), std::vector<Nd4jLong>(), std::vector<bool>(), std::vector<nd4j::DataType>());
+        sd::ResultSet *DeclarableOp::evaluate(const std::vector<NDArray *> &inputs) {
+            return evaluate(inputs, std::vector<double>(), std::vector<Nd4jLong>(), std::vector<bool>(), std::vector<sd::DataType>());
         }
 
         template <>
-        nd4j::ResultSet *DeclarableOp::evaluate(const std::vector<NDArray *> &inputs, std::initializer_list<int> iArgs) {
+        sd::ResultSet *DeclarableOp::evaluate(const std::vector<NDArray *> &inputs, std::initializer_list<int> iArgs) {
             std::vector<Nd4jLong> realArgs;
             for (auto v:iArgs)
                 realArgs.emplace_back(v);
 
-            return evaluate(inputs, std::vector<double>(), realArgs, std::vector<bool>(), std::vector<nd4j::DataType>());
+            return evaluate(inputs, std::vector<double>(), realArgs, std::vector<bool>(), std::vector<sd::DataType>());
         }
 
         template <>
-        nd4j::ResultSet *DeclarableOp::evaluate(const std::vector<NDArray *> &inputs, std::initializer_list<Nd4jLong> iArgs) {
-            return evaluate(inputs, std::vector<double>(), iArgs, std::vector<bool>(), std::vector<nd4j::DataType>());
+        sd::ResultSet *DeclarableOp::evaluate(const std::vector<NDArray *> &inputs, std::initializer_list<Nd4jLong> iArgs) {
+            return evaluate(inputs, std::vector<double>(), iArgs, std::vector<bool>(), std::vector<sd::DataType>());
         }
 
         template <>
-        nd4j::ResultSet *DeclarableOp::evaluate(const std::vector<NDArray *> &inputs, std::initializer_list<float> tArgs) {
+        sd::ResultSet *DeclarableOp::evaluate(const std::vector<NDArray *> &inputs, std::initializer_list<float> tArgs) {
             std::vector<double> realArgs;
             for (auto v:tArgs)
                 realArgs.emplace_back(v);
 
-            return evaluate(inputs, realArgs, std::vector<Nd4jLong>(), std::vector<bool>(), std::vector<nd4j::DataType>());
+            return evaluate(inputs, realArgs, std::vector<Nd4jLong>(), std::vector<bool>(), std::vector<sd::DataType>());
         }
 
         template <>
-        nd4j::ResultSet *DeclarableOp::evaluate(const std::vector<NDArray *> &inputs, std::initializer_list<double> tArgs) {
-            return evaluate(inputs, tArgs, std::vector<Nd4jLong>(), std::vector<bool>(), std::vector<nd4j::DataType>());
+        sd::ResultSet *DeclarableOp::evaluate(const std::vector<NDArray *> &inputs, std::initializer_list<double> tArgs) {
+            return evaluate(inputs, tArgs, std::vector<Nd4jLong>(), std::vector<bool>(), std::vector<sd::DataType>());
         }
 
         template <>
-        nd4j::ResultSet *DeclarableOp::evaluate(const std::vector<NDArray *> &inputs, std::initializer_list<bool> bArgs) {
-            return evaluate(inputs, std::vector<double>(), std::vector<Nd4jLong>(), bArgs, std::vector<nd4j::DataType>());
+        sd::ResultSet *DeclarableOp::evaluate(const std::vector<NDArray *> &inputs, std::initializer_list<bool> bArgs) {
+            return evaluate(inputs, std::vector<double>(), std::vector<Nd4jLong>(), bArgs, std::vector<sd::DataType>());
         }
 
         template <>
-        nd4j::ResultSet *DeclarableOp::evaluate(const std::vector<NDArray *> &inputs, std::initializer_list<nd4j::DataType> bArgs) {
+        sd::ResultSet *DeclarableOp::evaluate(const std::vector<NDArray *> &inputs, std::initializer_list<sd::DataType> bArgs) {
             return evaluate(inputs, std::vector<double>(), std::vector<Nd4jLong>(), std::vector<bool>(), bArgs);
         }
 
-        nd4j::ResultSet *DeclarableOp::evaluate(const std::vector<NDArray *> &inputs, const std::vector<double> &tArgs, const std::vector<Nd4jLong> &iArgs, const std::vector<bool> &bArgs, const std::vector<nd4j::DataType> &dArgs, bool isInplace) {
+        sd::ResultSet *DeclarableOp::evaluate(const std::vector<NDArray *> &inputs, const std::vector<double> &tArgs, const std::vector<Nd4jLong> &iArgs, const std::vector<bool> &bArgs, const std::vector<sd::DataType> &dArgs, bool isInplace) {
             VariableSpace variableSpace;
             //ResultSet arrayList;
             FlowPath fp;
@@ -1023,7 +1023,7 @@ namespace nd4j {
             }
 
             Context block(1, &variableSpace, false);
-            block.setDataType(0, nd4j::DataType::FLOAT32);
+            block.setDataType(0, sd::DataType::FLOAT32);
             block.fillInputs(in);
             block.markInplace(isInplace);
             // block.setRNG(ProviderRNG::getInstance().getRNG());
@@ -1057,7 +1057,7 @@ namespace nd4j {
                         auto arr = var->getNDArray();
                         if (!arr->isAttached()) {
                             var->markRemovable(false);
-                            arr->setContext(nd4j::LaunchContext::defaultContext());
+                            arr->setContext(sd::LaunchContext::defaultContext());
                             arrayList->push_back(arr);
                         } else {
                             arrayList->push_back(arr->detach());
@@ -1074,12 +1074,12 @@ namespace nd4j {
             return arrayList;
         }
 
-        nd4j::ResultSet* nd4j::ops::DeclarableOp::execute(const nd4j::OpArgsHolder& holder, bool isInplace) {
+        sd::ResultSet* sd::ops::DeclarableOp::execute(const sd::OpArgsHolder& holder, bool isInplace) {
             // FIXME: add DArgs to OpArgsHolder
-            return evaluate(holder.getInArrs(), holder.getTArgs(), holder.getIArgs(), holder.getBArgs(), std::vector<nd4j::DataType>(), isInplace);
+            return evaluate(holder.getInArrs(), holder.getTArgs(), holder.getIArgs(), holder.getBArgs(), std::vector<sd::DataType>(), isInplace);
         }
 
-        Nd4jStatus nd4j::ops::DeclarableOp::validateInputDimensionsMatch(Context& block) {
+        Nd4jStatus sd::ops::DeclarableOp::validateInputDimensionsMatch(Context& block) {
             if (block.width() == 0)
                 return ND4J_STATUS_OK;
 
@@ -1093,7 +1093,7 @@ namespace nd4j {
             return ND4J_STATUS_OK;
         }
 
-        Nd4jStatus nd4j::ops::DeclarableOp::validateInputLengthMatch(Context& block) {
+        Nd4jStatus sd::ops::DeclarableOp::validateInputLengthMatch(Context& block) {
             if (block.width() == 0)
                 return ND4J_STATUS_OK;
 
@@ -1117,7 +1117,7 @@ namespace nd4j {
 
         /*
         template <typename T>
-        int* nd4j::ops::DeclarableOp::calculateOutputShape(int* inputShape, nd4j::graph::Block& block) {
+        int* sd::ops::DeclarableOp::calculateOutputShape(int* inputShape, sd::graph::Block& block) {
             // default implementation suits transform, so just returns the same shape
 
             int* newshape;
diff --git a/libnd4j/include/ops/declarable/impl/DeclarableReductionOp.cpp b/libnd4j/include/ops/declarable/impl/DeclarableReductionOp.cpp
index 98a60b28b..4f6646694 100644
--- a/libnd4j/include/ops/declarable/impl/DeclarableReductionOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/DeclarableReductionOp.cpp
@@ -25,13 +25,13 @@
 #include <helpers/ConstantTadHelper.h>
 #include <array/DataTypeUtils.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
-        DeclarableReductionOp::DeclarableReductionOp(int numInputs, int numOutputs, const char *opName, bool allowsInplace, int tArgs, int iArgs) : nd4j::ops::DeclarableOp(numInputs, numOutputs, opName, allowsInplace, tArgs, iArgs) {
+        DeclarableReductionOp::DeclarableReductionOp(int numInputs, int numOutputs, const char *opName, bool allowsInplace, int tArgs, int iArgs) : sd::ops::DeclarableOp(numInputs, numOutputs, opName, allowsInplace, tArgs, iArgs) {
             //
         }
 
-        nd4j::ShapeList* DeclarableReductionOp::calculateOutputShape(nd4j::ShapeList* inputShape, nd4j::graph::Context& block)  {
+        sd::ShapeList* DeclarableReductionOp::calculateOutputShape(sd::ShapeList* inputShape, sd::graph::Context& block)  {
            // int numDims = INT_ARG(0);
             std::vector<int> dims;
             if (inputShape->size() > 1) {
@@ -51,7 +51,7 @@ namespace nd4j {
                 std::sort(dims.begin(), dims.end());
 
             // special case - output is scalar
-            if (dims.size() == 0 || (dims.size() == 1 && dims.at(0) == nd4j::DataTypeUtils::max<int>())) {
+            if (dims.size() == 0 || (dims.size() == 1 && dims.at(0) == sd::DataTypeUtils::max<int>())) {
                 auto newShape = ConstantShapeHelper::getInstance()->scalarShapeInfo(block.dataType());
                 return SHAPELIST(newShape);
             }
diff --git a/libnd4j/include/ops/declarable/impl/LegacyBroadcastBoolOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyBroadcastBoolOp.cpp
index 0cd91e3b5..03f34d269 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyBroadcastBoolOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyBroadcastBoolOp.cpp
@@ -19,13 +19,13 @@
 //
 
 #include <ops/declarable/LegacyBroadcastBoolOp.h>
-#include <PointersManager.h>
+#include <helpers/PointersManager.h>
 #include <helpers/TAD.h>
 #include <helpers/ConstantTadHelper.h>
-#include <Status.h>
+#include <graph/Status.h>
 
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         Nd4jStatus LegacyBroadcastBoolOp::validateAndExecute(Context &block) {
             auto x = INPUT_VARIABLE(0);
@@ -41,7 +41,7 @@ namespace nd4j {
 
             int opNum = block.opNum() < 0 ? this->_opNum : block.opNum();
 
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(x->getShapeInfo(), dims);
+            auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(x->getShapeInfo(), dims);
 
             PointersManager manager(block.launchContext(), "LegacyBroadcastBoolOp");
             auto pTadShape = Environment::getInstance()->isCPU() ? packX.primaryShapeInfo() : packX.specialShapeInfo(); //(Nd4jLong *) manager.replicatePointer(tad.tadOnlyShapeInfo, shape::shapeInfoByteLength(tad.tadOnlyShapeInfo));
@@ -57,7 +57,7 @@ namespace nd4j {
             else {
                 // this is rare, but possible use case - X and Z might have different shapes/strides/orders. In this case we prepare and pass separate TAD info
 
-                auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(z->getShapeInfo(), dims);
+                auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(z->getShapeInfo(), dims);
 
                 auto zTadShape =  Environment::getInstance()->isCPU() ? packZ.primaryShapeInfo() : packZ.specialShapeInfo(); //(Nd4jLong *) manager.replicatePointer(tadZ.tadOnlyShapeInfo, shape::shapeInfoByteLength(tadZ.tadOnlyShapeInfo));
                 auto zTadOffsets = Environment::getInstance()->isCPU() ? packZ.primaryOffsets() : packZ.specialOffsets(); //(Nd4jLong *) manager.replicatePointer(tadZ.tadOffsets, tadZ.numTads * sizeof(Nd4jLong));
@@ -89,7 +89,7 @@ namespace nd4j {
         /**
         *   If external NDArray wasn't specified - the same shape is returned by all broadcast ops.
         */
-        ShapeList* LegacyBroadcastBoolOp::calculateOutputShape(ShapeList *inputShape, nd4j::graph::Context &block) {
+        ShapeList* LegacyBroadcastBoolOp::calculateOutputShape(ShapeList *inputShape, sd::graph::Context &block) {
             auto inShape = inputShape->at(0);
             return SHAPELIST(ConstantShapeHelper::getInstance()->createShapeInfo(ShapeDescriptor(inShape, DataType::BOOL)));
         }
diff --git a/libnd4j/include/ops/declarable/impl/LegacyBroadcastOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyBroadcastOp.cpp
index 4dee50c13..0297df28a 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyBroadcastOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyBroadcastOp.cpp
@@ -23,9 +23,9 @@
 #include <ops/declarable/helpers/axis.h>
 #include <helpers/ShapeUtils.h>
 #include <helpers/ConstantTadHelper.h>
-#include <Status.h>
+#include <graph/Status.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         Nd4jStatus LegacyBroadcastOp::validateAndExecute(Context &block) {
             auto x = INPUT_VARIABLE(0);
@@ -47,7 +47,7 @@ namespace nd4j {
 
             int opNum = block.opNum() < 0 ? this->_opNum : block.opNum();
 
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(x->getShapeInfo(), dims);
+            auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(x->getShapeInfo(), dims);
 
             auto tadLen = shape::length(packX.primaryShapeInfo());
             REQUIRE_TRUE(tadLen == y->lengthOf(), 0, "Length of broadcast TAD should be equal to length of Y operand, but got [%i] vs [%i]",tadLen, (int) y->lengthOf());
@@ -62,7 +62,7 @@ namespace nd4j {
                         z->buffer(), z->shapeInfo(), z->specialBuffer(), z->specialShapeInfo(), dims.data(), dims.size(), pTadShape, pTadOffsets, pTadShape, pTadOffsets);
             else {
                 // this is rare, but possible use case - X and Z might have different shapes/strides/orders. In this case we prepare and pass separate TAD info
-                auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(z->getShapeInfo(), dims);
+                auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(z->getShapeInfo(), dims);
 
                 auto zTadShape = Environment::getInstance()->isCPU() ? packZ.primaryShapeInfo() : packZ.specialShapeInfo(); //(Nd4jLong *) manager.replicatePointer(tadZ.tadOnlyShapeInfo, shape::shapeInfoByteLength(tadZ.tadOnlyShapeInfo));
                 auto zTadOffsets = Environment::getInstance()->isCPU() ? packZ.primaryOffsets() : packZ.specialOffsets(); //(Nd4jLong *) manager.replicatePointer(tadZ.tadOffsets, tadZ.numTads * sizeof(Nd4jLong));
@@ -94,7 +94,7 @@ namespace nd4j {
         /**
         *   If external NDArray wasn't specified - the same shape is returned by all broadcast ops.
         */
-        ShapeList* LegacyBroadcastOp::calculateOutputShape(ShapeList *inputShape, nd4j::graph::Context &block) {
+        ShapeList* LegacyBroadcastOp::calculateOutputShape(ShapeList *inputShape, sd::graph::Context &block) {
             auto inShape = inputShape->at(0);
 
             // FIXME: remove memcpy
diff --git a/libnd4j/include/ops/declarable/impl/LegacyIndexReduceOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyIndexReduceOp.cpp
index c63139bb2..c92577f3b 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyIndexReduceOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyIndexReduceOp.cpp
@@ -21,11 +21,11 @@
 #include <ops/declarable/LegacyIndexReduceOp.h>
 #include <helpers/ShapeUtils.h>
 #include <helpers/TAD.h>
-#include <Status.h>
+#include <graph/Status.h>
 #include <helpers/ConstantTadHelper.h>
 
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         LegacyIndexReduceOp::LegacyIndexReduceOp() : LegacyOp::LegacyOp(1){
             //
@@ -39,7 +39,7 @@ namespace nd4j {
             return new LegacyIndexReduceOp(this->_opNum);
         }
 
-        ShapeList *LegacyIndexReduceOp::calculateOutputShape(ShapeList *inputShape, nd4j::graph::Context &block) {
+        ShapeList *LegacyIndexReduceOp::calculateOutputShape(ShapeList *inputShape, sd::graph::Context &block) {
             auto inShape = inputShape->at(0);
 
             Nd4jLong *newShape;
@@ -139,7 +139,7 @@ namespace nd4j {
                     if (dims.size() > 1)
                         std::sort(dims.begin(), dims.end());
 
-                    auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(x->shapeInfo(), dims);
+                    auto tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(x->shapeInfo(), dims);
 
                     NativeOpExecutioner::execIndexReduce(block.launchContext(), opNum, x->getBuffer(), x->getShapeInfo(),
                                                         x->getSpecialBuffer(), x->getSpecialShapeInfo(),
@@ -175,7 +175,7 @@ namespace nd4j {
 
                     REQUIRE_TRUE(axis.size() > 0, 0, "Some dimensions required for reduction!");
 
-                    auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(x->shapeInfo(), axis);
+                    auto tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(x->shapeInfo(), axis);
 
                     NativeOpExecutioner::execIndexReduce(block.launchContext(), opNum,
                             x->getBuffer(), x->getShapeInfo(), x->getSpecialBuffer(), x->getSpecialShapeInfo(),
diff --git a/libnd4j/include/ops/declarable/impl/LegacyOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyOp.cpp
index e9920c409..c179488df 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyOp.cpp
@@ -21,7 +21,7 @@
 #include <ops/declarable/LegacyOp.h>
 
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         LegacyOp::LegacyOp(int numInputs) : DeclarableOp::DeclarableOp(numInputs , 1, "LegacyOp", false) {
             _numInputs = numInputs;
diff --git a/libnd4j/include/ops/declarable/impl/LegacyPairwiseTransformBoolOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyPairwiseTransformBoolOp.cpp
index 86eee11d5..8e257daaa 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyPairwiseTransformBoolOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyPairwiseTransformBoolOp.cpp
@@ -22,7 +22,7 @@
 #include <ops/declarable/LegacyPairwiseTransformBoolOp.h>
 
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         LegacyPairwiseTransformBoolOp::LegacyPairwiseTransformBoolOp() : LegacyOp::LegacyOp(2) {
             // just a no-op
@@ -68,7 +68,7 @@ namespace nd4j {
         /**
         *   Output shape of PWT operations always the same as input[0] shape, no exclusions.
         */
-        ShapeList *LegacyPairwiseTransformBoolOp::calculateOutputShape(ShapeList *inputShape, nd4j::graph::Context &block) {
+        ShapeList *LegacyPairwiseTransformBoolOp::calculateOutputShape(ShapeList *inputShape, sd::graph::Context &block) {
             auto inShape = inputShape->at(0);
             return SHAPELIST(ConstantShapeHelper::getInstance()->createShapeInfo(ShapeDescriptor(inShape, DataType::BOOL)));
         }
diff --git a/libnd4j/include/ops/declarable/impl/LegacyPairwiseTransformOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyPairwiseTransformOp.cpp
index 49f896be1..4b2b7a0f9 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyPairwiseTransformOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyPairwiseTransformOp.cpp
@@ -22,7 +22,7 @@
 #include <ops/declarable/LegacyPairwiseTransformOp.h>
 
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         LegacyPairwiseTransformOp::LegacyPairwiseTransformOp() : LegacyOp::LegacyOp(2) {
             this->getOpDescriptor()->allowInplace(true);
@@ -68,7 +68,7 @@ namespace nd4j {
         /**
         *   Output shape of PWT operations always the same as input[0] shape, no exclusions.
         */
-        ShapeList *LegacyPairwiseTransformOp::calculateOutputShape(ShapeList *inputShape, nd4j::graph::Context &block) {
+        ShapeList *LegacyPairwiseTransformOp::calculateOutputShape(ShapeList *inputShape, sd::graph::Context &block) {
             auto inShape = inputShape->at(0);
 
             Nd4jLong *newShape;
diff --git a/libnd4j/include/ops/declarable/impl/LegacyRandomOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyRandomOp.cpp
index 731c5a5f9..88ef39237 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyRandomOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyRandomOp.cpp
@@ -20,12 +20,12 @@
 
 #include <ops/declarable/LegacyRandomOp.h>
 #include <helpers/RandomLauncher.h>
-#include <NativeOpExecutioner.h>
-#include <NDArrayFactory.h>
-#include <Status.h>
+#include <legacy/NativeOpExecutioner.h>
+#include <array/NDArrayFactory.h>
+#include <graph/Status.h>
 #include <ops/declarable/CustomOperations.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         LegacyRandomOp::LegacyRandomOp() : LegacyOp::LegacyOp(1) {
             // just a no-op
@@ -61,7 +61,7 @@ namespace nd4j {
                 (12, randomOps::AlphaDropOut)
             */
             switch(opNum) {
-                case nd4j::random::UniformDistribution: {
+                case sd::random::UniformDistribution: {
                     // uniform distribution
                     T from, to;
                     if (block.width() > 2) {
@@ -87,7 +87,7 @@ namespace nd4j {
                     //OVERWRITE_RESULT(z);
                 }
                     break;
-                case nd4j::random::DropOut: {
+                case sd::random::DropOut: {
                         auto z = OUTPUT_VARIABLE(0);
 
                         T prob;
@@ -108,13 +108,13 @@ namespace nd4j {
                         RandomLauncher::applyDropOut(block.launchContext(), block.randomGenerator(), z, prob);
                     }
                     break;
-                case nd4j::random::DropOutInverted: {
+                case sd::random::DropOutInverted: {
                         auto z = OUTPUT_VARIABLE(0);
-                        nd4j::ops::dropout op;
+                        sd::ops::dropout op;
                         return op.execute(&block);
                     }
                     break;
-                case nd4j::random::GaussianDistribution: {
+                case sd::random::GaussianDistribution: {
                     // gaussian distribution
                     T mean, stdev;
                     if (block.width() > 2) {
@@ -146,7 +146,7 @@ namespace nd4j {
                     //OVERWRITE_RESULT(z);
                 }
                     break;
-                case nd4j::random::BernoulliDistribution: {
+                case sd::random::BernoulliDistribution: {
                     // bernoulli distribution
                     T prob;
                     if (block.width() > 1) {
@@ -174,7 +174,7 @@ namespace nd4j {
                     //OVERWRITE_RESULT(z);
                 }
                     break;
-                case nd4j::random::BinomialDistributionEx: {
+                case sd::random::BinomialDistributionEx: {
                     // BinomialEx distribution
                     T prob;
                     int trials;
@@ -207,7 +207,7 @@ namespace nd4j {
                     //OVERWRITE_RESULT(z);
                 }
                     break;
-                case nd4j::random::LogNormalDistribution: {
+                case sd::random::LogNormalDistribution: {
                     // lognorm distribution
                     T mean, stdev;
                     if (block.width() > 2) {
@@ -239,7 +239,7 @@ namespace nd4j {
                     //OVERWRITE_RESULT(z);
                 }
                     break;
-                case nd4j::random::TruncatedNormalDistribution: {
+                case sd::random::TruncatedNormalDistribution: {
                     // truncated norm distribution
                     T mean, stdev;
                     if (block.width() > 2) {
@@ -271,7 +271,7 @@ namespace nd4j {
                     //OVERWRITE_RESULT(z);
                 }
                     break;
-                case nd4j::random::AlphaDropOut: {
+                case sd::random::AlphaDropOut: {
                     auto z = OUTPUT_VARIABLE(0);
 
                     T prob, a, b, pa;
@@ -304,7 +304,7 @@ namespace nd4j {
                     RandomLauncher::applyAlphaDropOut(block.launchContext(), block.randomGenerator(), z, prob, a, b, pa);
                 }
                     break;
-                case nd4j::random::Linspace: {
+                case sd::random::Linspace: {
                         auto z = OUTPUT_VARIABLE(0);
                         auto start = INPUT_VARIABLE(0);
                         auto finish = INPUT_VARIABLE(1);
@@ -334,7 +334,7 @@ namespace nd4j {
         * But these ops already have CustomOp implementations.
         *
         */
-        ShapeList *LegacyRandomOp::calculateOutputShape(ShapeList *inputShape, nd4j::graph::Context &block) {
+        ShapeList *LegacyRandomOp::calculateOutputShape(ShapeList *inputShape, sd::graph::Context &block) {
             auto inShape = inputShape->at(0);
             auto xType = ArrayOptions::dataType(inShape);
             Nd4jLong *newShape;
@@ -357,14 +357,14 @@ namespace nd4j {
             return DeclarableOp::execute(block);
         }
 
-        nd4j::ResultSet*  LegacyRandomOp::execute(nd4j::graph::RandomGenerator& rng, std::initializer_list<NDArray*> inputs, std::initializer_list<double> tArgs, std::initializer_list<int> iArgs, bool isInplace) {
+        sd::ResultSet*  LegacyRandomOp::execute(sd::graph::RandomGenerator& rng, std::initializer_list<NDArray*> inputs, std::initializer_list<double> tArgs, std::initializer_list<int> iArgs, bool isInplace) {
             std::vector<NDArray*> ins(inputs);
             std::vector<double> tas(tArgs);
             std::vector<int> ias(iArgs);
             return this->execute(rng, ins, tas, ias, isInplace);
         }
 
-        nd4j::ResultSet*  LegacyRandomOp::execute(nd4j::graph::RandomGenerator& rng, std::vector<NDArray*>& inputs, std::vector<double>& tArgs, std::vector<int>& iArgs, bool isInplace) {
+        sd::ResultSet*  LegacyRandomOp::execute(sd::graph::RandomGenerator& rng, std::vector<NDArray*>& inputs, std::vector<double>& tArgs, std::vector<int>& iArgs, bool isInplace) {
             VariableSpace variableSpace;
             auto arrayList = new ResultSet();
             //ResultSet arrayList;
diff --git a/libnd4j/include/ops/declarable/impl/LegacyReduce3Op.cpp b/libnd4j/include/ops/declarable/impl/LegacyReduce3Op.cpp
index 684f09262..7143c3bbd 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyReduce3Op.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyReduce3Op.cpp
@@ -24,7 +24,7 @@
 #include <helpers/ConstantTadHelper.h>
 #include <array/DataTypeUtils.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         Nd4jStatus LegacyReduce3Op::validateAndExecute(Context &block) {
             auto x = INPUT_VARIABLE(0);
@@ -40,7 +40,7 @@ namespace nd4j {
             ExtraArguments extras(*block.getTArguments());
             PointersManager manager(block.launchContext(), "LegacyReduce3Op");
 
-            if (x->isSameShape(y) && (block.getIArguments()->size() == 0 || (block.getIArguments()->size() == 1 && INT_ARG(0) == nd4j::DataTypeUtils::max<int>()))) {
+            if (x->isSameShape(y) && (block.getIArguments()->size() == 0 || (block.getIArguments()->size() == 1 && INT_ARG(0) == sd::DataTypeUtils::max<int>()))) {
                 // reduce3 to scalar
                 NativeOpExecutioner::execReduce3Scalar(block.launchContext(), opNum, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(),
                         extras.argumentsAsT(z->dataType()),
@@ -52,8 +52,8 @@ namespace nd4j {
                     if (dims[e] < 0)
                         dims[e] += x->rankOf();
 
-                auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(x->getShapeInfo(), dims);
-                auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(z->getShapeInfo(), dims);
+                auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(x->getShapeInfo(), dims);
+                auto packZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(z->getShapeInfo(), dims);
 
                 REQUIRE_TRUE(dims.size() > 0, 0, "Some dimensions requuired for reduction!");
 
@@ -92,13 +92,13 @@ namespace nd4j {
         *   For all reductions rules are simple: either you return scalar, or you return reduced NDArray.
         *   It solely depends on input shape, and requested dimensions
         */
-        ShapeList *LegacyReduce3Op::calculateOutputShape(ShapeList *inputShape, nd4j::graph::Context &block) {
+        ShapeList *LegacyReduce3Op::calculateOutputShape(ShapeList *inputShape, sd::graph::Context &block) {
             auto xShape = inputShape->at(0);
             auto yShape = inputShape->at(1);
 
             Nd4jLong *zShape = nullptr;
 
-            if (shape::equalsSoft(xShape, yShape) && (block.getIArguments()->size() == 0 || (block.getIArguments()->size() == 1 && INT_ARG(0) == nd4j::DataTypeUtils::max<int>()))) {
+            if (shape::equalsSoft(xShape, yShape) && (block.getIArguments()->size() == 0 || (block.getIArguments()->size() == 1 && INT_ARG(0) == sd::DataTypeUtils::max<int>()))) {
                 // reduce3 to scalar case
                 ALLOCATE(zShape, block.getWorkspace(), shape::shapeInfoLength(2), Nd4jLong);
                 zShape[0] = 2;
diff --git a/libnd4j/include/ops/declarable/impl/LegacyReduceBoolOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyReduceBoolOp.cpp
index 12a25537d..433e173fc 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyReduceBoolOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyReduceBoolOp.cpp
@@ -21,11 +21,11 @@
 #include <ops/declarable/LegacyReduceBoolOp.h>
 #include <helpers/TAD.h>
 #include <helpers/ShapeUtils.h>
-#include <Status.h>
+#include <graph/Status.h>
 #include <helpers/ConstantTadHelper.h>
 #include <array/DataTypeUtils.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         LegacyReduceBoolOp::LegacyReduceBoolOp() : LegacyOp::LegacyOp(1) {
             //
@@ -61,7 +61,7 @@ namespace nd4j {
                     allAxes = true;
 
                 if ((axis.empty()) ||
-                    (axis.size() == 1 && axis[0] == nd4j::DataTypeUtils::max<int>()) || allAxes) {
+                    (axis.size() == 1 && axis[0] == sd::DataTypeUtils::max<int>()) || allAxes) {
                     // scalar
                     NativeOpExecutioner::execReduceBoolScalar(block.launchContext(), opNum, x->getBuffer(), x->getShapeInfo(), x->specialBuffer(), x->specialShapeInfo(),
                             extras.argumentsAsT(x->dataType()), z->buffer(), z->shapeInfo(), z->specialBuffer(), z->specialShapeInfo());
@@ -75,7 +75,7 @@ namespace nd4j {
 
                     REQUIRE_TRUE(dims.size() > 0, 0, "Some dimensions required for reduction!");
 
-                    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(x->getShapeInfo(), dims);
+                    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(x->getShapeInfo(), dims);
 
                     auto pTadShape = Environment::getInstance()->isCPU() ? packX.primaryShapeInfo() : packX.specialShapeInfo(); //manager.replicatePointer(tad.tadOnlyShapeInfo, shape::shapeInfoByteLength(tad.tadOnlyShapeInfo));
                     auto pTadOffsets = Environment::getInstance()->isCPU() ? packX.primaryOffsets() : packX.specialOffsets(); //manager.replicatePointer(tad.tadOffsets, tad.numTads * sizeof(Nd4jLong));
@@ -101,7 +101,7 @@ namespace nd4j {
                     dims[e] = f >= 0 ? f : f += x->rankOf();
                 }
 
-                if ((block.getIArguments()->size() == 1 && INT_ARG(0) == nd4j::DataTypeUtils::max<int>()) || allAxes) {
+                if ((block.getIArguments()->size() == 1 && INT_ARG(0) == sd::DataTypeUtils::max<int>()) || allAxes) {
                     // scalar
                     NativeOpExecutioner::execReduceBoolScalar(block.launchContext(), opNum, x->getBuffer(), x->getShapeInfo(), x->specialBuffer(), x->specialShapeInfo(), extras.argumentsAsT(x->dataType()), z->buffer(), z->shapeInfo(), z->specialBuffer(), z->specialShapeInfo());
                 } else {
@@ -111,7 +111,7 @@ namespace nd4j {
 
                     REQUIRE_TRUE(dims.size() > 0, 0, "Some dimensions required for reduction!");
 
-                    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(x->getShapeInfo(), dims);
+                    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(x->getShapeInfo(), dims);
 
                     auto pTadShape = Environment::getInstance()->isCPU() ? packX.primaryShapeInfo() : packX.specialShapeInfo(); //(Nd4jLong *) manager.replicatePointer(tad.tadOnlyShapeInfo, shape::shapeInfoByteLength(tad.tadOnlyShapeInfo));
                     auto pTadOffsets = Environment::getInstance()->isCPU() ? packX.primaryOffsets() : packX.specialOffsets(); //(Nd4jLong *) manager.replicatePointer(tad.tadOffsets, tad.numTads * sizeof(Nd4jLong));
@@ -129,7 +129,7 @@ namespace nd4j {
         *   For all reductions rules are simple: either you return scalar, or you return reduced NDArray.
         *   It solely depends on input shape, and requested dimensions
         */
-        ShapeList *LegacyReduceBoolOp::calculateOutputShape(ShapeList *inputShape, nd4j::graph::Context &block) {
+        ShapeList *LegacyReduceBoolOp::calculateOutputShape(ShapeList *inputShape, sd::graph::Context &block) {
             auto inShape = inputShape->at(0);
 
             Nd4jLong *newShape;
diff --git a/libnd4j/include/ops/declarable/impl/LegacyReduceFloatOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyReduceFloatOp.cpp
index 2765e1b3f..23f863ba2 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyReduceFloatOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyReduceFloatOp.cpp
@@ -21,11 +21,11 @@
 #include <ops/declarable/LegacyReduceFloatOp.h>
 #include <helpers/TAD.h>
 #include <helpers/ShapeUtils.h>
-#include <Status.h>
+#include <graph/Status.h>
 #include <helpers/ConstantTadHelper.h>
 #include <array/DataTypeUtils.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         LegacyReduceFloatOp::LegacyReduceFloatOp() : LegacyOp::LegacyOp(1) {
             //
@@ -61,7 +61,7 @@ namespace nd4j {
                     allAxes = true;
 
                 // _axis.(block.getIArguments()->size() == 0) ||
-                //                    (block.getIArguments()->size() == 1 && INT_ARG(0) == nd4j::DataTypeUtils::max<int>())
+                //                    (block.getIArguments()->size() == 1 && INT_ARG(0) == sd::DataTypeUtils::max<int>())
                 if (block.getAxis()->empty() || allAxes) {
                     // scalar
                     NativeOpExecutioner::execReduceFloatScalar(block.launchContext(), opNum, x->getBuffer(), x->getShapeInfo(), x->specialBuffer(), x->specialShapeInfo(),
@@ -76,7 +76,7 @@ namespace nd4j {
 
                     REQUIRE_TRUE(dims.size() > 0, 0, "Some dimensions required for reduction!");
 
-                    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(x->getShapeInfo(), dims);
+                    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(x->getShapeInfo(), dims);
 
                     auto pTadShape = Environment::getInstance()->isCPU() ? packX.primaryShapeInfo() : packX.specialShapeInfo(); //manager.replicatePointer(tad.tadOnlyShapeInfo, shape::shapeInfoByteLength(tad.tadOnlyShapeInfo));
                     auto pTadOffsets = Environment::getInstance()->isCPU() ? packX.primaryOffsets() : packX.specialOffsets(); //manager.replicatePointer(tad.tadOffsets, tad.numTads * sizeof(Nd4jLong));
@@ -102,14 +102,14 @@ namespace nd4j {
                     dims[e] = f >= 0 ? f : f += x->rankOf();
                 }
 
-                if ((block.getIArguments()->size() == 1 && INT_ARG(0) == nd4j::DataTypeUtils::max<int>()) || allAxes) {
+                if ((block.getIArguments()->size() == 1 && INT_ARG(0) == sd::DataTypeUtils::max<int>()) || allAxes) {
                     // scalar
                     NativeOpExecutioner::execReduceFloatScalar(block.launchContext(), opNum, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(), extras.argumentsAsT(x->dataType()), z->buffer(), z->shapeInfo(), z->specialBuffer(), z->specialShapeInfo());
                 } else {
                     // TAD
                     REQUIRE_TRUE(dims.size() > 0, 0, "Some dimensions required for reduction!");
 
-                    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(x->getShapeInfo(), dims);
+                    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(x->getShapeInfo(), dims);
 
                     auto pTadShape = Environment::getInstance()->isCPU() ? packX.primaryShapeInfo() : packX.specialShapeInfo(); //(Nd4jLong *) manager.replicatePointer(tad.tadOnlyShapeInfo, shape::shapeInfoByteLength(tad.tadOnlyShapeInfo));
                     auto pTadOffsets = Environment::getInstance()->isCPU() ? packX.primaryOffsets() : packX.specialOffsets(); //(Nd4jLong *) manager.replicatePointer(tad.tadOffsets, tad.numTads * sizeof(Nd4jLong));
@@ -130,7 +130,7 @@ namespace nd4j {
         *   For all reductions rules are simple: either you return scalar, or you return reduced NDArray.
         *   It solely depends on input shape, and requested dimensions
         */
-        ShapeList *LegacyReduceFloatOp::calculateOutputShape(ShapeList *inputShape, nd4j::graph::Context &block) {
+        ShapeList *LegacyReduceFloatOp::calculateOutputShape(ShapeList *inputShape, sd::graph::Context &block) {
             auto inShape = inputShape->at(0);
 
             Nd4jLong *newShape;
diff --git a/libnd4j/include/ops/declarable/impl/LegacyReduceLongOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyReduceLongOp.cpp
index 836564c79..17cba4227 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyReduceLongOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyReduceLongOp.cpp
@@ -21,11 +21,11 @@
 #include <ops/declarable/LegacyReduceLongOp.h>
 #include <helpers/TAD.h>
 #include <helpers/ShapeUtils.h>
-#include <Status.h>
+#include <graph/Status.h>
 #include <helpers/ConstantTadHelper.h>
 #include <array/DataTypeUtils.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         LegacyReduceLongOp::LegacyReduceLongOp() : LegacyOp::LegacyOp(1) {
             //
@@ -61,7 +61,7 @@ namespace nd4j {
                     allAxes = true;
 
                 if ((axis.empty()) ||
-                    (axis.size() == 1 && axis[0] == nd4j::DataTypeUtils::max<int>()) || allAxes) {
+                    (axis.size() == 1 && axis[0] == sd::DataTypeUtils::max<int>()) || allAxes) {
                     // scalar
                     NativeOpExecutioner::execReduceLongScalar(block.launchContext(), opNum, x->getBuffer(), x->getShapeInfo(), x->specialBuffer(), x->specialShapeInfo(),
                             extras.argumentsAsT(x->dataType()), z->buffer(), z->shapeInfo(), z->specialBuffer(), z->specialShapeInfo());
@@ -78,7 +78,7 @@ namespace nd4j {
 
                     REQUIRE_TRUE(dims.size() > 0, 0, "Some dimensions required for reduction!");
 
-                    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(x->getShapeInfo(), dims);
+                    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(x->getShapeInfo(), dims);
 
                     auto pTadShape = Environment::getInstance()->isCPU() ? packX.primaryShapeInfo() : packX.specialShapeInfo(); //(Nd4jLong *) manager.replicatePointer(tad.tadOnlyShapeInfo, shape::shapeInfoByteLength(tad.tadOnlyShapeInfo));
                     auto pTadOffsets = Environment::getInstance()->isCPU() ? packX.primaryOffsets() : packX.specialOffsets(); //(Nd4jLong *) manager.replicatePointer(tad.tadOffsets, tad.numTads * sizeof(Nd4jLong));
@@ -104,14 +104,14 @@ namespace nd4j {
                     dims[e] = f >= 0 ? f : f += x->rankOf();
                 }
 
-                if ((block.getIArguments()->size() == 1 && INT_ARG(0) == nd4j::DataTypeUtils::max<int>()) || allAxes) {
+                if ((block.getIArguments()->size() == 1 && INT_ARG(0) == sd::DataTypeUtils::max<int>()) || allAxes) {
                     // scalar
                     NativeOpExecutioner::execReduceLongScalar(block.launchContext(), opNum, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(), extras.argumentsAsT(x->dataType()), z->buffer(), z->shapeInfo(), z->specialBuffer(), z->specialShapeInfo());
                 } else {
                     // TAD
                     REQUIRE_TRUE(dims.size() > 0, 0, "Some dimensions required for reduction!");
 
-                    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(x->getShapeInfo(), dims);
+                    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(x->getShapeInfo(), dims);
 
                     auto pTadShape = Environment::getInstance()->isCPU() ? packX.primaryShapeInfo() : packX.specialShapeInfo(); //(Nd4jLong *) manager.replicatePointer(tad.tadOnlyShapeInfo, shape::shapeInfoByteLength(tad.tadOnlyShapeInfo));
                     auto pTadOffsets = Environment::getInstance()->isCPU() ? packX.primaryOffsets() : packX.specialOffsets(); //(Nd4jLong *) manager.replicatePointer(tad.tadOffsets, tad.numTads * sizeof(Nd4jLong));
@@ -130,7 +130,7 @@ namespace nd4j {
         *   For all reductions rules are simple: either you return scalar, or you return reduced NDArray.
         *   It solely depends on input shape, and requested dimensions
         */
-        ShapeList *LegacyReduceLongOp::calculateOutputShape(ShapeList *inputShape, nd4j::graph::Context &block) {
+        ShapeList *LegacyReduceLongOp::calculateOutputShape(ShapeList *inputShape, sd::graph::Context &block) {
             auto inShape = inputShape->at(0);
 
             Nd4jLong *newShape;
diff --git a/libnd4j/include/ops/declarable/impl/LegacyReduceOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyReduceOp.cpp
index 8fac021d9..46be149c6 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyReduceOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyReduceOp.cpp
@@ -22,7 +22,7 @@
 #include <helpers/TAD.h>
 #include <helpers/ShapeUtils.h>
 #ifdef LEGACY_REDUCE_SAME_ONLY
-namespace nd4j {
+namespace sd {
     namespace ops {
         LegacyReduceOp::LegacyReduceOp() : LegacyOp::LegacyOp(1) {
             //
@@ -140,7 +140,7 @@ namespace nd4j {
         *   For all reductions rules are simple: either you return scalar, or you return reduced NDArray.
         *   It solely depends on input shape, and requested dimensions
         */
-        ShapeList *LegacyReduceOp::calculateOutputShape(ShapeList *inputShape, nd4j::graph::Context &block) {
+        ShapeList *LegacyReduceOp::calculateOutputShape(ShapeList *inputShape, sd::graph::Context &block) {
             auto inShape = inputShape->at(0);
 
             Nd4jLong *newShape;
diff --git a/libnd4j/include/ops/declarable/impl/LegacyReduceSameOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyReduceSameOp.cpp
index 2340f39b0..3c96bca70 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyReduceSameOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyReduceSameOp.cpp
@@ -21,11 +21,11 @@
 #include <ops/declarable/LegacyReduceSameOp.h>
 #include <helpers/TAD.h>
 #include <helpers/ShapeUtils.h>
-#include <Status.h>
+#include <graph/Status.h>
 #include <helpers/ConstantTadHelper.h>
 #include <array/DataTypeUtils.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         LegacyReduceSameOp::LegacyReduceSameOp() : LegacyOp::LegacyOp(1) {
             //
@@ -73,7 +73,7 @@ namespace nd4j {
 
                     REQUIRE_TRUE(dims.size() > 0, 0, "Some dimensions required for reduction!");
 
-                    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(x->getShapeInfo(), dims);
+                    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(x->getShapeInfo(), dims);
 
                     auto pTadShape = Environment::getInstance()->isCPU() ? packX.primaryShapeInfo() : packX.specialShapeInfo(); //(Nd4jLong *) manager.replicatePointer(tad.tadOnlyShapeInfo, shape::shapeInfoByteLength(tad.tadOnlyShapeInfo));
                     auto pTadOffsets = Environment::getInstance()->isCPU() ? packX.primaryOffsets() : packX.specialOffsets(); //(Nd4jLong *) manager.replicatePointer(tad.tadOffsets, tad.numTads * sizeof(Nd4jLong));
@@ -99,14 +99,14 @@ namespace nd4j {
                     dims[e] = f >= 0 ? f : f += x->rankOf();
                 }
 
-                if ((block.getIArguments()->size() == 1 && INT_ARG(0) == nd4j::DataTypeUtils::max<int>()) || allAxes) {
+                if ((block.getIArguments()->size() == 1 && INT_ARG(0) == sd::DataTypeUtils::max<int>()) || allAxes) {
                     // scalar
                     NativeOpExecutioner::execReduceSameScalar(block.launchContext(), opNum, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(), extras.argumentsAsT(z->dataType()), z->buffer(), z->shapeInfo(), z->specialBuffer(), z->specialShapeInfo());
                 } else {
                     // TAD
                     REQUIRE_TRUE(dims.size() > 0, 0, "Some dimensions required for reduction!");
 
-                    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(x->getShapeInfo(), dims);
+                    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(x->getShapeInfo(), dims);
 
                     auto pTadShape = Environment::getInstance()->isCPU() ? packX.primaryShapeInfo() : packX.specialShapeInfo(); //(Nd4jLong *) manager.replicatePointer(tad.tadOnlyShapeInfo, shape::shapeInfoByteLength(tad.tadOnlyShapeInfo));
                     auto pTadOffsets = Environment::getInstance()->isCPU() ? packX.primaryOffsets() : packX.specialOffsets(); //(Nd4jLong *) manager.replicatePointer(tad.tadOffsets, tad.numTads * sizeof(Nd4jLong));
@@ -126,7 +126,7 @@ namespace nd4j {
         *   For all reductions rules are simple: either you return scalar, or you return reduced NDArray.
         *   It solely depends on input shape, and requested dimensions
         */
-        ShapeList *LegacyReduceSameOp::calculateOutputShape(ShapeList *inputShape, nd4j::graph::Context &block) {
+        ShapeList *LegacyReduceSameOp::calculateOutputShape(ShapeList *inputShape, sd::graph::Context &block) {
             auto inShape = inputShape->at(0);
 
             Nd4jLong *newShape;
diff --git a/libnd4j/include/ops/declarable/impl/LegacyScalarBoolOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyScalarBoolOp.cpp
index 040cde77c..46728ede1 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyScalarBoolOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyScalarBoolOp.cpp
@@ -19,11 +19,11 @@
 //
 
 #include <ops/declarable/LegacyScalarBoolOp.h>
-#include <NDArrayFactory.h>
-#include <Status.h>
+#include <array/NDArrayFactory.h>
+#include <graph/Status.h>
 
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         LegacyScalarBoolOp::LegacyScalarBoolOp() : LegacyOp::LegacyOp(1) {
             // no-op
@@ -41,7 +41,7 @@ namespace nd4j {
             _scalar = new NDArray(scalar.dup(scalar.ordering()));
         }
 
-        ShapeList *LegacyScalarBoolOp::calculateOutputShape(ShapeList *inputShape, nd4j::graph::Context &block) {
+        ShapeList *LegacyScalarBoolOp::calculateOutputShape(ShapeList *inputShape, sd::graph::Context &block) {
             auto inShape = inputShape->at(0);
 
             Nd4jLong *newShape;
diff --git a/libnd4j/include/ops/declarable/impl/LegacyScalarOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyScalarOp.cpp
index 856bfdeaf..de104a11d 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyScalarOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyScalarOp.cpp
@@ -19,11 +19,11 @@
 //
 
 #include <ops/declarable/LegacyScalarOp.h>
-#include <NDArrayFactory.h>
-#include <Status.h>
+#include <array/NDArrayFactory.h>
+#include <graph/Status.h>
 
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         LegacyScalarOp::LegacyScalarOp() : LegacyOp::LegacyOp(1) {
             this->getOpDescriptor()->allowInplace(true);
@@ -41,7 +41,7 @@ namespace nd4j {
             _scalar = new NDArray(scalar.dup(scalar.ordering()));
         }
 
-        ShapeList *LegacyScalarOp::calculateOutputShape(ShapeList *inputShape, nd4j::graph::Context &block) {
+        ShapeList *LegacyScalarOp::calculateOutputShape(ShapeList *inputShape, sd::graph::Context &block) {
             auto inShape = inputShape->at(0);
 
             Nd4jLong *newShape;
@@ -70,7 +70,7 @@ namespace nd4j {
             } else if (block.getTArguments()->size() > 0) {
                 auto y = NDArrayFactory::create(x->dataType(), T_ARG(0), block.launchContext());
 
-                x->applyScalarArr(static_cast<nd4j::scalar::Ops>(opNum), y, *z);
+                x->applyScalarArr(static_cast<sd::scalar::Ops>(opNum), y, *z);
                 // NDArray::prepareSpecialUse({z}, {x, &y});
                 // NativeOpExecutioner::execScalar(block.launchContext(), opNum, x->getBuffer(), x->getShapeInfo(), x->specialBuffer(), x->specialShapeInfo(), z->getBuffer(), z->getShapeInfo(), z->specialBuffer(), z->specialShapeInfo(), y.buffer(), y.shapeInfo(), y.specialBuffer(), y.specialShapeInfo(), extras.argumentsAsT(z->dataType(), 1));
 
diff --git a/libnd4j/include/ops/declarable/impl/LegacyStatsOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyStatsOp.cpp
index 08ebb80de..74f82d162 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyStatsOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyStatsOp.cpp
@@ -25,7 +25,7 @@
 #include <array/DataTypeUtils.h>
 
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         Nd4jStatus LegacyStatsOp::validateAndExecute(Context &block) {
             auto x = INPUT_VARIABLE(0);
@@ -44,7 +44,7 @@ namespace nd4j {
             ExtraArguments extras(*block.getTArguments());
             PointersManager manager(block.launchContext(),"LegacyStatsOp");
 
-            if (block.getIArguments()->size() == 1 || (block.getIArguments()->size() == 2 && INT_ARG(1) == nd4j::DataTypeUtils::max<int>())) {
+            if (block.getIArguments()->size() == 1 || (block.getIArguments()->size() == 2 && INT_ARG(1) == sd::DataTypeUtils::max<int>())) {
                 // scalar
                 NativeOpExecutioner::execSummaryStatsScalar(block.launchContext(), opNum, x->getBuffer(), x->getShapeInfo(), x->specialBuffer(), x->specialShapeInfo(),
                         extras.argumentsAsT(z->dataType()), z->getBuffer(), z->getShapeInfo(), z->specialBuffer(), z->specialShapeInfo(), biasCorrected);
@@ -58,7 +58,7 @@ namespace nd4j {
 
                 REQUIRE_TRUE(dims.size() > 0, 0, "Some dimensions requuired for reduction!");
 
-                auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(x->getShapeInfo(), dims);
+                auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(x->getShapeInfo(), dims);
 
                 auto pTadShape = Environment::getInstance()->isCPU() ? packX.primaryShapeInfo() : packX.specialShapeInfo(); //(Nd4jLong *) manager.replicatePointer(tad.tadOnlyShapeInfo, shape::shapeInfoByteLength(tad.tadOnlyShapeInfo));
                 auto pTadOffsets = Environment::getInstance()->isCPU() ? packX.primaryOffsets() : packX.specialOffsets(); //(Nd4jLong *) manager.replicatePointer(tad.tadOffsets, tad.numTads * sizeof(Nd4jLong));
@@ -89,11 +89,11 @@ namespace nd4j {
         *   For all reductions rules are simple: either you return scalar, or you return reduced NDArray.
         *   It solely depends on input shape, and requested dimensions
         */
-        ShapeList *LegacyStatsOp::calculateOutputShape(ShapeList *inputShape, nd4j::graph::Context &block) {
+        ShapeList *LegacyStatsOp::calculateOutputShape(ShapeList *inputShape, sd::graph::Context &block) {
             auto inShape = inputShape->at(0);
 
             Nd4jLong *newShape;
-            if (block.getIArguments()->size() == 0 || (block.getIArguments()->size() == 1 && INT_ARG(0) == nd4j::DataTypeUtils::max<int>())) {
+            if (block.getIArguments()->size() == 0 || (block.getIArguments()->size() == 1 && INT_ARG(0) == sd::DataTypeUtils::max<int>())) {
                 // in this case we just return scalar
                 ALLOCATE(newShape, block.getWorkspace(), shape::shapeInfoLength(2), Nd4jLong);
                 newShape[0] = 2;
diff --git a/libnd4j/include/ops/declarable/impl/LegacyTransformAnyOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyTransformAnyOp.cpp
index 2f08565e6..def577eb3 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyTransformAnyOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyTransformAnyOp.cpp
@@ -20,10 +20,10 @@
 
 #include <ops/declarable/LegacyTransformAnyOp.h>
 
-#include <NativeOpExecutioner.h>
+#include <legacy/NativeOpExecutioner.h>
 
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         LegacyTransformAnyOp::LegacyTransformAnyOp() : LegacyOp::LegacyOp(1) {
             // just a no-op
@@ -62,7 +62,7 @@ namespace nd4j {
         * But these ops already have CustomOp implementations.
         *
         */
-        ShapeList *LegacyTransformAnyOp::calculateOutputShape(ShapeList *inputShape, nd4j::graph::Context &block) {
+        ShapeList *LegacyTransformAnyOp::calculateOutputShape(ShapeList *inputShape, sd::graph::Context &block) {
             auto inShape = inputShape->at(0);
 
             Nd4jLong *newShape;
diff --git a/libnd4j/include/ops/declarable/impl/LegacyTransformBoolOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyTransformBoolOp.cpp
index 96af1f632..99b856b8a 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyTransformBoolOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyTransformBoolOp.cpp
@@ -20,10 +20,10 @@
 
 #include <ops/declarable/LegacyTransformBoolOp.h>
 
-#include <NativeOpExecutioner.h>
+#include <legacy/NativeOpExecutioner.h>
 
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         LegacyTransformBoolOp::LegacyTransformBoolOp() : LegacyOp::LegacyOp(1) {
             // just a no-op
@@ -63,7 +63,7 @@ namespace nd4j {
         * But these ops already have CustomOp implementations.
         *
         */
-        ShapeList *LegacyTransformBoolOp::calculateOutputShape(ShapeList *inputShape, nd4j::graph::Context &block) {
+        ShapeList *LegacyTransformBoolOp::calculateOutputShape(ShapeList *inputShape, sd::graph::Context &block) {
             auto inShape = inputShape->at(0);
             return SHAPELIST(ConstantShapeHelper::getInstance()->createShapeInfo(ShapeDescriptor(inShape, DataType::BOOL)));
         }
diff --git a/libnd4j/include/ops/declarable/impl/LegacyTransformFloatOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyTransformFloatOp.cpp
index e5f4f62e9..f0795b7bb 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyTransformFloatOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyTransformFloatOp.cpp
@@ -20,10 +20,10 @@
 
 #include <ops/declarable/LegacyTransformFloatOp.h>
 
-#include <NativeOpExecutioner.h>
+#include <legacy/NativeOpExecutioner.h>
 
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         LegacyTransformFloatOp::LegacyTransformFloatOp() : LegacyOp::LegacyOp(1) {
             // just a no-op
@@ -62,7 +62,7 @@ namespace nd4j {
         * But these ops already have CustomOp implementations.
         *
         */
-        ShapeList *LegacyTransformFloatOp::calculateOutputShape(ShapeList *inputShape, nd4j::graph::Context &block) {
+        ShapeList *LegacyTransformFloatOp::calculateOutputShape(ShapeList *inputShape, sd::graph::Context &block) {
             auto inShape = inputShape->at(0);
 
             Nd4jLong *newShape;
diff --git a/libnd4j/include/ops/declarable/impl/LegacyTransformOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyTransformOp.cpp
index de8248d25..b073d9df1 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyTransformOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyTransformOp.cpp
@@ -20,10 +20,10 @@
 
 #include <ops/declarable/LegacyTransformOp.h>
 
-#include <NativeOpExecutioner.h>
+#include <legacy/NativeOpExecutioner.h>
 
 #ifdef ONLY_SAME_TRANSFORM
-namespace nd4j {
+namespace sd {
     namespace ops {
         LegacyTransformOp::LegacyTransformOp() : LegacyOp::LegacyOp(1) {
             // just a no-op
@@ -55,7 +55,7 @@ namespace nd4j {
         * But these ops already have CustomOp implementations.
         *
         */
-        ShapeList *LegacyTransformOp::calculateOutputShape(ShapeList *inputShape, nd4j::graph::Context &block) {
+        ShapeList *LegacyTransformOp::calculateOutputShape(ShapeList *inputShape, sd::graph::Context &block) {
             auto inShape = inputShape->at(0);
 
             Nd4jLong *newShape;
diff --git a/libnd4j/include/ops/declarable/impl/LegacyTransformSameOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyTransformSameOp.cpp
index 6b097c3af..0d827787e 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyTransformSameOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyTransformSameOp.cpp
@@ -20,10 +20,10 @@
 
 #include <ops/declarable/LegacyTransformSameOp.h>
 
-#include <NativeOpExecutioner.h>
+#include <legacy/NativeOpExecutioner.h>
 
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         LegacyTransformSameOp::LegacyTransformSameOp() : LegacyOp::LegacyOp(1) {
             this->getOpDescriptor()->allowInplace(true);
@@ -62,7 +62,7 @@ namespace nd4j {
         * But these ops already have CustomOp implementations.
         *
         */
-        ShapeList *LegacyTransformSameOp::calculateOutputShape(ShapeList *inputShape, nd4j::graph::Context &block) {
+        ShapeList *LegacyTransformSameOp::calculateOutputShape(ShapeList *inputShape, sd::graph::Context &block) {
             auto inShape = inputShape->at(0);
 
             Nd4jLong *newShape;
diff --git a/libnd4j/include/ops/declarable/impl/LegacyTransformStrictOp.cpp b/libnd4j/include/ops/declarable/impl/LegacyTransformStrictOp.cpp
index a390a458c..f36853579 100644
--- a/libnd4j/include/ops/declarable/impl/LegacyTransformStrictOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LegacyTransformStrictOp.cpp
@@ -20,10 +20,10 @@
 
 #include <ops/declarable/LegacyTransformStrictOp.h>
 
-#include <NativeOpExecutioner.h>
+#include <legacy/NativeOpExecutioner.h>
 
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         LegacyTransformStrictOp::LegacyTransformStrictOp() : LegacyOp::LegacyOp(1) {
             this->getOpDescriptor()->allowInplace(true);
@@ -61,7 +61,7 @@ namespace nd4j {
         * But these ops already have CustomOp implementations.
         *
         */
-        ShapeList *LegacyTransformStrictOp::calculateOutputShape(ShapeList *inputShape, nd4j::graph::Context &block) {
+        ShapeList *LegacyTransformStrictOp::calculateOutputShape(ShapeList *inputShape, sd::graph::Context &block) {
             auto inShape = inputShape->at(0);
 
             Nd4jLong *newShape;
diff --git a/libnd4j/include/ops/declarable/impl/LogicOp.cpp b/libnd4j/include/ops/declarable/impl/LogicOp.cpp
index 7873defe4..ae24b5631 100644
--- a/libnd4j/include/ops/declarable/impl/LogicOp.cpp
+++ b/libnd4j/include/ops/declarable/impl/LogicOp.cpp
@@ -20,19 +20,19 @@
 
 #include "ops/declarable/LogicOp.h"
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         LogicOp::LogicOp(const char *name) : DeclarableOp::DeclarableOp(name, true) {
             // just using DeclarableOp constructor
             //this->_descriptor->
         }
 
-        Nd4jStatus LogicOp::validateAndExecute(nd4j::graph::Context &block) {
+        Nd4jStatus LogicOp::validateAndExecute(sd::graph::Context &block) {
             nd4j_logger("WARNING: LogicOps should NOT be ever called\n", "");
             return ND4J_STATUS_BAD_INPUT;
         }
 
-        ShapeList* LogicOp::calculateOutputShape(ShapeList *inputShape, nd4j::graph::Context &block) {
+        ShapeList* LogicOp::calculateOutputShape(ShapeList *inputShape, sd::graph::Context &block) {
             // FIXME: we probably want these ops to evaluate scopes
             return SHAPELIST();
         }
diff --git a/libnd4j/include/ops/declarable/impl/OpDescriptor.cpp b/libnd4j/include/ops/declarable/impl/OpDescriptor.cpp
index 417fc0605..398c11729 100644
--- a/libnd4j/include/ops/declarable/impl/OpDescriptor.cpp
+++ b/libnd4j/include/ops/declarable/impl/OpDescriptor.cpp
@@ -20,7 +20,7 @@
 
 #include <ops/declarable/OpDescriptor.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
 
         OpDescriptor::OpDescriptor(const char * opName, bool isLogic) {
@@ -33,8 +33,8 @@ namespace nd4j {
             _numOutputs = 1;
 
             _opName = opName;
-            _hash = nd4j::ops::HashHelper::getInstance()->getLongHash(_opName);
-            _opClass = nd4j::graph::OpClass_CONDITIONAL;
+            _hash = sd::ops::HashHelper::getInstance()->getLongHash(_opName);
+            _opClass = sd::graph::OpClass_CONDITIONAL;
 
             _scalar = isScalar;
         }
@@ -44,8 +44,8 @@ namespace nd4j {
             _numOutputs = 1;
 
             _opName = opName;
-            _hash = nd4j::ops::HashHelper::getInstance()->getLongHash(_opName);
-            _opClass = nd4j::graph::OpClass_CONDITIONAL;
+            _hash = sd::ops::HashHelper::getInstance()->getLongHash(_opName);
+            _opClass = sd::graph::OpClass_CONDITIONAL;
 
             _scalar = isScalar;
         }
@@ -77,11 +77,11 @@ namespace nd4j {
             std::string tmp(opName);
             _opName = tmp;
             _allowsInplace = allowsInplace;
-            _hash = nd4j::ops::HashHelper::getInstance()->getLongHash(tmp);
+            _hash = sd::ops::HashHelper::getInstance()->getLongHash(tmp);
             _divergent = false;
 
             // just default value
-            _opClass = nd4j::graph::OpClass_TRANSFORM;
+            _opClass = sd::graph::OpClass_TRANSFORM;
         }
 
         // constructor for configurable op
@@ -159,12 +159,12 @@ namespace nd4j {
             return _inputType;
         }
 
-        OpDescriptor* OpDescriptor::setAllowedInputTypes(const std::initializer_list<nd4j::DataType> &dtypes) {
+        OpDescriptor* OpDescriptor::setAllowedInputTypes(const std::initializer_list<sd::DataType> &dtypes) {
             _allowedIns = dtypes;
             return this;
         }
 
-        OpDescriptor* OpDescriptor::setAllowedOutputTypes(const std::initializer_list<nd4j::DataType> &dtypes) {
+        OpDescriptor* OpDescriptor::setAllowedOutputTypes(const std::initializer_list<sd::DataType> &dtypes) {
             _allowedOuts = dtypes;
             return this;
         }
@@ -174,24 +174,24 @@ namespace nd4j {
             return this;
         }
 
-        OpDescriptor* OpDescriptor::setAllowedInputTypes(const nd4j::DataType dtype) {
+        OpDescriptor* OpDescriptor::setAllowedInputTypes(const sd::DataType dtype) {
             _allowedIns.clear();
             _allowedIns.emplace_back(dtype);
             return this;
         }
 
-        OpDescriptor* OpDescriptor::setAllowedOutputTypes(const nd4j::DataType dtype) {
+        OpDescriptor* OpDescriptor::setAllowedOutputTypes(const sd::DataType dtype) {
             _allowedOuts.clear();
             _allowedOuts.emplace_back(dtype);
             return this;
         }
 
-        OpDescriptor* OpDescriptor::setInputType(const int idx, const nd4j::DataType dtype) {
+        OpDescriptor* OpDescriptor::setInputType(const int idx, const sd::DataType dtype) {
             _inputTypes[idx] = { dtype };
             return this;
         }
 
-        OpDescriptor* OpDescriptor::setOutputType(const int idx, const nd4j::DataType dtype) {
+        OpDescriptor* OpDescriptor::setOutputType(const int idx, const sd::DataType dtype) {
             _outputTypes[idx] = { dtype };
             return this;
         }
@@ -201,17 +201,17 @@ namespace nd4j {
             return this;
         }
 
-        OpDescriptor* OpDescriptor::setAllowedInputTypes(int index, const std::vector<nd4j::DataType> &dtype) {
+        OpDescriptor* OpDescriptor::setAllowedInputTypes(int index, const std::vector<sd::DataType> &dtype) {
             _inputTypes[index] = dtype;
             return this;
         }
 
-        OpDescriptor* OpDescriptor::setAllowedOutputTypes(int index, const std::vector<nd4j::DataType> &dtype) {
+        OpDescriptor* OpDescriptor::setAllowedOutputTypes(int index, const std::vector<sd::DataType> &dtype) {
             _outputTypes[index] = dtype;
             return this;
         }
 
-        OpDescriptor* OpDescriptor::setAllowedInputTypes(int index,  nd4j::DataType dtype) {
+        OpDescriptor* OpDescriptor::setAllowedInputTypes(int index,  sd::DataType dtype) {
             if (_inputTypes.count(index) == 0)
                 _inputTypes[index] = {dtype};
             else
@@ -220,7 +220,7 @@ namespace nd4j {
             return this;
         }
 
-        OpDescriptor* OpDescriptor::setAllowedOutputTypes(int index, nd4j::DataType dtype) {
+        OpDescriptor* OpDescriptor::setAllowedOutputTypes(int index, sd::DataType dtype) {
             if (_outputTypes.count(index) == 0)
                 _outputTypes[index] = {dtype};
             else
@@ -229,7 +229,7 @@ namespace nd4j {
             return this;
         }
 
-        bool OpDescriptor::checkDataTypesMatch(nd4j::DataType needle, std::vector<nd4j::DataType> &haystack) const {
+        bool OpDescriptor::checkDataTypesMatch(sd::DataType needle, std::vector<sd::DataType> &haystack) const {
             // if haystack is empty - INHERIT is occurs - any type is perfect?
             if (haystack.empty())
                 return true;
@@ -238,7 +238,7 @@ namespace nd4j {
             if (std::find(haystack.begin(), haystack.end(), needle) == haystack.end()) {
 
                 // if direct input match failed - we're checking for ANY as allowed input
-                if (std::find(haystack.begin(), haystack.end(), nd4j::DataType::ANY) == haystack.end())
+                if (std::find(haystack.begin(), haystack.end(), sd::DataType::ANY) == haystack.end())
                     return false;
                 else
                     return true;
@@ -247,7 +247,7 @@ namespace nd4j {
             }
         }
 
-        bool OpDescriptor::checkInputMatch(int index, nd4j::DataType dataType) {
+        bool OpDescriptor::checkInputMatch(int index, sd::DataType dataType) {
             // we check for per-input types first
             if (_inputTypes.empty() || _inputTypes.count(index) == 0) {
                 // checking global input types
@@ -260,7 +260,7 @@ namespace nd4j {
             return true;
         }
 
-        bool OpDescriptor::checkOutputMatch(int index, nd4j::DataType dataType) {
+        bool OpDescriptor::checkOutputMatch(int index, sd::DataType dataType) {
             // we check for per-output types first
             if (_outputTypes.empty() || _outputTypes.count(index) == 0) {
 
@@ -279,23 +279,23 @@ namespace nd4j {
         }
 
         bool OpDescriptor::isInherit(int index) {
-            if (std::find(_allowedOuts.begin(), _allowedOuts.end(), nd4j::DataType::INHERIT) != _allowedOuts.end())
+            if (std::find(_allowedOuts.begin(), _allowedOuts.end(), sd::DataType::INHERIT) != _allowedOuts.end())
                 return true;
             if (_outputTypes.count(index) > 0) {
                 auto vec = _outputTypes[index];
 
-                if (std::find(vec.begin(), vec.end(), nd4j::DataType::INHERIT) != vec.end())
+                if (std::find(vec.begin(), vec.end(), sd::DataType::INHERIT) != vec.end())
                     return true;
             }
 
             return false;
         }
 
-        std::vector<nd4j::DataType> OpDescriptor::getOutputTypesForOutput(int index) {
+        std::vector<sd::DataType> OpDescriptor::getOutputTypesForOutput(int index) {
             if (_outputTypes.count(index) > 0)
                 return _outputTypes.at(index);
             else
-                return std::vector<nd4j::DataType>();
+                return std::vector<sd::DataType>();
         }
     }
 }
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/impl/OpRegistrator.cpp b/libnd4j/include/ops/declarable/impl/OpRegistrator.cpp
index 64abc4a3a..65d694dea 100644
--- a/libnd4j/include/ops/declarable/impl/OpRegistrator.cpp
+++ b/libnd4j/include/ops/declarable/impl/OpRegistrator.cpp
@@ -23,7 +23,7 @@
 #include <ops/declarable/OpRegistrator.h>
 #include <sstream>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
 
         ///////////////////////////////
@@ -42,7 +42,7 @@ namespace nd4j {
                 std::string newName(name);
                 std::string oldName(oname);
 
-                OpRegistrator::getInstance()->updateMSVC(nd4j::ops::HashHelper::getInstance()->getLongHash(newName), oldName);
+                OpRegistrator::getInstance()->updateMSVC(sd::ops::HashHelper::getInstance()->getLongHash(newName), oldName);
                 return;
             }
             OpRegistrator::getInstance()->registerOperation(name, ptr);
@@ -53,7 +53,7 @@ namespace nd4j {
 
         OpRegistrator* OpRegistrator::getInstance() {
             if (!_INSTANCE)
-                _INSTANCE = new nd4j::ops::OpRegistrator();
+                _INSTANCE = new sd::ops::OpRegistrator();
 
             return _INSTANCE;
         }
@@ -130,7 +130,7 @@ namespace nd4j {
             _locker.lock();
 
             if (!isInit) {
-                for (MAP_IMPL<std::string, nd4j::ops::DeclarableOp*>::iterator it=_declarablesD.begin(); it!=_declarablesD.end(); ++it) {
+                for (MAP_IMPL<std::string, sd::ops::DeclarableOp*>::iterator it=_declarablesD.begin(); it!=_declarablesD.end(); ++it) {
                     std::string op = it->first + ":"
                                      + local_to_string(it->second->getOpDescriptor()->getHash()) + ":"
                                      + local_to_string(it->second->getOpDescriptor()->getNumberOfInputs()) + ":"
@@ -151,13 +151,13 @@ namespace nd4j {
         }
 
 
-        bool OpRegistrator::registerOperation(const char* name, nd4j::ops::DeclarableOp* op) {
+        bool OpRegistrator::registerOperation(const char* name, sd::ops::DeclarableOp* op) {
             std::string str(name);
-            std::pair<std::string, nd4j::ops::DeclarableOp*> pair(str, op);
+            std::pair<std::string, sd::ops::DeclarableOp*> pair(str, op);
             _declarablesD.insert(pair);
 
-            auto hash = nd4j::ops::HashHelper::getInstance()->getLongHash(str);
-            std::pair<Nd4jLong, nd4j::ops::DeclarableOp*> pair2(hash, op);
+            auto hash = sd::ops::HashHelper::getInstance()->getLongHash(str);
+            std::pair<Nd4jLong, sd::ops::DeclarableOp*> pair2(hash, op);
             _declarablesLD.insert(pair2);
             return true;
         }
@@ -167,12 +167,12 @@ namespace nd4j {
          *
          * @param op
          */
-        bool OpRegistrator::registerOperation(nd4j::ops::DeclarableOp *op) {
+        bool OpRegistrator::registerOperation(sd::ops::DeclarableOp *op) {
             _uniqueD.emplace_back(op);
             return registerOperation(op->getOpName()->c_str(), op);
         }
 
-        void OpRegistrator::registerHelper(nd4j::ops::platforms::PlatformHelper* op) {
+        void OpRegistrator::registerHelper(sd::ops::platforms::PlatformHelper* op) {
             std::pair<Nd4jLong, samediff::Engine> p = {op->hash(), op->engine()};
             if (_helpersLH.count(p) > 0)
                 throw std::runtime_error("Tried to double register PlatformHelper");
@@ -181,14 +181,14 @@ namespace nd4j {
 
             nd4j_debug("Adding helper for op \"%s\": [%lld - %i]\n", op->name().c_str(), op->hash(), (int) op->engine());
 
-            std::pair<std::pair<std::string, samediff::Engine>, nd4j::ops::platforms::PlatformHelper*> pair({op->name(), op->engine()}, op);
+            std::pair<std::pair<std::string, samediff::Engine>, sd::ops::platforms::PlatformHelper*> pair({op->name(), op->engine()}, op);
             _helpersH.insert(pair);
 
-            std::pair<std::pair<Nd4jLong, samediff::Engine>, nd4j::ops::platforms::PlatformHelper*> pair2(p, op);
+            std::pair<std::pair<Nd4jLong, samediff::Engine>, sd::ops::platforms::PlatformHelper*> pair2(p, op);
             _helpersLH.insert(pair2);
         }
 
-        nd4j::ops::DeclarableOp* OpRegistrator::getOperation(const char *name) {
+        sd::ops::DeclarableOp* OpRegistrator::getOperation(const char *name) {
             std::string str(name);
             return getOperation(str);
         }
@@ -199,7 +199,7 @@ namespace nd4j {
          * @param name
          * @return
          */
-        nd4j::ops::DeclarableOp *OpRegistrator::getOperation(Nd4jLong hash) {
+        sd::ops::DeclarableOp *OpRegistrator::getOperation(Nd4jLong hash) {
             if (!_declarablesLD.count(hash)) {
                 if (!_msvc.count(hash)) {
                     nd4j_printf("Unknown D operation requested by hash: [%lld]\n", hash);
@@ -211,7 +211,7 @@ namespace nd4j {
                     auto op = _declarablesD.at(str);
                     auto oHash = op->getOpDescriptor()->getHash();
 
-                    std::pair<Nd4jLong, nd4j::ops::DeclarableOp*> pair(oHash, op);
+                    std::pair<Nd4jLong, sd::ops::DeclarableOp*> pair(oHash, op);
                     _declarablesLD.insert(pair);
 
                     _locker.unlock();
@@ -221,7 +221,7 @@ namespace nd4j {
             return _declarablesLD.at(hash);
         }
 
-        nd4j::ops::DeclarableOp *OpRegistrator::getOperation(std::string& name) {
+        sd::ops::DeclarableOp *OpRegistrator::getOperation(std::string& name) {
             if (!_declarablesD.count(name)) {
                 nd4j_debug("Unknown operation requested: [%s]\n", name.c_str());
                 return nullptr;
@@ -230,7 +230,7 @@ namespace nd4j {
             return _declarablesD.at(name);
         }
 
-        nd4j::ops::platforms::PlatformHelper* OpRegistrator::getPlatformHelper(Nd4jLong hash, samediff::Engine engine) {
+        sd::ops::platforms::PlatformHelper* OpRegistrator::getPlatformHelper(Nd4jLong hash, samediff::Engine engine) {
             std::pair<Nd4jLong, samediff::Engine> p = {hash, engine};
             if (_helpersLH.count(p) == 0)
                 throw std::runtime_error("Requested helper can't be found");
@@ -257,7 +257,7 @@ namespace nd4j {
             return result;
         }
 
-        nd4j::ops::OpRegistrator* nd4j::ops::OpRegistrator::_INSTANCE = 0;
+        sd::ops::OpRegistrator* sd::ops::OpRegistrator::_INSTANCE = 0;
     }
 }
 
diff --git a/libnd4j/include/ops/declarable/impl/OpTuple.cpp b/libnd4j/include/ops/declarable/impl/OpTuple.cpp
index 126f864a2..fc43739e8 100644
--- a/libnd4j/include/ops/declarable/impl/OpTuple.cpp
+++ b/libnd4j/include/ops/declarable/impl/OpTuple.cpp
@@ -20,38 +20,38 @@
 
 #include "ops/declarable/OpTuple.h"
 
-nd4j::ops::OpTuple::OpTuple(const char *opName) {
+sd::ops::OpTuple::OpTuple(const char *opName) {
     _opName = opName;
 }
 
-nd4j::ops::OpTuple::OpTuple(const char *opName, std::initializer_list<nd4j::NDArray *> &&inputs, std::initializer_list<double> &&tArgs, std::initializer_list<Nd4jLong> &&iArgs) {
+sd::ops::OpTuple::OpTuple(const char *opName, std::initializer_list<sd::NDArray *> &&inputs, std::initializer_list<double> &&tArgs, std::initializer_list<Nd4jLong> &&iArgs) {
     _opName = opName;
     _inputs = inputs;
     _iArgs = iArgs;
     _tArgs = tArgs;
 }
 
-nd4j::ops::OpTuple::~OpTuple() {
+sd::ops::OpTuple::~OpTuple() {
     for (auto v: _inputs)
         delete v;
 }
 
-nd4j::ops::OpTuple *nd4j::ops::OpTuple::addInput(nd4j::NDArray *array) {
+sd::ops::OpTuple *sd::ops::OpTuple::addInput(sd::NDArray *array) {
     _inputs.emplace_back(array);
     return this;
 }
 
-nd4j::ops::OpTuple *nd4j::ops::OpTuple::addOutput(nd4j::NDArray *array) {
+sd::ops::OpTuple *sd::ops::OpTuple::addOutput(sd::NDArray *array) {
     _outputs.emplace_back(array);
     return this;
 }
 
-nd4j::ops::OpTuple *nd4j::ops::OpTuple::setTArgs(std::initializer_list<double> tArgs) {
+sd::ops::OpTuple *sd::ops::OpTuple::setTArgs(std::initializer_list<double> tArgs) {
     _tArgs = tArgs;
     return this;
 }
 
-nd4j::ops::OpTuple *nd4j::ops::OpTuple::setIArgs(std::initializer_list<Nd4jLong> iArgs) {
+sd::ops::OpTuple *sd::ops::OpTuple::setIArgs(std::initializer_list<Nd4jLong> iArgs) {
     _iArgs = iArgs;
     return this;
 }
diff --git a/libnd4j/include/ops/declarable/impl/PlatformHelper.cpp b/libnd4j/include/ops/declarable/impl/PlatformHelper.cpp
index 86c84b0fb..fe0928ce6 100644
--- a/libnd4j/include/ops/declarable/impl/PlatformHelper.cpp
+++ b/libnd4j/include/ops/declarable/impl/PlatformHelper.cpp
@@ -21,7 +21,7 @@
 #include "../PlatformHelper.h"
 #include <graph/Variable.h>
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace platforms {
             PlatformHelper::PlatformHelper(const char *name, samediff::Engine engine) {
@@ -31,7 +31,7 @@ namespace nd4j {
                 _engine = engine;
             }
 
-            nd4j::NDArray *PlatformHelper::getZ(graph::Context &ctx, int inputId) {
+            sd::NDArray *PlatformHelper::getZ(graph::Context &ctx, int inputId) {
                 NDArray *z = nullptr;
 
                 if (ctx.isFastPath()) {
diff --git a/libnd4j/include/ops/declarable/platform/cudnn/avgpool2d.cu b/libnd4j/include/ops/declarable/platform/cudnn/avgpool2d.cu
index 8ff0bafb1..015c08172 100644
--- a/libnd4j/include/ops/declarable/platform/cudnn/avgpool2d.cu
+++ b/libnd4j/include/ops/declarable/platform/cudnn/avgpool2d.cu
@@ -22,7 +22,7 @@
 #include "cudnnUtils.h"
 #include <ops/declarable/helpers/convolutions.h>
 
-namespace nd4j      {
+namespace sd      {
 namespace ops       {
 namespace platforms {
 
diff --git a/libnd4j/include/ops/declarable/platform/cudnn/avgpool3d.cu b/libnd4j/include/ops/declarable/platform/cudnn/avgpool3d.cu
index 878f306b3..aeaaa6516 100644
--- a/libnd4j/include/ops/declarable/platform/cudnn/avgpool3d.cu
+++ b/libnd4j/include/ops/declarable/platform/cudnn/avgpool3d.cu
@@ -22,7 +22,7 @@
 #include "cudnnUtils.h"
 #include <ops/declarable/helpers/convolutions.h>
 
-namespace nd4j      {
+namespace sd      {
 namespace ops       {
 namespace platforms {
 
diff --git a/libnd4j/include/ops/declarable/platform/cudnn/batchnorm.cu b/libnd4j/include/ops/declarable/platform/cudnn/batchnorm.cu
index 1177d1a3c..8d0b1301a 100644
--- a/libnd4j/include/ops/declarable/platform/cudnn/batchnorm.cu
+++ b/libnd4j/include/ops/declarable/platform/cudnn/batchnorm.cu
@@ -22,7 +22,7 @@
 #include "cudnnUtils.h"
 #include <ops/declarable/helpers/convolutions.h>
 
-namespace nd4j      {
+namespace sd      {
 namespace ops       {
 namespace platforms {
 
@@ -44,7 +44,7 @@ static void batchnormCUDNN(const LaunchContext* context,
 
     auto handle = reinterpret_cast<cudnnHandle_t *>(context->getCuDnnHandle());
     cudnnStatus_t err = cudnnSetStream(*handle, *context->getCudaStream());
-    if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: can't set stream for cuDNN", err);
+    if (err != 0) throw sd::cuda_exception::build("conv2dCUDNN: can't set stream for cuDNN", err);
 
     const std::vector<int> xShape = input->getShapeAsVectorInt();               // input and output have same shapes
 
@@ -77,7 +77,7 @@ static void batchnormCUDNN(const LaunchContext* context,
         err = cudnnSetTensorNdDescriptorEx(x, format, dataType, xRank, xShape.data());
     else
         err = cudnnSetTensorNdDescriptor(x, dataType, xRank, xShape.data(), xStrides.data());
-    if (err != 0) throw nd4j::cuda_exception::build("batchnormCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for input failed", err);
+    if (err != 0) throw sd::cuda_exception::build("batchnormCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for input failed", err);
 
     // output descriptor
     cudnnTensorDescriptor_t z;
@@ -86,7 +86,7 @@ static void batchnormCUDNN(const LaunchContext* context,
         err = cudnnSetTensorNdDescriptorEx(z, format, dataType, xRank, xShape.data());
     else
         err = cudnnSetTensorNdDescriptor(z, dataType, xRank, xShape.data(), zStrides.data());
-    if (err != 0) throw nd4j::cuda_exception::build("batchnormCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for output failed", err);
+    if (err != 0) throw sd::cuda_exception::build("batchnormCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for output failed", err);
 
     // mean, variance, gamma and beta descriptor, the same descriptor for all of them
     cudnnTensorDescriptor_t params;
@@ -95,7 +95,7 @@ static void batchnormCUDNN(const LaunchContext* context,
         err = cudnnSetTensorNdDescriptorEx(params, format, dataType, xRank, paramsShape.data());
     else
         err = cudnnSetTensorNdDescriptor(params, dataType, xRank, paramsShape.data(), paramsStrides.data());
-    if (err != 0) throw nd4j::cuda_exception::build("batchnormCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for mean/variance/gamma/beta failed", err);
+    if (err != 0) throw sd::cuda_exception::build("batchnormCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for mean/variance/gamma/beta failed", err);
 
     // provide scaling parameters
     const float  alpha32(1), beta32(0);
@@ -114,7 +114,7 @@ static void batchnormCUDNN(const LaunchContext* context,
                                                  gamma->getSpecialBuffer(), beta->getSpecialBuffer(),
                                                  mean->getSpecialBuffer(), variance->getSpecialBuffer(), epsilon);
 
-    if (err != 0) throw nd4j::cuda_exception::build("batchnormCUDNN: cudnnBatchNormalizationForwardInference failed", err);
+    if (err != 0) throw sd::cuda_exception::build("batchnormCUDNN: cudnnBatchNormalizationForwardInference failed", err);
 
     auto cudaErr = cudaStreamSynchronize(*context->getCudaStream());
     if (cudaErr != 0)
@@ -139,7 +139,7 @@ static void batchnormBpCUDNN(const LaunchContext* context,
 
     auto handle = reinterpret_cast<cudnnHandle_t *>(context->getCuDnnHandle());
     cudnnStatus_t err = cudnnSetStream(*handle, *context->getCudaStream());
-    if (err != 0) throw nd4j::cuda_exception::build("batchnormBpCUDNN: can't set stream for cuDNN", err);
+    if (err != 0) throw sd::cuda_exception::build("batchnormBpCUDNN: can't set stream for cuDNN", err);
 
     const std::vector<int> xShape = input->getShapeAsVectorInt();               // input and output have same shapes
 
@@ -174,7 +174,7 @@ static void batchnormBpCUDNN(const LaunchContext* context,
         err = cudnnSetTensorNdDescriptorEx(x, format, dataType, xRank, xShape.data());
     else
         err = cudnnSetTensorNdDescriptor(x, dataType, xRank, xShape.data(), xStrides.data());
-    if (err != 0) throw nd4j::cuda_exception::build("batchnormBpCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for input failed", err);
+    if (err != 0) throw sd::cuda_exception::build("batchnormBpCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for input failed", err);
 
     // gradO descriptor
     cudnnTensorDescriptor_t dz;
@@ -183,7 +183,7 @@ static void batchnormBpCUDNN(const LaunchContext* context,
         err = cudnnSetTensorNdDescriptorEx(dz, format, dataType, xRank, xShape.data());
     else
         err = cudnnSetTensorNdDescriptor(dz, dataType, xRank, xShape.data(), dzStrides.data());
-    if (err != 0) throw nd4j::cuda_exception::build("batchnormBpCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for gradO failed", err);
+    if (err != 0) throw sd::cuda_exception::build("batchnormBpCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for gradO failed", err);
 
     // gradI descriptor
     cudnnTensorDescriptor_t dx;
@@ -192,7 +192,7 @@ static void batchnormBpCUDNN(const LaunchContext* context,
         err = cudnnSetTensorNdDescriptorEx(dx, format, dataType, xRank, xShape.data());
     else
         err = cudnnSetTensorNdDescriptor(dx, dataType, xRank, xShape.data(), dxStrides.data());
-    if (err != 0) throw nd4j::cuda_exception::build("batchnormBpCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for gradI failed", err);
+    if (err != 0) throw sd::cuda_exception::build("batchnormBpCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for gradI failed", err);
 
     // mean, variance, gamma, gradG and gradB descriptor, the same descriptor for all of them
     cudnnTensorDescriptor_t params;
@@ -201,7 +201,7 @@ static void batchnormBpCUDNN(const LaunchContext* context,
         err = cudnnSetTensorNdDescriptorEx(params, format, dataType, xRank, paramsShape.data());
     else
         err = cudnnSetTensorNdDescriptor(params, dataType, xRank, paramsShape.data(), paramsStrides.data());
-    if (err != 0) throw nd4j::cuda_exception::build("batchnormBpCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for mean/variance/gamma/gradG/gradB failed", err);
+    if (err != 0) throw sd::cuda_exception::build("batchnormBpCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for mean/variance/gamma/gradG/gradB failed", err);
 
     // provide scaling parameters
     const float  alpha32(1), beta32(0);
@@ -223,7 +223,7 @@ static void batchnormBpCUDNN(const LaunchContext* context,
                                             epsilon,
                                             nullptr/*mean->getSpecialBuffer()*/, nullptr/*variance->getSpecialBuffer()*/);
 
-    if (err != 0) throw nd4j::cuda_exception::build("batchnormBpCUDNN: cudnnBatchNormalizationBackward failed", err);
+    if (err != 0) throw sd::cuda_exception::build("batchnormBpCUDNN: cudnnBatchNormalizationBackward failed", err);
 
     auto cudaErr = cudaStreamSynchronize(*context->getCudaStream());
     if (cudaErr != 0)
diff --git a/libnd4j/include/ops/declarable/platform/cudnn/conv2d.cu b/libnd4j/include/ops/declarable/platform/cudnn/conv2d.cu
index 234dbffb7..b58cc40f3 100644
--- a/libnd4j/include/ops/declarable/platform/cudnn/conv2d.cu
+++ b/libnd4j/include/ops/declarable/platform/cudnn/conv2d.cu
@@ -23,7 +23,7 @@
 #include "cudnnUtils.h"
 #include <ops/declarable/helpers/convolutions.h>
 
-namespace nd4j      {
+namespace sd      {
 namespace ops       {
 namespace platforms {
 
@@ -42,7 +42,7 @@ static void conv2dCUDNN(const LaunchContext* context,
 
     auto handle = reinterpret_cast<cudnnHandle_t *>(context->getCuDnnHandle());
     cudnnStatus_t err = cudnnSetStream(*handle, *context->getCudaStream());
-    if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: can't set stream for cuDNN", err);
+    if (err != 0) throw sd::cuda_exception::build("conv2dCUDNN: can't set stream for cuDNN", err);
 
     cudnnTensorFormat_t format = isNCHW ? CUDNN_TENSOR_NCHW : CUDNN_TENSOR_NHWC;
 
@@ -53,13 +53,13 @@ static void conv2dCUDNN(const LaunchContext* context,
         err = cudnnSetTensor4dDescriptor(x, format, cudnnDataType(input->dataType()), bS, iC, iH, iW);
     else
         err = cudnnSetTensor4dDescriptorEx(x, cudnnDataType(input->dataType()), bS, iC, iH, iW, input->strideAt(0), input->strideAt(indIOioC), input->strideAt(indIiH), input->strideAt(indIiH + 1));
-    if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for input failed", err);
+    if (err != 0) throw sd::cuda_exception::build("conv2dCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for input failed", err);
 
     // weights descriptor
     cudnnFilterDescriptor_t w;
     cudnnCreateFilterDescriptor(&w);
     err = cudnnSetFilter4dDescriptor(w, cudnnDataType(weights->dataType()), CUDNN_TENSOR_NCHW, oC, iC, kH, kW);
-    if(err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudnnSetFilter4dDescriptor failed", err);
+    if(err != 0) throw sd::cuda_exception::build("conv2dCUDNN: cudnnSetFilter4dDescriptor failed", err);
 
     // output descriptor
     cudnnTensorDescriptor_t z;
@@ -68,27 +68,27 @@ static void conv2dCUDNN(const LaunchContext* context,
         err = cudnnSetTensor4dDescriptor(z, format, cudnnDataType(output->dataType()), bS, oC, oH, oW);
     else
         err = cudnnSetTensor4dDescriptorEx(z, cudnnDataType(output->dataType()), bS, oC, oH, oW, output->strideAt(0), output->strideAt(indIOioC), output->strideAt(indOoH), output->strideAt(indOoH + 1));
-    if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for output failed", err);
+    if (err != 0) throw sd::cuda_exception::build("conv2dCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for output failed", err);
 
     // description of convolution
     cudnnConvolutionDescriptor_t conv;
     cudnnCreateConvolutionDescriptor(&conv);
     err = cudnnSetConvolution2dDescriptor(conv, pH, pW, sH, sW, dH, dW, CUDNN_CROSS_CORRELATION, cudnnDataType(output->dataType()));
-    if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudnnSetConvolution2dDescriptor failed", err);
+    if (err != 0) throw sd::cuda_exception::build("conv2dCUDNN: cudnnSetConvolution2dDescriptor failed", err);
 
     // algorithm description
     cudnnConvolutionFwdAlgo_t algo;
     err = cudnnGetConvolutionForwardAlgorithm(*handle, x, w, conv, z, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo);
-    if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudnnGetConvolutionForwardAlgorithm failed", err);
+    if (err != 0) throw sd::cuda_exception::build("conv2dCUDNN: cudnnGetConvolutionForwardAlgorithm failed", err);
 
 
     // allocate auxiliary device memory, abbreviation ws means workspace
     size_t wsSize;
     err = cudnnGetConvolutionForwardWorkspaceSize(*handle, x, w, conv, z, algo, &wsSize);
-    if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudnnGetConvolutionForwardWorkspaceSize failed", err);
+    if (err != 0) throw sd::cuda_exception::build("conv2dCUDNN: cudnnGetConvolutionForwardWorkspaceSize failed", err);
     void* wsData;
     auto cudaErr = cudaMalloc(&wsData, wsSize);
-    if (cudaErr != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudaMalloc for auxiliary workspace memory failed", cudaErr);
+    if (cudaErr != 0) throw sd::cuda_exception::build("conv2dCUDNN: cudaMalloc for auxiliary workspace memory failed", cudaErr);
 
     // provide scaling parameters
     const float  alpha32(1), beta32(0);
@@ -100,7 +100,7 @@ static void conv2dCUDNN(const LaunchContext* context,
 
     // run calculation
     err = cudnnConvolutionForward(*handle, alpha, x, input->getSpecialBuffer(), w, weights->getSpecialBuffer(), conv, algo, wsData, wsSize, beta, z, output->specialBuffer());
-    if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudnnConvolutionForward failed", err);
+    if (err != 0) throw sd::cuda_exception::build("conv2dCUDNN: cudnnConvolutionForward failed", err);
 
     // add bias if it is present
     if (bias != nullptr) {
@@ -108,9 +108,9 @@ static void conv2dCUDNN(const LaunchContext* context,
         cudnnTensorDescriptor_t b;
         cudnnCreateTensorDescriptor(&b);
         err = cudnnSetTensor4dDescriptor(b, format, cudnnDataType(bias->dataType()), 1, isNCHW ? bias->lengthOf() : 1, 1, isNCHW ? 1: bias->lengthOf());
-        if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudnnSetTensor4dDescriptor for bias failed", err);
+        if (err != 0) throw sd::cuda_exception::build("conv2dCUDNN: cudnnSetTensor4dDescriptor for bias failed", err);
         err = cudnnAddTensor(*handle, alpha, b, bias->getSpecialBuffer(), alpha, z, output->specialBuffer());
-        if (err != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudnnAddTensor bias failed", err);
+        if (err != 0) throw sd::cuda_exception::build("conv2dCUDNN: cudnnAddTensor bias failed", err);
     }
 
     // cudaErr = cudaStreamSynchronize(*context->getCudaStream());
@@ -118,7 +118,7 @@ static void conv2dCUDNN(const LaunchContext* context,
     //     throw cuda_exception::build("conv2dCUDNN: cudaStreamSynchronize failed !", cudaErr);
 
     cudaErr = cudaFree(wsData);
-    if (cudaErr != 0) throw nd4j::cuda_exception::build("conv2dCUDNN: cudaFree for auxiliary workspace memory failed", cudaErr);
+    if (cudaErr != 0) throw sd::cuda_exception::build("conv2dCUDNN: cudaFree for auxiliary workspace memory failed", cudaErr);
 
     NDArray::registerSpecialUse({output}, {input, weights, bias});
 }
@@ -139,7 +139,7 @@ static void conv2dBpCUDNN(const LaunchContext* context,
 
     auto handle = reinterpret_cast<cudnnHandle_t *>(context->getCuDnnHandle());
     cudnnStatus_t err = cudnnSetStream(*handle, *context->getCudaStream());
-    if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: can't set stream for cuDNN", err);
+    if (err != 0) throw sd::cuda_exception::build("conv2dBpCUDNN: can't set stream for cuDNN", err);
 
     cudnnTensorFormat_t format = isNCHW ? CUDNN_TENSOR_NCHW : CUDNN_TENSOR_NHWC;
 
@@ -150,7 +150,7 @@ static void conv2dBpCUDNN(const LaunchContext* context,
         err = cudnnSetTensor4dDescriptor(x, format, cudnnDataType(input->dataType()), bS, iC, iH, iW);
     else
         err = cudnnSetTensor4dDescriptorEx(x, cudnnDataType(input->dataType()), bS, iC, iH, iW, input->strideAt(0), input->strideAt(indIOioC), input->strideAt(indIiH), input->strideAt(indIiH + 1));
-    if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for input failed", err);
+    if (err != 0) throw sd::cuda_exception::build("conv2dBpCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for input failed", err);
 
     // gradO descriptor
     cudnnTensorDescriptor_t dz;
@@ -159,7 +159,7 @@ static void conv2dBpCUDNN(const LaunchContext* context,
         err = cudnnSetTensor4dDescriptor(dz, format, cudnnDataType(gradO->dataType()), bS, oC, oH, oW);
     else
         err = cudnnSetTensor4dDescriptorEx(dz, cudnnDataType(gradO->dataType()), bS, oC, oH, oW, gradO->strideAt(0), gradO->strideAt(indIOioC), gradO->strideAt(indOoH), gradO->strideAt(indOoH + 1));
-    if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for gradO failed", err);
+    if (err != 0) throw sd::cuda_exception::build("conv2dBpCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for gradO failed", err);
 
     // gradI descriptor
     cudnnTensorDescriptor_t dx;
@@ -168,45 +168,45 @@ static void conv2dBpCUDNN(const LaunchContext* context,
         err = cudnnSetTensor4dDescriptor(dx, format, cudnnDataType(gradI->dataType()), bS, iC, iH, iW);
     else
         err = cudnnSetTensor4dDescriptorEx(dx, cudnnDataType(gradI->dataType()), bS, iC, iH, iW, gradI->strideAt(0), gradI->strideAt(indIOioC), gradI->strideAt(indIiH), gradI->strideAt(indIiH + 1));
-    if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for gradI failed", err);
+    if (err != 0) throw sd::cuda_exception::build("conv2dBpCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for gradI failed", err);
 
     // gradW descriptor
     cudnnFilterDescriptor_t dw;
     cudnnCreateFilterDescriptor(&dw);
     err = cudnnSetFilter4dDescriptor(dw, cudnnDataType(gradW->dataType()), CUDNN_TENSOR_NCHW, oC, iC, kH, kW);
-    if(err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnSetFilter4dDescriptor gradW failed", err);
+    if(err != 0) throw sd::cuda_exception::build("conv2dBpCUDNN: cudnnSetFilter4dDescriptor gradW failed", err);
 
     // description of convolution
     cudnnConvolutionDescriptor_t conv;
     cudnnCreateConvolutionDescriptor(&conv);
     err = cudnnSetConvolution2dDescriptor(conv, pH, pW, sH, sW, dH, dW, CUDNN_CROSS_CORRELATION, cudnnDataType(gradO->dataType()));
-    if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnSetConvolution2dDescriptor failed", err);
+    if (err != 0) throw sd::cuda_exception::build("conv2dBpCUDNN: cudnnSetConvolution2dDescriptor failed", err);
 
     // gradW algorithm description
     cudnnConvolutionBwdFilterAlgo_t algoGradW;
     err = cudnnGetConvolutionBackwardFilterAlgorithm(*handle, x, dz, conv, dw, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0, &algoGradW);
-    if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnGetConvolutionBackwardFilterAlgorithm failed", err);
+    if (err != 0) throw sd::cuda_exception::build("conv2dBpCUDNN: cudnnGetConvolutionBackwardFilterAlgorithm failed", err);
 
     // gradI algorithm description
     cudnnConvolutionBwdDataAlgo_t algoGradI;
     err = cudnnGetConvolutionBackwardDataAlgorithm(*handle, dw, dz, conv, x, CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 0, &algoGradI);
-    if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnGetConvolutionBackwardDataAlgorithm failed", err);
+    if (err != 0) throw sd::cuda_exception::build("conv2dBpCUDNN: cudnnGetConvolutionBackwardDataAlgorithm failed", err);
 
     // allocate auxiliary device memory for gradW calculation, abbreviation ws means workspace
     size_t wsGradWSize;
     err = cudnnGetConvolutionBackwardFilterWorkspaceSize(*handle, x, dz, conv, dw, algoGradW, &wsGradWSize);
-    if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnGetConvolutionBackwardFilterWorkspaceSize failed", err);
+    if (err != 0) throw sd::cuda_exception::build("conv2dBpCUDNN: cudnnGetConvolutionBackwardFilterWorkspaceSize failed", err);
     void* wsGradWData;
     auto cudaErr = cudaMalloc(&wsGradWData, wsGradWSize);
-    if (cudaErr != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudaMalloc for auxiliary workspace memory wsGradWData failed", cudaErr);
+    if (cudaErr != 0) throw sd::cuda_exception::build("conv2dBpCUDNN: cudaMalloc for auxiliary workspace memory wsGradWData failed", cudaErr);
 
     // allocate auxiliary device memory for gradI calculation, abbreviation ws means workspace
     size_t wsGradISize;
     err = cudnnGetConvolutionBackwardDataWorkspaceSize(*handle, dw, dz, conv, dx, algoGradI, &wsGradISize);
-    if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnGetConvolutionBackwardDataWorkspaceSize failed", err);
+    if (err != 0) throw sd::cuda_exception::build("conv2dBpCUDNN: cudnnGetConvolutionBackwardDataWorkspaceSize failed", err);
     void* wsGradIData;
     cudaErr = cudaMalloc(&wsGradIData, wsGradISize);
-    if (cudaErr != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudaMalloc for auxiliary workspace memory wsGradIData failed", cudaErr);
+    if (cudaErr != 0) throw sd::cuda_exception::build("conv2dBpCUDNN: cudaMalloc for auxiliary workspace memory wsGradIData failed", cudaErr);
 
     // provide scaling parameters
     const float  alpha32(1), beta32(0);
@@ -221,28 +221,28 @@ static void conv2dBpCUDNN(const LaunchContext* context,
         cudnnTensorDescriptor_t db;
         cudnnCreateTensorDescriptor(&db);
         err = cudnnSetTensor4dDescriptor(db, format, cudnnDataType(gradB->dataType()), 1, isNCHW ? gradB->lengthOf() : 1, 1, isNCHW ? 1: gradB->lengthOf());
-        if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnSetTensor4dDescriptor for gradB failed", err);
+        if (err != 0) throw sd::cuda_exception::build("conv2dBpCUDNN: cudnnSetTensor4dDescriptor for gradB failed", err);
 
         err = cudnnConvolutionBackwardBias(*handle, alpha, dz, gradO->getSpecialBuffer(), beta, db, gradB->getSpecialBuffer());
-        if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnConvolutionBackwardBias failed", err);
+        if (err != 0) throw sd::cuda_exception::build("conv2dBpCUDNN: cudnnConvolutionBackwardBias failed", err);
     }
 
     // run calculation for gradW
     err = cudnnConvolutionBackwardFilter(*handle, alpha, x, input->getSpecialBuffer(), dz, gradO->getSpecialBuffer(), conv, algoGradW, wsGradWData, wsGradWSize, beta, dw, gradW->getSpecialBuffer());
-    if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnConvolutionBackwardFilter failed", err);
+    if (err != 0) throw sd::cuda_exception::build("conv2dBpCUDNN: cudnnConvolutionBackwardFilter failed", err);
 
     // run calculation for gradI
     err = cudnnConvolutionBackwardData(*handle, alpha, dw, weights->getSpecialBuffer(), dz, gradO->getSpecialBuffer(), conv, algoGradI, wsGradIData, wsGradISize, beta, dx, gradI->getSpecialBuffer());
-    if (err != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudnnConvolutionBackwardData failed", err);
+    if (err != 0) throw sd::cuda_exception::build("conv2dBpCUDNN: cudnnConvolutionBackwardData failed", err);
 
     // cudaErr = cudaStreamSynchronize(*context->getCudaStream());
     // if (cudaErr != 0)
     //     throw cuda_exception::build("conv2dBpCUDNN: cudaStreamSynchronize failed !", cudaErr);
 
     cudaErr = cudaFree(wsGradWData);
-    if (cudaErr != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudaFree for auxiliary workspace memory wsGradWData failed", cudaErr);
+    if (cudaErr != 0) throw sd::cuda_exception::build("conv2dBpCUDNN: cudaFree for auxiliary workspace memory wsGradWData failed", cudaErr);
     cudaErr = cudaFree(wsGradIData);
-    if (cudaErr != 0) throw nd4j::cuda_exception::build("conv2dBpCUDNN: cudaFree for auxiliary workspace memory wsGradIData failed", cudaErr);
+    if (cudaErr != 0) throw sd::cuda_exception::build("conv2dBpCUDNN: cudaFree for auxiliary workspace memory wsGradIData failed", cudaErr);
 
     NDArray::registerSpecialUse({gradI, gradW, gradB}, {input, weights, gradO});
 }
@@ -422,7 +422,7 @@ PLATFORM_CHECK(conv2d_bp, ENGINE_CUDA) {
 //     auto handle = reinterpret_cast<cudnnHandle_t *>(block.launchContext()->getCuDnnHandle());
 //     auto res = cudnnSetStream(*handle, *block.launchContext()->getCudaStream());
 //     if (res != 0)
-//         throw nd4j::cuda_exception::build("Can't set stream for cuDNN", res);
+//         throw sd::cuda_exception::build("Can't set stream for cuDNN", res);
 
 //     auto input   = INPUT_VARIABLE(0);                                    // [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
 //     auto weights = INPUT_VARIABLE(1);                                    // [kH, kW, iC, oC] always
@@ -456,39 +456,39 @@ PLATFORM_CHECK(conv2d_bp, ENGINE_CUDA) {
 //     cudnnCreateTensorDescriptor(&src);
 //     res = cudnnSetTensor4dDescriptorEx(src, dtype, input->sizeAt(0), input->sizeAt(1), input->sizeAt(2), input->sizeAt(3), input->strideAt(0), input->strideAt(1), input->strideAt(2), input->strideAt(3));
 //     if (res != 0)
-//         throw nd4j::cuda_exception::build("cudnnSetTensor4dDescriptorEx src failed", res);
+//         throw sd::cuda_exception::build("cudnnSetTensor4dDescriptorEx src failed", res);
 
 //     // TODO: we definitely want NHWC here as well
 //     cudnnFilterDescriptor_t wght;
 //     cudnnCreateFilterDescriptor(&wght);
 //     res = cudnnSetFilter4dDescriptor(wght, dtype, CUDNN_TENSOR_NCHW, oC, iC, kH, kW);
 //     if (res != 0)
-//         throw nd4j::cuda_exception::build("cudnnSetFilter4dDescriptor failed", res);
+//         throw sd::cuda_exception::build("cudnnSetFilter4dDescriptor failed", res);
 
 //     cudnnConvolutionDescriptor_t cdc;
 //     cudnnCreateConvolutionDescriptor(&cdc);
 //     res = cudnnSetConvolution2dDescriptor(cdc, pH, pW, sH, sW, dH, dW, CUDNN_CROSS_CORRELATION, dtype);
 //     if (res != 0)
-//         throw nd4j::cuda_exception::build("cudnnSetConvolution2dDescriptor failed", res);
+//         throw sd::cuda_exception::build("cudnnSetConvolution2dDescriptor failed", res);
 
 //     cudnnTensorDescriptor_t dst;
 //     cudnnCreateTensorDescriptor(&dst);
 //     res = cudnnSetTensor4dDescriptorEx(dst, dtype, output->sizeAt(0), output->sizeAt(1), output->sizeAt(2), output->sizeAt(3), output->strideAt(0), output->strideAt(1), output->strideAt(2), output->strideAt(3));
 //     if (res != 0)
-//         throw nd4j::cuda_exception::build("cudnnSetTensor4dDescriptorEx dst failed", res);
+//         throw sd::cuda_exception::build("cudnnSetTensor4dDescriptorEx dst failed", res);
 
 //     // TODO: workspace algorithms are supposed to be faster, so we should use it here if we have enough memory
 //     cudnnConvolutionFwdAlgo_t algo;
 //     res = cudnnGetConvolutionForwardAlgorithm(*handle, src, wght, cdc, dst, CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, 0, &algo);
 //     if (res != 0)
-//         throw nd4j::cuda_exception::build("cudnnGetConvolutionForwardAlgorithm failed", res);
+//         throw sd::cuda_exception::build("cudnnGetConvolutionForwardAlgorithm failed", res);
 
 //     // TODO: should be float if dtype is half/float, and double otherwise
 //     float alpha = 1.0f;
 //     float beta = 0.0f;
 //     res = cudnnConvolutionForward(*handle, &alpha, src, input->specialBuffer(), wght, weights->specialBuffer(), cdc, algo, nullptr, 0, &beta, dst, output->specialBuffer());
 //     if (res != 0)
-//         throw nd4j::cuda_exception::build("cudnnConvolutionForward failed", res);
+//         throw sd::cuda_exception::build("cudnnConvolutionForward failed", res);
 
 
 //     if (bias != nullptr) {
@@ -497,16 +497,16 @@ PLATFORM_CHECK(conv2d_bp, ENGINE_CUDA) {
 //         if (isNCHW) {
 //             res = cudnnSetTensor4dDescriptor(bs, CUDNN_TENSOR_NCHW, dtype, 1, bias->lengthOf(), 1, 1);
 //             if (res != 0)
-//                 throw nd4j::cuda_exception::build("cudnnSetTensor4dDescriptorEx bias NHWC failed", res);
+//                 throw sd::cuda_exception::build("cudnnSetTensor4dDescriptorEx bias NHWC failed", res);
 //         } else {
 //             res = cudnnSetTensor4dDescriptor(bs, CUDNN_TENSOR_NHWC, dtype, 1, 1, 1, bias->lengthOf());
 //             if (res != 0)
-//                 throw nd4j::cuda_exception::build("cudnnSetTensor4dDescriptorEx bias NHWC failed", res);
+//                 throw sd::cuda_exception::build("cudnnSetTensor4dDescriptorEx bias NHWC failed", res);
 //         }
 
 //         res = cudnnAddTensor(*handle, &alpha, bs, bias->specialBuffer(), &alpha, dst, output->specialBuffer());
 //         if (res != 0)
-//             throw nd4j::cuda_exception::build("cudnnAddTensor failed", res);
+//             throw sd::cuda_exception::build("cudnnAddTensor failed", res);
 //     }
 
 
diff --git a/libnd4j/include/ops/declarable/platform/cudnn/conv3d.cu b/libnd4j/include/ops/declarable/platform/cudnn/conv3d.cu
index 9d30ff04c..1e86aaa07 100644
--- a/libnd4j/include/ops/declarable/platform/cudnn/conv3d.cu
+++ b/libnd4j/include/ops/declarable/platform/cudnn/conv3d.cu
@@ -23,7 +23,7 @@
 #include "cudnnUtils.h"
 #include <ops/declarable/helpers/convolutions.h>
 
-namespace nd4j      {
+namespace sd      {
 namespace ops       {
 namespace platforms {
 
@@ -44,7 +44,7 @@ static void conv3dCUDNN(const LaunchContext* context,
 
     auto handle = reinterpret_cast<cudnnHandle_t *>(context->getCuDnnHandle());
     cudnnStatus_t err = cudnnSetStream(*handle, *context->getCudaStream());
-    if (err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: can't set stream for cuDNN", err);
+    if (err != 0) throw sd::cuda_exception::build("conv3dCUDNN: can't set stream for cuDNN", err);
 
     const std::vector<int> pads        = {pD, pH, pW};
     const std::vector<int> filtStrides = {sD, sH, sW};
@@ -67,13 +67,13 @@ static void conv3dCUDNN(const LaunchContext* context,
         err = cudnnSetTensorNdDescriptorEx(x, format, cudnnDataType(input->dataType()), numDims, xShape.data());
     else
         err = cudnnSetTensorNdDescriptor(x, cudnnDataType(input->dataType()), numDims, xShape.data(), xStrides.data());
-    if (err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for input failed", err);
+    if (err != 0) throw sd::cuda_exception::build("conv3dCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for input failed", err);
 
     // weights descriptor
     cudnnFilterDescriptor_t w;
     cudnnCreateFilterDescriptor(&w);
     err = cudnnSetFilterNdDescriptor(w, cudnnDataType(weights->dataType()), CUDNN_TENSOR_NCHW, numDims, wShape.data());
-    if(err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudnnSetFilterNdDescriptor failed", err);
+    if(err != 0) throw sd::cuda_exception::build("conv3dCUDNN: cudnnSetFilterNdDescriptor failed", err);
 
     // output descriptor
     cudnnTensorDescriptor_t z;
@@ -82,26 +82,26 @@ static void conv3dCUDNN(const LaunchContext* context,
         err = cudnnSetTensorNdDescriptorEx(z, format, cudnnDataType(output->dataType()), numDims, zShape.data());
     else
         err = cudnnSetTensorNdDescriptor(z, cudnnDataType(output->dataType()), numDims, zShape.data(), zStrides.data());
-    if (err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for output failed", err);
+    if (err != 0) throw sd::cuda_exception::build("conv3dCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for output failed", err);
 
     // description of convolution
     cudnnConvolutionDescriptor_t conv;
     cudnnCreateConvolutionDescriptor(&conv);
     err = cudnnSetConvolutionNdDescriptor(conv, numDims-2, pads.data(), filtStrides.data(), dilations.data(), CUDNN_CROSS_CORRELATION, cudnnDataType(output->dataType()));
-    if (err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudnnSetConvolutionNdDescriptor failed", err);
+    if (err != 0) throw sd::cuda_exception::build("conv3dCUDNN: cudnnSetConvolutionNdDescriptor failed", err);
 
     // algorithm description
     cudnnConvolutionFwdAlgo_t algo;
     err = cudnnGetConvolutionForwardAlgorithm(*handle, x, w, conv, z, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo);
-    if (err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudnnGetConvolutionForwardAlgorithm failed", err);
+    if (err != 0) throw sd::cuda_exception::build("conv3dCUDNN: cudnnGetConvolutionForwardAlgorithm failed", err);
 
     // allocate auxiliary device memory, abbreviation ws means workspace
     size_t wsSize;
     err = cudnnGetConvolutionForwardWorkspaceSize(*handle, x, w, conv, z, algo, &wsSize);
-    if (err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudnnGetConvolutionForwardWorkspaceSize failed", err);
+    if (err != 0) throw sd::cuda_exception::build("conv3dCUDNN: cudnnGetConvolutionForwardWorkspaceSize failed", err);
     void* wsData;
     auto cudaErr = cudaMalloc(&wsData, wsSize);
-    if (cudaErr != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudaMalloc for auxiliary workspace memory failed", cudaErr);
+    if (cudaErr != 0) throw sd::cuda_exception::build("conv3dCUDNN: cudaMalloc for auxiliary workspace memory failed", cudaErr);
 
     // provide scaling parameters
     const float  alpha32(1), beta32(0);
@@ -113,7 +113,7 @@ static void conv3dCUDNN(const LaunchContext* context,
 
     // run calculation
     err = cudnnConvolutionForward(*handle, alpha, x, input->getSpecialBuffer(), w, weights->getSpecialBuffer(), conv, algo, wsData, wsSize, beta, z, output->specialBuffer());
-    if (err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudnnConvolutionForward failed", err);
+    if (err != 0) throw sd::cuda_exception::build("conv3dCUDNN: cudnnConvolutionForward failed", err);
 
     // add bias if it is present
     if (bias != nullptr) {
@@ -121,9 +121,9 @@ static void conv3dCUDNN(const LaunchContext* context,
         cudnnTensorDescriptor_t b;
         cudnnCreateTensorDescriptor(&b);
         err = cudnnSetTensorNdDescriptorEx(b, format, cudnnDataType(bias->dataType()), numDims, bShape.data());
-        if (err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudnnSetTensorNdDescriptor for bias failed", err);
+        if (err != 0) throw sd::cuda_exception::build("conv3dCUDNN: cudnnSetTensorNdDescriptor for bias failed", err);
         err = cudnnAddTensor(*handle, alpha, b, bias->getSpecialBuffer(), alpha, z, output->specialBuffer());
-        if (err != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudnnAddTensor bias failed", err);
+        if (err != 0) throw sd::cuda_exception::build("conv3dCUDNN: cudnnAddTensor bias failed", err);
     }
 
     // cudaErr = cudaStreamSynchronize(*context->getCudaStream());
@@ -131,7 +131,7 @@ static void conv3dCUDNN(const LaunchContext* context,
     //     throw cuda_exception::build("conv3dCUDNN: cudaStreamSynchronize failed !", cudaErr);
 
     cudaErr = cudaFree(wsData);
-    if (cudaErr != 0) throw nd4j::cuda_exception::build("conv3dCUDNN: cudaFree for auxiliary workspace memory failed", cudaErr);
+    if (cudaErr != 0) throw sd::cuda_exception::build("conv3dCUDNN: cudaFree for auxiliary workspace memory failed", cudaErr);
 
     NDArray::registerSpecialUse({output}, {input, weights, bias});
 }
@@ -154,7 +154,7 @@ static void conv3dBpCUDNN(const LaunchContext* context,
 
     auto handle = reinterpret_cast<cudnnHandle_t *>(context->getCuDnnHandle());
     cudnnStatus_t err = cudnnSetStream(*handle, *context->getCudaStream());
-    if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: can't set stream for cuDNN", err);
+    if (err != 0) throw sd::cuda_exception::build("conv3dBpCUDNN: can't set stream for cuDNN", err);
 
     const std::vector<int> pads        = {pD, pH, pW};
     const std::vector<int> filtStrides = {sD, sH, sW};
@@ -178,7 +178,7 @@ static void conv3dBpCUDNN(const LaunchContext* context,
         err = cudnnSetTensorNdDescriptorEx(x, format, cudnnDataType(input->dataType()), numDims, xShape.data());
     else
         err = cudnnSetTensorNdDescriptor(x, cudnnDataType(input->dataType()), numDims, xShape.data(), xStrides.data());
-    if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for input failed", err);
+    if (err != 0) throw sd::cuda_exception::build("conv3dBpCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for input failed", err);
 
     // gradO descriptor
     cudnnTensorDescriptor_t dz;
@@ -187,7 +187,7 @@ static void conv3dBpCUDNN(const LaunchContext* context,
         err = cudnnSetTensorNdDescriptorEx(dz, format, cudnnDataType(gradO->dataType()), numDims, dzShape.data());
     else
         err = cudnnSetTensorNdDescriptor(dz, cudnnDataType(gradO->dataType()), numDims, dzShape.data(), dzStrides.data());
-    if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for gradO failed", err);
+    if (err != 0) throw sd::cuda_exception::build("conv3dBpCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for gradO failed", err);
 
     // gradI descriptor
     cudnnTensorDescriptor_t dx;
@@ -196,45 +196,45 @@ static void conv3dBpCUDNN(const LaunchContext* context,
         err = cudnnSetTensorNdDescriptorEx(dx, format, cudnnDataType(gradI->dataType()), numDims, xShape.data());
     else
         err = cudnnSetTensorNdDescriptor(dx, cudnnDataType(gradI->dataType()), numDims, xShape.data(), dxStrides.data());
-    if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for gradI failed", err);
+    if (err != 0) throw sd::cuda_exception::build("conv3dBpCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for gradI failed", err);
 
     // gradW descriptor
     cudnnFilterDescriptor_t dw;
     cudnnCreateFilterDescriptor(&dw);
     err = cudnnSetFilterNdDescriptor(dw, cudnnDataType(gradW->dataType()), CUDNN_TENSOR_NCHW, numDims, wShape.data());
-    if(err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnSetFilterNdDescriptor failed", err);
+    if(err != 0) throw sd::cuda_exception::build("conv3dBpCUDNN: cudnnSetFilterNdDescriptor failed", err);
 
     // description of convolution
     cudnnConvolutionDescriptor_t conv;
     cudnnCreateConvolutionDescriptor(&conv);
     err = cudnnSetConvolutionNdDescriptor(conv, numDims-2, pads.data(), filtStrides.data(), dilations.data(), CUDNN_CROSS_CORRELATION, cudnnDataType(gradO->dataType()));
-    if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnSetConvolutionNdDescriptor failed", err);
+    if (err != 0) throw sd::cuda_exception::build("conv3dBpCUDNN: cudnnSetConvolutionNdDescriptor failed", err);
 
     // gradW algorithm description
     cudnnConvolutionBwdFilterAlgo_t algoGradW;
     err = cudnnGetConvolutionBackwardFilterAlgorithm(*handle, x, dz, conv, dw, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0, &algoGradW);
-    if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnGetConvolutionBackwardFilterAlgorithm failed", err);
+    if (err != 0) throw sd::cuda_exception::build("conv3dBpCUDNN: cudnnGetConvolutionBackwardFilterAlgorithm failed", err);
 
     // gradI algorithm description
     cudnnConvolutionBwdDataAlgo_t algoGradI;
     err = cudnnGetConvolutionBackwardDataAlgorithm(*handle, dw, dz, conv, x, CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 0, &algoGradI);
-    if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnGetConvolutionBackwardDataAlgorithm failed", err);
+    if (err != 0) throw sd::cuda_exception::build("conv3dBpCUDNN: cudnnGetConvolutionBackwardDataAlgorithm failed", err);
 
     // allocate auxiliary device memory for gradW calculation, abbreviation ws means workspace
     size_t wsGradWSize;
     err = cudnnGetConvolutionBackwardFilterWorkspaceSize(*handle, x, dz, conv, dw, algoGradW, &wsGradWSize);
-    if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnGetConvolutionBackwardFilterWorkspaceSize failed", err);
+    if (err != 0) throw sd::cuda_exception::build("conv3dBpCUDNN: cudnnGetConvolutionBackwardFilterWorkspaceSize failed", err);
     void* wsGradWData;
     auto cudaErr = cudaMalloc(&wsGradWData, wsGradWSize);
-    if (cudaErr != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudaMalloc for auxiliary workspace memory wsGradWData failed", cudaErr);
+    if (cudaErr != 0) throw sd::cuda_exception::build("conv3dBpCUDNN: cudaMalloc for auxiliary workspace memory wsGradWData failed", cudaErr);
 
     // allocate auxiliary device memory for gradI calculation, abbreviation ws means workspace
     size_t wsGradISize;
     err = cudnnGetConvolutionBackwardDataWorkspaceSize(*handle, dw, dz, conv, dx, algoGradI, &wsGradISize);
-    if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnGetConvolutionBackwardDataWorkspaceSize failed", err);
+    if (err != 0) throw sd::cuda_exception::build("conv3dBpCUDNN: cudnnGetConvolutionBackwardDataWorkspaceSize failed", err);
     void* wsGradIData;
     cudaErr = cudaMalloc(&wsGradIData, wsGradISize);
-    if (cudaErr != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudaMalloc for auxiliary workspace memory wsGradIData failed", cudaErr);
+    if (cudaErr != 0) throw sd::cuda_exception::build("conv3dBpCUDNN: cudaMalloc for auxiliary workspace memory wsGradIData failed", cudaErr);
 
     // provide scaling parameters
     const float  alpha32(1), beta32(0);
@@ -250,28 +250,28 @@ static void conv3dBpCUDNN(const LaunchContext* context,
         cudnnTensorDescriptor_t db;
         cudnnCreateTensorDescriptor(&db);
         err = cudnnSetTensorNdDescriptorEx(db, format, cudnnDataType(gradB->dataType()), numDims, dbShape.data());
-        if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnSetTensorNdDescriptor for gradB failed", err);
+        if (err != 0) throw sd::cuda_exception::build("conv3dBpCUDNN: cudnnSetTensorNdDescriptor for gradB failed", err);
 
         err = cudnnConvolutionBackwardBias(*handle, alpha, dz, gradO->getSpecialBuffer(), beta, db, gradB->getSpecialBuffer());
-        if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnConvolutionBackwardBias failed", err);
+        if (err != 0) throw sd::cuda_exception::build("conv3dBpCUDNN: cudnnConvolutionBackwardBias failed", err);
     }
 
     // run calculation for gradW
     err = cudnnConvolutionBackwardFilter(*handle, alpha, x, input->getSpecialBuffer(), dz, gradO->getSpecialBuffer(), conv, algoGradW, wsGradWData, wsGradWSize, beta, dw, gradW->getSpecialBuffer());
-    if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnConvolutionBackwardFilter failed", err);
+    if (err != 0) throw sd::cuda_exception::build("conv3dBpCUDNN: cudnnConvolutionBackwardFilter failed", err);
 
     // run calculation for gradI
     err = cudnnConvolutionBackwardData(*handle, alpha, dw, weights->getSpecialBuffer(), dz, gradO->getSpecialBuffer(), conv, algoGradI, wsGradIData, wsGradISize, beta, dx, gradI->getSpecialBuffer());
-    if (err != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudnnConvolutionBackwardData failed", err);
+    if (err != 0) throw sd::cuda_exception::build("conv3dBpCUDNN: cudnnConvolutionBackwardData failed", err);
 
     // cudaErr = cudaStreamSynchronize(*context->getCudaStream());
     // if (cudaErr != 0)
     //     throw cuda_exception::build("conv3dBpCUDNN: cudaStreamSynchronize failed !", cudaErr);
 
     cudaErr = cudaFree(wsGradWData);
-    if (cudaErr != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudaFree for auxiliary workspace memory wsGradWData failed", cudaErr);
+    if (cudaErr != 0) throw sd::cuda_exception::build("conv3dBpCUDNN: cudaFree for auxiliary workspace memory wsGradWData failed", cudaErr);
     cudaErr = cudaFree(wsGradIData);
-    if (cudaErr != 0) throw nd4j::cuda_exception::build("conv3dBpCUDNN: cudaFree for auxiliary workspace memory wsGradIData failed", cudaErr);
+    if (cudaErr != 0) throw sd::cuda_exception::build("conv3dBpCUDNN: cudaFree for auxiliary workspace memory wsGradIData failed", cudaErr);
 
     NDArray::registerSpecialUse({gradI, gradW, gradB}, {input, weights, gradO});
 }
diff --git a/libnd4j/include/ops/declarable/platform/cudnn/cudnnUtils.cu b/libnd4j/include/ops/declarable/platform/cudnn/cudnnUtils.cu
index 02a302e61..22b0f9b1c 100644
--- a/libnd4j/include/ops/declarable/platform/cudnn/cudnnUtils.cu
+++ b/libnd4j/include/ops/declarable/platform/cudnn/cudnnUtils.cu
@@ -22,7 +22,7 @@
 #include "cudnnUtils.h"
 #include <ops/declarable/helpers/convolutions.h>
 
-namespace nd4j      {
+namespace sd      {
 namespace ops       {
 namespace platforms {
 
@@ -128,7 +128,7 @@ void pooling2dCUDNN(const LaunchContext* context,
 
     auto handle = reinterpret_cast<cudnnHandle_t *>(context->getCuDnnHandle());
     cudnnStatus_t err = cudnnSetStream(*handle, *context->getCudaStream());
-    if (err != 0) throw nd4j::cuda_exception::build("pooling2dCUDNN: can't set stream for cuDNN", err);
+    if (err != 0) throw sd::cuda_exception::build("pooling2dCUDNN: can't set stream for cuDNN", err);
 
     cudnnTensorFormat_t format = isNCHW ? CUDNN_TENSOR_NCHW : CUDNN_TENSOR_NHWC;
 
@@ -139,7 +139,7 @@ void pooling2dCUDNN(const LaunchContext* context,
         err = cudnnSetTensor4dDescriptor(x, format, cudnnDataType(input->dataType()), bS, iC, iH, iW);
     else
         err = cudnnSetTensor4dDescriptorEx(x, cudnnDataType(input->dataType()), bS, iC, iH, iW, input->strideAt(0), input->strideAt(indIOioC), input->strideAt(indIiH), input->strideAt(indIiH + 1));
-    if (err != 0) throw nd4j::cuda_exception::build("pooling2dCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for input failed", err);
+    if (err != 0) throw sd::cuda_exception::build("pooling2dCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for input failed", err);
 
     // output descriptor
     cudnnTensorDescriptor_t z;
@@ -148,13 +148,13 @@ void pooling2dCUDNN(const LaunchContext* context,
         err = cudnnSetTensor4dDescriptor(z, format, cudnnDataType(output->dataType()), bS, oC, oH, oW);
     else
         err = cudnnSetTensor4dDescriptorEx(z, cudnnDataType(output->dataType()), bS, oC, oH, oW, output->strideAt(0), output->strideAt(indIOioC), output->strideAt(indOoH), output->strideAt(indOoH + 1));
-    if (err != 0) throw nd4j::cuda_exception::build("pooling2dCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for output failed", err);
+    if (err != 0) throw sd::cuda_exception::build("pooling2dCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for output failed", err);
 
     // description of pooling
     cudnnPoolingDescriptor_t pooling;
     cudnnCreatePoolingDescriptor(&pooling);
     err = cudnnSetPooling2dDescriptor(pooling, mode, CUDNN_PROPAGATE_NAN, kH, kW, pH, pW, sH, sW);
-    if (err != 0) throw nd4j::cuda_exception::build("pooling2dCUDNN: cudnnSetPooling2dDescriptor failed", err);
+    if (err != 0) throw sd::cuda_exception::build("pooling2dCUDNN: cudnnSetPooling2dDescriptor failed", err);
 
     // provide scaling parameters
     const float  alpha32(1), beta32(0);
@@ -166,7 +166,7 @@ void pooling2dCUDNN(const LaunchContext* context,
 
     // run calculation
     err = cudnnPoolingForward(*handle, pooling, alpha, x, input->getSpecialBuffer(), beta, z, output->specialBuffer());
-    if (err != 0) throw nd4j::cuda_exception::build("pooling2dCUDNN: cudnnPoolingForward failed", err);
+    if (err != 0) throw sd::cuda_exception::build("pooling2dCUDNN: cudnnPoolingForward failed", err);
 
     auto cudaErr = cudaStreamSynchronize(*context->getCudaStream());
     if (cudaErr != 0)
@@ -191,7 +191,7 @@ void pooling2dBpCUDNN(const LaunchContext* context,
 
     auto handle = reinterpret_cast<cudnnHandle_t *>(context->getCuDnnHandle());
     cudnnStatus_t err = cudnnSetStream(*handle, *context->getCudaStream());
-    if (err != 0) throw nd4j::cuda_exception::build("pooling2dBpCUDNN: can't set stream for cuDNN", err);
+    if (err != 0) throw sd::cuda_exception::build("pooling2dBpCUDNN: can't set stream for cuDNN", err);
 
     cudnnTensorFormat_t format = isNCHW ? CUDNN_TENSOR_NCHW : CUDNN_TENSOR_NHWC;
 
@@ -202,7 +202,7 @@ void pooling2dBpCUDNN(const LaunchContext* context,
         err = cudnnSetTensor4dDescriptor(x, format, cudnnDataType(input->dataType()), bS, iC, iH, iW);
     else
         err = cudnnSetTensor4dDescriptorEx(x, cudnnDataType(input->dataType()), bS, iC, iH, iW, input->strideAt(0), input->strideAt(indIOioC), input->strideAt(indIiH), input->strideAt(indIiH + 1));
-    if (err != 0) throw nd4j::cuda_exception::build("pooling2dBpCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for input/gradI failed", err);
+    if (err != 0) throw sd::cuda_exception::build("pooling2dBpCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for input/gradI failed", err);
 
     // gradO descriptor
     cudnnTensorDescriptor_t dz;
@@ -211,13 +211,13 @@ void pooling2dBpCUDNN(const LaunchContext* context,
         err = cudnnSetTensor4dDescriptor(dz, format, cudnnDataType(gradO->dataType()), bS, oC, oH, oW);
     else
         err = cudnnSetTensor4dDescriptorEx(dz, cudnnDataType(gradO->dataType()), bS, oC, oH, oW, gradO->strideAt(0), gradO->strideAt(indIOioC), gradO->strideAt(indOoH), gradO->strideAt(indOoH + 1));
-    if (err != 0) throw nd4j::cuda_exception::build("pooling2dBpCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for gradO failed", err);
+    if (err != 0) throw sd::cuda_exception::build("pooling2dBpCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for gradO failed", err);
 
     // description of pooling
     cudnnPoolingDescriptor_t pooling;
     cudnnCreatePoolingDescriptor(&pooling);
     err = cudnnSetPooling2dDescriptor(pooling, mode, CUDNN_PROPAGATE_NAN, kH, kW, pH, pW, sH, sW);
-    if (err != 0) throw nd4j::cuda_exception::build("pooling2dBpCUDNN: cudnnSetPooling2dDescriptor failed", err);
+    if (err != 0) throw sd::cuda_exception::build("pooling2dBpCUDNN: cudnnSetPooling2dDescriptor failed", err);
 
     // provide scaling parameters
     const float  alpha32(1), beta32(0);
@@ -229,7 +229,7 @@ void pooling2dBpCUDNN(const LaunchContext* context,
 
     // run calculation for gradI
     err = cudnnPoolingBackward(*handle, pooling, alpha, dz, gradO->getSpecialBuffer(), dz, gradO->getSpecialBuffer(), x, input->getSpecialBuffer(), beta, x, gradI->getSpecialBuffer());
-    if (err != 0) throw nd4j::cuda_exception::build("pooling2dBpCUDNN: cudnnPoolingBackward failed", err);
+    if (err != 0) throw sd::cuda_exception::build("pooling2dBpCUDNN: cudnnPoolingBackward failed", err);
 
     auto cudaErr = cudaStreamSynchronize(*context->getCudaStream());
     if (cudaErr != 0)
@@ -249,7 +249,7 @@ void pooling3dCUDNN(const LaunchContext* context,
 
     auto handle = reinterpret_cast<cudnnHandle_t *>(context->getCuDnnHandle());
     cudnnStatus_t err = cudnnSetStream(*handle, *context->getCudaStream());
-    if (err != 0) throw nd4j::cuda_exception::build("pooling3dCUDNN: can't set stream for cuDNN", err);
+    if (err != 0) throw sd::cuda_exception::build("pooling3dCUDNN: can't set stream for cuDNN", err);
 
     const int numDims = 5;
 
@@ -276,7 +276,7 @@ void pooling3dCUDNN(const LaunchContext* context,
         err = cudnnSetTensorNdDescriptorEx(x, format, cudnnDataType(input->dataType()), numDims, xShape);
     else
         err = cudnnSetTensorNdDescriptor(x, cudnnDataType(input->dataType()), numDims, xShape, xStrides);
-    if (err != 0) throw nd4j::cuda_exception::build("pooling3dCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for input failed", err);
+    if (err != 0) throw sd::cuda_exception::build("pooling3dCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for input failed", err);
 
     // output descriptor
     cudnnTensorDescriptor_t z;
@@ -285,13 +285,13 @@ void pooling3dCUDNN(const LaunchContext* context,
         err = cudnnSetTensorNdDescriptorEx(z, format, cudnnDataType(output->dataType()), numDims, zShape);
     else
         err = cudnnSetTensorNdDescriptor(z, cudnnDataType(output->dataType()), numDims, zShape, zStrides);
-    if (err != 0) throw nd4j::cuda_exception::build("pooling3dCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for output failed", err);
+    if (err != 0) throw sd::cuda_exception::build("pooling3dCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for output failed", err);
 
     // description of pooling
     cudnnPoolingDescriptor_t pooling;
     cudnnCreatePoolingDescriptor(&pooling);
     err = cudnnSetPoolingNdDescriptor(pooling, mode, CUDNN_PROPAGATE_NAN, numDims - 2, kSizes, pSizes, sSizes);
-    if (err != 0) throw nd4j::cuda_exception::build("pooling3dCUDNN: cudnnSetPoolingNdDescriptor failed", err);
+    if (err != 0) throw sd::cuda_exception::build("pooling3dCUDNN: cudnnSetPoolingNdDescriptor failed", err);
 
     // provide scaling parameters
     const float  alpha32(1), beta32(0);
@@ -303,7 +303,7 @@ void pooling3dCUDNN(const LaunchContext* context,
 
     // run calculation
     err = cudnnPoolingForward(*handle, pooling, alpha, x, input->getSpecialBuffer(), beta, z, output->specialBuffer());
-    if (err != 0) throw nd4j::cuda_exception::build("pooling3dCUDNN: cudnnPoolingForward failed", err);
+    if (err != 0) throw sd::cuda_exception::build("pooling3dCUDNN: cudnnPoolingForward failed", err);
 
     auto cudaErr = cudaStreamSynchronize(*context->getCudaStream());
     if (cudaErr != 0)
@@ -324,7 +324,7 @@ void pooling3dBpCUDNN(const LaunchContext* context,
 
     auto handle = reinterpret_cast<cudnnHandle_t *>(context->getCuDnnHandle());
     cudnnStatus_t err = cudnnSetStream(*handle, *context->getCudaStream());
-    if (err != 0) throw nd4j::cuda_exception::build("pooling3dBpCUDNN: can't set stream for cuDNN", err);
+    if (err != 0) throw sd::cuda_exception::build("pooling3dBpCUDNN: can't set stream for cuDNN", err);
 
     const int numDims = 5;
 
@@ -351,7 +351,7 @@ void pooling3dBpCUDNN(const LaunchContext* context,
         err = cudnnSetTensorNdDescriptorEx(x, format, cudnnDataType(input->dataType()), numDims, xShape);
     else
         err = cudnnSetTensorNdDescriptor(x, cudnnDataType(input->dataType()), numDims, xShape, xStrides);
-    if (err != 0) throw nd4j::cuda_exception::build("pooling3dBpCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for input/gradI failed", err);
+    if (err != 0) throw sd::cuda_exception::build("pooling3dBpCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for input/gradI failed", err);
 
     // gradO descriptor
     cudnnTensorDescriptor_t dz;
@@ -360,13 +360,13 @@ void pooling3dBpCUDNN(const LaunchContext* context,
         err = cudnnSetTensorNdDescriptorEx(dz, format, cudnnDataType(gradO->dataType()), numDims, dzShape);
     else
         err = cudnnSetTensorNdDescriptor(dz, cudnnDataType(gradO->dataType()), numDims, dzShape, dzStrides);
-    if (err != 0) throw nd4j::cuda_exception::build("pooling3dBpCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for gradO failed", err);
+    if (err != 0) throw sd::cuda_exception::build("pooling3dBpCUDNN: cudnnSetTensorNdDescriptor/cudnnSetTensorNdDescriptorEx for gradO failed", err);
 
     // description of pooling
     cudnnPoolingDescriptor_t pooling;
     cudnnCreatePoolingDescriptor(&pooling);
     err = cudnnSetPoolingNdDescriptor(pooling, mode, CUDNN_PROPAGATE_NAN, numDims - 2, kSizes, pSizes, sSizes);
-    if (err != 0) throw nd4j::cuda_exception::build("pooling3dBpCUDNN: cudnnSetPoolingNdDescriptor failed", err);
+    if (err != 0) throw sd::cuda_exception::build("pooling3dBpCUDNN: cudnnSetPoolingNdDescriptor failed", err);
 
     // provide scaling parameters
     const float  alpha32(1), beta32(0);
@@ -383,11 +383,11 @@ void pooling3dBpCUDNN(const LaunchContext* context,
 
         // run ff calculation
         err = cudnnPoolingForward(*handle, pooling, alpha, x, input->getSpecialBuffer(), beta, dz, temp.specialBuffer());
-        if (err != 0) throw nd4j::cuda_exception::build("pooling3dCUDNN: cudnnPoolingForward failed", err);
+        if (err != 0) throw sd::cuda_exception::build("pooling3dCUDNN: cudnnPoolingForward failed", err);
 
         // run bp calculation for gradI
         err = cudnnPoolingBackward(*handle, pooling, alpha, dz, temp.getSpecialBuffer(), dz, gradO->getSpecialBuffer(), x, input->getSpecialBuffer(), beta, x, gradI->getSpecialBuffer());
-        if (err != 0) throw nd4j::cuda_exception::build("pooling2dBpCUDNN: cudnnPoolingBackward failed", err);
+        if (err != 0) throw sd::cuda_exception::build("pooling2dBpCUDNN: cudnnPoolingBackward failed", err);
 
         NDArray::registerSpecialUse({gradI}, {input, gradO, &temp});
     }
@@ -397,7 +397,7 @@ void pooling3dBpCUDNN(const LaunchContext* context,
 
         // run bp calculation for gradI
         err = cudnnPoolingBackward(*handle, pooling, alpha, dz, gradO->getSpecialBuffer(), dz, gradO->getSpecialBuffer(), x, input->getSpecialBuffer(), beta, x, gradI->getSpecialBuffer());
-        if (err != 0) throw nd4j::cuda_exception::build("pooling2dBpCUDNN: cudnnPoolingBackward failed", err);
+        if (err != 0) throw sd::cuda_exception::build("pooling2dBpCUDNN: cudnnPoolingBackward failed", err);
 
         NDArray::registerSpecialUse({gradI}, {input, gradO});
     }
diff --git a/libnd4j/include/ops/declarable/platform/cudnn/cudnnUtils.h b/libnd4j/include/ops/declarable/platform/cudnn/cudnnUtils.h
index 5c46fb7b0..3379979a3 100644
--- a/libnd4j/include/ops/declarable/platform/cudnn/cudnnUtils.h
+++ b/libnd4j/include/ops/declarable/platform/cudnn/cudnnUtils.h
@@ -23,14 +23,14 @@
 
 #include <ops/declarable/PlatformHelper.h>
 #include <ops/declarable/OpRegistrator.h>
-#include <platform_boilerplate.h>
+#include <system/platform_boilerplate.h>
 #include <exceptions/cuda_exception.h>
 #include <exceptions/datatype_exception.h>
-#include <dll.h>
+#include <system/dll.h>
 
 #include <cudnn.h>
 
-namespace nd4j      {
+namespace sd      {
 namespace ops       {
 namespace platforms {
 
@@ -59,17 +59,17 @@ namespace platforms {
     DECLARE_PLATFORM(maxpool3dnew_bp, ENGINE_CUDA);
 
 //////////////////////////////////////////////////////////////////////////
-FORCEINLINE cudnnDataType_t cudnnDataType(nd4j::DataType dataType) {
+FORCEINLINE cudnnDataType_t cudnnDataType(sd::DataType dataType) {
     switch (dataType) {
-        case nd4j::DataType::FLOAT32:
+        case sd::DataType::FLOAT32:
             return CUDNN_DATA_FLOAT;
-        case nd4j::DataType::DOUBLE:
+        case sd::DataType::DOUBLE:
             return CUDNN_DATA_DOUBLE;
-        case nd4j::DataType::HALF:
+        case sd::DataType::HALF:
             return CUDNN_DATA_HALF;
-        case nd4j::DataType::INT32:
+        case sd::DataType::INT32:
             return CUDNN_DATA_INT32;
-        case nd4j::DataType::INT8:
+        case sd::DataType::INT8:
             return CUDNN_DATA_INT8;
         default:
             throw datatype_exception::build("Unsupported data type", dataType);
diff --git a/libnd4j/include/ops/declarable/platform/cudnn/depthwiseConv2d.cu b/libnd4j/include/ops/declarable/platform/cudnn/depthwiseConv2d.cu
index d328fa92b..ae07ce944 100644
--- a/libnd4j/include/ops/declarable/platform/cudnn/depthwiseConv2d.cu
+++ b/libnd4j/include/ops/declarable/platform/cudnn/depthwiseConv2d.cu
@@ -22,7 +22,7 @@
 #include "cudnnUtils.h"
 #include <ops/declarable/helpers/convolutions.h>
 
-namespace nd4j      {
+namespace sd      {
 namespace ops       {
 namespace platforms {
 
@@ -51,7 +51,7 @@ static void depthwiseConv2dCUDNN(const LaunchContext* context,
 
     auto handle = reinterpret_cast<cudnnHandle_t *>(context->getCuDnnHandle());
     cudnnStatus_t err = cudnnSetStream(*handle, *context->getCudaStream());
-    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: can't set stream for cuDNN", err);
+    if (err != 0) throw sd::cuda_exception::build("depthwiseConv2dCUDNN: can't set stream for cuDNN", err);
 
     cudnnTensorFormat_t format = isNCHW ? CUDNN_TENSOR_NCHW : CUDNN_TENSOR_NHWC;
 
@@ -62,13 +62,13 @@ static void depthwiseConv2dCUDNN(const LaunchContext* context,
         err = cudnnSetTensor4dDescriptor(x, format, cudnnDataType(input->dataType()), bS, iC, iH, iW);
     else
         err = cudnnSetTensor4dDescriptorEx(x, cudnnDataType(input->dataType()), bS, iC, iH, iW, input->strideAt(0), input->strideAt(indIOioC), input->strideAt(indIiH), input->strideAt(indIiH + 1));
-    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for input failed", err);
+    if (err != 0) throw sd::cuda_exception::build("depthwiseConv2dCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for input failed", err);
 
     // weights descriptor
     cudnnFilterDescriptor_t w;
     cudnnCreateFilterDescriptor(&w);
     err = cudnnSetFilter4dDescriptor(w, cudnnDataType(weights->dataType()), CUDNN_TENSOR_NCHW, iC, mC, kH, kW);
-    if(err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnSetFilter4dDescriptor failed", err);
+    if(err != 0) throw sd::cuda_exception::build("depthwiseConv2dCUDNN: cudnnSetFilter4dDescriptor failed", err);
 
     // output descriptor
     cudnnTensorDescriptor_t z;
@@ -77,28 +77,28 @@ static void depthwiseConv2dCUDNN(const LaunchContext* context,
         err = cudnnSetTensor4dDescriptor(z, format, cudnnDataType(output->dataType()), bS, oC, oH, oW);
     else
         err = cudnnSetTensor4dDescriptorEx(z, cudnnDataType(output->dataType()), bS, oC, oH, oW, output->strideAt(0), output->strideAt(indIOioC), output->strideAt(indOoH), output->strideAt(indOoH + 1));
-    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for output failed", err);
+    if (err != 0) throw sd::cuda_exception::build("depthwiseConv2dCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for output failed", err);
 
     // description of convolution
     cudnnConvolutionDescriptor_t conv;
     cudnnCreateConvolutionDescriptor(&conv);
     err = cudnnSetConvolution2dDescriptor(conv, pH, pW, sH, sW, dH, dW, CUDNN_CROSS_CORRELATION, cudnnDataType(output->dataType()));
-    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnSetConvolution2dDescriptor failed", err);
+    if (err != 0) throw sd::cuda_exception::build("depthwiseConv2dCUDNN: cudnnSetConvolution2dDescriptor failed", err);
     err = cudnnSetConvolutionGroupCount(conv, iC);  // set number of groups (depthwise mode) in description of convolution, groupCount == iC
-    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnSetConvolutionGroupCount failed", err);
+    if (err != 0) throw sd::cuda_exception::build("depthwiseConv2dCUDNN: cudnnSetConvolutionGroupCount failed", err);
 
     // algorithm description
     cudnnConvolutionFwdAlgo_t algo;
     err = cudnnGetConvolutionForwardAlgorithm(*handle, x, w, conv, z, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo);
-    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnGetConvolutionForwardAlgorithm failed", err);
+    if (err != 0) throw sd::cuda_exception::build("depthwiseConv2dCUDNN: cudnnGetConvolutionForwardAlgorithm failed", err);
 
     // allocate auxiliary device memory, abbreviation ws means workspace
     size_t wsSize;
     err = cudnnGetConvolutionForwardWorkspaceSize(*handle, x, w, conv, z, algo, &wsSize);
-    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnGetConvolutionForwardWorkspaceSize failed", err);
+    if (err != 0) throw sd::cuda_exception::build("depthwiseConv2dCUDNN: cudnnGetConvolutionForwardWorkspaceSize failed", err);
     void* wsData;
     auto cudaErr = cudaMalloc(&wsData, wsSize);
-    if (cudaErr != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudaMalloc for auxiliary workspace memory failed", cudaErr);
+    if (cudaErr != 0) throw sd::cuda_exception::build("depthwiseConv2dCUDNN: cudaMalloc for auxiliary workspace memory failed", cudaErr);
 
     // provide scaling parameters
     const float  alpha32(1), beta32(0);
@@ -110,7 +110,7 @@ static void depthwiseConv2dCUDNN(const LaunchContext* context,
 
     // run calculation
     err = cudnnConvolutionForward(*handle, alpha, x, input->getSpecialBuffer(), w, weights->getSpecialBuffer(), conv, algo, wsData, wsSize, beta, z, output->specialBuffer());
-    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnConvolutionForward failed", err);
+    if (err != 0) throw sd::cuda_exception::build("depthwiseConv2dCUDNN: cudnnConvolutionForward failed", err);
 
     // add bias if it is present
     if (bias != nullptr) {
@@ -118,9 +118,9 @@ static void depthwiseConv2dCUDNN(const LaunchContext* context,
         cudnnTensorDescriptor_t b;
         cudnnCreateTensorDescriptor(&b);
         err = cudnnSetTensor4dDescriptor(b, format, cudnnDataType(bias->dataType()), 1, isNCHW ? bias->lengthOf() : 1, 1, isNCHW ? 1: bias->lengthOf());
-        if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnSetTensor4dDescriptor for bias failed", err);
+        if (err != 0) throw sd::cuda_exception::build("depthwiseConv2dCUDNN: cudnnSetTensor4dDescriptor for bias failed", err);
         err = cudnnAddTensor(*handle, alpha, b, bias->getSpecialBuffer(), alpha, z, output->specialBuffer());
-        if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudnnAddTensor bias failed", err);
+        if (err != 0) throw sd::cuda_exception::build("depthwiseConv2dCUDNN: cudnnAddTensor bias failed", err);
     }
 
     // cudaErr = cudaStreamSynchronize(*context->getCudaStream());
@@ -128,7 +128,7 @@ static void depthwiseConv2dCUDNN(const LaunchContext* context,
     //     throw cuda_exception::build("depthwiseConv2dCUDNN: cudaStreamSynchronize failed !", cudaErr);
 
     cudaErr = cudaFree(wsData);
-    if (cudaErr != 0) throw nd4j::cuda_exception::build("depthwiseConv2dCUDNN: cudaFree for auxiliary workspace memory failed", cudaErr);
+    if (cudaErr != 0) throw sd::cuda_exception::build("depthwiseConv2dCUDNN: cudaFree for auxiliary workspace memory failed", cudaErr);
 
     NDArray::registerSpecialUse({output}, {input, weights, bias});
 }
@@ -158,7 +158,7 @@ static void depthwiseConv2dBpCUDNN(const LaunchContext* context,
 
     auto handle = reinterpret_cast<cudnnHandle_t *>(context->getCuDnnHandle());
     cudnnStatus_t err = cudnnSetStream(*handle, *context->getCudaStream());
-    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: can't set stream for cuDNN", err);
+    if (err != 0) throw sd::cuda_exception::build("depthwiseConv2dBpCUDNN: can't set stream for cuDNN", err);
 
     cudnnTensorFormat_t format = isNCHW ? CUDNN_TENSOR_NCHW : CUDNN_TENSOR_NHWC;
 
@@ -169,7 +169,7 @@ static void depthwiseConv2dBpCUDNN(const LaunchContext* context,
         err = cudnnSetTensor4dDescriptor(x, format, cudnnDataType(input->dataType()), bS, iC, iH, iW);
     else
         err = cudnnSetTensor4dDescriptorEx(x, cudnnDataType(input->dataType()), bS, iC, iH, iW, input->strideAt(0), input->strideAt(indIOioC), input->strideAt(indIiH), input->strideAt(indIiH + 1));
-    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for input failed", err);
+    if (err != 0) throw sd::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for input failed", err);
 
     // gradO descriptor
     cudnnTensorDescriptor_t dz;
@@ -178,7 +178,7 @@ static void depthwiseConv2dBpCUDNN(const LaunchContext* context,
         err = cudnnSetTensor4dDescriptor(dz, format, cudnnDataType(gradO->dataType()), bS, oC, oH, oW);
     else
         err = cudnnSetTensor4dDescriptorEx(dz, cudnnDataType(gradO->dataType()), bS, oC, oH, oW, gradO->strideAt(0), gradO->strideAt(indIOioC), gradO->strideAt(indOoH), gradO->strideAt(indOoH + 1));
-    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for gradO failed", err);
+    if (err != 0) throw sd::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for gradO failed", err);
 
     // gradI descriptor
     cudnnTensorDescriptor_t dx;
@@ -187,47 +187,47 @@ static void depthwiseConv2dBpCUDNN(const LaunchContext* context,
         err = cudnnSetTensor4dDescriptor(dx, format, cudnnDataType(gradI->dataType()), bS, iC, iH, iW);
     else
         err = cudnnSetTensor4dDescriptorEx(dx, cudnnDataType(gradI->dataType()), bS, iC, iH, iW, gradI->strideAt(0), gradI->strideAt(indIOioC), gradI->strideAt(indIiH), gradI->strideAt(indIiH + 1));
-    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for gradI failed", err);
+    if (err != 0) throw sd::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnSetTensor4dDescriptor/cudnnSetTensor4dDescriptorEx for gradI failed", err);
 
     // gradW descriptor
     cudnnFilterDescriptor_t dw;
     cudnnCreateFilterDescriptor(&dw);
     err = cudnnSetFilter4dDescriptor(dw, cudnnDataType(gradW->dataType()), CUDNN_TENSOR_NCHW, iC, mC, kH, kW);
-    if(err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnSetFilter4dDescriptor gradW failed", err);
+    if(err != 0) throw sd::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnSetFilter4dDescriptor gradW failed", err);
 
     // description of convolution
     cudnnConvolutionDescriptor_t conv;
     cudnnCreateConvolutionDescriptor(&conv);
     err = cudnnSetConvolution2dDescriptor(conv, pH, pW, sH, sW, dH, dW, CUDNN_CROSS_CORRELATION, cudnnDataType(gradO->dataType()));
-    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnSetConvolution2dDescriptor failed", err);
+    if (err != 0) throw sd::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnSetConvolution2dDescriptor failed", err);
     err = cudnnSetConvolutionGroupCount(conv, iC);  // set number of groups (depthwise mode) in description of convolution, groupCount == iC
-    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnSetConvolutionGroupCount failed", err);
+    if (err != 0) throw sd::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnSetConvolutionGroupCount failed", err);
 
     // gradW algorithm description
     cudnnConvolutionBwdFilterAlgo_t algoGradW;
     err = cudnnGetConvolutionBackwardFilterAlgorithm(*handle, x, dz, conv, dw, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0, &algoGradW);
-    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnGetConvolutionBackwardFilterAlgorithm failed", err);
+    if (err != 0) throw sd::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnGetConvolutionBackwardFilterAlgorithm failed", err);
 
     // gradI algorithm description
     cudnnConvolutionBwdDataAlgo_t algoGradI;
     err = cudnnGetConvolutionBackwardDataAlgorithm(*handle, dw, dz, conv, x, CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 0, &algoGradI);
-    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnGetConvolutionBackwardDataAlgorithm failed", err);
+    if (err != 0) throw sd::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnGetConvolutionBackwardDataAlgorithm failed", err);
 
     // allocate auxiliary device memory for gradW calculation, abbreviation ws means workspace
     size_t wsGradWSize;
     err = cudnnGetConvolutionBackwardFilterWorkspaceSize(*handle, x, dz, conv, dw, algoGradW, &wsGradWSize);
-    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnGetConvolutionBackwardFilterWorkspaceSize failed", err);
+    if (err != 0) throw sd::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnGetConvolutionBackwardFilterWorkspaceSize failed", err);
     void* wsGradWData;
     auto cudaErr = cudaMalloc(&wsGradWData, wsGradWSize);
-    if (cudaErr != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudaMalloc for auxiliary workspace memory wsGradWData failed", cudaErr);
+    if (cudaErr != 0) throw sd::cuda_exception::build("depthwiseConv2dBpCUDNN: cudaMalloc for auxiliary workspace memory wsGradWData failed", cudaErr);
 
     // allocate auxiliary device memory for gradI calculation, abbreviation ws means workspace
     size_t wsGradISize;
     err = cudnnGetConvolutionBackwardDataWorkspaceSize(*handle, dw, dz, conv, dx, algoGradI, &wsGradISize);
-    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnGetConvolutionBackwardDataWorkspaceSize failed", err);
+    if (err != 0) throw sd::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnGetConvolutionBackwardDataWorkspaceSize failed", err);
     void* wsGradIData;
     cudaErr = cudaMalloc(&wsGradIData, wsGradISize);
-    if (cudaErr != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudaMalloc for auxiliary workspace memory wsGradIData failed", cudaErr);
+    if (cudaErr != 0) throw sd::cuda_exception::build("depthwiseConv2dBpCUDNN: cudaMalloc for auxiliary workspace memory wsGradIData failed", cudaErr);
 
     // provide scaling parameters
     const float  alpha32(1), beta32(0);
@@ -242,28 +242,28 @@ static void depthwiseConv2dBpCUDNN(const LaunchContext* context,
         cudnnTensorDescriptor_t db;
         cudnnCreateTensorDescriptor(&db);
         err = cudnnSetTensor4dDescriptor(db, format, cudnnDataType(gradB->dataType()), 1, isNCHW ? gradB->lengthOf() : 1, 1, isNCHW ? 1: gradB->lengthOf());
-        if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnSetTensor4dDescriptor for gradB failed", err);
+        if (err != 0) throw sd::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnSetTensor4dDescriptor for gradB failed", err);
 
         err = cudnnConvolutionBackwardBias(*handle, alpha, dz, gradO->getSpecialBuffer(), beta, db, gradB->getSpecialBuffer());
-        if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnConvolutionBackwardBias failed", err);
+        if (err != 0) throw sd::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnConvolutionBackwardBias failed", err);
     }
 
     // run calculation for gradW
     err = cudnnConvolutionBackwardFilter(*handle, alpha, x, input->getSpecialBuffer(), dz, gradO->getSpecialBuffer(), conv, algoGradW, wsGradWData, wsGradWSize, beta, dw, gradW->getSpecialBuffer());
-    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnConvolutionBackwardFilter failed", err);
+    if (err != 0) throw sd::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnConvolutionBackwardFilter failed", err);
 
     // run calculation for gradI
     err = cudnnConvolutionBackwardData(*handle, alpha, dw, weights->getSpecialBuffer(), dz, gradO->getSpecialBuffer(), conv, algoGradI, wsGradIData, wsGradISize, beta, dx, gradI->getSpecialBuffer());
-    if (err != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnConvolutionBackwardData failed", err);
+    if (err != 0) throw sd::cuda_exception::build("depthwiseConv2dBpCUDNN: cudnnConvolutionBackwardData failed", err);
 
     // cudaErr = cudaStreamSynchronize(*context->getCudaStream());
     // if (cudaErr != 0)
     //     throw cuda_exception::build("depthwiseConv2dBpCUDNN: cudaStreamSynchronize failed !", cudaErr);
 
     cudaErr = cudaFree(wsGradWData);
-    if (cudaErr != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudaFree for auxiliary workspace memory wsGradWData failed", cudaErr);
+    if (cudaErr != 0) throw sd::cuda_exception::build("depthwiseConv2dBpCUDNN: cudaFree for auxiliary workspace memory wsGradWData failed", cudaErr);
     cudaErr = cudaFree(wsGradIData);
-    if (cudaErr != 0) throw nd4j::cuda_exception::build("depthwiseConv2dBpCUDNN: cudaFree for auxiliary workspace memory wsGradIData failed", cudaErr);
+    if (cudaErr != 0) throw sd::cuda_exception::build("depthwiseConv2dBpCUDNN: cudaFree for auxiliary workspace memory wsGradIData failed", cudaErr);
 
     NDArray::registerSpecialUse({gradI, gradW, gradB}, {input, weights, gradO});
 }
diff --git a/libnd4j/include/ops/declarable/platform/cudnn/maxpool2d.cu b/libnd4j/include/ops/declarable/platform/cudnn/maxpool2d.cu
index 6d5affe79..841faa0d3 100644
--- a/libnd4j/include/ops/declarable/platform/cudnn/maxpool2d.cu
+++ b/libnd4j/include/ops/declarable/platform/cudnn/maxpool2d.cu
@@ -22,7 +22,7 @@
 #include "cudnnUtils.h"
 #include <ops/declarable/helpers/convolutions.h>
 
-namespace nd4j      {
+namespace sd      {
 namespace ops       {
 namespace platforms {
 
diff --git a/libnd4j/include/ops/declarable/platform/cudnn/maxpool3d.cu b/libnd4j/include/ops/declarable/platform/cudnn/maxpool3d.cu
index fc2e38577..82e7b9f84 100644
--- a/libnd4j/include/ops/declarable/platform/cudnn/maxpool3d.cu
+++ b/libnd4j/include/ops/declarable/platform/cudnn/maxpool3d.cu
@@ -22,7 +22,7 @@
 #include "cudnnUtils.h"
 #include <ops/declarable/helpers/convolutions.h>
 
-namespace nd4j      {
+namespace sd      {
 namespace ops       {
 namespace platforms {
 
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling2d.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling2d.cpp
index 4c8a582f0..9df7bedf3 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling2d.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling2d.cpp
@@ -22,7 +22,7 @@
 
 #include <ops/declarable/PlatformHelper.h>
 #include <ops/declarable/OpRegistrator.h>
-#include <platform_boilerplate.h>
+#include <system/platform_boilerplate.h>
 
 #include <helpers/MKLDNNStream.h>
 #include "mkldnnUtils.h"
@@ -31,7 +31,7 @@
 using namespace dnnl;
 using namespace samediff;
 
-namespace nd4j      {
+namespace sd      {
 namespace ops       {
 namespace platforms {
 
@@ -78,7 +78,7 @@ PLATFORM_CHECK(avgpool2d, ENGINE_CPU) {
     auto input = INPUT_VARIABLE(0);
     auto output = OUTPUT_VARIABLE(0);
 
-    return block.isUseMKLDNN() && nd4j::MKLDNNStream::isSupported({input, output});
+    return block.isUseMKLDNN() && sd::MKLDNNStream::isSupported({input, output});
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -125,7 +125,7 @@ PLATFORM_CHECK(avgpool2d_bp, ENGINE_CPU) {
     auto input = INPUT_VARIABLE(0);
     auto output = OUTPUT_VARIABLE(0);
 
-    return block.isUseMKLDNN() && nd4j::MKLDNNStream::isSupported({input, output});
+    return block.isUseMKLDNN() && sd::MKLDNNStream::isSupported({input, output});
 }
 
 
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling3d.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling3d.cpp
index 39e85de98..e8582658e 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling3d.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/avgpooling3d.cpp
@@ -22,7 +22,7 @@
 
 #include <ops/declarable/PlatformHelper.h>
 #include <ops/declarable/OpRegistrator.h>
-#include <platform_boilerplate.h>
+#include <system/platform_boilerplate.h>
 
 #include <helpers/MKLDNNStream.h>
 #include "mkldnnUtils.h"
@@ -30,7 +30,7 @@
 
 using namespace dnnl;
 
-namespace nd4j      {
+namespace sd      {
 namespace ops       {
 namespace platforms {
 
@@ -78,7 +78,7 @@ PLATFORM_CHECK(avgpool3dnew, ENGINE_CPU) {
     auto input = INPUT_VARIABLE(0);
     auto output = OUTPUT_VARIABLE(0);
 
-    return block.isUseMKLDNN() && nd4j::MKLDNNStream::isSupported({input, output});
+    return block.isUseMKLDNN() && sd::MKLDNNStream::isSupported({input, output});
 }
 
 //////////////////////////////////////////////////////////////////////
@@ -130,7 +130,7 @@ PLATFORM_CHECK(avgpool3dnew_bp, ENGINE_CPU) {
     auto input = INPUT_VARIABLE(0);
     auto output = OUTPUT_VARIABLE(0);
 
-    return block.isUseMKLDNN() && nd4j::MKLDNNStream::isSupported({input, output});
+    return block.isUseMKLDNN() && sd::MKLDNNStream::isSupported({input, output});
 }
 
 
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/batchnorm.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/batchnorm.cpp
index f63690e81..42b7d231c 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/batchnorm.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/batchnorm.cpp
@@ -23,15 +23,15 @@
 
 #include <ops/declarable/PlatformHelper.h>
 #include <ops/declarable/OpRegistrator.h>
-#include <platform_boilerplate.h>
+#include <system/platform_boilerplate.h>
 
 #include <helpers/MKLDNNStream.h>
 #include "mkldnnUtils.h"
 #include <ops/declarable/helpers/convolutions.h>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 
 
-namespace nd4j      {
+namespace sd      {
 namespace ops       {
 namespace platforms {
 
@@ -369,7 +369,7 @@ static void batchnormBackPropMKLDNN(const NDArray* x, const NDArray* mean, const
 
     // x - mean
     NDArray xMinusMean(x); // empty array with same shape as x
-    const_cast<NDArray*>(x)->applyBroadcast(nd4j::broadcast::Subtract, axes, *mean, xMinusMean);
+    const_cast<NDArray*>(x)->applyBroadcast(sd::broadcast::Subtract, axes, *mean, xMinusMean);
 
     // stdInv
     NDArray stdInv = *variance + epsilon;
@@ -377,30 +377,30 @@ static void batchnormBackPropMKLDNN(const NDArray* x, const NDArray* mean, const
     stdInv.applyTransform(transform::Sqrt, stdInv);                                 // 1 / (variance + epsilon)^0.5
 
     // dfdm / N
-    auto dfdm = dLdO->reduceAlongDimension(nd4j::reduce::Sum, excludedAxes);
+    auto dfdm = dLdO->reduceAlongDimension(sd::reduce::Sum, excludedAxes);
     dfdm *= stdInv;
     dfdm *= -Ninv;
 
     // dvdm / 2
     NDArray dvdm(mean);                 // empty array with same shape as mean
-    xMinusMean.reduceAlongDimension(nd4j::reduce::Sum, dvdm, excludedAxes);
+    xMinusMean.reduceAlongDimension(sd::reduce::Sum, dvdm, excludedAxes);
     dvdm *= -Ninv;
 
     // (2/N)*dfdv
     NDArray dfdv(variance);                 // empty array with same shape as variance
-    (xMinusMean * *dLdO).reduceAlongDimension(nd4j::reduce::Sum, dfdv, excludedAxes);
+    (xMinusMean * *dLdO).reduceAlongDimension(sd::reduce::Sum, dfdv, excludedAxes);
     dfdv *= stdInv*stdInv*stdInv;
     dfdv *= -Ninv;
 
     // dvdm/2  + (x - m)
-    xMinusMean.applyBroadcast(nd4j::broadcast::Add, axes, dvdm, xMinusMean);
+    xMinusMean.applyBroadcast(sd::broadcast::Add, axes, dvdm, xMinusMean);
     // dfdv * (dvdm/2  + (x - m))
-    xMinusMean.applyBroadcast(nd4j::broadcast::Multiply, axes, dfdv, xMinusMean);
+    xMinusMean.applyBroadcast(sd::broadcast::Multiply, axes, dfdv, xMinusMean);
     // add dfdm / N
-    xMinusMean.applyBroadcast(nd4j::broadcast::Add, axes, dfdm, xMinusMean);
+    xMinusMean.applyBroadcast(sd::broadcast::Add, axes, dfdm, xMinusMean);
     // * gamma
     auto gamma = (*weights)({0,1, 0,0});
-    xMinusMean.applyBroadcast(nd4j::broadcast::Multiply, axes, gamma, xMinusMean);
+    xMinusMean.applyBroadcast(sd::broadcast::Multiply, axes, gamma, xMinusMean);
 
     *dLdI += xMinusMean;
 }
@@ -644,7 +644,7 @@ PLATFORM_CHECK(batchnorm, ENGINE_CPU) {
 //         axes.push_back(input->rankOf() - 1);
 
 //     return block.isUseMKLDNN() &&
-//            nd4j::MKLDNNStream::isSupported({input, mean, variance, gamma, beta, output}) &&
+//            sd::MKLDNNStream::isSupported({input, mean, variance, gamma, beta, output}) &&
 //            axes.size() == 1;
 // }
 
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/conv2d.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/conv2d.cpp
index 2d88a73ef..fd34368a6 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/conv2d.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/conv2d.cpp
@@ -22,7 +22,7 @@
 
 #include <ops/declarable/PlatformHelper.h>
 #include <ops/declarable/OpRegistrator.h>
-#include <platform_boilerplate.h>
+#include <system/platform_boilerplate.h>
 
 #include <helpers/MKLDNNStream.h>
 #include "mkldnnUtils.h"
@@ -30,7 +30,7 @@
 
 using namespace dnnl;
 
-namespace nd4j      {
+namespace sd      {
 namespace ops       {
 namespace platforms {
 
@@ -327,7 +327,7 @@ static void conv2dBpMKLDNN(const NDArray *input, const NDArray *weights, const N
 
 /*
 //////////////////////////////////////////////////////////////////////
-static void conv2dMKLDNN(nd4j::graph::Context &block, const NDArray *input, const NDArray *weights,
+static void conv2dMKLDNN(sd::graph::Context &block, const NDArray *input, const NDArray *weights,
                           const NDArray *bias, NDArray *output, const int kH, const int kW, const int sH,
                           const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode,
                           const int isNCHW) {
@@ -404,7 +404,7 @@ static void conv2dMKLDNN(nd4j::graph::Context &block, const NDArray *input, cons
 }
 
 //////////////////////////////////////////////////////////////////////
-static void conv2dBpMKLDNN(nd4j::graph::Context &block,
+static void conv2dBpMKLDNN(sd::graph::Context &block,
                             const NDArray *input, const NDArray *weights, const NDArray *bias, const NDArray *gradO,
                             NDArray *gradI, NDArray *gradW, NDArray *gradB,
                             const int kH, const int kW, const int sH,const int sW, int pH, int pW, const int dH, const int dW,
@@ -577,8 +577,8 @@ PLATFORM_CHECK(conv2d, ENGINE_CPU) {
     auto weights = INPUT_VARIABLE(1);
 
     // conv2d is only available for float32 dtype
-    return block.isUseMKLDNN() && input->dataType() == nd4j::DataType::FLOAT32 &&
-           weights->dataType() == nd4j::DataType::FLOAT32;
+    return block.isUseMKLDNN() && input->dataType() == sd::DataType::FLOAT32 &&
+           weights->dataType() == sd::DataType::FLOAT32;
 }
 
 //////////////////////////////////////////////////////////////////////
@@ -639,7 +639,7 @@ PLATFORM_CHECK(conv2d_bp, ENGINE_CPU) {
 
 
     return block.isUseMKLDNN() &&
-           nd4j::MKLDNNStream::isSupported({input, weights, bias, gradO, gradI, gradW, gradB});
+           sd::MKLDNNStream::isSupported({input, weights, bias, gradO, gradI, gradW, gradB});
 }
 
 
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/conv3d.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/conv3d.cpp
index 7c10b0d1e..3003713e3 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/conv3d.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/conv3d.cpp
@@ -21,7 +21,7 @@
 
 #include <ops/declarable/PlatformHelper.h>
 #include <ops/declarable/OpRegistrator.h>
-#include <platform_boilerplate.h>
+#include <system/platform_boilerplate.h>
 
 #include <helpers/MKLDNNStream.h>
 #include "mkldnnUtils.h"
@@ -29,7 +29,7 @@
 
 using namespace dnnl;
 
-namespace nd4j      {
+namespace sd      {
 namespace ops       {
 namespace platforms {
 
@@ -342,7 +342,7 @@ static void conv3dBpMKLDNN(const NDArray *input, const NDArray *weights, const N
 
 /*
 //////////////////////////////////////////////////////////////////////
-static void conv3dMKLDNN(nd4j::graph::Context &block,
+static void conv3dMKLDNN(sd::graph::Context &block,
                         const NDArray *input, const NDArray *weights, const NDArray *bias,
                               NDArray *output,
                         const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, int pD, int pH, int pW, const int dD, const int dH, const int dW,
@@ -416,7 +416,7 @@ static void conv3dMKLDNN(nd4j::graph::Context &block,
 
 
 //////////////////////////////////////////////////////////////////////
-static void conv3dBpMKLDNN(nd4j::graph::Context &block,
+static void conv3dBpMKLDNN(sd::graph::Context &block,
                             const NDArray *input, const NDArray *weights, const NDArray *bias, const NDArray *gradO,
                             NDArray *gradI, NDArray *gradW, NDArray *gradB,
                             const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, int pD, int pH, int pW, const int dD, const int dH, const int dW,
@@ -584,7 +584,7 @@ PLATFORM_CHECK(conv3dnew, ENGINE_CPU) {
     auto bias = block.width() > 2 ? INPUT_VARIABLE(2) : nullptr;      // [oC]
     auto output = OUTPUT_VARIABLE(0);                                   // [bS, oD, oH, oW, oC] (NDHWC) or [bS, oC, oD, oH, oW] (NCDHW)
 
-    return block.isUseMKLDNN() && nd4j::MKLDNNStream::isSupported({input, weights, bias, output});
+    return block.isUseMKLDNN() && sd::MKLDNNStream::isSupported({input, weights, bias, output});
 }
 
 //////////////////////////////////////////////////////////////////////
@@ -651,7 +651,7 @@ PLATFORM_CHECK(conv3dnew_bp, ENGINE_CPU) {
     auto gradB = block.width() > 3 ? OUTPUT_VARIABLE(2) : nullptr;                   // [oC]
 
     return block.isUseMKLDNN() &&
-           nd4j::MKLDNNStream::isSupported({input, weights, bias, gradO, gradI, gradW, gradB});
+           sd::MKLDNNStream::isSupported({input, weights, bias, gradO, gradI, gradW, gradB});
 }
 
 
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/deconv2d.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/deconv2d.cpp
index 1879ef8fb..9a2051232 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/deconv2d.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/deconv2d.cpp
@@ -20,14 +20,14 @@
 
 #include <ops/declarable/PlatformHelper.h>
 #include <ops/declarable/OpRegistrator.h>
-#include <platform_boilerplate.h>
+#include <system/platform_boilerplate.h>
 
 #include <helpers/MKLDNNStream.h>
 #include "mkldnnUtils.h"
 #include <ops/declarable/helpers/convolutions.h>
 
 
-namespace nd4j      {
+namespace sd      {
 namespace ops       {
 namespace platforms {
 
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/deconv2d_tf.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/deconv2d_tf.cpp
index 7c6582ab4..2e210d0f4 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/deconv2d_tf.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/deconv2d_tf.cpp
@@ -20,13 +20,13 @@
 
 #include <ops/declarable/PlatformHelper.h>
 #include <ops/declarable/OpRegistrator.h>
-#include <platform_boilerplate.h>
+#include <system/platform_boilerplate.h>
 
 #include <helpers/MKLDNNStream.h>
 #include "mkldnnUtils.h"
 #include <ops/declarable/helpers/convolutions.h>
 
-namespace nd4j      {
+namespace sd      {
 namespace ops       {
 namespace platforms {
 
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/deconv3d.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/deconv3d.cpp
index 5daab8228..50e766d3b 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/deconv3d.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/deconv3d.cpp
@@ -20,14 +20,14 @@
 
 #include <ops/declarable/PlatformHelper.h>
 #include <ops/declarable/OpRegistrator.h>
-#include <platform_boilerplate.h>
+#include <system/platform_boilerplate.h>
 
 #include <helpers/MKLDNNStream.h>
 #include "mkldnnUtils.h"
 #include <ops/declarable/helpers/convolutions.h>
 
 
-namespace nd4j      {
+namespace sd      {
 namespace ops       {
 namespace platforms {
 
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/depthwiseConv2d.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/depthwiseConv2d.cpp
index 4da2c2cb0..db0a1979c 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/depthwiseConv2d.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/depthwiseConv2d.cpp
@@ -21,14 +21,14 @@
 
 #include <ops/declarable/PlatformHelper.h>
 #include <ops/declarable/OpRegistrator.h>
-#include <platform_boilerplate.h>
+#include <system/platform_boilerplate.h>
 #include <helpers/MKLDNNStream.h>
 #include <ops/declarable/helpers/convolutions.h>
 #include "mkldnnUtils.h"
 
 using namespace dnnl;
 
-namespace nd4j      {
+namespace sd      {
 namespace ops       {
 namespace platforms {
 
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/lrn.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/lrn.cpp
index a0f2f6151..583ab0852 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/lrn.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/lrn.cpp
@@ -21,7 +21,7 @@
 
 #include <ops/declarable/PlatformHelper.h>
 #include <ops/declarable/OpRegistrator.h>
-#include <platform_boilerplate.h>
+#include <system/platform_boilerplate.h>
 
 #include <helpers/MKLDNNStream.h>
 #include "mkldnnUtils.h"
@@ -29,7 +29,7 @@
 
 using namespace dnnl;
 
-namespace nd4j {
+namespace sd {
     namespace ops {
         namespace platforms {
             PLATFORM_IMPL(lrn, ENGINE_CPU) {
@@ -86,7 +86,7 @@ namespace nd4j {
                 auto input = INPUT_VARIABLE(0);
                 auto output = OUTPUT_VARIABLE(0);
 
-                return block.isUseMKLDNN() && nd4j::MKLDNNStream::isSupported({input, output});
+                return block.isUseMKLDNN() && sd::MKLDNNStream::isSupported({input, output});
             }
         }
     }
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/lstmLayer.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/lstmLayer.cpp
index 26aeacaa3..ad612435d 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/lstmLayer.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/lstmLayer.cpp
@@ -23,7 +23,7 @@
 
 using namespace dnnl;
 
-namespace nd4j      {
+namespace sd      {
 namespace ops       {
 namespace platforms {
 
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/matmul.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/matmul.cpp
index 53d18e3cd..7345b6543 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/matmul.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/matmul.cpp
@@ -20,14 +20,14 @@
 
 #include <ops/declarable/PlatformHelper.h>
 #include <ops/declarable/OpRegistrator.h>
-#include <platform_boilerplate.h>
+#include <system/platform_boilerplate.h>
 
 #include <helpers/MKLDNNStream.h>
 #include "mkldnnUtils.h"
 #include <numeric>
 
 
-namespace nd4j      {
+namespace sd      {
 namespace ops       {
 namespace platforms {
 
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling2d.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling2d.cpp
index 3e7979f2f..1b60684a1 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling2d.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling2d.cpp
@@ -22,7 +22,7 @@
 
 #include <ops/declarable/PlatformHelper.h>
 #include <ops/declarable/OpRegistrator.h>
-#include <platform_boilerplate.h>
+#include <system/platform_boilerplate.h>
 
 #include <helpers/MKLDNNStream.h>
 #include "mkldnnUtils.h"
@@ -30,7 +30,7 @@
 
 using namespace dnnl;
 
-namespace nd4j      {
+namespace sd      {
 namespace ops       {
 namespace platforms {
 
@@ -75,7 +75,7 @@ PLATFORM_CHECK(maxpool2d, ENGINE_CPU) {
     auto input = INPUT_VARIABLE(0);
     auto output = OUTPUT_VARIABLE(0);
 
-    return block.isUseMKLDNN() && nd4j::MKLDNNStream::isSupported({input, output});
+    return block.isUseMKLDNN() && sd::MKLDNNStream::isSupported({input, output});
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -119,7 +119,7 @@ PLATFORM_CHECK(maxpool2d_bp, ENGINE_CPU) {
     auto input = INPUT_VARIABLE(0);
     auto output = OUTPUT_VARIABLE(0);
 
-    return block.isUseMKLDNN() && nd4j::MKLDNNStream::isSupported({input, output});
+    return block.isUseMKLDNN() && sd::MKLDNNStream::isSupported({input, output});
 }
 
 }
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling3d.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling3d.cpp
index 7f6e95418..fbd17d882 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling3d.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/maxpooling3d.cpp
@@ -21,7 +21,7 @@
 
 #include <ops/declarable/PlatformHelper.h>
 #include <ops/declarable/OpRegistrator.h>
-#include <platform_boilerplate.h>
+#include <system/platform_boilerplate.h>
 
 #include <helpers/MKLDNNStream.h>
 #include "mkldnnUtils.h"
@@ -29,7 +29,7 @@
 
 using namespace dnnl;
 
-namespace nd4j      {
+namespace sd      {
 namespace ops       {
 namespace platforms {
 
@@ -76,7 +76,7 @@ PLATFORM_CHECK(maxpool3dnew, ENGINE_CPU) {
     auto input = INPUT_VARIABLE(0);
     auto output = OUTPUT_VARIABLE(0);
 
-    return block.isUseMKLDNN() && nd4j::MKLDNNStream::isSupported({input, output});
+    return block.isUseMKLDNN() && sd::MKLDNNStream::isSupported({input, output});
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -125,7 +125,7 @@ PLATFORM_CHECK(maxpool3dnew_bp, ENGINE_CPU) {
     auto input = INPUT_VARIABLE(0);
     auto output = OUTPUT_VARIABLE(0);
 
-    return block.isUseMKLDNN() && nd4j::MKLDNNStream::isSupported({input, output});
+    return block.isUseMKLDNN() && sd::MKLDNNStream::isSupported({input, output});
 }
 
 }
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/mkldnnUtils.cpp b/libnd4j/include/ops/declarable/platform/mkldnn/mkldnnUtils.cpp
index 02bba4300..6cb74d628 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/mkldnnUtils.cpp
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/mkldnnUtils.cpp
@@ -25,7 +25,7 @@
 
 using namespace dnnl;
 
-namespace nd4j        {
+namespace sd        {
 namespace mkldnnUtils {
 
 //////////////////////////////////////////////////////////////////////
diff --git a/libnd4j/include/ops/declarable/platform/mkldnn/mkldnnUtils.h b/libnd4j/include/ops/declarable/platform/mkldnn/mkldnnUtils.h
index 10adf533d..e0e1b83c2 100644
--- a/libnd4j/include/ops/declarable/platform/mkldnn/mkldnnUtils.h
+++ b/libnd4j/include/ops/declarable/platform/mkldnn/mkldnnUtils.h
@@ -22,18 +22,18 @@
 #ifndef DEV_TESTS_MKLDNNUTILS_H
 #define DEV_TESTS_MKLDNNUTILS_H
 
-#include <NativeOps.h>
-#include <NDArray.h>
+#include <legacy/NativeOps.h>
+#include <array/NDArray.h>
 #include <dnnl.hpp>
-#include <MKLDNNStream.h>
+#include <helpers/MKLDNNStream.h>
 #include <graph/Context.h>
 #include <ops/declarable/PlatformHelper.h>
-#include <platform_boilerplate.h>
+#include <system/platform_boilerplate.h>
 
 using namespace samediff;
 
 
-namespace nd4j{
+namespace sd{
     namespace ops {
         namespace platforms {
             /**
diff --git a/libnd4j/include/ops/gemm.h b/libnd4j/include/ops/gemm.h
index c967425c7..23f1636a2 100644
--- a/libnd4j/include/ops/gemm.h
+++ b/libnd4j/include/ops/gemm.h
@@ -22,11 +22,11 @@
 #define LIBND4J_GEMM_H
 
 #include <cblas.h>
-#include <templatemath.h>
-#include <op_boilerplate.h>
+#include <math/templatemath.h>
+#include <system/op_boilerplate.h>
 
 
-namespace nd4j {
+namespace sd {
      namespace blas {
          template <typename T>
          static void * transpose(int orderSource, int orderTarget, int rows, int cols, void *source);
@@ -42,7 +42,7 @@ namespace nd4j {
          };
 
          template <typename X, typename Y, typename Z>
-         class GEMV : public nd4j::blas::GEMM<X, Y, Z>{
+         class GEMV : public sd::blas::GEMM<X, Y, Z>{
          public:
              static void op(int TRANS, int M, int N, double alpha, void* vA, int lda, void* vX, int incx, double beta, void* vY, int incy );
          };
diff --git a/libnd4j/include/ops/impl/BroadcastBoolOpsTuple.cpp b/libnd4j/include/ops/impl/BroadcastBoolOpsTuple.cpp
index 31fa9cc66..7e903346b 100644
--- a/libnd4j/include/ops/impl/BroadcastBoolOpsTuple.cpp
+++ b/libnd4j/include/ops/impl/BroadcastBoolOpsTuple.cpp
@@ -19,8 +19,8 @@
 //
 #include <ops/BroadcastBoolOpsTuple.h>
 
-namespace nd4j {
-    BroadcastBoolOpsTuple BroadcastBoolOpsTuple::custom(nd4j::scalar::BoolOps scalar, nd4j::pairwise::BoolOps pairwise, nd4j::broadcast::BoolOps broadcast) {
+namespace sd {
+    BroadcastBoolOpsTuple BroadcastBoolOpsTuple::custom(sd::scalar::BoolOps scalar, sd::pairwise::BoolOps pairwise, sd::broadcast::BoolOps broadcast) {
         BroadcastBoolOpsTuple t(scalar, pairwise, broadcast);
         return t;
     }
diff --git a/libnd4j/include/ops/impl/BroadcastIntOpsTuple.cpp b/libnd4j/include/ops/impl/BroadcastIntOpsTuple.cpp
index 607572b59..5680b8056 100644
--- a/libnd4j/include/ops/impl/BroadcastIntOpsTuple.cpp
+++ b/libnd4j/include/ops/impl/BroadcastIntOpsTuple.cpp
@@ -19,8 +19,8 @@
 //
 #include <ops/BroadcastIntOpsTuple.h>
 
-namespace nd4j {
-    BroadcastIntOpsTuple BroadcastIntOpsTuple::custom(nd4j::scalar::IntOps scalar, nd4j::pairwise::IntOps pairwise, nd4j::broadcast::IntOps broadcast) {
+namespace sd {
+    BroadcastIntOpsTuple BroadcastIntOpsTuple::custom(sd::scalar::IntOps scalar, sd::pairwise::IntOps pairwise, sd::broadcast::IntOps broadcast) {
         BroadcastIntOpsTuple t(scalar, pairwise, broadcast);
         return t;
     }
diff --git a/libnd4j/include/ops/impl/BroadcastOpsTuple.cpp b/libnd4j/include/ops/impl/BroadcastOpsTuple.cpp
index 26cda74a4..71afe8260 100644
--- a/libnd4j/include/ops/impl/BroadcastOpsTuple.cpp
+++ b/libnd4j/include/ops/impl/BroadcastOpsTuple.cpp
@@ -19,48 +19,48 @@
 //
 #include <ops/BroadcastOpsTuple.h>
 
-namespace nd4j {
-    BroadcastOpsTuple BroadcastOpsTuple::custom(nd4j::scalar::Ops scalar, nd4j::pairwise::Ops pairwise, nd4j::broadcast::Ops broadcast) {
+namespace sd {
+    BroadcastOpsTuple BroadcastOpsTuple::custom(sd::scalar::Ops scalar, sd::pairwise::Ops pairwise, sd::broadcast::Ops broadcast) {
         BroadcastOpsTuple t(scalar, pairwise, broadcast);
         return t;
     }
 
     BroadcastOpsTuple BroadcastOpsTuple::Add() {
-        return custom(nd4j::scalar::Add, nd4j::pairwise::Add, nd4j::broadcast::Add);
+        return custom(sd::scalar::Add, sd::pairwise::Add, sd::broadcast::Add);
     }
 
     BroadcastOpsTuple BroadcastOpsTuple::Assign() {
-        return custom(nd4j::scalar::CopyPws, nd4j::pairwise::CopyPws, nd4j::broadcast::CopyPws);
+        return custom(sd::scalar::CopyPws, sd::pairwise::CopyPws, sd::broadcast::CopyPws);
     }
 
     BroadcastOpsTuple BroadcastOpsTuple::Divide() {
-        return custom(nd4j::scalar::Divide, nd4j::pairwise::Divide, nd4j::broadcast::Divide);
+        return custom(sd::scalar::Divide, sd::pairwise::Divide, sd::broadcast::Divide);
     }
 
     BroadcastOpsTuple BroadcastOpsTuple::DivideNoNan() {
-        return custom(nd4j::scalar::DivideNoNan, nd4j::pairwise::DivideNoNan, nd4j::broadcast::DivideNoNan);
+        return custom(sd::scalar::DivideNoNan, sd::pairwise::DivideNoNan, sd::broadcast::DivideNoNan);
     }
 
     BroadcastOpsTuple BroadcastOpsTuple::Multiply() {
-        return custom(nd4j::scalar::Multiply, nd4j::pairwise::Multiply, nd4j::broadcast::Multiply);
+        return custom(sd::scalar::Multiply, sd::pairwise::Multiply, sd::broadcast::Multiply);
     }
 
     BroadcastOpsTuple BroadcastOpsTuple::Subtract() {
-        return custom(nd4j::scalar::Subtract, nd4j::pairwise::Subtract, nd4j::broadcast::Subtract);
+        return custom(sd::scalar::Subtract, sd::pairwise::Subtract, sd::broadcast::Subtract);
     }
     BroadcastOpsTuple BroadcastOpsTuple::IGamma() {
-        return custom(nd4j::scalar::IGamma, nd4j::pairwise::IGamma, nd4j::broadcast::IGamma);
+        return custom(sd::scalar::IGamma, sd::pairwise::IGamma, sd::broadcast::IGamma);
     }
     BroadcastOpsTuple BroadcastOpsTuple::IGammac() {
-        return custom(nd4j::scalar::IGammac, nd4j::pairwise::IGammac, nd4j::broadcast::IGammac);
+        return custom(sd::scalar::IGammac, sd::pairwise::IGammac, sd::broadcast::IGammac);
     }
 
 
     BroadcastOpsTuple BroadcastOpsTuple::Pow() {
-        return custom(nd4j::scalar::Pow, nd4j::pairwise::Pow, nd4j::broadcast::Pow);
+        return custom(sd::scalar::Pow, sd::pairwise::Pow, sd::broadcast::Pow);
     }
     BroadcastOpsTuple BroadcastOpsTuple::PowDerivative() {
-        return custom(nd4j::scalar::PowDerivative, nd4j::pairwise::PowDerivative, nd4j::broadcast::PowDerivative);
+        return custom(sd::scalar::PowDerivative, sd::pairwise::PowDerivative, sd::broadcast::PowDerivative);
     }
 
 }
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_0.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_0.cpp
index 8ef8032bb..e9d262f58 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_0.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_0.cpp
@@ -21,7 +21,7 @@
 
 #include "../specials_double.hpp"
 
-namespace nd4j {
+namespace sd {
     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_0);
 
     BUILD_DOUBLE_TEMPLATE(template void SpecialTypeConverter::convertGeneric, (Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz), LIBND4J_TYPES, LIBND4J_TYPES);
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_1.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_1.cpp
index 5bb518d76..a61a98870 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_1.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_1.cpp
@@ -21,6 +21,6 @@
 
 #include "../specials_double.hpp"
 
-namespace nd4j {
+namespace sd {
     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_1);
 }
\ No newline at end of file
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_2.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_2.cpp
index 27b68e732..89deb3d9c 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_2.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_2.cpp
@@ -21,6 +21,6 @@
 
 #include "../specials_double.hpp"
 
-namespace nd4j {
+namespace sd {
     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_2);
 }
\ No newline at end of file
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_3.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_3.cpp
index 80e2258c7..7690749bf 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_3.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_3.cpp
@@ -21,6 +21,6 @@
 
 #include "../specials_double.hpp"
 
-namespace nd4j {
+namespace sd {
     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_3);
 }
\ No newline at end of file
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_4.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_4.cpp
index e34b0c528..505ea9921 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_4.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_4.cpp
@@ -21,6 +21,6 @@
 
 #include "../specials_double.hpp"
 
-namespace nd4j {
+namespace sd {
     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_4);
 }
\ No newline at end of file
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_5.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_5.cpp
index 96797cc98..caa9d2dfa 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_5.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_5.cpp
@@ -21,6 +21,6 @@
 
 #include "../specials_double.hpp"
 
-namespace nd4j {
+namespace sd {
     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_5);
 }
\ No newline at end of file
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_6.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_6.cpp
index 70c7f3990..9646534a9 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_6.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_6.cpp
@@ -21,6 +21,6 @@
 
 #include "../specials_double.hpp"
 
-namespace nd4j {
+namespace sd {
     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_6);
 }
\ No newline at end of file
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_7.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_7.cpp
index e2d1df0e9..3230c1fbc 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_7.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_7.cpp
@@ -21,6 +21,6 @@
 
 #include "../specials_double.hpp"
 
-namespace nd4j {
+namespace sd {
     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_7);
 }
\ No newline at end of file
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_8.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_8.cpp
index 25e14d39f..a56b335b6 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_8.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_8.cpp
@@ -21,6 +21,6 @@
 
 #include "../specials_double.hpp"
 
-namespace nd4j {
+namespace sd {
     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_8);
 }
\ No newline at end of file
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_9.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_9.cpp
index f3b4cbcb6..bb13c0415 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_9.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_9.cpp
@@ -21,6 +21,6 @@
 
 #include "../specials_double.hpp"
 
-namespace nd4j {
+namespace sd {
     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_9);
 }
\ No newline at end of file
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_0.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_0.cpp
index 4d1575123..f74717f05 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_0.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_0.cpp
@@ -21,6 +21,6 @@
 
 #include "../specials_single.hpp"
 
-namespace nd4j {
+namespace sd {
     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_0);
 }
\ No newline at end of file
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_1.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_1.cpp
index b50c487b7..cbacbb60e 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_1.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_1.cpp
@@ -21,6 +21,6 @@
 
 #include "../specials_single.hpp"
 
-namespace nd4j {
+namespace sd {
     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_1);
 }
\ No newline at end of file
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_2.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_2.cpp
index 972b936dd..b1c7c0db6 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_2.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_2.cpp
@@ -21,6 +21,6 @@
 
 #include "../specials_single.hpp"
 
-namespace nd4j {
+namespace sd {
     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_2);
 }
\ No newline at end of file
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_3.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_3.cpp
index 9eb99b238..d340500e5 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_3.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_3.cpp
@@ -21,6 +21,6 @@
 
 #include "../specials_single.hpp"
 
-namespace nd4j {
+namespace sd {
     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_3);
 }
\ No newline at end of file
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_4.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_4.cpp
index 6558d7284..b8ea2a933 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_4.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_4.cpp
@@ -21,6 +21,6 @@
 
 #include "../specials_single.hpp"
 
-namespace nd4j {
+namespace sd {
     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_4);
 }
\ No newline at end of file
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_5.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_5.cpp
index d89652899..cc3fe3f0b 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_5.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_5.cpp
@@ -21,6 +21,6 @@
 
 #include "../specials_single.hpp"
 
-namespace nd4j {
+namespace sd {
     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_5);
 }
\ No newline at end of file
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_6.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_6.cpp
index 40c9598ee..4e0b96a82 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_6.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_6.cpp
@@ -21,6 +21,6 @@
 
 #include "../specials_single.hpp"
 
-namespace nd4j {
+namespace sd {
     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_6);
 }
\ No newline at end of file
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_7.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_7.cpp
index e49ace221..e8bd8d950 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_7.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_7.cpp
@@ -21,6 +21,6 @@
 
 #include "../specials_single.hpp"
 
-namespace nd4j {
+namespace sd {
     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_7);
 }
\ No newline at end of file
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_8.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_8.cpp
index 973b25edc..b2581352e 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_8.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_8.cpp
@@ -21,6 +21,6 @@
 
 #include "../specials_single.hpp"
 
-namespace nd4j {
+namespace sd {
     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_8);
 }
\ No newline at end of file
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_9.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_9.cpp
index b3bf0beeb..5105affa8 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_9.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_9.cpp
@@ -21,6 +21,6 @@
 
 #include "../specials_single.hpp"
 
-namespace nd4j {
+namespace sd {
     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_9);
 }
\ No newline at end of file
diff --git a/libnd4j/include/ops/impl/gemm.cpp b/libnd4j/include/ops/impl/gemm.cpp
index efd57a7c5..0c4ab167c 100644
--- a/libnd4j/include/ops/impl/gemm.cpp
+++ b/libnd4j/include/ops/impl/gemm.cpp
@@ -19,12 +19,12 @@
 // Modified by GS <sgazeos@gmail.com> on 3/9/2018
 //
 
-#include <gemm.h>
+#include <ops/gemm.h>
 #include <types/types.h>
-#include <Environment.h>
+#include <system/Environment.h>
 #include <execution/Threads.h>
 
-namespace nd4j {
+namespace sd {
     namespace blas {
 
         template <typename T>
@@ -95,7 +95,7 @@ namespace nd4j {
                             for (int k = 0; k < K; k++) {
                                 aIdx = (transAFlag ? linearIndexC(M, K, r, k) : linearIndexF(M, K, r, k));
                                 bIdx = (transBFlag ? linearIndexC(K, N, k, c) : linearIndexF(K, N, k, c));
-                                dot += static_cast<Z>(alpha) * static_cast<Z>(A[aIdx]) * static_cast<Z>(B[bIdx]);//A[aIdx]nd4j::math::nd4j_dot<T>(aX, bX, K) * alpha;
+                                dot += static_cast<Z>(alpha) * static_cast<Z>(A[aIdx]) * static_cast<Z>(B[bIdx]);//A[aIdx]sd::math::nd4j_dot<T>(aX, bX, K) * alpha;
                             }
                         }
 
@@ -127,14 +127,14 @@ namespace nd4j {
             auto y = reinterpret_cast<Y *>(vY);
             auto z = reinterpret_cast<Z *>(vZ);
 
-            auto aT = TRANS == CblasTrans ? reinterpret_cast<X *>(nd4j::blas::transpose<X>(CblasColMajor, CblasRowMajor, M, N, reinterpret_cast<void *>(x))) : x;
+            auto aT = TRANS == CblasTrans ? reinterpret_cast<X *>(sd::blas::transpose<X>(CblasColMajor, CblasRowMajor, M, N, reinterpret_cast<void *>(x))) : x;
 
             auto func = PRAGMA_THREADS_FOR {
                 for (auto r = start; r < stop; r++) {
                     int aIdx = linearIndexC(M, N, r, 0);
                     auto aX = aT + aIdx;
 
-                    auto dot = nd4j::math::nd4j_dot<X, Y, Z>(aX, y, lda) * static_cast<Z>(alpha);
+                    auto dot = sd::math::nd4j_dot<X, Y, Z>(aX, y, lda) * static_cast<Z>(alpha);
                     z[r] = beta == 0.0f ? dot : dot + static_cast<Z>(beta) * z[r];
                 }
             };
diff --git a/libnd4j/include/ops/impl/specials_double.hpp b/libnd4j/include/ops/impl/specials_double.hpp
index 73f50c772..96f7d2db2 100644
--- a/libnd4j/include/ops/impl/specials_double.hpp
+++ b/libnd4j/include/ops/impl/specials_double.hpp
@@ -20,17 +20,17 @@
 //
 
 
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <helpers/shape.h>
 #include <helpers/TAD.h>
-#include <specials.h>
-#include <dll.h>
-#include <NDArray.h>
+#include <ops/specials.h>
+#include <system/dll.h>
+#include <array/NDArray.h>
 #include <ops/declarable/CustomOperations.h>
 #include <types/types.h>
 #include <helpers/Loops.h>
 
-namespace nd4j {
+namespace sd {
 
 
     template<typename S, typename T>
diff --git a/libnd4j/include/ops/impl/specials_single.hpp b/libnd4j/include/ops/impl/specials_single.hpp
index ad4c96e7c..317cbeb42 100644
--- a/libnd4j/include/ops/impl/specials_single.hpp
+++ b/libnd4j/include/ops/impl/specials_single.hpp
@@ -20,17 +20,17 @@
 //
 
 
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <helpers/shape.h>
 #include <helpers/TAD.h>
-#include <specials.h>
-#include <dll.h>
-#include <NDArray.h>
+#include <ops/specials.h>
+#include <system/dll.h>
+#include <array/NDArray.h>
 #include <ops/declarable/CustomOperations.h>
 #include <types/types.h>
 #include <helpers/Loops.h>
 
-namespace nd4j {
+namespace sd {
 /**
 * Concatneate multi array of the same shape together
 * along a particular dimension
@@ -100,7 +100,7 @@ namespace nd4j {
 //         auto func = PRAGMA_THREADS_FOR {
 //             for (auto i = start; i < stop; i += increment) {
 //                 auto temp = output(indices[i], true);
-//                 nd4j::TransformLoops<T, T, T>::template loopTransform<simdOps::Assign<T, T>>( inArrs[i]->bufferAsT<T>(), inArrs[i]->getShapeInfo(), temp.bufferAsT<T>(), temp.getShapeInfo(), nullptr, 0, 1);
+//                 sd::TransformLoops<T, T, T>::template loopTransform<simdOps::Assign<T, T>>( inArrs[i]->bufferAsT<T>(), inArrs[i]->getShapeInfo(), temp.bufferAsT<T>(), temp.getShapeInfo(), nullptr, 0, 1);
 //             }
 //         };
 
@@ -211,7 +211,7 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint
     for(int i = 0; i < numArrays; ++i)
         inputs[i] = new NDArray(static_cast<void *>(data[i]), static_cast<Nd4jLong*>(inputShapeInfo[i]));
 
-    nd4j::SpecialMethods<T>::concatCpuGeneric(inputs, output, dimension);
+    sd::SpecialMethods<T>::concatCpuGeneric(inputs, output, dimension);
 
     for(int i = 0; i < numArrays; ++i)
         delete inputs[i];
@@ -486,7 +486,7 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
                         continue;
 
                     T val = dx[e];
-                    T abs = nd4j::math::nd4j_abs<T>(val);
+                    T abs = sd::math::nd4j_abs<T>(val);
 
                     int bitId = e % 16;
 
diff --git a/libnd4j/include/ops/impl/specials_sparse.cpp b/libnd4j/include/ops/impl/specials_sparse.cpp
index 6a76c9209..c782ccf18 100644
--- a/libnd4j/include/ops/impl/specials_sparse.cpp
+++ b/libnd4j/include/ops/impl/specials_sparse.cpp
@@ -19,8 +19,8 @@
 //
 
 #include <ops/specials_sparse.h>
-#include <dll.h>
-#include <pointercast.h>
+#include <system/dll.h>
+#include <system/pointercast.h>
 #include <stdio.h>
 #include <stdlib.h>
 #ifdef _OPENMP
@@ -29,7 +29,7 @@
 #include <types/float16.h>
 #include <types/types.h>
 
-namespace nd4j {
+namespace sd {
     namespace sparse {
 
         template <typename T>
diff --git a/libnd4j/include/ops/meta_ops.h b/libnd4j/include/ops/meta_ops.h
index 53be344b6..a2120477d 100644
--- a/libnd4j/include/ops/meta_ops.h
+++ b/libnd4j/include/ops/meta_ops.h
@@ -18,8 +18,8 @@
 #ifndef FUSED_OPS_H_
 #define FUSED_OPS_H_
 
-#include <pointercast.h>
-#include <op_boilerplate.h>
+#include <system/pointercast.h>
+#include <system/op_boilerplate.h>
 
 #include <ops/ops.h>
 
diff --git a/libnd4j/include/ops/ops.h b/libnd4j/include/ops/ops.h
index ff3ee570b..e49165e78 100644
--- a/libnd4j/include/ops/ops.h
+++ b/libnd4j/include/ops/ops.h
@@ -18,11 +18,11 @@
 #ifndef OPS_H_
 #define OPS_H_
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <array/DataTypeUtils.h>
 #include <helpers/shape.h>
 #include <vector>
-#include <Environment.h>
+#include <system/Environment.h>
 #include <loops/summarystatsreduce.h>
 #include <loops/ReduceType.h>
 
@@ -228,23 +228,23 @@ namespace simdOps {
 		op_def static Z op(X z, Y c) {
 			auto zz = static_cast<Z>(z);
 			auto zc = static_cast<Z>(c);
-			return (nd4j::math::nd4j_exp<Y, Z>(c) - zz * zc  + (zz * nd4j::math::nd4j_log<X, Z>(z) - zz + static_cast<Z>(0.5f) * nd4j::math::nd4j_log<Z, Z>(static_cast<Z>(DOUBLE_PI_X) * zz)));
+			return (sd::math::nd4j_exp<Y, Z>(c) - zz * zc  + (zz * sd::math::nd4j_log<X, Z>(z) - zz + static_cast<Z>(0.5f) * sd::math::nd4j_log<Z, Z>(static_cast<Z>(DOUBLE_PI_X) * zz)));
 		}
 
 		op_def static Z op(X z, Y c, Z *params) {
 			auto zz = static_cast<Z>(z);
 			auto zc = static_cast<Z>(c);
-			return (nd4j::math::nd4j_exp<Y, Z>(c) - zz * zc  + (zz * nd4j::math::nd4j_log<X, Z>(z) - zz + static_cast<Z>(0.5f) * nd4j::math::nd4j_log<Z, Z>(static_cast<Z>(DOUBLE_PI_X) * zz)));
+			return (sd::math::nd4j_exp<Y, Z>(c) - zz * zc  + (zz * sd::math::nd4j_log<X, Z>(z) - zz + static_cast<Z>(0.5f) * sd::math::nd4j_log<Z, Z>(static_cast<Z>(DOUBLE_PI_X) * zz)));
 		}
 
 		op_def static Z op(X z) {
 			auto zz = static_cast<Z>(z);
-			return (zz * nd4j::math::nd4j_log<Y, Z>(z) - zz + static_cast<Z>(0.5f) * nd4j::math::nd4j_log<Z, Z>(static_cast<Z>(DOUBLE_PI_X) * zz));
+			return (zz * sd::math::nd4j_log<Y, Z>(z) - zz + static_cast<Z>(0.5f) * sd::math::nd4j_log<Z, Z>(static_cast<Z>(DOUBLE_PI_X) * zz));
 		}
 
 		// op for MetaOps
 		op_def static X op(X z, Y *params) {
-			return (nd4j::math::nd4j_exp<X, X>(params[0]) - z * params[0]  + (z * nd4j::math::nd4j_log<X, Z>(z) - z + static_cast<X>(0.5f) * nd4j::math::nd4j_log<X, Z>(DOUBLE_PI_X * z)));
+			return (sd::math::nd4j_exp<X, X>(params[0]) - z * params[0]  + (z * sd::math::nd4j_log<X, Z>(z) - z + static_cast<X>(0.5f) * sd::math::nd4j_log<X, Z>(DOUBLE_PI_X * z)));
 		}
 	};
 
@@ -255,13 +255,13 @@ namespace simdOps {
 		op_def static Z op(X z, Y c) {
 			auto zz = static_cast<Z>(z);
 			auto zc = static_cast<Z>(c);
-			return (nd4j::math::nd4j_exp<Y, Z>(c) - zz * zc);
+			return (sd::math::nd4j_exp<Y, Z>(c) - zz * zc);
 		}
 
 		op_def static Z op(X z, Y c, Z *params) {
 			auto zz = static_cast<Z>(z);
 			auto zc = static_cast<Z>(c);
-			return (nd4j::math::nd4j_exp<Y, Z>(c) - zz * zc);
+			return (sd::math::nd4j_exp<Y, Z>(c) - zz * zc);
 		}
 
 		op_def static Z op(X z) {
@@ -270,7 +270,7 @@ namespace simdOps {
 
 		// op for MetaOps
 		op_def static Z op(X z, Y *params) {
-			return (nd4j::math::nd4j_exp<Y, Z>(params[0]) - static_cast<Z>(z) * static_cast<Z>(params[0]));
+			return (sd::math::nd4j_exp<Y, Z>(params[0]) - static_cast<Z>(z) * static_cast<Z>(params[0]));
 		}
 	};
 
@@ -383,20 +383,20 @@ namespace simdOps {
     class FloorDiv {
     public:
         op_def static Z op(X d1, Y d2) {
-            return nd4j::math::nd4j_floor<Z,Z>(static_cast<Z>(d1 / d2));
+            return sd::math::nd4j_floor<Z,Z>(static_cast<Z>(d1 / d2));
         }
 
         op_def static Z op(X d1, Y d2, Z *params) {
-            return nd4j::math::nd4j_floor<Z,Z>(static_cast<Z>(d1 / d2));
+            return sd::math::nd4j_floor<Z,Z>(static_cast<Z>(d1 / d2));
         }
 
         op_def static Z op(X d1) {
-            return nd4j::math::nd4j_floor<Z,Z>(static_cast<Z>(d1));
+            return sd::math::nd4j_floor<Z,Z>(static_cast<Z>(d1));
         }
 
         // op for MetaOps
         op_def static Z op(X d1, Y *params) {
-            return nd4j::math::nd4j_floor<Z,Z>(static_cast<Z>(d1 / params[0]));
+            return sd::math::nd4j_floor<Z,Z>(static_cast<Z>(d1 / params[0]));
         }
     };
 
@@ -458,11 +458,11 @@ namespace simdOps {
     class Remainder {
     public:
         op_def static Z op(X d1, Y d2) {
-            return nd4j::math::nd4j_remainder<X, Y, Z>(d1, d2);
+            return sd::math::nd4j_remainder<X, Y, Z>(d1, d2);
         }
 
         op_def static Z op(X d1, Y d2, Z *params) {
-            return nd4j::math::nd4j_remainder<X, Y, Z>(d1, d2);
+            return sd::math::nd4j_remainder<X, Y, Z>(d1, d2);
         }
 
         op_def static Z op(X d1) {
@@ -471,7 +471,7 @@ namespace simdOps {
 
         // op for MetaOps
         op_def static Z op(X d1, Y *params) {
-            return nd4j::math::nd4j_remainder<X, Y, Z>(d1, params[0]);
+            return sd::math::nd4j_remainder<X, Y, Z>(d1, params[0]);
         }
     };
 
@@ -479,11 +479,11 @@ namespace simdOps {
     class FMod {
     public:
         op_def static Z op(X d1, Y d2) {
-            return nd4j::math::nd4j_fmod<X, Y, Z>(d1, d2);
+            return sd::math::nd4j_fmod<X, Y, Z>(d1, d2);
         }
 
         op_def static Z op(X d1, Y d2, Z *params) {
-            return nd4j::math::nd4j_fmod<X, Y, Z>(d1, d2);
+            return sd::math::nd4j_fmod<X, Y, Z>(d1, d2);
         }
 
         op_def static Z op(X d1) {
@@ -492,7 +492,7 @@ namespace simdOps {
 
         // op for MetaOps
         op_def static Z op(X d1, Y *params) {
-            return nd4j::math::nd4j_fmod<X, Y, Z>(d1, params[0]);
+            return sd::math::nd4j_fmod<X, Y, Z>(d1, params[0]);
         }
     };
 
@@ -500,13 +500,13 @@ namespace simdOps {
     class FloorMod {
     public:
         op_def static Z op(X d1, Y d2) {
-			auto m = nd4j::math::nd4j_fmod<X, Y, Z>(d1, d2);
-            return (d1 < static_cast<X>(0)) == (d2 < static_cast<Y>(0)) ? m : nd4j::math::nd4j_fmod<Z, Y, Z>(m + static_cast<Z>(d2), d2);
+			auto m = sd::math::nd4j_fmod<X, Y, Z>(d1, d2);
+            return (d1 < static_cast<X>(0)) == (d2 < static_cast<Y>(0)) ? m : sd::math::nd4j_fmod<Z, Y, Z>(m + static_cast<Z>(d2), d2);
         }
 
         op_def static Z op(X d1, Y d2, Z *params) {
-            auto m = nd4j::math::nd4j_fmod<X, Y, Z>(d1, d2);
-			return (d1 < static_cast<X>(0.0f)) == (d2 < static_cast<Y>(0)) ? m : nd4j::math::nd4j_fmod<Z, Y, Z>(m + static_cast<Z>(d2), d2);
+            auto m = sd::math::nd4j_fmod<X, Y, Z>(d1, d2);
+			return (d1 < static_cast<X>(0.0f)) == (d2 < static_cast<Y>(0)) ? m : sd::math::nd4j_fmod<Z, Y, Z>(m + static_cast<Z>(d2), d2);
         }
 
         op_def static Z op(X d1) {
@@ -722,7 +722,7 @@ namespace simdOps {
     public:
 
         op_def static X op(X d1, X d2) {
-            return nd4j::math::nd4j_rotl<X>(d1, d2);
+            return sd::math::nd4j_rotl<X>(d1, d2);
         }
 
         op_def static X op(X d1, X d2, X *params) {
@@ -735,7 +735,7 @@ namespace simdOps {
     public:
 
         op_def static X op(X d1, X d2) {
-            return nd4j::math::nd4j_rotr<X>(d1, d2);
+            return sd::math::nd4j_rotr<X>(d1, d2);
         }
 
         op_def static X op(X d1, X d2, X *params) {
@@ -969,7 +969,7 @@ namespace simdOps {
 
 	    op_def static Z op(X d1, X d2) {
             X diff = d1 - d2;
-            X absDiff = nd4j::math::nd4j_abs<X>(diff);
+            X absDiff = sd::math::nd4j_abs<X>(diff);
             if (absDiff <= static_cast<X>(MIN_V))
                 return static_cast<Z>(1);
             return static_cast<Z>(0);
@@ -1100,7 +1100,7 @@ namespace simdOps {
 		no_op_exec_special_same_cuda
 
 		op_def static X op(X d1, X *params) {
-			return nd4j::math::nd4j_abs<X>(d1);
+			return sd::math::nd4j_abs<X>(d1);
 		}
 	};
 
@@ -1112,7 +1112,7 @@ namespace simdOps {
 		no_op_exec_special_same_cuda
 
 		op_def static X op(X d1, X *params) {
-			return nd4j::math::nd4j_ceil<X,X>(d1);
+			return sd::math::nd4j_ceil<X,X>(d1);
 		}
 	};
 
@@ -1124,7 +1124,7 @@ namespace simdOps {
 		no_op_exec_special_same_cuda
 
 		op_def static X op(X d1, X *params) {
-			return nd4j::math::nd4j_cos<X,X>(d1);
+			return sd::math::nd4j_cos<X,X>(d1);
 		}
 	};
 
@@ -1136,7 +1136,7 @@ namespace simdOps {
 		no_op_exec_special_same_cuda
 
 		op_def static X op(X d1, X *params) {
-			return nd4j::math::nd4j_exp<X, X>(d1);
+			return sd::math::nd4j_exp<X, X>(d1);
 		}
 	};
 
@@ -1178,7 +1178,7 @@ namespace simdOps {
 		no_op_exec_special_same_cuda
 
 		op_def static X op(X d1, X *params) {
-			return nd4j::math::nd4j_floor<X,X>(d1);
+			return sd::math::nd4j_floor<X,X>(d1);
 		}
 	};
 
@@ -1190,7 +1190,7 @@ namespace simdOps {
 		no_op_exec_special_same_cuda
 
 		op_def static X op(X d1, X *params) {
-			return nd4j::math::nd4j_log<X, X>(d1);
+			return sd::math::nd4j_log<X, X>(d1);
 		}
 	};
 
@@ -1201,7 +1201,7 @@ namespace simdOps {
 		no_op_exec_special_same_cuda
 
 		op_def static X op(X d1, X *params) {
-			return nd4j::math::nd4j_log<X, X>(1 + d1);
+			return sd::math::nd4j_log<X, X>(1 + d1);
 		}
 	};
 
@@ -1210,7 +1210,7 @@ namespace simdOps {
 	public:
 
 		op_def static Z op(X d1, Y d2, Z *params) {
-			return nd4j::math::nd4j_log<X, Z>(d1) / nd4j::math::nd4j_log<Y, Z>(d2) ;
+			return sd::math::nd4j_log<X, Z>(d1) / sd::math::nd4j_log<Y, Z>(d2) ;
 		}
 	};
 
@@ -1222,7 +1222,7 @@ namespace simdOps {
 
         op_def static X op(X d1, X *params) {
             if (d1 <= static_cast<X>(0))
-            	return static_cast<X>(nd4j::DataTypeUtils::min<float16>());
+            	return static_cast<X>(sd::DataTypeUtils::min<float16>());
             else return d1;
         }
     };
@@ -1235,7 +1235,7 @@ namespace simdOps {
 
         op_def static X op(X d1, X *params) {
             if (d1 <= static_cast<X>(0))
-            	return nd4j::DataTypeUtils::min<X>();
+            	return sd::DataTypeUtils::min<X>();
             else return d1;
         }
     };
@@ -1270,7 +1270,7 @@ namespace simdOps {
 		no_op_exec_special_same_cuda
 
 		op_def static X op(X d1, X *params) {
-			return nd4j::math::nd4j_erf<X,X>(d1);
+			return sd::math::nd4j_erf<X,X>(d1);
 		}
 	};
 
@@ -1282,7 +1282,7 @@ namespace simdOps {
 		no_op_exec_special_same_cuda
 
 		op_def static X op(X d1, X *params) {
-			return nd4j::math::nd4j_erfc<X,X>(d1);
+			return sd::math::nd4j_erfc<X,X>(d1);
 		}
 	};
 
@@ -1307,11 +1307,11 @@ namespace simdOps {
 		no_op_exec_special_cuda
 
 		op_def static Z op(X d1, Z *params) {
-			return nd4j::math::nd4j_pow<X, X, Z>(d1, static_cast<X>(2));
+			return sd::math::nd4j_pow<X, X, Z>(d1, static_cast<X>(2));
 		}
 
 		op_def static Z op(X d1) {
-			return nd4j::math::nd4j_pow<X, X, Z>(d1, static_cast<X>(2));
+			return sd::math::nd4j_pow<X, X, Z>(d1, static_cast<X>(2));
 		}
 	};
 
@@ -1323,7 +1323,7 @@ namespace simdOps {
 		no_op_exec_special_cuda
 
 		op_def static Z op(X d1, Y d2) {
-			return nd4j::math::nd4j_re<X>(d1, d2);
+			return sd::math::nd4j_re<X>(d1, d2);
 		}
 
 		op_def static Z op(X d1, Y d2, Z *params) {
@@ -1343,7 +1343,7 @@ namespace simdOps {
 
 		op_def static Z op(X d1, Y d2, Z *params) {
 			X threshold = params[0];
-			return nd4j::math::nd4j_re<X>(d1, d2) > threshold ? static_cast<Z>(1) : static_cast<Z>(0);
+			return sd::math::nd4j_re<X>(d1, d2) > threshold ? static_cast<Z>(1) : static_cast<Z>(0);
 		}
 
 		op_def static Z op(X d1) {
@@ -1361,13 +1361,13 @@ namespace simdOps {
 			X d2 = params[0];
 			X thresholdRelative = params[1];
 			X thresholdAbsolute = params[2];
-			return nd4j::math::nd4j_re<X>(d1, d2) > thresholdRelative ? (nd4j::math::nd4j_abs<X>(d1 - static_cast<X>(d2)) < thresholdAbsolute ? static_cast<Z>(0) : static_cast<Z>(1)) : static_cast<Z>(0);
+			return sd::math::nd4j_re<X>(d1, d2) > thresholdRelative ? (sd::math::nd4j_abs<X>(d1 - static_cast<X>(d2)) < thresholdAbsolute ? static_cast<Z>(0) : static_cast<Z>(1)) : static_cast<Z>(0);
  		}
 
 		op_def static Z op(X d1, Y d2, Z *params) {
 			X thresholdRelative = params[0];
 			X thresholdAbsolute = params[1];
-			return nd4j::math::nd4j_re<X>(d1, d2) > thresholdRelative ? (nd4j::math::nd4j_abs<X>(d1 - static_cast<X>(d2)) < thresholdAbsolute ? static_cast<Z>(0) : static_cast<Z>(1)) : static_cast<Z>(0);
+			return sd::math::nd4j_re<X>(d1, d2) > thresholdRelative ? (sd::math::nd4j_abs<X>(d1 - static_cast<X>(d2)) < thresholdAbsolute ? static_cast<Z>(0) : static_cast<Z>(1)) : static_cast<Z>(0);
 		}
 
 		op_def static Z op(X d1) {
@@ -1382,15 +1382,15 @@ namespace simdOps {
         no_op_exec_special_cuda
 
         op_def static Z op(X d1, Z *params) {
-            return nd4j::math::nd4j_pow<X, X, Z>(params[0], d1);
+            return sd::math::nd4j_pow<X, X, Z>(params[0], d1);
         }
 
         op_def static Z op(X d1, Y d2) {
-            return nd4j::math::nd4j_pow<X, Y, Z>(d2, d1);
+            return sd::math::nd4j_pow<X, Y, Z>(d2, d1);
         }
 
         op_def static Z op(X d1, Y d2, Z *params) {
-            return nd4j::math::nd4j_pow<X, Y, Z>(d2, d1);
+            return sd::math::nd4j_pow<X, Y, Z>(d2, d1);
         }
 
         op_def static Z op(X d1) {
@@ -1405,15 +1405,15 @@ namespace simdOps {
 		no_op_exec_special_cuda
 
 		op_def static Z op(X d1, Z *params) {
-			return nd4j::math::nd4j_pow<X, X, Z>(d1, params[0]);
+			return sd::math::nd4j_pow<X, X, Z>(d1, params[0]);
 		}
 
 		op_def static Z op(X d1, Y d2) {
-			return nd4j::math::nd4j_pow<X, Y, Z>(d1, d2);
+			return sd::math::nd4j_pow<X, Y, Z>(d1, d2);
 		}
 
 		op_def static Z op(X d1, Y d2, Z *params) {
-			return nd4j::math::nd4j_pow<X, Y, Z>(d1, d2);
+			return sd::math::nd4j_pow<X, Y, Z>(d1, d2);
 		}
 
 		op_def static Z op(X d1) {
@@ -1429,15 +1429,15 @@ namespace simdOps {
 		no_op_exec_special_cuda
 
 		op_def static Z op(X d1, Z *params) {
-			return params[0] * nd4j::math::nd4j_pow<X, Z, Z>(d1, static_cast<Z>(params[0]) - static_cast<Z>(1.f));
+			return params[0] * sd::math::nd4j_pow<X, Z, Z>(d1, static_cast<Z>(params[0]) - static_cast<Z>(1.f));
 		}
 
 		op_def static Z op(X d1, Y d2) {
-			return static_cast<Z>(d2) * nd4j::math::nd4j_pow<X, Z, Z>(d1, static_cast<Z>(d2) - static_cast<Z>(1.f));
+			return static_cast<Z>(d2) * sd::math::nd4j_pow<X, Z, Z>(d1, static_cast<Z>(d2) - static_cast<Z>(1.f));
 		}
 
 		op_def static Z op(X d1, Y d2, Z *params) {
-			return static_cast<Z>(d2) * nd4j::math::nd4j_pow<X, Z, Z>(d1, static_cast<Z>(d2) - static_cast<Z>(1.f));
+			return static_cast<Z>(d2) * sd::math::nd4j_pow<X, Z, Z>(d1, static_cast<Z>(d2) - static_cast<Z>(1.f));
 		}
 
 		op_def static Z op(X d1) {
@@ -1453,15 +1453,15 @@ namespace simdOps {
 		no_op_exec_special_cuda
 
 		op_def static Z op(X d1, Z *params) {
-			return nd4j::math::nd4j_igamma<X, X, Z>(d1, params[0]);
+			return sd::math::nd4j_igamma<X, X, Z>(d1, params[0]);
 		}
 
 		op_def static Z op(X d1, Y d2) {
-			return nd4j::math::nd4j_igamma<X, Y, Z>(d1, d2);
+			return sd::math::nd4j_igamma<X, Y, Z>(d1, d2);
 		}
 
 		op_def static Z op(X d1, Y d2, Z *params) {
-			return nd4j::math::nd4j_igamma<X, Y, Z>(d1, d2);
+			return sd::math::nd4j_igamma<X, Y, Z>(d1, d2);
 		}
 
 		op_def static Z op(X d1) {
@@ -1476,15 +1476,15 @@ namespace simdOps {
 		no_op_exec_special_cuda
 
 		op_def static Z op(X d1, Z *params) {
-			return nd4j::math::nd4j_igammac<X, X, Z>(d1, params[0]);
+			return sd::math::nd4j_igammac<X, X, Z>(d1, params[0]);
 		}
 
 		op_def static Z op(X d1, Y d2) {
-			return nd4j::math::nd4j_igammac<X, Y, Z>(d1, d2);
+			return sd::math::nd4j_igammac<X, Y, Z>(d1, d2);
 		}
 
 		op_def static Z op(X d1, Y d2, Z *params) {
-			return nd4j::math::nd4j_igammac<X, Y, Z>(d1, d2);
+			return sd::math::nd4j_igammac<X, Y, Z>(d1, d2);
 		}
 
 		op_def static Z op(X d1) {
@@ -1499,7 +1499,7 @@ namespace simdOps {
 		no_op_exec_special_same_cuda
 
 		op_def static X op(X d1, X *params) {
-			return nd4j::math::nd4j_round<X,X>(d1);
+			return sd::math::nd4j_round<X,X>(d1);
 		}
 	};
 
@@ -1513,7 +1513,7 @@ namespace simdOps {
 		no_op_exec_special_accumulation_cuda
 
 		op_def static Z op(X d1, X *params) {
-			return nd4j::math::nd4j_isnan(d1) ? static_cast<X>(1) : static_cast<X>(0);
+			return sd::math::nd4j_isnan(d1) ? static_cast<X>(1) : static_cast<X>(0);
 		}
 
         op_def static X startingValue(const X *input) {
@@ -1543,7 +1543,7 @@ namespace simdOps {
 		no_op_exec_special_same_cuda
 
 		op_def static X op(X d1, X *params) {
-			return nd4j::math::nd4j_exp<X, X>(d1) - static_cast<X>(1);
+			return sd::math::nd4j_exp<X, X>(d1) - static_cast<X>(1);
 		}
 	};
 
@@ -1617,7 +1617,7 @@ namespace simdOps {
 		no_op_exec_special_accumulation_cuda
 
 		op_def static Z op(X d1, X *params) {
-			return nd4j::math::nd4j_isinf<X>(d1) ? static_cast<Z>(1) : static_cast<Z>(0);
+			return sd::math::nd4j_isinf<X>(d1) ? static_cast<Z>(1) : static_cast<Z>(0);
 		}
 
         op_def static X startingValue(const X *input) {
@@ -1650,7 +1650,7 @@ namespace simdOps {
 		no_op_exec_special_accumulation_cuda
 
 		op_def static Z op(X d1, X *params) {
-			return nd4j::math::nd4j_isfin<X>(d1) ? static_cast<Z>(0) : static_cast<Z>(1);
+			return sd::math::nd4j_isfin<X>(d1) ? static_cast<Z>(0) : static_cast<Z>(1);
 		}
 
 		op_def static X startingValue(const X *input) {
@@ -1683,7 +1683,7 @@ namespace simdOps {
 		no_op_exec_special_accumulation_cuda
 
 		op_def static Z op(X d1, X *params) {
-			return nd4j::math::nd4j_isfin<X>(d1) ? static_cast<Z>(1) : static_cast<Z>(0);
+			return sd::math::nd4j_isfin<X>(d1) ? static_cast<Z>(1) : static_cast<Z>(0);
 		}
 
         op_def static X startingValue(const X *input) {
@@ -1742,7 +1742,7 @@ namespace simdOps {
         no_op_exec_special_same_cuda
 
         op_def static X op(X d1, X *params) {
-            return d1 * nd4j::math::nd4j_sigmoid<X,X>(d1);
+            return d1 * sd::math::nd4j_sigmoid<X,X>(d1);
         }
     };
 
@@ -1753,7 +1753,7 @@ namespace simdOps {
         no_op_exec_special_same_cuda
 
         op_def static X op(X d1, X *params) {
-            return d1 * nd4j::math::nd4j_tanh<X,X>(nd4j::math::nd4j_softplus<X,X>(d1));
+            return d1 * sd::math::nd4j_tanh<X,X>(sd::math::nd4j_softplus<X,X>(d1));
         }
     };
 
@@ -1764,11 +1764,11 @@ namespace simdOps {
         no_op_exec_special_same_cuda
 
         op_def static X op(X d1, X *params) {
-            auto ex = nd4j::math::nd4j_exp<X,X>(d1);
+            auto ex = sd::math::nd4j_exp<X,X>(d1);
             auto e2x = ex * ex;
             auto e3x = ex * ex * ex;
 
-            return (ex * (4 * (d1 + 1) + 4 * e2x + e3x + ex *(4 * d1 + 6))) / nd4j::math::nd4j_pow<X, X, X>((2 * ex + e2x + 2), (X) 2.f);
+            return (ex * (4 * (d1 + 1) + 4 * e2x + e3x + ex *(4 * d1 + 6))) / sd::math::nd4j_pow<X, X, X>((2 * ex + e2x + 2), (X) 2.f);
         }
     };
 
@@ -1779,7 +1779,7 @@ namespace simdOps {
         no_op_exec_special_same_cuda
 
         op_def static X op(X d1, X *params) {
-            return d1 * nd4j::math::nd4j_sigmoid<X,X>(static_cast<X>(1.702f) * d1);
+            return d1 * sd::math::nd4j_sigmoid<X,X>(static_cast<X>(1.702f) * d1);
         }
     };
 
@@ -1790,9 +1790,9 @@ namespace simdOps {
         no_op_exec_special_same_cuda
 
         op_def static X op(X d1, X *params) {
-            auto sp = nd4j::math::nd4j_sqrt<X, X>(static_cast<X>(2) / static_cast<X>(M_PI));
-            auto xp = d1 + nd4j::math::nd4j_pow<X, X, X>(static_cast<X>(0.044715) * d1, static_cast<X>(3));
-            return (d1 / static_cast<X>(2)) * (static_cast<X>(1) + nd4j::math::nd4j_tanh<X, X>(sp * xp));
+            auto sp = sd::math::nd4j_sqrt<X, X>(static_cast<X>(2) / static_cast<X>(M_PI));
+            auto xp = d1 + sd::math::nd4j_pow<X, X, X>(static_cast<X>(0.044715) * d1, static_cast<X>(3));
+            return (d1 / static_cast<X>(2)) * (static_cast<X>(1) + sd::math::nd4j_tanh<X, X>(sp * xp));
         }
     };
 
@@ -1804,9 +1804,9 @@ namespace simdOps {
 
         op_def static X op(X d1, X *params) {
             auto x17 = static_cast<X>(1.702f) * d1;
-            auto ep = nd4j::math::nd4j_pow<X,X,X>(static_cast<X>(M_E), x17);
+            auto ep = sd::math::nd4j_pow<X,X,X>(static_cast<X>(M_E), x17);
             // (E^(1.702 x) (1. + E^(1.702 x) + 1.702 x))/(1. + E^(1.702 x))^2
-            return (ep * (static_cast<X>(1.f) + ep + x17)) / nd4j::math::nd4j_pow<X, int, X>((static_cast<X>(1.f) + ep), 2);
+            return (ep * (static_cast<X>(1.f) + ep + x17)) / sd::math::nd4j_pow<X, int, X>((static_cast<X>(1.f) + ep), 2);
         }
     };
 
@@ -1818,12 +1818,12 @@ namespace simdOps {
 
         op_def static X op(X d1, X *params) {
             auto x79 = static_cast<X>(0.797885) * d1;
-            auto x03 = nd4j::math::nd4j_pow<X, int, X>(static_cast<X>(0.0356774) * d1, 3);
+            auto x03 = sd::math::nd4j_pow<X, int, X>(static_cast<X>(0.0356774) * d1, 3);
             auto x39 = static_cast<X>(0.398942) * d1;
-            auto x05 = nd4j::math::nd4j_pow<X, int, X>(static_cast<X>(0.0535161) * d1, 3);
-            auto scz = nd4j::math::nd4j_sech<X, X>(x79 + x03);
+            auto x05 = sd::math::nd4j_pow<X, int, X>(static_cast<X>(0.0535161) * d1, 3);
+            auto scz = sd::math::nd4j_sech<X, X>(x79 + x03);
             // 0.5 + (0.398942 x + 0.0535161 x^3) Sech[0.797885 x + 0.0356774 x^3]^2 + 0.5 Tanh[0.797885 x + 0.0356774 x^3]
-            return static_cast<X>(0.5) + (x39 + x05) * (scz * scz) + static_cast<X>(0.5) * nd4j::math::nd4j_tanh<X, X>(x79 + x03);
+            return static_cast<X>(0.5) + (x39 + x05) * (scz * scz) + static_cast<X>(0.5) * sd::math::nd4j_tanh<X, X>(x79 + x03);
         }
     };
 
@@ -1835,8 +1835,8 @@ namespace simdOps {
 		no_op_exec_special_same_cuda
 
 		op_def static X op(X d1, X *params) {
-			X ex = nd4j::math::nd4j_pow<X, X, X>(static_cast<X>(M_E), d1);
-			return (ex * (d1 + ex + static_cast<X>(1.f))) / nd4j::math::nd4j_pow<X, X, X>((ex + static_cast<X>(1.f)) , static_cast<X>(2.f));
+			X ex = sd::math::nd4j_pow<X, X, X>(static_cast<X>(M_E), d1);
+			return (ex * (d1 + ex + static_cast<X>(1.f))) / sd::math::nd4j_pow<X, X, X>((ex + static_cast<X>(1.f)) , static_cast<X>(2.f));
 		}
 	};
 
@@ -1848,7 +1848,7 @@ namespace simdOps {
 		no_op_exec_special_same_cuda
 
 		op_def static X op(X d1, X *params) {
-			return nd4j::math::nd4j_log<X, X>(nd4j::math::nd4j_sigmoid<X, X>(d1));
+			return sd::math::nd4j_log<X, X>(sd::math::nd4j_sigmoid<X, X>(d1));
 		}
 	};
 
@@ -1859,7 +1859,7 @@ namespace simdOps {
 		no_op_exec_special_same_cuda
 
 		op_def static X op(X d1, X *params) {
-			X ex = nd4j::math::nd4j_pow<X, X, X>(M_E, d1);
+			X ex = sd::math::nd4j_pow<X, X, X>(M_E, d1);
 			return static_cast<X>(1.f) / (ex + static_cast<X>(1.f));
 		}
 	};
@@ -1871,7 +1871,7 @@ namespace simdOps {
 		no_op_exec_special_same_cuda
 
 		op_def static X op(X d1, X *params) {
-			return nd4j::math::nd4j_sigmoid<X, X>(d1);
+			return sd::math::nd4j_sigmoid<X, X>(d1);
 		}
 	};
 
@@ -1893,7 +1893,7 @@ namespace simdOps {
 		no_op_exec_special_same_cuda
 
 		op_def static X op(X d1, X *params) {
-			return nd4j::math::nd4j_sigmoidderivative<X, X>(d1);
+			return sd::math::nd4j_sigmoidderivative<X, X>(d1);
 		}
 	};
 
@@ -1905,7 +1905,7 @@ namespace simdOps {
         no_op_exec_special_same_cuda
 
         op_def static X op(X d1, X *params) {
-            return nd4j::math::nd4j_min<X>(static_cast<X>(1), nd4j::math::nd4j_max<X>(static_cast<X>(0), (static_cast<X>(0.2f)) * d1 + static_cast<X>(0.5f)));
+            return sd::math::nd4j_min<X>(static_cast<X>(1), sd::math::nd4j_max<X>(static_cast<X>(0), (static_cast<X>(0.2f)) * d1 + static_cast<X>(0.5f)));
         }
     };
 
@@ -1936,11 +1936,11 @@ namespace simdOps {
 			if (static_cast<X>(d1) >= min && static_cast<X>(d1) <= max)
 				return d1;
 			if (min == static_cast<X>(0) && max == static_cast<X>(1)) {
-				auto val = static_cast<X>(1) / (static_cast<X>(1) + nd4j::math::nd4j_exp<X, X>(-d1));
-				return (nd4j::math::nd4j_floor<X,X>(val * (max - min)) + min);
+				auto val = static_cast<X>(1) / (static_cast<X>(1) + sd::math::nd4j_exp<X, X>(-d1));
+				return (sd::math::nd4j_floor<X,X>(val * (max - min)) + min);
 			}
 
-			return (nd4j::math::nd4j_floor<X,X>(d1 * (max - min)) + min);
+			return (sd::math::nd4j_floor<X,X>(d1 * (max - min)) + min);
 		}
 	};
 
@@ -1952,7 +1952,7 @@ namespace simdOps {
 		no_op_exec_special_same_cuda
 
 		op_def static X op(X d1, X *params) {
-			return nd4j::math::nd4j_sin<X,X>(d1);
+			return sd::math::nd4j_sin<X,X>(d1);
 		}
 	};
 
@@ -1974,7 +1974,7 @@ namespace simdOps {
 		no_op_exec_special_cuda
 
 		op_def static Z op(X d1, Z *params) {
-			return nd4j::math::nd4j_sqrt<X, Z>(d1);
+			return sd::math::nd4j_sqrt<X, Z>(d1);
 		}
 	};
 
@@ -1985,7 +1985,7 @@ namespace simdOps {
 		no_op_exec_special_cuda
 
 		op_def static Z op(X d1, Z *params) {
-			return static_cast<Z>(1) / nd4j::math::nd4j_sqrt<X, Z>(d1);
+			return static_cast<Z>(1) / sd::math::nd4j_sqrt<X, Z>(d1);
 		}
 	};
 
@@ -1996,7 +1996,7 @@ namespace simdOps {
 		no_op_exec_special_same_cuda
 
 		op_def static X op(X d1, X *params) {
-			return nd4j::math::nd4j_rint<X,X>(d1);
+			return sd::math::nd4j_rint<X,X>(d1);
 		}
 	};
 
@@ -2008,7 +2008,7 @@ namespace simdOps {
 		no_op_exec_special_same_cuda
 
 		op_def static X op(X d1, X *params) {
-			return nd4j::math::nd4j_softplus<X, X>(d1);
+			return sd::math::nd4j_softplus<X, X>(d1);
 		}
 	};
 
@@ -2047,7 +2047,7 @@ namespace simdOps {
 			// keep 2/3 as runtime variable, to match precision
 			auto dis = (static_cast<X>(2) / static_cast<X>(3)) * d1;
 
-			auto tanh = nd4j::math::nd4j_sgn<X,X>(dis) * (static_cast<X>(1) - (static_cast<X>(1) / (static_cast<X>(1) + static_cast<X>(nd4j::math::nd4j_abs<X>(dis)) + nd4j::math::nd4j_pow<X, X, X>(dis, static_cast<X>(2)) + static_cast<X>(1.41645f) * nd4j::math::nd4j_pow<X, X, X>(dis, static_cast<X>(4)) )));
+			auto tanh = sd::math::nd4j_sgn<X,X>(dis) * (static_cast<X>(1) - (static_cast<X>(1) / (static_cast<X>(1) + static_cast<X>(sd::math::nd4j_abs<X>(dis)) + sd::math::nd4j_pow<X, X, X>(dis, static_cast<X>(2)) + static_cast<X>(1.41645f) * sd::math::nd4j_pow<X, X, X>(dis, static_cast<X>(4)) )));
 			return static_cast<X>(1.7159f) * tanh;
 		}
 	};
@@ -2061,9 +2061,9 @@ namespace simdOps {
 		op_def static X op(X d1, X *params) {
 			auto dis = (static_cast<X>(2.f) / static_cast<X>(3.f)) * d1;
 
-			auto a = static_cast<X>(1.f) + nd4j::math::nd4j_abs<X>(dis) + nd4j::math::nd4j_pow<X, X, X>(dis, static_cast<X>(2.f)) + static_cast<X>(1.41645f) * nd4j::math::nd4j_pow<X, X, X>(dis, static_cast<X>(4));
+			auto a = static_cast<X>(1.f) + sd::math::nd4j_abs<X>(dis) + sd::math::nd4j_pow<X, X, X>(dis, static_cast<X>(2.f)) + static_cast<X>(1.41645f) * sd::math::nd4j_pow<X, X, X>(dis, static_cast<X>(4));
 
-			auto tDeriv = (static_cast<X>(1.f) + nd4j::math::nd4j_sign<X,X>(dis) * (static_cast<X>(2.f) * dis + static_cast<X>(4.f) * static_cast<X>(1.41645f) * nd4j::math::nd4j_pow<X, X, X>(dis, static_cast<X>(3)))) / (a * a);
+			auto tDeriv = (static_cast<X>(1.f) + sd::math::nd4j_sign<X,X>(dis) * (static_cast<X>(2.f) * dis + static_cast<X>(4.f) * static_cast<X>(1.41645f) * sd::math::nd4j_pow<X, X, X>(dis, static_cast<X>(3)))) / (a * a);
 
 			return static_cast<X>(1.7159f) * (static_cast<X>(2.f) / static_cast<X>(3.f)) * tDeriv;
 		}
@@ -2076,7 +2076,7 @@ namespace simdOps {
 		no_op_exec_special_same_cuda
 
 		op_def static X op(X d1, X *params) {
-			return nd4j::math::nd4j_tanh<X, X>(d1);
+			return sd::math::nd4j_tanh<X, X>(d1);
 		}
 	};
 
@@ -2087,7 +2087,7 @@ namespace simdOps {
 		no_op_exec_special_same_cuda
 
 		op_def static X op(X d1, X *params) {
-			return params[0] * nd4j::math::nd4j_tanh<X, X>(params[1] * d1);
+			return params[0] * sd::math::nd4j_tanh<X, X>(params[1] * d1);
 		}
 	};
 
@@ -2098,7 +2098,7 @@ namespace simdOps {
         no_op_exec_special_same_cuda
 
         op_def static X op(X d1, X *params) {
-            return nd4j::math::nd4j_max<X>(static_cast<X>(0), nd4j::math::nd4j_tanh<X,X>(d1));
+            return sd::math::nd4j_max<X>(static_cast<X>(0), sd::math::nd4j_tanh<X,X>(d1));
         }
     };
 
@@ -2109,7 +2109,7 @@ namespace simdOps {
         no_op_exec_special_same_cuda
 
         op_def static X op(X d1, X *params) {
-            return d1 > static_cast<X>(0.f) ? nd4j::math::nd4j_tanhderivative<X,X>(d1) : static_cast<X>(0.f);
+            return d1 > static_cast<X>(0.f) ? sd::math::nd4j_tanhderivative<X,X>(d1) : static_cast<X>(0.f);
         }
     };
 
@@ -2120,7 +2120,7 @@ namespace simdOps {
 		no_op_exec_special_same_cuda
 
 		op_def static X op(X d1, X *params) {
-			return nd4j::math::nd4j_atanh<X,X>(d1);
+			return sd::math::nd4j_atanh<X,X>(d1);
 		}
 	};
 
@@ -2131,7 +2131,7 @@ namespace simdOps {
 		no_op_exec_special_same_cuda
 
 		op_def static X op(X d1, X *params) {
-			return nd4j::math::nd4j_tanhderivative<X,X>(d1);
+			return sd::math::nd4j_tanhderivative<X,X>(d1);
 		}
 	};
 
@@ -2165,7 +2165,7 @@ namespace simdOps {
 		no_op_exec_special_same_cuda
 
 		op_def static X op(X d1, X *params) {
-			return nd4j::math::nd4j_acos<X, X>(d1);
+			return sd::math::nd4j_acos<X, X>(d1);
 		}
 	};
 
@@ -2176,7 +2176,7 @@ namespace simdOps {
 		no_op_exec_special_same_cuda
 
 		op_def static X op(X d1, X *params) {
-			return nd4j::math::nd4j_asinh<X, X>(d1);
+			return sd::math::nd4j_asinh<X, X>(d1);
 		}
 	};
 
@@ -2187,7 +2187,7 @@ namespace simdOps {
 		no_op_exec_special_same_cuda
 
 		op_def static X op(X d1, X *params) {
-			return static_cast<X>(1.f) / (nd4j::math::nd4j_sqrt<X, X>(nd4j::math::nd4j_pow<X, X, X>(d1, static_cast<X>(2.f)) + static_cast<X>(1.f)));
+			return static_cast<X>(1.f) / (sd::math::nd4j_sqrt<X, X>(sd::math::nd4j_pow<X, X, X>(d1, static_cast<X>(2.f)) + static_cast<X>(1.f)));
 		}
 	};
 
@@ -2198,7 +2198,7 @@ namespace simdOps {
 		no_op_exec_special_same_cuda
 
 		op_def static X op(X d1, X *params) {
-			return nd4j::math::nd4j_acosh<X, X>(d1);
+			return sd::math::nd4j_acosh<X, X>(d1);
 		}
 	};
 
@@ -2210,7 +2210,7 @@ namespace simdOps {
 		no_op_exec_special_same_cuda
 
 		op_def static X op(X d1, X *params) {
-			return static_cast<X>(1.f) / (nd4j::math::nd4j_sqrt<X, X>(d1 - static_cast<X>(1.f)) * nd4j::math::nd4j_sqrt<X, X>(d1 + static_cast<X>(1.f)));
+			return static_cast<X>(1.f) / (sd::math::nd4j_sqrt<X, X>(d1 - static_cast<X>(1.f)) * sd::math::nd4j_sqrt<X, X>(d1 + static_cast<X>(1.f)));
 		}
 	};
 
@@ -2236,7 +2236,7 @@ namespace simdOps {
 		no_op_exec_special_same_cuda
 
 		op_def static X op(X d1, X *params) {
-			return nd4j::math::nd4j_softsign<X, X>(d1);
+			return sd::math::nd4j_softsign<X, X>(d1);
 		}
 	};
 
@@ -2248,7 +2248,7 @@ namespace simdOps {
 		no_op_exec_special_same_cuda
 
 		op_def static X op(X d1, X *params) {
-			return nd4j::math::nd4j_softsignderivative<X,X>(d1);
+			return sd::math::nd4j_softsignderivative<X,X>(d1);
 		}
 	};
 
@@ -2268,9 +2268,9 @@ namespace simdOps {
 
             switch (mode) {
                 case 0: // equals
-                    return nd4j::math::nd4j_abs<X>(d1 - compare) <= eps ? true : false;
+                    return sd::math::nd4j_abs<X>(d1 - compare) <= eps ? true : false;
                 case 1: // not equals
-                    return nd4j::math::nd4j_abs<X>(d1 - compare) > eps ? true : false;
+                    return sd::math::nd4j_abs<X>(d1 - compare) > eps ? true : false;
                 case 2: // less_than
                     return d1 < compare ? true : false;
                 case 3: // greater_than
@@ -2280,27 +2280,27 @@ namespace simdOps {
                 case 5: // greater_or_equals_than
                     return d1 >= compare ? true : false;
                 case 6: // abs_less_than
-                    return nd4j::math::nd4j_abs<X>(d1) < compare ? true : false;
+                    return sd::math::nd4j_abs<X>(d1) < compare ? true : false;
                 case 7: // abs_greater_than
-                    return nd4j::math::nd4j_abs<X>(d1) > compare ? true : false;
+                    return sd::math::nd4j_abs<X>(d1) > compare ? true : false;
                 case 8: // is inf
-                    return nd4j::math::nd4j_isinf(d1) ? true : false;
+                    return sd::math::nd4j_isinf(d1) ? true : false;
                 case 9: // is nan
-                    return nd4j::math::nd4j_isnan(d1) ? true : false;
+                    return sd::math::nd4j_isnan(d1) ? true : false;
                 case 10:
                     return (d1 == compare) ? true : false;
                 case 11:
                     return (d1 != compare) ? true : false;
                 case 12: // abs_greater_or_equals_than
-                    return nd4j::math::nd4j_abs<X>(d1) >= compare ? true : false;
+                    return sd::math::nd4j_abs<X>(d1) >= compare ? true : false;
                 case 13: // abs_less_or_equals_than
-                    return nd4j::math::nd4j_abs<X>(d1) <= compare ? true : false;
+                    return sd::math::nd4j_abs<X>(d1) <= compare ? true : false;
                 case 14:
                     // isFinite
-                    return !(nd4j::math::nd4j_isinf(d1) || nd4j::math::nd4j_isnan(d1));
+                    return !(sd::math::nd4j_isinf(d1) || sd::math::nd4j_isnan(d1));
                 case 15:
                     // isInfinite
-                    return nd4j::math::nd4j_isinf(d1) || nd4j::math::nd4j_isnan(d1);
+                    return sd::math::nd4j_isinf(d1) || sd::math::nd4j_isnan(d1);
                 default:
                     printf("Undefined match condition: [%i]\n", mode);
             }
@@ -2333,9 +2333,9 @@ namespace simdOps {
         op_def static Z op(X d1, X compare, X eps, int mode) {
             switch (mode) {
                 case 0: // equals
-                    return nd4j::math::nd4j_abs<X>(d1 - compare) <= eps ? 1 : 0;
+                    return sd::math::nd4j_abs<X>(d1 - compare) <= eps ? 1 : 0;
                 case 1: // not equals
-                    return nd4j::math::nd4j_abs<X>(d1 - compare) > eps ? 1 : 0;
+                    return sd::math::nd4j_abs<X>(d1 - compare) > eps ? 1 : 0;
                 case 2: // less_than
                     return d1 < compare ? 1 : 0;
                 case 3: // greater_than
@@ -2345,27 +2345,27 @@ namespace simdOps {
                 case 5: // greater_or_equals_than
                     return d1 >= compare ? 1 : 0;
                 case 6: // abs_less_than
-                    return nd4j::math::nd4j_abs<X>(d1) < compare ? 1 : 0;
+                    return sd::math::nd4j_abs<X>(d1) < compare ? 1 : 0;
                 case 7: // abs_greater_than
-                    return nd4j::math::nd4j_abs<X>(d1) > compare ? 1 : 0;
+                    return sd::math::nd4j_abs<X>(d1) > compare ? 1 : 0;
                 case 8: // is inf
-                    return nd4j::math::nd4j_isinf(d1) ? 1 : 0;
+                    return sd::math::nd4j_isinf(d1) ? 1 : 0;
                 case 9: // is nan
-                    return nd4j::math::nd4j_isnan(d1) ? 1 : 0;
+                    return sd::math::nd4j_isnan(d1) ? 1 : 0;
                 case 10:
                     return (d1 == compare) ? 1 : 0;
                 case 11:
                     return (d1 != compare) ? 1 : 0;
                 case 12: // abs_greater_or_equals_than
-                    return nd4j::math::nd4j_abs<X>(d1) >= compare ? 1 : 0;
+                    return sd::math::nd4j_abs<X>(d1) >= compare ? 1 : 0;
                 case 13: // abs_less_or_equals_than
-                    return nd4j::math::nd4j_abs<X>(d1) <= compare ? 1 : 0;
+                    return sd::math::nd4j_abs<X>(d1) <= compare ? 1 : 0;
                 case 14:
                     // isFinite
-                    return !(nd4j::math::nd4j_isinf(d1) || nd4j::math::nd4j_isnan(d1)) ? 1 : 0;
+                    return !(sd::math::nd4j_isinf(d1) || sd::math::nd4j_isnan(d1)) ? 1 : 0;
                 case 15:
                     // isInfinite
-                    return nd4j::math::nd4j_isinf(d1) || nd4j::math::nd4j_isnan(d1) ? 1 : 0;
+                    return sd::math::nd4j_isinf(d1) || sd::math::nd4j_isnan(d1) ? 1 : 0;
                 default:
                     printf("Undefined match condition: [%i]\n", mode);
             }
@@ -2404,7 +2404,7 @@ namespace simdOps {
 		no_op_exec_special_same_cuda
 
 		op_def static Z op(X d1, Y d2, Z *params) {
-			return nd4j::math::nd4j_elu<X,Z>(d1, static_cast<X>(d2));
+			return sd::math::nd4j_elu<X,Z>(d1, static_cast<X>(d2));
 		}
 	};
 
@@ -2416,7 +2416,7 @@ namespace simdOps {
 		no_op_exec_special_same_cuda
 
 		op_def static Z op(X d1, Y d2, Z *params) {
-			return nd4j::math::nd4j_eluderivative<X,Z>(d1, static_cast<X>(d2));
+			return sd::math::nd4j_eluderivative<X,Z>(d1, static_cast<X>(d2));
 		}
 	};
 
@@ -2474,7 +2474,7 @@ namespace simdOps {
         no_op_exec_special_same_cuda
 
         op_def static X op(X d1, X *params) {
-            return d1 > static_cast<X>(0.0f) ? static_cast<X>(SELU_LAMBDA) * static_cast<X>(d1) : static_cast<X>(SELU_LAMBDA) * (static_cast<X>(SELU_ALPHA) * nd4j::math::nd4j_exp<X, X>(d1) - static_cast<X>(SELU_ALPHA));
+            return d1 > static_cast<X>(0.0f) ? static_cast<X>(SELU_LAMBDA) * static_cast<X>(d1) : static_cast<X>(SELU_LAMBDA) * (static_cast<X>(SELU_ALPHA) * sd::math::nd4j_exp<X, X>(d1) - static_cast<X>(SELU_ALPHA));
         }
     };
 
@@ -2485,7 +2485,7 @@ namespace simdOps {
         no_op_exec_special_same_cuda
 
         op_def static X op(X d1, X *params) {
-            return d1 > static_cast<X>(0.f) ? static_cast<X>(SELU_LAMBDA) : static_cast<X>(SELU_ALPHA) * static_cast<X>(SELU_LAMBDA) * nd4j::math::nd4j_exp<X, X>(d1);
+            return d1 > static_cast<X>(0.f) ? static_cast<X>(SELU_LAMBDA) : static_cast<X>(SELU_ALPHA) * static_cast<X>(SELU_LAMBDA) * sd::math::nd4j_exp<X, X>(d1);
         }
     };
 
@@ -2511,7 +2511,7 @@ namespace simdOps {
 		no_op_exec_special_same_cuda
 
 		op_def static X op(X d1, X *params) {
-			return nd4j::math::nd4j_asin<X,X>(d1);
+			return sd::math::nd4j_asin<X,X>(d1);
 		}
 	};
 
@@ -2522,7 +2522,7 @@ namespace simdOps {
 		no_op_exec_special_same_cuda
 
 		op_def static X op(X d1, X *params) {
-			return nd4j::math::nd4j_sinh<X,X>(d1);
+			return sd::math::nd4j_sinh<X,X>(d1);
 		}
 	};
 
@@ -2533,7 +2533,7 @@ namespace simdOps {
 		no_op_exec_special_same_cuda
 
 		op_def static X op(X d1, X *params) {
-			return nd4j::math::nd4j_cosh<X, X>(d1);
+			return sd::math::nd4j_cosh<X, X>(d1);
 		}
 	};
 
@@ -2544,7 +2544,7 @@ namespace simdOps {
 		no_op_exec_special_same_cuda
 
 		op_def static X op(X d1, X *params) {
-			return nd4j::math::nd4j_cosh<X,X>(d1);
+			return sd::math::nd4j_cosh<X,X>(d1);
 		}
 	};
 
@@ -2556,7 +2556,7 @@ namespace simdOps {
 		no_op_exec_special_same_cuda
 
 		op_def static X op(X d1, X *params) {
-			return nd4j::math::nd4j_tan<X,X>(d1);
+			return sd::math::nd4j_tan<X,X>(d1);
 		}
 	};
 
@@ -2567,7 +2567,7 @@ namespace simdOps {
         no_op_exec_special_same_cuda
 
         op_def static X op(X d1, X *params) {
-            return  static_cast<X>(1.f) / nd4j::math::nd4j_pow<X, X, X>(nd4j::math::nd4j_cos<X, X>(d1), static_cast<X>(2.0f));
+            return  static_cast<X>(1.f) / sd::math::nd4j_pow<X, X, X>(sd::math::nd4j_cos<X, X>(d1), static_cast<X>(2.0f));
         }
     };
 
@@ -2578,7 +2578,7 @@ namespace simdOps {
 		no_op_exec_special_same_cuda
 
 		op_def static X op(X d1, X *params) {
-			return nd4j::math::nd4j_atan<X, X>(d1);
+			return sd::math::nd4j_atan<X, X>(d1);
 		}
 	};
 
@@ -2589,7 +2589,7 @@ namespace simdOps {
         no_op_exec_special_cuda
 
 		op_def static Z op(X d1, Y d2) {
-			return nd4j::math::nd4j_atan2<X, Z>(d2, d1);
+			return sd::math::nd4j_atan2<X, Z>(d2, d1);
 		}
 
         op_def static Z op(X d1, Y d2, Z *params) {
@@ -2706,11 +2706,11 @@ namespace simdOps {
 
         op_def static X op(X d1, X *extraParams) {
             auto f1 = static_cast<float>(d1);
-            return static_cast<X>(nd4j::math::nd4j_pow<float,float,float>(f1, 3)
-                   + nd4j::math::nd4j_log<float,float>(f1) * nd4j::math::nd4j_sin<float,float>(f1)
-                     / nd4j::math::nd4j_tanh<float,float>(static_cast<float>(M_E) * static_cast<float>(M_PI) * f1)
-                     * nd4j::math::nd4j_sqrt<float,float>(static_cast<float>(M_PI) / f1)
-                   - nd4j::math::nd4j_atan<float,float>(static_cast<float>(M_E) / f1));
+            return static_cast<X>(sd::math::nd4j_pow<float,float,float>(f1, 3)
+                   + sd::math::nd4j_log<float,float>(f1) * sd::math::nd4j_sin<float,float>(f1)
+                     / sd::math::nd4j_tanh<float,float>(static_cast<float>(M_E) * static_cast<float>(M_PI) * f1)
+                     * sd::math::nd4j_sqrt<float,float>(static_cast<float>(M_PI) / f1)
+                   - sd::math::nd4j_atan<float,float>(static_cast<float>(M_E) / f1));
         }
 
         op_def static X postProcess(X reduction, Nd4jLong n, X *extraParams) {
@@ -2741,7 +2741,7 @@ namespace simdOps {
 
         op_def static Z op(X d1, Z *extraParams) {
             auto p = d1 * d1;
-            return static_cast<Z>(p) * nd4j::math::nd4j_log<X, Z>(p);
+            return static_cast<Z>(p) * sd::math::nd4j_log<X, Z>(p);
         }
 
         op_def static Z postProcess(Z reduction, Nd4jLong n, Z *extraParams) {
@@ -2771,12 +2771,12 @@ namespace simdOps {
         }
 
         op_def static Z op(X d1, Z *extraParams) {
-			return static_cast<Z>(d1) * nd4j::math::nd4j_log<X, Z>(d1);
+			return static_cast<Z>(d1) * sd::math::nd4j_log<X, Z>(d1);
         }
 
         op_def static Z postProcess(Z reduction, Nd4jLong n, Z *extraParams) {
 			//entropy is -sum(p(x) * log(p(x))); log entropy is log of this
-			return nd4j::math::nd4j_log<Z, Z>(-reduction);
+			return sd::math::nd4j_log<Z, Z>(-reduction);
         }
     };
 
@@ -2801,7 +2801,7 @@ namespace simdOps {
         }
 
         op_def static Z op(X d1, Z *extraParams) {
-            return static_cast<Z>(d1) * nd4j::math::nd4j_log<X, Z>(d1);
+            return static_cast<Z>(d1) * sd::math::nd4j_log<X, Z>(d1);
         }
 
         op_def static Z postProcess(Z reduction, Nd4jLong n, Z *extraParams) {
@@ -2823,19 +2823,19 @@ namespace simdOps {
         }
 
         op_def static X merge(X old, X opOutput, X *extraParams) {
-            return nd4j::math::nd4j_abs<X>(opOutput) + nd4j::math::nd4j_abs<X>(old);
+            return sd::math::nd4j_abs<X>(opOutput) + sd::math::nd4j_abs<X>(old);
         }
 
         op_def static X update(X old, X opOutput, X *extraParams) {
-            return nd4j::math::nd4j_abs<X>(opOutput) + nd4j::math::nd4j_abs<X>(old);
+            return sd::math::nd4j_abs<X>(opOutput) + sd::math::nd4j_abs<X>(old);
         }
 
         op_def static X op(X d1, X *extraParams) {
-            return nd4j::math::nd4j_abs<X>(d1);
+            return sd::math::nd4j_abs<X>(d1);
         }
 
         op_def static X postProcess(X reduction, Nd4jLong n, X *extraParams) {
-            return nd4j::math::nd4j_abs<X>(reduction);
+            return sd::math::nd4j_abs<X>(reduction);
         }
     };
 
@@ -3039,11 +3039,11 @@ namespace simdOps {
 
         op_def static Z op(X d1, Z *extraParams) {
             auto f1 = static_cast<float>(d1);
-            return static_cast<Z>(nd4j::math::nd4j_pow<float,float,float>(f1, 3)
-                    + nd4j::math::nd4j_log<float,float>(f1) * nd4j::math::nd4j_sin<float,float>(f1)
-                    / nd4j::math::nd4j_tanh<float,float>(static_cast<float>(M_E) * static_cast<float>(M_PI) * f1)
-                    * nd4j::math::nd4j_sqrt<float,float>(static_cast<float>(M_PI) / f1)
-                    - nd4j::math::nd4j_atan<float,float>(static_cast<float>(M_E) / f1));
+            return static_cast<Z>(sd::math::nd4j_pow<float,float,float>(f1, 3)
+                    + sd::math::nd4j_log<float,float>(f1) * sd::math::nd4j_sin<float,float>(f1)
+                    / sd::math::nd4j_tanh<float,float>(static_cast<float>(M_E) * static_cast<float>(M_PI) * f1)
+                    * sd::math::nd4j_sqrt<float,float>(static_cast<float>(M_PI) / f1)
+                    - sd::math::nd4j_atan<float,float>(static_cast<float>(M_E) / f1));
         }
 
         op_def static Z postProcess(Z reduction, Nd4jLong n, Z *extraParams) {
@@ -3065,7 +3065,7 @@ namespace simdOps {
         }
 
         op_def static Z merge(Z old, Z opOutput, Z *extraParams) {
-            return nd4j::math::nd4j_abs<X>(opOutput) + nd4j::math::nd4j_abs<X>(old);
+            return sd::math::nd4j_abs<X>(opOutput) + sd::math::nd4j_abs<X>(old);
         }
 
         op_def static Z update(Z old, Z opOutput, Z *extraParams) {
@@ -3073,11 +3073,11 @@ namespace simdOps {
         }
 
         op_def static Z op(X d1, Z *extraParams) {
-            return nd4j::math::nd4j_abs<X>(d1);
+            return sd::math::nd4j_abs<X>(d1);
         }
 
         op_def static Z postProcess(Z reduction, Nd4jLong n, Z *extraParams) {
-            return nd4j::math::nd4j_abs<Z>(reduction) / static_cast<Z>(n);
+            return sd::math::nd4j_abs<Z>(reduction) / static_cast<Z>(n);
         }
     };
 
@@ -3090,23 +3090,23 @@ namespace simdOps {
         const static functions::ReduceType reduceType = functions::ReduceType::MAX;
 
 		op_def static X startingValue(const X *input) {
-			return -nd4j::DataTypeUtils::infOrMax<X>();
+			return -sd::DataTypeUtils::infOrMax<X>();
 		}
 
 		op_def static X merge(X old, X opOutput, X *extraParams) {
-			return nd4j::math::nd4j_max<X>(old, opOutput);
+			return sd::math::nd4j_max<X>(old, opOutput);
 		}
 
 		op_def static X update(X old, X opOutput, X *extraParams) {
-			return nd4j::math::nd4j_max<X>(opOutput, old);
+			return sd::math::nd4j_max<X>(opOutput, old);
 		}
 
 		op_def static X op(X d1, X d2, X *params) {
-			return nd4j::math::nd4j_max<X>(d1, d2);
+			return sd::math::nd4j_max<X>(d1, d2);
 		}
 
         op_def static X op(X d1, X d2) {
-            return nd4j::math::nd4j_max<X>(d1, d2);
+            return sd::math::nd4j_max<X>(d1, d2);
         }
 
 		// FIXME: this signature overlaps with MetaOp
@@ -3131,7 +3131,7 @@ namespace simdOps {
 			auto z1 = static_cast<Z>(d1);
 			auto z2 = static_cast<Z>(d2);
 
-			if (nd4j::math::nd4j_abs<Z>(z1) > nd4j::math::nd4j_abs<Z>(z2))
+			if (sd::math::nd4j_abs<Z>(z1) > sd::math::nd4j_abs<Z>(z2))
 				return z1;
 			else
 				return z2;
@@ -3150,7 +3150,7 @@ namespace simdOps {
 			auto z1 = static_cast<Z>(d1);
 			auto z2 = static_cast<Z>(d2);
 
-			if (nd4j::math::nd4j_abs<Z>(z1) < nd4j::math::nd4j_abs<Z>(z2))
+			if (sd::math::nd4j_abs<Z>(z1) < sd::math::nd4j_abs<Z>(z2))
 				return z1;
 			else
 				return z2;
@@ -3161,11 +3161,11 @@ namespace simdOps {
 	class MaxPairwise {
 	public:
 		op_def static Z op(X d1, Y d2, Z *params) {
-			return nd4j::math::nd4j_max<Z>(static_cast<Z>(d1), static_cast<Z>(d2));
+			return sd::math::nd4j_max<Z>(static_cast<Z>(d1), static_cast<Z>(d2));
 		}
 
 		op_def static Z op(X d1, Y d2) {
-			return nd4j::math::nd4j_max<Z>(static_cast<Z>(d1), static_cast<Z>(d2));
+			return sd::math::nd4j_max<Z>(static_cast<Z>(d1), static_cast<Z>(d2));
 		}
 	};
 
@@ -3174,11 +3174,11 @@ namespace simdOps {
 	class MinPairwise {
 	public:
 		op_def static Z op(X d1, Y d2, Z *params) {
-			return nd4j::math::nd4j_min<Z>(static_cast<Z>(d1), static_cast<Z>(d2));
+			return sd::math::nd4j_min<Z>(static_cast<Z>(d1), static_cast<Z>(d2));
 		}
 
 		op_def static Z op(X d1, Y d2) {
-			return nd4j::math::nd4j_min<Z>(static_cast<Z>(d1), static_cast<Z>(d2));
+			return sd::math::nd4j_min<Z>(static_cast<Z>(d1), static_cast<Z>(d2));
 		}
 	};
 
@@ -3195,28 +3195,28 @@ namespace simdOps {
         }
 
         op_def static X merge(X old, X opOutput, X *extraParams) {
-            return nd4j::math::nd4j_max<X>(nd4j::math::nd4j_abs<X>(old), nd4j::math::nd4j_abs<X>(opOutput));
+            return sd::math::nd4j_max<X>(sd::math::nd4j_abs<X>(old), sd::math::nd4j_abs<X>(opOutput));
         }
 
         op_def static X update(X old, X opOutput, X *extraParams) {
-            return nd4j::math::nd4j_max<X>(nd4j::math::nd4j_abs<X>(opOutput), nd4j::math::nd4j_abs<X>(old));
+            return sd::math::nd4j_max<X>(sd::math::nd4j_abs<X>(opOutput), sd::math::nd4j_abs<X>(old));
         }
 
         op_def static X op(X d1, X d2, X *params) {
-            return nd4j::math::nd4j_max<X>(nd4j::math::nd4j_abs<X>(d1), nd4j::math::nd4j_abs<X>(d2));
+            return sd::math::nd4j_max<X>(sd::math::nd4j_abs<X>(d1), sd::math::nd4j_abs<X>(d2));
         }
 
         op_def static X op(X d1, X d2) {
-            return nd4j::math::nd4j_abs<X>(d1) > nd4j::math::nd4j_abs<X>(d2) ? d1 : d2;
+            return sd::math::nd4j_abs<X>(d1) > sd::math::nd4j_abs<X>(d2) ? d1 : d2;
         }
 
         // FIXME: this signature overlaps with MetaOp
         op_def static X op(X d1, X *extraParams) {
-            return nd4j::math::nd4j_abs<X>(d1);
+            return sd::math::nd4j_abs<X>(d1);
         }
 
         op_def static X postProcess(X reduction, Nd4jLong n, X *extraParams) {
-            return nd4j::math::nd4j_abs<X>(reduction);
+            return sd::math::nd4j_abs<X>(reduction);
         }
     };
 
@@ -3234,28 +3234,28 @@ namespace simdOps {
 		}
 
 		op_def static X merge(X old, X opOutput, X *extraParams) {
-			return nd4j::math::nd4j_min<X>(nd4j::math::nd4j_abs<X>(old), nd4j::math::nd4j_abs<X>(opOutput));
+			return sd::math::nd4j_min<X>(sd::math::nd4j_abs<X>(old), sd::math::nd4j_abs<X>(opOutput));
 		}
 
 		op_def static X update(X old, X opOutput, X *extraParams) {
-			return nd4j::math::nd4j_min<X>(nd4j::math::nd4j_abs<X>(opOutput), nd4j::math::nd4j_abs<X>(old));
+			return sd::math::nd4j_min<X>(sd::math::nd4j_abs<X>(opOutput), sd::math::nd4j_abs<X>(old));
 		}
 
 		op_def static X op(X d1, X d2, X *params) {
-			return nd4j::math::nd4j_min<X>(nd4j::math::nd4j_abs<X>(d1), nd4j::math::nd4j_abs<X>(d2));
+			return sd::math::nd4j_min<X>(sd::math::nd4j_abs<X>(d1), sd::math::nd4j_abs<X>(d2));
 		}
 
         op_def static X op(X d1, X d2) {
-            return nd4j::math::nd4j_min<X>(nd4j::math::nd4j_abs<X>(d1), nd4j::math::nd4j_abs<X>(d2));
+            return sd::math::nd4j_min<X>(sd::math::nd4j_abs<X>(d1), sd::math::nd4j_abs<X>(d2));
         }
 
 		// FIXME: this signature overlaps with MetaOp
 		op_def static X op(X d1, X *extraParams) {
-			return nd4j::math::nd4j_abs<X>(d1);
+			return sd::math::nd4j_abs<X>(d1);
 		}
 
 		op_def static X postProcess(X reduction, Nd4jLong n, X *extraParams) {
-			return nd4j::math::nd4j_abs<X>(reduction);
+			return sd::math::nd4j_abs<X>(reduction);
 		}
 	};
 
@@ -3268,23 +3268,23 @@ namespace simdOps {
         const static functions::ReduceType reduceType = functions::ReduceType::MIN;
 
         op_def static X startingValue(const X *input) {
-            return nd4j::DataTypeUtils::infOrMax<X>();
+            return sd::DataTypeUtils::infOrMax<X>();
         }
 
         op_def static X merge(X old, X opOutput, X *extraParams) {
-            return nd4j::math::nd4j_min<X>(old, opOutput);
+            return sd::math::nd4j_min<X>(old, opOutput);
         }
 
         op_def static X update(X old, X opOutput, X *extraParams) {
-            return nd4j::math::nd4j_min<X>(opOutput, old);
+            return sd::math::nd4j_min<X>(opOutput, old);
         }
 
         op_def static X op(X d1, X d2, X *params) {
-            return nd4j::math::nd4j_min<X>(d1, d2);
+            return sd::math::nd4j_min<X>(d1, d2);
         }
 
         op_def static X op(X d1, X d2) {
-            return nd4j::math::nd4j_min<X>(d1, d2);
+            return sd::math::nd4j_min<X>(d1, d2);
         }
 
         // FIXME: this signature overlaps with MetaOp
@@ -3321,7 +3321,7 @@ namespace simdOps {
 		}
 
 		op_def static Z op(X d1, Z *extraParams) {
-			return static_cast<Z>(nd4j::math::nd4j_abs<X>(d1));
+			return static_cast<Z>(sd::math::nd4j_abs<X>(d1));
 		}
 
 		op_def static Z postProcess(Z reduction, Nd4jLong n, Z *extraParams) {
@@ -3353,7 +3353,7 @@ namespace simdOps {
 
 
 		op_def static Z postProcess(Z reduction, Nd4jLong n, Z *extraParams) {
-			return nd4j::math::nd4j_sqrt<Z, Z>(reduction);
+			return sd::math::nd4j_sqrt<Z, Z>(reduction);
 		}
 
         op_def static Z op(X d1, Z *extraParams) {
@@ -3413,12 +3413,12 @@ namespace simdOps {
 		}
 
 		op_def static Z op(X d1, Z *extraParams) {
-			X v = nd4j::math::nd4j_abs<X>(d1);
+			X v = sd::math::nd4j_abs<X>(d1);
 			return static_cast<Z>(v * v);
 		}
 
 		op_def static Z postProcess(Z reduction, Nd4jLong n, Z *extraParams) {
-			return nd4j::math::nd4j_sqrt<Z, Z>(reduction);
+			return sd::math::nd4j_sqrt<Z, Z>(reduction);
 		}
 	};
 
@@ -3443,11 +3443,11 @@ namespace simdOps {
 		}
 
 		op_def static Z op(X d1, Z *extraParams) {
-			return nd4j::math::nd4j_pow<X, Z, Z>(nd4j::math::nd4j_abs<X>(d1), extraParams[0]);
+			return sd::math::nd4j_pow<X, Z, Z>(sd::math::nd4j_abs<X>(d1), extraParams[0]);
 		}
 
 		op_def static Z postProcess(Z reduction, Nd4jLong n, Z *extraParams) {
-			return nd4j::math::nd4j_pow<Z, Z, Z>(reduction, static_cast<Z>(1.0f) / extraParams[0]);
+			return sd::math::nd4j_pow<Z, Z, Z>(reduction, static_cast<Z>(1.0f) / extraParams[0]);
 		}
 	};
 
@@ -3469,8 +3469,8 @@ namespace simdOps {
 		}
 
 		op_def static Z update(Z old, Z opOutput, Z *extraParams) {
-			return nd4j::math::nd4j_max<Z>(nd4j::math::nd4j_abs<Z>(old),
-				nd4j::math::nd4j_abs<Z>(opOutput));
+			return sd::math::nd4j_max<Z>(sd::math::nd4j_abs<Z>(old),
+				sd::math::nd4j_abs<Z>(opOutput));
 		}
 
 		op_def static Z op(X d1, Z *extraParams) {
@@ -3478,7 +3478,7 @@ namespace simdOps {
 		}
 
 		op_def static Z postProcess(Z reduction, Nd4jLong n, Z *extraParams) {
-			return nd4j::math::nd4j_max<Z>(nd4j::math::nd4j_abs<Z>(reduction), nd4j::math::nd4j_abs<Z>(reduction));
+			return sd::math::nd4j_max<Z>(sd::math::nd4j_abs<Z>(reduction), sd::math::nd4j_abs<Z>(reduction));
 		}
 	};
 
@@ -3511,7 +3511,7 @@ namespace simdOps {
 
 		op_def static Z postProcess(X reduction, Nd4jLong n, Z *extraParams) {
 			// T bias = extraParams[1];
-			// return (reduction - (nd4j::math::nd4j_pow<T>(bias, static_cast<T>(2.0f)) / static_cast<T>(n))) / (n - 1)
+			// return (reduction - (sd::math::nd4j_pow<T>(bias, static_cast<T>(2.0f)) / static_cast<T>(n))) / (n - 1)
 			return static_cast<Z>(reduction) / static_cast<Z>(n - 1);
 		}
 	};
@@ -3548,7 +3548,7 @@ namespace simdOps {
 
 		op_def static Z postProcess(X reduction, Nd4jLong n, Z *extraParams) {
 			Z ret = Variance<X,Z>::postProcess(reduction, n, extraParams);
-			Z sqrtRet = nd4j::math::nd4j_sqrt<X, Z>(ret);
+			Z sqrtRet = sd::math::nd4j_sqrt<X, Z>(ret);
 			return sqrtRet;
 		}
 	};
@@ -3572,7 +3572,7 @@ namespace simdOps {
 		}
 
 		op_def static Y postProcess(Y reduction, Nd4jLong n, Y *extraParams) {
-			return reduction / (nd4j::math::nd4j_sqrt<Y, Y>(extraParams[0]) * nd4j::math::nd4j_sqrt<Y, Y>(extraParams[1]));
+			return reduction / (sd::math::nd4j_sqrt<Y, Y>(extraParams[0]) * sd::math::nd4j_sqrt<Y, Y>(extraParams[1]));
 		}
 
 		op_def static Y op(X d1, X d2, Y *extraParams) {
@@ -3588,8 +3588,8 @@ namespace simdOps {
 
 #ifdef __CUDACC__
 		static _CUDA_D inline Y opAtomic(X d1, X d2, Y *extraParams) {
-			nd4j::math::atomics::nd4j_atomicAdd(&extraParams[0],static_cast<Y>(d1 * d1));
-			nd4j::math::atomics::nd4j_atomicAdd(&extraParams[1],static_cast<Y>(d2 * d2));
+			sd::math::atomics::nd4j_atomicAdd(&extraParams[0],static_cast<Y>(d1 * d1));
+			sd::math::atomics::nd4j_atomicAdd(&extraParams[1],static_cast<Y>(d2 * d2));
 
 			return static_cast<Y>(d1 * d2);
 		}
@@ -3630,11 +3630,11 @@ namespace simdOps {
         }
 
         op_def static Y num(X d1, X d2) {
-            return nd4j::math::nd4j_min<X>(d1, d2);
+            return sd::math::nd4j_min<X>(d1, d2);
         }
 
         op_def static Y denom(X d1, X d2) {
-            return nd4j::math::nd4j_max<X>(d1, d2);
+            return sd::math::nd4j_max<X>(d1, d2);
         }
 
         op_def static Y op(X d1, X d2, Y *extraParams) {
@@ -3651,8 +3651,8 @@ namespace simdOps {
 #ifdef __CUDACC__
         __device__
 		static inline Y opAtomic(X d1, X d2, Y *extraParams) {
-			nd4j::math::atomics::nd4j_atomicAdd(&extraParams[0],num(d1, d2));
-			nd4j::math::atomics::nd4j_atomicAdd(&extraParams[1], denom(d1, d2));
+			sd::math::atomics::nd4j_atomicAdd(&extraParams[0],num(d1, d2));
+			sd::math::atomics::nd4j_atomicAdd(&extraParams[1], denom(d1, d2));
 
 			return static_cast<Y>(0.0f);
 		}
@@ -3735,12 +3735,12 @@ namespace simdOps {
         }
 
         op_def static Y postProcess(Y reduction, Nd4jLong n, Y *extraParams) {
-            return (static_cast<Y>(1.0f)) - (reduction / (nd4j::math::nd4j_sqrt<Y, Y>(extraParams[0]) * nd4j::math::nd4j_sqrt<Y, Y>(extraParams[1])));
+            return (static_cast<Y>(1.0f)) - (reduction / (sd::math::nd4j_sqrt<Y, Y>(extraParams[0]) * sd::math::nd4j_sqrt<Y, Y>(extraParams[1])));
         }
 
         op_def static Y op(X d1, X d2, Y *extraParams) {
-            extraParams[0] += static_cast<Y>(nd4j::math::nd4j_abs<X>(d1) * nd4j::math::nd4j_abs<X>(d1));
-            extraParams[1] += static_cast<Y>(nd4j::math::nd4j_abs<X>(d2) * nd4j::math::nd4j_abs<X>(d2));
+            extraParams[0] += static_cast<Y>(sd::math::nd4j_abs<X>(d1) * sd::math::nd4j_abs<X>(d1));
+            extraParams[1] += static_cast<Y>(sd::math::nd4j_abs<X>(d2) * sd::math::nd4j_abs<X>(d2));
             return (d1 * d2);
         }
 
@@ -3751,8 +3751,8 @@ namespace simdOps {
 
 #ifdef __CUDACC__
 	static _CUDA_D inline Y opAtomic(X d1, X d2, Y *extraParams) {
-			nd4j::math::atomics::nd4j_atomicAdd(&extraParams[0], nd4j::math::nd4j_abs<Y>(d1) * nd4j::math::nd4j_abs<Y>(d1));
-			nd4j::math::atomics::nd4j_atomicAdd(&extraParams[1], nd4j::math::nd4j_abs<Y>(d2) * nd4j::math::nd4j_abs<Y>(d2));
+			sd::math::atomics::nd4j_atomicAdd(&extraParams[0], sd::math::nd4j_abs<Y>(d1) * sd::math::nd4j_abs<Y>(d1));
+			sd::math::atomics::nd4j_atomicAdd(&extraParams[1], sd::math::nd4j_abs<Y>(d2) * sd::math::nd4j_abs<Y>(d2));
 
 			return (d1 * d2);
 		}
@@ -3843,8 +3843,8 @@ namespace simdOps {
         }
 
         op_def static Z op(X d1, X d2, Z *extraParamsRef) {
-			double eps = nd4j::math::nd4j_abs<double>(extraParamsRef[2]);
-			return static_cast<Z>(!nd4j::math::nd4j_eq<X>(d1, d2, eps));
+			double eps = sd::math::nd4j_abs<double>(extraParamsRef[2]);
+			return static_cast<Z>(!sd::math::nd4j_eq<X>(d1, d2, eps));
         }
 
 
@@ -3886,7 +3886,7 @@ namespace simdOps {
 		}
 
 		op_def static Y postProcess(Y reduction, Nd4jLong n, Y *extraParamsRef) {
-			return nd4j::math::nd4j_sqrt<Y, Y>(reduction);
+			return sd::math::nd4j_sqrt<Y, Y>(reduction);
 		}
 
 		op_def static Y op(X d1, X d2, Y *extraParamsRef) {
@@ -3936,7 +3936,7 @@ namespace simdOps {
 		}
 
 		op_def static Y op(X d1, X d2, Y *extraParamsRef) {
-			return nd4j::math::nd4j_abs<X>(d1 - d2);
+			return sd::math::nd4j_abs<X>(d1 - d2);
 		}
 
 		op_def static Y update(Y old, Y opOutput, Y *extraParamsRef) {
@@ -3968,12 +3968,12 @@ namespace simdOps {
 	class IndexAbsoluteMax  {
 	public:
 		static _CUDA_HD inline functions::indexreduce::IndexValue<X> op(functions::indexreduce::IndexValue<X> val, X *extraParams) {
-			return nd4j::math::nd4j_abs<X>(val);
+			return sd::math::nd4j_abs<X>(val);
 		}
 
 		static _CUDA_HD inline functions::indexreduce::IndexValue<X> update(functions::indexreduce::IndexValue<X> &old, functions::indexreduce::IndexValue<X> &opOutput, X *extraParams) {
-			opOutput.value = nd4j::math::nd4j_abs<X>(opOutput.value);
-			old.value = nd4j::math::nd4j_abs<X>(old.value);
+			opOutput.value = sd::math::nd4j_abs<X>(opOutput.value);
+			old.value = sd::math::nd4j_abs<X>(old.value);
 			if (opOutput.value > old.value)
 				return opOutput;
 #ifdef __CUDACC__
@@ -3989,7 +3989,7 @@ namespace simdOps {
 		static _CUDA_HD inline functions::indexreduce::IndexValue<X> merge(
 				functions::indexreduce::IndexValue<X> f1,
 		functions::indexreduce::IndexValue<X> f2, X *extraParams) {
-			if (nd4j::math::nd4j_abs<X>(f1.value) > nd4j::math::nd4j_abs<X>(f2.value))
+			if (sd::math::nd4j_abs<X>(f1.value) > sd::math::nd4j_abs<X>(f2.value))
 				return f2;
 			return f1;
 		}
@@ -4048,7 +4048,7 @@ namespace simdOps {
         }
 
         static _CUDA_HD inline X startingValue(const X *input) {
-            return -nd4j::DataTypeUtils::infOrMax<X>();
+            return -sd::DataTypeUtils::infOrMax<X>();
         }
 
         static _CUDA_HD inline functions::indexreduce::IndexValue<X> startingIndexValue(X *input) {
@@ -4107,7 +4107,7 @@ namespace simdOps {
         }
 
         static _CUDA_HD inline X startingValue(const X *input) {
-            return -nd4j::DataTypeUtils::infOrMax<X>();
+            return -sd::DataTypeUtils::infOrMax<X>();
         }
 
         static _CUDA_HD inline functions::indexreduce::IndexValue<X> startingIndexValue(X *input) {
@@ -4175,7 +4175,7 @@ namespace simdOps {
 		}
 
         static _CUDA_HD inline X startingValue(const X *input) {
-			return -nd4j::DataTypeUtils::infOrMax<X>();
+			return -sd::DataTypeUtils::infOrMax<X>();
 		}
 
         static _CUDA_HD inline functions::indexreduce::IndexValue<X> startingIndexValue(X *input) {
@@ -4201,7 +4201,7 @@ namespace simdOps {
 		}
 
 		static _CUDA_HD inline X startingValue(const X *input) {
-			return nd4j::DataTypeUtils::infOrMax<X>();
+			return sd::DataTypeUtils::infOrMax<X>();
 		}
 
         static _CUDA_HD inline functions::indexreduce::IndexValue<X> startingIndexValue(X *input) {
@@ -4212,8 +4212,8 @@ namespace simdOps {
         }
 
 		static _CUDA_HD inline functions::indexreduce::IndexValue<X> update(functions::indexreduce::IndexValue<X> &old, functions::indexreduce::IndexValue<X> &opOutput, X *extraParams) {
-			opOutput.value = nd4j::math::nd4j_abs<X>(opOutput.value);
-			old.value = nd4j::math::nd4j_abs<X>(old.value);
+			opOutput.value = sd::math::nd4j_abs<X>(opOutput.value);
+			old.value = sd::math::nd4j_abs<X>(old.value);
 			if (opOutput.value < old.value)
 				return opOutput;
 
@@ -4230,7 +4230,7 @@ namespace simdOps {
 		static _CUDA_HD inline functions::indexreduce::IndexValue<X> merge(
 				functions::indexreduce::IndexValue<X> f1,
 		functions::indexreduce::IndexValue<X> f2, X *extraParams) {
-			if (nd4j::math::nd4j_abs<X>(f1.value) < nd4j::math::nd4j_abs<X>(f2.value))
+			if (sd::math::nd4j_abs<X>(f1.value) < sd::math::nd4j_abs<X>(f2.value))
 				return f2;
 			return f1;
 		}
@@ -4257,7 +4257,7 @@ namespace simdOps {
 		}
 
         static _CUDA_HD inline X startingValue(const X *input) {
-			return nd4j::DataTypeUtils::infOrMax<X>();
+			return sd::DataTypeUtils::infOrMax<X>();
 		}
 
         static _CUDA_HD inline functions::indexreduce::IndexValue<X> startingIndexValue(X *input) {
@@ -4328,11 +4328,11 @@ namespace simdOps {
 			if (biasCorrected) {
 				auto ret = static_cast<Z>(val.varianceBiasCorrected());
 				if (ret < static_cast<Z>(0.0f))
-					return nd4j::math::nd4j_sqrt<double, Z>(val.variance());
+					return sd::math::nd4j_sqrt<double, Z>(val.variance());
 				else
-					return nd4j::math::nd4j_sqrt<double, Z>(ret);
+					return sd::math::nd4j_sqrt<double, Z>(ret);
 			}
-			return  nd4j::math::nd4j_sqrt<double, Z>(val.variance());
+			return  sd::math::nd4j_sqrt<double, Z>(val.variance());
 		}
 
         static _CUDA_HD inline functions::summarystats::SummaryStatsData<X> op(functions::summarystats::SummaryStatsData<X> d1, Z *extraParams) {
@@ -4352,7 +4352,7 @@ namespace simdOps {
 #ifdef __CUDACC__
 			X length = params[1];
             X tid = blockIdx.x * blockDim.x + threadIdx.x;
-            X rnd = nd4j::math::nd4j_abs<X>(nd4j::math::nd4j_cos<X>(static_cast<X>(clock64()) * static_cast<X>(tid) + static_cast<X>(length) * static_cast<X>(tid)));
+            X rnd = sd::math::nd4j_abs<X>(sd::math::nd4j_cos<X>(static_cast<X>(clock64()) * static_cast<X>(tid) + static_cast<X>(length) * static_cast<X>(tid)));
 #else
 			X rnd = static_cast<X>(rand() / RAND_MAX);
 #endif
@@ -4374,7 +4374,7 @@ namespace simdOps {
 #ifdef __CUDACC__
 			X length = params[1];
 			X tid = blockIdx.x * blockDim.x + threadIdx.x;
-            X rnd = nd4j::math::nd4j_abs<X>(nd4j::math::nd4j_cos<X>(static_cast<X>(clock64()) * static_cast<X>(tid) + static_cast<X>(length) * static_cast<X>(tid)));
+            X rnd = sd::math::nd4j_abs<X>(sd::math::nd4j_cos<X>(static_cast<X>(clock64()) * static_cast<X>(tid) + static_cast<X>(length) * static_cast<X>(tid)));
 #else
 			X rnd = static_cast<X>(rand() / RAND_MAX);
 #endif
@@ -4390,7 +4390,7 @@ namespace simdOps {
 		no_op_exec_special_cuda
 
 		op_def static Z op(X d1, Y d2, Z *params) {
-			return nd4j::math::nd4j_isnan(d1) ? static_cast<Z>(d2) : static_cast<Z>(d1) ;
+			return sd::math::nd4j_isnan(d1) ? static_cast<Z>(d2) : static_cast<Z>(d1) ;
 		}
 	};
 
@@ -4406,12 +4406,12 @@ namespace simdOps {
             auto eps = params[2];
             int mode = (int) params[3];
             if (mode == 0) // equals
-                if (nd4j::math::nd4j_abs<Z>(zd1 - compare) <= eps)
+                if (sd::math::nd4j_abs<Z>(zd1 - compare) <= eps)
                     return zd2;
                 else
                     return zd1;
             else if (mode == 1) // not equals eps
-                if (nd4j::math::nd4j_abs<Z>(zd1 - compare) > eps)
+                if (sd::math::nd4j_abs<Z>(zd1 - compare) > eps)
                     return zd2;
                 else
                     return zd1;
@@ -4436,22 +4436,22 @@ namespace simdOps {
                 else
                     return zd1;
             else if (mode == 6) // abs_less_than
-                if (nd4j::math::nd4j_abs<Z>(zd1) < compare)
+                if (sd::math::nd4j_abs<Z>(zd1) < compare)
                     return zd2;
                 else
                     return zd1;
             else if (mode == 7) // abs_greater_than
-                if (nd4j::math::nd4j_abs<Z>(zd1) > compare)
+                if (sd::math::nd4j_abs<Z>(zd1) > compare)
                     return zd2;
                 else
                     return zd1;
             else if (mode == 8) // is inf
-                if (nd4j::math::nd4j_isinf(zd1))
+                if (sd::math::nd4j_isinf(zd1))
                     return zd2;
                 else
                     return zd1;
             else if (mode == 9) // is nan
-                if (nd4j::math::nd4j_isnan(zd1))
+                if (sd::math::nd4j_isnan(zd1))
                     return zd2;
                 else
                     return zd1;
@@ -4466,12 +4466,12 @@ namespace simdOps {
                 else
                     return zd1;
             else if (mode == 12) // abs_greater_or_equals_than
-                if (nd4j::math::nd4j_abs<Z>(zd1) >= compare)
+                if (sd::math::nd4j_abs<Z>(zd1) >= compare)
                     return zd2;
                 else
                     return zd1;
             else if (mode == 13) // abs_less_or_equals_than
-                if (nd4j::math::nd4j_abs<Z>(zd1) <= compare)
+                if (sd::math::nd4j_abs<Z>(zd1) <= compare)
                     return zd2;
                 else
                     return zd1;
@@ -4494,12 +4494,12 @@ namespace simdOps {
 			auto eps = params[2];
 			auto mode = static_cast<int>(params[3]);
             if (mode == 0) // equals
-                if (nd4j::math::nd4j_abs<Z>(d2 - compare) <= eps)
+                if (sd::math::nd4j_abs<Z>(d2 - compare) <= eps)
                     return d2;
                 else
                     return d1;
             else if (mode == 1) // not equals
-                if (nd4j::math::nd4j_abs<Z>(d2 - compare) > eps)
+                if (sd::math::nd4j_abs<Z>(d2 - compare) > eps)
                     return d2;
                 else
                     return d1;
@@ -4524,22 +4524,22 @@ namespace simdOps {
                 else
                     return d1;
             else if (mode == 6) // abs_less_than
-                if (nd4j::math::nd4j_abs<Z>(d2) < compare)
+                if (sd::math::nd4j_abs<Z>(d2) < compare)
                     return d2;
                 else
                     return d1;
             else if (mode == 7) // abs_greater_than
-                if (nd4j::math::nd4j_abs<Z>(d2) > compare)
+                if (sd::math::nd4j_abs<Z>(d2) > compare)
                     return d2;
                 else
                     return d1;
             else if (mode == 8) // is inf
-                if (nd4j::math::nd4j_isinf(d2))
+                if (sd::math::nd4j_isinf(d2))
                     return d2;
                 else
                     return d1;
             else if (mode == 9) // is nan
-                if (nd4j::math::nd4j_isnan(d2))
+                if (sd::math::nd4j_isnan(d2))
                     return d2;
                 else
                     return d1;
@@ -4554,12 +4554,12 @@ namespace simdOps {
                 else
                     return d1;
             else if (mode == 12) // abs_greater_or_equals_than
-                if (nd4j::math::nd4j_abs<Z>(d1) >= compare)
+                if (sd::math::nd4j_abs<Z>(d1) >= compare)
                     return d2;
                 else
                     return d1;
             else if (mode == 13) // abs_less_or_equals_than
-                if (nd4j::math::nd4j_abs<Z>(d1) <= compare)
+                if (sd::math::nd4j_abs<Z>(d1) <= compare)
                     return d2;
                 else
                     return d1;
@@ -4585,17 +4585,17 @@ namespace simdOps {
             // with mode == 0 we do set if d1 equals to compare, and with mode == 1 - we go otherwise
             int mode = (int) params[3];
             if (mode == 0) // equals
-                if (nd4j::math::nd4j_abs<X>(d1 - compare) <= eps)
+                if (sd::math::nd4j_abs<X>(d1 - compare) <= eps)
                     return set;
                 else
                     return d1;
-                //return nd4j::math::nd4j_abs<T>(d1 - compare) <= eps ? set : d1;
+                //return sd::math::nd4j_abs<T>(d1 - compare) <= eps ? set : d1;
             else if (mode == 1) // not equals
-                if (nd4j::math::nd4j_abs<X>(d1 - compare) > eps)
+                if (sd::math::nd4j_abs<X>(d1 - compare) > eps)
                     return set;
                 else
                     return d1;
-                //return nd4j::math::nd4j_abs<T>(d1 - compare) > eps ? set : d1;
+                //return sd::math::nd4j_abs<T>(d1 - compare) > eps ? set : d1;
             else if (mode == 2) // less_than
                 if (d1 < compare)
                     return set;
@@ -4617,22 +4617,22 @@ namespace simdOps {
                 else
                     return d1;
             else if (mode == 6) // abs_less_than
-                if (nd4j::math::nd4j_abs<X>(d1) < compare)
+                if (sd::math::nd4j_abs<X>(d1) < compare)
                     return set;
                 else
                     return d1;
             else if (mode == 7) // abs_greater_than
-                if (nd4j::math::nd4j_abs<X>(d1) > compare)
+                if (sd::math::nd4j_abs<X>(d1) > compare)
                     return set;
                 else
                     return d1;
             else if (mode == 8) // is inf
-                if (nd4j::math::nd4j_isinf(d1))
+                if (sd::math::nd4j_isinf(d1))
                     return set;
                 else
                     return d1;
             else if (mode == 9) // is nan
-                if (nd4j::math::nd4j_isnan(d1))
+                if (sd::math::nd4j_isnan(d1))
                     return set;
                 else
                     return d1;
@@ -4647,12 +4647,12 @@ namespace simdOps {
                 else
                     return d1;
             else if (mode == 12) // abs_greater_or_equals_than
-                if (nd4j::math::nd4j_abs<X>(d1) >= compare)
+                if (sd::math::nd4j_abs<X>(d1) >= compare)
                     return set;
                 else
                     return d1;
             else if (mode == 13) // abs_less_or_equals_than
-                if (nd4j::math::nd4j_abs<X>(d1) <= compare)
+                if (sd::math::nd4j_abs<X>(d1) <= compare)
                     return set;
                 else
                     return d1;
diff --git a/libnd4j/include/ops/random_ops.h b/libnd4j/include/ops/random_ops.h
index 8eb25c84c..844f88ed3 100644
--- a/libnd4j/include/ops/random_ops.h
+++ b/libnd4j/include/ops/random_ops.h
@@ -28,9 +28,9 @@
 #endif
 
 // since we can't inherit/overwrite static methods - we just define default impls
-#define method_idx  random_def T op(Nd4jLong idx, Nd4jLong length, nd4j::graph::RandomGenerator* rng, T *extraParams) { return -1.0f; }
-#define method_X  random_def T op(T valueX, Nd4jLong idx, Nd4jLong length, nd4j::graph::RandomGenerator* rng, T *extraParams) { return -2.0f; }
-#define method_XY  random_def T op(T valueX, T valueY, Nd4jLong idx, Nd4jLong length, nd4j::graph::RandomGenerator* rng, T *extraParams) { return -3.0f; }
+#define method_idx  random_def T op(Nd4jLong idx, Nd4jLong length, sd::graph::RandomGenerator* rng, T *extraParams) { return -1.0f; }
+#define method_X  random_def T op(T valueX, Nd4jLong idx, Nd4jLong length, sd::graph::RandomGenerator* rng, T *extraParams) { return -2.0f; }
+#define method_XY  random_def T op(T valueX, T valueY, Nd4jLong idx, Nd4jLong length, sd::graph::RandomGenerator* rng, T *extraParams) { return -3.0f; }
 
 #define no_exec_special static const bool requiresSpecial = false; static inline void specialOp(Nd4jPointer state, T *x, Nd4jLong *xShapeBuffer, T *y, Nd4jLong *yShapeBuffer, T *z, Nd4jLong *zShapeBuffer, T *extraArguments) { }
 
@@ -59,7 +59,7 @@ namespace randomOps {
         method_idx
         method_X
 
-        random_def T op(T valueX, T valueY, Nd4jLong idx,  Nd4jLong length, nd4j::graph::RandomGenerator *helper, T *extraParams) {
+        random_def T op(T valueX, T valueY, Nd4jLong idx,  Nd4jLong length, sd::graph::RandomGenerator *helper, T *extraParams) {
             T threshold = extraParams[0];
             T randVal = helper->relativeT<T>(idx);
 
@@ -80,7 +80,7 @@ namespace randomOps {
         method_XY
         method_X
 
-        random_def T op(Nd4jLong idx, Nd4jLong length, nd4j::graph::RandomGenerator *helper, T *extraParams) {
+        random_def T op(Nd4jLong idx, Nd4jLong length, sd::graph::RandomGenerator *helper, T *extraParams) {
             return helper->relativeT<T>(idx, extraParams[0], extraParams[1]);
         }
     };
@@ -96,11 +96,11 @@ namespace randomOps {
 
         method_XY
 
-        random_def T op(Nd4jLong idx, Nd4jLong length, nd4j::graph::RandomGenerator *helper, T *extraParams) {
+        random_def T op(Nd4jLong idx, Nd4jLong length, sd::graph::RandomGenerator *helper, T *extraParams) {
             return extraParams[0] >= helper->relativeT<T>(idx) ? (T) 1.0f : (T) 0.0f;
         }
 
-        random_def T op(T valueX, Nd4jLong idx, Nd4jLong length, nd4j::graph::RandomGenerator *helper, T *extraParams) {
+        random_def T op(T valueX, Nd4jLong idx, Nd4jLong length, sd::graph::RandomGenerator *helper, T *extraParams) {
             return valueX >= helper->relativeT<T>(idx) ? (T) 1.0f : (T) 0.0f;
         }
     };
@@ -117,15 +117,15 @@ namespace randomOps {
 
         method_XY
 
-        random_def T op(Nd4jLong idx, Nd4jLong length, nd4j::graph::RandomGenerator *helper, T *extraParams) {
+        random_def T op(Nd4jLong idx, Nd4jLong length, sd::graph::RandomGenerator *helper, T *extraParams) {
             T lambda = extraParams[0];
-            T x = helper->relativeT(idx, -nd4j::DataTypeUtils::template max<T>() / 10 , nd4j::DataTypeUtils::template max<T>() / 10);
-            return x <= (T)0.f ? (T)0.f : (T)1.f - nd4j::math::nd4j_pow<T, T, T>((T) M_E, -(lambda * x));
+            T x = helper->relativeT(idx, -sd::DataTypeUtils::template max<T>() / 10 , sd::DataTypeUtils::template max<T>() / 10);
+            return x <= (T)0.f ? (T)0.f : (T)1.f - sd::math::nd4j_pow<T, T, T>((T) M_E, -(lambda * x));
         }
 
-        random_def T op(T valueX, Nd4jLong idx, Nd4jLong length, nd4j::graph::RandomGenerator *helper, T *extraParams) {
+        random_def T op(T valueX, Nd4jLong idx, Nd4jLong length, sd::graph::RandomGenerator *helper, T *extraParams) {
             T lambda = extraParams[0];
-            return valueX <= (T)0.f ? (T)0.f : (T)1.f - nd4j::math::nd4j_pow<T, T, T>((T) M_E, -(lambda * valueX));
+            return valueX <= (T)0.f ? (T)0.f : (T)1.f - sd::math::nd4j_pow<T, T, T>((T) M_E, -(lambda * valueX));
         }
     };
 
@@ -137,15 +137,15 @@ namespace randomOps {
 
         method_XY
 
-        random_def T op(Nd4jLong idx, Nd4jLong length, nd4j::graph::RandomGenerator *helper, T *extraParams) {
+        random_def T op(Nd4jLong idx, Nd4jLong length, sd::graph::RandomGenerator *helper, T *extraParams) {
             T lambda = extraParams[0];
-            T x = helper->relativeT(idx, -nd4j::DataTypeUtils::template max<T>() / 10 , nd4j::DataTypeUtils::template max<T>() / 10);
-            return x <= (T)0.f ? (T)0.f : nd4j::math::nd4j_igammac<T,T,T>(nd4j::math::nd4j_floor<T,T>(x), lambda);
+            T x = helper->relativeT(idx, -sd::DataTypeUtils::template max<T>() / 10 , sd::DataTypeUtils::template max<T>() / 10);
+            return x <= (T)0.f ? (T)0.f : sd::math::nd4j_igammac<T,T,T>(sd::math::nd4j_floor<T,T>(x), lambda);
         }
 
-        random_def T op(T valueX, Nd4jLong idx, Nd4jLong length, nd4j::graph::RandomGenerator *helper, T *extraParams) {
+        random_def T op(T valueX, Nd4jLong idx, Nd4jLong length, sd::graph::RandomGenerator *helper, T *extraParams) {
             T lambda = extraParams[0];
-            return valueX <= (T)0.f ? (T)0.f : (T)nd4j::math::nd4j_igammac<T,T,T>(nd4j::math::nd4j_floor<T,T>(valueX), lambda);
+            return valueX <= (T)0.f ? (T)0.f : (T)sd::math::nd4j_igammac<T,T,T>(sd::math::nd4j_floor<T,T>(valueX), lambda);
         }
     };
 
@@ -157,17 +157,17 @@ namespace randomOps {
 
         method_XY
 
-        random_def T op(Nd4jLong idx, Nd4jLong length, nd4j::graph::RandomGenerator *helper, T *extraParams) {
+        random_def T op(Nd4jLong idx, Nd4jLong length, sd::graph::RandomGenerator *helper, T *extraParams) {
             T alpha = extraParams[0];
             T beta = extraParams[1];
-            T x = helper->relativeT(idx, -nd4j::DataTypeUtils::template max<T>() / 10 , nd4j::DataTypeUtils::template max<T>() / 10);
-            return x <= (T)0.f ? (T)0.f : nd4j::math::nd4j_igamma<T,T,T>(alpha, x * beta);
+            T x = helper->relativeT(idx, -sd::DataTypeUtils::template max<T>() / 10 , sd::DataTypeUtils::template max<T>() / 10);
+            return x <= (T)0.f ? (T)0.f : sd::math::nd4j_igamma<T,T,T>(alpha, x * beta);
         }
 
-        random_def T op(T valueX, Nd4jLong idx, Nd4jLong length, nd4j::graph::RandomGenerator *helper, T *extraParams) {
+        random_def T op(T valueX, Nd4jLong idx, Nd4jLong length, sd::graph::RandomGenerator *helper, T *extraParams) {
             T alpha = extraParams[0];
             T beta = extraParams[1];
-            return valueX <= (T)0.f ? (T)0.f : nd4j::math::nd4j_igamma<T,T,T>(alpha, beta * valueX);
+            return valueX <= (T)0.f ? (T)0.f : sd::math::nd4j_igamma<T,T,T>(alpha, beta * valueX);
         }
     };
 
@@ -185,7 +185,7 @@ namespace randomOps {
         method_XY
 
         // please note: prob is chance to retain original value
-        random_def T op(T valueX, Nd4jLong idx, Nd4jLong length, nd4j::graph::RandomGenerator *helper, T *extraParams) {
+        random_def T op(T valueX, Nd4jLong idx, Nd4jLong length, sd::graph::RandomGenerator *helper, T *extraParams) {
             T randVal = helper->relativeT<T>(idx);
             return randVal >= extraParams[0] ? (T) 0.0f : valueX;
         }
@@ -202,7 +202,7 @@ namespace randomOps {
         method_XY
 
         // please note: prob is chance to retain original value
-        random_def T op(T valueX, Nd4jLong idx, Nd4jLong length, nd4j::graph::RandomGenerator *helper, T *extraParams) {
+        random_def T op(T valueX, Nd4jLong idx, Nd4jLong length, sd::graph::RandomGenerator *helper, T *extraParams) {
             T randVal = helper->relativeT<T>(idx);
             // extraParams[0] == p
             // [1] = a
@@ -226,7 +226,7 @@ namespace randomOps {
         method_XY
 
         // please note: prob is chance to retain original value
-        random_def T op(T valueX, Nd4jLong idx, Nd4jLong length, nd4j::graph::RandomGenerator *helper, T *extraParams) {
+        random_def T op(T valueX, Nd4jLong idx, Nd4jLong length, sd::graph::RandomGenerator *helper, T *extraParams) {
             T prob = extraParams[0];
             T randVal = helper->relativeT<T>(idx);
             return randVal >= prob ? (T) 0.0f : valueX / prob;
@@ -244,7 +244,7 @@ namespace randomOps {
         method_X
         method_XY
 
-        random_def T op(Nd4jLong idx, Nd4jLong length, nd4j::graph::RandomGenerator *helper, T *extraParams) {
+        random_def T op(Nd4jLong idx, Nd4jLong length, sd::graph::RandomGenerator *helper, T *extraParams) {
             T from = extraParams[0];
             T to = extraParams[1];
             T step = extraParams[2];
@@ -266,15 +266,15 @@ namespace randomOps {
 
         method_XY
 
-        random_def T op(Nd4jLong idx, Nd4jLong length, nd4j::graph::RandomGenerator *helper, T *extraParams) {
+        random_def T op(Nd4jLong idx, Nd4jLong length, sd::graph::RandomGenerator *helper, T *extraParams) {
             T lambda = extraParams[0];
-            T x = helper->relativeT(idx, nd4j::DataTypeUtils::template min<T>(), (T)1.f);            
-            return -nd4j::math::nd4j_log<T, T>((T)1.f - x) / lambda;
+            T x = helper->relativeT(idx, sd::DataTypeUtils::template min<T>(), (T)1.f);
+            return -sd::math::nd4j_log<T, T>((T)1.f - x) / lambda;
         }
 
-        random_def T op(T valueX, Nd4jLong idx, Nd4jLong length, nd4j::graph::RandomGenerator *helper, T *extraParams) {
+        random_def T op(T valueX, Nd4jLong idx, Nd4jLong length, sd::graph::RandomGenerator *helper, T *extraParams) {
             T lambda = extraParams[0];            
-            return -nd4j::math::nd4j_log<T, T>((T)1.f - valueX) / lambda;  // valueX must be within (0, 1]
+            return -sd::math::nd4j_log<T, T>((T)1.f - valueX) / lambda;  // valueX must be within (0, 1]
         }
     };
 
diff --git a/libnd4j/include/ops/special_random_ops.h b/libnd4j/include/ops/special_random_ops.h
index 354f8e328..50a50752e 100644
--- a/libnd4j/include/ops/special_random_ops.h
+++ b/libnd4j/include/ops/special_random_ops.h
@@ -24,7 +24,7 @@
 #include <ops/random_ops.h>
 #include <helpers/shape.h>
 #include <graph/RandomGenerator.h>
-#include <specials_cuda.h>
+#include <ops/specials_cuda.h>
 #include <execution/Threads.h>
 
 namespace randomOps {
@@ -63,16 +63,16 @@ namespace randomOps {
             __shared__ char yOrder;
             __shared__ char zOrder;
 
-            __shared__ nd4j::graph::RandomGenerator *rng;
+            __shared__ sd::graph::RandomGenerator *rng;
             __shared__ unsigned char *cB;
             __shared__ unsigned char *dB;
-            __shared__ nd4j::graph::RandomGenerator *devRng;
+            __shared__ sd::graph::RandomGenerator *devRng;
 
             if (threadIdx.x == 0) {
                 extern __shared__ unsigned char shmem[];
-                rng = (nd4j::graph::RandomGenerator*) shmem;
+                rng = (sd::graph::RandomGenerator*) shmem;
                 cB = shmem;
-                devRng = reinterpret_cast<nd4j::graph::RandomGenerator*> (state);
+                devRng = reinterpret_cast<sd::graph::RandomGenerator*> (state);
                 dB = reinterpret_cast<unsigned char *> (state);
 
                 xLength = shape::length(xShapeBuffer);
@@ -89,7 +89,7 @@ namespace randomOps {
             __syncthreads();
 
             // using this loop instead of memcpy
-            for (int e = threadIdx.x; e < sizeof(nd4j::graph::RandomGenerator); e+= blockDim.x)
+            for (int e = threadIdx.x; e < sizeof(sd::graph::RandomGenerator); e+= blockDim.x)
                 cB[e] = dB[e];
 
             __syncthreads();
@@ -148,8 +148,8 @@ namespace randomOps {
              * Z will hold results
              */
 
-            //nd4j::random::RandomBuffer *buffer = reinterpret_cast<nd4j::random::RandomBuffer *> (state);
-            nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
+            //sd::random::RandomBuffer *buffer = reinterpret_cast<sd::random::RandomBuffer *> (state);
+            sd::graph::RandomGenerator* rng = reinterpret_cast<sd::graph::RandomGenerator*>(state);
             // TODO: we probably might want to skip this sum, and state that probabilities array should be real probabilities, i.e. should sum to 1.0
             //T probSum = extraArguments[0];
 
@@ -162,8 +162,8 @@ namespace randomOps {
             auto zEWS = shape::elementWiseStride(zShapeBuffer);
 
             int elementsPerThread = zLength / TAD_THRESHOLD;
-            int _threads = nd4j::math::nd4j_max<int>(1, elementsPerThread);
-            _threads = nd4j::math::nd4j_min<int>(_threads, nd4j::Environment::getInstance()->maxThreads());
+            int _threads = sd::math::nd4j_max<int>(1, elementsPerThread);
+            _threads = sd::math::nd4j_min<int>(_threads, sd::Environment::getInstance()->maxThreads());
 
             if (zEWS >= 1 && xEWS >= 1 && yEWS >= 1) {
                 auto func = PRAGMA_THREADS_FOR {
@@ -244,19 +244,19 @@ namespace randomOps {
 
             __shared__ T *tZ;
 
-            __shared__ nd4j::graph::RandomGenerator* rng;
+            __shared__ sd::graph::RandomGenerator* rng;
             __shared__ unsigned char *cB;
             __shared__ unsigned char *dB;
-            __shared__ nd4j::graph::RandomGenerator *devRng;
+            __shared__ sd::graph::RandomGenerator *devRng;
 
             if (threadIdx.x == 0) {
                 extern __shared__ unsigned char shmem[];
-                rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(shmem);
+                rng = reinterpret_cast<sd::graph::RandomGenerator*>(shmem);
                 cB = shmem;
-                devRng = reinterpret_cast<nd4j::graph::RandomGenerator *> (state);
+                devRng = reinterpret_cast<sd::graph::RandomGenerator *> (state);
                 dB = reinterpret_cast<unsigned char *> (state);
 
-                tZ = reinterpret_cast<T *>(shmem + sizeof(nd4j::graph::RandomGenerator));
+                tZ = reinterpret_cast<T *>(shmem + sizeof(sd::graph::RandomGenerator));
 
                 zLength = shape::length(zShapeBuffer);
                 zEWS = shape::elementWiseStride(zShapeBuffer);
@@ -274,7 +274,7 @@ namespace randomOps {
             __syncthreads();
 
             // using this loop instead of memcpy
-            for (int e = threadIdx.x; e < sizeof(nd4j::graph::RandomGenerator); e+= blockDim.x)
+            for (int e = threadIdx.x; e < sizeof(sd::graph::RandomGenerator); e+= blockDim.x)
                 cB[e] = dB[e];
 
             __syncthreads();
@@ -292,11 +292,11 @@ namespace randomOps {
 
                 T realMean0 = y == z ? mean : y[e * yEWS];
 
-                z[e * zEWS] = (nd4j::math::nd4j_sqrt<T,T>(t * nd4j::math::nd4j_log<T,T>(r0)) * nd4j::math::nd4j_cos<T,T>(two_pi * r1)) * stddev + realMean0;
+                z[e * zEWS] = (sd::math::nd4j_sqrt<T,T>(t * sd::math::nd4j_log<T,T>(r0)) * sd::math::nd4j_cos<T,T>(two_pi * r1)) * stddev + realMean0;
 
                 if (epm < zLength) {
                     T realMean1 = y == z ? mean : y[epm * yEWS];
-                    z[epm * zEWS] =  (nd4j::math::nd4j_sqrt<T,T>(t * nd4j::math::nd4j_log<T,T>(r0)) * nd4j::math::nd4j_sin<T,T>(two_pi * r1)) * stddev + realMean1;
+                    z[epm * zEWS] =  (sd::math::nd4j_sqrt<T,T>(t * sd::math::nd4j_log<T,T>(r0)) * sd::math::nd4j_sin<T,T>(two_pi * r1)) * stddev + realMean1;
                 }
             }
         }
@@ -314,16 +314,16 @@ namespace randomOps {
             auto middle = zLength % 2  + zLength / 2;
 
             int elementsPerThread = middle / TAD_THRESHOLD;
-            int _threads = nd4j::math::nd4j_max<int>(1, elementsPerThread);
-            _threads = nd4j::math::nd4j_min<int>(_threads, nd4j::Environment::getInstance()->maxThreads());
+            int _threads = sd::math::nd4j_max<int>(1, elementsPerThread);
+            _threads = sd::math::nd4j_min<int>(_threads, sd::Environment::getInstance()->maxThreads());
 
             int span = (middle / _threads) + 8;
 
             // we're enforcing even chunks, since it's mandatory for this algorithm
             span -= span % 2;
 
-            //nd4j::random::RandomBuffer *buffer = reinterpret_cast<nd4j::random::RandomBuffer *> (state);
-            nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
+            //sd::random::RandomBuffer *buffer = reinterpret_cast<sd::random::RandomBuffer *> (state);
+            sd::graph::RandomGenerator* rng = reinterpret_cast<sd::graph::RandomGenerator*>(state);
             const T mean = extraArguments[0];
             const T stddev = extraArguments[1];
 
@@ -339,14 +339,14 @@ namespace randomOps {
 
                     T realMean0 = y == z ? mean : y[e * yEWS];
 
-                    auto z0 = (nd4j::math::nd4j_sqrt<T, T>(static_cast<T>(-2.0f) * nd4j::math::nd4j_log<T, T>(r0)) *
-                               nd4j::math::nd4j_cos<T, T>(two_pi * r1)) * stddev + realMean0;
+                    auto z0 = (sd::math::nd4j_sqrt<T, T>(static_cast<T>(-2.0f) * sd::math::nd4j_log<T, T>(r0)) *
+                               sd::math::nd4j_cos<T, T>(two_pi * r1)) * stddev + realMean0;
                     z[e * zEWS] = z0;
 
                     if (epm < zLength) {
                         T realMean1 = y == z ? mean : y[epm * yEWS];
-                        auto z1 = (nd4j::math::nd4j_sqrt<T, T>(static_cast<T>(-2.0f) * nd4j::math::nd4j_log<T, T>(r0)) *
-                                   nd4j::math::nd4j_sin<T, T>(two_pi * r1)) * stddev + realMean1;
+                        auto z1 = (sd::math::nd4j_sqrt<T, T>(static_cast<T>(-2.0f) * sd::math::nd4j_log<T, T>(r0)) *
+                                   sd::math::nd4j_sin<T, T>(two_pi * r1)) * stddev + realMean1;
                         z[epm * zEWS] = z1;
                     }
                 }
@@ -381,15 +381,15 @@ namespace randomOps {
             __shared__ int yEWS;
             __shared__ int zEWS;
 
-            __shared__ nd4j::graph::RandomGenerator* rng;
+            __shared__ sd::graph::RandomGenerator* rng;
             __shared__ unsigned char *cB;
             __shared__ unsigned char *dB;
-            __shared__ nd4j::graph::RandomGenerator *devRng;
+            __shared__ sd::graph::RandomGenerator *devRng;
             if (threadIdx.x == 0) {
                 extern __shared__ unsigned char shmem[];
-                rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(shmem);
+                rng = reinterpret_cast<sd::graph::RandomGenerator*>(shmem);
                 cB = shmem;
-                devRng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
+                devRng = reinterpret_cast<sd::graph::RandomGenerator*>(state);
                 dB = reinterpret_cast<unsigned char *> (state);
 
                 zLength = shape::length(zShapeBuffer);
@@ -399,7 +399,7 @@ namespace randomOps {
             __syncthreads();
 
             // using this loop instead of memcpy
-            for (int e = threadIdx.x; e < sizeof(nd4j::graph::RandomGenerator); e+= blockDim.x)
+            for (int e = threadIdx.x; e < sizeof(sd::graph::RandomGenerator); e+= blockDim.x)
                 cB[e] = dB[e];
 
             __syncthreads();
@@ -433,12 +433,12 @@ namespace randomOps {
             auto zEWS = shape::elementWiseStride(zShapeBuffer);
 
             int elementsPerThread = zLength / TAD_THRESHOLD;
-            int _threads = nd4j::math::nd4j_max<int>(1, elementsPerThread);
-            _threads = nd4j::math::nd4j_min<int>(_threads, nd4j::Environment::getInstance()->maxThreads());
+            int _threads = sd::math::nd4j_max<int>(1, elementsPerThread);
+            _threads = sd::math::nd4j_min<int>(_threads, sd::Environment::getInstance()->maxThreads());
 
             T prob = extraArguments[1];
 
-            nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
+            sd::graph::RandomGenerator* rng = reinterpret_cast<sd::graph::RandomGenerator*>(state);
             auto func = PRAGMA_THREADS_FOR {
                 for (auto e = start; e < stop; e++) {
 
@@ -488,15 +488,15 @@ namespace randomOps {
             __shared__ int yEWS;
             __shared__ int zEWS;
 
-            __shared__ nd4j::graph::RandomGenerator* rng;
+            __shared__ sd::graph::RandomGenerator* rng;
             __shared__ unsigned char *cB;
             __shared__ unsigned char *dB;
-            __shared__ nd4j::graph::RandomGenerator *devRng;
+            __shared__ sd::graph::RandomGenerator *devRng;
             if (threadIdx.x == 0) {
                 extern __shared__ unsigned char shmem[];
-                rng = (nd4j::graph::RandomGenerator*) shmem;
+                rng = (sd::graph::RandomGenerator*) shmem;
                 cB = shmem;
-                devRng = reinterpret_cast<nd4j::graph::RandomGenerator*> (state);
+                devRng = reinterpret_cast<sd::graph::RandomGenerator*> (state);
                 dB = reinterpret_cast<unsigned char *> (state);
 
                 zLength = shape::length(zShapeBuffer);
@@ -506,7 +506,7 @@ namespace randomOps {
             __syncthreads();
 
             // using this loop instead of memcpy
-            for (int e = threadIdx.x; e < sizeof(nd4j::graph::RandomGenerator); e+= blockDim.x)
+            for (int e = threadIdx.x; e < sizeof(sd::graph::RandomGenerator); e+= blockDim.x)
                 cB[e] = dB[e];
 
             __syncthreads();
@@ -541,13 +541,13 @@ namespace randomOps {
             auto zEWS = shape::elementWiseStride(zShapeBuffer);
 
             int elementsPerThread = zLength / TAD_THRESHOLD;
-            int _threads = nd4j::math::nd4j_max<int>(1, elementsPerThread);
-            _threads = nd4j::math::nd4j_min<int>(_threads, nd4j::Environment::getInstance()->maxThreads());
+            int _threads = sd::math::nd4j_max<int>(1, elementsPerThread);
+            _threads = sd::math::nd4j_min<int>(_threads, sd::Environment::getInstance()->maxThreads());
 
             T prob = extraArguments[1];
 
-            //nd4j::random::RandomBuffer *buffer = reinterpret_cast<nd4j::random::RandomBuffer *> (state);
-            nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
+            //sd::random::RandomBuffer *buffer = reinterpret_cast<sd::random::RandomBuffer *> (state);
+            sd::graph::RandomGenerator* rng = reinterpret_cast<sd::graph::RandomGenerator*>(state);
             auto func = PRAGMA_THREADS_FOR {
                 for (auto e = start; e < stop; e++) {
 
@@ -577,7 +577,7 @@ namespace randomOps {
     template<typename T>
     class TruncatedNormalDistribution {
     private:
-        static inline _CUDA_HD T step(nd4j::graph::RandomGenerator* rng, T mean, T stddev, Nd4jLong e, Nd4jLong middle, T& z) {
+        static inline _CUDA_HD T step(sd::graph::RandomGenerator* rng, T mean, T stddev, Nd4jLong e, Nd4jLong middle, T& z) {
             auto epm = e + middle;
             const T two_pi = static_cast<T>(2.0f) * static_cast<T>(3.14159265358979323846);
             const T epsilon = static_cast<T>(1.e-5f);
@@ -587,12 +587,12 @@ namespace randomOps {
 
             T realMean0 = mean;
 
-            auto z0 =  (nd4j::math::nd4j_sqrt<T,T>(static_cast<T>(-2.0f) * nd4j::math::nd4j_log<T,T>(r0)) * nd4j::math::nd4j_cos<T,T>(two_pi * r1)) * stddev + realMean0;
+            auto z0 =  (sd::math::nd4j_sqrt<T,T>(static_cast<T>(-2.0f) * sd::math::nd4j_log<T,T>(r0)) * sd::math::nd4j_cos<T,T>(two_pi * r1)) * stddev + realMean0;
             z = z0;
             if (epm < middle) {
                 T realMean1 = mean;
-                auto z1 = (nd4j::math::nd4j_sqrt<T, T>(static_cast<T>(-2.0f) * nd4j::math::nd4j_log<T, T>(r0)) *
-                           nd4j::math::nd4j_sin<T, T>(two_pi * r1)) * stddev + realMean1;
+                auto z1 = (sd::math::nd4j_sqrt<T, T>(static_cast<T>(-2.0f) * sd::math::nd4j_log<T, T>(r0)) *
+                           sd::math::nd4j_sin<T, T>(two_pi * r1)) * stddev + realMean1;
                 z = z1;
             }
             return z;
@@ -619,20 +619,20 @@ namespace randomOps {
 
             __shared__ T *tZ;
 
-            __shared__ nd4j::graph::RandomGenerator* rng;
+            __shared__ sd::graph::RandomGenerator* rng;
             __shared__ unsigned char *cB;
             __shared__ unsigned char *dB;
-            __shared__ nd4j::graph::RandomGenerator* devRng;
+            __shared__ sd::graph::RandomGenerator* devRng;
             __shared__ Nd4jLong middle;
 
             if (threadIdx.x == 0) {
                 extern __shared__ unsigned char shmem[];
-                rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(shmem);
+                rng = reinterpret_cast<sd::graph::RandomGenerator*>(shmem);
                 cB = shmem;
-                devRng = reinterpret_cast<nd4j::graph::RandomGenerator*> (state);
+                devRng = reinterpret_cast<sd::graph::RandomGenerator*> (state);
                 dB = reinterpret_cast<unsigned char *> (state);
 
-                tZ = reinterpret_cast<T*>(shmem + sizeof(nd4j::graph::RandomGenerator));
+                tZ = reinterpret_cast<T*>(shmem + sizeof(sd::graph::RandomGenerator));
 
                 zLength = shape::length(zShapeBuffer);
                 zEWS = shape::elementWiseStride(zShapeBuffer);
@@ -650,7 +650,7 @@ namespace randomOps {
             __syncthreads();
 
             // using this loop instead of memcpy
-            for (int e = threadIdx.x; e < sizeof(nd4j::graph::RandomGenerator); e+= blockDim.x)
+            for (int e = threadIdx.x; e < sizeof(sd::graph::RandomGenerator); e+= blockDim.x)
                 cB[e] = dB[e];
 
             __syncthreads();
@@ -660,13 +660,13 @@ namespace randomOps {
             GaussianDistribution<T>::specialOpCuda(state, x, xShapeBuffer, y, yShapeBuffer, z, zShapeBuffer, extraArguments);
             __syncthreads();
 
-            T ds = nd4j::math::nd4j_abs<T>(stddev) * static_cast<T>(2.0f);
+            T ds = sd::math::nd4j_abs<T>(stddev) * static_cast<T>(2.0f);
             for (Nd4jLong e = tid; e < zLength; e += step) {
                 if (z[e] > mean + ds || z[e] < mean - ds) {
                     z[e] = TruncatedNormalDistribution<T>::step(rng, mean, stddev, e, middle, z[e]);
 
                     if (z[e] > mean + ds || z[e] < mean - ds)
-                        z[e] = mean + nd4j::DataTypeUtils::min<T>();
+                        z[e] = mean + sd::DataTypeUtils::min<T>();
                 }
             }
         }
@@ -678,14 +678,14 @@ namespace randomOps {
             Nd4jLong zLength = shape::length(zShapeBuffer);
             //auto yEWS = shape::elementWiseStride(yShapeBuffer);
             //auto zEWS = shape::elementWiseStride(zShapeBuffer);
-            nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
+            sd::graph::RandomGenerator* rng = reinterpret_cast<sd::graph::RandomGenerator*>(state);
             T mean = extraArguments[0];
             T stddev = extraArguments[1];
-            T ds = nd4j::math::nd4j_abs<T>(stddev) * (T) 2.0f;
+            T ds = sd::math::nd4j_abs<T>(stddev) * (T) 2.0f;
             Nd4jLong middle = zLength / 2 + (zLength % 2);
             int elementsPerThread = middle / TAD_THRESHOLD;
-            int _threads = nd4j::math::nd4j_max<int>(1, elementsPerThread);
-            _threads = nd4j::math::nd4j_min<int>(_threads, nd4j::Environment::getInstance()->maxThreads());
+            int _threads = sd::math::nd4j_max<int>(1, elementsPerThread);
+            _threads = sd::math::nd4j_min<int>(_threads, sd::Environment::getInstance()->maxThreads());
 
             const T epsilon = static_cast<T>(1e-5);
 
@@ -695,7 +695,7 @@ namespace randomOps {
                         z[e] = step(rng, mean, stddev, e, middle, z[e]);
 
                         if (z[e] > mean + ds || z[e] < mean - ds)
-                            z[e] = mean + nd4j::DataTypeUtils::min<T>();
+                            z[e] = mean + sd::DataTypeUtils::min<T>();
                     }
                 }
             };
@@ -731,20 +731,20 @@ namespace randomOps {
 
             __shared__ T *tZ;
 
-            __shared__ nd4j::graph::RandomGenerator* rng;
+            __shared__ sd::graph::RandomGenerator* rng;
             __shared__ unsigned char *cB;
             __shared__ unsigned char *dB;
-            __shared__ nd4j::graph::RandomGenerator* devRng;
+            __shared__ sd::graph::RandomGenerator* devRng;
 
             if (threadIdx.x == 0) {
                 extern __shared__ unsigned char shmem[];
-                rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
+                rng = reinterpret_cast<sd::graph::RandomGenerator*>(state);
                 cB = shmem;
-                devRng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
+                devRng = reinterpret_cast<sd::graph::RandomGenerator*>(state);
 
                 dB = reinterpret_cast<unsigned char *> (state);
 
-                tZ = reinterpret_cast<T*>(shmem + sizeof(nd4j::graph::RandomGenerator));
+                tZ = reinterpret_cast<T*>(shmem + sizeof(sd::graph::RandomGenerator));
 
                 zLength = shape::length(zShapeBuffer);
                 zEWS = shape::elementWiseStride(zShapeBuffer);
@@ -762,7 +762,7 @@ namespace randomOps {
             __syncthreads();
 
             // using this loop instead of memcpy
-            for (int e = threadIdx.x; e < sizeof(nd4j::graph::RandomGenerator); e+= blockDim.x)
+            for (int e = threadIdx.x; e < sizeof(sd::graph::RandomGenerator); e+= blockDim.x)
                 cB[e] = dB[e];
 
             __syncthreads();
@@ -780,11 +780,11 @@ namespace randomOps {
 
                 T realMean = y == z ? mean : y[e * yEWS];
 
-                z[e *zEWS] =  nd4j::math::nd4j_exp<T,T>((nd4j::math::nd4j_sqrt<T,T>(static_cast<T>(-2.0f) * nd4j::math::nd4j_log<T,T>(r0)) * nd4j::math::nd4j_cos<T,T>(two_pi * r1)) * stddev + realMean);
+                z[e *zEWS] =  sd::math::nd4j_exp<T,T>((sd::math::nd4j_sqrt<T,T>(static_cast<T>(-2.0f) * sd::math::nd4j_log<T,T>(r0)) * sd::math::nd4j_cos<T,T>(two_pi * r1)) * stddev + realMean);
 
                 if (epm < zLength) {
                     realMean = y == z ? mean : y[epm * yEWS];
-                    z[epm *zEWS] =  nd4j::math::nd4j_exp<T,T>((nd4j::math::nd4j_sqrt<T,T>(static_cast<T>(-2.0f) * nd4j::math::nd4j_log<T,T>(r0)) * nd4j::math::nd4j_sin<T,T>(two_pi * r1)) * stddev + realMean);
+                    z[epm *zEWS] =  sd::math::nd4j_exp<T,T>((sd::math::nd4j_sqrt<T,T>(static_cast<T>(-2.0f) * sd::math::nd4j_log<T,T>(r0)) * sd::math::nd4j_sin<T,T>(two_pi * r1)) * stddev + realMean);
                 }
             }
         }
@@ -801,16 +801,16 @@ namespace randomOps {
             auto middle = zLength % 2 == 0 ? zLength / 2 : zLength / 2 + 1;
 
             int elementsPerThread = middle / TAD_THRESHOLD;
-            int _threads = nd4j::math::nd4j_max<int>(1, elementsPerThread);
-            _threads = nd4j::math::nd4j_min<int>(_threads, nd4j::Environment::getInstance()->maxThreads());
+            int _threads = sd::math::nd4j_max<int>(1, elementsPerThread);
+            _threads = sd::math::nd4j_min<int>(_threads, sd::Environment::getInstance()->maxThreads());
 
             int span = (zLength / _threads) + 8;
 
             // we're enforcing even chunks, since it's mandatory for this algorithm
             span -= span % 2;
 
-//            auto buffer = reinterpret_cast<nd4j::random::RandomBuffer *> (state);
-            nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
+//            auto buffer = reinterpret_cast<sd::random::RandomBuffer *> (state);
+            sd::graph::RandomGenerator* rng = reinterpret_cast<sd::graph::RandomGenerator*>(state);
 
             const T mean = extraArguments[0];
             const T stddev = extraArguments[1];
@@ -827,11 +827,11 @@ namespace randomOps {
 
                     T realMean = y == z ? mean : y[e * yEWS];
 
-                    z[e * zEWS] =  nd4j::math::nd4j_exp<T,T>((nd4j::math::nd4j_sqrt<T,T>(static_cast<T>(-2.0f) * nd4j::math::nd4j_log<T,T>(r0)) * nd4j::math::nd4j_cos<T,T>(two_pi * r1)) * stddev + realMean);
+                    z[e * zEWS] =  sd::math::nd4j_exp<T,T>((sd::math::nd4j_sqrt<T,T>(static_cast<T>(-2.0f) * sd::math::nd4j_log<T,T>(r0)) * sd::math::nd4j_cos<T,T>(two_pi * r1)) * stddev + realMean);
 
                     if (epm < zLength) {
                         realMean = y == z ? mean : y[epm * yEWS];
-                        z[epm * zEWS] =  nd4j::math::nd4j_exp<T,T>((nd4j::math::nd4j_sqrt<T,T>(static_cast<T>(-2.0f) * nd4j::math::nd4j_log<T,T>(r0)) * nd4j::math::nd4j_sin<T,T>(two_pi * r1)) * stddev + realMean);
+                        z[epm * zEWS] =  sd::math::nd4j_exp<T,T>((sd::math::nd4j_sqrt<T,T>(static_cast<T>(-2.0f) * sd::math::nd4j_log<T,T>(r0)) * sd::math::nd4j_sin<T,T>(two_pi * r1)) * stddev + realMean);
                     }
                 }
             };
diff --git a/libnd4j/include/ops/specials.h b/libnd4j/include/ops/specials.h
index 94fce8477..77726e799 100644
--- a/libnd4j/include/ops/specials.h
+++ b/libnd4j/include/ops/specials.h
@@ -27,10 +27,10 @@
 #define TAD_THRESHOLD 2
 #endif
 
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <vector>
 
-namespace nd4j {
+namespace sd {
     class NDArray;
 
     //FIXME: get rid of this redefinition
diff --git a/libnd4j/include/ops/specials_sparse.h b/libnd4j/include/ops/specials_sparse.h
index 741c9cce8..cd0e2f6b5 100644
--- a/libnd4j/include/ops/specials_sparse.h
+++ b/libnd4j/include/ops/specials_sparse.h
@@ -23,9 +23,9 @@
 #ifndef LIBND4J_SPECIALS_SPARSE_H
 #define LIBND4J_SPECIALS_SPARSE_H
 
-#include <pointercast.h>
+#include <system/pointercast.h>
 
-namespace nd4j {
+namespace sd {
     namespace sparse {
 
         template <typename T>
diff --git a/libnd4j/include/performance/benchmarking/BenchmarkSuit.h b/libnd4j/include/performance/benchmarking/BenchmarkSuit.h
index 1a77dbd9f..7805e570e 100644
--- a/libnd4j/include/performance/benchmarking/BenchmarkSuit.h
+++ b/libnd4j/include/performance/benchmarking/BenchmarkSuit.h
@@ -22,12 +22,12 @@
 #define LIBND4J_BENCHMARKSUIT_H
 
 #include <string>
-#include <pointercast.h>
-#include <dll.h>
-#include <BenchmarkHelper.h>
-#include <NDArrayFactory.h>
+#include <system/pointercast.h>
+#include <system/dll.h>
+#include <helpers/BenchmarkHelper.h>
+#include <array/NDArrayFactory.h>
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT BenchmarkSuit {
     public:
         BenchmarkSuit() = default;
diff --git a/libnd4j/include/performance/benchmarking/FullBenchmarkSuit.h b/libnd4j/include/performance/benchmarking/FullBenchmarkSuit.h
index dc2b63a4d..6b2314b96 100644
--- a/libnd4j/include/performance/benchmarking/FullBenchmarkSuit.h
+++ b/libnd4j/include/performance/benchmarking/FullBenchmarkSuit.h
@@ -23,7 +23,7 @@
 
 #include <performance/benchmarking/BenchmarkSuit.h>
 
-namespace nd4j {
+namespace sd {
     class FullBenchmarkSuit : public BenchmarkSuit {
     public:
         std::string runSuit() override;
diff --git a/libnd4j/include/performance/benchmarking/LightBenchmarkSuit.h b/libnd4j/include/performance/benchmarking/LightBenchmarkSuit.h
index 35215d032..65a74b1fe 100644
--- a/libnd4j/include/performance/benchmarking/LightBenchmarkSuit.h
+++ b/libnd4j/include/performance/benchmarking/LightBenchmarkSuit.h
@@ -23,7 +23,7 @@
 
 #include <performance/benchmarking/BenchmarkSuit.h>
 
-namespace nd4j {
+namespace sd {
     class LightBenchmarkSuit : public BenchmarkSuit {
     public:
         std::string runSuit() override;
diff --git a/libnd4j/include/performance/benchmarking/impl/FullBenchmarkSuit.cpp b/libnd4j/include/performance/benchmarking/impl/FullBenchmarkSuit.cpp
index b4960bc90..0f020d348 100644
--- a/libnd4j/include/performance/benchmarking/impl/FullBenchmarkSuit.cpp
+++ b/libnd4j/include/performance/benchmarking/impl/FullBenchmarkSuit.cpp
@@ -78,7 +78,7 @@
     int limit3 = 1;
 #endif
 
-namespace nd4j {
+namespace sd {
 
     static std::string layerNormBenchmark() {
         std::string output;
@@ -126,7 +126,7 @@ namespace nd4j {
             return ctx;
         };
 
-        nd4j::ops::layer_norm layerNorm;
+        sd::ops::layer_norm layerNorm;
         DeclarableBenchmark benchmark(layerNorm, "layer norm");
         output += helper.runOperationSuit(&benchmark, generator, batch, "Layer Norm");
 
@@ -141,7 +141,7 @@ namespace nd4j {
         BoolParameters ncdhw("ncdhw");  //1 = ndhwc
         ParametersBatch batch({&ncdhw});
 
-        nd4j::ops::maxpool3dnew maxpool3Dnew;
+        sd::ops::maxpool3dnew maxpool3Dnew;
         DeclarableBenchmark benchmark(maxpool3Dnew, "maxPool3d");
 
 #ifdef _RELEASE
@@ -207,7 +207,7 @@ namespace nd4j {
         BoolParameters ncdhw("ncdhw");  //1 = ndhwc
         ParametersBatch batch({&ncdhw});
 
-        nd4j::ops::conv3dnew conv3Dnew;
+        sd::ops::conv3dnew conv3Dnew;
         DeclarableBenchmark benchmark(conv3Dnew, "conv3d");
 
 #ifdef _RELEASE
@@ -284,7 +284,7 @@ namespace nd4j {
 #endif
 
         ParametersBatch batch({&format, &mb, &nInOut});
-        nd4j::ops::lstmBlock lstmBlock;
+        sd::ops::lstmBlock lstmBlock;
         DeclarableBenchmark benchmark(lstmBlock, "lstm");
 
         int seqLength = 32;
@@ -414,7 +414,7 @@ namespace nd4j {
             return ctx;
         };
 
-        nd4j::ops::batchnorm batchnorm;
+        sd::ops::batchnorm batchnorm;
         DeclarableBenchmark benchmark(batchnorm, "batchnorm");
         output += helper.runOperationSuit(&benchmark, generator, batch, "Batch Normalization");
 
@@ -471,11 +471,11 @@ namespace nd4j {
             return ctx;
         };
 
-        nd4j::ops::avgpool2d avgpool2d;
+        sd::ops::avgpool2d avgpool2d;
         DeclarableBenchmark benchmark1(avgpool2d, "avgpool");
         output += helper.runOperationSuit(&benchmark1, generator, batch, "Average Pooling 2d Operation");
 
-        nd4j::ops::maxpool2d maxpool2d;
+        sd::ops::maxpool2d maxpool2d;
         DeclarableBenchmark benchmark2(maxpool2d, "maxpool");
         output += helper.runOperationSuit(&benchmark2, generator, batch, "Max Pooling 2d Operation");
         return output;
@@ -497,7 +497,7 @@ namespace nd4j {
         PredefinedParameters hw("hw", {8});
 #endif
         ParametersBatch batch({&nhwc, &k, &c, &hw});
-        nd4j::ops::conv2d conv2d;
+        sd::ops::conv2d conv2d;
         DeclarableBenchmark benchmark(conv2d, "conv2d");
 
         auto generator = PARAMETRIC_D() {
@@ -573,27 +573,27 @@ namespace nd4j {
             return ctx;
         };
 
-        nd4j::ops::LegacyRandomOp unif(random::UniformDistribution);
+        sd::ops::LegacyRandomOp unif(random::UniformDistribution);
         DeclarableBenchmark dbU(unif, "uniform");
         output += helper.runOperationSuit(&dbU, gen01, batch, "Uniform Distribution");
 
-        nd4j::ops::LegacyRandomOp gaussian(random::GaussianDistribution);
+        sd::ops::LegacyRandomOp gaussian(random::GaussianDistribution);
         DeclarableBenchmark dbG(gaussian, "gaussian");
         output += helper.runOperationSuit(&dbG, gen01, batch, "Gaussian Distribution");
 
-        nd4j::ops::LegacyRandomOp trunc(random::TruncatedNormalDistribution);
+        sd::ops::LegacyRandomOp trunc(random::TruncatedNormalDistribution);
         DeclarableBenchmark dbTU(unif, "trunc.norm");
         output += helper.runOperationSuit(&dbTU, gen01, batch, "Truncated Normal Distribution");
 
-        nd4j::ops::LegacyRandomOp ln(random::LogNormalDistribution);
+        sd::ops::LegacyRandomOp ln(random::LogNormalDistribution);
         DeclarableBenchmark dbLN(ln, "uniform");
         output += helper.runOperationSuit(&dbLN, gen01, batch, "Log Normal Distribution");
 
-        nd4j::ops::LegacyRandomOp bernoulli(random::BernoulliDistribution);
+        sd::ops::LegacyRandomOp bernoulli(random::BernoulliDistribution);
         DeclarableBenchmark dbB(bernoulli, "bernoulli");
         output += helper.runOperationSuit(&dbB, gen05, batch, "Bernoulli Distribution");
 
-        nd4j::ops::LegacyRandomOp dropout(random::BernoulliDistribution);
+        sd::ops::LegacyRandomOp dropout(random::BernoulliDistribution);
         DeclarableBenchmark dbD(dropout, "dropout");
         output += helper.runOperationSuit(&dbD, gen05, batch, "Dropout");
 
@@ -764,7 +764,7 @@ namespace nd4j {
             return ctx;
         };
 
-        nd4j::ops::matmul mmul;
+        sd::ops::matmul mmul;
         DeclarableBenchmark benchmark(mmul, "mmul (batch)");
         output += helper.runOperationSuit(&benchmark, generator, b, "MMul (batch)");
 
@@ -822,7 +822,7 @@ namespace nd4j {
         ParametersBatch batch({&length});
 
         //Gather 1D tests - 1d ref, 1d indices, 1d updates -> 1d output
-        nd4j::ops::scatter_upd scatter_update1;
+        sd::ops::scatter_upd scatter_update1;
         DeclarableBenchmark sa1d(scatter_update1, "scatter_update1d");
         auto generator = PARAMETRIC_D() {
             auto ctx = new Context(1);
@@ -856,7 +856,7 @@ namespace nd4j {
         IntPowerParameters rows("rows", 2, 8, gatherOpPowLimit2, 4);      //2^10 to 2^16 in steps of 2: 2^10, ..., 2^20
         PredefinedParameters cols("cols", {32});
         ParametersBatch batch2({&rows, &cols});
-        nd4j::ops::scatter_upd scatter_update2;
+        sd::ops::scatter_upd scatter_update2;
         DeclarableBenchmark sa2d(scatter_update2, "scatter_update2d");
         auto generator2 = PARAMETRIC_D() {
             auto ctx = new Context(1);
@@ -891,7 +891,7 @@ namespace nd4j {
         IntPowerParameters sz0("sz0", 2, 8, gatherOpPowLimit3, 4);
         PredefinedParameters sz1("sz1", {32});
         ParametersBatch batch3({&sz0, &sz1});
-        nd4j::ops::scatter_upd scatter_update3;
+        sd::ops::scatter_upd scatter_update3;
         DeclarableBenchmark sa3d(scatter_update3, "scatter3d");
         auto generator3 = PARAMETRIC_D() {
             auto ctx = new Context(1);
@@ -932,7 +932,7 @@ namespace nd4j {
         ParametersBatch batch({&length});
 
         //Gather 1D tests - 1d input, 1d indices -> 1d output
-        nd4j::ops::gather gather1;
+        sd::ops::gather gather1;
         DeclarableBenchmark gather1d(gather1, "gather1d");
         auto generator = PARAMETRIC_D() {
             auto ctx = new Context(1);
@@ -962,7 +962,7 @@ namespace nd4j {
         IntPowerParameters rows("rows", 2, 8, gatherOpPowLimit2, 4);      //2^10 to 2^20 in steps of 2: 2^10, ..., 2^20
         PredefinedParameters cols("cols", {32});
         ParametersBatch batch2({&rows, &cols});
-        nd4j::ops::gather gather2;
+        sd::ops::gather gather2;
         DeclarableBenchmark gather2d(gather2, "gather2d");
         auto generator2 = PARAMETRIC_D() {
             auto ctx = new Context(1);
@@ -994,7 +994,7 @@ namespace nd4j {
         IntPowerParameters sz0("sz0", 2, 8, gatherOpPowLimit3, 4);      //2^8 to 2^16 in steps of 4
         PredefinedParameters sz1("sz1", {32});
         ParametersBatch batch3({&sz0, &sz1});
-        nd4j::ops::gather gather3;
+        sd::ops::gather gather3;
         DeclarableBenchmark gather3d(gather3, "gather3d");
         auto generator3 = PARAMETRIC_D() {
             auto ctx = new Context(1);
@@ -1166,7 +1166,7 @@ namespace nd4j {
             name += "Broadcast Matrix Add (Custom) - Rank";
             name += std::to_string(rank);
 
-            nd4j::ops::add op;
+            sd::ops::add op;
             DeclarableBenchmark benchmark(op, "add");
             output += helper.runOperationSuit(&benchmark, generator, b, name.c_str());
         }
@@ -1207,7 +1207,7 @@ namespace nd4j {
         };
 
         std::string s("add");
-        nd4j::ops::add op;
+        sd::ops::add op;
         DeclarableBenchmark benchmark(op, "add");
         output += helper.runOperationSuit(&benchmark, generator, batch, "Broadcast (Custom) Add - 2d");
         return output;
@@ -1432,7 +1432,7 @@ namespace nd4j {
             return ctx;
         };
 
-        nd4j::ops::argmax opArgmax;
+        sd::ops::argmax opArgmax;
         DeclarableBenchmark dbArgmax(opArgmax, "stridedArgmax");
         output += helper.runOperationSuit(&dbArgmax, generator3, batch, "Strided Argmax");
         return output;
@@ -1508,7 +1508,7 @@ namespace nd4j {
             std::string s5("Argmax Along Dimension - ");
             s5 += std::to_string(length[i]);
 
-            nd4j::ops::argmax opArgmax;
+            sd::ops::argmax opArgmax;
             DeclarableBenchmark dbArgmax(opArgmax, "Argmax");
             output += helper.runOperationSuit(&dbArgmax, generator3, batch, s5.c_str());
         }
@@ -1537,7 +1537,7 @@ namespace nd4j {
         output += helper.runOperationSuit(&rbSum, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator), batch, "Sum - Full Array Reduction");
 
         //Index reduction
-        nd4j::ops::argmax opArgmax;
+        sd::ops::argmax opArgmax;
         DeclarableBenchmark dbArgmax(opArgmax, "Argmax");
         auto generator3 = PARAMETRIC_D(){
             auto ctx = new Context(1);
@@ -1640,7 +1640,7 @@ namespace nd4j {
         output += helper.runOperationSuit(&erf, generator, batch, "Error Function (Erf)");
 
         ParametersBatch batch2({&length});
-        nd4j::ops::polygamma op1;
+        sd::ops::polygamma op1;
         DeclarableBenchmark pg(op1, "polygamma");
         auto generator2 = PARAMETRIC_D() {
             auto ctx = new Context(1);
@@ -1657,7 +1657,7 @@ namespace nd4j {
 
         IntPowerParameters lengthBetaInc("length", 2, 10, heavyPowLimit, 4);      //2^10 to 2^22 in steps of 4
         ParametersBatch batch3({&lengthBetaInc});
-        nd4j::ops::betainc op2;
+        sd::ops::betainc op2;
         DeclarableBenchmark binc(op2, "betainc");
         auto generator3 = PARAMETRIC_D() {
             auto ctx = new Context(1);
diff --git a/libnd4j/include/performance/benchmarking/impl/LightBenchmarkSuit.cpp b/libnd4j/include/performance/benchmarking/impl/LightBenchmarkSuit.cpp
index 9e179db7f..3c1011e09 100644
--- a/libnd4j/include/performance/benchmarking/impl/LightBenchmarkSuit.cpp
+++ b/libnd4j/include/performance/benchmarking/impl/LightBenchmarkSuit.cpp
@@ -32,7 +32,7 @@
 
 #endif
 
-namespace nd4j {
+namespace sd {
 
     template <typename T>
     static std::string transformBenchmark() {
@@ -262,7 +262,7 @@ namespace nd4j {
         output += helper.runOperationSuit(&rbMax, (const std::function<void (Parameters &, ResultSet &, ResultSet &, ResultSet &)>)(generator), batch, "Maximum - Full Array Reduction");
 
         //Index reduction
-        nd4j::ops::argmax opArgmax;
+        sd::ops::argmax opArgmax;
         DeclarableBenchmark dbArgmax(opArgmax, "Argmax");
         auto generator3 = PARAMETRIC_D(){
             auto ctx = new Context(1);
@@ -353,7 +353,7 @@ namespace nd4j {
             std::string s5("Argmax Along Dimension - ");
             s5 += std::to_string(length[i]);
 
-            nd4j::ops::argmax opArgmax;
+            sd::ops::argmax opArgmax;
             DeclarableBenchmark dbArgmax(opArgmax, "Argmax");
             output += helper.runOperationSuit(&dbArgmax, generator3, batch, s5.c_str());
         }
@@ -371,7 +371,7 @@ namespace nd4j {
         PredefinedParameters k("k", {2, 3});
 
         ParametersBatch batch({&nhwc, &k});
-        nd4j::ops::conv2d conv2d;
+        sd::ops::conv2d conv2d;
         DeclarableBenchmark benchmark(conv2d, "conv2d");
 
         int hw = 64;
@@ -462,11 +462,11 @@ namespace nd4j {
             return ctx;
         };
 
-        nd4j::ops::avgpool2d avgpool2d;
+        sd::ops::avgpool2d avgpool2d;
         DeclarableBenchmark benchmark1(avgpool2d, "avgpool");
         output += helper.runOperationSuit(&benchmark1, generator, batch, "Average Pool 2d");
 
-        nd4j::ops::maxpool2d maxpool2d;
+        sd::ops::maxpool2d maxpool2d;
         DeclarableBenchmark benchmark2(maxpool2d, "maxpool");
         output += helper.runOperationSuit(&benchmark2, generator, batch, "Max Pool 2d");
         return output;
@@ -483,7 +483,7 @@ namespace nd4j {
         int n = 128;
 
         ParametersBatch batch({&format, &mb});
-        nd4j::ops::lstmBlock lstmBlock;
+        sd::ops::lstmBlock lstmBlock;
         DeclarableBenchmark benchmark(lstmBlock, "lstm");
 
         int seqLength = 8;
@@ -585,7 +585,7 @@ namespace nd4j {
         };
 
         std::string s("add");
-        nd4j::ops::add op;
+        sd::ops::add op;
         DeclarableBenchmark benchmark(op, "add");
         output += helper.runOperationSuit(&benchmark, generator, batch, "Broadcast (Custom) Add - 2d");
         return output;
@@ -593,9 +593,9 @@ namespace nd4j {
 
     std::string LightBenchmarkSuit::runSuit() {
 #ifdef RELEASE_BUILD
-        std::vector<nd4j::DataType> dtypes({nd4j::DataType::FLOAT32, nd4j::DataType::HALF});
+        std::vector<sd::DataType> dtypes({sd::DataType::FLOAT32, sd::DataType::HALF});
 #else
-        std::vector<nd4j::DataType> dtypes({nd4j::DataType::FLOAT32});
+        std::vector<sd::DataType> dtypes({sd::DataType::FLOAT32});
 #endif
 
         std::string result;
diff --git a/libnd4j/include/samediff.h b/libnd4j/include/samediff.h
new file mode 100644
index 000000000..4907c9802
--- /dev/null
+++ b/libnd4j/include/samediff.h
@@ -0,0 +1,39 @@
+/*******************************************************************************
+ * Copyright (c) 2020 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com
+//
+
+#ifndef _SAMEDIFF_H
+#define _SAMEDIFF_H
+
+/**
+ * This file is a basic include that pulls everything in
+ */
+
+// basic NDArray-related includes
+#include <array/NDArray.h>
+#include <array/NDArrayFactory.h>
+
+// basic Graph-related includes
+#include <graph/GraphExecutioner.h>
+#include <graph/Graph.h>
+
+// ML ops includes
+#include <ops/declarable/CustomOperations.h>
+
+#endif //S_SAMEDIFF_H
diff --git a/libnd4j/blas/BlasVersionHelper.h b/libnd4j/include/system/BlasVersionHelper.h
similarity index 96%
rename from libnd4j/blas/BlasVersionHelper.h
rename to libnd4j/include/system/BlasVersionHelper.h
index 93e8d75e3..7cc97a26c 100644
--- a/libnd4j/blas/BlasVersionHelper.h
+++ b/libnd4j/include/system/BlasVersionHelper.h
@@ -21,11 +21,11 @@
 #ifndef SAMEDIFF_BLASVERSIONHELPER_H
 #define SAMEDIFF_BLASVERSIONHELPER_H
 
-#include <dll.h>
+#include <system/dll.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT BlasVersionHelper {
     public:
         int _blasMajorVersion = 0;
diff --git a/libnd4j/blas/Environment.h b/libnd4j/include/system/Environment.h
similarity index 95%
rename from libnd4j/blas/Environment.h
rename to libnd4j/include/system/Environment.h
index 5bef3f1e4..9a998d705 100644
--- a/libnd4j/blas/Environment.h
+++ b/libnd4j/include/system/Environment.h
@@ -23,13 +23,13 @@
 
 #include <atomic>
 #include <vector>
-#include <dll.h>
+#include <system/dll.h>
 #include <stdexcept>
 #include <array/DataType.h>
 #include <types/pair.h>
-#include <pointercast.h>
+#include <system/pointercast.h>
 
-namespace nd4j{
+namespace sd{
     class ND4J_EXPORT Environment {
     private:
         std::atomic<int> _tadThreshold;
@@ -38,7 +38,7 @@ namespace nd4j{
         std::atomic<bool> _debug;
         std::atomic<bool> _leaks;
         std::atomic<bool> _profile;
-        std::atomic<nd4j::DataType> _dataType;
+        std::atomic<sd::DataType> _dataType;
         std::atomic<bool> _precBoost;
         std::atomic<bool> _useMKLDNN{true};
         std::atomic<bool> _allowHelpers{true};
@@ -125,8 +125,8 @@ namespace nd4j{
         bool isUseMKLDNN() { return _useMKLDNN.load(); }
         void setUseMKLDNN(bool useMKLDNN) { _useMKLDNN.store(useMKLDNN); }
 
-        nd4j::DataType defaultFloatDataType();
-        void setDefaultFloatDataType(nd4j::DataType dtype);
+        sd::DataType defaultFloatDataType();
+        void setDefaultFloatDataType(sd::DataType dtype);
 
         bool precisionBoostAllowed();
         void allowPrecisionBoost(bool reallyAllow);
diff --git a/libnd4j/include/buffer.h b/libnd4j/include/system/buffer.h
similarity index 98%
rename from libnd4j/include/buffer.h
rename to libnd4j/include/system/buffer.h
index 79197753d..5072965ca 100755
--- a/libnd4j/include/buffer.h
+++ b/libnd4j/include/system/buffer.h
@@ -28,15 +28,15 @@
 #include <cuda_runtime.h>
 #include <helpers/DebugHelper.h>
 #endif
-#include <dll.h>
+#include <system/dll.h>
 
 #include <stddef.h>
 #include <stdlib.h>
 #include <stdio.h>
-#include <dll.h>
+#include <system/dll.h>
 
  //Question: Should the indexes here really be int? Isn't size_t or Nd4jLong more appropriate?
-namespace nd4j {
+namespace sd {
 	namespace buffer {
 /**
  * Represents both a cpu and gpu
@@ -288,7 +288,7 @@ __host__ void copyDataFromGpu(Buffer <T> **buffer, cudaStream_t stream) {
 
 #ifdef __CUDACC__
 template<typename T>
-__host__ void printArr(nd4j::buffer::Buffer <T> *buff) {
+__host__ void printArr(sd::buffer::Buffer <T> *buff) {
 	for (int i = 0; i < buff->length; i++) {
 		printf("Buffer[%d] was %f\n", i, buff->data[i]);
 	}
diff --git a/libnd4j/include/dll.h b/libnd4j/include/system/dll.h
similarity index 97%
rename from libnd4j/include/dll.h
rename to libnd4j/include/system/dll.h
index 91d5a7677..71098f8bf 100644
--- a/libnd4j/include/dll.h
+++ b/libnd4j/include/system/dll.h
@@ -21,7 +21,7 @@
 #ifndef NATIVEOPERATIONS_DLL_H
 #define NATIVEOPERATIONS_DLL_H
 
-#include <msvc.h>
+#include <system/msvc.h>
 
 #ifdef _WIN32
 //#include <windows.h>
diff --git a/libnd4j/include/enum_boilerplate.h b/libnd4j/include/system/enum_boilerplate.h
similarity index 99%
rename from libnd4j/include/enum_boilerplate.h
rename to libnd4j/include/system/enum_boilerplate.h
index 1d549527b..2acb1536a 100644
--- a/libnd4j/include/enum_boilerplate.h
+++ b/libnd4j/include/system/enum_boilerplate.h
@@ -21,7 +21,7 @@
 #ifndef LIBND4J_ENUM_BOILERPLATE_H
 #define LIBND4J_ENUM_BOILERPLATE_H
 
-#include <type_boilerplate.h>
+#include <system/type_boilerplate.h>
 
 
 #define EN_1(WHAT, OP_PAIR) WHAT(OP_PAIR)
diff --git a/libnd4j/include/msvc.h b/libnd4j/include/system/msvc.h
similarity index 100%
rename from libnd4j/include/msvc.h
rename to libnd4j/include/system/msvc.h
diff --git a/libnd4j/include/nd4jmalloc.h b/libnd4j/include/system/nd4jmalloc.h
similarity index 100%
rename from libnd4j/include/nd4jmalloc.h
rename to libnd4j/include/system/nd4jmalloc.h
diff --git a/libnd4j/include/nd4jmemset.h b/libnd4j/include/system/nd4jmemset.h
similarity index 100%
rename from libnd4j/include/nd4jmemset.h
rename to libnd4j/include/system/nd4jmemset.h
diff --git a/libnd4j/include/op_boilerplate.h b/libnd4j/include/system/op_boilerplate.h
similarity index 96%
rename from libnd4j/include/op_boilerplate.h
rename to libnd4j/include/system/op_boilerplate.h
index 5fef0c892..b4df39a29 100644
--- a/libnd4j/include/op_boilerplate.h
+++ b/libnd4j/include/system/op_boilerplate.h
@@ -65,8 +65,8 @@
 #ifndef OP_BOILERPLATE_HH
 #define OP_BOILERPLATE_HH
 
-#include <openmp_pragmas.h>
-#include <type_boilerplate.h>
+#include <system/openmp_pragmas.h>
+#include <system/type_boilerplate.h>
 #include <exceptions/allocation_exception.h>
 #include <memory/MemoryTracker.h>
 
@@ -118,8 +118,8 @@
 #endif
 
 
-#define ELEMENT_THRESHOLD nd4j::Environment::getInstance()->elementwiseThreshold()
-#define TAD_THRESHOLD nd4j::Environment::getInstance()->tadThreshold()
+#define ELEMENT_THRESHOLD sd::Environment::getInstance()->elementwiseThreshold()
+#define TAD_THRESHOLD sd::Environment::getInstance()->tadThreshold()
 
 #define SHAPELIST(...)  new ShapeList({__VA_ARGS__}, block.workspace() != nullptr)
 
@@ -129,8 +129,8 @@
 #define PRINT_FIRST(...)    printf(__VA_ARGS__); fflush(stdout)
 #endif
 
-#define DEBUG_CALL(STREAM)      if (nd4j::Environment::getInstance()->isDebug()) { cudaError_t tRes = cudaStreamSynchronize(*STREAM); checkCudaErrors(tRes); if (tRes != 0) { throw std::runtime_error(); }; }
-#define DEBUG_KERNEL(STREAM, OP_NUM)       if (nd4j::Environment::getInstance()->isDebug()) { cudaError_t tRes = cudaStreamSynchronize(*STREAM); checkCudaErrors(tRes); if (tRes != 0) {std::string tFile(__FILE__); std::string tOp = "Kernel OpNum failed: [" + nd4j::StringUtils::valueToString<int>(OP_NUM) + std::string("]; File: ") + tFile + std::string(":") + nd4j::StringUtils::valueToString<int>(__LINE__); throw std::runtime_error(tOp.c_str()); }; }
+#define DEBUG_CALL(STREAM)      if (sd::Environment::getInstance()->isDebug()) { cudaError_t tRes = cudaStreamSynchronize(*STREAM); checkCudaErrors(tRes); if (tRes != 0) { throw std::runtime_error(); }; }
+#define DEBUG_KERNEL(STREAM, OP_NUM)       if (sd::Environment::getInstance()->isDebug()) { cudaError_t tRes = cudaStreamSynchronize(*STREAM); checkCudaErrors(tRes); if (tRes != 0) {std::string tFile(__FILE__); std::string tOp = "Kernel OpNum failed: [" + sd::StringUtils::valueToString<int>(OP_NUM) + std::string("]; File: ") + tFile + std::string(":") + sd::StringUtils::valueToString<int>(__LINE__); throw std::runtime_error(tOp.c_str()); }; }
 
 
 #define LAUNCH(A, B, C, D) <<<A, B, C, D>>>
@@ -1228,8 +1228,8 @@
 
 
 /// graph definitions
-#define REQUIRE_OK(A)  if (nd4j::ops::resultHelper( (A), #A, __FILE__, __LINE__ ) != 0) return ND4J_STATUS_VALIDATION;
-#define REQUIRE_TRUE(COND, ...) if (!(COND)) { if (nd4j::ops::conditionHelper(__FILE__, __LINE__, COND, __VA_ARGS__) != 0) throw std::invalid_argument("Op validation failed");};
+#define REQUIRE_OK(A)  if (sd::ops::resultHelper( (A), #A, __FILE__, __LINE__ ) != 0) return ND4J_STATUS_VALIDATION;
+#define REQUIRE_TRUE(COND, ...) if (!(COND)) { if (sd::ops::conditionHelper(__FILE__, __LINE__, COND, __VA_ARGS__) != 0) throw std::invalid_argument("Op validation failed");};
 
 #define DECLARE_ENTRY(NAME, ...)           template struct ND4J_EXPORT __registratorFloat<NAME<float>>; \
                                       template struct ND4J_EXPORT __registratorHalf<NAME<float16>>; \
@@ -1243,13 +1243,13 @@
 #define NOT_EXCLUDED(NAME) 1>0
 #else
 // for now we don't want minifier mechanics working
-//#define NOT_EXCLUDED(NAME) defined(LIBND4J_ALL_OPS) || defined(NAME)
+//#define NOT_EXCLUDED(NAME) defined(SD_ALL_OPS) || defined(NAME)
 #define NOT_EXCLUDED(NAME) 1>0
 #endif
 
 #ifdef __JAVACPP_HACK__
 #define REGISTER_H(NAME)
-#elif defined(LIBND4J_ALL_OPS)
+#elif defined(SD_ALL_OPS)
 #define REGISTER_H(NAME)
 #else
 #define REGISTER_H(NAME)  template <typename OpName>  \
@@ -1259,12 +1259,12 @@
                                 OpRegistrator::getInstance()->registerOperation(ptr); \
                             }\
                         };\
-                        static nd4j::ops::__registrator_##NAME<NAME> zzz_register_opd_##NAME;
+                        static sd::ops::__registrator_##NAME<NAME> zzz_register_opd_##NAME;
 #endif
 
 #ifdef __JAVACPP_HACK__
 #define REGISTER_C(NAME)
-#elif defined(LIBND4J_ALL_OPS)
+#elif defined(SD_ALL_OPS)
 #define REGISTER_C(NAME)   template <typename OpName>  \
                         struct __registrator_##NAME {\
                             __registrator_##NAME() {\
@@ -1272,63 +1272,63 @@
                                 OpRegistrator::getInstance()->registerOperation(ptr); \
                             }\
                         };\
-                        static nd4j::ops::__registrator_##NAME<NAME> zzz_register_opd_##NAME;
+                        static sd::ops::__registrator_##NAME<NAME> zzz_register_opd_##NAME;
 #else
 #define REGISTER_C(NAME)
 #endif
 
-#define DECLARE_OP(NAME, NIN, NOUT, INPLACEABLE)   class ND4J_EXPORT NAME: public nd4j::ops::DeclarableOp { \
+#define DECLARE_OP(NAME, NIN, NOUT, INPLACEABLE)   class ND4J_EXPORT NAME: public sd::ops::DeclarableOp { \
                                                 public:\
                                                     NAME(); \
-                                                    nd4j::ShapeList* calculateOutputShape(nd4j::ShapeList* inputShape, nd4j::graph::Context& block); \
+                                                    sd::ShapeList* calculateOutputShape(sd::ShapeList* inputShape, sd::graph::Context& block); \
                                                 protected: \
                                                     void registerTypes(); \
-                                                    Nd4jStatus validateAndExecute(nd4j::graph::Context& block); \
+                                                    Nd4jStatus validateAndExecute(sd::graph::Context& block); \
                                                 };\
                                                 REGISTER_H(NAME)
 
-#define DECLARE_BOOLEAN_OP(NAME, NIN, SCALAR)   class ND4J_EXPORT NAME: public nd4j::ops::BooleanOp { \
+#define DECLARE_BOOLEAN_OP(NAME, NIN, SCALAR)   class ND4J_EXPORT NAME: public sd::ops::BooleanOp { \
                                                 public:\
                                                     NAME(); \
                                                 protected: \
                                                     void registerTypes(); \
-                                                    Nd4jStatus validateAndExecute(nd4j::graph::Context& block); \
+                                                    Nd4jStatus validateAndExecute(sd::graph::Context& block); \
                                                 }; \
                                                 REGISTER_H(NAME)
 
-#define BOOLEAN_OP_IMPL(NAME, NIN, SCALAR)   NAME::NAME() : nd4j::ops::BooleanOp(#NAME, NIN, SCALAR) { }; \
+#define BOOLEAN_OP_IMPL(NAME, NIN, SCALAR)   NAME::NAME() : sd::ops::BooleanOp(#NAME, NIN, SCALAR) { }; \
                                                 REGISTER_C(NAME) \
-                                                Nd4jStatus nd4j::ops::NAME::validateAndExecute(nd4j::graph::Context& block)
+                                                Nd4jStatus sd::ops::NAME::validateAndExecute(sd::graph::Context& block)
 
-#define DECLARE_LIST_OP(NAME, NIN, NOUT, TARGS, IARGS)      class ND4J_EXPORT  NAME: public nd4j::ops::DeclarableListOp { \
+#define DECLARE_LIST_OP(NAME, NIN, NOUT, TARGS, IARGS)      class ND4J_EXPORT  NAME: public sd::ops::DeclarableListOp { \
                                                             public:\
                                                                 NAME(); \
                                                             protected: \
-                                                                Nd4jStatus validateAndExecute(nd4j::graph::Context& block); \
+                                                                Nd4jStatus validateAndExecute(sd::graph::Context& block); \
                                                             };\
                                                             REGISTER_H(NAME)
 
-#define LIST_OP_IMPL(NAME, NIN, NOUT, TARGS, IARGS)         NAME::NAME() : nd4j::ops::DeclarableListOp(NIN, NOUT, #NAME, TARGS, IARGS) { }; \
+#define LIST_OP_IMPL(NAME, NIN, NOUT, TARGS, IARGS)         NAME::NAME() : sd::ops::DeclarableListOp(NIN, NOUT, #NAME, TARGS, IARGS) { }; \
                                                             REGISTER_C(NAME) \
-                                                            Nd4jStatus nd4j::ops::NAME::validateAndExecute(nd4j::graph::Context& block)
+                                                            Nd4jStatus sd::ops::NAME::validateAndExecute(sd::graph::Context& block)
 
-#define DECLARE_LOGIC_OP(NAME)      class ND4J_EXPORT NAME: public nd4j::ops::LogicOp { \
+#define DECLARE_LOGIC_OP(NAME)      class ND4J_EXPORT NAME: public sd::ops::LogicOp { \
                                     public:\
                                         NAME(); \
                                     protected: \
-                                        Nd4jStatus validateAndExecute(nd4j::graph::Context& block); \
+                                        Nd4jStatus validateAndExecute(sd::graph::Context& block); \
                                     };\
                                     REGISTER_H(NAME)
 
-#define LOGIC_OP_IMPL(NAME)     NAME::NAME() : nd4j::ops::LogicOp(#NAME) { }; \
+#define LOGIC_OP_IMPL(NAME)     NAME::NAME() : sd::ops::LogicOp(#NAME) { }; \
                                 REGISTER_C(NAME) \
-                                Nd4jStatus nd4j::ops::NAME::validateAndExecute(nd4j::graph::Context& block) { return nd4j::ops::LogicOp::validateAndExecute(block); };
+                                Nd4jStatus sd::ops::NAME::validateAndExecute(sd::graph::Context& block) { return sd::ops::LogicOp::validateAndExecute(block); };
 
 
 
-#define OP_IMPL(NAME, NIN, NOUT, INPLACEABLE)   NAME::NAME() : nd4j::ops::DeclarableOp(NIN, NOUT, #NAME, INPLACEABLE) { }; \
+#define OP_IMPL(NAME, NIN, NOUT, INPLACEABLE)   NAME::NAME() : sd::ops::DeclarableOp(NIN, NOUT, #NAME, INPLACEABLE) { }; \
                                                 REGISTER_C(NAME) \
-                                                nd4j::ShapeList* nd4j::ops::NAME::calculateOutputShape(nd4j::ShapeList* inputShape, nd4j::graph::Context& block) { \
+                                                sd::ShapeList* sd::ops::NAME::calculateOutputShape(sd::ShapeList* inputShape, sd::graph::Context& block) { \
                                                     auto shapeList = SHAPELIST(); \
                                                     auto opLimit = this->getOpDescriptor()->getNumberOfOutputs() < 1 ? block.width() : this->getOpDescriptor()->getNumberOfOutputs(); \
                                                     for (int e = 0; e < opLimit; e++) { \
@@ -1337,7 +1337,7 @@
                                                     } \
                                                     return shapeList; \
                                                 } \
-                                                Nd4jStatus nd4j::ops::NAME::validateAndExecute(nd4j::graph::Context& block)
+                                                Nd4jStatus sd::ops::NAME::validateAndExecute(sd::graph::Context& block)
 
 
 #define DECLARE_SYN(NAME, ORIGINAL) template <typename OpName>  \
@@ -1347,26 +1347,26 @@
                                             if (ptr == nullptr) { \
                                                 std::string newName(name); \
                                                 std::string oldName(oname); \
-                                                OpRegistrator::getInstance()->updateMSVC(nd4j::ops::HashHelper::getInstance()->getLongHash(newName), oldName);\
+                                                OpRegistrator::getInstance()->updateMSVC(sd::ops::HashHelper::getInstance()->getLongHash(newName), oldName);\
                                                 return;\
                                             }\
                                             OpRegistrator::getInstance()->registerOperation(name, ptr);\
                                             }\
                                         };\
-                                        static nd4j::ops::__registratorSynonym_##NAME<ORIGINAL> zzz_register_opd_##NAME(#NAME, #ORIGINAL)
+                                        static sd::ops::__registratorSynonym_##NAME<ORIGINAL> zzz_register_opd_##NAME(#NAME, #ORIGINAL)
 
-#define DECLARE_DIVERGENT_OP(NAME, NIN, NOUT, INPLACEABLE)  class ND4J_EXPORT NAME: public nd4j::ops::DeclarableOp { \
+#define DECLARE_DIVERGENT_OP(NAME, NIN, NOUT, INPLACEABLE)  class ND4J_EXPORT NAME: public sd::ops::DeclarableOp { \
                                                             public:\
                                                                 NAME(); \
-                                                                nd4j::ShapeList* calculateOutputShape(nd4j::ShapeList* inputShape, nd4j::graph::Context& block); \
+                                                                sd::ShapeList* calculateOutputShape(sd::ShapeList* inputShape, sd::graph::Context& block); \
                                                             protected: \
-                                                                Nd4jStatus validateAndExecute(nd4j::graph::Context& block); \
+                                                                Nd4jStatus validateAndExecute(sd::graph::Context& block); \
                                                             };\
                                                             REGISTER_H(NAME)
 
-#define DIVERGENT_OP_IMPL(NAME, NIN, NOUT, INPLACEABLE)     NAME::NAME() : nd4j::ops::DeclarableOp(NIN, NOUT, #NAME, INPLACEABLE, true) { }; \
+#define DIVERGENT_OP_IMPL(NAME, NIN, NOUT, INPLACEABLE)     NAME::NAME() : sd::ops::DeclarableOp(NIN, NOUT, #NAME, INPLACEABLE, true) { }; \
                                                             REGISTER_C(NAME) \
-                                                            nd4j::ShapeList* nd4j::ops::NAME::calculateOutputShape(nd4j::ShapeList* inputShape, nd4j::graph::Context& block) { \
+                                                            sd::ShapeList* sd::ops::NAME::calculateOutputShape(sd::ShapeList* inputShape, sd::graph::Context& block) { \
                                                                 auto shapeList = SHAPELIST(); \
                                                                 auto opLimit = this->getOpDescriptor()->getNumberOfOutputs() < 1 ? block.width() : this->getOpDescriptor()->getNumberOfOutputs(); \
                                                                 for (int e = 0; e < opLimit; e++) { \
@@ -1376,21 +1376,21 @@
                                                                 } \
                                                                 return shapeList; \
                                                             } \
-                                                            Nd4jStatus nd4j::ops::NAME::validateAndExecute(nd4j::graph::Context& block)
+                                                            Nd4jStatus sd::ops::NAME::validateAndExecute(sd::graph::Context& block)
 
-#define DECLARE_CONFIGURABLE_OP(NAME, NIN, NOUT, INPLACEABLE, TARGS, IARGS)     class ND4J_EXPORT NAME: public nd4j::ops::DeclarableOp { \
+#define DECLARE_CONFIGURABLE_OP(NAME, NIN, NOUT, INPLACEABLE, TARGS, IARGS)     class ND4J_EXPORT NAME: public sd::ops::DeclarableOp { \
                                                                                 public:\
                                                                                     NAME(); \
-                                                                                    nd4j::ShapeList* calculateOutputShape(nd4j::ShapeList* inputShape, nd4j::graph::Context& block); \
+                                                                                    sd::ShapeList* calculateOutputShape(sd::ShapeList* inputShape, sd::graph::Context& block); \
                                                                                 protected: \
                                                                                     void registerTypes(); \
-                                                                                    Nd4jStatus validateAndExecute(nd4j::graph::Context& block); \
+                                                                                    Nd4jStatus validateAndExecute(sd::graph::Context& block); \
                                                                                 };\
                                                                                 REGISTER_H(NAME)
 
-#define CONFIGURABLE_OP_IMPL(NAME, NIN, NOUT, INPLACEABLE, TARGS, IARGS)        NAME::NAME() : nd4j::ops::DeclarableOp(NIN, NOUT, #NAME, INPLACEABLE, TARGS, IARGS) { }; \
+#define CONFIGURABLE_OP_IMPL(NAME, NIN, NOUT, INPLACEABLE, TARGS, IARGS)        NAME::NAME() : sd::ops::DeclarableOp(NIN, NOUT, #NAME, INPLACEABLE, TARGS, IARGS) { }; \
                                                                                 REGISTER_C(NAME) \
-                                                                                nd4j::ShapeList* nd4j::ops::NAME::calculateOutputShape(nd4j::ShapeList* inputShape, nd4j::graph::Context& block) { \
+                                                                                sd::ShapeList* sd::ops::NAME::calculateOutputShape(sd::ShapeList* inputShape, sd::graph::Context& block) { \
                                                                                     auto shapeList = SHAPELIST(); \
                                                                                     auto opLimit = this->getOpDescriptor()->getNumberOfOutputs() < 1 ? block.width() : this->getOpDescriptor()->getNumberOfOutputs(); \
                                                                                     for (int e = 0; e < opLimit; e++) { \
@@ -1399,9 +1399,9 @@
                                                                                     } \
                                                                                     return shapeList; \
                                                                                 } \
-                                                                                Nd4jStatus nd4j::ops::NAME::validateAndExecute(Context& block)
+                                                                                Nd4jStatus sd::ops::NAME::validateAndExecute(Context& block)
 
-#define DECLARE_REDUCTION_OP(NAME, NIN, NOUT, INPLACEABLE, TARGS, IARGS)        class ND4J_EXPORT NAME: public nd4j::ops::DeclarableReductionOp { \
+#define DECLARE_REDUCTION_OP(NAME, NIN, NOUT, INPLACEABLE, TARGS, IARGS)        class ND4J_EXPORT NAME: public sd::ops::DeclarableReductionOp { \
                                                                                 public:\
                                                                                     NAME(); \
                                                                                 protected: \
@@ -1410,34 +1410,34 @@
                                                                                 };\
                                                                                 REGISTER_H(NAME)
 
-#define REDUCTION_OP_IMPL(NAME, NIN, NOUT, INPLACEABLE, TARGS, IARGS)           NAME::NAME() : nd4j::ops::DeclarableReductionOp(NIN, NOUT, #NAME, INPLACEABLE, TARGS, IARGS) { }; \
+#define REDUCTION_OP_IMPL(NAME, NIN, NOUT, INPLACEABLE, TARGS, IARGS)           NAME::NAME() : sd::ops::DeclarableReductionOp(NIN, NOUT, #NAME, INPLACEABLE, TARGS, IARGS) { }; \
                                                                                 REGISTER_C(NAME) \
-                                                                                Nd4jStatus nd4j::ops::NAME::validateAndExecute(nd4j::graph::Context& block)
+                                                                                Nd4jStatus sd::ops::NAME::validateAndExecute(sd::graph::Context& block)
 
 
-#define DECLARE_CUSTOM_OP(NAME, NIN, NOUT, INPLACEABLE, TARGS, IARGS)           class ND4J_EXPORT NAME: public nd4j::ops::DeclarableCustomOp { \
+#define DECLARE_CUSTOM_OP(NAME, NIN, NOUT, INPLACEABLE, TARGS, IARGS)           class ND4J_EXPORT NAME: public sd::ops::DeclarableCustomOp { \
                                                                                 protected: \
                                                                                     void registerTypes(); \
                                                                                     Nd4jStatus validateAndExecute(Context& block); \
                                                                                 public:\
                                                                                     NAME(); \
-                                                                                    nd4j::ShapeList* calculateOutputShape(nd4j::ShapeList* inputShape, nd4j::graph::Context& block); \
+                                                                                    sd::ShapeList* calculateOutputShape(sd::ShapeList* inputShape, sd::graph::Context& block); \
                                                                                 };\
                                                                                 REGISTER_H(NAME)
 
-#define CUSTOM_OP_IMPL(NAME, NIN, NOUT, INPLACEABLE, TARGS, IARGS)              NAME::NAME(): nd4j::ops::DeclarableCustomOp(NIN, NOUT, #NAME, INPLACEABLE, TARGS, IARGS) { }; \
+#define CUSTOM_OP_IMPL(NAME, NIN, NOUT, INPLACEABLE, TARGS, IARGS)              NAME::NAME(): sd::ops::DeclarableCustomOp(NIN, NOUT, #NAME, INPLACEABLE, TARGS, IARGS) { }; \
                                                                                 REGISTER_C(NAME) \
-                                                                                Nd4jStatus nd4j::ops::NAME::validateAndExecute(nd4j::graph::Context& block)
+                                                                                Nd4jStatus sd::ops::NAME::validateAndExecute(sd::graph::Context& block)
 
 // this declaration MUST follow DECLARE_CUSTOM_OP
-#define DECLARE_SHAPE_FN(NAME)                                                  nd4j::ShapeList* nd4j::ops::NAME::calculateOutputShape(nd4j::ShapeList* inputShape, nd4j::graph::Context& block)
+#define DECLARE_SHAPE_FN(NAME)                                                  sd::ShapeList* sd::ops::NAME::calculateOutputShape(sd::ShapeList* inputShape, sd::graph::Context& block)
 
 
-#define DECLARE_SAME_TYPE(NAME)                                                 void nd4j::ops::NAME::registerTypes() {this->getOpDescriptor()->setSameMode(true);}
+#define DECLARE_SAME_TYPE(NAME)                                                 void sd::ops::NAME::registerTypes() {this->getOpDescriptor()->setSameMode(true);}
 
-#define DECLARE_TYPES(NAME)                                                     void nd4j::ops::NAME::registerTypes()
+#define DECLARE_TYPES(NAME)                                                     void sd::ops::NAME::registerTypes()
 
-#define DECLARE_BROADCASTABLE_OP(NAME,TARGS, IARGS)                             class ND4J_EXPORT NAME: public nd4j::ops::BroadcastableOp { \
+#define DECLARE_BROADCASTABLE_OP(NAME,TARGS, IARGS)                             class ND4J_EXPORT NAME: public sd::ops::BroadcastableOp { \
                                                                                 protected: \
                                                                                     void registerTypes(); \
                                                                                     Nd4jStatus validateAndExecute(Context& block); \
@@ -1446,17 +1446,17 @@
                                                                                 };\
                                                                                 REGISTER_H(NAME)
 
-#define BROADCASTABLE_OP_IMPL(NAME, TARGS, IARGS)                               NAME::NAME(): nd4j::ops::BroadcastableOp(#NAME, TARGS, IARGS) { }; \
+#define BROADCASTABLE_OP_IMPL(NAME, TARGS, IARGS)                               NAME::NAME(): sd::ops::BroadcastableOp(#NAME, TARGS, IARGS) { }; \
                                                                                 REGISTER_C(NAME) \
-                                                                                Nd4jStatus nd4j::ops::NAME::validateAndExecute(nd4j::graph::Context& block)
+                                                                                Nd4jStatus sd::ops::NAME::validateAndExecute(sd::graph::Context& block)
 
 
 #define DECLARE_DEVICE_OP(NAME, NIN, NOUT, INPLACEABLE, TARGS, IARGS)
 
 #define REPLICATE_SHAPE(SRC, TGT)   if (shape::order(SRC) == 'c')\
-                                        shape::shapeBuffer(shape::rank(SRC), nd4j::ArrayOptions::dataType(SRC), shape::shapeOf(SRC), TGT);\
+                                        shape::shapeBuffer(shape::rank(SRC), sd::ArrayOptions::dataType(SRC), shape::shapeOf(SRC), TGT);\
                                     else \
-                                        shape::shapeBufferFortran(shape::rank(SRC),  nd4j::ArrayOptions::dataType(SRC), shape::shapeOf(SRC), TGT);\
+                                        shape::shapeBufferFortran(shape::rank(SRC),  sd::ArrayOptions::dataType(SRC), shape::shapeOf(SRC), TGT);\
 
 
 #ifdef __CUDABLAS__
@@ -1464,14 +1464,14 @@
 #ifdef _RELEASE
 
 // we intentionally add 8 tail bytes here to avoid problems with atomic operations
-#define ALLOCATE_SPECIAL(VARIABLE, WORKSPACE, LENGTH, TT) if (WORKSPACE == nullptr) {auto erc_##VARIABLE = cudaMalloc(reinterpret_cast<void**>(&VARIABLE), LENGTH * sizeof(TT) + 8); if (erc_##VARIABLE != 0) {throw cuda_exception::build("[DEVICE] allocation failed", erc_##VARIABLE);} else { }; } else {VARIABLE = reinterpret_cast<TT *>(WORKSPACE->allocateBytes(nd4j::memory::MemoryType::DEVICE, LENGTH * sizeof(TT) + 8)); }
+#define ALLOCATE_SPECIAL(VARIABLE, WORKSPACE, LENGTH, TT) if (WORKSPACE == nullptr) {auto erc_##VARIABLE = cudaMalloc(reinterpret_cast<void**>(&VARIABLE), LENGTH * sizeof(TT) + 8); if (erc_##VARIABLE != 0) {throw cuda_exception::build("[DEVICE] allocation failed", erc_##VARIABLE);} else { }; } else {VARIABLE = reinterpret_cast<TT *>(WORKSPACE->allocateBytes(sd::memory::MemoryType::DEVICE, LENGTH * sizeof(TT) + 8)); }
 #define RELEASE_SPECIAL(VARIABLE, WORKSPACE) if (VARIABLE != nullptr) {if (WORKSPACE == nullptr) { auto erc_##VARIABLE = cudaFree(reinterpret_cast<void *>(VARIABLE));  if (erc_##VARIABLE != 0) {throw cuda_exception::build("[DEVICE] deallocation failed", erc_##VARIABLE);}; }; };
 
 #else
 
 // we intentionally add 8 tail bytes here to avoid problems with atomic operations
-#define ALLOCATE_SPECIAL(VARIABLE, WORKSPACE, LENGTH, TT) if (WORKSPACE == nullptr) {auto erc_##VARIABLE = cudaMalloc(reinterpret_cast<void**>(&VARIABLE), LENGTH * sizeof(TT) + 8); if (erc_##VARIABLE != 0) {throw cuda_exception::build("[DEVICE] allocation failed", erc_##VARIABLE);} else { nd4j::memory::MemoryTracker::getInstance()->countIn(nd4j::memory::MemoryType::DEVICE, VARIABLE, LENGTH * sizeof(TT)); }; } else {VARIABLE = reinterpret_cast<TT *>(WORKSPACE->allocateBytes(nd4j::memory::MemoryType::DEVICE, LENGTH * sizeof(TT) + 8)); }
-#define RELEASE_SPECIAL(VARIABLE, WORKSPACE) if (VARIABLE != nullptr) {if (WORKSPACE == nullptr) { nd4j::memory::MemoryTracker::getInstance()->countOut(VARIABLE); auto erc_##VARIABLE = cudaFree(reinterpret_cast<void *>(VARIABLE));  if (erc_##VARIABLE != 0) {throw cuda_exception::build("[DEVICE] deallocation failed", erc_##VARIABLE);}; }; };
+#define ALLOCATE_SPECIAL(VARIABLE, WORKSPACE, LENGTH, TT) if (WORKSPACE == nullptr) {auto erc_##VARIABLE = cudaMalloc(reinterpret_cast<void**>(&VARIABLE), LENGTH * sizeof(TT) + 8); if (erc_##VARIABLE != 0) {throw cuda_exception::build("[DEVICE] allocation failed", erc_##VARIABLE);} else { sd::memory::MemoryTracker::getInstance()->countIn(sd::memory::MemoryType::DEVICE, VARIABLE, LENGTH * sizeof(TT)); }; } else {VARIABLE = reinterpret_cast<TT *>(WORKSPACE->allocateBytes(sd::memory::MemoryType::DEVICE, LENGTH * sizeof(TT) + 8)); }
+#define RELEASE_SPECIAL(VARIABLE, WORKSPACE) if (VARIABLE != nullptr) {if (WORKSPACE == nullptr) { sd::memory::MemoryTracker::getInstance()->countOut(VARIABLE); auto erc_##VARIABLE = cudaFree(reinterpret_cast<void *>(VARIABLE));  if (erc_##VARIABLE != 0) {throw cuda_exception::build("[DEVICE] deallocation failed", erc_##VARIABLE);}; }; };
 
 #endif
 
@@ -1489,8 +1489,8 @@
 
 #else
 
-#define ALLOCATE(VARIABLE, WORKSPACE, LENGTH, TT)   if (WORKSPACE == nullptr) {VARIABLE = new TT[LENGTH]; nd4j::memory::MemoryTracker::getInstance()->countIn(nd4j::memory::MemoryType::HOST, VARIABLE, LENGTH * sizeof(TT)); } else {VARIABLE = reinterpret_cast<TT *>(WORKSPACE->allocateBytes(LENGTH * sizeof(TT))); }; memset(VARIABLE, 0, LENGTH * sizeof(TT));
-#define RELEASE(VARIABLE, WORKSPACE)    if (WORKSPACE == nullptr) { nd4j::memory::MemoryTracker::getInstance()->countOut(VARIABLE); delete[] VARIABLE;};
+#define ALLOCATE(VARIABLE, WORKSPACE, LENGTH, TT)   if (WORKSPACE == nullptr) {VARIABLE = new TT[LENGTH]; sd::memory::MemoryTracker::getInstance()->countIn(sd::memory::MemoryType::HOST, VARIABLE, LENGTH * sizeof(TT)); } else {VARIABLE = reinterpret_cast<TT *>(WORKSPACE->allocateBytes(LENGTH * sizeof(TT))); }; memset(VARIABLE, 0, LENGTH * sizeof(TT));
+#define RELEASE(VARIABLE, WORKSPACE)    if (WORKSPACE == nullptr) { sd::memory::MemoryTracker::getInstance()->countOut(VARIABLE); delete[] VARIABLE;};
 
 #endif
 
@@ -1512,9 +1512,9 @@
 #define UNSTASH(NAME)       block.getStash()->extractArray(block.getNodeId(), NAME);
 
 #define INPUT_VARIABLE(INDEX)     block.array(INDEX)
-#define OUTPUT_VARIABLE(INDEX)    reinterpret_cast<nd4j::NDArray *>(this->getZ(block, INDEX))
+#define OUTPUT_VARIABLE(INDEX)    reinterpret_cast<sd::NDArray *>(this->getZ(block, INDEX))
 
-#define INPUT_LIST(INDEX)     reinterpret_cast<nd4j::NDArrayList *>(block.getVariable(INDEX)->getNDArrayList())
+#define INPUT_LIST(INDEX)     reinterpret_cast<sd::NDArrayList *>(block.getVariable(INDEX)->getNDArrayList())
 
 #define D_ARG(INDEX)     block.getDArguments()->at(INDEX)
 #define INT_ARG(INDEX)     block.getIArguments()->at(INDEX)
@@ -1559,7 +1559,7 @@
 
 #endif // CUDACC
 
-#define CHECK_ALLOC(PTR, MSG, BYTES) if (PTR == nullptr) { throw nd4j::allocation_exception::build(MSG, BYTES); };
+#define CHECK_ALLOC(PTR, MSG, BYTES) if (PTR == nullptr) { throw sd::allocation_exception::build(MSG, BYTES); };
 
 
 
diff --git a/libnd4j/include/op_enums.h b/libnd4j/include/system/op_enums.h
similarity index 97%
rename from libnd4j/include/op_enums.h
rename to libnd4j/include/system/op_enums.h
index 8a100153f..ad16d281e 100644
--- a/libnd4j/include/op_enums.h
+++ b/libnd4j/include/system/op_enums.h
@@ -23,10 +23,10 @@
 #define LIBND4J_OP_ENUMS_H
 
 #include <loops/legacy_ops.h>
-#include <type_boilerplate.h>
-#include <enum_boilerplate.h>
+#include <system/type_boilerplate.h>
+#include <system/enum_boilerplate.h>
 
-namespace nd4j {
+namespace sd {
     namespace random {
         enum Ops {
             BUILD_ENUMERATION(RANDOM_OPS)
diff --git a/libnd4j/include/openmp_pragmas.h b/libnd4j/include/system/openmp_pragmas.h
similarity index 100%
rename from libnd4j/include/openmp_pragmas.h
rename to libnd4j/include/system/openmp_pragmas.h
diff --git a/libnd4j/include/optype.h b/libnd4j/include/system/optype.h
similarity index 100%
rename from libnd4j/include/optype.h
rename to libnd4j/include/system/optype.h
diff --git a/libnd4j/include/pairwise_util.h b/libnd4j/include/system/pairwise_util.h
similarity index 98%
rename from libnd4j/include/pairwise_util.h
rename to libnd4j/include/system/pairwise_util.h
index e0ed79a7e..d9e0965c8 100755
--- a/libnd4j/include/pairwise_util.h
+++ b/libnd4j/include/system/pairwise_util.h
@@ -32,12 +32,12 @@
 #define omp_set_num_threads(threads)
 #endif
 
-#include <templatemath.h>
+#include <math/templatemath.h>
 #include <functional>
-#include <pointercast.h>
-#include <op_boilerplate.h>
-#include <dll.h>
-#include <nd4jmemset.h>
+#include <system/pointercast.h>
+#include <system/op_boilerplate.h>
+#include <system/dll.h>
+#include <system/nd4jmemset.h>
 #ifdef _OPENMP
 #include <omp.h>
 #endif
@@ -274,8 +274,8 @@ public:
     BlockInformation(Nd4jLong length, int threshold) {
 
     threads = length / threshold;
-    threads = (1 < threads)?threads:1;//nd4j::math::nd4j_max<int>(1, threads);
-    threads = (threads < omp_get_max_threads())?threads:omp_get_max_threads();//nd4j::math::nd4j_min<int>(threads, omp_get_max_threads());
+    threads = (1 < threads)?threads:1;//sd::math::nd4j_max<int>(1, threads);
+    threads = (threads < omp_get_max_threads())?threads:omp_get_max_threads();//sd::math::nd4j_min<int>(threads, omp_get_max_threads());
 
     items = length / threads;
     remainder = length % threads;
diff --git a/libnd4j/include/platform_boilerplate.h b/libnd4j/include/system/platform_boilerplate.h
similarity index 97%
rename from libnd4j/include/platform_boilerplate.h
rename to libnd4j/include/system/platform_boilerplate.h
index 5c73a1b38..bdbb1a051 100644
--- a/libnd4j/include/platform_boilerplate.h
+++ b/libnd4j/include/system/platform_boilerplate.h
@@ -44,7 +44,7 @@
                                                         } \
                                                     }; \
                                                     static __registratorPlatformHelper_##CNAME platformHelper_##CNAME; \
-                                                    Nd4jStatus PLATFORM_##CNAME::invokeHelper(nd4j::graph::Context &block)
+                                                    Nd4jStatus PLATFORM_##CNAME::invokeHelper(sd::graph::Context &block)
 
 
 #define PLATFORM_IMPL(NAME, ENGINE) PLATFORM_IMPL_F(NAME, ENGINE, NAME ##_## ENGINE)
diff --git a/libnd4j/include/play.h b/libnd4j/include/system/play.h
similarity index 86%
rename from libnd4j/include/play.h
rename to libnd4j/include/system/play.h
index d0fecee82..9e121d88b 100644
--- a/libnd4j/include/play.h
+++ b/libnd4j/include/system/play.h
@@ -22,7 +22,7 @@
 #define LIBND4J_PLAY_H
 
 //#include <type_boilerplate.h>
-#include <platform_boilerplate.h>
+#include <system/platform_boilerplate.h>
 /*
 #define DATA_TYPES \
         (DATA_FLOAT, float) ,\
@@ -71,10 +71,10 @@ DECLARE_PLATFORM(conv2d, ENGINE_CPU)
 */
 /*
 #define NATIVE_LAYERS \
-        (0, nd4j::layers::DenseLayer)
-//        (1, nd4j::layers::ConvolutionLayer) ,\
-//        (2, nd4j::layers::Pooling2DLayer) ,\
-//        (3, nd4j::layers::LSTMLayer)
+        (0, sd::layers::DenseLayer)
+//        (1, sd::layers::ConvolutionLayer) ,\
+//        (2, sd::layers::Pooling2DLayer) ,\
+//        (3, sd::layers::LSTMLayer)
 
 
 */
@@ -114,26 +114,26 @@ EXECUTE_NOE((x, y, extras), OPS_A(PAIRWISE_TRANSFORM_OPS))
 
 //EXECUTE_NOE((x, extras), OPS_A(SCALAR_OPS))
 
-//BUILD_CALL_1(template void nd4j::NDArray<float16>::applyTransform, float16, (NDArray<float16>* a, float16* b), TRANSFORM_OPS)
+//BUILD_CALL_1(template void sd::NDArray<float16>::applyTransform, float16, (NDArray<float16>* a, float16* b), TRANSFORM_OPS)
 
-//BUILD_CALL_1(template void nd4j::NDArray<float16>::applyPairwiseTransform, float16, (NDArray<float16>* other, float16* extraParams), PAIRWISE_TRANSFORM_OPS)
+//BUILD_CALL_1(template void sd::NDArray<float16>::applyPairwiseTransform, float16, (NDArray<float16>* other, float16* extraParams), PAIRWISE_TRANSFORM_OPS)
 //BUILD_TRACKER(TRANSFORM, ACTIVATIONS)
 
-//BUILD_CALL_1(template void nd4j::NDArray<float16>::applyScalar, float16, (float16 scalar, NDArray<float16>* target, float16 *extraParams) , ACTIVATIONS);
+//BUILD_CALL_1(template void sd::NDArray<float16>::applyScalar, float16, (float16 scalar, NDArray<float16>* target, float16 *extraParams) , ACTIVATIONS);
 
 /*
 #define DECLARE_OP(NAME, NIN, NOUT)   DECLARE_OP_UNIQ(__COUNTER__, NAME, NIN, NOUT)
 #define DECLARE_OP_UNIQ(CTR, NAME, NIN, NOUT)   template <typename T> \
-                                                class NAME: public nd4j::ops::DeclarableOp<T> { \
+                                                class NAME: public sd::ops::DeclarableOp<T> { \
                                                 public:\
-                                                NAME() : nd4j::ops::DeclarableOp<T>(NIN, NOUT, #NAME) { } \
+                                                NAME() : sd::ops::DeclarableOp<T>(NIN, NOUT, #NAME) { } \
                                                 protected: \
                                                     Nd4jStatus validateAndExecute(Block<T>& block); \
                                                 };\
                                                 template <typename T> \
-                                                Nd4jStatus nd4j::ops::NAME<T>::validateAndExecute(Block<T>& block)
+                                                Nd4jStatus sd::ops::NAME<T>::validateAndExecute(Block<T>& block)
 */
-//#define END_OP(NAME) }; static nd4j::ops::__registrator<NAME<float>> register_op##Name;
+//#define END_OP(NAME) }; static sd::ops::__registrator<NAME<float>> register_op##Name;
 
 //#DECLARE_OP(Concat, -1, 1)
 
diff --git a/libnd4j/include/pointercast.h b/libnd4j/include/system/pointercast.h
similarity index 98%
rename from libnd4j/include/pointercast.h
rename to libnd4j/include/system/pointercast.h
index 66b28693f..2c64d608e 100644
--- a/libnd4j/include/pointercast.h
+++ b/libnd4j/include/system/pointercast.h
@@ -21,7 +21,7 @@
 #ifndef NATIVEOPERATIONS_POINTERCAST_H
 #define NATIVEOPERATIONS_POINTERCAST_H
 
-#include <msvc.h>
+#include <system/msvc.h>
 #include <stdint.h>
 
 typedef void* Nd4jPointer;
diff --git a/libnd4j/include/type_boilerplate.h b/libnd4j/include/system/type_boilerplate.h
similarity index 99%
rename from libnd4j/include/type_boilerplate.h
rename to libnd4j/include/system/type_boilerplate.h
index af0fe369d..997fcab22 100644
--- a/libnd4j/include/type_boilerplate.h
+++ b/libnd4j/include/system/type_boilerplate.h
@@ -631,12 +631,12 @@
 #define RANDOMTRIPLE(NAME, SIGNATURE, TYPES_X, TYPES_Y, TYPE_Z)  _RANDOMTRIPLE(NAME, SIGNATURE, TYPE_Z, TYPES_X, TYPES_Y)
 
 
-#define BROADCAST(NAME) nd4j::BroadcastOpsTuple::custom(nd4j::scalar::NAME, nd4j::pairwise::NAME, nd4j::broadcast::NAME)
-#define BROADCAST_BOOL(NAME) nd4j::BroadcastBoolOpsTuple::custom(nd4j::scalar::NAME, nd4j::pairwise::NAME, nd4j::broadcast::NAME)
+#define BROADCAST(NAME) sd::BroadcastOpsTuple::custom(sd::scalar::NAME, sd::pairwise::NAME, sd::broadcast::NAME)
+#define BROADCAST_BOOL(NAME) sd::BroadcastBoolOpsTuple::custom(sd::scalar::NAME, sd::pairwise::NAME, sd::broadcast::NAME)
 
-#define ALL_STRINGS nd4j::DataType::UTF8, nd4j::DataType::UTF16, nd4j::DataType::UTF32
-#define ALL_INDICES nd4j::DataType::INT32, nd4j::DataType::INT64
-#define ALL_INTS  nd4j::DataType::INT8, nd4j::DataType::UINT8, nd4j::DataType::INT16, nd4j::DataType::UINT16, nd4j::DataType::INT32, nd4j::DataType::UINT32, nd4j::DataType::INT64, nd4j::DataType::UINT64
-#define ALL_FLOATS  nd4j::DataType::HALF, nd4j::DataType::FLOAT32, nd4j::DataType::DOUBLE, nd4j::DataType::BFLOAT16
+#define ALL_STRINGS sd::DataType::UTF8, sd::DataType::UTF16, sd::DataType::UTF32
+#define ALL_INDICES sd::DataType::INT32, sd::DataType::INT64
+#define ALL_INTS  sd::DataType::INT8, sd::DataType::UINT8, sd::DataType::INT16, sd::DataType::UINT16, sd::DataType::INT32, sd::DataType::UINT32, sd::DataType::INT64, sd::DataType::UINT64
+#define ALL_FLOATS  sd::DataType::HALF, sd::DataType::FLOAT32, sd::DataType::DOUBLE, sd::DataType::BFLOAT16
 
 #endif //TESTS_CPU_TYPE_BOILERPLATE_H
diff --git a/libnd4j/include/util.h b/libnd4j/include/system/util.h
similarity index 97%
rename from libnd4j/include/util.h
rename to libnd4j/include/system/util.h
index 153f7f4ae..aa2055606 100644
--- a/libnd4j/include/util.h
+++ b/libnd4j/include/system/util.h
@@ -30,7 +30,7 @@
 #include <sys/time.h>
 #endif
 
-#include "pointercast.h"
+#include "system/pointercast.h"
 
 static inline Nd4jLong microTime() {
 #ifdef WIN32
diff --git a/libnd4j/include/types/bfloat16.h b/libnd4j/include/types/bfloat16.h
index 847c2ebda..a05909816 100644
--- a/libnd4j/include/types/bfloat16.h
+++ b/libnd4j/include/types/bfloat16.h
@@ -43,7 +43,7 @@
 #define local_def inline
 #endif
 
-//namespace nd4j
+//namespace sd
 //{
   struct bfloat16
   {
diff --git a/libnd4j/include/types/float16.h b/libnd4j/include/types/float16.h
index 4aa0d5d66..761e66f1b 100644
--- a/libnd4j/include/types/float16.h
+++ b/libnd4j/include/types/float16.h
@@ -20,7 +20,7 @@
 #include <cfloat>
 #include <iosfwd>
 #include <iostream>
-#include <pointercast.h>
+#include <system/pointercast.h>
 #if defined(__INTEL_COMPILER) || defined(SD_F16C)
     #include <immintrin.h>
 #endif
diff --git a/libnd4j/include/types/float8.h b/libnd4j/include/types/float8.h
index d7e4b80a3..6dc03bba4 100644
--- a/libnd4j/include/types/float8.h
+++ b/libnd4j/include/types/float8.h
@@ -33,10 +33,10 @@
 #endif
 */
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 
 
-namespace nd4j {
+namespace sd {
 
     typedef struct {
         unsigned char x;
diff --git a/libnd4j/include/types/impl/float8.cpp b/libnd4j/include/types/impl/float8.cpp
index f5c6d2e9a..b36846e35 100644
--- a/libnd4j/include/types/impl/float8.cpp
+++ b/libnd4j/include/types/impl/float8.cpp
@@ -20,7 +20,7 @@
 
 #include <types/float8.h>
 
-namespace nd4j {
+namespace sd {
 
 /*
     template float8::float8(const float& rhs);
diff --git a/libnd4j/include/types/impl/int16.cpp b/libnd4j/include/types/impl/int16.cpp
index 50529aa97..67f90e9d8 100644
--- a/libnd4j/include/types/impl/int16.cpp
+++ b/libnd4j/include/types/impl/int16.cpp
@@ -20,7 +20,7 @@
 
 #include <types/int16.h>
 
-namespace nd4j {
+namespace sd {
 
     /*
     template int16::int16(const float& rhs);
diff --git a/libnd4j/include/types/impl/int8.cpp b/libnd4j/include/types/impl/int8.cpp
index 6b0d5a659..030695f96 100644
--- a/libnd4j/include/types/impl/int8.cpp
+++ b/libnd4j/include/types/impl/int8.cpp
@@ -20,7 +20,7 @@
 
 #include <types/int8.h>
 
-namespace nd4j {
+namespace sd {
 
 /*
     template int8::int8(const float& rhs);
diff --git a/libnd4j/include/types/impl/pair.cpp b/libnd4j/include/types/impl/pair.cpp
index 910bf6274..767bfa630 100644
--- a/libnd4j/include/types/impl/pair.cpp
+++ b/libnd4j/include/types/impl/pair.cpp
@@ -20,7 +20,7 @@
 
 #include <types/pair.h>
 
-namespace nd4j {
+namespace sd {
     Pair::Pair(int first, int second) {
         _first = first;
         _second = second;
diff --git a/libnd4j/include/types/impl/triple.cpp b/libnd4j/include/types/impl/triple.cpp
index b01afb680..0b39d4bac 100644
--- a/libnd4j/include/types/impl/triple.cpp
+++ b/libnd4j/include/types/impl/triple.cpp
@@ -20,7 +20,7 @@
 
 #include <types/triple.h>
 
-namespace nd4j {
+namespace sd {
     int Triple::first() const {
         return _first;
     }
diff --git a/libnd4j/include/types/impl/uint16.cpp b/libnd4j/include/types/impl/uint16.cpp
index c15f71021..5b858222d 100644
--- a/libnd4j/include/types/impl/uint16.cpp
+++ b/libnd4j/include/types/impl/uint16.cpp
@@ -18,10 +18,10 @@
 // @author raver119@gmail.com
 //
 
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 #include <types/uint16.h>
 
-namespace nd4j {
+namespace sd {
 
 
     /*
diff --git a/libnd4j/include/types/impl/uint8.cpp b/libnd4j/include/types/impl/uint8.cpp
index 6e9423f97..a6d25c9d3 100644
--- a/libnd4j/include/types/impl/uint8.cpp
+++ b/libnd4j/include/types/impl/uint8.cpp
@@ -20,7 +20,7 @@
 
 #include <types/uint8.h>
 
-namespace nd4j {
+namespace sd {
 
     /*
     template uint8::uint8(const float& rhs);
diff --git a/libnd4j/include/types/impl/utf8string.cpp b/libnd4j/include/types/impl/utf8string.cpp
index 8cfeecb5c..a7df7cc28 100644
--- a/libnd4j/include/types/impl/utf8string.cpp
+++ b/libnd4j/include/types/impl/utf8string.cpp
@@ -21,7 +21,7 @@
 #include <types/utf8string.h>
 #include <cstring>
 
-namespace nd4j {
+namespace sd {
     utf8string::~utf8string() {
         if (_allocated)
             delete[] _buffer;
diff --git a/libnd4j/include/types/int16.h b/libnd4j/include/types/int16.h
index 695ba64fa..25a771381 100644
--- a/libnd4j/include/types/int16.h
+++ b/libnd4j/include/types/int16.h
@@ -22,10 +22,10 @@
 #define LIBND4J_INT16_H
 
 #include <stdint.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 
 
-namespace nd4j {
+namespace sd {
 
     float _CUDA_HD FORCEINLINE cpu_int162float(int16_t data);
     int16_t _CUDA_HD FORCEINLINE cpu_float2int16(float data);
diff --git a/libnd4j/include/types/int8.h b/libnd4j/include/types/int8.h
index eb7f5a3c8..19e1b91e1 100644
--- a/libnd4j/include/types/int8.h
+++ b/libnd4j/include/types/int8.h
@@ -22,10 +22,10 @@
 #define LIBND4J_INT8_H
 
 #include <stdint.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 
 
-namespace nd4j {
+namespace sd {
 
     float _CUDA_HD FORCEINLINE cpu_int82float(int8_t data);
     int8_t _CUDA_HD FORCEINLINE cpu_float2int8(float data);
diff --git a/libnd4j/include/types/pair.h b/libnd4j/include/types/pair.h
index 28d25f5ea..0471c45ed 100644
--- a/libnd4j/include/types/pair.h
+++ b/libnd4j/include/types/pair.h
@@ -21,9 +21,9 @@
 #ifndef LIBND4J_PAIR_H
 #define LIBND4J_PAIR_H
 
-#include <dll.h>
+#include <system/dll.h>
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT Pair {
     protected:
         int _first = 0;
diff --git a/libnd4j/include/types/triple.h b/libnd4j/include/types/triple.h
index 8084c48e8..0a5310265 100644
--- a/libnd4j/include/types/triple.h
+++ b/libnd4j/include/types/triple.h
@@ -22,9 +22,9 @@
 #define LIBND4J_TRIPLE_H
 
 
-#include <dll.h>
+#include <system/dll.h>
 
-namespace nd4j {
+namespace sd {
     class ND4J_EXPORT Triple {
     protected:
         int _first = 0;
diff --git a/libnd4j/include/types/types.h b/libnd4j/include/types/types.h
index 7322c6bd5..620faa838 100644
--- a/libnd4j/include/types/types.h
+++ b/libnd4j/include/types/types.h
@@ -21,7 +21,7 @@
 #ifndef LIBND4J_TYPES_H
 #define LIBND4J_TYPES_H
 
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <types/float8.h>
 #include <types/float16.h>
 #include <types/int8.h>
@@ -30,175 +30,175 @@
 #include <types/uint16.h>
 #include <types/utf8string.h>
 #include <types/bfloat16.h>
-#include <type_boilerplate.h>
+#include <system/type_boilerplate.h>
 
 
 #define LIBND4J_STRINGTYPES \
-        (nd4j::DataType::UTF8, std::string),\
-        (nd4j::DataType::UTF16, std::u16string), \
-        (nd4j::DataType::UTF32, std::u32string)
+        (sd::DataType::UTF8, std::string),\
+        (sd::DataType::UTF16, std::u16string), \
+        (sd::DataType::UTF32, std::u32string)
 
 #define LIBND4J_TYPES \
-        (nd4j::DataType::BFLOAT16, bfloat16),\
-        (nd4j::DataType::HALF, float16), \
-        (nd4j::DataType::FLOAT32, float), \
-        (nd4j::DataType::DOUBLE, double), \
-        (nd4j::DataType::BOOL, bool), \
-        (nd4j::DataType::INT8, int8_t), \
-        (nd4j::DataType::UINT8, uint8_t), \
-        (nd4j::DataType::UINT16, uint16_t), \
-        (nd4j::DataType::UINT32, uint32_t), \
-        (nd4j::DataType::UINT64, uint64_t), \
-        (nd4j::DataType::INT16, int16_t), \
-        (nd4j::DataType::INT32, int32_t), \
-        (nd4j::DataType::INT64, Nd4jLong)
+        (sd::DataType::BFLOAT16, bfloat16),\
+        (sd::DataType::HALF, float16), \
+        (sd::DataType::FLOAT32, float), \
+        (sd::DataType::DOUBLE, double), \
+        (sd::DataType::BOOL, bool), \
+        (sd::DataType::INT8, int8_t), \
+        (sd::DataType::UINT8, uint8_t), \
+        (sd::DataType::UINT16, uint16_t), \
+        (sd::DataType::UINT32, uint32_t), \
+        (sd::DataType::UINT64, uint64_t), \
+        (sd::DataType::INT16, int16_t), \
+        (sd::DataType::INT32, int32_t), \
+        (sd::DataType::INT64, Nd4jLong)
 
 #define LIBND4J_TYPES_EXTENDED \
-        (nd4j::DataType::HALF, float16), \
-        (nd4j::DataType::FLOAT32, float), \
-        (nd4j::DataType::DOUBLE, double), \
-        (nd4j::DataType::BOOL, bool), \
-        (nd4j::DataType::INT8, int8_t), \
-        (nd4j::DataType::UINT8, uint8_t), \
-        (nd4j::DataType::INT16, int16_t), \
-        (nd4j::DataType::INT32, int32_t), \
-        (nd4j::DataType::INT64, Nd4jLong), \
-        (nd4j::DataType::UINT16, uint16_t), \
-        (nd4j::DataType::UINT64, Nd4jULong), \
-        (nd4j::DataType::UINT32, uint32_t), \
-        (nd4j::DataType::BFLOAT16, bfloat16)
+        (sd::DataType::HALF, float16), \
+        (sd::DataType::FLOAT32, float), \
+        (sd::DataType::DOUBLE, double), \
+        (sd::DataType::BOOL, bool), \
+        (sd::DataType::INT8, int8_t), \
+        (sd::DataType::UINT8, uint8_t), \
+        (sd::DataType::INT16, int16_t), \
+        (sd::DataType::INT32, int32_t), \
+        (sd::DataType::INT64, Nd4jLong), \
+        (sd::DataType::UINT16, uint16_t), \
+        (sd::DataType::UINT64, Nd4jULong), \
+        (sd::DataType::UINT32, uint32_t), \
+        (sd::DataType::BFLOAT16, bfloat16)
 
 #define BOOL_TYPES \
-        (nd4j::DataType::BOOL, bool)
+        (sd::DataType::BOOL, bool)
 
 #define LONG_TYPES \
-        (nd4j::DataType::INT64, Nd4jLong),\
-        (nd4j::DataType::UINT64, uint64_t)
+        (sd::DataType::INT64, Nd4jLong),\
+        (sd::DataType::UINT64, uint64_t)
 
 #define FLOAT_TYPES \
-        (nd4j::DataType::BFLOAT16, bfloat16) ,\
-        (nd4j::DataType::HALF, float16), \
-        (nd4j::DataType::FLOAT32, float), \
-        (nd4j::DataType::DOUBLE, double)
+        (sd::DataType::BFLOAT16, bfloat16) ,\
+        (sd::DataType::HALF, float16), \
+        (sd::DataType::FLOAT32, float), \
+        (sd::DataType::DOUBLE, double)
 
 #define INDEXING_TYPES \
-        (nd4j::DataType::INT32, int32_t), \
-        (nd4j::DataType::INT64, Nd4jLong)
+        (sd::DataType::INT32, int32_t), \
+        (sd::DataType::INT64, Nd4jLong)
 
 #define FLOAT_NATIVE \
-        (nd4j::DataType::FLOAT32, float), \
-        (nd4j::DataType::DOUBLE, double)
+        (sd::DataType::FLOAT32, float), \
+        (sd::DataType::DOUBLE, double)
 
 #define FLOAT_TYPES_0 \
-        (nd4j::DataType::HALF, float16)
+        (sd::DataType::HALF, float16)
 
 #define FLOAT_TYPES_1 \
-        (nd4j::DataType::FLOAT32, float)
+        (sd::DataType::FLOAT32, float)
 
 #define FLOAT_TYPES_2 \
-        (nd4j::DataType::DOUBLE, double)
+        (sd::DataType::DOUBLE, double)
 
 #define FLOAT_TYPES_3 \
-        (nd4j::DataType::BFLOAT16, bfloat16)
+        (sd::DataType::BFLOAT16, bfloat16)
 
 #define LIBND4J_TYPES_0 \
-        (nd4j::DataType::HALF, float16)
+        (sd::DataType::HALF, float16)
 
 #define LIBND4J_TYPES_1 \
-        (nd4j::DataType::FLOAT32, float)
+        (sd::DataType::FLOAT32, float)
 
 #define LIBND4J_TYPES_2 \
-        (nd4j::DataType::DOUBLE, double)
+        (sd::DataType::DOUBLE, double)
 
 #define LIBND4J_TYPES_3 \
-        (nd4j::DataType::BOOL, bool)
+        (sd::DataType::BOOL, bool)
 
 #define LIBND4J_TYPES_4 \
-        (nd4j::DataType::INT8, int8_t)
+        (sd::DataType::INT8, int8_t)
 
 #define LIBND4J_TYPES_5 \
-        (nd4j::DataType::UINT8, uint8_t)
+        (sd::DataType::UINT8, uint8_t)
 
 #define LIBND4J_TYPES_6 \
-        (nd4j::DataType::INT16, int16_t),\
-        (nd4j::DataType::UINT16, uint16_t)
+        (sd::DataType::INT16, int16_t),\
+        (sd::DataType::UINT16, uint16_t)
 
 #define LIBND4J_TYPES_7 \
-        (nd4j::DataType::INT32, int32_t), \
-        (nd4j::DataType::UINT32, uint32_t)
+        (sd::DataType::INT32, int32_t), \
+        (sd::DataType::UINT32, uint32_t)
 
 #define LIBND4J_TYPES_8 \
-        (nd4j::DataType::INT64, Nd4jLong),\
-        (nd4j::DataType::UINT64, uint64_t)
+        (sd::DataType::INT64, Nd4jLong),\
+        (sd::DataType::UINT64, uint64_t)
 
 #define LIBND4J_TYPES_9 \
-        (nd4j::DataType::BFLOAT16, bfloat16)
+        (sd::DataType::BFLOAT16, bfloat16)
 
 #define INTEGER_TYPES \
-        (nd4j::DataType::INT8, int8_t), \
-        (nd4j::DataType::UINT8, uint8_t), \
-        (nd4j::DataType::UINT16, uint16_t), \
-        (nd4j::DataType::UINT32, uint32_t), \
-        (nd4j::DataType::UINT64, uint64_t), \
-        (nd4j::DataType::INT16, int16_t), \
-        (nd4j::DataType::INT32, int32_t), \
-        (nd4j::DataType::INT64, Nd4jLong)
+        (sd::DataType::INT8, int8_t), \
+        (sd::DataType::UINT8, uint8_t), \
+        (sd::DataType::UINT16, uint16_t), \
+        (sd::DataType::UINT32, uint32_t), \
+        (sd::DataType::UINT64, uint64_t), \
+        (sd::DataType::INT16, int16_t), \
+        (sd::DataType::INT32, int32_t), \
+        (sd::DataType::INT64, Nd4jLong)
 
 
 #define NUMERIC_TYPES \
-        (nd4j::DataType::HALF, float16), \
-        (nd4j::DataType::FLOAT32, float), \
-        (nd4j::DataType::DOUBLE, double), \
-        (nd4j::DataType::INT8, int8_t), \
-        (nd4j::DataType::UINT8, uint8_t), \
-        (nd4j::DataType::UINT16, uint16_t), \
-        (nd4j::DataType::UINT32, uint32_t), \
-        (nd4j::DataType::UINT64, uint64_t), \
-        (nd4j::DataType::INT16, int16_t), \
-        (nd4j::DataType::INT32, int32_t), \
-        (nd4j::DataType::INT64, Nd4jLong), \
-        (nd4j::DataType::BFLOAT16, bfloat16)
+        (sd::DataType::HALF, float16), \
+        (sd::DataType::FLOAT32, float), \
+        (sd::DataType::DOUBLE, double), \
+        (sd::DataType::INT8, int8_t), \
+        (sd::DataType::UINT8, uint8_t), \
+        (sd::DataType::UINT16, uint16_t), \
+        (sd::DataType::UINT32, uint32_t), \
+        (sd::DataType::UINT64, uint64_t), \
+        (sd::DataType::INT16, int16_t), \
+        (sd::DataType::INT32, int32_t), \
+        (sd::DataType::INT64, Nd4jLong), \
+        (sd::DataType::BFLOAT16, bfloat16)
 
 #define NUMERIC_TYPES_0 \
-        (nd4j::DataType::HALF, float16)
+        (sd::DataType::HALF, float16)
 
 #define NUMERIC_TYPES_1 \
-        (nd4j::DataType::FLOAT32, float)
+        (sd::DataType::FLOAT32, float)
 
 #define NUMERIC_TYPES_2 \
-        (nd4j::DataType::DOUBLE, double)
+        (sd::DataType::DOUBLE, double)
 
 #define NUMERIC_TYPES_3 \
-        (nd4j::DataType::INT8, int8_t), \
-        (nd4j::DataType::BFLOAT16, bfloat16)
+        (sd::DataType::INT8, int8_t), \
+        (sd::DataType::BFLOAT16, bfloat16)
 
 #define NUMERIC_TYPES_4 \
-        (nd4j::DataType::UINT8, uint8_t)
+        (sd::DataType::UINT8, uint8_t)
 
 #define NUMERIC_TYPES_5 \
-        (nd4j::DataType::UINT16, uint16_t)
+        (sd::DataType::UINT16, uint16_t)
 
 #define NUMERIC_TYPES_6 \
-        (nd4j::DataType::UINT32, uint32_t)
+        (sd::DataType::UINT32, uint32_t)
 
 #define NUMERIC_TYPES_7 \
-        (nd4j::DataType::UINT64, uint64_t)
+        (sd::DataType::UINT64, uint64_t)
 
 #define NUMERIC_TYPES_8 \
-        (nd4j::DataType::INT16, int16_t)
+        (sd::DataType::INT16, int16_t)
 
 #define NUMERIC_TYPES_9 \
-        (nd4j::DataType::INT32, int32_t), \
-        (nd4j::DataType::INT64, Nd4jLong)
+        (sd::DataType::INT32, int32_t), \
+        (sd::DataType::INT64, Nd4jLong)
 
 
 #define GENERIC_NUMERIC_TYPES \
-        (nd4j::DataType::HALF, float16), \
-        (nd4j::DataType::FLOAT32, float), \
-        (nd4j::DataType::DOUBLE, double), \
-        (nd4j::DataType::INT32, int32_t), \
-        (nd4j::DataType::INT64, Nd4jLong), \
-        (nd4j::DataType::BFLOAT16, bfloat16)
+        (sd::DataType::HALF, float16), \
+        (sd::DataType::FLOAT32, float), \
+        (sd::DataType::DOUBLE, double), \
+        (sd::DataType::INT32, int32_t), \
+        (sd::DataType::INT64, Nd4jLong), \
+        (sd::DataType::BFLOAT16, bfloat16)
 
 
 #ifdef __ND4J_EXPERIMENTAL__
diff --git a/libnd4j/include/types/u64.h b/libnd4j/include/types/u64.h
index 0e9e63145..908a9ba1c 100644
--- a/libnd4j/include/types/u64.h
+++ b/libnd4j/include/types/u64.h
@@ -21,11 +21,11 @@
 #define LIBND4J_U64_H
 
 #include <cstdint>
-#include <pointercast.h>
+#include <system/pointercast.h>
 #include <types/float16.h>
 
 
-namespace nd4j {
+namespace sd {
     typedef struct {
         int16_t _v0;
         int16_t _v1;
diff --git a/libnd4j/include/types/uint16.h b/libnd4j/include/types/uint16.h
index 7daa755e5..5fee50e7a 100644
--- a/libnd4j/include/types/uint16.h
+++ b/libnd4j/include/types/uint16.h
@@ -22,10 +22,10 @@
 #define LIBND4J_UINT16_H
 
 #include <stdint.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 
 
-namespace nd4j {
+namespace sd {
 
     uint16_t _CUDA_HD FORCEINLINE cpu_float2uint16(float data);
     float _CUDA_HD FORCEINLINE cpu_uint162float(uint16_t data);
diff --git a/libnd4j/include/types/uint8.h b/libnd4j/include/types/uint8.h
index 7feab81a7..a2505c9ab 100644
--- a/libnd4j/include/types/uint8.h
+++ b/libnd4j/include/types/uint8.h
@@ -22,10 +22,10 @@
 #define LIBND4J_UINT8_H
 
 #include <stdint.h>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 
 
-namespace nd4j {
+namespace sd {
 
     float _CUDA_HD FORCEINLINE cpu_uint82float(uint8_t data);
     uint8_t _CUDA_HD FORCEINLINE cpu_float2uint8(float data);
diff --git a/libnd4j/include/types/utf8string.h b/libnd4j/include/types/utf8string.h
index b3a794cee..ed25c6e10 100644
--- a/libnd4j/include/types/utf8string.h
+++ b/libnd4j/include/types/utf8string.h
@@ -22,9 +22,9 @@
 #define DEV_TESTS_UTF8STRING_H
 
 #include <string>
-#include <dll.h>
+#include <system/dll.h>
 
-namespace nd4j {
+namespace sd {
     struct ND4J_EXPORT utf8string {
     private:
         bool _allocated = false;
diff --git a/libnd4j/minifier/minifier.cpp b/libnd4j/minifier/minifier.cpp
index 071dacc17..7846c1846 100644
--- a/libnd4j/minifier/minifier.cpp
+++ b/libnd4j/minifier/minifier.cpp
@@ -23,12 +23,12 @@
 #endif
 #include <cstdlib>
 #include "graphopt.h"
-#include <GraphExecutioner.h>
+#include <graph/GraphExecutioner.h>
 #include <ops/declarable/CustomOperations.h>
 #include <graph/GraphUtils.h>
 
-using namespace nd4j::ops;
-using namespace nd4j::graph;
+using namespace sd::ops;
+using namespace sd::graph;
 
 int
 main(int argc, char *argv[]) {
diff --git a/libnd4j/server/CMakeLists.txt b/libnd4j/server/CMakeLists.txt
index 7477e25b1..da9c36962 100644
--- a/libnd4j/server/CMakeLists.txt
+++ b/libnd4j/server/CMakeLists.txt
@@ -45,7 +45,7 @@ else()
 		endif()
     else()
         set(CMAKE_CXX_FLAGS  " -g -O0 -fPIC -std=c++11 -fmax-errors=2")
-        if (CPU_BLAS)
+        if (SD_CPU)
             SET( CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -fsanitize=address")
         endif()
     endif()
@@ -53,7 +53,7 @@ endif()
 
 
 # tests are always compiled with all ops included
-SET( CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -DLIBND4J_ALL_OPS=true")
+SET( CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -DSD_ALL_OPS=true")
 
 if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
     # using Clang
@@ -117,7 +117,7 @@ find_package(GRPC REQUIRED)
 message("gRPC found, building GraphServer")
 add_executable(GraphServer ./GraphServer.cpp ../include/graph/generated/graph.grpc.fb.cc ../blas/cpu/NativeOps.cpp ../blas/cpu/GraphExecutioner.cpp
         ../blas/cpu/NativeOpExecutioner.cpp ../blas/cpu/NDArray.cpp
-        ../include/cnpy/cnpy.cpp  ../include/nd4jmemset.h ../include/nd4jmalloc.h
+        ../include/cnpy/cnpy.cpp ../include/system/nd4jmemset.h ../include/system/nd4jmalloc.h
         ../blas/Environment.cpp ../blas/Environment.h ${LOOPS_SOURCES}  ${ARRAY_SOURCES} ${TYPES_SOURCES}
         ${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${INDEXING_SOURCES} ${HELPERS_SOURCES}  ${CUSTOMOPS_HELPERS_SOURCES} ${OPS_SOURCES})
 
diff --git a/libnd4j/server/GraphServer.cpp b/libnd4j/server/GraphServer.cpp
index 10d0f1575..a9e8c3ddc 100644
--- a/libnd4j/server/GraphServer.cpp
+++ b/libnd4j/server/GraphServer.cpp
@@ -20,7 +20,7 @@
 
 #include "GraphServer.h"
 #include <graph/GraphHolder.h>
-#include <GraphExecutioner.h>
+#include <graph/GraphExecutioner.h>
 #include <graph/generated/result_generated.h>
 #include <helpers/StringUtils.h>
 #include <algorithm>
@@ -33,7 +33,7 @@
 
 
 
-namespace nd4j {
+namespace sd {
     namespace graph {
             grpc::Status GraphInferenceServerImpl::RegisterGraph( grpc::ServerContext *context, const flatbuffers::grpc::Message<FlatGraph> *request_msg, flatbuffers::grpc::Message<FlatResponse> *response_msg) {
                 auto flat_graph = request_msg->GetRoot();
@@ -75,7 +75,7 @@ namespace nd4j {
                     assert(response_msg->Verify());
 
                     return grpc::Status::OK;
-                } catch (nd4j::graph::unknown_graph_exception &e) {
+                } catch (sd::graph::unknown_graph_exception &e) {
                     grpc::string gmsg(e.message());
                     return grpc::Status(grpc::StatusCode::NOT_FOUND, gmsg);
                 } catch (std::runtime_error &e) {
@@ -100,7 +100,7 @@ namespace nd4j {
                     assert(response_msg->Verify());
 
                     return grpc::Status::OK;
-                } catch (nd4j::graph::unknown_graph_exception &e) {
+                } catch (sd::graph::unknown_graph_exception &e) {
                     grpc::string gmsg(e.message());
                     return grpc::Status(grpc::StatusCode::NOT_FOUND, gmsg);
                 }
@@ -118,13 +118,13 @@ namespace nd4j {
                     assert(response_msg->Verify());
 
                     return grpc::Status::OK;
-                } catch (nd4j::graph::no_results_exception &e) {
+                } catch (sd::graph::no_results_exception &e) {
                     grpc::string gmsg(e.message());
                     return grpc::Status(grpc::StatusCode::INTERNAL, gmsg);
-                } catch (nd4j::graph::unknown_graph_exception &e) {
+                } catch (sd::graph::unknown_graph_exception &e) {
                     grpc::string gmsg(e.message());
                     return grpc::Status(grpc::StatusCode::NOT_FOUND, gmsg);
-                } catch (nd4j::graph::graph_execution_exception &e) {
+                } catch (sd::graph::graph_execution_exception &e) {
                     grpc::string gmsg(e.message());
                     return grpc::Status(grpc::StatusCode::INTERNAL, gmsg);
                 } catch (std::runtime_error &e) {
@@ -139,10 +139,10 @@ void RunServer(int port) {
   assert(port > 0 && port < 65535);
 
   std::string server_address("0.0.0.0:");
-  server_address += nd4j::StringUtils::valueToString<int>(port);
+  server_address += sd::StringUtils::valueToString<int>(port);
 
-  nd4j::graph::GraphInferenceServerImpl service;
-  auto registrator = nd4j::ops::OpRegistrator::getInstance();
+  sd::graph::GraphInferenceServerImpl service;
+  auto registrator = sd::ops::OpRegistrator::getInstance();
 
   grpc::ServerBuilder builder;
   builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
@@ -181,7 +181,7 @@ int main(int argc, char *argv[]) {
     if(cmdOptionExists(argv, argv+argc, "-f")) {
         auto file = getCmdOption(argv, argv + argc, "-f");
         auto graph = GraphExecutioner<float>::importFromFlatBuffers(file);
-        nd4j::graph::GraphHolder::getInstance()->registerGraph<float>(0L, graph);
+        sd::graph::GraphHolder::getInstance()->registerGraph<float>(0L, graph);
     }
 
     RunServer(port);
diff --git a/libnd4j/server/GraphServer.h b/libnd4j/server/GraphServer.h
index 4e9c539d6..0dceacf25 100644
--- a/libnd4j/server/GraphServer.h
+++ b/libnd4j/server/GraphServer.h
@@ -20,13 +20,13 @@
 
 
 #include <grpc++/grpc++.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <graph/Graph.h>
 #include <ops/declarable/CustomOperations.h>
 
 #include <graph/generated/graph.grpc.fb.h>
 
-namespace nd4j {
+namespace sd {
     namespace graph {
         class GraphInferenceServerImpl final : public GraphInferenceServer::Service {
         private:
diff --git a/libnd4j/tests_cpu/layers_tests/ArrayOptionsTests.cpp b/libnd4j/tests_cpu/layers_tests/ArrayOptionsTests.cpp
index 6ddae6019..18551909c 100644
--- a/libnd4j/tests_cpu/layers_tests/ArrayOptionsTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ArrayOptionsTests.cpp
@@ -20,9 +20,9 @@
 
 #include "testlayers.h"
 #include <array/ArrayOptions.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 
-using namespace nd4j;
+using namespace sd;
 
 
 class ArrayOptionsTests : public testing::Test {
@@ -69,42 +69,42 @@ TEST_F(ArrayOptionsTests, TestShape_Basic_4) {
     auto dtype = ArrayOptions::dataType(shape);
 
     ASSERT_FALSE(ArrayOptions::isSparseArray(shape));
-    ASSERT_TRUE(nd4j::DataType::HALF == ArrayOptions::dataType(shape));
-    ASSERT_EQ(nd4j::ArrayType::DENSE, ArrayOptions::arrayType(shape));
-    ASSERT_EQ(nd4j::SpaceType::QUANTIZED, ArrayOptions::spaceType(shape));
+    ASSERT_TRUE(sd::DataType::HALF == ArrayOptions::dataType(shape));
+    ASSERT_EQ(sd::ArrayType::DENSE, ArrayOptions::arrayType(shape));
+    ASSERT_EQ(sd::SpaceType::QUANTIZED, ArrayOptions::spaceType(shape));
 }
 
 TEST_F(ArrayOptionsTests, TestShape_Basic_5) {
     ArrayOptions::setPropertyBits(shape, {ARRAY_SPARSE, ARRAY_INT, ARRAY_CSC});
 
     ASSERT_TRUE(ArrayOptions::isSparseArray(shape));
-    ASSERT_TRUE(nd4j::DataType::INT32 == ArrayOptions::dataType(shape));
-    ASSERT_EQ(nd4j::SparseType::CSC, ArrayOptions::sparseType(shape));
+    ASSERT_TRUE(sd::DataType::INT32 == ArrayOptions::dataType(shape));
+    ASSERT_EQ(sd::SparseType::CSC, ArrayOptions::sparseType(shape));
 }
 
 TEST_F(ArrayOptionsTests, TestShape_Basic_6) {
     ArrayOptions::setPropertyBits(shape, {ARRAY_EMPTY, ARRAY_INT, ARRAY_CSC});
 
-    ASSERT_EQ(nd4j::ArrayType::EMPTY, ArrayOptions::arrayType(shape));
+    ASSERT_EQ(sd::ArrayType::EMPTY, ArrayOptions::arrayType(shape));
 }
 
 TEST_F(ArrayOptionsTests, TestShape_Basic_7) {
-    ArrayOptions::setDataType(shape, nd4j::DataType::FLOAT32);
-    ArrayOptions::setDataType(shape, nd4j::DataType::FLOAT32);
+    ArrayOptions::setDataType(shape, sd::DataType::FLOAT32);
+    ArrayOptions::setDataType(shape, sd::DataType::FLOAT32);
 
-    ASSERT_EQ(nd4j::DataType::FLOAT32, ArrayOptions::dataType(shape));
+    ASSERT_EQ(sd::DataType::FLOAT32, ArrayOptions::dataType(shape));
 }
 
 TEST_F(ArrayOptionsTests, TestShape_Basic_8) {
-    ArrayOptions::setDataType(shape, nd4j::DataType::DOUBLE);
-    ArrayOptions::setDataType(shape, nd4j::DataType::FLOAT32);
+    ArrayOptions::setDataType(shape, sd::DataType::DOUBLE);
+    ArrayOptions::setDataType(shape, sd::DataType::FLOAT32);
 
-    ASSERT_EQ(nd4j::DataType::FLOAT32, ArrayOptions::dataType(shape));
+    ASSERT_EQ(sd::DataType::FLOAT32, ArrayOptions::dataType(shape));
 }
 
 TEST_F(ArrayOptionsTests, TestShape_Basic_9) {
-    ArrayOptions::setDataType(shape, nd4j::DataType::FLOAT32);
-    ArrayOptions::setDataType(shape, nd4j::DataType::DOUBLE);
+    ArrayOptions::setDataType(shape, sd::DataType::FLOAT32);
+    ArrayOptions::setDataType(shape, sd::DataType::DOUBLE);
 
-    ASSERT_EQ(nd4j::DataType::DOUBLE, ArrayOptions::dataType(shape));
+    ASSERT_EQ(sd::DataType::DOUBLE, ArrayOptions::dataType(shape));
 }
\ No newline at end of file
diff --git a/libnd4j/tests_cpu/layers_tests/AtomicTests.cu b/libnd4j/tests_cpu/layers_tests/AtomicTests.cu
index fdf543026..bd024ef3b 100644
--- a/libnd4j/tests_cpu/layers_tests/AtomicTests.cu
+++ b/libnd4j/tests_cpu/layers_tests/AtomicTests.cu
@@ -20,14 +20,14 @@
 
 #include "testlayers.h"
 #include <ops/declarable/CustomOperations.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <ops/ops.h>
-#include <GradCheck.h>
+#include <helpers/GradCheck.h>
 #include <helpers/RandomLauncher.h>
 #include <exceptions/cuda_exception.h>
 
 
-using namespace nd4j;
+using namespace sd;
 
 
 class AtomicTests : public testing::Test {
@@ -48,16 +48,16 @@ static _CUDA_G void multiplyKernel(void *vbuffer, uint64_t length, void *vresult
         auto rem = e % 4;
         auto i = (e - rem) / 4;
 
-        nd4j::math::atomics::nd4j_atomicMul<T>(&result[i], buffer[e]);
+        sd::math::atomics::nd4j_atomicMul<T>(&result[i], buffer[e]);
     }
 }
 
 template <typename T>
 static void multiplyLauncher(void *vbuffer, uint64_t length, void *vresult) {
-    multiplyKernel<T><<<256, 256, 1024, *nd4j::LaunchContext::defaultContext()->getCudaStream()>>>(vbuffer, length, vresult);
-    auto err = cudaStreamSynchronize(*nd4j::LaunchContext::defaultContext()->getCudaStream());
+    multiplyKernel<T><<<256, 256, 1024, *sd::LaunchContext::defaultContext()->getCudaStream()>>>(vbuffer, length, vresult);
+    auto err = cudaStreamSynchronize(*sd::LaunchContext::defaultContext()->getCudaStream());
     if (err != 0)
-        nd4j::cuda_exception::build("multiply failed", err);
+        sd::cuda_exception::build("multiply failed", err);
 }
 
 template <typename T>
@@ -71,16 +71,16 @@ static _CUDA_G void sumKernel(void *vbuffer, uint64_t length, void *vresult) {
         auto rem = e % 4;
         auto i = (e - rem) / 4;
 
-        nd4j::math::atomics::nd4j_atomicAdd<T>(&result[i], buffer[e]);
+        sd::math::atomics::nd4j_atomicAdd<T>(&result[i], buffer[e]);
     }
 }
 
 template <typename T>
 static void sumLauncher(void *vbuffer, uint64_t length, void *vresult) {
-    sumKernel<T><<<256, 256, 1024, *nd4j::LaunchContext::defaultContext()->getCudaStream()>>>(vbuffer, length, vresult);
-    auto err = cudaStreamSynchronize(*nd4j::LaunchContext::defaultContext()->getCudaStream());
+    sumKernel<T><<<256, 256, 1024, *sd::LaunchContext::defaultContext()->getCudaStream()>>>(vbuffer, length, vresult);
+    auto err = cudaStreamSynchronize(*sd::LaunchContext::defaultContext()->getCudaStream());
     if (err != 0)
-        nd4j::cuda_exception::build("sum failed", err);
+        sd::cuda_exception::build("sum failed", err);
 }
 
 template <typename T>
@@ -94,16 +94,16 @@ static _CUDA_G void subKernel(void *vbuffer, uint64_t length, void *vresult) {
         auto rem = e % 4;
         auto i = (e - rem) / 4;
 
-        nd4j::math::atomics::nd4j_atomicSub<T>(&result[i], buffer[e]);
+        sd::math::atomics::nd4j_atomicSub<T>(&result[i], buffer[e]);
     }
 }
 
 template <typename T>
 static void subLauncher(void *vbuffer, uint64_t length, void *vresult) {
-    subKernel<T><<<256, 256, 1024, *nd4j::LaunchContext::defaultContext()->getCudaStream()>>>(vbuffer, length, vresult);
-    auto err = cudaStreamSynchronize(*nd4j::LaunchContext::defaultContext()->getCudaStream());
+    subKernel<T><<<256, 256, 1024, *sd::LaunchContext::defaultContext()->getCudaStream()>>>(vbuffer, length, vresult);
+    auto err = cudaStreamSynchronize(*sd::LaunchContext::defaultContext()->getCudaStream());
     if (err != 0)
-        nd4j::cuda_exception::build("sub failed", err);
+        sd::cuda_exception::build("sub failed", err);
 }
 
 template <typename T>
@@ -117,16 +117,16 @@ static _CUDA_G void divKernel(void *vbuffer, uint64_t length, void *vresult) {
         auto rem = e % 4;
         auto i = (e - rem) / 4;
 
-        nd4j::math::atomics::nd4j_atomicDiv<T>(&result[i], buffer[e]);
+        sd::math::atomics::nd4j_atomicDiv<T>(&result[i], buffer[e]);
     }
 }
 
 template <typename T>
 static void divLauncher(void *vbuffer, uint64_t length, void *vresult) {
-    divKernel<T><<<256, 256, 1024, *nd4j::LaunchContext::defaultContext()->getCudaStream()>>>(vbuffer, length, vresult);
-    auto err = cudaStreamSynchronize(*nd4j::LaunchContext::defaultContext()->getCudaStream());
+    divKernel<T><<<256, 256, 1024, *sd::LaunchContext::defaultContext()->getCudaStream()>>>(vbuffer, length, vresult);
+    auto err = cudaStreamSynchronize(*sd::LaunchContext::defaultContext()->getCudaStream());
     if (err != 0)
-        nd4j::cuda_exception::build("div failed", err);
+        sd::cuda_exception::build("div failed", err);
 }
 
 static void multiplyHost(NDArray &input, NDArray &output) {
@@ -146,7 +146,7 @@ static void divHost(NDArray &input, NDArray &output) {
 }
 
 TEST_F(AtomicTests, test_multiply) {
-    std::vector<nd4j::DataType> dtypes = {nd4j::DataType::FLOAT32, nd4j::DataType::DOUBLE, nd4j::DataType::INT16, nd4j::DataType::HALF};
+    std::vector<sd::DataType> dtypes = {sd::DataType::FLOAT32, sd::DataType::DOUBLE, sd::DataType::INT16, sd::DataType::HALF};
 
     for (auto t:dtypes) {
         nd4j_printf("Trying data type [%s]\n", DataTypeUtils::asString(t).c_str());
@@ -164,7 +164,7 @@ TEST_F(AtomicTests, test_multiply) {
 }
 
 TEST_F(AtomicTests, test_multiply_2) {
-    std::vector<nd4j::DataType> dtypes = {nd4j::DataType::FLOAT32, nd4j::DataType::DOUBLE, nd4j::DataType::HALF, nd4j::DataType::BFLOAT16};
+    std::vector<sd::DataType> dtypes = {sd::DataType::FLOAT32, sd::DataType::DOUBLE, sd::DataType::HALF, sd::DataType::BFLOAT16};
 
     for (auto t:dtypes) {
         nd4j_printf("Trying data type [%s]\n", DataTypeUtils::asString(t).c_str());
@@ -183,7 +183,7 @@ TEST_F(AtomicTests, test_multiply_2) {
 }
 
 TEST_F(AtomicTests, test_sum) {
-    std::vector<nd4j::DataType> dtypes = {nd4j::DataType::FLOAT32, nd4j::DataType::DOUBLE, nd4j::DataType::BFLOAT16, nd4j::DataType::HALF, nd4j::DataType::INT16};
+    std::vector<sd::DataType> dtypes = {sd::DataType::FLOAT32, sd::DataType::DOUBLE, sd::DataType::BFLOAT16, sd::DataType::HALF, sd::DataType::INT16};
 
     for (auto t:dtypes) {
         nd4j_printf("Trying data type [%s]\n", DataTypeUtils::asString(t).c_str());
@@ -202,7 +202,7 @@ TEST_F(AtomicTests, test_sum) {
 }
 
 TEST_F(AtomicTests, test_sub) {
-    std::vector<nd4j::DataType> dtypes = {nd4j::DataType::FLOAT32, nd4j::DataType::DOUBLE, nd4j::DataType::HALF};
+    std::vector<sd::DataType> dtypes = {sd::DataType::FLOAT32, sd::DataType::DOUBLE, sd::DataType::HALF};
 
     for (auto t:dtypes) {
         nd4j_printf("Trying data type [%s]\n", DataTypeUtils::asString(t).c_str());
@@ -222,7 +222,7 @@ TEST_F(AtomicTests, test_sub) {
 }
 
 TEST_F(AtomicTests, test_div) {
-    std::vector<nd4j::DataType> dtypes = {nd4j::DataType::FLOAT32, nd4j::DataType::DOUBLE, nd4j::DataType::BFLOAT16, nd4j::DataType::HALF};
+    std::vector<sd::DataType> dtypes = {sd::DataType::FLOAT32, sd::DataType::DOUBLE, sd::DataType::BFLOAT16, sd::DataType::HALF};
 
     for (auto t:dtypes) {
         nd4j_printf("Trying data type [%s]\n", DataTypeUtils::asString(t).c_str());
diff --git a/libnd4j/tests_cpu/layers_tests/AttentionTests.cpp b/libnd4j/tests_cpu/layers_tests/AttentionTests.cpp
index aa9d941ea..28fe404e1 100644
--- a/libnd4j/tests_cpu/layers_tests/AttentionTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/AttentionTests.cpp
@@ -20,13 +20,13 @@
 
 #include "testlayers.h"
 #include <ops/declarable/CustomOperations.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <ops/ops.h>
-#include <GradCheck.h>
+#include <helpers/GradCheck.h>
 #include <helpers/RandomLauncher.h>
 
 
-using namespace nd4j;
+using namespace sd;
 
 
 class AttentionTests : public testing::Test {
@@ -42,7 +42,7 @@ TEST_F(AttentionTests, basic_dot_product_attention) {
     auto values = NDArrayFactory::create<float>('c', {10, 4, 3});
     auto queries = NDArrayFactory::create<float>('c', {10, 4, 1});
 
-    nd4j::ops::dot_product_attention op;
+    sd::ops::dot_product_attention op;
     auto result = op.evaluate({&queries, &keys, &values}, {1, 0});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -57,7 +57,7 @@ TEST_F(AttentionTests, basic_dot_product_attention_bp) {
     auto queries = NDArrayFactory::create<float>('c', {10, 4, 1});
     auto eps = NDArrayFactory::create<float>('c', {10, 4, 1});
 
-    nd4j::ops::dot_product_attention_bp op;
+    sd::ops::dot_product_attention_bp op;
     auto result = op.execute({&queries, &keys, &values, &eps}, {}, {1, 0}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -70,7 +70,7 @@ TEST_F(AttentionTests, basic_dot_product_attention_with_weights) {
     auto values = NDArrayFactory::create<float>('c', {10, 4, 3});
     auto queries = NDArrayFactory::create<float>('c', {10, 4, 1});
 
-    nd4j::ops::dot_product_attention op;
+    sd::ops::dot_product_attention op;
     auto result = op.evaluate({&queries, &keys, &values}, {1, 1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -84,7 +84,7 @@ TEST_F(AttentionTests, basic_dot_product_attention_with_mask) {
     auto mask = NDArrayFactory::create<float>('c', {10, 3});
     mask.assign(1.);
 
-    nd4j::ops::dot_product_attention op;
+    sd::ops::dot_product_attention op;
     auto result = op.evaluate({&queries, &keys, &values, &mask}, {1, 0});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -101,7 +101,7 @@ TEST_F(AttentionTests, basic_dot_product_attention_bp_with_mask) {
     auto mask = NDArrayFactory::create<float>('c', {10, 3});
     mask.assign(1.);
 
-    nd4j::ops::dot_product_attention_bp op;
+    sd::ops::dot_product_attention_bp op;
     auto result = op.execute({&queries, &keys, &values, &eps, &mask}, {}, {1, 0}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -116,7 +116,7 @@ TEST_F(AttentionTests, multi_head_input_dot_product_attention_with_mask) {
     auto mask = NDArrayFactory::create<float>('c', {2, 3});
     mask.assign(1.);
 
-    nd4j::ops::dot_product_attention op;
+    sd::ops::dot_product_attention op;
     auto result = op.evaluate({&queries, &keys, &values, &mask}, {1, 0});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -133,7 +133,7 @@ TEST_F(AttentionTests, multi_head_input_dot_product_attention_bp_with_mask) {
     auto mask = NDArrayFactory::create<float>('c', {2, 3});
     mask.assign(1.);
 
-    nd4j::ops::dot_product_attention_bp op;
+    sd::ops::dot_product_attention_bp op;
     auto result = op.execute({&queries, &keys, &values, &eps, &mask}, {}, {1, 0}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -152,7 +152,7 @@ TEST_F(AttentionTests, basic_multi_head_dot_product_attention) {
     auto Wq = NDArrayFactory::create<float>('c', {2, 3, 4});
     auto Wo = NDArrayFactory::create<float>('c', {2* 3, 4});
 
-    nd4j::ops::multi_head_dot_product_attention op;
+    sd::ops::multi_head_dot_product_attention op;
     auto result = op.evaluate({&queries, &keys, &values, &Wk, &Wv, &Wq, &Wo}, {1, 0});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -174,7 +174,7 @@ TEST_F(AttentionTests, basic_multi_head_dot_product_bp_attention) {
     auto eps = NDArrayFactory::create<float>('c', {10, 7, 2});
 
 
-    nd4j::ops::multi_head_dot_product_attention_bp op;
+    sd::ops::multi_head_dot_product_attention_bp op;
     auto result = op.execute({&queries, &keys, &values, &Wk, &Wv, &Wq, &Wo, &eps}, {}, {1, 0}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -196,7 +196,7 @@ TEST_F(AttentionTests, basic_multi_head_dot_product_attention_with_mask) {
     mask.assign(1.);
 
 
-    nd4j::ops::multi_head_dot_product_attention op;
+    sd::ops::multi_head_dot_product_attention op;
     auto result = op.evaluate({&queries, &keys, &values, &Wk, &Wv, &Wq, &Wo, &mask}, {1, 0});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -221,7 +221,7 @@ TEST_F(AttentionTests, basic_multi_head_dot_product_bp_attention_with_mask) {
     mask.assign(1.);
 
 
-    nd4j::ops::multi_head_dot_product_attention_bp op;
+    sd::ops::multi_head_dot_product_attention_bp op;
     auto result = op.execute({&queries, &keys, &values, &Wk, &Wv, &Wq, &Wo, &eps, &mask}, {}, {1, 0}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
diff --git a/libnd4j/tests_cpu/layers_tests/BackpropTests.cpp b/libnd4j/tests_cpu/layers_tests/BackpropTests.cpp
index 88bd9b286..fc08184b9 100644
--- a/libnd4j/tests_cpu/layers_tests/BackpropTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/BackpropTests.cpp
@@ -21,9 +21,9 @@
 #include "testlayers.h"
 #include <ops/declarable/CustomOperations.h>
 
-using namespace nd4j;
-using namespace nd4j::ops;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::ops;
+using namespace sd::graph;
 
 class BackpropTests : public testing::Test {
 public:
@@ -32,11 +32,11 @@ public:
 
 TEST_F(BackpropTests, Test_Add_1) {
 
-    NDArray x('c', {2, 3, 4}, nd4j::DataType::FLOAT32);
-    NDArray y('c', {3, 4}, nd4j::DataType::FLOAT32);
-    NDArray e('c', {2, 3, 4}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {2, 3, 4}, sd::DataType::FLOAT32);
+    NDArray y('c', {3, 4}, sd::DataType::FLOAT32);
+    NDArray e('c', {2, 3, 4}, sd::DataType::FLOAT32);
 
-    nd4j::ops::add_bp op;
+    sd::ops::add_bp op;
     auto result = op.evaluate({&x, &y, &e});
 
     ASSERT_EQ(Status::OK(), result->status());
diff --git a/libnd4j/tests_cpu/layers_tests/BitwiseUtilsTests.cpp b/libnd4j/tests_cpu/layers_tests/BitwiseUtilsTests.cpp
index 487f7ba9b..4174637e2 100644
--- a/libnd4j/tests_cpu/layers_tests/BitwiseUtilsTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/BitwiseUtilsTests.cpp
@@ -20,12 +20,12 @@
 
 #include "testlayers.h"
 #include <ops/declarable/CustomOperations.h>
-#include <NDArray.h>
-#include <NativeOps.h>
+#include <array/NDArray.h>
+#include <legacy/NativeOps.h>
 #include <helpers/BitwiseUtils.h>
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class BitwiseUtilsTests : public testing::Test {
 public:
diff --git a/libnd4j/tests_cpu/layers_tests/BooleanOpsTests.cpp b/libnd4j/tests_cpu/layers_tests/BooleanOpsTests.cpp
index 38aada40f..ce6b61707 100644
--- a/libnd4j/tests_cpu/layers_tests/BooleanOpsTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/BooleanOpsTests.cpp
@@ -21,9 +21,9 @@
 #include "testlayers.h"
 #include <ops/declarable/CustomOperations.h>
 
-using namespace nd4j;
-using namespace nd4j::ops;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::ops;
+using namespace sd::graph;
 
 class BooleanOpsTests : public testing::Test {
 public:
@@ -35,7 +35,7 @@ TEST_F(BooleanOpsTests, LtTest_1) {
     auto x = NDArrayFactory::create_(1.0f);
     auto y = NDArrayFactory::create_(2.0f);
 
-    nd4j::ops::lt_scalar op;
+    sd::ops::lt_scalar op;
 
 
     ASSERT_TRUE(op.verify({x, y}));
@@ -48,7 +48,7 @@ TEST_F(BooleanOpsTests, LtTest_2) {
     auto x = NDArrayFactory::create_(2.0f);
     auto y = NDArrayFactory::create_(1.0f);
 
-    nd4j::ops::lt_scalar op;
+    sd::ops::lt_scalar op;
 
 
     ASSERT_FALSE(op.verify({x, y}));
@@ -60,7 +60,7 @@ TEST_F(BooleanOpsTests, LtTest_2) {
 TEST_F(BooleanOpsTests, Is_non_decreasing_1) {
     auto x = NDArrayFactory::create<double>('c', {2 , 2}, {1, 2, 4, 4});
 
-    nd4j::ops::is_non_decreasing op;
+    sd::ops::is_non_decreasing op;
 
     ASSERT_TRUE(op.verify({&x}));
 
@@ -69,7 +69,7 @@ TEST_F(BooleanOpsTests, Is_non_decreasing_1) {
 TEST_F(BooleanOpsTests, Is_non_decreasing_2) {
     auto x = NDArrayFactory::create<double>('c', {2 , 2}, {1, 2, 4, 3});
 
-    nd4j::ops::is_non_decreasing op;
+    sd::ops::is_non_decreasing op;
 
     ASSERT_FALSE(op.verify({&x}));
 
@@ -78,7 +78,7 @@ TEST_F(BooleanOpsTests, Is_non_decreasing_2) {
 TEST_F(BooleanOpsTests, Is_strictly_increasing_1) {
     auto x = NDArrayFactory::create<double>('c', {2 , 2}, {1, 2, 4, 5});
 
-    nd4j::ops::is_strictly_increasing op;
+    sd::ops::is_strictly_increasing op;
 
     ASSERT_TRUE(op.verify({&x}));
 
@@ -87,7 +87,7 @@ TEST_F(BooleanOpsTests, Is_strictly_increasing_1) {
 TEST_F(BooleanOpsTests, Is_strictly_increasing_2) {
     auto x = NDArrayFactory::create<double>('c', {2 , 2}, {1, 2, 3, 3});
 
-    nd4j::ops::is_strictly_increasing op;
+    sd::ops::is_strictly_increasing op;
 
     ASSERT_FALSE(op.verify({&x}));
 
@@ -96,7 +96,7 @@ TEST_F(BooleanOpsTests, Is_strictly_increasing_2) {
 TEST_F(BooleanOpsTests, Is_strictly_increasing_3) {
     auto x = NDArrayFactory::create<double>('c', {2 , 2}, {1, 2, 4, 3});
 
-    nd4j::ops::is_strictly_increasing op;
+    sd::ops::is_strictly_increasing op;
 
     ASSERT_FALSE(op.verify({&x}));
 }
@@ -105,7 +105,7 @@ TEST_F(BooleanOpsTests, Is_strictly_increasing_5) {
     auto x = NDArrayFactory::create<double>('c', {64, 512});
     x.linspace(1.0);
 
-    nd4j::ops::is_strictly_increasing op;
+    sd::ops::is_strictly_increasing op;
 
     ASSERT_TRUE(op.verify({&x}));
 }
@@ -116,7 +116,7 @@ TEST_F(BooleanOpsTests, Is_strictly_increasing_6) {
 
     x.p(18, 1000323.f);
 
-    nd4j::ops::is_strictly_increasing op;
+    sd::ops::is_strictly_increasing op;
 
     ASSERT_FALSE(op.verify({&x}));
 }
@@ -124,7 +124,7 @@ TEST_F(BooleanOpsTests, Is_strictly_increasing_6) {
 TEST_F(BooleanOpsTests, Is_numeric_tensor_1) {
     auto x = NDArrayFactory::create<float>('c', {2 , 2}, {1.f, 2.f, 4.f, 3.f});
 
-    nd4j::ops::is_numeric_tensor op;
+    sd::ops::is_numeric_tensor op;
 
     ASSERT_TRUE(op.verify({&x}));
 }
@@ -134,7 +134,7 @@ TEST_F(BooleanOpsTests, test_where_1) {
     auto y = NDArrayFactory::create<double>('c', {6}, { 2, -3, 1, 1, -2, 1 });
     auto e = NDArrayFactory::create<double>('c', {3}, { 4, 8, 5 });
 
-    nd4j::ops::choose op;
+    sd::ops::choose op;
 
     auto result = op.evaluate({&x, &y}, {3});
     ASSERT_EQ(Status::OK(), result->status());
diff --git a/libnd4j/tests_cpu/layers_tests/BroadcastableOpsTests.cpp b/libnd4j/tests_cpu/layers_tests/BroadcastableOpsTests.cpp
index 9b6d06ec6..3861c0ad8 100644
--- a/libnd4j/tests_cpu/layers_tests/BroadcastableOpsTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/BroadcastableOpsTests.cpp
@@ -20,12 +20,12 @@
 
 
 #include "testlayers.h"
-#include <Graph.h>
-#include <Node.h>
+#include <graph/Graph.h>
+#include <graph/Node.h>
 #include <ops/declarable/CustomOperations.h>
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class BroadcastableOpsTests : public testing::Test {
 public:
@@ -34,9 +34,9 @@ public:
 
 TEST_F(BroadcastableOpsTests, Test_Add_1) {
 
-    NDArray x('c', {5, 5}, nd4j::DataType::FLOAT32);
-    NDArray y('c', {1, 5}, nd4j::DataType::FLOAT32);
-    NDArray exp('c', {5, 5}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {5, 5}, sd::DataType::FLOAT32);
+    NDArray y('c', {1, 5}, sd::DataType::FLOAT32);
+    NDArray exp('c', {5, 5}, sd::DataType::FLOAT32);
     x.linspace(1);
     y.linspace(1);
     exp.linspace(1);
@@ -45,7 +45,7 @@ TEST_F(BroadcastableOpsTests, Test_Add_1) {
 
     exp.applyBroadcast(broadcast::Add, {1}, y, exp);
 
-    nd4j::ops::add op;
+    sd::ops::add op;
     auto result = op.evaluate({&x, &y});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -72,7 +72,7 @@ TEST_F(BroadcastableOpsTests, Test_Multiply_1) {
 
     exp.applyBroadcast(broadcast::Multiply, {1}, y, exp);
 
-    nd4j::ops::multiply op;
+    sd::ops::multiply op;
     auto result = op.evaluate({&x, &y});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -97,7 +97,7 @@ TEST_F(BroadcastableOpsTests, Test_SquaredSubtract_1) {
     exp.applyBroadcast(broadcast::SquaredSubtract, {1}, y, exp);
 
 
-    nd4j::ops::squaredsubtract op;
+    sd::ops::squaredsubtract op;
     auto result = op.evaluate({&x, &y});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -116,7 +116,7 @@ TEST_F(BroadcastableOpsTests, Test_ScalarBroadcast_1) {
     auto y = NDArrayFactory::create<float>('c', {1, 3}, {0, 1, 2});
     auto exp = NDArrayFactory::create<float>('c', {1,3}, {1, 0, -1});
 
-    nd4j::ops::subtract op;
+    sd::ops::subtract op;
     auto result = op.evaluate({&x, &y});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -135,7 +135,7 @@ TEST_F(BroadcastableOpsTests, Test_ScalarBroadcast_2) {
     auto y = NDArrayFactory::create<float>('c', {1, 3}, {0, 1, 2});
     auto exp = NDArrayFactory::create<float>('c', {1,3}, {1, 2, 3});
 
-    nd4j::ops::add op;
+    sd::ops::add op;
     auto result = op.evaluate({&x, &y});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -154,7 +154,7 @@ TEST_F(BroadcastableOpsTests, Test_Maximum_1) {
     auto row = NDArrayFactory::create<float>('c', {1, 3}, {2, 2, 2});
     auto exp = NDArrayFactory::create<float>('c', {2, 3}, {2, 2, 2, 2, 3, 2});
 
-    nd4j::ops::maximum op;
+    sd::ops::maximum op;
     auto result = op.evaluate({&x, &row});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -172,7 +172,7 @@ TEST_F(BroadcastableOpsTests, Test_Minimum_1) {
     auto col = NDArrayFactory::create<float>('c', {2, 1}, {2, 1});
     auto exp = NDArrayFactory::create<float>('c', {2, 3}, {1, 2, 1, 1, 1, 1});
 
-    nd4j::ops::minimum op;
+    sd::ops::minimum op;
     auto result = op.evaluate({&x, &col});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -187,7 +187,7 @@ TEST_F(BroadcastableOpsTests, Test_Minimum_1) {
 
 
 TEST_F(BroadcastableOpsTests, Test_Shape_1) {
-    nd4j::ops::minimum op;
+    sd::ops::minimum op;
 
     Nd4jLong shapeX[] = {2, 2, 5, 5, 1, 8192, 1, 99};
     Nd4jLong shapeY[] = {2, 2, 5, 5, 1, 8192, 1, 99};
@@ -204,7 +204,7 @@ TEST_F(BroadcastableOpsTests, Test_Shape_1) {
 }
 
 TEST_F(BroadcastableOpsTests, Test_Shape_2) {
-    nd4j::ops::minimum op;
+    sd::ops::minimum op;
 
     Nd4jLong shapeX[] = {2, 1, 1, 1, 1, 8192, 1, 99};
     Nd4jLong shapeY[] = {2, 2, 5, 5, 1, 8192, 1, 99};
@@ -222,7 +222,7 @@ TEST_F(BroadcastableOpsTests, Test_Shape_2) {
 
 
 TEST_F(BroadcastableOpsTests, Test_Shape_3) {
-    nd4j::ops::minimum op;
+    sd::ops::minimum op;
 
     Nd4jLong shapeX[] = {2, 5, 3, 1, 1, 8192, 1, 99};
     Nd4jLong shapeY[] = {2, 1, 3, 3, 1, 8192, 1, 99};
@@ -240,7 +240,7 @@ TEST_F(BroadcastableOpsTests, Test_Shape_3) {
 
 
 TEST_F(BroadcastableOpsTests, Test_Shape_4) {
-    nd4j::ops::minimum op;
+    sd::ops::minimum op;
 
     Nd4jLong shapeX[] = {2, 5, 3, 1, 1, 8192, 1, 99};
     Nd4jLong shapeY[] = {2, 5, 1, 1, 1, 8192, 1, 99};
@@ -259,7 +259,7 @@ TEST_F(BroadcastableOpsTests, Test_Shape_4) {
 // (2,1,3) + (4,3) = (2,4,3)
 
 TEST_F(BroadcastableOpsTests, Test_Shape_5) {
-    nd4j::ops::minimum op;
+    sd::ops::minimum op;
 
     Nd4jLong shapeX[] = {3, 2, 1, 3, 3, 3, 1, 8192, 1, 99};
     Nd4jLong shapeY[] = {2, 4, 3, 3, 1, 8192, 1, 99};
@@ -281,7 +281,7 @@ TEST_F(BroadcastableOpsTests, Test_Scalar_Add_1) {
     auto y  = NDArrayFactory::create<float>(2.0f);
     auto exp = NDArrayFactory::create<float>('c', {2, 2}, {3, 4, 5, 6});
 
-    nd4j::ops::add op;
+    sd::ops::add op;
     auto result = op.evaluate({&x, &y});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -303,7 +303,7 @@ TEST_F(BroadcastableOpsTests, Test_Inplace_Output_1) {
     y.assign(1.0f);
     e.assign(1.0f);
 
-    nd4j::ops::add op;
+    sd::ops::add op;
     auto result = op.execute({&x, &y}, {&o}, {}, {}, {});
     ASSERT_EQ(Status::OK(), result);
 
@@ -331,7 +331,7 @@ TEST_F(BroadcastableOpsTests, Test_Subtract_2) {
     auto y = NDArrayFactory::create<float>('c', {2}, {0.0f, 1.0f});
     auto e = NDArrayFactory::create<float>('c', {2}, {1.0f, 0.0f});
 
-    nd4j::ops::subtract op;
+    sd::ops::subtract op;
     auto result = op.evaluate({&x, &y});
     auto z = result->at(0);
 
@@ -346,7 +346,7 @@ TEST_F(BroadcastableOpsTests, Test_Subtract_3) {
     auto z = NDArrayFactory::create<float>('c', {2}, {0.0f, 0.0f});
     auto e = NDArrayFactory::create<float>('c', {2}, {1.0f, 0.0f});
 
-    nd4j::ops::subtract op;
+    sd::ops::subtract op;
     auto result = op.execute({&x, &y}, {&z}, {}, {}, {});
 
     ASSERT_EQ(Status::OK(), result);
@@ -509,7 +509,7 @@ TEST_F(BroadcastableOpsTests, Test_Multiply_7) {
     auto y = NDArrayFactory::create<float>('c', {1}, {4.f});
     auto e = NDArrayFactory::create<float>('c', {1}, {8.f});
 
-    nd4j::ops::multiply op;
+    sd::ops::multiply op;
     auto result = op.evaluate({&x, &y});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -525,7 +525,7 @@ TEST_F(BroadcastableOpsTests, Test_Multiply_8) {
     auto y = NDArrayFactory::create<float>('c', {1, 1}, {4.f});
     auto e = NDArrayFactory::create<float>('c', {1, 1}, {8.f});
 
-    nd4j::ops::multiply op;
+    sd::ops::multiply op;
     auto result = op.evaluate({&x, &y});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -541,10 +541,10 @@ TEST_F(BroadcastableOpsTests, broadcast_add_1) {
 
     NDArray x('c', {4}, {1,1,1,1});
     NDArray y('c', {1,4}, {1,2,3,4});
-    NDArray z('c', {1,4}, nd4j::DataType::DOUBLE);
-    NDArray exp('c', {1,4}, {2,3,4,5}, nd4j::DataType::DOUBLE);
+    NDArray z('c', {1,4}, sd::DataType::DOUBLE);
+    NDArray exp('c', {1,4}, {2,3,4,5}, sd::DataType::DOUBLE);
 
-    nd4j::ops::add op;
+    sd::ops::add op;
     auto status = op.execute({&x, &y}, {&z});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -556,10 +556,10 @@ TEST_F(BroadcastableOpsTests, broadcast_equals_1) {
 
     NDArray x('c', {1,4}, {1,2,3,4});
     NDArray y('c', {3,4}, {0,0,0,0,  1,2,3,4,  1,2,3,4});
-    NDArray z('c', {3,4}, nd4j::DataType::BOOL);
-    NDArray exp('c', {3,4}, {0,0,0,0,  1,1,1,1,  1,1,1,1}, nd4j::DataType::BOOL);
+    NDArray z('c', {3,4}, sd::DataType::BOOL);
+    NDArray exp('c', {3,4}, {0,0,0,0,  1,1,1,1,  1,1,1,1}, sd::DataType::BOOL);
 
-    nd4j::ops::equals op;
+    sd::ops::equals op;
     auto status = op.execute({&x, &y}, {&z});
     // z.printIndexedBuffer();
 
@@ -571,11 +571,11 @@ TEST_F(BroadcastableOpsTests, broadcast_equals_1) {
 TEST_F(BroadcastableOpsTests, broadcast_empty_1) {
 
     NDArray y('c', {3,4}, {0,0,0,0,  1,2,3,4,  1,2,3,4});
-    NDArray x(nd4j::DataType::DOUBLE, y.getContext(), false);
-    NDArray z(nd4j::DataType::DOUBLE, y.getContext(), false);
-    NDArray zExp(nd4j::DataType::DOUBLE, y.getContext(), false);
+    NDArray x(sd::DataType::DOUBLE, y.getContext(), false);
+    NDArray z(sd::DataType::DOUBLE, y.getContext(), false);
+    NDArray zExp(sd::DataType::DOUBLE, y.getContext(), false);
 
-    nd4j::ops::multiply op;
+    sd::ops::multiply op;
     auto status = op.execute({&x, &y}, {&z}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -589,7 +589,7 @@ TEST_F(BroadcastableOpsTests, broadcast_empty_2) {
     NDArray x = NDArrayFactory::create<double>('c', {0, 4});
     NDArray e = NDArrayFactory::create<double>('c', {0, 4});;
 
-    nd4j::ops::multiply op;
+    sd::ops::multiply op;
     auto status = op.execute({&x, &y}, {&x}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -600,10 +600,10 @@ TEST_F(BroadcastableOpsTests, broadcast_empty_2) {
 TEST_F(BroadcastableOpsTests, broadcast_empty_3) {
 
     NDArray x = NDArrayFactory::create<float>('c', {1, 0, 2});
-    NDArray y('c', {}, std::vector<double>{0.1}, nd4j::DataType::FLOAT32);
+    NDArray y('c', {}, std::vector<double>{0.1}, sd::DataType::FLOAT32);
     NDArray e = NDArrayFactory::create<float>('c', {1, 0, 2});;
 
-    nd4j::ops::maximum op;
+    sd::ops::maximum op;
     auto result = op.evaluate({&x, &y});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -622,7 +622,7 @@ TEST_F(BroadcastableOpsTests, broadcast_empty_4) {
     NDArray y = NDArrayFactory::create<float>('c', {1, 0, 2});
     NDArray e = NDArrayFactory::create<float>('c', {1, 0, 2});;
 
-    nd4j::ops::maximum op;
+    sd::ops::maximum op;
     auto result = op.evaluate({&x, &y});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -641,7 +641,7 @@ TEST_F(BroadcastableOpsTests, broadcast_empty_5) {
     NDArray y = NDArrayFactory::create<float>('c', {1, 0, 2});
     NDArray e = NDArrayFactory::create<float>('c', {1, 0, 2});;
 
-    nd4j::ops::realdiv op;
+    sd::ops::realdiv op;
     auto result = op.evaluate({&x, &y});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -660,7 +660,7 @@ TEST_F(BroadcastableOpsTests, broadcast_empty_6) {
     NDArray y = NDArrayFactory::create<float>('c', {1, 2}, {2, 2});
     NDArray e = NDArrayFactory::create<float>('c', {1, 0, 2});;
 
-    nd4j::ops::realdiv op;
+    sd::ops::realdiv op;
     auto result = op.evaluate({&x, &y});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -679,7 +679,7 @@ TEST_F(BroadcastableOpsTests, broadcast_empty_7) {
     NDArray y = NDArrayFactory::create<float>('c', {1, 2, 0});
     NDArray e = NDArrayFactory::create<float>('c', {1, 0, 2, 0});;
 
-    nd4j::ops::realdiv op;
+    sd::ops::realdiv op;
     auto result = op.evaluate({&x, &y});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -696,11 +696,11 @@ TEST_F(BroadcastableOpsTests, broadcast_empty_7) {
 TEST_F(BroadcastableOpsTests, broadcast_bool_empty_1) {
 
     NDArray y('c', {3,4}, {0,0,0,0,  1,2,3,4,  1,2,3,4});
-    NDArray x(nd4j::DataType::DOUBLE, y.getContext(), false);
-    NDArray z(nd4j::DataType::BOOL, y.getContext(), false);
-    NDArray zExp(nd4j::DataType::BOOL, y.getContext(), false);
+    NDArray x(sd::DataType::DOUBLE, y.getContext(), false);
+    NDArray z(sd::DataType::BOOL, y.getContext(), false);
+    NDArray zExp(sd::DataType::BOOL, y.getContext(), false);
 
-    nd4j::ops::greater op;
+    sd::ops::greater op;
     auto status = op.execute({&x, &y}, {&z}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -715,7 +715,7 @@ TEST_F(BroadcastableOpsTests, broadcast_bool_empty_2) {
     NDArray e = NDArrayFactory::create<bool>('c', {0, 4});;
 
 
-    nd4j::ops::greater op;
+    sd::ops::greater op;
     auto result  = op.evaluate({&x, &y});
 
     auto z = result->at(0);
@@ -731,16 +731,16 @@ TEST_F(BroadcastableOpsTests, broadcast_bool_empty_2) {
 
 TEST_F(BroadcastableOpsTests, broadcast_bool_1) {
 
-    NDArray x('c', {3, 1, 2}, nd4j::DataType::FLOAT32);
-    NDArray y('c', {2, 2}, nd4j::DataType::FLOAT32);
-    NDArray z('c', {3, 2, 2}, nd4j::DataType::BOOL);
-    NDArray e('c', {3, 2, 2}, nd4j::DataType::BOOL);
+    NDArray x('c', {3, 1, 2}, sd::DataType::FLOAT32);
+    NDArray y('c', {2, 2}, sd::DataType::FLOAT32);
+    NDArray z('c', {3, 2, 2}, sd::DataType::BOOL);
+    NDArray e('c', {3, 2, 2}, sd::DataType::BOOL);
 
     x.assign(4.f);
     y.assign(2.f);
     e.assign(true);
 
-    nd4j::ops::greater op;
+    sd::ops::greater op;
 
     auto status = op.execute({&x, &y}, {&z});
 
@@ -754,16 +754,16 @@ TEST_F(BroadcastableOpsTests, broadcast_bool_1) {
 
 TEST_F(BroadcastableOpsTests, broadcast_bool_2) {
 
-    NDArray x('c', {3, 1, 2}, nd4j::DataType::FLOAT32);
-    NDArray y('c', {2, 2}, nd4j::DataType::FLOAT32);
-    NDArray z('c', {3, 2, 2}, nd4j::DataType::BOOL);
-    NDArray e('c', {3, 2, 2}, nd4j::DataType::BOOL);
+    NDArray x('c', {3, 1, 2}, sd::DataType::FLOAT32);
+    NDArray y('c', {2, 2}, sd::DataType::FLOAT32);
+    NDArray z('c', {3, 2, 2}, sd::DataType::BOOL);
+    NDArray e('c', {3, 2, 2}, sd::DataType::BOOL);
 
     x.assign(1.f);
     y.assign(2.f);
     e.assign(false);
 
-    nd4j::ops::equals op;
+    sd::ops::equals op;
 
     auto status = op.execute({&x, &y}, {&z}, {}, {}, {});
 
@@ -779,12 +779,12 @@ TEST_F(BroadcastableOpsTests, broadcast_bool_3) {
 
     auto x = NDArrayFactory::create<int>(0);
     auto y = NDArrayFactory::create<int>('c', {3}, {2, 1, 2});
-    NDArray z('c', {3}, nd4j::DataType::BOOL);
-    NDArray e('c', {3}, nd4j::DataType::BOOL);
+    NDArray z('c', {3}, sd::DataType::BOOL);
+    NDArray e('c', {3}, sd::DataType::BOOL);
 
     e.assign(true);
 
-    nd4j::ops::less op;
+    sd::ops::less op;
     auto status = op.execute({&x, &y}, {&z}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -796,16 +796,16 @@ TEST_F(BroadcastableOpsTests, broadcast_bool_3) {
 }
 
 TEST_F(BroadcastableOpsTests, broadcast_2) {
-    NDArray x('c', {3, 1, 2}, nd4j::DataType::FLOAT32);
-    NDArray y('c', {2, 2}, nd4j::DataType::FLOAT32);
-    NDArray z('c', {3, 2, 2}, nd4j::DataType::FLOAT32);
-    NDArray e('c', {3, 2, 2}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {3, 1, 2}, sd::DataType::FLOAT32);
+    NDArray y('c', {2, 2}, sd::DataType::FLOAT32);
+    NDArray z('c', {3, 2, 2}, sd::DataType::FLOAT32);
+    NDArray e('c', {3, 2, 2}, sd::DataType::FLOAT32);
 
     x = 4.f;
     y = 2.f;
     e = -2.f;
 
-    nd4j::ops::reversesubtract op;   // z = y - x;
+    sd::ops::reversesubtract op;   // z = y - x;
 
     auto status = op.execute({&x, &y}, {&z}, {}, {}, {});
 
@@ -820,10 +820,10 @@ TEST_F(BroadcastableOpsTests, broadcast_2) {
 TEST_F(BroadcastableOpsTests, broadcast_3) {
     auto x = NDArrayFactory::create<int>(0);
     auto y = NDArrayFactory::create<int>('c', {3}, {2, 1, 2});
-    NDArray z('c', {3}, nd4j::DataType::INT32);
+    NDArray z('c', {3}, sd::DataType::INT32);
     auto e = NDArrayFactory::create<int>('c', {3}, {2, 1, 2});
 
-    nd4j::ops::add op;
+    sd::ops::add op;
     auto status = op.execute({&x, &y}, {&z}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -850,7 +850,7 @@ TEST_F(BroadcastableOpsTests, test_bert_multiply_1) {
     ctx.setInputArray(1, &y);
     ctx.setOutputArray(0, &z);
 
-    nd4j::ops::multiply op;
+    sd::ops::multiply op;
     auto status = op.execute(&ctx);
     ASSERT_EQ(Status::OK(), status);
 
diff --git a/libnd4j/tests_cpu/layers_tests/BrodcastTests.cpp b/libnd4j/tests_cpu/layers_tests/BrodcastTests.cpp
index bc2ae2152..ed97c3137 100644
--- a/libnd4j/tests_cpu/layers_tests/BrodcastTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/BrodcastTests.cpp
@@ -19,7 +19,7 @@
 //
 
 #include "testinclude.h"
-#include <broadcasting.h>
+#include <loops/broadcasting.h>
 
 class BroadcastMultiDimTest : public testing::Test {
 public:
@@ -54,7 +54,7 @@ TEST_F(BroadcastMultiDimTest,MultimDimTest) {
             tad->tadOnlyShapeInfo, //tadShapeInfo
             tad->tadOffsets, //tadOffset
             tad->tadOnlyShapeInfo, //tadShapeInfoZ
-            tad->tadOffsets, nd4j::LoopKind::COMMON, 0, tad->numTads); //tadOffsetZ
+            tad->tadOffsets, sd::LoopKind::COMMON, 0, tad->numTads); //tadOffsetZ
     for(int i = 0; i < 30; i++) {
         ASSERT_EQ(dataAssertion[i],result[i]);
     }
diff --git a/libnd4j/tests_cpu/layers_tests/CMakeLists.txt b/libnd4j/tests_cpu/layers_tests/CMakeLists.txt
index 17ae714cd..07f473dbc 100644
--- a/libnd4j/tests_cpu/layers_tests/CMakeLists.txt
+++ b/libnd4j/tests_cpu/layers_tests/CMakeLists.txt
@@ -1,5 +1,5 @@
 include_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR})
-include_directories(../../include ../../layers ../../include/helpers ../../include/exceptions ../../include/execution ../../include/array ../../include/memory ../../include/loops ../../include/graph ../../include/ops ../../include/types ../../include/cnpy ../../blas ../lib/googletest-release-1.8.0/googletest/include)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
 if(LINUX)
     link_directories(/usr/local/lib)
     link_directories(/usr/lib)
@@ -21,7 +21,7 @@ if(WIN32)
     endforeach()
 endif()
 
-if (CUDA_BLAS)
+if (SD_CUDA)
 	find_package(CUDA)
 	message("Tests CUDA include directory: ${CUDA_INCLUDE_DIRS}")
 	include_directories(${CUDA_INCLUDE_DIRS})
@@ -44,11 +44,11 @@ endif()
 if (APPLE)
     set(CMAKE_CXX_FLAGS  " -fPIC  -fmax-errors=2 -D__APPLE_OS__=true")
 elseif(WIN32)
-    if (CPU_BLAS)
+    if (SD_CPU)
         set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fPIC -march=native -mtune=native -O3")
     endif()
 
-	if (CPU_BLAS AND LINUX)
+	if (SD_CPU AND LINUX)
 		set(CMAKE_CXX_FLAGS  " -fPIC  -fmax-errors=2")
 	endif()
 else()
@@ -60,7 +60,7 @@ else()
         set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -march=native -mtune=native")
     endif()
 
-    if (CPU_BLAS)
+    if (SD_CPU AND SD_SANITIZE)
         set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} -fsanitize=address")
     else()
         # CUDA?
@@ -69,7 +69,7 @@ endif()
 
 
 # tests are always compiled with all ops included
-SET( CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -DLIBND4J_ALL_OPS=true -DBUILD_TESTS=true")
+SET( CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -DSD_ALL_OPS=true -DBUILD_TESTS=true")
 
 if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
     # using Clang
@@ -109,9 +109,9 @@ else()
     message("OPENMP NOT FOUND")
 endif()
 
-if (CPU_BLAS)
+if (SD_CPU)
 	file(GLOB_RECURSE TEST_SOURCES false ./*.cpp ./*.h)
-elseif (CUDA_BLAS)
+elseif (SD_CUDA)
 	file(GLOB_RECURSE TEST_SOURCES false ./*.cpp ./*.cu ./*.h)
 endif()
 
@@ -124,14 +124,14 @@ foreach (TMP_PATH ${TEST_SOURCES})
     endif ()
 endforeach(TMP_PATH)
 
-if (CPU_BLAS)
+if (SD_CPU)
     if (NOT BLAS_LIBRARIES)
         set(BLAS_LIBRARIES "")
     endif()
 
 	add_executable(runtests ${TEST_SOURCES})
-	target_link_libraries(runtests ${LIBND4J_NAME}static ${MKLDNN_LIBRARIES} ${OPENBLAS_LIBRARIES} ${MKLDNN} ${BLAS_LIBRARIES} ${CPU_FEATURES} gtest gtest_main)
-elseif(CUDA_BLAS)
+	target_link_libraries(runtests ${SD_LIBRARY_NAME}static ${MKLDNN_LIBRARIES} ${OPENBLAS_LIBRARIES} ${MKLDNN} ${BLAS_LIBRARIES} ${CPU_FEATURES} gtest gtest_main)
+elseif(SD_CUDA)
 
 	add_executable(runtests ${TEST_SOURCES})
 
@@ -148,5 +148,5 @@ elseif(CUDA_BLAS)
         message("CUDNN library: ${CUDNN}")
     endif()
 
-	target_link_libraries(runtests ${LIBND4J_NAME}static ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_cusolver_LIBRARY} ${CUDNN} ${MKLDNN} gtest gtest_main)
+	target_link_libraries(runtests ${SD_LIBRARY_NAME}static ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_cusolver_LIBRARY} ${CUDNN} ${MKLDNN} gtest gtest_main)
 endif()
\ No newline at end of file
diff --git a/libnd4j/tests_cpu/layers_tests/CnpyTests.cpp b/libnd4j/tests_cpu/layers_tests/CnpyTests.cpp
index 086da26c5..ea8025592 100644
--- a/libnd4j/tests_cpu/layers_tests/CnpyTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/CnpyTests.cpp
@@ -20,7 +20,7 @@
 
 #include "testinclude.h"
 #include <string>
-#include <NativeOps.h>
+#include <legacy/NativeOps.h>
 
 class FileTest : public testing::Test {
 
@@ -38,28 +38,28 @@ TEST_F(HeaderTest, test_dataTypes_1) {
     std::string header("0NUMPY6789{'descr': '>f4");
 
 
-    ASSERT_EQ(nd4j::DataType::FLOAT32, dataTypeFromNpyHeader(const_cast<char *>(header.data())));
+    ASSERT_EQ(sd::DataType::FLOAT32, dataTypeFromNpyHeader(const_cast<char *>(header.data())));
 }
 
 TEST_F(HeaderTest, test_dataTypes_2) {
     std::string header("0NUMPY6789{'descr': '>f8");
 
 
-    ASSERT_EQ(nd4j::DataType::DOUBLE, dataTypeFromNpyHeader(const_cast<char *>(header.data())));
+    ASSERT_EQ(sd::DataType::DOUBLE, dataTypeFromNpyHeader(const_cast<char *>(header.data())));
 }
 
 TEST_F(HeaderTest, test_dataTypes_3) {
     std::string header("0NUMPY6789{'descr': '<i4");
 
 
-    ASSERT_EQ(nd4j::DataType::INT32, dataTypeFromNpyHeader(const_cast<char *>(header.data())));
+    ASSERT_EQ(sd::DataType::INT32, dataTypeFromNpyHeader(const_cast<char *>(header.data())));
 }
 
 TEST_F(HeaderTest, test_dataTypes_4) {
     std::string header("0NUMPY6789{'descr': '>u2");
 
 
-    ASSERT_EQ(nd4j::DataType::UINT16, dataTypeFromNpyHeader(const_cast<char *>(header.data())));
+    ASSERT_EQ(sd::DataType::UINT16, dataTypeFromNpyHeader(const_cast<char *>(header.data())));
 }
 
 /*
diff --git a/libnd4j/tests_cpu/layers_tests/ConditionalTests.cpp b/libnd4j/tests_cpu/layers_tests/ConditionalTests.cpp
index 60ba4733c..00752ca0f 100644
--- a/libnd4j/tests_cpu/layers_tests/ConditionalTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ConditionalTests.cpp
@@ -19,13 +19,13 @@
 //
 
 #include "testlayers.h"
-#include <Graph.h>
-#include <GraphExecutioner.h>
-#include <Node.h>
+#include <graph/Graph.h>
+#include <graph/GraphExecutioner.h>
+#include <graph/Node.h>
 #include <ops/declarable/CustomOperations.h>
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class ConditionalTests : public testing::Test {
 public:
@@ -75,7 +75,7 @@ TEST_F(ConditionalTests, BasicTests_1) {
     auto nodeC0 = new Node(OpType_REDUCE_SAME, reduce::Sum, 7, {-1});
     nodeC0->setScopeInfo(1, "scopeCondition");
 
-    nd4j::ops::eq_scalar op;
+    sd::ops::eq_scalar op;
     auto nodeC1 = new Node(&op, 8, {7, -4});
     nodeC1->setScopeInfo(1, "scopeCondition");
 
@@ -110,7 +110,7 @@ TEST_F(ConditionalTests, BasicTests_1) {
  * Condition is False
  */
 TEST_F(ConditionalTests, Flat_Test_1) {
-    nd4j::ops::identity op0;
+    sd::ops::identity op0;
 
     auto graph = GraphExecutioner::importFromFlatBuffers("./resources/simpleif_0_1.fb");
     auto varSpace = graph->getVariableSpace();
@@ -141,7 +141,7 @@ TEST_F(ConditionalTests, Flat_Test_1) {
 TEST_F(ConditionalTests, Flat_Test_2) {
     Environment::getInstance()->setDebug(true);
     Environment::getInstance()->setVerbose(true);
-    nd4j::ops::identity op0;
+    sd::ops::identity op0;
 
     auto graph = GraphExecutioner::importFromFlatBuffers("./resources/simpleif_0.fb");
     auto varSpace = graph->getVariableSpace();
@@ -169,7 +169,7 @@ TEST_F(ConditionalTests, Flat_Test_2) {
  * Condition is false here, so there loop will be skipped
  */
 TEST_F(ConditionalTests, Flat_Test_3) {
-    nd4j::ops::identity op0;
+    sd::ops::identity op0;
 
     auto graph = GraphExecutioner::importFromFlatBuffers("./resources/simplewhile_0_3.fb");
     auto varSpace = graph->getVariableSpace();
@@ -196,7 +196,7 @@ TEST_F(ConditionalTests, Flat_Test_3) {
  * just one cycle in body
  */
 TEST_F(ConditionalTests, Flat_Test_4) {
-    nd4j::ops::identity op0;
+    sd::ops::identity op0;
 
     auto graph = GraphExecutioner::importFromFlatBuffers("./resources/simplewhile_0_4.fb");
     auto varSpace = graph->getVariableSpace();
@@ -225,7 +225,7 @@ TEST_F(ConditionalTests, Flat_Test_4) {
  * just two cycles in body
  */
 TEST_F(ConditionalTests, Flat_Test_5) {
-    nd4j::ops::identity op0;
+    sd::ops::identity op0;
 
     auto graph = GraphExecutioner::importFromFlatBuffers("./resources/simplewhile_0_4.fb");
     auto varSpace = graph->getVariableSpace();
@@ -253,7 +253,7 @@ TEST_F(ConditionalTests, Flat_Test_5) {
  * While loop with multiple variables
  */
 TEST_F(ConditionalTests, Flat_Test_6) {
-    nd4j::ops::identity op0;
+    sd::ops::identity op0;
 
     auto graph = GraphExecutioner::importFromFlatBuffers("./resources/simplewhile_1.fb");
     auto varSpace = graph->getVariableSpace();
@@ -280,7 +280,7 @@ TEST_F(ConditionalTests, Flat_Test_6) {
 }
 
 TEST_F(ConditionalTests, Flat_Test_7) {
-    nd4j::ops::identity op0;
+    sd::ops::identity op0;
 
     auto graph = GraphExecutioner::importFromFlatBuffers("./resources/simplewhile_1.fb");
     auto varSpace = graph->getVariableSpace();
diff --git a/libnd4j/tests_cpu/layers_tests/ConstantShapeHelperTests.cpp b/libnd4j/tests_cpu/layers_tests/ConstantShapeHelperTests.cpp
index aa1491b75..5b747ab5b 100644
--- a/libnd4j/tests_cpu/layers_tests/ConstantShapeHelperTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ConstantShapeHelperTests.cpp
@@ -20,14 +20,14 @@
 
 #include "testlayers.h"
 #include <ops/declarable/CustomOperations.h>
-#include <ConstantShapeHelper.h>
-#include <ShapeDescriptor.h>
+#include <helpers/ConstantShapeHelper.h>
+#include <array/ShapeDescriptor.h>
 #include <array/ConstantDataBuffer.h>
 #include <helpers/PointersManager.h>
 
-using namespace nd4j;
-using namespace nd4j::ops;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::ops;
+using namespace sd::graph;
 
 class ConstantShapeHelperTests : public testing::Test {
 public:
@@ -76,7 +76,7 @@ TEST_F(ConstantTadHelperTests, test_cachedAmount_1) {
 }
 
 TEST_F(ConstantShapeHelperTests, basic_test_1) {
-    auto ptr = ShapeBuilders::createShapeInfo(nd4j::DataType::BFLOAT16, 'f', {5, 10, 15});
+    auto ptr = ShapeBuilders::createShapeInfo(sd::DataType::BFLOAT16, 'f', {5, 10, 15});
     ShapeDescriptor descriptor(ptr);
     ShapeDescriptor descriptor2(ptr);
 
@@ -85,7 +85,7 @@ TEST_F(ConstantShapeHelperTests, basic_test_1) {
     ASSERT_EQ(1, descriptor.ews());
     ASSERT_EQ(3, descriptor.rank());
     ASSERT_EQ('f', descriptor.order());
-    ASSERT_EQ(nd4j::DataType::BFLOAT16, descriptor.dataType());
+    ASSERT_EQ(sd::DataType::BFLOAT16, descriptor.dataType());
     ASSERT_FALSE(descriptor.isEmpty());
 
     ASSERT_FALSE(ConstantShapeHelper::getInstance()->checkBufferExistenceForShapeInfo(descriptor));
@@ -107,12 +107,12 @@ TEST_F(ConstantShapeHelperTests, basic_test_1) {
 TEST_F(ConstantShapeHelperTests, stress_test_1) {
 
     for (auto x = 0; x < 1000; x++) {
-        auto ptr = ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', {5, x + 10, x + 1});
+        auto ptr = ShapeBuilders::createShapeInfo(sd::DataType::FLOAT32, 'c', {5, x + 10, x + 1});
         ShapeDescriptor descriptor(ptr);
         ConstantShapeHelper::getInstance()->createShapeInfo(descriptor);
         delete [] ptr;
     }
-    ShapeDescriptor aShape(nd4j::DataType::FLOAT32, 'c',  {(Nd4jLong)5, (Nd4jLong)382, (Nd4jLong)373});
+    ShapeDescriptor aShape(sd::DataType::FLOAT32, 'c',  {(Nd4jLong)5, (Nd4jLong)382, (Nd4jLong)373});
 //    nd4j_printf("%d\n", ConstantShapeHelper::getInstance()->cachedEntriesForDevice(0));
 
     auto timeStart = std::chrono::system_clock::now();
@@ -145,7 +145,7 @@ TEST_F(ConstantShapeHelperTests, basic_test_4) {
 
 #ifdef __CUDABLAS__
     ASSERT_TRUE(dup->specialShapeInfo() != nullptr);
-    PointersManager manager(nd4j::LaunchContext ::defaultContext(), "test");
+    PointersManager manager(sd::LaunchContext ::defaultContext(), "test");
     // manager.printDevContentOnDev<Nd4jLong>(dup->specialShapeInfo(), shape::shapeInfoLength(2), 0);
 #endif
 
@@ -169,8 +169,8 @@ TEST_F(ConstantShapeHelperTests, basic_test_5) {
 }
 
 TEST_F(ConstantShapeHelperTests, basic_test_6) {
-    ShapeDescriptor descriptorA(nd4j::DataType::INT32, 'c', {});
-    ShapeDescriptor descriptorB(nd4j::DataType::FLOAT32, 'c', {10, 10});
+    ShapeDescriptor descriptorA(sd::DataType::INT32, 'c', {});
+    ShapeDescriptor descriptorB(sd::DataType::FLOAT32, 'c', {10, 10});
 
     // ASSERT_FALSE(descriptorA < descriptorB);
     // ASSERT_TRUE(descriptorB < descriptorA);
@@ -195,14 +195,14 @@ TEST_F(ConstantHelperTests, basic_test_1) {
 
     ConstantDescriptor descriptor({1, 2, 3});
 
-    ConstantDataBuffer* fBuffer = ConstantHelper::getInstance()->constantBuffer(descriptor, nd4j::DataType::FLOAT32);
+    ConstantDataBuffer* fBuffer = ConstantHelper::getInstance()->constantBuffer(descriptor, sd::DataType::FLOAT32);
     auto fPtr = fBuffer->primaryAsT<float>();
 
     ASSERT_NEAR(1.f, fPtr[0], 1e-5);
     ASSERT_NEAR(2.f, fPtr[1], 1e-5);
     ASSERT_NEAR(3.f, fPtr[2], 1e-5);
 
-    auto iBuffer = ConstantHelper::getInstance()->constantBuffer(descriptor, nd4j::DataType::INT32);
+    auto iBuffer = ConstantHelper::getInstance()->constantBuffer(descriptor, sd::DataType::INT32);
     auto iPtr = iBuffer->primaryAsT<int>();
 
     ASSERT_EQ(1, iPtr[0]);
@@ -215,14 +215,14 @@ TEST_F(ConstantHelperTests, basic_test_2) {
     double array[] = {1., 2., 3.};
     ConstantDescriptor descriptor(array, 3);
 
-    ConstantDataBuffer* fBuffer = ConstantHelper::getInstance()->constantBuffer(descriptor, nd4j::DataType::FLOAT32);
+    ConstantDataBuffer* fBuffer = ConstantHelper::getInstance()->constantBuffer(descriptor, sd::DataType::FLOAT32);
     auto fPtr = fBuffer->primaryAsT<float>();
 
     ASSERT_NEAR(1.f, fPtr[0], 1e-5);
     ASSERT_NEAR(2.f, fPtr[1], 1e-5);
     ASSERT_NEAR(3.f, fPtr[2], 1e-5);
 
-    auto iBuffer = ConstantHelper::getInstance()->constantBuffer(descriptor, nd4j::DataType::INT32);
+    auto iBuffer = ConstantHelper::getInstance()->constantBuffer(descriptor, sd::DataType::INT32);
     auto iPtr = iBuffer->primaryAsT<int>();
 
     ASSERT_EQ(1, iPtr[0]);
diff --git a/libnd4j/tests_cpu/layers_tests/ContextTests.cpp b/libnd4j/tests_cpu/layers_tests/ContextTests.cpp
index 13316fe8d..57d9ce88d 100644
--- a/libnd4j/tests_cpu/layers_tests/ContextTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ContextTests.cpp
@@ -21,9 +21,9 @@
 #include "testlayers.h"
 #include <ops/declarable/CustomOperations.h>
 
-using namespace nd4j;
-using namespace nd4j::ops;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::ops;
+using namespace sd::graph;
 
 class ContextTests : public testing::Test {
 public:
@@ -327,7 +327,7 @@ TEST_F(ContextTests, test_short_context_2) {
 
     ASSERT_EQ(2, ctx.width());
 
-    nd4j::ops::add op;
+    sd::ops::add op;
     op.execute(&ctx);
 
     ASSERT_EQ(exp, z);
@@ -345,7 +345,7 @@ TEST_F(ContextTests, test_short_context_3) {
 
     ASSERT_EQ(2, ctx.width());
 
-    nd4j::ops::add op;
+    sd::ops::add op;
     op.execute(&ctx);
 
     ASSERT_EQ(1, ctx.fastpath_out().size());
diff --git a/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp b/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp
index a6b99f976..6b9109731 100644
--- a/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp
@@ -22,23 +22,23 @@
 #define LIBND4J_CONVOLUTIONTESTS1_H
 
 #include "testlayers.h"
-#include <NDArray.h>
-#include <Context.h>
-#include <Node.h>
+#include <array/NDArray.h>
+#include <graph/Context.h>
+#include <graph/Node.h>
 #include <graph/Variable.h>
 #include <graph/VariableSpace.h>
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/convolutions.h>
 #include <ops/declarable/helpers/col2im.h>
-#include <PointersManager.h>
-#include <GradCheck.h>
+#include <helpers/PointersManager.h>
+#include <helpers/GradCheck.h>
 
 #ifdef HAVE_MKLDNN
 #include <ops/declarable/platform/mkldnn/mkldnnUtils.h>
 #endif
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class ConvolutionTests1 : public testing::Test {
 public:
@@ -102,7 +102,7 @@ TYPED_TEST(TypedConvolutionTests1, conv2d_1) {
     // is NHWC
     block->getIArguments()->push_back(0);
 
-    nd4j::ops::conv2d op;
+    sd::ops::conv2d op;
 
     Nd4jStatus status = op.execute(block);
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -139,7 +139,7 @@ TYPED_TEST(TypedConvolutionTests1, conv2d_2) {
     weights.assign(2.0);
     input.linspace(1);
 
-    nd4j::ops::conv2d op;
+    sd::ops::conv2d op;
     auto result = op.evaluate({&input, &weights}, {}, {1, 1, 1, 1, 0, 0, 1, 1, 0, 0});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -171,7 +171,7 @@ TYPED_TEST(TypedConvolutionTests1, conv2d_3) {
     input = 2.;
     weights.linspace(0.1, 0.1);
 
-    nd4j::ops::conv2d op;
+    sd::ops::conv2d op;
     auto results = op.evaluate({&input, &weights}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
     auto output = results->at(0);
 
@@ -200,7 +200,7 @@ TYPED_TEST(TypedConvolutionTests1, conv2d_4) {
     input = 2.;
     weights.linspace(0.1, 0.1);
 
-    nd4j::ops::conv2d op;
+    sd::ops::conv2d op;
     auto results = op.evaluate({&input, &weights}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
     auto output = results->at(0);
 
@@ -230,7 +230,7 @@ TYPED_TEST(TypedConvolutionTests1, conv2d_5) {
     weights.linspace(0.1, 0.1);
     weights.permutei({2,3,1,0});
 
-    nd4j::ops::conv2d op;
+    sd::ops::conv2d op;
     auto results = op.evaluate({&input, &weights, &bias}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
     auto output = results->at(0);
 
@@ -249,7 +249,7 @@ TYPED_TEST(TypedConvolutionTests1, conv2d_6) {
     auto input = NDArrayFactory::create<TypeParam>('c', {54, 1, 12, 12});
     auto weights = NDArrayFactory::create<TypeParam>('c', {1, 2, 12, 2});
 
-    nd4j::ops::conv2d op;
+    sd::ops::conv2d op;
     auto result = op.evaluate({&input, &weights}, {}, {-1,-1,  1,1,  0,0,  1,1,  1,1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -270,7 +270,7 @@ TYPED_TEST(TypedConvolutionTests1, conv2d_7) {
     input = 5.;
     weights = 3.;
 
-    nd4j::ops::conv2d op;
+    sd::ops::conv2d op;
     auto results = op.evaluate({&input, &weights}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
     auto* output = results->at(0);
 
@@ -304,7 +304,7 @@ TEST_F(ConvolutionTests1, conv2d_8) {
         1.179925, 1.909109, 2.009143, 2.299778, 1.957207, 1.779718, 2.480604, 1.529086, 1.748063, 1.952856, 2.029487, 2.699131, 1.879842, 1.471205, 2.150177, 2.039078, 1.933456,
         1.764169, 2.584944, 2.521004, 1.744296, 1.707578, 2.237938, 2.325231, 0.984485, 1.766936, 1.590640, 1.347524, 1.404648, 1.422042, 1.709862, 1.155412});
 
-    nd4j::ops::conv2d op;
+    sd::ops::conv2d op;
     auto results = op.evaluate({&input, &weights, &bias}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
     auto output = results->at(0);
 
@@ -368,7 +368,7 @@ TEST_F(ConvolutionTests1, sconv2d_1) {
     // NOT same mode
     block->getIArguments()->push_back(0);
 
-    nd4j::ops::sconv2d op;
+    sd::ops::sconv2d op;
 
     Nd4jStatus status = op.execute(block);
 
@@ -417,7 +417,7 @@ TYPED_TEST(TypedConvolutionTests1, sconv2d_2) {
     weightsD.applyScalar(scalar::Divide, 100.0, weightsD);
     weightsP.applyScalar(scalar::Divide, 100.0, weightsP);
 
-    nd4j::ops::sconv2d op;
+    sd::ops::sconv2d op;
 
     auto resultFF = op.evaluate({&input, &weightsD, &weightsP},  {5, 5, 1, 1, 0, 0, 1, 1, 0, 0});
 
@@ -451,7 +451,7 @@ TYPED_TEST(TypedConvolutionTests1, sconv2d_3) {
 
     auto expOutput = NDArrayFactory::create<TypeParam>('c', {3, 2, 8, 8});
 
-    nd4j::ops::sconv2d op;
+    sd::ops::sconv2d op;
     Nd4jStatus status = op.execute({&input, &weightsD, &weightsP, &bias}, {&output}, {1, 1, 1, 1, 0, 0, 1, 1, 0});
     auto result = op.evaluate({&input, &weightsD, &weightsP, &bias}, {1, 1, 1, 1, 0, 0, 1, 1, 0});
 
@@ -492,7 +492,7 @@ TEST_F(ConvolutionTests1, sconv2d_4) {
         2.096277, 1.178815, 1.637460, 1.254187, 1.491076, 0.968625, 0.986342, 2.116042, 1.536920, 1.504321, 1.490398, 2.136795, 1.351860, 1.148578, 1.817408, 1.327139, 1.288620,
         0.962232, 0.980667, 1.623775, 1.417320, 1.845710, 1.237095, 1.762792, 1.352515});
 
-    nd4j::ops::sconv2d op;
+    sd::ops::sconv2d op;
     auto results = op.evaluate({&input, &weightsD, &weightsP, &biases}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
     auto* output = results->at(0);
 
@@ -529,7 +529,7 @@ TYPED_TEST(TypedConvolutionTests1, conv2D_BP_Bias_1) {
     epsilonNext.linspace(1);
     weights.permutei({2,3,1,0});
 
-    nd4j::ops::conv2d_bp op;
+    sd::ops::conv2d_bp op;
 
     auto results = op.evaluate({&input, &weights, &bias, &epsilonNext}, {},  {3, 3, 1, 1, 0, 0, 1, 1, 1}, {});
 
@@ -579,7 +579,7 @@ TYPED_TEST(TypedConvolutionTests1, conv2D_BP_NoBias_1) {
     epsilonNext.linspace(1);
     weights.permutei({2,3,1,0});
 
-    nd4j::ops::conv2d_bp op;
+    sd::ops::conv2d_bp op;
 
     auto results = op.evaluate({&input, &weights, &epsilonNext}, {},  {3, 3, 1, 1, 0, 0, 1, 1, 1}, {});
 
@@ -663,7 +663,7 @@ TYPED_TEST(TypedConvolutionTests1, sconv2d_conv2d_1) {
 
     input.linspace(1);
 
-    nd4j::ops::sconv2d op;
+    sd::ops::sconv2d op;
     auto resultFF = op.evaluate({&input, &weightsD}, {}, {5, 5, 1, 1, 0, 0, 1, 1, 0}, {});
 
     auto z = resultFF->at(0);
@@ -672,7 +672,7 @@ TYPED_TEST(TypedConvolutionTests1, sconv2d_conv2d_1) {
     ASSERT_TRUE(z->equalsTo(&expFF, 1));
 
 
-    nd4j::ops::conv2d op2d;
+    sd::ops::conv2d op2d;
     // weightsP.printShapeInfo();
     auto result2D = op2d.evaluate({z, &weightsP}, {}, {1, 1, 1, 1, 0, 0, 1, 1, 0, 0}, {});
 
@@ -694,10 +694,10 @@ TEST_F(ConvolutionTests1, TestDeconv_bp_1) {
     int dataFormat  = 0;             // 1-NHWC, 0-NCHW
 
 
-    NDArray input('c', {bS, iC, iH, iW}, nd4j::DataType::FLOAT32);
-    NDArray bias('c', {oC}, nd4j::DataType::FLOAT32);
-    NDArray weights('c',{kH,kW,oC,iC}, {1,3,5,2,4,6}, nd4j::DataType::FLOAT32);
-    NDArray gradO('c', {bS, oC, oH, oW},nd4j::DataType::FLOAT32);
+    NDArray input('c', {bS, iC, iH, iW}, sd::DataType::FLOAT32);
+    NDArray bias('c', {oC}, sd::DataType::FLOAT32);
+    NDArray weights('c',{kH,kW,oC,iC}, {1,3,5,2,4,6}, sd::DataType::FLOAT32);
+    NDArray gradO('c', {bS, oC, oH, oW},sd::DataType::FLOAT32);
 
     NDArray expGradI('c', {bS, iC, iH, iW}, {35.f,   38.f,   41.f,   44.f,   47.f,   50.f,   53.f,   56.f,   59.f,   62.f,   65.f,    68.f,   71.f,   74.f,
                 77.f,   80.f,   71.f,   78.f,   85.f,   92.f,   99.f,  106.f,    113.f,  120.f,  127.f,  134.f,  141.f,  148.f,  155.f,  162.f,  169.f,
@@ -707,16 +707,16 @@ TEST_F(ConvolutionTests1, TestDeconv_bp_1) {
                 481.f,  492.f,  503.f,  514.f,  525.f,  536.f,    547.f,  558.f,  569.f,  580.f,  591.f,  602.f,  613.f,  624.f,  227.f,  230.f,  233.f,
                 236.f,  239.f,  242.f,  245.f,  248.f,  251.f,  254.f,  257.f,  260.f,  263.f,  266.f,    269.f,  272.f,  519.f,  526.f,  533.f,  540.f,
                 547.f,  554.f,  561.f,  568.f,  575.f,    582.f,  589.f,  596.f,  603.f,  610.f,  617.f,  624.f,  811.f,  822.f,  833.f,  844.f,    855.f,
-                866.f,  877.f,  888.f,  899.f,  910.f,  921.f,  932.f,  943.f,  954.f,  965.f,    976.f}, nd4j::DataType::FLOAT32);
-    NDArray expGradW('c', {kH, kW, oC, iC}, {160008., 191112., 222216., 203400., 246792., 290184.f}, nd4j::DataType::FLOAT32);
-    NDArray expGradB('c', {oC}, {1944.f,  2712.f}, nd4j::DataType::FLOAT32);
+                866.f,  877.f,  888.f,  899.f,  910.f,  921.f,  932.f,  943.f,  954.f,  965.f,    976.f}, sd::DataType::FLOAT32);
+    NDArray expGradW('c', {kH, kW, oC, iC}, {160008., 191112., 222216., 203400., 246792., 290184.f}, sd::DataType::FLOAT32);
+    NDArray expGradB('c', {oC}, {1944.f,  2712.f}, sd::DataType::FLOAT32);
 
     input.linspace(1);
     bias.linspace(1);
     gradO.linspace(1);
 
 
-    nd4j::ops::deconv2d_bp op;
+    sd::ops::deconv2d_bp op;
     auto results = op.evaluate({&input, &weights, &bias, &gradO}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -769,7 +769,7 @@ TEST_F(ConvolutionTests1, TestDeconv_bp_2) {
     bias.linspace(1);
     epsilon.linspace(1);
 
-    nd4j::ops::deconv2d_bp<double> op;
+    sd::ops::deconv2d_bp<double> op;
 
     auto result = op.evaluate({&input, &weights, &bias, &epsilon}, {}, {2, 2, 1, 1, 0, 0, 2, 2, 0});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -790,7 +790,7 @@ TYPED_TEST(TypedConvolutionTests1, Test_Conv1D_ff_1) {
     input.linspace(1);
     bias.linspace(1);
 
-    nd4j::ops::conv1d op;
+    sd::ops::conv1d op;
     auto result_FF = op.evaluate({&input, &weights, &bias}, {}, {2, 1, 0, 1, 0, 0});
 
     ASSERT_EQ(ND4J_STATUS_OK, result_FF->status());
@@ -800,7 +800,7 @@ TYPED_TEST(TypedConvolutionTests1, Test_Conv1D_ff_1) {
     ASSERT_TRUE(expFF.isSameShape(z));
     ASSERT_TRUE(expFF.equalsTo(z));
 
-    nd4j::ops::conv1d_bp op_bp;
+    sd::ops::conv1d_bp op_bp;
 
     auto epsilonNxt = new NDArray(z->dup());
     epsilonNxt->linspace(1);
@@ -832,7 +832,7 @@ TYPED_TEST(TypedConvolutionTests1, Test_Conv1D_ff_2) {
 
     input.linspace(1);
 
-    nd4j::ops::conv1d op;
+    sd::ops::conv1d op;
     auto result = op.evaluate({&input, &weights}, {}, {2, 1, 0, 1, 1,0});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -859,7 +859,7 @@ TEST_F(ConvolutionTests1, conv1d_causal_1) {
     input.linspace(1., 1.);
     weights.linspace(0.1, 0.1);
 
-    nd4j::ops::conv1d op;
+    sd::ops::conv1d op;
     auto results = op.evaluate({&input, &weights, &bias}, {kW, sW, pW, dW,  paddingMode, dataFormat});
     auto output = results->at(0);
 
@@ -891,7 +891,7 @@ TEST_F(ConvolutionTests1, conv1d_causal_2) {
     input.linspace(1., 1.);
     weights.linspace(0.1, 0.1);
 
-    nd4j::ops::conv1d op;
+    sd::ops::conv1d op;
     auto results = op.evaluate({&input, &weights, &bias}, {kW, sW, pW, dW, paddingMode, dataFormat});
     auto output = results->at(0);
 
@@ -922,7 +922,7 @@ TEST_F(ConvolutionTests1, conv1d_causal_3) {
     input.linspace(1., 1.);
     weights.linspace(0.1, 0.1);
 
-    nd4j::ops::conv1d op;
+    sd::ops::conv1d op;
     auto results = op.evaluate({&input, &weights, &bias}, {kW, sW, pW, dW, paddingMode, dataFormat});
     auto output = results->at(0);
 
@@ -953,7 +953,7 @@ TEST_F(ConvolutionTests1, conv1d_causal_4) {
     input.linspace(1., 1.);
     weights.linspace(0.1, 0.1);
 
-    nd4j::ops::conv1d op;
+    sd::ops::conv1d op;
     auto results = op.evaluate({&input, &weights, &bias}, {kW, sW, pW, dW, paddingMode, dataFormat});
     auto output = results->at(0);
 
@@ -984,7 +984,7 @@ TEST_F(ConvolutionTests1, conv1d_causal_5) {
     input.linspace(1., 1.);
     weights.linspace(0.1, 0.1);
 
-    nd4j::ops::conv1d op;
+    sd::ops::conv1d op;
     auto results = op.evaluate({&input, &weights, &bias}, {kW, sW, pW, dW, paddingMode, dataFormat});
     auto output = results->at(0);
 
@@ -1015,7 +1015,7 @@ TEST_F(ConvolutionTests1, conv1d_causal_6) {
     input.linspace(1., 1.);
     weights.linspace(0.1, 0.1);
 
-    nd4j::ops::conv1d op;
+    sd::ops::conv1d op;
     auto results = op.evaluate({&input, &weights, &bias}, {kW, sW, pW, dW, paddingMode, dataFormat});
     auto output = results->at(0);
 
@@ -1035,19 +1035,19 @@ TEST_F(ConvolutionTests1, conv1d_causal_7) {
     int paddingMode = 2;             // CAUSAL
     int dataFormat  = 1;             // 1-NHWC, 0-NCHW
 
-    NDArray input('c', {bS, iW, iC}, nd4j::DataType::FLOAT32);
-    NDArray weights('c', {kW, iC, oC}, nd4j::DataType::FLOAT32);
+    NDArray input('c', {bS, iW, iC}, sd::DataType::FLOAT32);
+    NDArray weights('c', {kW, iC, oC}, sd::DataType::FLOAT32);
 
     NDArray expOutput('c', {bS, oW, oC}, {11.000000, 11.600000, 12.200000, 12.800000, 30.099998, 32.200001, 34.299999, 36.400002, 49.899998, 53.800003, 57.699997,
         61.599998, 69.699997, 75.400002, 81.099998, 86.800003, 89.500000, 97.000000, 104.500000, 112.000000, 109.300003, 118.600006, 127.899994, 137.199997, 129.100006,
         140.199997, 151.300003, 162.399994, 148.899994, 161.800003, 174.699997, 187.600006, 133.399994, 141.200012, 149.000000, 156.800003, 188.500000, 205.000000,
         221.500000, 238.000000, 208.299988, 226.600006, 244.899994, 263.200012, 228.100006, 248.200012, 268.299988, 288.399994, 247.899994, 269.799988, 291.700012,
-        313.600006, 267.700012, 291.399994, 315.100006, 338.799988, 287.500000, 313.000000, 338.500000, 364.000000, 307.299988, 334.600006, 361.899994, 389.200012}, nd4j::DataType::FLOAT32);
+        313.600006, 267.700012, 291.399994, 315.100006, 338.799988, 287.500000, 313.000000, 338.500000, 364.000000, 307.299988, 334.600006, 361.899994, 389.200012}, sd::DataType::FLOAT32);
 
     input.linspace(1., 1.);
     weights.linspace(0.1, 0.1);
 
-    nd4j::ops::conv1d op;
+    sd::ops::conv1d op;
     auto results = op.evaluate({&input, &weights}, {kW, sW, pW, dW,  paddingMode, dataFormat});
     auto output = results->at(0);
 
@@ -1067,20 +1067,20 @@ TEST_F(ConvolutionTests1, conv1d_causal_8) {
     int paddingMode = 2;             // CAUSAL
     int dataFormat  = 1;             // 1-NHWC, 0-NCHW
 
-    NDArray input('c', {bS, iW, iC}, nd4j::DataType::FLOAT32);
-    NDArray weights('c', {kW, iC, oC}, nd4j::DataType::FLOAT32);
+    NDArray input('c', {bS, iW, iC}, sd::DataType::FLOAT32);
+    NDArray weights('c', {kW, iC, oC}, sd::DataType::FLOAT32);
 
     NDArray expOutput('c', {bS, oW, oC}, {11.000000, 11.600000, 12.200000, 12.800000, 26.299999, 27.799999, 29.299999, 30.799999, 45.399998, 48.399998,
         51.400002, 54.400005, 65.199997, 70.000000, 74.800003, 79.600006, 85.000000, 91.600006, 98.199997, 104.800003, 104.799995, 113.199997, 121.600006,
         130.000000, 124.599998, 134.800003, 145.000000, 155.200012, 144.399994, 156.399994, 168.399994, 180.400009, 133.400009, 141.199997, 149.000000,
         156.800003, 148.699997, 157.400009, 166.099991, 174.800003, 203.800003, 221.200012, 238.599991, 256.000000, 223.599991, 242.799988, 262.000000,
         281.200012, 243.399994, 264.399994, 285.399994, 306.399994, 263.199982, 286.000000, 308.799988, 331.600006, 283.000000, 307.600006, 332.200012,
-        356.800018, 302.799988, 329.199982, 355.600006, 382.000000}, nd4j::DataType::FLOAT32);
+        356.800018, 302.799988, 329.199982, 355.600006, 382.000000}, sd::DataType::FLOAT32);
 
     input.linspace(1., 1.);
     weights.linspace(0.1, 0.1);
 
-    nd4j::ops::conv1d op;
+    sd::ops::conv1d op;
     auto results = op.evaluate({&input, &weights}, {kW, sW, pW, dW,  paddingMode, dataFormat});
     auto output = results->at(0);
 
@@ -1112,8 +1112,8 @@ TEST_F(ConvolutionTests1, conv1d_causal_bp_1) {
     const OpArgsHolder argsHolderFF({&input, &weights, &bias}, {}, {kW, sW, pW, dW,  paddingMode, dataFormat});
     const OpArgsHolder argsHolderBP({&input, &weights, &bias, &gradO}, {}, {kW, sW, pW, dW,  paddingMode, dataFormat});
 
-    nd4j::ops::conv1d opFF;
-    nd4j::ops::conv1d_bp opBP;
+    sd::ops::conv1d opFF;
+    sd::ops::conv1d_bp opBP;
 
     const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
 
@@ -1128,7 +1128,7 @@ TEST_F(ConvolutionTests1, Test_Dilation2D_1) {
     input.linspace(1);
     weights.linspace(1);
 
-    nd4j::ops::dilation2d op;
+    sd::ops::dilation2d op;
     auto result = op.evaluate({&input, &weights}, {1, 1,2,2,1, 1,2,2,1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -1148,7 +1148,7 @@ TEST_F(ConvolutionTests1, Test_Dilation2D_2) {
     input.linspace(1);
     weights.linspace(1);
 
-    nd4j::ops::dilation2d op;
+    sd::ops::dilation2d op;
     auto result = op.evaluate({&input, &weights}, {0, 1,2,2,1, 1,2,2,1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -1187,7 +1187,7 @@ TYPED_TEST(TypedConvolutionTests1, conv2d_bp_test1) {
     weights.linspace(0.1, 0.1);
     gradO.linspace(0.01, 0.01);
 
-    nd4j::ops::conv2d_bp op;
+    sd::ops::conv2d_bp op;
     auto results = op.evaluate({&input, &weights, &bias, &gradO}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
     auto gradI = results->at(0);
     auto gradW = results->at(1);
@@ -1230,7 +1230,7 @@ TYPED_TEST(TypedConvolutionTests1, conv2d_bp_test2) {
     weights.linspace(0.1, 0.1);
     gradO.linspace(0.01, 0.01);
 
-    nd4j::ops::conv2d_bp op;
+    sd::ops::conv2d_bp op;
     auto results = op.evaluate({&input, &weights, &bias, &gradO}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
     auto gradI = results->at(0);
     auto gradW = results->at(1);
@@ -1275,7 +1275,7 @@ TYPED_TEST(TypedConvolutionTests1, conv2d_bp_test3) {
     weights.permutei({2,3,1,0});
     expGradW.permutei({2,3,1,0});
 
-    nd4j::ops::conv2d_bp op;
+    sd::ops::conv2d_bp op;
     auto results = op.evaluate({&input, &weights, &bias, &gradO}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
     auto gradI = results->at(0);
     auto gradW = results->at(1);
@@ -1303,21 +1303,21 @@ TEST_F(ConvolutionTests1, conv2d_bp_4) {
     int paddingMode = 1;             // 1-SAME, 0-VALID;
     int dataFormat  = 0;             // 1-NHWC, 0-NCHW
 
-    NDArray input('c', {bS, iC, iH, iW}, nd4j::DataType::FLOAT32);
-    NDArray weights('c', {kH, kW, iC, oC}, nd4j::DataType::FLOAT32);
-    NDArray bias('c', {oC}, {1,2,3}, nd4j::DataType::FLOAT32);
-    NDArray gradO('c', {bS, oC, oH, oW}, nd4j::DataType::FLOAT32);
+    NDArray input('c', {bS, iC, iH, iW}, sd::DataType::FLOAT32);
+    NDArray weights('c', {kH, kW, iC, oC}, sd::DataType::FLOAT32);
+    NDArray bias('c', {oC}, {1,2,3}, sd::DataType::FLOAT32);
+    NDArray gradO('c', {bS, oC, oH, oW}, sd::DataType::FLOAT32);
 
-    NDArray gradI('c', {bS, iC, iH, iW}, nd4j::DataType::FLOAT32);
-    NDArray gradW('c', {kH, kW, iC, oC}, nd4j::DataType::FLOAT32);
-    NDArray gradB('c', {oC}, nd4j::DataType::FLOAT32);
+    NDArray gradI('c', {bS, iC, iH, iW}, sd::DataType::FLOAT32);
+    NDArray gradW('c', {kH, kW, iC, oC}, sd::DataType::FLOAT32);
+    NDArray gradB('c', {oC}, sd::DataType::FLOAT32);
 
 
     input = 2.;
     weights.linspace(0.1, 0.1);
     gradO.linspace(0.01, 0.01);
 
-    nd4j::ops::conv2d_bp op;
+    sd::ops::conv2d_bp op;
     auto status = op.execute({&input, &weights, &bias, &gradO}, {&gradI, &gradW, &gradB}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat}, {});
 
     ASSERT_EQ(Status::OK(), status);
@@ -1357,7 +1357,7 @@ TYPED_TEST(TypedConvolutionTests1, conv3d_bp_test1) {
     weights.linspace(0.1, 0.1);
     gradO.linspace(0.01, 0.01);
 
-    nd4j::ops::conv3dnew_bp op;
+    sd::ops::conv3dnew_bp op;
     auto results = op.evaluate({&input, &weights, &bias, &gradO}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW,  dD,dH,dW, paddingMode, dataFormat});
     auto gradI = results->at(0);
     auto gradW = results->at(1);
@@ -1405,7 +1405,7 @@ TYPED_TEST(TypedConvolutionTests1, conv3d_bp_test2) {
     weights.linspace(0.1, 0.1);
     gradO.linspace(0.01, 0.01);
 
-    nd4j::ops::conv3dnew_bp op;
+    sd::ops::conv3dnew_bp op;
     auto results = op.evaluate({&input, &weights, &bias, &gradO}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW,  dD,dH,dW, paddingMode, dataFormat});
     auto gradI = results->at(0);
     auto gradW = results->at(1);
@@ -1458,7 +1458,7 @@ TYPED_TEST(TypedConvolutionTests1, conv3d_bp_test3) {
     weights.permutei({2, 3, 4, 1, 0});
     expGradW.permutei({2, 3, 4, 1, 0});
 
-    nd4j::ops::conv3dnew_bp op;
+    sd::ops::conv3dnew_bp op;
     auto results = op.evaluate({&input, &weights, &bias, &gradO}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW,  dD,dH,dW, paddingMode, dataFormat});
     auto* gradI = results->at(0);
     auto* gradW = results->at(1);
@@ -1493,15 +1493,15 @@ TEST_F(ConvolutionTests1, depthwise_conv2d_bp_test1) {
     auto gradO    = NDArrayFactory::create<float>('c', {bS, oH, oW, oC});
 
     NDArray expGradI('c', {bS, iH, iW, iC},{0.07 ,  0.19 , 0.348,  0.652, 0.588,  0.956, 0.387,  0.687, 1.326,  2.022, 1.878,  2.67 , 1.071,  1.515, 2.982,  3.966, 3.534,  4.614, 1.606,  1.982, 3.932,  4.748, 4.428,  5.308,
-                                                    1.126,  1.63 , 3.228,  4.3  , 3.468,  4.604, 3.123,  3.999, 7.95 ,  9.798, 8.502, 10.446, 3.807,  4.827, 9.606, 11.742,10.158, 12.39 , 4.198,  4.958, 9.884, 11.468,10.38 , 12.028}, nd4j::DataType::FLOAT32);
+                                                    1.126,  1.63 , 3.228,  4.3  , 3.468,  4.604, 3.123,  3.999, 7.95 ,  9.798, 8.502, 10.446, 3.807,  4.827, 9.606, 11.742,10.158, 12.39 , 4.198,  4.958, 9.884, 11.468,10.38 , 12.028}, sd::DataType::FLOAT32);
 
-    NDArray expGradW('c', {kH, kW, iC, mC},{19.08, 19.44,19.8 , 20.16,12.24, 12.48,12.72, 12.96,22.56, 23.04,23.52, 24. ,14.4 , 14.72,15.04, 15.36,14.76, 15.12,15.48, 15.84, 9.36,  9.6 , 9.84, 10.08}, nd4j::DataType::FLOAT32);
+    NDArray expGradW('c', {kH, kW, iC, mC},{19.08, 19.44,19.8 , 20.16,12.24, 12.48,12.72, 12.96,22.56, 23.04,23.52, 24. ,14.4 , 14.72,15.04, 15.36,14.76, 15.12,15.48, 15.84, 9.36,  9.6 , 9.84, 10.08}, sd::DataType::FLOAT32);
 
     input = 2.;
     weights.linspace(0.1, 0.1);
     gradO.linspace(0.01, 0.01);
 
-    nd4j::ops::depthwise_conv2d_bp op;
+    sd::ops::depthwise_conv2d_bp op;
     auto results = op.evaluate({&input, &weights, &bias, &gradO}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
     auto* gradI = results->at(0);
     auto* gradW = results->at(1);
@@ -1532,14 +1532,14 @@ TEST_F(ConvolutionTests1, depthwise_conv2d_bp_test2) {
     auto gradO    = NDArrayFactory::create<float>('c', {bS, oH, oW, oC});
 
     NDArray expGradI('c', {bS, iH, iW, iC},{0.005, 0.025,0.034, 0.106,0.061, 0.113,0.058, 0.162,0.292, 0.564,0.298, 0.466,0.234, 0.402,0.772, 1.172,0.602, 0.834,0.333, 0.449,0.882, 1.146,0.581, 0.729,
-                                                    0.053, 0.137,0.258, 0.458,0.237, 0.353,0.41 , 0.642,1.252, 1.78 ,0.906, 1.202,1.098, 1.394,2.756, 3.412,1.722, 2.082,0.893, 1.073,2.13 , 2.522,1.269, 1.481}, nd4j::DataType::FLOAT32);
-    NDArray expGradW('c', {kH, kW, iC, mC},{2.4 , 2.56,2.72, 2.88,2.4 , 2.56,2.72, 2.88,2.4 , 2.56,2.72, 2.88,2.4 , 2.56,2.72, 2.88,2.4 , 2.56,2.72, 2.88,2.4 , 2.56,2.72, 2.88}, nd4j::DataType::FLOAT32);
+                                                    0.053, 0.137,0.258, 0.458,0.237, 0.353,0.41 , 0.642,1.252, 1.78 ,0.906, 1.202,1.098, 1.394,2.756, 3.412,1.722, 2.082,0.893, 1.073,2.13 , 2.522,1.269, 1.481}, sd::DataType::FLOAT32);
+    NDArray expGradW('c', {kH, kW, iC, mC},{2.4 , 2.56,2.72, 2.88,2.4 , 2.56,2.72, 2.88,2.4 , 2.56,2.72, 2.88,2.4 , 2.56,2.72, 2.88,2.4 , 2.56,2.72, 2.88,2.4 , 2.56,2.72, 2.88}, sd::DataType::FLOAT32);
 
     input = 2.;
     weights.linspace(0.1, 0.1);
     gradO.linspace(0.01, 0.01);
 
-    nd4j::ops::depthwise_conv2d_bp op;
+    sd::ops::depthwise_conv2d_bp op;
     auto results = op.evaluate({&input, &weights, &bias, &gradO}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
     auto* gradI = results->at(0);
     auto* gradW = results->at(1);
@@ -1581,10 +1581,10 @@ TEST_F(ConvolutionTests1, depthwise_conv2d_bp_test4) {
     int paddingMode = 1;             // 1-SAME, 0-VALID;
     int dataFormat  = 1;             // 1-NHWC, 0-NCHW
 
-    NDArray input('c', {bS, iH, iW, iC}, nd4j::DataType::FLOAT32);
-    NDArray weights('c', {kH, kW, iC, mC}, nd4j::DataType::FLOAT32);
-    NDArray gradO('c', {bS, oH, oW, oC}, nd4j::DataType::FLOAT32);
-    NDArray bias('c', {oC}, nd4j::DataType::FLOAT32);
+    NDArray input('c', {bS, iH, iW, iC}, sd::DataType::FLOAT32);
+    NDArray weights('c', {kH, kW, iC, mC}, sd::DataType::FLOAT32);
+    NDArray gradO('c', {bS, oH, oW, oC}, sd::DataType::FLOAT32);
+    NDArray bias('c', {oC}, sd::DataType::FLOAT32);
 
     input.linspace(-10, 0.1);
     weights.linspace(-2, 0.1);
@@ -1596,17 +1596,17 @@ TEST_F(ConvolutionTests1, depthwise_conv2d_bp_test4) {
         -117.209999, -134.399994, -151.769989, -169.319992, -187.049988, -75.360008, -92.729996, -110.279991, -128.009979, -145.920013, -164.009995, -182.279984, -200.729996, -84.000000, -102.089996, -120.360016, -138.809967, -157.440002, -176.249969, -195.240005, -214.410019, -92.639999, -111.449997, -130.440018, -149.610016, -168.960007, -188.489990, -208.200012, -228.090012, -101.279976, -120.809982, -140.519989, -160.410004, -180.480011, -200.730011, -221.160034, -241.770020, -121.920006, -135.420013, -149.040009, -162.779999, -176.640015, -190.619995, -204.719986, -218.940002, -29.760002, -43.739998, -57.840000, -72.059998, -86.400009, -100.860001, -115.439995, -130.140015, -127.199997, -148.890015, -170.760010, -192.809998, -215.040024, -237.450012, -260.039978, -282.809998, -135.839996, -158.250000, -180.840012, -203.610046, -226.559982, -249.690002, -272.999969, -296.489990, -144.479980, -167.609985, -190.920013, -214.410019, -238.080032, -261.929993, -285.959991, -310.169983, -153.119995, -176.969986, -201.000031, -225.210022, -249.599976, -274.170013, -298.920013, -323.849976, -161.760040, -186.330017, -211.079987, -236.009995, -261.120026, -286.410034, -311.879974, -337.530029, -170.400009, -195.689987, -221.159973, -246.809998, -272.639954, -298.650024, -324.840057, -351.209991, -179.039963, -205.050018, -231.240021, -257.609985, -284.160004, -310.890015, -337.799988, -364.890015, -187.680023, -214.410004, -241.319977, -268.410004, -295.679993, -323.130005, -350.760010, -378.570038, -198.720016, -217.019989, -235.440002, -253.979980, -272.640045, -291.419983, -310.319977, -329.339996, -68.159981, -86.939987, -105.840012, -124.860001, -144.000000, -163.260010, -182.639984, -202.140015, -213.600021, -242.489990, -271.559937, -300.809998, -330.239990, -359.849976, -389.639984,
         -419.610016, -222.240036, -251.849960, -281.640015, -311.609985, -341.760040, -372.089996, -402.600037, -433.290009, -230.880005, -261.210022, -291.719971, -322.410034, -353.280029, -384.329956, -415.559998, -446.970001, -239.519989, -270.570007, -301.800018, -333.209991, -364.800018, -396.570007, -428.520020, -460.650024, -248.160034, -279.929962, -311.880005, -344.010010, -376.320038, -408.809998, -441.479980, -474.330017, -256.799988, -289.289978, -321.960022, -354.809967, -387.839996, -421.050018, -454.440002, -488.009979, -265.440002, -298.650024, -332.040009, -365.609985, -399.360016, -433.290009, -467.399963, -501.689941, -274.080017, -308.009949, -342.119995, -376.409973, -410.880005, -445.530029, -480.359985, -515.369995, -275.520020, -298.619995, -321.839966, -345.179993, -368.640015, -392.220001, -415.919952, -439.740021, -106.560005, -130.140030, -153.840027, -177.659973, -201.599991, -225.660019, -249.840012, -274.140015, -300.000000, -336.090057, -372.360046, -408.809937, -445.440002, -482.250031, -519.240051, -556.410034, -308.640015, -345.450012, -382.440002, -419.609955, -456.959961, -494.489960, -532.200012, -570.089966, -317.280029, -354.809998, -392.520020, -430.410004, -468.480042, -506.729980, -545.159912, -583.770020, -325.920013, -364.169952, -402.600037, -441.210022, -480.000000, -518.970032, -558.119873, -597.449951, -334.559967, -373.529999, -412.679993, -452.009949, -491.519989, -531.209961, -571.080017, -611.129944, -343.200012, -382.889984, -422.760071, -462.809906, -503.039978, -543.449951, -584.039978, -624.809998, -351.839966, -392.250000, -432.839966, -473.609955, -514.560120, -555.689941, -596.999939, -638.489990, -360.480011, -401.610016, -442.920044, -484.409912, -526.080017, -567.929993, -609.959961, -652.169983, -352.320007, -380.220001,
         -408.239990, -436.380005, -464.639984, -493.019989, -521.519958, -550.139954, -144.960022, -173.339996, -201.839996, -230.459976, -259.200043, -288.059998, -317.039978, -346.140015, -386.399963, -429.690002, -473.159912, -516.809937, -560.640076, -604.650024, -648.839966, -693.210022, -395.039978, -439.050018, -483.239929, -527.609985, -572.159973, -616.890015, -661.799988, -706.890015, -403.680023, -448.409973, -493.320007, -538.410034, -583.680054, -629.129944, -674.760010, -720.570068, -412.320007, -457.769897, -503.399963, -549.210083, -595.199951, -641.369995, -687.720093, -734.250000, -420.960052, -467.130035, -513.479980, -560.010010, -606.720093, -653.610046, -700.680054, -747.930115, -429.599976, -476.489990, -523.559998, -570.809937, -618.239990, -665.849976, -713.640015, -761.609985, -438.239990, -485.850037, -533.640015, -581.610046, -629.760010, -678.089966, -726.600037, -775.289917, -446.880035,-495.210052, -543.719971, -592.410034, -641.279968, -690.330017, -739.559937, -788.970093, -429.120026, -461.819946, -494.639984, -527.580017, -560.640015, -593.820007, -627.119995, -660.540039, -183.360016, -216.540009, -249.839996, -283.260040, -316.800018, -350.459961, -384.239990, -418.139984, -472.800049, -523.289917, -573.959961, -624.809998, -675.839966, -727.050049, -778.440063, -830.010010, -481.440002, -532.649963, -584.040100, -635.609985, -687.359924, -739.290039, -791.399963, -843.689941, -490.079987, -542.010010, -594.119995, -646.410034, -698.880005, -751.529968, -804.359985, -857.369995, -498.720032, -551.369995, -604.200012, -657.210022, -710.400024, -763.770081, -817.319946, -871.050049, -507.359955, -560.729919, -614.280029, -668.010010, -721.919983, -776.010010, -830.280029, -884.730042, -515.999939, -570.089966, -624.360046, -678.809937, -733.440002,
-        -788.250000, -843.239990, -898.410034, -524.639954, -579.449951, -634.440002, -689.609985, -744.960022, -800.489990, -856.200012, -912.090027, -533.280029, -588.810059, -644.520081, -700.409973, -756.480042, -812.730103, -869.159912, -925.769958, -505.920013, -543.420044, -581.040039, -618.780029, -656.640015, -694.620056, -732.719971, -770.940002, -447.359985, -471.559998, -495.840027, -520.200012, -544.640015, -569.159973, -593.760010, -618.440002, -815.359985, -852.140015, -889.040039, -926.059937, -963.200073, -1000.460022, -1037.839966, -1075.339966, -826.879944, -864.139954, -901.519958, -939.019958, -976.640076, -1014.379944, -1052.239990, -1090.219971, -838.400024, -876.140015, -913.999939, -951.979919, -990.080017, -1028.299927, -1066.640015, -1105.099976, -849.919983, -888.140015, -926.479980, -964.939941, -1003.520081, -1042.219971, -1081.040039, -1119.979980, -861.440063, -900.140015, -938.960022,-977.899963, -1016.960022, -1056.140015, -1095.440063, -1134.859985, -872.960022, -912.140015, -951.439941, -990.859985, -1030.400024, -1070.060059, -1109.839844, -1149.739990, -884.479980, -924.140015, -963.919922, -1003.819946, -1043.839966, -1083.979980, -1124.239990, -1164.619995, -896.000000, -936.140015, -976.399963, -1016.780029, -1057.280029, -1097.899902, -1138.640015, -1179.500122, -705.919983, -733.000000, -760.159912, -787.400024, -814.719971, -842.119995, -869.599976, -897.160034}, nd4j::DataType::FLOAT32);
+        -788.250000, -843.239990, -898.410034, -524.639954, -579.449951, -634.440002, -689.609985, -744.960022, -800.489990, -856.200012, -912.090027, -533.280029, -588.810059, -644.520081, -700.409973, -756.480042, -812.730103, -869.159912, -925.769958, -505.920013, -543.420044, -581.040039, -618.780029, -656.640015, -694.620056, -732.719971, -770.940002, -447.359985, -471.559998, -495.840027, -520.200012, -544.640015, -569.159973, -593.760010, -618.440002, -815.359985, -852.140015, -889.040039, -926.059937, -963.200073, -1000.460022, -1037.839966, -1075.339966, -826.879944, -864.139954, -901.519958, -939.019958, -976.640076, -1014.379944, -1052.239990, -1090.219971, -838.400024, -876.140015, -913.999939, -951.979919, -990.080017, -1028.299927, -1066.640015, -1105.099976, -849.919983, -888.140015, -926.479980, -964.939941, -1003.520081, -1042.219971, -1081.040039, -1119.979980, -861.440063, -900.140015, -938.960022,-977.899963, -1016.960022, -1056.140015, -1095.440063, -1134.859985, -872.960022, -912.140015, -951.439941, -990.859985, -1030.400024, -1070.060059, -1109.839844, -1149.739990, -884.479980, -924.140015, -963.919922, -1003.819946, -1043.839966, -1083.979980, -1124.239990, -1164.619995, -896.000000, -936.140015, -976.399963, -1016.780029, -1057.280029, -1097.899902, -1138.640015, -1179.500122, -705.919983, -733.000000, -760.159912, -787.400024, -814.719971, -842.119995, -869.599976, -897.160034}, sd::DataType::FLOAT32);
 
     NDArray expGradW('c', {kH, kW, iC, mC},{-104306.421875, -104786.734375, -105268.687500, -105752.250000, -106237.421875, -106724.242188, -107212.671875,
         -107702.734375, -116289.593750, -116823.296875, -117358.781250, -117896.109375, -118435.210938, -118976.109375, -119518.796875, -120063.296875, -104824.789062,
         -105305.117188, -105787.070312, -106270.640625, -106755.843750, -107242.640625, -107731.078125, -108221.117188, -126744.000000, -127277.710938, -127813.187500,
         -128350.484375, -128889.601562, -129430.515625, -129973.210938, -130517.703125, -140944.000000, -141536.984375, -142131.984375, -142729.000000, -143328.000000,
-        -143929.015625, -144532.000000, -145137.000000, -126744.000000, -127277.710938, -127813.187500, -128350.484375, -128889.601562, -129430.515625, -129973.210938, -130517.703125, -104824.789062, -105305.117188, -105787.070312, -106270.640625, -106755.843750, -107242.640625, -107731.078125, -108221.117188, -116289.593750, -116823.296875, -117358.781250, -117896.109375, -118435.210938, -118976.109375, -119518.796875, -120063.296875, -104306.421875, -104786.734375, -105268.687500, -105752.250000, -106237.421875, -106724.242188, -107212.671875, -107702.734375}, nd4j::DataType::FLOAT32);
+        -143929.015625, -144532.000000, -145137.000000, -126744.000000, -127277.710938, -127813.187500, -128350.484375, -128889.601562, -129430.515625, -129973.210938, -130517.703125, -104824.789062, -105305.117188, -105787.070312, -106270.640625, -106755.843750, -107242.640625, -107731.078125, -108221.117188, -116289.593750, -116823.296875, -117358.781250, -117896.109375, -118435.210938, -118976.109375, -119518.796875, -120063.296875, -104306.421875, -104786.734375, -105268.687500, -105752.250000, -106237.421875, -106724.242188, -107212.671875, -107702.734375}, sd::DataType::FLOAT32);
 
-    NDArray expGradB('c', {oC}, {-2960., -2970., -2980., -2990., -3000., -3010., -3020., -3030.}, nd4j::DataType::FLOAT32);
+    NDArray expGradB('c', {oC}, {-2960., -2970., -2980., -2990., -3000., -3010., -3020., -3030.}, sd::DataType::FLOAT32);
 
-    nd4j::ops::depthwise_conv2d_bp op;
+    sd::ops::depthwise_conv2d_bp op;
     ResultSet* results = op.evaluate({&input, &weights, &bias, &gradO}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
     NDArray* gradI = results->at(0);
     NDArray* gradW = results->at(1);
@@ -1635,10 +1635,10 @@ TEST_F(ConvolutionTests1, depthwise_conv2d_bp_test5) {
     int paddingMode = 1;             // 1-SAME, 0-VALID;
     int dataFormat  = 0;             // 1-NHWC, 0-NCHW
 
-    NDArray input('c', {bS, iC, iH, iW}, nd4j::DataType::FLOAT32);
-    NDArray weights('c', {kH, kW, iC, mC}, nd4j::DataType::FLOAT32);
-    NDArray gradO('c', {bS, oC, oH, oW}, nd4j::DataType::FLOAT32);
-    NDArray bias('c', {oC}, nd4j::DataType::FLOAT32);
+    NDArray input('c', {bS, iC, iH, iW}, sd::DataType::FLOAT32);
+    NDArray weights('c', {kH, kW, iC, mC}, sd::DataType::FLOAT32);
+    NDArray gradO('c', {bS, oC, oH, oW}, sd::DataType::FLOAT32);
+    NDArray bias('c', {oC}, sd::DataType::FLOAT32);
 
     input.linspace(-10, 0.1);
     weights.linspace(-2, 0.1);
@@ -1650,18 +1650,18 @@ TEST_F(ConvolutionTests1, depthwise_conv2d_bp_test5) {
         -33.879997, -34.059994, -34.239994, -34.419994, -57.299995, -129.209991, -269.969971, -271.319977, -272.670044, -274.019989, -275.369995, -276.720001, -278.070007, -279.420013, -239.369980, -135.809998, -283.470001, -284.820007, -286.169983, -287.520020, -288.869995, -290.220001, -291.570038, -292.919983, -250.770004, -142.410004, -296.969971, -298.320007, -299.669983, -301.020020, -302.369995, -303.719971, -305.070007, -306.419983, -262.169983, -149.009995, -310.470001, -311.820007, -313.170013, -314.519989, -315.869995, -317.220001, -318.570007, -319.919983, -273.570007, -155.610016, -323.969971, -325.320038, -326.669983, -328.020020, -329.369965, -330.719971, -332.070007, -333.419983, -284.970001, -162.209991, -337.469971, -338.820007, -340.169983, -341.519958, -342.869995, -344.220001, -345.570007, -346.920013, -296.369995, -168.809998, -350.970001, -352.320007, -353.669983, -355.019989, -356.369995, -357.719971, -359.070038, -360.419983, -307.769989, -175.410004, -364.469971, -365.820007, -367.169983, -368.520020, -369.869995, -371.219971, -372.570007, -373.919983, -319.169983, -260.179993, -459.399994, -461.019958, -462.639984, -464.260010, -465.880005, -467.500000, -469.119995, -470.739990, -361.459991, 2.480003, -69.520004, -69.760025, -70.000000, -70.239990, -70.479996, -70.720001, -70.960007, -71.200005, -97.839996, -213.840012, -432.960022, -434.400055, -435.840027, -437.279999, -438.720001, -440.160065, -441.599976, -443.040039, -372.480011, -221.040009, -447.360016, -448.800018, -450.239990, -451.679993, -453.119995, -454.559967, -456.000061, -457.440033, -384.480011, -228.239990, -461.759979, -463.200012, -464.639984, -466.079956, -467.520081, -468.960052, -470.399963, -471.839996, -396.479980, -235.440002, -476.159912,
         -477.600006, -479.040039, -480.479980, -481.919952, -483.360046, -484.800079, -486.239990, -408.480042, -242.639999, -490.559967, -491.999969, -493.440063, -494.880035, -496.319946, -497.759979, -499.200012, -500.639984, -420.480011, -249.840012, -504.960052, -506.399963, -507.839996, -509.280029, -510.720001, -512.159973, -513.599976, -515.040039, -432.480011, -257.040009, -519.360046, -520.800049, -522.239990, -523.680054, -525.120056, -526.559998, -527.999939, -529.440002, -444.480011, -264.239990, -533.760010, -535.200012, -536.640015, -538.079956, -539.520020, -540.960022, -542.399963, -543.839966, -456.479980, -367.599976, -644.559998, -646.239929, -647.920044, -649.599976, -651.280029, -652.960022, -654.640076, -656.320007, -501.200043, -13.740002, -117.880005, -118.179993, -118.479996, -118.780014, -119.080002, -119.379990, -119.680008, -119.979996, -146.379990, -310.470001, -613.950012, -615.479980, -617.010071, -618.539978, -620.069946, -621.599976, -623.130005, -624.660034, -517.589966, -318.269958, -629.250000, -630.779968, -632.309937, -633.840027, -635.369995, -636.899902, -638.429993, -639.959961, -530.190063, -326.070038, -644.550049, -646.079956, -647.609985, -649.140015, -650.669922, -652.200012, -653.729980, -655.260010, -542.789978, -333.870026, -659.849976, -661.380005, -662.910034, -664.439941, -665.970093, -667.500000, -669.029968, -670.559937, -555.390015, -341.669983, -675.149902, -676.679993, -678.209961, -679.740051, -681.270020, -682.800049, -684.329956, -685.859985, -567.989990, -349.470001, -690.450012, -691.979980, -693.510010, -695.039978, -696.569946, -698.099976, -699.630005, -701.160034, -580.589966, -357.269958, -705.750000, -707.279968, -708.809937, -710.340027, -711.869995, -713.399902, -714.929993, -716.459961, -593.190002, -365.070038, -721.050049, -722.579956, -724.109985, -725.640015, -727.169922, -728.700012,
         -730.229980, -731.760010, -605.789978, -483.019958, -841.719971, -843.460022, -845.200073, -846.939941, -848.680054, -850.419983, -852.159973, -853.899963, -648.940002, -37.960014, -178.240021, -178.599976, -178.959991, -179.320007, -179.679993, -180.039978, -180.399994, -180.759964, -202.919983, -419.099915, -812.939941, -814.559937, -816.179993, -817.800049, -819.419922, -821.040039, -822.660034, -824.279968, -674.699951, -427.500031, -829.140015, -830.759949, -832.380005, -833.999939, -835.619995, -837.240051, -838.859924, -840.479980, -687.899963, -435.899994, -845.339966, -846.959961, -848.579956, -850.200012, -851.819885, -853.439941, -855.059937, -856.679993, -701.100037, -444.299927, -861.540039, -863.160034, -864.779968, -866.399963, -868.020020, -869.640015, -871.259949, -872.880005, -714.299988, -452.700012, -877.740051, -879.359924, -880.979980, -882.599915, -884.219971, -885.839966, -887.459961, -889.079956, -727.500000, -461.099915, -893.939941, -895.559937, -897.179993, -898.800049, -900.419922, -902.040039, -903.660034, -905.279968, -740.700012, -469.499969, -910.140015, -911.759949, -913.380005, -914.999939, -916.620056, -918.239990, -919.860046, -921.479919, -753.899963, -477.899902, -926.339905, -927.959961, -929.579956, -931.200012, -932.819946, -934.439880, -936.059937, -937.679932, -767.100037, -606.439941, -1050.880005, -1052.680054, -1054.479980, -1056.280029, -1058.079956, -1059.880005, -1061.679932, -1063.479980, -804.679993, -70.180008, -250.600006, -251.019958, -251.440033, -251.860001, -252.280029, -252.700043, -253.120026, -253.540039, -267.459991, -539.730042, -1029.929932, -1031.640137, -1033.350098, -1035.060059, -1036.770020, -1038.479980, -1040.190063, -1041.900024, -843.809998, -548.729980, -1047.030029, -1048.740112, -1050.449829, -1052.160034, -1053.870117, -1055.580078, -1057.289917, -1059.000122, -857.609985, -557.729980,
-        -1064.130005, -1065.840088, -1067.550049, -1069.260010, -1070.969849, -1072.679932, -1074.390137, -1076.100098, -871.410034, -566.729980, -1081.229980, -1082.940063, -1084.650024, -1086.359985, -1088.069946, -1089.780029, -1091.489990, -1093.199951, -885.210022, -575.729980, -1098.329956, -1100.040039, -1101.750122, -1103.460205, -1105.170166, -1106.879883, -1108.589966, -1110.300049, -899.010071, -584.730042, -1115.429932, -1117.140137, -1118.850098, -1120.560059, -1122.270020, -1123.979980, -1125.689941, -1127.400024, -912.810059, -593.730042, -1132.530029, -1134.240234, -1135.949951, -1137.659912, -1139.370117, -1141.079956, -1142.790039, -1144.500122, -926.610046, -602.730042, -1149.629883, -1151.339966, -1153.050049, -1154.760132, -1156.469971, -1158.179810, -1159.890137, -1161.600098, -940.410034, -737.859985, -1272.040039, -1273.899902, -1275.760010, -1277.619995, -1279.479980, -1281.340088, -1283.200195, -1285.060059, -968.420044}, nd4j::DataType::FLOAT32);
+        -1064.130005, -1065.840088, -1067.550049, -1069.260010, -1070.969849, -1072.679932, -1074.390137, -1076.100098, -871.410034, -566.729980, -1081.229980, -1082.940063, -1084.650024, -1086.359985, -1088.069946, -1089.780029, -1091.489990, -1093.199951, -885.210022, -575.729980, -1098.329956, -1100.040039, -1101.750122, -1103.460205, -1105.170166, -1106.879883, -1108.589966, -1110.300049, -899.010071, -584.730042, -1115.429932, -1117.140137, -1118.850098, -1120.560059, -1122.270020, -1123.979980, -1125.689941, -1127.400024, -912.810059, -593.730042, -1132.530029, -1134.240234, -1135.949951, -1137.659912, -1139.370117, -1141.079956, -1142.790039, -1144.500122, -926.610046, -602.730042, -1149.629883, -1151.339966, -1153.050049, -1154.760132, -1156.469971, -1158.179810, -1159.890137, -1161.600098, -940.410034, -737.859985, -1272.040039, -1273.899902, -1275.760010, -1277.619995, -1279.479980, -1281.340088, -1283.200195, -1285.060059, -968.420044}, sd::DataType::FLOAT32);
 
     NDArray expGradW('c', {kH, kW, iC, mC}, {-2586.600586, -2505.600098, -18624.595703, -50943.605469, -99462.601562, -164181.609375, -245100.609375, -342219.625000,
         -2880.149902, -2790.150146, -20700.152344, -56610.148438, -110520.156250, -182430.156250, -272340.156250, -380250.125000, -2594.701416, -2513.699951,
         -18632.699219, -50951.695312, -99470.695312, -164189.703125, -245108.687500, -342227.750000, -3043.501465, -2953.500244, -20863.500000, -56773.492188,
         -110683.515625, -182593.515625, -272503.531250, -380413.562500, -3383.499756, -3283.500000, -23183.501953, -63083.500000, -122983.500000, -202883.515625,
         -302783.531250, -422683.468750, -3043.501465, -2953.500244, -20863.500000, -56773.492188, -110683.515625, -182593.515625, -272503.531250, -380413.562500,
-        -2594.701416, -2513.699951, -18632.699219, -50951.695312, -99470.695312, -164189.703125, -245108.687500, -342227.750000, -2880.149902, -2790.150146, -20700.152344, -56610.148438, -110520.156250, -182430.156250, -272340.156250, -380250.125000, -2586.600586, -2505.600098, -18624.595703, -50943.605469, -99462.601562, -164181.609375, -245100.609375, -342219.625000}, nd4j::DataType::FLOAT32);
+        -2594.701416, -2513.699951, -18632.699219, -50951.695312, -99470.695312, -164189.703125, -245108.687500, -342227.750000, -2880.149902, -2790.150146, -20700.152344, -56610.148438, -110520.156250, -182430.156250, -272340.156250, -380250.125000, -2586.600586, -2505.600098, -18624.595703, -50943.605469, -99462.601562, -164181.609375, -245100.609375, -342219.625000}, sd::DataType::FLOAT32);
 
-    NDArray expGradB('c', {oC}, {505., -495., -1495., -2495., -3495., -4494.999512, -5495., -6495.}, nd4j::DataType::FLOAT32);
+    NDArray expGradB('c', {oC}, {505., -495., -1495., -2495., -3495., -4494.999512, -5495., -6495.}, sd::DataType::FLOAT32);
 
-    nd4j::ops::depthwise_conv2d_bp op;
+    sd::ops::depthwise_conv2d_bp op;
     ResultSet* results = op.evaluate({&input, &weights, &bias, &gradO}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
     NDArray* gradI = results->at(0);
     NDArray* gradW = results->at(1);
@@ -1705,7 +1705,7 @@ TEST_F(ConvolutionTests1, depthwise_conv2d_bp_test6) {
     weights.linspace(0.1, 0.1);
     gradO.linspace(0.01, 0.01);
 
-    nd4j::ops::depthwise_conv2d_bp op;
+    sd::ops::depthwise_conv2d_bp op;
     auto results = op.evaluate({&input, &weights, &bias, &gradO}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
     auto* gradI = results->at(0);
     auto* gradW = results->at(1);
@@ -1741,7 +1741,7 @@ TYPED_TEST(TypedConvolutionTests1, conv3d_test1) {
     input = 2.;
     weights = 1.;
 
-    nd4j::ops::conv3dnew op;
+    sd::ops::conv3dnew op;
     auto results = op.evaluate({&input, &weights}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW,  dD,dH,dW, paddingMode, dataFormat});
     auto* output = results->at(0);
 
@@ -1773,7 +1773,7 @@ TYPED_TEST(TypedConvolutionTests1, conv3d_test2) {
     input = 2.;
     weights.linspace(0.1, 0.1);
 
-    nd4j::ops::conv3dnew op;
+    sd::ops::conv3dnew op;
     auto results = op.evaluate({&input, &weights}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW,  dD,dH,dW, paddingMode, dataFormat});
     auto* output = results->at(0);
 
@@ -1800,7 +1800,7 @@ TYPED_TEST(TypedConvolutionTests1, conv3d_test3) {
     input = 2.;
     weights.linspace(0.1, 0.1);
 
-    nd4j::ops::conv3dnew op;
+    sd::ops::conv3dnew op;
     auto results = op.evaluate({&input, &weights}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW,  dD,dH,dW, paddingMode, dataFormat});
     auto* output = results->at(0);
 
@@ -1826,7 +1826,7 @@ TYPED_TEST(TypedConvolutionTests1, conv3d_test4) {
     weights = 0.5;
     expected = 48.;
 
-    nd4j::ops::conv3dnew op;
+    sd::ops::conv3dnew op;
     auto results = op.evaluate({&input, &weights}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW,  dD,dH,dW, paddingMode, dataFormat});
     auto* output = results->at(0);
 
@@ -1854,7 +1854,7 @@ TYPED_TEST(TypedConvolutionTests1, conv3d_test5) {
     expected = 49.;
     bias = 1.;
 
-    nd4j::ops::conv3dnew op;
+    sd::ops::conv3dnew op;
     auto results = op.evaluate({&input, &weights, &bias}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW,  dD,dH,dW, paddingMode, dataFormat});
     auto* output = results->at(0);
 
@@ -1883,7 +1883,7 @@ TYPED_TEST(TypedConvolutionTests1, conv3d_test6) {
     input = 2.;
     weights = 0.5;
 
-    nd4j::ops::conv3dnew op;
+    sd::ops::conv3dnew op;
     auto results = op.evaluate({&input, &weights, &bias}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW,  dD,dH,dW, paddingMode, dataFormat});
     auto* output = results->at(0);
 
@@ -1914,7 +1914,7 @@ TYPED_TEST(TypedConvolutionTests1, conv3d_test7) {
     weights.linspace(0.1, 0.1);
     weights.permutei({2, 3, 4, 1, 0});
 
-    nd4j::ops::conv3dnew op;
+    sd::ops::conv3dnew op;
     auto results = op.evaluate({&input, &weights, &bias}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW,  dD,dH,dW, paddingMode, dataFormat});
     auto* output = results->at(0);
 
@@ -1943,7 +1943,7 @@ TYPED_TEST(TypedConvolutionTests1, conv3d_test8) {
     weights.linspace(0.1, 0.1);
     weights.permutei({2, 3, 4, 1, 0});
 
-    nd4j::ops::conv3dnew op;
+    sd::ops::conv3dnew op;
     auto results = op.evaluate({&input, &weights}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW,  dD,dH,dW, paddingMode, dataFormat});
     auto* output = results->at(0);
 
@@ -1960,7 +1960,7 @@ TYPED_TEST(TypedConvolutionTests1, conv3d_test9) {
     auto y = NDArrayFactory::create<TypeParam>('c', {2, 5, 5, 3, 4});
     auto e = NDArrayFactory::create<TypeParam>('c', {4, 1, 7, 10, 4});
 
-    nd4j::ops::conv3dnew op;
+    sd::ops::conv3dnew op;
     auto result = op.evaluate({&x, &y}, {}, {2,5,5, 5,4,3, 0,0,0, 1,1,1, 1,1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -1976,7 +1976,7 @@ TYPED_TEST(TypedConvolutionTests1, conv3d_test10) {
     auto w = NDArrayFactory::create<TypeParam>('c', {2, 5, 5, 3, 4});
     auto exp = NDArrayFactory::create<TypeParam>('c', {4, 1, 7, 10, 4});
 
-    nd4j::ops::conv3dnew op;
+    sd::ops::conv3dnew op;
     auto result = op.evaluate({&x, &w}, {}, {2,5,5, 5,4,3, 0,0,0, 1,1,1, 1,1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -2038,7 +2038,7 @@ TYPED_TEST(TypedConvolutionTests1, pointwise_conv2d_test1) {
     weights.linspace(0.1, 0.1);
     bias = 1.;
 
-    nd4j::ops::pointwise_conv2d op;
+    sd::ops::pointwise_conv2d op;
     auto results = op.evaluate({&input, &weights, &bias}, {}, {dataFormat});
     auto* output = results->at(0);
 
@@ -2062,7 +2062,7 @@ TYPED_TEST(TypedConvolutionTests1, conv3d_test11) {
     input = 2.;
     weights = 1.;
 
-    nd4j::ops::conv3dnew op;
+    sd::ops::conv3dnew op;
     auto results = op.evaluate({&input, &weights}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW,  dD,dH,dW, paddingMode, dataFormat});
     auto* output = results->at(0);
 
@@ -2086,7 +2086,7 @@ TYPED_TEST(TypedConvolutionTests1, conv3d_test12) {
     input = 2.;
     weights = 1.;
 
-    nd4j::ops::conv3dnew op;
+    sd::ops::conv3dnew op;
     auto results = op.evaluate({&input, &weights}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW,  dD,dH,dW, paddingMode, dataFormat});
     auto* output = results->at(0);
 
@@ -2102,8 +2102,8 @@ TEST_F(ConvolutionTests1, vol2col_test1) {
     int bS=2, iD=2,iH=3,iW=2,  iC=3,oC=2,  kD=2,kH=3,kW=2,  sD=1,sH=1,sW=1,  pD=0,pH=0,pW=0,  dD=1,dH=1,dW=1;
     int       oD=2,oH=3,oW=2;
 
-    NDArray volume('c', {bS, iC, iD, iH, iW}, nd4j::DataType::FLOAT32);
-    NDArray columns('c', {bS, iC, kD, kH, kW, oD, oH, oW}, nd4j::DataType::FLOAT32);
+    NDArray volume('c', {bS, iC, iD, iH, iW}, sd::DataType::FLOAT32);
+    NDArray columns('c', {bS, iC, kD, kH, kW, oD, oH, oW}, sd::DataType::FLOAT32);
 
     columns = -1.;
     volume.linspace(1);
@@ -2121,10 +2121,10 @@ TEST_F(ConvolutionTests1, vol2col_test1) {
 53., 54., 0., 0., 0., 0., 59., 60., 0., 0., 0., 0., 54., 0., 0., 0., 0., 0., 60., 0., 0., 0., 0., 0., 55., 56., 57., 58., 59., 60., 0., 0.,0., 0., 0., 0., 56., 0., 58., 0., 60., 0., 0., 0., 0., 0., 0., 0., 57., 58., 59., 60., 0., 0., 0., 0., 0., 0., 0., 0., 58., 0., 60., 0.,
 0., 0., 0., 0., 0., 0., 0., 0., 59., 60., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 60., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 61., 62., 63., 64., 65., 66., 67., 68., 69., 70., 71., 72., 62., 0., 64., 0., 66., 0., 68., 0., 70., 0., 72., 0., 63., 64., 65., 66., 0., 0., 69.,
 70., 71., 72., 0., 0., 64., 0., 66., 0., 0., 0., 70., 0., 72., 0., 0., 0., 65., 66., 0., 0., 0., 0., 71., 72., 0., 0., 0., 0., 66., 0., 0., 0., 0., 0., 72., 0., 0., 0., 0., 0., 67., 68., 69., 70., 71., 72., 0., 0., 0., 0., 0., 0., 68., 0., 70., 0., 72., 0., 0., 0., 0., 0., 0.,
-0., 69., 70., 71., 72., 0., 0., 0., 0., 0., 0., 0., 0., 70., 0., 72., 0., 0., 0., 0., 0., 0., 0., 0., 0., 71., 72., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 72., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.}, nd4j::DataType::FLOAT32);
+0., 69., 70., 71., 72., 0., 0., 0., 0., 0., 0., 0., 0., 70., 0., 72., 0., 0., 0., 0., 0., 0., 0., 0., 0., 71., 72., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 72., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.}, sd::DataType::FLOAT32);
 
     graph::Context context(1);
-    nd4j::ops::ConvolutionUtils::vol2col(context, volume, columns, sD, sH, sW, pD, pH, pW, dD, dH, dW);
+    sd::ops::ConvolutionUtils::vol2col(context, volume, columns, sD, sH, sW, pD, pH, pW, dD, dH, dW);
     // columns.printBuffer();
 
     ASSERT_TRUE(columns.equalsTo(columnsExpected));
@@ -2160,7 +2160,7 @@ TEST_F(ConvolutionTests1, vol2col_test2) {
 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 70.f, 0.f, 72.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 71.f, 72.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 72.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f});
 
     graph::Context context(1);
-    nd4j::ops::ConvolutionUtils::vol2col(context, volume, columns, sD, sH, sW, pD, pH, pW, dD, dH, dW);
+    sd::ops::ConvolutionUtils::vol2col(context, volume, columns, sD, sH, sW, pD, pH, pW, dD, dH, dW);
         // columns.printBuffer();
 
     ASSERT_TRUE(columns.equalsTo(columnsExpected));
@@ -2181,7 +2181,7 @@ TEST_F(ConvolutionTests1, col2im_test1) {
     auto imageExpected = NDArrayFactory::create<float>('c', {bS, iC, iH, iW}, {1.f,  7.f,  12.f,  34.f,  17.f,  39.f,  44.f,  98.f,  33.f,  71.f,  76.f,  162.f,  49.f,  103.f,  108.f,  226.f});
 
     LaunchContext ctx;
-    nd4j::ops::helpers::col2im(ctx, columns, image, sH, sW, pH, pW, iH, iW, dH, dW);
+    sd::ops::helpers::col2im(ctx, columns, image, sH, sW, pH, pW, iH, iW, dH, dW);
 
     ASSERT_TRUE(image.equalsTo(imageExpected));
 }
@@ -2204,7 +2204,7 @@ TEST_F(ConvolutionTests1, upsampling2d_test1) {
                                         25.f, 26.f, 27.f, 25.f, 26.f, 27.f, 25.f, 26.f, 27.f, 28.f, 29.f, 30.f, 28.f, 29.f, 30.f, 28.f, 29.f, 30.f, 25.f, 26.f, 27.f, 25.f, 26.f, 27.f, 25.f, 26.f, 27.f, 28.f, 29.f, 30.f, 28.f, 29.f, 30.f, 28.f, 29.f, 30.f,
                                         31.f, 32.f, 33.f, 31.f, 32.f, 33.f, 31.f, 32.f, 33.f, 34.f, 35.f, 36.f, 34.f, 35.f, 36.f, 34.f, 35.f, 36.f, 31.f, 32.f, 33.f, 31.f, 32.f, 33.f, 31.f, 32.f, 33.f, 34.f, 35.f, 36.f, 34.f, 35.f, 36.f, 34.f, 35.f, 36.f});
 
-    nd4j::ops::upsampling2d op;
+    sd::ops::upsampling2d op;
     auto results = op.evaluate({&input}, {factorH, factorW, isNCHW});
     auto* output = results->at(0);
 
@@ -2232,7 +2232,7 @@ TEST_F(ConvolutionTests1, upsampling2d_test2) {
                                 29.f, 29.f, 29.f, 30.f, 30.f, 30.f, 29.f, 29.f, 29.f, 30.f, 30.f, 30.f, 31.f, 31.f, 31.f, 32.f, 32.f, 32.f, 31.f, 31.f, 31.f, 32.f, 32.f, 32.f,
                                 33.f, 33.f, 33.f, 34.f, 34.f, 34.f, 33.f, 33.f, 33.f, 34.f, 34.f, 34.f, 35.f, 35.f, 35.f, 36.f, 36.f, 36.f, 35.f, 35.f, 35.f, 36.f, 36.f, 36.f});
 
-    nd4j::ops::upsampling2d op;
+    sd::ops::upsampling2d op;
     auto results = op.evaluate({&input}, {factorH, factorW, isNCHW});
     auto* output = results->at(0);
 
@@ -2270,7 +2270,7 @@ TEST_F(ConvolutionTests1, upsampling3d_test1) {
             67.f, 68.f, 69.f, 67.f, 68.f, 69.f, 70.f, 71.f, 72.f, 70.f, 71.f, 72.f, 61.f, 62.f, 63.f, 61.f, 62.f, 63.f, 64.f, 65.f, 66.f, 64.f, 65.f, 66.f, 61.f, 62.f, 63.f, 61.f, 62.f, 63.f, 64.f, 65.f, 66.f, 64.f, 65.f, 66.f, 61.f, 62.f, 63.f, 61.f, 62.f, 63.f, 64.f, 65.f, 66.f, 64.f, 65.f, 66.f, 67.f, 68.f, 69.f, 67.f, 68.f, 69.f, 70.f, 71.f, 72.f, 70.f, 71.f, 72.f,
             67.f, 68.f, 69.f, 67.f, 68.f, 69.f, 70.f, 71.f, 72.f, 70.f, 71.f, 72.f, 67.f, 68.f, 69.f, 67.f, 68.f, 69.f, 70.f, 71.f, 72.f, 70.f, 71.f, 72.f});
 
-    nd4j::ops::upsampling3d op;
+    sd::ops::upsampling3d op;
     auto results = op.evaluate({&input}, {factorD, factorH, factorW, isNCDHW});
     auto* output = results->at(0);
 
@@ -2304,7 +2304,7 @@ TEST_F(ConvolutionTests1, upsampling3d_test2) {
             61.f, 61.f, 62.f, 62.f, 61.f, 61.f, 62.f, 62.f, 61.f, 61.f, 62.f, 62.f, 63.f, 63.f, 64.f, 64.f, 63.f, 63.f, 64.f, 64.f, 63.f, 63.f, 64.f, 64.f, 61.f, 61.f, 62.f, 62.f, 61.f, 61.f, 62.f, 62.f, 61.f, 61.f, 62.f, 62.f, 63.f, 63.f, 64.f, 64.f, 63.f, 63.f, 64.f, 64.f, 63.f, 63.f, 64.f, 64.f, 65.f, 65.f, 66.f, 66.f, 65.f, 65.f, 66.f, 66.f, 65.f, 65.f, 66.f, 66.f, 67.f, 67.f, 68.f, 68.f, 67.f, 67.f, 68.f, 68.f, 67.f, 67.f, 68.f, 68.f,
             65.f, 65.f, 66.f, 66.f, 65.f, 65.f, 66.f, 66.f, 65.f, 65.f, 66.f, 66.f, 67.f, 67.f, 68.f, 68.f, 67.f, 67.f, 68.f, 68.f, 67.f, 67.f, 68.f, 68.f, 69.f, 69.f, 70.f, 70.f, 69.f, 69.f, 70.f, 70.f, 69.f, 69.f, 70.f, 70.f, 71.f, 71.f, 72.f, 72.f, 71.f, 71.f, 72.f, 72.f, 71.f, 71.f, 72.f, 72.f, 69.f, 69.f, 70.f, 70.f, 69.f, 69.f, 70.f, 70.f, 69.f, 69.f, 70.f, 70.f, 71.f, 71.f, 72.f, 72.f, 71.f, 71.f, 72.f, 72.f, 71.f, 71.f, 72.f, 72.f});
 
-    nd4j::ops::upsampling3d op;
+    sd::ops::upsampling3d op;
     auto results = op.evaluate({&input}, {factorD, factorH, factorW, isNCDHW});
     auto* output = results->at(0);
 
@@ -2331,7 +2331,7 @@ TEST_F(ConvolutionTests1, upsampling3d_bp_test1) {
     auto expGradI = NDArrayFactory::create<float>('c', {bS, iC, iD, iH, iW});
     expGradI = 8.;
 
-    nd4j::ops::upsampling3d_bp op;
+    sd::ops::upsampling3d_bp op;
     auto results = op.evaluate({&input, &gradO}, {isNCDHW});
     auto* gradI = results->at(0);
 
@@ -2357,7 +2357,7 @@ TYPED_TEST(TypedConvolutionTests1, conv2D_input_BP_test1) {
     epsilonNext.linspace(1);
     weights.permutei({2,3,1,0});
 
-    nd4j::ops::conv2d_input_bp op;
+    sd::ops::conv2d_input_bp op;
 
     auto results = op.evaluate({&inputShape, &weights, &epsilonNext}, {},  {3, 3, 1, 1, 0, 0, 1, 1, 1});
 
@@ -2378,7 +2378,7 @@ TEST_F(ConvolutionTests1, upsampling3d_bp_test3) {
     const int factorD=2, factorH=2, factorW=2;
     const int isNCDHW = 1;                    // data format, default is NCHW
 
-    NDArray input('c', {bS, iC, iD, iH, iW}, nd4j::DataType::FLOAT32);
+    NDArray input('c', {bS, iC, iD, iH, iW}, sd::DataType::FLOAT32);
     NDArray gradO('c', {bS, iC, iD*factorD, iH*factorH, iW*factorW}, {0.6793504, 0.35508695, 0.84278935, 0.20031333, 0.7014987, 0.31069338,
         0.44793984, 0.93800974, 0.32667395, 0.15187258, 0.38331753, 0.78212297, 0.1988072, 0.7985636, 0.1632634, 0.14696825, 0.26089668,
         0.13505761, 0.7562093, 0.27545404, 0.36908787, 0.09282647, 0.83649176, 0.26841334, 0.09506222, 0.31279507, 0.13591796, 0.5175439,
@@ -2415,15 +2415,15 @@ TEST_F(ConvolutionTests1, upsampling3d_bp_test3) {
         0.10535406, 0.66438645, 0.4372345, 0.93851465, 0.8635335, 0.3405871, 0.45652762, 0.3636232, 0.52931345, 0.20154329, 0.07698499, 0.6125804,
         0.3583082, 0.3894796, 0.32601944, 0.5237369, 0.66683626, 0.08541841, 0.4815708, 0.11897489, 0.97555137, 0.3602705, 0.9620871, 0.6361821,
         0.71167386, 0.5134439, 0.57761437, 0.58598644, 0.39387667, 0.6966405, 0.46841687, 0.85788506, 0.9957087, 0.051309288, 0.24846801, 0.55938333,
-        0.10230542, 0.9370694, 0.57527155, 0.54656035, 0.28896323, 0.51303476, 0.8865, 0.38641605, 0.9836358}, nd4j::DataType::FLOAT32);
+        0.10230542, 0.9370694, 0.57527155, 0.54656035, 0.28896323, 0.51303476, 0.8865, 0.38641605, 0.9836358}, sd::DataType::FLOAT32);
 
     NDArray expGradI('c', {bS, iC, iD, iH, iW}, {3.510932, 3.4310975, 3.538762, 4.148549, 2.8380678, 2.5431657, 3.3928843, 3.228055, 3.1467278,
         3.2603023, 5.611751, 4.334653, 3.3697734, 4.603307, 4.4357986, 4.32991, 3.0532732, 3.1370173, 4.181534, 2.9965065, 2.8553872, 5.2719016,
         4.5671935, 3.7027276, 3.3517184, 5.2544537, 3.5107024, 4.1496124, 3.9333878, 3.1798909, 3.1446428, 3.0932689, 3.9730802, 3.0466917,
         4.9675374, 4.769673, 3.766952, 3.6375027, 3.6492167, 4.9440994, 3.8379507, 3.467589, 4.719474, 3.1295977, 4.5177174, 4.2760015, 2.8443856,
-        4.225355, 4.377341, 4.4398847, 4.710785, 4.4199953, 3.928307, 4.8769503}, nd4j::DataType::FLOAT32);
+        4.225355, 4.377341, 4.4398847, 4.710785, 4.4199953, 3.928307, 4.8769503}, sd::DataType::FLOAT32);
 
-    nd4j::ops::upsampling3d_bp op;
+    sd::ops::upsampling3d_bp op;
     auto results = op.evaluate({&input, &gradO}, {isNCDHW});
     auto* gradI = results->at(0);
 
@@ -2456,7 +2456,7 @@ TEST_F(ConvolutionTests1, deconv2d_test1) {
     input = 0.5;
     weights.linspace(0.1, 0.1);
 
-    nd4j::ops::deconv2d op;
+    sd::ops::deconv2d op;
     auto results = op.evaluate({&input, &weights}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
     ASSERT_EQ(Status::OK(), results->status());
 
@@ -2489,7 +2489,7 @@ TEST_F(ConvolutionTests1, deconv2d_test2) {
     input = 0.5;
     weights.linspace(0.1, 0.1);
 
-    nd4j::ops::deconv2d op;
+    sd::ops::deconv2d op;
     auto results = op.evaluate({&input, &weights}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
     auto output = results->at(0);
 
@@ -2521,7 +2521,7 @@ TEST_F(ConvolutionTests1, deconv2d_test3) {
     weights.linspace(0.1, 0.1);
     bias = 0.2;
 
-    nd4j::ops::deconv2d op;
+    sd::ops::deconv2d op;
     auto results = op.evaluate({&input, &weights}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
     ASSERT_EQ(Status::OK(), results->status());
 
@@ -2536,8 +2536,8 @@ TEST_F(ConvolutionTests1, deconv2d_test3) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(ConvolutionTests1, deconv2d_test4) {
 
-    NDArray input('c', {2, 3, 4, 4}, nd4j::DataType::FLOAT32);
-    NDArray weights('c', {3, 3, 5, 5}, nd4j::DataType::FLOAT32);
+    NDArray input('c', {2, 3, 4, 4}, sd::DataType::FLOAT32);
+    NDArray weights('c', {3, 3, 5, 5}, sd::DataType::FLOAT32);
     NDArray exp('c', {2,3,8,8}, {6276.0,12831.0,19668.0,26790.0,27012.0,20703.0,14100.0,7200.0,13719.0,28023.0,42918.0,58410.0,58902.0,45105.0,30693.0,15660.0,22389.0,45696.0,69930.0,95100.0,95910.0,73386.0,49899.0,25440.0,32346.0,65970.0,
                                 100884.0,137100.0,138276.0,105726.0,71838.0,36600.0,33726.0,68790.0,105204.0,142980.0,144156.0,110226.0,74898.0,38160.0,27555.0,56154.0,85806.0,116520.0,117474.0,89748.0,60933.0,31020.0,19917.0,40557.0,61926.0,
                                 84030.0,84714.0,64671.0,43875.0,22320.0,10752.0,21879.0,33384.0,45270.0,45636.0,34815.0,23604.0,12000.0,7551.0,15456.0,23718.0,32340.0,32562.0,24978.0,17025.0,8700.0,16569.0,33873.0,51918.0,70710.0,71202.0,
@@ -2550,13 +2550,13 @@ TEST_F(ConvolutionTests1, deconv2d_test4) {
                                 268338.0,180882.0,91440.0,66867.0,135210.0,205038.0,276360.0,279042.0,211572.0,142581.0,72060.0,46845.0,94701.0,143574.0,193470.0,195306.0,148047.0,99747.0,50400.0,24576.0,49671.0,75288.0,101430.0,102372.0,77583.0,
                                 52260.0,26400.0,22095.0,44688.0,67782.0,91380.0,92178.0,69906.0,47121.0,23820.0,46377.0,93777.0,142206.0,191670.0,193314.0,146571.0,98775.0,49920.0,72906.0,147387.0,223452.0,301110.0,303648.0,230175.0,155082.0,
                                 78360.0,101742.0,205638.0,311700.0,419940.0,423420.0,320898.0,216162.0,109200.0,106002.0,214218.0,324660.0,437340.0,440820.0,334038.0,224982.0,113640.0,83292.0,168285.0,254988.0,343410.0,346092.0,262197.0,176556.0,
-                                89160.0,58095.0,117351.0,177774.0,239370.0,241206.0,182697.0,122997.0,62100.0,30351.0,61296.0,92838.0,124980.0,125922.0,95358.0,64185.0,32400.0,26970.0,54513.0,82632.0,111330.0,112128.0,84981.0,57246.0,28920.0,56427.0,114027.0,172806.0,232770.0,234414.0,177621.0,119625.0,60420.0,88431.0,178662.0,270702.0,364560.0,367098.0,278100.0,187257.0,94560.0,123042.0,248538.0,376500.0,506940.0,510420.0,386598.0,260262.0,131400.0,127302.0,257118.0,389460.0,524340.0,527820.0,399738.0,269082.0,135840.0,99717.0,201360.0,304938.0,410460.0,413142.0,312822.0,210531.0,106260.0,69345.0,140001.0,211974.0,285270.0,287106.0,217347.0,146247.0,73800.0,36126.0,72921.0,110388.0,148530.0,149472.0,113133.0,76110.0,38400.0}, nd4j::DataType::FLOAT32);
+                                89160.0,58095.0,117351.0,177774.0,239370.0,241206.0,182697.0,122997.0,62100.0,30351.0,61296.0,92838.0,124980.0,125922.0,95358.0,64185.0,32400.0,26970.0,54513.0,82632.0,111330.0,112128.0,84981.0,57246.0,28920.0,56427.0,114027.0,172806.0,232770.0,234414.0,177621.0,119625.0,60420.0,88431.0,178662.0,270702.0,364560.0,367098.0,278100.0,187257.0,94560.0,123042.0,248538.0,376500.0,506940.0,510420.0,386598.0,260262.0,131400.0,127302.0,257118.0,389460.0,524340.0,527820.0,399738.0,269082.0,135840.0,99717.0,201360.0,304938.0,410460.0,413142.0,312822.0,210531.0,106260.0,69345.0,140001.0,211974.0,285270.0,287106.0,217347.0,146247.0,73800.0,36126.0,72921.0,110388.0,148530.0,149472.0,113133.0,76110.0,38400.0}, sd::DataType::FLOAT32);
 
     input.linspace(1);
     weights.linspace(1);
     weights.permutei({2,3,1,0});
 
-    nd4j::ops::deconv2d op;
+    sd::ops::deconv2d op;
     auto result = op.evaluate({&input, &weights}, {5, 5, 1, 1, 0, 0, 1, 1, 0, 0});
 
     auto z = result->at(0);
@@ -2583,7 +2583,7 @@ TEST_F(ConvolutionTests1, deconv2d_test5) {
     weights.linspace(1);
     weights.permutei({2,3,1,0});
 
-    nd4j::ops::deconv2d op;
+    sd::ops::deconv2d op;
     auto result = op.execute({&input, &weights}, {&z}, {5, 5, 1, 1, 0, 0, 1, 1, 0, 0});
 
     ASSERT_EQ(ND4J_STATUS_OK, result);
@@ -2614,7 +2614,7 @@ TYPED_TEST(TypedConvolutionTests1, deconv2d_test6) {
 
     input.linspace(1);
 
-    nd4j::ops::deconv2d op;
+    sd::ops::deconv2d op;
     auto results = op.evaluate({&input, &weights}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
 
     ASSERT_EQ(Status::OK(), results->status());
@@ -2638,7 +2638,7 @@ TEST_F(ConvolutionTests1, deconv2d_test7) {
     input.linspace(1);
     bias.linspace(1);
 
-    nd4j::ops::deconv2d op;
+    sd::ops::deconv2d op;
 
     auto result = op.evaluate({&input, &weights, &bias}, {1, 1, 1, 1, 0, 0, 1, 1, 1, 0});
 
@@ -2682,7 +2682,7 @@ TEST_F(ConvolutionTests1, deconv2d_test8) {
         1.235054, 1.201363, 1.222816, 1.623673, 1.590317, 1.322463, 1.206481, 1.466262, 0.974741, 0.922343, 1.367100, 1.087943, 1.084952, 1.586691, 1.133576, 1.405098,
         1.471922, 1.484062, 1.212039, 1.144419, 1.266123});
 
-    nd4j::ops::deconv2d op;
+    sd::ops::deconv2d op;
     auto results = op.evaluate({&input, &weights, &bias}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
 
     ASSERT_EQ(Status::OK(), results->status());
@@ -2717,7 +2717,7 @@ TYPED_TEST(TypedConvolutionTests1, deconv2d_tf_test1) {
     input = 0.5;
     weights.linspace(0.1, 0.1);
 
-    nd4j::ops::deconv2d_tf op;
+    sd::ops::deconv2d_tf op;
     auto results = op.evaluate({&outShape, &weights, &input}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
     auto output = results->at(0);
 
diff --git a/libnd4j/tests_cpu/layers_tests/ConvolutionTests2.cpp b/libnd4j/tests_cpu/layers_tests/ConvolutionTests2.cpp
index 0c63f527e..e403f6a32 100644
--- a/libnd4j/tests_cpu/layers_tests/ConvolutionTests2.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ConvolutionTests2.cpp
@@ -23,19 +23,19 @@
 #define LIBND4J_CONVOLUTIONTESTS2_H
 
 #include "testlayers.h"
-#include <NDArray.h>
-#include <Context.h>
-#include <Node.h>
+#include <array/NDArray.h>
+#include <graph/Context.h>
+#include <graph/Node.h>
 #include <graph/Variable.h>
 #include <graph/VariableSpace.h>
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/convolutions.h>
 #include <ops/declarable/helpers/col2im.h>
-#include <PointersManager.h>
-#include <GradCheck.h>
+#include <helpers/PointersManager.h>
+#include <helpers/GradCheck.h>
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class ConvolutionTests2 : public testing::Test {
 public:
@@ -67,7 +67,7 @@ TEST_F(ConvolutionTests2, im2col_1) {
 
     int paddingMode = 0;             // 1-SAME, 0-VALID;
 
-    NDArray image('c', {bS, iC, iH, iW}, nd4j::DataType::DOUBLE);
+    NDArray image('c', {bS, iC, iH, iW}, sd::DataType::DOUBLE);
     NDArray expected('c', {bS, iC, kH, kW, oH, oW}, {1,  2, 4,  5, 2,  3, 5,  6, 4,  5, 7,  8, 5,  6, 8,  9, 7,  8, 10, 11, 8,  9, 11, 12, 13, 14, 16, 17, 14,
                                                     15, 17, 18, 16, 17, 19, 20, 17, 18, 20, 21, 19, 20, 22, 23, 20, 21, 23, 24, 25, 26, 28, 29, 26, 27, 29, 30,
                                                     28, 29, 31, 32, 29, 30, 32, 33, 31, 32, 34, 35, 32, 33, 35, 36, 37, 38, 40, 41, 38, 39, 41, 42, 40, 41, 43,
@@ -78,7 +78,7 @@ TEST_F(ConvolutionTests2, im2col_1) {
 
     image.linspace(1, 1);
 
-    nd4j::ops::im2col op;
+    sd::ops::im2col op;
     auto results = op.evaluate({&image}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode});
     auto column = results->at(0);
 
@@ -121,7 +121,7 @@ TYPED_TEST(TypedConvolutionTests2, deconv2d_tf_test2) {
     input = 0.5;
     weights.linspace(0.1, 0.1);
 
-    nd4j::ops::deconv2d_tf op;
+    sd::ops::deconv2d_tf op;
     auto results = op.evaluate({&outShape, &weights, &input}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
     auto output = results->at(0);
 
@@ -139,7 +139,7 @@ TYPED_TEST(TypedConvolutionTests2, Test_DeConv2D_TF_1) {
     auto input2 = NDArrayFactory::create<TypeParam>('c', {12, 4, 4, 16});
     auto exp = NDArrayFactory::create<TypeParam>('c', {12, 5, 5, 32});
 
-    nd4j::ops::deconv2d_tf op;
+    sd::ops::deconv2d_tf op;
     auto result = op.evaluate({&input0, &input1, &input2}, {}, {2, 2, 1, 1, 0, 0, 1, 1, 0, 1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -169,7 +169,7 @@ TYPED_TEST(TypedConvolutionTests2, Test_DeConv2D_TF_2) {
     auto input2 = NDArrayFactory::create<TypeParam>('c', {3, 4, 4, 5}, {0.98114507f, 0.96400015f, 0.58669623f, 0.60073098f, 0.75425418f, 0.44258752f, 0.76373084f, 0.96593234f, 0.34067846f, 0.57962620f, 0.77517051f, 0.97472977f, 0.79237527f, 0.68690428f, 0.21719366f, 0.79959206f, 0.84814187f, 0.22496814f, 0.08646965f, 0.31110474f, 0.79813162f, 0.19661444f, 0.57760099f, 0.72138960f, 0.15244268f, 0.87687051f, 0.11130344f, 0.01087698f, 0.34817841f, 0.54992017f, 0.23443850f, 0.31725614f, 0.59755220f, 0.20364695f, 0.00531392f, 0.23403114f, 0.07442912f, 0.83707647f, 0.89291743f, 0.09044587f, 0.69041462f, 0.29904183f, 0.61904680f, 0.85306847f, 0.34467042f, 0.95839152f, 0.54517124f, 0.29640937f, 0.94855959f, 0.95970016f, 0.94045145f, 0.95510301f, 0.34666505f, 0.34717010f, 0.69245678f, 0.71669175f, 0.59043738f, 0.64924132f, 0.06033522f, 0.60185199f, 0.04690073f, 0.59241154f, 0.40229547f, 0.23002481f, 0.45161195f, 0.73743778f, 0.93209113f, 0.37294358f, 0.50177744f, 0.15072501f, 0.26146917f, 0.05252146f, 0.04758931f, 0.76448288f, 0.85149045f, 0.08840467f, 0.07692576f, 0.33180160f, 0.27241259f, 0.74834620f, 0.56453640f, 0.23057286f, 0.68429752f, 0.11961551f, 0.39045977f, 0.44356094f, 0.77018807f, 0.07984410f, 0.47926806f, 0.26165759f, 0.18606064f, 0.89972877f, 0.17962874f, 0.47273120f, 0.64641705f, 0.61890443f, 0.58730015f, 0.25937832f, 0.35231561f, 0.10243882f, 0.17459193f, 0.95906995f, 0.09227025f, 0.30003223f, 0.41601210f, 0.38269713f, 0.84799751f, 0.59295173f, 0.76277990f, 0.68910424f, 0.37672606f, 0.40675461f, 0.94346058f, 0.91438505f, 0.84728183f, 0.64367667f, 0.74899979f, 0.60570691f, 0.16417363f, 0.68852426f, 0.85486889f, 0.22585792f, 0.86953176f, 0.07465519f, 0.93096301f, 0.38008822f, 0.38752587f, 0.44004038f, 0.13170612f, 0.94541045f, 0.89349973f, 0.69245307f, 0.94978877f, 0.98776658f, 0.79445884f, 0.30607409f, 0.58264961f, 0.37980538f, 0.41810784f, 0.48903038f, 0.51615888f, 0.57682794f, 0.82481897f, 0.78341080f, 0.48446465f, 0.17447931f, 0.71125424f, 0.30263851f, 0.70675352f, 0.03215584f, 0.92381065f, 0.22343694f, 0.08851149f, 0.91402490f, 0.70074717f, 0.30912192f, 0.37723206f, 0.97579397f, 0.23554587f, 0.95939133f, 0.41565709f, 0.01741416f, 0.58362787f, 0.22106662f, 0.89065537f, 0.31900249f, 0.41280911f, 0.67947610f, 0.04545590f, 0.15352812f, 0.85412524f, 0.84933222f, 0.80000225f, 0.93147073f, 0.70094105f, 0.69269875f, 0.95282194f, 0.65913582f, 0.79186874f, 0.59855248f, 0.39707430f, 0.95126239f, 0.15618217f, 0.33446689f, 0.98123758f, 0.84770758f, 0.98081012f, 0.54427413f, 0.18728519f, 0.89792955f, 0.53360126f, 0.72812986f, 0.13307744f, 0.51217443f, 0.66708084f, 0.29416915f, 0.31298995f, 0.39155037f, 0.29288291f, 0.87063305f, 0.61759154f, 0.73723332f, 0.37167635f, 0.82122716f, 0.22937430f, 0.76570536f, 0.47911792f, 0.02826214f, 0.94277323f, 0.59945469f, 0.19042060f, 0.68173155f, 0.82771295f, 0.95649538f, 0.40833101f, 0.90838542f, 0.55245881f, 0.49011012f, 0.36773444f, 0.34513527f, 0.42050683f, 0.16113964f, 0.30969388f, 0.27174174f, 0.12117655f, 0.35270175f, 0.81967867f, 0.63723136f, 0.84309389f, 0.71822576f, 0.84883484f, 0.32306117f, 0.08176457f, 0.56175486f, 0.34892198f, 0.09306929f, 0.85437582f, 0.13925577f, 0.48629188f, 0.29923539f});
     auto exp = NDArrayFactory::create<TypeParam>('c', {3, 8, 8, 16}, {5.98743296f, -2.83037376f, -0.87943113f, 1.41339970f, 1.32433391f, -1.20299149f, -0.02893090f, 2.05326009f, 1.19417048f, 5.58212376f, 3.28139353f, 1.19237995f, -1.09431255f, -2.55264497f, 3.11014652f, 6.81296825f, -2.09029293f, -4.32068443f, -0.52808392f, -1.97968531f, -0.18673831f, 0.84605980f, 4.55825520f, 2.71503139f, 0.15210046f, 0.85310984f, -3.82062817f, 2.76470995f, 3.69004202f, -1.45017099f, -2.59361267f, -1.35094655f, 7.24145126f, -5.25432396f, 0.19920218f, -4.30596399f, 1.35318923f, -3.88142037f, 3.67493343f, 2.25931478f, 2.87630725f, 1.66349852f, 6.21347952f, 0.94105923f, -1.61742055f, -2.35699606f, 0.12850338f, 1.79141688f, -2.09535933f, -6.35418081f, -0.06303531f, -4.38615131f, 0.48237842f, 0.26528549f, 3.38231516f, 3.76315165f, -0.40254810f, -0.23716694f, -6.13381910f, -0.41950428f, -0.89680839f, -1.46491277f, -1.98541689f, -0.99357355f, 5.58237648f, -2.38937521f, -0.00872564f, -2.37138414f, 4.91117287f, -4.51916361f, 0.97943687f, 2.91052818f, -2.50362611f, 1.70252812f, 5.04137802f, 3.57108784f, -1.87532270f, -3.66677809f, -2.38861251f, 5.55765152f, -7.27571774f, -1.68887305f, -0.72266489f, -4.42809057f, -0.92118186f, 1.02381468f, 4.44284725f, 5.17150497f, -0.42438728f, 2.02693963f, -1.36484981f, -1.47912180f, 0.26649538f, -0.02091765f, -2.86906910f, -3.03046989f, 1.35122132f, -3.21707630f, 2.21112418f, 0.24121630f, 3.96940088f, -7.66105747f, 2.76352382f, -0.99061489f, -2.16720009f, -1.63170409f, 1.12701774f, -1.02415371f, -0.90435314f, -1.51372027f, -0.76884907f, 0.39066136f, -0.89562428f, -2.03204703f, 1.28074932f, -2.14551091f, -2.36843777f, 0.46580017f, 0.75451565f, -0.00336730f, -1.06597757f, 3.27195978f, -0.41307712f, -0.10376054f, -1.34102952f, -2.22901654f, 2.31929803f, 1.40851438f, -2.23774385f, 0.20417206f, -1.12153268f, -0.13188094f, -3.96649432f, 2.10269976f, 0.49845099f, 6.18937683f, -0.51783508f, -0.48048639f, -1.92970264f, 3.16670656f, 1.13355756f, -0.07890664f, 1.31536257f, -0.43924797f, -0.04562932f, -0.87974954f, 0.75411212f, -2.39745235f, -3.97132111f, 0.37202546f, -2.40399146f, -1.50796390f, -3.08302689f, 0.23075986f, -0.94316757f, 1.34948587f, 0.58591264f, 2.18529797f, 7.97652435f, 2.32798409f, -4.09404373f, 0.89634895f, 0.77697754f, -0.65091681f, -7.05506849f, 5.86194515f, 2.51394033f, 4.69959354f, 0.20835471f, 3.18049693f, -1.29682434f, 3.70832396f, -0.48123091f, -1.67904007f, -1.35418940f, 1.58435583f, -1.13851106f, -1.19225955f, 0.59713769f, -5.80462933f, -7.45143986f, -1.08658695f, 1.03244078f, -1.75307107f, -7.07100582f, 3.85825157f, 1.62127817f, 2.32572675f, 0.56171900f, -0.80591971f, 3.98835945f, 0.15742642f, -2.97832179f, 0.13821673f, -0.72556758f, -0.84936106f, -7.28444147f, 3.94134307f, 0.80779338f, 7.47784615f, 8.23335075f, 4.80595016f, -4.89574575f, 4.03362942f, -6.67522192f, -4.55204487f, 2.12511182f, -2.70781207f, -1.57226098f, -3.08408356f, -0.30812448f, -5.32870674f, -5.13238287f, 0.49605465f, -0.55042171f, 0.46324944f, -3.83545256f, -0.12562510f, -0.20978995f, -0.13068712f, -1.92144060f, -1.68787408f, 5.45581436f, -0.79583496f, -2.38866687f, -3.90546346f, -0.47028148f, -0.14319679f, -3.37016582f, 2.00905991f, -1.21345615f, 1.81376505f, 7.73004007f, 0.74310112f, -4.64536428f, 3.78111577f, -9.05182457f, -0.10674095f, 1.53476238f, 0.63345337f, -0.40907967f, -1.44729769f, -1.87145400f, -2.46623540f, 1.07472968f, 0.77390999f, -3.93438888f, 4.49174690f, -0.96686655f, 1.92278123f, 0.30049133f, -0.02388665f, -1.99777114f, -3.23885751f, 5.87784004f, 2.13776040f, 3.56758308f, -3.37774134f, -3.67526293f, 1.63700044f, -1.69959962f, -0.99112594f, 6.03103638f, 1.67399430f, -1.28699589f, 7.16759014f, 12.63490295f, 3.62937450f, -4.75982571f, 2.17861104f, -2.03065681f, 4.30207729f, -0.46797156f, -2.96022511f, -6.02702332f, 3.09229851f, -1.39771092f, -0.03471333f, 3.22175527f, 5.63565636f, 1.78195477f, -0.63545251f, -3.99497652f, 1.46043062f, 4.60050488f, -2.96651959f, -2.03159475f, -1.52386189f, -0.15129802f, -3.90390921f, -0.63852370f, 0.79210538f, 2.35288715f, -5.55609035f, 5.36427498f, -0.60248077f, -0.26181316f, 5.04884720f, 8.53192806f, 5.05080223f, -6.56371737f, 1.52260923f, -7.13623667f, 6.49414349f, 2.33445597f, -4.11490965f, -6.44347477f, -0.47079402f, -0.63467920f, 2.60399365f, 1.05958164f, 3.66901422f, -1.05657935f, 1.88611507f, -6.37475634f, 2.01480770f, 3.36020517f, -5.11001921f, -0.46132171f, 2.16525555f, 4.21938848f, -2.08346295f, 2.86168146f, 1.26987600f, 6.76066971f, -7.84916353f, 4.11700916f, 0.47985530f, -4.60113716f, 7.42062473f, 6.37472820f, 4.37820530f, -7.12197018f, 0.01357239f, -7.90392113f, 8.32131577f, -0.87593079f, -0.16994858f, -5.86345863f, -0.20697471f, -1.37845206f, 1.63819647f, 1.59720242f, -0.74357712f, -1.88725603f, -1.98357940f, -8.57950306f, -4.10104513f, 3.57231879f, -2.89855957f, -0.11263305f, 2.78033924f, 1.53078973f, -2.93089223f, 0.73189604f, 3.20563078f, 3.92601013f, -5.21916151f, 0.89163935f, -0.42978728f, -6.70888853f, 4.56477976f, 1.20105875f, 3.83393812f, -6.27205181f, 4.05993128f, -7.35513067f, 1.60660768f, -1.21052051f, 1.58191252f, -1.37899971f, -1.20117283f, 2.93301678f, 1.06302834f, 1.38993621f, -1.66884089f, -3.34452581f, 1.04498529f, -4.10412455f, -4.03310585f, 1.61513603f, -1.09388447f, 2.11451387f, -0.94192362f, -0.23287666f, 5.88265705f, -0.83010495f, -2.15317154f, -0.60276151f, -1.49265075f, 3.93397975f, 5.45194483f, 1.45161700f, -2.57401872f, -5.59288931f, 4.29170895f, 1.87151814f, 0.08362055f, -0.28767288f, 1.17675185f, 0.85266006f, 1.30549634f, -5.60830832f, 0.19398519f, -0.83982587f, 1.75940764f, -5.46077394f, 1.64495635f, 0.17102760f, -0.54459631f, -2.21975255f, -0.37443402f, -2.08474159f, 1.85959935f, 11.19680309f, -0.18611598f, -2.59765387f, 3.06330776f, -1.52183700f, -4.88415241f, -0.75097847f, 2.58201051f, 7.40885210f, 3.58994508f, 1.62457407f, 3.12514591f, -4.36833286f, 1.39830995f, 3.61003447f, -0.63837433f, -3.62661815f, 3.78898096f, 2.92802262f, 5.87374496f, -4.38554621f, -2.53411579f, -2.87311554f, -1.31391978f, -4.26736879f, 3.45099425f, 1.58769250f, 1.73341393f, -1.08842182f, 2.27120280f, -1.78938174f, -2.29940319f, 7.07046986f, 0.51426595f, -6.22928905f, 5.28968811f, 2.31827855f, -4.20915890f, -1.27249205f, 5.92120600f, 3.19458675f, 7.09252501f, 3.96577907f, 6.41484213f, -4.66009521f, 10.00181389f, 0.51108456f, -4.62243366f, -5.18351841f, 2.12961674f, 5.10694027f, 7.29412317f, 0.15912467f, -3.38902974f, -4.01918602f, -2.17383957f, 0.13118666f, 0.27872476f, -0.92317247f, 3.51440644f, 1.84171486f, 1.03378081f, 1.30569839f, -2.09583759f, 9.03952980f, -0.55187917f, -2.04549074f, 1.08294606f, -2.65263700f, -2.93977118f, 1.88909876f, 0.96043622f, 1.76579499f, 3.14314699f, 5.86394691f, 7.36944389f, -7.04524136f, 6.68673229f, -5.52591467f, -2.19745898f, -4.32036924f, 0.52971321f, 2.26268244f, 6.91575766f, -0.94590527f, -3.98923349f, -0.12266219f, 0.24294075f, -1.07783222f, 1.87989080f, -3.57109427f, 1.61553633f, 0.42486978f, 0.75852054f, -6.19481468f, -3.80570698f, 2.39946675f, -1.93851781f, -5.42234039f, -6.34092760f, -2.52374983f, -1.85044456f, 3.92693520f, 0.40042299f, 4.69742584f, 5.40483189f, -1.02398944f, 8.89605045f, 0.64680403f, 0.89943957f, 0.76993859f, -1.88244629f, 1.90714884f, 3.10836840f, -0.17064989f, 0.84892416f, -6.94988108f, 1.92141032f, -1.36458397f, 6.39284658f, 0.45201308f, 2.58823442f, 6.33375788f, -4.76916075f, -8.45738983f, -0.48962492f, 2.40652561f, 4.56602001f, -3.34420681f, 1.86862195f, -7.01420689f, -6.94657421f, -2.47419310f, -4.61693668f, -0.18822384f, -0.36949772f, 2.01374269f, 4.11018658f, -5.11564064f, 8.04294395f, 2.88567662f, -2.87645102f, -1.23238611f, -5.91409397f, -0.62205851f, 1.38689423f, -0.01120412f, 5.25955677f, -1.98474956f, -3.72012186f, 3.00445986f, 4.99141550f, 2.97457719f, 2.70827627f, 6.04544449f, -0.20756161f, -10.87035751f, 0.80454814f, 0.33568168f, -2.48132324f, -2.84452009f, 2.63126230f, -3.99351716f, -7.39294338f, 3.62798953f, -8.65815926f, 2.65992808f, -6.98126554f, 3.09881067f, 0.67735767f, -1.15946686f, 5.63180256f, -0.17694545f, -8.59651184f, 3.75297594f, -2.35913754f, -0.20330384f, 5.49958467f, 1.00861740f, 1.42849684f, 0.00062013f, -0.11073381f, 2.15207863f, 4.07368469f, 1.14344299f, -1.27953362f, 6.64699316f, -0.73672432f, -8.55606937f, -0.19439441f, -4.14319754f, -4.69964647f, -5.86446047f, 2.87106085f, -3.42714882f, -5.00668287f, 6.22464132f, -7.72335291f, 4.05667686f, -5.72637177f, 6.35073948f, -1.29593158f, 0.00813985f, 3.63368607f, -1.05764008f, -7.88486052f, 3.73919106f, 1.41835213f, -1.04935634f, 0.65119827f, 0.03547254f, 1.88996327f, 1.58701086f, -0.56215239f, -0.80187100f, 4.55604362f, -0.67249978f, 1.41084409f, 7.86281586f, -2.38301182f, -8.50535774f, -3.82098866f, -2.40856767f, -5.33439016f, -3.34747362f, 2.69389009f, -1.64118791f, 4.52447939f, 0.04468334f, -1.48768258f, -0.69848812f, -0.71123981f, 3.66259432f, 6.10314512f, 1.37305343f, -0.62758982f, -2.99383426f, 4.20510864f, 1.48497128f, -0.08954811f, 2.43872309f, -0.59880185f, 0.37431365f, 2.45458341f, -3.28401661f, -1.94629693f, -1.93975246f, -0.26385683f, -0.45814323f, -0.18108580f, -3.74811840f, -0.29739976f, -2.24116230f, -0.28150487f, -2.24421668f, 3.46930790f, 8.35415077f, 0.05562943f, -2.81079793f, 1.10388446f, -2.82245207f, -2.98102283f, -1.08132946f, 1.19089699f, 8.00183105f, 6.35385323f, 3.72591257f, 4.59467506f, -5.74890900f, 4.42238331f, -3.36533451f, 0.18350232f, 3.05606651f, 1.18788099f, 2.87450886f, 0.27472210f, -2.80111074f, -0.66314960f, -1.96376896f, 0.75167024f, -4.72056293f, 1.10629988f, -5.00775242f, 1.48246133f, -3.91681528f, -1.86573625f, -6.17714882f, -0.67820001f, 5.69730282f, 1.04399037f, -4.93794823f, 3.09619617f, 2.18692017f, -5.54232264f, -3.10046840f, -0.68972743f, 2.81824327f, 3.04334164f, 6.13203907f, 4.14081764f, 1.02573645f, 5.71970081f, -6.01574707f, -2.07346702f, 0.99554527f, 1.69641590f, 0.66776669f, -0.80132431f, -2.03513098f, -3.42513680f, -0.06704485f, -1.87195873f, -5.42428589f, -0.20748445f, -1.52408111f, 0.97084987f, -0.48799962f, -0.45379883f, -0.26652339f, -1.20720732f, 3.94169855f, -3.18480229f, -1.87440264f, -1.18028760f, 0.52011997f, -2.13437462f, -4.52583313f, 1.69722807f, -0.89371562f, 3.37972403f, 6.38838720f, 6.98663378f, -4.05421400f, 6.89512825f, -5.09085655f, -2.16257906f, -3.33272719f, -3.01246452f, 0.37613097f, 1.80455804f, -0.36456174f, -5.32273912f, -1.29978943f, -0.53685790f, -2.12896323f, 2.55506587f, -2.57999182f, 3.40891910f, 1.36033249f, 0.83864629f, -2.88629293f, -7.36048365f, 5.61314154f, 1.32668555f, -2.58041072f, -3.71943092f, 1.60647738f, -2.74816346f, 2.47269106f, 0.85507953f, 8.39183426f, 3.42624784f, -0.01519036f, 5.68412066f, 2.51771593f, 1.03045523f, -2.08733034f, -2.44337177f, 0.81668580f, 1.30275154f, 2.99679208f, -2.91957355f, -1.71337795f, 3.34979844f, 1.51825011f, 5.20375061f, 2.27888370f, 1.38787699f, 4.23474550f, -4.05878592f, -4.85074377f, -0.22794735f, 4.64402294f, 1.24391258f, -2.04935098f, 1.26285601f, -7.51862240f, 0.62138438f, -1.95792389f, -0.96587181f, 0.85141110f, 0.79354531f, 7.93766356f, 6.07677746f, 2.05947518f, 6.55480623f, 1.44032848f, -0.70615625f, -0.07896036f, -5.08359432f, -0.01047915f, -1.89632201f, 2.57555676f, 3.83779287f, 0.42850614f, 1.80754125f, -0.06942326f, 6.35997963f, 6.06101418f, -0.97032297f, 5.71477222f, -6.06671238f, -3.46607208f, -4.98306370f, 2.84659123f, -2.11025190f, -0.04609144f, 5.26831341f, -9.56940651f, -3.67193556f, -1.71143103f, -1.35221267f, -4.26226807f, -6.89146233f, 8.21761799f, 5.69823503f, 2.28137946f, 1.88911343f, -1.44562483f, -1.60295713f, -0.52568185f, -3.31892347f, -2.81997776f, 0.35287106f, 2.98202395f, -1.39432132f, -2.70001364f, -4.14169264f, 3.50194883f, 4.12610435f, 5.52755260f, 2.65859175f, 3.61353087f, -0.83027136f, -5.10652542f, -4.48625374f, 2.06585884f, -2.76383352f, -0.64300913f, 8.19686604f, 0.96106279f, 2.45952058f, 2.47275925f, -1.03288829f, -0.64897656f, -3.77937531f, 4.27940083f, 2.58320260f, -0.57665241f, 1.87247813f, -3.81604433f, -0.24543774f, -1.62118483f, -0.73075479f, -0.48533297f, 2.05016756f, 0.45561486f, 0.03316188f, 0.77791005f, -1.56283605f, 2.36616826f, 5.58082104f, -1.30925488f, -1.06329608f, 2.17189479f, -3.43008828f, -4.71520567f, -2.56184673f, 0.17508316f, -3.25817418f, -0.41749167f, 0.18119079f, -0.73181152f, 3.99792433f, -3.08002281f, -0.99143314f, -1.83520067f, 1.18565679f, 2.98040128f, 5.67814350f, 2.35128760f, 1.41600966f, 4.02718067f, -0.08193968f, 0.64636409f, 1.35931289f, 2.37125754f, 1.75978124f, 3.90977740f, 1.50662971f, -2.84089065f, 1.29824126f, -3.38730979f, -1.61005294f, 0.58292413f, -0.03019404f, -1.57986510f, -0.56102908f, -3.03128719f, 0.51644313f, -2.01147819f, 0.98400700f, 3.00028515f, 0.74579155f, -3.37098312f, 0.93339360f, -1.29018497f, -2.14695001f, 1.30411184f, 0.71501279f, 7.47793055f, 4.06516457f, 3.50772929f, 3.52762985f, 0.55643129f, 0.32272506f, -4.30955982f, 2.49414706f, 2.07820845f, -0.34377906f, 4.39805031f, 2.77561307f, -3.91292810f, 2.43981409f, 0.18861845f, -2.76658440f, -4.97148752f, 3.25273705f, -0.08929539f, 0.19818619f, -5.83767605f, -0.97381884f, -5.68745661f, -5.42433214f, 3.98769903f, -0.40394354f, -1.83387578f, -0.80109525f, 1.47454357f, -3.14899540f, 0.80130816f, -2.26348829f, 4.06121159f, 6.13077354f, 5.31226397f, 2.94966197f, -3.65217376f, -1.08136678f, -7.14119816f, -0.85269439f, -0.70365787f, -0.81598872f, 3.62807679f, 3.08123684f, -7.82739496f, 4.07951784f, -0.14204243f, -0.66969109f, -5.07225513f, 2.88492823f, 0.47202343f, 0.72683257f, -6.84280777f, 0.41807127f, -5.09785986f, -3.74514675f, 2.03936672f, -1.06096244f, -1.52409148f, -0.97046643f, 2.27491093f, -1.55597985f, -1.29215479f, -0.79737484f, -0.01979581f, 7.65407991f, 5.54527044f, 4.04147148f, -2.64274883f, -1.89246953f, -3.89547634f, -1.06029689f, -2.85982800f, -1.41247237f, 1.55836034f, 3.38194537f, -2.97655582f, 0.87510300f, 1.26282072f, -1.77029657f, -3.57144690f, -4.19456863f, 0.53179169f, -1.42221975f, -3.09144497f, -0.84294832f, -5.02758694f, -2.68011904f, 0.89156240f, -0.34783912f, 4.64484835f, -2.34453487f, -1.28573155f, 0.09990287f, 0.01828218f, -1.79960847f, -1.06579173f, 1.08763921f, 0.43687880f, 3.24747229f, 3.83097172f, 1.07253766f, -1.33810723f, 0.76530832f, 1.58660865f, 5.60743904f, -3.54124737f, -0.89264417f, -3.83942485f, -1.03707337f, -1.61659896f, 1.65349591f, 1.72698796f, 4.96013832f, 0.78927267f, -0.35563886f, -3.48121166f, 3.79677629f, 2.59023166f, 2.74940348f, -2.17589283f, -5.91757107f, 2.43766379f, -4.15906048f, -1.74731481f, -2.49113035f, -0.57349741f, -4.04455185f, -1.46939647f, 2.21418452f, 0.09153593f, 2.23016739f, 7.91880608f, 4.04464149f, 0.07706618f, -2.41892862f, -2.19280314f, 7.61760712f, -5.89153862f, 0.33551922f, -1.70855618f, -0.30561331f, -0.14341974f, -2.48878574f, 1.31269515f, 3.45388412f, -0.02453184f, -0.12132037f, -4.27916241f, 1.25179088f, 4.09455204f, -1.83801770f, -1.86743176f, -4.02864933f, 3.44515228f, -4.39244986f, -0.56988084f, -1.69426417f, 2.18254852f, -4.78135824f, 1.73193693f, -2.27968478f, -1.49523509f, 2.51696730f, 4.03677559f, -2.03679037f, 1.32167840f, -2.22570705f, -2.74843621f, 6.29655170f, -3.67230225f, -1.86765468f, -0.14842367f, -1.21552539f, -0.92038238f, -0.51692355f, 1.08433771f, -0.01929832f, 0.15660909f, 2.31432915f, -3.86507082f, -0.69797570f, 0.13505173f, -1.50951028f, -0.69980979f, -1.51297045f, 3.63725281f, 0.13388813f, 2.73131752f, -0.96528149f, 4.92000961f, -5.92699385f, 1.69444644f, -1.17121375f, -2.33710480f, 1.35302818f, 1.39608085f, 1.68293881f, 0.94960749f, 1.89011908f, -4.08865070f, 0.13722643f, -1.62849212f, -0.19044125f, 1.37906075f, -3.92504406f, -1.45033538f, -0.42085981f, 3.38237071f, -3.06508875f, -1.39420545f, 1.13067436f, 0.92206454f, 0.49917889f, -2.74508023f, -2.19221997f, 1.77914095f, 0.10854459f, -2.62178278f, 2.35042715f, -0.15322030f, -0.67014873f, -1.75627899f, 2.64074945f, 2.76339936f, 2.67275214f, -0.62736398f, 0.58251178f, -4.64895678f, 5.50419283f, 2.53566456f, -2.44196153f, -0.07845879f, -2.80389643f, -0.64810950f, -0.05813205f, 1.67155504f, -2.69673729f, -1.72486305f, -0.53888649f, 1.86805439f, -1.37128329f, -5.37923479f, -2.08133769f, 0.58187997f, -1.39498150f, 0.21874082f, 4.33726025f, 6.29673958f, 0.72312093f, -3.32683516f, 1.73482585f, -0.00766110f, -2.63785434f, -0.13511759f, 4.07195950f, 0.94139838f, 3.15717316f, 1.53720927f, 1.87664819f, -2.33655119f, 6.18176556f, -2.73912525f, -2.45279956f, 2.20392370f, -0.56854641f, 0.98915887f, -2.64472580f, 2.40633702f, -4.93327999f, -1.28942823f, 0.98247659f, 1.31774998f, 0.07669818f, -5.91169453f, -0.43135011f, 1.27404964f, -0.59787154f, -0.22716975f, 0.74409103f, 10.27316475f, -2.29192710f, -2.19403267f, 3.78925133f, 3.19553399f, -4.42490482f, -0.80781460f, 2.16568565f, -2.54165983f, 2.54885101f, 4.18779039f, 1.73079813f, -1.48891807f, 11.60153770f, -0.98686743f, -2.88813901f, 2.32898521f, -0.36101711f, 2.34522438f, 0.29057693f, 1.39800644f, -4.31848240f, -3.21217132f, 0.11740226f, -1.21613467f, 0.57248503f, -4.44853830f, 1.54665899f, 3.14459944f, 1.76809108f, 0.26693153f, 0.86913753f, 9.47121620f, -2.07677889f, 2.08578467f, 1.30181742f, 1.58683562f, -3.52757788f, -1.32763624f, 0.79821301f, -2.19358301f, 1.17707348f, 6.01983643f, 4.11209440f, -2.04209709f, 7.00413418f, -1.84904683f, -1.32542288f, -0.01298118f, 0.70377320f, 0.27815005f, 2.07879829f, -0.71606725f, -4.94399881f, -2.11898828f, -0.39051518f, -2.21034360f, 3.05337906f, -1.56889665f, 1.97065282f, 2.61320901f, -0.34063196f, -0.57001418f, -2.13183641f, 3.48879004f, -0.12067288f, 0.48568326f, -1.81424558f, 2.28868723f, 1.44802380f, 1.25918829f, -1.76415455f, 5.35742331f, 3.50682044f, 4.71371317f, 5.89110756f, 8.51241302f, 4.07391453f, -0.05887252f, -0.18202400f, 2.27119660f, 6.78274727f, -2.87470293f, -5.14336634f, 0.76443815f, 2.04625130f, -0.43199503f, -1.01353514f, 2.42951298f, 2.35641170f, 0.32345510f, -4.04195738f, -4.77967072f, 0.26564783f, 6.11455107f, -2.53868008f, -3.11839914f, -1.04203856f, 5.17195654f, -4.15338612f, -3.84149241f, 0.48130888f, 3.09706950f, -4.18423653f, 5.26233864f, 3.55831861f, 3.75122595f, 8.14969349f, 6.80038738f, 4.68907356f, -1.40135396f, -3.19287133f, -3.15895939f, 8.77363205f, -4.48793411f, -3.80537176f, -2.40145254f, -2.74341679f, -2.02862644f, 5.33402443f, 9.25365734f, 2.50246119f, 0.32847846f, -1.50564361f, -4.26163197f, -1.40994716f, 2.50708485f, 0.44500345f, -0.62516934f, 4.09846306f, 5.29355669f, -4.02224922f, 0.73442125f, 0.46648952f, 0.67028689f, -6.30715466f, 6.56297970f, 3.80854273f, -5.19078207f, 4.98839283f, 7.59161472f, 0.46010983f, -2.10227895f, 0.29324162f, -2.67019558f, 4.57838106f, -3.02338457f, -3.08647728f, -2.00112700f, -3.81710315f, -0.08346784f, 1.69288683f, 5.68807268f, 3.29351830f, 0.54618967f, 1.83540761f, -5.38810253f, 0.51326782f, 4.40081882f, -4.03805828f, 0.49482727f, -1.36024392f, 2.91845679f, -2.00959015f, 2.47489738f, -1.43354976f, 1.92024410f, -6.55897284f, 1.79488957f, -0.89570928f, -6.13094234f, -0.45504010f, 2.35239482f, 1.29039919f, -4.78849840f, -1.52545333f, -6.50420475f, 2.99257326f, -0.55620033f, 0.26807702f, -2.52090979f, -4.59419632f, 0.57965040f, 2.19423151f, 2.04760551f, -0.57048106f, -2.20812702f, -0.04777686f, 1.38053393f, -2.71448946f, -1.06219673f, -3.62008905f, 1.85719645f, 1.28355026f, -2.76315832f, 1.65295160f, -4.01645803f, -3.10454416f, -0.65713316f, 1.22384977f, -0.70416176f, 4.45064926f, 1.31602776f, 2.06907344f, 2.48872757f, 4.25775290f, 3.50504255f, -0.68262041f, 1.29799378f, -1.01969171f, 2.98593879f, 0.12607655f, 0.37219539f, -0.84196299f, -3.80019331f, -1.82315290f, -0.38489276f, -1.45200360f, -4.00882292f, 0.61042011f, -0.16738498f, 1.33787775f, -2.26938057f, 1.03656030f, 8.89089870f, -1.60370600f, -5.38691807f, 5.72182989f, 2.72854710f, -6.18535757f, -3.13408709f, 2.79175353f, 5.18425512f, 9.46434212f, 2.40110517f, 1.11330092f, -3.57366538f, 4.80967665f, 0.40691876f, -3.65484858f, 0.92398167f, 2.53852940f, 3.17747331f, 2.14199781f, -1.69107199f, -1.91864693f, -3.18452644f, -2.42408276f, -2.14332366f, -1.35526609f, -4.50732136f, 0.58234072f, -1.81547785f, 0.57311213f, 1.10584176f, -0.97226644f, 11.73174381f, -2.00559855f, -1.81175601f, 2.33131361f, 0.49264961f, -0.42245382f, -1.37528467f, 1.55768061f, 0.21152198f, 13.08896351f, 10.33674145f, 5.77929306f, -6.19886398f, 5.67007637f, -6.61288071f, -2.58029866f, -4.05192375f, 1.77221894f, 0.29821560f, 5.23508501f, -5.09560966f, -0.97536200f, -5.17957878f, 1.02876794f, -4.52072096f, 2.22126532f, -4.81708670f, 0.44538212f, -2.30738068f, 3.15900373f, -4.99227905f, 0.82632786f, 9.65415478f, -0.63819492f, -3.25479436f, -0.13276935f, 0.21337092f, -2.22116399f, -3.04922724f, 0.65568435f, -0.10706246f, 4.58047390f, 7.80782652f, 5.49080181f, -3.97114491f, 6.43327618f, -6.54772758f, -2.10962629f, -0.79831678f, -0.08316499f, 2.48658133f, 4.14070511f, -0.59806836f, -4.58636141f, -0.31166920f, 0.31757897f, -3.92562199f, 0.65357721f, 0.55871534f, 1.71843934f, 1.62395024f, 0.00695819f, -4.56716251f, -3.76420808f, 4.24979544f, -0.86128616f, 0.23126510f, -6.32968998f, 1.83346081f, 3.81335950f, 2.98407745f, -1.80454743f, 6.61764765f, -1.39372075f, -0.86780751f, 7.24317265f, 2.24205112f, 1.05702817f, 0.55431479f, -1.54557061f, 3.36389136f, 4.70898724f, 1.11327887f, -3.78462076f, -3.63381767f, 2.86510396f, 0.74203897f, 0.81488025f, 3.54250598f, 3.24824381f, 3.19000244f, -0.58995843f, -7.05670738f, 3.18306041f, 3.95191574f, 0.81820154f, -1.91068232f, -2.05426741f, -1.05589008f, -3.18377590f, -1.86278260f, -8.80374908f, 0.93416154f, -4.60517359f, 8.38999462f, 5.26356745f, -8.89992714f, 8.95298958f, 4.22590351f, 1.00351548f, -6.90151119f, -8.07641125f, -4.82450199f, 8.02293015f, 4.11661243f, 0.95457208f, -7.07843113f, -4.30524826f, 5.02697992f, 5.21011686f, 0.80132771f, 3.23420191f, 3.82452774f, -2.13171721f, -7.88879967f, 1.31062031f, 1.90848613f, -3.51572514f, -3.75684500f, 3.62577081f, -5.76075602f, -2.79389215f, 0.32598805f, -4.28981733f, 4.21048594f, -3.84532523f, 3.19815183f, -0.40756655f, -2.19974327f, 6.25655174f, 3.42396951f, -1.88986623f, -1.92803884f, -2.97344875f, -0.09756154f, 5.24342251f, -0.72513700f, 1.06113195f, -1.30720282f, 4.69107103f, 0.58984971f, 2.33985567f, 1.46385121f, 3.16576266f, 6.77769995f, -5.92685127f, -12.61141014f, -2.83663774f, 4.90253258f, -6.32688522f, -3.00096869f, 2.38634992f, -7.21459866f, -5.89208746f, 2.84085894f, -1.21792030f, 6.70161343f, -4.00450230f, 5.29881001f, -1.45574808f, 0.77542424f, 1.38336325f, -0.21572059f, -3.38088870f, 2.33249640f, 0.68824625f, -3.68440270f, 0.33481622f, -0.39239681f, 0.14560902f, 1.61039007f, -3.11967754f, 2.49372435f, 2.68783092f, -1.17559779f, 0.95257235f, 4.35451412f, -0.56818569f, -7.32110357f, -7.58534050f, -2.10573673f, -3.34446383f, -0.32183546f, -0.78525496f, -1.76974547f, 5.19060802f, -2.11319876f, -3.41755080f, -0.36864156f, 1.32680905f, 0.45004874f, 6.17223930f, -1.60707474f, 0.46096295f, -3.88852644f, 1.84729624f, -0.03412050f, 0.99224162f, -2.05553341f, 3.47793245f, -0.06305170f, 0.51314175f, -2.91650558f, -1.78121483f, -2.85465693f, 0.24649808f, -2.70376635f, 0.42334458f, -1.13862336f, -0.98409218f, -0.96593523f, 2.22128963f, 0.53402066f, 3.33979344f, 8.57430458f, 2.34217858f, -2.40062976f, 5.81624222f, 1.13290989f, -5.06850052f, -4.72865725f, 1.82859278f, 6.78569555f, 8.56885242f, 2.76462936f, 0.33891773f, -2.81092787f, 0.79498398f, -2.27208567f, 1.55182552f, 2.17166376f, 6.12517643f, 3.56859684f, 0.27685475f, -1.38408327f, -1.03533340f, -3.46618199f, 0.79240030f, -3.89390516f, -0.55852515f, -1.16367757f, -0.07008934f, -2.20105195f, 3.81210446f, -0.66834474f, 0.43603873f, 10.92334938f, 2.48571420f, -6.34997845f, 4.23135757f, 0.45045292f, -4.13489866f, -3.92324209f, 1.88537407f, 2.57159734f, 9.90973091f, 4.37453461f, 7.34546280f, -2.51120615f, 11.12575245f, -3.23452854f, -2.49947500f, 1.39819741f, -3.78950691f, 2.40617585f, 5.10036278f, -3.55743456f, -6.42888737f, -2.51929998f, -1.90880990f, -1.81618094f, 1.60946512f, -4.09737110f, 1.96408439f, -1.90115595f, 2.44444203f, -2.31254292f, -4.01332951f, 8.65541840f, -0.58626485f, -4.02226830f, 0.43893200f, -3.78272748f, -5.46277428f, 0.01306701f, 0.61185312f, 0.24469066f, 1.30214953f, 5.87789631f, 8.75197792f, -5.31634712f, 3.43556309f, -5.90755081f, 0.54375106f, -2.48162293f, -3.51843548f, 2.55853295f, 5.06387186f, -2.09662485f, -3.00377345f, -3.21781397f, -0.14537808f, -4.65453672f, 1.92747557f, 0.41553855f, 4.09379959f, 0.83387995f, 1.50868511f, -6.54959488f, -8.38881016f, 5.50689125f, -2.88616610f, -1.21597648f, -0.23817590f, 1.50816703f, -2.26873541f, 2.29862142f, -1.61143053f, 5.97371244f, 4.71440220f, -0.20635787f, 8.85926723f, 0.56064367f, -1.04103339f, -4.47060108f, -2.63824081f, 3.06782055f, -2.07702565f, 3.38269401f, -1.59988797f, -3.80122590f, 2.35341501f, 2.69095278f, 3.87612104f, 1.89984226f, 0.95496917f, 3.14841127f, -5.84543085f, -7.24945450f, -2.65708590f, 2.87417006f, 0.97556210f, -3.75203967f, 1.55287778f, -7.43401051f, -1.29005826f, -3.40252638f, -4.01049423f, 2.82721639f, -1.21479535f, 8.54563904f, 7.39749908f, -0.61361837f, 7.60177565f, 1.65812778f, -0.83008504f, -3.60961151f, -7.69062138f, -1.26275063f, -4.17071676f, 5.28448200f, 4.04685593f, -1.18231702f, 1.15276611f, 1.58620787f, 6.75060844f, 3.29332161f, -0.67640316f, 5.78984785f, -3.14913464f, -6.41867924f, -2.58316016f, -2.04366302f, 2.01089478f, -3.81723452f, 3.63843751f, -5.13238430f, -3.79432917f, 4.86581373f, -1.06922054f, 3.95978498f, -0.78166616f, 8.35650539f, 5.35834265f, 0.35594034f, 9.41657066f, -0.84108615f, -6.54425859f, -3.44328952f, -6.55536795f, -0.08963367f, -1.53906262f, 0.17658240f, -0.13108420f, -0.44371247f, -0.78411150f, 2.64754868f, 9.66306782f, 1.70506203f, -0.31588936f, 4.31715870f, -6.16665173f, -10.43371868f, -3.72962189f, 4.35245228f, -1.75867891f, -4.20046234f, 8.62637043f, 1.45946813f, -3.30153608f, 0.85179043f, -2.66643381f, 3.01863337f, -2.52916121f, 8.35405540f, -0.37298933f, -0.89473486f, 6.88681793f, -4.46370125f, -7.50776386f, 3.80255938f, -3.55003357f, 1.43528831f, -2.20383263f, 2.34999895f, 2.03803205f, 1.94830751f, -1.85976326f, 0.97718471f, 5.53710842f, -0.80560827f, 0.23925614f, 5.98795223f, -2.03578377f, -7.77835321f, -2.79955530f, -1.88185954f, -2.49112058f, -0.76095992f, 2.71161270f, -0.55918610f, 0.83789903f, -1.42063200f, -0.61528748f, -4.18273115f, 1.76384258f, 4.21265936f, 5.50964785f, -0.93324339f, 3.83215356f, 1.52210593f, -0.91594946f, 1.31148386f, 3.20160103f, 1.24493563f, -0.72693497f, 1.84716725f, 3.09897518f, -1.34605026f, -1.17511916f, -1.05526352f, -1.08590937f, -1.41319299f, -3.75052118f, -2.67095542f, -0.76179552f, -3.32081509f, -1.04692316f, -1.30194843f, -1.98795474f, 5.01223469f, 0.21895903f, -1.85535169f, 3.12362719f, 0.16198632f, -3.86784005f, -2.03062248f, -0.15415624f, 8.22020721f, 4.83055592f, 4.50315666f, 4.19443417f, 0.42727345f, -4.67786789f, -5.18739986f, 2.53988838f, 3.19683266f, 1.80313504f, 1.94664574f, 0.59795094f, -4.21626759f, 0.50492239f, -0.41232634f, -0.99224532f, -3.94929314f, 1.74060190f, -0.92474866f, -1.00664830f, -6.17397356f, -1.33146775f, -3.78111315f, -4.91876888f, 2.50303864f, -0.34890354f, -1.25013232f, 0.38168997f, -1.84135628f, -4.46107960f, -4.05920792f, -2.61709857f, 0.71046209f, 9.80566883f, 6.34086990f, 2.73394704f, -2.03342366f, -2.21424174f, -5.56514263f, -4.74755144f, -2.20672894f, 0.09010231f, 1.70423889f, 3.19200158f, -6.99027634f, 1.14216340f, 0.05824995f, -0.76996505f, -6.51575899f, -0.41109252f, 0.78229940f, 1.36170781f, -5.65170193f, 1.12221193f, -4.60430050f, -4.40174437f, 4.01805925f, 0.10774946f, -2.77991009f, -0.18023163f, 0.02151692f, -1.77023101f, -1.86639869f, -0.69443607f, 4.92290831f, 6.83520412f, 4.27372265f, 6.54272366f, -7.59249687f, -1.40776849f, -3.52368808f, 1.01398587f, -3.58802676f, -0.35658866f, 1.14716864f, 3.75847244f, -2.30159235f, -0.72130895f, -0.24564353f, -1.77531350f, -3.08677864f, -0.73486501f, -1.20357263f, 0.60789430f, -3.46990204f, -0.20668676f, -5.46096087f, -5.22016764f, 0.98259866f, 1.81012678f, 3.92534304f, -2.94997001f, 1.65154219f, 2.27040243f, 0.99095678f, 0.09144652f, -0.99103236f, -1.11210847f, 0.78181303f, 2.38706732f, 2.96695375f, -0.17279971f, 0.31143007f, 1.35465562f, 2.03586054f, 6.19515753f, -3.14652419f, -2.89027119f, -3.26665854f, -1.93043876f, -0.46601450f, 1.07655203f, 1.74946189f, 4.02148342f, 0.69275337f, 0.50094581f, -4.07613230f, 2.98369169f, 4.24537849f, 0.49480581f, -2.02408123f, -2.02068973f, 6.54505825f, -5.19377470f, -0.12596917f, -0.70204186f, -0.98308045f, -3.19708824f, 1.63609934f, 1.35475993f, 0.16313422f, 4.13918924f, 7.69187021f, 3.72601676f, -1.97790039f, -1.16739464f, -3.31835508f, 8.14553452f, -1.78718984f, 1.21505618f, -3.84255409f, -3.21992350f, 0.07376552f, -0.81223297f, 3.57002878f, 1.48521733f, -0.45995998f, 0.30551746f, -3.33944130f, 1.39538884f, 1.84758544f, -0.21494150f, -2.27316713f, -4.37771225f, 6.48841667f, -5.00251961f, -0.45162797f, -5.01056004f, 0.70199943f, -4.60057783f, -2.22394514f, 0.07777429f, -1.49820781f, 3.47308421f, 6.13231564f, 1.18605387f, -4.78924608f, -3.49548388f, -2.73382568f, 6.24617863f, -2.74291611f, -1.03833354f, -2.20752788f, -2.33219409f, 1.48633552f, 1.65796840f, 4.95045471f, 2.58479190f, -0.90922785f, 0.71312457f, -4.44465590f, 1.37020862f, 2.37683725f, 0.18805164f, -3.28422308f, -1.64939332f, 3.64181972f, -3.75277281f, 3.67203593f, -0.11204052f, 2.24140930f, -3.90657187f, 2.56883717f, -1.44016707f, -2.83842611f, -0.29104578f, 2.17757058f, -0.71431804f, 1.36911654f, 0.85083604f, -1.60110259f, -1.97247636f, -1.61163378f, -0.81236130f, -0.38993555f, -3.03631902f, -0.38213277f, 0.06394482f, 3.19348621f, 0.36771113f, 1.36763072f, 2.49159527f, -0.39599860f, -2.69996762f, -0.97561121f, -2.97563028f, -0.49662948f, -0.17564940f, -2.79042959f, 0.72395414f, 2.07260203f, -0.99439794f, -2.20248008f, -0.07389921f, 0.65536159f, 4.73054695f, -0.63917702f, 0.58788192f, -3.60156059f, 6.59609890f, 3.88419437f, -3.38469863f, -3.56237841f, -2.03295064f, 0.07279694f, 3.71804547f, 0.79928309f, -2.13411403f, -1.13909864f, -0.34193408f, -1.00338125f, -1.44231665f, -5.39835978f, -0.45086145f, 1.16064668f, 2.58335257f, 2.10072684f, 4.64244223f, 7.10090065f, 1.01974952f, -4.44687223f, 2.99792576f, 1.10303724f, -1.22736573f, -3.91514421f, 3.07458854f, 2.18765211f, 3.34481716f, 2.46166849f, 2.99648619f, -0.94046807f, 5.55028200f, 0.92199719f, -0.83934361f, -0.72042274f, 0.84869325f, 1.46914721f, 0.85937387f, 4.77306223f, -4.06436539f, -2.59847593f, 2.44828081f, 0.50484699f, -2.71092367f, -6.39010477f, 0.91778028f, 3.25469685f, 1.30310678f, 1.35258150f, 3.56171441f, 7.82435083f, -2.51527429f, -4.24328852f, 2.36876059f, 1.94595242f, -2.59290171f, -6.62389565f, 3.32567835f, 2.13659120f, 4.09299326f, 3.48293996f, 2.64965177f, -3.19157362f, 13.37204266f, -0.50297594f, -4.57448196f, 3.95582604f, -0.69038916f, 0.10098404f, 1.18737555f, 3.65761185f, -5.69623756f, -2.03357077f, 1.02868807f, -1.38448596f, -0.05690211f, -8.48874187f, 0.56755424f, 1.45485961f, 0.66273880f, 0.06495565f, 1.79539490f, 8.46864319f, -1.22696662f, -1.87585378f, -0.99768794f, 2.72801924f, -0.66980243f, -2.31924677f, 0.33271110f, 0.11666083f, 1.86980045f, 5.95332909f, 7.38583708f, -2.80956483f, 6.79227638f, -6.78070831f, 1.21884382f, -1.40695429f, 0.90236962f, -1.13695288f, 0.50760663f, 1.00955284f, -5.39029121f, 0.24987072f, 2.24283314f, -4.02145576f, 2.18057394f, -3.35627747f, 1.26061773f, 1.30342579f, 0.11311233f, -1.11199212f, -4.06509686f, 5.82649660f, -1.24059582f, 5.51652861f, -1.90937877f, 1.10658336f, -0.47065550f, -2.39167786f, -1.95931304f, 4.12717247f, 1.15396059f, 1.26015663f, 7.97836876f, 7.33633423f, 2.27785325f, -2.83802366f, -2.74850106f, 0.86126029f, 6.18781090f, -1.43707538f, -6.97134876f, -3.25486469f, -1.95214593f, 0.91066706f, 0.89637989f, 1.06481194f, 6.25791073f, 0.81779671f, -1.08384395f, -3.21191931f, 2.04216075f, 4.76030350f, -2.37217665f, -1.42571259f, -6.35876131f, 4.62536526f, -5.40060568f, -3.14868999f, -1.00587153f, 1.80662942f, -7.03201485f, 6.08373499f, 0.99862772f, 2.21717811f, 4.06814623f, 6.02428913f, 5.33422756f, -0.87013257f, -2.22477579f, -2.51505303f, 5.82925224f, -0.82854009f, -4.30698347f, -1.75007713f, 2.08352375f, -2.25235629f, 1.17517352f, 5.77717733f, 2.27472878f, 2.72778273f, -1.95411634f, -4.52602863f, 1.13983536f, 1.16340065f, -2.02740526f, -3.11290503f, -1.94906235f, 1.54855204f, -4.52984142f, 1.97465122f, -1.79415476f, 4.03510094f, -8.45349979f, 10.87430096f, 2.19863629f, -5.39083815f, 5.86213875f, 6.25744534f, 6.52600002f, -4.72149038f, -1.75254321f, -5.51459169f, 7.03155518f, -2.01889277f, -4.58441257f, -3.61226106f, 0.42395937f, -0.93263882f, 2.28703761f, 2.80611467f, 2.59498215f, 0.65989012f, -1.51268566f, -4.49465561f, -4.70453882f, 5.44696808f, -4.37603617f, 0.46670085f, 2.82488608f, 2.18854523f, -2.04817152f, 1.19557285f, 1.53618634f, 4.44758606f, -7.31593513f, 7.43966007f, -3.55480957f, -5.29834652f, 2.14622784f, 1.65194583f, 2.71262598f, -4.86145496f, 0.79726243f, -8.88541985f, 1.19627261f, 0.79660845f, -1.98016644f, 1.03741014f, -3.93128228f, 1.05535269f, 2.01378822f, -0.46086323f, -0.77754641f, -1.43942690f, 0.49809402f, -2.27861357f, -3.29815221f, 0.38201320f, -3.98481083f, 4.88261318f, -0.44555628f, -2.57224536f, 2.35001850f, -2.65835261f, -2.43422794f, -2.97889376f, 1.07349825f, 1.88157082f, 4.74075413f, 0.60376728f, -0.48894715f, -1.15800071f, 4.68110943f, -0.86976886f, 1.49192941f, 0.62665290f, 0.20652676f, 0.53916287f, -1.45706177f, 0.66133004f, 1.34405875f, -4.27689552f, -0.20838106f, -5.14266443f, -1.29718637f, -1.74506426f, -0.86022055f, -3.57553625f, 0.46880072f, -1.25287139f, 3.28596354f, 11.33191013f, 1.23942876f, -3.87616491f, 7.57880497f, -0.22940339f, -5.68512678f, -1.94969654f, 5.85449600f, 3.75705457f, 4.24395847f, 1.60086083f, 2.62553668f, -0.93964291f, 5.84753895f, -0.79931092f, 0.48274064f, 2.07170033f, 3.02243996f, 2.63509989f, -0.76043403f, -1.64048159f, -6.17683458f, -3.09974527f, -2.12773156f, -0.89379883f, 2.82242465f, -1.99981332f, -0.08763933f, 0.01921120f, -1.94142103f, 2.48067307f, 0.41083777f, 8.24922180f, -1.84516132f, -1.39224625f, 5.03956223f, 0.49562740f, -5.28296328f, -0.20005548f, 3.13672113f, 0.51187158f, 7.11563921f, 6.43059587f, 3.48430967f, -5.37095928f, 8.03863049f, -5.53923941f, -2.16421175f, -3.77641368f, 3.29633045f, 5.04030085f, 2.25945377f, -3.04169011f, -2.16198015f, -2.49559617f, -0.26252726f, -6.99201345f, 2.87374353f, -0.12568980f, 0.23314142f, -1.32087135f, 4.39030552f, -0.24638844f, -4.37242651f, 14.09276772f, 1.23987353f, -1.72249663f, 0.31124914f, -2.13725138f, -3.74915648f, -1.87147236f, 0.47318631f, 1.13337576f, 3.00416899f, 8.82548523f, 4.80538750f, -5.28486395f, 5.51870108f, -5.15801477f, 0.95712411f, -1.50416136f, 2.34657240f, 4.20726633f, 5.56757259f, -3.30645251f, -3.39945269f, -2.68488026f, -2.53525281f, -3.15145874f, 2.74529529f, -0.96283442f, 2.87778258f, 0.22186530f, 1.24905694f, -7.07941198f, -5.45916176f, 3.46988297f, 0.92430985f, -0.98330998f, -2.23672342f, -3.03262734f, 0.73941302f, 0.98004431f, 0.83219361f, 7.17411804f, 4.27849865f, 0.14765590f, 8.61269569f, 9.04497051f, 1.53991723f, -2.08305025f, -4.34939337f, 0.63786775f, 2.60098696f, 0.02432060f, -1.48516297f, -4.06825686f, 5.12420368f, -0.75312757f, 1.96927559f, 4.91575956f, 3.41533065f, 3.62557888f, -4.35002136f, -5.91343403f, 0.45026422f, 4.93286371f, 3.45830250f, -4.39032364f, -0.51697755f, -7.41543341f, -3.06703568f, 1.01196158f, 2.47106576f, 5.54014874f, -4.65312243f, 8.61000633f, 8.25905323f, -1.41497111f, 8.69221878f, 0.40090930f, 1.11325574f, -1.67089832f, -4.01080132f, 1.07925677f, 2.68086481f, -0.73093414f, -1.35081220f, -7.85765076f, -5.98989439f, -0.04651213f, 4.63693142f, 2.07757711f, -0.22652936f, 3.45525455f, -0.69198442f, -10.39761639f, -2.02106953f, 4.77755499f, -2.67665577f, -1.72481167f, 4.49634743f, -2.55717134f, -4.55044937f, 0.46377492f, -3.08933020f, 3.86891365f, -2.79104614f, 8.36974335f, 0.86471701f, -5.39342690f, 12.54906940f, -0.41536295f, -5.29502535f, -3.94430566f, -5.67391300f, -4.65079165f, 2.22505951f, -0.30000746f, 2.27855444f, -4.81604433f, -1.73440599f, 4.68784523f, 5.00208044f, 0.18863934f, -1.74989462f, 3.17923450f, -1.59773099f, -12.59962940f, -1.54495025f, -0.00576371f, 1.79913878f, -2.43449807f, 1.49516344f, -3.90507102f, 1.68647158f, 4.50177765f, -5.32286358f, 3.47539330f, -2.90529680f, 1.61576962f, 0.83679676f, -5.55615807f, 3.78939056f, -4.46644831f, -5.95550919f, 0.37808037f, 0.51334500f, 1.74658906f, -0.82085419f, -0.65387219f, 3.67790437f, 0.03758264f, -2.42622781f, 1.83335185f, 4.73835945f, -0.83536482f, -0.03993917f, 3.78230667f, -4.81265640f, -8.26869011f, -1.30363441f, -2.09106350f, -3.96769738f, -1.89037073f, 0.38682747f, 0.05434489f, 5.72213697f, 0.55685395f, -3.47729349f, -1.11535001f, 2.09416127f, 5.08877802f, 5.72183466f, 1.29632664f, 0.16822398f, -2.43180108f, 3.49967623f, 2.15753818f, -0.26548505f, 3.24446392f, -0.00599277f, 1.08215356f, -0.23225522f, -2.40723038f, 0.18496060f, -3.70608735f, -0.19918591f, -1.64028871f, 0.80792952f, -0.85334057f, -2.52314138f, -3.12099195f, 0.17949918f, -0.82650864f, 2.32224989f, 9.56476116f, -0.20134282f, -0.48428559f, 2.86784410f, 0.07289505f, -3.92880869f, -2.11887884f, 0.59164631f, 6.31267452f, 7.49149418f, 2.88749456f, 2.40504885f, -3.57608175f, -1.48019314f, -0.69410253f, 0.90275228f, -0.34111357f, 2.19190216f, 3.39090061f, 3.39631820f, -5.19105434f, 2.67546582f, -2.56549048f, -0.59797800f, -4.21802664f, 0.63918972f, -0.69969130f, 0.47496963f, -4.30976725f, 0.16531238f, -3.59595251f, -0.76877379f, 11.79971790f, -0.93276632f, -1.48630571f, 8.04754066f, 2.09168458f, -3.77018499f, -4.19337654f, 0.26171905f, 1.99359691f, 8.96759701f, 8.39609814f, 6.19231987f, -5.36037970f, 4.69818354f, -4.22453928f, -4.61665344f, -2.52073431f, 1.34026706f, 2.80182385f, 2.56681514f, -4.04676390f, -3.01466990f, -4.10480118f, 0.38737059f, -0.37146521f, -2.26529670f, -1.72867084f, 0.93472683f, -2.47562981f, 0.89871657f, -1.67618203f, -0.28950238f, 5.30124855f, -0.14731219f, -0.81319761f, -1.11265934f, 0.11356127f, -2.52802444f, -1.93826056f, 1.06187987f, 1.48062325f, 4.28070498f, 5.69893932f, 9.26904392f, -4.23773003f, 5.78582096f, -6.18445301f, -2.85200453f, -5.30461454f, -4.16009140f, -0.07239690f, 4.11531162f, -1.12266588f, -1.50265646f, 0.47661865f, -1.90043914f, -6.48978710f, 1.71005368f, 0.18256521f, -0.88272136f, -0.51324779f, -0.78045660f, -5.21036625f, -4.11805344f, 3.99454761f, -1.04999924f, -6.99629354f, -5.02737141f, 0.94748145f, -2.35882139f, 4.13982439f, -1.41835535f, 7.56763077f, 3.97024012f, -4.08156776f, 6.90305424f, 0.53571963f, -2.22625160f, -2.09144926f, -4.98530245f, -0.15102190f, 0.59995949f, 3.28562784f, 0.77991986f, -3.08389306f, 3.34046674f, 0.41394949f, 5.10031366f, 2.99692893f, 0.17706826f, 2.85998058f, -6.68330860f, -6.72653008f, -0.04071128f, 3.71085787f, 3.17834806f, -4.88019037f, 6.74075413f, -7.41782188f, -5.22026348f, -1.94595623f, -3.61318684f, 1.85610664f, 1.08613706f, 6.41580677f, 1.46376514f, -4.11524010f, 9.59146214f, -2.92772651f, -1.70753336f, -1.51594138f, -4.88185692f, 1.47331417f, -2.23893595f, 4.98459148f, 1.29359996f, -2.29221845f, -0.99594390f, 3.05759239f, 6.86030054f, 2.40487719f, 3.28339863f, 7.72739315f, -3.60563445f, -9.73502827f, -1.51672328f, -0.08473521f, -2.43673515f, -3.26616001f, 3.63767886f, -11.25394535f, -5.17597103f, -1.27523947f, -7.82669783f, 0.67929745f, -4.50530529f, 5.49323797f, 6.78993320f, -2.28033876f, 4.61412525f, 2.55109429f, -12.38607693f, -0.63024014f, -3.45992327f, -0.84092742f, -0.03252453f, 4.58635283f, 5.28213978f, -1.28417206f, -1.71185923f, -0.26850975f, 8.28257561f, 4.47432184f, 2.72818279f, 8.42217731f, -4.22216320f, -8.95128918f, -1.57179546f, 1.34253705f, -5.47035217f, -5.50866985f, 4.64156532f, -6.11207914f, -5.46734476f, 3.54298997f, -2.79237103f, -0.70766860f, -3.62739944f, 3.22660995f, -2.02262759f, 0.11224222f, 2.63832402f, -0.91955596f, -4.65958309f, -0.29729855f, -1.78957534f, -0.40749407f, 0.51688713f, 0.83725226f, 0.30945438f, 1.20769620f, -1.75219965f, 2.59689760f, 5.01501608f, -1.59034789f, 0.58155286f, 3.75831509f, -5.26110506f, -8.65382767f, -6.19066620f, -0.61932850f, -2.71863723f, -0.87443137f, 3.40582991f, -1.27868056f, 3.51236677f, -2.07806540f, -0.85076392f, -1.14599180f, 1.16361260f, 1.86411846f, 5.86179352f, 0.69029891f, -0.06060839f, 1.54649436f, -0.60351688f, 1.51970077f, 0.04187265f, 1.64540339f, 2.75502157f, 2.46308279f, 1.69071770f, -3.23827076f, 0.92096543f, -3.09458661f, -1.23823690f, 0.24035048f, -0.74456501f, -1.85476089f, -0.32914662f, -2.10325241f, 1.19795251f, -2.05372071f, 1.02114081f, 2.56286955f, 0.42165697f, -1.65826249f, 4.00724554f, -2.18727994f, -1.05848944f, -0.52338278f, -0.28714985f, 8.08780861f, 5.04444599f, 3.51866961f, 3.37445784f, -1.96067202f, -1.21509445f, -3.96595931f, -0.80801201f, 0.76944816f, 1.80147493f, 4.14419460f, -0.12201095f, -2.77788162f, 1.13284469f, -2.05441403f, -0.61129224f, -2.69690657f, 1.91634214f, -2.17146754f, -0.22308528f, -6.02561045f, 0.49161875f, -6.74280357f, -4.62689781f, 2.47910833f, 1.86534905f, -3.24152899f, -1.39898300f, 0.29427958f, -2.16338181f, 0.90073711f, 1.75551236f, 4.42651892f, 8.34437466f, 5.50070190f, 5.68162251f, 1.65345454f, -2.72315669f, -5.43411493f, -0.29380533f, 1.07508349f, -1.73533511f, 2.56912184f, 3.62010550f, -6.30422783f, 1.74158525f, -1.22070909f, -0.80982518f, -4.14757967f, 4.29217434f, 0.70600843f, -2.09282112f, -5.09018898f, -0.11623126f, -5.99775553f, -4.66743088f, 1.61512172f, -1.30276895f, -3.17103505f, -0.26310229f, -1.00843918f, -0.77664804f, -2.05240250f, 0.04728425f, 1.15720487f, 4.01001406f, 7.24615860f, 2.55452180f, -5.76347876f, 0.34683830f, -6.05540276f, -4.70677900f, -0.93182588f, -4.37759733f, 2.93209839f, 1.63947964f, -2.43563962f, 1.35213876f, 0.00670356f, -0.02742785f, -2.16460943f, 1.39449501f, 0.23929763f, 2.37476778f, -4.17733765f, -0.81475425f, -6.15027046f, -5.74441719f, 3.53978682f, 0.66798484f});
 
-    nd4j::ops::deconv2d_tf op;
+    sd::ops::deconv2d_tf op;
     auto result = op.evaluate({&input0, &input1, &input2}, {}, {7,7,  2,2,  0,0,  1,1,  1,1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -188,7 +188,7 @@ TEST_F(ConvolutionTests2, Test_Dilation2D_Again_1) {
     auto exp = NDArrayFactory::create<double>('c', {4, 64, 43, 4});
 
 
-    nd4j::ops::dilation2d op;
+    sd::ops::dilation2d op;
     auto result = op.evaluate({&x, &w}, {}, {1, 1,5,7,1, 1,2,3,1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -204,7 +204,7 @@ TEST_F(ConvolutionTests2, Test_Dilation2D_Again_2) {
     auto x = NDArrayFactory::create<double>('c', {4, 26, 19, 4});
     auto w = NDArrayFactory::create<double>('c', {11, 7, 4});
 
-    nd4j::ops::dilation2d op;
+    sd::ops::dilation2d op;
     auto result = op.evaluate({&x, &w}, {}, {0, 1,2,3,1, 1,3,2,1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -245,7 +245,7 @@ TYPED_TEST(TypedConvolutionTests2, sconv2d_bp_1) {
     weightsP.applyScalar(scalar::Divide, 100.0, weightsP);
     epsilonNext.applyScalar(scalar::Divide, 100.0, epsilonNext);
 
-    nd4j::ops::sconv2d_bp op;
+    sd::ops::sconv2d_bp op;
     auto resultBP = op.evaluate({&input, &epsilonNext, &weightsD, &weightsP },{}, {5, 5, 1, 1, 0, 0, 1, 1, 0}, {});
 
     ASSERT_EQ(3, resultBP->size());
@@ -285,11 +285,11 @@ TYPED_TEST(TypedConvolutionTests2, sconv2d_bp_2) {
     int paddingMode = 0;             // 1-SAME, 0-VALID;
     int dataFormat  = 0;             // 1-NHWC, 0-NCHW
 
-    NDArray input('c', {bS, iC, iH, iW}, typeid(TypeParam) == typeid(float) ? nd4j::DataType::FLOAT32 : nd4j::DataType::DOUBLE);
-    NDArray gradO('c', {bS, oC, oH, oW}, typeid(TypeParam) == typeid(float) ? nd4j::DataType::FLOAT32 : nd4j::DataType::DOUBLE);
-    NDArray weightsDepth('c', {kH, kW, iC, mC}, typeid(TypeParam) == typeid(float) ? nd4j::DataType::FLOAT32 : nd4j::DataType::DOUBLE);
-    NDArray weightsPoint('f', {1, 1, iC*mC, oC}, typeid(TypeParam) == typeid(float) ? nd4j::DataType::FLOAT32 : nd4j::DataType::DOUBLE);
-    NDArray bias('c', {1,oC}, {0.5, 0.5}, typeid(TypeParam) == typeid(float) ? nd4j::DataType::FLOAT32 : nd4j::DataType::DOUBLE);
+    NDArray input('c', {bS, iC, iH, iW}, typeid(TypeParam) == typeid(float) ? sd::DataType::FLOAT32 : sd::DataType::DOUBLE);
+    NDArray gradO('c', {bS, oC, oH, oW}, typeid(TypeParam) == typeid(float) ? sd::DataType::FLOAT32 : sd::DataType::DOUBLE);
+    NDArray weightsDepth('c', {kH, kW, iC, mC}, typeid(TypeParam) == typeid(float) ? sd::DataType::FLOAT32 : sd::DataType::DOUBLE);
+    NDArray weightsPoint('f', {1, 1, iC*mC, oC}, typeid(TypeParam) == typeid(float) ? sd::DataType::FLOAT32 : sd::DataType::DOUBLE);
+    NDArray bias('c', {1,oC}, {0.5, 0.5}, typeid(TypeParam) == typeid(float) ? sd::DataType::FLOAT32 : sd::DataType::DOUBLE);
 
     NDArray gradI(&input);
     NDArray gradWD(&weightsDepth);
@@ -301,7 +301,7 @@ TYPED_TEST(TypedConvolutionTests2, sconv2d_bp_2) {
     weightsPoint.linspace(0.15, 0.1);
     gradO.linspace(0.01, 0.01);
 
-    nd4j::ops::sconv2d_bp op;
+    sd::ops::sconv2d_bp op;
     Nd4jStatus status = op.execute({&input, &gradO, &weightsDepth, & weightsPoint, &bias},
                               {&gradI, &gradWD, &gradWP, &gradB},
                               {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat}, {});
@@ -341,7 +341,7 @@ TYPED_TEST(TypedConvolutionTests2, sconv2d_bp_3) {
 
     auto epsilon = NDArrayFactory::create<TypeParam>('c', {3, 3, 16, 16});
 
-    nd4j::ops::sconv2d_bp op;
+    sd::ops::sconv2d_bp op;
     auto result = op.evaluate({&input, &epsilonNext, &weightsD, &weightsP}, {}, {2, 2, 1, 1, 0, 0, 2, 2, 0});
 
     auto eps = result->at(0);
@@ -377,7 +377,7 @@ TYPED_TEST(TypedConvolutionTests2, sconv2d_bp_4) {
     weightsDepth.linspace(0.1, 0.1);
     gradO.linspace(0.01, 0.01);
 
-    nd4j::ops::sconv2d_bp op;
+    sd::ops::sconv2d_bp op;
     auto results = op.evaluate({&input, &gradO, &weightsDepth, &bias}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
     auto* gradI = results->at(0);
     auto* gradWD = results->at(1);
@@ -418,7 +418,7 @@ TEST_F(ConvolutionTests2, sconv2d_bp_5) {
     weightsDepth.linspace(-0.5, 0.1);
     gradO.linspace(0.01, 0.01);
 
-    nd4j::ops::sconv2d_bp op;
+    sd::ops::sconv2d_bp op;
     auto status = op.execute({&input, &gradO, &weightsDepth, &weightsPoint, &bias}, {&gradI, &gradWD, &gradWP, &gradB}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat}, {});
     ASSERT_EQ(Status::OK(), status);
 }
@@ -430,11 +430,11 @@ TEST_F(ConvolutionTests2, im2col_bp_1) {
     int       oH=12,oW=12;
 
     // [bS, iC, kH, kW, oH, oW] is de-convoluted to [bS, iC, iH, iW]
-    NDArray input('c', {bS, iC, iH, iW}, nd4j::DataType::DOUBLE);
-    NDArray gradO('c', {bS, iC, kH, kW, oH, oW}, nd4j::DataType::DOUBLE);
-    NDArray gradI('c', {bS, iC, iH, iW}, nd4j::DataType::DOUBLE);           // output
+    NDArray input('c', {bS, iC, iH, iW}, sd::DataType::DOUBLE);
+    NDArray gradO('c', {bS, iC, kH, kW, oH, oW}, sd::DataType::DOUBLE);
+    NDArray gradI('c', {bS, iC, iH, iW}, sd::DataType::DOUBLE);           // output
 
-    nd4j::ops::im2col_bp op;
+    sd::ops::im2col_bp op;
     Nd4jStatus status = op.execute({&input, &gradO}, {&gradI}, {}, {kH, kW, sH, sW, pH, pW, dH, dW, 1}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -462,7 +462,7 @@ TEST_F(ConvolutionTests2, deconv3d_test1) {
     input = 0.5;
     weights.linspace(0.1, 0.1);
 
-    nd4j::ops::deconv3d op;
+    sd::ops::deconv3d op;
     auto results = op.evaluate({&input, &weights}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW,  dD,dH,dW, paddingMode, dataFormat}, {});
     auto output = results->at(0);
 
@@ -496,7 +496,7 @@ TEST_F(ConvolutionTests2, deconv3d_test2) {
     input = 0.5;
     weights.linspace(0.1, 0.1);
 
-    nd4j::ops::deconv3d op;
+    sd::ops::deconv3d op;
     auto results = op.evaluate({&input, &weights}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW,  dD,dH,dW, paddingMode, dataFormat}, {});
     auto output = results->at(0);
 
@@ -529,7 +529,7 @@ TEST_F(ConvolutionTests2, deconv3d_test3) {
     weights.linspace(0.1, 0.1);
     weights.permutei({2, 3, 4, 1, 0});
 
-    nd4j::ops::deconv3d op;
+    sd::ops::deconv3d op;
     auto results = op.evaluate({&input, &weights}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW,  dD,dH,dW, paddingMode, dataFormat}, {});
     auto output = results->at(0);
 
@@ -556,7 +556,7 @@ TEST_F(ConvolutionTests2, deconv3d_test4) {
     weights.linspace(0.1, 0.1);
     weights.permutei({2, 3, 4, 1, 0});
 
-    nd4j::ops::deconv3d op;
+    sd::ops::deconv3d op;
     auto results = op.evaluate({&input, &weights}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW,  dD,dH,dW, paddingMode, dataFormat}, {});
     auto output = results->at(0);
 
@@ -599,7 +599,7 @@ TEST_F(ConvolutionTests2, deconv3d_test5) {
     weights.linspace(0.1, 0.1);
     bias = 0.2;
 
-    nd4j::ops::deconv3d op;
+    sd::ops::deconv3d op;
     auto results = op.evaluate({&input, &weights}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW,  dD,dH,dW, paddingMode, dataFormat});
     ASSERT_EQ(Status::OK(), results->status());
 
@@ -624,15 +624,15 @@ TEST_F(ConvolutionTests2, deconv3d_bp_test1) {
     auto bias     = NDArrayFactory::create<float>('c', {iC});
     auto gradO    = NDArrayFactory::create<float>('c', {bS, iD, iH, iW, iC});
 
-    NDArray expGradI('c', {bS, oD, oH, oW, oC}, {62., 67.6, 68.4, 74.8, 81.2, 89.2, 87.6, 96.4, 119.6, 132.4, 126., 139.6, 138.8, 154., 145.2, 161.2}, nd4j::DataType::FLOAT32);
-    NDArray expGradW('c', {kD, kH, kW, iC, oC}, {28., 28., 32., 32., 40., 40., 44., 44., 64, 64., 68., 68., 76., 76., 80., 80.}, nd4j::DataType::FLOAT32);
-    NDArray expGradB('c', {iC},  std::vector<double>{364.5}, nd4j::DataType::FLOAT32);
+    NDArray expGradI('c', {bS, oD, oH, oW, oC}, {62., 67.6, 68.4, 74.8, 81.2, 89.2, 87.6, 96.4, 119.6, 132.4, 126., 139.6, 138.8, 154., 145.2, 161.2}, sd::DataType::FLOAT32);
+    NDArray expGradW('c', {kD, kH, kW, iC, oC}, {28., 28., 32., 32., 40., 40., 44., 44., 64, 64., 68., 68., 76., 76., 80., 80.}, sd::DataType::FLOAT32);
+    NDArray expGradB('c', {iC},  std::vector<double>{364.5}, sd::DataType::FLOAT32);
 
     input = 0.5;
     weights.linspace(0.1, 0.1);
     gradO.linspace(0.5);
 
-    nd4j::ops::deconv3d_bp op;
+    sd::ops::deconv3d_bp op;
     auto results = op.evaluate({&input, &weights, &bias, &gradO}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW,  dD,dH,dW, paddingMode, dataFormat}, {});
 
     auto gradI = results->at(0);
@@ -665,14 +665,14 @@ TEST_F(ConvolutionTests2, deconv3d_bp_test2) {
     auto weights  = NDArrayFactory::create<float>('c', {kD, kH, kW, iC, oC});
     auto gradO    = NDArrayFactory::create<float>('c', {bS, iD, iH, iW, iC});
 
-    NDArray expGradI('c', {bS, oD, oH, oW, oC}, {34, 37.2, 16.6, 18.4, 15.4, 17.4, 7.1, 8.2, 10.6, 13., 4.3, 5.6, 2.9, 4.3, 0.75, 1.5}, nd4j::DataType::FLOAT32);
-    NDArray expGradW('c', {kD, kH, kW, iC, oC}, {16, 16, 9, 9, 10, 10, 5.5, 5.5, 12, 12, 6.5, 6.5, 7, 7, 3.75, 3.75}, nd4j::DataType::FLOAT32);
+    NDArray expGradI('c', {bS, oD, oH, oW, oC}, {34, 37.2, 16.6, 18.4, 15.4, 17.4, 7.1, 8.2, 10.6, 13., 4.3, 5.6, 2.9, 4.3, 0.75, 1.5}, sd::DataType::FLOAT32);
+    NDArray expGradW('c', {kD, kH, kW, iC, oC}, {16, 16, 9, 9, 10, 10, 5.5, 5.5, 12, 12, 6.5, 6.5, 7, 7, 3.75, 3.75}, sd::DataType::FLOAT32);
 
     input = 0.5;
     weights.linspace(0.1, 0.1);
     gradO.linspace(0.5);
 
-    nd4j::ops::deconv3d_bp op;
+    sd::ops::deconv3d_bp op;
     auto results = op.evaluate({&input, &weights, &gradO}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW,  dD,dH,dW, paddingMode, dataFormat}, {});
 
     auto gradI = results->at(0);
@@ -701,13 +701,13 @@ TEST_F(ConvolutionTests2, deconv3d_bp_test3) {
     auto weights  = NDArrayFactory::create<float>('c', {kD, kH, kW, iC, oC}, {0.1f, 0.9f, 0.2f, 0.1f, 0.3f, 1.1f, 0.4f, 1.2f, 0.5f, 1.3f, 0.6f, 1.4f, 0.7f, 1.5f, 0.8f, 1.6f});
     auto gradO    = NDArrayFactory::create<float>('c', {bS, iC, iD, iH, iW});
 
-    NDArray expGradI('c', {bS, oD, oH, oW, oC}, {33.8, 37.4, 44.6, 48.2, 66.2, 69.8, 77., 80.6, 77.25, 86.35, 104.55, 113.65, 159.15, 168.25, 186.45, 195.55}, nd4j::DataType::FLOAT32);
-    NDArray expGradW('c', {kD, kH, kW, iC, oC}, {28., 28, 32, 32, 40, 40, 44, 44, 64, 64, 68, 68, 76, 76, 80, 80.}, nd4j::DataType::FLOAT32);
+    NDArray expGradI('c', {bS, oD, oH, oW, oC}, {33.8, 37.4, 44.6, 48.2, 66.2, 69.8, 77., 80.6, 77.25, 86.35, 104.55, 113.65, 159.15, 168.25, 186.45, 195.55}, sd::DataType::FLOAT32);
+    NDArray expGradW('c', {kD, kH, kW, iC, oC}, {28., 28, 32, 32, 40, 40, 44, 44, 64, 64, 68, 68, 76, 76, 80, 80.}, sd::DataType::FLOAT32);
 
     input = 0.5;
     gradO.linspace(0.5);
 
-    nd4j::ops::deconv3d_bp op;
+    sd::ops::deconv3d_bp op;
     auto results = op.evaluate({&input, &weights, &gradO}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW,  dD,dH,dW, paddingMode, dataFormat}, {});
 
     auto gradI = results->at(0);
@@ -736,13 +736,13 @@ TEST_F(ConvolutionTests2, deconv3d_bp_test4) {
     auto weights  = NDArrayFactory::create<float>('c', {kD, kH, kW, iC, oC}, {0.1f, 0.9f, 0.2f, 0.1f, 0.3f, 1.1f, 0.4f, 1.2f, 0.5f, 1.3f, 0.6f, 1.4f, 0.7f, 1.5f, 0.8f, 1.6f});
     auto gradO    = NDArrayFactory::create<float>('c', {bS, iC, iD, iH, iW});
 
-    NDArray expGradI('c', {bS, oC, oD, oH, oW}, {0.4, 1.55, 1.05, 2.3, 5.7, 3.2, 1.5, 3.35, 1.75, 3.8, 8.3, 4.3, 9.0, 18.6, 9.2, 4.4, 8.7, 4.1, 1.8, 3.55, 1.65, 3.5, 6.5, 2.8, 1.3, 2.15, 0.75, 0.8, 3.15, 2.25, 4.7, 12.1, 7.2, 3.5, 8.15, 4.55, 7.8, 17.9, 9.9, 19.75, 42.85, 23.6, 9.35, 21.55, 12.9, 5.4, 11.55, 6.05, 8.25, 20.75, 13.2, 0.65, 6.6, 6.75}, nd4j::DataType::FLOAT32);
-    NDArray expGradW('c', {kD, kH, kW, iC, oC}, {16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.}, nd4j::DataType::FLOAT32);
+    NDArray expGradI('c', {bS, oC, oD, oH, oW}, {0.4, 1.55, 1.05, 2.3, 5.7, 3.2, 1.5, 3.35, 1.75, 3.8, 8.3, 4.3, 9.0, 18.6, 9.2, 4.4, 8.7, 4.1, 1.8, 3.55, 1.65, 3.5, 6.5, 2.8, 1.3, 2.15, 0.75, 0.8, 3.15, 2.25, 4.7, 12.1, 7.2, 3.5, 8.15, 4.55, 7.8, 17.9, 9.9, 19.75, 42.85, 23.6, 9.35, 21.55, 12.9, 5.4, 11.55, 6.05, 8.25, 20.75, 13.2, 0.65, 6.6, 6.75}, sd::DataType::FLOAT32);
+    NDArray expGradW('c', {kD, kH, kW, iC, oC}, {16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.}, sd::DataType::FLOAT32);
 
     input = 0.5;
     gradO.linspace(0.5);
 
-    nd4j::ops::deconv3d_bp op;
+    sd::ops::deconv3d_bp op;
     auto results = op.evaluate({&input, &weights, &gradO}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW,  dD,dH,dW, paddingMode, dataFormat}, {});
 
     auto gradI = results->at(0);
@@ -775,7 +775,7 @@ TEST_F(ConvolutionTests2, maxpool2d_1) {
     std::vector<int>* argI = block->getIArguments();
     *argI = {kH,kW, sH,sW, pH,pW, dH,dW, 0};  // 0,1 - kernel Height/Width; 2,3 - stride Height/Width; 4,5 - pad Height/Width; 6,7 - dilation Height/Width; 8 - same mode;
 
-    nd4j::ops::maxpool2d pooling;
+    sd::ops::maxpool2d pooling;
     Nd4jStatus status = pooling.execute(block);
     ASSERT_EQ(ND4J_STATUS_OK, status);
 
@@ -819,7 +819,7 @@ TEST_F(ConvolutionTests2, maxpool2d_2) {
     std::vector<int>* argI = block->getIArguments();
     *argI = {kH,kW, sH,sW, pH,pW, dH,dW, 0};  // 0,1 - kernel Height/Width; 2,3 - stride Height/Width; 4,5 - pad Height/Width; 6,7 - dilation Height/Width; 8 - same mode;
 
-    nd4j::ops::maxpool2d pooling;
+    sd::ops::maxpool2d pooling;
     Nd4jStatus status = pooling.execute(block);
     ASSERT_EQ(ND4J_STATUS_OK, status);
 
@@ -846,8 +846,8 @@ TEST_F(ConvolutionTests2, maxpool2d_3) {
     const int pW = 0;
     const int dH = 1;
     const int dW = 1;
-    const int oH = (int) nd4j::math::nd4j_ceil<float, int>(iH * 1.f / sH);
-    const int oW = (int) nd4j::math::nd4j_ceil<float, int>(iW * 1.f / sW);
+    const int oH = (int) sd::math::nd4j_ceil<float, int>(iH * 1.f / sH);
+    const int oW = (int) sd::math::nd4j_ceil<float, int>(iW * 1.f / sW);
 
 
     auto x = NDArrayFactory::create_<float>('c', {bS,iD,iH,iW});
@@ -863,7 +863,7 @@ TEST_F(ConvolutionTests2, maxpool2d_3) {
     std::vector<int>* argI = block->getIArguments();
     *argI = {kH,kW, sH,sW, pH,pW, dH,dW, 1};  // 0,1 - kernel Height/Width; 2,3 - stride Height/Width; 4,5 - pad Height/Width; 6,7 - dilation Height/Width; 8 - same mode;
 
-    nd4j::ops::maxpool2d pooling;
+    sd::ops::maxpool2d pooling;
     Nd4jStatus status = pooling.execute(block);
     ASSERT_EQ(ND4J_STATUS_OK, status);
 
@@ -907,7 +907,7 @@ TEST_F(ConvolutionTests2, maxpool2d_4) {
     std::vector<int>* argI = block->getIArguments();
     *argI = {kH,kW, sH,sW, pH,pW, dH,dW, 0};  // 0,1 - kernel Height/Width; 2,3 - stride Height/Width; 4,5 - pad Height/Width; 6,7 - dilation Height/Width; 8 - same mode;
 
-    nd4j::ops::maxpool2d pooling;
+    sd::ops::maxpool2d pooling;
     Nd4jStatus status = pooling.execute(block);
     ASSERT_EQ(ND4J_STATUS_OK, status);
 
@@ -934,8 +934,8 @@ TEST_F(ConvolutionTests2, maxpool2d_5) {
     const int pW = 0;
     const int dH = 1;
     const int dW = 1;
-    const int oH = (int) nd4j::math::nd4j_ceil<float, int>(iH * 1.f / sH);
-    const int oW = (int) nd4j::math::nd4j_ceil<float, int>(iW * 1.f / sW);
+    const int oH = (int) sd::math::nd4j_ceil<float, int>(iH * 1.f / sH);
+    const int oW = (int) sd::math::nd4j_ceil<float, int>(iW * 1.f / sW);
 
 
     auto x = NDArrayFactory::create_<float>('c', {bS,iD,iH,iW});
@@ -951,7 +951,7 @@ TEST_F(ConvolutionTests2, maxpool2d_5) {
     std::vector<int>* argI = block->getIArguments();
     *argI = {kH,kW, sH,sW, pH,pW, dH,dW, 1};  // 0,1 - kernel Height/Width; 2,3 - stride Height/Width; 4,5 - pad Height/Width; 6,7 - dilation Height/Width; 8 - same mode;
 
-    nd4j::ops::maxpool2d pooling;
+    sd::ops::maxpool2d pooling;
     Nd4jStatus status = pooling.execute(block);
     ASSERT_EQ(ND4J_STATUS_OK, status);
 
@@ -970,7 +970,7 @@ TYPED_TEST(TypedConvolutionTests2, maxpool2d_6) {
 
     x.linspace(1);
 
-    nd4j::ops::maxpool2d op;
+    sd::ops::maxpool2d op;
     auto result = op.evaluate({&x}, {}, {2, 2, 2, 2, 0, 0, 1, 1, 1, 1, 1});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -990,7 +990,7 @@ TYPED_TEST(TypedConvolutionTests2, maxpool2d_7) {
 
     x.linspace(1);
 
-    nd4j::ops::maxpool2d op;
+    sd::ops::maxpool2d op;
     auto result = op.evaluate({&x}, {}, {2, 2, 2, 2, 0, 0, 1, 1, 0, 1, 1});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1010,7 +1010,7 @@ TYPED_TEST(TypedConvolutionTests2, maxpool2d_8) {
 
     x.linspace(1);
 
-    nd4j::ops::maxpool2d op;
+    sd::ops::maxpool2d op;
     auto result = op.evaluate({&x}, {}, {2, 2, 2, 2, 0, 0, 1, 1, 0, 1, 0});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1040,7 +1040,7 @@ TYPED_TEST(TypedConvolutionTests2, maxpool2d_9) {
 
     auto input    = NDArrayFactory::create<TypeParam>('c', {bS, iC, iH, iW});
 
-    nd4j::ops::maxpool2d op;
+    sd::ops::maxpool2d op;
     auto results = op.evaluate({&input}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, isSameMode, 1, 0});
     auto output = results->at(0);
 
@@ -1067,7 +1067,7 @@ TYPED_TEST(TypedConvolutionTests2, maxpool2d_10) {
                                                      0.7135856f, 0.9613717f, 0.9613717f, 0.78289545f, 0.9613717f, 0.9613717f, 0.78289545f, 0.7997134f, 0.8536445f, 0.8536445f, 0.7997134f, 0.85019743f, 0.85019743f,
                                                      0.85722464f, 0.85722464f, 0.85019743f});
 
-    nd4j::ops::maxpool2d op;
+    sd::ops::maxpool2d op;
     auto results = op.evaluate({&input}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW,  paddingMode});
     auto* output = results->at(0);
 
@@ -1082,12 +1082,12 @@ TYPED_TEST(TypedConvolutionTests2, maxpool2d_10) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(ConvolutionTests2, maxpool2d_11) {
 
-    NDArray input('c', {1,1,4,5}, nd4j::DataType::FLOAT32);
-    NDArray z('c', {1,1,4,5}, nd4j::DataType::FLOAT32);
+    NDArray input('c', {1,1,4,5}, sd::DataType::FLOAT32);
+    NDArray z('c', {1,1,4,5}, sd::DataType::FLOAT32);
 
     input.linspace(1.);
 
-    nd4j::ops::maxpool2d op;
+    sd::ops::maxpool2d op;
     auto results = op.evaluate({&input}, {}, {2,2,  1,1,  1,1,  2,2,  1,0,0});
 
     ASSERT_EQ(Status::OK(), results->status());
@@ -1109,7 +1109,7 @@ TYPED_TEST(TypedConvolutionTests2, avgpool3d_test1) {
                                                         154.5f,155.5f,157.5f,158.5f,166.5f,167.5f,169.5f,170.5f,190.5f,191.5f,193.5f,194.5f,202.5f,203.5f,205.5f,206.5f});
     input.linspace(1.);
 
-    nd4j::ops::avgpool3dnew op;
+    sd::ops::avgpool3dnew op;
     auto results = op.evaluate({&input}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW, dD,dH,dW, paddingMode, 1, dataFormat});
     auto output = results->at(0);
 
@@ -1137,7 +1137,7 @@ TYPED_TEST(TypedConvolutionTests2, avgpool3d_test2) {
                                                           187.f, 188.f, 189.f, 190.f, 191.f, 192.f, 191.5f, 192.5f, 193.5f, 191.5f, 192.5f, 193.5f, 194.5f, 195.5f, 196.5f, 196.f, 197.f, 198.f, 200.5f, 201.5f, 202.5f, 203.5f, 204.5f, 205.5f, 205.f, 206.f, 207.f, 205.f, 206.f, 207.f, 208.f, 209.f, 210.f, 209.5f, 210.5f, 211.5f});
     input.linspace(1.);
 
-    nd4j::ops::avgpool3dnew op;
+    sd::ops::avgpool3dnew op;
     auto results = op.evaluate({&input}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW,  dD,dH,dW,  paddingMode, 0, dataFormat});
     auto output = results->at(0);
 
@@ -1162,7 +1162,7 @@ TYPED_TEST(TypedConvolutionTests2, avgpool3d_test3) {
                                                           173.5f, 174.5f, 175.5f, 176.5f, 177.5f, 178.5f, 182.5f, 183.5f, 184.5f, 185.5f, 186.5f, 187.5f});
     input.linspace(1.);
 
-    nd4j::ops::avgpool3dnew op;
+    sd::ops::avgpool3dnew op;
     auto results = op.evaluate({&input}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW,  dD,dH,dW,  paddingMode, 1, dataFormat});
     auto output = results->at(0);
 
@@ -1202,7 +1202,7 @@ TYPED_TEST(TypedConvolutionTests2, avgpool3d_test4) {
                                                         68.833336f, 138.00f, 138.666672f, 69.50f, 34.416668f, 69.00f, 69.333336f, 34.75f, 52.00f, 104.25f, 104.75f, 52.50f, 52.75f, 105.75f, 106.25f, 53.25f, 35.416668f, 71.00f, 71.333336f, 35.75f});
     input.linspace(1.);
 
-    nd4j::ops::avgpool3dnew op;
+    sd::ops::avgpool3dnew op;
     auto results = op.evaluate({&input}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW,  dD,dH,dW,  paddingMode, 1, dataFormat});
     auto output = results->at(0);
 
@@ -1226,7 +1226,7 @@ TYPED_TEST(TypedConvolutionTests2, maxpool3d_test1) {
                                                          128.f, 129.f, 131.f, 132.f, 140.f, 141.f, 143.f, 144.f, 164.f, 165.f, 167.f, 168.f, 176.f, 177.f, 179.f, 180.f, 200.f, 201.f, 203.f, 204.f, 212.f, 213.f, 215.f, 216.f});
     input.linspace(1.);
 
-    nd4j::ops::maxpool3dnew op;
+    sd::ops::maxpool3dnew op;
     auto results = op.evaluate({&input}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW, dD,dH,dW, paddingMode, 1, dataFormat});
     auto output = results->at(0);
 
@@ -1254,7 +1254,7 @@ TYPED_TEST(TypedConvolutionTests2, maxpool3d_test2) {
                                                          193.f, 194.f, 195.f, 196.f, 197.f, 198.f, 196.f, 197.f, 198.f, 202.f, 203.f, 204.f, 205.f, 206.f, 207.f, 205.f, 206.f, 207.f, 211.f, 212.f, 213.f, 214.f, 215.f, 216.f, 214.f, 215.f, 216.f, 211.f, 212.f, 213.f, 214.f, 215.f, 216.f, 214.f, 215.f, 216.f});
     input.linspace(1.);
 
-    nd4j::ops::maxpool3dnew op;
+    sd::ops::maxpool3dnew op;
     auto results = op.evaluate({&input}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW,  dD,dH,dW, paddingMode, 1, dataFormat});
     auto output = results->at(0);
 
@@ -1278,7 +1278,7 @@ TYPED_TEST(TypedConvolutionTests2, maxpool3d_test3) {
                                                          166.f, 167.f, 168.f, 169.f, 170.f, 171.f, 175.f, 176.f, 177.f, 178.f, 179.f, 180.f, 202.f, 203.f, 204.f, 205.f, 206.f, 207.f, 211.f, 212.f, 213.f, 214.f, 215.f, 216.f});
     input.linspace(1.);
 
-    nd4j::ops::maxpool3dnew op;
+    sd::ops::maxpool3dnew op;
     auto results = op.evaluate({&input}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW,  dD,dH,dW,  paddingMode, 1, dataFormat});
     auto output = results->at(0);
 
@@ -1308,7 +1308,7 @@ TYPED_TEST(TypedConvolutionTests2, maxpool3d_test4) {
                                                         196.f, 197.f, 198.f, 198.f, 199.f, 200.f, 201.f, 201.f, 202.f, 203.f, 204.f, 204.f, 202.f, 203.f, 204.f, 204.f, 208.f, 209.f, 210.f, 210.f, 211.f, 212.f, 213.f, 213.f, 214.f, 215.f, 216.f, 216.f, 214.f, 215.f, 216.f, 216.f, 208.f, 209.f, 210.f, 210.f, 211.f, 212.f, 213.f, 213.f, 214.f, 215.f, 216.f, 216.f, 214.f, 215.f, 216.f, 216.f});
     input.linspace(1.);
 
-    nd4j::ops::maxpool3dnew op;
+    sd::ops::maxpool3dnew op;
     auto results = op.evaluate({&input}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW, dD,dH,dW, paddingMode, 1, dataFormat});
     auto output = results->at(0);
 
@@ -1341,7 +1341,7 @@ TYPED_TEST(TypedConvolutionTests2, avgpool3d_bp_test1) {
     input.linspace(1.);
     gradO = 2.;
 
-    nd4j::ops::avgpool3dnew_bp op;
+    sd::ops::avgpool3dnew_bp op;
     auto results = op.evaluate({&input, &gradO}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW,  dD,dH,dW,  paddingMode, 1, dataFormat});
     auto output = results->at(0);
 
@@ -1375,7 +1375,7 @@ TYPED_TEST(TypedConvolutionTests2, avgpool3d_bp_test2) {
     input.linspace(1.);
     gradO = 2.;
 
-    nd4j::ops::avgpool3dnew_bp op;
+    sd::ops::avgpool3dnew_bp op;
     auto results = op.evaluate({&input, &gradO}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW,  dD,dH,dW,  paddingMode, 1, dataFormat});
     auto output = results->at(0);
 
@@ -1410,7 +1410,7 @@ TYPED_TEST(TypedConvolutionTests2, avgpool3d_bp_test3) {
     input.linspace(1.);
     gradO = 2.;
 
-    nd4j::ops::avgpool3dnew_bp op;
+    sd::ops::avgpool3dnew_bp op;
     auto results = op.evaluate({&input, &gradO}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW,  dD,dH,dW,  paddingMode, 0, dataFormat});
     auto output = results->at(0);
 
@@ -1442,7 +1442,7 @@ TYPED_TEST(TypedConvolutionTests2, avgpool3d_bp_test4) {
     input.linspace(1.);
     gradO = 2.;
 
-    nd4j::ops::avgpool3dnew_bp op;
+    sd::ops::avgpool3dnew_bp op;
     auto results = op.evaluate({&input, &gradO}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW,  dD,dH,dW,  paddingMode, 0, dataFormat});
     auto output = results->at(0);
 
@@ -1473,7 +1473,7 @@ TYPED_TEST(TypedConvolutionTests2, maxpool3d_bp_test1) {
     input.linspace(1.);
     gradO.linspace(0.1, 0.1);
 
-    nd4j::ops::maxpool3dnew_bp op;
+    sd::ops::maxpool3dnew_bp op;
     auto results = op.evaluate({&input, &gradO}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW, dD,dH,dW, paddingMode, 1, dataFormat});
     auto output = results->at(0);
 
@@ -1506,7 +1506,7 @@ TYPED_TEST(TypedConvolutionTests2, maxpool3d_bp_test2) {
     input.linspace(1.);
     gradO.linspace(0.1, 0.1);
 
-    nd4j::ops::maxpool3dnew_bp op;
+    sd::ops::maxpool3dnew_bp op;
     auto results = op.evaluate({&input, &gradO}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW, dD,dH,dW, paddingMode, 1, dataFormat});
     auto output = results->at(0);
 
@@ -1538,7 +1538,7 @@ TYPED_TEST(TypedConvolutionTests2, maxpool3d_bp_test3) {
     input.linspace(1.);
     gradO.linspace(0.1, 0.1);
 
-    nd4j::ops::maxpool3dnew_bp op;
+    sd::ops::maxpool3dnew_bp op;
     auto results = op.evaluate({&input, &gradO}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW,  dD,dH,dW, paddingMode, 1, dataFormat});
     auto output = results->at(0);
 
@@ -1570,7 +1570,7 @@ TYPED_TEST(TypedConvolutionTests2, maxpool3d_bp_test4) {
     input.linspace(1.);
     gradO.linspace(0.1, 0.1);
 
-    nd4j::ops::maxpool3dnew_bp op;
+    sd::ops::maxpool3dnew_bp op;
     auto results = op.evaluate({&input, &gradO}, {}, {kD,kH,kW,  sD,sH,sW,  pD,pH,pW, dD,dH,dW,  paddingMode, 1, dataFormat});
     auto output = results->at(0);
 
@@ -1599,7 +1599,7 @@ TEST_F(ConvolutionTests2, maxpool2d_bp_1) {
     std::vector<int>* argI = block->getIArguments();
     *argI = {kH,kW, sH,sW, pH,pW, dW,dH, 0, 0, 0};   // 0,1 - kernel Height/Width; 2,3 - stride Height/Width; 4,5 - pad Height/Width; 6,7 - dilation Height/Width; 8 - same mode;
 
-    nd4j::ops::maxpool2d_bp bp;
+    sd::ops::maxpool2d_bp bp;
     Nd4jStatus status = bp.execute(block);
     ASSERT_EQ(ND4J_STATUS_OK, status);
 
@@ -1629,7 +1629,7 @@ TEST_F(ConvolutionTests2, maxpool2d_bp_2) {
 
     std::initializer_list<Nd4jLong> argI = {kH,kW, sH,sW, pH,pW, dW,dH, 0, 0, 0};   // 0,1 - kernel Height/Width; 2,3 - stride Height/Width; 4,5 - pad Height/Width; 6,7 - dilation Height/Width; 8 - same mode;
 
-    nd4j::ops::maxpool2d_bp op;
+    sd::ops::maxpool2d_bp op;
     auto results = op.evaluate({&input, &epsilon}, {}, argI);
     auto output = results->at(0);
 
@@ -1655,7 +1655,7 @@ TYPED_TEST(TypedConvolutionTests2, maxpool2d_bp_3) {
     input.linspace(1.);
     gradO.linspace(0.1, 0.1);
 
-    nd4j::ops::maxpool2d_bp op;
+    sd::ops::maxpool2d_bp op;
     auto results = op.evaluate({&input, &gradO}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW,  paddingMode, 1, dataFormat});
     auto output = results->at(0);
 
@@ -1682,7 +1682,7 @@ TYPED_TEST(TypedConvolutionTests2, maxpool2d_bp_4) {
     input.linspace(1.);
     gradO.linspace(0.1, 0.1);
 
-    nd4j::ops::maxpool2d_bp op;
+    sd::ops::maxpool2d_bp op;
     auto results = op.evaluate({&input, &gradO}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW,  paddingMode, 1, dataFormat});
     auto output = results->at(0);
 
@@ -1709,7 +1709,7 @@ TYPED_TEST(TypedConvolutionTests2, maxpool2d_bp_5) {
     input.linspace(1.);
     gradO.linspace(0.1, 0.1);
 
-    nd4j::ops::maxpool2d_bp op;
+    sd::ops::maxpool2d_bp op;
     auto results = op.evaluate({&input, &gradO}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW,  paddingMode, 1, dataFormat});
     auto output = results->at(0);
 
@@ -1736,7 +1736,7 @@ TYPED_TEST(TypedConvolutionTests2, maxpool2d_bp_6) {
     input.linspace(1.);
     gradO.linspace(0.1, 0.1);
 
-    nd4j::ops::maxpool2d_bp op;
+    sd::ops::maxpool2d_bp op;
     auto results = op.evaluate({&input, &gradO}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW,  paddingMode, 1, dataFormat});
     auto output = results->at(0);
 
@@ -1761,7 +1761,7 @@ TEST_F(ConvolutionTests2, maxpool2d_bp_7) {
     input.linspace(1.);
     gradO.linspace(0.1, 0.1);
 
-    nd4j::ops::maxpool2d_bp op;
+    sd::ops::maxpool2d_bp op;
     auto results = op.evaluate({&input, &gradO}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW,  paddingMode, 1, dataFormat});
     // auto output = results->at(0);
 
@@ -1790,7 +1790,7 @@ TEST_F(ConvolutionTests2, avgpool2d_bp_1) {
     std::vector<int>* argI = block->getIArguments();
     *argI = {kH,kW, sH,sW, pH,pW, dW,dH, 0, 1, 0};   // 0,1 - kernel Height/Width; 2,3 - stride Height/Width; 4,5 - pad Height/Width; 6,7 - dilation Height/Width; 8 - same mode, 9 - extraParam0 (unnecessary for avg mode), 10 - data format
 
-    nd4j::ops::avgpool2d_bp bp;
+    sd::ops::avgpool2d_bp bp;
     Nd4jStatus status = bp.execute(block);
     ASSERT_EQ(ND4J_STATUS_OK, status);
 
@@ -1819,7 +1819,7 @@ TYPED_TEST(TypedConvolutionTests2, avgpool2d_bp_2) {
 
     std::initializer_list<Nd4jLong> argI = {kH,kW, sH,sW, pH,pW, dW,dH, 1, 1, 0};
 
-    nd4j::ops::avgpool2d_bp op;
+    sd::ops::avgpool2d_bp op;
     auto results = op.evaluate({&input, &epsilon}, {}, argI);
     auto output = results->at(0);
 
@@ -1848,7 +1848,7 @@ TYPED_TEST(TypedConvolutionTests2, avgpool2d_bp_3) {
     input.linspace(1.);
     gradO.linspace(0.1, 0.1);
 
-    nd4j::ops::avgpool2d_bp op;
+    sd::ops::avgpool2d_bp op;
     auto results = op.evaluate({&input, &gradO}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW,  paddingMode, 1, dataFormat});
     auto output = results->at(0);
 
@@ -1879,7 +1879,7 @@ TYPED_TEST(TypedConvolutionTests2, avgpool2d_bp_4) {
     input.linspace(1.);
     gradO.linspace(0.1, 0.1);
 
-    nd4j::ops::avgpool2d_bp op;
+    sd::ops::avgpool2d_bp op;
     auto results = op.evaluate({&input, &gradO}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW,  paddingMode, 1, dataFormat});
     auto output = results->at(0);
 
@@ -1908,7 +1908,7 @@ TYPED_TEST(TypedConvolutionTests2, avgpool2d_bp_5) {
     input.linspace(1.);
     gradO.linspace(0.1, 0.1);
 
-    nd4j::ops::avgpool2d_bp op;
+    sd::ops::avgpool2d_bp op;
     auto results = op.evaluate({&input, &gradO}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW,  paddingMode, 0, dataFormat});
     auto output = results->at(0);
 
@@ -1937,7 +1937,7 @@ TYPED_TEST(TypedConvolutionTests2, avgpool2d_bp_6) {
     input.linspace(1.);
     gradO.linspace(0.1, 0.1);
 
-    nd4j::ops::avgpool2d_bp op;
+    sd::ops::avgpool2d_bp op;
     auto results = op.evaluate({&input, &gradO}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW,  paddingMode, 1, dataFormat});
     auto output = results->at(0);
 
@@ -1968,7 +1968,7 @@ TEST_F(ConvolutionTests2, pnormpool2d_bp_1) {
     std::vector<double>* argT = block->getTArguments();
     *argT = {0.000001};
 
-    nd4j::ops::pnormpool2d_bp bp;
+    sd::ops::pnormpool2d_bp bp;
     Nd4jStatus status = bp.execute(block);
     ASSERT_EQ(ND4J_STATUS_OK, status);
 
@@ -2001,7 +2001,7 @@ TYPED_TEST(TypedConvolutionTests2, pnormpool2d_bp_2) {
     input.linspace(1.);
     gradO.linspace(0.1, 0.1);
 
-    nd4j::ops::pnormpool2d_bp op;
+    sd::ops::pnormpool2d_bp op;
     auto results = op.evaluate({&input, &gradO}, {eps}, {kH,kW,  sH,sW,  pH,pW,  dH,dW,  paddingMode, pnorm, dataFormat});
     auto output = results->at(0);
 
@@ -2034,7 +2034,7 @@ TYPED_TEST(TypedConvolutionTests2, pnormpool2d_bp_3) {
     input.linspace(1.);
     gradO.linspace(0.1, 0.1);
 
-    nd4j::ops::pnormpool2d_bp op;
+    sd::ops::pnormpool2d_bp op;
     auto results = op.evaluate({&input, &gradO}, {eps}, {kH,kW,  sH,sW,  pH,pW,  dH,dW,  paddingMode, pnorm, dataFormat});
     auto output = results->at(0);
 
@@ -2060,7 +2060,7 @@ TEST_F(ConvolutionTests2, upsampling2d_bp_1) {
     auto expGradI = NDArrayFactory::create<float>('c', {bS, iC, iH, iW});
     expGradI = 4.;
 
-    nd4j::ops::upsampling2d_bp op;
+    sd::ops::upsampling2d_bp op;
     auto results = op.evaluate({&input, &gradO}, {}, {isNCHW});
     auto* gradI = results->at(0);
 
@@ -2085,7 +2085,7 @@ TEST_F(ConvolutionTests2, upsampling2d_bp_2) {
     auto expGradI = NDArrayFactory::create<float>('c', {bS, iH, iW, iC});
     expGradI = 4.;
 
-    nd4j::ops::upsampling2d_bp op;
+    sd::ops::upsampling2d_bp op;
     auto results = op.evaluate({&input, &gradO}, {}, {isNCHW});
     auto* gradI = results->at(0);
 
@@ -2103,7 +2103,7 @@ TEST_F(ConvolutionTests2, upsampling2d_bp_3) {
     const int factorH=2, factorW=2;
     const int isNCHW = 1;                    // data format, default is NCHW
 
-    NDArray input('c', {bS, iC, iH, iW}, nd4j::DataType::FLOAT32);
+    NDArray input('c', {bS, iC, iH, iW}, sd::DataType::FLOAT32);
 
     NDArray gradO('c', {bS, iC, iH*factorH, iW*factorW}, {0.6793504, 0.35508695, 0.84278935, 0.20031333, 0.7014987, 0.31069338, 0.44793984,
         0.93800974, 0.32667395, 0.15187258, 0.38331753, 0.78212297, 0.1988072, 0.7985636, 0.1632634, 0.14696825, 0.26089668, 0.13505761,
@@ -2111,12 +2111,12 @@ TEST_F(ConvolutionTests2, upsampling2d_bp_3) {
         0.061735712, 0.39643127, 0.248016, 0.5489592, 0.115046196, 0.8143622, 0.7215636, 0.40449402, 0.29908907, 0.4038839, 0.9883108,
         0.022296403, 0.927782, 0.3184157, 0.0685462, 0.28453344, 0.23272, 0.35214192, 0.058909304, 0.7112212, 0.6744568, 0.19694561, 0.6994972,
         0.0743224, 0.42042503, 0.5842631, 0.14957358, 0.44640633, 0.72307247, 0.06448108, 0.48307765, 0.8759956, 0.5698191, 0.4458631, 0.5277549,
-        0.016646361, 0.753678, 0.14063567, 0.7541292, 0.16193217, 0.7750374, 0.3326449, 0.11739397}, nd4j::DataType::FLOAT32);
+        0.016646361, 0.753678, 0.14063567, 0.7541292, 0.16193217, 0.7750374, 0.3326449, 0.11739397}, sd::DataType::FLOAT32);
 
     NDArray expGradI('c', {bS, iC, iH, iW}, {2.4203868, 1.5216494, 2.1776323, 2.0290341, 0.772146, 1.5008594, 1.0523045, 1.3174672, 1.9263644,
-                    1.090545, 1.9094483, 1.3611296, 2.1195147, 2.0659215, 1.0423062, 2.3405795, 1.9105877, 1.2203633}, nd4j::DataType::FLOAT32);
+                    1.090545, 1.9094483, 1.3611296, 2.1195147, 2.0659215, 1.0423062, 2.3405795, 1.9105877, 1.2203633}, sd::DataType::FLOAT32);
 
-    nd4j::ops::upsampling2d_bp op;
+    sd::ops::upsampling2d_bp op;
     auto results = op.evaluate({&input, &gradO}, {}, {isNCHW});
     auto* gradI = results->at(0);
 
@@ -2147,7 +2147,7 @@ TYPED_TEST(TypedConvolutionTests2, depthwise_conv2d_1) {
     input = 2.;
     weights.linspace(0.1, 0.1);
 
-    nd4j::ops::depthwise_conv2d op;
+    sd::ops::depthwise_conv2d op;
     auto results = op.evaluate({&input, &weights}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
     auto* output = results->at(0);
 
@@ -2177,7 +2177,7 @@ TEST_F(ConvolutionTests2, depthwise_conv2d_2) {
     input = 2.;
     weights.linspace(0.1, 0.1);
 
-    nd4j::ops::depthwise_conv2d op;
+    sd::ops::depthwise_conv2d op;
     auto results = op.evaluate({&input, &weights}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
     auto* output = results->at(0);
 
@@ -2203,13 +2203,13 @@ TEST_F(ConvolutionTests2, depthwise_conv2d_3) {
     auto weights  = NDArrayFactory::create<float>('c', {mC, iC, kH, kW});
     auto biases   = NDArrayFactory::create<float>('c', {iC*mC}, {1.f,2.f,3.f,4.f});
 
-    NDArray expOutput('c', {bS, oC, oH, oW},{5.2, 5.2, 5.2, 5.2,20.6,20.6,20.6,20.6,14.4,14.4,14.4,14.4,29.8,29.8,29.8,29.8, 5.2, 5.2, 5.2, 5.2,20.6,20.6,20.6,20.6,14.4,14.4,14.4,14.4,29.8,29.8,29.8,29.8}, nd4j::DataType::FLOAT32);
+    NDArray expOutput('c', {bS, oC, oH, oW},{5.2, 5.2, 5.2, 5.2,20.6,20.6,20.6,20.6,14.4,14.4,14.4,14.4,29.8,29.8,29.8,29.8, 5.2, 5.2, 5.2, 5.2,20.6,20.6,20.6,20.6,14.4,14.4,14.4,14.4,29.8,29.8,29.8,29.8}, sd::DataType::FLOAT32);
 
     input = 2.;
     weights.linspace(0.1, 0.1);
     weights.permutei({2,3,1,0});
 
-    nd4j::ops::depthwise_conv2d op;
+    sd::ops::depthwise_conv2d op;
     auto results = op.evaluate({&input, &weights, &biases}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
     auto* output = results->at(0);
 
@@ -2233,14 +2233,14 @@ TEST_F(ConvolutionTests2, depthwise_conv2d_4) {
 
     const float unique = -1000000;
 
-    NDArray input('c', {bS, iH, iW, iC}, nd4j::DataType::FLOAT32);
-    NDArray weights('c', {kH, kW, iC, mC}, nd4j::DataType::FLOAT32);
-    NDArray output('c', {bS, oH, oW, oC}, nd4j::DataType::FLOAT32);
+    NDArray input('c', {bS, iH, iW, iC}, sd::DataType::FLOAT32);
+    NDArray weights('c', {kH, kW, iC, mC}, sd::DataType::FLOAT32);
+    NDArray output('c', {bS, oH, oW, oC}, sd::DataType::FLOAT32);
     input.linspace(0.1, 0.0001);
     weights = 0.5;
     output = unique;
 
-    nd4j::ops::depthwise_conv2d op;
+    sd::ops::depthwise_conv2d op;
     Nd4jStatus status = op.execute({&input, &weights}, {&output} , {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat}, {});
 
     ASSERT_EQ(Status::OK(), status);
@@ -2261,12 +2261,12 @@ TEST_F(ConvolutionTests2, depthwise_conv2d_5) {
     auto input   = NDArrayFactory::create<float>('c', {bS, iH, iW, iC});
     auto weights = NDArrayFactory::create<float>('c', {kH, kW, iC, mC});
 
-    NDArray expOutput('c', {bS, oH, oW, oC}, {10., 12., 14., 16., 8., 9., 22., 24., 26., 28., 14., 15., 14., 15., 16., 17., 8.5, 9.}, nd4j::DataType::FLOAT32);
+    NDArray expOutput('c', {bS, oH, oW, oC}, {10., 12., 14., 16., 8., 9., 22., 24., 26., 28., 14., 15., 14., 15., 16., 17., 8.5, 9.}, sd::DataType::FLOAT32);
 
     input.linspace(1.);
     weights = 0.5;
 
-    nd4j::ops::depthwise_conv2d op;
+    sd::ops::depthwise_conv2d op;
     auto results = op.evaluate({&input, &weights}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
     auto output = results->at(0);
 
@@ -2287,14 +2287,14 @@ TEST_F(ConvolutionTests2, depthwise_conv2d_6) {
     int paddingMode = 1;             // 1-SAME, 0-VALID;
     int dataFormat  = 1;             // 1-NHWC, 0-NCHW
 
-    NDArray input('c', {bS, iH, iW, iC}, nd4j::DataType::FLOAT32);
-    NDArray weights('c', {kH, kW, iC, mC}, nd4j::DataType::FLOAT32);
+    NDArray input('c', {bS, iH, iW, iC}, sd::DataType::FLOAT32);
+    NDArray weights('c', {kH, kW, iC, mC}, sd::DataType::FLOAT32);
 
-    NDArray expOutput('c', {bS, oH, oW, oC}, {20., 24.,28., 32.,16., 18.,44., 48.,52., 56.,28., 30.,28., 30.,32., 34.,17., 18.}, nd4j::DataType::FLOAT32);
+    NDArray expOutput('c', {bS, oH, oW, oC}, {20., 24.,28., 32.,16., 18.,44., 48.,52., 56.,28., 30.,28., 30.,32., 34.,17., 18.}, sd::DataType::FLOAT32);
     input.linspace(1.);
     weights = 1.;
 
-    nd4j::ops::depthwise_conv2d op;
+    sd::ops::depthwise_conv2d op;
     auto results = op.evaluate({&input, &weights}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
     NDArray* output = results->at(0);
     // output.printIndexedBuffer();
@@ -2318,18 +2318,18 @@ TEST_F(ConvolutionTests2, depthwise_conv2d_7) {
 
     NDArray input('c', {bS, iC, iH, iW}, {0.6793503761291504, 0.35508695244789124, 0.842789351940155, 0.20031332969665527, 0.7014986872673035, 0.3106933832168579,
                                         0.44793984293937683, 0.9380097389221191, 0.3266739547252655, 0.15187257528305054, 0.3833175301551819, 0.7821229696273804,
-                                        0.19880719482898712, 0.7985635995864868, 0.16326339542865753, 0.14696824550628662, 0.2608966827392578, 0.13505761325359344}, nd4j::DataType::FLOAT32);
-    NDArray weights('c', {kH, kW, iC, mC}, {0.1308445781469345, 0.6442840099334717, 0.5698848366737366, 0.19896849989891052}, nd4j::DataType::FLOAT32);
-    NDArray biases('c', {1,iC*mC}, {0.6123566627502441, 0.37637925148010254, 0.17464971542358398, 0.4270855486392975}, nd4j::DataType::FLOAT32);
+                                        0.19880719482898712, 0.7985635995864868, 0.16326339542865753, 0.14696824550628662, 0.2608966827392578, 0.13505761325359344}, sd::DataType::FLOAT32);
+    NDArray weights('c', {kH, kW, iC, mC}, {0.1308445781469345, 0.6442840099334717, 0.5698848366737366, 0.19896849989891052}, sd::DataType::FLOAT32);
+    NDArray biases('c', {1,iC*mC}, {0.6123566627502441, 0.37637925148010254, 0.17464971542358398, 0.4270855486392975}, sd::DataType::FLOAT32);
 
     NDArray expOutput('c', {bS, oC, oH, oW}, {0.7012459761288241, 0.6588178652487691, 0.722631079971582, 0.6385665758716108, 0.7041439625563628, 0.6530092074102978,
         0.670967162534851, 0.735090151337225, 0.6551001785478623, 0.8140738359624038, 0.6051560970782859, 0.9193749546773375, 0.5054379267801892, 0.8283436386757472,
         0.5765540302788565, 0.6649797296980537, 0.9807239274294943, 0.586850056971322, 0.261199593183985, 0.3930965634902499, 0.6203697362284615, 0.28794692117826504,
         0.6297390019475202, 0.26769104886224415, 0.25840469001015975, 0.3233307788551656, 0.25161700129415276, 0.4573034071191504, 0.5033536625992294, 0.5827033826425385,
-        0.4666419179635315, 0.585974550122895, 0.4595698215161401, 0.45632759998045813, 0.4789957702325296, 0.4539577593482922}, nd4j::DataType::FLOAT32);
+        0.4666419179635315, 0.585974550122895, 0.4595698215161401, 0.45632759998045813, 0.4789957702325296, 0.4539577593482922}, sd::DataType::FLOAT32);
 
 
-    nd4j::ops::depthwise_conv2d op;
+    sd::ops::depthwise_conv2d op;
     auto results = op.evaluate({&input, &weights, &biases}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
     auto* output = results->at(0);
 
@@ -2350,8 +2350,8 @@ TEST_F(ConvolutionTests2, depthwise_conv2d_8) {
     int paddingMode = 1;             // 1-SAME, 0-VALID;
     int dataFormat  = 1;             // 1-NHWC, 0-NCHW
 
-    NDArray input('c', {bS, iH, iW, iC}, nd4j::DataType::FLOAT32);
-    NDArray weights('c', {kH, kW, iC, mC}, nd4j::DataType::FLOAT32);
+    NDArray input('c', {bS, iH, iW, iC}, sd::DataType::FLOAT32);
+    NDArray weights('c', {kH, kW, iC, mC}, sd::DataType::FLOAT32);
 
     NDArray expOutput('c', {bS, oH, oW, oC}, {-42.879997, -43.959999, -44.959999, -45.879997, -46.720005, -47.480003, -48.160000, -48.760002, -43.519997, -45.139999, -46.639996, -48.020000, -49.280003, -50.419998, -51.440006, -52.340000, -31.999998, -33.139999, -34.160000, -35.060001, -35.840004, -36.500004, -37.039997, -37.459999, -20.480000,
         -21.139997, -21.680000, -22.100000, -22.399998, -22.579998, -22.639996, -22.580002, -8.960000, -9.139998, -9.200002, -9.140001, -8.960001, -8.660000, -8.240002, -7.700001, 2.560000, 2.860002, 3.279998, 3.820000, 4.480001, 5.260000, 6.160001, 7.180000, 14.080000, 14.860000, 15.759998, 16.779999, 17.920002, 19.180000, 20.560001, 22.059998,
@@ -2362,12 +2362,12 @@ TEST_F(ConvolutionTests2, depthwise_conv2d_8) {
         214.889999, 222.720001, 230.730011, 238.919998, 247.290009, 201.119995, 209.129990, 217.319992, 225.690002, 234.240005, 242.970001, 251.880005, 260.970001, 209.760010, 218.489990, 227.399994, 236.490005, 245.760010, 255.209991, 264.839996, 274.649994, 218.399994, 227.850006, 237.479996, 247.289993, 257.279999, 267.449982, 277.799988,
         288.330017, 227.040009, 237.209991, 247.559998, 258.089996, 268.800018, 279.690002, 290.760010, 302.010010, 235.679993, 246.570007, 257.639984, 268.889984, 280.320007, 291.929993, 303.720001, 315.690002, 244.320007, 255.929993, 267.720001, 279.690002, 291.839996, 304.169983, 316.679993, 329.369995, 252.959991, 265.290009, 277.799988,
         290.489990, 303.359985, 316.410004, 329.640015, 343.050018, 139.199997, 147.419998, 155.760010, 164.220001, 172.799988, 181.500000, 190.319992, 199.260010, 216.000000, 225.660004, 235.440002, 245.339996, 255.360016, 265.500000, 275.760010, 286.140015, 278.880005, 293.369995, 308.040009, 322.889984, 337.920013, 353.129974, 368.519989,
-        384.090027, 287.520020, 302.730011, 318.119995, 333.690002, 349.440002, 365.369995, 381.479980, 397.770020, 296.160004, 312.089996, 328.199982, 344.489990, 360.960022, 377.609985, 394.440002, 411.449982, 304.799988, 321.450012, 338.280029, 355.289978, 372.480011, 389.850006, 407.399994, 425.130005, 313.440002, 330.809998, 348.359985, 366.089996, 384.000000, 402.090027, 420.359985, 438.809998, 322.079987, 340.169983, 358.440002, 376.889984, 395.520020, 414.329987, 433.320007, 452.489990, 330.720001, 349.530029, 368.520020, 387.690002, 407.039978, 426.570007, 446.279999, 466.170013, 339.360016, 358.890015, 378.599976, 398.490021, 418.559998, 438.809998, 459.239990, 479.849976, 177.600006, 190.619995, 203.759995, 217.020004, 230.399994, 243.899994, 257.519989, 271.260010, 292.799988, 307.260010, 321.839996, 336.539978, 351.360016, 366.299988, 381.359985, 396.540009, 365.279999, 386.970001, 408.839996, 430.889984, 453.120026, 475.529968, 498.119995, 520.890015, 373.920013, 396.329987, 418.919983, 441.690002, 464.640015, 487.769958, 511.079987, 534.570007, 382.559998, 405.690002, 429.000000, 452.489990, 476.160004, 500.010010, 524.039978, 548.250000, 391.200012, 415.049988, 439.080017, 463.290009, 487.679993, 512.250000, 537.000000, 561.930054, 399.839996, 424.409973, 449.160034, 474.089966, 499.200012, 524.489990, 549.959961, 575.609985, 408.479980, 433.770020, 459.239990, 484.889954, 510.720032, 536.729980, 562.919983, 589.290039, 417.119995, 443.130005, 469.319977, 495.690002, 522.239990, 548.969971, 575.880005, 602.969971, 425.760010, 452.489990, 479.399994, 506.489990, 533.760010, 561.209961, 588.839966, 616.650024, 216.000000, 233.819992, 251.760010, 269.820007, 288.000000, 306.299988, 324.719971, 343.260010, 369.600006, 388.859985, 408.239990, 427.739990, 447.360016, 467.100006, 486.959961, 506.940002, 451.679993, 480.570007, 509.639984, 538.890015, 568.320007, 597.929993, 627.719971, 657.690002, 460.320007, 489.929993, 519.719971, 549.690002, 579.840027, 610.170044, 640.680054, 671.369995, 468.960022, 499.289978, 529.799988, 560.489990, 591.359985, 622.409973, 653.640015, 685.049988, 477.599976, 508.650024, 539.880005, 571.289978, 602.880005, 634.650024, 666.599976, 698.729980, 486.239990, 518.010010, 549.960022, 582.089966, 614.400024, 646.890015, 679.559937, 712.410034, 494.879974, 527.369995, 560.039978, 592.890015, 625.920044, 659.130005, 692.520020, 726.089966, 503.519989, 536.729980, 570.119995, 603.689941, 637.440063, 671.369995, 705.480042, 739.770020, 512.160034, 546.089966, 580.199951, 614.489990, 648.960022, 683.609985, 718.440002, 753.449951, 254.400009, 277.020020, 299.760010, 322.619995, 345.600006, 368.700012, 391.919983, 415.260010, 446.399994, 470.459961, 494.640015, 518.940002, 543.360046, 567.900024, 592.559998, 617.340027, 538.080017, 574.170044, 610.440002, 646.890015, 683.520020, 720.329956, 757.320007, 794.489990, 546.719971, 583.530029, 620.520020, 657.690002, 695.040039, 732.570007, 770.279968, 808.169983, 555.359985, 592.889954, 630.599976, 668.489990, 706.559998, 744.809998, 783.239990, 821.849976, 564.000000, 602.250000, 640.679993, 679.289978, 718.080017, 757.050049, 796.199951, 835.530029, 572.640015, 611.609985, 650.760010, 690.089966, 729.600037, 769.289978, 809.160034, 849.210083, 581.279968, 620.970032, 660.839966, 700.889954, 741.119995, 781.529968, 822.119995, 862.890015, 589.919983, 630.330017, 670.919983, 711.690002, 752.640015, 793.770020, 835.079956, 876.570007, 598.559998, 639.690002, 681.000000, 722.490051, 764.160034, 806.010010, 848.039978, 890.250061, 292.799988, 320.220001, 347.760010, 375.419983, 403.200012, 431.100006, 459.119995, 487.260010, 523.199951, 552.059998, 581.040039, 610.139954, 639.360046, 668.699951, 698.159973, 727.739990, 624.479980, 667.770020, 711.239990, 754.890015, 798.719971, 842.729980, 886.919983, 931.290039, 633.119995, 677.130005, 721.319946, 765.690002, 810.239990, 854.969971, 899.880005, 944.969971, 641.760010, 686.489990, 731.400024, 776.489990, 821.760010, 867.209961, 912.839966, 958.650024, 650.400024, 695.849976, 741.479980, 787.290039, 833.279968, 879.449951, 925.799927, 972.330017, 659.040039, 705.210022, 751.559998, 798.089966, 844.800049, 891.690002, 938.760010, 986.010010, 667.679993, 714.569946, 761.640015, 808.890015, 856.320007, 903.929993, 951.719971, 999.690063, 676.320007, 723.929993, 771.719971, 819.690002, 867.839966, 916.169922, 964.679932, 1013.369995, 684.959961, 733.290039, 781.800049, 830.489990, 879.359985, 928.410034, 977.640015, 1027.050049, 331.199982, 363.419983, 395.760010, 428.220001, 460.799988, 493.500000, 526.320007, 559.260010, 600.000000, 633.660034, 667.440002, 701.339966, 735.359985, 769.500000, 803.759949, 838.140015, 710.880005, 761.369995, 812.039978, 862.889893, 913.919983, 965.130005, 1016.520020, 1068.090088, 719.520020, 770.729980, 822.119934, 873.689941, 925.440063, 977.369995, 1029.479980, 1081.770020, 728.160034, 780.090088, 832.199951, 884.489990, 936.960022, 989.610046, 1042.439941, 1095.449951, 736.799927, 789.449951, 842.280029, 895.290039, 948.480042, 1001.849976, 1055.399902, 1109.129883, 745.439941, 798.810059, 852.359985, 906.089966, 960.000000, 1014.089966, 1068.359985, 1122.810059, 754.080017, 808.170044, 862.440002, 916.890015, 971.520020, 1026.330078, 1081.319946, 1136.489990, 762.720032, 817.530029, 872.520020, 927.689941, 983.040039, 1038.569946, 1094.280029, 1150.169922, 771.359985, 826.890015, 882.599976, 938.489990, 994.559998, 1050.810059, 1107.239990, 1163.849976, 369.599976, 406.619995, 443.760010, 481.020020, 518.400024, 555.900024, 593.520020, 631.260010, 113.279999, 136.839996, 160.480011, 184.199982, 208.000015, 231.880005, 255.839996, 279.880005, 31.359985, 66.699989, 102.160004, 137.740005, 173.440002, 209.260010, 245.199982, 281.260010, 31.359993, 67.179993, 103.120003, 139.179993, 175.360016, 211.660004, 248.079987, 284.619995, 31.359993, 67.659996, 104.080009, 140.619995, 177.280014, 214.060013, 250.959991, 287.980011, 31.359993, 68.139999, 105.039993, 142.059982, 179.200027, 216.459991, 253.839996, 291.339996, 31.360008, 68.619995, 106.000000, 143.499985, 181.119995, 218.860001, 256.719971, 294.700012, 31.360001, 69.099991, 106.959984, 144.939987, 183.040009, 221.260010, 259.600006, 298.059998, 31.360008, 69.579971, 107.920006, 146.379990, 184.960007, 223.660004, 262.479980, 301.419983, 31.360001, 70.059975, 108.880020, 147.819977, 186.880020, 226.059998, 265.359985, 304.779999, -83.840004, -58.040001, -32.159988, -6.200012, 19.840012, 45.959984, 72.159996, 98.440010}, nd4j::DataType::FLOAT32);
+        384.090027, 287.520020, 302.730011, 318.119995, 333.690002, 349.440002, 365.369995, 381.479980, 397.770020, 296.160004, 312.089996, 328.199982, 344.489990, 360.960022, 377.609985, 394.440002, 411.449982, 304.799988, 321.450012, 338.280029, 355.289978, 372.480011, 389.850006, 407.399994, 425.130005, 313.440002, 330.809998, 348.359985, 366.089996, 384.000000, 402.090027, 420.359985, 438.809998, 322.079987, 340.169983, 358.440002, 376.889984, 395.520020, 414.329987, 433.320007, 452.489990, 330.720001, 349.530029, 368.520020, 387.690002, 407.039978, 426.570007, 446.279999, 466.170013, 339.360016, 358.890015, 378.599976, 398.490021, 418.559998, 438.809998, 459.239990, 479.849976, 177.600006, 190.619995, 203.759995, 217.020004, 230.399994, 243.899994, 257.519989, 271.260010, 292.799988, 307.260010, 321.839996, 336.539978, 351.360016, 366.299988, 381.359985, 396.540009, 365.279999, 386.970001, 408.839996, 430.889984, 453.120026, 475.529968, 498.119995, 520.890015, 373.920013, 396.329987, 418.919983, 441.690002, 464.640015, 487.769958, 511.079987, 534.570007, 382.559998, 405.690002, 429.000000, 452.489990, 476.160004, 500.010010, 524.039978, 548.250000, 391.200012, 415.049988, 439.080017, 463.290009, 487.679993, 512.250000, 537.000000, 561.930054, 399.839996, 424.409973, 449.160034, 474.089966, 499.200012, 524.489990, 549.959961, 575.609985, 408.479980, 433.770020, 459.239990, 484.889954, 510.720032, 536.729980, 562.919983, 589.290039, 417.119995, 443.130005, 469.319977, 495.690002, 522.239990, 548.969971, 575.880005, 602.969971, 425.760010, 452.489990, 479.399994, 506.489990, 533.760010, 561.209961, 588.839966, 616.650024, 216.000000, 233.819992, 251.760010, 269.820007, 288.000000, 306.299988, 324.719971, 343.260010, 369.600006, 388.859985, 408.239990, 427.739990, 447.360016, 467.100006, 486.959961, 506.940002, 451.679993, 480.570007, 509.639984, 538.890015, 568.320007, 597.929993, 627.719971, 657.690002, 460.320007, 489.929993, 519.719971, 549.690002, 579.840027, 610.170044, 640.680054, 671.369995, 468.960022, 499.289978, 529.799988, 560.489990, 591.359985, 622.409973, 653.640015, 685.049988, 477.599976, 508.650024, 539.880005, 571.289978, 602.880005, 634.650024, 666.599976, 698.729980, 486.239990, 518.010010, 549.960022, 582.089966, 614.400024, 646.890015, 679.559937, 712.410034, 494.879974, 527.369995, 560.039978, 592.890015, 625.920044, 659.130005, 692.520020, 726.089966, 503.519989, 536.729980, 570.119995, 603.689941, 637.440063, 671.369995, 705.480042, 739.770020, 512.160034, 546.089966, 580.199951, 614.489990, 648.960022, 683.609985, 718.440002, 753.449951, 254.400009, 277.020020, 299.760010, 322.619995, 345.600006, 368.700012, 391.919983, 415.260010, 446.399994, 470.459961, 494.640015, 518.940002, 543.360046, 567.900024, 592.559998, 617.340027, 538.080017, 574.170044, 610.440002, 646.890015, 683.520020, 720.329956, 757.320007, 794.489990, 546.719971, 583.530029, 620.520020, 657.690002, 695.040039, 732.570007, 770.279968, 808.169983, 555.359985, 592.889954, 630.599976, 668.489990, 706.559998, 744.809998, 783.239990, 821.849976, 564.000000, 602.250000, 640.679993, 679.289978, 718.080017, 757.050049, 796.199951, 835.530029, 572.640015, 611.609985, 650.760010, 690.089966, 729.600037, 769.289978, 809.160034, 849.210083, 581.279968, 620.970032, 660.839966, 700.889954, 741.119995, 781.529968, 822.119995, 862.890015, 589.919983, 630.330017, 670.919983, 711.690002, 752.640015, 793.770020, 835.079956, 876.570007, 598.559998, 639.690002, 681.000000, 722.490051, 764.160034, 806.010010, 848.039978, 890.250061, 292.799988, 320.220001, 347.760010, 375.419983, 403.200012, 431.100006, 459.119995, 487.260010, 523.199951, 552.059998, 581.040039, 610.139954, 639.360046, 668.699951, 698.159973, 727.739990, 624.479980, 667.770020, 711.239990, 754.890015, 798.719971, 842.729980, 886.919983, 931.290039, 633.119995, 677.130005, 721.319946, 765.690002, 810.239990, 854.969971, 899.880005, 944.969971, 641.760010, 686.489990, 731.400024, 776.489990, 821.760010, 867.209961, 912.839966, 958.650024, 650.400024, 695.849976, 741.479980, 787.290039, 833.279968, 879.449951, 925.799927, 972.330017, 659.040039, 705.210022, 751.559998, 798.089966, 844.800049, 891.690002, 938.760010, 986.010010, 667.679993, 714.569946, 761.640015, 808.890015, 856.320007, 903.929993, 951.719971, 999.690063, 676.320007, 723.929993, 771.719971, 819.690002, 867.839966, 916.169922, 964.679932, 1013.369995, 684.959961, 733.290039, 781.800049, 830.489990, 879.359985, 928.410034, 977.640015, 1027.050049, 331.199982, 363.419983, 395.760010, 428.220001, 460.799988, 493.500000, 526.320007, 559.260010, 600.000000, 633.660034, 667.440002, 701.339966, 735.359985, 769.500000, 803.759949, 838.140015, 710.880005, 761.369995, 812.039978, 862.889893, 913.919983, 965.130005, 1016.520020, 1068.090088, 719.520020, 770.729980, 822.119934, 873.689941, 925.440063, 977.369995, 1029.479980, 1081.770020, 728.160034, 780.090088, 832.199951, 884.489990, 936.960022, 989.610046, 1042.439941, 1095.449951, 736.799927, 789.449951, 842.280029, 895.290039, 948.480042, 1001.849976, 1055.399902, 1109.129883, 745.439941, 798.810059, 852.359985, 906.089966, 960.000000, 1014.089966, 1068.359985, 1122.810059, 754.080017, 808.170044, 862.440002, 916.890015, 971.520020, 1026.330078, 1081.319946, 1136.489990, 762.720032, 817.530029, 872.520020, 927.689941, 983.040039, 1038.569946, 1094.280029, 1150.169922, 771.359985, 826.890015, 882.599976, 938.489990, 994.559998, 1050.810059, 1107.239990, 1163.849976, 369.599976, 406.619995, 443.760010, 481.020020, 518.400024, 555.900024, 593.520020, 631.260010, 113.279999, 136.839996, 160.480011, 184.199982, 208.000015, 231.880005, 255.839996, 279.880005, 31.359985, 66.699989, 102.160004, 137.740005, 173.440002, 209.260010, 245.199982, 281.260010, 31.359993, 67.179993, 103.120003, 139.179993, 175.360016, 211.660004, 248.079987, 284.619995, 31.359993, 67.659996, 104.080009, 140.619995, 177.280014, 214.060013, 250.959991, 287.980011, 31.359993, 68.139999, 105.039993, 142.059982, 179.200027, 216.459991, 253.839996, 291.339996, 31.360008, 68.619995, 106.000000, 143.499985, 181.119995, 218.860001, 256.719971, 294.700012, 31.360001, 69.099991, 106.959984, 144.939987, 183.040009, 221.260010, 259.600006, 298.059998, 31.360008, 69.579971, 107.920006, 146.379990, 184.960007, 223.660004, 262.479980, 301.419983, 31.360001, 70.059975, 108.880020, 147.819977, 186.880020, 226.059998, 265.359985, 304.779999, -83.840004, -58.040001, -32.159988, -6.200012, 19.840012, 45.959984, 72.159996, 98.440010}, sd::DataType::FLOAT32);
 
     input.linspace(-10, 0.1);
     weights.linspace(-2, 0.1);
 
-    nd4j::ops::depthwise_conv2d op;
+    sd::ops::depthwise_conv2d op;
     auto results = op.evaluate({&input, &weights}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
     auto output = results->at(0);
     // output->printBuffer();
@@ -2389,8 +2389,8 @@ TEST_F(ConvolutionTests2, depthwise_conv2d_9) {
     int paddingMode = 1;             // 1-SAME, 0-VALID;
     int dataFormat  = 0;             // 1-NHWC, 0-NCHW
 
-    NDArray input('c', {bS, iC, iH, iW}, nd4j::DataType::FLOAT32);
-    NDArray weights('c', {kH, kW, iC, mC}, nd4j::DataType::FLOAT32);
+    NDArray input('c', {bS, iC, iH, iW}, sd::DataType::FLOAT32);
+    NDArray weights('c', {kH, kW, iC, mC}, sd::DataType::FLOAT32);
 
     NDArray expOutput('c', {bS, oC, oH, oW}, {-103.360001, -131.440002, -130.000000, -128.559998, -127.120003, -125.680000, -124.240005, -122.799995, -121.360001, -66.720001,-76.199997, -81.239998, -80.160004, -79.080002, -78.000000, -76.919998, -75.840004, -74.760002, -73.680000, -29.400002, -66.599998, -70.440002, -69.360001, -68.279999,
         -67.199997, -66.120003, -65.040001, -63.959999, -62.879997, -24.599997, -57.000000, -59.639999, -58.560005, -57.479996, -56.399998, -55.320000, -54.240002, -53.159996, -52.080002, -19.799997, -47.400002, -48.840000, -47.760002, -46.680000, -45.599998, -44.520000, -43.440002, -42.360001, -41.279999, -15.000000, -37.799999, -38.040001,
@@ -2402,12 +2402,12 @@ TEST_F(ConvolutionTests2, depthwise_conv2d_9) {
         170.839996, 172.399994, 173.960007, 175.520004, 177.080002, 178.639999, 180.199982, 102.360001, 129.059998, 154.739990, 156.000000, 157.259995, 158.520004, 159.779999, 161.039993, 162.300003, 163.559998, 80.820000, 139.860001, 167.340012, 168.600006, 169.860001, 171.119995, 172.380005, 173.639999, 174.899994, 176.160004, 86.820000,
         150.660004, 179.940002, 181.200012, 182.459991, 183.720001, 184.980011, 186.239990, 187.500000, 188.759995, 92.820007, 161.459991, 192.540009, 193.799988, 195.059998, 196.319992, 197.579987, 198.839996, 200.100006, 201.360001, 98.820000, 172.259995, 205.139999, 206.399994, 207.660004, 208.919983, 210.179993, 211.440002, 212.700012,
         213.959991, 104.819992, 183.059998, 217.739990, 219.000000, 220.259995, 221.519989, 222.779999, 224.039993, 225.300018, 226.559998, 110.819992, 193.860016, 230.339996, 231.600006, 232.860001, 234.119995, 235.380005, 236.639999, 237.900009, 239.160004, 116.820000, 204.660004, 242.940002, 244.199982, 245.459991, 246.720001, 247.980011,
-        249.239990, 250.500000, 251.759995, 122.819992, 47.000000, 26.240004, 26.360004, 26.479998, 26.600002, 26.720001, 26.840002, 26.959997, 27.080000, -12.999998, 257.299988, 337.640015, 339.260010, 340.879974, 342.499969, 344.119995, 345.740021, 347.359985, 348.979980, 198.899994, 249.690002, 299.729980, 301.079987, 302.429993, 303.779999, 305.130005, 306.480011, 307.829987, 309.179993, 153.929993, 261.089996, 313.230011, 314.580017, 315.929993, 317.279968, 318.630005, 319.979980, 321.329987, 322.679993, 160.529999, 272.489990, 326.729980, 328.079987, 329.429993, 330.779968, 332.130005, 333.479980, 334.829987, 336.179993, 167.130005, 283.889984, 340.230011, 341.580017, 342.929993, 344.279999, 345.630005, 346.980011, 348.330017, 349.679993, 173.729996, 295.289978, 353.729980, 355.079987, 356.429993, 357.779968, 359.130005, 360.480011, 361.829987, 363.179993, 180.329987, 306.690002, 367.230011, 368.580017, 369.929993, 371.279999, 372.630005, 373.980011, 375.330017, 376.679993, 186.929993, 318.089996, 380.729980, 382.080017, 383.429993, 384.779968, 386.130005, 387.479980, 388.829987, 390.179993, 193.529984, 329.489990, 394.229980, 395.579987, 396.929993, 398.279999, 399.630005, 400.980011, 402.330017, 403.679993, 200.130005, 82.419998, 55.400005, 55.580002, 55.759995, 55.939999, 56.120003, 56.299995, 56.479996, 56.659996, -9.260002, 393.520020, 518.000000, 519.679993, 521.359985, 523.040039, 524.720032, 526.400024, 528.080017, 529.760010, 303.440002, 382.320007, 462.720032, 464.160004, 465.600037, 467.040009, 468.479980, 469.919983, 471.359985, 472.800018, 239.040009, 394.320007, 477.119995, 478.559998, 480.000000, 481.440002, 482.880005, 484.320007, 485.760010, 487.200012, 246.240005, 406.320007, 491.520020, 492.960022, 494.400024, 495.839996, 497.280029, 498.720032, 500.160004, 501.600037, 253.440002, 418.320007, 505.919983, 507.359985, 508.800018, 510.240051, 511.680023, 513.119995, 514.559998, 516.000000, 260.640015, 430.319977, 520.320007, 521.760010, 523.200012, 524.640015, 526.079956, 527.520020, 528.960022, 530.400024, 267.839996, 442.320007, 534.720032, 536.160034, 537.600037, 539.040039, 540.479980, 541.919983, 543.359985, 544.800049, 275.040009, 454.320007, 549.119995, 550.559998, 552.000000, 553.440002, 554.880005, 556.320007, 557.760010, 559.200012, 282.239990, 466.320007, 563.520020, 564.960022, 566.400024, 567.839966, 569.280029, 570.720032, 572.160034, 573.600037, 289.440002, 125.839996, 96.559998, 96.799995, 97.040009, 97.280014, 97.520004, 97.759995, 98.000000, 98.240013, 2.480007, 537.739990, 710.359985, 712.099976, 713.840027, 715.579956, 717.319946, 719.059998, 720.799988, 722.539978, 415.980011, 526.950012, 643.710022, 645.240051, 646.770020, 648.300049, 649.829956, 651.359985, 652.890015, 654.419983, 336.149994, 539.549988, 659.010010, 660.539978, 662.070007, 663.600037, 665.130005, 666.660034, 668.190002, 669.720032, 343.950012, 552.150024, 674.309998, 675.839966, 677.369995, 678.900024, 680.429993, 681.960022, 683.490051, 685.020020, 351.750000, 564.750000, 689.609985, 691.140015, 692.669983, 694.200012, 695.729980, 697.260010, 698.789978, 700.320007, 359.549988, 577.349976, 704.910034, 706.440002, 707.970032, 709.500000, 711.029968, 712.559998, 714.089966, 715.619995, 367.350037, 589.950012, 720.210022, 721.740051, 723.270020, 724.800049, 726.329956, 727.859985, 729.390015, 730.919983, 375.149994, 602.549988, 735.510010, 737.039978, 738.570007, 740.100037, 741.630005, 743.160034, 744.690002, 746.220032, 382.950012, 615.150024, 750.809998, 752.339966, 753.869995, 755.399963, 756.929993, 758.460022, 759.990051, 761.520020, 390.750000, 177.260010, 149.720001, 150.020004, 150.319992, 150.619995, 150.919998, 151.220001, 151.520004, 151.819992, 22.220009, 689.959961, 914.720032, 916.519958, 918.319946, 920.119995, 921.919983, 923.719971, 925.520020, 927.320007, 536.519958, 683.579956, 842.699951, 844.319946, 845.940002, 847.559998, 849.179993, 850.799988, 852.419983, 854.039978, 445.260010, 696.779968, 858.900024, 860.520020, 862.140015, 863.760010, 865.380005, 867.000000, 868.619995, 870.239990, 453.659973, 709.979980, 875.099976, 876.719971, 878.339966, 879.959961, 881.579956, 883.199951, 884.819946, 886.440002, 462.059998, 723.179993, 891.299988, 892.919983, 894.539978, 896.159973, 897.779968, 899.400024, 901.020020, 902.640015, 470.459991, 736.380005, 907.500000, 909.119995, 910.739990, 912.359985, 913.979980, 915.599976, 917.219971, 918.839966, 478.859985, 749.579956, 923.699951, 925.319946, 926.940002, 928.559998, 930.179993, 931.799988, 933.419983, 935.039978, 487.260010, 762.779968, 939.900024, 941.520020, 943.140015, 944.760010, 946.380005, 948.000000, 949.619995, 951.239990, 495.659973, 775.979980, 956.099976, 957.719971, 959.339966, 960.959961, 962.579956, 964.199951, 965.819946, 967.440002, 504.059998, 236.679977, 214.880005, 215.239990, 215.599991, 215.959991, 216.319992, 216.679993, 217.040009, 217.399994, 49.959995, 850.180054, 1131.079956, 1132.939941, 1134.800049, 1136.660034, 1138.520020, 1140.380005, 1142.239990, 1144.100098, 665.060059, 852.209961, 1059.689941, 1061.399902, 1063.110107, 1064.820068, 1066.530029, 1068.239990, 1069.950073, 1071.660034, 566.370056, 866.010010, 1076.790039, 1078.500000, 1080.209961, 1081.920044, 1083.630005, 1085.339966, 1087.050049, 1088.760010, 575.369995, 879.809998, 1093.890015, 1095.599976, 1097.310059, 1099.020020, 1100.729980, 1102.439941, 1104.149902, 1105.859985, 584.369995, 893.609985, 1110.989990, 1112.699951, 1114.410034, 1116.120117, 1117.830078, 1119.540039, 1121.250000, 1122.959961, 593.370056, 907.410034, 1128.089966, 1129.800049, 1131.510010, 1133.220093, 1134.929932, 1136.639893, 1138.349976, 1140.060059, 602.369995, 921.209961, 1145.189941, 1146.900024, 1148.609985, 1150.320068, 1152.030029, 1153.739990, 1155.449951, 1157.160034, 611.370056, 935.010010, 1162.290039, 1164.000000, 1165.709961, 1167.420044, 1169.130005, 1170.839966, 1172.550049, 1174.260010, 620.369995, 948.809998, 1179.390015, 1181.099976, 1182.810059, 1184.520020, 1186.229980, 1187.939941, 1189.650024, 1191.359985, 629.370056, 304.099976, 292.039978, 292.460022, 292.880005, 293.300018, 293.720001, 294.140015, 294.559998, 294.980042, 85.700005}, nd4j::DataType::FLOAT32);
+        249.239990, 250.500000, 251.759995, 122.819992, 47.000000, 26.240004, 26.360004, 26.479998, 26.600002, 26.720001, 26.840002, 26.959997, 27.080000, -12.999998, 257.299988, 337.640015, 339.260010, 340.879974, 342.499969, 344.119995, 345.740021, 347.359985, 348.979980, 198.899994, 249.690002, 299.729980, 301.079987, 302.429993, 303.779999, 305.130005, 306.480011, 307.829987, 309.179993, 153.929993, 261.089996, 313.230011, 314.580017, 315.929993, 317.279968, 318.630005, 319.979980, 321.329987, 322.679993, 160.529999, 272.489990, 326.729980, 328.079987, 329.429993, 330.779968, 332.130005, 333.479980, 334.829987, 336.179993, 167.130005, 283.889984, 340.230011, 341.580017, 342.929993, 344.279999, 345.630005, 346.980011, 348.330017, 349.679993, 173.729996, 295.289978, 353.729980, 355.079987, 356.429993, 357.779968, 359.130005, 360.480011, 361.829987, 363.179993, 180.329987, 306.690002, 367.230011, 368.580017, 369.929993, 371.279999, 372.630005, 373.980011, 375.330017, 376.679993, 186.929993, 318.089996, 380.729980, 382.080017, 383.429993, 384.779968, 386.130005, 387.479980, 388.829987, 390.179993, 193.529984, 329.489990, 394.229980, 395.579987, 396.929993, 398.279999, 399.630005, 400.980011, 402.330017, 403.679993, 200.130005, 82.419998, 55.400005, 55.580002, 55.759995, 55.939999, 56.120003, 56.299995, 56.479996, 56.659996, -9.260002, 393.520020, 518.000000, 519.679993, 521.359985, 523.040039, 524.720032, 526.400024, 528.080017, 529.760010, 303.440002, 382.320007, 462.720032, 464.160004, 465.600037, 467.040009, 468.479980, 469.919983, 471.359985, 472.800018, 239.040009, 394.320007, 477.119995, 478.559998, 480.000000, 481.440002, 482.880005, 484.320007, 485.760010, 487.200012, 246.240005, 406.320007, 491.520020, 492.960022, 494.400024, 495.839996, 497.280029, 498.720032, 500.160004, 501.600037, 253.440002, 418.320007, 505.919983, 507.359985, 508.800018, 510.240051, 511.680023, 513.119995, 514.559998, 516.000000, 260.640015, 430.319977, 520.320007, 521.760010, 523.200012, 524.640015, 526.079956, 527.520020, 528.960022, 530.400024, 267.839996, 442.320007, 534.720032, 536.160034, 537.600037, 539.040039, 540.479980, 541.919983, 543.359985, 544.800049, 275.040009, 454.320007, 549.119995, 550.559998, 552.000000, 553.440002, 554.880005, 556.320007, 557.760010, 559.200012, 282.239990, 466.320007, 563.520020, 564.960022, 566.400024, 567.839966, 569.280029, 570.720032, 572.160034, 573.600037, 289.440002, 125.839996, 96.559998, 96.799995, 97.040009, 97.280014, 97.520004, 97.759995, 98.000000, 98.240013, 2.480007, 537.739990, 710.359985, 712.099976, 713.840027, 715.579956, 717.319946, 719.059998, 720.799988, 722.539978, 415.980011, 526.950012, 643.710022, 645.240051, 646.770020, 648.300049, 649.829956, 651.359985, 652.890015, 654.419983, 336.149994, 539.549988, 659.010010, 660.539978, 662.070007, 663.600037, 665.130005, 666.660034, 668.190002, 669.720032, 343.950012, 552.150024, 674.309998, 675.839966, 677.369995, 678.900024, 680.429993, 681.960022, 683.490051, 685.020020, 351.750000, 564.750000, 689.609985, 691.140015, 692.669983, 694.200012, 695.729980, 697.260010, 698.789978, 700.320007, 359.549988, 577.349976, 704.910034, 706.440002, 707.970032, 709.500000, 711.029968, 712.559998, 714.089966, 715.619995, 367.350037, 589.950012, 720.210022, 721.740051, 723.270020, 724.800049, 726.329956, 727.859985, 729.390015, 730.919983, 375.149994, 602.549988, 735.510010, 737.039978, 738.570007, 740.100037, 741.630005, 743.160034, 744.690002, 746.220032, 382.950012, 615.150024, 750.809998, 752.339966, 753.869995, 755.399963, 756.929993, 758.460022, 759.990051, 761.520020, 390.750000, 177.260010, 149.720001, 150.020004, 150.319992, 150.619995, 150.919998, 151.220001, 151.520004, 151.819992, 22.220009, 689.959961, 914.720032, 916.519958, 918.319946, 920.119995, 921.919983, 923.719971, 925.520020, 927.320007, 536.519958, 683.579956, 842.699951, 844.319946, 845.940002, 847.559998, 849.179993, 850.799988, 852.419983, 854.039978, 445.260010, 696.779968, 858.900024, 860.520020, 862.140015, 863.760010, 865.380005, 867.000000, 868.619995, 870.239990, 453.659973, 709.979980, 875.099976, 876.719971, 878.339966, 879.959961, 881.579956, 883.199951, 884.819946, 886.440002, 462.059998, 723.179993, 891.299988, 892.919983, 894.539978, 896.159973, 897.779968, 899.400024, 901.020020, 902.640015, 470.459991, 736.380005, 907.500000, 909.119995, 910.739990, 912.359985, 913.979980, 915.599976, 917.219971, 918.839966, 478.859985, 749.579956, 923.699951, 925.319946, 926.940002, 928.559998, 930.179993, 931.799988, 933.419983, 935.039978, 487.260010, 762.779968, 939.900024, 941.520020, 943.140015, 944.760010, 946.380005, 948.000000, 949.619995, 951.239990, 495.659973, 775.979980, 956.099976, 957.719971, 959.339966, 960.959961, 962.579956, 964.199951, 965.819946, 967.440002, 504.059998, 236.679977, 214.880005, 215.239990, 215.599991, 215.959991, 216.319992, 216.679993, 217.040009, 217.399994, 49.959995, 850.180054, 1131.079956, 1132.939941, 1134.800049, 1136.660034, 1138.520020, 1140.380005, 1142.239990, 1144.100098, 665.060059, 852.209961, 1059.689941, 1061.399902, 1063.110107, 1064.820068, 1066.530029, 1068.239990, 1069.950073, 1071.660034, 566.370056, 866.010010, 1076.790039, 1078.500000, 1080.209961, 1081.920044, 1083.630005, 1085.339966, 1087.050049, 1088.760010, 575.369995, 879.809998, 1093.890015, 1095.599976, 1097.310059, 1099.020020, 1100.729980, 1102.439941, 1104.149902, 1105.859985, 584.369995, 893.609985, 1110.989990, 1112.699951, 1114.410034, 1116.120117, 1117.830078, 1119.540039, 1121.250000, 1122.959961, 593.370056, 907.410034, 1128.089966, 1129.800049, 1131.510010, 1133.220093, 1134.929932, 1136.639893, 1138.349976, 1140.060059, 602.369995, 921.209961, 1145.189941, 1146.900024, 1148.609985, 1150.320068, 1152.030029, 1153.739990, 1155.449951, 1157.160034, 611.370056, 935.010010, 1162.290039, 1164.000000, 1165.709961, 1167.420044, 1169.130005, 1170.839966, 1172.550049, 1174.260010, 620.369995, 948.809998, 1179.390015, 1181.099976, 1182.810059, 1184.520020, 1186.229980, 1187.939941, 1189.650024, 1191.359985, 629.370056, 304.099976, 292.039978, 292.460022, 292.880005, 293.300018, 293.720001, 294.140015, 294.559998, 294.980042, 85.700005}, sd::DataType::FLOAT32);
 
     input.linspace(-10, 0.1);
     weights.linspace(-2, 0.1);
 
-    nd4j::ops::depthwise_conv2d op;
+    sd::ops::depthwise_conv2d op;
     auto results = op.evaluate({&input, &weights}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
     auto output = results->at(0);
 
diff --git a/libnd4j/tests_cpu/layers_tests/CuDnnTests.cu b/libnd4j/tests_cpu/layers_tests/CuDnnTests.cu
index 02e1040aa..47892c973 100644
--- a/libnd4j/tests_cpu/layers_tests/CuDnnTests.cu
+++ b/libnd4j/tests_cpu/layers_tests/CuDnnTests.cu
@@ -21,7 +21,7 @@
 
 #include "testlayers.h"
 #include <initializer_list>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 #include <ops/declarable/PlatformHelper.h>
 #include <ops/declarable/CustomOperations.h>
 #include <execution/Engine.h>
@@ -32,14 +32,14 @@
 
 #endif
 
-using namespace nd4j;
+using namespace sd;
 
 class CuDnnTests : public testing::Test {
 public:
 
 };
 
-static void printer(std::initializer_list<nd4j::ops::platforms::PlatformHelper*> helpers) {
+static void printer(std::initializer_list<sd::ops::platforms::PlatformHelper*> helpers) {
 
     for (auto v:helpers) {
         nd4j_printf("Initialized [%s]\n", v->name().c_str());
@@ -50,22 +50,22 @@ static void printer(std::initializer_list<nd4j::ops::platforms::PlatformHelper*>
 TEST_F(CuDnnTests, helpers_includer) {
     // we need this block, to make sure all helpers are still available within binary, and not optimized out by linker
 #ifdef HAVE_CUDNN
-    nd4j::ops::platforms::PLATFORM_conv2d_ENGINE_CUDA conv2d;
-    nd4j::ops::platforms::PLATFORM_conv2d_bp_ENGINE_CUDA conv2d_bp;
-    nd4j::ops::platforms::PLATFORM_conv3dnew_ENGINE_CUDA conv3dnew;
-    nd4j::ops::platforms::PLATFORM_conv3dnew_bp_ENGINE_CUDA conv3dnew_bp;
-    nd4j::ops::platforms::PLATFORM_depthwise_conv2d_ENGINE_CUDA depthwise_conv2d;
-    nd4j::ops::platforms::PLATFORM_depthwise_conv2d_bp_ENGINE_CUDA depthwise_conv2d_bp;
-    nd4j::ops::platforms::PLATFORM_batchnorm_ENGINE_CUDA batchnorm;
-    nd4j::ops::platforms::PLATFORM_batchnorm_bp_ENGINE_CUDA batchnorm_bp;
-    nd4j::ops::platforms::PLATFORM_avgpool2d_ENGINE_CUDA avgpool2d;
-    nd4j::ops::platforms::PLATFORM_avgpool2d_bp_ENGINE_CUDA avgpool2d_bp;
-    nd4j::ops::platforms::PLATFORM_maxpool2d_ENGINE_CUDA maxpool2d;
-    nd4j::ops::platforms::PLATFORM_maxpool2d_bp_ENGINE_CUDA maxpool2d_bp;
-    nd4j::ops::platforms::PLATFORM_avgpool3dnew_ENGINE_CUDA avgpool3dnew;
-    nd4j::ops::platforms::PLATFORM_avgpool3dnew_bp_ENGINE_CUDA avgpool3dnew_bp;
-    nd4j::ops::platforms::PLATFORM_maxpool3dnew_ENGINE_CUDA maxpool3dnew;
-    nd4j::ops::platforms::PLATFORM_maxpool3dnew_bp_ENGINE_CUDA maxpool3dnew_bp;
+    sd::ops::platforms::PLATFORM_conv2d_ENGINE_CUDA conv2d;
+    sd::ops::platforms::PLATFORM_conv2d_bp_ENGINE_CUDA conv2d_bp;
+    sd::ops::platforms::PLATFORM_conv3dnew_ENGINE_CUDA conv3dnew;
+    sd::ops::platforms::PLATFORM_conv3dnew_bp_ENGINE_CUDA conv3dnew_bp;
+    sd::ops::platforms::PLATFORM_depthwise_conv2d_ENGINE_CUDA depthwise_conv2d;
+    sd::ops::platforms::PLATFORM_depthwise_conv2d_bp_ENGINE_CUDA depthwise_conv2d_bp;
+    sd::ops::platforms::PLATFORM_batchnorm_ENGINE_CUDA batchnorm;
+    sd::ops::platforms::PLATFORM_batchnorm_bp_ENGINE_CUDA batchnorm_bp;
+    sd::ops::platforms::PLATFORM_avgpool2d_ENGINE_CUDA avgpool2d;
+    sd::ops::platforms::PLATFORM_avgpool2d_bp_ENGINE_CUDA avgpool2d_bp;
+    sd::ops::platforms::PLATFORM_maxpool2d_ENGINE_CUDA maxpool2d;
+    sd::ops::platforms::PLATFORM_maxpool2d_bp_ENGINE_CUDA maxpool2d_bp;
+    sd::ops::platforms::PLATFORM_avgpool3dnew_ENGINE_CUDA avgpool3dnew;
+    sd::ops::platforms::PLATFORM_avgpool3dnew_bp_ENGINE_CUDA avgpool3dnew_bp;
+    sd::ops::platforms::PLATFORM_maxpool3dnew_ENGINE_CUDA maxpool3dnew;
+    sd::ops::platforms::PLATFORM_maxpool3dnew_bp_ENGINE_CUDA maxpool3dnew_bp;
 
 
 
@@ -115,7 +115,7 @@ TEST_F(CuDnnTests, mixed_helpers_test_1) {
     weights.syncToHost();
     bias.syncToHost();
 
-    nd4j::ops::conv2d op;
+    sd::ops::conv2d op;
 
     // cuDNN part
     Context cuda(1);
diff --git a/libnd4j/tests_cpu/layers_tests/CudaBasicsTests1.cu b/libnd4j/tests_cpu/layers_tests/CudaBasicsTests1.cu
index 0a22272d8..a0104e637 100644
--- a/libnd4j/tests_cpu/layers_tests/CudaBasicsTests1.cu
+++ b/libnd4j/tests_cpu/layers_tests/CudaBasicsTests1.cu
@@ -19,25 +19,25 @@
  //
 
 #include "testlayers.h"
-#include <NDArray.h>
-#include <NDArrayFactory.h>
-#include <Context.h>
-#include <Node.h>
+#include <array/NDArray.h>
+#include <array/NDArrayFactory.h>
+#include <graph/Context.h>
+#include <graph/Node.h>
 #include <graph/Variable.h>
 #include <graph/VariableSpace.h>
-#include <specials_cuda.h>
-#include <TAD.h>
-#include <MmulHelper.h>
+#include <ops/specials_cuda.h>
+#include <helpers/TAD.h>
+#include <helpers/MmulHelper.h>
 #include <helpers/PointersManager.h>
 #include <cuda.h>
 #include <helpers/RandomLauncher.h>
-#include <ConstantShapeHelper.h>
-#include <ConstantTadHelper.h>
-#include <ShapeDescriptor.h>
+#include <helpers/ConstantShapeHelper.h>
+#include <helpers/ConstantTadHelper.h>
+#include <array/ShapeDescriptor.h>
 #include <array/ConstantDataBuffer.h>
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class CudaBasicsTests1 : public testing::Test {
 public:
@@ -128,15 +128,15 @@ TEST_F(CudaBasicsTests1, TestPairwise_1) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execIndexReduceScalar_1) {
 
-    NDArray x1('c', {2,2}, {0, 1, 2, 3}, nd4j::DataType::INT32);
-    NDArray x2('c', {2,2}, {0.5, 1.5, -4.5, 3.5}, nd4j::DataType::BFLOAT16);    
-    NDArray x3('c', {2,2}, {0, -1, 0, 1}, nd4j::DataType::BOOL);
+    NDArray x1('c', {2,2}, {0, 1, 2, 3}, sd::DataType::INT32);
+    NDArray x2('c', {2,2}, {0.5, 1.5, -4.5, 3.5}, sd::DataType::BFLOAT16);
+    NDArray x3('c', {2,2}, {0, -1, 0, 1}, sd::DataType::BOOL);
     
-    NDArray scalar('c', {}, std::vector<double>{0}, nd4j::DataType::INT64);
+    NDArray scalar('c', {}, std::vector<double>{0}, sd::DataType::INT64);
 
-    NDArray exp1('c', {}, std::vector<double>{3}, nd4j::DataType::INT64);
-    NDArray exp2('c', {}, std::vector<double>{2}, nd4j::DataType::INT64);
-    NDArray exp3('c', {}, std::vector<double>{1}, nd4j::DataType::INT64);
+    NDArray exp1('c', {}, std::vector<double>{3}, sd::DataType::INT64);
+    NDArray exp2('c', {}, std::vector<double>{2}, sd::DataType::INT64);
+    NDArray exp3('c', {}, std::vector<double>{1}, sd::DataType::INT64);
 
     void *dX1, *dX2, *dX3, *dZ; 
     Nd4jLong *dX1ShapeInfo, *dX2ShapeInfo, *dX3ShapeInfo, *dZShapeInfo;
@@ -180,7 +180,7 @@ TEST_F(CudaBasicsTests1, execIndexReduceScalar_1) {
 	/***************************************/
 	
     NativeOpExecutioner::execIndexReduceScalar(&lc,
-    											nd4j::indexreduce::IndexAbsoluteMax, 
+    											sd::indexreduce::IndexAbsoluteMax,
     											x1.buffer(), x1.getShapeInfo(),
     	                                       	dX1, dX1ShapeInfo, 
     	                                       	nullptr, 
@@ -202,7 +202,7 @@ TEST_F(CudaBasicsTests1, execIndexReduceScalar_1) {
     /***************************************/
     
     NativeOpExecutioner::execIndexReduceScalar(&lc,
-    											nd4j::indexreduce::IndexAbsoluteMax, 
+    											sd::indexreduce::IndexAbsoluteMax,
     											nullptr, x2.getShapeInfo(),
     	                                       	dX2, dX2ShapeInfo, 
     	                                       	nullptr, 
@@ -222,7 +222,7 @@ TEST_F(CudaBasicsTests1, execIndexReduceScalar_1) {
     // *************************************
 
     NativeOpExecutioner::execIndexReduceScalar(&lc, 
-    											nd4j::indexreduce::IndexAbsoluteMax, 
+    											sd::indexreduce::IndexAbsoluteMax,
     											nullptr, x3.getShapeInfo(),
     	                                       	dX3, dX3ShapeInfo, 
     	                                       	nullptr, 
@@ -257,16 +257,16 @@ TEST_F(CudaBasicsTests1, execReduce3Scalar_1) {
 	 if (!Environment::getInstance()->isExperimentalBuild())
         return;
 
-    NDArray x1('c', {2,2}, {1,2,3,4}, nd4j::DataType::INT32);
-    NDArray x2('c', {2,2}, {-1,-2,-3,-4}, nd4j::DataType::INT32);
-    NDArray x3('c', {2,2}, {1.5,1.5,1.5,1.5}, nd4j::DataType::DOUBLE);
-    NDArray x4('c', {2,2}, {1,2,3,4}, nd4j::DataType::DOUBLE);
+    NDArray x1('c', {2,2}, {1,2,3,4}, sd::DataType::INT32);
+    NDArray x2('c', {2,2}, {-1,-2,-3,-4}, sd::DataType::INT32);
+    NDArray x3('c', {2,2}, {1.5,1.5,1.5,1.5}, sd::DataType::DOUBLE);
+    NDArray x4('c', {2,2}, {1,2,3,4}, sd::DataType::DOUBLE);
 
-    NDArray exp1('c', {}, std::vector<double>{-30.f}, nd4j::DataType::FLOAT32);
-    NDArray exp2('c', {}, std::vector<double>{15.}, nd4j::DataType::DOUBLE);
+    NDArray exp1('c', {}, std::vector<double>{-30.f}, sd::DataType::FLOAT32);
+    NDArray exp2('c', {}, std::vector<double>{15.}, sd::DataType::DOUBLE);
     
-	NDArray scalar1('c', {}, std::vector<double>{100.f}, nd4j::DataType::FLOAT32);
-    NDArray scalar2('c', {}, std::vector<double>{100.}, nd4j::DataType::DOUBLE);
+	NDArray scalar1('c', {}, std::vector<double>{100.f}, sd::DataType::FLOAT32);
+    NDArray scalar2('c', {}, std::vector<double>{100.}, sd::DataType::DOUBLE);
 
     void *dX1, *dX2, *dX3, *dX4, *dZ1, *dZ2; 
     Nd4jLong *dX1ShapeInfo, *dX3ShapeInfo, *dZ1ShapeInfo, *dZ2ShapeInfo;
@@ -316,7 +316,7 @@ TEST_F(CudaBasicsTests1, execReduce3Scalar_1) {
 
 	/***************************************/
 	
-    NativeOpExecutioner::execReduce3Scalar(&lc, nd4j::reduce3::Dot,nullptr, x1.getShapeInfo(),dX1, dX1ShapeInfo, nullptr, nullptr, x2.getShapeInfo(),dX2, dX1ShapeInfo,nullptr, scalar1.getShapeInfo(),dZ1, dZ1ShapeInfo);
+    NativeOpExecutioner::execReduce3Scalar(&lc, sd::reduce3::Dot,nullptr, x1.getShapeInfo(),dX1, dX1ShapeInfo, nullptr, nullptr, x2.getShapeInfo(),dX2, dX1ShapeInfo,nullptr, scalar1.getShapeInfo(),dZ1, dZ1ShapeInfo);
 
     cudaResult = cudaStreamSynchronize(stream);     
     ASSERT_EQ(0, cudaResult);
@@ -333,7 +333,7 @@ TEST_F(CudaBasicsTests1, execReduce3Scalar_1) {
 
     /***************************************/
     
-    NativeOpExecutioner::execReduce3Scalar(&lc, nd4j::reduce3::Dot,nullptr, x3.getShapeInfo(),dX3, dX3ShapeInfo, nullptr, nullptr, x4.getShapeInfo(),dX4, dX3ShapeInfo,nullptr, scalar2.getShapeInfo(),dZ2, dZ2ShapeInfo);
+    NativeOpExecutioner::execReduce3Scalar(&lc, sd::reduce3::Dot,nullptr, x3.getShapeInfo(),dX3, dX3ShapeInfo, nullptr, nullptr, x4.getShapeInfo(),dX4, dX3ShapeInfo,nullptr, scalar2.getShapeInfo(),dZ2, dZ2ShapeInfo);
 
     cudaResult = cudaStreamSynchronize(stream); 
     ASSERT_EQ(0, cudaResult);
@@ -360,11 +360,11 @@ TEST_F(CudaBasicsTests1, execReduce3Scalar_1) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduce3_1) {
 
-    NDArray x('c', {2,2}, {1,2,3,4}, nd4j::DataType::INT32);
-    NDArray y('c', {2,2}, {-1,-2,-3,-4}, nd4j::DataType::INT32);
+    NDArray x('c', {2,2}, {1,2,3,4}, sd::DataType::INT32);
+    NDArray y('c', {2,2}, {-1,-2,-3,-4}, sd::DataType::INT32);
 
-    NDArray exp('c', {}, std::vector<double>{-30.f}, nd4j::DataType::FLOAT32);
-    NDArray z('c', {}, std::vector<double>{100.f},  nd4j::DataType::FLOAT32);
+    NDArray exp('c', {}, std::vector<double>{-30.f}, sd::DataType::FLOAT32);
+    NDArray z('c', {}, std::vector<double>{100.f},  sd::DataType::FLOAT32);
 
     std::vector<int> dimensions = {0, 1};
 
@@ -386,7 +386,7 @@ TEST_F(CudaBasicsTests1, execReduce3_1) {
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
 		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execReduce3(&lc, nd4j::reduce3::Dot, 
+	NativeOpExecutioner::execReduce3(&lc, sd::reduce3::Dot,
 								nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(), 
 								nullptr, 
 								nullptr, y.getShapeInfo(), y.specialBuffer(), y.specialShapeInfo(), 
@@ -412,11 +412,11 @@ TEST_F(CudaBasicsTests1, execReduce3_1) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduce3_2) {
     
-	NDArray x('c', {2,2}, {1.5,1.5,1.5,1.5}, nd4j::DataType::DOUBLE);
-    NDArray y('c', {2,2}, {1,2,3,4}, nd4j::DataType::DOUBLE);
+	NDArray x('c', {2,2}, {1.5,1.5,1.5,1.5}, sd::DataType::DOUBLE);
+    NDArray y('c', {2,2}, {1,2,3,4}, sd::DataType::DOUBLE);
 
-    NDArray exp('c', {}, std::vector<double>{15.}, nd4j::DataType::DOUBLE);
-    NDArray z('c', {}, std::vector<double>{100.},  nd4j::DataType::DOUBLE);
+    NDArray exp('c', {}, std::vector<double>{15.}, sd::DataType::DOUBLE);
+    NDArray z('c', {}, std::vector<double>{100.},  sd::DataType::DOUBLE);
    
     std::vector<int> dimensions = {0, 1};   
 
@@ -435,7 +435,7 @@ TEST_F(CudaBasicsTests1, execReduce3_2) {
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
 		
 	// call cuda kernel which calculates result	
-	NativeOpExecutioner::execReduce3(&lc, nd4j::reduce3::Dot, 
+	NativeOpExecutioner::execReduce3(&lc, sd::reduce3::Dot,
 								nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(), 
 								nullptr, 
 								nullptr, y.getShapeInfo(), y.specialBuffer(), y.specialShapeInfo(), 
@@ -461,11 +461,11 @@ TEST_F(CudaBasicsTests1, execReduce3_2) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduce3_3) {
     
-	NDArray x('c', {2,3}, {1,2,3,4,5,6}, nd4j::DataType::INT32);
-    NDArray y('c', {2,3}, {-6,-5,-4,-3,-2,-1}, nd4j::DataType::INT32);        
+	NDArray x('c', {2,3}, {1,2,3,4,5,6}, sd::DataType::INT32);
+    NDArray y('c', {2,3}, {-6,-5,-4,-3,-2,-1}, sd::DataType::INT32);
 
-    NDArray exp('c', {3}, {-18,-20,-18}, nd4j::DataType::FLOAT32);
-    NDArray z('c', {3}, {100,100,100}, nd4j::DataType::FLOAT32);
+    NDArray exp('c', {3}, {-18,-20,-18}, sd::DataType::FLOAT32);
+    NDArray z('c', {3}, {100,100,100}, sd::DataType::FLOAT32);
    
     std::vector<int> dimensions = {0};
 
@@ -501,7 +501,7 @@ TEST_F(CudaBasicsTests1, execReduce3_3) {
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
 		
 	// call cuda kernel which calculates result	
-	NativeOpExecutioner::execReduce3(&lc, nd4j::reduce3::Dot, 
+	NativeOpExecutioner::execReduce3(&lc, sd::reduce3::Dot,
 								nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(), 
 								nullptr, 
 								nullptr, y.getShapeInfo(), y.specialBuffer(), y.specialShapeInfo(), 
@@ -527,11 +527,11 @@ TEST_F(CudaBasicsTests1, execReduce3_3) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduce3_4) {
     	
-    NDArray x('c', {2,3}, {1,2,3,4,5,6}, nd4j::DataType::DOUBLE);
-    NDArray y('c', {2,3}, {1.5,1.5,1.5,1.5,1.5,1.5}, nd4j::DataType::DOUBLE);
+    NDArray x('c', {2,3}, {1,2,3,4,5,6}, sd::DataType::DOUBLE);
+    NDArray y('c', {2,3}, {1.5,1.5,1.5,1.5,1.5,1.5}, sd::DataType::DOUBLE);
 
-    NDArray exp('c', {2}, {9,22.5}, nd4j::DataType::DOUBLE);
-    NDArray z('c', {2}, {100,100}, nd4j::DataType::DOUBLE);
+    NDArray exp('c', {2}, {9,22.5}, sd::DataType::DOUBLE);
+    NDArray z('c', {2}, {100,100}, sd::DataType::DOUBLE);
    
     std::vector<int> dimensions = {1};
 
@@ -567,7 +567,7 @@ TEST_F(CudaBasicsTests1, execReduce3_4) {
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
 		
 	// call cuda kernel which calculates result	
-	NativeOpExecutioner::execReduce3(&lc, nd4j::reduce3::Dot,
+	NativeOpExecutioner::execReduce3(&lc, sd::reduce3::Dot,
 								nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(), 
 								nullptr, 
 								nullptr, y.getShapeInfo(), y.specialBuffer(), y.specialShapeInfo(), 
@@ -593,11 +593,11 @@ TEST_F(CudaBasicsTests1, execReduce3_4) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduce3_5) {
     	
-    NDArray x('c', {2,2,3}, {1.5,1.5,1.5,1.5,1.5,1.5,1.5,1.5,1.5,1.5,1.5,1.5}, nd4j::DataType::FLOAT32);
-    NDArray y('c', {2,2,3}, {1,2,3,4,5,6,7,8,9,10,11,12}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {2,2,3}, {1.5,1.5,1.5,1.5,1.5,1.5,1.5,1.5,1.5,1.5,1.5,1.5}, sd::DataType::FLOAT32);
+    NDArray y('c', {2,2,3}, {1,2,3,4,5,6,7,8,9,10,11,12}, sd::DataType::FLOAT32);
 
-    NDArray exp('c', {2,3}, {7.5, 10.5, 13.5, 25.5, 28.5, 31.5}, nd4j::DataType::FLOAT32);
-    NDArray z('c', {2,3}, {100,100,100,100,100,100}, nd4j::DataType::FLOAT32);
+    NDArray exp('c', {2,3}, {7.5, 10.5, 13.5, 25.5, 28.5, 31.5}, sd::DataType::FLOAT32);
+    NDArray z('c', {2,3}, {100,100,100,100,100,100}, sd::DataType::FLOAT32);
    
     std::vector<int> dimensions = {1};
 
@@ -633,7 +633,7 @@ TEST_F(CudaBasicsTests1, execReduce3_5) {
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
 		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execReduce3(&lc, nd4j::reduce3::Dot,
+	NativeOpExecutioner::execReduce3(&lc, sd::reduce3::Dot,
 								nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(), 
 								nullptr, 
 								nullptr, y.getShapeInfo(), y.specialBuffer(), y.specialShapeInfo(), 
@@ -659,11 +659,11 @@ TEST_F(CudaBasicsTests1, execReduce3_5) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduce3All_1) {
     	
-    NDArray x('c', {2,2}, {1,2,3,4}, nd4j::DataType::INT32);
-    NDArray y('c', {2,3}, {-1,1,-1,1,-1,1}, nd4j::DataType::INT32);
+    NDArray x('c', {2,2}, {1,2,3,4}, sd::DataType::INT32);
+    NDArray y('c', {2,3}, {-1,1,-1,1,-1,1}, sd::DataType::INT32);
 
-    NDArray exp('c', {2,3}, {2,-2,2,2,-2,2}, nd4j::DataType::FLOAT32);
-    NDArray z('c', {2,3}, {100,100,100,100,100,100}, nd4j::DataType::FLOAT32);
+    NDArray exp('c', {2,3}, {2,-2,2,2,-2,2}, sd::DataType::FLOAT32);
+    NDArray z('c', {2,3}, {100,100,100,100,100,100}, sd::DataType::FLOAT32);
    
     std::vector<int> dimensions = {0};
 
@@ -699,7 +699,7 @@ TEST_F(CudaBasicsTests1, execReduce3All_1) {
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
 		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execReduce3All(&lc, nd4j::reduce3::Dot, 
+	NativeOpExecutioner::execReduce3All(&lc, sd::reduce3::Dot,
 										nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(), 
 										nullptr, 
 										nullptr, y.getShapeInfo(), y.specialBuffer(), y.specialShapeInfo(), 
@@ -725,11 +725,11 @@ TEST_F(CudaBasicsTests1, execReduce3All_1) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduce3All_2) {
     	
-    NDArray x('c', {2,2}, {1,2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray y('c', {2,3}, {1.5,1.5,1.5,1.5,1.5,1.5}, nd4j::DataType::DOUBLE);    
+    NDArray x('c', {2,2}, {1,2,3,4}, sd::DataType::DOUBLE);
+    NDArray y('c', {2,3}, {1.5,1.5,1.5,1.5,1.5,1.5}, sd::DataType::DOUBLE);
 
-    NDArray exp('c', {2,3}, {6,6,6,9,9,9}, nd4j::DataType::DOUBLE);    
-    NDArray z('c', {2,3}, {100,100,100,100,100,100,},nd4j::DataType::DOUBLE);    
+    NDArray exp('c', {2,3}, {6,6,6,9,9,9}, sd::DataType::DOUBLE);
+    NDArray z('c', {2,3}, {100,100,100,100,100,100,},sd::DataType::DOUBLE);
    
     std::vector<int> dimensions = {0};
 
@@ -765,7 +765,7 @@ TEST_F(CudaBasicsTests1, execReduce3All_2) {
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
 		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execReduce3All(&lc, nd4j::reduce3::Dot, 
+	NativeOpExecutioner::execReduce3All(&lc, sd::reduce3::Dot,
 										nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(), 
 										nullptr, 
 										nullptr, y.getShapeInfo(), y.specialBuffer(), y.specialShapeInfo(), 
@@ -791,10 +791,10 @@ TEST_F(CudaBasicsTests1, execReduce3All_2) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execIndexReduce_1) {
     	
-    NDArray x('c', {2,3}, {100,100,100,100,100,100}, nd4j::DataType::DOUBLE);
+    NDArray x('c', {2,3}, {100,100,100,100,100,100}, sd::DataType::DOUBLE);
     x.linspace(-2.); x.syncToDevice();
-    NDArray exp('c', {2}, {2, 2}, nd4j::DataType::INT64);
-    NDArray z('c', {2}, {100,100}, nd4j::DataType::INT64);
+    NDArray exp('c', {2}, {2, 2}, sd::DataType::INT64);
+    NDArray z('c', {2}, {100,100}, sd::DataType::INT64);
     
     std::vector<int> dimensions = {1};          
 
@@ -822,7 +822,7 @@ TEST_F(CudaBasicsTests1, execIndexReduce_1) {
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
 		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execIndexReduce(&lc, nd4j::indexreduce::IndexMax, 
+	NativeOpExecutioner::execIndexReduce(&lc, sd::indexreduce::IndexMax,
 										nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(), 
 										nullptr, 
 										nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(), 
@@ -852,10 +852,10 @@ TEST_F(CudaBasicsTests1, execIndexReduce_2) {
     							100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,
     							100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,
     							100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,
-    							100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100}, nd4j::DataType::FLOAT32);
+    							100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100}, sd::DataType::FLOAT32);
     x.linspace(-2.f); x.syncToDevice();
-    NDArray exp('c', {2,5}, {11,11,11,11,11,11,11,11,11,11}, nd4j::DataType::INT64);    
-    NDArray z('c', {2,5}, {100,100,100,100,100,100,100,100,100,100}, nd4j::DataType::INT64);
+    NDArray exp('c', {2,5}, {11,11,11,11,11,11,11,11,11,11}, sd::DataType::INT64);
+    NDArray z('c', {2,5}, {100,100,100,100,100,100,100,100,100,100}, sd::DataType::INT64);
     
     std::vector<int> dimensions = {1,2};     
 
@@ -884,7 +884,7 @@ TEST_F(CudaBasicsTests1, execIndexReduce_2) {
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
 		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execIndexReduce(&lc, nd4j::indexreduce::IndexMax, 
+	NativeOpExecutioner::execIndexReduce(&lc, sd::indexreduce::IndexMax,
 										nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(), 
 										nullptr, 
 										nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(), 
@@ -914,10 +914,10 @@ TEST_F(CudaBasicsTests1, execIndexReduce_3) {
     							100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,
     							100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,
     							100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,
-    							100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100}, nd4j::DataType::DOUBLE);
+    							100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100}, sd::DataType::DOUBLE);
     x.linspace(-2.); x.syncToDevice();
-    NDArray exp('c', {3}, {39, 39, 39}, nd4j::DataType::INT64);    
-    NDArray z('c', {3}, {100,100,100}, nd4j::DataType::INT64);
+    NDArray exp('c', {3}, {39, 39, 39}, sd::DataType::INT64);
+    NDArray z('c', {3}, {100,100,100}, sd::DataType::INT64);
     
     std::vector<int> dimensions = {0,2,3};
 
@@ -945,7 +945,7 @@ TEST_F(CudaBasicsTests1, execIndexReduce_3) {
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
 		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execIndexReduce(&lc, nd4j::indexreduce::IndexMax, 
+	NativeOpExecutioner::execIndexReduce(&lc, sd::indexreduce::IndexMax,
 										nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(), 
 										nullptr, 
 										nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(), 
@@ -973,10 +973,10 @@ TEST_F(CudaBasicsTests1, execScalar_1) {
 	if (!Environment::getInstance()->isExperimentalBuild())
         return;
     	
-    NDArray x('c', {2,3},  {0,1,2,3,4,5}, nd4j::DataType::INT64); 
-    NDArray exp('c',{2,3}, {0,0,1,1,2,2}, nd4j::DataType::INT64);
-    NDArray scalar('c',{}, std::vector<double>{2.f}, nd4j::DataType::FLOAT32);
-    NDArray z('c', {2,3}, {100,100,100,100,100,100}, nd4j::DataType::INT64);
+    NDArray x('c', {2,3},  {0,1,2,3,4,5}, sd::DataType::INT64);
+    NDArray exp('c',{2,3}, {0,0,1,1,2,2}, sd::DataType::INT64);
+    NDArray scalar('c',{}, std::vector<double>{2.f}, sd::DataType::FLOAT32);
+    NDArray z('c', {2,3}, {100,100,100,100,100,100}, sd::DataType::INT64);
     
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
@@ -985,7 +985,7 @@ TEST_F(CudaBasicsTests1, execScalar_1) {
 	LaunchContext lc(&stream);
 	
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execScalar(&lc, nd4j::scalar::Divide, 
+	NativeOpExecutioner::execScalar(&lc, sd::scalar::Divide,
 									nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(), 
 									nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(), 
 									nullptr, scalar.getShapeInfo(), scalar.specialBuffer(), scalar.specialShapeInfo(), 
@@ -1008,10 +1008,10 @@ TEST_F(CudaBasicsTests1, execScalar_2) {
 	if (!Environment::getInstance()->isExperimentalBuild())
         return;
     	
-    NDArray x('c', {2,3},  {-1,-2,-3,-4,-5,-6}, nd4j::DataType::INT64); 
-    NDArray exp('c',{2,3}, {10,10,10,10,10,10}, nd4j::DataType::FLOAT32);
-    NDArray scalar('c',{}, std::vector<double>{10.f}, nd4j::DataType::FLOAT32);
-    NDArray z('c', {2,3}, {100,100,100,100,100,100}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {2,3},  {-1,-2,-3,-4,-5,-6}, sd::DataType::INT64);
+    NDArray exp('c',{2,3}, {10,10,10,10,10,10}, sd::DataType::FLOAT32);
+    NDArray scalar('c',{}, std::vector<double>{10.f}, sd::DataType::FLOAT32);
+    NDArray z('c', {2,3}, {100,100,100,100,100,100}, sd::DataType::FLOAT32);
     
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
@@ -1020,7 +1020,7 @@ TEST_F(CudaBasicsTests1, execScalar_2) {
 	LaunchContext lc(&stream);
 		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execScalar(&lc, nd4j::scalar::CopyPws, 
+	NativeOpExecutioner::execScalar(&lc, sd::scalar::CopyPws,
 									nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(), 
 									nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(), 
 									nullptr, scalar.getShapeInfo(), scalar.specialBuffer(), scalar.specialShapeInfo(), 
@@ -1044,10 +1044,10 @@ TEST_F(CudaBasicsTests1, execScalar_3) {
 	if (!Environment::getInstance()->isExperimentalBuild())
         return;
     	
-    NDArray x('c', {2,3,2},  {0,1,2,3,4,5,6,7,8,9,10,11}, nd4j::DataType::INT64); 
-    NDArray scalars('c',{2,2}, {1,2,3,4}, nd4j::DataType::FLOAT32);
-    NDArray exp('c', {2,3,2},  {0,0,2,1,4,2, 2,1,2,2,3,2}, nd4j::DataType::INT64);     
-    NDArray z('c', {2,3,2}, {100,100,100,100,100,100,100,100,100,100,100,100}, nd4j::DataType::INT64);
+    NDArray x('c', {2,3,2},  {0,1,2,3,4,5,6,7,8,9,10,11}, sd::DataType::INT64);
+    NDArray scalars('c',{2,2}, {1,2,3,4}, sd::DataType::FLOAT32);
+    NDArray exp('c', {2,3,2},  {0,0,2,1,4,2, 2,1,2,2,3,2}, sd::DataType::INT64);
+    NDArray z('c', {2,3,2}, {100,100,100,100,100,100,100,100,100,100,100,100}, sd::DataType::INT64);
 
     std::vector<int> dimensions = {1};
 
@@ -1075,7 +1075,7 @@ TEST_F(CudaBasicsTests1, execScalar_3) {
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
 		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execScalar(&lc, nd4j::scalar::Divide, 
+	NativeOpExecutioner::execScalar(&lc, sd::scalar::Divide,
 									nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(), 
 									nullptr,
 									nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
@@ -1102,10 +1102,10 @@ TEST_F(CudaBasicsTests1, execScalar_3) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execScalarBool_1) {
     	
-    NDArray x('c', {2,3},  {-1,-2,0,1,2,3}, nd4j::DataType::BFLOAT16); 
-    NDArray scalar('c',{}, std::vector<double>{0}, nd4j::DataType::BFLOAT16);
-    NDArray exp('c',{2,3}, {0,0,0,1,1,1}, nd4j::DataType::BOOL);    
-    NDArray z('c', {2,3}, {100,100,100,100,100,100,}, nd4j::DataType::BOOL);    
+    NDArray x('c', {2,3},  {-1,-2,0,1,2,3}, sd::DataType::BFLOAT16);
+    NDArray scalar('c',{}, std::vector<double>{0}, sd::DataType::BFLOAT16);
+    NDArray exp('c',{2,3}, {0,0,0,1,1,1}, sd::DataType::BOOL);
+    NDArray z('c', {2,3}, {100,100,100,100,100,100,}, sd::DataType::BOOL);
 	
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
@@ -1115,7 +1115,7 @@ TEST_F(CudaBasicsTests1, execScalarBool_1) {
 		
 	// call cuda kernel which calculates result
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execScalarBool(&lc, nd4j::scalar::GreaterThan, 
+	NativeOpExecutioner::execScalarBool(&lc, sd::scalar::GreaterThan,
 									nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(), 
 									nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(), 
 									nullptr, scalar.getShapeInfo(), scalar.specialBuffer(), scalar.specialShapeInfo(), 
@@ -1135,10 +1135,10 @@ TEST_F(CudaBasicsTests1, execScalarBool_1) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execScalarBool_2) {
     	
-    NDArray x('c', {2,3},  {0,1,2,3,4,5}, nd4j::DataType::FLOAT32); 
-    NDArray scalars('c',{2}, {-1,4}, nd4j::DataType::FLOAT32);
-    NDArray exp('c', {2,3},  {1,1,1,0,0,1}, nd4j::DataType::BOOL);
-    NDArray z('c', {2,3}, {100,100,100,100,100,100}, nd4j::DataType::BOOL);
+    NDArray x('c', {2,3},  {0,1,2,3,4,5}, sd::DataType::FLOAT32);
+    NDArray scalars('c',{2}, {-1,4}, sd::DataType::FLOAT32);
+    NDArray exp('c', {2,3},  {1,1,1,0,0,1}, sd::DataType::BOOL);
+    NDArray z('c', {2,3}, {100,100,100,100,100,100}, sd::DataType::BOOL);
 
     std::vector<int> dimensions = {1};
 
@@ -1165,7 +1165,7 @@ TEST_F(CudaBasicsTests1, execScalarBool_2) {
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
 			
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execScalarBool(&lc, nd4j::scalar::GreaterThan, 
+	NativeOpExecutioner::execScalarBool(&lc, sd::scalar::GreaterThan,
 									nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(), 
 									nullptr,
 									nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
@@ -1195,10 +1195,10 @@ TEST_F(CudaBasicsTests1, execBroadcast_1) {
 	if (!Environment::getInstance()->isExperimentalBuild())
         return;
     	
-	NDArray x('c', {2,3,4}, {100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100}, nd4j::DataType::INT32);
-    NDArray y('c', {3},   {10, 20, 30}, nd4j::DataType::INT64);
-    NDArray z('c', {2,3,4}, {100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100}, nd4j::DataType::INT32);	
-	NDArray exp('c', {2,3,4}, {10, 11, 12, 13,24, 25, 26, 27,38, 39, 40, 41,22, 23, 24, 25,36, 37, 38, 39,50, 51, 52, 53}, nd4j::DataType::INT32);
+	NDArray x('c', {2,3,4}, {100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100}, sd::DataType::INT32);
+    NDArray y('c', {3},   {10, 20, 30}, sd::DataType::INT64);
+    NDArray z('c', {2,3,4}, {100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100}, sd::DataType::INT32);
+	NDArray exp('c', {2,3,4}, {10, 11, 12, 13,24, 25, 26, 27,38, 39, 40, 41,22, 23, 24, 25,36, 37, 38, 39,50, 51, 52, 53}, sd::DataType::INT32);
 	x.linspace(0); x.syncToDevice();
 
     std::vector<int> dimensions = {1};
@@ -1226,7 +1226,7 @@ TEST_F(CudaBasicsTests1, execBroadcast_1) {
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
 
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execBroadcast(&lc, nd4j::broadcast::Add,
+	NativeOpExecutioner::execBroadcast(&lc, sd::broadcast::Add,
 										nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(), 
 										nullptr, y.getShapeInfo(), y.specialBuffer(), y.specialShapeInfo(),
 										nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
@@ -1255,10 +1255,10 @@ TEST_F(CudaBasicsTests1, execBroadcast_2) {
 	if (!Environment::getInstance()->isExperimentalBuild())
         return;
     	
-	NDArray x('c', {2,3,4}, {100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100}, nd4j::DataType::INT32);
-    NDArray y('c', {2,4},   {10,20,30,40,50,60,70,80}, nd4j::DataType::FLOAT32);
-    NDArray z('c', {2,3,4}, {100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100}, nd4j::DataType::FLOAT32);	
-	NDArray exp('c', {2,3,4}, {10., 21., 32., 43., 14., 25., 36., 47., 18., 29., 40., 51., 62., 73., 84., 95., 66., 77., 88., 99., 70., 81., 92., 103}, nd4j::DataType::FLOAT32);
+	NDArray x('c', {2,3,4}, {100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100}, sd::DataType::INT32);
+    NDArray y('c', {2,4},   {10,20,30,40,50,60,70,80}, sd::DataType::FLOAT32);
+    NDArray z('c', {2,3,4}, {100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100}, sd::DataType::FLOAT32);
+	NDArray exp('c', {2,3,4}, {10., 21., 32., 43., 14., 25., 36., 47., 18., 29., 40., 51., 62., 73., 84., 95., 66., 77., 88., 99., 70., 81., 92., 103}, sd::DataType::FLOAT32);
 	x.linspace(0); x.syncToDevice();
 
     std::vector<int> dimensions = {0,2};
@@ -1286,7 +1286,7 @@ TEST_F(CudaBasicsTests1, execBroadcast_2) {
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
 
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execBroadcast(&lc, nd4j::broadcast::Add,
+	NativeOpExecutioner::execBroadcast(&lc, sd::broadcast::Add,
 										nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(), 
 										nullptr, y.getShapeInfo(), y.specialBuffer(), y.specialShapeInfo(),
 										nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
@@ -1312,10 +1312,10 @@ TEST_F(CudaBasicsTests1, execBroadcast_2) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execBroadcastBool_1) {
     	
-	NDArray x('c', {2,3,4}, {100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100}, nd4j::DataType::INT32);
-    NDArray y('c', {3},   {2, 12, 22}, nd4j::DataType::INT32);
-    NDArray z('c', {2,3,4}, {100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,}, nd4j::DataType::BOOL);	
-	NDArray exp('c', {2,3,4}, {0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0}, nd4j::DataType::BOOL);
+	NDArray x('c', {2,3,4}, {100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100}, sd::DataType::INT32);
+    NDArray y('c', {3},   {2, 12, 22}, sd::DataType::INT32);
+    NDArray z('c', {2,3,4}, {100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,}, sd::DataType::BOOL);
+	NDArray exp('c', {2,3,4}, {0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0}, sd::DataType::BOOL);
 	x.linspace(1); x.syncToDevice();
 
     std::vector<int> dimensions = {1};
@@ -1343,7 +1343,7 @@ TEST_F(CudaBasicsTests1, execBroadcastBool_1) {
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
 
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execBroadcastBool(&lc, nd4j::broadcast::EqualTo,
+	NativeOpExecutioner::execBroadcastBool(&lc, sd::broadcast::EqualTo,
 										nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(), 
 										nullptr, y.getShapeInfo(), y.specialBuffer(), y.specialShapeInfo(),
 										nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
@@ -1370,10 +1370,10 @@ TEST_F(CudaBasicsTests1, execBroadcastBool_1) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execBroadcastBool_2) {
     	
-	NDArray x('c', {2,3,4}, {100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100},nd4j::DataType::FLOAT32);
-    NDArray y('c', {2,4},   {1,10,10,15,20,20,20,24}, nd4j::DataType::FLOAT32);
-    NDArray z('c', {2,3,4}, {100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100}, nd4j::DataType::BOOL);	
-	NDArray exp('c', {2,3,4}, {1, 0, 0, 0,0, 0, 0, 0,0, 1, 0, 0,0, 0, 0, 0,0, 0, 0, 0,0, 0, 0, 1}, nd4j::DataType::BOOL);
+	NDArray x('c', {2,3,4}, {100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100},sd::DataType::FLOAT32);
+    NDArray y('c', {2,4},   {1,10,10,15,20,20,20,24}, sd::DataType::FLOAT32);
+    NDArray z('c', {2,3,4}, {100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100}, sd::DataType::BOOL);
+	NDArray exp('c', {2,3,4}, {1, 0, 0, 0,0, 0, 0, 0,0, 1, 0, 0,0, 0, 0, 0,0, 0, 0, 0,0, 0, 0, 1}, sd::DataType::BOOL);
 	x.linspace(1); x.syncToDevice();
 
     std::vector<int> dimensions = {0,2};
@@ -1402,7 +1402,7 @@ TEST_F(CudaBasicsTests1, execBroadcastBool_2) {
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
 
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execBroadcastBool(&lc, nd4j::broadcast::EqualTo,
+	NativeOpExecutioner::execBroadcastBool(&lc, sd::broadcast::EqualTo,
 										nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(), 
 										nullptr, y.getShapeInfo(), y.specialBuffer(), y.specialShapeInfo(),
 										nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
@@ -1432,10 +1432,10 @@ TEST_F(CudaBasicsTests1, execPairwiseTransform_1) {
 	if (!Environment::getInstance()->isExperimentalBuild())
         return;
     	
-	NDArray x('c', {2,2,2}, {1,5,3,7,2,6,4,8}, nd4j::DataType::INT32);
-    NDArray y('c', {4,2}, {0.1,0.2,0.3,0.4,1.5,0.6,0.7,1.8}, nd4j::DataType::DOUBLE);
-    NDArray z('c', {8}, {100,100,100,100,100,100,100,100}, nd4j::DataType::INT32);	
-	NDArray exp('c', {8}, {0,1,2,3,3,5,6,6}, nd4j::DataType::INT32);
+	NDArray x('c', {2,2,2}, {1,5,3,7,2,6,4,8}, sd::DataType::INT32);
+    NDArray y('c', {4,2}, {0.1,0.2,0.3,0.4,1.5,0.6,0.7,1.8}, sd::DataType::DOUBLE);
+    NDArray z('c', {8}, {100,100,100,100,100,100,100,100}, sd::DataType::INT32);
+	NDArray exp('c', {8}, {0,1,2,3,3,5,6,6}, sd::DataType::INT32);
 	x.permutei({2,1,0});	// -> {1,2,3,4,5,6,7,8}
     x.syncShape();
 
@@ -1446,7 +1446,7 @@ TEST_F(CudaBasicsTests1, execPairwiseTransform_1) {
 	LaunchContext lc(&stream);
 	
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execPairwiseTransform(&lc, nd4j::pairwise::Subtract,
+	NativeOpExecutioner::execPairwiseTransform(&lc, sd::pairwise::Subtract,
 												nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(), 
 												nullptr, y.getShapeInfo(), y.specialBuffer(), y.specialShapeInfo(), 
 												nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(), 
@@ -1466,10 +1466,10 @@ TEST_F(CudaBasicsTests1, execPairwiseTransform_1) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execPairwiseBoolTransform_1) {
     	
-	NDArray x('c', {2,2,2}, {1,5,3,7,2,6,4,8}, nd4j::DataType::INT64);
-    NDArray y('c', {4,2}, {0,2,0,4,0,6,0,8}, nd4j::DataType::INT64);
-    NDArray z('c', {8}, {100,100,100,100,100,100,100,100}, nd4j::DataType::BOOL);	
-	NDArray exp('c', {8}, {0,1,0,1,0,1,0,1}, nd4j::DataType::BOOL);
+	NDArray x('c', {2,2,2}, {1,5,3,7,2,6,4,8}, sd::DataType::INT64);
+    NDArray y('c', {4,2}, {0,2,0,4,0,6,0,8}, sd::DataType::INT64);
+    NDArray z('c', {8}, {100,100,100,100,100,100,100,100}, sd::DataType::BOOL);
+	NDArray exp('c', {8}, {0,1,0,1,0,1,0,1}, sd::DataType::BOOL);
 	x.permutei({2,1,0});	// -> {1,2,3,4,5,6,7,8}
 	x.syncShape();
         
@@ -1480,7 +1480,7 @@ TEST_F(CudaBasicsTests1, execPairwiseBoolTransform_1) {
 	LaunchContext lc(&stream);	
 
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execPairwiseBoolTransform(&lc, nd4j::pairwise::EqualTo,
+	NativeOpExecutioner::execPairwiseBoolTransform(&lc, sd::pairwise::EqualTo,
 													nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(), 
 													nullptr, y.getShapeInfo(), y.specialBuffer(), y.specialShapeInfo(), 
 													nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(), 
@@ -1501,9 +1501,9 @@ TEST_F(CudaBasicsTests1, execPairwiseBoolTransform_1) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execTransformFloat_1) {
     	
-	NDArray x('c', {2,2}, {0, 6.25, 2.25, 12.25}, nd4j::DataType::DOUBLE);    
-    NDArray z('c', {4}, {100,100,100,100}, nd4j::DataType::FLOAT32);	
-	NDArray exp('c', {4}, {0, 1.5, 2.5, 3.5}, nd4j::DataType::FLOAT32);
+	NDArray x('c', {2,2}, {0, 6.25, 2.25, 12.25}, sd::DataType::DOUBLE);
+    NDArray z('c', {4}, {100,100,100,100}, sd::DataType::FLOAT32);
+	NDArray exp('c', {4}, {0, 1.5, 2.5, 3.5}, sd::DataType::FLOAT32);
 	x.permutei({1,0});
 	x.syncShape();
         
@@ -1514,7 +1514,7 @@ TEST_F(CudaBasicsTests1, execTransformFloat_1) {
 	LaunchContext lc(&stream);
 
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execTransformFloat(&lc, nd4j::transform::Sqrt,
+	NativeOpExecutioner::execTransformFloat(&lc, sd::transform::Sqrt,
 		nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
 		nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
 		nullptr, nullptr, nullptr);
@@ -1533,9 +1533,9 @@ TEST_F(CudaBasicsTests1, execTransformFloat_1) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execTransformFloat_2) {
     	
-	NDArray x('c', {1,4}, {0, 4, 9, 16}, nd4j::DataType::INT64);
-    NDArray z('c', {2,2}, {100,100,100,100}, nd4j::DataType::DOUBLE);	
-	NDArray exp('c', {2,2}, {0, 2, 3, 4}, nd4j::DataType::DOUBLE);	       
+	NDArray x('c', {1,4}, {0, 4, 9, 16}, sd::DataType::INT64);
+    NDArray z('c', {2,2}, {100,100,100,100}, sd::DataType::DOUBLE);
+	NDArray exp('c', {2,2}, {0, 2, 3, 4}, sd::DataType::DOUBLE);
 	
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
@@ -1544,7 +1544,7 @@ TEST_F(CudaBasicsTests1, execTransformFloat_2) {
 	LaunchContext lc(&stream);
 	
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execTransformFloat(&lc, nd4j::transform::Sqrt,
+	NativeOpExecutioner::execTransformFloat(&lc, sd::transform::Sqrt,
 		nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
 		nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
 		nullptr, nullptr, nullptr);
@@ -1563,9 +1563,9 @@ TEST_F(CudaBasicsTests1, execTransformFloat_2) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execTransformAny_1) {
     	
-	NDArray x('c', {2,2}, {0, 6.25, 2.25, 12.25}, nd4j::DataType::DOUBLE);    
-    NDArray z('c', {4,1}, {100,100,100,100}, nd4j::DataType::INT32);	
-	NDArray exp('c', {4,1}, {0, 2, 6, 12}, nd4j::DataType::INT32);
+	NDArray x('c', {2,2}, {0, 6.25, 2.25, 12.25}, sd::DataType::DOUBLE);
+    NDArray z('c', {4,1}, {100,100,100,100}, sd::DataType::INT32);
+	NDArray exp('c', {4,1}, {0, 2, 6, 12}, sd::DataType::INT32);
 	x.permutei({1,0});
         
 	// create cuda stream and LaunchContext
@@ -1575,7 +1575,7 @@ TEST_F(CudaBasicsTests1, execTransformAny_1) {
 	LaunchContext lc(&stream);
 
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execTransformAny(&lc, nd4j::transform::Assign,
+	NativeOpExecutioner::execTransformAny(&lc, sd::transform::Assign,
 		nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
 		nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
 		nullptr, nullptr, nullptr);
@@ -1594,9 +1594,9 @@ TEST_F(CudaBasicsTests1, execTransformAny_1) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execTransformAny_2) {
     	
-	NDArray x('c', {1,4}, {0, 6.25, 2.25, 12.25}, nd4j::DataType::BFLOAT16);
-    NDArray z('c', {2,2}, {100,100,100,100}, nd4j::DataType::FLOAT32);	
-	NDArray exp('c', {2,2}, {0, 6.25, 2.25, 12.25}, nd4j::DataType::FLOAT32);
+	NDArray x('c', {1,4}, {0, 6.25, 2.25, 12.25}, sd::DataType::BFLOAT16);
+    NDArray z('c', {2,2}, {100,100,100,100}, sd::DataType::FLOAT32);
+	NDArray exp('c', {2,2}, {0, 6.25, 2.25, 12.25}, sd::DataType::FLOAT32);
 
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
@@ -1605,7 +1605,7 @@ TEST_F(CudaBasicsTests1, execTransformAny_2) {
 	LaunchContext lc(&stream);
 
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execTransformAny(&lc, nd4j::transform::Assign,
+	NativeOpExecutioner::execTransformAny(&lc, sd::transform::Assign,
 		nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
 		nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
 		nullptr, nullptr, nullptr);
@@ -1624,9 +1624,9 @@ TEST_F(CudaBasicsTests1, execTransformAny_2) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execTransformStrict_1) {
     	
-	NDArray x('c', {2,3}, {0,2,4,1,3,5}, nd4j::DataType::DOUBLE);
-    NDArray z('c', {3,2}, {100,100,100,100,100,100}, nd4j::DataType::DOUBLE);	
-	NDArray exp('c', {3,2}, {0, 3, 12, 27, 48, 75}, nd4j::DataType::DOUBLE);
+	NDArray x('c', {2,3}, {0,2,4,1,3,5}, sd::DataType::DOUBLE);
+    NDArray z('c', {3,2}, {100,100,100,100,100,100}, sd::DataType::DOUBLE);
+	NDArray exp('c', {3,2}, {0, 3, 12, 27, 48, 75}, sd::DataType::DOUBLE);
 	x.permutei({1,0});
 	
 	// create cuda stream and LaunchContext
@@ -1636,7 +1636,7 @@ TEST_F(CudaBasicsTests1, execTransformStrict_1) {
 	LaunchContext lc(&stream);
 
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execTransformStrict(&lc, nd4j::transform::CubeDerivative,
+	NativeOpExecutioner::execTransformStrict(&lc, sd::transform::CubeDerivative,
 		nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
 		nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
 		nullptr, nullptr, nullptr);
@@ -1655,9 +1655,9 @@ TEST_F(CudaBasicsTests1, execTransformStrict_1) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execTransformStrict_2) {
     	
-	NDArray x('c', {6}, {0,1,2,3,4,5}, nd4j::DataType::FLOAT32);
-    NDArray z('c', {3,2}, {100,100,100,100,100,100}, nd4j::DataType::FLOAT32);	
-	NDArray exp('c', {3,2}, {0, 3, 12, 27, 48, 75}, nd4j::DataType::FLOAT32);	
+	NDArray x('c', {6}, {0,1,2,3,4,5}, sd::DataType::FLOAT32);
+    NDArray z('c', {3,2}, {100,100,100,100,100,100}, sd::DataType::FLOAT32);
+	NDArray exp('c', {3,2}, {0, 3, 12, 27, 48, 75}, sd::DataType::FLOAT32);
     	
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
@@ -1666,7 +1666,7 @@ TEST_F(CudaBasicsTests1, execTransformStrict_2) {
 	LaunchContext lc(&stream);
 
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execTransformStrict(&lc, nd4j::transform::CubeDerivative,
+	NativeOpExecutioner::execTransformStrict(&lc, sd::transform::CubeDerivative,
 		nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
 		nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
 		nullptr, nullptr, nullptr);
@@ -1685,9 +1685,9 @@ TEST_F(CudaBasicsTests1, execTransformStrict_2) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execTransformSame_1) {
     
-	NDArray x('c', {2,3}, {0,2.5,4.5,1.5,3.5,5.5}, nd4j::DataType::DOUBLE);	
-    NDArray z('c', {1,6}, {100,100,100,100,100,100}, nd4j::DataType::DOUBLE);	
-	NDArray exp('c', {1,6}, {0,2.25,6.25,12.25,20.25,30.25}, nd4j::DataType::DOUBLE);
+	NDArray x('c', {2,3}, {0,2.5,4.5,1.5,3.5,5.5}, sd::DataType::DOUBLE);
+    NDArray z('c', {1,6}, {100,100,100,100,100,100}, sd::DataType::DOUBLE);
+	NDArray exp('c', {1,6}, {0,2.25,6.25,12.25,20.25,30.25}, sd::DataType::DOUBLE);
 	x.permutei({1,0});
     	
 	// create cuda stream and LaunchContext
@@ -1697,7 +1697,7 @@ TEST_F(CudaBasicsTests1, execTransformSame_1) {
 	LaunchContext lc(&stream);
 
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execTransformSame(&lc, nd4j::transform::Square,
+	NativeOpExecutioner::execTransformSame(&lc, sd::transform::Square,
 		nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
 		nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
 		nullptr, nullptr, nullptr);
@@ -1716,9 +1716,9 @@ TEST_F(CudaBasicsTests1, execTransformSame_1) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execTransformSame_2) {
     	
-	NDArray x('c', {6}, {0,1,2,3,4,5}, nd4j::DataType::INT32);
-    NDArray z('c', {3,2}, {100,100,100,100,100,100}, nd4j::DataType::INT32);	
-	NDArray exp('c', {3,2}, {0,1,4,9,16,25}, nd4j::DataType::INT32);	
+	NDArray x('c', {6}, {0,1,2,3,4,5}, sd::DataType::INT32);
+    NDArray z('c', {3,2}, {100,100,100,100,100,100}, sd::DataType::INT32);
+	NDArray exp('c', {3,2}, {0,1,4,9,16,25}, sd::DataType::INT32);
     	
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
@@ -1727,7 +1727,7 @@ TEST_F(CudaBasicsTests1, execTransformSame_2) {
 	LaunchContext lc(&stream);
 	
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execTransformSame(&lc, nd4j::transform::Square,
+	NativeOpExecutioner::execTransformSame(&lc, sd::transform::Square,
 		nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
 		nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
 		nullptr, nullptr, nullptr);
@@ -1746,9 +1746,9 @@ TEST_F(CudaBasicsTests1, execTransformSame_2) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execTransformBool_1) {
     
-	NDArray x('c', {2,3}, {0,2,4,-1,-3,-5}, nd4j::DataType::DOUBLE);	
-    NDArray z('c', {1,6}, {100,100,100,100,100,100}, nd4j::DataType::BOOL);	    
-	NDArray exp('c', {1,6}, {0,0,1,0,1,0}, nd4j::DataType::BOOL);
+	NDArray x('c', {2,3}, {0,2,4,-1,-3,-5}, sd::DataType::DOUBLE);
+    NDArray z('c', {1,6}, {100,100,100,100,100,100}, sd::DataType::BOOL);
+	NDArray exp('c', {1,6}, {0,0,1,0,1,0}, sd::DataType::BOOL);
 	x.permutei({1,0});
     
 	// create cuda stream and LaunchContext
@@ -1758,7 +1758,7 @@ TEST_F(CudaBasicsTests1, execTransformBool_1) {
 	LaunchContext lc(&stream);
 
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execTransformBool(&lc, nd4j::transform::IsPositive,
+	NativeOpExecutioner::execTransformBool(&lc, sd::transform::IsPositive,
 		nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
 		nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
 		nullptr, nullptr, nullptr);
@@ -1777,9 +1777,9 @@ TEST_F(CudaBasicsTests1, execTransformBool_1) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execTransformBool_2) {
     	
-	NDArray x('c', {6}, {0,-1,2,-3,4,-5}, nd4j::DataType::INT32);
-    NDArray z('c', {3,2}, {100,100,100,100,100,100}, nd4j::DataType::BOOL);	
-	NDArray exp('c', {3,2}, {0,0,1,0,1,0}, nd4j::DataType::BOOL);
+	NDArray x('c', {6}, {0,-1,2,-3,4,-5}, sd::DataType::INT32);
+    NDArray z('c', {3,2}, {100,100,100,100,100,100}, sd::DataType::BOOL);
+	NDArray exp('c', {3,2}, {0,0,1,0,1,0}, sd::DataType::BOOL);
     	
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
@@ -1788,7 +1788,7 @@ TEST_F(CudaBasicsTests1, execTransformBool_2) {
 	LaunchContext lc(&stream);
 
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execTransformBool(&lc, nd4j::transform::IsPositive,
+	NativeOpExecutioner::execTransformBool(&lc, sd::transform::IsPositive,
 		nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
 		nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
 		nullptr, nullptr, nullptr);
@@ -1807,9 +1807,9 @@ TEST_F(CudaBasicsTests1, execTransformBool_2) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduceFloat_1) {
     	   	
-    NDArray x('c', {2,3,4}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18}, nd4j::DataType::INT32);
-    NDArray z('c', {3}, {100,100,100}, nd4j::DataType::FLOAT32);
-    NDArray exp('c', {3}, {2.5, 6.5, 10.5}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {2,3,4}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18}, sd::DataType::INT32);
+    NDArray z('c', {3}, {100,100,100}, sd::DataType::FLOAT32);
+    NDArray exp('c', {3}, {2.5, 6.5, 10.5}, sd::DataType::FLOAT32);
     x.permutei({2,1,0});
     
     std::vector<int> dimensions = {0,2};
@@ -1837,7 +1837,7 @@ TEST_F(CudaBasicsTests1, execReduceFloat_1) {
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
 		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execReduceFloat(&lc, nd4j::reduce::Mean, 
+	NativeOpExecutioner::execReduceFloat(&lc, sd::reduce::Mean,
 					nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
 					nullptr, 
 					nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
@@ -1862,9 +1862,9 @@ TEST_F(CudaBasicsTests1, execReduceFloat_1) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduceFloat_2) {
     	   	
-    NDArray x('c', {2,3,4}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18}, nd4j::DataType::INT32);
-    NDArray z('c', {2,4}, {100,100,100,100,100,100,100,100}, nd4j::DataType::DOUBLE);
-    NDArray exp('c', {2,4}, {-1., 0., 1., 2.,11., 12., 13., 14.}, nd4j::DataType::DOUBLE);
+    NDArray x('c', {2,3,4}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18}, sd::DataType::INT32);
+    NDArray z('c', {2,4}, {100,100,100,100,100,100,100,100}, sd::DataType::DOUBLE);
+    NDArray exp('c', {2,4}, {-1., 0., 1., 2.,11., 12., 13., 14.}, sd::DataType::DOUBLE);
     
     std::vector<int> dimensions = {1};
 
@@ -1891,7 +1891,7 @@ TEST_F(CudaBasicsTests1, execReduceFloat_2) {
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
 		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execReduceFloat(&lc, nd4j::reduce::Mean, 
+	NativeOpExecutioner::execReduceFloat(&lc, sd::reduce::Mean,
 					nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
 					nullptr, 
 					nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
@@ -1916,9 +1916,9 @@ TEST_F(CudaBasicsTests1, execReduceFloat_2) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduceSame_1) {
     	   	
-    NDArray x('c', {2,3,4}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18}, nd4j::DataType::INT32);
-    NDArray z('c', {3}, {100,100,100}, nd4j::DataType::INT32);
-    NDArray exp('c', {3}, {20, 52, 84}, nd4j::DataType::INT32);
+    NDArray x('c', {2,3,4}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18}, sd::DataType::INT32);
+    NDArray z('c', {3}, {100,100,100}, sd::DataType::INT32);
+    NDArray exp('c', {3}, {20, 52, 84}, sd::DataType::INT32);
     x.permutei({2,1,0});
     
     std::vector<int> dimensions = {0,2};
@@ -1946,7 +1946,7 @@ TEST_F(CudaBasicsTests1, execReduceSame_1) {
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
 		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execReduceSame(&lc, nd4j::reduce::Sum, 
+	NativeOpExecutioner::execReduceSame(&lc, sd::reduce::Sum,
 					nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
 					nullptr, 
 					nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
@@ -1971,9 +1971,9 @@ TEST_F(CudaBasicsTests1, execReduceSame_1) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduceSame_2) {
     	   	
-    NDArray x('c', {2,3,4}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18}, nd4j::DataType::FLOAT32);
-    NDArray z('c', {2,4}, {100,100,100,100,100,100,100,100}, nd4j::DataType::FLOAT32);
-    NDArray exp('c', {2,4}, {-3., 0., 3., 6.,33., 36., 39., 42.}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {2,3,4}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18}, sd::DataType::FLOAT32);
+    NDArray z('c', {2,4}, {100,100,100,100,100,100,100,100}, sd::DataType::FLOAT32);
+    NDArray exp('c', {2,4}, {-3., 0., 3., 6.,33., 36., 39., 42.}, sd::DataType::FLOAT32);
     
     std::vector<int> dimensions = {1};
 
@@ -2000,7 +2000,7 @@ TEST_F(CudaBasicsTests1, execReduceSame_2) {
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
 		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execReduceSame(&lc, nd4j::reduce::Sum, 
+	NativeOpExecutioner::execReduceSame(&lc, sd::reduce::Sum,
 					nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
 					nullptr, 
 					nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
@@ -2025,9 +2025,9 @@ TEST_F(CudaBasicsTests1, execReduceSame_2) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduceBool_1) {
     	   	
-    NDArray x('c', {2,3,4}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18}, nd4j::DataType::INT32);
-    NDArray z('c', {3}, {100,100,100}, nd4j::DataType::BOOL);
-    NDArray exp('c', {3}, {0, 1, 1}, nd4j::DataType::BOOL);
+    NDArray x('c', {2,3,4}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18}, sd::DataType::INT32);
+    NDArray z('c', {3}, {100,100,100}, sd::DataType::BOOL);
+    NDArray exp('c', {3}, {0, 1, 1}, sd::DataType::BOOL);
     x.permutei({2,1,0});
 
     
@@ -2056,7 +2056,7 @@ TEST_F(CudaBasicsTests1, execReduceBool_1) {
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
 		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execReduceBool(&lc, nd4j::reduce::IsPositive, 
+	NativeOpExecutioner::execReduceBool(&lc, sd::reduce::IsPositive,
 					nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
 					nullptr, 
 					nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
@@ -2081,9 +2081,9 @@ TEST_F(CudaBasicsTests1, execReduceBool_1) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduceBool_2) {
     	   	
-    NDArray x('c', {2,3,4}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18}, nd4j::DataType::FLOAT32);
-    NDArray z('c', {2,4}, {100,100,100,100,100,100,100,100}, nd4j::DataType::BOOL);
-    NDArray exp('c', {2,4}, {1, 1, 1, 1, 0, 0, 0, 0}, nd4j::DataType::BOOL);
+    NDArray x('c', {2,3,4}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18}, sd::DataType::FLOAT32);
+    NDArray z('c', {2,4}, {100,100,100,100,100,100,100,100}, sd::DataType::BOOL);
+    NDArray exp('c', {2,4}, {1, 1, 1, 1, 0, 0, 0, 0}, sd::DataType::BOOL);
     
     std::vector<int> dimensions = {1};
 
@@ -2110,7 +2110,7 @@ TEST_F(CudaBasicsTests1, execReduceBool_2) {
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
 		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execReduceBool(&lc, nd4j::reduce::IsPositive, 
+	NativeOpExecutioner::execReduceBool(&lc, sd::reduce::IsPositive,
 					nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
 					nullptr, 
 					nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
@@ -2135,9 +2135,9 @@ TEST_F(CudaBasicsTests1, execReduceBool_2) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduceLong_1) {
     	   	
-    NDArray x('c', {2,3,4}, {-5,0,-3,0,-1,0,1,2,3,4,5,6,7,0,9,10,11,0,13,14,0,16,0,18}, nd4j::DataType::INT32);
-    NDArray z('c', {3}, {100,100,100}, nd4j::DataType::INT64);
-    NDArray exp('c', {3}, {5,6,6}, nd4j::DataType::INT64);
+    NDArray x('c', {2,3,4}, {-5,0,-3,0,-1,0,1,2,3,4,5,6,7,0,9,10,11,0,13,14,0,16,0,18}, sd::DataType::INT32);
+    NDArray z('c', {3}, {100,100,100}, sd::DataType::INT64);
+    NDArray exp('c', {3}, {5,6,6}, sd::DataType::INT64);
     x.permutei({2,1,0});
     
     std::vector<int> dimensions = {0,2};
@@ -2165,7 +2165,7 @@ TEST_F(CudaBasicsTests1, execReduceLong_1) {
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
 		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execReduceLong(&lc, nd4j::reduce::CountNonZero, 
+	NativeOpExecutioner::execReduceLong(&lc, sd::reduce::CountNonZero,
 					nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
 					nullptr, 
 					nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
@@ -2190,9 +2190,9 @@ TEST_F(CudaBasicsTests1, execReduceLong_1) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduceLong_2) {
     	   	
-    NDArray x('c', {2,3,4}, {-5,0,-3,0,-1,0,1,2,3,4,5,6,7,0,9,10,11,0,13,14,0,16,0,18}, nd4j::DataType::FLOAT32);
-    NDArray z('c', {2,4}, {100,100,100,100,100,100,100,100}, nd4j::DataType::INT64);
-    NDArray exp('c', {2,4}, {3, 1, 3, 2, 2, 1, 2, 3}, nd4j::DataType::INT64);    
+    NDArray x('c', {2,3,4}, {-5,0,-3,0,-1,0,1,2,3,4,5,6,7,0,9,10,11,0,13,14,0,16,0,18}, sd::DataType::FLOAT32);
+    NDArray z('c', {2,4}, {100,100,100,100,100,100,100,100}, sd::DataType::INT64);
+    NDArray exp('c', {2,4}, {3, 1, 3, 2, 2, 1, 2, 3}, sd::DataType::INT64);
 
     std::vector<int> dimensions = {1};
 
@@ -2219,7 +2219,7 @@ TEST_F(CudaBasicsTests1, execReduceLong_2) {
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
 		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execReduceLong(&lc, nd4j::reduce::CountNonZero, 
+	NativeOpExecutioner::execReduceLong(&lc, sd::reduce::CountNonZero,
 					nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
 					nullptr, 
 					nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
@@ -2244,9 +2244,9 @@ TEST_F(CudaBasicsTests1, execReduceLong_2) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduceFloatScalar_1) {
     	   	
-    NDArray x('c', {2,3,4}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18}, nd4j::DataType::INT32);
-    NDArray z('c', {}, std::vector<double>{100}, nd4j::DataType::FLOAT32);
-    NDArray exp('c', {}, std::vector<double>{6.5}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {2,3,4}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18}, sd::DataType::INT32);
+    NDArray z('c', {}, std::vector<double>{100}, sd::DataType::FLOAT32);
+    NDArray exp('c', {}, std::vector<double>{6.5}, sd::DataType::FLOAT32);
     x.permutei({2,1,0});
        
 	// create cuda stream and LaunchContext
@@ -2262,7 +2262,7 @@ TEST_F(CudaBasicsTests1, execReduceFloatScalar_1) {
 	lc.setAllocationPointer(allocationPointer);
 		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execReduceFloatScalar(&lc, nd4j::reduce::Mean, 
+	NativeOpExecutioner::execReduceFloatScalar(&lc, sd::reduce::Mean,
 					nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
 					nullptr, 
 					nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo());
@@ -2281,9 +2281,9 @@ TEST_F(CudaBasicsTests1, execReduceFloatScalar_1) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduceFloatScalar_2) {
     	   	
-    NDArray x('c', {2,3,4}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18}, nd4j::DataType::INT32);
-    NDArray z('c', {}, std::vector<double>{100}, nd4j::DataType::DOUBLE);
-    NDArray exp('c', {}, std::vector<double>{6.5}, nd4j::DataType::DOUBLE);
+    NDArray x('c', {2,3,4}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18}, sd::DataType::INT32);
+    NDArray z('c', {}, std::vector<double>{100}, sd::DataType::DOUBLE);
+    NDArray exp('c', {}, std::vector<double>{6.5}, sd::DataType::DOUBLE);
 	
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
@@ -2298,7 +2298,7 @@ TEST_F(CudaBasicsTests1, execReduceFloatScalar_2) {
 	lc.setAllocationPointer(allocationPointer);
 		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execReduceFloatScalar(&lc, nd4j::reduce::Mean, 
+	NativeOpExecutioner::execReduceFloatScalar(&lc, sd::reduce::Mean,
 					nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
 					nullptr, 
 					nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo());
@@ -2317,9 +2317,9 @@ TEST_F(CudaBasicsTests1, execReduceFloatScalar_2) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduceSameScalar_1) {
     	   	
-    NDArray x('c', {2,3,4}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18}, nd4j::DataType::INT32);
-    NDArray z('c', {}, std::vector<double>{100}, nd4j::DataType::INT32);
-    NDArray exp('c', {}, std::vector<double>{156}, nd4j::DataType::INT32);
+    NDArray x('c', {2,3,4}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18}, sd::DataType::INT32);
+    NDArray z('c', {}, std::vector<double>{100}, sd::DataType::INT32);
+    NDArray exp('c', {}, std::vector<double>{156}, sd::DataType::INT32);
     x.permutei({2,1,0});
        
 	// create cuda stream and LaunchContext
@@ -2335,7 +2335,7 @@ TEST_F(CudaBasicsTests1, execReduceSameScalar_1) {
 	lc.setAllocationPointer(allocationPointer);
 		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execReduceSameScalar(&lc, nd4j::reduce::Sum, 
+	NativeOpExecutioner::execReduceSameScalar(&lc, sd::reduce::Sum,
 					nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
 					nullptr, 
 					nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo());
@@ -2354,9 +2354,9 @@ TEST_F(CudaBasicsTests1, execReduceSameScalar_1) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduceSameScalar_2) {
     	   	
-    NDArray x('c', {2,3,4}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18}, nd4j::DataType::DOUBLE);
-    NDArray z('c', {}, std::vector<double>{100}, nd4j::DataType::DOUBLE);
-    NDArray exp('c', {}, std::vector<double>{156}, nd4j::DataType::DOUBLE);
+    NDArray x('c', {2,3,4}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18}, sd::DataType::DOUBLE);
+    NDArray z('c', {}, std::vector<double>{100}, sd::DataType::DOUBLE);
+    NDArray exp('c', {}, std::vector<double>{156}, sd::DataType::DOUBLE);
 	
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
@@ -2371,7 +2371,7 @@ TEST_F(CudaBasicsTests1, execReduceSameScalar_2) {
 	lc.setAllocationPointer(allocationPointer);
 		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execReduceSameScalar(&lc, nd4j::reduce::Sum, 
+	NativeOpExecutioner::execReduceSameScalar(&lc, sd::reduce::Sum,
 					nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
 					nullptr, 
 					nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo());
@@ -2390,9 +2390,9 @@ TEST_F(CudaBasicsTests1, execReduceSameScalar_2) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduceBoolScalar_1) {
     	   	
-    NDArray x('c', {2,3,4}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18}, nd4j::DataType::INT32);
-    NDArray z('c', {}, std::vector<double>{100}, nd4j::DataType::BOOL);
-    NDArray exp('c', {}, std::vector<double>{1}, nd4j::DataType::BOOL);
+    NDArray x('c', {2,3,4}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18}, sd::DataType::INT32);
+    NDArray z('c', {}, std::vector<double>{100}, sd::DataType::BOOL);
+    NDArray exp('c', {}, std::vector<double>{1}, sd::DataType::BOOL);
     x.permutei({2,1,0});
     x.syncShape();    
        
@@ -2409,7 +2409,7 @@ TEST_F(CudaBasicsTests1, execReduceBoolScalar_1) {
 	lc.setAllocationPointer(allocationPointer);
 		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execReduceBoolScalar(&lc, nd4j::reduce::IsPositive, 
+	NativeOpExecutioner::execReduceBoolScalar(&lc, sd::reduce::IsPositive,
 					nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
 					nullptr, 
 					nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo());
@@ -2428,9 +2428,9 @@ TEST_F(CudaBasicsTests1, execReduceBoolScalar_1) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduceBoolScalar_2) {
     	   	
-    NDArray x('c', {2,3,4}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18}, nd4j::DataType::DOUBLE);
-    NDArray z('c', {}, std::vector<double>{100}, nd4j::DataType::BOOL);
-    NDArray exp('c', {}, std::vector<double>{1}, nd4j::DataType::BOOL);
+    NDArray x('c', {2,3,4}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18}, sd::DataType::DOUBLE);
+    NDArray z('c', {}, std::vector<double>{100}, sd::DataType::BOOL);
+    NDArray exp('c', {}, std::vector<double>{1}, sd::DataType::BOOL);
     
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
@@ -2445,7 +2445,7 @@ TEST_F(CudaBasicsTests1, execReduceBoolScalar_2) {
 	lc.setAllocationPointer(allocationPointer);
 		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execReduceBoolScalar(&lc, nd4j::reduce::IsPositive, 
+	NativeOpExecutioner::execReduceBoolScalar(&lc, sd::reduce::IsPositive,
 					nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
 					nullptr, 
 					nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo());
@@ -2464,9 +2464,9 @@ TEST_F(CudaBasicsTests1, execReduceBoolScalar_2) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduceLongScalar_1) {
     	   	
-    NDArray x('c', {2,3,4}, {-5,0,-3,0,-1,0,1,2,3,4,5,6,7,0,9,10,11,0,13,14,0,16,0,18}, nd4j::DataType::INT32);
-    NDArray z('c', {}, std::vector<double>{100}, nd4j::DataType::INT64);
-    NDArray exp('c', {}, std::vector<double>{17}, nd4j::DataType::INT64);
+    NDArray x('c', {2,3,4}, {-5,0,-3,0,-1,0,1,2,3,4,5,6,7,0,9,10,11,0,13,14,0,16,0,18}, sd::DataType::INT32);
+    NDArray z('c', {}, std::vector<double>{100}, sd::DataType::INT64);
+    NDArray exp('c', {}, std::vector<double>{17}, sd::DataType::INT64);
     x.permutei({2,1,0});
     x.syncShape();    
        
@@ -2483,7 +2483,7 @@ TEST_F(CudaBasicsTests1, execReduceLongScalar_1) {
 	lc.setAllocationPointer(allocationPointer);
 		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execReduceLongScalar(&lc, nd4j::reduce::CountNonZero, 
+	NativeOpExecutioner::execReduceLongScalar(&lc, sd::reduce::CountNonZero,
 					nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
 					nullptr, 
 					nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo());
@@ -2502,9 +2502,9 @@ TEST_F(CudaBasicsTests1, execReduceLongScalar_1) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduceLongScalar_2) {
     	   	
-    NDArray x('c', {2,3,4}, {-5,0,-3,0,-1,0,1,2,3,4,5,6,7,0,9,10,11,0,13,14,0,16,0,18}, nd4j::DataType::DOUBLE);
-    NDArray z('c', {}, std::vector<double>{100}, nd4j::DataType::INT64);
-    NDArray exp('c', {}, std::vector<double>{17}, nd4j::DataType::INT64);
+    NDArray x('c', {2,3,4}, {-5,0,-3,0,-1,0,1,2,3,4,5,6,7,0,9,10,11,0,13,14,0,16,0,18}, sd::DataType::DOUBLE);
+    NDArray z('c', {}, std::vector<double>{100}, sd::DataType::INT64);
+    NDArray exp('c', {}, std::vector<double>{17}, sd::DataType::INT64);
     
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
@@ -2519,7 +2519,7 @@ TEST_F(CudaBasicsTests1, execReduceLongScalar_2) {
 	lc.setAllocationPointer(allocationPointer);
 		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execReduceLongScalar(&lc, nd4j::reduce::CountNonZero, 
+	NativeOpExecutioner::execReduceLongScalar(&lc, sd::reduce::CountNonZero,
 					nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
 					nullptr, 
 					nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo());
@@ -2538,10 +2538,10 @@ TEST_F(CudaBasicsTests1, execReduceLongScalar_2) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduce3TAD_1) {
     	
-    NDArray x('c', {2,2,3}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6}, nd4j::DataType::FLOAT32);
-    NDArray y('c', {2,2}, {1,2,3,4}, nd4j::DataType::FLOAT32);
-    NDArray exp('c', {3}, {10,20,30}, nd4j::DataType::DOUBLE);
-    NDArray z('c', {3}, {100,100,100}, nd4j::DataType::DOUBLE);
+    NDArray x('c', {2,2,3}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6}, sd::DataType::FLOAT32);
+    NDArray y('c', {2,2}, {1,2,3,4}, sd::DataType::FLOAT32);
+    NDArray exp('c', {3}, {10,20,30}, sd::DataType::DOUBLE);
+    NDArray z('c', {3}, {100,100,100}, sd::DataType::DOUBLE);
    
     std::vector<int> dimensions = {0,1};
     auto packX = ConstantTadHelper::getInstance()->tadForDimensions(x.shapeInfo(), dimensions);
@@ -2551,7 +2551,7 @@ TEST_F(CudaBasicsTests1, execReduce3TAD_1) {
 	y.syncToDevice();
 	PointersManager pm(context, "execReduce3TAD_1");
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execReduce3TAD(context, nd4j::reduce3::Dot,
+	NativeOpExecutioner::execReduce3TAD(context, sd::reduce3::Dot,
 								nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(), 
 								nullptr, 
 								nullptr, y.getShapeInfo(), y.specialBuffer(), y.specialShapeInfo(), 
@@ -2571,10 +2571,10 @@ TEST_F(CudaBasicsTests1, execReduce3TAD_1) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduce3TAD_2) {
     	
-    NDArray x('c', {2,2,3}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6}, nd4j::DataType::INT64);
-    NDArray y('c', {2,3}, {1,2,3,4,5,6}, nd4j::DataType::INT64);
-    NDArray exp('c', {2}, {10,73}, nd4j::DataType::FLOAT32);
-    NDArray z('c', {2}, {100,100}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {2,2,3}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6}, sd::DataType::INT64);
+    NDArray y('c', {2,3}, {1,2,3,4,5,6}, sd::DataType::INT64);
+    NDArray exp('c', {2}, {10,73}, sd::DataType::FLOAT32);
+    NDArray z('c', {2}, {100,100}, sd::DataType::FLOAT32);
    
     std::vector<int> dimensions = {0,2};
 
@@ -2602,7 +2602,7 @@ TEST_F(CudaBasicsTests1, execReduce3TAD_2) {
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
 		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execReduce3TAD(&lc, nd4j::reduce3::Dot,
+	NativeOpExecutioner::execReduce3TAD(&lc, sd::reduce3::Dot,
 								nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(), 
 								nullptr, 
 								nullptr, y.getShapeInfo(), y.specialBuffer(), y.specialShapeInfo(), 
@@ -2627,10 +2627,10 @@ TEST_F(CudaBasicsTests1, execReduce3TAD_2) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduce3TAD_3) {
     	
-    NDArray x('c', {2,2,3}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6}, nd4j::DataType::INT64);
-    NDArray y('c', {3}, {1,2,3}, nd4j::DataType::INT64);
-    NDArray exp('c', {2,2}, {-22,-4,14,32}, nd4j::DataType::FLOAT32);
-    NDArray z('c', {2,2}, {100,100,100,100}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {2,2,3}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6}, sd::DataType::INT64);
+    NDArray y('c', {3}, {1,2,3}, sd::DataType::INT64);
+    NDArray exp('c', {2,2}, {-22,-4,14,32}, sd::DataType::FLOAT32);
+    NDArray z('c', {2,2}, {100,100,100,100}, sd::DataType::FLOAT32);
    
     std::vector<int> dimensions = {2};
 
@@ -2658,7 +2658,7 @@ TEST_F(CudaBasicsTests1, execReduce3TAD_3) {
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
 		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execReduce3TAD(&lc, nd4j::reduce3::Dot,
+	NativeOpExecutioner::execReduce3TAD(&lc, sd::reduce3::Dot,
 								nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(), 
 								nullptr, 
 								nullptr, y.getShapeInfo(), y.specialBuffer(), y.specialShapeInfo(), 
@@ -2683,10 +2683,10 @@ TEST_F(CudaBasicsTests1, execReduce3TAD_3) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execReduce3TAD_4) {
     	
-    NDArray x('c', {2,2,3}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6}, nd4j::DataType::DOUBLE);
-    NDArray y('c', {2,2,3}, {10,20,30,40,50,60,70,80,90,100,110,120}, nd4j::DataType::DOUBLE);
-    NDArray exp('c', {}, std::vector<double>{1820}, nd4j::DataType::FLOAT32);
-    NDArray z('c', {}, std::vector<double>{100}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {2,2,3}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6}, sd::DataType::DOUBLE);
+    NDArray y('c', {2,2,3}, {10,20,30,40,50,60,70,80,90,100,110,120}, sd::DataType::DOUBLE);
+    NDArray exp('c', {}, std::vector<double>{1820}, sd::DataType::FLOAT32);
+    NDArray z('c', {}, std::vector<double>{100}, sd::DataType::FLOAT32);
 
     std::vector<int> dimensions = {0,1,2};
 
@@ -2713,7 +2713,7 @@ TEST_F(CudaBasicsTests1, execReduce3TAD_4) {
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
 		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execReduce3TAD(&lc, nd4j::reduce3::Dot,
+	NativeOpExecutioner::execReduce3TAD(&lc, sd::reduce3::Dot,
 								nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(), 
 								nullptr, 
 								nullptr, y.getShapeInfo(), y.specialBuffer(), y.specialShapeInfo(), 
@@ -2738,9 +2738,9 @@ TEST_F(CudaBasicsTests1, execReduce3TAD_4) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execSummaryStats_1) {
     	
-    NDArray x('c', {2,2,3}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6}, nd4j::DataType::INT64);    
-    NDArray exp('c', {}, std::vector<double>{3.605551}, nd4j::DataType::FLOAT32);
-    NDArray z('c', {}, std::vector<double>{100}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {2,2,3}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6}, sd::DataType::INT64);
+    NDArray exp('c', {}, std::vector<double>{3.605551}, sd::DataType::FLOAT32);
+    NDArray z('c', {}, std::vector<double>{100}, sd::DataType::FLOAT32);
 
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
@@ -2752,7 +2752,7 @@ TEST_F(CudaBasicsTests1, execSummaryStats_1) {
 	lc.setReductionPointer(reductionPointer);
 		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execSummaryStats(&lc, nd4j::variance::SummaryStatsStandardDeviation,
+	NativeOpExecutioner::execSummaryStats(&lc, sd::variance::SummaryStatsStandardDeviation,
 								nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(), 
 								nullptr, 								
 								nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(), 
@@ -2772,9 +2772,9 @@ TEST_F(CudaBasicsTests1, execSummaryStats_1) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execSummaryStats_2) {
     	
-    NDArray x('c', {2,2,3}, {-5,-4,-3,-20,-1,0,1,2,3,4,5,6}, nd4j::DataType::DOUBLE);    
-    NDArray exp('c', {2}, {3.405877, 9.715966}, nd4j::DataType::FLOAT32);
-    NDArray z('c', {2}, {100,100}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {2,2,3}, {-5,-4,-3,-20,-1,0,1,2,3,4,5,6}, sd::DataType::DOUBLE);
+    NDArray exp('c', {2}, {3.405877, 9.715966}, sd::DataType::FLOAT32);
+    NDArray z('c', {2}, {100,100}, sd::DataType::FLOAT32);
 
     std::vector<int> dimensions = {0,2};
 
@@ -2801,7 +2801,7 @@ TEST_F(CudaBasicsTests1, execSummaryStats_2) {
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
 		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execSummaryStats(&lc, nd4j::variance::SummaryStatsStandardDeviation,
+	NativeOpExecutioner::execSummaryStats(&lc, sd::variance::SummaryStatsStandardDeviation,
 								nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(), 
 								nullptr, 								
 								nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(), 
@@ -2826,9 +2826,9 @@ TEST_F(CudaBasicsTests1, execSummaryStats_2) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execSummaryStats_3) {
     	
-    NDArray x('c', {2,2,3}, {-5,-4,-3,-20,-1,0,1,2,3,4,5,6}, nd4j::DataType::DOUBLE);    
-    NDArray exp('c', {2}, {10.606602, 2.121320}, nd4j::DataType::FLOAT32);
-    NDArray z('c', {2}, {100,100}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {2,2,3}, {-5,-4,-3,-20,-1,0,1,2,3,4,5,6}, sd::DataType::DOUBLE);
+    NDArray exp('c', {2}, {10.606602, 2.121320}, sd::DataType::FLOAT32);
+    NDArray z('c', {2}, {100,100}, sd::DataType::FLOAT32);
 
     std::vector<int> dimensions = {1};
 
@@ -2855,7 +2855,7 @@ TEST_F(CudaBasicsTests1, execSummaryStats_3) {
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
 		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execSummaryStats(&lc, nd4j::variance::SummaryStatsStandardDeviation,
+	NativeOpExecutioner::execSummaryStats(&lc, sd::variance::SummaryStatsStandardDeviation,
 								nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(), 
 								nullptr, 								
 								nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(), 
@@ -2880,9 +2880,9 @@ TEST_F(CudaBasicsTests1, execSummaryStats_3) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execSummaryStatsScalar_1) {
     	
-    NDArray x('c', {2,2,3}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6}, nd4j::DataType::INT64);
-    NDArray exp('c', {}, std::vector<double>{3.605551}, nd4j::DataType::FLOAT32);
-    NDArray z('c', {}, std::vector<double>{100}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {2,2,3}, {-5,-4,-3,-2,-1,0,1,2,3,4,5,6}, sd::DataType::INT64);
+    NDArray exp('c', {}, std::vector<double>{3.605551}, sd::DataType::FLOAT32);
+    NDArray z('c', {}, std::vector<double>{100}, sd::DataType::FLOAT32);
 
 	// create cuda stream and LaunchContext
 	cudaError_t cudaResult;
@@ -2894,7 +2894,7 @@ TEST_F(CudaBasicsTests1, execSummaryStatsScalar_1) {
 	lc.setReductionPointer(reductionPointer);
 		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execSummaryStatsScalar(&lc, nd4j::variance::SummaryStatsStandardDeviation,
+	NativeOpExecutioner::execSummaryStatsScalar(&lc, sd::variance::SummaryStatsStandardDeviation,
 								nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(), 
 								nullptr, 								
 								nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(), 
@@ -2914,11 +2914,11 @@ TEST_F(CudaBasicsTests1, execSummaryStatsScalar_1) {
 //////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execRandom_1) {
     	   
-//    NDArray z('c', {10}, {100,0,0,0,0,0,0,0,0,0}, nd4j::DataType::DOUBLE);
-    NDArray z('c', {10}, {100,0,0,0,0,0,0,0,0,100}, nd4j::DataType::FLOAT32);
-    NDArray exp('c', {10}, {0.050942, -0.183229, -0.093921, 0.075469, 0.257166, -0.254838, 0.342227, -0.682188, -0.004345, 0.464633}, nd4j::DataType::FLOAT32);
+//    NDArray z('c', {10}, {100,0,0,0,0,0,0,0,0,0}, sd::DataType::DOUBLE);
+    NDArray z('c', {10}, {100,0,0,0,0,0,0,0,0,100}, sd::DataType::FLOAT32);
+    NDArray exp('c', {10}, {0.050942, -0.183229, -0.093921, 0.075469, 0.257166, -0.254838, 0.342227, -0.682188, -0.004345, 0.464633}, sd::DataType::FLOAT32);
 
-    nd4j::graph::RandomGenerator gen(119,5);
+    sd::graph::RandomGenerator gen(119,5);
 
 	cudaError_t cudaResult;
     NDArray* array = &z;
@@ -2942,7 +2942,7 @@ TEST_F(CudaBasicsTests1, execRandom_1) {
 //
 //	//	::execRandom(extraPointers, random::GaussianDistribution, &gen, z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), &extra);
 //	// call cuda kernel which calculates result
-//	NativeOpExecutioner::execRandom(&lc, nd4j::random::GaussianDistribution,
+//	NativeOpExecutioner::execRandom(&lc, sd::random::GaussianDistribution,
 //								&gen,
 //								nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
 //								nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
@@ -2968,12 +2968,12 @@ TEST_F(CudaBasicsTests1, execRandom_1) {
 //////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execRandom_2) {
     	   
-    NDArray x('c', {10}, {0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1}, nd4j::DataType::DOUBLE);    
-    NDArray z('c', {2,5}, {100,100,100,100,100,100,100,100,100,100}, nd4j::DataType::DOUBLE);
-    NDArray exp('c', {10}, {0., 0., 0.3, 0., 0.5, 0., 0.7, 0., 0., 1.}, nd4j::DataType::DOUBLE);
+    NDArray x('c', {10}, {0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1}, sd::DataType::DOUBLE);
+    NDArray z('c', {2,5}, {100,100,100,100,100,100,100,100,100,100}, sd::DataType::DOUBLE);
+    NDArray exp('c', {10}, {0., 0., 0.3, 0., 0.5, 0., 0.7, 0., 0., 1.}, sd::DataType::DOUBLE);
     
     ExtraArguments extraArguments({0.7});
-    nd4j::graph::RandomGenerator gen(119,5);
+    sd::graph::RandomGenerator gen(119,5);
     
 //    // prepare input arrays for prepareDataForCuda function
 //    std::vector<std::pair<void*,size_t>> hostData;
@@ -2990,7 +2990,7 @@ TEST_F(CudaBasicsTests1, execRandom_2) {
 //	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
 		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execRandom(lc, nd4j::random::DropOut,
+	NativeOpExecutioner::execRandom(lc, sd::random::DropOut,
 								&gen,
 								nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(), 
 								nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(), 								
@@ -3013,11 +3013,11 @@ TEST_F(CudaBasicsTests1, execRandom_2) {
 //////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execRandom_3) {
     	       
-    NDArray z('c', {10}, {100,100,100,100,100,100,100,100,100,100}, nd4j::DataType::DOUBLE);    
-    NDArray exp('c', {10}, {2.373649, 2.239791, 1.887353, 2.488636, 2.068904, 2.281399, 1.828228, 2.228222, 2.490847, 1.669537}, nd4j::DataType::DOUBLE);
+    NDArray z('c', {10}, {100,100,100,100,100,100,100,100,100,100}, sd::DataType::DOUBLE);
+    NDArray exp('c', {10}, {2.373649, 2.239791, 1.887353, 2.488636, 2.068904, 2.281399, 1.828228, 2.228222, 2.490847, 1.669537}, sd::DataType::DOUBLE);
     
     std::vector<double> extraArguments = {1.5, 2.5};
-    nd4j::graph::RandomGenerator gen(119,5);
+    sd::graph::RandomGenerator gen(119,5);
     
     // prepare input arrays for prepareDataForCuda function
     std::vector<std::pair<void*,size_t>> hostData;    		
@@ -3034,7 +3034,7 @@ TEST_F(CudaBasicsTests1, execRandom_3) {
 	cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
 		
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execRandom(&lc, nd4j::random::UniformDistribution,
+	NativeOpExecutioner::execRandom(&lc, sd::random::UniformDistribution,
 								&gen,
 								nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(), 								
 								devicePtrs[0]);
@@ -3056,12 +3056,12 @@ TEST_F(CudaBasicsTests1, execRandom_3) {
 //////////////////////////////////////////////////////////////////////////
 TEST_F(CudaBasicsTests1, execRandom_4) {
     	       
-    NDArray z('c', {2,5}, {1,2,3,4,5,6,7,8,9,10}, nd4j::DataType::FLOAT32);
-    NDArray exp('c', {10}, {2.373649, 2.281399, 2.239791, 1.828228, 1.887353, 2.228222, 2.488636, 2.490847, 2.068904, 1.669537}, nd4j::DataType::FLOAT32);
+    NDArray z('c', {2,5}, {1,2,3,4,5,6,7,8,9,10}, sd::DataType::FLOAT32);
+    NDArray exp('c', {10}, {2.373649, 2.281399, 2.239791, 1.828228, 1.887353, 2.228222, 2.488636, 2.490847, 2.068904, 1.669537}, sd::DataType::FLOAT32);
     z.permutei({1,0});        
         
     ExtraArguments extraArguments({1.5, 2.5});
-    nd4j::graph::RandomGenerator gen(119,5);
+    sd::graph::RandomGenerator gen(119,5);
     
 //    // prepare input arrays for prepareDataForCuda function
 //    std::vector<std::pair<void*,size_t>> hostData;
@@ -3079,7 +3079,7 @@ TEST_F(CudaBasicsTests1, execRandom_4) {
     auto context = z.getContext();
     PointersManager pm(context, "execRandom4");
 	// call cuda kernel which calculates result
-	NativeOpExecutioner::execRandom(context, nd4j::random::UniformDistribution,
+	NativeOpExecutioner::execRandom(context, sd::random::UniformDistribution,
 								&gen,
 								nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(), 								
 								extraArguments.argumentsAsT(z.dataType()));
diff --git a/libnd4j/tests_cpu/layers_tests/CudaBasicsTests2.cu b/libnd4j/tests_cpu/layers_tests/CudaBasicsTests2.cu
index 027dcdd42..b425ffcbb 100644
--- a/libnd4j/tests_cpu/layers_tests/CudaBasicsTests2.cu
+++ b/libnd4j/tests_cpu/layers_tests/CudaBasicsTests2.cu
@@ -19,20 +19,20 @@
  //
 
 #include "testlayers.h"
-#include <NDArray.h>
-#include <NDArrayFactory.h>
-#include <Context.h>
-#include <Node.h>
+#include <array/NDArray.h>
+#include <array/NDArrayFactory.h>
+#include <graph/Context.h>
+#include <graph/Node.h>
 #include <graph/Variable.h>
 #include <graph/VariableSpace.h>
-#include <specials_cuda.h>
-#include <TAD.h>
-#include <MmulHelper.h>
+#include <ops/specials_cuda.h>
+#include <helpers/TAD.h>
+#include <helpers/MmulHelper.h>
 
 #include <cuda.h>
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class CudaBasicsTests2 : public testing::Test {
 public:
@@ -51,13 +51,13 @@ TEST_F(CudaBasicsTests2, mmulMxM_1) {
 	const Nd4jLong K = 4;
 	const Nd4jLong N = 5;
 
-	NDArray a('f', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::FLOAT32);
-	NDArray b('f', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, nd4j::DataType::FLOAT32);
-	NDArray c('f', {M,N}, nd4j::DataType::FLOAT32);
+	NDArray a('f', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::FLOAT32);
+	NDArray b('f', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, sd::DataType::FLOAT32);
+	NDArray c('f', {M,N}, sd::DataType::FLOAT32);
 
-	NDArray exp('f', {M,N}, {0.1, 0.3, 0.5, 2.5, 2.7, 2.9, 4.9, 5.1, 5.3, 7.3, 7.5, 7.7, 9.7, 9.9, 10.1}, nd4j::DataType::FLOAT32);
+	NDArray exp('f', {M,N}, {0.1, 0.3, 0.5, 2.5, 2.7, 2.9, 4.9, 5.1, 5.3, 7.3, 7.5, 7.7, 9.7, 9.9, 10.1}, sd::DataType::FLOAT32);
 
-	nd4j::MmulHelper::mmul(&a, &b, &c, 1., 0.);
+	sd::MmulHelper::mmul(&a, &b, &c, 1., 0.);
 	// c.printIndexedBuffer();
 
 	ASSERT_TRUE(c.equalsTo(&exp));
@@ -70,12 +70,12 @@ TEST_F(CudaBasicsTests2, mmulMxM_2) {
 	const Nd4jLong K = 4;
 	const Nd4jLong N = 5;
 
-	NDArray a('c', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::DOUBLE);
-	NDArray b('f', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, nd4j::DataType::DOUBLE);
-	NDArray c('f', {M,N}, nd4j::DataType::DOUBLE);
-	NDArray exp('f', {M,N}, {-1.6, -0.7, 0.2, -0.8, 0.1, 1., -0., 0.9, 1.8, 0.8, 1.7, 2.6, 1.6, 2.5, 3.4}, nd4j::DataType::DOUBLE);
+	NDArray a('c', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::DOUBLE);
+	NDArray b('f', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, sd::DataType::DOUBLE);
+	NDArray c('f', {M,N}, sd::DataType::DOUBLE);
+	NDArray exp('f', {M,N}, {-1.6, -0.7, 0.2, -0.8, 0.1, 1., -0., 0.9, 1.8, 0.8, 1.7, 2.6, 1.6, 2.5, 3.4}, sd::DataType::DOUBLE);
 
-	nd4j::MmulHelper::mmul(&a, &b, &c, 1., 0.);
+	sd::MmulHelper::mmul(&a, &b, &c, 1., 0.);
 
 	ASSERT_TRUE(c.equalsTo(&exp));
 }
@@ -87,13 +87,13 @@ TEST_F(CudaBasicsTests2, mmulMxM_3) {
 	const Nd4jLong K = 4;
 	const Nd4jLong N = 5;
 
-	NDArray a('f', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::DOUBLE);
-	NDArray b('c', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, nd4j::DataType::DOUBLE);
-	NDArray c('f', {M,N}, nd4j::DataType::DOUBLE);
+	NDArray a('f', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::DOUBLE);
+	NDArray b('c', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, sd::DataType::DOUBLE);
+	NDArray c('f', {M,N}, sd::DataType::DOUBLE);
 
-	NDArray exp('f', {M,N}, {-1.9, -0.9, 0.1, 1.3, 0.3, -0.7, -0.7, 0.3, 1.3, 0.1, -0.9, -1.9, 0.5, 1.5, 2.5}, nd4j::DataType::DOUBLE);
+	NDArray exp('f', {M,N}, {-1.9, -0.9, 0.1, 1.3, 0.3, -0.7, -0.7, 0.3, 1.3, 0.1, -0.9, -1.9, 0.5, 1.5, 2.5}, sd::DataType::DOUBLE);
 
-	nd4j::MmulHelper::mmul(&a, &b, &c, 1., 0.);
+	sd::MmulHelper::mmul(&a, &b, &c, 1., 0.);
 
 	ASSERT_TRUE(c.equalsTo(&exp));
 }
@@ -105,13 +105,13 @@ TEST_F(CudaBasicsTests2, mmulMxM_4) {
 	const Nd4jLong K = 4;
 	const Nd4jLong N = 5;
 
-	NDArray a('f', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::DOUBLE);
-	NDArray b('f', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, nd4j::DataType::DOUBLE);
-	NDArray c('c', {M,N}, nd4j::DataType::DOUBLE);
+	NDArray a('f', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::DOUBLE);
+	NDArray b('f', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, sd::DataType::DOUBLE);
+	NDArray c('c', {M,N}, sd::DataType::DOUBLE);
 
-	NDArray exp('c', {M,N}, {0.1, 2.5, 4.9, 7.3, 9.7,0.3, 2.7, 5.1, 7.5, 9.9,0.5, 2.9, 5.3, 7.7, 10.1}, nd4j::DataType::DOUBLE);
+	NDArray exp('c', {M,N}, {0.1, 2.5, 4.9, 7.3, 9.7,0.3, 2.7, 5.1, 7.5, 9.9,0.5, 2.9, 5.3, 7.7, 10.1}, sd::DataType::DOUBLE);
 
-	nd4j::MmulHelper::mmul(&a, &b, &c, 1., 0.);
+	sd::MmulHelper::mmul(&a, &b, &c, 1., 0.);
 	ASSERT_TRUE(c.equalsTo(&exp));
 
 
@@ -119,7 +119,7 @@ TEST_F(CudaBasicsTests2, mmulMxM_4) {
 	// NDArray* pB = b.permute({1,0});
 	// NDArray* pC = c.permute({1,0});
 
-	// nd4j::MmulHelper::mmul(pB, pA, pC, 1., 0.);
+	// sd::MmulHelper::mmul(pB, pA, pC, 1., 0.);
 	// ASSERT_TRUE(c.equalsTo(&exp));
 
 	// delete pA;
@@ -134,13 +134,13 @@ TEST_F(CudaBasicsTests2, mmulMxM_5) {
 	const Nd4jLong K = 4;
 	const Nd4jLong N = 5;
 
-	NDArray a('c', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::DOUBLE);
-	NDArray b('c', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, nd4j::DataType::DOUBLE);
-	NDArray c('f', {M,N}, nd4j::DataType::DOUBLE);
+	NDArray a('c', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::DOUBLE);
+	NDArray b('c', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, sd::DataType::DOUBLE);
+	NDArray c('f', {M,N}, sd::DataType::DOUBLE);
 
-	NDArray exp('f', {M,N}, {-8.8, -4.3, 0.2, 8.6, 4.1, -0.4, -8.4, -3.9, 0.6, 8.2, 3.7, -0.8, -8.0, -3.5, 1.}, nd4j::DataType::DOUBLE);
+	NDArray exp('f', {M,N}, {-8.8, -4.3, 0.2, 8.6, 4.1, -0.4, -8.4, -3.9, 0.6, 8.2, 3.7, -0.8, -8.0, -3.5, 1.}, sd::DataType::DOUBLE);
 
-	nd4j::MmulHelper::mmul(&a, &b, &c, 1., 0.);
+	sd::MmulHelper::mmul(&a, &b, &c, 1., 0.);
 
 	ASSERT_TRUE(c.equalsTo(&exp));
 }
@@ -152,13 +152,13 @@ TEST_F(CudaBasicsTests2, mmulMxM_6) {
 	const Nd4jLong K = 4;
 	const Nd4jLong N = 5;
 
-	NDArray a('c', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::DOUBLE);
-	NDArray b('f', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, nd4j::DataType::DOUBLE);
-	NDArray c('c', {M,N}, nd4j::DataType::DOUBLE);
+	NDArray a('c', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::DOUBLE);
+	NDArray b('f', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, sd::DataType::DOUBLE);
+	NDArray c('c', {M,N}, sd::DataType::DOUBLE);
 
-	NDArray exp('c', {M,N}, {-1.6, -0.8, -0.0, 0.8, 1.6, -0.7, 0.1, 0.9, 1.7, 2.5, 0.2, 1.0, 1.8, 2.6, 3.4}, nd4j::DataType::DOUBLE);
+	NDArray exp('c', {M,N}, {-1.6, -0.8, -0.0, 0.8, 1.6, -0.7, 0.1, 0.9, 1.7, 2.5, 0.2, 1.0, 1.8, 2.6, 3.4}, sd::DataType::DOUBLE);
 
-	nd4j::MmulHelper::mmul(&a, &b, &c, 1., 0.);
+	sd::MmulHelper::mmul(&a, &b, &c, 1., 0.);
 
 	ASSERT_TRUE(c.equalsTo(&exp));
 }
@@ -170,13 +170,13 @@ TEST_F(CudaBasicsTests2, mmulMxM_7) {
 	const Nd4jLong K = 4;
 	const Nd4jLong N = 5;
 
-	NDArray a('f', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::DOUBLE);
-	NDArray b('c', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, nd4j::DataType::DOUBLE);
-	NDArray c('c', {M,N}, nd4j::DataType::DOUBLE);
+	NDArray a('f', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::DOUBLE);
+	NDArray b('c', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, sd::DataType::DOUBLE);
+	NDArray c('c', {M,N}, sd::DataType::DOUBLE);
 
-	NDArray exp('c', {M,N}, {-1.9, 1.3, -0.7, 0.1, 0.5, -0.9, 0.3, 0.3, -0.9, 1.5, 0.1, -0.7, 1.3, -1.9, 2.5}, nd4j::DataType::DOUBLE);
+	NDArray exp('c', {M,N}, {-1.9, 1.3, -0.7, 0.1, 0.5, -0.9, 0.3, 0.3, -0.9, 1.5, 0.1, -0.7, 1.3, -1.9, 2.5}, sd::DataType::DOUBLE);
 
-	nd4j::MmulHelper::mmul(&a, &b, &c, 1., 0.);
+	sd::MmulHelper::mmul(&a, &b, &c, 1., 0.);
 
 	ASSERT_TRUE(c.equalsTo(&exp));
 }
@@ -188,13 +188,13 @@ TEST_F(CudaBasicsTests2, mmulMxM_8) {
 	const Nd4jLong K = 4;
 	const Nd4jLong N = 5;
 
-	NDArray a('c', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::DOUBLE);
-	NDArray b('c', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, nd4j::DataType::DOUBLE);
-	NDArray c('c', {M,N}, nd4j::DataType::DOUBLE);
+	NDArray a('c', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::DOUBLE);
+	NDArray b('c', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, sd::DataType::DOUBLE);
+	NDArray c('c', {M,N}, sd::DataType::DOUBLE);
 
-	NDArray exp('c', {M,N}, {-8.8, 8.6, -8.4, 8.2, -8.0, -4.3, 4.1, -3.9, 3.7, -3.5, 0.2, -0.4, 0.6, -0.8, 1.}, nd4j::DataType::DOUBLE);
+	NDArray exp('c', {M,N}, {-8.8, 8.6, -8.4, 8.2, -8.0, -4.3, 4.1, -3.9, 3.7, -3.5, 0.2, -0.4, 0.6, -0.8, 1.}, sd::DataType::DOUBLE);
 
-	nd4j::MmulHelper::mmul(&a, &b, &c, 1., 0.);
+	sd::MmulHelper::mmul(&a, &b, &c, 1., 0.);
 
 	ASSERT_TRUE(c.equalsTo(&exp));
 }
@@ -206,13 +206,13 @@ TEST_F(CudaBasicsTests2, mmulMxM_9) {
 	const Nd4jLong K = 4;
 	const Nd4jLong N = 5;
 
-	NDArray a('c', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::FLOAT32);
-	NDArray b('c', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, nd4j::DataType::FLOAT32);
-	NDArray c('c', {M,N}, nd4j::DataType::FLOAT32);
+	NDArray a('c', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::FLOAT32);
+	NDArray b('c', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, sd::DataType::FLOAT32);
+	NDArray c('c', {M,N}, sd::DataType::FLOAT32);
 
-	NDArray exp('c', {M,N}, {-8.8, 8.6, -8.4, 8.2, -8.0, -4.3, 4.1, -3.9, 3.7, -3.5, 0.2, -0.4, 0.6, -0.8, 1.}, nd4j::DataType::FLOAT32);
+	NDArray exp('c', {M,N}, {-8.8, 8.6, -8.4, 8.2, -8.0, -4.3, 4.1, -3.9, 3.7, -3.5, 0.2, -0.4, 0.6, -0.8, 1.}, sd::DataType::FLOAT32);
 
-	nd4j::MmulHelper::mmul(&a, &b, &c, 1., 0.);
+	sd::MmulHelper::mmul(&a, &b, &c, 1., 0.);
 
 	ASSERT_TRUE(c.equalsTo(&exp));
 }
@@ -224,13 +224,13 @@ TEST_F(CudaBasicsTests2, mmulMxM_10) {
 	const Nd4jLong K = 4;
 	const Nd4jLong N = 5;
 
-	NDArray a('f', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::FLOAT32);
-	NDArray b('f', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, nd4j::DataType::FLOAT32);
-	NDArray c('f', {M,N}, nd4j::DataType::FLOAT32);
+	NDArray a('f', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::FLOAT32);
+	NDArray b('f', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, sd::DataType::FLOAT32);
+	NDArray c('f', {M,N}, sd::DataType::FLOAT32);
 
-	NDArray exp('f', {M,N}, {0.1, 0.3, 0.5, 2.5, 2.7, 2.9, 4.9, 5.1, 5.3, 7.3, 7.5, 7.7, 9.7, 9.9, 10.1}, nd4j::DataType::FLOAT32);
+	NDArray exp('f', {M,N}, {0.1, 0.3, 0.5, 2.5, 2.7, 2.9, 4.9, 5.1, 5.3, 7.3, 7.5, 7.7, 9.7, 9.9, 10.1}, sd::DataType::FLOAT32);
 
-	nd4j::MmulHelper::mmul(&a, &b, &c, 1., 0.);
+	sd::MmulHelper::mmul(&a, &b, &c, 1., 0.);
 	// c.printIndexedBuffer();
 
 	ASSERT_TRUE(c.equalsTo(&exp));
@@ -243,13 +243,13 @@ TEST_F(CudaBasicsTests2, mmulMxM_11) {
 	const Nd4jLong K = 4;
 	const Nd4jLong N = 5;
 
-	NDArray a('f', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::FLOAT32);
-	NDArray b('c', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, nd4j::DataType::FLOAT32);
-	NDArray c('f', {M,N}, nd4j::DataType::FLOAT32);
+	NDArray a('f', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::FLOAT32);
+	NDArray b('c', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, sd::DataType::FLOAT32);
+	NDArray c('f', {M,N}, sd::DataType::FLOAT32);
 
-	NDArray exp('f', {M,N}, {-1.9, -0.9, 0.1, 1.3, 0.3, -0.7, -0.7, 0.3, 1.3, 0.1, -0.9, -1.9, 0.5, 1.5, 2.5}, nd4j::DataType::FLOAT32);
+	NDArray exp('f', {M,N}, {-1.9, -0.9, 0.1, 1.3, 0.3, -0.7, -0.7, 0.3, 1.3, 0.1, -0.9, -1.9, 0.5, 1.5, 2.5}, sd::DataType::FLOAT32);
 
-	nd4j::MmulHelper::mmul(&a, &b, &c, 1., 0.);
+	sd::MmulHelper::mmul(&a, &b, &c, 1., 0.);
 
 	ASSERT_TRUE(c.equalsTo(&exp));
 }
@@ -265,13 +265,13 @@ TEST_F(CudaBasicsTests2, mmulMxM_12) {
 	const Nd4jLong K = 4;
 	const Nd4jLong N = 4;
 
-	NDArray a('f', {M,K}, {1.,2,3,4,5,6,7,8,9,2,3,2,1,0,4,7.}, nd4j::DataType::INT8);
-	NDArray b('f', {K,N}, {-2,-3,0,1,5,-6,7,-8,9,-1,2,-2,3,-4,5,-6.}, nd4j::DataType::INT8);
-	NDArray c('f', {M,N}, nd4j::DataType::FLOAT32);
+	NDArray a('f', {M,K}, {1.,2,3,4,5,6,7,8,9,2,3,2,1,0,4,7.}, sd::DataType::INT8);
+	NDArray b('f', {K,N}, {-2,-3,0,1,5,-6,7,-8,9,-1,2,-2,3,-4,5,-6.}, sd::DataType::INT8);
+	NDArray c('f', {M,N}, sd::DataType::FLOAT32);
 
-	NDArray exp('f', {M,N}, {-16., -22., -23., -25., 30., -12., -38., -70., 20., 16., 18., 18., 22., -8., -28., -52.}, nd4j::DataType::FLOAT32);
+	NDArray exp('f', {M,N}, {-16., -22., -23., -25., 30., -12., -38., -70., 20., 16., 18., 18., 22., -8., -28., -52.}, sd::DataType::FLOAT32);
 
-	nd4j::MmulHelper::mmul(&a, &b, &c, 1., 0.);
+	sd::MmulHelper::mmul(&a, &b, &c, 1., 0.);
 	// c.printBuffer();
 
 	ASSERT_TRUE(c.equalsTo(&exp));
@@ -288,13 +288,13 @@ TEST_F(CudaBasicsTests2, mmulMxM_13) {
 	const Nd4jLong K = 4;
 	const Nd4jLong N = 5;
 
-	NDArray a('f', {M,K}, {1.,2,3,4,5,6,7,8,9,10,11,12}, nd4j::DataType::INT8);
-	NDArray b('c', {K,N}, {-2,-3,0,1,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, nd4j::DataType::INT8);
-	NDArray c('f', {M,N}, nd4j::DataType::FLOAT32);
+	NDArray a('f', {M,K}, {1.,2,3,4,5,6,7,8,9,10,11,12}, sd::DataType::INT8);
+	NDArray b('c', {K,N}, {-2,-3,0,1,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, sd::DataType::INT8);
+	NDArray c('f', {M,N}, sd::DataType::FLOAT32);
 
-	NDArray exp('f', {M,N}, {-109., -122., -135., 111., 120., 129., -121., -134., -147., 129., 144., 159., -130., -140., -150.}, nd4j::DataType::FLOAT32);
+	NDArray exp('f', {M,N}, {-109., -122., -135., 111., 120., 129., -121., -134., -147., 129., 144., 159., -130., -140., -150.}, sd::DataType::FLOAT32);
 
-	nd4j::MmulHelper::mmul(&a, &b, &c, 1., 0.);
+	sd::MmulHelper::mmul(&a, &b, &c, 1., 0.);
 
 	ASSERT_TRUE(c.equalsTo(&exp));
 }
@@ -310,13 +310,13 @@ TEST_F(CudaBasicsTests2, mmulMxM_14) {
 	const Nd4jLong K = 4;
 	const Nd4jLong N = 5;
 
-	NDArray a('c', {M,K}, {1.,2,3,4,5,6,7,8,9,10,11,12}, nd4j::DataType::INT8);
-	NDArray b('c', {K,N}, {-2,-3,0,1,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, nd4j::DataType::INT8);
-	NDArray c('c', {M,N}, nd4j::DataType::FLOAT32);
+	NDArray a('c', {M,K}, {1.,2,3,4,5,6,7,8,9,10,11,12}, sd::DataType::INT8);
+	NDArray b('c', {K,N}, {-2,-3,0,1,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, sd::DataType::INT8);
+	NDArray c('c', {M,N}, sd::DataType::FLOAT32);
 
-	NDArray exp('c', {M,N}, {-45., 43., -49., 53., -50., -97., 79., -101., 113., -90., -149., 115., -153., 173., -130.}, nd4j::DataType::FLOAT32);
+	NDArray exp('c', {M,N}, {-45., 43., -49., 53., -50., -97., 79., -101., 113., -90., -149., 115., -153., 173., -130.}, sd::DataType::FLOAT32);
 
-	nd4j::MmulHelper::mmul(&a, &b, &c, 1., 0.);
+	sd::MmulHelper::mmul(&a, &b, &c, 1., 0.);
 
 	ASSERT_TRUE(c.equalsTo(&exp));
 }
@@ -332,13 +332,13 @@ TEST_F(CudaBasicsTests2, mmulMxM_15) {
 	const Nd4jLong K = 4;
 	const Nd4jLong N = 5;
 
-	NDArray a('f', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::HALF);
-	NDArray b('f', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, nd4j::DataType::HALF);
-	NDArray c('f', {M,N}, nd4j::DataType::FLOAT32);
+	NDArray a('f', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::HALF);
+	NDArray b('f', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, sd::DataType::HALF);
+	NDArray c('f', {M,N}, sd::DataType::FLOAT32);
 
-	NDArray exp('f', {M,N}, {0.1, 0.3, 0.5, 2.5, 2.7, 2.9, 4.9, 5.1, 5.3, 7.3, 7.5, 7.7, 9.7, 9.9, 10.1}, nd4j::DataType::FLOAT32);
+	NDArray exp('f', {M,N}, {0.1, 0.3, 0.5, 2.5, 2.7, 2.9, 4.9, 5.1, 5.3, 7.3, 7.5, 7.7, 9.7, 9.9, 10.1}, sd::DataType::FLOAT32);
 
-	nd4j::MmulHelper::mmul(&a, &b, &c, 1., 0.);
+	sd::MmulHelper::mmul(&a, &b, &c, 1., 0.);
 	// c.printBuffer();
 
 	ASSERT_TRUE(c.equalsTo(&exp, 0.01));
@@ -355,13 +355,13 @@ TEST_F(CudaBasicsTests2, mmulMxM_16) {
 	const Nd4jLong K = 4;
 	const Nd4jLong N = 5;
 
-	NDArray a('f', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::HALF);
-	NDArray b('c', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, nd4j::DataType::HALF);
-	NDArray c('f', {M,N}, nd4j::DataType::FLOAT32);
+	NDArray a('f', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::HALF);
+	NDArray b('c', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, sd::DataType::HALF);
+	NDArray c('f', {M,N}, sd::DataType::FLOAT32);
 
-	NDArray exp('f', {M,N}, {-1.9, -0.9, 0.1, 1.3, 0.3, -0.7, -0.7, 0.3, 1.3, 0.1, -0.9, -1.9, 0.5, 1.5, 2.5}, nd4j::DataType::FLOAT32);
+	NDArray exp('f', {M,N}, {-1.9, -0.9, 0.1, 1.3, 0.3, -0.7, -0.7, 0.3, 1.3, 0.1, -0.9, -1.9, 0.5, 1.5, 2.5}, sd::DataType::FLOAT32);
 
-	nd4j::MmulHelper::mmul(&a, &b, &c, 1., 0.);
+	sd::MmulHelper::mmul(&a, &b, &c, 1., 0.);
 
 	ASSERT_TRUE(c.equalsTo(&exp, 0.01));
 }
@@ -377,13 +377,13 @@ TEST_F(CudaBasicsTests2, mmulMxM_17) {
 	const Nd4jLong K = 4;
 	const Nd4jLong N = 5;
 
-	NDArray a('c', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::HALF);
-	NDArray b('c', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, nd4j::DataType::HALF);
-	NDArray c('c', {M,N}, nd4j::DataType::FLOAT32);
+	NDArray a('c', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::HALF);
+	NDArray b('c', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, sd::DataType::HALF);
+	NDArray c('c', {M,N}, sd::DataType::FLOAT32);
 
-	NDArray exp('c', {M,N}, {-8.8, 8.6, -8.4, 8.2, -8.0, -4.3, 4.1, -3.9, 3.7, -3.5, 0.2, -0.4, 0.6, -0.8, 1.}, nd4j::DataType::FLOAT32);
+	NDArray exp('c', {M,N}, {-8.8, 8.6, -8.4, 8.2, -8.0, -4.3, 4.1, -3.9, 3.7, -3.5, 0.2, -0.4, 0.6, -0.8, 1.}, sd::DataType::FLOAT32);
 
-	nd4j::MmulHelper::mmul(&a, &b, &c, 1., 0.);
+	sd::MmulHelper::mmul(&a, &b, &c, 1., 0.);
 
 	ASSERT_TRUE(c.equalsTo(&exp, 0.01));
 }
@@ -399,13 +399,13 @@ TEST_F(CudaBasicsTests2, mmulMxM_18) {
 	const Nd4jLong K = 4;
 	const Nd4jLong N = 5;
 
-	NDArray a('f', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::HALF);
-	NDArray b('f', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, nd4j::DataType::HALF);
-	NDArray c('f', {M,N}, nd4j::DataType::HALF);
+	NDArray a('f', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::HALF);
+	NDArray b('f', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, sd::DataType::HALF);
+	NDArray c('f', {M,N}, sd::DataType::HALF);
 
-	NDArray exp('f', {M,N}, {0.1, 0.3, 0.5, 2.5, 2.7, 2.9, 4.9, 5.1, 5.3, 7.3, 7.5, 7.7, 9.7, 9.9, 10.1}, nd4j::DataType::HALF);
+	NDArray exp('f', {M,N}, {0.1, 0.3, 0.5, 2.5, 2.7, 2.9, 4.9, 5.1, 5.3, 7.3, 7.5, 7.7, 9.7, 9.9, 10.1}, sd::DataType::HALF);
 
-	nd4j::MmulHelper::mmul(&a, &b, &c, 1., 0.);
+	sd::MmulHelper::mmul(&a, &b, &c, 1., 0.);
 
 	ASSERT_TRUE(c.equalsTo(&exp, 1e-1));
 }
@@ -421,13 +421,13 @@ TEST_F(CudaBasicsTests2, mmulMxM_19) {
 	const Nd4jLong K = 4;
 	const Nd4jLong N = 5;
 
-	NDArray a('f', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::HALF);
-	NDArray b('c', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, nd4j::DataType::HALF);
-	NDArray c('f', {M,N}, nd4j::DataType::HALF);
+	NDArray a('f', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::HALF);
+	NDArray b('c', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, sd::DataType::HALF);
+	NDArray c('f', {M,N}, sd::DataType::HALF);
 
-	NDArray exp('f', {M,N}, {-1.9, -0.9, 0.1, 1.3, 0.3, -0.7, -0.7, 0.3, 1.3, 0.1, -0.9, -1.9, 0.5, 1.5, 2.5}, nd4j::DataType::HALF);
+	NDArray exp('f', {M,N}, {-1.9, -0.9, 0.1, 1.3, 0.3, -0.7, -0.7, 0.3, 1.3, 0.1, -0.9, -1.9, 0.5, 1.5, 2.5}, sd::DataType::HALF);
 
-	nd4j::MmulHelper::mmul(&a, &b, &c, 1., 0.);
+	sd::MmulHelper::mmul(&a, &b, &c, 1., 0.);
 
 	ASSERT_TRUE(c.equalsTo(&exp, 1e-1));
 }
@@ -443,13 +443,13 @@ TEST_F(CudaBasicsTests2, mmulMxM_20) {
 	const Nd4jLong K = 4;
 	const Nd4jLong N = 5;
 
-	NDArray a('c', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::HALF);
-	NDArray b('c', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, nd4j::DataType::HALF);
-	NDArray c('c', {M,N}, nd4j::DataType::HALF);
+	NDArray a('c', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::HALF);
+	NDArray b('c', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, sd::DataType::HALF);
+	NDArray c('c', {M,N}, sd::DataType::HALF);
 
-	NDArray exp('c', {M,N}, {-8.8, 8.6, -8.4, 8.2, -8.0, -4.3, 4.1, -3.9, 3.7, -3.5, 0.2, -0.4, 0.6, -0.8, 1.}, nd4j::DataType::HALF);
+	NDArray exp('c', {M,N}, {-8.8, 8.6, -8.4, 8.2, -8.0, -4.3, 4.1, -3.9, 3.7, -3.5, 0.2, -0.4, 0.6, -0.8, 1.}, sd::DataType::HALF);
 
-	nd4j::MmulHelper::mmul(&a, &b, &c, 1., 0.);
+	sd::MmulHelper::mmul(&a, &b, &c, 1., 0.);
 
 	ASSERT_TRUE(c.equalsTo(&exp, 1e-1));
 }
@@ -461,13 +461,13 @@ TEST_F(CudaBasicsTests2, mmulMxM_21) {
 	const Nd4jLong K = 4;
 	const Nd4jLong N = 5;
 
-	NDArray a('c', {M,K}, {1.,2,3,4,5,6,7,8,9,10,11,12}, nd4j::DataType::INT8);
-	NDArray b('c', {K,N}, {-2,-3,0,1,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, nd4j::DataType::FLOAT32);
-	NDArray c('c', {M,N}, nd4j::DataType::DOUBLE);
+	NDArray a('c', {M,K}, {1.,2,3,4,5,6,7,8,9,10,11,12}, sd::DataType::INT8);
+	NDArray b('c', {K,N}, {-2,-3,0,1,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, sd::DataType::FLOAT32);
+	NDArray c('c', {M,N}, sd::DataType::DOUBLE);
 
-	NDArray exp('c', {M,N}, {-45., 43., -49., 53., -50., -97., 79., -101., 113., -90., -149., 115., -153., 173., -130.}, nd4j::DataType::DOUBLE);
+	NDArray exp('c', {M,N}, {-45., 43., -49., 53., -50., -97., 79., -101., 113., -90., -149., 115., -153., 173., -130.}, sd::DataType::DOUBLE);
 
-	nd4j::MmulHelper::mmul(&a, &b, &c, 1., 0.);
+	sd::MmulHelper::mmul(&a, &b, &c, 1., 0.);
 
 	ASSERT_TRUE(c.equalsTo(&exp));
 }
@@ -479,13 +479,13 @@ TEST_F(CudaBasicsTests2, mmulMxM_22) {
 	const Nd4jLong K = 4;
 	const Nd4jLong N = 5;
 
-	NDArray a('c', {M,K}, {1.,2,3,4,5,6,7,8,9,10,11,12}, nd4j::DataType::FLOAT32);
-	NDArray b('c', {K,N}, {-2,-3,0,1,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, nd4j::DataType::HALF);
-	NDArray c('c', {M,N}, nd4j::DataType::FLOAT32);
+	NDArray a('c', {M,K}, {1.,2,3,4,5,6,7,8,9,10,11,12}, sd::DataType::FLOAT32);
+	NDArray b('c', {K,N}, {-2,-3,0,1,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, sd::DataType::HALF);
+	NDArray c('c', {M,N}, sd::DataType::FLOAT32);
 
-	NDArray exp('c', {M,N}, {-45., 43., -49., 53., -50., -97., 79., -101., 113., -90., -149., 115., -153., 173., -130.}, nd4j::DataType::FLOAT32);
+	NDArray exp('c', {M,N}, {-45., 43., -49., 53., -50., -97., 79., -101., 113., -90., -149., 115., -153., 173., -130.}, sd::DataType::FLOAT32);
 
-	nd4j::MmulHelper::mmul(&a, &b, &c, 1., 0.);
+	sd::MmulHelper::mmul(&a, &b, &c, 1., 0.);
 	// c.printBuffer();
 
 	ASSERT_TRUE(c.equalsTo(&exp));
@@ -498,13 +498,13 @@ TEST_F(CudaBasicsTests2, mmulMxM_23) {
 	const Nd4jLong K = 4;
 	const Nd4jLong N = 5;
 
-	NDArray a('c', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::HALF);
-	NDArray b('c', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, nd4j::DataType::HALF);
-	NDArray c('c', {M,N}, nd4j::DataType::DOUBLE);
+	NDArray a('c', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::HALF);
+	NDArray b('c', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, sd::DataType::HALF);
+	NDArray c('c', {M,N}, sd::DataType::DOUBLE);
 
-	NDArray exp('c', {M,N}, {-8.8, 8.6, -8.4, 8.2, -8.0, -4.3, 4.1, -3.9, 3.7, -3.5, 0.2, -0.4, 0.6, -0.8, 1.}, nd4j::DataType::DOUBLE);
+	NDArray exp('c', {M,N}, {-8.8, 8.6, -8.4, 8.2, -8.0, -4.3, 4.1, -3.9, 3.7, -3.5, 0.2, -0.4, 0.6, -0.8, 1.}, sd::DataType::DOUBLE);
 
-	nd4j::MmulHelper::mmul(&a, &b, &c, 1., 0.);
+	sd::MmulHelper::mmul(&a, &b, &c, 1., 0.);
 
 	ASSERT_TRUE(c.equalsTo(&exp, 0.01));
 }
@@ -516,13 +516,13 @@ TEST_F(CudaBasicsTests2, mmulMxM_24) {
 	const Nd4jLong K = 4;
 	const Nd4jLong N = 5;
 
-	NDArray a('c', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::HALF);
-	NDArray b('c', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, nd4j::DataType::FLOAT32);
-	NDArray c('c', {M,N}, nd4j::DataType::DOUBLE);
+	NDArray a('c', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::HALF);
+	NDArray b('c', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, sd::DataType::FLOAT32);
+	NDArray c('c', {M,N}, sd::DataType::DOUBLE);
 
-	NDArray exp('c', {M,N}, {-8.8, 8.6, -8.4, 8.2, -8.0, -4.3, 4.1, -3.9, 3.7, -3.5, 0.2, -0.4, 0.6, -0.8, 1.}, nd4j::DataType::DOUBLE);
+	NDArray exp('c', {M,N}, {-8.8, 8.6, -8.4, 8.2, -8.0, -4.3, 4.1, -3.9, 3.7, -3.5, 0.2, -0.4, 0.6, -0.8, 1.}, sd::DataType::DOUBLE);
 
-	nd4j::MmulHelper::mmul(&a, &b, &c, 1., 0.);
+	sd::MmulHelper::mmul(&a, &b, &c, 1., 0.);
 
 	ASSERT_TRUE(c.equalsTo(&exp, 0.01));
 }
@@ -534,13 +534,13 @@ TEST_F(CudaBasicsTests2, mmulMxM_25) {
 	const Nd4jLong K = 4;
 	const Nd4jLong N = 5;
 
-	NDArray a('c', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::DOUBLE);
-	NDArray b('c', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, nd4j::DataType::FLOAT32);
-	NDArray c('c', {M,N}, nd4j::DataType::HALF);
+	NDArray a('c', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::DOUBLE);
+	NDArray b('c', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, sd::DataType::FLOAT32);
+	NDArray c('c', {M,N}, sd::DataType::HALF);
 
-	NDArray exp('c', {M,N}, {-8.8, 8.6, -8.4, 8.2, -8.0, -4.3, 4.1, -3.9, 3.7, -3.5, 0.2, -0.4, 0.6, -0.8, 1.}, nd4j::DataType::HALF);
+	NDArray exp('c', {M,N}, {-8.8, 8.6, -8.4, 8.2, -8.0, -4.3, 4.1, -3.9, 3.7, -3.5, 0.2, -0.4, 0.6, -0.8, 1.}, sd::DataType::HALF);
 
-	nd4j::MmulHelper::mmul(&a, &b, &c, 1., 0.);
+	sd::MmulHelper::mmul(&a, &b, &c, 1., 0.);
 
 	ASSERT_TRUE(c.equalsTo(&exp, 0.01));
 }
@@ -553,13 +553,13 @@ TEST_F(CudaBasicsTests2, mmulMxM_26) {
 	const Nd4jLong N = 5;
 
 	// 3x4 * 4x5 = 3x5
-	NDArray a('c', {M,K}, {1.,2,3,4,5,6,7,8,9,10,11,12}, nd4j::DataType::INT64);
-	NDArray b('c', {K,N}, {-2,-3,0,1,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, nd4j::DataType::FLOAT32);
-	NDArray c('c', {M,N}, nd4j::DataType::DOUBLE);
+	NDArray a('c', {M,K}, {1.,2,3,4,5,6,7,8,9,10,11,12}, sd::DataType::INT64);
+	NDArray b('c', {K,N}, {-2,-3,0,1,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, sd::DataType::FLOAT32);
+	NDArray c('c', {M,N}, sd::DataType::DOUBLE);
 
-	NDArray exp('c', {M,N}, {-45., 43., -49., 53., -50., -97., 79., -101., 113., -90., -149., 115., -153., 173., -130.}, nd4j::DataType::DOUBLE);
+	NDArray exp('c', {M,N}, {-45., 43., -49., 53., -50., -97., 79., -101., 113., -90., -149., 115., -153., 173., -130.}, sd::DataType::DOUBLE);
 
-	nd4j::MmulHelper::mmul(&a, &b, &c, 1., 0.);
+	sd::MmulHelper::mmul(&a, &b, &c, 1., 0.);
 	// c.printBuffer();
 
 	ASSERT_TRUE(c.equalsTo(&exp));
@@ -572,13 +572,13 @@ TEST_F(CudaBasicsTests2, mmulMxM_27) {
 	const Nd4jLong K = 4;
 	const Nd4jLong N = 5;
 
-	NDArray a('f', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::HALF);
-	NDArray b('f', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, nd4j::DataType::DOUBLE);
-	NDArray c('f', {M,N}, nd4j::DataType::FLOAT32);
+	NDArray a('f', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::HALF);
+	NDArray b('f', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, sd::DataType::DOUBLE);
+	NDArray c('f', {M,N}, sd::DataType::FLOAT32);
 
-	NDArray exp('f', {M,N}, {0.1, 0.3, 0.5, 2.5, 2.7, 2.9, 4.9, 5.1, 5.3, 7.3, 7.5, 7.7, 9.7, 9.9, 10.1}, nd4j::DataType::FLOAT32);
+	NDArray exp('f', {M,N}, {0.1, 0.3, 0.5, 2.5, 2.7, 2.9, 4.9, 5.1, 5.3, 7.3, 7.5, 7.7, 9.7, 9.9, 10.1}, sd::DataType::FLOAT32);
 
-	nd4j::MmulHelper::mmul(&a, &b, &c, 1., 0.);
+	sd::MmulHelper::mmul(&a, &b, &c, 1., 0.);
 	// c.printBuffer();
 
 	ASSERT_TRUE(c.equalsTo(&exp, 0.01));
@@ -591,13 +591,13 @@ TEST_F(CudaBasicsTests2, mmulMxM_28) {
 	const Nd4jLong K = 4;
 	const Nd4jLong N = 5;
 
-	NDArray a('c', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::FLOAT32);
-	NDArray b('f', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, nd4j::DataType::DOUBLE);
-	NDArray c('f', {M,N}, nd4j::DataType::FLOAT32);
+	NDArray a('c', {M,K}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::FLOAT32);
+	NDArray b('f', {K,N}, {1,-2,3,-4,5,-6,7,-8,9,-10,11,-12,13,-14,15,-16,17,-18,19,-20}, sd::DataType::DOUBLE);
+	NDArray c('f', {M,N}, sd::DataType::FLOAT32);
 
-	NDArray exp('f', {M,N}, {-1.6, -0.7, 0.2, -0.8, 0.1, 1., -0., 0.9, 1.8, 0.8, 1.7, 2.6, 1.6, 2.5, 3.4}, nd4j::DataType::FLOAT32);
+	NDArray exp('f', {M,N}, {-1.6, -0.7, 0.2, -0.8, 0.1, 1., -0., 0.9, 1.8, 0.8, 1.7, 2.6, 1.6, 2.5, 3.4}, sd::DataType::FLOAT32);
 
-	nd4j::MmulHelper::mmul(&a, &b, &c, 1., 0.);
+	sd::MmulHelper::mmul(&a, &b, &c, 1., 0.);
 
 	ASSERT_TRUE(c.equalsTo(&exp));
 }
@@ -609,13 +609,13 @@ TEST_F(CudaBasicsTests2, mmulMxV_1) {
 	const Nd4jLong M = 3;
 	const Nd4jLong N = 4;
 
-	NDArray a('f', {M,N}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::DOUBLE);
-	NDArray x('f', {N}, {1,-2,3,-4}, nd4j::DataType::DOUBLE);
-	NDArray y('f', {M}, nd4j::DataType::DOUBLE);
+	NDArray a('f', {M,N}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::DOUBLE);
+	NDArray x('f', {N}, {1,-2,3,-4}, sd::DataType::DOUBLE);
+	NDArray y('f', {M}, sd::DataType::DOUBLE);
 
-	NDArray exp('f', {M}, {0.1, 0.3, 0.5}, nd4j::DataType::DOUBLE);
+	NDArray exp('f', {M}, {0.1, 0.3, 0.5}, sd::DataType::DOUBLE);
 
-	nd4j::MmulHelper::mmul(&a, &x, &y, 1., 0.);
+	sd::MmulHelper::mmul(&a, &x, &y, 1., 0.);
 	ASSERT_TRUE(y.equalsTo(&exp));
 }
 
@@ -625,13 +625,13 @@ TEST_F(CudaBasicsTests2, mmulMxV_2) {
 	const Nd4jLong M = 3;
 	const Nd4jLong N = 4;
 
-	NDArray a('c', {M,N}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::DOUBLE);
-	NDArray x('f', {N}, {1,-2,3,-4}, nd4j::DataType::DOUBLE);
-	NDArray y('f', {M}, nd4j::DataType::DOUBLE);
+	NDArray a('c', {M,N}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::DOUBLE);
+	NDArray x('f', {N}, {1,-2,3,-4}, sd::DataType::DOUBLE);
+	NDArray y('f', {M}, sd::DataType::DOUBLE);
 
-	NDArray exp('f', {M}, {-1.6, -0.7, 0.2}, nd4j::DataType::DOUBLE);
+	NDArray exp('f', {M}, {-1.6, -0.7, 0.2}, sd::DataType::DOUBLE);
 
-	nd4j::MmulHelper::mmul(&a, &x, &y, 1., 0.);
+	sd::MmulHelper::mmul(&a, &x, &y, 1., 0.);
 	ASSERT_TRUE(y.equalsTo(&exp));
 }
 
@@ -641,13 +641,13 @@ TEST_F(CudaBasicsTests2, mmulMxV_3) {
 	const Nd4jLong M = 3;
 	const Nd4jLong N = 4;
 
-	NDArray a('c', {M,N}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::DOUBLE);
-	NDArray x('c', {N}, {1,-2,3,-4}, nd4j::DataType::DOUBLE);
-	NDArray y('f', {M}, nd4j::DataType::DOUBLE);
+	NDArray a('c', {M,N}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::DOUBLE);
+	NDArray x('c', {N}, {1,-2,3,-4}, sd::DataType::DOUBLE);
+	NDArray y('f', {M}, sd::DataType::DOUBLE);
 
-	NDArray exp('f', {M}, {-1.6, -0.7, 0.2}, nd4j::DataType::DOUBLE);
+	NDArray exp('f', {M}, {-1.6, -0.7, 0.2}, sd::DataType::DOUBLE);
 
-	nd4j::MmulHelper::mmul(&a, &x, &y, 1., 0.);
+	sd::MmulHelper::mmul(&a, &x, &y, 1., 0.);
 	ASSERT_TRUE(y.equalsTo(&exp));
 }
 
@@ -657,13 +657,13 @@ TEST_F(CudaBasicsTests2, mmulMxV_4) {
 	const Nd4jLong M = 3;
 	const Nd4jLong N = 4;
 
-	NDArray a('c', {M,N}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::DOUBLE);
-	NDArray x('c', {N}, {1,-2,3,-4}, nd4j::DataType::DOUBLE);
-	NDArray y('c', {M}, nd4j::DataType::DOUBLE);
+	NDArray a('c', {M,N}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::DOUBLE);
+	NDArray x('c', {N}, {1,-2,3,-4}, sd::DataType::DOUBLE);
+	NDArray y('c', {M}, sd::DataType::DOUBLE);
 
-	NDArray exp('c', {M}, {-1.6, -0.7, 0.2}, nd4j::DataType::DOUBLE);
+	NDArray exp('c', {M}, {-1.6, -0.7, 0.2}, sd::DataType::DOUBLE);
 
-	nd4j::MmulHelper::mmul(&a, &x, &y, 1., 0.);
+	sd::MmulHelper::mmul(&a, &x, &y, 1., 0.);
 	ASSERT_TRUE(y.equalsTo(&exp));
 }
 
@@ -673,13 +673,13 @@ TEST_F(CudaBasicsTests2, mmulMxV_5) {
 	const Nd4jLong M = 3;
 	const Nd4jLong N = 4;
 
-	NDArray a('f', {M,N}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::DOUBLE);
-	NDArray x('c', {N}, {1,-2,3,-4}, nd4j::DataType::DOUBLE);
-	NDArray y('c', {M}, nd4j::DataType::DOUBLE);
+	NDArray a('f', {M,N}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::DOUBLE);
+	NDArray x('c', {N}, {1,-2,3,-4}, sd::DataType::DOUBLE);
+	NDArray y('c', {M}, sd::DataType::DOUBLE);
 
-	NDArray exp('c', {M}, {0.1, 0.3, 0.5}, nd4j::DataType::DOUBLE);
+	NDArray exp('c', {M}, {0.1, 0.3, 0.5}, sd::DataType::DOUBLE);
 
-	nd4j::MmulHelper::mmul(&a, &x, &y, 1., 0.);
+	sd::MmulHelper::mmul(&a, &x, &y, 1., 0.);
 	ASSERT_TRUE(y.equalsTo(&exp));
 }
 
@@ -689,14 +689,14 @@ TEST_F(CudaBasicsTests2, mmulMxV_6) {
 	const Nd4jLong M = 3;
 	const Nd4jLong N = 4;
 
-	NDArray a('f', {M,N}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::DOUBLE);
-	NDArray temp('f', {M,N,5}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, nd4j::DataType::DOUBLE);
+	NDArray a('f', {M,N}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::DOUBLE);
+	NDArray temp('f', {M,N,5}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, sd::DataType::DOUBLE);
 	NDArray x = temp(6, {0,2});
-	NDArray y('f', {M}, nd4j::DataType::DOUBLE);
+	NDArray y('f', {M}, sd::DataType::DOUBLE);
 
-	NDArray exp('f', {M}, {5.5, 5.1, 4.7}, nd4j::DataType::DOUBLE);
+	NDArray exp('f', {M}, {5.5, 5.1, 4.7}, sd::DataType::DOUBLE);
 
-	nd4j::MmulHelper::mmul(&a, &x, &y, 1., 0.);
+	sd::MmulHelper::mmul(&a, &x, &y, 1., 0.);
 	ASSERT_TRUE(y.equalsTo(&exp));
 }
 
@@ -706,15 +706,15 @@ TEST_F(CudaBasicsTests2, mmulMxV_7) {
     const Nd4jLong M = 3;
     const Nd4jLong N = 4;
 
-    NDArray a('f', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::DOUBLE);
+    NDArray a('f', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::DOUBLE);
     a.permutei({1,0});
-    NDArray temp('f', {M,N,5}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, nd4j::DataType::DOUBLE);
+    NDArray temp('f', {M,N,5}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, sd::DataType::DOUBLE);
     NDArray x = temp(6, {0,2});
-    NDArray y('f', {M}, nd4j::DataType::DOUBLE);
+    NDArray y('f', {M}, sd::DataType::DOUBLE);
 
-    NDArray exp('f', {M}, {5.1, 3.3, 1.5}, nd4j::DataType::DOUBLE);
+    NDArray exp('f', {M}, {5.1, 3.3, 1.5}, sd::DataType::DOUBLE);
 
-    nd4j::MmulHelper::mmul(&a, &x, &y, 1., 0.);
+    sd::MmulHelper::mmul(&a, &x, &y, 1., 0.);
     ASSERT_TRUE(y.equalsTo(&exp));
 }
 
@@ -724,15 +724,15 @@ TEST_F(CudaBasicsTests2, mmulMxV_8) {
     const Nd4jLong M = 3;
     const Nd4jLong N = 4;
 
-    NDArray a('f', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::DOUBLE);
+    NDArray a('f', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::DOUBLE);
     a.permutei({1,0});
-    NDArray temp('f', {N,M,5}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, nd4j::DataType::DOUBLE);
+    NDArray temp('f', {N,M,5}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, sd::DataType::DOUBLE);
     NDArray x = temp(4, {1,2});
-    NDArray y('f', {M}, nd4j::DataType::DOUBLE);
+    NDArray y('f', {M}, sd::DataType::DOUBLE);
 
-    NDArray exp('f', {M}, {6.2, 4.5, 1.7}, nd4j::DataType::DOUBLE);
+    NDArray exp('f', {M}, {6.2, 4.5, 1.7}, sd::DataType::DOUBLE);
 
-    nd4j::MmulHelper::mmul(&a, &x, &y, 1., 0.);
+    sd::MmulHelper::mmul(&a, &x, &y, 1., 0.);
     ASSERT_TRUE(y.equalsTo(&exp));
 }
 
@@ -742,15 +742,15 @@ TEST_F(CudaBasicsTests2, mmulMxV_9) {
     const Nd4jLong M = 3;
     const Nd4jLong N = 4;
 
-    NDArray a('f', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::DOUBLE);
+    NDArray a('f', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::DOUBLE);
     a.permutei({1,0});
-    NDArray temp('f', {5,M,N}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, nd4j::DataType::DOUBLE);
+    NDArray temp('f', {5,M,N}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, sd::DataType::DOUBLE);
     NDArray x = temp(3, {0,1});
-    NDArray y('f', {M}, nd4j::DataType::DOUBLE);
+    NDArray y('f', {M}, sd::DataType::DOUBLE);
 
-    NDArray exp('f', {M}, {1.5, 1.8, 1.5}, nd4j::DataType::DOUBLE);
+    NDArray exp('f', {M}, {1.5, 1.8, 1.5}, sd::DataType::DOUBLE);
 
-    nd4j::MmulHelper::mmul(&a, &x, &y, 1., 0.);
+    sd::MmulHelper::mmul(&a, &x, &y, 1., 0.);
     ASSERT_TRUE(y.equalsTo(&exp));
 }
 
@@ -760,15 +760,15 @@ TEST_F(CudaBasicsTests2, mmulMxV_10) {
     const Nd4jLong M = 3;
     const Nd4jLong N = 4;
 
-    NDArray a('c', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::DOUBLE);
+    NDArray a('c', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::DOUBLE);
     a.permutei({1,0});
-    NDArray temp('f', {5,M,N}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, nd4j::DataType::DOUBLE);
+    NDArray temp('f', {5,M,N}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, sd::DataType::DOUBLE);
     NDArray x = temp(2, {0,1});
-    NDArray y('f', {M}, nd4j::DataType::DOUBLE);
+    NDArray y('f', {M}, sd::DataType::DOUBLE);
 
-    NDArray exp('f', {M}, {-0.3, 0.3, 0.9}, nd4j::DataType::DOUBLE);
+    NDArray exp('f', {M}, {-0.3, 0.3, 0.9}, sd::DataType::DOUBLE);
 
-    nd4j::MmulHelper::mmul(&a, &x, &y, 1., 0.);
+    sd::MmulHelper::mmul(&a, &x, &y, 1., 0.);
     ASSERT_TRUE(y.equalsTo(&exp));
 }
 
@@ -778,15 +778,15 @@ TEST_F(CudaBasicsTests2, mmulMxV_11) {
     const Nd4jLong M = 3;
     const Nd4jLong N = 4;
 
-    NDArray a('c', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::DOUBLE);
+    NDArray a('c', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::DOUBLE);
     a.permutei({1,0});
-    NDArray temp('c', {5,N,M}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, nd4j::DataType::DOUBLE);
+    NDArray temp('c', {5,N,M}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, sd::DataType::DOUBLE);
     NDArray x = temp(13, {0,2});
-    NDArray y('f', {M}, nd4j::DataType::DOUBLE);
+    NDArray y('f', {M}, sd::DataType::DOUBLE);
 
-    NDArray exp('f', {M}, {-12.1, -10.9, -9.7}, nd4j::DataType::DOUBLE);
+    NDArray exp('f', {M}, {-12.1, -10.9, -9.7}, sd::DataType::DOUBLE);
 
-    nd4j::MmulHelper::mmul(&a, &x, &y, 1., 0.);
+    sd::MmulHelper::mmul(&a, &x, &y, 1., 0.);
     ASSERT_TRUE(y.equalsTo(&exp));
 }
 
@@ -796,15 +796,15 @@ TEST_F(CudaBasicsTests2, mmulMxV_12) {
     const Nd4jLong M = 3;
     const Nd4jLong N = 4;
 
-    NDArray a('c', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::DOUBLE);
+    NDArray a('c', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::DOUBLE);
     a.permutei({1,0});
-    NDArray temp('c', {5,N,M}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, nd4j::DataType::DOUBLE);
+    NDArray temp('c', {5,N,M}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, sd::DataType::DOUBLE);
     NDArray x = temp(10, {0,2});
-    NDArray y('c', {M}, nd4j::DataType::DOUBLE);
+    NDArray y('c', {M}, sd::DataType::DOUBLE);
 
-    NDArray exp('c', {M}, {3.3, 3.3, 3.3}, nd4j::DataType::DOUBLE);
+    NDArray exp('c', {M}, {3.3, 3.3, 3.3}, sd::DataType::DOUBLE);
 
-    nd4j::MmulHelper::mmul(&a, &x, &y, 1., 0.);
+    sd::MmulHelper::mmul(&a, &x, &y, 1., 0.);
     ASSERT_TRUE(y.equalsTo(&exp));
 }
 
@@ -814,15 +814,15 @@ TEST_F(CudaBasicsTests2, mmulMxV_13) {
     const Nd4jLong M = 3;
     const Nd4jLong N = 4;
 
-    NDArray a('c', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::DOUBLE);
+    NDArray a('c', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::DOUBLE);
     a.permutei({1,0});
-    NDArray temp('f', {5,M,N}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, nd4j::DataType::DOUBLE);
+    NDArray temp('f', {5,M,N}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, sd::DataType::DOUBLE);
     NDArray x = temp(2, {0,1}, true);
-    NDArray y('f', {M}, nd4j::DataType::DOUBLE);
+    NDArray y('f', {M}, sd::DataType::DOUBLE);
 
-    NDArray exp('f', {M}, {-0.3, 0.3, 0.9}, nd4j::DataType::DOUBLE);
+    NDArray exp('f', {M}, {-0.3, 0.3, 0.9}, sd::DataType::DOUBLE);
 
-    nd4j::MmulHelper::mmul(&a, &x, &y, 1., 0.);
+    sd::MmulHelper::mmul(&a, &x, &y, 1., 0.);
     ASSERT_TRUE(y.equalsTo(&exp));
 }
 
@@ -832,15 +832,15 @@ TEST_F(CudaBasicsTests2, mmulMxV_14) {
     const Nd4jLong M = 3;
     const Nd4jLong N = 4;
 
-    NDArray a('c', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::DOUBLE);
+    NDArray a('c', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::DOUBLE);
     a.permutei({1,0});
-    NDArray temp('c', {5,N,M}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, nd4j::DataType::DOUBLE);
+    NDArray temp('c', {5,N,M}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, sd::DataType::DOUBLE);
     NDArray x = temp(10, {0,2}, true);
-    NDArray y('c', {M}, nd4j::DataType::DOUBLE);
+    NDArray y('c', {M}, sd::DataType::DOUBLE);
 
-    NDArray exp('c', {M}, {3.3, 3.3, 3.3}, nd4j::DataType::DOUBLE);
+    NDArray exp('c', {M}, {3.3, 3.3, 3.3}, sd::DataType::DOUBLE);
 
-    nd4j::MmulHelper::mmul(&a, &x, &y, 1., 0.);
+    sd::MmulHelper::mmul(&a, &x, &y, 1., 0.);
     ASSERT_TRUE(y.equalsTo(&exp));
 }
 
@@ -850,15 +850,15 @@ TEST_F(CudaBasicsTests2, mmulMxV_15) {
     const Nd4jLong M = 3;
     const Nd4jLong N = 4;
 
-    NDArray a('c', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::DOUBLE);
+    NDArray a('c', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::DOUBLE);
     a.permutei({1,0});
-    NDArray temp('f', {5,M,N}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, nd4j::DataType::DOUBLE);
+    NDArray temp('f', {5,M,N}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, sd::DataType::DOUBLE);
     NDArray x = temp(2, {0,1});
     NDArray y = temp(17, {0,2});
 
-    NDArray exp('f', {M}, {-0.3, 0.3, 0.9}, nd4j::DataType::DOUBLE);
+    NDArray exp('f', {M}, {-0.3, 0.3, 0.9}, sd::DataType::DOUBLE);
 
-    nd4j::MmulHelper::mmul(&a, &x, &y, 1., 0.);
+    sd::MmulHelper::mmul(&a, &x, &y, 1., 0.);
     ASSERT_TRUE(y.equalsTo(&exp));
 }
 
@@ -868,16 +868,16 @@ TEST_F(CudaBasicsTests2, mmulMxV_16) {
     const Nd4jLong M = 3;
     const Nd4jLong N = 4;
 
-    NDArray a('c', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::DOUBLE);
+    NDArray a('c', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::DOUBLE);
     a.permutei({1,0});
-    NDArray temp('f',  {5,M,N}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, nd4j::DataType::DOUBLE);
-    NDArray temp1('c', {5,M,N}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, nd4j::DataType::DOUBLE);
+    NDArray temp('f',  {5,M,N}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, sd::DataType::DOUBLE);
+    NDArray temp1('c', {5,M,N}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, sd::DataType::DOUBLE);
     NDArray x = temp(2, {0,1});
     NDArray y = temp1(17, {0,2});
 
-    NDArray exp('c', {M}, {-0.3, 0.3, 0.9}, nd4j::DataType::DOUBLE);
+    NDArray exp('c', {M}, {-0.3, 0.3, 0.9}, sd::DataType::DOUBLE);
 
-    nd4j::MmulHelper::mmul(&a, &x, &y, 1., 0.);
+    sd::MmulHelper::mmul(&a, &x, &y, 1., 0.);
     ASSERT_TRUE(y.equalsTo(&exp));
 }
 
@@ -887,16 +887,16 @@ TEST_F(CudaBasicsTests2, mmulMxV_17) {
     const Nd4jLong M = 3;
     const Nd4jLong N = 4;
 
-    NDArray a('c', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::DOUBLE);
+    NDArray a('c', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::DOUBLE);
     a.permutei({1,0});
-    NDArray temp('f', {5,M,N}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, nd4j::DataType::DOUBLE);
+    NDArray temp('f', {5,M,N}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, sd::DataType::DOUBLE);
     NDArray x = temp(2, {0,1});
     NDArray y = temp(17, {0,2}, true);
     // y.printShapeInfo();
 
-    NDArray exp('f', {1,M,1}, {-0.3, 0.3, 0.9}, nd4j::DataType::DOUBLE);
+    NDArray exp('f', {1,M,1}, {-0.3, 0.3, 0.9}, sd::DataType::DOUBLE);
 
-    nd4j::MmulHelper::mmul(&a, &x, &y, 1., 0.);
+    sd::MmulHelper::mmul(&a, &x, &y, 1., 0.);
     ASSERT_TRUE(y.equalsTo(&exp));
 }
 
@@ -906,16 +906,16 @@ TEST_F(CudaBasicsTests2, mmulMxV_18) {
     const Nd4jLong M = 3;
     const Nd4jLong N = 4;
 
-    NDArray a('c', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::DOUBLE);
+    NDArray a('c', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::DOUBLE);
     a.permutei({1,0});
-    NDArray temp('f',  {5,M,N}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, nd4j::DataType::DOUBLE);
-    NDArray temp1('c', {5,M,N}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, nd4j::DataType::DOUBLE);
+    NDArray temp('f',  {5,M,N}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, sd::DataType::DOUBLE);
+    NDArray temp1('c', {5,M,N}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, sd::DataType::DOUBLE);
     NDArray x = temp(2, {0,1},true);
     NDArray y = temp1(17, {0,2},true);
 
-    NDArray exp('c', {1,M,1}, {-0.3, 0.3, 0.9}, nd4j::DataType::DOUBLE);
+    NDArray exp('c', {1,M,1}, {-0.3, 0.3, 0.9}, sd::DataType::DOUBLE);
 
-    nd4j::MmulHelper::mmul(&a, &x, &y, 1., 0.);
+    sd::MmulHelper::mmul(&a, &x, &y, 1., 0.);
     ASSERT_TRUE(y.equalsTo(&exp));
 }
 
@@ -926,13 +926,13 @@ TEST_F(CudaBasicsTests2, mmulMxV_19) {
 	const Nd4jLong M = 3;
 	const Nd4jLong N = 4;
 
-	NDArray a('f', {M,N}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::FLOAT32);
-	NDArray x('f', {N}, {1,-2,3,-4}, nd4j::DataType::DOUBLE);
-	NDArray y('f', {M}, nd4j::DataType::FLOAT32);
+	NDArray a('f', {M,N}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::FLOAT32);
+	NDArray x('f', {N}, {1,-2,3,-4}, sd::DataType::DOUBLE);
+	NDArray y('f', {M}, sd::DataType::FLOAT32);
 
-	NDArray exp('f', {M}, {0.1, 0.3, 0.5}, nd4j::DataType::FLOAT32);
+	NDArray exp('f', {M}, {0.1, 0.3, 0.5}, sd::DataType::FLOAT32);
 
-	nd4j::MmulHelper::mmul(&a, &x, &y, 1., 0.);
+	sd::MmulHelper::mmul(&a, &x, &y, 1., 0.);
 	ASSERT_TRUE(y.equalsTo(&exp));
 }
 
@@ -942,12 +942,12 @@ TEST_F(CudaBasicsTests2, mmulMxV_20) {
 	const Nd4jLong M = 3;
 	const Nd4jLong N = 4;
 
-	NDArray a('c', {M,N}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::FLOAT32);
-	NDArray x('f', {N}, {1,-2,3,-4}, nd4j::DataType::DOUBLE);
-	NDArray y('f', {M}, nd4j::DataType::FLOAT32);
-	NDArray exp('f', {M}, {-1.6, -0.7, 0.2}, nd4j::DataType::FLOAT32);
+	NDArray a('c', {M,N}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::FLOAT32);
+	NDArray x('f', {N}, {1,-2,3,-4}, sd::DataType::DOUBLE);
+	NDArray y('f', {M}, sd::DataType::FLOAT32);
+	NDArray exp('f', {M}, {-1.6, -0.7, 0.2}, sd::DataType::FLOAT32);
 
-	nd4j::MmulHelper::mmul(&a, &x, &y, 1., 0.);
+	sd::MmulHelper::mmul(&a, &x, &y, 1., 0.);
 	ASSERT_TRUE(y.equalsTo(&exp));
 }
 
@@ -957,12 +957,12 @@ TEST_F(CudaBasicsTests2, mmulMxV_21) {
 	const Nd4jLong M = 3;
 	const Nd4jLong N = 4;
 
-	NDArray a('c', {M,N}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::FLOAT32);
-	NDArray x('c', {N}, {1,-2,3,-4}, nd4j::DataType::DOUBLE);
-	NDArray y('c', {M}, nd4j::DataType::FLOAT32);
-	NDArray exp('c', {M}, {-1.6, -0.7, 0.2}, nd4j::DataType::FLOAT32);
+	NDArray a('c', {M,N}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::FLOAT32);
+	NDArray x('c', {N}, {1,-2,3,-4}, sd::DataType::DOUBLE);
+	NDArray y('c', {M}, sd::DataType::FLOAT32);
+	NDArray exp('c', {M}, {-1.6, -0.7, 0.2}, sd::DataType::FLOAT32);
 
-	nd4j::MmulHelper::mmul(&a, &x, &y, 1., 0.);
+	sd::MmulHelper::mmul(&a, &x, &y, 1., 0.);
 	ASSERT_TRUE(y.equalsTo(&exp));
 }
 
@@ -972,14 +972,14 @@ TEST_F(CudaBasicsTests2, mmulMxV_22) {
 	const Nd4jLong M = 3;
 	const Nd4jLong N = 4;
 
-	NDArray a('f', {M,N}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::FLOAT32);
-	NDArray temp('f', {M,N,5}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, nd4j::DataType::DOUBLE);
+	NDArray a('f', {M,N}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::FLOAT32);
+	NDArray temp('f', {M,N,5}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, sd::DataType::DOUBLE);
 	NDArray x = temp(6, {0,2});
-	NDArray y('f', {M}, nd4j::DataType::FLOAT32);
+	NDArray y('f', {M}, sd::DataType::FLOAT32);
 
-	NDArray exp('f', {M}, {5.5, 5.1, 4.7}, nd4j::DataType::FLOAT32);
+	NDArray exp('f', {M}, {5.5, 5.1, 4.7}, sd::DataType::FLOAT32);
 
-	nd4j::MmulHelper::mmul(&a, &x, &y, 1., 0.);
+	sd::MmulHelper::mmul(&a, &x, &y, 1., 0.);
 	ASSERT_TRUE(y.equalsTo(&exp));
 }
 
@@ -989,15 +989,15 @@ TEST_F(CudaBasicsTests2, mmulMxV_23) {
     const Nd4jLong M = 3;
     const Nd4jLong N = 4;
 
-    NDArray a('f', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::FLOAT32);
+    NDArray a('f', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::FLOAT32);
     a.permutei({1,0});
-    NDArray temp('f', {5,M,N}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, nd4j::DataType::DOUBLE);
+    NDArray temp('f', {5,M,N}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, sd::DataType::DOUBLE);
     NDArray x = temp(3, {0,1});
-    NDArray y('f', {M}, nd4j::DataType::FLOAT32);
+    NDArray y('f', {M}, sd::DataType::FLOAT32);
 
-    NDArray exp('f', {M}, {1.5, 1.8, 1.5}, nd4j::DataType::FLOAT32);
+    NDArray exp('f', {M}, {1.5, 1.8, 1.5}, sd::DataType::FLOAT32);
 
-    nd4j::MmulHelper::mmul(&a, &x, &y, 1., 0.);
+    sd::MmulHelper::mmul(&a, &x, &y, 1., 0.);
     ASSERT_TRUE(y.equalsTo(&exp));
 }
 
@@ -1007,14 +1007,14 @@ TEST_F(CudaBasicsTests2, mmulMxV_24) {
 	const Nd4jLong M = 3;
 	const Nd4jLong N = 4;
 
-	NDArray a('f', {M,N}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::FLOAT32);
-	NDArray temp('f', {M,N,5}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, nd4j::DataType::DOUBLE);
+	NDArray a('f', {M,N}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::FLOAT32);
+	NDArray temp('f', {M,N,5}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, sd::DataType::DOUBLE);
 	NDArray x = temp(6, {0,2},true);
-	NDArray y('f', {M}, nd4j::DataType::FLOAT32);
+	NDArray y('f', {M}, sd::DataType::FLOAT32);
 
-	NDArray exp('f', {M}, {5.5, 5.1, 4.7}, nd4j::DataType::FLOAT32);
+	NDArray exp('f', {M}, {5.5, 5.1, 4.7}, sd::DataType::FLOAT32);
 
-	nd4j::MmulHelper::mmul(&a, &x, &y, 1., 0.);
+	sd::MmulHelper::mmul(&a, &x, &y, 1., 0.);
 	ASSERT_TRUE(y.equalsTo(&exp));
 }
 
@@ -1024,15 +1024,15 @@ TEST_F(CudaBasicsTests2, mmulMxV_25) {
     const Nd4jLong M = 3;
     const Nd4jLong N = 4;
 
-    NDArray a('f', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::FLOAT32);
+    NDArray a('f', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::FLOAT32);
     a.permutei({1,0});
-    NDArray temp('f', {5,M,N}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, nd4j::DataType::DOUBLE);
+    NDArray temp('f', {5,M,N}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, sd::DataType::DOUBLE);
     NDArray x = temp(3, {0,1}, true);
-    NDArray y('f', {M}, nd4j::DataType::FLOAT32);
+    NDArray y('f', {M}, sd::DataType::FLOAT32);
 
-    NDArray exp('f', {M}, {1.5, 1.8, 1.5}, nd4j::DataType::FLOAT32);
+    NDArray exp('f', {M}, {1.5, 1.8, 1.5}, sd::DataType::FLOAT32);
 
-    nd4j::MmulHelper::mmul(&a, &x, &y, 1., 0.);
+    sd::MmulHelper::mmul(&a, &x, &y, 1., 0.);
     ASSERT_TRUE(y.equalsTo(&exp));
 }
 
@@ -1042,16 +1042,16 @@ TEST_F(CudaBasicsTests2, mmulMxV_26) {
     const Nd4jLong M = 3;
     const Nd4jLong N = 4;
 
-    NDArray a('c', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::FLOAT32);
+    NDArray a('c', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::FLOAT32);
     a.permutei({1,0});
-    NDArray temp('f',  {5,M,N}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, nd4j::DataType::DOUBLE);
-    NDArray temp1('c', {5,M,N}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, nd4j::DataType::FLOAT32);
+    NDArray temp('f',  {5,M,N}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, sd::DataType::DOUBLE);
+    NDArray temp1('c', {5,M,N}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, sd::DataType::FLOAT32);
     NDArray x = temp(2, {0,1});
     NDArray y = temp1(17, {0,2});
 
-    NDArray exp('c', {M}, {-0.3, 0.3, 0.9}, nd4j::DataType::FLOAT32);
+    NDArray exp('c', {M}, {-0.3, 0.3, 0.9}, sd::DataType::FLOAT32);
 
-    nd4j::MmulHelper::mmul(&a, &x, &y, 1., 0.);
+    sd::MmulHelper::mmul(&a, &x, &y, 1., 0.);
     ASSERT_TRUE(y.equalsTo(&exp));
 }
 
@@ -1061,16 +1061,16 @@ TEST_F(CudaBasicsTests2, mmulMxV_27) {
     const Nd4jLong M = 3;
     const Nd4jLong N = 4;
 
-    NDArray a('c', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::FLOAT32);
+    NDArray a('c', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::FLOAT32);
     a.permutei({1,0});
-    NDArray temp('f',  {5,M,N}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, nd4j::DataType::DOUBLE);
-    NDArray temp1('c', {5,M,N}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, nd4j::DataType::FLOAT32);
+    NDArray temp('f',  {5,M,N}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, sd::DataType::DOUBLE);
+    NDArray temp1('c', {5,M,N}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, sd::DataType::FLOAT32);
     NDArray x = temp(2, {0,1},true);
     NDArray y = temp1(17, {0,2},true);
 
-    NDArray exp('c', {1,M,1}, {-0.3, 0.3, 0.9}, nd4j::DataType::FLOAT32);
+    NDArray exp('c', {1,M,1}, {-0.3, 0.3, 0.9}, sd::DataType::FLOAT32);
 
-    nd4j::MmulHelper::mmul(&a, &x, &y, 1., 0.);
+    sd::MmulHelper::mmul(&a, &x, &y, 1., 0.);
     ASSERT_TRUE(y.equalsTo(&exp));
 }
 
@@ -1080,14 +1080,14 @@ TEST_F(CudaBasicsTests2, mmulMxV_28) {
     const Nd4jLong M = 3;
     const Nd4jLong N = 4;
 
-    NDArray a('c', {M,N}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::FLOAT32);
-    NDArray temp('f', {M,N,5}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, nd4j::DataType::DOUBLE);
+    NDArray a('c', {M,N}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::FLOAT32);
+    NDArray temp('f', {M,N,5}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, sd::DataType::DOUBLE);
     NDArray x = temp(6, {0,2});
-    NDArray y('f', {M}, nd4j::DataType::FLOAT32);
+    NDArray y('f', {M}, sd::DataType::FLOAT32);
 
-    NDArray exp('f', {M}, {5.1, 3.3, 1.5}, nd4j::DataType::FLOAT32);
+    NDArray exp('f', {M}, {5.1, 3.3, 1.5}, sd::DataType::FLOAT32);
 
-    nd4j::MmulHelper::mmul(&a, &x, &y, 1., 0.);
+    sd::MmulHelper::mmul(&a, &x, &y, 1., 0.);
     ASSERT_TRUE(y.equalsTo(&exp));
 }
 
@@ -1096,13 +1096,13 @@ TEST_F(CudaBasicsTests2, mmulDot_1) {
 
     const Nd4jLong N = 4;
 
-    NDArray x('c', {N}, {1, 2, 3, 4}, nd4j::DataType::INT32);
-    NDArray y('f', {N}, {0.1, 0.2, 0.3, 0.4}, nd4j::DataType::FLOAT32);
-    NDArray z(nd4j::DataType::DOUBLE);
+    NDArray x('c', {N}, {1, 2, 3, 4}, sd::DataType::INT32);
+    NDArray y('f', {N}, {0.1, 0.2, 0.3, 0.4}, sd::DataType::FLOAT32);
+    NDArray z(sd::DataType::DOUBLE);
 
-    NDArray exp('c', {}, {3}, nd4j::DataType::DOUBLE);
+    NDArray exp('c', {}, {3}, sd::DataType::DOUBLE);
 
-    nd4j::MmulHelper::mmul(&x, &y, &z);
+    sd::MmulHelper::mmul(&x, &y, &z);
     ASSERT_TRUE(z.equalsTo(&exp));
 }
 
@@ -1111,13 +1111,13 @@ TEST_F(CudaBasicsTests2, mmulDot_2) {
 
     const Nd4jLong N = 4;
 
-    NDArray x('c', {1,1,N}, {1,2, 3, 4}, nd4j::DataType::INT32);
-    NDArray y('f', {1,1,N,1,1,1}, {0.1, 0.2, 0.3, 0.4}, nd4j::DataType::FLOAT32);
-    NDArray z(nd4j::DataType::DOUBLE);
+    NDArray x('c', {1,1,N}, {1,2, 3, 4}, sd::DataType::INT32);
+    NDArray y('f', {1,1,N,1,1,1}, {0.1, 0.2, 0.3, 0.4}, sd::DataType::FLOAT32);
+    NDArray z(sd::DataType::DOUBLE);
 
-    NDArray exp('c', {}, {3}, nd4j::DataType::DOUBLE);
+    NDArray exp('c', {}, {3}, sd::DataType::DOUBLE);
 
-    nd4j::MmulHelper::mmul(&x, &y, &z);
+    sd::MmulHelper::mmul(&x, &y, &z);
     ASSERT_TRUE(z.equalsTo(&exp));
 }
 
@@ -1126,15 +1126,15 @@ TEST_F(CudaBasicsTests2, mmulDot_3) {
 
     const Nd4jLong N = 4;
 
-    NDArray xBig('c', {4,2}, {1, 0, 2, 0, 3, 0, 4, 0}, nd4j::DataType::INT32);
-    NDArray yBig('c', {4,3}, {0.1, 0, 0, 0.2, 0, 0, 0.3, 0, 0, 0.4, 0,0}, nd4j::DataType::FLOAT32);
+    NDArray xBig('c', {4,2}, {1, 0, 2, 0, 3, 0, 4, 0}, sd::DataType::INT32);
+    NDArray yBig('c', {4,3}, {0.1, 0, 0, 0.2, 0, 0, 0.3, 0, 0, 0.4, 0,0}, sd::DataType::FLOAT32);
     NDArray x = xBig(0, {1}, true);
     NDArray y = yBig(0, {1}, true);
-    NDArray z(nd4j::DataType::DOUBLE);
+    NDArray z(sd::DataType::DOUBLE);
 
-    NDArray exp('c', {}, {3}, nd4j::DataType::DOUBLE);
+    NDArray exp('c', {}, {3}, sd::DataType::DOUBLE);
 
-    nd4j::MmulHelper::mmul(&x, &y, &z);
+    sd::MmulHelper::mmul(&x, &y, &z);
     ASSERT_TRUE(z.equalsTo(&exp));
 }
 
@@ -1143,15 +1143,15 @@ TEST_F(CudaBasicsTests2, mmulDot_4) {
 
     const Nd4jLong N = 4;
 
-    NDArray xBig('f', {4,2}, {1, 2, 3, 4, 0, 0, 0, 0}, nd4j::DataType::INT32);
-    NDArray yBig('c', {4,3}, {0.1, 0, 0, 0.2, 0, 0, 0.3, 0, 0, 0.4, 0,0}, nd4j::DataType::FLOAT32);
+    NDArray xBig('f', {4,2}, {1, 2, 3, 4, 0, 0, 0, 0}, sd::DataType::INT32);
+    NDArray yBig('c', {4,3}, {0.1, 0, 0, 0.2, 0, 0, 0.3, 0, 0, 0.4, 0,0}, sd::DataType::FLOAT32);
     NDArray x = xBig(0, {1}, true);
     NDArray y = yBig(0, {1});
-    NDArray z(nd4j::DataType::DOUBLE);
+    NDArray z(sd::DataType::DOUBLE);
 
-    NDArray exp('c', {}, {3}, nd4j::DataType::DOUBLE);
+    NDArray exp('c', {}, {3}, sd::DataType::DOUBLE);
 
-    nd4j::MmulHelper::mmul(&x, &y, &z);
+    sd::MmulHelper::mmul(&x, &y, &z);
     ASSERT_TRUE(z.equalsTo(&exp));
 }
  */
\ No newline at end of file
diff --git a/libnd4j/tests_cpu/layers_tests/CudaExtraArgumentsTests.cu b/libnd4j/tests_cpu/layers_tests/CudaExtraArgumentsTests.cu
index fd8493db8..30d58946c 100644
--- a/libnd4j/tests_cpu/layers_tests/CudaExtraArgumentsTests.cu
+++ b/libnd4j/tests_cpu/layers_tests/CudaExtraArgumentsTests.cu
@@ -24,7 +24,7 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 
-using namespace nd4j;
+using namespace sd;
 
 class CudaExtraArgumentsTests : public testing::Test {
 public:
diff --git a/libnd4j/tests_cpu/layers_tests/CudaLaunchHelperTests.cpp b/libnd4j/tests_cpu/layers_tests/CudaLaunchHelperTests.cpp
index 7d3753dbd..66cc024bf 100644
--- a/libnd4j/tests_cpu/layers_tests/CudaLaunchHelperTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/CudaLaunchHelperTests.cpp
@@ -21,8 +21,8 @@
 #include "testlayers.h"
 #include <helpers/CudaLaunchHelper.h>
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class CudaLaunchHelperTests : public testing::Test {
 public:
diff --git a/libnd4j/tests_cpu/layers_tests/DataBufferTests.cpp b/libnd4j/tests_cpu/layers_tests/DataBufferTests.cpp
index 03d7bb38e..42ab543b1 100644
--- a/libnd4j/tests_cpu/layers_tests/DataBufferTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DataBufferTests.cpp
@@ -19,9 +19,9 @@
 //
 
 #include "testlayers.h"
-#include <NDArray.h>
-#include <Context.h>
-#include <Node.h>
+#include <array/NDArray.h>
+#include <graph/Context.h>
+#include <graph/Node.h>
 #include <graph/Variable.h>
 #include <graph/VariableSpace.h>
 #include <ops/declarable/CustomOperations.h>
@@ -29,9 +29,9 @@
 #include <ops/declarable/helpers/col2im.h>
 #include <helpers/RandomLauncher.h>
 
-using namespace nd4j;
-using namespace nd4j::graph;
-using namespace nd4j::memory;
+using namespace sd;
+using namespace sd::graph;
+using namespace sd::memory;
 
 class DataBufferTests : public testing::Test {
 public:
diff --git a/libnd4j/tests_cpu/layers_tests/DataBufferTestsCuda.cu b/libnd4j/tests_cpu/layers_tests/DataBufferTestsCuda.cu
index 4f309cff5..730ade824 100644
--- a/libnd4j/tests_cpu/layers_tests/DataBufferTestsCuda.cu
+++ b/libnd4j/tests_cpu/layers_tests/DataBufferTestsCuda.cu
@@ -19,9 +19,9 @@
 //
 
 #include "testlayers.h"
-#include <NDArray.h>
-#include <Context.h>
-#include <Node.h>
+#include <array/NDArray.h>
+#include <graph/Context.h>
+#include <graph/Node.h>
 #include <graph/Variable.h>
 #include <graph/VariableSpace.h>
 #include <ops/declarable/CustomOperations.h>
@@ -29,9 +29,9 @@
 #include <ops/declarable/helpers/col2im.h>
 #include <helpers/RandomLauncher.h>
 
-using namespace nd4j;
-using namespace nd4j::graph;
-using namespace nd4j::memory;
+using namespace sd;
+using namespace sd::graph;
+using namespace sd::memory;
 
 class DataBufferTestsCuda : public testing::Test {
 public:
diff --git a/libnd4j/tests_cpu/layers_tests/DataTypesValidationTests.cpp b/libnd4j/tests_cpu/layers_tests/DataTypesValidationTests.cpp
index 9ee23c36d..830b861a4 100644
--- a/libnd4j/tests_cpu/layers_tests/DataTypesValidationTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DataTypesValidationTests.cpp
@@ -19,9 +19,9 @@
 //
 
 #include "testlayers.h"
-#include <NDArray.h>
-#include <Context.h>
-#include <Node.h>
+#include <array/NDArray.h>
+#include <graph/Context.h>
+#include <graph/Node.h>
 #include <graph/Variable.h>
 #include <graph/VariableSpace.h>
 #include <ops/declarable/CustomOperations.h>
@@ -29,8 +29,8 @@
 #include <ops/declarable/helpers/col2im.h>
 #include <helpers/RandomLauncher.h>
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class DataTypesValidationTests : public testing::Test {
 public:
@@ -45,7 +45,7 @@ TEST_F(DataTypesValidationTests, Basic_Test_1) {
     weights.assign(2.0);
     input.linspace(1);
 
-    nd4j::ops::conv2d op;
+    sd::ops::conv2d op;
     auto result = op.evaluate({&input, &weights}, {1, 1, 1, 1, 0, 0, 1, 1, 0, 0});
 
     ASSERT_EQ(ND4J_STATUS_VALIDATION, result->status());
@@ -61,7 +61,7 @@ TEST_F(DataTypesValidationTests, Basic_Test_2) {
     weights.assign(2.0);
     input.linspace(1);
 
-    nd4j::ops::conv2d op;
+    sd::ops::conv2d op;
     auto result = op.evaluate({&input, &weights}, {1, 1, 1, 1, 0, 0, 1, 1, 0, 0});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -82,7 +82,7 @@ TEST_F(DataTypesValidationTests, Basic_Test_3) {
     weights.assign(2.0);
     input.linspace(1);
 
-    nd4j::ops::conv2d op;
+    sd::ops::conv2d op;
     auto result = op.execute({&input, &weights}, {&out}, {}, {1, 1, 1, 1, 0, 0, 1, 1, 0, 0}, {});
     ASSERT_EQ(Status::OK(), result);
 
@@ -98,7 +98,7 @@ TEST_F(DataTypesValidationTests, Basic_Test_4) {
     weights.assign(2.0);
     input.linspace(1);
 
-    nd4j::ops::conv2d op;
+    sd::ops::conv2d op;
     auto result = op.execute({&input, &weights}, {&out}, {}, {1, 1, 1, 1, 0, 0, 1, 1, 0, 0}, {});
     ASSERT_EQ(ND4J_STATUS_VALIDATION, result);
 }
@@ -138,7 +138,7 @@ TEST_F(DataTypesValidationTests, test_bits_hamming_distance_1) {
     ctx.setInputArray(1, &y);
     ctx.setOutputArray(0, &z);
 
-    nd4j::ops::bits_hamming_distance op;
+    sd::ops::bits_hamming_distance op;
     auto status = op.execute(&ctx);
     ASSERT_NE(Status::OK(), status);
 }
@@ -153,7 +153,7 @@ TEST_F(DataTypesValidationTests, test_bits_hamming_distance_2) {
     ctx.setInputArray(1, &y);
     ctx.setOutputArray(0, &z);
 
-    nd4j::ops::bits_hamming_distance op;
+    sd::ops::bits_hamming_distance op;
     auto status = op.execute(&ctx);
     ASSERT_EQ(Status::OK(), status);
 }
\ No newline at end of file
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp
index 507a507af..dfd74d6ab 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp
@@ -19,21 +19,21 @@
 //
 
 #include "testlayers.h"
-#include <Context.h>
+#include <graph/Context.h>
 #include <iomanip>
-#include <Variable.h>
-#include <VariableSpace.h>
+#include <graph/Variable.h>
+#include <graph/VariableSpace.h>
 #include <ops/declarable/OpRegistrator.h>
 #include <ops/declarable/CustomOperations.h>
 #include <helpers/helper_hash.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <array/NDArrayList.h>
-#include <NativeOps.h>
+#include <legacy/NativeOps.h>
 #include <ops/gemm.h>
 #include <helpers/PointersManager.h>
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class DeclarableOpsTests1 : public testing::Test {
 public:
@@ -55,11 +55,11 @@ public:
     const int oW = (iW - kW - (kW-1)*(dW-1) + 2*pW)/sW + 1;     // output width
 
     DeclarableOpsTests1() {
-        nd4j::memory::MemoryTracker::getInstance()->reset();
+        sd::memory::MemoryTracker::getInstance()->reset();
     }
 
     ~DeclarableOpsTests1() {
-        nd4j::memory::MemoryTracker::getInstance()->summarize();
+        sd::memory::MemoryTracker::getInstance()->summarize();
     }
 };
 
@@ -93,7 +93,7 @@ TYPED_TEST_CASE(TypedDeclarableOpsTests1, TestingTypes);
 
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests1, BasicInitialization1) {
-    auto concat = new nd4j::ops::concat();
+    auto concat = new sd::ops::concat();
     std::string expName("concat");
     ASSERT_EQ(expName, *(concat->getOpName()));
 
@@ -144,7 +144,7 @@ TEST_F(DeclarableOpsTests1, BasicInitialization1) {
 
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests1, BasicInitialization2) {
-    auto op = nd4j::ops::OpRegistrator::getInstance()->getOperation("concat");
+    auto op = sd::ops::OpRegistrator::getInstance()->getOperation("concat");
 
     ASSERT_TRUE(op != nullptr);
     std::string expName("concat");
@@ -160,7 +160,7 @@ TEST_F(DeclarableOpsTests1, ApplyGradientDescent_1) {
     auto y = NDArrayFactory::create<double>('c', {3,4}, {0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0,1.1,1.2});
     auto exp = NDArrayFactory::create<double>('c', {3,4});
     exp.linspace(0.9, 0.9);
-    nd4j::ops::apply_sgd op;
+    sd::ops::apply_sgd op;
     auto result = op.evaluate({&x, &y}, {1.}, {});
     ASSERT_EQ(result->status(), ND4J_STATUS_OK);
     auto z = result->at(0);
@@ -174,7 +174,7 @@ TEST_F(DeclarableOpsTests1, AssignBroadcastTest_1) {
     auto x = NDArrayFactory::create<double>('c', {3,4}, {1,2,3,4,5,6,7,8,9,10,11,12});
     auto y = NDArrayFactory::create<double>('c', {1,4}, {0.1,0.2,0.3,0.4});
     auto exp = NDArrayFactory::create<double>('c', {3,4}, {0.1, 0.2, 0.3, 0.4, 0.1, 0.2, 0.3, 0.4, 0.1, 0.2, 0.3, 0.4});
-    nd4j::ops::assign op;
+    sd::ops::assign op;
     auto result = op.evaluate({&x, &y});
     ASSERT_EQ(result->status(), ND4J_STATUS_OK);
     auto z = result->at(0);
@@ -190,7 +190,7 @@ TEST_F(DeclarableOpsTests1, AssignBroadcastTest_2) {
     auto eps = NDArrayFactory::create<double>('c', {3,4}, {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4});
     auto exp1 = NDArrayFactory::create<double>('c', {3,4}); // zero
     auto exp2 = NDArrayFactory::create<double>('c', {1,4}, {3, 6, 9, 12});
-    nd4j::ops::assign_bp op;
+    sd::ops::assign_bp op;
     auto result = op.evaluate({&x, &y, &eps});
     ASSERT_EQ(result->status(), ND4J_STATUS_OK);
     auto z1 = result->at(0);
@@ -207,7 +207,7 @@ TEST_F(DeclarableOpsTests1, AXpY_Test_1) {
     auto y = NDArrayFactory::create<double>('c', {3,4}, {1,2,3,4,5,6,7,8,9,10,11,12});
     auto exp = NDArrayFactory::create<double>('c', {3,4});
     exp.linspace(3, 3);
-    nd4j::ops::axpy op;
+    sd::ops::axpy op;
     auto result = op.evaluate({&x, &y}, {2.});
     ASSERT_EQ(result->status(), ND4J_STATUS_OK);
     auto z = result->at(0);
@@ -217,19 +217,19 @@ TEST_F(DeclarableOpsTests1, AXpY_Test_1) {
 }
 
 TEST_F(DeclarableOpsTests1, BasicInitialization3) {
-    auto op1 = nd4j::ops::OpRegistrator::getInstance()->getOperation("concat");
+    auto op1 = sd::ops::OpRegistrator::getInstance()->getOperation("concat");
     std::string expName("concat");
-    auto hash = nd4j::ops::HashHelper::getInstance()->getLongHash(expName);
+    auto hash = sd::ops::HashHelper::getInstance()->getLongHash(expName);
 
-    auto op2 = nd4j::ops::OpRegistrator::getInstance()->getOperation(hash);
+    auto op2 = sd::ops::OpRegistrator::getInstance()->getOperation(hash);
 
     ASSERT_TRUE(op1 == op2);
 }
 
 
 TEST_F(DeclarableOpsTests1, SynonymInitialization2) {
-    auto op = nd4j::ops::OpRegistrator::getInstance()->getOperation("Mul");
-    auto op2 = nd4j::ops::OpRegistrator::getInstance()->getOperation("multiply");
+    auto op = sd::ops::OpRegistrator::getInstance()->getOperation("Mul");
+    auto op2 = sd::ops::OpRegistrator::getInstance()->getOperation("multiply");
 
     ASSERT_TRUE(op != nullptr);
     std::string expName("multiply");
@@ -240,15 +240,15 @@ TEST_F(DeclarableOpsTests1, SynonymInitialization2) {
 
 TEST_F(DeclarableOpsTests1, TestTensorMmul1) {
 
-    NDArray x('c', {2, 3, 4}, nd4j::DataType::FLOAT32);
-    NDArray y('c', {2, 3, 4}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {2, 3, 4}, sd::DataType::FLOAT32);
+    NDArray y('c', {2, 3, 4}, sd::DataType::FLOAT32);
 
     x.linspace(1);
     y.linspace(1);
 
-    NDArray exp('c', {2, 2}, {650.0, 1586.0, 1586.0, 4250.0}, nd4j::DataType::FLOAT32);
+    NDArray exp('c', {2, 2}, {650.0, 1586.0, 1586.0, 4250.0}, sd::DataType::FLOAT32);
 
-    nd4j::ops::tensormmul op;
+    sd::ops::tensormmul op;
     auto results = op.evaluate({&x, &y}, {}, {2,1,2,2,1,2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -263,12 +263,12 @@ TEST_F(DeclarableOpsTests1, TestTensorMmul1) {
 
 TEST_F(DeclarableOpsTests1, TestTensorDot2) {
 
-    NDArray x('f', {2, 3, 4}, {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24.}, nd4j::DataType::FLOAT32);
-    NDArray y('f', {2, 3, 4}, {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24.}, nd4j::DataType::FLOAT32);
+    NDArray x('f', {2, 3, 4}, {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24.}, sd::DataType::FLOAT32);
+    NDArray y('f', {2, 3, 4}, {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24.}, sd::DataType::FLOAT32);
 
-    NDArray exp('c', {2, 2}, {2300.0, 2444.0, 2444.0, 2600.0}, nd4j::DataType::FLOAT32);
+    NDArray exp('c', {2, 2}, {2300.0, 2444.0, 2444.0, 2600.0}, sd::DataType::FLOAT32);
 
-    nd4j::ops::tensormmul op;
+    sd::ops::tensormmul op;
     auto results = op.evaluate({&x, &y}, {}, {2,1,2,2,1,2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -283,12 +283,12 @@ TEST_F(DeclarableOpsTests1, TestTensorDot2) {
 
 TEST_F(DeclarableOpsTests1, TestTensorDot3) {
 
-    NDArray x('c', {2, 3, 4}, {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24.}, nd4j::DataType::FLOAT32);
-    NDArray y('f', {2, 3, 4}, {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24.}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {2, 3, 4}, {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24.}, sd::DataType::FLOAT32);
+    NDArray y('f', {2, 3, 4}, {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24.}, sd::DataType::FLOAT32);
 
-    NDArray exp('f', {2, 2}, {1090.0, 2818.0, 1168.0, 3040.0}, nd4j::DataType::FLOAT32);
+    NDArray exp('f', {2, 2}, {1090.0, 2818.0, 1168.0, 3040.0}, sd::DataType::FLOAT32);
 
-    nd4j::ops::tensormmul op;
+    sd::ops::tensormmul op;
     auto results = op.evaluate({&x, &y}, {}, {2,1,2,2,1,2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -303,12 +303,12 @@ TEST_F(DeclarableOpsTests1, TestTensorDot3) {
 
 TEST_F(DeclarableOpsTests1, TestTensorDot4) {
 
-    NDArray x('f', {2, 3, 4}, {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24.}, nd4j::DataType::FLOAT32);
-    NDArray y('c', {2, 3, 4}, {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24.}, nd4j::DataType::FLOAT32);
+    NDArray x('f', {2, 3, 4}, {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24.}, sd::DataType::FLOAT32);
+    NDArray y('c', {2, 3, 4}, {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24.}, sd::DataType::FLOAT32);
 
-    NDArray exp('f', {2, 2}, {1090.0, 1168.0, 2818.0, 3040.0}, nd4j::DataType::FLOAT32);
+    NDArray exp('f', {2, 2}, {1090.0, 1168.0, 2818.0, 3040.0}, sd::DataType::FLOAT32);
 
-    nd4j::ops::tensormmul op;
+    sd::ops::tensormmul op;
     auto results = op.evaluate({&x, &y}, {}, {2,1,2,2,1,2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -328,7 +328,7 @@ TEST_F(DeclarableOpsTests1, TestTensorDot5) {
     auto y = NDArrayFactory::create<double>('c', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16});
     auto expected = NDArrayFactory::create<double>('c', {2,4,2,4}, {44,110,160, 66,132, 38, 88,154, 68,170,224,102,204, 82,136,238, 92,230,288,138,276,126,184,322, 116,290,352,174,348,170,232,406, 76,190,160,114,228,182,152,266, 100,250,224,150,300,226,200,350, 124,310,288,186,372,270,248,434, 148,370,352,222,444,314,296,518});
 
-    nd4j::ops::tensormmul op;
+    sd::ops::tensormmul op;
     auto results = op.evaluate({&x, &y}, {}, {1,1,1,2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -350,7 +350,7 @@ TEST_F(DeclarableOpsTests1, TestTensorDot6) {
     auto y = NDArrayFactory::create<double>('f', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16});
     auto expected = NDArrayFactory::create<double>('c', {2,4,2,4}, {22, 66,110,154, 44, 88,132,176, 34,102,170,238, 68,136,204,272, 46,138,230,322, 92,184,276,368, 58,174,290,406,116,232,348,464, 38,114,190,266, 76,152,228,304, 50,150,250,350,100,200,300,400, 62,186,310,434,124,248,372,496, 74,222,370,518,148,296,444,592});
 
-    nd4j::ops::tensormmul op;
+    sd::ops::tensormmul op;
     auto results = op.evaluate({&x, &y}, {}, {1,1,1,2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -371,7 +371,7 @@ TEST_F(DeclarableOpsTests1, TestTensorDot7) {
     auto y = NDArrayFactory::create<double>('c', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16});
     auto expected = NDArrayFactory::create<double>('c', {2,4,2,4}, {76,166,112,106,196, 62,136,226, 60,174,208, 98,212,230,136,250, 76,214,336,122,260,174,168,306, 124,286,240,178,340,150,232,394, 100,226,176,142,268,106,184,310, 84,234,272,134,284,274,184,334, 100,274,400,158,332,218,216,390, 148,346,304,214,412,194,280,478});
 
-    nd4j::ops::tensormmul op;
+    sd::ops::tensormmul op;
     auto results = op.evaluate({&x, &y}, {}, {1,1,1,2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -392,7 +392,7 @@ TEST_F(DeclarableOpsTests1, TestTensorDot8) {
     auto y = NDArrayFactory::create<double>('f', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16});
     auto expected = NDArrayFactory::create<double>('c', {2,4,2,4}, {30, 90,150,210, 60,120,180,240, 38,114,190,266, 76,152,228,304, 46,138,230,322, 92,184,276,368, 54,162,270,378,108,216,324,432, 42,126,210,294, 84,168,252,336, 50,150,250,350,100,200,300,400, 58,174,290,406,116,232,348,464, 66,198,330,462,132,264,396,528});
 
-    nd4j::ops::tensormmul op;
+    sd::ops::tensormmul op;
     auto results = op.evaluate({&x, &y}, {}, {1,1,1,2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -409,7 +409,7 @@ TEST_F(DeclarableOpsTests1, TestTensorDot8) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests1, TestTensorDot9) {
 
-    // NDArray z('f',{2,2,3}, nd4j::DataType::DOUBLE);
+    // NDArray z('f',{2,2,3}, sd::DataType::DOUBLE);
     // z.linspace(1);
     // z.printShapeInfo();
     // z.printIndexedBuffer();
@@ -421,7 +421,7 @@ TEST_F(DeclarableOpsTests1, TestTensorDot9) {
     auto y = NDArrayFactory::create<double>('f', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16});
     auto expected = NDArrayFactory::create<double>('c', {3,4,4,3}, {14, 14, 14, 30, 30, 30, 46, 46, 46, 62, 62, 62, 86, 86, 86,198,198,198,310,310,310,422,422,422, 62, 62, 62,142,142,142,222,222,222,302,302,302, 38, 38, 38, 86, 86, 86,134,134,134,182,182,182, 38, 38, 38, 86, 86, 86,134,134,134,182,182,182, 14, 14, 14, 30, 30, 30, 46, 46, 46, 62, 62, 62, 86, 86, 86,198,198,198,310,310,310,422,422,422, 62, 62, 62,142,142,142,222,222,222,302,302,302, 62, 62, 62,142,142,142,222,222,222,302,302,302, 38, 38, 38, 86, 86, 86,134,134,134,182,182,182, 14, 14, 14, 30, 30, 30, 46, 46, 46, 62, 62, 62, 86, 86, 86,198,198,198,310,310,310,422,422,422});
 
-    nd4j::ops::tensormmul op;
+    sd::ops::tensormmul op;
     auto results = op.evaluate({&x, &y}, {}, {1,0,1,0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -442,7 +442,7 @@ TEST_F(DeclarableOpsTests1, TestTensorDot10) {
     auto y = NDArrayFactory::create<double>('f', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16});
     auto expected = NDArrayFactory::create<double>('c', {4,4}, {114,258,402,546, 138,314,490,666, 162,370,578,786, 186,426,666,906});
 
-    nd4j::ops::tensormmul op;
+    sd::ops::tensormmul op;
     auto results = op.evaluate({&x, &y}, {}, {2,0,1, 2,0,2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -464,7 +464,7 @@ TEST_F(DeclarableOpsTests1, TestTensorDot11) {
     auto y = NDArrayFactory::create<double>('f', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16});
     auto expected = NDArrayFactory::create<double>('c', {4,4}, {98,218,338,458, 134,302,470,638, 170,386,602,818, 206,470,734,998});
 
-    nd4j::ops::tensormmul op;
+    sd::ops::tensormmul op;
     auto results = op.evaluate({&x, &y}, {}, {2,0,1, 2,0,2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -485,7 +485,7 @@ TEST_F(DeclarableOpsTests1, TestTensorDot12) {
     auto y = NDArrayFactory::create<double>('c', {2,4,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16});
     auto expected = NDArrayFactory::create<double>('c', {4,4}, {272,292,312,332, 368,396,424,452, 464,500,536,572, 560,604,648,692});
 
-    nd4j::ops::tensormmul op;
+    sd::ops::tensormmul op;
     auto results = op.evaluate({&x, &y}, {}, {2,0,1, 2,0,2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -506,7 +506,7 @@ TEST_F(DeclarableOpsTests1, TestTensorDot13) {
     auto y = NDArrayFactory::create<double>('c', {4,2,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16});
     auto expected = NDArrayFactory::create<double>('c', {3,3}, {640,560,640, 576,624,576, 640,560,640});
 
-    nd4j::ops::tensormmul op;
+    sd::ops::tensormmul op;
     auto results = op.evaluate({&x, &y}, {}, {2,0,2, 2,1,0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -527,7 +527,7 @@ TEST_F(DeclarableOpsTests1, TestTensorDot14) {
     auto y = NDArrayFactory::create<double>('c', {4,2,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16});
     auto expected = NDArrayFactory::create<double>('c', {3,3}, {648,600,520, 648,536,648, 520,600,648});
 
-    nd4j::ops::tensormmul op;
+    sd::ops::tensormmul op;
     auto results = op.evaluate({&x, &y}, {}, {2,0,2, 2,1,0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -548,7 +548,7 @@ TEST_F(DeclarableOpsTests1, TestTensorDot15) {
     auto y = NDArrayFactory::create<double>('f', {4,2,3}, {2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16, 2,4,6,8,10,12,14,16});
     auto expected = NDArrayFactory::create<double>('c', {3,3}, {624,624,624, 656,656,656, 624,624,624});
 
-    nd4j::ops::tensormmul op;
+    sd::ops::tensormmul op;
     auto results = op.evaluate({&x, &y}, {}, {2,0,2, 2,1,0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -565,11 +565,11 @@ TEST_F(DeclarableOpsTests1, TestTensorDot15) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests1, TestTensorDot16) {
 
-    NDArray x('c', {1}, std::vector<double>{2}, nd4j::DataType::FLOAT32);
-    NDArray y('c', {2,1,2}, {1,2,3,4}, nd4j::DataType::FLOAT32);
-    NDArray exp('c', {2,2}, {2,4,6,8}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {1}, std::vector<double>{2}, sd::DataType::FLOAT32);
+    NDArray y('c', {2,1,2}, {1,2,3,4}, sd::DataType::FLOAT32);
+    NDArray exp('c', {2,2}, {2,4,6,8}, sd::DataType::FLOAT32);
 
-    nd4j::ops::tensormmul op;
+    sd::ops::tensormmul op;
     auto results = op.evaluate({&x, &y}, {}, {1,0, 1,1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -585,11 +585,11 @@ TEST_F(DeclarableOpsTests1, TestTensorDot16) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests1, TestTensorDot17) {
 
-    NDArray x('f', {16,16}, nd4j::DataType::FLOAT32);
-    NDArray y('f', {1000,16}, nd4j::DataType::FLOAT32);
-    NDArray z('c', {16,1000}, nd4j::DataType::FLOAT32);
+    NDArray x('f', {16,16}, sd::DataType::FLOAT32);
+    NDArray y('f', {1000,16}, sd::DataType::FLOAT32);
+    NDArray z('c', {16,1000}, sd::DataType::FLOAT32);
 
-    nd4j::ops::tensormmul op;
+    sd::ops::tensormmul op;
     auto status = op.execute({&x, &y}, {&z}, {}, {1,1, 1,1}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -597,7 +597,7 @@ TEST_F(DeclarableOpsTests1, TestTensorDot17) {
 
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests1, DivergentCheck1) {
-    auto op = nd4j::ops::OpRegistrator::getInstance()->getOperation("switch");
+    auto op = sd::ops::OpRegistrator::getInstance()->getOperation("switch");
 
     ASSERT_TRUE(op != nullptr);
     std::string expName("Switch");
@@ -622,7 +622,7 @@ TEST_F(DeclarableOpsTests1, AddMatrices1) {
     auto block = new Context(1, variableSpace, true);
     block->fillInputs({-1, -2});
 
-    nd4j::ops::add addOp;
+    sd::ops::add addOp;
 
     addOp.execute(block);
 
@@ -650,7 +650,7 @@ TEST_F(DeclarableOpsTests1, AddVectorVector1) {
     auto block = new Context(1, variableSpace, true);
     block->fillInputs({-1, -2});
 
-    nd4j::ops::add addOp;
+    sd::ops::add addOp;
 
     addOp.execute(block);
 
@@ -677,7 +677,7 @@ TEST_F(DeclarableOpsTests1, AddMatrixScalar1) {
     auto block = new Context(1, variableSpace, true);
     block->fillInputs({-1, -2});
 
-    nd4j::ops::add addOp;
+    sd::ops::add addOp;
 
     addOp.execute(block);
 
@@ -703,7 +703,7 @@ TEST_F(DeclarableOpsTests1, AddScalarScalar1) {
     auto block = new Context(1, variableSpace, true);
     block->fillInputs({-1, -2});
 
-    nd4j::ops::add addOp;
+    sd::ops::add addOp;
 
     addOp.execute(block);
 
@@ -729,7 +729,7 @@ TEST_F(DeclarableOpsTests1, SubtractMatrices1) {
     auto block = new Context(1, variableSpace, true);
     block->fillInputs({-1, -2});
 
-    nd4j::ops::subtract subOp;
+    sd::ops::subtract subOp;
 
     subOp.execute(block);
 
@@ -756,7 +756,7 @@ TEST_F(DeclarableOpsTests1, SubtractTest_1) {
     auto block = new Context(1, variableSpace, true);
     block->fillInputs({-1, -2});
 
-    nd4j::ops::subtract subOp;
+    sd::ops::subtract subOp;
 
     subOp.execute(block);
 
@@ -778,7 +778,7 @@ TEST_F(DeclarableOpsTests1, SubtractTest_2) {
     exp.assign(2);
 
 
-    nd4j::ops::subtract subOp;
+    sd::ops::subtract subOp;
 
     auto res = subOp.evaluate({&x, &y});
 
@@ -793,7 +793,7 @@ TEST_F(DeclarableOpsTests1, TestRng1) {
 /*
     Nd4jLong *buffer = new Nd4jLong[100000];
 
-    nd4j::random::RandomBuffer *rng = (nd4j::random::RandomBuffer *) initRandom(nullptr, 123, 100000, (Nd4jPointer) buffer);
+    sd::random::RandomBuffer *rng = (sd::random::RandomBuffer *) initRandom(nullptr, 123, 100000, (Nd4jPointer) buffer);
 
     if (rng == nullptr)
         throw std::runtime_error("RNG initialization failed");
@@ -807,7 +807,7 @@ TEST_F(DeclarableOpsTests1, TestRng1) {
     block->getTArguments()->push_back(0.0f);
     block->getTArguments()->push_back(1.0f);
 
-    nd4j::ops::randomuniform uniform;
+    sd::ops::randomuniform uniform;
 
     Nd4jStatus status = uniform.execute(block);
 
@@ -843,7 +843,7 @@ TEST_F(DeclarableOpsTests1, MergeSumTest1) {
     auto block = new Context(1, variableSpace, false);
     block->fillInputs({-1, -2, -3});
 
-    nd4j::ops::mergeadd merge;
+    sd::ops::mergeadd merge;
 
     merge.execute(block);
 
@@ -876,7 +876,7 @@ TEST_F(DeclarableOpsTests1, ClipByValue1) {
     block->getTArguments()->push_back(3.0f);
     block->fillInputs({-1});
 
-    nd4j::ops::clipbyvalue clip;
+    sd::ops::clipbyvalue clip;
 
     clip.execute(block);
 
@@ -909,7 +909,7 @@ TEST_F(DeclarableOpsTests1, MergeAvgTest1) {
     auto block = new Context(1, variableSpace, false);
     block->fillInputs({-1, -2, -3});
 
-    nd4j::ops::mergeavg merge;
+    sd::ops::mergeavg merge;
 
     merge.execute(block);
 
@@ -938,7 +938,7 @@ TEST_F(DeclarableOpsTests1, SubtractVectorVector1) {
     auto block = new Context(1, variableSpace, true);
     block->fillInputs({-1, -2});
 
-    nd4j::ops::subtract subOp;
+    sd::ops::subtract subOp;
 
     subOp.execute(block);
 
@@ -966,7 +966,7 @@ TEST_F(DeclarableOpsTests1, SubtractMatrixScalar1) {
     auto block = new Context(1, variableSpace, true);
     block->fillInputs({-1, -2});
 
-    nd4j::ops::subtract subOp;
+    sd::ops::subtract subOp;
 
     subOp.execute(block);
 
@@ -993,7 +993,7 @@ TEST_F(DeclarableOpsTests1, SubtractScalarScalar1) {
     auto block = new Context(1, variableSpace, true);
     block->fillInputs({-1, -2});
 
-    nd4j::ops::subtract subOp;
+    sd::ops::subtract subOp;
 
     subOp.execute(block);
 
@@ -1019,7 +1019,7 @@ TEST_F(DeclarableOpsTests1, ReverseSubtractMatrices1) {
     auto block = new Context(1, variableSpace, true);
     block->fillInputs({-1, -2});
 
-    nd4j::ops::reversesubtract subOp;
+    sd::ops::reversesubtract subOp;
 
     subOp.execute(block);
 
@@ -1039,7 +1039,7 @@ TEST_F(DeclarableOpsTests1, ReverseSubtractTest_1) {
     y.assign(1.f);
     exp.assign(-2.f);
 
-    nd4j::ops::reversesubtract subOp;
+    sd::ops::reversesubtract subOp;
 
     auto res = subOp.evaluate({&x, &y});
 
@@ -1064,7 +1064,7 @@ TEST_F(DeclarableOpsTests1, ReverseSubtractTest_2) {
 
     ASSERT_TRUE(exp.equalsTo(&z));
 
-    nd4j::ops::reversesubtract subOp;
+    sd::ops::reversesubtract subOp;
 
     auto res = subOp.evaluate({&x, &y});
 
@@ -1087,7 +1087,7 @@ TEST_F(DeclarableOpsTests1, ReverseSubtractTest_3) {
     exp.assign(2);
     x.applyTrueBroadcast(BROADCAST(ReverseSubtract), y, z, true);
     ASSERT_TRUE(z.equalsTo(&exp));
-    nd4j::ops::reversesubtract subOp;
+    sd::ops::reversesubtract subOp;
 
     auto res = subOp.evaluate({&x, &y});
     ASSERT_TRUE(res->status() == ND4J_STATUS_OK);
@@ -1113,7 +1113,7 @@ TEST_F(DeclarableOpsTests1, ReverseModTest_1) {
     x.applyTrueBroadcast(BROADCAST(ReverseMod), y, exp, true);
     ASSERT_TRUE(exp.equalsTo(&z));
 
-    nd4j::ops::reversemod subOp;
+    sd::ops::reversemod subOp;
 
     auto res = subOp.evaluate({&x, &y});
 
@@ -1140,7 +1140,7 @@ TEST_F(DeclarableOpsTests1, ReverseModTest_2) {
     x.applyTrueBroadcast(BROADCAST(ReverseMod), y, exp, true);
     ASSERT_TRUE(z.equalsTo(&exp));
 
-    nd4j::ops::reversemod subOp;
+    sd::ops::reversemod subOp;
 
     auto res = subOp.evaluate({&x, &y});
 
@@ -1166,7 +1166,7 @@ TEST_F(DeclarableOpsTests1, ReverseSubtractVectorVector1) {
     auto block = new Context(1, variableSpace, true);
     block->fillInputs({-1, -2});
 
-    nd4j::ops::reversesubtract subOp;
+    sd::ops::reversesubtract subOp;
 
     subOp.execute(block);
 
@@ -1194,7 +1194,7 @@ TEST_F(DeclarableOpsTests1, ReverseSubtractMatrixScalar1) {
     auto block = new Context(1, variableSpace, true);
     block->fillInputs({-1, -2});
 
-    nd4j::ops::reversesubtract subOp;
+    sd::ops::reversesubtract subOp;
 
     subOp.execute(block);
 
@@ -1222,7 +1222,7 @@ TEST_F(DeclarableOpsTests1, ReverseSubtractScalarScalar1) {
     auto block = new Context(1, variableSpace, true);
     block->fillInputs({-1, -2});
 
-    nd4j::ops::reversesubtract subOp;
+    sd::ops::reversesubtract subOp;
 
     subOp.execute(block);
 
@@ -1249,7 +1249,7 @@ TEST_F(DeclarableOpsTests1, MultiplyMatrices1) {
     auto block = new Context(1, variableSpace, true);
     block->fillInputs({-1, -2});
 
-    nd4j::ops::multiply mul;
+    sd::ops::multiply mul;
 
     mul.execute(block);
 
@@ -1276,7 +1276,7 @@ TEST_F(DeclarableOpsTests1, MultiplyVectorVector1) {
     auto block = new Context(1, variableSpace, true);
     block->fillInputs({-1, -2});
 
-    nd4j::ops::multiply mul;
+    sd::ops::multiply mul;
 
     mul.execute(block);
 
@@ -1303,7 +1303,7 @@ TEST_F(DeclarableOpsTests1, MultiplyMatrixScalar) {
     auto block = new Context(1, variableSpace, true);
     block->fillInputs({-1, -2});
 
-    nd4j::ops::multiply mul;
+    sd::ops::multiply mul;
 
     mul.execute(block);
 
@@ -1330,7 +1330,7 @@ TEST_F(DeclarableOpsTests1, MultiplyScalarScalar1) {
     auto block = new Context(1, variableSpace, true);
     block->fillInputs({-1, -2});
 
-    nd4j::ops::multiply mul;
+    sd::ops::multiply mul;
 
     mul.execute(block);
 
@@ -1372,7 +1372,7 @@ TEST_F(DeclarableOpsTests1, TestSoftMax_bp_1) {
     auto block = new Context(1, variableSpace, false);
     block->fillInputs({-1, -2});
 
-    nd4j::ops::softmax_bp op;
+    sd::ops::softmax_bp op;
 
     Nd4jStatus status = op.execute(block);
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -1395,7 +1395,7 @@ TEST_F(DeclarableOpsTests1, BroadcastDivideTest_1) {
     y.assign(2);
     exp.assign(3);
 
-    nd4j::ops::divide div;
+    sd::ops::divide div;
 
     auto res = div.evaluate({&x, &y});
 
@@ -1415,7 +1415,7 @@ TEST_F(DeclarableOpsTests1, BroadcastDivideTest_2) {
     y.assign(2);
     exp.assign(3);
 
-    nd4j::ops::divide_no_nan div;
+    sd::ops::divide_no_nan div;
     auto res = div.evaluate({&x, &y});
 
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -1431,7 +1431,7 @@ TEST_F(DeclarableOpsTests1, BroadcastDivideTest_3) {
     auto  y = NDArrayFactory::create<float>({3,3,0,3,3});
     auto  exp = NDArrayFactory::create<float>({2, 2, 0, 2, 2});
 
-    nd4j::ops::divide_no_nan div;
+    sd::ops::divide_no_nan div;
     auto res = div.evaluate({&x, &y});
 
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -1450,7 +1450,7 @@ TEST_F(DeclarableOpsTests1, BroadcastReverseDivideTest_1) {
     y.assign(6.f);
     exp.assign(2.f);
 
-    nd4j::ops::reversedivide div;
+    sd::ops::reversedivide div;
 
     auto res = div.evaluate({&x, &y});
 
@@ -1482,7 +1482,7 @@ TEST_F(DeclarableOpsTests1, DivideMatrices1) {
     auto block = new Context(1, variableSpace, true);
     block->fillInputs({-1, -2});
 
-    nd4j::ops::divide div;
+    sd::ops::divide div;
 
     div.execute(block);
 
@@ -1509,7 +1509,7 @@ TEST_F(DeclarableOpsTests1, DivideVectorVector1) {
     auto block = new Context(1, variableSpace, true);
     block->fillInputs({-1, -2});
 
-    nd4j::ops::divide div;
+    sd::ops::divide div;
 
     div.execute(block);
 
@@ -1535,7 +1535,7 @@ TEST_F(DeclarableOpsTests1, DivideMatrixScalar1) {
     auto block = new Context(1, variableSpace, true);
     block->fillInputs({-1, -2});
 
-    nd4j::ops::divide div;
+    sd::ops::divide div;
 
     div.execute(block);
 
@@ -1562,7 +1562,7 @@ TEST_F(DeclarableOpsTests1, DivideScalarScalar1) {
     auto block = new Context(1, variableSpace, true);
     block->fillInputs({-1, -2});
 
-    nd4j::ops::divide div;
+    sd::ops::divide div;
 
     div.execute(block);
 
@@ -1588,7 +1588,7 @@ TEST_F(DeclarableOpsTests1, ReverseDivideMatrices1) {
     auto block = new Context(1, variableSpace, true);
     block->fillInputs({-1, -2});
 
-    nd4j::ops::reversedivide div;
+    sd::ops::reversedivide div;
 
     div.execute(block);
 
@@ -1614,7 +1614,7 @@ TEST_F(DeclarableOpsTests1, ReverseDivideVectorVector1) {
     auto block = new Context(1, variableSpace, true);
     block->fillInputs({-1, -2});
 
-    nd4j::ops::reversedivide div;
+    sd::ops::reversedivide div;
 
     div.execute(block);
 
@@ -1640,7 +1640,7 @@ TEST_F(DeclarableOpsTests1, ReverseDivideMatrixScalar1) {
     auto block = new Context(1, variableSpace, true);
     block->fillInputs({-1, -2});
 
-    nd4j::ops::reversedivide div;
+    sd::ops::reversedivide div;
 
     div.execute(block);
 
@@ -1666,7 +1666,7 @@ TEST_F(DeclarableOpsTests1, ReverseDivideScalarScalar1) {
     auto block = new Context(1, variableSpace, true);
     block->fillInputs({-1, -2});
 
-    nd4j::ops::reversedivide div;
+    sd::ops::reversedivide div;
 
     div.execute(block);
 
@@ -1691,7 +1691,7 @@ TEST_F(DeclarableOpsTests1, Reshapeas1) {
     auto block = new Context(1, variableSpace, true);
     block->fillInputs({-1, -2});
 
-    nd4j::ops::reshapeas reshape;
+    sd::ops::reshapeas reshape;
 
     reshape.execute(block);
 
@@ -1707,7 +1707,7 @@ TEST_F(DeclarableOpsTests1, Test_Cast_1) {
     auto yExp = NDArrayFactory::create<float16>('c', {5, 5});
     x.linspace(1);
     yExp.linspace(1);
-    nd4j::ops::cast op;
+    sd::ops::cast op;
 
     auto result = op.evaluate({&x}, {}, {3});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1720,7 +1720,7 @@ TEST_F(DeclarableOpsTests1, Test_Cast_1) {
 
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests1, TestRegistrator1) {
-    auto res = nd4j::ops::OpRegistrator::getInstance()->getAllCustomOperations();
+    auto res = sd::ops::OpRegistrator::getInstance()->getAllCustomOperations();
 }
 
 // //////////////////////////////////////////////////////////////////////
@@ -1738,7 +1738,7 @@ TEST_F(DeclarableOpsTests1, TestRegistrator1) {
 //     z->assign(120.0f);
 //     std::string opName("add");
 
-//     auto hash = nd4j::ops::HashHelper::getInstance()->getInstance()->getLongHash(opName);
+//     auto hash = sd::ops::HashHelper::getInstance()->getInstance()->getLongHash(opName);
 
 //     auto inputBuffers = new Nd4jPointer[2];
 //     auto inputShapes = new Nd4jPointer[2];
@@ -1788,7 +1788,7 @@ TEST_F(DeclarableOpsTests1, TestRegistrator1) {
 
 //     std::string opName("add");
 
-//     auto hash = nd4j::ops::HashHelper::getInstance()->getInstance()->getLongHash(opName);
+//     auto hash = sd::ops::HashHelper::getInstance()->getInstance()->getLongHash(opName);
 
 //     auto inputBuffers = new Nd4jPointer[2];
 //     auto inputShapes = new Nd4jPointer[2];
@@ -1824,12 +1824,12 @@ TEST_F(DeclarableOpsTests1, TestGemv1) {
     /*
     auto xBuffer = new float[15]{1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f};
     auto xShape = new Nd4jLong[8] {2, 5, 3, 3, 1, 0, 1, 99};
-    ArrayOptions::setDataType(xShape, nd4j::DataType::FLOAT32);
+    ArrayOptions::setDataType(xShape, sd::DataType::FLOAT32);
     auto x = new NDArray(xBuffer, xShape);
 
     auto yBuffer = new float[3]{2.f, 4.f, 6.f};
     auto yShape = new Nd4jLong[8] {2, 3, 1, 1, 1, 0, 1, 99};
-    ArrayOptions::setDataType(yShape, nd4j::DataType::FLOAT32);
+    ArrayOptions::setDataType(yShape, sd::DataType::FLOAT32);
 
     auto y = new NDArray(yBuffer, yShape);
 
@@ -1838,7 +1838,7 @@ TEST_F(DeclarableOpsTests1, TestGemv1) {
     auto expBuffer = new float[5]{28.00f,64.00f,100.00f,136.00f,172.00f};
     auto exp = new NDArray(expBuffer, z->getShapeInfo());
 
-     nd4j::blas::GEMV<float, float, float>::op('f',  x->rows(), x->columns(), 1.0f, x->getBuffer(), y->rows(), y->getBuffer(), 1, 0.0, z->getBuffer(), 1);
+     sd::blas::GEMV<float, float, float>::op('f',  x->rows(), x->columns(), 1.0f, x->getBuffer(), y->rows(), y->getBuffer(), 1, 0.0, z->getBuffer(), 1);
 
     ASSERT_TRUE(z->equalsTo(exp));
 
@@ -1868,7 +1868,7 @@ TEST_F(DeclarableOpsTests1, Reshape2) {
     arguments->push_back(5);
     arguments->push_back(4);
 
-    nd4j::ops::reshape reshape;
+    sd::ops::reshape reshape;
 
     Nd4jStatus status = reshape.execute(block);
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -1884,7 +1884,7 @@ TEST_F(DeclarableOpsTests1, Reshape2) {
 TEST_F(DeclarableOpsTests1, Reshape3) {
     auto x = NDArrayFactory::create<float>('c', {3, 4, 5});
 
-    nd4j::ops::reshape op;
+    sd::ops::reshape op;
     auto result = op.evaluate({&x}, {}, {-99, 3, 4, 5});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1899,7 +1899,7 @@ TEST_F(DeclarableOpsTests1, Reshape3) {
 TEST_F(DeclarableOpsTests1, Reshape4) {
     auto x = NDArrayFactory::create<float>('c', {3, 4, 5});
 
-    nd4j::ops::reshape op;
+    sd::ops::reshape op;
     auto result = op.evaluate({&x}, {}, {3, 4, 5});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1914,7 +1914,7 @@ TEST_F(DeclarableOpsTests1, Reshape4) {
 TEST_F(DeclarableOpsTests1, Reshape5) {
     auto x = NDArrayFactory::create<float>('c', {3, 4, 5});
 
-    nd4j::ops::reshape op;
+    sd::ops::reshape op;
     auto result = op.evaluate({&x}, {}, {5, 4, 3});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1926,7 +1926,7 @@ TEST_F(DeclarableOpsTests1, Reshape6){
     auto x = NDArrayFactory::create<float>('c', {3, 4, 5});
     auto exp = NDArrayFactory::create<float>('c', {4, 15});
 
-    nd4j::ops::reshape op;
+    sd::ops::reshape op;
     auto result = op.evaluate({&x}, {}, {4, -1});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1943,7 +1943,7 @@ TEST_F(DeclarableOpsTests1, Reshape7){
     auto x = NDArrayFactory::create<float>('c', {3, 4, 5});
     auto exp = NDArrayFactory::create<float>('c', {60});
 
-    nd4j::ops::reshape op;
+    sd::ops::reshape op;
     auto result = op.evaluate({&x}, {}, {-1});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1967,7 +1967,7 @@ TEST_F(DeclarableOpsTests1, Transpose1) {
 
     auto block = new Context(1, variableSpace, false);  // not-in-place
     block->fillInputs({-1});
-    nd4j::ops::transpose transpose;
+    sd::ops::transpose transpose;
 
     Nd4jStatus  status = transpose.execute(block);
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -1992,8 +1992,8 @@ TEST_F(DeclarableOpsTests1, Permute1) {
     Nd4jLong shapeExp[] = {3, 15,5,10,  50,10,1,  0,1,99};
     const std::vector<int> perm = {2, 0, 1};
 
-    ArrayOptions::setDataType(shapeX, nd4j::DataType::FLOAT32);
-    ArrayOptions::setDataType(shapeExp, nd4j::DataType::FLOAT32);
+    ArrayOptions::setDataType(shapeX, sd::DataType::FLOAT32);
+    ArrayOptions::setDataType(shapeExp, sd::DataType::FLOAT32);
 
     auto x = new NDArray(shapeX, true);
     auto exp = new NDArray(shapeExp, true);
@@ -2007,7 +2007,7 @@ TEST_F(DeclarableOpsTests1, Permute1) {
     auto arguments = block->getIArguments();
     *arguments = perm;      // set dimensions to be permuted
 
-    nd4j::ops::permute permute;
+    sd::ops::permute permute;
     Nd4jStatus status = permute.execute(block);
     auto result = variableSpace->getVariable(block->getNodeId())->getNDArray();
 
@@ -2024,8 +2024,8 @@ TEST_F(DeclarableOpsTests1, TestArgumentsValidation1) {
     Nd4jLong shapeX[]   = {3, 5, 10, 15, 150, 15, 1, 0, 1, 99};
     Nd4jLong shapeExp[] = {3, 15, 5, 10, 1, 150, 15, 0, -1, 99};
 
-    ArrayOptions::setDataType(shapeX, nd4j::DataType::FLOAT32);
-    ArrayOptions::setDataType(shapeExp, nd4j::DataType::FLOAT32);
+    ArrayOptions::setDataType(shapeX, sd::DataType::FLOAT32);
+    ArrayOptions::setDataType(shapeExp, sd::DataType::FLOAT32);
 
     const std::vector<int> perm = {2, 0, 1};
     auto x = new NDArray(shapeX);
@@ -2038,7 +2038,7 @@ TEST_F(DeclarableOpsTests1, TestArgumentsValidation1) {
     auto block = new Context(1, variableSpace, false);  // not-in-place
     block->fillInputs({-1});
 
-    nd4j::ops::im2col permute;
+    sd::ops::im2col permute;
     Nd4jStatus status = permute.execute(block);
 
     ASSERT_TRUE(status != 0);
@@ -2061,7 +2061,7 @@ TEST_F(DeclarableOpsTests1, TestReductionShape1) {
     // kernel params
     block->getIArguments()->push_back(MAX_INT);
 
-    nd4j::ops::testreduction testop;
+    sd::ops::testreduction testop;
 
     auto inP = new Nd4jLong[shape::shapeInfoLength(input->getShapeInfo())];
     memcpy(inP, input->getShapeInfo(), shape::shapeInfoByteLength(input->rankOf()));
@@ -2100,7 +2100,7 @@ TEST_F(DeclarableOpsTests1, TestReductionShape2) {
     block->getIArguments()->push_back(3);
     block->getIArguments()->push_back(4);
 
-    nd4j::ops::testreduction testop;
+    sd::ops::testreduction testop;
 
     auto inshapes = new ShapeList(input->getShapeInfo());
     auto shapes = testop.calculateOutputShape(inshapes, *block);
@@ -2125,7 +2125,7 @@ TEST_F(DeclarableOpsTests1, TestCustomShape1) {
     auto block = new Context(1, variableSpace, false);  // not-in-place
     block->fillInputs({-1});
 
-    nd4j::ops::testcustom test;
+    sd::ops::testcustom test;
 
     auto inshapes = new ShapeList(input->getShapeInfo());
     auto shapes = test.calculateOutputShape(inshapes, *block);
@@ -2167,7 +2167,7 @@ TEST_F(DeclarableOpsTests1, Sum1) {
     std::vector<int>* arguments = block->getIArguments();
     *arguments = dimensions;
 
-    nd4j::ops::sum<float> sum;
+    sd::ops::sum<float> sum;
     Nd4jStatus status = sum.execute(block);
 
     auto result = variableSpace->getVariable(block->getNodeId())->getNDArray();
@@ -2196,7 +2196,7 @@ TEST_F(DeclarableOpsTests1, Pnormpool2d1) {
     std::vector<int>* argI = block->getIArguments();
     *argI = {kH,kW, sH,sW, pH,pW, dW,dH, 0, 1, 0};  // 0,1 - kernel Height/Width; 2,3 - stride Height/Width; 4,5 - pad Height/Width; 6,7 - dilation Height/Width; 8 - same mode; 9 - extraParam0 for pnorm case;
 
-    nd4j::ops::pnormpool2d pooling;
+    sd::ops::pnormpool2d pooling;
     Nd4jStatus status = pooling.execute(block);
     ASSERT_EQ(ND4J_STATUS_OK, status);
 
@@ -2213,7 +2213,7 @@ TEST_F(DeclarableOpsTests1, IsMax1) {
     float xBuff[]   = {1,2,3,4,5,6,7,8,9};
     Nd4jLong xShape[]    = {2,3,3,3,1,0,1,99};
     bool expBuff[] = {0,0,1,0,0,1,0,0,1};
-    ArrayOptions::setDataType(xShape, nd4j::DataType::BOOL);
+    ArrayOptions::setDataType(xShape, sd::DataType::BOOL);
 
     auto x = new NDArray(xBuff, xShape);
     NDArray exp(expBuff, xShape);
@@ -2227,7 +2227,7 @@ TEST_F(DeclarableOpsTests1, IsMax1) {
 //    *argI = {1};                                        // dimensions
     argI->push_back(1); // = {1};                                        // dimensions
 
-    nd4j::ops::ismax ismaxOp;
+    sd::ops::ismax ismaxOp;
     Nd4jStatus status = ismaxOp.execute(block);
     ASSERT_EQ(ND4J_STATUS_OK, status);
 
@@ -2242,15 +2242,15 @@ TEST_F(DeclarableOpsTests1, IsMax1) {
 
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests1, IsMax1) {
-    NDArray x('c', {3, 3}, nd4j::DataType::FLOAT32);
-//    NDArray exp('c', {3, 3}, nd4j::DataType::BOOL);
-    NDArray exp('c', {3, 3}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {3, 3}, sd::DataType::FLOAT32);
+//    NDArray exp('c', {3, 3}, sd::DataType::BOOL);
+    NDArray exp('c', {3, 3}, sd::DataType::FLOAT32);
     x.linspace(1);
     exp.p<bool>(0, 2, true);
     exp.p<bool>(1, 2, true);
     exp.p<bool>(2, 2, true);
 
-    nd4j::ops::ismax ismaxOp;
+    sd::ops::ismax ismaxOp;
     auto result = ismaxOp.evaluate({&x}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -2264,15 +2264,15 @@ TEST_F(DeclarableOpsTests1, IsMax1) {
 
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests1, IsMax2) {
-    NDArray x('c', {3, 3}, nd4j::DataType::FLOAT32);
-//    NDArray exp('c', {3, 3}, nd4j::DataType::BOOL);
-    NDArray exp('c', {3, 3}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {3, 3}, sd::DataType::FLOAT32);
+//    NDArray exp('c', {3, 3}, sd::DataType::BOOL);
+    NDArray exp('c', {3, 3}, sd::DataType::FLOAT32);
     x.linspace(1);
     //exp.p<bool>(0, 2, true);
     //exp.p<bool>(1, 2, true);
     exp.p<bool>(2, 2, true);
 
-    nd4j::ops::ismax ismaxOp;
+    sd::ops::ismax ismaxOp;
     auto result = ismaxOp.evaluate({&x}, {}, {0, 1});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -2286,15 +2286,15 @@ TEST_F(DeclarableOpsTests1, IsMax2) {
 
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests1, IsMax3) {
-    NDArray x = NDArrayFactory::create<float>(120.f); //('c', {3, 3}, nd4j::DataType::FLOAT32);
-//    NDArray exp('c', {3, 3}, nd4j::DataType::BOOL);
-    NDArray exp = NDArrayFactory::create<float>(1.f);//, nd4j::DataType::FLOAT32); //'c', {3, 3}, nd4j::DataType::FLOAT32);
+    NDArray x = NDArrayFactory::create<float>(120.f); //('c', {3, 3}, sd::DataType::FLOAT32);
+//    NDArray exp('c', {3, 3}, sd::DataType::BOOL);
+    NDArray exp = NDArrayFactory::create<float>(1.f);//, sd::DataType::FLOAT32); //'c', {3, 3}, sd::DataType::FLOAT32);
     x.linspace(1);
     //exp.p<bool>(0, 2, true);
     //exp.p<bool>(1, 2, true);
     //exp.p<bool>(2, 2, true);
 
-    nd4j::ops::ismax ismaxOp;
+    sd::ops::ismax ismaxOp;
     auto result = ismaxOp.evaluate({&x}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -2312,7 +2312,7 @@ TEST_F(DeclarableOpsTests1, IsMax4) {
     auto z = NDArrayFactory::create<bool>('c', {6});
     auto e = NDArrayFactory::create<bool>('c', {6}, {false, false, false, true, false, false});
 
-    nd4j::ops::ismax op;
+    sd::ops::ismax op;
     auto result = op.execute({&x}, {&z});
     ASSERT_EQ(Status::OK(), result);
 
@@ -2326,13 +2326,13 @@ TEST_F(DeclarableOpsTests1, IsMax4) {
 //     const int K = 3;
 //     const int N = 4;
 
-//     NDArray input('c', {bS,K,N}, nd4j::DataType::DOUBLE);
-//     NDArray weights('c', {3*K,K}, nd4j::DataType::DOUBLE);
-//     NDArray bias('c', {1,2*K}, nd4j::DataType::DOUBLE);
-//     NDArray init('c', {bS,K}, nd4j::DataType::DOUBLE);
-//     NDArray mask('c', {bS,K}, nd4j::DataType::DOUBLE);
-//     NDArray expState('c', {bS,K,N}, {0.847983, 0.874549, 0.896109, 0.913715, 0.847983, 0.874549, 0.896109, 0.913715, 0.847983, 0.874549, 0.896109, 0.913715, 0.847983, 0.874549, 0.896109, 0.913715, 0.847983, 0.874549, 0.896109, 0.913715, 0.847983, 0.874549, 0.896109, 0.913715}, nd4j::DataType::DOUBLE);
-//     NDArray expOut('c', {bS,K,N}, {1.090533, 1.174509, 1.252403, 1.324656, 1.090533, 1.174509, 1.252403, 1.324656, 1.090533, 1.174509, 1.252403, 1.324656, 1.090533, 1.174509, 1.252403, 1.324656, 1.090533, 1.174509, 1.252403, 1.324656, 1.090533, 1.174509, 1.252403, 1.324656}, nd4j::DataType::DOUBLE);
+//     NDArray input('c', {bS,K,N}, sd::DataType::DOUBLE);
+//     NDArray weights('c', {3*K,K}, sd::DataType::DOUBLE);
+//     NDArray bias('c', {1,2*K}, sd::DataType::DOUBLE);
+//     NDArray init('c', {bS,K}, sd::DataType::DOUBLE);
+//     NDArray mask('c', {bS,K}, sd::DataType::DOUBLE);
+//     NDArray expState('c', {bS,K,N}, {0.847983, 0.874549, 0.896109, 0.913715, 0.847983, 0.874549, 0.896109, 0.913715, 0.847983, 0.874549, 0.896109, 0.913715, 0.847983, 0.874549, 0.896109, 0.913715, 0.847983, 0.874549, 0.896109, 0.913715, 0.847983, 0.874549, 0.896109, 0.913715}, sd::DataType::DOUBLE);
+//     NDArray expOut('c', {bS,K,N}, {1.090533, 1.174509, 1.252403, 1.324656, 1.090533, 1.174509, 1.252403, 1.324656, 1.090533, 1.174509, 1.252403, 1.324656, 1.090533, 1.174509, 1.252403, 1.324656, 1.090533, 1.174509, 1.252403, 1.324656, 1.090533, 1.174509, 1.252403, 1.324656}, sd::DataType::DOUBLE);
 
 //     input.assign(1.5);
 //     weights.assign(0.5);
@@ -2340,7 +2340,7 @@ TEST_F(DeclarableOpsTests1, IsMax4) {
 //     init.assign(1.);
 //     mask.assign(1.);
 
-//     nd4j::ops::sru_old op;
+//     sd::ops::sru_old op;
 //     auto  results = op.execute({&input, &weights, &bias, &init, &mask}, {}, {});
 //     ASSERT_TRUE(results->size() == 2);
 
@@ -2362,13 +2362,13 @@ TEST_F(DeclarableOpsTests1, sru_test1) {
     const int K = 3;
     const int N = 4;
 
-    NDArray input('c', {bS,K,N}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {3*K,K}, nd4j::DataType::DOUBLE);
-    NDArray bias('c', {2*K}, nd4j::DataType::DOUBLE);
-    NDArray init('c', {bS,K}, nd4j::DataType::DOUBLE);
-    NDArray mask('c', {bS,K}, nd4j::DataType::DOUBLE);
-    NDArray expState('c', {bS,K,N}, {1.090533, 1.174509, 1.252403, 1.324656, 1.090533, 1.174509, 1.252403, 1.324656, 1.090533, 1.174509, 1.252403, 1.324656, 1.090533, 1.174509, 1.252403, 1.324656, 1.090533, 1.174509, 1.252403, 1.324656, 1.090533, 1.174509, 1.252403, 1.324656}, nd4j::DataType::DOUBLE);
-    NDArray expOut('c', {bS,K,N}, {0.847983, 0.874549, 0.896109, 0.913715, 0.847983, 0.874549, 0.896109, 0.913715, 0.847983, 0.874549, 0.896109, 0.913715, 0.847983, 0.874549, 0.896109, 0.913715, 0.847983, 0.874549, 0.896109, 0.913715, 0.847983, 0.874549, 0.896109, 0.913715}, nd4j::DataType::DOUBLE);
+    NDArray input('c', {bS,K,N}, sd::DataType::DOUBLE);
+    NDArray weights('c', {3*K,K}, sd::DataType::DOUBLE);
+    NDArray bias('c', {2*K}, sd::DataType::DOUBLE);
+    NDArray init('c', {bS,K}, sd::DataType::DOUBLE);
+    NDArray mask('c', {bS,K}, sd::DataType::DOUBLE);
+    NDArray expState('c', {bS,K,N}, {1.090533, 1.174509, 1.252403, 1.324656, 1.090533, 1.174509, 1.252403, 1.324656, 1.090533, 1.174509, 1.252403, 1.324656, 1.090533, 1.174509, 1.252403, 1.324656, 1.090533, 1.174509, 1.252403, 1.324656, 1.090533, 1.174509, 1.252403, 1.324656}, sd::DataType::DOUBLE);
+    NDArray expOut('c', {bS,K,N}, {0.847983, 0.874549, 0.896109, 0.913715, 0.847983, 0.874549, 0.896109, 0.913715, 0.847983, 0.874549, 0.896109, 0.913715, 0.847983, 0.874549, 0.896109, 0.913715, 0.847983, 0.874549, 0.896109, 0.913715, 0.847983, 0.874549, 0.896109, 0.913715}, sd::DataType::DOUBLE);
 
     input.assign(1.5);
     weights.assign(0.5);
@@ -2376,7 +2376,7 @@ TEST_F(DeclarableOpsTests1, sru_test1) {
     init.assign(1.);
     mask.assign(1.);
 
-    nd4j::ops::sru op;
+    sd::ops::sru op;
     auto results = op.evaluate({&input, &weights, &bias, &init, &mask});
     ASSERT_TRUE(results->size() == 2);
 
@@ -2423,7 +2423,7 @@ TEST_F(DeclarableOpsTests1, sru_bp) {
     inGradCt.assign(0.5);
     inGradH.assign(0.5);
 
-    nd4j::ops::sru_bp bp;
+    sd::ops::sru_bp bp;
     auto resultsBP = bp.evaluate({&input, &weights, &bias, &init, &state, &inGradCt, &inGradH, &mask}, {}, {});
     ASSERT_TRUE(resultsBP->size() == 4);
 
@@ -2448,11 +2448,11 @@ TEST_F(DeclarableOpsTests1, sru_bi_1) {
     const int K = 3;
     const int N = 4;
 
-    NDArray input('c', {N,bS,2*K}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {2*K,6*K}, nd4j::DataType::DOUBLE);
-    NDArray bias('c', {4*K}, nd4j::DataType::DOUBLE);
-    NDArray init('c', {bS,2*K}, nd4j::DataType::DOUBLE);
-    NDArray mask('c', {bS,2*K}, nd4j::DataType::DOUBLE);
+    NDArray input('c', {N,bS,2*K}, sd::DataType::DOUBLE);
+    NDArray weights('c', {2*K,6*K}, sd::DataType::DOUBLE);
+    NDArray bias('c', {4*K}, sd::DataType::DOUBLE);
+    NDArray init('c', {bS,2*K}, sd::DataType::DOUBLE);
+    NDArray mask('c', {bS,2*K}, sd::DataType::DOUBLE);
     NDArray expState('c', {N,bS,2*K}, {1.02857, 1.02857, 1.02857, 1.11288, 1.11288, 1.11288, 1.02857, 1.02857, 1.02857, 1.11288, 1.11288, 1.11288, 1.0569, 1.0569, 1.0569, 1.08501, 1.08501, 1.08501, 1.0569, 1.0569, 1.0569, 1.08501, 1.08501, 1.08501, 1.08501, 1.08501, 1.08501, 1.0569, 1.0569, 1.0569, 1.08501, 1.08501, 1.08501, 1.0569, 1.0569, 1.0569, 1.11288, 1.11288, 1.11288, 1.02857, 1.02857, 1.02857, 1.11288, 1.11288, 1.11288, 1.02857, 1.02857, 1.02857});
     NDArray expOut('c', {N,bS,2*K}, {0.779265, 0.779265, 0.779265, 0.810752, 0.810752, 0.810752, 0.779265, 0.779265, 0.779265, 0.810752, 0.810752, 0.810752, 0.790317, 0.790317, 0.790317, 0.800804, 0.800804, 0.800804, 0.790317, 0.790317, 0.790317, 0.800804, 0.800804, 0.800804, 0.800804, 0.800804, 0.800804, 0.790317, 0.790317, 0.790317, 0.800804, 0.800804, 0.800804, 0.790317, 0.790317, 0.790317, 0.810752, 0.810752, 0.810752, 0.779265, 0.779265, 0.779265, 0.810752, 0.810752, 0.810752, 0.779265, 0.779265, 0.779265});
 
@@ -2462,7 +2462,7 @@ TEST_F(DeclarableOpsTests1, sru_bi_1) {
     init.assign(1.);
     mask.assign(1.);
 
-    nd4j::ops::sru_bi op;
+    sd::ops::sru_bi op;
     auto results = op.evaluate({&input, &weights, &bias, &init, &mask}, {}, {});
     ASSERT_TRUE(results->size() == 2);
 
@@ -2513,7 +2513,7 @@ TEST_F(DeclarableOpsTests1, sru_bi_bp_1) {
     inGradCt.assign(0.5);
     inGradH.assign(0.5);
 
-    nd4j::ops::sru_bi_bp bp;
+    sd::ops::sru_bi_bp bp;
     auto resultsBP = bp.evaluate({&input, &weights, &bias, &init, &state, &inGradCt, &inGradH, &mask}, {}, {});
     ASSERT_TRUE(resultsBP->size() == 4);
 
@@ -2536,7 +2536,7 @@ TEST_F(DeclarableOpsTests1, ArgMax1) {
     auto exp = NDArrayFactory::create<Nd4jLong>('c', {3});
     exp.assign(4);
 
-    nd4j::ops::argmax op;
+    sd::ops::argmax op;
 
     auto result = op.evaluate({&x}, {}, {1});
 
@@ -2557,7 +2557,7 @@ TEST_F(DeclarableOpsTests1, ArgMax2) {
     auto exp = NDArrayFactory::create<Nd4jLong>('c', {5});
     exp.assign(2);
 
-    nd4j::ops::argmax op;
+    sd::ops::argmax op;
 
     auto result = op.evaluate({&x}, {}, {0});
 
@@ -2579,7 +2579,7 @@ TEST_F(DeclarableOpsTests1, ArgMax3) {
     auto exp = NDArrayFactory::create<Nd4jLong>('c', {5});
     exp.assign(2);
 
-    nd4j::ops::argmax op;
+    sd::ops::argmax op;
 
     auto result = op.evaluate({&x, &dim}, {}, {});
 
@@ -2600,7 +2600,7 @@ TEST_F(DeclarableOpsTests1, ArgMax4) {
     auto exp = NDArrayFactory::create<Nd4jLong>('c', {3});
     exp.assign(4);
 
-    nd4j::ops::argmax op;
+    sd::ops::argmax op;
 
     auto result = op.evaluate({&x, &dim}, {}, {});
 
@@ -2622,7 +2622,7 @@ TEST_F(DeclarableOpsTests1, ArgMax5) {
     auto exp = NDArrayFactory::create<Nd4jLong>(14);
 
 
-    nd4j::ops::argmax op;
+    sd::ops::argmax op;
 
     auto result = op.evaluate({&x, &dim}, {}, {});
 
@@ -2642,7 +2642,7 @@ TEST_F(DeclarableOpsTests1, ArgMax6) {
     x.linspace(1);
 
 
-    nd4j::ops::argmax op;
+    sd::ops::argmax op;
 
     auto expected = op.evaluate({&x}, {}, {2});
     ASSERT_EQ(Status::OK(), expected->status());
@@ -2668,7 +2668,7 @@ TEST_F(DeclarableOpsTests1, ArgMin1) {
     auto exp = NDArrayFactory::create<Nd4jLong>('c', {3});
     exp.assign(0.0f);
 
-    nd4j::ops::argmin op;
+    sd::ops::argmin op;
 
     auto result = op.evaluate({&x}, {}, {1});
 
@@ -2691,7 +2691,7 @@ TEST_F(DeclarableOpsTests1, SquareTests1) {
     exp.linspace(1);
     exp *= exp;
 
-    nd4j::ops::square op;
+    sd::ops::square op;
 
     auto result = op.evaluate({&x}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -2709,7 +2709,7 @@ TEST_F(DeclarableOpsTests1, OneHotTests_1) {
 
     auto exp = NDArrayFactory::create<float>('c', {1, 4, 3}, {1.f, 0.f, 0.f, 0.f, 0.f, 1.f, 0.f, 0.f, 0.f, 0.f, 1.f, 0.f});
 
-    nd4j::ops::onehot op;
+    sd::ops::onehot op;
 
     auto result = op.evaluate({&indices}, {1.0f, 0.0f}, {-1, 3});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -2728,7 +2728,7 @@ TEST_F(DeclarableOpsTests1, OneHotTests_2) {
 
     auto exp = NDArrayFactory::create<float>('c', {2, 2, 3}, {1.f, 0.f, 0.f, 0.f, 0.f, 1.f, 0.f, 1.f, 0.f, 0.f, 0.f, 0.f});
 
-    nd4j::ops::onehot op;
+    sd::ops::onehot op;
     auto result = op.evaluate({&indices}, {1.0f, 0.0f}, {-1, 3});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -2747,7 +2747,7 @@ TEST_F(DeclarableOpsTests1, OneHotTests_3) {
 
     auto exp = NDArrayFactory::create<float>('c', {4, 3}, {1.f, 0.f, 0.f, 0.f, 0.f, 1.f, 0.f, 0.f, 0.f, 0.f, 1.f, 0.f});
 
-    nd4j::ops::onehot op;
+    sd::ops::onehot op;
 
     auto result = op.evaluate({&indices}, {1.0f, 0.0f}, {-1, 3});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -2768,7 +2768,7 @@ TEST_F(DeclarableOpsTests1, OneHotTests_4) {
 
     auto exp = NDArrayFactory::create<float>('c', {4, 3}, {1.f, 0.f, 0.f, 0.f, 0.f, 1.f, 0.f, 0.f, 0.f, 0.f, 1.f, 0.f});
 
-    nd4j::ops::onehot op;
+    sd::ops::onehot op;
 
     auto result = op.evaluate({&indices, &depth}, {1.0f, 0.0f}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -2789,7 +2789,7 @@ TEST_F(DeclarableOpsTests1, OneHotTests_5) {
 
     auto exp = NDArrayFactory::create<float>('c', {4, 3}, {1.f, 0.f, 0.f, 0.f, 0.f, 1.f, 0.f, 0.f, 0.f, 0.f, 1.f, 0.f});
 
-    nd4j::ops::onehot op;
+    sd::ops::onehot op;
 
     auto result = op.evaluate({&indices, &depth, &on, &off}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -2806,7 +2806,7 @@ TEST_F(DeclarableOpsTests1, OneHotTests_6) {
     auto indices = NDArrayFactory::create<float>('c', {3}, {0.f, 1.f, 2.f});
     auto e = NDArrayFactory::create<float>('c', {3, 3}, {1.f, 0.f, 0.f, 0.f, 1.f, 0.f, 0.f, 0.f, 1.f});
 
-    nd4j::ops::onehot op;
+    sd::ops::onehot op;
     auto result = op.evaluate({&indices}, {1.0, 0.0}, {0, 3});
     auto z = result->at(0);
 
@@ -2819,8 +2819,8 @@ TEST_F(DeclarableOpsTests1, OneHotTests_7) {
     auto indices = NDArrayFactory::create<int>('c', {3}, {0, 1, 2});
     auto e = NDArrayFactory::create<float16>('c', {3, 3}, {1., 0., 0., 0., 1., 0., 0., 0., 1.});
 
-    nd4j::ops::onehot op;
-    auto result = op.evaluate({&indices}, {1.0, 0.0}, {0, 3}, {}, {nd4j::DataType::HALF}, false);
+    sd::ops::onehot op;
+    auto result = op.evaluate({&indices}, {1.0, 0.0}, {0, 3}, {}, {sd::DataType::HALF}, false);
     auto z = result->at(0);
 
     ASSERT_EQ(e, *z);
@@ -2834,7 +2834,7 @@ TEST_F(DeclarableOpsTests1, FillAs_1) {
 
     float scalar = 119.f;
 
-    nd4j::ops::fill_as op;
+    sd::ops::fill_as op;
     auto result = op.evaluate({&x}, {scalar}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -2848,7 +2848,7 @@ TEST_F(DeclarableOpsTests1, FillAs_1) {
 
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests1, LRN1) {
-    nd4j::ops::lrn lrn;
+    sd::ops::lrn lrn;
 
     lrn.getOpName();
 }
@@ -2862,15 +2862,15 @@ TEST_F(DeclarableOpsTests1, Stack_1) {
     Nd4jLong shape1[]    = {2, 3, 4, 4, 1, 0, 1, 99};
     Nd4jLong shape2[]    = {2, 3, 4, 4, 1, 0, 1, 99};
     Nd4jLong expShape[]  = {3, 2, 3, 4, 12, 4, 1, 0, 1, 99};
-    ArrayOptions::setDataType(shape1, nd4j::DataType::FLOAT32);
-    ArrayOptions::setDataType(shape2, nd4j::DataType::FLOAT32);
-    ArrayOptions::setDataType(expShape, nd4j::DataType::FLOAT32);
+    ArrayOptions::setDataType(shape1, sd::DataType::FLOAT32);
+    ArrayOptions::setDataType(shape2, sd::DataType::FLOAT32);
+    ArrayOptions::setDataType(expShape, sd::DataType::FLOAT32);
 
     NDArray input1(buff1, shape1);
     NDArray input2(buff2, shape2);
     NDArray expected(expBuff, expShape);
 
-    nd4j::ops::stack op;
+    sd::ops::stack op;
     auto results = op.evaluate({&input1, &input2}, {}, {0});
     auto output = results->at(0);
 
@@ -2890,15 +2890,15 @@ TEST_F(DeclarableOpsTests1, Stack_2) {
     Nd4jLong shape1[]    = {2, 3, 4, 4, 1, 0, 1, 99};
     Nd4jLong shape2[]    = {2, 3, 4, 4, 1, 0, 1, 99};
     Nd4jLong expShape[]  = {3, 3, 2, 4, 8, 4, 1, 0, 1, 99};
-    ArrayOptions::setDataType(shape1, nd4j::DataType::FLOAT32);
-    ArrayOptions::setDataType(shape2, nd4j::DataType::FLOAT32);
-    ArrayOptions::setDataType(expShape, nd4j::DataType::FLOAT32);
+    ArrayOptions::setDataType(shape1, sd::DataType::FLOAT32);
+    ArrayOptions::setDataType(shape2, sd::DataType::FLOAT32);
+    ArrayOptions::setDataType(expShape, sd::DataType::FLOAT32);
 
     NDArray input1(buff1, shape1);
     NDArray input2(buff2, shape2);
     NDArray expected(expBuff, expShape);
 
-    nd4j::ops::stack op;
+    sd::ops::stack op;
     auto results = op.evaluate({&input1, &input2}, {}, {1});
     auto output = results->at(0);
 
@@ -2918,15 +2918,15 @@ TEST_F(DeclarableOpsTests1, Stack_3) {
     Nd4jLong shape1[]    = {2, 1, 12, 12, 1, 0, 1, 99};
     Nd4jLong shape2[]    = {2, 1, 12, 12, 1, 0, 1, 99};
     Nd4jLong expShape[]  = {3, 2, 1, 12, 12, 12, 1, 0, 1, 99};
-    ArrayOptions::setDataType(shape1, nd4j::DataType::FLOAT32);
-    ArrayOptions::setDataType(shape2, nd4j::DataType::FLOAT32);
-    ArrayOptions::setDataType(expShape, nd4j::DataType::FLOAT32);
+    ArrayOptions::setDataType(shape1, sd::DataType::FLOAT32);
+    ArrayOptions::setDataType(shape2, sd::DataType::FLOAT32);
+    ArrayOptions::setDataType(expShape, sd::DataType::FLOAT32);
 
     NDArray input1(buff1, shape1);
     NDArray input2(buff2, shape2);
     NDArray expected(expBuff, expShape);
 
-    nd4j::ops::stack op;
+    sd::ops::stack op;
     auto results = op.evaluate({&input1, &input2}, {}, {0});
     auto output = results->at(0);
 
@@ -2945,15 +2945,15 @@ TEST_F(DeclarableOpsTests1, Stack_4) {
     Nd4jLong shape1[]    = {2, 1, 12, 12, 1, 0, 1, 99};
     Nd4jLong shape2[]    = {2, 1, 12, 12, 1, 0, 1, 99};
     Nd4jLong expShape[]  = {3, 1, 2, 12, 24, 12, 1, 0, 1, 99};
-    ArrayOptions::setDataType(shape1, nd4j::DataType::FLOAT32);
-    ArrayOptions::setDataType(shape2, nd4j::DataType::FLOAT32);
-    ArrayOptions::setDataType(expShape, nd4j::DataType::FLOAT32);
+    ArrayOptions::setDataType(shape1, sd::DataType::FLOAT32);
+    ArrayOptions::setDataType(shape2, sd::DataType::FLOAT32);
+    ArrayOptions::setDataType(expShape, sd::DataType::FLOAT32);
 
     NDArray input1(buff1, shape1);
     NDArray input2(buff2, shape2);
     NDArray expected(expBuff, expShape);
 
-    nd4j::ops::stack op;
+    sd::ops::stack op;
     auto results = op.evaluate({&input1, &input2}, {}, {1});
     auto output = results->at(0);
 
@@ -2972,15 +2972,15 @@ TEST_F(DeclarableOpsTests1, Stack_5) {
     Nd4jLong shape1[]    = {2, 12, 1, 1,1, 0, 1, 99};
     Nd4jLong shape2[]    = {2, 12, 1, 1,1, 0, 1, 99};
     Nd4jLong expShape[]  = {3, 2, 12, 1, 12, 1, 1, 0, 1, 99};
-    ArrayOptions::setDataType(shape1, nd4j::DataType::FLOAT32);
-    ArrayOptions::setDataType(shape2, nd4j::DataType::FLOAT32);
-    ArrayOptions::setDataType(expShape, nd4j::DataType::FLOAT32);
+    ArrayOptions::setDataType(shape1, sd::DataType::FLOAT32);
+    ArrayOptions::setDataType(shape2, sd::DataType::FLOAT32);
+    ArrayOptions::setDataType(expShape, sd::DataType::FLOAT32);
 
     NDArray input1(buff1, shape1);
     NDArray input2(buff2, shape2);
     NDArray expected(expBuff, expShape);
 
-    nd4j::ops::stack op;
+    sd::ops::stack op;
     auto results = op.evaluate({&input1, &input2}, {}, {0});
     auto output = results->at(0);
 
@@ -2999,15 +2999,15 @@ TEST_F(DeclarableOpsTests1, Stack_6) {
     Nd4jLong shape1[]    = {2, 12, 1, 1, 12, 0, 1, 99};
     Nd4jLong shape2[]    = {2, 12, 1, 1, 12, 0, 1, 99};
     Nd4jLong expShape[]  = {3, 12, 2, 1, 2, 1, 1, 0, 1, 99};
-    ArrayOptions::setDataType(shape1, nd4j::DataType::FLOAT32);
-    ArrayOptions::setDataType(shape2, nd4j::DataType::FLOAT32);
-    ArrayOptions::setDataType(expShape, nd4j::DataType::FLOAT32);
+    ArrayOptions::setDataType(shape1, sd::DataType::FLOAT32);
+    ArrayOptions::setDataType(shape2, sd::DataType::FLOAT32);
+    ArrayOptions::setDataType(expShape, sd::DataType::FLOAT32);
 
     NDArray input1(buff1, shape1);
     NDArray input2(buff2, shape2);
     NDArray expected(expBuff, expShape);
 
-    nd4j::ops::stack op;
+    sd::ops::stack op;
     auto results = op.evaluate({&input1, &input2}, {}, {1});
     auto output = results->at(0);
 
@@ -3025,13 +3025,13 @@ TEST_F(DeclarableOpsTests1, Stack_7) {
     float expBuff[] = {1, 1, 1};
     Nd4jLong shape1[]    = {2, 1, 1, 1, 1, 0, 1, 99};
     Nd4jLong expShape[]  = {3, 3, 1, 1, 1, 1, 1, 0, 1, 99};
-    ArrayOptions::setDataType(shape1, nd4j::DataType::FLOAT32);
-    ArrayOptions::setDataType(expShape, nd4j::DataType::FLOAT32);
+    ArrayOptions::setDataType(shape1, sd::DataType::FLOAT32);
+    ArrayOptions::setDataType(expShape, sd::DataType::FLOAT32);
 
     NDArray input1(buff1, shape1);
     NDArray expected(expBuff, expShape);
 
-    nd4j::ops::stack op;
+    sd::ops::stack op;
     auto results = op.evaluate({&input1, &input1, &input1}, {}, {0});
     auto output = results->at(0);
 
@@ -3048,13 +3048,13 @@ TEST_F(DeclarableOpsTests1, Stack_8) {
     float expBuff[] = {1, 1, 1};
     Nd4jLong shape1[]    = {1, 1, 1, 0, 1, 99};
     Nd4jLong expShape[]  = {2, 3, 1, 1, 1, 0, 1, 99};
-    ArrayOptions::setDataType(shape1, nd4j::DataType::FLOAT32);
-    ArrayOptions::setDataType(expShape, nd4j::DataType::FLOAT32);
+    ArrayOptions::setDataType(shape1, sd::DataType::FLOAT32);
+    ArrayOptions::setDataType(expShape, sd::DataType::FLOAT32);
 
     NDArray input1(buff1, shape1);
     NDArray expected(expBuff, expShape);
 
-    nd4j::ops::stack op;
+    sd::ops::stack op;
     auto results = op.evaluate({&input1, &input1, &input1}, {}, {0});
     auto output = results->at(0);
 
@@ -3071,13 +3071,13 @@ TEST_F(DeclarableOpsTests1, Stack_9) {
     float expBuff[] = {1, 1, 1};
     Nd4jLong shape1[]    = {2, 1, 1, 1, 1, 0, 1, 99};
     Nd4jLong expShape[]  = {3, 1, 3, 1, 3, 1, 1, 0, 1, 99};
-    ArrayOptions::setDataType(shape1, nd4j::DataType::FLOAT32);
-    ArrayOptions::setDataType(expShape, nd4j::DataType::FLOAT32);
+    ArrayOptions::setDataType(shape1, sd::DataType::FLOAT32);
+    ArrayOptions::setDataType(expShape, sd::DataType::FLOAT32);
 
     NDArray input1(buff1, shape1);
     NDArray expected(expBuff, expShape);
 
-    nd4j::ops::stack op;
+    sd::ops::stack op;
     auto results = op.evaluate({&input1, &input1, &input1}, {}, {1});
     auto output = results->at(0);
 
@@ -3094,13 +3094,13 @@ TEST_F(DeclarableOpsTests1, Stack_10) {
     float expBuff[] = {1, 1, 1};
     Nd4jLong shape1[]    = {1, 1, 1, 0, 1, 99};
     Nd4jLong expShape[]  = {2, 1, 3, 3, 1, 0, 1, 99};
-    ArrayOptions::setDataType(shape1, nd4j::DataType::FLOAT32);
-    ArrayOptions::setDataType(expShape, nd4j::DataType::FLOAT32);
+    ArrayOptions::setDataType(shape1, sd::DataType::FLOAT32);
+    ArrayOptions::setDataType(expShape, sd::DataType::FLOAT32);
 
     NDArray input1(buff1, shape1);
     NDArray expected(expBuff, expShape);
 
-    nd4j::ops::stack op;
+    sd::ops::stack op;
     auto results = op.evaluate({&input1, &input1, &input1}, {}, {1});
     auto output = results->at(0);
 
@@ -3119,13 +3119,13 @@ TEST_F(DeclarableOpsTests1, Stack_11) {
     float expBuff[] = {1, 1, 1};
     Nd4jLong shape1[]    = {1, 1, 1, 0, 1, 99};
     Nd4jLong expShape[]  = {2, 3, 1, 1, 1, 0, 1, 99};
-    ArrayOptions::setDataType(shape1, nd4j::DataType::FLOAT32);
-    ArrayOptions::setDataType(expShape, nd4j::DataType::FLOAT32);
+    ArrayOptions::setDataType(shape1, sd::DataType::FLOAT32);
+    ArrayOptions::setDataType(expShape, sd::DataType::FLOAT32);
 
     NDArray input1(buff1, shape1);
     NDArray expected(expBuff, expShape);
 
-    nd4j::ops::stack op;
+    sd::ops::stack op;
     auto results = op.evaluate({&input1, &input1, &input1}, {}, {});
     auto output = results->at(0);
 
@@ -3140,7 +3140,7 @@ TEST_F(DeclarableOpsTests1, Test_Range_Integer_1) {
     auto exp = NDArrayFactory::create<int>('c', {4});
     exp.linspace(1);
 
-    nd4j::ops::range op;
+    sd::ops::range op;
 
     auto result = op.evaluate({}, {}, {1, 5, 1});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -3167,7 +3167,7 @@ TEST_F(DeclarableOpsTests1, Test_Range_Integer_2) {
     stop.p(0, 5.f);
     step.p(0, 1.f);
 
-    nd4j::ops::range op;
+    sd::ops::range op;
 
     auto result = op.evaluate({&start, &stop, &step}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -3187,7 +3187,7 @@ TEST_F(DeclarableOpsTests1, Test_Range_Integer_3) {
     auto exp = NDArrayFactory::create<float>('c', {4});
     exp.linspace(1);
 
-    nd4j::ops::range op;
+    sd::ops::range op;
 
     auto result = op.evaluate({}, {1.f, 5.f, 1.f}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -3207,7 +3207,7 @@ TEST_F(DeclarableOpsTests1, softmax_test1) {
     auto input = NDArrayFactory::create<double>('c', {3, 3}, {-1, 1, -2, 2, -3, 3, -4, 4, 5});
     auto expOutput = NDArrayFactory::create<double>('c', {3, 3}, {1.14195199e-01, 8.43794734e-01, 4.20100661e-02, 2.68454951e-01, 1.80883523e-03, 7.29736214e-01, 9.02116571e-05, 2.68917160e-01, 7.30992629e-01});
 
-    nd4j::ops::softmax op;
+    sd::ops::softmax op;
     auto results = op.evaluate({&input}, {}, {}, {});
     auto z = results->at(0);
 
@@ -3223,7 +3223,7 @@ TEST_F(DeclarableOpsTests1, softmax_test2) {
     auto input = NDArrayFactory::create<double>('c', {3, 3, 3}, {-1, 1, -2, 2, -3, 3, -4, 4, -5,5 ,-6,6, -7,7, -8,8, -9,9, -10,10, -11,11, -12,12, -13,13, 14});
     auto expOutput = NDArrayFactory::create<double>('c', {3, 3, 3}, {4.73142e-02,4.73847e-02,6.69062e-03, 9.50330e-01,8.67881e-04,9.92976e-01, 2.35563e-03,9.51747e-01,3.33106e-04, 4.74259e-02,2.26032e-06,4.74259e-02, 2.91395e-07,9.99998e-01,3.94360e-08, 9.52574e-01,1.12535e-07,9.52574e-01, 7.58256e-10,4.74259e-02,1.22325e-11, 1.00000e+00,1.32293e-11,1.19203e-01, 3.77513e-11,9.52574e-01,8.80797e-01});
 
-    nd4j::ops::softmax op;
+    sd::ops::softmax op;
     auto results = op.evaluate({&input}, {}, {1}, {});
     auto z = results->at(0);
 
@@ -3239,7 +3239,7 @@ TEST_F(DeclarableOpsTests1, softmax_test3) {
     auto input = NDArrayFactory::create<double>('c', {3, 3, 3}, {-1, 1, -2, 2, -3, 3, -4, 4, -5,5 ,-6,6, -7,7, -8,8, -9,9, -10,10, -11,11, -12,12, -13,13, 14});
     auto expOutput = NDArrayFactory::create<double>('c', {3, 3, 3}, {2.47262e-03,1.23395e-04,3.35350e-04, 1.23395e-04,4.53979e-05,1.23395e-04, 6.14417e-06,1.23395e-04,5.56530e-09, 9.97527e-01,1.12521e-07,9.99665e-01, 1.52281e-08,9.99955e-01,2.06090e-09, 9.99994e-01,2.78912e-10,6.69285e-03, 3.05146e-07,9.99876e-01,4.13855e-08, 9.99877e-01,5.60254e-09,9.99877e-01, 7.58251e-10,9.99877e-01,9.93307e-01});
 
-    nd4j::ops::softmax op;
+    sd::ops::softmax op;
     auto results = op.evaluate({&input}, {}, {0}, {});
     auto z = results->at(0);
 
@@ -3255,7 +3255,7 @@ TEST_F(DeclarableOpsTests1, softmax_test4) {
     auto input = NDArrayFactory::create<double>('c', {1, 5}, {-1, 1, -2, 2, 3});
     auto expOutput = NDArrayFactory::create<double>('c', {1, 5}, {0.01198,0.08855,0.00441,0.24072,0.65434});
 
-    nd4j::ops::softmax op;
+    sd::ops::softmax op;
     auto results = op.evaluate({&input}, {}, {1}, {});
     auto z = results->at(0);
 
@@ -3271,7 +3271,7 @@ TEST_F(DeclarableOpsTests1, softmax_test5) {
     auto input = NDArrayFactory::create<double>('c', {1, 5}, {-1, 1, -2, 2, 3});
     auto expOutput = NDArrayFactory::create<double>('c', {1, 5}, {1,1,1,1,1});
 
-    nd4j::ops::softmax op;
+    sd::ops::softmax op;
     auto results = op.evaluate({&input}, {}, {0});
     auto z = results->at(0);
 
@@ -3287,7 +3287,7 @@ TEST_F(DeclarableOpsTests1, softmax_test6) {
     auto input = NDArrayFactory::create<double>('c', {5, 1}, {-1, 1, -2, 2, 3});
     auto expOutput = NDArrayFactory::create<double>('c', {5, 1}, {0.01198,0.08855,0.00441,0.24072,0.65434});
 
-    nd4j::ops::softmax op;
+    sd::ops::softmax op;
     auto results = op.evaluate({&input}, {}, {0}, {});
     auto z = results->at(0);
 
@@ -3303,7 +3303,7 @@ TEST_F(DeclarableOpsTests1, softmax_test7) {
     auto input = NDArrayFactory::create<double>('c', {5, 1}, {-1, 1, -2, 2, 3});
     auto expOutput = NDArrayFactory::create<double>('c', {5, 1}, {1,1,1,1,1});
 
-    nd4j::ops::softmax op;
+    sd::ops::softmax op;
     auto results = op.evaluate({&input}, {}, {1}, {});
     auto z = results->at(0);
 
@@ -3319,7 +3319,7 @@ TEST_F(DeclarableOpsTests1, softmax_test8) {
     auto input = NDArrayFactory::create<double>('c', {5}, {-1, 1, -2, 2, 3});
     auto expOutput = NDArrayFactory::create<double>('c', {5}, {0.01198,0.08855,0.00441,0.24072,0.65434});
 
-    nd4j::ops::softmax op;
+    sd::ops::softmax op;
     auto results = op.evaluate({&input}, {}, {}, {});
     auto z = results->at(0);
 
@@ -3339,7 +3339,7 @@ TEST_F(DeclarableOpsTests1, Test_Stack_Edge_1) {
 
     auto exp = NDArrayFactory::create<float>(expBuff, 'c', {1, 1, 3});
 
-    nd4j::ops::stack op;
+    sd::ops::stack op;
 
     auto result = op.evaluate({&input}, {}, {0});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -3361,7 +3361,7 @@ TEST_F(DeclarableOpsTests1, Test_Stack_Edge_2) {
 
     auto exp = NDArrayFactory::create<float>(expBuff, 'c', {1, 1, 1, 3});
 
-    nd4j::ops::stack op;
+    sd::ops::stack op;
 
     auto result = op.evaluate({&input}, {}, {0});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -3383,7 +3383,7 @@ TEST_F(DeclarableOpsTests1, Test_Stack_Edge_3) {
 
     auto exp = NDArrayFactory::create<float>(expBuff, 'c', {1, 1, 3});
 
-    nd4j::ops::stack op;
+    sd::ops::stack op;
 
     auto result = op.evaluate({&input}, {}, {1});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -3404,13 +3404,13 @@ TEST_F(DeclarableOpsTests1, Reverse_1 ) {
     float inBuff[]  = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24};
     float expBuff[] = {24., 23., 22., 21., 20., 19., 18., 17., 16., 15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1.};
     Nd4jLong shapeInfo[] = {3, 2, 3, 4, 12, 4, 1, 0, 1, 99};
-    ArrayOptions::setDataType(shapeInfo, nd4j::DataType::FLOAT32);
+    ArrayOptions::setDataType(shapeInfo, sd::DataType::FLOAT32);
 
     NDArray input(inBuff, shapeInfo);
     NDArray expected(expBuff, shapeInfo);
     NDArray output(shapeInfo);
 
-    nd4j::ops::reverse op;
+    sd::ops::reverse op;
     auto results = op.evaluate({&input}, {}, {0,1,2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3429,13 +3429,13 @@ TEST_F(DeclarableOpsTests1, Reverse_2 ) {
     float inBuff[]  = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24};
     float expBuff[] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24};
     Nd4jLong shapeInfo[] = {3, 2, 3, 4, 12, 4, 1, 0, 1, 99};
-    ArrayOptions::setDataType(shapeInfo, nd4j::DataType::FLOAT32);
+    ArrayOptions::setDataType(shapeInfo, sd::DataType::FLOAT32);
 
     NDArray input(inBuff, shapeInfo);
     NDArray expected(expBuff, shapeInfo);
     NDArray output(shapeInfo);
 
-    nd4j::ops::reverse op;
+    sd::ops::reverse op;
     auto results = op.evaluate({&input}, {}, {}, {}, {}, true);
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3454,13 +3454,13 @@ TEST_F(DeclarableOpsTests1, Reverse_3 ) {
     float inBuff[]  = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24};
     float expBuff[] = {12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 24., 23., 22., 21., 20., 19., 18., 17., 16., 15., 14., 13.};
     Nd4jLong shapeInfo[] = {3, 2, 3, 4, 12, 4, 1, 0, 1, 99};
-    ArrayOptions::setDataType(shapeInfo, nd4j::DataType::FLOAT32);
+    ArrayOptions::setDataType(shapeInfo, sd::DataType::FLOAT32);
 
     NDArray input(inBuff, shapeInfo);
     NDArray expected(expBuff, shapeInfo);
     NDArray output(shapeInfo);
 
-    nd4j::ops::reverse op;
+    sd::ops::reverse op;
     auto results = op.evaluate({&input}, {}, {1,2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3480,13 +3480,13 @@ TEST_F(DeclarableOpsTests1, Reverse_4 ) {
     float inBuff[]  = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24};
     float expBuff[] = {16,15,14,13,20,19,18,17,24,23,22,21,4,3,2,1,8,7,6,5,12,11,10,9,};
     Nd4jLong shapeInfo[] = {3, 2, 3, 4, 12, 4, 1, 0, 1, 99};
-    ArrayOptions::setDataType(shapeInfo, nd4j::DataType::FLOAT32);
+    ArrayOptions::setDataType(shapeInfo, sd::DataType::FLOAT32);
 
     NDArray input(inBuff, shapeInfo);
     NDArray expected(expBuff, shapeInfo);
     NDArray output(shapeInfo);
 
-    nd4j::ops::reverse op;
+    sd::ops::reverse op;
     auto results = op.evaluate({&input}, {}, {0,2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3506,13 +3506,13 @@ TEST_F(DeclarableOpsTests1, Reverse_5 ) {
     float inBuff[]  = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24};
     float expBuff[] = {21., 22., 23., 24., 17., 18., 19., 20., 13., 14., 15., 16., 9., 10., 11., 12., 5., 6., 7., 8., 1., 2., 3., 4.};
     Nd4jLong shapeInfo[] = {3, 2, 3, 4, 12, 4, 1, 0, 1, 99};
-    ArrayOptions::setDataType(shapeInfo, nd4j::DataType::FLOAT32);
+    ArrayOptions::setDataType(shapeInfo, sd::DataType::FLOAT32);
 
     NDArray input(inBuff, shapeInfo);
     NDArray expected(expBuff, shapeInfo);
     NDArray output(shapeInfo);
 
-    nd4j::ops::reverse op;
+    sd::ops::reverse op;
     auto results = op.evaluate({&input}, {}, {0,1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3531,13 +3531,13 @@ TEST_F(DeclarableOpsTests1, Reverse_6 ) {
     float inBuff[]  = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24};
     float expBuff[] = {4., 3., 2., 1., 8., 7., 6., 5., 12., 11., 10., 9., 16., 15., 14., 13., 20., 19., 18., 17., 24., 23., 22., 21.};
     Nd4jLong shapeInfo[] = {3, 2, 3, 4, 12, 4, 1, 0, 1, 99};
-    ArrayOptions::setDataType(shapeInfo, nd4j::DataType::FLOAT32);
+    ArrayOptions::setDataType(shapeInfo, sd::DataType::FLOAT32);
 
     NDArray input(inBuff, shapeInfo);
     NDArray expected(expBuff, shapeInfo);
     NDArray output(shapeInfo);
 
-    nd4j::ops::reverse op;
+    sd::ops::reverse op;
     auto results = op.evaluate({&input}, {}, {2}, {}, {}, true);
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3558,13 +3558,13 @@ TEST_F(DeclarableOpsTests1, Reverse_7 ) {
     float inBuff[]  = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24};
     float expBuff[] = {9., 10., 11., 12., 5., 6., 7., 8., 1., 2., 3., 4., 21., 22., 23., 24., 17., 18., 19., 20., 13., 14., 15., 16.};
     Nd4jLong shapeInfo[] = {3, 2, 3, 4, 12, 4, 1, 0, 1, 99};
-    ArrayOptions::setDataType(shapeInfo, nd4j::DataType::FLOAT32);
+    ArrayOptions::setDataType(shapeInfo, sd::DataType::FLOAT32);
 
     NDArray input(inBuff, shapeInfo);
     NDArray expected(expBuff, shapeInfo);
     NDArray output(shapeInfo);
 
-    nd4j::ops::reverse op;
+    sd::ops::reverse op;
     auto results = op.evaluate({&input}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3587,13 +3587,13 @@ TEST_F(DeclarableOpsTests1, Reverse_8 ) {
     float inBuff[]  = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24};
     float expBuff[] = {12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 24., 23., 22., 21., 20., 19., 18., 17., 16., 15., 14., 13.};
     Nd4jLong shapeInfo[] = {3, 2, 3, 4, 12, 4, 1, 0, 1, 99};
-    ArrayOptions::setDataType(shapeInfo, nd4j::DataType::FLOAT32);
+    ArrayOptions::setDataType(shapeInfo, sd::DataType::FLOAT32);
 
     NDArray input(inBuff, shapeInfo);
     NDArray expected(expBuff, shapeInfo);
     NDArray output(shapeInfo);
 
-    nd4j::ops::reverse op;
+    sd::ops::reverse op;
     auto results = op.evaluate({&input}, {}, {2,1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3613,13 +3613,13 @@ TEST_F(DeclarableOpsTests1, Reverse_9 ) {
     float inBuff[]  = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24};
     float expBuff[] = {13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.};
     Nd4jLong shapeInfo[] = {3, 2, 3, 4, 12, 4, 1, 0, 1, 99};
-    ArrayOptions::setDataType(shapeInfo, nd4j::DataType::FLOAT32);
+    ArrayOptions::setDataType(shapeInfo, sd::DataType::FLOAT32);
 
     NDArray input(inBuff, shapeInfo);
     NDArray expected(expBuff, shapeInfo);
     NDArray output(shapeInfo);
 
-    nd4j::ops::reverse op;
+    sd::ops::reverse op;
     auto results = op.evaluate({&input}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3637,7 +3637,7 @@ TEST_F(DeclarableOpsTests1, Reverse_10 ) {
     auto i = NDArrayFactory::create<int>('c', {1}, {-1});
     auto e = NDArrayFactory::create<double>('c', {4, 3}, {0.09966054, 0.1592365, 1.5375735,  -1.0355669, 1.144433, 0.677872,0.85020787, -0.67863184, 0.48456487,  -1.1660044, 0.20998026, 0.13950661});
 
-    nd4j::ops::reverse op;
+    sd::ops::reverse op;
     auto result = op.evaluate({&x, &i}, {}, {}, {});
 
     auto z = result->at(0);
@@ -3658,7 +3658,7 @@ TEST_F(DeclarableOpsTests1, Reverse_11 ) {
                                                                   6.f,  5.f,  4.f,  3.f,  2.f,  1.f});
 
     input.linspace(1);
-    nd4j::ops::reverse op;
+    sd::ops::reverse op;
     auto results = op.evaluate({&input}, {}, {0, 1, 2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3679,7 +3679,7 @@ TEST_F(DeclarableOpsTests1, Reverse_12 ) {
     auto expected = NDArrayFactory::create<float>({4.f, 3.f, 2.f, 1.f, 0.f});
 
     //input.linspace(1);
-    nd4j::ops::reverse op;
+    sd::ops::reverse op;
     auto results = op.evaluate({&input}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3701,7 +3701,7 @@ TEST_F(DeclarableOpsTests1, Reverse_13 ) {
     auto expected = NDArrayFactory::create<float>({4.f, 3.f, 2.f, 1.f, 0.f});
 
     //input.linspace(1);
-    nd4j::ops::reverse op;
+    sd::ops::reverse op;
     auto results = op.evaluate({&input}, {}, {-1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3722,7 +3722,7 @@ TEST_F(DeclarableOpsTests1, Reverse_14 ) {
     auto expected = NDArrayFactory::create<double>({0.f, 1.f, 2.f, 3.f, 4.f});
 
     //input.linspace(1);
-    nd4j::ops::reverse op;
+    sd::ops::reverse op;
     auto results = op.evaluate({&input}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3739,7 +3739,7 @@ TEST_F(DeclarableOpsTests1, Test_Expose_1) {
     auto input0 = NDArrayFactory::create<float>('c', {2, 3}, {1, 2, 3, 6, 5, 4});
     auto input1 = NDArrayFactory::create<float>('c', {2, 3}, {3, 2, 1, 4, 5, 6});
 
-    nd4j::ops::expose op;
+    sd::ops::expose op;
 
     auto result = op.evaluate({&input0, &input1});
 
@@ -3767,7 +3767,7 @@ TEST_F(DeclarableOpsTests1, Test_Expose_2) {
     Context block(1, &variableSpace);
     block.pickInput(-1);
 
-    nd4j::ops::expose op;
+    sd::ops::expose op;
     auto result = op.execute(&block);
 
     ASSERT_EQ(ND4J_STATUS_OK, result);
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp
index 484719a45..0ea90f2ce 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp
@@ -22,12 +22,12 @@
 
 #include "testlayers.h"
 #include <ops/declarable/CustomOperations.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <ops/ops.h>
-#include <GradCheck.h>
+#include <helpers/GradCheck.h>
 
 
-using namespace nd4j;
+using namespace sd;
 
 
 class DeclarableOpsTests10 : public testing::Test {
@@ -59,7 +59,7 @@ TEST_F(DeclarableOpsTests10, Test_ArgMax_1) {
     x.linspace(1.0);
 
 
-    nd4j::ops::argmax op;
+    sd::ops::argmax op;
     auto result = op.evaluate({&x});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -78,7 +78,7 @@ TEST_F(DeclarableOpsTests10, Test_ArgMax_2) {
 
     x.linspace(1.0);
 
-    nd4j::ops::argmax op;
+    sd::ops::argmax op;
     auto result = op.evaluate({&x, &y});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -97,7 +97,7 @@ TEST_F(DeclarableOpsTests10, Test_And_1) {
     auto y = NDArrayFactory::create<double>('c', {4}, {0, 0, 0, 1});
     auto e = NDArrayFactory::create<double>('c', {4}, {0, 0, 0, 1});
 
-    nd4j::ops::boolean_and op;
+    sd::ops::boolean_and op;
     auto result = op.evaluate({&x, &y});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -111,7 +111,7 @@ TEST_F(DeclarableOpsTests10, Test_Or_1) {
     auto y = NDArrayFactory::create<double>('c', {4}, {0, 0, 0, 1});
     auto e = NDArrayFactory::create<double>('c', {4}, {1, 1, 0, 1});
 
-    nd4j::ops::boolean_or op;
+    sd::ops::boolean_or op;
     auto result = op.evaluate({&x, &y});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -126,7 +126,7 @@ TEST_F(DeclarableOpsTests10, Test_Not_1) {
 //    auto e = NDArrayFactory::create<bool>('c', {4}, {1, 1, 1, 0});
     auto e = NDArrayFactory::create<bool>('c', {4}, {false, false, true, false});
 
-    nd4j::ops::boolean_not op;
+    sd::ops::boolean_not op;
     auto result = op.evaluate({&x, &y});
     ASSERT_EQ(Status::OK(), result->status());
     auto res = result->at(0);
@@ -140,7 +140,7 @@ TEST_F(DeclarableOpsTests10, Test_Size_at_1) {
     auto x = NDArrayFactory::create<double>('c', {10, 20, 30});
     auto e = NDArrayFactory::create<Nd4jLong>(20);
 
-    nd4j::ops::size_at op;
+    sd::ops::size_at op;
     auto result = op.evaluate({&x}, {1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -159,7 +159,7 @@ TEST_F(DeclarableOpsTests10, MirrorPad_SGO_Test_1) {
 
     auto exp = NDArrayFactory::create<double>({2., 1., 2., 3., 4., 5., 4.});
 
-    nd4j::ops::mirror_pad op;
+    sd::ops::mirror_pad op;
 
     auto res = op.evaluate({&in, &pad}, {10.0}, {0});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -174,7 +174,7 @@ TEST_F(DeclarableOpsTests10, Unique_SGO_Test_1) {
     auto expIdx = NDArrayFactory::create<Nd4jLong>({0, 1, 0, 2, 0, 3, 4, 1, 4, 1});
     auto exp = NDArrayFactory::create<double>({3., 4., 1., 0., 2.});
 
-    nd4j::ops::unique op;
+    sd::ops::unique op;
     auto res = op.evaluate({&input}, {}, {});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
     auto res1 = res->at(0);
@@ -191,7 +191,7 @@ TEST_F(DeclarableOpsTests10, Where_SGO_Test_1) {
     //auto expIdx({0., 1., 0., 2., 0., 3., 4., 1., 4., 1.});
     auto exp = NDArrayFactory::create<Nd4jLong>('c', {6, 2}, {0LL, 0LL, 1LL, 0LL, 1LL, 1LL, 2LL, 0LL, 2LL, 1LL, 2LL, 2LL});
 
-    nd4j::ops::Where op;
+    sd::ops::Where op;
     auto res = op.evaluate({&input}, {}, {});
     ASSERT_TRUE(res->status() == ND4J_STATUS_OK);
     auto  resA = res->at(0);
@@ -208,7 +208,7 @@ TEST_F(DeclarableOpsTests10, Where_SGO_Test_02) {
     //auto expIdx({0., 1., 0., 2., 0., 3., 4., 1., 4., 1.});
     auto exp = NDArrayFactory::create<Nd4jLong>('c', {5, 3}, {0LL, 0LL, 0LL, 0LL, 1LL, 1LL, 1LL, 0LL, 0LL, 1LL, 0LL, 1LL, 1LL, 1LL, 0LL});
 
-    nd4j::ops::Where op;
+    sd::ops::Where op;
     auto res = op.evaluate({&input}, {}, {});
     ASSERT_TRUE(res->status() == ND4J_STATUS_OK);
     auto  resA = res->at(0);
@@ -226,7 +226,7 @@ TEST_F(DeclarableOpsTests10, WhereNP_SGO_Test_1) {
     auto exp1 = NDArrayFactory::create<Nd4jLong>({0, 0, 1, 1, 1});
     auto exp2 = NDArrayFactory::create<Nd4jLong>({0, 1, 0, 0, 1});
     auto exp3 = NDArrayFactory::create<Nd4jLong>({0, 1, 0, 1, 0});
-    nd4j::ops::where_np op;
+    sd::ops::where_np op;
     auto res = op.evaluate({&cond3d}, {}, {});
     ASSERT_TRUE(res->size() == 3);
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -250,7 +250,7 @@ TEST_F(DeclarableOpsTests10, WhereNP_SGO_Test_2) {
 //    auto expIdx({0, 1, 0, 2, 0, 3, 4, 1, 4, 1});
     auto exp1 = NDArrayFactory::create<Nd4jLong>({0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2});
     auto exp2 = NDArrayFactory::create<Nd4jLong>({0, 1, 4, 0, 1, 2, 3, 4, 1, 2, 3, 4});
-    nd4j::ops::where_np op;
+    sd::ops::where_np op;
     auto res = op.evaluate({&cond2d}, {}, {});
     ASSERT_TRUE(res->size() == 2);
     ASSERT_TRUE(res->status() == ND4J_STATUS_OK);
@@ -266,7 +266,7 @@ TEST_F(DeclarableOpsTests10, Where_SGO_Test_2) {
     //auto expIdx({0., 1., 0., 2., 0., 3., 4., 1., 4., 1.});
     auto exp = NDArrayFactory::create<Nd4jLong>('c', {4,1}, {0, 2, 3, 4});
 
-    nd4j::ops::Where op;
+    sd::ops::Where op;
     auto res = op.evaluate({&input});
     ASSERT_TRUE(res->status() == ND4J_STATUS_OK);
     auto resA = res->at(0);
@@ -284,7 +284,7 @@ TEST_F(DeclarableOpsTests10, Where_SGO_Test_3) {
     //auto expIdx({0., 1., 0., 2., 0., 3., 4., 1., 4., 1.});
     auto exp = NDArrayFactory::create<Nd4jLong>('c', {4, 2}, {0, 0, 2, 0, 3, 0, 4, 0});
 
-    nd4j::ops::Where op;
+    sd::ops::Where op;
     auto res = op.evaluate({&input}, {}, {});
     ASSERT_TRUE(res->status() == ND4J_STATUS_OK);
     auto resA = res->at(0);
@@ -302,7 +302,7 @@ TEST_F(DeclarableOpsTests10, Where_SGO_Test_4) {
     //auto expIdx({0., 1., 0., 2., 0., 3., 4., 1., 4., 1.});
     auto exp = NDArrayFactory::create<Nd4jLong>('c', {4, 2}, {0, 0, 2, 0, 3, 0, 4, 0});
 
-    nd4j::ops::Where op;
+    sd::ops::Where op;
     auto res = op.evaluate({&input}, {}, {});
     ASSERT_TRUE(res->status() == ND4J_STATUS_OK);
     auto resA = res->at(0);
@@ -321,7 +321,7 @@ TEST_F(DeclarableOpsTests10, Where_SGO_Test_5) {
     //auto expIdx({0., 1., 0., 2., 0., 3., 4., 1., 4., 1.});
     auto exp = NDArrayFactory::create<Nd4jLong>('c', {3, 1}, {0, 3, 4});
 
-    nd4j::ops::Where op;
+    sd::ops::Where op;
     auto res = op.evaluate({&input}, {}, {});
     ASSERT_TRUE(res->status() == ND4J_STATUS_OK);
     auto resA = res->at(0);
@@ -339,7 +339,7 @@ TEST_F(DeclarableOpsTests10, WhereNP_SGO_Test_4) {
     //auto expIdx({0., 1., 0., 2., 0., 3., 4., 1., 4., 1.});
     auto exp = NDArrayFactory::create<Nd4jLong>('c', {4, 2}, {0, 0, 2, 0, 3, 0, 4, 0});
 
-    nd4j::ops::where_np op;
+    sd::ops::where_np op;
     auto res = op.evaluate({&input}, {}, {});
     ASSERT_TRUE(res->status() == ND4J_STATUS_OK);
     auto resA = res->at(0);
@@ -360,7 +360,7 @@ TEST_F(DeclarableOpsTests10, CosineDistance_SGO_Test_1) {
     auto weights = NDArrayFactory::create<double>('c', {2, 1}, {0., 1.});
     auto exp = NDArrayFactory::create<double>(0.6);
 
-    nd4j::ops::cosine_distance_loss op;
+    sd::ops::cosine_distance_loss op;
     auto res = op.evaluate({&predictions, &weights, &labels}, {}, {3, 1});
     ASSERT_TRUE(res->status() == ND4J_STATUS_OK);
     auto resA = res->at(0);
@@ -378,7 +378,7 @@ TEST_F(DeclarableOpsTests10, CosineDistance_SGO_Test_2) {
     auto weights = NDArrayFactory::create<double>('c', {2, 1}, {0., 1.});
     auto exp = NDArrayFactory::create<double>(0.6);
 
-    nd4j::ops::cosine_distance_loss op;
+    sd::ops::cosine_distance_loss op;
     auto res = op.evaluate({&predictions, &weights, &labels}, {}, {2, 1});
     ASSERT_TRUE(res->status() == ND4J_STATUS_OK);
     auto resA = res->at(0);
@@ -401,7 +401,7 @@ TEST_F(DeclarableOpsTests10, TestMarixBandPart_Test_1) {
     exp.p(0, 2, 0, 0.);
     exp.p(1, 2, 0, 0.);
 
-    nd4j::ops::matrix_band_part op;
+    sd::ops::matrix_band_part op;
     auto results = op.evaluate({&x}, {}, {1, 1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -421,7 +421,7 @@ TEST_F(DeclarableOpsTests10, atan2_test1) {
     auto exp = NDArrayFactory::create<double>('c', {2,3,4}, {-2.04201, -2.03663, -2.03009, -2.02199,-2.01166, -1.99808, -1.97941, -1.95217,-1.90875, -1.8292 , -1.6416 , -0.942  ,
                                        0.33172,  0.69614,  0.81846,  0.87776, 0.91253,  0.93533,  0.95141,  0.96336, 0.97259,  0.97993,  0.98591,  1.01266,});
 
-    nd4j::ops::tf_atan2 op;
+    sd::ops::tf_atan2 op;
     auto result = op.evaluate({&y, &x}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     auto z = result->at(0);
@@ -441,13 +441,13 @@ TEST_F(DeclarableOpsTests10, atan2_test2) {
     auto exp = NDArrayFactory::create<double>('c', {2,3,4}, {-2.38008, -2.30149, -2.22748, -2.1232 ,-1.96979, -1.73736, -1.3973 , -0.98279,-0.61088, -0.34685, -0.17256, -0.0555 ,
                                        3.11208,  2.99987,  2.83399,  2.57869, 2.207  ,  1.77611,  1.41664,  1.17298, 1.01458,  0.90829,  0.8336 ,  0.77879});
 
-    nd4j::ops::tf_atan2 op;
+    sd::ops::tf_atan2 op;
     auto result = op.evaluate({&y, &x}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     auto z = result->at(0);
     // z->printIndexedBuffer();
 
-    // x.applyTrueBroadcast(nd4j::BroadcastOpsTuple::custom(scalar::Atan2, pairwise::Atan2, broadcast::Atan2), &y, &z, true);
+    // x.applyTrueBroadcast(sd::BroadcastOpsTuple::custom(scalar::Atan2, pairwise::Atan2, broadcast::Atan2), &y, &z, true);
 
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
@@ -464,7 +464,7 @@ TEST_F(DeclarableOpsTests10, atan2_test3) {
     auto exp = NDArrayFactory::create<double>('c', {2,3,4}, {-2.33231, -2.41089, -2.48491, -2.58919,-2.74259, -2.97502,  2.9681 ,  2.55359, 2.18167,  1.91765,  1.74335,  1.62629,
                                        -1.54128, -1.42907, -1.2632 , -1.00789,-0.63621, -0.20531,  0.15416,  0.39782, 0.55622,  0.6625 ,  0.7372 ,  0.79201});
 
-    nd4j::ops::tf_atan2 op;
+    sd::ops::tf_atan2 op;
     auto result = op.evaluate({&x, &y}, {}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     auto z = result->at(0);
@@ -484,7 +484,7 @@ TEST_F(DeclarableOpsTests10, atan2_test4) {
     auto exp = NDArrayFactory::create<double>('c', {2,3,4}, {-2.45527, -2.36165, -2.24628, -2.10492,-2.1703 , -1.86945, -1.50321, -1.15359,-0.25062, -0.17373, -0.13273, -0.10733,
                                         3.05688,  3.03942,  3.01293,  2.9681 , 2.18167,  1.87635,  1.50156,  1.14451, 1.13674,  0.97626,  0.84423,  0.7372 });
 
-    nd4j::ops::tf_atan2 op;
+    sd::ops::tf_atan2 op;
     auto result = op.evaluate({&x, &y}, {}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     auto z = result->at(0);
@@ -504,7 +504,7 @@ TEST_F(DeclarableOpsTests10, atan2_test5) {
     auto exp = NDArrayFactory::create<double>('c', {2,3,4}, {-2.25712, -2.35074, -2.46611, -2.60747,-2.54209, -2.84294,  3.07401,  2.72438, 1.82141,  1.74453,  1.70353,  1.67813,
                                        -1.48608, -1.46862, -1.44214, -1.3973 ,-0.61088, -0.30556,  0.06924,  0.42629, 0.43405,  0.59453,  0.72657,  0.8336 });
 
-    nd4j::ops::tf_atan2 op;
+    sd::ops::tf_atan2 op;
     auto result = op.evaluate({&y, &x}, {}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     auto z = result->at(0);
@@ -523,7 +523,7 @@ TEST_F(DeclarableOpsTests10, atan2_test6) {
 
     auto exp = NDArrayFactory::create<double>('c', {1,3,4}, {-2.25712, -1.68608, -1.44214, -0.54006,-2.77695, -2.16855,  0.34972,  0.24585, 2.71267,  1.74453,  1.45312,  0.8336 });
 
-    nd4j::ops::tf_atan2 op;
+    sd::ops::tf_atan2 op;
     auto result = op.evaluate({&y, &x}, {}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     auto z = result->at(0);
@@ -545,7 +545,7 @@ TEST_F(DeclarableOpsTests10, IGamma_Test1) {
            0.0066205109,    0.022211598, 0.040677428,  0.059117373,
         0.0000039433403, 0.000086064574, 0.000436067, 0.0012273735});
 
-    nd4j::ops::igamma op;
+    sd::ops::igamma op;
     auto result = op.evaluate({&y, &x}, {}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     auto z = result->at(0);
@@ -567,7 +567,7 @@ TEST_F(DeclarableOpsTests10, IGamma_Test2) {
                                                              0.993379, 0.977788, 0.959323, 0.940883,
                                                              0.999996, 0.999914, 0.999564, 0.998773});
 
-    nd4j::ops::igammac op;
+    sd::ops::igammac op;
     auto result = op.evaluate({&y, &x}, {}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     auto z = result->at(0);
@@ -590,7 +590,7 @@ TEST_F(DeclarableOpsTests10, LGamma_Test1) {
              0.28468287,  0.4348206 ,  0.6931472
     });
 
-    nd4j::ops::lgamma op;
+    sd::ops::lgamma op;
     auto result = op.evaluate({&x}, {}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     auto z = result->at(0);
@@ -609,7 +609,7 @@ TEST_F(DeclarableOpsTests10, range_test10) {
     limit = 5.;
     auto exp = NDArrayFactory::create<double>('c', {5}, {0.,1.,2.,3.,4.});
 
-    nd4j::ops::range op;
+    sd::ops::range op;
     auto result = op.evaluate({&limit}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -631,7 +631,7 @@ TEST_F(DeclarableOpsTests10, range_test11) {
     start = 0.5;
     auto exp = NDArrayFactory::create<double>('c', {5}, {0.5,1.5,2.5,3.5,4.5});
 
-    nd4j::ops::range op;
+    sd::ops::range op;
     auto result = op.evaluate({&start, &limit}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -649,7 +649,7 @@ TEST_F(DeclarableOpsTests10, range_test12) {
 
     auto exp = NDArrayFactory::create<float>('c', {9}, {0.5f, 1.f , 1.5f, 2.f , 2.5f, 3.f , 3.5f, 4.f , 4.5f});
 
-    nd4j::ops::range op;
+    sd::ops::range op;
     auto result = op.evaluate({}, {0.5, 5, 0.5}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -670,7 +670,7 @@ TEST_F(DeclarableOpsTests10, top_k_permuted_test1) {
     auto expSorted = NDArrayFactory::create<double>({9., 8., 7., 6., 5.}); // Sorted = False
 
 
-    nd4j::ops::top_k op;
+    sd::ops::top_k op;
     auto result = op.evaluate({&x}, {}, {4}, {false});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -703,7 +703,7 @@ TEST_F(DeclarableOpsTests10, top_k_permuted_test2) {
     auto expSorted = NDArrayFactory::create<double>({9., 8., 7., 6., 5.}); // Sorted = False
 
 
-    nd4j::ops::top_k op;
+    sd::ops::top_k op;
     auto result = op.evaluate({&x}, {}, {5}, {false});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -737,7 +737,7 @@ TEST_F(DeclarableOpsTests10, sparse_softmax_cross_entropy_loss_with_logits_test1
 
     logits.linspace(0.1, 0.1);
 
-    nd4j::ops::sparse_softmax_cross_entropy_loss_with_logits op;
+    sd::ops::sparse_softmax_cross_entropy_loss_with_logits op;
     auto results = op.evaluate({&labels, &logits});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -759,7 +759,7 @@ TEST_F(DeclarableOpsTests10, sparse_softmax_cross_entropy_loss_with_logits_test2
 
     logits.linspace(0.1, 0.1);
 
-    nd4j::ops::sparse_softmax_cross_entropy_loss_with_logits op;
+    sd::ops::sparse_softmax_cross_entropy_loss_with_logits op;
     auto results = op.evaluate({&labels, &logits});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -775,13 +775,13 @@ TEST_F(DeclarableOpsTests10, sparse_softmax_cross_entropy_loss_with_logits_test2
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests10, sparse_softmax_cross_entropy_loss_with_logits_test3) {
 
-    NDArray labels('c', {1}, std::vector<double>{0}, nd4j::DataType::INT32);
+    NDArray labels('c', {1}, std::vector<double>{0}, sd::DataType::INT32);
     auto logits = NDArrayFactory::create<double>('c', {1,3});
     auto expected = NDArrayFactory::create<double>('c', {1}, {1.20194});
 
     logits.linspace(0.1, 0.1);
 
-    nd4j::ops::sparse_softmax_cross_entropy_loss_with_logits op;
+    sd::ops::sparse_softmax_cross_entropy_loss_with_logits op;
     auto results = op.evaluate({&labels, &logits});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -803,7 +803,7 @@ TEST_F(DeclarableOpsTests10, sparse_softmax_cross_entropy_loss_with_logits_test4
 
     logits.linspace(0.1, 0.1);
 
-    nd4j::ops::sparse_softmax_cross_entropy_loss_with_logits op;
+    sd::ops::sparse_softmax_cross_entropy_loss_with_logits op;
     auto results = op.evaluate({&labels, &logits});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -824,7 +824,7 @@ TEST_F(DeclarableOpsTests10, split_test4) {
     auto exp1 = NDArrayFactory::create<double>('c', {5}, {1.f,2.f,3.f,4.f,5.f});
     auto exp2 = NDArrayFactory::create<double>('c', {5}, {6.f,7.f,8.f,9.f,10.f});
 
-    nd4j::ops::split op;
+    sd::ops::split op;
     auto results = op.evaluate({&input, &axis}, {}, {2}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -848,7 +848,7 @@ TEST_F(DeclarableOpsTests10, split_test5) {
     auto exp1 = NDArrayFactory::create<double>('c', {3,4}, {1.f,2.f,3.f,4.f, 9.f,10.f,11.f,12.f, 17.f,18.f,19.f,20.f});
     auto exp2 = NDArrayFactory::create<double>('c', {3,4}, {5.f,6.f,7.f,8.f, 13.f,14.f,15.f,16.f, 21.f,22.f,23.f,24.f});
 
-    nd4j::ops::split op;
+    sd::ops::split op;
     auto results = op.evaluate({&input}, {}, {2,-1},{});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -871,7 +871,7 @@ TEST_F(DeclarableOpsTests10, histogram_fixed_width_test1) {
     auto range = NDArrayFactory::create<double>('c', {2}, {0, 5});
     auto exp = NDArrayFactory::create<Nd4jLong>('c', {5}, {2, 1, 1, 0, 2});
 
-    nd4j::ops::histogram_fixed_width op;
+    sd::ops::histogram_fixed_width op;
     auto results = op.evaluate({&input, &range}, {}, {5}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -891,7 +891,7 @@ TEST_F(DeclarableOpsTests10, histogram_fixed_width_test2) {
     auto range = NDArrayFactory::create<double>('c', {2}, {0, 5});
     auto exp = NDArrayFactory::create<Nd4jLong>('c', {5}, {5, 2, 5, 3, 9});
 
-    nd4j::ops::histogram_fixed_width op;
+    sd::ops::histogram_fixed_width op;
     auto results = op.evaluate({&input, &range}, {}, {5}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -911,7 +911,7 @@ TEST_F(DeclarableOpsTests10, histogram_fixed_width_test3) {
     auto range = NDArrayFactory::create<double>('c', {1,2,1}, {0, 5});
     auto exp = NDArrayFactory::create<Nd4jLong>('c', {5}, {5, 2, 5, 4, 8});
 
-    nd4j::ops::histogram_fixed_width op;
+    sd::ops::histogram_fixed_width op;
     auto results = op.evaluate({&input, &range}, {}, {5}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -936,7 +936,7 @@ TEST_F(DeclarableOpsTests10, histogram_fixed_width_test4) {
     auto range = NDArrayFactory::create<double>('c', {1,2}, {0, 50});
     auto exp = NDArrayFactory::create<Nd4jLong>('c', {5}, {22, 17, 24, 19, 18});
 
-    nd4j::ops::histogram_fixed_width op;
+    sd::ops::histogram_fixed_width op;
     auto results = op.evaluate({&input, &range}, {}, {5}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -962,7 +962,7 @@ TEST_F(DeclarableOpsTests10, histogram_fixed_width_test5) {
 //    auto exp = NDArrayFactory::create<Nd4jLong>('c', {5}, {23, 19, 20, 23, 15}); // 23, 15, 24, 17, 21
     auto exp = NDArrayFactory::create<Nd4jLong>('c', {5}, {23, 15, 24, 17, 21});
 
-    nd4j::ops::histogram_fixed_width op;
+    sd::ops::histogram_fixed_width op;
     auto results = op.evaluate({&input, &range}, {}, {5}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -985,7 +985,7 @@ TEST_F(DeclarableOpsTests10, histogram_fixed_width_test6) {
 
     auto exp = NDArrayFactory::create<Nd4jLong>('c', {5}, {3, 1, 2, 0, 1});
 
-    nd4j::ops::histogram_fixed_width op;
+    sd::ops::histogram_fixed_width op;
     auto results = op.evaluate({&input, &range, &bins}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1009,7 +1009,7 @@ TEST_F(DeclarableOpsTests10, NTH_Element_Test_1) {
 
     //input.linspace(1.f);
 
-    nd4j::ops::nth_element op;
+    sd::ops::nth_element op;
     auto results = op.evaluate({&input, &n}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1031,7 +1031,7 @@ TEST_F(DeclarableOpsTests10, NTH_Element_Test_2) {
 
 //    input.linspace(1.f);
 
-    nd4j::ops::nth_element op;
+    sd::ops::nth_element op;
     auto results = op.evaluate({&input, &n}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1053,7 +1053,7 @@ TEST_F(DeclarableOpsTests10, NTH_Element_Test_3) {
 
     //input.linspace(1.f);
 
-    nd4j::ops::nth_element op;
+    sd::ops::nth_element op;
     auto results = op.evaluate({&input, &n}, {}, {1}); // with reverse = true
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1075,7 +1075,7 @@ TEST_F(DeclarableOpsTests10, NTH_Element_Test_4) {
 
     //input.linspace(1.f);
 
-    nd4j::ops::nth_element op;
+    sd::ops::nth_element op;
     auto results = op.evaluate({&input, &n}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1096,7 +1096,7 @@ TEST_F(DeclarableOpsTests10, NTH_Element_Test_04) {
 
     input.linspace(1.f);
 
-    nd4j::ops::nth_element op;
+    sd::ops::nth_element op;
     auto results = op.evaluate({&input, &n}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1117,7 +1117,7 @@ TEST_F(DeclarableOpsTests10, NTH_Element_Test_5) {
 
 //    input.linspace(1.f);
 
-    nd4j::ops::nth_element op;
+    sd::ops::nth_element op;
     auto results = op.evaluate({&input, &n}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1139,7 +1139,7 @@ TEST_F(DeclarableOpsTests10, NTH_Element_Test_6) {
 
 //    input.linspace(1.f);
 
-    nd4j::ops::nth_element op;
+    sd::ops::nth_element op;
     auto results = op.evaluate({&input, &n}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1159,7 +1159,7 @@ TEST_F(DeclarableOpsTests10, NTH_Element_Test_06) {
 
 //    input.linspace(1.f);
 
-    nd4j::ops::nth_element op;
+    sd::ops::nth_element op;
     auto results = op.evaluate({&input, &n}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1185,7 +1185,7 @@ TEST_F(DeclarableOpsTests10, NTH_Element_Test_7) {
 
     //input.linspace(1.f);
 
-    nd4j::ops::nth_element op;
+    sd::ops::nth_element op;
     auto results = op.evaluate({&input, &n}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1212,7 +1212,7 @@ TEST_F(DeclarableOpsTests10, NTH_Element_Test_8) {
 
     //input.linspace(1.f);
 
-    nd4j::ops::nth_element op;
+    sd::ops::nth_element op;
     auto results = op.evaluate({&input, &n}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1234,7 +1234,7 @@ TEST_F(DeclarableOpsTests10, broadcast_to_test1) {
 
     input.linspace(1.f);
 
-    nd4j::ops::broadcast_to op;
+    sd::ops::broadcast_to op;
     auto results = op.evaluate({&input, &shape}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1256,7 +1256,7 @@ TEST_F(DeclarableOpsTests10, broadcast_to_test2) {
 
     input.linspace(1.f);
 
-    nd4j::ops::broadcast_to op;
+    sd::ops::broadcast_to op;
     auto results = op.evaluate({&input, &shape}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1278,7 +1278,7 @@ TEST_F(DeclarableOpsTests10, broadcast_to_test3) {
 
     input.linspace(1.f);
 
-    nd4j::ops::broadcast_to op;
+    sd::ops::broadcast_to op;
     auto results = op.evaluate({&input, &shape}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1298,7 +1298,7 @@ TEST_F(DeclarableOpsTests10, broadcast_to_test4) {
     auto shape = NDArrayFactory::create<double>('c', {2}, {3.f, 3.f});
     auto exp = NDArrayFactory::create<double>('c', {3,3}, {10.f, 10.f, 10.f,10.f, 10.f, 10.f, 10.f, 10.f, 10.f});
 
-    nd4j::ops::broadcast_to op;
+    sd::ops::broadcast_to op;
     auto results = op.evaluate({&input, &shape}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1318,7 +1318,7 @@ TEST_F(DeclarableOpsTests10, broadcast_to_test5) {
     auto shape = NDArrayFactory::create<double>('c', {1}, {3.f});
     auto exp = NDArrayFactory::create<double>('c', {3}, {10.f, 10.f, 10.f});
 
-    nd4j::ops::broadcast_to op;
+    sd::ops::broadcast_to op;
     auto results = op.evaluate({&input, &shape}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1338,7 +1338,7 @@ TEST_F(DeclarableOpsTests10, broadcast_to_test6) {
     auto shape = NDArrayFactory::create<double>(1.f);
     auto exp = NDArrayFactory::create<double>('c', {1}, {10.f});
 
-    nd4j::ops::broadcast_to op;
+    sd::ops::broadcast_to op;
     auto results = op.evaluate({&input, &shape}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1358,7 +1358,7 @@ TEST_F(DeclarableOpsTests10, broadcast_to_test7) {
     auto shape = NDArrayFactory::create<Nd4jLong>(1);
     auto exp = NDArrayFactory::create<double>('c', {1}, {10.});
 
-    nd4j::ops::broadcast_to op;
+    sd::ops::broadcast_to op;
     auto results = op.evaluate({&input, &shape}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1380,7 +1380,7 @@ TEST_F(DeclarableOpsTests10, broadcast_to_test8) {
 
     input.linspace(1.f);
 
-    nd4j::ops::broadcast_to op;
+    sd::ops::broadcast_to op;
     auto results = op.evaluate({&input, &shape}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1402,7 +1402,7 @@ TEST_F(DeclarableOpsTests10, broadcast_to_test9) {
                                           1.f, 1.f, 1.f,2.f, 2.f, 2.f,3.f, 3.f, 3.f,4.f, 4.f, 4.f,5.f, 5.f, 5.f});
     input.linspace(1.f);
 
-    nd4j::ops::broadcast_to op;
+    sd::ops::broadcast_to op;
     auto results = op.evaluate({&input, &shape}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1424,7 +1424,7 @@ TEST_F(DeclarableOpsTests10, broadcast_to_test10) {
                                           1.f,  2.f,  3.f, 4.f,  5.f,  6.f, 7.f,  8.f,  9.f,10.f, 11.f, 12.f,13.f, 14.f, 15.f});
     input.linspace(1.f);
 
-    nd4j::ops::broadcast_to op;
+    sd::ops::broadcast_to op;
     auto results = op.evaluate({&input, &shape}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1480,7 +1480,7 @@ TEST_F(DeclarableOpsTests10, ImageResizeBilinear_Test1) {
     //input = 1.f;
     input.linspace(1);
 
-    nd4j::ops::resize_bilinear op;
+    sd::ops::resize_bilinear op;
     auto results = op.evaluate({&input}, {}, {10, 10});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1502,7 +1502,7 @@ TEST_F(DeclarableOpsTests10, ImageResizeBilinear_Test_11) {
     input.assign(0.8f); //linspace(1);
     auto size = NDArrayFactory::create<int>({65,65});
     auto ex = NDArrayFactory::create<float>('c', {1,65,65,256});
-    nd4j::ops::resize_bilinear op;
+    sd::ops::resize_bilinear op;
     auto results = op.evaluate({&input, &size}, {}, {}, {false});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1521,7 +1521,7 @@ TEST_F(DeclarableOpsTests10, ImageResizeBilinear_Test_12) {
     input.assign(0.8f); //linspace(1);
     auto size = NDArrayFactory::create<int>({65,65});
     auto ex = NDArrayFactory::create<float>('c', {1,65,65,256});
-    nd4j::ops::resize_bilinear op;
+    sd::ops::resize_bilinear op;
     auto results = op.evaluate({&input, &size}, {}, {}, {true});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1565,7 +1565,7 @@ TEST_F(DeclarableOpsTests10, ImageResizeBilinear_Test1_1) {
     //input = 1.f;
     input.linspace(1);
 
-    nd4j::ops::resize_bilinear op;
+    sd::ops::resize_bilinear op;
     auto results = op.evaluate({&input}, {}, {4, 5}, {false, true});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1612,7 +1612,7 @@ TEST_F(DeclarableOpsTests10, ImageResizeBilinear_Test1_2) {
     //input = 1.f;
     input.linspace(1);
 
-    nd4j::ops::resize_bilinear op;
+    sd::ops::resize_bilinear op;
     auto results = op.evaluate({&input}, {}, {4, 5}, {false, true});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1668,7 +1668,7 @@ TEST_F(DeclarableOpsTests10, ImageResizeBilinear_Test01) {
     //input = 1.f;
     input.linspace(1);
 
-    nd4j::ops::resize_bilinear op;
+    sd::ops::resize_bilinear op;
     auto results = op.evaluate({&input}, {}, {10, 10});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1823,7 +1823,7 @@ TEST_F(DeclarableOpsTests10, ImageResizeBilinear_Test02) {
     });
     //input.linspace(1);
 
-    nd4j::ops::resize_bilinear op;
+    sd::ops::resize_bilinear op;
     auto results = op.evaluate({&input}, {}, {9, 9});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1882,7 +1882,7 @@ TEST_F(DeclarableOpsTests10, ImageResizeBilinear_Test2) {
     //input = 1.f;
     input.linspace(1);
 
-    nd4j::ops::resize_bilinear op;
+    sd::ops::resize_bilinear op;
     auto results = op.evaluate({&input, &size}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2012,7 +2012,7 @@ TEST_F(DeclarableOpsTests10, ImageResizeBilinear_Test3) {
     //input = 1.f;
     input.linspace(1);
 
-    nd4j::ops::resize_bilinear op;
+    sd::ops::resize_bilinear op;
     auto results = op.evaluate({&input}, {}, {10, 10}, {true});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2141,7 +2141,7 @@ TEST_F(DeclarableOpsTests10, ImageResizeBilinear_Test4) {
     //input = 1.f;
     input.linspace(1);
 
-    nd4j::ops::resize_bilinear op;
+    sd::ops::resize_bilinear op;
     auto results = op.evaluate({&input, &size}, {}, {}, {true});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2165,7 +2165,7 @@ TEST_F(DeclarableOpsTests10, LinSpace_Test1) {
     NDArray expect = NDArrayFactory::create<double>({1., 1.5, 2., 2.5, 3., 3.5, 4., 4.5, 5., 5.5, 6., 6.5, 7., 7.5,
                                                         8., 8.5, 9., 9.5, 10., 10.5, 11., 11.5, 12.});
 
-    nd4j::ops::lin_space op;
+    sd::ops::lin_space op;
     auto result = op.evaluate({&start, &finish, &num}, {}, {});
     ASSERT_EQ(result->status(), ND4J_STATUS_OK);
     auto res = result->at(0);
@@ -2207,7 +2207,7 @@ TEST_F(DeclarableOpsTests10, ImageResizeNeighbor_Test1) {
     //input = 1.f;
     input.linspace(1);
 
-    nd4j::ops::resize_nearest_neighbor op;
+    sd::ops::resize_nearest_neighbor op;
     auto results = op.evaluate({&input}, {}, {4, 5}, {false, false});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2255,7 +2255,7 @@ TEST_F(DeclarableOpsTests10, ImageResizeNeighbor_Test1_1) {
     //input = 1.f;
     input.linspace(1);
 
-    nd4j::ops::resize_nearest_neighbor op;
+    sd::ops::resize_nearest_neighbor op;
     auto results = op.evaluate({&input}, {}, {4, 5});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2303,7 +2303,7 @@ TEST_F(DeclarableOpsTests10, ImageResizeNeighbor_Test1_1_1) {
     //input = 1.f;
     input.linspace(1);
 
-    nd4j::ops::resize_nearest_neighbor op;
+    sd::ops::resize_nearest_neighbor op;
     auto results = op.evaluate({&input}, {}, {4,5}, {false, true});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2350,7 +2350,7 @@ TEST_F(DeclarableOpsTests10, ImageResizeNeighbor_Test01) {
     //input = 1.f;
     input.linspace(1);
 
-    nd4j::ops::resize_nearest_neighbor op;
+    sd::ops::resize_nearest_neighbor op;
     auto results = op.evaluate({&input}, {}, {4, 5});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2372,7 +2372,7 @@ TEST_F(DeclarableOpsTests10, ReduceLogSumExpTest_1) {
 
     NDArray expected = NDArrayFactory::create<double>(2.5206409f);
 
-    nd4j::ops::reduce_logsumexp op;
+    sd::ops::reduce_logsumexp op;
     auto results = op.evaluate({&input}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2393,7 +2393,7 @@ TEST_F(DeclarableOpsTests10, ReduceLogSumExpTest_2) {
 
     NDArray expected = NDArrayFactory::create<double>({1.0986123f, 1.8619947f, 1.0986123f});
 
-    nd4j::ops::reduce_logsumexp op;
+    sd::ops::reduce_logsumexp op;
     auto results = op.evaluate({&input}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2413,7 +2413,7 @@ TEST_F(DeclarableOpsTests10, ReduceLogSumExpTest_3) {
 
     NDArray expected = NDArrayFactory::create<float>('c', {1,3}, {1.0986123f, 1.8619947f, 1.0986123f});
 
-    nd4j::ops::reduce_logsumexp op;
+    sd::ops::reduce_logsumexp op;
     auto results = op.evaluate({&input}, {1.f}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2434,7 +2434,7 @@ TEST_F(DeclarableOpsTests10, Image_NonMaxSuppressing_1) {
     NDArray expected = NDArrayFactory::create<int>('c', {3}, {2, 1, 0});
     boxes.linspace(1.f);
 
-    nd4j::ops::non_max_suppression op;
+    sd::ops::non_max_suppression op;
     auto results = op.evaluate({&boxes, &scores}, {}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2456,7 +2456,7 @@ TEST_F(DeclarableOpsTests10, Image_NonMaxSuppressing_2) {
     NDArray scales = NDArrayFactory::create<double>('c', {6}, {0.9f, .75f, .6f, .95f, .5f, .3f}); //3, 0, 1, 2, 4, 5
     NDArray expected = NDArrayFactory::create<int>('c', {3}, {3,0,5});
 
-    nd4j::ops::non_max_suppression op;
+    sd::ops::non_max_suppression op;
     auto results = op.evaluate({&boxes, &scales}, {0.5}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2478,7 +2478,7 @@ TEST_F(DeclarableOpsTests10, Image_NonMaxSuppressing_3) {
     NDArray scales = NDArrayFactory::create<float>('c', {3}, {0.0029f,    0.8135f,    0.4873f}); //3, 0, 1, 2, 4, 5
     NDArray expected = NDArrayFactory::create<int>('c', {1}, {1});
 
-    nd4j::ops::non_max_suppression op;
+    sd::ops::non_max_suppression op;
     auto results = op.evaluate({&boxes, &scales}, {0.5, 0.5}, {2});
 
     ASSERT_EQ(Status::OK(), results->status());
@@ -2501,7 +2501,7 @@ TEST_F(DeclarableOpsTests10, Image_NonMaxSuppressing_4) {
     NDArray maxSize = NDArrayFactory::create(2);
     NDArray threshold = NDArrayFactory::create(0.5f);
     NDArray scoreThreshold = NDArrayFactory::create(0.5);
-    nd4j::ops::non_max_suppression op;
+    sd::ops::non_max_suppression op;
     auto results = op.evaluate({&boxes, &scales, &maxSize, &threshold, &scoreThreshold}, {}, {});
 
     ASSERT_EQ(Status::OK(), results->status());
@@ -2523,7 +2523,7 @@ TEST_F(DeclarableOpsTests10, Image_NonMaxSuppressing_5) {
     NDArray maxSize = NDArrayFactory::create(2);
     NDArray threshold = NDArrayFactory::create(0.5f);
     NDArray scoreThreshold = NDArrayFactory::create(-DataTypeUtils::infOrMax<float>());
-    nd4j::ops::non_max_suppression op;
+    sd::ops::non_max_suppression op;
     auto results = op.evaluate({&boxes, &scales, &maxSize, &threshold, &scoreThreshold}, {}, {});
 
     ASSERT_EQ(Status::OK(), results->status());
@@ -2546,7 +2546,7 @@ TEST_F(DeclarableOpsTests10, Image_NonMaxSuppressing_6) {
     NDArray maxSize = NDArrayFactory::create(2);
     NDArray threshold = NDArrayFactory::create(0.5f);
     NDArray scoreThreshold = NDArrayFactory::create(-DataTypeUtils::infOrMax<float>());
-    nd4j::ops::non_max_suppression_v3 op;
+    sd::ops::non_max_suppression_v3 op;
     auto results = op.evaluate({&boxes, &scales, &maxSize, &threshold, &scoreThreshold}, {}, {});
 
     ASSERT_EQ(Status::OK(), results->status());
@@ -2570,7 +2570,7 @@ TEST_F(DeclarableOpsTests10, Image_NonMaxSuppressing_06) {
     NDArray maxSize = NDArrayFactory::create(2);
     NDArray threshold = NDArrayFactory::create(0.5f);
     NDArray scoreThreshold = NDArrayFactory::create(-DataTypeUtils::infOrMax<float>());
-    nd4j::ops::non_max_suppression_v3 op;
+    sd::ops::non_max_suppression_v3 op;
     auto results = op.evaluate({&boxes, &scales, &maxSize, &threshold, &scoreThreshold}, {}, {});
 
     ASSERT_EQ(Status::OK(), results->status());
@@ -2593,7 +2593,7 @@ TEST_F(DeclarableOpsTests10, Image_NonMaxSuppressing_7) {
     NDArray maxSize = NDArrayFactory::create(0);
     NDArray threshold = NDArrayFactory::create(0.5f);
     NDArray scoreThreshold = NDArrayFactory::create(0.5f);
-    nd4j::ops::non_max_suppression_v3 op;
+    sd::ops::non_max_suppression_v3 op;
     auto results = op.evaluate({&boxes, &scales, &maxSize, &threshold, &scoreThreshold}, {}, {});
 
     ASSERT_EQ(Status::OK(), results->status());
@@ -2618,7 +2618,7 @@ TEST_F(DeclarableOpsTests10, Image_NonMaxSuppressingOverlap_1) {
     NDArray max_num = NDArrayFactory::create<int>(3);
     NDArray expected = NDArrayFactory::create<int>('c', {1,}, {3});
 
-    nd4j::ops::non_max_suppression_overlaps op;
+    sd::ops::non_max_suppression_overlaps op;
     auto results = op.evaluate({&boxes, &scores, &max_num}, {0.5, 0.}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2643,7 +2643,7 @@ TEST_F(DeclarableOpsTests10, Image_NonMaxSuppressingOverlap_2) {
     NDArray max_num = NDArrayFactory::create<int>(3);
     NDArray expected = NDArrayFactory::create<int>('c', {3,}, {1,1,1});
 
-    nd4j::ops::non_max_suppression_overlaps op;
+    sd::ops::non_max_suppression_overlaps op;
     auto results = op.evaluate({&boxes, &scores, &max_num}, {0.5, 0.}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2668,7 +2668,7 @@ TEST_F(DeclarableOpsTests10, Image_NonMaxSuppressingOverlap_3) {
     NDArray max_num = NDArrayFactory::create<int>(5);
     NDArray expected = NDArrayFactory::create<int>('c', {5,}, {1,1,1,1,1});
 
-    nd4j::ops::non_max_suppression_overlaps op;
+    sd::ops::non_max_suppression_overlaps op;
     auto results = op.evaluate({&boxes, &scores, &max_num}, {0.5, 0.}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2692,7 +2692,7 @@ TEST_F(DeclarableOpsTests10, Image_CropAndResize_1) {
     //NDArray<float> ('c', {6}, {0.9f, .75f, .6f, .95f, .5f, .3f});
     NDArray expected = NDArrayFactory::create<double>('c', {1,1,1,1}, {2.5f});
 
-    nd4j::ops::crop_and_resize op;
+    sd::ops::crop_and_resize op;
     auto results = op.evaluate({&images, &boxes, &boxI, &cropSize}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2717,7 +2717,7 @@ TEST_F(DeclarableOpsTests10, Image_CropAndResize_2) {
     //NDArray<float> ('c', {6}, {0.9f, .75f, .6f, .95f, .5f, .3f});
     NDArray expected = NDArrayFactory::create<float>('c', {1,1,1,1}, {4.f});
 
-    nd4j::ops::crop_and_resize op;
+    sd::ops::crop_and_resize op;
     auto results = op.evaluate({&images, &boxes, &boxI, &cropSize}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2733,15 +2733,15 @@ TEST_F(DeclarableOpsTests10, Image_CropAndResize_2) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests10, Image_CropAndResize_3) {
 
-    NDArray images   ('c', {1,2,2,1}, {1,2,3,4}, nd4j::DataType::FLOAT32);
-    NDArray boxes('c', {1,4}, {0,0,1,1}, nd4j::DataType::FLOAT32);
-    NDArray boxI('c', {1}, std::vector<double>{0}, nd4j::DataType::INT64);
+    NDArray images   ('c', {1,2,2,1}, {1,2,3,4}, sd::DataType::FLOAT32);
+    NDArray boxes('c', {1,4}, {0,0,1,1}, sd::DataType::FLOAT32);
+    NDArray boxI('c', {1}, std::vector<double>{0}, sd::DataType::INT64);
     NDArray cropSize = NDArrayFactory::create<Nd4jLong>({3, 3});
 
     //NDArray<float> ('c', {6}, {0.9f, .75f, .6f, .95f, .5f, .3f});
-    NDArray expected('c', {1,3,3,1}, {1.f, 1.5f, 2., 2.f, 2.5f, 3.f, 3.f, 3.5f, 4.f}, nd4j::DataType::FLOAT32);
+    NDArray expected('c', {1,3,3,1}, {1.f, 1.5f, 2., 2.f, 2.5f, 3.f, 3.f, 3.5f, 4.f}, sd::DataType::FLOAT32);
 
-    nd4j::ops::crop_and_resize op;
+    sd::ops::crop_and_resize op;
     auto results = op.evaluate({&images, &boxes, &boxI, &cropSize}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2757,15 +2757,15 @@ TEST_F(DeclarableOpsTests10, Image_CropAndResize_3) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests10, Image_CropAndResize_4) {
 
-    NDArray images('c', {1,2,2,1}, {1, 2, 3, 4}, nd4j::DataType::FLOAT32);
-    NDArray boxes('c', {1,4}, {0,0,1,1}, nd4j::DataType::FLOAT32);
-    NDArray boxI('c', {1}, std::vector<double>({0.}), nd4j::DataType::INT32);
+    NDArray images('c', {1,2,2,1}, {1, 2, 3, 4}, sd::DataType::FLOAT32);
+    NDArray boxes('c', {1,4}, {0,0,1,1}, sd::DataType::FLOAT32);
+    NDArray boxI('c', {1}, std::vector<double>({0.}), sd::DataType::INT32);
     NDArray cropSize = NDArrayFactory::create<int>({3, 3});
 
     //NDArray<float> ('c', {6}, {0.9f, .75f, .6f, .95f, .5f, .3f});
-    NDArray expected('c', {1,3,3,1}, {1.f, 2.f, 2.f, 3.f, 4, 4.f, 3.f, 4.f, 4.f}, nd4j::DataType::FLOAT32);
+    NDArray expected('c', {1,3,3,1}, {1.f, 2.f, 2.f, 3.f, 4, 4.f, 3.f, 4.f, 4.f}, sd::DataType::FLOAT32);
 
-    nd4j::ops::crop_and_resize op;
+    sd::ops::crop_and_resize op;
     auto results = op.evaluate({&images, &boxes, &boxI, &cropSize}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2781,15 +2781,15 @@ TEST_F(DeclarableOpsTests10, Image_CropAndResize_4) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests10, Image_CropAndResize_5) {
 
-    NDArray images('c', {1, 100, 100, 3}, nd4j::DataType::FLOAT32);
-    NDArray boxes('c', {1,4}, {0,0,1,1}, nd4j::DataType::FLOAT32);
-    NDArray boxI('c', {2}, {1,1}, nd4j::DataType::INT32);
+    NDArray images('c', {1, 100, 100, 3}, sd::DataType::FLOAT32);
+    NDArray boxes('c', {1,4}, {0,0,1,1}, sd::DataType::FLOAT32);
+    NDArray boxI('c', {2}, {1,1}, sd::DataType::INT32);
     NDArray cropSize = NDArrayFactory::create<int>({10, 10});
 
     //NDArray<float> ('c', {6}, {0.9f, .75f, .6f, .95f, .5f, .3f});
-    NDArray expected('c', {1, 10, 10,3}, nd4j::DataType::FLOAT32);
+    NDArray expected('c', {1, 10, 10,3}, sd::DataType::FLOAT32);
 
-    nd4j::ops::crop_and_resize op;
+    sd::ops::crop_and_resize op;
     auto results = op.evaluate({&images, &boxes, &boxI, &cropSize}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2825,7 +2825,7 @@ TEST_F(DeclarableOpsTests10, Image_DrawBoundingBoxes_1) {
        106.f, 107.f,  108.f,    109.f, 110.f, 111.f,    112.f, 113.f, 114.f,    115.f, 116.f, 117.f,    118.f, 119.f, 120.f
     });
     images.linspace(1.);
-    nd4j::ops::draw_bounding_boxes op;
+    sd::ops::draw_bounding_boxes op;
     auto results = op.evaluate({&images, &boxes, &colors}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2858,7 +2858,7 @@ TEST_F(DeclarableOpsTests10, Image_DrawBoundingBoxes_2) {
             64.1f ,  65.1f,  66.1f, 67.1f,   68.1f , 69.1f, 70.1f , 71.1f , 72.1f ,
             73.1f ,  74.1f,  75.1f, 76.1f,   77.1f , 78.1f, 79.1f , 80.1f , 81.1f    });
     images.linspace(1.1);
-    nd4j::ops::draw_bounding_boxes op;
+    sd::ops::draw_bounding_boxes op;
     auto results = op.evaluate({&images, &boxes, &colors}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2911,7 +2911,7 @@ TEST_F(DeclarableOpsTests10, Image_DrawBoundingBoxes_3) {
                                                                        0.7234f, 0.269f ,  0.0062f,   0.0327f,  0.0644f,
                                                                        0.8428f, 0.9441f,  0.9441f,   0.9441f,  0.3491f,
                                                                        0.5793f, 0.573f ,  0.1822f,   0.642f ,  0.9143f});
-    nd4j::ops::draw_bounding_boxes op;
+    sd::ops::draw_bounding_boxes op;
     auto results = op.evaluate({&images, &boxes, &colors}, {}, {});
      ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
@@ -2931,12 +2931,12 @@ TEST_F(DeclarableOpsTests10, Image_DrawBoundingBoxes_3) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_1) {
 
-    NDArray x('c', {2,3}, {-63.80f, -63.75f, -63.70f, -63.5f, 0.0f, 0.1f}, nd4j::DataType::FLOAT32);
-    NDArray exp('c', {2,3},  {-63.75f, -63.75f, -63.75f, -63.5f, 0.f, 0.f}, nd4j::DataType::FLOAT32);
-    NDArray min('c', {},  std::vector<double>{-63.65f}, nd4j::DataType::FLOAT32);
-    NDArray max('c', {},  std::vector<double>{0.1f}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {2,3}, {-63.80f, -63.75f, -63.70f, -63.5f, 0.0f, 0.1f}, sd::DataType::FLOAT32);
+    NDArray exp('c', {2,3},  {-63.75f, -63.75f, -63.75f, -63.5f, 0.f, 0.f}, sd::DataType::FLOAT32);
+    NDArray min('c', {},  std::vector<double>{-63.65f}, sd::DataType::FLOAT32);
+    NDArray max('c', {},  std::vector<double>{0.1f}, sd::DataType::FLOAT32);
 
-    nd4j::ops::fake_quant_with_min_max_vars op;
+    sd::ops::fake_quant_with_min_max_vars op;
     auto results = op.evaluate({&x, &min, &max}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2957,7 +2957,7 @@ TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_2) {
     NDArray min = NDArrayFactory::create<double>(-63.65);
     NDArray max = NDArrayFactory::create<double>(0.1);
 
-    nd4j::ops::fake_quant_with_min_max_vars op;
+    sd::ops::fake_quant_with_min_max_vars op;
     auto results = op.evaluate({&x, &min, &max}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2978,7 +2978,7 @@ TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_3) {
     NDArray min = NDArrayFactory::create<double>('c', {1},{-63.65});
     NDArray max = NDArrayFactory::create<double>('c', {1}, {0.1});
 
-    nd4j::ops::fake_quant_with_min_max_vars_per_channel op;
+    sd::ops::fake_quant_with_min_max_vars_per_channel op;
     auto results = op.evaluate({&x, &min, &max}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3002,7 +3002,7 @@ TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_03) {
     NDArray min = NDArrayFactory::create<float>({-0.2283f,   -0.0719f,   -0.0154f,   -0.5162f,   -0.3567f});
     NDArray max = NDArrayFactory::create<float>({0.9441f,    0.5957f,    0.8669f,    0.3502f,    0.5100f});
 
-    nd4j::ops::fake_quant_with_min_max_vars_per_channel op;
+    sd::ops::fake_quant_with_min_max_vars_per_channel op;
     auto results = op.evaluate({&x, &min, &max}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3025,7 +3025,7 @@ TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_03_1) {
     NDArray min = NDArrayFactory::create<float>({-0.2283f,   -0.0719f,   -0.0154f,   -0.5162f,   -0.3567f});
     NDArray max = NDArrayFactory::create<float>({0.9441f,    0.5957f,    0.8669f,    0.3502f,    0.5100f});
 
-    nd4j::ops::fake_quant_with_min_max_vars_per_channel op;
+    sd::ops::fake_quant_with_min_max_vars_per_channel op;
     auto results = op.evaluate({&x, &min, &max}, {}, {8}, {true});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3049,7 +3049,7 @@ TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_03_2) {
     NDArray min = NDArrayFactory::create<float>({-0.2283f,   -0.0719f,   -0.0154f,   -0.5162f,   -0.3567f});
     NDArray max = NDArrayFactory::create<float>({0.9441f,    0.5957f,    0.8669f,    0.3502f,    0.5100f});
 
-    nd4j::ops::fake_quant_with_min_max_vars_per_channel op;
+    sd::ops::fake_quant_with_min_max_vars_per_channel op;
     auto results = op.evaluate({&x, &min, &max}, {}, {6}, {true});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3072,7 +3072,7 @@ TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_03_3) {
     NDArray min = NDArrayFactory::create<float>({-0.2283f,   -0.0719f,   -0.0154f,   -0.5162f,   -0.3567f});
     NDArray max = NDArrayFactory::create<float>({0.9441f,    0.5957f,    0.8669f,    0.3502f,    0.5100f});
 
-    nd4j::ops::fake_quant_with_min_max_vars_per_channel op;
+    sd::ops::fake_quant_with_min_max_vars_per_channel op;
     auto results = op.evaluate({&x, &min, &max}, {}, {6}, {false});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3111,7 +3111,7 @@ TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_4) {
     NDArray min = NDArrayFactory::create<float>({20.f, 20.f, 20.f});
     NDArray max = NDArrayFactory::create<float>({65.f, 70.f, 90.f});
     x.linspace(1.);
-    nd4j::ops::fake_quant_with_min_max_vars_per_channel op;
+    sd::ops::fake_quant_with_min_max_vars_per_channel op;
     auto results = op.evaluate({&x, &min, &max}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3164,7 +3164,7 @@ TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_5) {
     NDArray min = NDArrayFactory::create<float>({-20.f, -19.f, -18.f, -17.f});
     NDArray max = NDArrayFactory::create<float>({20.f, 21.f, 22.f, 23.f});
     x.linspace(-60.);
-    nd4j::ops::fake_quant_with_min_max_vars_per_channel op;
+    sd::ops::fake_quant_with_min_max_vars_per_channel op;
     auto results = op.evaluate({&x, &min, &max}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3198,7 +3198,7 @@ TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_6) {
     NDArray min = NDArrayFactory::create<float>('c', {5}, {-0.2283f,   -0.0719f,   -0.0154f,   -0.5162f,   -0.3567f});
     NDArray max = NDArrayFactory::create<float>('c', {5}, {0.9441f,    0.5957f,    0.8669f,    0.3502f,    0.5100f});
    // x.linspace(-60.);
-    nd4j::ops::fake_quant_with_min_max_vars_per_channel op;
+    sd::ops::fake_quant_with_min_max_vars_per_channel op;
     auto results = op.evaluate({&x, &min, &max}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3244,7 +3244,7 @@ TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_7) {
     NDArray min = NDArrayFactory::create<float>('c', {1},{0.0f});
     NDArray max = NDArrayFactory::create<float>('c', {1}, {1.f});
     x.linspace(0., 0.01);
-    nd4j::ops::fake_quant_with_min_max_vars op;
+    sd::ops::fake_quant_with_min_max_vars op;
     auto results = op.evaluate({&x, &min, &max}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3269,7 +3269,7 @@ TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_8) {
     NDArray min = NDArrayFactory::create<float>('c', {1},{0.0f});
     NDArray max = NDArrayFactory::create<float>('c', {1}, {1.f});
     x.linspace(0., 0.1);
-    nd4j::ops::fake_quant_with_min_max_vars op;
+    sd::ops::fake_quant_with_min_max_vars op;
     auto results = op.evaluate({&x, &min, &max}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3287,14 +3287,14 @@ TEST_F(DeclarableOpsTests10, FakeQuantWithMinMaxVars_Test_8) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests10, bool_broadcast_test_1) {
 
-    NDArray arr1('c', {2,2,1}, {1, 2, 3, 4}, nd4j::DataType::INT32);
-    NDArray arr2('c', {  2,2}, {0, 1, 0, 4}, nd4j::DataType::INT32);
+    NDArray arr1('c', {2,2,1}, {1, 2, 3, 4}, sd::DataType::INT32);
+    NDArray arr2('c', {  2,2}, {0, 1, 0, 4}, sd::DataType::INT32);
 
-    NDArray expd('c', {2,2,2}, {false, true, false, false, false, false, false, true}, nd4j::DataType::BOOL);
+    NDArray expd('c', {2,2,2}, {false, true, false, false, false, false, false, true}, sd::DataType::BOOL);
 
-    NDArray result('c', {2,2,2}, nd4j::DataType::BOOL);
+    NDArray result('c', {2,2,2}, sd::DataType::BOOL);
 
-    arr1.applyTrueBroadcast(nd4j::BroadcastBoolOpsTuple::custom(scalar::EqualTo, pairwise::EqualTo, broadcast::EqualTo), arr2, result, true);
+    arr1.applyTrueBroadcast(sd::BroadcastBoolOpsTuple::custom(scalar::EqualTo, pairwise::EqualTo, broadcast::EqualTo), arr2, result, true);
     // result.printIndexedBuffer();
     // expd.printIndexedBuffer();
 
@@ -3305,14 +3305,14 @@ TEST_F(DeclarableOpsTests10, bool_broadcast_test_1) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests10, printIndexedTest_1) {
 
-    NDArray arr('c', {2,2,2,2}, {1, 2, 3, 4, 5, 6, 7, 8,9, 10, 11, 12, 13, 14, 15, 16}, nd4j::DataType::INT32);
-//    NDArray arr2('c', {  2,2}, {0, 1, 0, 4}, nd4j::DataType::INT32);
+    NDArray arr('c', {2,2,2,2}, {1, 2, 3, 4, 5, 6, 7, 8,9, 10, 11, 12, 13, 14, 15, 16}, sd::DataType::INT32);
+//    NDArray arr2('c', {  2,2}, {0, 1, 0, 4}, sd::DataType::INT32);
 
-//    NDArray expd('c', {2,2,2}, {0,1,0,0, 0,0,0,1}, nd4j::DataType::BOOL);
+//    NDArray expd('c', {2,2,2}, {0,1,0,0, 0,0,0,1}, sd::DataType::BOOL);
 
-//    NDArray result('c', {2,2,2}, nd4j::DataType::BOOL);
+//    NDArray result('c', {2,2,2}, sd::DataType::BOOL);
 
-//    arr1.applyTrueBroadcast(nd4j::BroadcastBoolOpsTuple::custom(scalar::EqualTo, pairwise::EqualTo, broadcast::EqualTo), &arr2, &result, true, nullptr);
+//    arr1.applyTrueBroadcast(sd::BroadcastBoolOpsTuple::custom(scalar::EqualTo, pairwise::EqualTo, broadcast::EqualTo), &arr2, &result, true, nullptr);
     // result.printIndexedBuffer();
     // expd.printIndexedBuffer();
 
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests11.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests11.cpp
index fc9ccebfc..177893ba4 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests11.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests11.cpp
@@ -21,12 +21,12 @@
 
 #include "testlayers.h"
 #include <ops/declarable/CustomOperations.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <ops/ops.h>
-#include <GradCheck.h>
+#include <helpers/GradCheck.h>
 #include <helpers/MmulHelper.h>
 
-using namespace nd4j;
+using namespace sd;
 
 
 class DeclarableOpsTests11 : public testing::Test {
@@ -43,7 +43,7 @@ TEST_F(DeclarableOpsTests11, test_listdiff_1) {
     auto x = NDArrayFactory::create<int>('c', {4}, {0, 1, 2, 3});
     auto y = NDArrayFactory::create<int>('c',{2}, {3, 1});
 
-    nd4j::ops::listdiff op;
+    sd::ops::listdiff op;
     auto result = op.evaluate({&x, &y}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -53,9 +53,9 @@ TEST_F(DeclarableOpsTests11, test_listdiff_1) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, log_loss_grad_test1) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {2,3,4}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3,4}, {-12.49997,-13.04346, -13.63635, -14.28571,-14.99999,-15.78947, -16.66666, -17.64705,-18.75   ,-20.     , -21.42857, -23.07692,
                                    -24.99999,-27.27272, -29.99999, -33.33332,-37.49999,-42.85713, -49.99998, -59.99998,-74.99995,-99.99992,-149.99986,-299.99911});
@@ -68,7 +68,7 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test1) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::log_loss_grad op;
+    sd::ops::log_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {1e-7}, {0}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -90,9 +90,9 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test1) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, log_loss_grad_test2) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {2,1,4}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {2,1,4}, sd::DataType::DOUBLE);
 
     NDArray dLdwExp('c', {2,1,4}, {15.99805, 16.72406, 16.27746,  14.83754,-44.97147,-59.99582,-79.28771,-107.35497});
 
@@ -100,7 +100,7 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test2) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::log_loss_grad op;
+    sd::ops::log_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {1e-7}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -116,9 +116,9 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test2) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, log_loss_grad_test3) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights(nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights(sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3,4}, {-12.49997,-13.04346, -13.63635, -14.28571,-14.99999,-15.78947, -16.66666, -17.64705,-18.75   ,-20.     , -21.42857, -23.07692,
                                    -24.99999,-27.27272, -29.99999, -33.33332,-37.49999,-42.85713, -49.99998, -59.99998,-74.99995,-99.99992,-149.99986,-299.99911});
@@ -130,7 +130,7 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test3) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::log_loss_grad op;
+    sd::ops::log_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {1e-7}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -152,9 +152,9 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test3) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, log_loss_grad_test4) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {1,3,1}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {1,3,1}, sd::DataType::DOUBLE);
 
     NDArray dLdwExp('c', {1,3,1}, {4.8876 , -46.29156, -186.36887});
 
@@ -162,7 +162,7 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test4) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::log_loss_grad op;
+    sd::ops::log_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {1e-7}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -180,9 +180,9 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test4) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, log_loss_grad_test5) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {2,3,4}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3,4}, {-1.04166,-1.08696, -1.13636, -1.19048,-1.25   ,-1.31579, -1.38889, -1.47059,-1.5625 ,-1.66667, -1.78571, -1.92308,
                                    -2.08333,-2.27273, -2.5    , -2.77778,-3.125  ,-3.57143, -4.16667, -5.     ,-6.25   ,-8.33333,-12.49999,-24.99993});
@@ -195,7 +195,7 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test5) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::log_loss_grad op;
+    sd::ops::log_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {1e-7}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -217,9 +217,9 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test5) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, log_loss_grad_test6) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {1,3,1}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {1,3,1}, sd::DataType::DOUBLE);
 
     NDArray dLdwExp('c', {1,3,1}, {6.73432, 2.46939,-9.20372});
 
@@ -227,7 +227,7 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test6) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::log_loss_grad op;
+    sd::ops::log_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {1e-7}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -243,9 +243,9 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test6) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, log_loss_grad_test7) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights(nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights(sd::DataType::DOUBLE);
 
     NDArray dLdwExp('c', {},  std::vector<double>{0.});
 
@@ -253,7 +253,7 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test7) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::log_loss_grad op;
+    sd::ops::log_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {1e-7}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -269,9 +269,9 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test7) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, log_loss_grad_test8) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {2,3,4}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3,4}, {0.     , 0.     ,  0.     ,  0.     ,-1.5    ,-1.57895, -1.66667, -1.76471,-1.875  ,-2.     , -2.14286, -2.30769,
                                   -2.5    ,-2.72727, -3.     , -3.33333,-3.75   ,-4.28571, -5.     , -6.     ,-7.49999,-9.99999,-14.99999,-29.99991});
@@ -288,7 +288,7 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test8) {
     weights.p(2, 0.);
     weights.p(3, 0.);
 
-    nd4j::ops::log_loss_grad op;
+    sd::ops::log_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {1e-7}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -310,9 +310,9 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test8) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, log_loss_grad_test9) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {2,3,4}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3,4}, {-0.52083,-0.54348,-0.56818, -0.59524,-0.625  ,-0.65789,-0.69444, -0.73529,-0.78125,-0.83333,-0.89286, -0.96154,
                                    -1.04167,-1.13636,-1.25   , -1.38889,-1.5625 ,-1.78571,-2.08333, -2.5    ,-3.125  ,-4.16666,-6.24999,-12.49996});
@@ -325,7 +325,7 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test9) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::log_loss_grad op;
+    sd::ops::log_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {1e-7}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -347,9 +347,9 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test9) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, log_loss_grad_test10) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {1,1}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {1,1}, sd::DataType::DOUBLE);
 
     NDArray dLdwExp('c', {1,1}, std::vector<double>{-9.49054});
 
@@ -357,7 +357,7 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test10) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::log_loss_grad op;
+    sd::ops::log_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {1e-7}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -373,9 +373,9 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test10) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, log_loss_grad_test11) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {1,3,1}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {1,3,1}, sd::DataType::DOUBLE);
 
     NDArray dLdwExp('c', {1,3,1}, {0.20365,-1.92882,-7.76537});
 
@@ -383,7 +383,7 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test11) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::log_loss_grad op;
+    sd::ops::log_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {1e-7}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -399,9 +399,9 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test11) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, log_loss_grad_test12) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {2,3,4}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3,4}, { 0.      , 0.      , 0.      ,  0.      ,-0.75    ,-0.789473,-0.833333, -0.882353,-0.9375  ,-1.      ,-1.071428, -1.153846,
                                 -1.25    ,-1.363636,-1.5     , -1.666666,-1.875   ,-2.142857,-2.499999, -2.999999,-3.749997,-4.999997,-7.499993,-14.999956});
@@ -420,7 +420,7 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test12) {
     weights.t<double>(3) = 0.;
 
 
-    nd4j::ops::log_loss_grad op;
+    sd::ops::log_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {1e-7}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -442,9 +442,9 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test12) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, log_loss_grad_test13) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {2,3,1}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {2,3,1}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3,4}, {0.     , 0.     ,  0.     ,  0.     , 0.     , 0.     ,  0.     ,  0.     , 0.     , 0.     ,  0.     ,  0.     ,
                                   -2.08333,-2.27273, -2.5    , -2.77778,-3.125  ,-3.57143, -4.16667, -5.     ,-6.25   ,-8.33333,-12.49999,-24.99993});
@@ -459,7 +459,7 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test13) {
     weights.t<double>(1) = 0.;
     weights.t<double>(2) = 0.;
 
-    nd4j::ops::log_loss_grad op;
+    sd::ops::log_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {1e-7}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -642,7 +642,7 @@ TEST_F(DeclarableOpsTests11, ImageResizeBicubic_Test1) {
         49.902905f,  50.092834f,  50.262653f,  50.329483f,   50.30638f,    50.25057f});
 
     auto size = NDArrayFactory::create<int>({30, 30});
-    nd4j::ops::resize_bicubic op;
+    sd::ops::resize_bicubic op;
     auto results = op.evaluate({&input, &size}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -716,7 +716,7 @@ TEST_F(DeclarableOpsTests11, ImageResizeBicubic_Test2) {
     });    //input = 1.f;
     input.linspace(1);
     auto size = NDArrayFactory::create<int>({10, 8});
-    nd4j::ops::resize_bicubic op;
+    sd::ops::resize_bicubic op;
     auto results = op.evaluate({&input, &size}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -753,7 +753,7 @@ TEST_F(DeclarableOpsTests11, ImageResizeBicubic_Test3) {
     });
     input.linspace(1);
     auto size = NDArrayFactory::create<int>({6, 6});
-    nd4j::ops::resize_bicubic op;
+    sd::ops::resize_bicubic op;
     auto results = op.evaluate({&input, &size}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -790,7 +790,7 @@ TEST_F(DeclarableOpsTests11, ImageResizeBicubic_Test4) {
             });
     input.linspace(1);
     auto size = NDArrayFactory::create<int>({6, 8});
-    nd4j::ops::resize_bicubic op;
+    sd::ops::resize_bicubic op;
     auto results = op.evaluate({&input, &size}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -833,7 +833,7 @@ TEST_F(DeclarableOpsTests11, ImageResizeBicubic_Test5) {
         });
     input.linspace(1);
     auto size = NDArrayFactory::create<int>({8, 8});
-    nd4j::ops::resize_bicubic op;
+    sd::ops::resize_bicubic op;
     auto results = op.evaluate({&input, &size}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -963,7 +963,7 @@ TEST_F(DeclarableOpsTests11, ImageResizeBicubic_Test6) {
     });
 
     auto size = NDArrayFactory::create<int>({30, 30});
-    nd4j::ops::resize_bicubic op;
+    sd::ops::resize_bicubic op;
     auto results = op.evaluate({&input, &size}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1021,7 +1021,7 @@ TEST_F(DeclarableOpsTests11, ImageResizeBicubic_Test7) {
              0.9139262f, 0.92068815f
     });
     auto size = NDArrayFactory::create<int>({9, 9});
-    nd4j::ops::resize_bicubic op;
+    sd::ops::resize_bicubic op;
     auto results = op.evaluate({&input, &size}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1074,7 +1074,7 @@ TEST_F(DeclarableOpsTests11, ImageResizeBicubic_Test8) {
     });
 
     auto size = NDArrayFactory::create<int>({9, 9});
-    nd4j::ops::resize_bicubic op;
+    sd::ops::resize_bicubic op;
     auto results = op.evaluate({&input, &size}, {}, {}, {true, false});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1135,7 +1135,7 @@ TEST_F(DeclarableOpsTests11, ImageResizeArea_Test1) {
             33.f, 34.f, 35.f, 36.f    });
     input.linspace(1);
     auto size = NDArrayFactory::create<int>({6, 6});
-    nd4j::ops::resize_area op;
+    sd::ops::resize_area op;
     auto results = op.evaluate({&input, &size}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1162,7 +1162,7 @@ TEST_F(DeclarableOpsTests11, ImageResizeArea_Test2) {
     });
     input.linspace(1);
     auto size = NDArrayFactory::create<int>({6, 6});
-    nd4j::ops::resize_area op;
+    sd::ops::resize_area op;
     auto results = op.evaluate({&input, &size}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1190,7 +1190,7 @@ TEST_F(DeclarableOpsTests11, ImageResizeArea_Test3) {
     });
     input.linspace(1);
     auto size = NDArrayFactory::create<int>({6, 6});
-    nd4j::ops::resize_area op;
+    sd::ops::resize_area op;
     auto results = op.evaluate({&input, &size}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1228,7 +1228,7 @@ TEST_F(DeclarableOpsTests11, ImageResizeArea_Test4) {
     });
     //input.linspace(1);
     auto size = NDArrayFactory::create<int>({6, 6});
-    nd4j::ops::resize_area op;
+    sd::ops::resize_area op;
     auto results = op.evaluate({&input, &size}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1266,7 +1266,7 @@ TEST_F(DeclarableOpsTests11, ImageResizeArea_Test5) {
     });
     //input.linspace(1);
     auto size = NDArrayFactory::create<int>({6, 6});
-    nd4j::ops::resize_area op;
+    sd::ops::resize_area op;
     auto results = op.evaluate({&input, &size}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1304,7 +1304,7 @@ TEST_F(DeclarableOpsTests11, ImageResizeArea_Test6) {
     });
     //input.linspace(1);
     auto size = NDArrayFactory::create<int>({6, 6});
-    nd4j::ops::resize_area op;
+    sd::ops::resize_area op;
     auto results = op.evaluate({&input, &size}, {}, {}, {true});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1342,7 +1342,7 @@ TEST_F(DeclarableOpsTests11, ImageResizeArea_Test7) {
     });
     //input.linspace(1);
 //    auto size = NDArrayFactory::create<int>({6, 6});
-    nd4j::ops::resize_area op;
+    sd::ops::resize_area op;
     auto results = op.evaluate({&input}, {}, {6, 6}, {true});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1372,7 +1372,7 @@ TEST_F(DeclarableOpsTests11, ImageResizeArea_Test8) {
     });
     //input.linspace(1);
 //    auto size = NDArrayFactory::create<int>({6, 6});
-    nd4j::ops::resize_area op;
+    sd::ops::resize_area op;
     auto results = op.evaluate({&input}, {}, {6, 6}, {true});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1399,7 +1399,7 @@ TEST_F(DeclarableOpsTests11, ImageResizeArea_Test9) {
     });
     //input.linspace(1);
     auto size = NDArrayFactory::create<int>({10, 10});
-    nd4j::ops::resize_area op;
+    sd::ops::resize_area op;
     auto results = op.evaluate({&input, &size}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1426,7 +1426,7 @@ TEST_F(DeclarableOpsTests11, ImageResizeArea_Test10) {
     });
     //input.linspace(1);
     //auto size = NDArrayFactory::create<int>({10, 10});
-    nd4j::ops::resize_area op;
+    sd::ops::resize_area op;
     auto results = op.evaluate({&input}, {}, {10, 10});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1453,7 +1453,7 @@ TEST_F(DeclarableOpsTests11, ImageResizeArea_Test11) {
 //    });
     //input.linspace(1);
     //auto size = NDArrayFactory::create<int>({10, 10});
-    nd4j::ops::resize_area op;
+    sd::ops::resize_area op;
     auto results = op.evaluate({&input}, {}, {6, 9});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1480,7 +1480,7 @@ TEST_F(DeclarableOpsTests11, ImageResizeArea_Test12) {
 //    });
     //input.linspace(1);
     //auto size = NDArrayFactory::create<int>({10, 10});
-    nd4j::ops::resize_area op;
+    sd::ops::resize_area op;
     auto results = op.evaluate({&input}, {}, {10, 15});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1507,7 +1507,7 @@ TEST_F(DeclarableOpsTests11, ImageResizeArea_Test13) {
 //    });
     //input.linspace(1);
     //auto size = NDArrayFactory::create<int>({10, 10});
-    nd4j::ops::resize_area op;
+    sd::ops::resize_area op;
     auto results = op.evaluate({&input}, {}, {9, 9});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1538,7 +1538,7 @@ TEST_F(DeclarableOpsTests11, ImageResizeArea_Test14) {
             25.f
     });    //input.linspace(1);
 //    auto size = NDArrayFactory::create<int>({6, 6});
-    nd4j::ops::resize_area op;
+    sd::ops::resize_area op;
     auto results = op.evaluate({&input, &size}, {}, {false});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1567,7 +1567,7 @@ TEST_F(DeclarableOpsTests11, ImageResizeArea_Test15) {
             22.999998f  , 23.800001f  , 24.399984f  , 25.f
     });
 
-    nd4j::ops::resize_area op;
+    sd::ops::resize_area op;
     auto results = op.evaluate({&input}, {}, {8, 7}, {false});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1616,7 +1616,7 @@ TEST_F(DeclarableOpsTests11, Solve_Test_1) {
             7.625f, 3.25f, 5.f
     });
 
-    nd4j::ops::solve op;
+    sd::ops::solve op;
 
     auto res = op.evaluate({&a, &b});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -1646,7 +1646,7 @@ TEST_F(DeclarableOpsTests11, Solve_Test_2) {
             -3.3333333f,      3.6666666f,         0.333333f,        1.3333333f
     });
 
-    nd4j::ops::solve op;
+    sd::ops::solve op;
 
     auto res = op.evaluate({&a, &b});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -1683,7 +1683,7 @@ TEST_F(DeclarableOpsTests11, Solve_Test_3) {
             1.333333f,      -0.6666667f,         2.6666667f,        -1.3333333f
     });
 
-    nd4j::ops::solve op;
+    sd::ops::solve op;
 
     auto res = op.evaluate({&a, &b});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -1715,7 +1715,7 @@ TEST_F(DeclarableOpsTests11, Solve_Test_4) {
             0.81915987f,  0.72049433f,    0.2643504f,  0.44472617f
     });
 
-    nd4j::ops::solve op;
+    sd::ops::solve op;
 
     auto res = op.evaluate({&a, &b});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -1744,7 +1744,7 @@ TEST_F(DeclarableOpsTests11, Solve_Test_4_1) {
               0.4400987f,   0.2766527f,   0.6394467f,  0.79696566f
     });
 
-    nd4j::ops::solve op;
+    sd::ops::solve op;
 
     auto res = op.evaluate({&a, &b}, {true});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -1777,7 +1777,7 @@ TEST_F(DeclarableOpsTests11, Solve_Test_4_2) {
            -0.30013534f, -0.53690606f, -0.47959247f
     });
 
-    nd4j::ops::triangular_solve op;
+    sd::ops::triangular_solve op;
 
     auto res = op.evaluate({&a, &b}, {true, false});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -1811,7 +1811,7 @@ TEST_F(DeclarableOpsTests11, Solve_Test_4_3) {
              1.1904413f,   1.3938838f,   1.3926021f
     });
 
-    nd4j::ops::triangular_solve op;
+    sd::ops::triangular_solve op;
 
     auto res = op.evaluate({&a, &b}, {true, true});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -1845,7 +1845,7 @@ TEST_F(DeclarableOpsTests11, Solve_Test_4_4) {
             -0.4398522f,  -1.1899745f, -1.1392052f
     });
 
-    nd4j::ops::solve op;
+    sd::ops::solve op;
 
     auto res = op.evaluate({&a, &b}, {false});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -1879,7 +1879,7 @@ TEST_F(DeclarableOpsTests11, Solve_Test_4_5) {
             -0.8774802f, -1.2155888f, -1.8049058f
     });
 
-    nd4j::ops::solve op;
+    sd::ops::solve op;
 
     auto res = op.evaluate({&a, &b}, {true, true});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -1913,7 +1913,7 @@ TEST_F(DeclarableOpsTests11, Solve_Test_4_6) {
             0.01692283f,   -0.04538865f, -0.09868701f
     });
 
-    nd4j::ops::triangular_solve op;
+    sd::ops::triangular_solve op;
 
     auto res = op.evaluate({&a, &b}, {false, true});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -1950,7 +1950,7 @@ TEST_F(DeclarableOpsTests11, Solve_Test_4_7) {
             0.01692283f,   -0.04538865f, -0.09868701f
     });
 
-    nd4j::ops::triangular_solve op;
+    sd::ops::triangular_solve op;
 
     auto res = op.evaluate({&a, &b}, {true, false});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -1984,7 +1984,7 @@ TEST_F(DeclarableOpsTests11, Solve_Test_5) {
             -0.8774802f, -1.2155888f, -1.8049058f
     });
 
-    nd4j::ops::solve op;
+    sd::ops::solve op;
 
     auto res = op.evaluate({&a, &b}, {true});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -2012,7 +2012,7 @@ TEST_F(DeclarableOpsTests11, SolveLS_Test_1) {
             0.8311695f,           1.0909086f,           0.9205573f,            1.0630057f
     });
 
-    nd4j::ops::lstsq op;
+    sd::ops::lstsq op;
 
     auto res = op.evaluate({&a, &b}, {0.5}, {}, {true});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -2040,7 +2040,7 @@ TEST_F(DeclarableOpsTests11, SolveLS_Test_2) {
             0.8311695f,           1.0909086f,           0.9205573f,            1.0630057f
     });
 
-    nd4j::ops::lstsq op;
+    sd::ops::lstsq op;
 
     auto res = op.evaluate({&a, &b}, {0.5}, {}, {true});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -2069,7 +2069,7 @@ TEST_F(DeclarableOpsTests11, Cholesky_Test_2x2x2) {
             8.602325f,  0.f,  9.997296f, 0.23252854f
     });
 
-    nd4j::ops::cholesky op;
+    sd::ops::cholesky op;
 
     auto res = op.evaluate({&a});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -2098,7 +2098,7 @@ TEST_F(DeclarableOpsTests11, Cholesky_Test_2x2x2_2) {
              8.631338f, 0.f,             9.963693f, 1.1067207f
     });
 
-    nd4j::ops::cholesky op;
+    sd::ops::cholesky op;
 
     auto res = op.evaluate({&a});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -2114,9 +2114,9 @@ TEST_F(DeclarableOpsTests11, Cholesky_Test_2x2x2_2) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test1) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {2,3,4}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3,4}, {-0.96, -1.92, -2.88, -3.84, -4.8 , -5.76, -6.72, -7.68, -8.64, -9.6 ,-10.56,-11.52,
                                    -12.48,-13.44,-14.4 ,-15.36,-16.32,-17.28,-18.24,-19.2 ,-20.16,-21.12,-22.08,-23.04});
@@ -2127,7 +2127,7 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test1) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::mean_sqerr_loss_grad op;
+    sd::ops::mean_sqerr_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2149,9 +2149,9 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test1) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test2) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {2,1,4}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {2,1,4}, sd::DataType::DOUBLE);
 
     NDArray dLdwExp('c', {2,1,4}, {98.61121,129.024  , 164.9664 , 206.4384 , 828.51837,925.28644,1027.58398,1135.41113});
 
@@ -2159,7 +2159,7 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test2) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::mean_sqerr_loss_grad op;
+    sd::ops::mean_sqerr_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2175,9 +2175,9 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test2) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test3) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights(nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights(sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3,4}, {-0.96, -1.92, -2.88, -3.84, -4.8 , -5.76, -6.72, -7.68, -8.64, -9.6 ,-10.56,-11.52,
                                    -12.48,-13.44,-14.4 ,-15.36,-16.32,-17.28,-18.24,-19.2 ,-20.16,-21.12,-22.08,-23.04});
@@ -2187,7 +2187,7 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test3) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::mean_sqerr_loss_grad op;
+    sd::ops::mean_sqerr_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2209,9 +2209,9 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test3) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test4) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {1,3,1}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {1,3,1}, sd::DataType::DOUBLE);
 
     NDArray dLdwExp('c', {1,3,1}, {807.32153, 1426.63684, 2281.88159});
 
@@ -2219,7 +2219,7 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test4) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::mean_sqerr_loss_grad op;
+    sd::ops::mean_sqerr_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2235,9 +2235,9 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test4) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test5) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {2,3,4}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3,4}, {-0.08,-0.16,-0.24,-0.32,-0.4 ,-0.48,-0.56,-0.64,-0.72,-0.8 ,-0.88,-0.96,
                                    -1.04,-1.12,-1.2 ,-1.28,-1.36,-1.44,-1.52,-1.6 ,-1.68,-1.76,-1.84,-1.92});
@@ -2248,7 +2248,7 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test5) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::mean_sqerr_loss_grad op;
+    sd::ops::mean_sqerr_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2270,9 +2270,9 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test5) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test6) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {1,3,1}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {1,3,1}, sd::DataType::DOUBLE);
 
     NDArray dLdwExp('c', {1,3,1}, {-58.16319, -6.5536 , 64.71682});
 
@@ -2280,7 +2280,7 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test6) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::mean_sqerr_loss_grad op;
+    sd::ops::mean_sqerr_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2296,9 +2296,9 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test6) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test7) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights(nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights(sd::DataType::DOUBLE);
 
     NDArray dLdwExp('c', {}, std::vector<double>{0.});
 
@@ -2306,7 +2306,7 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test7) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::mean_sqerr_loss_grad op;
+    sd::ops::mean_sqerr_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2322,9 +2322,9 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test7) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test8) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {2,3,4}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3,4}, {0. ,0. ,0. ,0. ,-0.48 ,-0.576,-0.672,-0.768,-0.864,-0.96 ,-1.056,-1.152,
                                  -1.248,-1.344,-1.44 ,-1.536,-1.632,-1.728,-1.824,-1.92 ,-2.016,-2.112,-2.208,-2.304});
@@ -2339,7 +2339,7 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test8) {
     weights.p(2, 0.);
     weights.p(3, 0.);
 
-    nd4j::ops::mean_sqerr_loss_grad op;
+    sd::ops::mean_sqerr_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2361,9 +2361,9 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test8) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test9) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {2,3,4}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3,4}, {-0.04,-0.08,-0.12,-0.16,-0.2 ,-0.24,-0.28,-0.32,-0.36,-0.4 ,-0.44,-0.48,
                                    -0.52,-0.56,-0.6 ,-0.64,-0.68,-0.72,-0.76,-0.8 ,-0.84,-0.88,-0.92,-0.96});
@@ -2374,7 +2374,7 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test9) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::mean_sqerr_loss_grad op;
+    sd::ops::mean_sqerr_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2396,9 +2396,9 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test9) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test10) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {1,1}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {1,1}, sd::DataType::DOUBLE);
 
     NDArray dLdwExp('c', {1,1}, std::vector<double>{188.16});
 
@@ -2406,7 +2406,7 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test10) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::mean_sqerr_loss_grad op;
+    sd::ops::mean_sqerr_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2422,9 +2422,9 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test10) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test11) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {1,3,1}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {1,3,1}, sd::DataType::DOUBLE);
 
     NDArray dLdwExp('c', {1,3,1}, {33.6384 ,59.4432 ,95.07841});
 
@@ -2432,7 +2432,7 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test11) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::mean_sqerr_loss_grad op;
+    sd::ops::mean_sqerr_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2448,9 +2448,9 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test11) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test12) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {2,3,4}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3,4}, {0.,0.,0.,0., -0.24 ,-0.288,-0.336,-0.384,-0.432,-0.48 ,-0.528,-0.576,
                                   -0.624,-0.672,-0.72 ,-0.768,-0.816,-0.864,-0.912,-0.96 ,-1.008,-1.056,-1.104,-1.152});
@@ -2465,7 +2465,7 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test12) {
     weights.t<double>(2) = 0.;
     weights.t<double>(3) = 0.;
 
-    nd4j::ops::mean_sqerr_loss_grad op;
+    sd::ops::mean_sqerr_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2487,9 +2487,9 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test12) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test13) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {2,3,1}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {2,3,1}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3,4}, {0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                                   -1.04,-1.12,-1.2 ,-1.28,-1.36,-1.44,-1.52,-1.6 ,-1.68,-1.76,-1.84,-1.92});
@@ -2502,7 +2502,7 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test13) {
     weights.t<double>(1) = 0.;
     weights.t<double>(2) = 0.;
 
-    nd4j::ops::mean_sqerr_loss_grad op;
+    sd::ops::mean_sqerr_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2525,7 +2525,7 @@ TEST_F(DeclarableOpsTests11, SquaredSubtractTest_Test1) {
     auto x = NDArrayFactory::create<float>('c', {4}, {0, 1, 2, 3});
     auto y = NDArrayFactory::create<float>('c',{4}, {3, 2, 1, 0});
     auto exp = NDArrayFactory::create<float>('c', {4}, {9, 1,1, 9});
-    nd4j::ops::squaredsubtract op;
+    sd::ops::squaredsubtract op;
     auto result = op.evaluate({&x, &y}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
@@ -2537,7 +2537,7 @@ TEST_F(DeclarableOpsTests11, SquaredSubtractTest_Test2) {
     auto x = NDArrayFactory::create<float>('c', {2, 4}, {0, 1, 2, 3, 0, 1, 2, 3});
     auto y = NDArrayFactory::create<float>('c',{4}, {3, 2, 1, 0});
     auto exp = NDArrayFactory::create<float>('c', {2, 4}, {9, 1,1, 9, 9, 1, 1, 9});
-    nd4j::ops::squaredsubtract op;
+    sd::ops::squaredsubtract op;
     auto result = op.evaluate({&x, &y}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
@@ -2549,7 +2549,7 @@ TEST_F(DeclarableOpsTests11, SquaredSubtractTest_Test3) {
     auto y = NDArrayFactory::create<float>('c',{4}, {3, 2, 1, 0});
     auto exp = NDArrayFactory::create<float>('c', {2, 4}, {-6, -4, 6, 24, -30, -12, 14, 48});
     auto eps = NDArrayFactory::create<float>('c', {2, 4}, {1,2,3,4,5,6,7,8});
-    nd4j::ops::squaredsubtract_bp op;
+    sd::ops::squaredsubtract_bp op;
     auto result = op.evaluate({&x, &y, &eps}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
     ASSERT_TRUE(exp.equalsTo(result->at(0)));
@@ -2559,9 +2559,9 @@ TEST_F(DeclarableOpsTests11, SquaredSubtractTest_Test3) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test1) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {2,3,4}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3,4}, {-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,
                                    -0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5});
@@ -2572,7 +2572,7 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test1) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::absolute_difference_loss_grad op;
+    sd::ops::absolute_difference_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2594,9 +2594,9 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test1) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test2) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {2,1,4}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {2,1,4}, sd::DataType::DOUBLE);
 
     NDArray dLdwExp('c', {2,1,4}, {14.4 , 17.28, 20.16, 23.04, 48.96, 51.84, 54.72, 57.6});
 
@@ -2604,7 +2604,7 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test2) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::absolute_difference_loss_grad op;
+    sd::ops::absolute_difference_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2620,9 +2620,9 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test2) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test3) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights(nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights(sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3,4}, {-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,
                                    -0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5});
@@ -2632,7 +2632,7 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test3) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::absolute_difference_loss_grad op;
+    sd::ops::absolute_difference_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2654,9 +2654,9 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test3) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test4) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {1,3,1}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {1,3,1}, sd::DataType::DOUBLE);
 
     NDArray dLdwExp('c', {1,3,1}, {65.28, 96., 126.72001});
 
@@ -2664,7 +2664,7 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test4) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::absolute_difference_loss_grad op;
+    sd::ops::absolute_difference_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2680,9 +2680,9 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test4) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test5) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {2,3,4}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3,4}, {-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,
                                    -0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167});
@@ -2693,7 +2693,7 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test5) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::absolute_difference_loss_grad op;
+    sd::ops::absolute_difference_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2715,9 +2715,9 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test5) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test6) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {1,3,1}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {1,3,1}, sd::DataType::DOUBLE);
 
     NDArray dLdwExp('c', {1,3,1}, {-2.56, 0., 2.56});
 
@@ -2725,7 +2725,7 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test6) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::absolute_difference_loss_grad op;
+    sd::ops::absolute_difference_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2741,9 +2741,9 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test6) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test7) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights(nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights(sd::DataType::DOUBLE);
 
     NDArray dLdwExp('c', {}, std::vector<double>{0.});
 
@@ -2751,7 +2751,7 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test7) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::absolute_difference_loss_grad op;
+    sd::ops::absolute_difference_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2767,9 +2767,9 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test7) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test8) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {2,3,4}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3,4}, {-0.  ,-0.  ,-0.  ,-0.  ,-0.05,-0.05,-0.05,-0.05,-0.05,-0.05,-0.05,-0.05,
                                    -0.05,-0.05,-0.05,-0.05,-0.05,-0.05,-0.05,-0.05,-0.05,-0.05,-0.05,-0.05});
@@ -2784,7 +2784,7 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test8) {
     weights.p(2, 0.);
     weights.p(3, 0.);
 
-    nd4j::ops::absolute_difference_loss_grad op;
+    sd::ops::absolute_difference_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2806,9 +2806,9 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test8) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test9) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {2,3,4}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3,4}, {-0.02083, -0.02083, -0.02083, -0.02083,-0.02083, -0.02083, -0.02083, -0.02083,-0.02083, -0.02083, -0.02083, -0.02083,
                                    -0.02083, -0.02083, -0.02083, -0.02083,-0.02083, -0.02083, -0.02083, -0.02083,-0.02083, -0.02083, -0.02083, -0.02083});
@@ -2819,7 +2819,7 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test9) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::absolute_difference_loss_grad op;
+    sd::ops::absolute_difference_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2841,9 +2841,9 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test9) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test10) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {1,1}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {1,1}, sd::DataType::DOUBLE);
 
     NDArray dLdwExp('c', {1,1}, std::vector<double>{12.});
 
@@ -2851,7 +2851,7 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test10) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::absolute_difference_loss_grad op;
+    sd::ops::absolute_difference_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2867,9 +2867,9 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test10) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test11) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {1,3,1}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {1,3,1}, sd::DataType::DOUBLE);
 
     NDArray dLdwExp('c', {1,3,1}, {2.72, 4., 5.28});
 
@@ -2877,7 +2877,7 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test11) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::absolute_difference_loss_grad op;
+    sd::ops::absolute_difference_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2893,9 +2893,9 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test11) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test12) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {2,3,4}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3,4}, {0., 0., 0., 0., -0.025, -0.025, -0.025, -0.025,-0.025, -0.025, -0.025, -0.025,
                                    -0.025, -0.025, -0.025, -0.025,-0.025, -0.025, -0.025, -0.025,-0.025, -0.025, -0.025, -0.025});
@@ -2910,7 +2910,7 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test12) {
     weights.t<double>(2) = 0.;
     weights.t<double>(3) = 0.;
 
-    nd4j::ops::absolute_difference_loss_grad op;
+    sd::ops::absolute_difference_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2932,9 +2932,9 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test12) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test13) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {2,3,1}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {2,3,1}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3,4}, {0., 0., 0., 0., 0., 0., 0., 0.,0., 0., 0., 0.,
                                   -0.04167, -0.04167, -0.04167, -0.04167,-0.04167, -0.04167, -0.04167, -0.04167,-0.04167, -0.04167, -0.04167, -0.04167});
@@ -2947,7 +2947,7 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test13) {
     weights.t<double>(1) = 0.;
     weights.t<double>(2) = 0.;
 
-    nd4j::ops::absolute_difference_loss_grad op;
+    sd::ops::absolute_difference_loss_grad op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2970,13 +2970,13 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test13) {
 TEST_F(DeclarableOpsTests11, BFloat16_Test_1) {
 
     NDArray x = NDArrayFactory::create<bfloat16>('c', {2,3,4});
-    NDArray y = NDArrayFactory::create<bfloat16>('c', {2,3,4});//('c', {2,3,4}, nd4j::DataType::BFLOAT16);
-    NDArray exp = NDArrayFactory::create<bfloat16>('c', {2,3,4});//('c', {2,3,4}, nd4j::DataType::BFLOAT16);
+    NDArray y = NDArrayFactory::create<bfloat16>('c', {2,3,4});//('c', {2,3,4}, sd::DataType::BFLOAT16);
+    NDArray exp = NDArrayFactory::create<bfloat16>('c', {2,3,4});//('c', {2,3,4}, sd::DataType::BFLOAT16);
 
     x.linspace(1);
     y.linspace(1);
     exp.linspace(2,2);
-    nd4j::ops::add op;
+    sd::ops::add op;
     auto results = op.evaluate({&x, &y}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2991,13 +2991,13 @@ TEST_F(DeclarableOpsTests11, BFloat16_Test_1) {
 TEST_F(DeclarableOpsTests11, BFloat16_Test_2) {
 
     NDArray x = NDArrayFactory::create<float16>('c', {2,3,4});
-    NDArray y = NDArrayFactory::create<bfloat16>('c', {2,3,4});//('c', {2,3,4}, nd4j::DataType::BFLOAT16);
-    NDArray exp = NDArrayFactory::create<float16>('c', {2,3,4});//('c', {2,3,4}, nd4j::DataType::BFLOAT16);
+    NDArray y = NDArrayFactory::create<bfloat16>('c', {2,3,4});//('c', {2,3,4}, sd::DataType::BFLOAT16);
+    NDArray exp = NDArrayFactory::create<float16>('c', {2,3,4});//('c', {2,3,4}, sd::DataType::BFLOAT16);
 
     x.linspace(1);
     y.linspace(1);
     exp.linspace(2,2);
-    nd4j::ops::add op;
+    sd::ops::add op;
     auto results = op.evaluate({&x, &y}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3011,14 +3011,14 @@ TEST_F(DeclarableOpsTests11, BFloat16_Test_2) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, BFloat16_Test_3) {
 
-    NDArray x('c', {2,3,4}, nd4j::DataType::BFLOAT16);
-    NDArray y('c', {2,3,4}, nd4j::DataType::BFLOAT16);
-    NDArray exp('c', {2,3,4}, nd4j::DataType::BFLOAT16);
+    NDArray x('c', {2,3,4}, sd::DataType::BFLOAT16);
+    NDArray y('c', {2,3,4}, sd::DataType::BFLOAT16);
+    NDArray exp('c', {2,3,4}, sd::DataType::BFLOAT16);
 
     x.linspace(1);
     y.linspace(1);
     exp.linspace(2,2);
-    nd4j::ops::add op;
+    sd::ops::add op;
     auto results = op.evaluate({&x, &y}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3032,9 +3032,9 @@ TEST_F(DeclarableOpsTests11, BFloat16_Test_3) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test1) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray logits('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {2,3,4}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3,4}, {-0.25999, -0.755  , -1.25   , -1.745  , -2.24001, -2.73502, -3.23004, -3.72508, -4.22014, -4.71523, -5.21034, -5.70548,
                                    -6.20066, -6.69587, -7.19113, -7.68643, -8.18177, -8.67717, -9.17262, -9.66813,-10.1637 ,-10.65932,-11.15501,-11.65077});
@@ -3047,7 +3047,7 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test1) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::sigm_cross_entropy_loss_grad op;
+    sd::ops::sigm_cross_entropy_loss_grad op;
     auto results = op.evaluate({&logits, &weights, &labels}, {0.}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3069,9 +3069,9 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test1) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test2) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {2,1,4}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray logits('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {2,1,4}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3,4}, {-0.18499,-0.53   ,-0.875  ,-1.22   ,-1.56501,-1.91002,-2.25504,-2.60008,-2.94514,-3.29023,-3.63534,-3.98048,
                                    -4.32566,-4.67087,-5.01613,-5.36143,-5.70677,-6.05217,-6.39762,-6.74313,-7.0887 ,-7.43432,-7.78001,-8.12577});
@@ -3083,7 +3083,7 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test2) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::sigm_cross_entropy_loss_grad op;
+    sd::ops::sigm_cross_entropy_loss_grad op;
     auto results = op.evaluate({&logits, &weights, &labels}, {0.3}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3105,9 +3105,9 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test2) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test3) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights(nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray logits('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights(sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3,4}, {-0.18499,-0.53   ,-0.875  ,-1.22   ,-1.56501,-1.91002,-2.25504,-2.60008,-2.94514,-3.29023,-3.63534,-3.98048,
                                    -4.32566,-4.67087,-5.01613,-5.36143,-5.70677,-6.05217,-6.39762,-6.74313,-7.0887 ,-7.43432,-7.78001,-8.12577});
@@ -3119,7 +3119,7 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test3) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::sigm_cross_entropy_loss_grad op;
+    sd::ops::sigm_cross_entropy_loss_grad op;
     auto results = op.evaluate({&logits, &weights, &labels}, {0.3}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3141,9 +3141,9 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test3) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test4) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {1,3,1}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray logits('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {1,3,1}, sd::DataType::DOUBLE);
 
     NDArray dLdwExp('c', {1,3,1}, {-12.54779,-28.13393,-50.83936});
 
@@ -3151,7 +3151,7 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test4) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::sigm_cross_entropy_loss_grad op;
+    sd::ops::sigm_cross_entropy_loss_grad op;
     auto results = op.evaluate({&logits, &weights, &labels}, {0.3}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3167,9 +3167,9 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test4) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test5) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray logits('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {2,3,4}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3,4}, {-0.01542,-0.04417,-0.07292,-0.10167,-0.13042,-0.15917,-0.18792,-0.21667,-0.24543,-0.27419,-0.30294,-0.33171,
                                    -0.36047,-0.38924,-0.41801,-0.44679,-0.47556,-0.50435,-0.53314,-0.56193,-0.59072,-0.61953,-0.64833,-0.67715});
@@ -3182,7 +3182,7 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test5) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::sigm_cross_entropy_loss_grad op;
+    sd::ops::sigm_cross_entropy_loss_grad op;
     auto results = op.evaluate({&logits, &weights, &labels}, {0.3}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3204,9 +3204,9 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test5) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test6) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {1,3,1}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray logits('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {1,3,1}, sd::DataType::DOUBLE);
 
     NDArray dLdwExp('c', {1,3,1}, {1.4966 , 0.19776,-1.69436});
 
@@ -3214,7 +3214,7 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test6) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::sigm_cross_entropy_loss_grad op;
+    sd::ops::sigm_cross_entropy_loss_grad op;
     auto results = op.evaluate({&logits, &weights, &labels}, {0.3}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3230,9 +3230,9 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test6) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test7) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights(nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray logits('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights(sd::DataType::DOUBLE);
 
     NDArray dLdwExp('c', {}, std::vector<double>{0.});
 
@@ -3240,7 +3240,7 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test7) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::sigm_cross_entropy_loss_grad op;
+    sd::ops::sigm_cross_entropy_loss_grad op;
     auto results = op.evaluate({&logits, &weights, &labels}, {0.3}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3256,9 +3256,9 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test7) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test8) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray logits('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {2,3,4}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3,4}, { 0.     , 0.     , 0.     , 0.     ,-0.1565 ,-0.191  ,-0.2255 ,-0.26001,-0.29451,-0.32902,-0.36353,-0.39805,
                                    -0.43257,-0.46709,-0.50161,-0.53614,-0.57068,-0.60522,-0.63976,-0.67431,-0.70887,-0.74343,-0.778  ,-0.81258});
@@ -3274,7 +3274,7 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test8) {
     weights.p(2, 0.);
     weights.p(3, 0.);
 
-    nd4j::ops::sigm_cross_entropy_loss_grad op;
+    sd::ops::sigm_cross_entropy_loss_grad op;
     auto results = op.evaluate({&logits, &weights, &labels}, {0.3}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3296,9 +3296,9 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test8) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test9) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray logits('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {2,3,4}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3,4}, {-0.00771, -0.02208, -0.03646, -0.05083,-0.06521, -0.07958, -0.09396, -0.10834,-0.12271, -0.13709, -0.15147, -0.16585,
                                    -0.18024, -0.19462, -0.20901, -0.22339,-0.23778, -0.25217, -0.26657, -0.28096,-0.29536, -0.30976, -0.32417, -0.33857});
@@ -3310,7 +3310,7 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test9) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::sigm_cross_entropy_loss_grad op;
+    sd::ops::sigm_cross_entropy_loss_grad op;
     auto results = op.evaluate({&logits, &weights, &labels}, {0.3}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3332,9 +3332,9 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test9) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test10) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {1,1}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray logits('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {1,1}, sd::DataType::DOUBLE);
 
     NDArray dLdwExp('c', {1,1}, std::vector<double>{-3.81338});
 
@@ -3342,7 +3342,7 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test10) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::sigm_cross_entropy_loss_grad op;
+    sd::ops::sigm_cross_entropy_loss_grad op;
     auto results = op.evaluate({&logits, &weights, &labels}, {0.3}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3358,9 +3358,9 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test10) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test11) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {1,3,1}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray logits('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {1,3,1}, sd::DataType::DOUBLE);
 
     NDArray dLdwExp('c', {1,3,1}, {-0.52282,-1.17225,-2.11831});
 
@@ -3368,7 +3368,7 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test11) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::sigm_cross_entropy_loss_grad op;
+    sd::ops::sigm_cross_entropy_loss_grad op;
     auto results = op.evaluate({&logits, &weights, &labels}, {0.3}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3384,9 +3384,9 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test11) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test12) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray logits('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {2,3,4}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3,4}, {0.     ,  0.     ,  0.     ,  0.     ,-0.07825, -0.0955 , -0.11275, -0.13   ,-0.14726, -0.16451, -0.18177, -0.19902,
                                    -0.21628, -0.23354, -0.25081, -0.26807,-0.28534, -0.30261, -0.31988, -0.33716,-0.35443, -0.37172, -0.389  , -0.40629});
@@ -3403,7 +3403,7 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test12) {
     weights.t<double>(3) = 0.;
 
 
-    nd4j::ops::sigm_cross_entropy_loss_grad op;
+    sd::ops::sigm_cross_entropy_loss_grad op;
     auto results = op.evaluate({&logits, &weights, &labels}, {0.3}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3425,9 +3425,9 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test12) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test13) {
 
-    NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {2,3,1}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray logits('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {2,3,1}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3,4}, {0.     ,  0.     ,  0.     ,  0.     , 0.     ,  0.     ,  0.     ,  0.     , 0.     ,  0.     ,  0.     ,  0.     ,
                                    -0.36047, -0.38924, -0.41801, -0.44679,-0.47556, -0.50435, -0.53314, -0.56193,-0.59072, -0.61953, -0.64833, -0.67715});
@@ -3441,7 +3441,7 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test13) {
     weights.t<double>(1) = 0.;
     weights.t<double>(2) = 0.;
 
-    nd4j::ops::sigm_cross_entropy_loss_grad op;
+    sd::ops::sigm_cross_entropy_loss_grad op;
     auto results = op.evaluate({&logits, &weights, &labels}, {0.3}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3464,13 +3464,13 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test13) {
 TEST_F(DeclarableOpsTests11, BFloat16_Test_4) {
 
     NDArray x = NDArrayFactory::create<float>('c', {2,3,4});
-    NDArray y = NDArrayFactory::create<bfloat16>('c', {2,3,4});//('c', {2,3,4}, nd4j::DataType::BFLOAT16);
-    NDArray exp = NDArrayFactory::create<float>('c', {2,3,4});//('c', {2,3,4}, nd4j::DataType::BFLOAT16);
+    NDArray y = NDArrayFactory::create<bfloat16>('c', {2,3,4});//('c', {2,3,4}, sd::DataType::BFLOAT16);
+    NDArray exp = NDArrayFactory::create<float>('c', {2,3,4});//('c', {2,3,4}, sd::DataType::BFLOAT16);
 
     x.linspace(1);
     y.linspace(1);
     exp.linspace(2,2);
-    nd4j::ops::add op;
+    sd::ops::add op;
     auto results = op.evaluate({&x, &y}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3485,13 +3485,13 @@ TEST_F(DeclarableOpsTests11, BFloat16_Test_4) {
 TEST_F(DeclarableOpsTests11, BFloat16_Test_5) {
 
     NDArray x = NDArrayFactory::create<float>('c', {2,3,4});
-    NDArray y = NDArrayFactory::create<bfloat16>('c', {2,3,4});//('c', {2,3,4}, nd4j::DataType::BFLOAT16);
-    NDArray exp = NDArrayFactory::create<float>('c', {2,3,4});//('c', {2,3,4}, nd4j::DataType::BFLOAT16);
+    NDArray y = NDArrayFactory::create<bfloat16>('c', {2,3,4});//('c', {2,3,4}, sd::DataType::BFLOAT16);
+    NDArray exp = NDArrayFactory::create<float>('c', {2,3,4});//('c', {2,3,4}, sd::DataType::BFLOAT16);
 
     x.linspace(2, 2);
     y.linspace(1);
     exp.linspace(1);
-    nd4j::ops::subtract op;
+    sd::ops::subtract op;
     auto results = op.evaluate({&x, &y}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3506,13 +3506,13 @@ TEST_F(DeclarableOpsTests11, BFloat16_Test_5) {
 TEST_F(DeclarableOpsTests11, BFloat16_Test_6) {
 
     NDArray x = NDArrayFactory::create<bfloat16>('c', {2,3,4});
-    NDArray y = NDArrayFactory::create<double>('c', {2,3,4});//('c', {2,3,4}, nd4j::DataType::BFLOAT16);
-    NDArray exp = NDArrayFactory::create<bfloat16>('c', {2,3,4});//('c', {2,3,4}, nd4j::DataType::BFLOAT16);
+    NDArray y = NDArrayFactory::create<double>('c', {2,3,4});//('c', {2,3,4}, sd::DataType::BFLOAT16);
+    NDArray exp = NDArrayFactory::create<bfloat16>('c', {2,3,4});//('c', {2,3,4}, sd::DataType::BFLOAT16);
 
     x.linspace(2, 2);
     y.linspace(1);
     exp.linspace(1);
-    nd4j::ops::subtract op;
+    sd::ops::subtract op;
     auto results = op.evaluate({&x, &y}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3526,9 +3526,9 @@ TEST_F(DeclarableOpsTests11, BFloat16_Test_6) {
 /////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test1) {
 
-    NDArray labels('c', {2,4}, {0,0,1,0, 0,1,0,0}, nd4j::DataType::INT32);
-    NDArray logits('c', {2,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {2}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,4}, {0,0,1,0, 0,1,0,0}, sd::DataType::INT32);
+    NDArray logits('c', {2,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {2}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,4}, {0.1176,  0.1224, -0.3726,  0.1326, 0.1176, -0.3776,  0.1274,  0.1326});
     NDArray dLdwExp('c', {2}, {1.36729, 1.40729});
@@ -3536,7 +3536,7 @@ TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test1) {
     logits.linspace(-0.08, 0.04);
     weights.assign(0.5);
 
-    nd4j::ops::softmax_cross_entropy_loss_grad op;
+    sd::ops::softmax_cross_entropy_loss_grad op;
 
     auto results = op.evaluate({&logits, &weights, &labels}, {0.}, {0});
 
@@ -3557,9 +3557,9 @@ TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test1) {
 /////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test2) {
 
-    NDArray labels('c', {4}, {0,0,1,0}, nd4j::DataType::INT32);
-    NDArray logits('c', {4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {1}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {4}, {0,0,1,0}, sd::DataType::INT32);
+    NDArray logits('c', {4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {1}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {4}, {0.125,  0.125, -0.375,  0.125});
     NDArray dLdwExp('c', {1}, std::vector<double>{1.38629});
@@ -3567,7 +3567,7 @@ TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test2) {
     logits = 2.;
     weights.assign(0.5);
 
-    nd4j::ops::softmax_cross_entropy_loss_grad op;
+    sd::ops::softmax_cross_entropy_loss_grad op;
 
     auto results = op.evaluate({&logits, &weights, &labels}, {0.}, {1});
 
@@ -3588,9 +3588,9 @@ TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test2) {
 /////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test3) {
 
-    NDArray labels('c', {4}, {0,0,1,0}, nd4j::DataType::INT32);
-    NDArray logits('c', {4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {}, std::vector<double>{0}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {4}, {0,0,1,0}, sd::DataType::INT32);
+    NDArray logits('c', {4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {}, std::vector<double>{0}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {4}, {0.125,  0.125, -0.375,  0.125});
     NDArray dLdwExp('c', {}, std::vector<double>{1.38629});
@@ -3598,7 +3598,7 @@ TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test3) {
     logits = 2.;
     weights.assign(0.5);
 
-    nd4j::ops::softmax_cross_entropy_loss_grad op;
+    sd::ops::softmax_cross_entropy_loss_grad op;
 
     auto results = op.evaluate({&logits, &weights, &labels}, {0.}, {1});
 
@@ -3619,9 +3619,9 @@ TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test3) {
 /////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test4) {
 
-    NDArray labels('c', {4}, {0,0,1,0}, nd4j::DataType::INT32);
-    NDArray logits('c', {4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {}, std::vector<double>{0}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {4}, {0,0,1,0}, sd::DataType::INT32);
+    NDArray logits('c', {4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {}, std::vector<double>{0}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {4}, {0.23521,  0.2448 , -0.7452 ,  0.26519});
     NDArray dLdwExp('c', {}, std::vector<double>{0.});
@@ -3629,7 +3629,7 @@ TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test4) {
     logits.linspace(-0.08, 0.04);
     weights = 0.5;
 
-    nd4j::ops::softmax_cross_entropy_loss_grad op;
+    sd::ops::softmax_cross_entropy_loss_grad op;
 
     auto results = op.evaluate({&logits, &weights, &labels}, {0.}, {2});
 
@@ -3650,9 +3650,9 @@ TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test4) {
 /////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test5) {
 
-    NDArray labels('c', {4}, {0,0,1,0}, nd4j::DataType::INT32);
-    NDArray logits('c', {4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {1}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {4}, {0,0,1,0}, sd::DataType::INT32);
+    NDArray logits('c', {4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {1}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {4}, {0.1176,  0.1224, -0.3726,  0.1326});
     NDArray dLdwExp('c', {1}, std::vector<double>{1.36729});
@@ -3660,7 +3660,7 @@ TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test5) {
     logits.linspace(-0.08, 0.04);
     weights = 0.5;
 
-    nd4j::ops::softmax_cross_entropy_loss_grad op;
+    sd::ops::softmax_cross_entropy_loss_grad op;
 
     auto results = op.evaluate({&logits, &weights, &labels}, {0.}, {3});
 
@@ -3681,9 +3681,9 @@ TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test5) {
 /////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test6) {
 
-    NDArray labels('c', {2,4}, {0,0,1,0, 0,1,0,0}, nd4j::DataType::INT32);
-    NDArray logits('c', {2,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {2}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,4}, {0,0,1,0, 0,1,0,0}, sd::DataType::INT32);
+    NDArray logits('c', {2,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {2}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,4}, {0.0801,  0.0849, -0.2601,  0.0951, 0.0801, -0.2651,  0.0899,  0.0951});
     NDArray dLdwExp('c', {2}, {-0.014000, 0.014000});
@@ -3691,7 +3691,7 @@ TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test6) {
     logits.linspace(-0.08, 0.04);
     weights.assign(0.5);
 
-    nd4j::ops::softmax_cross_entropy_loss_grad op;
+    sd::ops::softmax_cross_entropy_loss_grad op;
 
     auto results = op.evaluate({&logits, &weights, &labels}, {0.3}, {2});
 
@@ -3712,8 +3712,8 @@ TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test6) {
 /////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test7) {
 
-    NDArray labels('c', {2,3,4}, {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1, 1,0,0,0, 0,1,0,0}, nd4j::DataType::INT32);
-    NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3,4}, {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1, 1,0,0,0, 0,1,0,0}, sd::DataType::INT32);
+    NDArray logits('c', {2,3,4}, sd::DataType::DOUBLE);
     NDArray weights('c', {1,3}, {0.5, 0., 1.5});
 
     NDArray dLdpExp('c', {2,3,4}, {-0.0956 , 0.0306 , 0.03185, 0.03315, 0.,-0., 0., 0., 0.0882 , 0.0918 ,-0.27945, 0.09945,
@@ -3722,7 +3722,7 @@ TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test7) {
 
     logits.linspace(-0.08, 0.04);
 
-    nd4j::ops::softmax_cross_entropy_loss_grad op;
+    sd::ops::softmax_cross_entropy_loss_grad op;
 
     auto results = op.evaluate({&logits, &weights, &labels}, {0.}, {3});
 
@@ -3745,10 +3745,10 @@ TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test8) {
 
     NDArray labels('c', {2,3,4,5}, {1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,
                                     0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,
-                                    0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0}, nd4j::DataType::INT32);
+                                    0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0}, sd::DataType::INT32);
 
-    NDArray logits('c', {2,3,4,5}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {1,1,4}, nd4j::DataType::DOUBLE);
+    NDArray logits('c', {2,3,4,5}, sd::DataType::DOUBLE);
+    NDArray weights('c', {1,1,4}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3,4,5}, {-0.03399, 0.00799, 0.00832, 0.00866, 0.00901, 0.00768,-0.03367, 0.00832, 0.00866, 0.00901, 0.00768, 0.00799,-0.03335,
                                     0.00866, 0.00901, 0.00768, 0.00799, 0.00832,-0.03301, 0.00901, 0.00768, 0.00799, 0.00832, 0.00866,-0.03265,-0.03399,
@@ -3764,7 +3764,7 @@ TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test8) {
     logits.linspace(-0.08, 0.04);
     weights.assign(0.5);
 
-    nd4j::ops::softmax_cross_entropy_loss_grad op;
+    sd::ops::softmax_cross_entropy_loss_grad op;
 
     auto results = op.evaluate({&logits, &weights, &labels}, {0.}, {2});
 
@@ -3790,7 +3790,7 @@ TEST_F(DeclarableOpsTests11, SafeDivideMixed_Test1) {
     NDArray labels('c', {2, 3}, {1.0, 2.0, 3.0, -1.0, 2.0, 1.0});
     auto sumDiff = labels.reduceAlongDimension(reduce::Sum, {1}, true);
 
-    NDArray numOfNonZero(sumDiff.getShapeInfo(), nd4j::DataType::INT64, false);
+    NDArray numOfNonZero(sumDiff.getShapeInfo(), sd::DataType::INT64, false);
     numOfNonZero.assign(1);
     sumDiff.applyPairwiseTransform(pairwise::SafeDivide, numOfNonZero, sumDiff);
 }
@@ -3799,13 +3799,13 @@ TEST_F(DeclarableOpsTests11, SafeDivideMixed_Test1) {
 TEST_F(DeclarableOpsTests11, softmaxCrossEntropyWithLogits_grad_test1) {
 
     NDArray labels('c', {2,3,4}, {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1, 1,0,0,0, 0,1,0,0});
-    NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE);
+    NDArray logits('c', {2,3,4}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3,4}, {-0.76479, 0.2448, 0.2548, 0.26519, 0.23521,-0.7552, 0.2548, 0.26519, 0.23521, 0.2448,-0.7452, 0.26519,
                                    0.23521, 0.2448, 0.2548,-0.73481,-0.76479, 0.2448, 0.2548, 0.26519, 0.23521,-0.7552, 0.2548, 0.26519});
     logits.linspace(-0.08, 0.04);
 
-    nd4j::ops::softmax_cross_entropy_loss_with_logits_grad op;
+    sd::ops::softmax_cross_entropy_loss_with_logits_grad op;
 
     auto results = op.evaluate({&logits, &labels}, {}, {});
 
@@ -3823,13 +3823,13 @@ TEST_F(DeclarableOpsTests11, softmaxCrossEntropyWithLogits_grad_test1) {
 TEST_F(DeclarableOpsTests11, softmaxCrossEntropyWithLogits_grad_test2) {
 
     NDArray labels('c', {2,3,4}, {1,0,0,0, 0,1,0,1, 0,0,1,0, 0,0,0,1, 1,0,1,0, 0,1,0,0});
-    NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE);
+    NDArray logits('c', {2,3,4}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3,4}, {-0.71836,  0.28164,  0.28164,  0.28164, 0.33051, -0.66949,  0.33051, -0.66949, 0.38785,  0.38785, -0.61215,  0.38785,
                                     0.28164,  0.28164,  0.28164, -0.71836,-0.66949,  0.33051, -0.66949,  0.33051, 0.38785, -0.61215,  0.38785,  0.38785});
     logits.linspace(-0.08, 0.04);
 
-    nd4j::ops::softmax_cross_entropy_loss_with_logits_grad op;
+    sd::ops::softmax_cross_entropy_loss_with_logits_grad op;
 
     auto results = op.evaluate({&logits, &labels}, {}, {1});
 
@@ -3847,12 +3847,12 @@ TEST_F(DeclarableOpsTests11, softmaxCrossEntropyWithLogits_grad_test2) {
 TEST_F(DeclarableOpsTests11, softmaxCrossEntropyWithLogits_grad_test3) {
 
     NDArray labels('c', {2,3}, {1,0,0, 0,1,1});
-    NDArray logits('c', {2,3}, nd4j::DataType::DOUBLE);
+    NDArray logits('c', {2,3}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3}, {-0.52996,  0.47004,  0.47004, 0.52996, -0.47004, -0.47004});
     logits.linspace(-0.08, 0.04);
 
-    nd4j::ops::softmax_cross_entropy_loss_with_logits_grad op;
+    sd::ops::softmax_cross_entropy_loss_with_logits_grad op;
 
     auto results = op.evaluate({&logits, &labels}, {}, {0});
 
@@ -3874,7 +3874,7 @@ TEST_F(DeclarableOpsTests11, softmaxCrossEntropyWithLogits_grad_test4) {
 
     NDArray dLdpExp('c', {2,1}, {0., 0.});
 
-    nd4j::ops::softmax_cross_entropy_loss_with_logits_grad op;
+    sd::ops::softmax_cross_entropy_loss_with_logits_grad op;
 
     auto results = op.evaluate({&logits, &labels}, {}, {1});
 
@@ -3896,7 +3896,7 @@ TEST_F(DeclarableOpsTests11, softmaxCrossEntropyWithLogits_grad_test5) {
 
     NDArray dLdpExp('c', {2,1}, {-0.51999, 0.51999});
 
-    nd4j::ops::softmax_cross_entropy_loss_with_logits_grad op;
+    sd::ops::softmax_cross_entropy_loss_with_logits_grad op;
 
     auto results = op.evaluate({&logits, &labels}, {}, {0});
 
@@ -3918,7 +3918,7 @@ TEST_F(DeclarableOpsTests11, softmaxCrossEntropyWithLogits_grad_test6) {
 
     NDArray dLdpExp('c', {1,2}, {0, 0.});
 
-    nd4j::ops::softmax_cross_entropy_loss_with_logits_grad op;
+    sd::ops::softmax_cross_entropy_loss_with_logits_grad op;
 
     auto results = op.evaluate({&logits, &labels}, {}, {0});
 
@@ -3940,7 +3940,7 @@ TEST_F(DeclarableOpsTests11, softmaxCrossEntropyWithLogits_grad_test7) {
 
     NDArray dLdpExp('c', {2}, {0.48001, -0.48001});
 
-    nd4j::ops::softmax_cross_entropy_loss_with_logits_grad op;
+    sd::ops::softmax_cross_entropy_loss_with_logits_grad op;
 
     auto results = op.evaluate({&logits, &labels}, {}, {0});
 
@@ -3962,7 +3962,7 @@ TEST_F(DeclarableOpsTests11, softmaxCrossEntropyWithLogits_grad_test8) {
 
     NDArray dLdpExp('c', {1}, std::vector<double>{0});
 
-    nd4j::ops::softmax_cross_entropy_loss_with_logits_grad op;
+    sd::ops::softmax_cross_entropy_loss_with_logits_grad op;
 
     auto results = op.evaluate({&logits, &labels}, {}, {0});
 
@@ -3979,17 +3979,17 @@ TEST_F(DeclarableOpsTests11, softmaxCrossEntropyWithLogits_grad_test8) {
 /////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, Multiply_BP_Test1) {
 
-    NDArray x('c', {3,4,5}, nd4j::DataType::DOUBLE);
-    NDArray y('c', {1,1,1}, nd4j::DataType::DOUBLE);
+    NDArray x('c', {3,4,5}, sd::DataType::DOUBLE);
+    NDArray y('c', {1,1,1}, sd::DataType::DOUBLE);
 
-    NDArray dLdp('c', {3,4,5}, nd4j::DataType::DOUBLE);
-    NDArray dLdpExp('c', {3,4,5}, nd4j::DataType::DOUBLE);
+    NDArray dLdp('c', {3,4,5}, sd::DataType::DOUBLE);
+    NDArray dLdpExp('c', {3,4,5}, sd::DataType::DOUBLE);
 
     x.assign(1.0);//linspace(0.1, 0.1);
     y.assign(1.0);
     dLdp.assign(1.0);
     dLdpExp.assign(1.0);
-    nd4j::ops::multiply_bp op;
+    sd::ops::multiply_bp op;
 
     auto results = op.evaluate({&x, &y, &dLdp}, {}, {});
 
@@ -4005,14 +4005,14 @@ TEST_F(DeclarableOpsTests11, Multiply_BP_Test1) {
 /////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, sparseSoftmaxCrossEntropyWithLogits_grad_test1) {
 
-    NDArray labels('c', {2}, {2,1}, nd4j::DataType::INT64);
-    NDArray logits('c', {2,3}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2}, {2,1}, sd::DataType::INT64);
+    NDArray logits('c', {2,3}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3}, {0.30061,  0.33222, -0.63283, 0.30061, -0.66778,  0.36717});
 
     logits.linspace(0.1, 0.1);
 
-    nd4j::ops::sparse_softmax_cross_entropy_loss_with_logits_grad op;
+    sd::ops::sparse_softmax_cross_entropy_loss_with_logits_grad op;
 
     auto results = op.evaluate({&labels, &logits}, {}, {});
 
@@ -4029,14 +4029,14 @@ TEST_F(DeclarableOpsTests11, sparseSoftmaxCrossEntropyWithLogits_grad_test1) {
 /////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, sparseSoftmaxCrossEntropyWithLogits_grad_test2) {
 
-    NDArray labels('c', {2}, {0,1}, nd4j::DataType::INT64);
-    NDArray logits('c', {2,3}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2}, {0,1}, sd::DataType::INT64);
+    NDArray logits('c', {2,3}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3}, {-0.69939,  0.33222,  0.36717, 0.30061, -0.66778,  0.36717});
 
     logits.linspace(-0.1, 0.1);
 
-    nd4j::ops::sparse_softmax_cross_entropy_loss_with_logits_grad op;
+    sd::ops::sparse_softmax_cross_entropy_loss_with_logits_grad op;
 
     auto results = op.evaluate({&labels, &logits}, {}, {});
 
@@ -4053,12 +4053,12 @@ TEST_F(DeclarableOpsTests11, sparseSoftmaxCrossEntropyWithLogits_grad_test2) {
 /////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, sparseSoftmaxCrossEntropyWithLogits_grad_test3) {
 
-    NDArray labels('c', {}, std::vector<double>{1}, nd4j::DataType::INT64);
+    NDArray labels('c', {}, std::vector<double>{1}, sd::DataType::INT64);
     NDArray logits('c', {2}, {-0.2, 0.3});
 
     NDArray dLdpExp('c', {2}, {0.37754, -0.37754});
 
-    nd4j::ops::sparse_softmax_cross_entropy_loss_with_logits_grad op;
+    sd::ops::sparse_softmax_cross_entropy_loss_with_logits_grad op;
 
     auto results = op.evaluate({&labels, &logits}, {}, {});
 
@@ -4075,14 +4075,14 @@ TEST_F(DeclarableOpsTests11, sparseSoftmaxCrossEntropyWithLogits_grad_test3) {
 /////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, sparseSoftmaxCrossEntropyWithLogits_grad_test4) {
 
-    NDArray labels('c', {2,3}, {0,1,1, 3,3,2}, nd4j::DataType::INT64);
-    NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {2,3}, {0,1,1, 3,3,2}, sd::DataType::INT64);
+    NDArray logits('c', {2,3,4}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3,4}, {-0.78616,  0.23633,  0.26118,  0.28865, 0.21384, -0.76367,  0.26118,  0.28865, 0.21384, -0.76367,  0.26118,  0.28865,
                                   0.21384,  0.23633,  0.26118, -0.71135, 0.21384,  0.23633,  0.26118, -0.71135, 0.21384,  0.23633, -0.73882,  0.28865});
     logits.linspace(-0.5, 0.1);
 
-    nd4j::ops::sparse_softmax_cross_entropy_loss_with_logits_grad op;
+    sd::ops::sparse_softmax_cross_entropy_loss_with_logits_grad op;
 
     auto results = op.evaluate({&labels, &logits}, {}, {});
 
@@ -4099,12 +4099,12 @@ TEST_F(DeclarableOpsTests11, sparseSoftmaxCrossEntropyWithLogits_grad_test4) {
 /////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, sparseSoftmaxCrossEntropyWithLogits_grad_test5) {
 
-    NDArray labels('c', {1,1}, std::vector<double>({0}), nd4j::DataType::INT64);
+    NDArray labels('c', {1,1}, std::vector<double>({0}), sd::DataType::INT64);
     NDArray logits('c', {1,1,2}, {-0.3,0.2});
 
     NDArray dLdpExp('c', {1,1,2}, {-0.62246,  0.62246});
 
-    nd4j::ops::sparse_softmax_cross_entropy_loss_with_logits_grad op;
+    sd::ops::sparse_softmax_cross_entropy_loss_with_logits_grad op;
 
     auto results = op.evaluate({&labels, &logits}, {}, {});
 
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests12.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests12.cpp
index aa4f7177a..2b07f352d 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests12.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests12.cpp
@@ -21,14 +21,14 @@
 
 #include "testlayers.h"
 #include <ops/declarable/CustomOperations.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <ops/ops.h>
-#include <GradCheck.h>
-#include <ConstantTadHelper.h>
+#include <helpers/GradCheck.h>
+#include <helpers/ConstantTadHelper.h>
 #include <helpers/PointersManager.h>
 #include <helpers/MmulHelper.h>
 
-using namespace nd4j;
+using namespace sd;
 
 
 class DeclarableOpsTests12 : public testing::Test {
@@ -44,7 +44,7 @@ TEST_F(DeclarableOpsTests12, test_any_validation_1) {
     auto x = NDArrayFactory::create<double>('c', {2, 1}, {1.0, 2.0});
     auto y = NDArrayFactory::create<int>('c', {2}, {1, 0});
 
-    nd4j::ops::transpose op;
+    sd::ops::transpose op;
     auto result = op.evaluate({&x, &y});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -59,8 +59,8 @@ TEST_F(DeclarableOpsTests12, test_any_validation_1) {
 TEST_F(DeclarableOpsTests12, cosine_distance_loss_grad_test1) {
 
     NDArray labels('c', {2,4}, {0,1,1,0,1,0,1,0});
-    NDArray predictions('c', {2,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {2,1}, nd4j::DataType::DOUBLE);
+    NDArray predictions('c', {2,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {2,1}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,4}, {-0. , -0.5, -0.5, -0., -0.5, -0. , -0.5, -0.});
     NDArray dLdwExp('c', {2,1}, {1.2, -0.2});
@@ -68,7 +68,7 @@ TEST_F(DeclarableOpsTests12, cosine_distance_loss_grad_test1) {
     predictions.linspace(-0.4, 0.2);
     weights.assign(0.5);
 
-    nd4j::ops::cosine_distance_loss_grad op;
+    sd::ops::cosine_distance_loss_grad op;
 
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {0, -1});
 
@@ -90,8 +90,8 @@ TEST_F(DeclarableOpsTests12, cosine_distance_loss_grad_test1) {
 TEST_F(DeclarableOpsTests12, cosine_distance_loss_grad_test2) {
 
     NDArray labels('c', {2,4}, {-0.1, 0.3, 2, -1.4, 2.5, -3, 1.2, 2.2});
-    NDArray predictions('c', {2,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {1,4}, nd4j::DataType::DOUBLE);
+    NDArray predictions('c', {2,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {1,4}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,4}, {0.05, -0.15, -1.  ,  0.7 ,-1.25,  1.5 , -0.6 , -1.1 });
     NDArray dLdwExp('c', {1,4}, {-0.04,  2.86,  0.04, -0.92});
@@ -100,7 +100,7 @@ TEST_F(DeclarableOpsTests12, cosine_distance_loss_grad_test2) {
     predictions.linspace(-0.4, 0.2);
     weights.assign(0.5);
 
-    nd4j::ops::cosine_distance_loss_grad op;
+    sd::ops::cosine_distance_loss_grad op;
 
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {0, 0});
 
@@ -124,8 +124,8 @@ TEST_F(DeclarableOpsTests12, cosine_distance_loss_grad_test2) {
 TEST_F(DeclarableOpsTests12, cosine_distance_loss_grad_test3) {
 
     NDArray labels('c', {4}, {-0.1, 0.3, 2, -1.4});
-    NDArray predictions('c', {4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {1}, nd4j::DataType::DOUBLE);
+    NDArray predictions('c', {4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {1}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {4}, {0.05, -0.15, -1.,  0.7});
     NDArray dLdwExp('c', {1}, std::vector<double>{1.3});
@@ -134,7 +134,7 @@ TEST_F(DeclarableOpsTests12, cosine_distance_loss_grad_test3) {
     predictions.linspace(-0.4, 0.2);
     weights.assign(0.5);
 
-    nd4j::ops::cosine_distance_loss_grad op;
+    sd::ops::cosine_distance_loss_grad op;
 
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {0, 0});
 
@@ -158,8 +158,8 @@ TEST_F(DeclarableOpsTests12, cosine_distance_loss_grad_test3) {
 TEST_F(DeclarableOpsTests12, cosine_distance_loss_grad_test4) {
 
     NDArray labels('c', {1,4}, {-0.1, 0.3, 2, -1.4});
-    NDArray predictions('c', {1,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {}, std::vector<double>{0.}, nd4j::DataType::DOUBLE);
+    NDArray predictions('c', {1,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {}, std::vector<double>{0.}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {1,4}, {0.05, -0.15, -1.,  0.7});
     NDArray dLdwExp('c', {}, std::vector<double>{1.3});
@@ -168,7 +168,7 @@ TEST_F(DeclarableOpsTests12, cosine_distance_loss_grad_test4) {
     predictions.linspace(-0.4, 0.2);
     weights.assign(0.5);
 
-    nd4j::ops::cosine_distance_loss_grad op;
+    sd::ops::cosine_distance_loss_grad op;
 
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {1, 1});
 
@@ -192,9 +192,9 @@ TEST_F(DeclarableOpsTests12, cosine_distance_loss_grad_test4) {
 /////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests12, cosine_distance_loss_grad_test5) {
 
-    NDArray labels('c', {4}, {-0.1, 0.3, 2, -1.4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {1,1}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {4}, {-0.1, 0.3, 2, -1.4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {1,1}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {4}, {0.1, -0.3, -2. ,  1.4});
     NDArray dLdwExp('c', {1,1}, std::vector<double>{0.});
@@ -203,7 +203,7 @@ TEST_F(DeclarableOpsTests12, cosine_distance_loss_grad_test5) {
     predictions.linspace(-0.4, 0.2);
     weights = 0.5;
 
-    nd4j::ops::cosine_distance_loss_grad op;
+    sd::ops::cosine_distance_loss_grad op;
 
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {2, 0});
 
@@ -226,9 +226,9 @@ TEST_F(DeclarableOpsTests12, cosine_distance_loss_grad_test5) {
 /////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests12, cosine_distance_loss_grad_test6) {
 
-    NDArray labels('c', {4,1}, {-0.1, 0.3, 2, -1.4}, nd4j::DataType::DOUBLE);
-    NDArray predictions('c', {4,1}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {4,1}, nd4j::DataType::DOUBLE);
+    NDArray labels('c', {4,1}, {-0.1, 0.3, 2, -1.4}, sd::DataType::DOUBLE);
+    NDArray predictions('c', {4,1}, sd::DataType::DOUBLE);
+    NDArray weights('c', {4,1}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {4,1}, {0.0125, -0.0375, -0.25  , 0.175});
     NDArray dLdwExp('c', {4,1}, {0.24 , 0.265, 0.25 , 0.32});
@@ -237,7 +237,7 @@ TEST_F(DeclarableOpsTests12, cosine_distance_loss_grad_test6) {
     predictions.linspace(-0.4, 0.2);
     weights = 0.5;
 
-    nd4j::ops::cosine_distance_loss_grad op;
+    sd::ops::cosine_distance_loss_grad op;
 
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {3, 1});
 
@@ -261,8 +261,8 @@ TEST_F(DeclarableOpsTests12, cosine_distance_loss_grad_test6) {
 TEST_F(DeclarableOpsTests12, cosine_distance_loss_grad_test7) {
 
     NDArray labels('c', {2,3,4}, {-0.1, 0.3, 2, -1.4, 2.5, -3, 1.2, 2.2,-0.1, 0.3, 2, -3.4, 2.5, -3, 1.2, 2.2,-0.2, 0.3, 2, -1.4, 2.7, -3, 1.2, 4.2});
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {1,3,1}, nd4j::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {1,3,1}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3,4}, {0.00833, -0.025  , -0.16667,  0.11667,-0.20833,  0.25   , -0.1    , -0.18333, 0.00833, -0.025  , -0.16667,  0.28333,
                                    -0.20833,  0.25   , -0.1    , -0.18333, 0.01667, -0.025  , -0.16667,  0.11667,-0.225  ,  0.25   , -0.1    , -0.35   });
@@ -273,7 +273,7 @@ TEST_F(DeclarableOpsTests12, cosine_distance_loss_grad_test7) {
     predictions.linspace(-0.4, 0.2);
     weights = 0.5;
 
-    nd4j::ops::cosine_distance_loss_grad op;
+    sd::ops::cosine_distance_loss_grad op;
 
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {2, 0});
 
@@ -297,8 +297,8 @@ TEST_F(DeclarableOpsTests12, cosine_distance_loss_grad_test7) {
 TEST_F(DeclarableOpsTests12, cosine_distance_loss_grad_test8) {
 
     NDArray labels('c', {2,3,4}, {-0.1, 0.3, 2, -1.4, 2.5, -3, 1.2, 2.2,-0.1, 0.3, 2, -3.4, 2.5, -3, 1.2, 2.2,-0.2, 0.3, 2, -1.4, 2.7, -3, 1.2, 4.2});
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {2,1,1}, nd4j::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {2,1,1}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3,4}, {0.00625, -0.01875, -0.125  ,  0.0875,-0.15625,  0.1875 , -0.075  , -0.1375, 0.00625, -0.01875, -0.125  ,  0.2125,
                                   -0.15625,  0.1875 , -0.075  , -0.1375, 0.0125 , -0.01875, -0.125  ,  0.0875,-0.16875,  0.1875 , -0.075  , -0.2625});
@@ -309,7 +309,7 @@ TEST_F(DeclarableOpsTests12, cosine_distance_loss_grad_test8) {
     predictions.linspace(-0.4, 0.2);
     weights = 0.5;
 
-    nd4j::ops::cosine_distance_loss_grad op;
+    sd::ops::cosine_distance_loss_grad op;
 
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {3, 1});
 
@@ -333,8 +333,8 @@ TEST_F(DeclarableOpsTests12, cosine_distance_loss_grad_test8) {
 TEST_F(DeclarableOpsTests12, cosine_distance_loss_grad_test9) {
 
     NDArray labels('c', {2,3,4}, {-0.1, 0.3, 2, -1.4, 2.5, -3, 1.2, 2.2,-0.1, 0.3, 2, -3.4, 2.5, -3, 1.2, 2.2,-0.2, 0.3, 2, -1.4, 2.7, -3, 1.2, 4.2});
-    NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray weights('c', {2,3,1}, nd4j::DataType::DOUBLE);
+    NDArray predictions('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray weights('c', {2,3,1}, sd::DataType::DOUBLE);
 
     NDArray dLdpExp('c', {2,3,4}, {0.05, -0.15, -1.  ,  0.7,-1.25,  1.5 , -0.6 , -1.1, 0.05, -0.15, -1.  ,  1.7,
                                     -1.25,  1.5 , -0.6 , -1.1, 0.1 , -0.15, -1.  ,  0.7,-1.35,  1.5 , -0.6 , -2.1});
@@ -345,7 +345,7 @@ TEST_F(DeclarableOpsTests12, cosine_distance_loss_grad_test9) {
     predictions.linspace(-0.4, 0.2);
     weights = 0.5;
 
-    nd4j::ops::cosine_distance_loss_grad op;
+    sd::ops::cosine_distance_loss_grad op;
 
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {0, 2});
 
@@ -369,16 +369,16 @@ TEST_F(DeclarableOpsTests12, cosine_distance_loss_grad_test9) {
 /////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests12, hinge_loss_14) {
 
-    NDArray logits('c', {3,4}, nd4j::DataType::DOUBLE);
+    NDArray logits('c', {3,4}, sd::DataType::DOUBLE);
     NDArray weights('c', {}, std::vector<double>{1.});
     NDArray labels('c', {3,4}, {0,1,1,0,1,0,1,0,1,0,1,0});
 
-    NDArray output('c', {}, std::vector<double>{0.}, nd4j::DataType::DOUBLE);
+    NDArray output('c', {}, std::vector<double>{0.}, sd::DataType::DOUBLE);
 
     logits.linspace(1.);
     weights.assign(1.);
 
-    nd4j::ops::hinge_loss op;
+    sd::ops::hinge_loss op;
     Nd4jStatus status = op.execute({&logits, &weights, &labels}, {&output}, {}, {1}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -389,17 +389,17 @@ TEST_F(DeclarableOpsTests12, hinge_loss_14) {
 /////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests12, TestDivideBP_1) {
 
-    NDArray x('c', {3,4}, nd4j::DataType::DOUBLE);
+    NDArray x('c', {3,4}, sd::DataType::DOUBLE);
     NDArray y = NDArrayFactory::create<double>(2.);
-    NDArray eps('c', {3,4}, nd4j::DataType::DOUBLE);
+    NDArray eps('c', {3,4}, sd::DataType::DOUBLE);
 
-    NDArray output1('c', {3, 4}, nd4j::DataType::DOUBLE);
-    NDArray output2(nd4j::DataType::DOUBLE);
+    NDArray output1('c', {3, 4}, sd::DataType::DOUBLE);
+    NDArray output2(sd::DataType::DOUBLE);
 
     x.linspace(2., 2.);
     eps.linspace(1.);
 
-    nd4j::ops::divide_bp op;
+    sd::ops::divide_bp op;
     Nd4jStatus status = op.execute({&x, &y, &eps}, {&output1, &output2}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -409,20 +409,20 @@ TEST_F(DeclarableOpsTests12, TestDivideBP_1) {
 /////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests12, TestDivideBP_2) {
 
-    NDArray x('c', {3,4}, nd4j::DataType::DOUBLE);
+    NDArray x('c', {3,4}, sd::DataType::DOUBLE);
     NDArray y = NDArrayFactory::create<double>('c', {3,4});
-    NDArray eps('c', {3,4}, nd4j::DataType::DOUBLE);
-    NDArray exp1('c', {3,4}, nd4j::DataType::DOUBLE);
-    NDArray exp2('c', {3,4}, nd4j::DataType::DOUBLE);
-    NDArray output1('c', {3, 4}, nd4j::DataType::DOUBLE);
-    NDArray output2('c', {3, 4}, nd4j::DataType::DOUBLE);
+    NDArray eps('c', {3,4}, sd::DataType::DOUBLE);
+    NDArray exp1('c', {3,4}, sd::DataType::DOUBLE);
+    NDArray exp2('c', {3,4}, sd::DataType::DOUBLE);
+    NDArray output1('c', {3, 4}, sd::DataType::DOUBLE);
+    NDArray output2('c', {3, 4}, sd::DataType::DOUBLE);
     exp1.assign(1.);
     exp2.assign(-2.);
     x.linspace(2., 2.);
     y.linspace(1.);
     eps.linspace(1.);
 
-    nd4j::ops::divide_bp op;
+    sd::ops::divide_bp op;
     Nd4jStatus status = op.execute({&x, &y, &eps}, std::vector<NDArray*>{&output1, &output2}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -433,17 +433,17 @@ TEST_F(DeclarableOpsTests12, TestDivideBP_2) {
 /////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests12, TestReverseDivideBP_1) {
 
-    NDArray x('c', {3,4}, nd4j::DataType::DOUBLE);
+    NDArray x('c', {3,4}, sd::DataType::DOUBLE);
     NDArray y = NDArrayFactory::create<double>(2.);
-    NDArray eps('c', {3,4}, nd4j::DataType::DOUBLE);
+    NDArray eps('c', {3,4}, sd::DataType::DOUBLE);
 
-    NDArray output1('c', {3, 4}, nd4j::DataType::DOUBLE);
-    NDArray output2(nd4j::DataType::DOUBLE);
+    NDArray output1('c', {3, 4}, sd::DataType::DOUBLE);
+    NDArray output2(sd::DataType::DOUBLE);
 
     x.linspace(2., 2.);
     eps.linspace(1.);
 
-    nd4j::ops::reversedivide_bp op;
+    sd::ops::reversedivide_bp op;
     Nd4jStatus status = op.execute({&y, &x, &eps}, std::vector<NDArray*>{&output2, &output1}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -453,21 +453,21 @@ TEST_F(DeclarableOpsTests12, TestReverseDivideBP_1) {
 /////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests12, TestReverseDivideBP_2) {
 
-    NDArray x('c', {3,4}, nd4j::DataType::DOUBLE);
+    NDArray x('c', {3,4}, sd::DataType::DOUBLE);
     NDArray y = NDArrayFactory::create<double>('c', {3,4});
-    NDArray eps('c', {3,4}, nd4j::DataType::DOUBLE);
-    NDArray exp1('c', {3,4}, nd4j::DataType::DOUBLE);
-    NDArray exp2('c', {3,4}, nd4j::DataType::DOUBLE);
+    NDArray eps('c', {3,4}, sd::DataType::DOUBLE);
+    NDArray exp1('c', {3,4}, sd::DataType::DOUBLE);
+    NDArray exp2('c', {3,4}, sd::DataType::DOUBLE);
 
-    NDArray output1('c', {3, 4}, nd4j::DataType::DOUBLE);
-    NDArray output2('c', {3, 4}, nd4j::DataType::DOUBLE);
+    NDArray output1('c', {3, 4}, sd::DataType::DOUBLE);
+    NDArray output2('c', {3, 4}, sd::DataType::DOUBLE);
 
     x.linspace(2., 2.);
     y.linspace(1.);
     eps.linspace(1.);
     exp1.assign(1.);
     exp2.assign(-2.);
-    nd4j::ops::reversedivide_bp op;
+    sd::ops::reversedivide_bp op;
     Nd4jStatus status = op.execute({&y, &x, &eps}, std::vector<NDArray*>{&output2, &output1}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -478,19 +478,19 @@ TEST_F(DeclarableOpsTests12, TestReverseDivideBP_2) {
 /////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests12, TestSliceBP_1) {
 
-    NDArray x('c', {3,4}, nd4j::DataType::DOUBLE);
-    NDArray eps('c', {2,2}, nd4j::DataType::DOUBLE);
+    NDArray x('c', {3,4}, sd::DataType::DOUBLE);
+    NDArray eps('c', {2,2}, sd::DataType::DOUBLE);
     NDArray exp('c', {3,4}, {0., 0., 0., 0., 0., 1.,1., 0., 0., 1., 1., 0.});
-    //NDArray exp2('c', {3,4}, nd4j::DataType::DOUBLE);
+    //NDArray exp2('c', {3,4}, sd::DataType::DOUBLE);
 
-    NDArray output('c', {3, 4}, nd4j::DataType::DOUBLE);
-    //NDArray output2('c', {3, 4}, nd4j::DataType::DOUBLE);
+    NDArray output('c', {3, 4}, sd::DataType::DOUBLE);
+    //NDArray output2('c', {3, 4}, sd::DataType::DOUBLE);
     output.assign(119.113);
     x.linspace(1.);
     eps.assign(1.);
     //exp1.assign(1.);
     //exp2.assign(-2.);
-    nd4j::ops::slice_bp op;
+    sd::ops::slice_bp op;
     Nd4jStatus status = op.execute({&x, &eps}, {&output}, {}, {1,1,2,2}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -501,20 +501,20 @@ TEST_F(DeclarableOpsTests12, TestSliceBP_1) {
 /////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests12, TestConfusionZero_1) {
 
-    NDArray x('c', {2}, {1,2}, nd4j::DataType::INT64);
-    NDArray i('c', {2}, {0,2}, nd4j::DataType::INT64);
-    //NDArray eps('c', {2,2}, nd4j::DataType::DOUBLE);
-    NDArray exp('c', {4,4}, {0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0}, nd4j::DataType::INT64);
-    //NDArray exp2('c', {3,4}, nd4j::DataType::DOUBLE);
+    NDArray x('c', {2}, {1,2}, sd::DataType::INT64);
+    NDArray i('c', {2}, {0,2}, sd::DataType::INT64);
+    //NDArray eps('c', {2,2}, sd::DataType::DOUBLE);
+    NDArray exp('c', {4,4}, {0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0}, sd::DataType::INT64);
+    //NDArray exp2('c', {3,4}, sd::DataType::DOUBLE);
 
-    NDArray output('c', {4, 4}, nd4j::DataType::INT64);
-    //NDArray output2('c', {3, 4}, nd4j::DataType::DOUBLE);
+    NDArray output('c', {4, 4}, sd::DataType::INT64);
+    //NDArray output2('c', {3, 4}, sd::DataType::DOUBLE);
     output.assign(119.113);
     x.linspace(1.);
     //eps.assign(1.);
     //exp1.assign(1.);
     //exp2.assign(-2.);
-    nd4j::ops::confusion_matrix op;
+    sd::ops::confusion_matrix op;
     Nd4jStatus status = op.execute({&x, &i}, {&output}, {}, {4}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -525,21 +525,21 @@ TEST_F(DeclarableOpsTests12, TestConfusionZero_1) {
 /////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests12, TestMaximumBP_1) {
 
-    NDArray x('c', {3,4}, nd4j::DataType::DOUBLE);
-    NDArray y('c', {3,4}, nd4j::DataType::DOUBLE);
-    NDArray eps('c', {3,4}, nd4j::DataType::DOUBLE);
-    NDArray exp1('c', {3,4}, {0, 0, 0, 0, 0, 0, 7, 8, 9, 10, 11, 12}, nd4j::DataType::DOUBLE);
-    NDArray exp2('c', {3,4}, {1, 2, 3, 4, 5, 6, 0, 0, 0,  0,  0,  0}, nd4j::DataType::DOUBLE);
+    NDArray x('c', {3,4}, sd::DataType::DOUBLE);
+    NDArray y('c', {3,4}, sd::DataType::DOUBLE);
+    NDArray eps('c', {3,4}, sd::DataType::DOUBLE);
+    NDArray exp1('c', {3,4}, {0, 0, 0, 0, 0, 0, 7, 8, 9, 10, 11, 12}, sd::DataType::DOUBLE);
+    NDArray exp2('c', {3,4}, {1, 2, 3, 4, 5, 6, 0, 0, 0,  0,  0,  0}, sd::DataType::DOUBLE);
 
-    NDArray output1('c', {3, 4}, nd4j::DataType::DOUBLE);
-    NDArray output2('c', {3, 4}, nd4j::DataType::DOUBLE);
+    NDArray output1('c', {3, 4}, sd::DataType::DOUBLE);
+    NDArray output2('c', {3, 4}, sd::DataType::DOUBLE);
     output1.assign(119);
     x.linspace(1.);
     y.linspace(12., -1.);
     eps.linspace(1.);
     //exp1.assign(1.);
     //exp2.assign(-2.);
-    nd4j::ops::maximum_bp op;
+    sd::ops::maximum_bp op;
     Nd4jStatus status = op.execute({&x, &y, &eps}, std::vector<NDArray*>{&output1, &output2}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -550,21 +550,21 @@ TEST_F(DeclarableOpsTests12, TestMaximumBP_1) {
 /////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests12, TestMinimumBP_1) {
 
-    NDArray x('c', {3,4}, nd4j::DataType::DOUBLE);
-    NDArray y('c', {3,4}, nd4j::DataType::DOUBLE);
-    NDArray eps('c', {3,4}, nd4j::DataType::DOUBLE);
-    NDArray exp1('c', {3,4}, {0, 0, 0, 0, 0, 0, 7, 8, 9, 10, 11, 12}, nd4j::DataType::DOUBLE);
-    NDArray exp2('c', {3,4}, {1, 2, 3, 4, 5, 6, 0, 0, 0,  0,  0,  0}, nd4j::DataType::DOUBLE);
+    NDArray x('c', {3,4}, sd::DataType::DOUBLE);
+    NDArray y('c', {3,4}, sd::DataType::DOUBLE);
+    NDArray eps('c', {3,4}, sd::DataType::DOUBLE);
+    NDArray exp1('c', {3,4}, {0, 0, 0, 0, 0, 0, 7, 8, 9, 10, 11, 12}, sd::DataType::DOUBLE);
+    NDArray exp2('c', {3,4}, {1, 2, 3, 4, 5, 6, 0, 0, 0,  0,  0,  0}, sd::DataType::DOUBLE);
 
-    NDArray output1('c', {3, 4}, nd4j::DataType::DOUBLE);
-    NDArray output2('c', {3, 4}, nd4j::DataType::DOUBLE);
+    NDArray output1('c', {3, 4}, sd::DataType::DOUBLE);
+    NDArray output2('c', {3, 4}, sd::DataType::DOUBLE);
     output1.assign(119);
     x.linspace(1.);
     y.linspace(12., -1.);
     eps.linspace(1.);
     //exp1.assign(1.);
     //exp2.assign(-2.);
-    nd4j::ops::minimum_bp op;
+    sd::ops::minimum_bp op;
     Nd4jStatus status = op.execute({&x, &y, &eps}, std::vector<NDArray*>{&output2, &output1}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -576,13 +576,13 @@ TEST_F(DeclarableOpsTests12, TestMinimumBP_1) {
 /////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests12, reverse_test15) {
 
-    NDArray x('c', {5}, {1,2,3,4,5}, nd4j::DataType::DOUBLE);
-    NDArray axis('c', {}, std::vector<double>{0}, nd4j::DataType::INT32);
-    NDArray z('c', {5}, nd4j::DataType::DOUBLE);
-    NDArray exp('c', {5}, {5,4,3,2,1}, nd4j::DataType::DOUBLE);
+    NDArray x('c', {5}, {1,2,3,4,5}, sd::DataType::DOUBLE);
+    NDArray axis('c', {}, std::vector<double>{0}, sd::DataType::INT32);
+    NDArray z('c', {5}, sd::DataType::DOUBLE);
+    NDArray exp('c', {5}, {5,4,3,2,1}, sd::DataType::DOUBLE);
 
 
-    nd4j::ops::reverse op;
+    sd::ops::reverse op;
     // auto result = op.execute({&x, &axis}, {}, {1}, {});
     Nd4jStatus status = op.execute({&x, &axis}, {&z}, {}, {1}, {});
     // auto z = result->at(0);
@@ -597,13 +597,13 @@ TEST_F(DeclarableOpsTests12, reverse_test15) {
 /////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests12, mirrorPad_test17) {
 
-    NDArray x('c', {2,3}, {1,2,3,4,5,6}, nd4j::DataType::DOUBLE);
-    NDArray padding('c', {2,2}, {1,1,2,2}, nd4j::DataType::INT64);
-    NDArray z('c', {4,7}, nd4j::DataType::DOUBLE);
-    NDArray exp1('c', {4,7}, {6, 5, 4, 5, 6, 5, 4,3, 2, 1, 2, 3, 2, 1,6, 5, 4, 5, 6, 5, 4,3, 2, 1, 2, 3, 2, 1}, nd4j::DataType::DOUBLE);
-    NDArray exp2('c', {4,7}, {2, 1, 1, 2, 3, 3, 2,2, 1, 1, 2, 3, 3, 2,5, 4, 4, 5, 6, 6, 5,5, 4, 4, 5, 6, 6, 5}, nd4j::DataType::DOUBLE);
+    NDArray x('c', {2,3}, {1,2,3,4,5,6}, sd::DataType::DOUBLE);
+    NDArray padding('c', {2,2}, {1,1,2,2}, sd::DataType::INT64);
+    NDArray z('c', {4,7}, sd::DataType::DOUBLE);
+    NDArray exp1('c', {4,7}, {6, 5, 4, 5, 6, 5, 4,3, 2, 1, 2, 3, 2, 1,6, 5, 4, 5, 6, 5, 4,3, 2, 1, 2, 3, 2, 1}, sd::DataType::DOUBLE);
+    NDArray exp2('c', {4,7}, {2, 1, 1, 2, 3, 3, 2,2, 1, 1, 2, 3, 3, 2,5, 4, 4, 5, 6, 6, 5,5, 4, 4, 5, 6, 6, 5}, sd::DataType::DOUBLE);
 
-    nd4j::ops::mirror_pad op;
+    sd::ops::mirror_pad op;
     Nd4jStatus status = op.execute({&x, &padding}, {&z}, {}, {0}, {});      // reflect
 
     ASSERT_EQ(Status::OK(), status);
@@ -621,12 +621,12 @@ TEST_F(DeclarableOpsTests12, mirrorPad_test17) {
 /////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests12, mirrorPad_test18) {
 
-    NDArray x('c', {3}, {1,2,3}, nd4j::DataType::DOUBLE);
-    NDArray padding('c', {1, 2}, {1,1}, nd4j::DataType::INT32);
-    NDArray z('c', {5}, nd4j::DataType::DOUBLE);
-    NDArray exp('c', {5}, {2,1,2,3,2}, nd4j::DataType::DOUBLE);
+    NDArray x('c', {3}, {1,2,3}, sd::DataType::DOUBLE);
+    NDArray padding('c', {1, 2}, {1,1}, sd::DataType::INT32);
+    NDArray z('c', {5}, sd::DataType::DOUBLE);
+    NDArray exp('c', {5}, {2,1,2,3,2}, sd::DataType::DOUBLE);
 
-    nd4j::ops::mirror_pad op;
+    sd::ops::mirror_pad op;
     Nd4jStatus status = op.execute({&x, &padding}, {&z}, {}, {0}, {});      // reflect
 
     ASSERT_EQ(Status::OK(), status);
@@ -649,7 +649,7 @@ TEST_F(DeclarableOpsTests12, relu_1) {
                                     0.133276, 0.326284, 0.102804, -0.133276, -0.326284, -0.102804,0.426913, 0.256251, 0.305241, -0.426913, -0.256251, -0.305241,
                                     0.177977, 0.841799, 0.800615, -0.177977, -0.841799, -0.800615,0.001991, 0.518389, 0.439322, -0.001991, -0.518389, -0.439322,
                                     0.166846, 0.508224, 0.486687, -0.166846, -0.508224, -0.486687,0.167493, 0.930932, 0.868717, -0.167493, -0.930932, -0.868717,
-                                    0.174864, 0.444607, 0.445000, -0.174864, -0.444607, -0.445000},  nd4j::DataType::FLOAT32);
+                                    0.174864, 0.444607, 0.445000, -0.174864, -0.444607, -0.445000},  sd::DataType::FLOAT32);
 
     NDArray expected('c', {1,5,5,6}, { 0.557449, 0.768277, 1.094015, 0., 0., 0., 0.563735, 0.900299, 0.789979, 0., 0., 0.,
                                 0.142528, 0.959611, 0.877506, 0., 0., 0., 0.448742, 0.995377, 1.171543, 0., 0., 0.,
@@ -663,11 +663,11 @@ TEST_F(DeclarableOpsTests12, relu_1) {
                                 0.133276, 0.326284, 0.102804, 0., 0., 0., 0.426913, 0.256251, 0.305241, 0., 0., 0.,
                                 0.177977, 0.841799, 0.800615, 0., 0., 0., 0.001991, 0.518389, 0.439322, 0., 0., 0.,
                                 0.166846, 0.508224, 0.486687, 0., 0., 0., 0.167493, 0.930932, 0.868717, 0., 0., 0.,
-                                0.174864, 0.444607, 0.445000, 0., 0., 0.},  nd4j::DataType::FLOAT32);
+                                0.174864, 0.444607, 0.445000, 0., 0., 0.},  sd::DataType::FLOAT32);
 
-    NDArray z('c', {1,5,5,6}, nd4j::DataType::FLOAT32);
+    NDArray z('c', {1,5,5,6}, sd::DataType::FLOAT32);
 
-    nd4j::ops::relu op;
+    sd::ops::relu op;
     Nd4jStatus status = op.execute({&input}, {&z}, {0}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -679,47 +679,47 @@ TEST_F(DeclarableOpsTests12, relu_1) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests12, multiUnique_1) {
 
-    NDArray input1('c', {3,5}, {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}, nd4j::DataType::INT32);
-    NDArray input2('c', {3,4}, {1,2,3,4,5,6,7,8,9,10,11,12}, nd4j::DataType::INT32);
-    NDArray input3('c', {2,3}, {10,11,12,13,14,15}, nd4j::DataType::INT32);
-    NDArray input4('c', {1,5}, {7,8,9,10,11}, nd4j::DataType::INT32);
-    NDArray input5('c', {5,3}, {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}, nd4j::DataType::INT32);
+    NDArray input1('c', {3,5}, {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}, sd::DataType::INT32);
+    NDArray input2('c', {3,4}, {1,2,3,4,5,6,7,8,9,10,11,12}, sd::DataType::INT32);
+    NDArray input3('c', {2,3}, {10,11,12,13,14,15}, sd::DataType::INT32);
+    NDArray input4('c', {1,5}, {7,8,9,10,11}, sd::DataType::INT32);
+    NDArray input5('c', {5,3}, {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}, sd::DataType::INT32);
 
-    //NDArray indices('c', {1}, {2}, nd4j::DataType::INT32);
-    //NDArray expected('c', {1,5}, {11, 12, 13, 14, 15.}, nd4j::DataType::FLOAT32);
+    //NDArray indices('c', {1}, {2}, sd::DataType::INT32);
+    //NDArray expected('c', {1,5}, {11, 12, 13, 14, 15.}, sd::DataType::FLOAT32);
 
     std::vector<NDArray*> arrayList({&input1, &input2, &input3, &input4, &input5});
 
-    ASSERT_FALSE(nd4j::ops::helpers::multiUnique(arrayList));
+    ASSERT_FALSE(sd::ops::helpers::multiUnique(arrayList));
 }
 
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests12, multiUnique_2) {
 
-    NDArray input1('c', {3,5}, {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}, nd4j::DataType::INT32);
-    NDArray input2('c', {3,4}, {21,22,23,24,25,26,27,28,29,210,211,212}, nd4j::DataType::INT32);
-    NDArray input3('c', {2,3}, {310,311,312,313,314,315}, nd4j::DataType::INT32);
-    NDArray input4('c', {1,5}, {47,48,49,410,411}, nd4j::DataType::INT32);
-    NDArray input5('c', {5,3}, {51,52,53,54,55,56,57,58,59,510,511,512,513,514,515}, nd4j::DataType::INT32);
+    NDArray input1('c', {3,5}, {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}, sd::DataType::INT32);
+    NDArray input2('c', {3,4}, {21,22,23,24,25,26,27,28,29,210,211,212}, sd::DataType::INT32);
+    NDArray input3('c', {2,3}, {310,311,312,313,314,315}, sd::DataType::INT32);
+    NDArray input4('c', {1,5}, {47,48,49,410,411}, sd::DataType::INT32);
+    NDArray input5('c', {5,3}, {51,52,53,54,55,56,57,58,59,510,511,512,513,514,515}, sd::DataType::INT32);
 
-    //NDArray indices('c', {1}, {2}, nd4j::DataType::INT32);
-    //NDArray expected('c', {1,5}, {11, 12, 13, 14, 15.}, nd4j::DataType::FLOAT32);
+    //NDArray indices('c', {1}, {2}, sd::DataType::INT32);
+    //NDArray expected('c', {1,5}, {11, 12, 13, 14, 15.}, sd::DataType::FLOAT32);
 
     std::vector<NDArray*> arrayList({&input1, &input2, &input3, &input4, &input5});
-    ASSERT_TRUE(nd4j::ops::helpers::multiUnique(arrayList));
+    ASSERT_TRUE(sd::ops::helpers::multiUnique(arrayList));
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests12, reduceMeanBp_4) {
 
     NDArray x('c', {3,5}, {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15});
-    NDArray gradO('c', {5}, nd4j::DataType::DOUBLE);
-    NDArray exp('c', {3,5}, nd4j::DataType::DOUBLE);
+    NDArray gradO('c', {5}, sd::DataType::DOUBLE);
+    NDArray exp('c', {3,5}, sd::DataType::DOUBLE);
 
     gradO = 1.;
     exp = 0.333333;
 
-    nd4j::ops::reduce_mean_bp op;
+    sd::ops::reduce_mean_bp op;
     auto result = op.evaluate({&x, &gradO}, {}, {0});
     auto output = result->at(0);
 
@@ -735,13 +735,13 @@ TEST_F(DeclarableOpsTests12, reduceMeanBp_4) {
 TEST_F(DeclarableOpsTests12, reduceMeanBp_5) {
 
     NDArray x('c', {3,5}, {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15});
-    NDArray gradO('c', {3}, nd4j::DataType::DOUBLE);
-    NDArray exp('c', {3,5}, nd4j::DataType::DOUBLE);
+    NDArray gradO('c', {3}, sd::DataType::DOUBLE);
+    NDArray exp('c', {3,5}, sd::DataType::DOUBLE);
 
     gradO = 1.;
     exp = 0.2;
 
-    nd4j::ops::reduce_mean_bp op;
+    sd::ops::reduce_mean_bp op;
     auto result = op.evaluate({&x, &gradO}, {}, {1});
     auto output = result->at(0);
 
@@ -756,10 +756,10 @@ TEST_F(DeclarableOpsTests12, reduceMeanBp_5) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests12, reduceSqnormBp_1) {
 
-    NDArray x('c', {8,6,4}, nd4j::DataType::DOUBLE);
-    NDArray gradO('c', {8,6,1}, nd4j::DataType::DOUBLE);
+    NDArray x('c', {8,6,4}, sd::DataType::DOUBLE);
+    NDArray gradO('c', {8,6,1}, sd::DataType::DOUBLE);
 
-    nd4j::ops::reduce_sqnorm_bp op;
+    sd::ops::reduce_sqnorm_bp op;
     auto result = op.evaluate({&x, &gradO}, {1}, {2});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -770,7 +770,7 @@ TEST_F(DeclarableOpsTests12, reduceSqnormBp_1) {
 TEST_F(DeclarableOpsTests12, pullRows_1) {
 
     NDArray x('c', {5, 1}, {0,1,2,3,4});
-    NDArray z('c', {4, 1}, nd4j::DataType::DOUBLE);
+    NDArray z('c', {4, 1}, sd::DataType::DOUBLE);
     NDArray exp('c', {4, 1}, {0,2,3,4});
 
     Nd4jLong indexes[] = {0,2,3,4};
@@ -779,8 +779,8 @@ TEST_F(DeclarableOpsTests12, pullRows_1) {
 
     std::vector<int> dims = {1};
 
-    auto xTadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(x.getShapeInfo(), dims);
-    auto zTadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(z.getShapeInfo(), dims);
+    auto xTadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(x.getShapeInfo(), dims);
+    auto zTadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(z.getShapeInfo(), dims);
 
     Nd4jPointer nativeStart[2];
 
@@ -806,7 +806,7 @@ TEST_F(DeclarableOpsTests12, pullRows_2) {
     NDArray* y = new NDArray(arr.dup('c'));
     NDArray x = (*y)({0,0, 0,1}, true);     // view, points on first column of y, shape is {5,1}
 
-    NDArray z('c', {4, 1}, nd4j::DataType::DOUBLE);
+    NDArray z('c', {4, 1}, sd::DataType::DOUBLE);
     NDArray exp('c', {4, 1}, {0,2,3,4});
 
     Nd4jLong indexes[] = {0,2,3,4};
@@ -815,8 +815,8 @@ TEST_F(DeclarableOpsTests12, pullRows_2) {
 
     std::vector<int> dims = {1};
 
-    auto xTadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(x.getShapeInfo(), dims);
-    auto zTadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(z.getShapeInfo(), dims);
+    auto xTadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(x.getShapeInfo(), dims);
+    auto zTadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(z.getShapeInfo(), dims);
 
     Nd4jPointer nativeStart[2];
 #ifdef __CUDABLAS__
@@ -837,15 +837,15 @@ TEST_F(DeclarableOpsTests12, pullRows_2) {
 
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests12, softmax_9) {
-    NDArray  arrC('c', {5,2}, {-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8, -0.9, 1}, nd4j::DataType::FLOAT32);
+    NDArray  arrC('c', {5,2}, {-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8, -0.9, 1}, sd::DataType::FLOAT32);
     NDArray* arrF = new NDArray(arrC.dup('f'));
 
-    NDArray  outCC('c', {5,2}, nd4j::DataType::FLOAT32);
-    NDArray  outCF('f', {5,2}, nd4j::DataType::FLOAT32);
-    NDArray  outFC('c', {5,2}, nd4j::DataType::FLOAT32);
-    NDArray  outFF('c', {5,2}, nd4j::DataType::FLOAT32);
+    NDArray  outCC('c', {5,2}, sd::DataType::FLOAT32);
+    NDArray  outCF('f', {5,2}, sd::DataType::FLOAT32);
+    NDArray  outFC('c', {5,2}, sd::DataType::FLOAT32);
+    NDArray  outFF('c', {5,2}, sd::DataType::FLOAT32);
 
-    nd4j::ops::softmax op;
+    sd::ops::softmax op;
     auto status1 = op.execute({&arrC}, {&outCC}, {}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, status1);
     auto status2 = op.execute({&arrC}, {&outCF}, {}, {}, {});
@@ -872,7 +872,7 @@ TEST_F(DeclarableOpsTests12, maxpool_bp_half_1) {
     auto y = NDArrayFactory::create<bfloat16>('c', {2, 3, 10, 1}, {0.0f, -0.13391113f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, -0.1751709f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.51904297f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.5107422f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f});
     auto z = NDArrayFactory::create<bfloat16>('c', {2, 3, 10, 1});
 
-    nd4j::ops::maxpool2d_bp op;
+    sd::ops::maxpool2d_bp op;
     Context ctx(1);
     Nd4jLong iArgs[] = {5,1,1, 2,2,0, 1,1,1, 0,0};
     ctx.setIArguments(iArgs, 11);
@@ -912,7 +912,7 @@ TEST_F(DeclarableOpsTests12, lrn_bp_1) {
     input.linspace(1);
     gradO = 1;
 
-    nd4j::ops::lrn_bp op;
+    sd::ops::lrn_bp op;
 
     auto results = op.evaluate({&input, &gradO}, {1., 1., 1}, {5});
     auto gradI = results->at(0);
@@ -943,7 +943,7 @@ TEST_F(DeclarableOpsTests12, lrn_bp_2) {
     input.linspace(-10, 0.1);
     gradO = 1;
 
-    nd4j::ops::lrn_bp op;
+    sd::ops::lrn_bp op;
 
     auto results = op.evaluate({&input, &gradO}, {1., 1., 1}, {2});
     auto gradI = results->at(0);
@@ -974,7 +974,7 @@ TEST_F(DeclarableOpsTests12, lrn_bp_3) {
     input.linspace(-10, 0.1);
     gradO = 1;
 
-    nd4j::ops::lrn_bp op;
+    sd::ops::lrn_bp op;
 
     auto results = op.evaluate({&input, &gradO}, {1., 1., 1}, {7});
     auto gradI = results->at(0);
@@ -1005,7 +1005,7 @@ TEST_F(DeclarableOpsTests12, lrn_bp_4) {
     input.linspace(-10, 0.1);
     gradO = 1;
 
-    nd4j::ops::lrn_bp op;
+    sd::ops::lrn_bp op;
 
     auto results = op.evaluate({&input, &gradO}, {1., 1., 1}, {12});
     auto gradI = results->at(0);
@@ -1028,7 +1028,7 @@ TEST_F(DeclarableOpsTests12, lrn_bp_5) {
     // gradO.linspace(0.1, 0.1);
     gradO = 1;
 
-    nd4j::ops::lrn_bp op;
+    sd::ops::lrn_bp op;
 
     auto results = op.evaluate({&input, &gradO}, {1., 1., 0.5}, {2});
     auto gradI = results->at(0);
@@ -1047,7 +1047,7 @@ TEST_F(DeclarableOpsTests12, lrn_bp_6) {
     // gradO.linspace(-1.5, 0.1);
     gradO = 1;
 
-    nd4j::ops::lrn_bp op;
+    sd::ops::lrn_bp op;
 
     auto results = op.evaluate({&input, &gradO}, {1., 2., 0.5}, {10});
     auto gradI = results->at(0);
@@ -1069,8 +1069,8 @@ TEST_F(DeclarableOpsTests12, lrn_bp_7) {
     const OpArgsHolder argsHolderFF({&input}, {1,2,0.5}, {2});
     const OpArgsHolder argsHolderBP({&input, &gradO}, {1,2,0.5}, {2});
 
-    nd4j::ops::lrn opFF;
-    nd4j::ops::lrn_bp opBP;
+    sd::ops::lrn opFF;
+    sd::ops::lrn_bp opBP;
 
     const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
 
@@ -1086,8 +1086,8 @@ TEST_F(DeclarableOpsTests12, lrn_bp_8) {
     const OpArgsHolder argsHolderFF({&input}, {1,2,0.5}, {2});
     const OpArgsHolder argsHolderBP({&input, &gradO}, {1,2,0.5}, {2});
 
-    nd4j::ops::lrn opFF;
-    nd4j::ops::lrn_bp opBP;
+    sd::ops::lrn opFF;
+    sd::ops::lrn_bp opBP;
 
     const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
 
@@ -1101,7 +1101,7 @@ TEST_F(DeclarableOpsTests12, lrn_bp_9) {
     NDArray gradO('c', {1,1,1,5}, {1, 1, 1, 1, 1});
     NDArray exp('c', {1,1,1,5}, {0.1084472 ,  0.03816165,  0.00978456, -0.01859251,-0.02511311});
 
-    nd4j::ops::lrn_bp op;
+    sd::ops::lrn_bp op;
 
     auto results = op.evaluate({&input, &gradO}, {1., 2., 0.5}, {3});
     auto gradI = results->at(0);
@@ -1121,7 +1121,7 @@ TEST_F(DeclarableOpsTests12, lrn_bp_10) {
     NDArray gradO('c', {1,1,1,1}, std::vector<double>{1});
     NDArray exp('c', {1,1,1,1}, std::vector<double>{0.19245008});
 
-    nd4j::ops::lrn_bp op;
+    sd::ops::lrn_bp op;
 
     auto results = op.evaluate({&input, &gradO}, {1., 2., 0.5}, {1});
     auto gradI = results->at(0);
@@ -1142,7 +1142,7 @@ TEST_F(DeclarableOpsTests12, lrn_1) {
 
     input.linspace(-20, 1);
 
-    nd4j::ops::lrn op;
+    sd::ops::lrn op;
 
     auto results = op.evaluate({&input}, {1., 2., 0.5}, {2});
     auto output = results->at(0);
@@ -1158,7 +1158,7 @@ TEST_F(DeclarableOpsTests12, lrn_2) {
     NDArray input('c', {1,1,1,5}, {1, 2., 3, 4, 5});
     NDArray exp('c', {1,1,1,5}, {0.09530295, 0.1906059 , 0.28590885, 0.3812118 , 0.47651473});
 
-    nd4j::ops::lrn op;
+    sd::ops::lrn op;
 
     auto results = op.evaluate({&input}, {0.1, 2., 0.5}, {5});
     auto output = results->at(0);
@@ -1173,7 +1173,7 @@ TEST_F(DeclarableOpsTests12, lrn_3) {
     NDArray input('c', {1,1,1,1}, std::vector<double>{1.});
     NDArray exp('c', {1,1,1,1}, std::vector<double>{0.69006556});
 
-    nd4j::ops::lrn op;
+    sd::ops::lrn op;
 
     auto results = op.evaluate({&input}, {0.1, 2., 0.5}, {5});
     auto output = results->at(0);
@@ -1188,7 +1188,7 @@ TEST_F(DeclarableOpsTests12, lrn_4) {
     NDArray input('c', {1,1,1,1}, std::vector<double>{1.});
     NDArray exp('c', {1,1,1,1}, std::vector<double>{0.69006556});
 
-    nd4j::ops::lrn op;
+    sd::ops::lrn op;
 
     auto results = op.evaluate({&input}, {0.1, 2., 0.5}, {0});
     auto output = results->at(0);
@@ -1203,7 +1203,7 @@ TEST_F(DeclarableOpsTests12, lrn_5) {
     NDArray input('c', {1,1,1,5}, {1, 2., 3, 4, 5});
     NDArray exp('c', {1,1,1,5}, {0.69006556, 0.70272833, 0.7051508 , 0.7060045 , 0.7064008});
 
-    nd4j::ops::lrn op;
+    sd::ops::lrn op;
 
     auto results = op.evaluate({&input}, {0.1, 2., 0.5}, {0});
     auto output = results->at(0);
@@ -1216,12 +1216,12 @@ TEST_F(DeclarableOpsTests12, lrn_5) {
 TEST_F(DeclarableOpsTests12, inTopK_1) {
 
     NDArray x('c', {4, 5}, {11.0, 14.0, 6.0, 9.0, 3.5, 7.0, 21.0, 3.0,  15.0, 6.0, 9.0, 3.5, 7.0, 11.0, 13.0, 5.0, 16.0, 9.0, 13.5, 7.0});
-    NDArray y('c', {4}, {0., 0, 0, 0}, nd4j::DataType::INT64);
-    NDArray z('c', {4}, {1., 1, 1, 1}, nd4j::DataType::BOOL);
+    NDArray y('c', {4}, {0., 0, 0, 0}, sd::DataType::INT64);
+    NDArray z('c', {4}, {1., 1, 1, 1}, sd::DataType::BOOL);
 
-    NDArray expV('c', {4}, {1., 0, 0, 0}, nd4j::DataType::BOOL);
+    NDArray expV('c', {4}, {1., 0, 0, 0}, sd::DataType::BOOL);
 
-    nd4j::ops::in_top_k op;
+    sd::ops::in_top_k op;
     Nd4jStatus status = op.execute({&x, &y, }, {&z}, {}, {2}, {});
 
     // z.printIndexedBuffer();
@@ -1243,7 +1243,7 @@ TEST_F(DeclarableOpsTests12, inTopK_2) {
     input.linspace(1);
     idx.linspace(1);
 
-    nd4j::ops::in_top_k op;
+    sd::ops::in_top_k op;
 
     auto res = op.evaluate({&input, &idx}, {}, {1});
 
@@ -1259,7 +1259,7 @@ TEST_F(DeclarableOpsTests12, inTopK_3) {
     auto y = NDArrayFactory::create<Nd4jLong>('c', {2}, {1, 1});
     auto expV = NDArrayFactory::create<bool>('c', {2}, {true, false});
 
-    nd4j::ops::in_top_k op;
+    sd::ops::in_top_k op;
     auto result = op.evaluate({&x, &y}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1279,7 +1279,7 @@ TEST_F(DeclarableOpsTests12, inTopK_4) {
     auto y = NDArrayFactory::create<Nd4jLong>('c', {6}, {0, 0, 0, 0, 0, 0});
     auto expV = NDArrayFactory::create<bool>('c', {6}, {true, false, true, false, false, true});
 
-    nd4j::ops::in_top_k op;
+    sd::ops::in_top_k op;
     auto result = op.evaluate({&x, &y}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1300,7 +1300,7 @@ TEST_F(DeclarableOpsTests12, inTopK_5) {
     auto y = NDArrayFactory::create<Nd4jLong>('f', {6}, {0, 0, 0, 0, 0, 0});
     auto expV = NDArrayFactory::create<bool>('f', {6}, {true, false, false, false, false, false });
 
-    nd4j::ops::in_top_k op;
+    sd::ops::in_top_k op;
     auto result = op.evaluate({&x, &y}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1320,7 +1320,7 @@ TEST_F(DeclarableOpsTests12, cube_1) {
     NDArray x('c', {2, 3}, {1., 2., 3., 4., 5, 6});
     NDArray exp('c', {2, 3}, {1., 8., 27., 64., 125, 216});
 
-    nd4j::ops::cube op;
+    sd::ops::cube op;
 
     auto result = op.evaluate({&x});
 
@@ -1338,12 +1338,12 @@ TEST_F(DeclarableOpsTests12, cube_1) {
 TEST_F(DeclarableOpsTests12, cube_bp_1) {
 
     NDArray x('c', {2, 3}, {1., 2., 3., 4., 5, 6});
-    NDArray gradO('c', {2, 3}, nd4j::DataType::DOUBLE);
+    NDArray gradO('c', {2, 3}, sd::DataType::DOUBLE);
     NDArray exp('c', {2, 3}, {1.5, 6., 13.5, 24., 37.5, 54});
 
     gradO = 0.5;
 
-    nd4j::ops::cube_bp op;
+    sd::ops::cube_bp op;
 
     auto result = op.evaluate({&x, &gradO});
 
@@ -1363,11 +1363,11 @@ TEST_F(DeclarableOpsTests12, cube_bp_1) {
 TEST_F(DeclarableOpsTests12, pad_tests1) {
 
 
-    NDArray input('c', {2,3}, {1,2,3,4,5,6}, nd4j::DataType::FLOAT32);
-    NDArray paddings('c', {2,2}, {1,1,2,2}, nd4j::DataType::INT32);
-    NDArray expected('c', {4,7}, {0,0,0,0,0,0,0, 0,0,1,2,3,0,0, 0,0,4,5,6,0,0, 0,0,0,0,0,0,0}, nd4j::DataType::FLOAT32);
+    NDArray input('c', {2,3}, {1,2,3,4,5,6}, sd::DataType::FLOAT32);
+    NDArray paddings('c', {2,2}, {1,1,2,2}, sd::DataType::INT32);
+    NDArray expected('c', {4,7}, {0,0,0,0,0,0,0, 0,0,1,2,3,0,0, 0,0,4,5,6,0,0, 0,0,0,0,0,0,0}, sd::DataType::FLOAT32);
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
     auto results = op.evaluate({&input, &paddings}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1394,7 +1394,7 @@ TEST_F(DeclarableOpsTests12, pad_tests2) {
     auto paddings = NDArrayFactory::create<int>(padBuff, 'c', {2,2});
     auto expected = NDArrayFactory::create<float>(expBuff, 'c', {4,7});
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
     auto results = op.evaluate({&input, &paddings}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1421,7 +1421,7 @@ TEST_F(DeclarableOpsTests12, pad_tests3) {
     auto paddings = NDArrayFactory::create<Nd4jLong>(padBuff, 'c', {2,2});
     auto expected = NDArrayFactory::create<float>(expBuff, 'c', {4,7});
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
     auto results = op.evaluate({&input, &paddings}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1452,7 +1452,7 @@ TEST_F(DeclarableOpsTests12, pad_tests4) {
     auto paddings = NDArrayFactory::create<int>(padBuff, 'c', {3,2});
     auto expected = NDArrayFactory::create<float>(expBuff, 'c', {4,7,7});
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
     auto results = op.evaluate({&input, &paddings}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1486,7 +1486,7 @@ TEST_F(DeclarableOpsTests12, pad_tests5) {
     auto paddings = NDArrayFactory::create<int>(padBuff, 'c', {3,2});
     auto expected = NDArrayFactory::create<double>(expBuff, 'c', {4,7,7});
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
     auto results = op.evaluate({&input, &paddings}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1513,7 +1513,7 @@ TEST_F(DeclarableOpsTests12, pad_tests6) {
     auto paddings = NDArrayFactory::create<int>(padBuff, 'c', {3,2});
     auto expected = NDArrayFactory::create<double>(expBuff, 'c', {4,7,7});
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
     auto results = op.evaluate({&input, &paddings}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1539,7 +1539,7 @@ TEST_F(DeclarableOpsTests12, pad_tests7)
     auto paddings = NDArrayFactory::create<int>(padBuff, 'c', {4, 2});
     auto expected = NDArrayFactory::create<double>(expBuff, 'c', {4, 4, 4, 4});
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
     auto results = op.evaluate({&input, &paddings}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1565,7 +1565,7 @@ TEST_F(DeclarableOpsTests12, pad_tests8)
     auto paddings = NDArrayFactory::create<int>(padBuff, 'c', {4, 2});
     auto expected = NDArrayFactory::create<double>(expBuff, 'c', {4, 4, 4, 4});
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
     auto results = op.evaluate({&input, &paddings}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1591,7 +1591,7 @@ TEST_F(DeclarableOpsTests12, pad_tests9)
     auto paddings = NDArrayFactory::create<int>(padBuff, 'c', {4, 2});
     auto expected = NDArrayFactory::create<double>(expBuff, 'c', {4, 4, 4, 4});
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
     auto results = op.evaluate({&input, &paddings}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1614,7 +1614,7 @@ TEST_F(DeclarableOpsTests12, pad_tests10) {
 
     input = 1.f;
     //input.assign(1.);
-    nd4j::ops::pad op;
+    sd::ops::pad op;
     auto results = op.evaluate({&input, &paddings}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1636,7 +1636,7 @@ TEST_F(DeclarableOpsTests12, pad_tests11) {
 
     input.linspace(1.f);
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
     auto results = op.evaluate({&input, &paddings}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1664,7 +1664,7 @@ TEST_F(DeclarableOpsTests12, pad_tests12) {
                                             101.,102.,103.,104.,105.,106.,107.,108.,109.,110.,111.,112.,113.,114.,115.,116.,117.,118.,119.,120.,116.,117.,118.,119.,120.});
     input.linspace(1.f);
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
     auto results = op.evaluate({&input, &paddings}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1686,7 +1686,7 @@ TEST_F(DeclarableOpsTests12, pad_tests13) {
     auto expected = NDArrayFactory::create<double>('c', {10}, {3., 2., 1., 2., 3., 4., 5., 4., 3., 2.});
     input.linspace(1.f);
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
     auto results = op.evaluate({&input, &paddings}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1708,7 +1708,7 @@ TEST_F(DeclarableOpsTests12, pad_tests14) {
     auto expected = NDArrayFactory::create<double>('c', {1,10}, {2., 1., 1., 2., 3., 4., 5., 5., 4., 3.});
     input.linspace(1.f);
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
     auto results = op.evaluate({&input, &paddings}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1729,7 +1729,7 @@ TEST_F(DeclarableOpsTests12, pad_tests15) {
     auto expected = NDArrayFactory::create<double>('c', {3,5}, {1., 2., 3., 4., 5., 1., 2., 3., 4., 5., 1., 2., 3., 4., 5.});
     input.linspace(1.f);
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
     auto results = op.evaluate({&input, &paddings}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1750,7 +1750,7 @@ TEST_F(DeclarableOpsTests12, pad_tests16) {
     auto expected = NDArrayFactory::create<double>('c', {10,1}, {3., 2., 1., 2., 3., 4., 5., 4., 3., 2.});
     input.linspace(1.f);
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
     auto results = op.evaluate({&input, &paddings}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1771,7 +1771,7 @@ TEST_F(DeclarableOpsTests12, pad_tests17) {
     auto expected = NDArrayFactory::create<double>('c', {5,2}, {1.,1., 2.,2., 3.,3., 4.,4., 5.,5.});
     input.linspace(1.f);
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
     auto results = op.evaluate({&input, &paddings}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1792,7 +1792,7 @@ TEST_F(DeclarableOpsTests12, pad_tests18) {
     auto expected = NDArrayFactory::create<double>('c', {5}, {1.,2.,3.,4.,5.});
     input.linspace(1.f);
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
     auto results = op.evaluate({&input, &paddings}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1813,7 +1813,7 @@ TEST_F(DeclarableOpsTests12, pad_tests19) {
     auto expected = NDArrayFactory::create<double>('c', {5,1}, {1., 2., 3., 4., 5.});
     input.linspace(1.f);
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
     auto results = op.evaluate({&input, &paddings}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1834,7 +1834,7 @@ TEST_F(DeclarableOpsTests12, pad_tests20) {
     auto expected = NDArrayFactory::create<double>('c', {1,5}, {1., 2., 3., 4., 5.});
     input.linspace(1.f);
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
     auto results = op.evaluate({&input, &paddings}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1856,7 +1856,7 @@ TEST_F(DeclarableOpsTests12, pad_tests21) {
                                              11.,12.,13.,14.,15.,11.,12.,13.,14.,15.,11.,12.,13.,14.,15.,11.,12.,13.,14.,15.});
     input.linspace(1.f);
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
     auto results = op.evaluate({&input, &paddings}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1879,7 +1879,7 @@ TEST_F(DeclarableOpsTests12, pad_tests22) {
 
     input.linspace(1.f);
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
     auto results = op.evaluate({&input, &paddings}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1902,7 +1902,7 @@ TEST_F(DeclarableOpsTests12, pad_tests23) {
 
     input.linspace(1.f);
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
     auto results = op.evaluate({&input, &paddings}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1926,7 +1926,7 @@ TEST_F(DeclarableOpsTests12, pad_tests24) {
 
     input.linspace(1.f);
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
     auto results = op.evaluate({&input, &paddings}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1948,7 +1948,7 @@ TEST_F(DeclarableOpsTests12, pad_tests25) {
 
     input.linspace(1.f);
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
     auto results = op.evaluate({&input, &paddings}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1970,7 +1970,7 @@ TEST_F(DeclarableOpsTests12, pad_tests26) {
 
     input.linspace(1.f);
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
     auto results = op.evaluate({&input, &paddings}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1986,13 +1986,13 @@ TEST_F(DeclarableOpsTests12, pad_tests26) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests12, pad_tests27) {
 
-    NDArray input('c', {2,3}, nd4j::DataType::FLOAT32);
-    NDArray paddings('c', {2,2}, {0,0,0,1}, nd4j::DataType::INT32);
-    NDArray exp('c', {2,4}, {1,1,1,0,1,1,1,0}, nd4j::DataType::FLOAT32);
-    NDArray z('c', {2,4}, nd4j::DataType::FLOAT32);
+    NDArray input('c', {2,3}, sd::DataType::FLOAT32);
+    NDArray paddings('c', {2,2}, {0,0,0,1}, sd::DataType::INT32);
+    NDArray exp('c', {2,4}, {1,1,1,0,1,1,1,0}, sd::DataType::FLOAT32);
+    NDArray z('c', {2,4}, sd::DataType::FLOAT32);
     input = 1.;
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
     Nd4jStatus status = op.execute({&input, &paddings}, {&z}, {0}, {0}, {});      // constant
     // z.printIndexedBuffer();
 
@@ -2004,16 +2004,16 @@ TEST_F(DeclarableOpsTests12, pad_tests27) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests12, pad_tests28) {
 
-    NDArray input('c', {1,111,111,32}, nd4j::DataType::FLOAT32);
-    NDArray paddings('c', {4,2}, {0,0,0,1,0,1,0,0}, nd4j::DataType::INT32);
-    NDArray z('c', {1,112,112,32}, nd4j::DataType::FLOAT32);
+    NDArray input('c', {1,111,111,32}, sd::DataType::FLOAT32);
+    NDArray paddings('c', {4,2}, {0,0,0,1,0,1,0,0}, sd::DataType::INT32);
+    NDArray z('c', {1,112,112,32}, sd::DataType::FLOAT32);
     input = 1.;
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
     Nd4jStatus status = op.execute({&input, &paddings}, {&z}, {0}, {0}, {});      // constant
     // z.printIndexedBuffer();
 
-    NDArray sum = z.reduceNumber(nd4j::reduce::Sum);
+    NDArray sum = z.reduceNumber(sd::reduce::Sum);
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
     ASSERT_EQ(sum.e<float>(0), 111*111*32);
@@ -2029,7 +2029,7 @@ TEST_F(DeclarableOpsTests12, pad_tests29) {
 
     auto exp = NDArrayFactory::create<double>({10., 1., 1., 1., 1., 1., 10.});
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
 
     auto res = op.evaluate({&in, &pad}, {10.0}, {0});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -2046,7 +2046,7 @@ TEST_F(DeclarableOpsTests12, pad_tests30) {
 
     auto exp = NDArrayFactory::create<double>({1., 1., 11., 111., 11., 1., 1.});
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
 
     auto res = op.evaluate({&in, &pad}, {10.0}, {2});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -2064,7 +2064,7 @@ TEST_F(DeclarableOpsTests12, pad_tests31) {
 
     auto exp = NDArrayFactory::create<double>({11., 1., 11., 111., 1111., 11111., 1111.});
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
 
     auto res = op.evaluate({&in, &pad}, {10.0}, {1});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -2080,7 +2080,7 @@ TEST_F(DeclarableOpsTests12, pad_tests32) {
 
     auto exp = NDArrayFactory::create<double>('c', {6,8}, {2, 1, 1, 2, 3, 3, 2, 1, 2, 1, 1, 2, 3, 3, 2, 1, 5, 4, 4, 5, 6, 6, 5, 4, 8, 7, 7, 8, 9, 9, 8, 7, 8, 7, 7, 8, 9, 9, 8, 7, 5, 4, 4, 5, 6, 6, 5, 4});
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
 
     auto res = op.evaluate({&in, &pad}, {10.0}, {2});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -2103,7 +2103,7 @@ TEST_F(DeclarableOpsTests12, pad_tests33) {
                                                             19,18,17,17,18,19,20,20,19,18.,  23,22,21,21,22,23,24,24,23,22.,  23,22,21,21,22,23,24,24,23,22.,  19,18,17,17,18,19,20,20,19,18.,
                                                             15,14,13,13,14,15,16,16,15,14.,  7,6,5,5,6,7,8,8,7,6.,   3,2,1,1,2,3,4,4,3,2.,   3,2,1,1,2,3,4,4,3,2.,   7,6,5,5,6,7,8,8,7,6.,
                                                             11,10,9,9,10,11,12,12,11,10.,  11,10,9,9,10,11,12,12,11,10.,   7,6,5,5,6,7,8,8,7,6.,   3,2,1,1,2,3,4,4,3,2.});
-    nd4j::ops::pad op;
+    sd::ops::pad op;
 
     auto res = op.evaluate({&in, &pad}, {10.0}, {2});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -2114,12 +2114,12 @@ TEST_F(DeclarableOpsTests12, pad_tests33) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests12, pad_tests34) {
 
-    NDArray input('c', {5}, {0.778786, 0.801198, 0.724375, 0.230894, 0.727141}, nd4j::DataType::FLOAT32);
-    NDArray paddings('c', {1,2}, {1,1}, nd4j::DataType::INT32);
-    NDArray expected('c', {7}, {10., 0.778786, 0.801198, 0.724375, 0.230894, 0.727141, 10.}, nd4j::DataType::FLOAT32);
-    NDArray z('c', {7}, nd4j::DataType::FLOAT32);
+    NDArray input('c', {5}, {0.778786, 0.801198, 0.724375, 0.230894, 0.727141}, sd::DataType::FLOAT32);
+    NDArray paddings('c', {1,2}, {1,1}, sd::DataType::INT32);
+    NDArray expected('c', {7}, {10., 0.778786, 0.801198, 0.724375, 0.230894, 0.727141, 10.}, sd::DataType::FLOAT32);
+    NDArray z('c', {7}, sd::DataType::FLOAT32);
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
     Nd4jStatus status = op.execute({&input, &paddings}, {&z}, {10}, {0}, {});      // constant
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -2139,7 +2139,7 @@ TEST_F(DeclarableOpsTests12, Pad_1) {
     auto paddings = NDArrayFactory::create<int>(padBuff, 'c', {2,2});
     auto expected = NDArrayFactory::create<double>(expBuff, 'c', {4,7});
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
     auto results = op.evaluate({&input, &paddings}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2166,7 +2166,7 @@ TEST_F(DeclarableOpsTests12, Pad_2) {
     auto paddings = NDArrayFactory::create<int>(padBuff, 'c', {2,2});
     auto expected = NDArrayFactory::create<double>(expBuff, 'c', {4,7});
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
     auto results = op.evaluate({&input, &paddings}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2193,7 +2193,7 @@ TEST_F(DeclarableOpsTests12, Pad_3) {
     auto paddings = NDArrayFactory::create<int>(padBuff, 'c', {2,2});
     auto expected = NDArrayFactory::create<double>(expBuff, 'c', {4,7});
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
     auto results = op.evaluate({&input, &paddings}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2220,7 +2220,7 @@ TEST_F(DeclarableOpsTests12, Pad_4) {
     auto paddings = NDArrayFactory::create<int>(padBuff, 'c', {3,2});
     auto expected = NDArrayFactory::create<double>(expBuff, 'c', {4,7,7});
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
     auto results = op.evaluate({&input, &paddings}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2247,7 +2247,7 @@ TEST_F(DeclarableOpsTests12, Pad_5) {
     auto paddings = NDArrayFactory::create<int>(padBuff, 'c', {3,2});
     auto expected = NDArrayFactory::create<double>(expBuff, 'c', {4,7,7});
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
     auto results = op.evaluate({&input, &paddings}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2274,7 +2274,7 @@ TEST_F(DeclarableOpsTests12, Pad_6) {
     auto paddings = NDArrayFactory::create<int>(padBuff, 'c', {3,2});
     auto expected = NDArrayFactory::create<double>(expBuff, 'c', {4,7,7});
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
     auto results = op.evaluate({&input, &paddings}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2300,7 +2300,7 @@ TEST_F(DeclarableOpsTests12, Pad_7)
     auto paddings = NDArrayFactory::create<int>(padBuff, 'c', {4, 2});
     auto expected = NDArrayFactory::create<double>(expBuff, 'c', {4, 4, 4, 4});
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
     auto results = op.evaluate({&input, &paddings}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2326,7 +2326,7 @@ TEST_F(DeclarableOpsTests12, Pad_8)
     auto paddings = NDArrayFactory::create<int>(padBuff, 'c', {4, 2});
     auto expected = NDArrayFactory::create<double>(expBuff, 'c', {4, 4, 4, 4});
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
     auto results = op.evaluate({&input, &paddings}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2352,7 +2352,7 @@ TEST_F(DeclarableOpsTests12, Pad_9)
     auto paddings = NDArrayFactory::create<int>(padBuff, 'c', {4, 2});
     auto expected = NDArrayFactory::create<double>(expBuff, 'c', {4, 4, 4, 4});
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
     auto results = op.evaluate({&input, &paddings}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2370,7 +2370,7 @@ TEST_F(DeclarableOpsTests12, Test_Expose_1) {
     auto input0 = NDArrayFactory::create<double>('c', {2, 3}, {1, 2, 3, 6, 5, 4});
     auto input1 = NDArrayFactory::create<double>('c', {2, 3}, {3, 2, 1, 4, 5, 6});
 
-    nd4j::ops::expose op;
+    sd::ops::expose op;
 
     auto result = op.evaluate({&input0, &input1});
 
@@ -2395,7 +2395,7 @@ TEST_F(DeclarableOpsTests12, Pad_SGO_Test_1) {
 
     auto exp = NDArrayFactory::create<double>({10., 1., 1., 1., 1., 1., 10.});
 
-    nd4j::ops::pad op;
+    sd::ops::pad op;
 
     auto res = op.evaluate({&in, &pad}, {10.0}, {0});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -2411,7 +2411,7 @@ TEST_F(DeclarableOpsTests12, LU_Test_1) {
     auto in = NDArrayFactory::create<double>('c', {3,3}, {1., 2., 3., 0., 2., 3., 0., 0., 7.});
     auto exp = NDArrayFactory::create<double>('c', {3,3}, {1., 2., 3., 0., 2., 3., 0., 0., 7});
     auto pExp = NDArrayFactory::create<int>('c', {3}, {0, 1, 2});
-    nd4j::ops::lu op;
+    sd::ops::lu op;
 
     auto res = op.evaluate({&in});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -2432,7 +2432,7 @@ TEST_F(DeclarableOpsTests12, LU_Test_2) {
 
     auto expLU = NDArrayFactory::create<double>('c', {3,3}, {4.,  5.,  6., 0.25, -1.25, -1.5, 0.5, -0.4, -3.6});
     auto expP = NDArrayFactory::create<int>({2, 0, 1});
-    nd4j::ops::lu op;
+    sd::ops::lu op;
 
     auto res = op.evaluate({&in});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -2455,7 +2455,7 @@ TEST_F(DeclarableOpsTests12, LU_Test_3) {
         0.09090909,  0.3448276,  0.34482753});
 
     auto expP = NDArrayFactory::create<int>({2, 1, 0});
-    nd4j::ops::lu op;
+    sd::ops::lu op;
 
     auto res = op.evaluate({&in});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -2497,7 +2497,7 @@ TEST_F(DeclarableOpsTests12, LU_Test_4) {
     });
 
     auto expP = NDArrayFactory::create<int>({1, 2, 7, 3, 6, 8, 5, 4, 0, 9});
-    nd4j::ops::lu op;
+    sd::ops::lu op;
 
     auto res = op.evaluate({&in});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -2567,7 +2567,7 @@ TEST_F(DeclarableOpsTests12, LU_Test_5) {
             1, 2, 7, 3, 6, 8, 5, 4, 0, 9,
             1, 2, 7, 3, 6, 8, 5, 4, 0, 9
     });
-    nd4j::ops::lu op;
+    sd::ops::lu op;
 
     auto res = op.evaluate({&in});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -2588,7 +2588,7 @@ TEST_F(DeclarableOpsTests12, LU_Test_1_2) {
     auto in = NDArrayFactory::create<double>('c', {2, 3,3}, {1., 2., 3., 0., 2., 3., 0., 0., 7.,1., 2., 3., 0., 2., 3., 0., 0., 7.});
     auto exp = NDArrayFactory::create<double>('c', {2, 3,3}, {1., 2., 3., 0., 2., 3., 0., 0., 7, 1., 2., 3., 0., 2., 3., 0., 0., 7.});
 
-    nd4j::ops::lu op;
+    sd::ops::lu op;
 
     auto res = op.evaluate({&in});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -2616,7 +2616,7 @@ TEST_F(DeclarableOpsTests12, LU_Test_3_2) {
     });
 
     auto expP = NDArrayFactory::create<int>('c', {2,3}, {2, 1, 0, 2, 1, 0});
-    nd4j::ops::lu op;
+    sd::ops::lu op;
 
     auto res = op.evaluate({&in});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -2644,7 +2644,7 @@ TEST_F(DeclarableOpsTests12, LU_Test_3_3) {
              0.30769232,  0.619403,   9.029851});
 
     auto expP = NDArrayFactory::create<int>('c', {2,3}, {2, 1, 0, 0, 2, 1});
-    nd4j::ops::lu op;
+    sd::ops::lu op;
 
     auto res = op.evaluate({&in});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -2672,7 +2672,7 @@ TEST_F(DeclarableOpsTests12, LU_Test_4_1) {
     });
 
     auto expP = NDArrayFactory::create<int>('c', {2,2}, {0, 1, 0, 1});
-    nd4j::ops::lu op;
+    sd::ops::lu op;
 
     auto res = op.evaluate({&in});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -2700,9 +2700,9 @@ TEST_F(DeclarableOpsTests12, LU_Test_4_2) {
     });
 
     auto expP = NDArrayFactory::create<Nd4jLong>('c', {2,2}, {0, 1, 0, 1});
-    nd4j::ops::lu op;
+    sd::ops::lu op;
 
-    auto res = op.evaluate({&in}, {}, {nd4j::DataType::INT64});
+    auto res = op.evaluate({&in}, {}, {sd::DataType::INT64});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
     auto z = res->at(0);
     auto p = res->at(1);
@@ -2726,7 +2726,7 @@ TEST_F(DeclarableOpsTests12, QR_Test_1) {
 
     auto expR = NDArrayFactory::create<double>('c', {5,3}, {
        -14.177447, -20.666622,       13.401566,               0., -175.04254,       70.080315,               0.,         0.,       35.201546,               0.,         0.,              0.,               0.,         0.,              0. });
-    nd4j::ops::qr op;
+    sd::ops::qr op;
     auto res = op.evaluate({&in}, {}, {}, {true});
 
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -2738,7 +2738,7 @@ TEST_F(DeclarableOpsTests12, QR_Test_1) {
 //    expR.printBuffer("Upper triangular Exp");
 //    q->printShapeInfo("Q shape");
 //    r->printShapeInfo("R shape");
-    nd4j::ops::matmul opMul;
+    sd::ops::matmul opMul;
     auto res2 = opMul.evaluate({q, r}); //MmulHelper::matmul(q, r, &in, false, false);
     auto exp = res2->at(0);//->printIndexedBuffer("Result as result");
     ASSERT_TRUE(exp->isSameShape(in));
@@ -2773,7 +2773,7 @@ TEST_F(DeclarableOpsTests12, QR_Test_1_1) {
         -14.177447, -20.666622,       13.401566,            0., -175.04254,       70.080315,            0.,         0.,       35.201546,            0.,         0.,              0.,            0.,         0.,              0.,
         -14.177447, -20.666622,       13.401566,            0., -175.04254,       70.080315,            0.,         0.,       35.201546,            0.,         0.,              0.,            0.,         0.,              0.
     });
-    nd4j::ops::qr op;
+    sd::ops::qr op;
     auto res = op.evaluate({&in}, {}, {}, {true});
 
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -2785,7 +2785,7 @@ TEST_F(DeclarableOpsTests12, QR_Test_1_1) {
 //    expR.printBuffer("Upper triangular Exp");
 //    q->printShapeInfo("Q shape");
 //    r->printShapeInfo("R shape");
-    nd4j::ops::matmul opMul;
+    sd::ops::matmul opMul;
     auto res2 = opMul.evaluate({q, r}); //MmulHelper::matmul(q, r, &in, false, false);
     auto exp = res2->at(0);//->printIndexedBuffer("Result as result");
     ASSERT_TRUE(exp->isSameShape(in));
@@ -2805,7 +2805,7 @@ TEST_F(DeclarableOpsTests12, QR_Test_2) {
     auto expQ = NDArrayFactory::create<double>('c', {5, 3}, {0.8464148,0.3912908,-0.3431241,-0.42320737, -0.9040873,0.02927014,0.28213826, -0.17042054, -0.93285596,0.07053456, -0.01404065,0.00109937,-0.14106913,0.0166551,0.10577161});
     auto expR = NDArrayFactory::create<double>('c', {3,3}, {-14.177447,-20.666622,13.401566,0.,-175.04254,70.080315,0.,0.,35.201546});
 
-    nd4j::ops::qr op;
+    sd::ops::qr op;
     auto res = op.evaluate({&in}, {}, {}, {false});
 
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -2814,7 +2814,7 @@ TEST_F(DeclarableOpsTests12, QR_Test_2) {
     ASSERT_TRUE(q->isSameShape(expQ));
     ASSERT_TRUE(r->isSameShape(expR));
 
-    nd4j::ops::matmul opMul;
+    sd::ops::matmul opMul;
     auto res2 = opMul.evaluate({q, r}); //MmulHelper::matmul(q, r, &in, false, false);
     auto exp = res2->at(0);//->printIndexedBuffer("Result as result");
     ASSERT_TRUE(exp->isSameShape(in));
@@ -2840,7 +2840,7 @@ TEST_F(DeclarableOpsTests12, TriangularSolve_Test_1) {
     auto exp = NDArrayFactory::create<float>('c', {4, 1}, {
             1.333333f,      -0.6666667f,         2.6666667f,        -1.3333333f });
 
-    nd4j::ops::triangular_solve op;
+    sd::ops::triangular_solve op;
 
     auto res = op.evaluate({&a, &b});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -2869,7 +2869,7 @@ TEST_F(DeclarableOpsTests12, TriangularSolve_Test_2) {
     auto exp = NDArrayFactory::create<float>('c', {4, 1}, {
             2.f,      4.f,         1.f,        1.3333333f });
 
-    nd4j::ops::triangular_solve op;
+    sd::ops::triangular_solve op;
 
     auto res = op.evaluate({&a, &b});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -2906,7 +2906,7 @@ TEST_F(DeclarableOpsTests12, TriangularSolve_Test_3) {
             1.333333f,      -0.6666667f,         2.6666667f,        -1.3333333f
     });
 
-    nd4j::ops::triangular_solve op;
+    sd::ops::triangular_solve op;
 
     auto res = op.evaluate({&a, &b});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -2935,7 +2935,7 @@ TEST_F(DeclarableOpsTests12, TriangularSolve_Test_4) {
            -3.3333333f,      3.6666666f,         0.333333f,        1.3333333f
     });
 
-    nd4j::ops::triangular_solve op;
+    sd::ops::triangular_solve op;
 
     auto res = op.evaluate({&a, &b}, {false});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -2965,7 +2965,7 @@ TEST_F(DeclarableOpsTests12, TriangularSolve_Test_5) {
             1.f,      1.f,         1.f,        1.f
     });
 
-    nd4j::ops::triangular_solve op;
+    sd::ops::triangular_solve op;
 
     auto res = op.evaluate({&a, &b}, {false, true});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -2994,7 +2994,7 @@ TEST_F(DeclarableOpsTests12, SolveLs_Test_1) {
     auto exp = NDArrayFactory::create<float>('c', {4, 1}, {
             1.333333f,      -0.6666667f,         2.6666667f,        -1.3333333f });
 
-    nd4j::ops::lstsq op;
+    sd::ops::lstsq op;
 
     auto res = op.evaluate({&a, &b});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -3018,7 +3018,7 @@ TEST_F(DeclarableOpsTests12, SolveLs_Test_2) {
 
     auto exp = NDArrayFactory::create<double>('c', {3, 1}, { -0.24999914f,  0.4999994f, 0.08333314f });
 
-    nd4j::ops::lstsq op;
+    sd::ops::lstsq op;
 
     auto res = op.evaluate({&a, &b});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -3043,7 +3043,7 @@ TEST_F(DeclarableOpsTests12, SolveLs_Test_3) {
 
     auto exp = NDArrayFactory::create<float>('c', {3, 1}, { -0.5f,   1.5f,   -2.f });
 
-    nd4j::ops::lstsq op;
+    sd::ops::lstsq op;
 
     auto res = op.evaluate({&a, &b});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -3066,7 +3066,7 @@ TEST_F(DeclarableOpsTests12, SolveLs_Test_4) {
 
     auto exp = NDArrayFactory::create<float>('c', {4, 1}, { -0.5f,   1.5f,   -2.f, 0.f});
 
-    nd4j::ops::lstsq op;
+    sd::ops::lstsq op;
 
     auto res = op.evaluate({&a, &b}, {false});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -3087,7 +3087,7 @@ TEST_F(DeclarableOpsTests12, SolveLs_Test_5) {
     auto a = NDArrayFactory::create<float>('c', {1, 0, 3, 4});
     auto b = NDArrayFactory::create<float>('c', {1, 0, 3, 1});
 
-    nd4j::ops::lstsq op;
+    sd::ops::lstsq op;
 
     auto res = op.evaluate({&a, &b}, {false});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -3103,7 +3103,7 @@ TEST_F(DeclarableOpsTests12, Solve_Test_6) {
     auto a = NDArrayFactory::create<float>('c', {1, 0, 3, 3});
     auto b = NDArrayFactory::create<float>('c', {1, 0, 3, 1});
 
-    nd4j::ops::solve op;
+    sd::ops::solve op;
 
     auto res = op.evaluate({&a, &b}, {true});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
@@ -3131,7 +3131,7 @@ TEST_F(DeclarableOpsTests12, TriangularSolve_Test_6) {
             1.f,0.2f,      1.f,0.8f,         1.f,0.4f,        1.f,1.2f
     });
 
-    nd4j::ops::triangular_solve op;
+    sd::ops::triangular_solve op;
 
     auto res = op.evaluate({&a, &b}, {}, {}, {false, true});
     ASSERT_EQ(res->status(), ND4J_STATUS_OK);
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests13.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests13.cpp
index e964d397d..dcdffac6a 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests13.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests13.cpp
@@ -21,13 +21,13 @@
 
 #include "testlayers.h"
 #include <ops/declarable/CustomOperations.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <ops/ops.h>
-#include <GradCheck.h>
+#include <helpers/GradCheck.h>
 #include <memory>
-#include <PointersManager.h>
+#include <helpers/PointersManager.h>
 
-using namespace nd4j;
+using namespace sd;
 
 
 class DeclarableOpsTests13 : public testing::Test {
@@ -57,7 +57,7 @@ TEST_F(DeclarableOpsTests13, test_pow_1) {
     auto y = NDArrayFactory::create<int>('c', {2}, {3, 3});
     auto e = NDArrayFactory::create<float>('c', {2, 2}, {8.f, 8.f, 8.f, 8.f});
 
-    nd4j::ops::Pow op;
+    sd::ops::Pow op;
     auto result = op.evaluate({&x, &y});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -72,7 +72,7 @@ TEST_F(DeclarableOpsTests13, test_empty_range_1) {
     auto start = NDArrayFactory::create<int>(0);
     auto limit = NDArrayFactory::create<int>(0);
 
-    nd4j::ops::range op;
+    sd::ops::range op;
     auto result = op.evaluate({&start, &limit});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -84,7 +84,7 @@ TEST_F(DeclarableOpsTests13, test_empty_range_1) {
 
 TEST_F(DeclarableOpsTests13, test_empty_range_2) {
 
-    nd4j::ops::range op;
+    sd::ops::range op;
     auto result = op.evaluate({}, {1.0, 1.0});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -96,7 +96,7 @@ TEST_F(DeclarableOpsTests13, test_empty_range_2) {
 
 TEST_F(DeclarableOpsTests13, test_empty_range_3) {
 
-    nd4j::ops::range op;
+    sd::ops::range op;
     auto result = op.evaluate({}, {1, 1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -115,7 +115,7 @@ TEST_F(DeclarableOpsTests13, test_argmax_edge_1) {
     ctx->setInputArray(1, NDArrayFactory::create_<Nd4jLong >(0), true);   //Axis 0
 
 
-    nd4j::ops::argmax op;
+    sd::ops::argmax op;
     auto result = op.execute(ctx);
     ASSERT_EQ(Status::OK(), result);
 
@@ -142,7 +142,7 @@ TEST_F(DeclarableOpsTests13, test_listdiff_1) {
     auto od = NDArrayFactory::create<int>('c', {2});
     auto oi = NDArrayFactory::create<int>('c', {2});
 
-    nd4j::ops::listdiff op;
+    sd::ops::listdiff op;
     auto result = op.execute({&x, &y}, std::vector<NDArray*>{&od, &oi}, {}, {}, {});
     ASSERT_EQ(Status::OK(), result);
 }
@@ -151,7 +151,7 @@ TEST_F(DeclarableOpsTests13, test_greater_1) {
     auto x = NDArrayFactory::create<float>('c', {3, 1});
     auto y = NDArrayFactory::create<float>('c', {1, 4});
 
-    nd4j::ops::greater op;
+    sd::ops::greater op;
     auto result = op.evaluate({&x, &y});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -164,7 +164,7 @@ TEST_F(DeclarableOpsTests13, test_eval_reduction_shape_1) {
     auto y = NDArrayFactory::create<Nd4jLong>('c', {1}, {axis});
     auto exp = NDArrayFactory::create<Nd4jLong>('c', {2}, {1, 2});
 
-    nd4j::ops::evaluate_reduction_shape op;
+    sd::ops::evaluate_reduction_shape op;
     auto result = op.evaluate({&x, &y}, {true});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -177,11 +177,11 @@ TEST_F(DeclarableOpsTests13, test_eval_reduction_shape_1) {
 
 TEST_F(DeclarableOpsTests13, test_or_1) {
 
-    NDArray x('c', {4}, {false, true, false, true}, nd4j::DataType::BOOL);
-    NDArray y('c', {4}, {false, false, true, true}, nd4j::DataType::BOOL);
-    NDArray e('c', {4}, {false, true, true, true}, nd4j::DataType::BOOL);
+    NDArray x('c', {4}, {false, true, false, true}, sd::DataType::BOOL);
+    NDArray y('c', {4}, {false, false, true, true}, sd::DataType::BOOL);
+    NDArray e('c', {4}, {false, true, true, true}, sd::DataType::BOOL);
 
-    NDArray z('c', {4}, nd4j::DataType::BOOL);
+    NDArray z('c', {4}, sd::DataType::BOOL);
 
     x.applyPairwiseTransform(pairwise::Or, y, z);
 
@@ -217,7 +217,7 @@ TEST_F(DeclarableOpsTests13, BarnesHutTsne_GainsTest_1) {
     auto y = NDArrayFactory::create<double>('c', {2,3}, {1,-2,3, -4, 5, -6});
     auto eps = NDArrayFactory::create<double>('c', {2,3}, {-0.1, 0.2, -0.3, 0.4, -0.5, 0.6});
     auto exp = NDArrayFactory::create<double>('c', {2,3}, {1.2,2.2,3.2,4.2,5.2,6.2});
-    nd4j::ops::barnes_gains op;
+    sd::ops::barnes_gains op;
     auto result = op.evaluate({&x, &y, &eps});
     ASSERT_EQ(result->status(), Status::OK());
     //result->at(0)->printBuffer("Gains out");
@@ -231,7 +231,7 @@ TEST_F(DeclarableOpsTests13, BarnesHutTsne_GainsTest_2) {
     auto y = NDArrayFactory::create<double>('c', {2,3}, {1, -2, 3, -4, 5, -6});
     auto eps = NDArrayFactory::create<double>('c', {2,3}, {-0.1, 0.2, -0.3, 0.4, -0.5, 0.6});
     auto exp = NDArrayFactory::create<double>('c', {2,3}, {1.2, 0.01, 3.2, 0.01, 5.2, 0.01});
-    nd4j::ops::barnes_gains op;
+    sd::ops::barnes_gains op;
     auto result = op.evaluate({&x, &y, &eps}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
     //result->at(0)->printBuffer("Gains out");
@@ -246,7 +246,7 @@ TEST_F(DeclarableOpsTests13, BarnesHutTsne_GainsTest_3) {
     auto y = NDArrayFactory::create<double>('c', {2,3}, {-0.1,-2,3, -4, -0.5, -6});
     auto eps = NDArrayFactory::create<double>('c', {2,3}, {-0.1, 0.2, -0.3, 0.4, -0.5, 0.6});
     auto exp = NDArrayFactory::create<double>('c', {2,3}, {0.01, 2.2, 0.01, 4.2, 0.01, 6.2});
-    nd4j::ops::barnes_gains op;
+    sd::ops::barnes_gains op;
     auto result = op.evaluate({&x, &y, &eps}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
     //result->at(0)->printBuffer("Gains out");
@@ -268,7 +268,7 @@ TEST_F(DeclarableOpsTests13, BarnesHutTsne_EdgeForceTest_1) {
 //    auto y = NDArrayFactory::create<double>('c', {2,3}, {-0.1,-2,3, -4, -0.5, -6});
 //    auto eps = NDArrayFactory::create<double>('c', {2,3}, {-0.1, 0.2, -0.3, 0.4, -0.5, 0.6});
 //    auto exp = NDArrayFactory::create<double>('c', {2,3}, {1, 2, 1, 2, 2, 2});
-    nd4j::ops::barnes_edge_forces op;
+    sd::ops::barnes_edge_forces op;
     auto result = op.evaluate({&rows, &cols, &vals, &data}, {}, {1});
 
 
@@ -292,7 +292,7 @@ TEST_F(DeclarableOpsTests13, BarnesHutTsne_EdgeForceTest_2) {
 //    auto y = NDArrayFactory::create<double>('c', {2,3}, {-0.1,-2,3, -4, -0.5, -6});
 //    auto eps = NDArrayFactory::create<double>('c', {2,3}, {-0.1, 0.2, -0.3, 0.4, -0.5, 0.6});
 //    auto exp = NDArrayFactory::create<double>('c', {2,3}, {1, 2, 1, 2, 2, 2});
-    nd4j::ops::barnes_edge_forces op;
+    sd::ops::barnes_edge_forces op;
     auto result = op.evaluate({&rows, &cols, &vals, &data}, {}, {2});
 
 
@@ -316,7 +316,7 @@ TEST_F(DeclarableOpsTests13, BarnesHutTsne_EdgeForceTest_3) {
 //    auto y = NDArrayFactory::create<double>('c', {2,3}, {-0.1,-2,3, -4, -0.5, -6});
 //    auto eps = NDArrayFactory::create<double>('c', {2,3}, {-0.1, 0.2, -0.3, 0.4, -0.5, 0.6});
 //    auto exp = NDArrayFactory::create<double>('c', {2,3}, {1, 2, 1, 2, 2, 2});
-    nd4j::ops::barnes_edge_forces op;
+    sd::ops::barnes_edge_forces op;
     auto result = op.evaluate({&rows, &cols, &vals, &data}, {}, {11});
 
     //nd4j_printf("rows %lld, cols %lld, vals %lld, res full %lld\n", rows.lengthOf(), cols.lengthOf(), vals.lengthOf(), exp1.lengthOf());
@@ -339,7 +339,7 @@ TEST_F(DeclarableOpsTests13, BarnesHutTsne_symmetrized_1) {
 //    auto y = NDArrayFactory::create<double>('c', {2,3}, {-0.1,-2,3, -4, -0.5, -6});
 //    auto eps = NDArrayFactory::create<double>('c', {2,3}, {-0.1, 0.2, -0.3, 0.4, -0.5, 0.6});
 //    auto exp = NDArrayFactory::create<double>('c', {2,3}, {1, 2, 1, 2, 2, 2});
-    nd4j::ops::barnes_symmetrized op;
+    sd::ops::barnes_symmetrized op;
     auto result = op.evaluate({&rows, &cols, &vals}, {}, {1});
     ASSERT_EQ(result->status(), Status::OK());
     //result->at(2)->printBuffer("Symmetrized1");
@@ -358,7 +358,7 @@ TEST_F(DeclarableOpsTests13, BarnesHutTsne_symmetrized_2) {
 //    auto y = NDArrayFactory::create<double>('c', {2,3}, {-0.1,-2,3, -4, -0.5, -6});
 //    auto eps = NDArrayFactory::create<double>('c', {2,3}, {-0.1, 0.2, -0.3, 0.4, -0.5, 0.6});
 //    auto exp = NDArrayFactory::create<double>('c', {2,3}, {1, 2, 1, 2, 2, 2});
-    nd4j::ops::barnes_symmetrized op;
+    sd::ops::barnes_symmetrized op;
     auto result = op.evaluate({&rows, &cols, &vals}, {}, {3});
     ASSERT_EQ(result->status(), Status::OK());
     //result->at(2)->printBuffer("Symmetrized2");
@@ -377,7 +377,7 @@ TEST_F(DeclarableOpsTests13, BarnesHutTsne_symmetrized_3) {
 //    auto y = NDArrayFactory::create<double>('c', {2,3}, {-0.1,-2,3, -4, -0.5, -6});
 //    auto eps = NDArrayFactory::create<double>('c', {2,3}, {-0.1, 0.2, -0.3, 0.4, -0.5, 0.6});
 //    auto exp = NDArrayFactory::create<double>('c', {2,3}, {1, 2, 1, 2, 2, 2});
-    nd4j::ops::barnes_symmetrized op;
+    sd::ops::barnes_symmetrized op;
     auto result = op.evaluate({&rows, &cols, &vals}, {}, {11});
     ASSERT_EQ(result->status(), Status::OK());
     //result->at(2)->printBuffer("Symmetrized3");
@@ -401,7 +401,7 @@ TEST_F(DeclarableOpsTests13, BarnesHutTsne_symmetrized_4) {
 //    auto y = NDArrayFactory::create<double>('c', {2,3}, {-0.1,-2,3, -4, -0.5, -6});
 //    auto eps = NDArrayFactory::create<double>('c', {2,3}, {-0.1, 0.2, -0.3, 0.4, -0.5, 0.6});
 //    auto exp = NDArrayFactory::create<double>('c', {2,3}, {1, 2, 1, 2, 2, 2});
-    nd4j::ops::barnes_symmetrized op;
+    sd::ops::barnes_symmetrized op;
     auto result = op.evaluate({&rows, &cols, &vals}, {}, {11});
     ASSERT_EQ(result->status(), Status::OK());
     auto res = result->at(2);
@@ -427,7 +427,7 @@ TEST_F(DeclarableOpsTests13, CellContains_test_1) {
     //    auto y = NDArrayFactory::create<double>('c', {2,3}, {-0.1,-2,3, -4, -0.5, -6});
     //    auto eps = NDArrayFactory::create<double>('c', {2,3}, {-0.1, 0.2, -0.3, 0.4, -0.5, 0.6});
     //    auto exp = NDArrayFactory::create<double>('c', {2,3}, {1, 2, 1, 2, 2, 2});
-    nd4j::ops::cell_contains op;
+    sd::ops::cell_contains op;
     auto result = op.evaluate({&corners, &width, &point}, {}, {5});
     ASSERT_EQ(result->status(), Status::OK());
     ASSERT_TRUE(result->at(0)->e<bool>(0));
@@ -441,12 +441,12 @@ TEST_F(DeclarableOpsTests13, CellContains_test_1) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests13, adjustHue_1) {
 
-    NDArray input('c', {2,2,3}, {0,100,56, 17,220,5,  150,97,230, 255,2,13}, nd4j::DataType::FLOAT32);
+    NDArray input('c', {2,2,3}, {0,100,56, 17,220,5,  150,97,230, 255,2,13}, sd::DataType::FLOAT32);
     NDArray factor = NDArrayFactory::create<float>(0.5);
-    NDArray exp  ('c', {2,2,3}, {100,0,44, 208,5,220, 177,230,97,  2,255,244}, nd4j::DataType::FLOAT32);
+    NDArray exp  ('c', {2,2,3}, {100,0,44, 208,5,220, 177,230,97,  2,255,244}, sd::DataType::FLOAT32);
 
-    nd4j::ops::adjust_hue op;
-    std::unique_ptr<nd4j::ResultSet> results (op.evaluate({&input, &factor}, {}, {2}));
+    sd::ops::adjust_hue op;
+    std::unique_ptr<sd::ResultSet> results (op.evaluate({&input, &factor}, {}, {2}));
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
@@ -462,12 +462,12 @@ TEST_F(DeclarableOpsTests13, adjustHue_1) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests13, adjustHue_2) {
 
-    NDArray input('c', { 2,2,3 }, { 0.f,100.f / 255.f,56.f / 255.f, 17.f / 255.f,220.f / 255.f,5.f / 255.f,  150.f / 255.f,97.f / 255.f,230.f / 255.f, 255.f / 255.f,2.f / 255.f,13.f / 255.f }, nd4j::DataType::FLOAT32);
-    NDArray exp('c', { 2,2,3 }, { 4.f / 255.f,100.f / 255.f,0.f,  146.f / 255.f,220.f / 255.f,5.f / 255.f, 97.f / 255.f,123.8f / 255.f,230.f / 255.f, 255.f / 255.f,2.f / 255.f,164.8f / 255.f }, nd4j::DataType::FLOAT32);
+    NDArray input('c', { 2,2,3 }, { 0.f,100.f / 255.f,56.f / 255.f, 17.f / 255.f,220.f / 255.f,5.f / 255.f,  150.f / 255.f,97.f / 255.f,230.f / 255.f, 255.f / 255.f,2.f / 255.f,13.f / 255.f }, sd::DataType::FLOAT32);
+    NDArray exp('c', { 2,2,3 }, { 4.f / 255.f,100.f / 255.f,0.f,  146.f / 255.f,220.f / 255.f,5.f / 255.f, 97.f / 255.f,123.8f / 255.f,230.f / 255.f, 255.f / 255.f,2.f / 255.f,164.8f / 255.f }, sd::DataType::FLOAT32);
 
 
-    nd4j::ops::adjust_hue op;
-    std::unique_ptr<nd4j::ResultSet> results(op.evaluate({&input}, {0.9}, {2}));
+    sd::ops::adjust_hue op;
+    std::unique_ptr<sd::ResultSet> results(op.evaluate({&input}, {0.9}, {2}));
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
@@ -483,11 +483,11 @@ TEST_F(DeclarableOpsTests13, adjustHue_2) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests13, adjustHue_3) {
 
-    NDArray input('c', {2,2,3}, {0,100,56,    17,220,5,          150,97,230,     255,2,13}, nd4j::DataType::FLOAT32);
-    NDArray exp  ('c', {2,2,3}, {0.,84.,100., 5.,220.,122.0001,  229.8,97.,230., 255.,142.8002,2.}, nd4j::DataType::FLOAT32);
+    NDArray input('c', {2,2,3}, {0,100,56,    17,220,5,          150,97,230,     255,2,13}, sd::DataType::FLOAT32);
+    NDArray exp  ('c', {2,2,3}, {0.,84.,100., 5.,220.,122.0001,  229.8,97.,230., 255.,142.8002,2.}, sd::DataType::FLOAT32);
 
-    nd4j::ops::adjust_hue op;
-    std::unique_ptr<nd4j::ResultSet> results(op.evaluate({&input}, {-0.9}, {2}));
+    sd::ops::adjust_hue op;
+    std::unique_ptr<sd::ResultSet> results(op.evaluate({&input}, {-0.9}, {2}));
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
@@ -502,11 +502,11 @@ TEST_F(DeclarableOpsTests13, adjustHue_3) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests13, adjustHue_4) {
 
-    NDArray input('c', {2,3,2}, {0,17,   100,220, 56,5,   150,255, 97,2,   230,13}, nd4j::DataType::FLOAT32);
-    NDArray exp  ('c', {2,3,2}, {100,208, 0,5,   44,220,  177,2,   230,255, 97,244}, nd4j::DataType::FLOAT32);
+    NDArray input('c', {2,3,2}, {0,17,   100,220, 56,5,   150,255, 97,2,   230,13}, sd::DataType::FLOAT32);
+    NDArray exp  ('c', {2,3,2}, {100,208, 0,5,   44,220,  177,2,   230,255, 97,244}, sd::DataType::FLOAT32);
 
-    nd4j::ops::adjust_hue op;
-    std::unique_ptr<nd4j::ResultSet> results(op.evaluate({&input}, {0.5}, {1}));
+    sd::ops::adjust_hue op;
+    std::unique_ptr<sd::ResultSet> results(op.evaluate({&input}, {0.5}, {1}));
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
@@ -521,11 +521,11 @@ TEST_F(DeclarableOpsTests13, adjustHue_4) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests13, adjustHue_5) {
 
-    NDArray input('c', {3,2,2}, {0,17, 150,255,   100,220, 97,2,  56,5, 230,13}, nd4j::DataType::FLOAT32);
-    NDArray exp  ('c', {3,2,2}, {100,208, 177,2,  0,5, 230,255,   44,220, 97,244}, nd4j::DataType::FLOAT32);
+    NDArray input('c', {3,2,2}, {0,17, 150,255,   100,220, 97,2,  56,5, 230,13}, sd::DataType::FLOAT32);
+    NDArray exp  ('c', {3,2,2}, {100,208, 177,2,  0,5, 230,255,   44,220, 97,244}, sd::DataType::FLOAT32);
 
-    nd4j::ops::adjust_hue op;
-    std::unique_ptr<nd4j::ResultSet> results(op.evaluate({&input}, {0.5}, {0}));
+    sd::ops::adjust_hue op;
+    std::unique_ptr<sd::ResultSet> results(op.evaluate({&input}, {0.5}, {0}));
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
@@ -540,11 +540,11 @@ TEST_F(DeclarableOpsTests13, adjustHue_5) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests13, adjustSaturation_1) {
 
-    NDArray input('c', {2,2,3}, {0,100,56,  17,220,5,         150,97,230,    255,2,13}, nd4j::DataType::FLOAT32);
+    NDArray input('c', {2,2,3}, {0,100,56,  17,220,5,         150,97,230,    255,2,13}, sd::DataType::FLOAT32);
     NDArray factor = NDArrayFactory::create<float>(0.5);
-    NDArray exp  ('c', {2,2,3}, {50,100,78, 118.5,220,112.5,  190,163.5,230, 255,128.5,134}, nd4j::DataType::FLOAT32);
+    NDArray exp  ('c', {2,2,3}, {50,100,78, 118.5,220,112.5,  190,163.5,230, 255,128.5,134}, sd::DataType::FLOAT32);
 
-    nd4j::ops::adjust_saturation op;
+    sd::ops::adjust_saturation op;
     auto results = op.evaluate({&input, &factor}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -560,10 +560,10 @@ TEST_F(DeclarableOpsTests13, adjustSaturation_1) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests13, adjustSaturation_2) {
 
-    NDArray input('c', {2,2,3}, {0,100,56,    17,220,5,          150,97,230,        255,2,13}, nd4j::DataType::DOUBLE);
-    NDArray exp  ('c', {2,2,3}, {0.,100.,56., 12.279087,220.,0., 91.654228,0.,230., 255.,0.,11.087015}, nd4j::DataType::DOUBLE);
+    NDArray input('c', {2,2,3}, {0,100,56,    17,220,5,          150,97,230,        255,2,13}, sd::DataType::DOUBLE);
+    NDArray exp  ('c', {2,2,3}, {0.,100.,56., 12.279087,220.,0., 91.654228,0.,230., 255.,0.,11.087015}, sd::DataType::DOUBLE);
 
-    nd4j::ops::adjust_saturation op;
+    sd::ops::adjust_saturation op;
     auto results = op.evaluate({&input}, {10}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -581,10 +581,10 @@ TEST_F(DeclarableOpsTests13, adjustSaturation_2) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests13, adjustSaturation_3) {
 
-    NDArray input('c', {2,2,3}, {0,100,56,       17,220,5,       150,97,230,     255,2,13}, nd4j::DataType::FLOAT32);
-    NDArray exp  ('c', {2,2,3}, {100.,100.,100., 220.,220.,220., 230.,230.,230., 255., 255., 255.}, nd4j::DataType::FLOAT32);
+    NDArray input('c', {2,2,3}, {0,100,56,       17,220,5,       150,97,230,     255,2,13}, sd::DataType::FLOAT32);
+    NDArray exp  ('c', {2,2,3}, {100.,100.,100., 220.,220.,220., 230.,230.,230., 255., 255., 255.}, sd::DataType::FLOAT32);
 
-    nd4j::ops::adjust_saturation op;
+    sd::ops::adjust_saturation op;
     auto results = op.evaluate({&input}, {-10}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -601,10 +601,10 @@ TEST_F(DeclarableOpsTests13, adjustSaturation_3) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests13, adjustSaturation_4) {
 
-    NDArray input('c', {2,3,2}, {0,17,   100,220,  56,5,   150,255,  97,2,   230,13}, nd4j::DataType::FLOAT32);
-    NDArray exp  ('c', {2,3,2}, {50,118.5, 100,220, 78,112.5,  190,255, 163.5,128.5, 230,134}, nd4j::DataType::FLOAT32);
+    NDArray input('c', {2,3,2}, {0,17,   100,220,  56,5,   150,255,  97,2,   230,13}, sd::DataType::FLOAT32);
+    NDArray exp  ('c', {2,3,2}, {50,118.5, 100,220, 78,112.5,  190,255, 163.5,128.5, 230,134}, sd::DataType::FLOAT32);
 
-    nd4j::ops::adjust_saturation op;
+    sd::ops::adjust_saturation op;
     auto results = op.evaluate({&input}, {0.5}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -621,10 +621,10 @@ TEST_F(DeclarableOpsTests13, adjustSaturation_4) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests13, adjustSaturation_5) {
 
-    NDArray input('c', {3,2,2}, {0,17,     150,255,  100,220,  97,2,        56,5,     230,13}, nd4j::DataType::FLOAT32);
-    NDArray exp  ('c', {3,2,2}, {50,118.5, 190,255,  100,220,  163.5,128.5, 78,112.5, 230,134}, nd4j::DataType::FLOAT32);
+    NDArray input('c', {3,2,2}, {0,17,     150,255,  100,220,  97,2,        56,5,     230,13}, sd::DataType::FLOAT32);
+    NDArray exp  ('c', {3,2,2}, {50,118.5, 190,255,  100,220,  163.5,128.5, 78,112.5, 230,134}, sd::DataType::FLOAT32);
 
-    nd4j::ops::adjust_saturation op;
+    sd::ops::adjust_saturation op;
     auto results = op.evaluate({&input}, {0.5}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -645,7 +645,7 @@ TEST_F(DeclarableOpsTests13, shift_bits_1) {
     x.assign(32);
     e.assign(512);
 
-    nd4j::ops::shift_bits op;
+    sd::ops::shift_bits op;
     auto result = op.evaluate({&x, &y}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -663,7 +663,7 @@ TEST_F(DeclarableOpsTests13, rshift_bits_1) {
     x.assign(512);
     e.assign(32);
 
-    nd4j::ops::rshift_bits op;
+    sd::ops::rshift_bits op;
     auto result = op.evaluate({&x, &y}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -681,7 +681,7 @@ TEST_F(DeclarableOpsTests13, cyclic_shift_bits_1) {
     x.assign(32);
     e.assign(512);
 
-    nd4j::ops::cyclic_shift_bits op;
+    sd::ops::cyclic_shift_bits op;
     auto result = op.evaluate({&x, &y}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -699,7 +699,7 @@ TEST_F(DeclarableOpsTests13, cyclic_rshift_bits_1) {
     x.assign(512);
     e.assign(32);
 
-    nd4j::ops::cyclic_rshift_bits op;
+    sd::ops::cyclic_rshift_bits op;
     auto result = op.evaluate({&x, &y}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -718,7 +718,7 @@ TEST_F(DeclarableOpsTests13, shift_bits_2) {
     y.assign(4);
     e.assign(512);
 
-    nd4j::ops::shift_bits op;
+    sd::ops::shift_bits op;
     auto result = op.evaluate({&x, &y}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -737,7 +737,7 @@ TEST_F(DeclarableOpsTests13, rshift_bits_2) {
     y.assign(4);
     e.assign(32);
 
-    nd4j::ops::rshift_bits op;
+    sd::ops::rshift_bits op;
     auto result = op.evaluate({&x, &y}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -756,7 +756,7 @@ TEST_F(DeclarableOpsTests13, cyclic_shift_bits_2) {
     y.assign(4);
     e.assign(512);
 
-    nd4j::ops::cyclic_shift_bits op;
+    sd::ops::cyclic_shift_bits op;
     auto result = op.evaluate({&x, &y}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -775,7 +775,7 @@ TEST_F(DeclarableOpsTests13, cyclic_rshift_bits_2) {
     y.assign(4);
     e.assign(32);
 
-    nd4j::ops::cyclic_rshift_bits op;
+    sd::ops::cyclic_rshift_bits op;
     auto result = op.evaluate({&x, &y}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -793,7 +793,7 @@ TEST_F(DeclarableOpsTests13, shift_bits_3) {
     y.assign(4);
     e.assign(512);
 
-    nd4j::ops::shift_bits op;
+    sd::ops::shift_bits op;
     auto result = op.evaluate({&x, &y}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -807,16 +807,16 @@ TEST_F(DeclarableOpsTests13, shift_bits_3) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests13, space_to_batch_nd_1) {
 
-    NDArray x('c', {1, 2, 2, 2, 3}, nd4j::DataType::FLOAT32);
-    NDArray blockShape('c', {3}, {2, 2, 2} , nd4j::DataType::INT32);    // three spatial dimensions
-    NDArray paddings('c', {3, 2}, std::vector<double>{0, 0, 0, 0, 0, 0} , nd4j::DataType::INT32);
+    NDArray x('c', {1, 2, 2, 2, 3}, sd::DataType::FLOAT32);
+    NDArray blockShape('c', {3}, {2, 2, 2} , sd::DataType::INT32);    // three spatial dimensions
+    NDArray paddings('c', {3, 2}, std::vector<double>{0, 0, 0, 0, 0, 0} , sd::DataType::INT32);
 
-    NDArray exp('c', {8, 1, 1, 1, 3}, nd4j::DataType::FLOAT32);
+    NDArray exp('c', {8, 1, 1, 1, 3}, sd::DataType::FLOAT32);
 
     x.linspace(1);
     exp.linspace(1);
 
-    nd4j::ops::space_to_batch_nd op;
+    sd::ops::space_to_batch_nd op;
     auto result = op.evaluate({&x, &blockShape, &paddings}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -831,19 +831,19 @@ TEST_F(DeclarableOpsTests13, space_to_batch_nd_1) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests13, space_to_batch_nd_2) {
 
-    NDArray x('c', {2,  2,4,3,  1}, nd4j::DataType::FLOAT32);
-    NDArray blockShape('c', {3}, {2, 2, 3} , nd4j::DataType::INT32);    // three spatial dimensions
-    NDArray paddings('c', {3, 2}, {0,0,  0,2,  2,1} , nd4j::DataType::INT32);
+    NDArray x('c', {2,  2,4,3,  1}, sd::DataType::FLOAT32);
+    NDArray blockShape('c', {3}, {2, 2, 3} , sd::DataType::INT32);    // three spatial dimensions
+    NDArray paddings('c', {3, 2}, {0,0,  0,2,  2,1} , sd::DataType::INT32);
 
     NDArray exp('c', {24, 1,3,2, 1}, { 0, 2, 0, 8, 0, 0, 0, 26, 0, 32, 0, 0, 0, 3, 0, 9, 0, 0, 0, 27, 0, 33, 0, 0, 1,
                                         0, 7, 0, 0, 0, 25, 0, 31, 0, 0, 0, 0, 5, 0, 11, 0, 0, 0, 29, 0, 35, 0, 0, 0, 6,
                                         0, 12, 0, 0, 0, 30, 0, 36, 0, 0, 4, 0, 10, 0, 0, 0, 28, 0, 34, 0, 0, 0, 0, 14,
                                         0, 20, 0, 0, 0, 38, 0, 44, 0, 0, 0, 15, 0, 21, 0, 0, 0, 39, 0, 45, 0, 0, 13, 0,
                                         19, 0, 0, 0, 37, 0, 43, 0, 0, 0, 0, 17, 0, 23, 0, 0, 0, 41, 0, 47, 0, 0, 0, 18,
-                                        0, 24, 0, 0, 0, 42, 0, 48, 0, 0, 16, 0, 22, 0, 0, 0, 40, 0, 46, 0, 0, 0}, nd4j::DataType::FLOAT32);
+                                        0, 24, 0, 0, 0, 42, 0, 48, 0, 0, 16, 0, 22, 0, 0, 0, 40, 0, 46, 0, 0, 0}, sd::DataType::FLOAT32);
     x.linspace(1);
 
-    nd4j::ops::space_to_batch_nd op;
+    sd::ops::space_to_batch_nd op;
     auto result = op.evaluate({&x, &blockShape, &paddings}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -859,9 +859,9 @@ TEST_F(DeclarableOpsTests13, space_to_batch_nd_2) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests13, space_to_batch_nd_3) {
 
-    NDArray x('c', {2,  2,4,3,  1}, nd4j::DataType::FLOAT32);
-    NDArray blockShape('c', {3}, {2, 2, 3} , nd4j::DataType::INT32);    // three spatial dimensions
-    NDArray paddings('c', {3, 2}, {1,1,  0,2,  2,1} , nd4j::DataType::INT32);
+    NDArray x('c', {2,  2,4,3,  1}, sd::DataType::FLOAT32);
+    NDArray blockShape('c', {3}, {2, 2, 3} , sd::DataType::INT32);    // three spatial dimensions
+    NDArray paddings('c', {3, 2}, {1,1,  0,2,  2,1} , sd::DataType::INT32);
 
     NDArray exp('c', {24, 2,3,2, 1}, { 0, 0, 0, 0, 0, 0, 0, 14, 0, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 38, 0, 44, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15,
                                         0, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 39, 0, 45, 0, 0, 0, 0, 0, 0, 0, 0, 13, 0, 19, 0, 0, 0, 0, 0, 0, 0,
@@ -871,10 +871,10 @@ TEST_F(DeclarableOpsTests13, space_to_batch_nd_3) {
                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 27, 0, 33, 0, 0, 0, 0, 0, 0, 0, 0, 1,
                                         0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 0, 31, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 11, 0, 0, 0, 0, 0, 0,
                                         0, 0, 0, 29, 0, 35, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 30, 0, 36, 0, 0,
-                                        0, 0, 0, 0, 0, 0, 4, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 28, 0, 34, 0, 0, 0, 0, 0, 0, 0, 0, 0}, nd4j::DataType::FLOAT32);
+                                        0, 0, 0, 0, 0, 0, 4, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 28, 0, 34, 0, 0, 0, 0, 0, 0, 0, 0, 0}, sd::DataType::FLOAT32);
     x.linspace(1);
 
-    nd4j::ops::space_to_batch_nd op;
+    sd::ops::space_to_batch_nd op;
     auto result = op.evaluate({&x, &blockShape, &paddings}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -890,17 +890,17 @@ TEST_F(DeclarableOpsTests13, space_to_batch_nd_3) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests13, batch_to_space_nd_1) {
 
-    NDArray x('c', {8, 1, 1, 1, 3}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {8, 1, 1, 1, 3}, sd::DataType::FLOAT32);
 
-    NDArray blockShape('c', {3}, {2., 2, 2} , nd4j::DataType::INT32);    // three spatial dimensions
-    NDArray crop('c', {3, 2}, {0., 0, 0, 0, 0, 0} , nd4j::DataType::INT32);
+    NDArray blockShape('c', {3}, {2., 2, 2} , sd::DataType::INT32);    // three spatial dimensions
+    NDArray crop('c', {3, 2}, {0., 0, 0, 0, 0, 0} , sd::DataType::INT32);
 
-    NDArray exp('c', {1, 2, 2, 2, 3}, nd4j::DataType::FLOAT32);
+    NDArray exp('c', {1, 2, 2, 2, 3}, sd::DataType::FLOAT32);
 
     x.linspace(1);
     exp.linspace(1);
 
-    nd4j::ops::batch_to_space_nd op;
+    sd::ops::batch_to_space_nd op;
     auto result = op.evaluate({&x, &blockShape, &crop}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -915,15 +915,15 @@ TEST_F(DeclarableOpsTests13, batch_to_space_nd_1) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests13, batch_to_space_nd_2) {
 
-    NDArray x('c', {24, 1,3,2, 1}, nd4j::DataType::FLOAT32);
-    NDArray blockShape('c', {3}, {2, 2, 3} , nd4j::DataType::INT32);    // three spatial dimensions
-    NDArray crop('c', {3, 2}, {0,0,  0,2,  2,1} , nd4j::DataType::INT32);
+    NDArray x('c', {24, 1,3,2, 1}, sd::DataType::FLOAT32);
+    NDArray blockShape('c', {3}, {2, 2, 3} , sd::DataType::INT32);    // three spatial dimensions
+    NDArray crop('c', {3, 2}, {0,0,  0,2,  2,1} , sd::DataType::INT32);
 
     NDArray exp('c', {2,  2,4,3,  1}, {25, 2, 14, 61, 38, 50, 27, 4, 16, 63, 40, 52, 97, 74, 86, 133, 110, 122, 99, 76, 88, 135, 112, 124,
-                                      31, 8, 20, 67, 44, 56, 33, 10, 22, 69, 46, 58, 103, 80, 92, 139, 116, 128, 105, 82, 94, 141, 118, 130}, nd4j::DataType::FLOAT32);
+                                      31, 8, 20, 67, 44, 56, 33, 10, 22, 69, 46, 58, 103, 80, 92, 139, 116, 128, 105, 82, 94, 141, 118, 130}, sd::DataType::FLOAT32);
     x.linspace(1);
 
-    nd4j::ops::batch_to_space_nd op;
+    sd::ops::batch_to_space_nd op;
     auto result = op.evaluate({&x, &blockShape, &crop}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -939,15 +939,15 @@ TEST_F(DeclarableOpsTests13, batch_to_space_nd_2) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests13, batch_to_space_nd_3) {
 
-    NDArray x('c', {24, 2,3,2, 1}, nd4j::DataType::FLOAT32);
-    NDArray blockShape('c', {3}, {2, 2, 3} , nd4j::DataType::INT32);    // three spatial dimensions
-    NDArray crop('c', {3, 2}, {1,1,  0,2,  2,1} , nd4j::DataType::INT32);
+    NDArray x('c', {24, 2,3,2, 1}, sd::DataType::FLOAT32);
+    NDArray blockShape('c', {3}, {2, 2, 3} , sd::DataType::INT32);    // three spatial dimensions
+    NDArray crop('c', {3, 2}, {1,1,  0,2,  2,1} , sd::DataType::INT32);
 
     NDArray exp('c', {2,  2,4,3,  1}, {193, 146, 170, 265, 218, 242, 195, 148, 172, 267, 220, 244, 55, 8, 32, 127, 80, 104, 57, 10, 34, 129, 82,
-                                    106, 205, 158, 182, 277, 230, 254, 207, 160, 184, 279, 232, 256, 67, 20, 44, 139, 92, 116, 69, 22, 46, 141, 94, 118}, nd4j::DataType::FLOAT32);
+                                    106, 205, 158, 182, 277, 230, 254, 207, 160, 184, 279, 232, 256, 67, 20, 44, 139, 92, 116, 69, 22, 46, 141, 94, 118}, sd::DataType::FLOAT32);
     x.linspace(1);
 
-    nd4j::ops::batch_to_space_nd op;
+    sd::ops::batch_to_space_nd op;
     auto result = op.evaluate({&x, &blockShape, &crop}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -963,17 +963,17 @@ TEST_F(DeclarableOpsTests13, batch_to_space_nd_3) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests13, mergemax_1) {
 
-    NDArray x1('c', {5, 5}, nd4j::DataType::FLOAT32);
-    NDArray x2('c', {5, 5}, nd4j::DataType::FLOAT32);
-    NDArray x3('c', {5, 5}, nd4j::DataType::FLOAT32);
-    NDArray e('c', {5, 5}, nd4j::DataType::FLOAT32);
+    NDArray x1('c', {5, 5}, sd::DataType::FLOAT32);
+    NDArray x2('c', {5, 5}, sd::DataType::FLOAT32);
+    NDArray x3('c', {5, 5}, sd::DataType::FLOAT32);
+    NDArray e('c', {5, 5}, sd::DataType::FLOAT32);
     x1.assign(3);
     x2.assign(1);
     x3.assign(2);
     e.assign(3);
 
 
-    nd4j::ops::mergemax op;
+    sd::ops::mergemax op;
     auto result = op.evaluate({&x1, &x2, &x3}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -989,11 +989,11 @@ TEST_F(DeclarableOpsTests13, mergemax_1) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests13, mergemax_2) {
 
-    NDArray x1('c', {1, 3}, {0., 1, 2}, nd4j::DataType::FLOAT32);
-    NDArray x2('c', {1, 1}, std::vector<double>{1.}, nd4j::DataType::FLOAT32);
-    NDArray out('c', {1, 3}, {-1., -1, -1}, nd4j::DataType::FLOAT32);
+    NDArray x1('c', {1, 3}, {0., 1, 2}, sd::DataType::FLOAT32);
+    NDArray x2('c', {1, 1}, std::vector<double>{1.}, sd::DataType::FLOAT32);
+    NDArray out('c', {1, 3}, {-1., -1, -1}, sd::DataType::FLOAT32);
 
-    nd4j::ops::mergemax op;
+    sd::ops::mergemax op;
     auto status = op.execute({&x1, &x2}, {&out}, {}, {}, {});
 
     ASSERT_EQ(20, status);
@@ -1026,12 +1026,12 @@ TEST_F(DeclarableOpsTests13, lstmLayer_1) {
 
     const double cellClip = 0;       // do not apply clipping
 
-    NDArray x('c', {sL, bS, nIn}, nd4j::DataType::FLOAT32);
-    NDArray Wx('c', {nIn, 4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray Wr('c', {nOut, 4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray b('c', {4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray hI('c', {bS, nOut}, nd4j::DataType::FLOAT32);
-    NDArray cI('c', {bS, nOut}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {sL, bS, nIn}, sd::DataType::FLOAT32);
+    NDArray Wx('c', {nIn, 4*nOut}, sd::DataType::FLOAT32);
+    NDArray Wr('c', {nOut, 4*nOut}, sd::DataType::FLOAT32);
+    NDArray b('c', {4*nOut}, sd::DataType::FLOAT32);
+    NDArray hI('c', {bS, nOut}, sd::DataType::FLOAT32);
+    NDArray cI('c', {bS, nOut}, sd::DataType::FLOAT32);
 
     x.linspace(0.5, 0.5);
     Wx = 0.003;
@@ -1052,7 +1052,7 @@ TEST_F(DeclarableOpsTests13, lstmLayer_1) {
 
     auto expClast = NDArrayFactory::create<float>('c', {bS, nOut}, {1.1589154f, 1.1589154f, 1.1589154f, 1.1892855f, 1.1892855f, 1.1892855f, 1.219861f, 1.219861f, 1.219861f});
 
-    nd4j::ops::lstmLayer op;
+    sd::ops::lstmLayer op;
     auto results = op.evaluate({&x, &Wx, &Wr, &b, &hI, &cI}, tArgs, iArgs, bArgs);
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1096,12 +1096,12 @@ TEST_F(DeclarableOpsTests13, lstmLayer_2) {
 
     const double cellClip = 0;       // do not apply clipping
 
-    NDArray x('c', {bS, sL, nIn}, nd4j::DataType::FLOAT32);
-    NDArray Wx('c', {nIn, 4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray Wr('c', {nOut, 4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray b('c', {4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray hI('c', {bS, nOut}, nd4j::DataType::FLOAT32);
-    NDArray cI('c', {bS, nOut}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {bS, sL, nIn}, sd::DataType::FLOAT32);
+    NDArray Wx('c', {nIn, 4*nOut}, sd::DataType::FLOAT32);
+    NDArray Wr('c', {nOut, 4*nOut}, sd::DataType::FLOAT32);
+    NDArray b('c', {4*nOut}, sd::DataType::FLOAT32);
+    NDArray hI('c', {bS, nOut}, sd::DataType::FLOAT32);
+    NDArray cI('c', {bS, nOut}, sd::DataType::FLOAT32);
 
     x.linspace(0.5, 0.5);
     Wx = 0.003;
@@ -1120,7 +1120,7 @@ TEST_F(DeclarableOpsTests13, lstmLayer_2) {
 
     auto expClast = NDArrayFactory::create<float>('c', {bS, nOut}, {0.996965f, 0.996965f, 0.996965f, 1.146756f, 1.146756f, 1.146756f, 1.301922f, 1.301922f, 1.301922f});
 
-    nd4j::ops::lstmLayer op;
+    sd::ops::lstmLayer op;
     auto results = op.evaluate({&x, &Wx, &Wr, &b, &hI, &cI}, tArgs, iArgs, bArgs);
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1164,12 +1164,12 @@ TEST_F(DeclarableOpsTests13, lstmLayer_3) {
 
     const double cellClip = 0;       // do not apply clipping
 
-    NDArray x('c', {sL,bS, nIn}, nd4j::DataType::FLOAT32);
-    NDArray Wx('c', {nIn, 4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray Wr('c', {nOut, 4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray b('c', {4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray hI('c', {bS, nOut}, nd4j::DataType::FLOAT32);
-    NDArray cI('c', {bS, nOut}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {sL,bS, nIn}, sd::DataType::FLOAT32);
+    NDArray Wx('c', {nIn, 4*nOut}, sd::DataType::FLOAT32);
+    NDArray Wr('c', {nOut, 4*nOut}, sd::DataType::FLOAT32);
+    NDArray b('c', {4*nOut}, sd::DataType::FLOAT32);
+    NDArray hI('c', {bS, nOut}, sd::DataType::FLOAT32);
+    NDArray cI('c', {bS, nOut}, sd::DataType::FLOAT32);
 
     x.linspace(0.5, 0.5);
     Wx = 0.003;
@@ -1184,12 +1184,12 @@ TEST_F(DeclarableOpsTests13, lstmLayer_3) {
 
     NDArray expH('c', {sL, bS, nOut}, {0.493883f, 0.493883f, 0.493883f, 0.510990f, 0.510990f, 0.510990f, 0.534701f, 0.534701f, 0.534701f, 0.549139f,
                                         0.549139f, 0.549139f, 0.571900f, 0.571900f, 0.571900f, 0.583561f, 0.583561f, 0.583561f, 0.605106f, 0.605106f,
-                                        0.605106f, 0.614114f, 0.614114f, 0.614114f, 0.635354f, 0.635354f, 0.635354f, 0.642045f, 0.642045f, 0.642045f}, nd4j::DataType::FLOAT32);
+                                        0.605106f, 0.614114f, 0.614114f, 0.614114f, 0.635354f, 0.635354f, 0.635354f, 0.642045f, 0.642045f, 0.642045f}, sd::DataType::FLOAT32);
 
-    NDArray expHL('c', {bS, nOut}, {0.493883f, 0.493883f, 0.493883f, 0.510990f, 0.510990f, 0.510990f}, nd4j::DataType::FLOAT32);
-    NDArray expCL('c', {bS, nOut}, {1.061274f, 1.061274f, 1.061274f, 1.115888f, 1.115888f, 1.115888f}, nd4j::DataType::FLOAT32);
+    NDArray expHL('c', {bS, nOut}, {0.493883f, 0.493883f, 0.493883f, 0.510990f, 0.510990f, 0.510990f}, sd::DataType::FLOAT32);
+    NDArray expCL('c', {bS, nOut}, {1.061274f, 1.061274f, 1.061274f, 1.115888f, 1.115888f, 1.115888f}, sd::DataType::FLOAT32);
 
-    nd4j::ops::lstmLayer op;
+    sd::ops::lstmLayer op;
     auto results = op.evaluate({&x, &Wx, &Wr, &b, &hI, &cI}, tArgs, iArgs, bArgs);
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1237,12 +1237,12 @@ TEST_F(DeclarableOpsTests13, lstmLayer_4) {
 
     const double cellClip = 0;       // do not apply clipping
 
-    NDArray x('c', {sL, bS, nIn}, nd4j::DataType::FLOAT32);
-    NDArray Wx('c', {2,nIn, 4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray Wr('c', {2,nOut, 4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray b('c', {2,4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray hI('c', {2,bS, nOut}, nd4j::DataType::FLOAT32);
-    NDArray cI('c', {2,bS, nOut}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {sL, bS, nIn}, sd::DataType::FLOAT32);
+    NDArray Wx('c', {2,nIn, 4*nOut}, sd::DataType::FLOAT32);
+    NDArray Wr('c', {2,nOut, 4*nOut}, sd::DataType::FLOAT32);
+    NDArray b('c', {2,4*nOut}, sd::DataType::FLOAT32);
+    NDArray hI('c', {2,bS, nOut}, sd::DataType::FLOAT32);
+    NDArray cI('c', {2,bS, nOut}, sd::DataType::FLOAT32);
 
     x.linspace(0.5, 0.5);
     Wx({0,1, 0,0, 0,0}) = 0.003f;
@@ -1267,14 +1267,14 @@ TEST_F(DeclarableOpsTests13, lstmLayer_4) {
                                         -0.123305f, -0.123305f, -0.123305f,  0.560640f,  0.560640f,  0.560640f, -0.120862f, -0.120862f, -0.120862f,
                                          0.550714f,  0.550714f,  0.550714f, -0.156223f, -0.156223f, -0.156223f,  0.565308f,  0.565308f,  0.565308f,
                                         -0.152313f, -0.152313f, -0.152313f,  0.563741f,  0.563741f,  0.563741f, -0.234128f, -0.234128f, -0.234128f,
-                                         0.578676f,  0.578676f,  0.578676f, -0.228917f, -0.228917f, -0.228917f}, nd4j::DataType::FLOAT32);
+                                         0.578676f,  0.578676f,  0.578676f, -0.228917f, -0.228917f, -0.228917f}, sd::DataType::FLOAT32);
 
     NDArray expHL('c', {2,bS, nOut}, {0.563741f, 0.563741f, 0.563741f, 0.578676f, 0.578676f, 0.578676f, -0.107642f,
-                                    -0.107642f, -0.107642f, -0.106937f, -0.106937f, -0.106937f}, nd4j::DataType::FLOAT32);
+                                    -0.107642f, -0.107642f, -0.106937f, -0.106937f, -0.106937f}, sd::DataType::FLOAT32);
     NDArray expCL('c', {2,bS, nOut}, {1.217757f, 1.217757f, 1.217757f, 1.272398f, 1.272398f, 1.272398f, -0.295768f,
-                                    -0.295768f, -0.295768f, -0.298453f, -0.298453f, -0.298453f}, nd4j::DataType::FLOAT32);
+                                    -0.295768f, -0.295768f, -0.298453f, -0.298453f, -0.298453f}, sd::DataType::FLOAT32);
 
-    nd4j::ops::lstmLayer op;
+    sd::ops::lstmLayer op;
     auto results = op.evaluate({&x, &Wx, &Wr, &b, &hI, &cI}, tArgs, iArgs, bArgs);
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1321,12 +1321,12 @@ TEST_F(DeclarableOpsTests13, lstmLayer_5) {
 
     const double cellClip = 0;       // do not apply clipping
 
-    NDArray x('c', {bS, sL, nIn}, nd4j::DataType::FLOAT32);
-    NDArray Wx('c', {2,nIn, 4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray Wr('c', {2,nOut, 4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray b('c', {2,4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray hI('c', {2,bS, nOut}, nd4j::DataType::FLOAT32);
-    NDArray cI('c', {2,bS, nOut}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {bS, sL, nIn}, sd::DataType::FLOAT32);
+    NDArray Wx('c', {2,nIn, 4*nOut}, sd::DataType::FLOAT32);
+    NDArray Wr('c', {2,nOut, 4*nOut}, sd::DataType::FLOAT32);
+    NDArray b('c', {2,4*nOut}, sd::DataType::FLOAT32);
+    NDArray hI('c', {2,bS, nOut}, sd::DataType::FLOAT32);
+    NDArray cI('c', {2,bS, nOut}, sd::DataType::FLOAT32);
 
     x.linspace(0.5, 0.5);
     Wx({0,1, 0,0, 0,0}) = 0.003;
@@ -1349,14 +1349,14 @@ TEST_F(DeclarableOpsTests13, lstmLayer_5) {
         0.526881f,  0.526881f,  0.526881f,  -0.12883f,  -0.12883f,  -0.12883f, 0.515882f,  0.515882f,  0.515882f,  -0.16868f,  -0.16868f,  -0.16868f,
          0.51409f,   0.51409f,   0.51409f, -0.255185f, -0.255185f, -0.255185f, 0.614599f,  0.614599f,  0.614599f, -0.102739f, -0.102739f, -0.102739f,
         0.599572f,  0.599572f,  0.599572f, -0.105802f, -0.105802f, -0.105802f, 0.591089f,  0.591089f,  0.591089f, -0.116681f, -0.116681f, -0.116681f,
-        0.588694f,  0.588694f,  0.588694f, -0.149201f, -0.149201f, -0.149201f, 0.591492f,  0.591492f,  0.591492f, -0.228917f, -0.228917f, -0.228917f}, nd4j::DataType::FLOAT32);
+        0.588694f,  0.588694f,  0.588694f, -0.149201f, -0.149201f, -0.149201f, 0.591492f,  0.591492f,  0.591492f, -0.228917f, -0.228917f, -0.228917f}, sd::DataType::FLOAT32);
 
     NDArray expHL('c', {2,bS, nOut}, {0.51409f,  0.51409f,  0.51409f,   0.591492f,  0.591492f,  0.591492f,
-                                     -0.107659f, -0.107659f, -0.107659f,  -0.102739f, -0.102739f, -0.102739f}, nd4j::DataType::FLOAT32);
+                                     -0.107659f, -0.107659f, -0.107659f,  -0.102739f, -0.102739f, -0.102739f}, sd::DataType::FLOAT32);
     NDArray expCL('c', {2,bS, nOut}, {1.07293f ,  1.07293f ,  1.07293f, 1.346609f,  1.346609f,  1.346609f,
-                                    -0.295811f, -0.295811f, -0.295811f, -0.305394f, -0.305394f, -0.305394f}, nd4j::DataType::FLOAT32);
+                                    -0.295811f, -0.295811f, -0.295811f, -0.305394f, -0.305394f, -0.305394f}, sd::DataType::FLOAT32);
 
-    nd4j::ops::lstmLayer op;
+    sd::ops::lstmLayer op;
     auto results = op.evaluate({&x, &Wx, &Wr, &b, &hI, &cI}, tArgs, iArgs, bArgs);
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1407,12 +1407,12 @@ TEST_F(DeclarableOpsTests13, lstmLayer_6) {
 
     const double cellClip = 0;       // do not apply clipping
 
-    NDArray x('c', {sL, bS, nIn}, nd4j::DataType::FLOAT32);
-    NDArray Wx('c', {2,nIn, 4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray Wr('c', {2,nOut, 4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray b('c', {2,4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray hI('c', {2,bS, nOut}, nd4j::DataType::FLOAT32);
-    NDArray cI('c', {2,bS, nOut}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {sL, bS, nIn}, sd::DataType::FLOAT32);
+    NDArray Wx('c', {2,nIn, 4*nOut}, sd::DataType::FLOAT32);
+    NDArray Wr('c', {2,nOut, 4*nOut}, sd::DataType::FLOAT32);
+    NDArray b('c', {2,4*nOut}, sd::DataType::FLOAT32);
+    NDArray hI('c', {2,bS, nOut}, sd::DataType::FLOAT32);
+    NDArray cI('c', {2,bS, nOut}, sd::DataType::FLOAT32);
 
     x.linspace(0.5, 0.5);
     Wx({0,1, 0,0, 0,0}) = 0.003f;
@@ -1433,16 +1433,16 @@ TEST_F(DeclarableOpsTests13, lstmLayer_6) {
     NDArray expH('c', {sL, bS, nOut}, {
         0.470019f, 0.470019f, 0.470019f, 0.478352f, 0.478352f, 0.478352f, 0.444871f, 0.444871f, 0.444871f, 0.457060f,
         0.457060f, 0.457060f, 0.424090f, 0.424090f, 0.424090f, 0.439778f, 0.439778f, 0.439778f, 0.394491f, 0.394491f,
-        0.394491f, 0.412995f, 0.412995f, 0.412995f, 0.329613f, 0.329613f, 0.329613f, 0.349760f, 0.349760f, 0.349760f}, nd4j::DataType::FLOAT32);
+        0.394491f, 0.412995f, 0.412995f, 0.412995f, 0.329613f, 0.329613f, 0.329613f, 0.349760f, 0.349760f, 0.349760f}, sd::DataType::FLOAT32);
 
     NDArray expHL('c', {2,bS, nOut}, {0.563741f, 0.563741f, 0.563741f, 0.578676f, 0.578676f, 0.578676f,
                                       -0.107642f, -0.107642f, -0.107642f, -0.106937f, -0.106937f, -0.106937f},
-                                      nd4j::DataType::FLOAT32);
+                                      sd::DataType::FLOAT32);
     NDArray expCL('c', {2,bS, nOut}, {1.217757f, 1.217757f, 1.217757f, 1.272398f, 1.272398f, 1.272398f,
                                       -0.295768f, -0.295768f, -0.295768f, -0.298453f, -0.298453f, -0.298453f},
-                                      nd4j::DataType::FLOAT32);
+                                      sd::DataType::FLOAT32);
 
-    nd4j::ops::lstmLayer op;
+    sd::ops::lstmLayer op;
     auto results = op.evaluate({&x, &Wx, &Wr, &b, &hI, &cI}, tArgs, iArgs, bArgs);
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1491,13 +1491,13 @@ TEST_F(DeclarableOpsTests13, lstmLayer_7) {
 
     const double cellClip = 0;       // do not apply clipping
 
-    NDArray x('c', {sL, bS, nIn}, nd4j::DataType::FLOAT32);
-    NDArray Wx('c', {nIn, 4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray Wr('c', {nOut, 4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray b('c', {4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray hI('c', {bS, nOut}, nd4j::DataType::FLOAT32);
-    NDArray cI('c', {bS, nOut}, nd4j::DataType::FLOAT32);
-    NDArray Wp('c', {3*nOut}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {sL, bS, nIn}, sd::DataType::FLOAT32);
+    NDArray Wx('c', {nIn, 4*nOut}, sd::DataType::FLOAT32);
+    NDArray Wr('c', {nOut, 4*nOut}, sd::DataType::FLOAT32);
+    NDArray b('c', {4*nOut}, sd::DataType::FLOAT32);
+    NDArray hI('c', {bS, nOut}, sd::DataType::FLOAT32);
+    NDArray cI('c', {bS, nOut}, sd::DataType::FLOAT32);
+    NDArray Wp('c', {3*nOut}, sd::DataType::FLOAT32);
 
     x.linspace(0.5, 0.5);
     Wx = 0.003;
@@ -1513,12 +1513,12 @@ TEST_F(DeclarableOpsTests13, lstmLayer_7) {
 
     NDArray expH('c', {sL, bS, nOut}, {0.55533 , 0.55533 , 0.55533 , 0.562925, 0.562925, 0.562925, 0.531795, 0.531795, 0.531795, 0.542556,
                                         0.542556, 0.542556, 0.521466, 0.521466, 0.521466, 0.534638, 0.534638, 0.534638, 0.524805, 0.524805,
-                                        0.524805, 0.539187, 0.539187, 0.539187, 0.538309, 0.538309, 0.538309, 0.552923, 0.552923, 0.552923}, nd4j::DataType::FLOAT32);
+                                        0.524805, 0.539187, 0.539187, 0.539187, 0.538309, 0.538309, 0.538309, 0.552923, 0.552923, 0.552923}, sd::DataType::FLOAT32);
 
-    NDArray expHL('c', {bS, nOut}, {0.538309, 0.538309, 0.538309,0.552923, 0.552923, 0.552923}, nd4j::DataType::FLOAT32);
-    NDArray expCL('c', {bS, nOut}, {1.147089, 1.147089, 1.147089,1.197228, 1.197228, 1.197228}, nd4j::DataType::FLOAT32);
+    NDArray expHL('c', {bS, nOut}, {0.538309, 0.538309, 0.538309,0.552923, 0.552923, 0.552923}, sd::DataType::FLOAT32);
+    NDArray expCL('c', {bS, nOut}, {1.147089, 1.147089, 1.147089,1.197228, 1.197228, 1.197228}, sd::DataType::FLOAT32);
 
-    nd4j::ops::lstmLayer op;
+    sd::ops::lstmLayer op;
     auto results = op.evaluate({&x, &Wx, &Wr, &b, &hI, &cI, &Wp}, tArgs, iArgs, bArgs);
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1568,13 +1568,13 @@ TEST_F(DeclarableOpsTests13, lstmLayer_8) {
 
     const double cellClip = 1.;       // do not apply clipping
 
-    NDArray x('c', {sL, bS, nIn}, nd4j::DataType::FLOAT32);
-    NDArray Wx('c', {nIn, 4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray Wr('c', {nOut, 4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray b('c', {4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray hI('c', {bS, nOut}, nd4j::DataType::FLOAT32);
-    NDArray cI('c', {bS, nOut}, nd4j::DataType::FLOAT32);
-    NDArray Wp('c', {3*nOut}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {sL, bS, nIn}, sd::DataType::FLOAT32);
+    NDArray Wx('c', {nIn, 4*nOut}, sd::DataType::FLOAT32);
+    NDArray Wr('c', {nOut, 4*nOut}, sd::DataType::FLOAT32);
+    NDArray b('c', {4*nOut}, sd::DataType::FLOAT32);
+    NDArray hI('c', {bS, nOut}, sd::DataType::FLOAT32);
+    NDArray cI('c', {bS, nOut}, sd::DataType::FLOAT32);
+    NDArray Wp('c', {3*nOut}, sd::DataType::FLOAT32);
 
     x.linspace(0.5, 0.5);
     Wx = 0.003;
@@ -1591,12 +1591,12 @@ TEST_F(DeclarableOpsTests13, lstmLayer_8) {
     NDArray expH('c', {sL, bS, nOut}, {
         0.436221f, 0.436221f, 0.436221f, 0.450573f, 0.450573f, 0.450573f, 0.463602f, 0.463602f, 0.463602f, 0.474674f, 0.474674f, 0.474674f,
         0.484039f, 0.484039f, 0.484039f, 0.490679f, 0.490679f, 0.490679f, 0.494871f, 0.494871f, 0.494871f, 0.499028f, 0.499028f, 0.499028f,
-        0.504649f, 0.504649f, 0.504649f, 0.508719f, 0.508719f, 0.508719f}, nd4j::DataType::FLOAT32);
+        0.504649f, 0.504649f, 0.504649f, 0.508719f, 0.508719f, 0.508719f}, sd::DataType::FLOAT32);
 
-    NDArray expHL('c', {bS, nOut}, {0.436221f, 0.436221f, 0.436221f, 0.450573f, 0.450573f, 0.450573f}, nd4j::DataType::FLOAT32);
-    NDArray expCL('c', {bS, nOut}, {0.879804f, 0.879804f, 0.879804f, 0.914666f, 0.914666f, 0.914666f}, nd4j::DataType::FLOAT32);
+    NDArray expHL('c', {bS, nOut}, {0.436221f, 0.436221f, 0.436221f, 0.450573f, 0.450573f, 0.450573f}, sd::DataType::FLOAT32);
+    NDArray expCL('c', {bS, nOut}, {0.879804f, 0.879804f, 0.879804f, 0.914666f, 0.914666f, 0.914666f}, sd::DataType::FLOAT32);
 
-    nd4j::ops::lstmLayer op;
+    sd::ops::lstmLayer op;
     auto results = op.evaluate({&x, &Wx, &Wr, &b, &hI, &cI, &Wp}, tArgs, iArgs, bArgs);
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1645,13 +1645,13 @@ TEST_F(DeclarableOpsTests13, lstmLayer_9) {
 
     const double cellClip = 0;       // do not apply clipping
 
-    NDArray x('c', {sL, bS, nIn}, nd4j::DataType::FLOAT32);
-    NDArray Wx('c', {2,nIn, 4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray Wr('c', {2,nOut, 4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray b('c', {2,4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray hI('c', {2,bS, nOut}, nd4j::DataType::FLOAT32);
-    NDArray cI('c', {2,bS, nOut}, nd4j::DataType::FLOAT32);
-    NDArray Wp('c', {2,3*nOut}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {sL, bS, nIn}, sd::DataType::FLOAT32);
+    NDArray Wx('c', {2,nIn, 4*nOut}, sd::DataType::FLOAT32);
+    NDArray Wr('c', {2,nOut, 4*nOut}, sd::DataType::FLOAT32);
+    NDArray b('c', {2,4*nOut}, sd::DataType::FLOAT32);
+    NDArray hI('c', {2,bS, nOut}, sd::DataType::FLOAT32);
+    NDArray cI('c', {2,bS, nOut}, sd::DataType::FLOAT32);
+    NDArray Wp('c', {2,3*nOut}, sd::DataType::FLOAT32);
 
     x.linspace(0.5, 0.5);
     Wx({0,1, 0,0, 0,0}) = 0.003;
@@ -1676,14 +1676,14 @@ TEST_F(DeclarableOpsTests13, lstmLayer_9) {
         0.531795f,  0.531795f,  0.531795f, -0.107456f, -0.107456f, -0.107456f, 0.542556f,  0.542556f,  0.542556f, -0.106139f, -0.106139f, -0.106139f,
         0.521466f,  0.521466f,  0.521466f,  -0.11681f,  -0.11681f,  -0.11681f, 0.534638f,  0.534638f,  0.534638f,  -0.11458f,  -0.11458f,  -0.11458f,
         0.524805f,  0.524805f,  0.524805f, -0.145177f, -0.145177f, -0.145177f, 0.539187f,  0.539187f,  0.539187f,  -0.14157f,  -0.14157f,  -0.14157f,
-        0.538309f,  0.538309f,  0.538309f, -0.218056f, -0.218056f, -0.218056f, 0.552923f,  0.552923f,  0.552923f, -0.213068f, -0.213068f, -0.213068f}, nd4j::DataType::FLOAT32);
+        0.538309f,  0.538309f,  0.538309f, -0.218056f, -0.218056f, -0.218056f, 0.552923f,  0.552923f,  0.552923f, -0.213068f, -0.213068f, -0.213068f}, sd::DataType::FLOAT32);
 
     NDArray expHL('c', {2,bS, nOut}, {0.538309f,  0.538309f,  0.538309f, 0.552923f,  0.552923f,  0.552923f, -0.104502f, -0.104502f, -0.104502f,
-                                     -0.103843f, -0.103843f, -0.103843f}, nd4j::DataType::FLOAT32);
+                                     -0.103843f, -0.103843f, -0.103843f}, sd::DataType::FLOAT32);
     NDArray expCL('c', {2,bS, nOut}, {1.147089f,  1.147089f,  1.147089f, 1.197228f,  1.197228f,  1.197228f, -0.289425f, -0.289425f, -0.289425f,
-                                     -0.292174f, -0.292174f, -0.292174f}, nd4j::DataType::FLOAT32);
+                                     -0.292174f, -0.292174f, -0.292174f}, sd::DataType::FLOAT32);
 
-    nd4j::ops::lstmLayer op;
+    sd::ops::lstmLayer op;
     auto results = op.evaluate({&x, &Wx, &Wr, &b, &hI, &cI, &Wp}, tArgs, iArgs, bArgs);
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1732,14 +1732,14 @@ TEST_F(DeclarableOpsTests13, lstmLayer_10) {
 
     const double cellClip = 0;       // do not apply clipping
 
-    NDArray x('c', {sL, bS, nIn}, nd4j::DataType::FLOAT32);
-    NDArray Wx('c', {nIn, 4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray Wr('c', {nOut, 4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray b('c', {4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray hI('c', {bS, nOut}, nd4j::DataType::FLOAT32);
-    NDArray cI('c', {bS, nOut}, nd4j::DataType::FLOAT32);
-    NDArray seqLen('c', {bS}, {0,1,2,3,5}, nd4j::DataType::FLOAT32);
-    NDArray Wp('c', {3*nOut}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {sL, bS, nIn}, sd::DataType::FLOAT32);
+    NDArray Wx('c', {nIn, 4*nOut}, sd::DataType::FLOAT32);
+    NDArray Wr('c', {nOut, 4*nOut}, sd::DataType::FLOAT32);
+    NDArray b('c', {4*nOut}, sd::DataType::FLOAT32);
+    NDArray hI('c', {bS, nOut}, sd::DataType::FLOAT32);
+    NDArray cI('c', {bS, nOut}, sd::DataType::FLOAT32);
+    NDArray seqLen('c', {bS}, {0,1,2,3,5}, sd::DataType::FLOAT32);
+    NDArray Wp('c', {3*nOut}, sd::DataType::FLOAT32);
 
     x.linspace(0.5, 0.5);
     Wx = 0.003;
@@ -1763,12 +1763,12 @@ TEST_F(DeclarableOpsTests13, lstmLayer_10) {
               0.f,       0.f,       0.f,       0.f,       0.f,       0.f,       0.f,       0.f,       0.f,        0.f,
               0.f,       0.f, 0.692315f, 0.692315f, 0.692315f,       0.f,       0.f,       0.f,       0.f,        0.f,
               0.f,       0.f,       0.f,       0.f,       0.f,       0.f,       0.f,       0.f,       0.f,        0.f},
-        nd4j::DataType::FLOAT32);
+        sd::DataType::FLOAT32);
 
-    NDArray expHL('c', {bS, nOut}, {0.f, 0.f, 0.f, 0.562925f, 0.562925f, 0.562925f, 0.576568f, 0.576568f, 0.576568f, 0.611224f, 0.611224f, 0.611224f, 0.692315f, 0.692315f, 0.692315f}, nd4j::DataType::FLOAT32);
-    NDArray expCL('c', {bS, nOut}, {0.f, 0.f, 0.f, 1.534275f, 1.534275f, 1.534275f,  1.40183f,  1.40183f,  1.40183f, 1.449675f, 1.449675f, 1.449675f, 1.767702f, 1.767702f, 1.767702f}, nd4j::DataType::FLOAT32);
+    NDArray expHL('c', {bS, nOut}, {0.f, 0.f, 0.f, 0.562925f, 0.562925f, 0.562925f, 0.576568f, 0.576568f, 0.576568f, 0.611224f, 0.611224f, 0.611224f, 0.692315f, 0.692315f, 0.692315f}, sd::DataType::FLOAT32);
+    NDArray expCL('c', {bS, nOut}, {0.f, 0.f, 0.f, 1.534275f, 1.534275f, 1.534275f,  1.40183f,  1.40183f,  1.40183f, 1.449675f, 1.449675f, 1.449675f, 1.767702f, 1.767702f, 1.767702f}, sd::DataType::FLOAT32);
 
-    nd4j::ops::lstmLayer op;
+    sd::ops::lstmLayer op;
     auto results = op.evaluate({&x, &Wx, &Wr, &b, &seqLen, &hI, &cI, &Wp}, tArgs, iArgs, bArgs);
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1817,14 +1817,14 @@ TEST_F(DeclarableOpsTests13, lstmLayer_11) {
 
     const double cellClip = 0;       // do not apply clipping
 
-    NDArray x('c', {sL, bS, nIn}, nd4j::DataType::FLOAT32);
-    NDArray Wx('c', {nIn, 4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray Wr('c', {nOut, 4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray b('c', {4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray hI('c', {bS, nOut}, nd4j::DataType::FLOAT32);
-    NDArray cI('c', {bS, nOut}, nd4j::DataType::FLOAT32);
-    NDArray seqLen('c', {bS}, {0,1,2,3,5}, nd4j::DataType::FLOAT32);
-    NDArray Wp('c', {3*nOut}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {sL, bS, nIn}, sd::DataType::FLOAT32);
+    NDArray Wx('c', {nIn, 4*nOut}, sd::DataType::FLOAT32);
+    NDArray Wr('c', {nOut, 4*nOut}, sd::DataType::FLOAT32);
+    NDArray b('c', {4*nOut}, sd::DataType::FLOAT32);
+    NDArray hI('c', {bS, nOut}, sd::DataType::FLOAT32);
+    NDArray cI('c', {bS, nOut}, sd::DataType::FLOAT32);
+    NDArray seqLen('c', {bS}, {0,1,2,3,5}, sd::DataType::FLOAT32);
+    NDArray Wp('c', {3*nOut}, sd::DataType::FLOAT32);
 
     x.linspace(0.5, 0.5);
     Wx = 0.003f;
@@ -1843,12 +1843,12 @@ TEST_F(DeclarableOpsTests13, lstmLayer_11) {
         0.61209f, 0.61209f,0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.652042f, 0.652042f, 0.652042f,  0.f, 0.f, 0.f, 0.f, 0.f,
         0.f, 0.f, 0.f, 0.f, 0.677708f, 0.677708f, 0.677708f, 0.684177f, 0.684177f, 0.684177f, 0.f, 0.f, 0.f,0.f, 0.f, 0.f, 0.699627f, 0.699627f,
         0.699627f, 0.705371f, 0.705371f, 0.705371f, 0.710989f, 0.710989f, 0.710989f, 0., 0., 0., 0.719014, 0.719014, 0.719014, 0.724087,
-        0.724087f, 0.724087f, 0.729084f, 0.729084f, 0.729084f, 0.734004f, 0.734004f, 0.734004f }, nd4j::DataType::FLOAT32);
+        0.724087f, 0.724087f, 0.729084f, 0.729084f, 0.729084f, 0.734004f, 0.734004f, 0.734004f }, sd::DataType::FLOAT32);
 
-    NDArray expHL('c', {bS, nOut}, {0.f, 0.f, 0.f, 0.719014f, 0.719014f, 0.719014f, 0.699627f, 0.699627f, 0.699627f, 0.677708f, 0.677708f, 0.677708f,  0.61209f,  0.61209f,  0.61209f}, nd4j::DataType::FLOAT32);
-    NDArray expCL('c', {bS, nOut}, {0.f, 0.f, 0.f, 2.092814f, 2.092814f, 2.092814f,  2.08832f,  2.08832f,  2.08832f, 2.009851f, 2.009851f, 2.009851f, 1.646034f, 1.646034f, 1.646034f}, nd4j::DataType::FLOAT32);
+    NDArray expHL('c', {bS, nOut}, {0.f, 0.f, 0.f, 0.719014f, 0.719014f, 0.719014f, 0.699627f, 0.699627f, 0.699627f, 0.677708f, 0.677708f, 0.677708f,  0.61209f,  0.61209f,  0.61209f}, sd::DataType::FLOAT32);
+    NDArray expCL('c', {bS, nOut}, {0.f, 0.f, 0.f, 2.092814f, 2.092814f, 2.092814f,  2.08832f,  2.08832f,  2.08832f, 2.009851f, 2.009851f, 2.009851f, 1.646034f, 1.646034f, 1.646034f}, sd::DataType::FLOAT32);
 
-    nd4j::ops::lstmLayer op;
+    sd::ops::lstmLayer op;
     auto results = op.evaluate({&x, &Wx, &Wr, &b, &seqLen, &hI, &cI, &Wp}, tArgs, iArgs, bArgs);
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1897,14 +1897,14 @@ TEST_F(DeclarableOpsTests13, lstmLayer_12) {
 
     const double cellClip = 0;       // do not apply clipping
 
-    NDArray x('c', {sL, bS, nIn}, nd4j::DataType::FLOAT32);
-    NDArray Wx('c', {2,nIn, 4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray Wr('c', {2,nOut, 4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray b('c', {2,4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray hI('c', {2,bS, nOut}, nd4j::DataType::FLOAT32);
-    NDArray cI('c', {2,bS, nOut}, nd4j::DataType::FLOAT32);
-    NDArray seqLen('c', {bS}, {0,1,2,3,5}, nd4j::DataType::FLOAT32);
-    NDArray Wp('c', {2,3*nOut}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {sL, bS, nIn}, sd::DataType::FLOAT32);
+    NDArray Wx('c', {2,nIn, 4*nOut}, sd::DataType::FLOAT32);
+    NDArray Wr('c', {2,nOut, 4*nOut}, sd::DataType::FLOAT32);
+    NDArray b('c', {2,4*nOut}, sd::DataType::FLOAT32);
+    NDArray hI('c', {2,bS, nOut}, sd::DataType::FLOAT32);
+    NDArray cI('c', {2,bS, nOut}, sd::DataType::FLOAT32);
+    NDArray seqLen('c', {bS}, {0,1,2,3,5}, sd::DataType::FLOAT32);
+    NDArray Wp('c', {2,3*nOut}, sd::DataType::FLOAT32);
 
     x.linspace(0.5, 0.5);
     Wx({0,1, 0,0, 0,0}) = 0.003f;
@@ -1932,14 +1932,14 @@ TEST_F(DeclarableOpsTests13, lstmLayer_12) {
                                         0.621298, 0.621298, 0.621298, -0.090626, -0.090626, -0.090626, 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                                         0., 0., 0., 0., 0., 0., 0.655858, 0.655858, 0.655858, -0.098015, -0.098015, -0.098015, 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                                         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.692315, 0.692315, 0.692315, -0.143704, -0.143704, -0.143704, 0., 0., 0., 0., 0., 0.,
-                                        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.}, nd4j::DataType::FLOAT32);
+                                        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.}, sd::DataType::FLOAT32);
 
     NDArray expHL('c', {2,bS, nOut}, {0.f, 0.f, 0.f, 0.562925f, 0.562925f, 0.562925f,   0.576568f,  0.576568f,  0.576568f,  0.611224f,  0.611224f,  0.611224f,   0.692315f,  0.692315f,  0.692315f,
-                                      0.f, 0.f, 0.f, -0.25361f, -0.25361f, -0.25361f,  -0.157103f, -0.157103f, -0.157103f, -0.116502f, -0.116502f, -0.116502f,  -0.100025f, -0.100025f, -0.100025f}, nd4j::DataType::FLOAT32);
+                                      0.f, 0.f, 0.f, -0.25361f, -0.25361f, -0.25361f,  -0.157103f, -0.157103f, -0.157103f, -0.116502f, -0.116502f, -0.116502f,  -0.100025f, -0.100025f, -0.100025f}, sd::DataType::FLOAT32);
     NDArray expCL('c', {2,bS, nOut}, {0.f, 0.f, 0.f, 1.534275f, 1.534275f, 1.534275f,    1.40183f,   1.40183f,   1.40183f,  1.449675f,  1.449675f,  1.449675f,   1.767702f,  1.767702f,  1.767702f,
-                                      0.f, 0.f, 0.f, -0.86636f, -0.86636f, -0.86636f,  -0.470245f, -0.470245f, -0.470245f, -0.341856f, -0.341856f, -0.341856f,  -0.294986f, -0.294986f, -0.294986f}, nd4j::DataType::FLOAT32);
+                                      0.f, 0.f, 0.f, -0.86636f, -0.86636f, -0.86636f,  -0.470245f, -0.470245f, -0.470245f, -0.341856f, -0.341856f, -0.341856f,  -0.294986f, -0.294986f, -0.294986f}, sd::DataType::FLOAT32);
 
-    nd4j::ops::lstmLayer op;
+    sd::ops::lstmLayer op;
     auto results = op.evaluate({&x, &Wx, &Wr, &b, &seqLen, &hI, &cI, &Wp}, tArgs, iArgs, bArgs);
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1965,17 +1965,17 @@ TEST_F(DeclarableOpsTests13, lstmLayer_12) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests13, batchnorm_test1) {
 
-    NDArray input   ('c', {2,4}, nd4j::DataType::FLOAT32);
-    NDArray mean    ('c', {4}, {1.05f, 1.15f, 1.2f, 1.3f}, nd4j::DataType::FLOAT32);
-    NDArray variance('c', {4}, {0.5f, 0.7f, 0.9f,  1.1f},  nd4j::DataType::FLOAT32);
-    NDArray gamma   ('c', {4}, {-1.2f, 1.3f, -1.4f, 1.5f}, nd4j::DataType::FLOAT32);
-    NDArray beta    ('c', {4}, {10.f, 20.f, -10.f, -20.f},     nd4j::DataType::FLOAT32);
+    NDArray input   ('c', {2,4}, sd::DataType::FLOAT32);
+    NDArray mean    ('c', {4}, {1.05f, 1.15f, 1.2f, 1.3f}, sd::DataType::FLOAT32);
+    NDArray variance('c', {4}, {0.5f, 0.7f, 0.9f,  1.1f},  sd::DataType::FLOAT32);
+    NDArray gamma   ('c', {4}, {-1.2f, 1.3f, -1.4f, 1.5f}, sd::DataType::FLOAT32);
+    NDArray beta    ('c', {4}, {10.f, 20.f, -10.f, -20.f},     sd::DataType::FLOAT32);
 
-    NDArray expected('c', {2,4}, {11.61218734f,  18.52390321f,  -8.67185076f, -21.28716864f, 10.93337162f,  19.14541765f, -9.26213931f, -20.71509369f}, nd4j::DataType::FLOAT32);
+    NDArray expected('c', {2,4}, {11.61218734f,  18.52390321f,  -8.67185076f, -21.28716864f, 10.93337162f,  19.14541765f, -9.26213931f, -20.71509369f}, sd::DataType::FLOAT32);
 
     input.linspace(0.1, 0.1);
 
-    nd4j::ops::batchnorm op;
+    sd::ops::batchnorm op;
 
     auto results = op.evaluate({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1});
 
@@ -2008,7 +2008,7 @@ TYPED_TEST(TypedDeclarableOpsTests13, batchnorm_test2) {
     gamma.assign(1.2);
     beta.assign(1.);
 
-    nd4j::ops::batchnorm op;
+    sd::ops::batchnorm op;
 
     auto results = op.evaluate({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1});
 
@@ -2037,7 +2037,7 @@ TYPED_TEST(TypedDeclarableOpsTests13, batchnorm_test3) {
 
     input.linspace(0.1, 0.1);
 
-    nd4j::ops::batchnorm op;
+    sd::ops::batchnorm op;
 
     auto results = op.evaluate({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1,1});
 
@@ -2065,7 +2065,7 @@ TYPED_TEST(TypedDeclarableOpsTests13, batchnorm_test4) {
 
     input.linspace(0.1, 0.1);
 
-    nd4j::ops::batchnorm op;
+    sd::ops::batchnorm op;
 
     auto results = op.evaluate({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1,0,2});
 
@@ -2082,18 +2082,18 @@ TYPED_TEST(TypedDeclarableOpsTests13, batchnorm_test4) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests13, batchnorm_test5) {
 
-    NDArray input   ('c', {2,4,2,2}, nd4j::DataType::FLOAT32);
-    NDArray mean    ('c', {4}, {1.05f, 1.15f, 1.2f, 1.3f}, nd4j::DataType::FLOAT32);
-    NDArray variance('c', {4}, {0.5f, 0.7f, 0.9f,  1.1f},  nd4j::DataType::FLOAT32);
-    NDArray gamma   ('c', {4}, {-1.2f, 1.3f, -1.4f, 1.5f}, nd4j::DataType::FLOAT32);
-    NDArray beta    ('c', {4}, {10.f, 20.f, -10.f, -20.f},     nd4j::DataType::FLOAT32);
+    NDArray input   ('c', {2,4,2,2}, sd::DataType::FLOAT32);
+    NDArray mean    ('c', {4}, {1.05f, 1.15f, 1.2f, 1.3f}, sd::DataType::FLOAT32);
+    NDArray variance('c', {4}, {0.5f, 0.7f, 0.9f,  1.1f},  sd::DataType::FLOAT32);
+    NDArray gamma   ('c', {4}, {-1.2f, 1.3f, -1.4f, 1.5f}, sd::DataType::FLOAT32);
+    NDArray beta    ('c', {4}, {10.f, 20.f, -10.f, -20.f},     sd::DataType::FLOAT32);
 
     NDArray expected('c', {2,4,2,2}, { 11.612187f,  11.442483f,  11.272779f,  11.103076f,  18.990039f,  19.145418f,  19.300796f,  19.456175f,  -9.557284f,  -9.704856f,  -9.852428f, -10.f, -20.f,
                                       -19.856981f, -19.713963f, -19.570944f,   8.896924f,   8.727221f,   8.557517f,   8.387813f,  21.476097f,  21.631475f,  21.786854f,  21.942233f, -11.918438f,
-                                       -12.06601f, -12.213582f, -12.361154f,   -17.7117f, -17.568681f, -17.425663f, -17.282644f}, nd4j::DataType::FLOAT32);
+                                       -12.06601f, -12.213582f, -12.361154f,   -17.7117f, -17.568681f, -17.425663f, -17.282644f}, sd::DataType::FLOAT32);
     input.linspace(0.1, 0.1);
 
-    nd4j::ops::batchnorm op;
+    sd::ops::batchnorm op;
 
     auto results = op.evaluate({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1, 1, 1});
 
@@ -2111,18 +2111,18 @@ TEST_F(DeclarableOpsTests13, batchnorm_test5) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests13, batchnorm_test6) {
 
-    NDArray input   ('c', {2,2,2,4}, nd4j::DataType::FLOAT32);
-    NDArray mean    ('c', {4}, {1.05f, 1.15f, 1.2f, 1.3f}, nd4j::DataType::FLOAT32);
-    NDArray variance('c', {4}, {0.5f, 0.7f, 0.9,  1.1f},  nd4j::DataType::FLOAT32);
-    NDArray gamma   ('c', {4}, {-1.2f, 1.3f, -1.4f, 1.5f}, nd4j::DataType::FLOAT32);
-    NDArray beta    ('c', {4}, {10.f, 20.f, -10.f, -20.f},     nd4j::DataType::FLOAT32);
+    NDArray input   ('c', {2,2,2,4}, sd::DataType::FLOAT32);
+    NDArray mean    ('c', {4}, {1.05f, 1.15f, 1.2f, 1.3f}, sd::DataType::FLOAT32);
+    NDArray variance('c', {4}, {0.5f, 0.7f, 0.9,  1.1f},  sd::DataType::FLOAT32);
+    NDArray gamma   ('c', {4}, {-1.2f, 1.3f, -1.4f, 1.5f}, sd::DataType::FLOAT32);
+    NDArray beta    ('c', {4}, {10.f, 20.f, -10.f, -20.f},     sd::DataType::FLOAT32);
 
     NDArray expected('c', {2,2,2,4}, {11.612187f,  18.523903f,  -8.671851f, -21.287169f,  10.933372f,  19.145418f,  -9.262139f, -20.715094f,  10.254556f,  19.766932f,  -9.852428f, -20.143019f,   9.57574f,
                                       20.388447f, -10.442716f, -19.570944f,   8.896924f,  21.009961f, -11.033005f, -18.998869f,   8.218109f,  21.631475f, -11.623294f, -18.426794f,   7.539293f,  22.25299f,
-                                     -12.213582f, -17.854719f,   6.860477f,  22.874504f, -12.803871f, -17.282644f}, nd4j::DataType::FLOAT32);
+                                     -12.213582f, -17.854719f,   6.860477f,  22.874504f, -12.803871f, -17.282644f}, sd::DataType::FLOAT32);
     input.linspace(0.1, 0.1);
 
-    nd4j::ops::batchnorm op;
+    sd::ops::batchnorm op;
 
     auto results = op.evaluate({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1,3});
 
@@ -2139,22 +2139,22 @@ TEST_F(DeclarableOpsTests13, batchnorm_test6) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests13, batchnorm_test7) {
 
-    NDArray input1('c', {3,3,15,15}, nd4j::DataType::FLOAT32);
-    NDArray input2('c', {3,15,15,3}, nd4j::DataType::FLOAT32);
+    NDArray input1('c', {3,3,15,15}, sd::DataType::FLOAT32);
+    NDArray input2('c', {3,15,15,3}, sd::DataType::FLOAT32);
     input2.permutei({0,3,1,2});
 
-    NDArray mean    ('c', {3}, {0., 0, 0}, nd4j::DataType::FLOAT32);
-    NDArray variance('c', {3}, {1., 1, 1}, nd4j::DataType::FLOAT32);
-    NDArray gamma   ('c', {3}, {1., 1, 1}, nd4j::DataType::FLOAT32);
-    NDArray beta    ('c', {3}, {0., 0, 0}, nd4j::DataType::FLOAT32);
+    NDArray mean    ('c', {3}, {0., 0, 0}, sd::DataType::FLOAT32);
+    NDArray variance('c', {3}, {1., 1, 1}, sd::DataType::FLOAT32);
+    NDArray gamma   ('c', {3}, {1., 1, 1}, sd::DataType::FLOAT32);
+    NDArray beta    ('c', {3}, {0., 0, 0}, sd::DataType::FLOAT32);
 
-    NDArray out1('c', {3,3,15,15}, nd4j::DataType::FLOAT32);
-    NDArray out2('c', {3,3,15,15}, nd4j::DataType::FLOAT32);
+    NDArray out1('c', {3,3,15,15}, sd::DataType::FLOAT32);
+    NDArray out2('c', {3,3,15,15}, sd::DataType::FLOAT32);
 
     input1.linspace(-1012, 1);
     input2.assign(input1);
 
-    nd4j::ops::batchnorm op;
+    sd::ops::batchnorm op;
 
     auto res1 = op.execute({&input1, &mean, &variance, &gamma, &beta}, {&out1}, {1e-5}, {1,1,1}, {});
     ASSERT_EQ(ND4J_STATUS_OK, res1);
@@ -2168,12 +2168,12 @@ TEST_F(DeclarableOpsTests13, batchnorm_test7) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests13, batchnorm_test8) {
 
-    NDArray input('c', {2,3,4,5}, nd4j::DataType::FLOAT32);
+    NDArray input('c', {2,3,4,5}, sd::DataType::FLOAT32);
 
-    NDArray mean    ('c', {1,3,4,5}, nd4j::DataType::FLOAT32);
-    NDArray variance('c', {1,3,4,5}, nd4j::DataType::FLOAT32);
-    NDArray gamma   ('c', {1,3,4,5}, nd4j::DataType::FLOAT32);
-    NDArray beta    ('c', {1,3,4,5}, nd4j::DataType::FLOAT32);
+    NDArray mean    ('c', {1,3,4,5}, sd::DataType::FLOAT32);
+    NDArray variance('c', {1,3,4,5}, sd::DataType::FLOAT32);
+    NDArray gamma   ('c', {1,3,4,5}, sd::DataType::FLOAT32);
+    NDArray beta    ('c', {1,3,4,5}, sd::DataType::FLOAT32);
 
     NDArray expected('c', {2,3,4,5}, {-105.019394, -103.322357, -101.625313, -99.928276, -98.231239, -96.534195, -94.837158, -93.140121, -91.443077, -89.746040, -88.049004, -86.351959, -84.654922,
                         -82.957886, -81.260841, -79.563805, -77.866768, -76.169724, -74.472687, -72.775650, -71.078606, -69.381569, -67.684532, -65.987488, -64.290451, -62.593414,
@@ -2183,7 +2183,7 @@ TEST_F(DeclarableOpsTests13, batchnorm_test8) {
                         6.985196, 8.682236, 10.379274, 12.076314, 13.773354, 15.470392, 17.167431, 18.864471, 20.561510, 22.258549, 23.955589, 25.652628, 27.349667, 29.046707, 30.743744,
                         32.440784, 34.137825, 35.834862, 37.531902, 39.228943, 40.925980, 42.623020, 44.320061, 46.017097, 47.714138, 49.411179, 51.108215, 52.805256, 54.502296, 56.199333,
                         57.896374, 59.593414, 61.290451, 62.987488, 64.684532, 66.381569, 68.078606, 69.775650, 71.472687, 73.169724, 74.866768, 76.563805, 78.260841, 79.957886, 81.654922,
-                        83.351959, 85.049004, 86.746040, 88.443077, 90.140121, 91.837158, 93.534195, 95.231239, 96.928276}, nd4j::DataType::FLOAT32);
+                        83.351959, 85.049004, 86.746040, 88.443077, 90.140121, 91.837158, 93.534195, 95.231239, 96.928276}, sd::DataType::FLOAT32);
 
     input.linspace(-60, 1);
     mean.assign(1.);
@@ -2191,7 +2191,7 @@ TEST_F(DeclarableOpsTests13, batchnorm_test8) {
     gamma.assign(1.2);
     beta.assign(-1.5);
 
-    nd4j::ops::batchnorm op;
+    sd::ops::batchnorm op;
 
     auto results = op.evaluate({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1, 1,2,3});
 
@@ -2208,12 +2208,12 @@ TEST_F(DeclarableOpsTests13, batchnorm_test8) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests13, batchnorm_test9) {
 
-    NDArray input('c', {2,3,3,3,3}, nd4j::DataType::FLOAT32);
+    NDArray input('c', {2,3,3,3,3}, sd::DataType::FLOAT32);
 
-    NDArray mean    ('c', {1,3,3,3,3}, nd4j::DataType::FLOAT32);
-    NDArray variance('c', {1,3,3,3,3}, nd4j::DataType::FLOAT32);
-    NDArray gamma   ('c', {1,3,3,3,3}, nd4j::DataType::FLOAT32);
-    NDArray beta    ('c', {1,3,3,3,3}, nd4j::DataType::FLOAT32);
+    NDArray mean    ('c', {1,3,3,3,3}, sd::DataType::FLOAT32);
+    NDArray variance('c', {1,3,3,3,3}, sd::DataType::FLOAT32);
+    NDArray gamma   ('c', {1,3,3,3,3}, sd::DataType::FLOAT32);
+    NDArray beta    ('c', {1,3,3,3,3}, sd::DataType::FLOAT32);
 
     NDArray expected('c', {2,3,3,3,3}, {-138.960175, -137.263138, -135.566101, -133.869064, -132.172028, -130.474976, -128.777954, -127.080902, -125.383865, -123.686829, -121.989784, -120.292747,
                             -118.595711, -116.898666, -115.201630, -113.504593, -111.807549, -110.110512, -108.413475, -106.716431, -105.019394, -103.322357, -101.625313, -99.928276,
@@ -2226,7 +2226,7 @@ TEST_F(DeclarableOpsTests13, batchnorm_test9) {
                             40.925980, 42.623020, 44.320061, 46.017097, 47.714138, 49.411179, 51.108215, 52.805256, 54.502296, 56.199333, 57.896374, 59.593414, 61.290451, 62.987488, 64.684532,
                             66.381569, 68.078606, 69.775650, 71.472687, 73.169724, 74.866768, 76.563805, 78.260841, 79.957886, 81.654922, 83.351959, 85.049004, 86.746040, 88.443077, 90.140121,
                             91.837158, 93.534195, 95.231239, 96.928276, 98.625313, 100.322357, 102.019394, 103.716431, 105.413475, 107.110512, 108.807549, 110.504593, 112.201630, 113.898666,
-                            115.595711, 117.292747, 118.989784, 120.686829, 122.383865, 124.080902, 125.777946, 127.474976, 129.172028, 130.869064, 132.566101, 134.263138}, nd4j::DataType::FLOAT32);
+                            115.595711, 117.292747, 118.989784, 120.686829, 122.383865, 124.080902, 125.777946, 127.474976, 129.172028, 130.869064, 132.566101, 134.263138}, sd::DataType::FLOAT32);
 
     input.linspace(-80, 1);
     mean.assign(1.);
@@ -2234,7 +2234,7 @@ TEST_F(DeclarableOpsTests13, batchnorm_test9) {
     gamma.assign(1.2);
     beta.assign(-1.5);
 
-    nd4j::ops::batchnorm op;
+    sd::ops::batchnorm op;
 
     auto results = op.evaluate({&input, &mean, &variance, &gamma, &beta}, {1e-5}, {1,1, 1,2,3,4});
 
@@ -2252,16 +2252,16 @@ TEST_F(DeclarableOpsTests13, batchnorm_test9) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests13, batchnorm_bp_test1) {
 
-    NDArray input   ('c', {2,3,4}, nd4j::DataType::FLOAT32);
-    NDArray mean    ('c', {4}, {1.1, 1.2, 1.3, 1.4}, nd4j::DataType::FLOAT32);
-    NDArray variance('c', {4}, nd4j::DataType::FLOAT32);
-    NDArray gamma   ('c', {4}, nd4j::DataType::FLOAT32);
-    NDArray beta    ('c', {4}, nd4j::DataType::FLOAT32);
-    NDArray gradO   ('c', {2,3,4}, nd4j::DataType::FLOAT32);
+    NDArray input   ('c', {2,3,4}, sd::DataType::FLOAT32);
+    NDArray mean    ('c', {4}, {1.1, 1.2, 1.3, 1.4}, sd::DataType::FLOAT32);
+    NDArray variance('c', {4}, sd::DataType::FLOAT32);
+    NDArray gamma   ('c', {4}, sd::DataType::FLOAT32);
+    NDArray beta    ('c', {4}, sd::DataType::FLOAT32);
+    NDArray gradO   ('c', {2,3,4}, sd::DataType::FLOAT32);
 
-    NDArray expdLdI('c', {2,3,4}, {-0.000056, -0.000056, -0.000056, -0.000056, -0.000034, -0.000034, -0.000034, -0.000034, -0.000011, -0.000011, -0.000011, -0.000011, 0.000011, 0.000011, 0.000011, 0.000011, 0.000034, 0.000034, 0.000034, 0.000034, 0.000056, 0.000056, 0.000056, 0.000056}, nd4j::DataType::FLOAT32);
-    NDArray expdLdG('c', {4}, {6.148104, 6.148104, 6.148105, 6.148105}, nd4j::DataType::FLOAT32);
-    NDArray expdLdB('c', {4}, {3.6, 4.5, 5.4, 6.3}, nd4j::DataType::FLOAT32);
+    NDArray expdLdI('c', {2,3,4}, {-0.000056, -0.000056, -0.000056, -0.000056, -0.000034, -0.000034, -0.000034, -0.000034, -0.000011, -0.000011, -0.000011, -0.000011, 0.000011, 0.000011, 0.000011, 0.000011, 0.000034, 0.000034, 0.000034, 0.000034, 0.000056, 0.000056, 0.000056, 0.000056}, sd::DataType::FLOAT32);
+    NDArray expdLdG('c', {4}, {6.148104, 6.148104, 6.148105, 6.148105}, sd::DataType::FLOAT32);
+    NDArray expdLdB('c', {4}, {3.6, 4.5, 5.4, 6.3}, sd::DataType::FLOAT32);
 
     input.linspace(0.1, 0.1);
     variance.assign(0.46666667);
@@ -2269,7 +2269,7 @@ TEST_F(DeclarableOpsTests13, batchnorm_bp_test1) {
     beta.assign(1.);     // has no effect on gradient calculations
     gradO.linspace(-0.9, 0.15);
 
-    nd4j::ops::batchnorm_bp op;
+    sd::ops::batchnorm_bp op;
 
     auto results = op.evaluate({&input, &mean, &variance, &gamma, &beta, &gradO}, {1e-5}, {1,1});
 
@@ -2295,24 +2295,24 @@ TEST_F(DeclarableOpsTests13, batchnorm_bp_test1) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests13, batchnorm_bp_test2) {
 
-    NDArray input   ('c', {2,3,4}, nd4j::DataType::FLOAT32);
-    NDArray mean    ('c', {3}, {1.05, 1.1, 1.15}, nd4j::DataType::FLOAT32);
-    NDArray variance('c', {3}, {0.5, 0.6, 0.7}, nd4j::DataType::FLOAT32);
-    NDArray gamma   ('c', {3}, {1.2, 1.3, 1.4}, nd4j::DataType::FLOAT32);
-    NDArray beta    ('c', {3}, nd4j::DataType::FLOAT32);
-    NDArray gradO   ('c', {2,3,4}, nd4j::DataType::FLOAT32);
+    NDArray input   ('c', {2,3,4}, sd::DataType::FLOAT32);
+    NDArray mean    ('c', {3}, {1.05, 1.1, 1.15}, sd::DataType::FLOAT32);
+    NDArray variance('c', {3}, {0.5, 0.6, 0.7}, sd::DataType::FLOAT32);
+    NDArray gamma   ('c', {3}, {1.2, 1.3, 1.4}, sd::DataType::FLOAT32);
+    NDArray beta    ('c', {3}, sd::DataType::FLOAT32);
+    NDArray gradO   ('c', {2,3,4}, sd::DataType::FLOAT32);
 
     NDArray expdLdI('c', {2,3,4}, {-0.601415, -0.521226, -0.441037, -0.360849, -0.456306, -0.395465, -0.334624, -0.273784, 0.396631, 0.343747,
                                     0.290863, 0.237978, 0.360849, 0.441037, 0.521226, 0.601415, 0.273784, 0.334625, 0.395465, 0.456306, -0.237978,
-                                    -0.290863, -0.343746, -0.396631}, nd4j::DataType::FLOAT32);
-    NDArray expdLdG('c', {3}, {5.81236 ,  7.048771, 12.155388}, nd4j::DataType::FLOAT32);
-    NDArray expdLdB('c', {3}, {1.8,  6.6, 11.4}, nd4j::DataType::FLOAT32);
+                                    -0.290863, -0.343746, -0.396631}, sd::DataType::FLOAT32);
+    NDArray expdLdG('c', {3}, {5.81236 ,  7.048771, 12.155388}, sd::DataType::FLOAT32);
+    NDArray expdLdB('c', {3}, {1.8,  6.6, 11.4}, sd::DataType::FLOAT32);
 
     input.linspace(0.1, 0.1);
     // beta.assign(1.);     // has no effect on gradient calculations
     gradO.linspace(-0.9, 0.15);
 
-    nd4j::ops::batchnorm_bp op;
+    sd::ops::batchnorm_bp op;
 
     auto results = op.evaluate({&input, &mean, &variance, &gamma, &beta, &gradO}, {1e-5}, {1,1,1});
 
@@ -2337,24 +2337,24 @@ TEST_F(DeclarableOpsTests13, batchnorm_bp_test2) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests13, batchnorm_bp_test3) {
 
-    NDArray input   ('c', {2,3,4}, nd4j::DataType::FLOAT32);
-    NDArray mean    ('c', {2,1,4}, {1.05, 1.1, 1.15, 1.2, 1.25, 1.3, 1.35, 1.4}, nd4j::DataType::FLOAT32);
-    NDArray variance('c', {2,1,4}, {0.5, 0.6, 0.7, 0.8, 0.9, 1., 1.1, 1.2}, nd4j::DataType::FLOAT32);
-    NDArray gamma   ('c', {2,1,4}, {1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9}, nd4j::DataType::FLOAT32);
-    NDArray beta    ('c', {2,1,4}, nd4j::DataType::FLOAT32);
-    NDArray gradO   ('c', {2,3,4}, nd4j::DataType::FLOAT32);
+    NDArray input   ('c', {2,3,4}, sd::DataType::FLOAT32);
+    NDArray mean    ('c', {2,1,4}, {1.05, 1.1, 1.15, 1.2, 1.25, 1.3, 1.35, 1.4}, sd::DataType::FLOAT32);
+    NDArray variance('c', {2,1,4}, {0.5, 0.6, 0.7, 0.8, 0.9, 1., 1.1, 1.2}, sd::DataType::FLOAT32);
+    NDArray gamma   ('c', {2,1,4}, {1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9}, sd::DataType::FLOAT32);
+    NDArray beta    ('c', {2,1,4}, sd::DataType::FLOAT32);
+    NDArray gradO   ('c', {2,3,4}, sd::DataType::FLOAT32);
 
     NDArray expdLdI('c', {2,3,4}, {-0.577002, -0.744041, -0.850999, -0.922373, -0.000000, -0.000000, -0.000000, -0.000000, 0.577002,
                                     0.744041, 0.850999, 0.922373, -0.386037, -0.350205, -0.312047, -0.271737, -0.000000, -0.000000,
-                                    -0.000000, -0.000000, 0.386037, 0.350205, 0.312047, 0.271736}, nd4j::DataType::FLOAT32);
-    NDArray expdLdG('c', {2,1,4}, {1.378844, 0.910144, 0.573706, 0.335408, 2.640487, 2.954985, 3.289431, 3.64234 }, nd4j::DataType::FLOAT32);
-    NDArray expdLdB('c', {2,1,4}, {-0.9 , -0.45,  0.  ,  0.45,  4.5 ,  4.95,  5.4 ,  5.85}, nd4j::DataType::FLOAT32);
+                                    -0.000000, -0.000000, 0.386037, 0.350205, 0.312047, 0.271736}, sd::DataType::FLOAT32);
+    NDArray expdLdG('c', {2,1,4}, {1.378844, 0.910144, 0.573706, 0.335408, 2.640487, 2.954985, 3.289431, 3.64234 }, sd::DataType::FLOAT32);
+    NDArray expdLdB('c', {2,1,4}, {-0.9 , -0.45,  0.  ,  0.45,  4.5 ,  4.95,  5.4 ,  5.85}, sd::DataType::FLOAT32);
 
     input.linspace(0.1, 0.1);
     // beta.assign(1.);     // has no effect on gradient calculations
     gradO.linspace(-0.9, 0.15);
 
-    nd4j::ops::batchnorm_bp op;
+    sd::ops::batchnorm_bp op;
 
     auto results = op.evaluate({&input, &mean, &variance, &gamma, &beta, &gradO}, {1e-5}, {1,1,0,2});
 
@@ -2379,21 +2379,21 @@ TEST_F(DeclarableOpsTests13, batchnorm_bp_test3) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests13, batchnorm_bp_test4) {
 
-    NDArray input   ('c', {2,4}, nd4j::DataType::FLOAT32);
-    NDArray mean    ('c', {4}, {1.05, 1.15, 1.2, 1.3}, nd4j::DataType::FLOAT32);
-    NDArray variance('c', {4}, {0.5, 0.7, 0.9,  1.1}, nd4j::DataType::FLOAT32);
-    NDArray gamma   ('c', {4}, {-1.2, 1.3, -1.4, 1.5}, nd4j::DataType::FLOAT32);
-    NDArray beta    ('c', {4}, nd4j::DataType::FLOAT32);
-    NDArray gradO   ('c', {2,4}, nd4j::DataType::FLOAT32);
+    NDArray input   ('c', {2,4}, sd::DataType::FLOAT32);
+    NDArray mean    ('c', {4}, {1.05, 1.15, 1.2, 1.3}, sd::DataType::FLOAT32);
+    NDArray variance('c', {4}, {0.5, 0.7, 0.9,  1.1}, sd::DataType::FLOAT32);
+    NDArray gamma   ('c', {4}, {-1.2, 1.3, -1.4, 1.5}, sd::DataType::FLOAT32);
+    NDArray beta    ('c', {4}, sd::DataType::FLOAT32);
+    NDArray gradO   ('c', {2,4}, sd::DataType::FLOAT32);
 
-    NDArray expdLdI('c', {2,4}, {0.162923, -0.289673, 0.354174, -0.386151, -0.162923, 0.289673, -0.354174, 0.386151}, nd4j::DataType::FLOAT32);
-    NDArray expdLdG('c', {4}, {1.442483, 0.950200, 0.569207, 0.314641}, nd4j::DataType::FLOAT32);
-    NDArray expdLdB('c', {4}, {-1.2, -0.9, -0.6, -0.3}, nd4j::DataType::FLOAT32);
+    NDArray expdLdI('c', {2,4}, {0.162923, -0.289673, 0.354174, -0.386151, -0.162923, 0.289673, -0.354174, 0.386151}, sd::DataType::FLOAT32);
+    NDArray expdLdG('c', {4}, {1.442483, 0.950200, 0.569207, 0.314641}, sd::DataType::FLOAT32);
+    NDArray expdLdB('c', {4}, {-1.2, -0.9, -0.6, -0.3}, sd::DataType::FLOAT32);
 
     input.linspace(0.1, 0.1);
     gradO.linspace(-0.9, 0.15);
 
-    nd4j::ops::batchnorm_bp op;
+    sd::ops::batchnorm_bp op;
 
     auto results = op.evaluate({&input, &mean, &variance, &gamma, &beta, &gradO}, {1e-5}, {1,1});
 
@@ -2421,23 +2421,23 @@ TEST_F(DeclarableOpsTests13, batchnorm_bp_test5) {
 #if defined(HAVE_CUDNN)
 return;
 #endif
-    NDArray input   ('c', {2,4,2,2}, nd4j::DataType::FLOAT32);
-    NDArray mean    ('c', {4}, {1.05, 1.15, 1.2, 1.3}, nd4j::DataType::FLOAT32);
-    NDArray variance('c', {4}, {0.5, 0.7, 0.9,  1.1}, nd4j::DataType::FLOAT32);
-    NDArray gamma   ('c', {4}, {-1.2, 1.3, -1.4, 1.5}, nd4j::DataType::FLOAT32);
-    NDArray beta    ('c', {4}, nd4j::DataType::FLOAT32);
-    NDArray gradO   ('c', {2,4,2,2}, nd4j::DataType::FLOAT32);
+    NDArray input   ('c', {2,4,2,2}, sd::DataType::FLOAT32);
+    NDArray mean    ('c', {4}, {1.05, 1.15, 1.2, 1.3}, sd::DataType::FLOAT32);
+    NDArray variance('c', {4}, {0.5, 0.7, 0.9,  1.1}, sd::DataType::FLOAT32);
+    NDArray gamma   ('c', {4}, {-1.2, 1.3, -1.4, 1.5}, sd::DataType::FLOAT32);
+    NDArray beta    ('c', {4}, sd::DataType::FLOAT32);
+    NDArray gradO   ('c', {2,4,2,2}, sd::DataType::FLOAT32);
 
     NDArray expdLdI('c', {2,4,2,2}, {-0.737512, -0.659880, -0.582247, -0.504614, 0.561404, 0.502309, 0.443214, 0.384118, -1.168243,
         -1.045270, -0.922297, -0.799324, 1.899026, 1.699128, 1.499231, 1.299333, 0.504614, 0.582247, 0.659880, 0.737512, -0.384118,
-        -0.443214, -0.502308, -0.561404, 0.799324, 0.922297, 1.045270, 1.168243, -1.299334, -1.499231, -1.699129, -1.899026}, nd4j::DataType::FLOAT32);
-    NDArray expdLdG('c', {4}, {11.073181, 12.585667, 17.708657, 24.313186}, nd4j::DataType::FLOAT32);
-    NDArray expdLdB('c', {4}, {4.2,  9. , 13.8, 18.6}, nd4j::DataType::FLOAT32);
+        -0.443214, -0.502308, -0.561404, 0.799324, 0.922297, 1.045270, 1.168243, -1.299334, -1.499231, -1.699129, -1.899026}, sd::DataType::FLOAT32);
+    NDArray expdLdG('c', {4}, {11.073181, 12.585667, 17.708657, 24.313186}, sd::DataType::FLOAT32);
+    NDArray expdLdB('c', {4}, {4.2,  9. , 13.8, 18.6}, sd::DataType::FLOAT32);
 
     input.linspace(0.1, 0.1);
     gradO.linspace(-0.9, 0.15);
 
-    nd4j::ops::batchnorm_bp op;
+    sd::ops::batchnorm_bp op;
 
     auto results = op.evaluate({&input, &mean, &variance, &gamma, &beta, &gradO}, {1e-5}, {1,1,1});
 
@@ -2466,23 +2466,23 @@ TEST_F(DeclarableOpsTests13, batchnorm_bp_test6) {
 return;
 #endif
 
-    NDArray input   ('c', {2,2,2,4}, nd4j::DataType::FLOAT32);
-    NDArray mean    ('c', {4}, {1.05, 1.15, 1.2, 1.3}, nd4j::DataType::FLOAT32);
-    NDArray variance('c', {4}, {0.5, 0.7, 0.9,  1.1}, nd4j::DataType::FLOAT32);
-    NDArray gamma   ('c', {4}, {-1.2, 1.3, -1.4, 1.5}, nd4j::DataType::FLOAT32);
-    NDArray beta    ('c', {4}, nd4j::DataType::FLOAT32);
-    NDArray gradO   ('c', {2,2,2,4}, nd4j::DataType::FLOAT32);
+    NDArray input   ('c', {2,2,2,4}, sd::DataType::FLOAT32);
+    NDArray mean    ('c', {4}, {1.05, 1.15, 1.2, 1.3}, sd::DataType::FLOAT32);
+    NDArray variance('c', {4}, {0.5, 0.7, 0.9,  1.1}, sd::DataType::FLOAT32);
+    NDArray gamma   ('c', {4}, {-1.2, 1.3, -1.4, 1.5}, sd::DataType::FLOAT32);
+    NDArray beta    ('c', {4}, sd::DataType::FLOAT32);
+    NDArray gradO   ('c', {2,2,2,4}, sd::DataType::FLOAT32);
 
     NDArray expdLdI('c', {2,2,2,4}, {-4.989124, 2.540357, -1.515022, 0.791769, -3.563660, 1.814540, -1.082159, 0.565549, -2.138196, 1.088724, -0.649295,
                                     0.339329, -0.712732, 0.362908, -0.216432, 0.113110, 0.712732, -0.362908, 0.216432, -0.113110, 2.138195, -1.088724, 0.649295,
-                                    -0.339330, 3.563660,-1.814540, 1.082159, -0.565549, 4.989125, -2.540356, 1.515022, -0.791770}, nd4j::DataType::FLOAT32);
-    NDArray expdLdG('c', {4}, {20.364472, 17.856588, 16.949714, 15.903684}, nd4j::DataType::FLOAT32);
-    NDArray expdLdB('c', {4}, {9.6, 10.8, 12. , 13.2}, nd4j::DataType::FLOAT32);
+                                    -0.339330, 3.563660,-1.814540, 1.082159, -0.565549, 4.989125, -2.540356, 1.515022, -0.791770}, sd::DataType::FLOAT32);
+    NDArray expdLdG('c', {4}, {20.364472, 17.856588, 16.949714, 15.903684}, sd::DataType::FLOAT32);
+    NDArray expdLdB('c', {4}, {9.6, 10.8, 12. , 13.2}, sd::DataType::FLOAT32);
 
     input.linspace(0.1, 0.1);
     gradO.linspace(-0.9, 0.15);
 
-    nd4j::ops::batchnorm_bp op;
+    sd::ops::batchnorm_bp op;
 
     auto results = op.evaluate({&input, &mean, &variance, &gamma, &beta, &gradO}, {1e-5}, {1,1,3});
 
@@ -2511,26 +2511,26 @@ TEST_F(DeclarableOpsTests13, batchnorm_bp_test7) {
 return;
 #endif
 
-    NDArray input   ('c', {2,2,2,2,4}, nd4j::DataType::FLOAT32);
-    NDArray mean    ('c', {4}, {1.05, 1.15, 1.2, 1.3}, nd4j::DataType::FLOAT32);
-    NDArray variance('c', {4}, {0.5, 0.7, 0.9,  1.1}, nd4j::DataType::FLOAT32);
-    NDArray gamma   ('c', {4}, {-1.2, 1.3, -1.4, 1.5}, nd4j::DataType::FLOAT32);
-    NDArray beta    ('c', {4}, nd4j::DataType::FLOAT32);
-    NDArray gradO   ('c', {2,2,2,2,4}, nd4j::DataType::FLOAT32);
+    NDArray input   ('c', {2,2,2,2,4}, sd::DataType::FLOAT32);
+    NDArray mean    ('c', {4}, {1.05, 1.15, 1.2, 1.3}, sd::DataType::FLOAT32);
+    NDArray variance('c', {4}, {0.5, 0.7, 0.9,  1.1}, sd::DataType::FLOAT32);
+    NDArray gamma   ('c', {4}, {-1.2, 1.3, -1.4, 1.5}, sd::DataType::FLOAT32);
+    NDArray beta    ('c', {4}, sd::DataType::FLOAT32);
+    NDArray gradO   ('c', {2,2,2,2,4}, sd::DataType::FLOAT32);
 
     NDArray expdLdI('c', {2,2,2,2,4}, {-119.435059, 78.159744, -58.732986, 46.630123, -103.510391, 67.738441, -50.901920, 40.412773, -87.585716, 57.317142,
         -43.070854, 34.195419, -71.661041, 46.895844, -35.239792, 27.978071, -55.736359, 36.474548, -27.408726, 21.760721, -39.811687, 26.053242, -19.577662,
         15.543370, -23.887009, 15.631950, -11.746595, 9.326023, -7.962326, 5.210644, -3.915531, 3.108671, 7.962341, -5.210655, 3.915535, -3.108677, 23.887032,
         -15.631958, 11.746601, -9.326031, 39.811691, -26.053246, 19.577671, -15.543377, 55.736382, -36.474548, 27.408726, -21.760731, 71.661064, -46.895851, 35.239788,
-        -27.978077, 87.585732, -57.317154, 43.070866, -34.195431, 103.510384, -67.738464, 50.901920, -40.412777, 119.435097, -78.159744, 58.732998, -46.630131}, nd4j::DataType::FLOAT32);
-    NDArray expdLdG('c', {4}, {282.38734 , 244.542027, 224.140995, 207.548793}, nd4j::DataType::FLOAT32);
-    NDArray expdLdB('c', {4}, {57.6, 60. , 62.4, 64.8}, nd4j::DataType::FLOAT32);
+        -27.978077, 87.585732, -57.317154, 43.070866, -34.195431, 103.510384, -67.738464, 50.901920, -40.412777, 119.435097, -78.159744, 58.732998, -46.630131}, sd::DataType::FLOAT32);
+    NDArray expdLdG('c', {4}, {282.38734 , 244.542027, 224.140995, 207.548793}, sd::DataType::FLOAT32);
+    NDArray expdLdB('c', {4}, {57.6, 60. , 62.4, 64.8}, sd::DataType::FLOAT32);
 
     input.linspace(0.1, 0.1);
     gradO.linspace(-0.9, 0.15);
 
 
-    nd4j::ops::batchnorm_bp op;
+    sd::ops::batchnorm_bp op;
 
     auto results = op.evaluate({&input, &mean, &variance, &gamma, &beta, &gradO}, {1e-5}, {1,1,4});
 
@@ -2561,25 +2561,25 @@ TEST_F(DeclarableOpsTests13, batchnorm_bp_test8) {
 return;
 #endif
 
-    NDArray input   ('c', {2,4,2,2,2}, nd4j::DataType::FLOAT32);
-    NDArray mean    ('c', {4}, {1.05, 1.15, 1.2, 1.3}, nd4j::DataType::FLOAT32);
-    NDArray variance('c', {4}, {0.5, 0.7, 0.9,  1.1}, nd4j::DataType::FLOAT32);
-    NDArray gamma   ('c', {4}, {-1.2, 1.3, -1.4, 1.5}, nd4j::DataType::FLOAT32);
-    NDArray beta    ('c', {4}, nd4j::DataType::FLOAT32);
-    NDArray gradO   ('c', {2,4,2,2,2}, nd4j::DataType::FLOAT32);
+    NDArray input   ('c', {2,4,2,2,2}, sd::DataType::FLOAT32);
+    NDArray mean    ('c', {4}, {1.05, 1.15, 1.2, 1.3}, sd::DataType::FLOAT32);
+    NDArray variance('c', {4}, {0.5, 0.7, 0.9,  1.1}, sd::DataType::FLOAT32);
+    NDArray gamma   ('c', {4}, {-1.2, 1.3, -1.4, 1.5}, sd::DataType::FLOAT32);
+    NDArray beta    ('c', {4}, sd::DataType::FLOAT32);
+    NDArray gradO   ('c', {2,4,2,2,2}, sd::DataType::FLOAT32);
 
     NDArray expdLdI('c', {2,4,2,2,2}, {-34.373802, -32.611046, -30.848286, -29.085529, -27.322769, -25.560009, -23.797251, -22.034491, 36.146996, 34.293301,
         32.439610, 30.585917, 28.732227, 26.878534, 25.024841, 23.171150, -42.876553, -40.677757, -38.478958, -36.280159, -34.081367, -31.882565, -29.683767,
         -27.484968, 50.674446, 48.075760, 45.477066, 42.878380, 40.279686, 37.681000, 35.082310, 32.483616, 22.034489, 23.797249, 25.560009, 27.322765, 29.085526,
         30.848286, 32.611046, 34.373802, -23.171146, -25.024837, -26.878536, -28.732231, -30.585918, -32.439613, -34.293297, -36.146996, 27.484982, 29.683773,
-        31.882572, 34.081364, 36.280178, 38.478970, 40.677776, 42.876560, -32.483627, -35.082329, -37.681023, -40.279701, -42.878403, -45.477081, -48.075775, -50.674484}, nd4j::DataType::FLOAT32);
-    NDArray expdLdG('c', {4}, {134.490365, 179.785003, 248.933114, 330.087248}, nd4j::DataType::FLOAT32);
-    NDArray expdLdB('c', {4}, {32.4, 51.6, 70.8, 90.}, nd4j::DataType::FLOAT32);
+        31.882572, 34.081364, 36.280178, 38.478970, 40.677776, 42.876560, -32.483627, -35.082329, -37.681023, -40.279701, -42.878403, -45.477081, -48.075775, -50.674484}, sd::DataType::FLOAT32);
+    NDArray expdLdG('c', {4}, {134.490365, 179.785003, 248.933114, 330.087248}, sd::DataType::FLOAT32);
+    NDArray expdLdB('c', {4}, {32.4, 51.6, 70.8, 90.}, sd::DataType::FLOAT32);
 
     input.linspace(0.1, 0.1);
     gradO.linspace(-0.9, 0.15);
 
-    nd4j::ops::batchnorm_bp op;
+    sd::ops::batchnorm_bp op;
 
     auto results = op.evaluate({&input, &mean, &variance, &gamma, &beta, &gradO}, {1e-5}, {1,1,1});
 
@@ -2606,18 +2606,18 @@ return;
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests13, batchnorm_bp_test9) {
 
-    NDArray input   ('c', {2,4,2,2}, nd4j::DataType::FLOAT32);
-    NDArray mean    ('c', {4}, nd4j::DataType::FLOAT32);
-    NDArray variance('c', {4}, nd4j::DataType::FLOAT32);
-    NDArray gamma   ('c', {4}, {-1.2, 1.3, -1.4, 1.5}, nd4j::DataType::FLOAT32);
-    NDArray beta    ('c', {4}, nd4j::DataType::FLOAT32);
-    NDArray gradO   ('c', {2,4,2,2}, nd4j::DataType::FLOAT32);
+    NDArray input   ('c', {2,4,2,2}, sd::DataType::FLOAT32);
+    NDArray mean    ('c', {4}, sd::DataType::FLOAT32);
+    NDArray variance('c', {4}, sd::DataType::FLOAT32);
+    NDArray gamma   ('c', {4}, {-1.2, 1.3, -1.4, 1.5}, sd::DataType::FLOAT32);
+    NDArray beta    ('c', {4}, sd::DataType::FLOAT32);
+    NDArray gradO   ('c', {2,4,2,2}, sd::DataType::FLOAT32);
 
     NDArray expdLdI('c', {2,4,2,2}, {0.032378, 0.028967, 0.025558, 0.022147, -0.035056, -0.031364, -0.027669, -0.024006, 0.037742, 0.033766, 0.029791, 0.025818,
                                     -0.040429, -0.036172, -0.031913, -0.027656, -0.022155, -0.025564, -0.028974, -0.032359, 0.023982, 0.027677, 0.031373, 0.035063,
-                                    -0.025822, -0.029794, -0.033770, -0.037747, 0.027653, 0.031913, 0.036168, 0.040426}, nd4j::DataType::FLOAT32);
-    NDArray expdLdG('c', {4}, {9.685875, 9.685880, 9.685887, 9.685891}, nd4j::DataType::FLOAT32);
-    NDArray expdLdB('c', {4}, {4.2,  9. , 13.8, 18.6}, nd4j::DataType::FLOAT32);
+                                    -0.025822, -0.029794, -0.033770, -0.037747, 0.027653, 0.031913, 0.036168, 0.040426}, sd::DataType::FLOAT32);
+    NDArray expdLdG('c', {4}, {9.685875, 9.685880, 9.685887, 9.685891}, sd::DataType::FLOAT32);
+    NDArray expdLdB('c', {4}, {4.2,  9. , 13.8, 18.6}, sd::DataType::FLOAT32);
 
     input.linspace(1,0.01);
     gradO.linspace(-0.9, 0.15);
@@ -2626,14 +2626,14 @@ TEST_F(DeclarableOpsTests13, batchnorm_bp_test9) {
     PointersManager manager(input.getContext(), "DeclarableOpsTests13.batchnorm_bp_test9");
     std::vector<int> dimensions = {0,2,3};
     int* dims = reinterpret_cast<int*>(manager.replicatePointer(dimensions.data(), dimensions.size() * sizeof(int)));
-    input.reduceAlongDimension(nd4j::reduce::Mean, mean, dimensions);
+    input.reduceAlongDimension(sd::reduce::Mean, mean, dimensions);
     NDArray::prepareSpecialUse({&variance}, {&input});
-    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input.getShapeInfo(), dimensions);
+    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input.getShapeInfo(), dimensions);
     NativeOpExecutioner::execSummaryStats(input.getContext(), 0,input.getBuffer(), input.getShapeInfo(),input.getSpecialBuffer(), input.getSpecialShapeInfo(),nullptr,variance.getBuffer(), variance.getShapeInfo(),variance.getSpecialBuffer(), variance.getSpecialShapeInfo(), dims, dimensions.size(),packX.platformShapeInfo(), packX.platformOffsets(),false);
     manager.synchronize();
     NDArray::registerSpecialUse({&variance}, {&input});
 
-    nd4j::ops::batchnorm_bp op;
+    sd::ops::batchnorm_bp op;
 
     auto results = op.evaluate({&input, &mean, &variance, &gamma, &beta, &gradO}, {1e-5}, {1,1,1});
 
@@ -2658,18 +2658,18 @@ TEST_F(DeclarableOpsTests13, batchnorm_bp_test9) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests13, batchnorm_bp_test10) {
 
-    NDArray input   ('c', {2,2,2,4}, nd4j::DataType::FLOAT32);
-    NDArray mean    ('c', {4}, nd4j::DataType::FLOAT32);
-    NDArray variance('c', {4}, nd4j::DataType::FLOAT32);
-    NDArray gamma   ('c', {4}, {-1.2, 1.3, -1.4, 1.5}, nd4j::DataType::FLOAT32);
-    NDArray beta    ('c', {4}, nd4j::DataType::FLOAT32);
-    NDArray gradO   ('c', {2,2,2,4}, nd4j::DataType::FLOAT32);
+    NDArray input   ('c', {2,2,2,4}, sd::DataType::FLOAT32);
+    NDArray mean    ('c', {4}, sd::DataType::FLOAT32);
+    NDArray variance('c', {4}, sd::DataType::FLOAT32);
+    NDArray gamma   ('c', {4}, {-1.2, 1.3, -1.4, 1.5}, sd::DataType::FLOAT32);
+    NDArray beta    ('c', {4}, sd::DataType::FLOAT32);
+    NDArray gradO   ('c', {2,2,2,4}, sd::DataType::FLOAT32);
 
     NDArray expdLdI('c', {2,2,2,4}, {0.032634, -0.035423, 0.038110, -0.040864, 0.023302, -0.025294, 0.027213, -0.029205, 0.013996, -0.015192, 0.016343,
                                     -0.017519, 0.004664, -0.005062, 0.005445, -0.005833, -0.004668, 0.005067, -0.005452, 0.005824, -0.013974, 0.015171,
-                                    -0.016325, 0.017508, -0.023309, 0.025301, -0.027221, 0.029197, -0.032639, 0.035428, -0.038118, 0.040878}, nd4j::DataType::FLOAT32);
-    NDArray expdLdG('c', {4}, {10.991656, 10.991631, 10.991643, 10.991632}, nd4j::DataType::FLOAT32);
-    NDArray expdLdB('c', {4}, {9.6, 10.8, 12., 13.2}, nd4j::DataType::FLOAT32);
+                                    -0.016325, 0.017508, -0.023309, 0.025301, -0.027221, 0.029197, -0.032639, 0.035428, -0.038118, 0.040878}, sd::DataType::FLOAT32);
+    NDArray expdLdG('c', {4}, {10.991656, 10.991631, 10.991643, 10.991632}, sd::DataType::FLOAT32);
+    NDArray expdLdB('c', {4}, {9.6, 10.8, 12., 13.2}, sd::DataType::FLOAT32);
 
     input.linspace(1,0.01);
     gradO.linspace(-0.9, 0.15);
@@ -2678,14 +2678,14 @@ TEST_F(DeclarableOpsTests13, batchnorm_bp_test10) {
     PointersManager manager(input.getContext(), "DeclarableOpsTests13.batchnorm_bp_test9");
     std::vector<int> dimensions = {0,1,2};
     int* dims = reinterpret_cast<int*>(manager.replicatePointer(dimensions.data(), dimensions.size() * sizeof(int)));
-    input.reduceAlongDimension(nd4j::reduce::Mean, mean, dimensions);
+    input.reduceAlongDimension(sd::reduce::Mean, mean, dimensions);
     NDArray::prepareSpecialUse({&variance}, {&input});
-    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input.getShapeInfo(), dimensions);
+    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input.getShapeInfo(), dimensions);
     NativeOpExecutioner::execSummaryStats(input.getContext(), 0,input.getBuffer(), input.getShapeInfo(),input.getSpecialBuffer(), input.getSpecialShapeInfo(),nullptr,variance.getBuffer(), variance.getShapeInfo(),variance.getSpecialBuffer(), variance.getSpecialShapeInfo(), dims, dimensions.size(),packX.platformShapeInfo(), packX.platformOffsets(),false);
     manager.synchronize();
     NDArray::registerSpecialUse({&variance}, {&input});
 
-    nd4j::ops::batchnorm_bp op;
+    sd::ops::batchnorm_bp op;
 
     auto results = op.evaluate({&input, &mean, &variance, &gamma, &beta, &gradO}, {1e-5}, {1,1,3});
 
@@ -2710,12 +2710,12 @@ TEST_F(DeclarableOpsTests13, batchnorm_bp_test10) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests13, batchnorm_bp_test11) {
 
-    NDArray input   ('c', {2,3,4,5}, nd4j::DataType::FLOAT32);
-    NDArray mean    ('c', {1,3,4,5}, nd4j::DataType::FLOAT32);
-    NDArray variance('c', {1,3,4,5}, nd4j::DataType::FLOAT32);
-    NDArray gamma   ('c', {1,3,4,5}, nd4j::DataType::FLOAT32);
-    NDArray beta    ('c', {1,3,4,5}, nd4j::DataType::FLOAT32);
-    NDArray gradO   ('c', {2,3,4,5}, nd4j::DataType::FLOAT32);
+    NDArray input   ('c', {2,3,4,5}, sd::DataType::FLOAT32);
+    NDArray mean    ('c', {1,3,4,5}, sd::DataType::FLOAT32);
+    NDArray variance('c', {1,3,4,5}, sd::DataType::FLOAT32);
+    NDArray gamma   ('c', {1,3,4,5}, sd::DataType::FLOAT32);
+    NDArray beta    ('c', {1,3,4,5}, sd::DataType::FLOAT32);
+    NDArray gradO   ('c', {2,3,4,5}, sd::DataType::FLOAT32);
 
     NDArray expdLdI('c', {2,3,4,5}, {0.004981, 0.004818, 0.004652, 0.004483, 0.004319, 0.004153, 0.003985, 0.003832, 0.003661, 0.003505, 0.003340, 0.003171, 0.003001, 0.002837,
                                         0.002670, 0.002505, 0.002337, 0.002167, 0.002003, 0.001835, 0.001666, 0.001499, 0.001327, 0.001162, 0.000996, 0.000830, 0.000664, 0.000498,
@@ -2725,14 +2725,14 @@ TEST_F(DeclarableOpsTests13, batchnorm_bp_test11) {
                                         -0.003836, -0.003661, -0.003505, -0.003338, -0.003171, -0.003004, -0.002837, -0.002670, -0.002503, -0.002337, -0.002170, -0.002003, -0.001835,
                                         -0.001664, -0.001499, -0.001328, -0.001162, -0.000996, -0.000829, -0.000664, -0.000498, -0.000332, -0.000166, 0.0, 0.000166, 0.000334,
                                         0.000500, 0.000668, 0.000834, 0.001003, 0.001170, 0.001337, 0.001502, 0.001669, 0.001838, 0.002005, 0.002172, 0.002330, 0.002496, 0.002669,
-                                        0.002836, 0.003002, 0.003162, 0.003328, 0.003495, 0.003670, 0.003828, 0.003992, 0.004158, 0.004324, 0.004522, 0.004689, 0.004843}, nd4j::DataType::FLOAT32);
+                                        0.002836, 0.003002, 0.003162, 0.003328, 0.003495, 0.003670, 0.003828, 0.003992, 0.004158, 0.004324, 0.004522, 0.004689, 0.004843}, sd::DataType::FLOAT32);
     NDArray expdLdG('c', {1,3,4,5}, {8.999503, 8.999502, 8.999502, 8.999503, 8.999502, 8.999503, 8.999503, 8.999499, 8.999501, 8.999498, 8.999498, 8.999498, 8.999498, 8.999498, 8.999498,
                                     8.999498, 8.999498, 8.999498, 8.999498, 8.999499, 8.999501, 8.999500, 8.999503, 8.999503, 8.999503, 8.999504, 8.999503, 8.999503, 8.999504, 8.999503,
                                     8.999504, 8.999504, 8.999499, 8.999500, 8.999497, 8.999498, 8.999496, 8.999496, 8.999496, 8.999498, 8.999498, 8.999496, 8.999496, 8.999496, 8.999501,
-                                    8.999501, 8.999499, 8.999499, 8.999499, 8.999501, 8.999501, 8.999501, 8.999499, 8.999500, 8.999501, 8.999501, 8.999501, 8.999495, 8.999495, 8.999497}, nd4j::DataType::FLOAT32);
+                                    8.999501, 8.999499, 8.999499, 8.999499, 8.999501, 8.999501, 8.999501, 8.999499, 8.999500, 8.999501, 8.999501, 8.999501, 8.999495, 8.999495, 8.999497}, sd::DataType::FLOAT32);
     NDArray expdLdB('c', {1,3,4,5}, {7.2, 7.5, 7.8, 8.1, 8.4, 8.7, 9.0, 9.3, 9.6, 9.9, 10.2, 10.5, 10.8, 11.1, 11.4, 11.7, 12.0, 12.3, 12.6, 12.9, 13.2, 13.5, 13.8, 14.1, 14.4, 14.7, 15.0,
                                     15.3, 15.6, 15.9, 16.2, 16.5, 16.8, 17.1, 17.4, 17.7, 18.0, 18.3, 18.6, 18.9, 19.2, 19.5, 19.8, 20.1, 20.4, 20.7, 21.0, 21.3, 21.6, 21.9, 22.2, 22.5,
-                                    22.8, 23.1, 23.4, 23.7, 24.0, 24.3, 24.6, 24.9}, nd4j::DataType::FLOAT32);
+                                    22.8, 23.1, 23.4, 23.7, 24.0, 24.3, 24.6, 24.9}, sd::DataType::FLOAT32);
 
     input.linspace(1,0.01);
     gradO.linspace(-0.9, 0.15);
@@ -2742,14 +2742,14 @@ TEST_F(DeclarableOpsTests13, batchnorm_bp_test11) {
     PointersManager manager(input.getContext(), "DeclarableOpsTests13.batchnorm_bp_test9");
     std::vector<int> dimensions = {0};
     int* dims = reinterpret_cast<int*>(manager.replicatePointer(dimensions.data(), dimensions.size() * sizeof(int)));
-    input.reduceAlongDimension(nd4j::reduce::Mean, mean, dimensions, true);
+    input.reduceAlongDimension(sd::reduce::Mean, mean, dimensions, true);
     NDArray::prepareSpecialUse({&variance}, {&input});
-    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input.getShapeInfo(), dimensions);
+    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(input.getShapeInfo(), dimensions);
     NativeOpExecutioner::execSummaryStats(input.getContext(), 0,input.getBuffer(), input.getShapeInfo(),input.getSpecialBuffer(), input.getSpecialShapeInfo(),nullptr,variance.getBuffer(), variance.getShapeInfo(),variance.getSpecialBuffer(), variance.getSpecialShapeInfo(), dims, dimensions.size(),packX.platformShapeInfo(), packX.platformOffsets(),false);
     manager.synchronize();
     NDArray::registerSpecialUse({&variance}, {&input});
 
-    nd4j::ops::batchnorm_bp op;
+    sd::ops::batchnorm_bp op;
 
     auto results = op.evaluate({&input, &mean, &variance, &gamma, &beta, &gradO}, {1e-5}, {1,1, 1,2,3});
 
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp
index 3672a4c20..ab57fcd77 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp
@@ -21,12 +21,12 @@
 
 #include "testlayers.h"
 #include <ops/declarable/CustomOperations.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <ops/ops.h>
-#include <GradCheck.h>
+#include <helpers/GradCheck.h>
 
 
-using namespace nd4j;
+using namespace sd;
 
 
 class DeclarableOpsTests14 : public testing::Test {
@@ -43,7 +43,7 @@ TEST_F(DeclarableOpsTests14, Test_Validation_Edge_1) {
     auto exp = NDArrayFactory::create('c', {2, 2}, Environment::getInstance()->defaultFloatDataType());
     exp.assign(4.0f);
 
-    nd4j::ops::fill op;
+    sd::ops::fill op;
     auto result = op.evaluate({&x}, {4.0f});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -61,7 +61,7 @@ TEST_F(DeclarableOpsTests14, Test_Reshape_CF_1) {
     auto r = x.reshape('c', {3, 2});;
     r.streamline('f');
 
-    nd4j::ops::reshape op;
+    sd::ops::reshape op;
     auto result = op.evaluate({&x}, {3, 2});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -100,7 +100,7 @@ TEST_F(DeclarableOpsTests14, Multiply_test) {
         y.assign(1.0);
         e.assign(1.0);
 
-        nd4j::ops::multiply op;
+        sd::ops::multiply op;
         auto result = op.evaluate({&x, &y});
         auto f = result->at(0);
         NDArray r = *f;
@@ -117,7 +117,7 @@ TEST_F(DeclarableOpsTests14, Test_EvalReductionShape_1) {
     auto y = NDArrayFactory::create<int>('c', {1}, {1});
     auto e = NDArrayFactory::create<Nd4jLong>('c', {2}, {5, 4});
 
-    nd4j::ops::evaluate_reduction_shape op;
+    sd::ops::evaluate_reduction_shape op;
     auto result = op.evaluate({&x, &y}, {}, {}, {false, false});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -132,7 +132,7 @@ TEST_F(DeclarableOpsTests14, Test_EvalReductionShape_2) {
     auto y = NDArrayFactory::create<int>('c', {1}, {1});
     auto e = NDArrayFactory::create<Nd4jLong>('c', {3}, {5, 1, 4});
 
-    nd4j::ops::evaluate_reduction_shape op;
+    sd::ops::evaluate_reduction_shape op;
     auto result = op.evaluate({&x, &y}, {}, {}, {true, false});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -147,7 +147,7 @@ TEST_F(DeclarableOpsTests14, Test_Reduce_Min_Small_0) {
     auto z = NDArrayFactory::create<float>('c', {4});
     auto e = NDArrayFactory::create<float>('c', {4}, {-999.f, 0.2236f, -2.1340f, 0.0962f});
 
-    nd4j::ops::reduce_min op;
+    sd::ops::reduce_min op;
     op.execute({&x}, {&z}, {}, {0}, {});
 
     //z.printIndexedBuffer("Z");
@@ -160,7 +160,7 @@ TEST_F(DeclarableOpsTests14, Test_Reduce_Min_Small_1) {
     auto z = NDArrayFactory::create<float>('c', {3});
     auto e = NDArrayFactory::create<float>('c', {3}, {-999.f, -0.7301f, -2.1340f});
 
-    nd4j::ops::reduce_min op;
+    sd::ops::reduce_min op;
     op.execute({&x}, {&z}, {}, {1}, {});
 
     //z.printIndexedBuffer("Z");
@@ -173,7 +173,7 @@ TEST_F(DeclarableOpsTests14, Test_Diag_Zeros_1) {
     auto z = NDArrayFactory::create<double>('c', {2, 2}, {-119, -119, -119, -119});
     auto exp = NDArrayFactory::create<double>('c', {2, 2}, {1, 0, 0, 2});
 
-    nd4j::ops::diag op;
+    sd::ops::diag op;
     auto status = op.execute({&x}, {&z}, {}, {}, {});
     ASSERT_EQ(Status::OK(), status);
 
@@ -187,7 +187,7 @@ TEST_F(DeclarableOpsTests14, Test_scalar_broadcast_1) {
     e.assign(1.0);
 
 
-    nd4j::ops::add op;
+    sd::ops::add op;
     auto result = op.evaluate({&x, &y});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -204,7 +204,7 @@ TEST_F(DeclarableOpsTests14, Test_scalar_broadcast_2) {
     e.assign(-1.0f);
 
 
-    nd4j::ops::subtract op;
+    sd::ops::subtract op;
     auto result = op.evaluate({&x, &y});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -217,7 +217,7 @@ TEST_F(DeclarableOpsTests14, test_empty_fill_1) {
     auto x = NDArrayFactory::empty<int>();
     auto y = NDArrayFactory::create<int>(1);
 
-    nd4j::ops::fill op;
+    sd::ops::fill op;
     auto result = op.evaluate({&x, &y});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -245,7 +245,7 @@ TEST_F(DeclarableOpsTests14, test_lstmBlockCell_1) {
     auto z5 = NDArrayFactory::create<double>('c', {1, 3});
     auto z6 = NDArrayFactory::create<double>('c', {1, 3});
 
-    nd4j::ops::lstmBlockCell op;
+    sd::ops::lstmBlockCell op;
     auto result = op.execute({&a, &b, &c, &d, &e, &f, &g, &h}, {&z0, &z1, &z2, &z3, &z4, &z5, &z6}, {1.0, -1.0}, {0}, {});
     ASSERT_EQ(Status::OK(), result);
 }
@@ -254,13 +254,13 @@ TEST_F(DeclarableOpsTests14, test_empty_stack_1) {
     auto x = NDArrayFactory::create<float>('c', {0});
     auto e = NDArrayFactory::create<float>('c', {1, 0});
 
-    nd4j::ops::stack op;
+    sd::ops::stack op;
     auto result = op.evaluate({&x}, {}, {0});
     ASSERT_EQ(Status::OK(), result->status());
 
     auto z = result->at(0);
     ASSERT_EQ(e, *z);
-    nd4j::ops::reduce_min sumOp;
+    sd::ops::reduce_min sumOp;
     auto res2 = sumOp.evaluate({&e}, {1.}, {1});
     ASSERT_EQ(res2->status(), Status::OK());
     auto out = res2->at(0);
@@ -274,7 +274,7 @@ TEST_F(DeclarableOpsTests14, test_empty_stack_2) {
     auto x = NDArrayFactory::empty<float>();
     auto e = NDArrayFactory::create<float>('c', {0});
 
-    nd4j::ops::stack op;
+    sd::ops::stack op;
     auto result = op.evaluate({&x}, {}, {0});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -288,7 +288,7 @@ TEST_F(DeclarableOpsTests14, test_empty_stack_3) {
     auto x = NDArrayFactory::empty<float>();
     auto e = NDArrayFactory::create<float>('c', {2, 0});
 
-    nd4j::ops::stack op;
+    sd::ops::stack op;
     auto result = op.evaluate({&x, &x}, {}, {0});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -302,7 +302,7 @@ TEST_F(DeclarableOpsTests14, test_empty_stack_4) {
     auto x = NDArrayFactory::create<float>('c', {0});
     auto e = NDArrayFactory::create<float>('c', {2, 0});
 
-    nd4j::ops::stack op;
+    sd::ops::stack op;
     auto result = op.evaluate({&x, &x}, {}, {0});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -315,7 +315,7 @@ TEST_F(DeclarableOpsTests14, test_empty_stack_4) {
 TEST_F(DeclarableOpsTests14, test_empty_reduce_min_1) {
 
     auto e = NDArrayFactory::create<float>('c', {1, 0});
-    nd4j::ops::reduce_min sumOp;
+    sd::ops::reduce_min sumOp;
     auto res2 = sumOp.evaluate({&e}, {1.}, {1});
     ASSERT_EQ(res2->status(), Status::OK());
     auto out = res2->at(0);
@@ -327,7 +327,7 @@ TEST_F(DeclarableOpsTests14, test_empty_reduce_min_1) {
 TEST_F(DeclarableOpsTests14, test_empty_reduce_max_1) {
 
     auto e = NDArrayFactory::create<float>('c', {1, 0});
-    nd4j::ops::reduce_max sumOp;
+    sd::ops::reduce_max sumOp;
     auto res2 = sumOp.evaluate({&e}, {1.}, {1});
     ASSERT_EQ(res2->status(), Status::OK());
     auto out = res2->at(0);
@@ -343,7 +343,7 @@ TEST_F(DeclarableOpsTests14, test_empty_reduce_sum_1) {
 #endif
 
     auto e = NDArrayFactory::create<float>('c', {1, 0});
-    nd4j::ops::reduce_sum sumOp;
+    sd::ops::reduce_sum sumOp;
     auto res2 = sumOp.evaluate({&e}, {1.}, {1});
     ASSERT_EQ(res2->status(), Status::OK());
     auto out = res2->at(0);
@@ -358,7 +358,7 @@ TEST_F(DeclarableOpsTests14, test_empty_reduce_mean_1) {
 #endif
 
     auto e = NDArrayFactory::create<float>('c', {1, 0});
-    nd4j::ops::reduce_mean sumOp;
+    sd::ops::reduce_mean sumOp;
     auto res2 = sumOp.evaluate({&e}, {1.}, {1});
     ASSERT_EQ(res2->status(), Status::OK());
     auto out = res2->at(0);
@@ -378,7 +378,7 @@ TEST_F(DeclarableOpsTests14, Test_StridedSliceZeros_1) {
 
     matrix.linspace(1);
 
-    nd4j::ops::strided_slice op;
+    sd::ops::strided_slice op;
     auto result = op.evaluate({&matrix, &b, &e, &s}, {}, {0, 0, 0, 0, 0});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -399,7 +399,7 @@ TEST_F(DeclarableOpsTests14, Test_StridedSliceZeros_2) {
 
     matrix.linspace(1);
 
-    nd4j::ops::strided_slice op;
+    sd::ops::strided_slice op;
     auto result = op.evaluate({&matrix, &b, &e, &s}, {}, {0, 0, 0, 0, 1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -415,8 +415,8 @@ TEST_F(DeclarableOpsTests14, test_empty_argmax_1) {
     auto y = NDArrayFactory::create<int>(0);
     auto e = NDArrayFactory::create<Nd4jLong>('c', {0});
 
-    nd4j::ops::argmax op;
-    //nd4j::ops::reduce_max op;
+    sd::ops::argmax op;
+    //sd::ops::reduce_max op;
 
     auto result = op.evaluate({&x, &y}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
@@ -432,7 +432,7 @@ TEST_F(DeclarableOpsTests14, test_empty_argmax_2) {
     auto x = NDArrayFactory::create<float>('c', {1, 0});
     auto y = NDArrayFactory::create<int>(1);
 
-    nd4j::ops::argmax op;
+    sd::ops::argmax op;
     try {
         auto result = op.execute({&x, &y}, {&y}, {}, {}, {});
         ASSERT_TRUE(false);
@@ -444,7 +444,7 @@ TEST_F(DeclarableOpsTests14, test_empty_argmax_2) {
 TEST_F(DeclarableOpsTests14, test_empty_tanh_5) {
     auto x = NDArrayFactory::create<float>('c', {32, 0});
 
-    nd4j::ops::tanh op;
+    sd::ops::tanh op;
     auto result = op.evaluate({&x}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -462,7 +462,7 @@ TEST_F(DeclarableOpsTests14, repeat_1) {
     NDArray x('c', {2, 3}, {1, 2, 3, 4, 5, 6});
     NDArray e('c', {4, 3}, {1, 2, 3, 1, 2, 3, 4, 5, 6, 4, 5, 6});
 
-    nd4j::ops::repeat op;
+    sd::ops::repeat op;
     auto result = op.evaluate({&x}, {}, {2, 0});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -480,7 +480,7 @@ TEST_F(DeclarableOpsTests14, repeat_2) {
     NDArray x('c', {2, 3}, {1, 2, 3, 4, 5, 6});
     NDArray e('c', {2, 6}, {1, 1, 2, 2, 3, 3,4, 4, 5, 5, 6, 6});
 
-    nd4j::ops::repeat op;
+    sd::ops::repeat op;
     auto result = op.evaluate({&x}, {}, {2, 1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -498,7 +498,7 @@ TEST_F(DeclarableOpsTests14, repeat_3) {
     NDArray x('c', {2, 3}, {1, 2, 3, 4, 5, 6});
     NDArray e('c', {2, 6}, {1, 2, 2, 3, 3, 3,4, 5, 5, 6, 6, 6});
 
-    nd4j::ops::repeat op;
+    sd::ops::repeat op;
     auto result = op.evaluate({&x}, {}, {1,2,3,  1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -516,7 +516,7 @@ TEST_F(DeclarableOpsTests14, repeat_4) {
     NDArray x('c', {2, 3}, {1, 2, 3, 4, 5, 6});
     NDArray e('c', {7, 3}, {1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 6, 4, 5, 6, 4, 5, 6, 4, 5, 6});
 
-    nd4j::ops::repeat op;
+    sd::ops::repeat op;
     auto result = op.evaluate({&x}, {}, {3,4,  0});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -534,7 +534,7 @@ TEST_F(DeclarableOpsTests14, repeat_5) {
     NDArray x('c', {2, 3, 4}, {1, 2, 3, 4, 5, 6, 7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24});
     NDArray e('c', {2, 4, 4}, {1,  2,  3,  4, 5,  6,  7,  8, 5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 17, 18, 19, 20, 21, 22, 23, 24});
 
-    nd4j::ops::repeat op;
+    sd::ops::repeat op;
     auto result = op.evaluate({&x}, {}, {1,2,1,  1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -548,15 +548,15 @@ TEST_F(DeclarableOpsTests14, repeat_5) {
 /////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests14, Test_broadcast_SpecialCaseTest) {
 
-    auto y = NDArray('c', { 3 }, nd4j::DataType::FLOAT32);
-    auto x = NDArray('c', { 5, 2, 1 }, nd4j::DataType::FLOAT32);
+    auto y = NDArray('c', { 3 }, sd::DataType::FLOAT32);
+    auto x = NDArray('c', { 5, 2, 1 }, sd::DataType::FLOAT32);
 
-    auto e = NDArray('c', { 5, 2, 3 }, { 2., 2., 2., 3., 3., 3., 4., 4., 4., 5., 5., 5., 6., 6., 6., 7., 7., 7., 8., 8., 8., 9., 9., 9., 10., 10., 10., 11., 11., 11. }, nd4j::DataType::FLOAT32);
+    auto e = NDArray('c', { 5, 2, 3 }, { 2., 2., 2., 3., 3., 3., 4., 4., 4., 5., 5., 5., 6., 6., 6., 7., 7., 7., 8., 8., 8., 9., 9., 9., 10., 10., 10., 11., 11., 11. }, sd::DataType::FLOAT32);
 
     y.assign(1.0);
     x.linspace(1.0);
 
-    nd4j::ops::add op;
+    sd::ops::add op;
     auto result = op.evaluate({ &x, &y });
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -569,15 +569,15 @@ TEST_F(DeclarableOpsTests14, Test_broadcast_SpecialCaseTest) {
 /////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests14, Test_broadcast_SpecialCaseTest2) {
 
-    auto y = NDArray('c', { 1, 3 }, nd4j::DataType::FLOAT32);
-    auto x = NDArray('c', { 5, 2, 1 }, nd4j::DataType::FLOAT32);
+    auto y = NDArray('c', { 1, 3 }, sd::DataType::FLOAT32);
+    auto x = NDArray('c', { 5, 2, 1 }, sd::DataType::FLOAT32);
 
-    auto e = NDArray('c', { 5, 2, 3 }, { 2., 2., 2., 3., 3., 3., 4., 4., 4., 5., 5., 5., 6., 6., 6., 7., 7., 7., 8., 8., 8., 9., 9., 9., 10., 10., 10., 11., 11., 11. }, nd4j::DataType::FLOAT32);
+    auto e = NDArray('c', { 5, 2, 3 }, { 2., 2., 2., 3., 3., 3., 4., 4., 4., 5., 5., 5., 6., 6., 6., 7., 7., 7., 8., 8., 8., 9., 9., 9., 10., 10., 10., 11., 11., 11. }, sd::DataType::FLOAT32);
 
     y.assign(1.0);
     x.linspace(1.0);
 
-    nd4j::ops::add op;
+    sd::ops::add op;
     auto result = op.evaluate({ &x, &y });
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -591,11 +591,11 @@ TEST_F(DeclarableOpsTests14, Test_broadcast_SpecialCaseTest2) {
 ///////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests14, Test_broadcast_SpecialCaseTest3) {
 
-    auto x = NDArray('c', { 3, 5, 1 }, nd4j::DataType::FLOAT32);
-    auto y = NDArray('c', { 3, 1, 4 }, nd4j::DataType::FLOAT32);
-    auto z = NDArray('c', { 3, 5, 4 }, nd4j::DataType::FLOAT32);
+    auto x = NDArray('c', { 3, 5, 1 }, sd::DataType::FLOAT32);
+    auto y = NDArray('c', { 3, 1, 4 }, sd::DataType::FLOAT32);
+    auto z = NDArray('c', { 3, 5, 4 }, sd::DataType::FLOAT32);
     // recieved by main algorithm
-    auto e = NDArray('c', { 3, 5, 4 }, { 10., 11., 12., 13., 20., 22., 24., 26., 30., 33., 36., 39., 40., 44., 48., 52., 50., 55., 60., 65., 84., 90., 96., 102., 98., 105., 112., 119., 112., 120., 128., 136., 126., 135., 144., 153., 140., 150., 160., 170., 198., 209., 220., 231., 216., 228., 240., 252., 234., 247., 260., 273., 252., 266., 280., 294., 270., 285., 300., 315. }, nd4j::DataType::FLOAT32);
+    auto e = NDArray('c', { 3, 5, 4 }, { 10., 11., 12., 13., 20., 22., 24., 26., 30., 33., 36., 39., 40., 44., 48., 52., 50., 55., 60., 65., 84., 90., 96., 102., 98., 105., 112., 119., 112., 120., 128., 136., 126., 135., 144., 153., 140., 150., 160., 170., 198., 209., 220., 231., 216., 228., 240., 252., 234., 247., 260., 273., 252., 266., 280., 294., 270., 285., 300., 315. }, sd::DataType::FLOAT32);
 
     x.linspace(1.f);
     y.linspace(10.f);
@@ -607,11 +607,11 @@ TEST_F(DeclarableOpsTests14, Test_broadcast_SpecialCaseTest3) {
 ///////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests14, Test_broadcast_SpecialCaseTest4) {
 
-    auto x = NDArray('c', { 2, 3, 5, 1 }, nd4j::DataType::FLOAT32);
-    auto y = NDArray('c', { 2, 3, 1, 4 }, nd4j::DataType::FLOAT32);
-    auto z = NDArray('c', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32);
+    auto x = NDArray('c', { 2, 3, 5, 1 }, sd::DataType::FLOAT32);
+    auto y = NDArray('c', { 2, 3, 1, 4 }, sd::DataType::FLOAT32);
+    auto z = NDArray('c', { 2, 3, 5, 4 }, sd::DataType::FLOAT32);
     // recieved by main algorithm
-    auto e = NDArray('c', { 2, 3, 5, 4 }, { 10., 11., 12., 13.,20., 22., 24., 26.,30., 33., 36., 39.,40., 44., 48., 52.,50., 55., 60., 65.,84., 90., 96., 102.,98., 105., 112., 119.,112., 120., 128., 136.,126., 135., 144., 153.,140., 150., 160., 170.,198., 209., 220., 231.,216., 228., 240., 252.,234., 247., 260., 273.,252., 266., 280., 294.,270., 285., 300., 315.,352., 368., 384., 400.,374., 391., 408., 425.,396., 414., 432., 450.,418., 437., 456., 475.,440., 460., 480., 500.,546., 567., 588., 609.,572., 594., 616., 638.,598., 621., 644., 667.,624., 648., 672., 696.,650., 675., 700., 725.,780., 806., 832., 858.,810., 837., 864., 891.,840., 868., 896., 924.,870., 899., 928., 957.,900., 930., 960., 990. }, nd4j::DataType::FLOAT32);
+    auto e = NDArray('c', { 2, 3, 5, 4 }, { 10., 11., 12., 13.,20., 22., 24., 26.,30., 33., 36., 39.,40., 44., 48., 52.,50., 55., 60., 65.,84., 90., 96., 102.,98., 105., 112., 119.,112., 120., 128., 136.,126., 135., 144., 153.,140., 150., 160., 170.,198., 209., 220., 231.,216., 228., 240., 252.,234., 247., 260., 273.,252., 266., 280., 294.,270., 285., 300., 315.,352., 368., 384., 400.,374., 391., 408., 425.,396., 414., 432., 450.,418., 437., 456., 475.,440., 460., 480., 500.,546., 567., 588., 609.,572., 594., 616., 638.,598., 621., 644., 667.,624., 648., 672., 696.,650., 675., 700., 725.,780., 806., 832., 858.,810., 837., 864., 891.,840., 868., 896., 924.,870., 899., 928., 957.,900., 930., 960., 990. }, sd::DataType::FLOAT32);
     x.linspace(1.f);
     y.linspace(10.f);
     z.assign(0.f);
@@ -622,11 +622,11 @@ TEST_F(DeclarableOpsTests14, Test_broadcast_SpecialCaseTest4) {
 ///////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests14, Test_broadcast_SpecialCaseTest5) {
 
-    auto x = NDArray('c', { 3, 5, 1 }, nd4j::DataType::FLOAT32);
-    auto y = NDArray('c', { 3, 1, 4 }, nd4j::DataType::FLOAT32);
-    auto z = NDArray('c', { 3, 5, 4 }, nd4j::DataType::FLOAT32);
+    auto x = NDArray('c', { 3, 5, 1 }, sd::DataType::FLOAT32);
+    auto y = NDArray('c', { 3, 1, 4 }, sd::DataType::FLOAT32);
+    auto z = NDArray('c', { 3, 5, 4 }, sd::DataType::FLOAT32);
     // recieved by main algorithm
-    auto e = NDArray('c', { 3, 5, 4 }, { 0.1, 0.090909, 0.083333, 0.076923,0.2, 0.181818, 0.166667, 0.153846,0.3, 0.272727, 0.250000, 0.230769,0.4, 0.363636, 0.333333, 0.307692,0.5, 0.454545, 0.416667, 0.384615, 0.428571, 0.400000, 0.375000, 0.352941,  0.500000, 0.466667, 0.437500, 0.411765,  0.571429, 0.533333, 0.500000, 0.470588,  0.642857, 0.600000, 0.562500, 0.529412,  0.714286, 0.666667, 0.625000, 0.588235,  0.611111, 0.578947, 0.550000, 0.523810,  0.666667, 0.631579, 0.600000, 0.571429,  0.722222, 0.684211, 0.650000, 0.619048,   0.777778, 0.736842, 0.700000, 0.666667,  0.833333, 0.789474, 0.750000, 0.714286 }, nd4j::DataType::FLOAT32);
+    auto e = NDArray('c', { 3, 5, 4 }, { 0.1, 0.090909, 0.083333, 0.076923,0.2, 0.181818, 0.166667, 0.153846,0.3, 0.272727, 0.250000, 0.230769,0.4, 0.363636, 0.333333, 0.307692,0.5, 0.454545, 0.416667, 0.384615, 0.428571, 0.400000, 0.375000, 0.352941,  0.500000, 0.466667, 0.437500, 0.411765,  0.571429, 0.533333, 0.500000, 0.470588,  0.642857, 0.600000, 0.562500, 0.529412,  0.714286, 0.666667, 0.625000, 0.588235,  0.611111, 0.578947, 0.550000, 0.523810,  0.666667, 0.631579, 0.600000, 0.571429,  0.722222, 0.684211, 0.650000, 0.619048,   0.777778, 0.736842, 0.700000, 0.666667,  0.833333, 0.789474, 0.750000, 0.714286 }, sd::DataType::FLOAT32);
     x.linspace(1.f);
     y.linspace(10.f);
     z.assign(0.f);
@@ -637,11 +637,11 @@ TEST_F(DeclarableOpsTests14, Test_broadcast_SpecialCaseTest5) {
 ///////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests14, Test_broadcast_SpecialCaseTest6) {
 
-    auto x = NDArray('c', { 2, 3, 5, 1 }, nd4j::DataType::FLOAT32);
-    auto y = NDArray('c', { 2, 3, 1, 4 }, nd4j::DataType::FLOAT32);
-    auto z = NDArray('c', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32);
+    auto x = NDArray('c', { 2, 3, 5, 1 }, sd::DataType::FLOAT32);
+    auto y = NDArray('c', { 2, 3, 1, 4 }, sd::DataType::FLOAT32);
+    auto z = NDArray('c', { 2, 3, 5, 4 }, sd::DataType::FLOAT32);
     // recieved by main algorithm
-    auto e = NDArray('c', { 2, 3, 5, 4 }, { 0.1, 0.090909, 0.083333, 0.076923,0.2, 0.181818, 0.166667, 0.153846,0.3, 0.272727, 0.250000, 0.230769,0.4, 0.363636, 0.333333, 0.307692,0.5, 0.454545, 0.416667, 0.384615,  0.428571, 0.400000, 0.375000, 0.352941,  0.500000, 0.466667, 0.437500, 0.411765,  0.571429, 0.533333, 0.500000, 0.470588,  0.642857, 0.600000, 0.562500, 0.529412,  0.714286, 0.666667, 0.625000, 0.588235,0.611111, 0.578947, 0.550000, 0.523810,0.666667, 0.631579, 0.600000, 0.571429,0.722222, 0.684211, 0.650000, 0.619048,0.777778, 0.736842, 0.700000, 0.666667,0.833333, 0.789474, 0.750000, 0.714286, 0.727273, 0.695652, 0.666667, 0.64, 0.772727, 0.739130, 0.708333, 0.68, 0.818182, 0.782609, 0.750000, 0.72, 0.863636, 0.826087, 0.791667, 0.76, 0.909091, 0.869565, 0.833333, 0.80,  0.807692, 0.777778, 0.750000, 0.724138,  0.846154, 0.814815, 0.785714, 0.758621,  0.884615, 0.851852, 0.821429, 0.793103,  0.923077, 0.888889, 0.857143, 0.827586,  0.961538, 0.925926, 0.892857, 0.862069,  0.866667, 0.838710, 0.812500, 0.787879,  0.900000, 0.870968, 0.843750, 0.818182,  0.933333, 0.903226, 0.875000, 0.848485, 0.966667, 0.935484, 0.906250, 0.878788,  1.000000, 0.967742, 0.937500, 0.909091 }, nd4j::DataType::FLOAT32);
+    auto e = NDArray('c', { 2, 3, 5, 4 }, { 0.1, 0.090909, 0.083333, 0.076923,0.2, 0.181818, 0.166667, 0.153846,0.3, 0.272727, 0.250000, 0.230769,0.4, 0.363636, 0.333333, 0.307692,0.5, 0.454545, 0.416667, 0.384615,  0.428571, 0.400000, 0.375000, 0.352941,  0.500000, 0.466667, 0.437500, 0.411765,  0.571429, 0.533333, 0.500000, 0.470588,  0.642857, 0.600000, 0.562500, 0.529412,  0.714286, 0.666667, 0.625000, 0.588235,0.611111, 0.578947, 0.550000, 0.523810,0.666667, 0.631579, 0.600000, 0.571429,0.722222, 0.684211, 0.650000, 0.619048,0.777778, 0.736842, 0.700000, 0.666667,0.833333, 0.789474, 0.750000, 0.714286, 0.727273, 0.695652, 0.666667, 0.64, 0.772727, 0.739130, 0.708333, 0.68, 0.818182, 0.782609, 0.750000, 0.72, 0.863636, 0.826087, 0.791667, 0.76, 0.909091, 0.869565, 0.833333, 0.80,  0.807692, 0.777778, 0.750000, 0.724138,  0.846154, 0.814815, 0.785714, 0.758621,  0.884615, 0.851852, 0.821429, 0.793103,  0.923077, 0.888889, 0.857143, 0.827586,  0.961538, 0.925926, 0.892857, 0.862069,  0.866667, 0.838710, 0.812500, 0.787879,  0.900000, 0.870968, 0.843750, 0.818182,  0.933333, 0.903226, 0.875000, 0.848485, 0.966667, 0.935484, 0.906250, 0.878788,  1.000000, 0.967742, 0.937500, 0.909091 }, sd::DataType::FLOAT32);
 
     x.linspace(1.f);
     y.linspace(10.f);
@@ -654,11 +654,11 @@ TEST_F(DeclarableOpsTests14, Test_broadcast_SpecialCaseTest6) {
 ///////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests14, Test_broadcast_SpecialCaseTest7) {
 
-    auto x = NDArray('c', { 3, 5, 1 }, nd4j::DataType::FLOAT32);
-    auto y = NDArray('c', { 3, 1, 4 }, nd4j::DataType::FLOAT32);
-    auto z = NDArray('c', { 3, 5, 4 }, nd4j::DataType::FLOAT32);
+    auto x = NDArray('c', { 3, 5, 1 }, sd::DataType::FLOAT32);
+    auto y = NDArray('c', { 3, 1, 4 }, sd::DataType::FLOAT32);
+    auto z = NDArray('c', { 3, 5, 4 }, sd::DataType::FLOAT32);
     // recieved by main algorithm
-    auto e = NDArray('c', { 3, 5, 4 }, { -9., -10., -11., -12.,-8., -9., -10., -11., -7., -8., -9., -10.,-6., -7., -8., -9.,-5., -6., -7., -8.,-8., -9., -10., -11.,-7., -8., -9., -10.,-6., -7., -8., -9.,-5., -6., -7., -8.,-4., -5., -6., -7.,-7., -8.000000, -9.000000, -10.00,-6.000000, -7.000000, -8.000000, -9.000,-5.000000, -6.000000, -7.000000, -8.000,-4.000000, -5.000000, -6.000000, -7.000,-3.000000, -4.000000, -5.000000, -6.000 }, nd4j::DataType::FLOAT32);
+    auto e = NDArray('c', { 3, 5, 4 }, { -9., -10., -11., -12.,-8., -9., -10., -11., -7., -8., -9., -10.,-6., -7., -8., -9.,-5., -6., -7., -8.,-8., -9., -10., -11.,-7., -8., -9., -10.,-6., -7., -8., -9.,-5., -6., -7., -8.,-4., -5., -6., -7.,-7., -8.000000, -9.000000, -10.00,-6.000000, -7.000000, -8.000000, -9.000,-5.000000, -6.000000, -7.000000, -8.000,-4.000000, -5.000000, -6.000000, -7.000,-3.000000, -4.000000, -5.000000, -6.000 }, sd::DataType::FLOAT32);
     x.linspace(1.f);
     y.linspace(10.f);
     z.assign(0.f);
@@ -669,11 +669,11 @@ TEST_F(DeclarableOpsTests14, Test_broadcast_SpecialCaseTest7) {
 ///////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests14, Test_broadcast_SpecialCaseTest8) {
 
-    auto x = NDArray('c', { 2, 3, 5, 1 }, nd4j::DataType::FLOAT32);
-    auto y = NDArray('c', { 2, 3, 1, 4 }, nd4j::DataType::FLOAT32);
-    auto z = NDArray('c', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32);
+    auto x = NDArray('c', { 2, 3, 5, 1 }, sd::DataType::FLOAT32);
+    auto y = NDArray('c', { 2, 3, 1, 4 }, sd::DataType::FLOAT32);
+    auto z = NDArray('c', { 2, 3, 5, 4 }, sd::DataType::FLOAT32);
     // recieved by main algorithm
-    auto e = NDArray('c', { 2, 3, 5, 4 }, { -9.0, -10., -11., -12.,-8., -9., -10., -11.0,-7., -8., -9., -10.,-6., -7., -8., -9.,-5., -6., -7., -8.,-8., -9., -10., -11.,-7., -8., -9., -10.,-6., -7., -8., -9.,-5., -6., -7., -8.,-4., -5., -6., -7.,-7., -8., -9., -10.,-6., -7., -8., -9.,-5., -6., -7., -8.,-4., -5., -6., -7.,-3., -4., -5., -6.,-6., -7., -8., -9.,-5., -6., -7., -8.,-4., -5., -6., -7.,-3., -4., -5., -6.,-2., -3., -4., -5.,-5., -6., -7., -8.,-4., -5., -6., -7.,-3., -4., -5., -6.,-2., -3., -4., -5.,-1., -2., -3., -4.,-4., -5., -6., -7.,-3., -4., -5., -6.,-2., -3., -4., -5.,-1., -2., -3., -4., 0., -1., -2., -3. }, nd4j::DataType::FLOAT32);
+    auto e = NDArray('c', { 2, 3, 5, 4 }, { -9.0, -10., -11., -12.,-8., -9., -10., -11.0,-7., -8., -9., -10.,-6., -7., -8., -9.,-5., -6., -7., -8.,-8., -9., -10., -11.,-7., -8., -9., -10.,-6., -7., -8., -9.,-5., -6., -7., -8.,-4., -5., -6., -7.,-7., -8., -9., -10.,-6., -7., -8., -9.,-5., -6., -7., -8.,-4., -5., -6., -7.,-3., -4., -5., -6.,-6., -7., -8., -9.,-5., -6., -7., -8.,-4., -5., -6., -7.,-3., -4., -5., -6.,-2., -3., -4., -5.,-5., -6., -7., -8.,-4., -5., -6., -7.,-3., -4., -5., -6.,-2., -3., -4., -5.,-1., -2., -3., -4.,-4., -5., -6., -7.,-3., -4., -5., -6.,-2., -3., -4., -5.,-1., -2., -3., -4., 0., -1., -2., -3. }, sd::DataType::FLOAT32);
 
     x.linspace(1.f);
     y.linspace(10.f);
@@ -693,7 +693,7 @@ TEST_F(DeclarableOpsTests14, matmul_test1) {
     x.linspace(1.);
     y.linspace(0.5, 0.5);
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     auto results = op.evaluate({&x, &y}, {}, {});
     auto z = results->at(0);
 
@@ -715,7 +715,7 @@ TEST_F(DeclarableOpsTests14, matmul_test2) {
     x.linspace(1.);
     y.linspace(0.5, 0.5);
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     auto results = op.evaluate({&x, &y}, {}, {});
     auto z = results->at(0);
 
@@ -736,7 +736,7 @@ TEST_F(DeclarableOpsTests14, matmul_test3) {
     x.linspace(1.);
     y.linspace(0.5, 0.5);
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     auto results = op.evaluate({&x, &y}, {}, {});
     auto z = results->at(0);
 
@@ -758,7 +758,7 @@ TEST_F(DeclarableOpsTests14, matmul_test4) {
     x.linspace(1.);
     y.linspace(0.5, 0.5);
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     auto results = op.evaluate({&x, &y}, {}, {});
     auto z = results->at(0);
 
@@ -780,7 +780,7 @@ TEST_F(DeclarableOpsTests14, matmul_test5) {
     x.linspace(1.);
     y.linspace(0.5, 0.5);
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     auto results = op.evaluate({&x, &y}, {}, {1});
     auto z = results->at(0);
 
@@ -801,7 +801,7 @@ TEST_F(DeclarableOpsTests14, matmul_test6) {
     x.linspace(1.);
     y.linspace(0.5, 0.5);
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     auto results = op.evaluate({&x, &y}, {}, {1, 1});
     auto z = results->at(0);
 
@@ -824,7 +824,7 @@ TEST_F(DeclarableOpsTests14, matmul_test7) {
     x.linspace(1.);
     y.linspace(0.1, 0.1);
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     auto results = op.evaluate({&x, &y}, {}, {0, 1});
     auto z = results->at(0);
 
@@ -849,7 +849,7 @@ TEST_F(DeclarableOpsTests14, matmul_test8) {
     x.linspace(1.);
     y.linspace(0.1, 0.1);
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     auto results = op.evaluate({&x, &y}, {}, {0, 1});
     auto z = results->at(0);
 
@@ -874,7 +874,7 @@ TEST_F(DeclarableOpsTests14, matmul_test9) {
     x.linspace(1.);
     y.linspace(0.1, 0.1);
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     auto results = op.evaluate({&x, &y}, {}, {1, 1});
     auto z = results->at(0);
 
@@ -895,7 +895,7 @@ TEST_F(DeclarableOpsTests14, matmul_test10) {
 
     float _expB[]{135.0f, 310.0f, 485.0f, 150.0f, 350.0f, 550.0f, 165.0f, 390.0f, 615.0f};
     Nd4jLong _expS[] {2, 3, 3, 1, 3, 0, 1, 102}; // expected shape
-    ArrayOptions::setDataType(_expS, nd4j::DataType::FLOAT32);
+    ArrayOptions::setDataType(_expS, sd::DataType::FLOAT32);
     NDArray exp(_expB, _expS);
 
     auto variableSpace = new VariableSpace();
@@ -906,7 +906,7 @@ TEST_F(DeclarableOpsTests14, matmul_test10) {
     auto block = new Context(1, variableSpace, false);
     block->fillInputs({-1, -2});
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
 
     Nd4jStatus status = op.execute(block);
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -928,7 +928,7 @@ TEST_F(DeclarableOpsTests14, matmul_test11) {
     A.linspace(1);
     B.linspace(1);
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
 
     auto result = op.evaluate({&A, &B}, {}, {});
 
@@ -946,7 +946,7 @@ TEST_F(DeclarableOpsTests14, matmul_test12) {
     auto y= NDArrayFactory::create<double>('c', {4, 3}, {1, 2, 3, 4, 5, 6, 7, 8 , 9, 10, 11, 12});
     auto exp= NDArrayFactory::create<double>('f', {4, 4}, {38.0, 44.0, 50.0, 56.0, 83.0, 98.0, 113.0, 128.0, 128.0, 152.0, 176.0, 200.0, 173.0, 206.0, 239.0, 272.0});
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     auto result = op.evaluate({&x, &y}, {}, {1, 1});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -965,7 +965,7 @@ TEST_F(DeclarableOpsTests14, matmul_test13) {
     auto y= NDArrayFactory::create<double>('c', {1, 4}, {1, 2, 3, 4});
     auto exp= NDArrayFactory::create<double>('f', {3, 4}, {1.0, 2.0, 3.0, 2.0, 4.0, 6.0, 3.0, 6.0, 9.0, 4.0, 8.0, 12.0});
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     auto result = op.evaluate({&x, &y}, {}, {1, 0});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -984,7 +984,7 @@ TEST_F(DeclarableOpsTests14, matmul_test14) {
     auto y= NDArrayFactory::create<double>('c', {4, 1}, {1, 2, 3, 4});
     auto exp= NDArrayFactory::create<double>('f', {3, 4}, {1.0, 2.0, 3.0, 2.0, 4.0, 6.0, 3.0, 6.0, 9.0, 4.0, 8.0, 12.0});
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     auto result = op.evaluate({&x, &y}, {}, {0, 1});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1003,7 +1003,7 @@ TEST_F(DeclarableOpsTests14, matmul_test15) {
     auto y= NDArrayFactory::create<double>('c', {1, 4}, {1, 2, 3, 4});
     auto exp= NDArrayFactory::create<double>('f', {3, 4}, {1.0, 2.0, 3.0, 2.0, 4.0, 6.0, 3.0, 6.0, 9.0, 4.0, 8.0, 12.0});
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     auto result = op.evaluate({&x, &y}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1022,7 +1022,7 @@ TEST_F(DeclarableOpsTests14, matmul_test16) {
     auto y= NDArrayFactory::create<double>('c', {1, 4}, {1, 2, 3, 4});
     auto exp= NDArrayFactory::create<double>('f', {4, 4}, {1,2, 3, 4,2,4, 6, 8,3,6, 9,12,4,8,12,16});
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     auto result = op.evaluate({&x, &y});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1041,7 +1041,7 @@ TEST_F(DeclarableOpsTests14, matmul_test17) {
     auto y = NDArrayFactory::create<double>('c', {2, 1}, {2.0f, 2.0f});
     auto exp = NDArrayFactory::create<double>('c', {1, 1}, {8.0f});
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     auto result = op.evaluate({&x, &y}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -1061,7 +1061,7 @@ TEST_F(DeclarableOpsTests14, matmul_test18) {
     x.linspace(1.);
     y.linspace(0.5, 0.5);
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     auto results = op.evaluate({&x, &y}, {}, {1, 1});
     auto z = results->at(0);
 
@@ -1081,7 +1081,7 @@ TEST_F(DeclarableOpsTests14, matmul_test19) {
 
     x.linspace(1.);
     y.linspace(0.5, 0.5);
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     auto results = op.evaluate({&x, &y}, {}, {1, 1});
     ASSERT_EQ(Status::OK(), results->status());
 
@@ -1102,7 +1102,7 @@ TEST_F(DeclarableOpsTests14, matmul_test20) {
     x.linspace(1.);
     y.linspace(0.5, 0.5);
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     auto results = op.evaluate({&x, &y}, {}, {1, 1});
 
     ASSERT_EQ(Status::OK(), results->status());
@@ -1123,7 +1123,7 @@ TEST_F(DeclarableOpsTests14, matmul_test21) {
     x.linspace(1.);
     y.linspace(0.5, 0.5);
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     auto results = op.evaluate({&x, &y}, {}, {0, 0, 1});
     auto z = results->at(0);
 
@@ -1144,7 +1144,7 @@ TEST_F(DeclarableOpsTests14, matmul_test22) {
     x.linspace(1.);
     y.linspace(0.5, 0.5);
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     auto results = op.evaluate({&x, &y}, {}, {1, 0, 1});
     auto z = results->at(0);
 
@@ -1165,7 +1165,7 @@ TEST_F(DeclarableOpsTests14, matmul_test23) {
     x.linspace(1.);
     y.linspace(0.5, 0.5);
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     auto results = op.evaluate({&x, &y}, {}, {1, 0, 1});
     auto z = results->at(0);
 
@@ -1189,7 +1189,7 @@ TEST_F(DeclarableOpsTests14, matmul_test24) {
     x.linspace(1.);
     y.linspace(0.1, 0.1);
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     auto results = op.evaluate({&x, &y}, {}, {1, 1, 1});
     auto z = results->at(0);
 
@@ -1210,7 +1210,7 @@ TEST_F(DeclarableOpsTests14, matmul_test25) {
     x.linspace(1.);
     y.linspace(0.1, 0.1);
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     auto results = op.evaluate({&x, &y}, {}, {1, 0});
     auto z = results->at(0);
 
@@ -1231,7 +1231,7 @@ TEST_F(DeclarableOpsTests14, matmul_test26) {
     x.linspace(1.);
     y.linspace(0.1, 0.1);
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     auto results = op.evaluate({&x, &y}, {}, {0, 1});
     auto z = results->at(0);
 
@@ -1252,7 +1252,7 @@ TEST_F(DeclarableOpsTests14, matmul_test27) {
     x.linspace(2.);
     y.linspace(0.1, 0.1);
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     auto results = op.evaluate({&x, &y}, {}, {});
     auto z = results->at(0);
 
@@ -1274,7 +1274,7 @@ TEST_F(DeclarableOpsTests14, matmul_test28) {
     x.linspace(2.);
     y.linspace(0.1, 0.1);
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     auto results = op.evaluate({&x, &y}, {}, {1,1,1});
     auto z = results->at(0);
 
@@ -1296,7 +1296,7 @@ TEST_F(DeclarableOpsTests14, matmul_test29) {
     x.linspace(2.);
     y.linspace(0.1, 0.1);
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     auto results = op.evaluate({&x, &y}, {}, {});
     auto z = results->at(0);
 
@@ -1316,7 +1316,7 @@ TEST_F(DeclarableOpsTests14, matmul_test30) {
     x.linspace(2.);
     y.linspace(0.1, 0.1);
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     auto results = op.evaluate({&x, &y}, {}, {1});
     auto z = results->at(0);
 
@@ -1336,7 +1336,7 @@ TEST_F(DeclarableOpsTests14, matmul_test31) {
     x.linspace(1.);
     y.linspace(0.1, 0.1);
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     auto results = op.evaluate({&x, &y}, {}, {1, 1});
     auto z = results->at(0);
 
@@ -1353,7 +1353,7 @@ TEST_F(DeclarableOpsTests14, matmul_test32) {
     auto y  = NDArrayFactory::create<double>('c', {1}, {3.});
     auto exp = NDArrayFactory::create<double>(6.);
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     auto results = op.evaluate({&x, &y}, {}, {1, 1});
     auto z = results->at(0);
 
@@ -1372,7 +1372,7 @@ TEST_F(DeclarableOpsTests14, matmul_test33) {
     x.linspace(1);
     y.linspace(1);
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     auto result = op.evaluate({&x, &y}, {}, {1, 0});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1389,7 +1389,7 @@ TEST_F(DeclarableOpsTests14, matmul_test34) {
     auto b = NDArrayFactory::create<double>('c', {4}, {1, 2, 3, 4});
     auto exp = NDArrayFactory::create<double>('c', {3}, {30, 70, 110});
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     auto result = op.evaluate({&a, &b});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1406,7 +1406,7 @@ TEST_F(DeclarableOpsTests14, matmul_test35) {
     auto b = NDArrayFactory::create<double>('c', {4, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
     auto exp = NDArrayFactory::create<double>('c', {3}, {70, 80, 90});
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     auto result = op.evaluate({&a, &b});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1423,7 +1423,7 @@ TEST_F(DeclarableOpsTests14, matmul_test36) {
     auto b = NDArrayFactory::create<double>('c', {4, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
     auto exp = NDArrayFactory::create<double>('c', {1, 3}, {70, 80, 90});
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     auto result = op.evaluate({&a, &b});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1437,16 +1437,16 @@ TEST_F(DeclarableOpsTests14, matmul_test36) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests14, matmul_test37) {
 
-    NDArray a('c', {32, 12, 128, 64},  nd4j::DataType::FLOAT32);
-    NDArray b('c', {32, 12, 128, 64}, nd4j::DataType::FLOAT32);
-    NDArray c('c', {32,12,128,128}, nd4j::DataType::FLOAT32);
-    NDArray cExp('c', {32,12,128,128}, nd4j::DataType::FLOAT32);
+    NDArray a('c', {32, 12, 128, 64},  sd::DataType::FLOAT32);
+    NDArray b('c', {32, 12, 128, 64}, sd::DataType::FLOAT32);
+    NDArray c('c', {32,12,128,128}, sd::DataType::FLOAT32);
+    NDArray cExp('c', {32,12,128,128}, sd::DataType::FLOAT32);
 
     a = 1;
     b = 1;
     cExp = 64;      //Each entry in output c is sum of 64 (1.0 x 1.0) multiplications
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     auto status = op.execute({&a, &b}, {&c}, {}, {0,1});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -1459,30 +1459,30 @@ TEST_F(DeclarableOpsTests14, Test_broadcast_3D_1) {
 
     // x[4, 12, 128] * y[4, 128] = z[4, 12, 128]
 
-    auto x = NDArray('c', { 2, 3, 5 }, nd4j::DataType::FLOAT32);
-    auto y = NDArray('c', { 2, 5 }, nd4j::DataType::FLOAT32);
-    auto z = NDArray('c', { 2, 3, 5 }, nd4j::DataType::FLOAT32);
+    auto x = NDArray('c', { 2, 3, 5 }, sd::DataType::FLOAT32);
+    auto y = NDArray('c', { 2, 5 }, sd::DataType::FLOAT32);
+    auto z = NDArray('c', { 2, 3, 5 }, sd::DataType::FLOAT32);
     // recieved by main algorithm
-    auto e = NDArray('c', { 2, 3, 5 }, { 10.000000, 22.000000, 36.000000, 52.000000, 70.000000, 60.000000, 77.000000, 96.000000, 117.000000, 140.000000, 110.000000, 132.000000, 156.000000, 182.000000, 210.000000, 240.000000, 272.000000, 306.000000, 342.000000, 380.000000, 315.000000, 352.000000, 391.000000, 432.000000, 475.000000, 390.000000, 432.000000, 476.000000, 522.000000, 570.000000 }, nd4j::DataType::FLOAT32);
+    auto e = NDArray('c', { 2, 3, 5 }, { 10.000000, 22.000000, 36.000000, 52.000000, 70.000000, 60.000000, 77.000000, 96.000000, 117.000000, 140.000000, 110.000000, 132.000000, 156.000000, 182.000000, 210.000000, 240.000000, 272.000000, 306.000000, 342.000000, 380.000000, 315.000000, 352.000000, 391.000000, 432.000000, 475.000000, 390.000000, 432.000000, 476.000000, 522.000000, 570.000000 }, sd::DataType::FLOAT32);
 
     x.linspace(1.f);
     y.linspace(10.f);
     z.assign(0.f);
 
-    x.applyBroadcast(nd4j::broadcast::Multiply, { 0,2 }, y, z);
+    x.applyBroadcast(sd::broadcast::Multiply, { 0,2 }, y, z);
     //z.printBuffer();
     ASSERT_EQ(e, z);
 }
 ///////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests14, Test_broadcast_3D_2) {
 
-    auto x = NDArray('f', { 2, 3, 5 }, nd4j::DataType::FLOAT32);
-    auto y = NDArray('f', { 2, 5 }, nd4j::DataType::FLOAT32);
-    auto z = NDArray('f', { 2, 3, 5 }, nd4j::DataType::FLOAT32);
+    auto x = NDArray('f', { 2, 3, 5 }, sd::DataType::FLOAT32);
+    auto y = NDArray('f', { 2, 5 }, sd::DataType::FLOAT32);
+    auto z = NDArray('f', { 2, 3, 5 }, sd::DataType::FLOAT32);
     // recieved by main algorithm
-    auto eC = NDArray('c', { 2, 3, 5 }, { 0.100000, 0.181818, 0.250000, 0.307692, 0.357143, 0.600000, 0.636364, 0.666667, 0.692308, 0.714286, 1.100000, 1.090909, 1.083333, 1.076923, 1.071429, 1.066667, 1.062500, 1.058824, 1.055556, 1.052632, 1.400000, 1.375000, 1.352941, 1.333333, 1.315789, 1.733333, 1.687500, 1.647059, 1.611111, 1.578947 }, nd4j::DataType::FLOAT32);
+    auto eC = NDArray('c', { 2, 3, 5 }, { 0.100000, 0.181818, 0.250000, 0.307692, 0.357143, 0.600000, 0.636364, 0.666667, 0.692308, 0.714286, 1.100000, 1.090909, 1.083333, 1.076923, 1.071429, 1.066667, 1.062500, 1.058824, 1.055556, 1.052632, 1.400000, 1.375000, 1.352941, 1.333333, 1.315789, 1.733333, 1.687500, 1.647059, 1.611111, 1.578947 }, sd::DataType::FLOAT32);
 
-    auto e = NDArray('f', { 2, 3, 5 }, nd4j::DataType::FLOAT32);
+    auto e = NDArray('f', { 2, 3, 5 }, sd::DataType::FLOAT32);
 
     e.assign(eC);
 
@@ -1490,37 +1490,37 @@ TEST_F(DeclarableOpsTests14, Test_broadcast_3D_2) {
     y.linspace(10.f);
     z.assign(0.f);
 
-    x.applyBroadcast(nd4j::broadcast::Divide, { 0,2 }, y, z);
+    x.applyBroadcast(sd::broadcast::Divide, { 0,2 }, y, z);
 
     ASSERT_EQ(e, z);
 }
 ///////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests14, Test_broadcast_4D_1) {
 
-    auto x = NDArray('c', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32);
-    auto y = NDArray('c', { 2, 5, 4 }, nd4j::DataType::FLOAT32);
-    auto z = NDArray('c', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32);
+    auto x = NDArray('c', { 2, 3, 5, 4 }, sd::DataType::FLOAT32);
+    auto y = NDArray('c', { 2, 5, 4 }, sd::DataType::FLOAT32);
+    auto z = NDArray('c', { 2, 3, 5, 4 }, sd::DataType::FLOAT32);
     // recieved by main algorithm
-    auto e = NDArray('c', { 2, 3, 5, 4 }, { 10.000000, 22.000000, 36.000000, 52.000000, 70.000000, 90.000000, 112.000000, 136.000000, 162.000000, 190.000000, 220.000000, 252.000000, 286.000000, 322.000000, 360.000000, 400.000000, 442.000000, 486.000000, 532.000000, 580.000000, 210.000000, 242.000000, 276.000000, 312.000000, 350.000000, 390.000000, 432.000000, 476.000000, 522.000000, 570.000000, 620.000000, 672.000000, 726.000000, 782.000000, 840.000000, 900.000000, 962.000000, 1026.000000, 1092.000000, 1160.000000, 410.000000, 462.000000, 516.000000, 572.000000, 630.000000, 690.000000, 752.000000, 816.000000, 882.000000, 950.000000, 1020.000000, 1092.000000, 1166.000000, 1242.000000, 1320.000000, 1400.000000, 1482.000000, 1566.000000, 1652.000000, 1740.000000, 1830.000000, 1922.000000, 2016.000000, 2112.000000, 2210.000000, 2310.000000, 2412.000000, 2516.000000, 2622.000000, 2730.000000, 2840.000000, 2952.000000, 3066.000000, 3182.000000, 3300.000000, 3420.000000, 3542.000000, 3666.000000, 3792.000000, 3920.000000, 2430.000000, 2542.000000, 2656.000000, 2772.000000, 2890.000000, 3010.000000, 3132.000000, 3256.000000, 3382.000000, 3510.000000, 3640.000000, 3772.000000, 3906.000000, 4042.000000, 4180.000000, 4320.000000, 4462.000000, 4606.000000, 4752.000000, 4900.000000, 3030.000000, 3162.000000, 3296.000000, 3432.000000, 3570.000000, 3710.000000, 3852.000000, 3996.000000, 4142.000000, 4290.000000, 4440.000000, 4592.000000, 4746.000000, 4902.000000, 5060.000000, 5220.000000, 5382.000000, 5546.000000, 5712.000000, 5880.000000 }, nd4j::DataType::FLOAT32);
+    auto e = NDArray('c', { 2, 3, 5, 4 }, { 10.000000, 22.000000, 36.000000, 52.000000, 70.000000, 90.000000, 112.000000, 136.000000, 162.000000, 190.000000, 220.000000, 252.000000, 286.000000, 322.000000, 360.000000, 400.000000, 442.000000, 486.000000, 532.000000, 580.000000, 210.000000, 242.000000, 276.000000, 312.000000, 350.000000, 390.000000, 432.000000, 476.000000, 522.000000, 570.000000, 620.000000, 672.000000, 726.000000, 782.000000, 840.000000, 900.000000, 962.000000, 1026.000000, 1092.000000, 1160.000000, 410.000000, 462.000000, 516.000000, 572.000000, 630.000000, 690.000000, 752.000000, 816.000000, 882.000000, 950.000000, 1020.000000, 1092.000000, 1166.000000, 1242.000000, 1320.000000, 1400.000000, 1482.000000, 1566.000000, 1652.000000, 1740.000000, 1830.000000, 1922.000000, 2016.000000, 2112.000000, 2210.000000, 2310.000000, 2412.000000, 2516.000000, 2622.000000, 2730.000000, 2840.000000, 2952.000000, 3066.000000, 3182.000000, 3300.000000, 3420.000000, 3542.000000, 3666.000000, 3792.000000, 3920.000000, 2430.000000, 2542.000000, 2656.000000, 2772.000000, 2890.000000, 3010.000000, 3132.000000, 3256.000000, 3382.000000, 3510.000000, 3640.000000, 3772.000000, 3906.000000, 4042.000000, 4180.000000, 4320.000000, 4462.000000, 4606.000000, 4752.000000, 4900.000000, 3030.000000, 3162.000000, 3296.000000, 3432.000000, 3570.000000, 3710.000000, 3852.000000, 3996.000000, 4142.000000, 4290.000000, 4440.000000, 4592.000000, 4746.000000, 4902.000000, 5060.000000, 5220.000000, 5382.000000, 5546.000000, 5712.000000, 5880.000000 }, sd::DataType::FLOAT32);
 
     x.linspace(1.f);
     y.linspace(10.f);
     z.assign(0.f);
 
-    x.applyBroadcast(nd4j::broadcast::Multiply, { 0,2,3 }, y, z);
+    x.applyBroadcast(sd::broadcast::Multiply, { 0,2,3 }, y, z);
 
     ASSERT_EQ(e, z);
 }
 ///////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests14, Test_broadcast_4D_2) {
 
-    auto x = NDArray('f', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32);
-    auto y = NDArray('f', { 2, 5, 4 }, nd4j::DataType::FLOAT32);
-    auto z = NDArray('f', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32);
+    auto x = NDArray('f', { 2, 3, 5, 4 }, sd::DataType::FLOAT32);
+    auto y = NDArray('f', { 2, 5, 4 }, sd::DataType::FLOAT32);
+    auto z = NDArray('f', { 2, 3, 5, 4 }, sd::DataType::FLOAT32);
     // recieved by main algorithm
-    auto eC = NDArray('c', { 2, 3, 5, 4 }, { 0.100000,0.181818,0.250000,0.307692,0.357143,0.400000,0.437500,0.470588,0.500000,0.526316,0.550000,0.571429, 0.590909,0.608696,0.625000,0.640000, 0.653846,0.666667,0.678571,0.689655, 2.100000,2.000000,1.916667, 1.846154, 1.785714, 1.733333,1.687500, 1.647059,1.611111, 1.578947,1.550000, 1.523810,1.500000, 1.478261,1.458333, 1.440000,1.423077, 1.407407,1.392857, 1.379310,4.100000, 3.818182,3.583333, 3.384615, 3.214286, 3.066667,2.937500, 2.823529,2.722222, 2.631579,2.550000, 2.476191,2.409091, 2.347826,2.291667, 2.240000,2.192308, 2.148148,2.107143, 2.068965,2.033333, 2.000000,1.968750, 1.939394,1.911765, 1.885714,1.861111, 1.837838,1.815789, 1.794872,1.775000, 1.756098,1.738095, 1.720930,1.704545, 1.688889,1.673913, 1.659575,1.645833,1.632653,2.700000,2.645161,2.593750,2.545455,2.500000,2.457143,2.416667,2.378378,2.342105,2.307692,2.275000,2.243902,2.214286,2.186047,2.159091,2.133333,2.108696,2.085106,2.062500,2.040816,3.366667,3.290323,3.218750,3.151515,3.088235,3.028571,2.972222,2.918919,2.868421,2.820513,2.775000,2.731707,2.690476,2.651163,2.613636,2.577778,2.543478,2.510638,2.479167,2.448980 }, nd4j::DataType::FLOAT32);
+    auto eC = NDArray('c', { 2, 3, 5, 4 }, { 0.100000,0.181818,0.250000,0.307692,0.357143,0.400000,0.437500,0.470588,0.500000,0.526316,0.550000,0.571429, 0.590909,0.608696,0.625000,0.640000, 0.653846,0.666667,0.678571,0.689655, 2.100000,2.000000,1.916667, 1.846154, 1.785714, 1.733333,1.687500, 1.647059,1.611111, 1.578947,1.550000, 1.523810,1.500000, 1.478261,1.458333, 1.440000,1.423077, 1.407407,1.392857, 1.379310,4.100000, 3.818182,3.583333, 3.384615, 3.214286, 3.066667,2.937500, 2.823529,2.722222, 2.631579,2.550000, 2.476191,2.409091, 2.347826,2.291667, 2.240000,2.192308, 2.148148,2.107143, 2.068965,2.033333, 2.000000,1.968750, 1.939394,1.911765, 1.885714,1.861111, 1.837838,1.815789, 1.794872,1.775000, 1.756098,1.738095, 1.720930,1.704545, 1.688889,1.673913, 1.659575,1.645833,1.632653,2.700000,2.645161,2.593750,2.545455,2.500000,2.457143,2.416667,2.378378,2.342105,2.307692,2.275000,2.243902,2.214286,2.186047,2.159091,2.133333,2.108696,2.085106,2.062500,2.040816,3.366667,3.290323,3.218750,3.151515,3.088235,3.028571,2.972222,2.918919,2.868421,2.820513,2.775000,2.731707,2.690476,2.651163,2.613636,2.577778,2.543478,2.510638,2.479167,2.448980 }, sd::DataType::FLOAT32);
 
-    auto e = NDArray('f', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32);
+    auto e = NDArray('f', { 2, 3, 5, 4 }, sd::DataType::FLOAT32);
 
     e.assign(eC);
 
@@ -1528,20 +1528,20 @@ TEST_F(DeclarableOpsTests14, Test_broadcast_4D_2) {
     y.linspace(10.f);
     z.assign(0.f);
 
-    x.applyBroadcast(nd4j::broadcast::Divide, { 0,2,3 }, y, z);
+    x.applyBroadcast(sd::broadcast::Divide, { 0,2,3 }, y, z);
 
     ASSERT_EQ(e, z);
 }
 ///////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests14, Test_broadcast_4D_3) {
 
-    auto x = NDArray('f', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32);
-    auto y = NDArray('f', { 2, 5 }, nd4j::DataType::FLOAT32);
-    auto z = NDArray('f', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32);
+    auto x = NDArray('f', { 2, 3, 5, 4 }, sd::DataType::FLOAT32);
+    auto y = NDArray('f', { 2, 5 }, sd::DataType::FLOAT32);
+    auto z = NDArray('f', { 2, 3, 5, 4 }, sd::DataType::FLOAT32);
     // recieved by main algorithm
-    auto eC = NDArray('c', { 2, 3, 5, 4 }, { 0.100000, 0.200000, 0.300000, 0.400000, 0.454545, 0.545455, 0.636364, 0.727273, 0.750000, 0.833333, 0.916667, 1.000000, 1.000000, 1.076923, 1.153846, 1.230769, 1.214286, 1.285714, 1.357143, 1.428571, 2.100000, 2.200000, 2.300000, 2.400000, 2.272727, 2.363636, 2.454545, 2.545455, 2.416667, 2.500000, 2.583333, 2.666667, 2.538461, 2.615385, 2.692308, 2.769231, 2.642857, 2.714286, 2.785714, 2.857143, 4.100000, 4.200000, 4.300000, 4.400000, 4.090909, 4.181818, 4.272727, 4.363636, 4.083333, 4.166667, 4.250000, 4.333333, 4.076923, 4.153846, 4.230769, 4.307693, 4.071429, 4.142857, 4.214286, 4.285714, 4.066667, 4.133333, 4.200000, 4.266667, 4.062500, 4.125000, 4.187500, 4.250000, 4.058824, 4.117647, 4.176471, 4.235294, 4.055555, 4.111111, 4.166667, 4.222222, 4.052631, 4.105263, 4.157895, 4.210526, 5.400000, 5.466667, 5.533333, 5.600000, 5.312500, 5.375000, 5.437500, 5.500000, 5.235294, 5.294117, 5.352941, 5.411765, 5.166667, 5.222222, 5.277778, 5.333333, 5.105263, 5.157895, 5.210526, 5.263158, 6.733333, 6.800000, 6.866667, 6.933333, 6.562500, 6.625000, 6.687500, 6.750000, 6.411765, 6.470588, 6.529412, 6.588235, 6.277778, 6.333333, 6.388889, 6.444445, 6.157895, 6.210526, 6.263158, 6.315790 }, nd4j::DataType::FLOAT32);
+    auto eC = NDArray('c', { 2, 3, 5, 4 }, { 0.100000, 0.200000, 0.300000, 0.400000, 0.454545, 0.545455, 0.636364, 0.727273, 0.750000, 0.833333, 0.916667, 1.000000, 1.000000, 1.076923, 1.153846, 1.230769, 1.214286, 1.285714, 1.357143, 1.428571, 2.100000, 2.200000, 2.300000, 2.400000, 2.272727, 2.363636, 2.454545, 2.545455, 2.416667, 2.500000, 2.583333, 2.666667, 2.538461, 2.615385, 2.692308, 2.769231, 2.642857, 2.714286, 2.785714, 2.857143, 4.100000, 4.200000, 4.300000, 4.400000, 4.090909, 4.181818, 4.272727, 4.363636, 4.083333, 4.166667, 4.250000, 4.333333, 4.076923, 4.153846, 4.230769, 4.307693, 4.071429, 4.142857, 4.214286, 4.285714, 4.066667, 4.133333, 4.200000, 4.266667, 4.062500, 4.125000, 4.187500, 4.250000, 4.058824, 4.117647, 4.176471, 4.235294, 4.055555, 4.111111, 4.166667, 4.222222, 4.052631, 4.105263, 4.157895, 4.210526, 5.400000, 5.466667, 5.533333, 5.600000, 5.312500, 5.375000, 5.437500, 5.500000, 5.235294, 5.294117, 5.352941, 5.411765, 5.166667, 5.222222, 5.277778, 5.333333, 5.105263, 5.157895, 5.210526, 5.263158, 6.733333, 6.800000, 6.866667, 6.933333, 6.562500, 6.625000, 6.687500, 6.750000, 6.411765, 6.470588, 6.529412, 6.588235, 6.277778, 6.333333, 6.388889, 6.444445, 6.157895, 6.210526, 6.263158, 6.315790 }, sd::DataType::FLOAT32);
 
-    auto e = NDArray('f', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32);
+    auto e = NDArray('f', { 2, 3, 5, 4 }, sd::DataType::FLOAT32);
 
     e.assign(eC);
 
@@ -1549,7 +1549,7 @@ TEST_F(DeclarableOpsTests14, Test_broadcast_4D_3) {
     y.linspace(10.f);
     z.assign(0.f);
 
-    x.applyBroadcast(nd4j::broadcast::Divide, { 0,2 }, y, z);
+    x.applyBroadcast(sd::broadcast::Divide, { 0,2 }, y, z);
 
     ASSERT_EQ(e, z);
 }
@@ -1558,13 +1558,13 @@ TEST_F(DeclarableOpsTests14, Test_broadcast_4D_4) {
 
     // x[4, 12, 128, 128] * y[4, 1, 128, 1] = z[4, 12, 128, 128]
 
-    auto x = NDArray('f', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32);
-    auto y = NDArray('f', { 2, 1, 5, 1 }, nd4j::DataType::FLOAT32);
-    auto z = NDArray('f', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32);
+    auto x = NDArray('f', { 2, 3, 5, 4 }, sd::DataType::FLOAT32);
+    auto y = NDArray('f', { 2, 1, 5, 1 }, sd::DataType::FLOAT32);
+    auto z = NDArray('f', { 2, 3, 5, 4 }, sd::DataType::FLOAT32);
     // recieved by main algorithm
-    auto eC = NDArray('c', { 2, 3, 5, 4 }, { 0.100000, 0.200000, 0.300000, 0.400000, 0.454545, 0.545455, 0.636364, 0.727273, 0.750000, 0.833333, 0.916667, 1.000000, 1.000000, 1.076923, 1.153846, 1.230769, 1.214286, 1.285714, 1.357143, 1.428571, 2.100000, 2.200000, 2.300000, 2.400000, 2.272727, 2.363636, 2.454545, 2.545455, 2.416667, 2.500000, 2.583333, 2.666667, 2.538461, 2.615385, 2.692308, 2.769231, 2.642857, 2.714286, 2.785714, 2.857143, 4.100000, 4.200000, 4.300000, 4.400000, 4.090909, 4.181818, 4.272727, 4.363636, 4.083333, 4.166667, 4.250000, 4.333333, 4.076923, 4.153846, 4.230769, 4.307693, 4.071429, 4.142857, 4.214286, 4.285714, 4.066667, 4.133333, 4.200000, 4.266667, 4.062500, 4.125000, 4.187500, 4.250000, 4.058824, 4.117647, 4.176471, 4.235294, 4.055555, 4.111111, 4.166667, 4.222222, 4.052631, 4.105263, 4.157895, 4.210526, 5.400000, 5.466667, 5.533333, 5.600000, 5.312500, 5.375000, 5.437500, 5.500000, 5.235294, 5.294117, 5.352941, 5.411765, 5.166667, 5.222222, 5.277778, 5.333333, 5.105263, 5.157895, 5.210526, 5.263158, 6.733333, 6.800000, 6.866667, 6.933333, 6.562500, 6.625000, 6.687500, 6.750000, 6.411765, 6.470588, 6.529412, 6.588235, 6.277778, 6.333333, 6.388889, 6.444445, 6.157895, 6.210526, 6.263158, 6.315790 }, nd4j::DataType::FLOAT32);
+    auto eC = NDArray('c', { 2, 3, 5, 4 }, { 0.100000, 0.200000, 0.300000, 0.400000, 0.454545, 0.545455, 0.636364, 0.727273, 0.750000, 0.833333, 0.916667, 1.000000, 1.000000, 1.076923, 1.153846, 1.230769, 1.214286, 1.285714, 1.357143, 1.428571, 2.100000, 2.200000, 2.300000, 2.400000, 2.272727, 2.363636, 2.454545, 2.545455, 2.416667, 2.500000, 2.583333, 2.666667, 2.538461, 2.615385, 2.692308, 2.769231, 2.642857, 2.714286, 2.785714, 2.857143, 4.100000, 4.200000, 4.300000, 4.400000, 4.090909, 4.181818, 4.272727, 4.363636, 4.083333, 4.166667, 4.250000, 4.333333, 4.076923, 4.153846, 4.230769, 4.307693, 4.071429, 4.142857, 4.214286, 4.285714, 4.066667, 4.133333, 4.200000, 4.266667, 4.062500, 4.125000, 4.187500, 4.250000, 4.058824, 4.117647, 4.176471, 4.235294, 4.055555, 4.111111, 4.166667, 4.222222, 4.052631, 4.105263, 4.157895, 4.210526, 5.400000, 5.466667, 5.533333, 5.600000, 5.312500, 5.375000, 5.437500, 5.500000, 5.235294, 5.294117, 5.352941, 5.411765, 5.166667, 5.222222, 5.277778, 5.333333, 5.105263, 5.157895, 5.210526, 5.263158, 6.733333, 6.800000, 6.866667, 6.933333, 6.562500, 6.625000, 6.687500, 6.750000, 6.411765, 6.470588, 6.529412, 6.588235, 6.277778, 6.333333, 6.388889, 6.444445, 6.157895, 6.210526, 6.263158, 6.315790 }, sd::DataType::FLOAT32);
 
-    auto e = NDArray('f', { 2, 3, 5, 4 }, nd4j::DataType::FLOAT32);
+    auto e = NDArray('f', { 2, 3, 5, 4 }, sd::DataType::FLOAT32);
     e.assign(eC);
 
     x.linspace(1.f);
@@ -1578,11 +1578,11 @@ TEST_F(DeclarableOpsTests14, Test_broadcast_4D_4) {
 ///////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests14, Test_broadcast_5D_1) {
     // x[4, 12, 128, 128, 128] * y[4, 1, 128, 128, 128] = z[4, 12, 128, 128, 128]
-    auto x = NDArray('c', { 2, 3, 5, 4, 3 }, nd4j::DataType::FLOAT32);
-    auto y = NDArray('c', { 2, 1, 5, 4, 3 }, nd4j::DataType::FLOAT32);
-    auto z = NDArray('c', { 2, 3, 5, 4, 3 }, nd4j::DataType::FLOAT32);
+    auto x = NDArray('c', { 2, 3, 5, 4, 3 }, sd::DataType::FLOAT32);
+    auto y = NDArray('c', { 2, 1, 5, 4, 3 }, sd::DataType::FLOAT32);
+    auto z = NDArray('c', { 2, 3, 5, 4, 3 }, sd::DataType::FLOAT32);
     // recieved by main algorithm
-    auto e = NDArray('c', { 2, 3, 5, 4, 3 }, { 10.000000, 22.000000, 36.000000, 52.000000, 70.000000, 90.000000, 112.000000, 136.000000, 162.000000, 190.000000, 220.000000, 252.000000, 286.000000, 322.000000, 360.000000, 400.000000, 442.000000, 486.000000, 532.000000, 580.000000, 630.000000, 682.000000, 736.000000, 792.000000, 850.000000, 910.000000, 972.000000, 1036.000000, 1102.000000, 1170.000000, 1240.000000, 1312.000000, 1386.000000, 1462.000000, 1540.000000, 1620.000000, 1702.000000, 1786.000000, 1872.000000, 1960.000000, 2050.000000, 2142.000000, 2236.000000, 2332.000000, 2430.000000, 2530.000000, 2632.000000, 2736.000000, 2842.000000, 2950.000000, 3060.000000, 3172.000000, 3286.000000, 3402.000000, 3520.000000, 3640.000000, 3762.000000, 3886.000000, 4012.000000, 4140.000000, 610.000000, 682.000000, 756.000000, 832.000000, 910.000000, 990.000000, 1072.000000, 1156.000000, 1242.000000, 1330.000000, 1420.000000, 1512.000000, 1606.000000, 1702.000000, 1800.000000, 1900.000000, 2002.000000, 2106.000000, 2212.000000, 2320.000000, 2430.000000, 2542.000000, 2656.000000, 2772.000000, 2890.000000, 3010.000000, 3132.000000, 3256.000000, 3382.000000, 3510.000000, 3640.000000, 3772.000000, 3906.000000, 4042.000000, 4180.000000, 4320.000000, 4462.000000, 4606.000000, 4752.000000, 4900.000000, 5050.000000, 5202.000000, 5356.000000, 5512.000000, 5670.000000, 5830.000000, 5992.000000, 6156.000000, 6322.000000, 6490.000000, 6660.000000, 6832.000000, 7006.000000, 7182.000000, 7360.000000, 7540.000000, 7722.000000, 7906.000000, 8092.000000, 8280.000000, 1210.000000, 1342.000000, 1476.000000, 1612.000000, 1750.000000, 1890.000000, 2032.000000, 2176.000000, 2322.000000, 2470.000000, 2620.000000, 2772.000000, 2926.000000, 3082.000000, 3240.000000, 3400.000000, 3562.000000, 3726.000000, 3892.000000, 4060.000000, 4230.000000, 4402.000000, 4576.000000, 4752.000000, 4930.000000, 5110.000000, 5292.000000, 5476.000000, 5662.000000, 5850.000000, 6040.000000, 6232.000000, 6426.000000, 6622.000000, 6820.000000, 7020.000000, 7222.000000, 7426.000000, 7632.000000, 7840.000000, 8050.000000, 8262.000000, 8476.000000, 8692.000000, 8910.000000, 9130.000000, 9352.000000, 9576.000000, 9802.000000, 10030.000000, 10260.000000, 10492.000000, 10726.000000, 10962.000000, 11200.000000, 11440.000000, 11682.000000, 11926.000000, 12172.000000, 12420.000000, 12670.000000, 12922.000000, 13176.000000, 13432.000000, 13690.000000, 13950.000000, 14212.000000, 14476.000000, 14742.000000, 15010.000000, 15280.000000, 15552.000000, 15826.000000, 16102.000000, 16380.000000, 16660.000000, 16942.000000, 17226.000000, 17512.000000, 17800.000000, 18090.000000, 18382.000000, 18676.000000, 18972.000000, 19270.000000, 19570.000000, 19872.000000, 20176.000000, 20482.000000, 20790.000000, 21100.000000, 21412.000000, 21726.000000, 22042.000000, 22360.000000, 22680.000000, 23002.000000, 23326.000000, 23652.000000, 23980.000000, 24310.000000, 24642.000000, 24976.000000, 25312.000000, 25650.000000, 25990.000000, 26332.000000, 26676.000000, 27022.000000, 27370.000000, 27720.000000, 28072.000000, 28426.000000, 28782.000000, 29140.000000, 29500.000000, 29862.000000, 30226.000000, 30592.000000, 30960.000000, 16870.000000, 17182.000000, 17496.000000, 17812.000000, 18130.000000, 18450.000000, 18772.000000, 19096.000000, 19422.000000, 19750.000000, 20080.000000, 20412.000000, 20746.000000, 21082.000000, 21420.000000, 21760.000000, 22102.000000, 22446.000000, 22792.000000, 23140.000000, 23490.000000, 23842.000000, 24196.000000, 24552.000000, 24910.000000, 25270.000000, 25632.000000, 25996.000000, 26362.000000, 26730.000000, 27100.000000, 27472.000000, 27846.000000, 28222.000000, 28600.000000, 28980.000000, 29362.000000, 29746.000000, 30132.000000, 30520.000000, 30910.000000, 31302.000000, 31696.000000, 32092.000000, 32490.000000, 32890.000000, 33292.000000, 33696.000000, 34102.000000, 34510.000000, 34920.000000, 35332.000000, 35746.000000, 36162.000000, 36580.000000, 37000.000000, 37422.000000, 37846.000000, 38272.000000, 38700.000000, 21070.000000, 21442.000000, 21816.000000, 22192.000000, 22570.000000, 22950.000000, 23332.000000, 23716.000000, 24102.000000, 24490.000000, 24880.000000, 25272.000000, 25666.000000, 26062.000000, 26460.000000, 26860.000000, 27262.000000, 27666.000000, 28072.000000, 28480.000000, 28890.000000, 29302.000000, 29716.000000, 30132.000000, 30550.000000, 30970.000000, 31392.000000, 31816.000000, 32242.000000, 32670.000000, 33100.000000, 33532.000000, 33966.000000, 34402.000000, 34840.000000, 35280.000000, 35722.000000, 36166.000000, 36612.000000, 37060.000000, 37510.000000, 37962.000000, 38416.000000, 38872.000000, 39330.000000, 39790.000000, 40252.000000, 40716.000000, 41182.000000, 41650.000000, 42120.000000, 42592.000000, 43066.000000, 43542.000000, 44020.000000, 44500.000000, 44982.000000, 45466.000000, 45952.000000, 46440.000000 }, nd4j::DataType::FLOAT32);
+    auto e = NDArray('c', { 2, 3, 5, 4, 3 }, { 10.000000, 22.000000, 36.000000, 52.000000, 70.000000, 90.000000, 112.000000, 136.000000, 162.000000, 190.000000, 220.000000, 252.000000, 286.000000, 322.000000, 360.000000, 400.000000, 442.000000, 486.000000, 532.000000, 580.000000, 630.000000, 682.000000, 736.000000, 792.000000, 850.000000, 910.000000, 972.000000, 1036.000000, 1102.000000, 1170.000000, 1240.000000, 1312.000000, 1386.000000, 1462.000000, 1540.000000, 1620.000000, 1702.000000, 1786.000000, 1872.000000, 1960.000000, 2050.000000, 2142.000000, 2236.000000, 2332.000000, 2430.000000, 2530.000000, 2632.000000, 2736.000000, 2842.000000, 2950.000000, 3060.000000, 3172.000000, 3286.000000, 3402.000000, 3520.000000, 3640.000000, 3762.000000, 3886.000000, 4012.000000, 4140.000000, 610.000000, 682.000000, 756.000000, 832.000000, 910.000000, 990.000000, 1072.000000, 1156.000000, 1242.000000, 1330.000000, 1420.000000, 1512.000000, 1606.000000, 1702.000000, 1800.000000, 1900.000000, 2002.000000, 2106.000000, 2212.000000, 2320.000000, 2430.000000, 2542.000000, 2656.000000, 2772.000000, 2890.000000, 3010.000000, 3132.000000, 3256.000000, 3382.000000, 3510.000000, 3640.000000, 3772.000000, 3906.000000, 4042.000000, 4180.000000, 4320.000000, 4462.000000, 4606.000000, 4752.000000, 4900.000000, 5050.000000, 5202.000000, 5356.000000, 5512.000000, 5670.000000, 5830.000000, 5992.000000, 6156.000000, 6322.000000, 6490.000000, 6660.000000, 6832.000000, 7006.000000, 7182.000000, 7360.000000, 7540.000000, 7722.000000, 7906.000000, 8092.000000, 8280.000000, 1210.000000, 1342.000000, 1476.000000, 1612.000000, 1750.000000, 1890.000000, 2032.000000, 2176.000000, 2322.000000, 2470.000000, 2620.000000, 2772.000000, 2926.000000, 3082.000000, 3240.000000, 3400.000000, 3562.000000, 3726.000000, 3892.000000, 4060.000000, 4230.000000, 4402.000000, 4576.000000, 4752.000000, 4930.000000, 5110.000000, 5292.000000, 5476.000000, 5662.000000, 5850.000000, 6040.000000, 6232.000000, 6426.000000, 6622.000000, 6820.000000, 7020.000000, 7222.000000, 7426.000000, 7632.000000, 7840.000000, 8050.000000, 8262.000000, 8476.000000, 8692.000000, 8910.000000, 9130.000000, 9352.000000, 9576.000000, 9802.000000, 10030.000000, 10260.000000, 10492.000000, 10726.000000, 10962.000000, 11200.000000, 11440.000000, 11682.000000, 11926.000000, 12172.000000, 12420.000000, 12670.000000, 12922.000000, 13176.000000, 13432.000000, 13690.000000, 13950.000000, 14212.000000, 14476.000000, 14742.000000, 15010.000000, 15280.000000, 15552.000000, 15826.000000, 16102.000000, 16380.000000, 16660.000000, 16942.000000, 17226.000000, 17512.000000, 17800.000000, 18090.000000, 18382.000000, 18676.000000, 18972.000000, 19270.000000, 19570.000000, 19872.000000, 20176.000000, 20482.000000, 20790.000000, 21100.000000, 21412.000000, 21726.000000, 22042.000000, 22360.000000, 22680.000000, 23002.000000, 23326.000000, 23652.000000, 23980.000000, 24310.000000, 24642.000000, 24976.000000, 25312.000000, 25650.000000, 25990.000000, 26332.000000, 26676.000000, 27022.000000, 27370.000000, 27720.000000, 28072.000000, 28426.000000, 28782.000000, 29140.000000, 29500.000000, 29862.000000, 30226.000000, 30592.000000, 30960.000000, 16870.000000, 17182.000000, 17496.000000, 17812.000000, 18130.000000, 18450.000000, 18772.000000, 19096.000000, 19422.000000, 19750.000000, 20080.000000, 20412.000000, 20746.000000, 21082.000000, 21420.000000, 21760.000000, 22102.000000, 22446.000000, 22792.000000, 23140.000000, 23490.000000, 23842.000000, 24196.000000, 24552.000000, 24910.000000, 25270.000000, 25632.000000, 25996.000000, 26362.000000, 26730.000000, 27100.000000, 27472.000000, 27846.000000, 28222.000000, 28600.000000, 28980.000000, 29362.000000, 29746.000000, 30132.000000, 30520.000000, 30910.000000, 31302.000000, 31696.000000, 32092.000000, 32490.000000, 32890.000000, 33292.000000, 33696.000000, 34102.000000, 34510.000000, 34920.000000, 35332.000000, 35746.000000, 36162.000000, 36580.000000, 37000.000000, 37422.000000, 37846.000000, 38272.000000, 38700.000000, 21070.000000, 21442.000000, 21816.000000, 22192.000000, 22570.000000, 22950.000000, 23332.000000, 23716.000000, 24102.000000, 24490.000000, 24880.000000, 25272.000000, 25666.000000, 26062.000000, 26460.000000, 26860.000000, 27262.000000, 27666.000000, 28072.000000, 28480.000000, 28890.000000, 29302.000000, 29716.000000, 30132.000000, 30550.000000, 30970.000000, 31392.000000, 31816.000000, 32242.000000, 32670.000000, 33100.000000, 33532.000000, 33966.000000, 34402.000000, 34840.000000, 35280.000000, 35722.000000, 36166.000000, 36612.000000, 37060.000000, 37510.000000, 37962.000000, 38416.000000, 38872.000000, 39330.000000, 39790.000000, 40252.000000, 40716.000000, 41182.000000, 41650.000000, 42120.000000, 42592.000000, 43066.000000, 43542.000000, 44020.000000, 44500.000000, 44982.000000, 45466.000000, 45952.000000, 46440.000000 }, sd::DataType::FLOAT32);
 
     x.linspace(1.f);
     y.linspace(10.f);
@@ -1595,13 +1595,13 @@ TEST_F(DeclarableOpsTests14, Test_broadcast_5D_1) {
 ///////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests14, Test_broadcast_5D_2) {
 
-    auto x = NDArray('f', { 2, 3, 5, 4, 3 }, nd4j::DataType::FLOAT32);
-    auto y = NDArray('f', { 2, 5, 4, 3 }, nd4j::DataType::FLOAT32);
-    auto z = NDArray('f', { 2, 3, 5, 4, 3 }, nd4j::DataType::FLOAT32);
+    auto x = NDArray('f', { 2, 3, 5, 4, 3 }, sd::DataType::FLOAT32);
+    auto y = NDArray('f', { 2, 5, 4, 3 }, sd::DataType::FLOAT32);
+    auto z = NDArray('f', { 2, 3, 5, 4, 3 }, sd::DataType::FLOAT32);
     // recieved by main algorithm
-    auto eC = NDArray('c', { 2, 3, 5, 4, 3 }, { 0.100000, 0.181818, 0.250000, 0.307692, 0.357143, 0.400000, 0.437500, 0.470588, 0.500000, 0.526316, 0.550000, 0.571429, 0.590909, 0.608696, 0.625000, 0.640000, 0.653846, 0.666667, 0.678571, 0.689655, 0.700000, 0.709677, 0.718750, 0.727273, 0.735294, 0.742857, 0.750000, 0.756757, 0.763158, 0.769231, 0.775000, 0.780488, 0.785714, 0.790698, 0.795455, 0.800000, 0.804348, 0.808511, 0.812500, 0.816327, 0.820000, 0.823529, 0.826923, 0.830189, 0.833333, 0.836364, 0.839286, 0.842105, 0.844828, 0.847458, 0.850000, 0.852459, 0.854839, 0.857143, 0.859375, 0.861538, 0.863636, 0.865672, 0.867647, 0.869565, 6.100000, 5.636364, 5.250000, 4.923077, 4.642857, 4.400000, 4.187500, 4.000000, 3.833333, 3.684211, 3.550000, 3.428571, 3.318182, 3.217391, 3.125000, 3.040000, 2.961539, 2.888889, 2.821429, 2.758621, 2.700000, 2.645161, 2.593750, 2.545455, 2.500000, 2.457143, 2.416667, 2.378378, 2.342105, 2.307692, 2.275000, 2.243902, 2.214286, 2.186047, 2.159091, 2.133333, 2.108696, 2.085106, 2.062500, 2.040816, 2.020000, 2.000000, 1.980769, 1.962264, 1.944444, 1.927273, 1.910714, 1.894737, 1.879310, 1.864407, 1.850000, 1.836066, 1.822581, 1.809524, 1.796875, 1.784615, 1.772727, 1.761194, 1.750000, 1.739130, 12.100000, 11.090909, 10.250000, 9.538462, 8.928572, 8.400000, 7.937500, 7.529412, 7.166667, 6.842105, 6.550000, 6.285714, 6.045455, 5.826087, 5.625000, 5.440000, 5.269231, 5.111111, 4.964286, 4.827586, 4.700000, 4.580645, 4.468750, 4.363636, 4.264706, 4.171429, 4.083333, 4.000000, 3.921053, 3.846154, 3.775000, 3.707317, 3.642857, 3.581395, 3.522727, 3.466667, 3.413043, 3.361702, 3.312500, 3.265306, 3.220000, 3.176471, 3.134615, 3.094340, 3.055556, 3.018182, 2.982143, 2.947368, 2.913793, 2.881356, 2.850000, 2.819672, 2.790323, 2.761905, 2.734375, 2.707692, 2.681818, 2.656716, 2.632353, 2.608696, 2.585714, 2.563380, 2.541667, 2.520548, 2.500000, 2.480000, 2.460526, 2.441558, 2.423077, 2.405063, 2.387500, 2.370370, 2.353658, 2.337349, 2.321429, 2.305882, 2.290698, 2.275862, 2.261364, 2.247191, 2.233333, 2.219780, 2.206522, 2.193548, 2.180851, 2.168421, 2.156250, 2.144330, 2.132653, 2.121212, 2.110000, 2.099010, 2.088235, 2.077670, 2.067308, 2.057143, 2.047170, 2.037383, 2.027778, 2.018349, 2.009091, 2.000000, 1.991071, 1.982301, 1.973684, 1.965217, 1.956897, 1.948718, 1.940678, 1.932773, 1.925000, 1.917355, 1.909836, 1.902439, 1.895161, 1.888000, 1.880952, 1.874016, 1.867188, 1.860465, 3.442857, 3.408451, 3.375000, 3.342466, 3.310811, 3.280000, 3.250000, 3.220779, 3.192308, 3.164557, 3.137500, 3.111111, 3.085366, 3.060241, 3.035714, 3.011765, 2.988372, 2.965517, 2.943182, 2.921348, 2.900000, 2.879121, 2.858696, 2.838710, 2.819149, 2.800000, 2.781250, 2.762887, 2.744898, 2.727273, 2.710000, 2.693069, 2.676471, 2.660194, 2.644231, 2.628572, 2.613208, 2.598131, 2.583333, 2.568807, 2.554545, 2.540540, 2.526786, 2.513274, 2.500000, 2.486957, 2.474138, 2.461539, 2.449152, 2.436975, 2.425000, 2.413223, 2.401639, 2.390244, 2.379032, 2.368000, 2.357143, 2.346457, 2.335938, 2.325581, 4.300000, 4.253521, 4.208333, 4.164383, 4.121622, 4.080000, 4.039474, 4.000000, 3.961539, 3.924051, 3.887500, 3.851852, 3.817073, 3.783133, 3.750000, 3.717647, 3.686047, 3.655172, 3.625000, 3.595506, 3.566667, 3.538461, 3.510870, 3.483871, 3.457447, 3.431579, 3.406250, 3.381443, 3.357143, 3.333333, 3.310000, 3.287129, 3.264706, 3.242718, 3.221154, 3.200000, 3.179245, 3.158879, 3.138889, 3.119266, 3.100000, 3.081081, 3.062500, 3.044248, 3.026316, 3.008696, 2.991379, 2.974359, 2.957627, 2.941176, 2.925000, 2.909091, 2.893443, 2.878049, 2.862903, 2.848000, 2.833333, 2.818898, 2.804688, 2.790698 }, nd4j::DataType::FLOAT32);
+    auto eC = NDArray('c', { 2, 3, 5, 4, 3 }, { 0.100000, 0.181818, 0.250000, 0.307692, 0.357143, 0.400000, 0.437500, 0.470588, 0.500000, 0.526316, 0.550000, 0.571429, 0.590909, 0.608696, 0.625000, 0.640000, 0.653846, 0.666667, 0.678571, 0.689655, 0.700000, 0.709677, 0.718750, 0.727273, 0.735294, 0.742857, 0.750000, 0.756757, 0.763158, 0.769231, 0.775000, 0.780488, 0.785714, 0.790698, 0.795455, 0.800000, 0.804348, 0.808511, 0.812500, 0.816327, 0.820000, 0.823529, 0.826923, 0.830189, 0.833333, 0.836364, 0.839286, 0.842105, 0.844828, 0.847458, 0.850000, 0.852459, 0.854839, 0.857143, 0.859375, 0.861538, 0.863636, 0.865672, 0.867647, 0.869565, 6.100000, 5.636364, 5.250000, 4.923077, 4.642857, 4.400000, 4.187500, 4.000000, 3.833333, 3.684211, 3.550000, 3.428571, 3.318182, 3.217391, 3.125000, 3.040000, 2.961539, 2.888889, 2.821429, 2.758621, 2.700000, 2.645161, 2.593750, 2.545455, 2.500000, 2.457143, 2.416667, 2.378378, 2.342105, 2.307692, 2.275000, 2.243902, 2.214286, 2.186047, 2.159091, 2.133333, 2.108696, 2.085106, 2.062500, 2.040816, 2.020000, 2.000000, 1.980769, 1.962264, 1.944444, 1.927273, 1.910714, 1.894737, 1.879310, 1.864407, 1.850000, 1.836066, 1.822581, 1.809524, 1.796875, 1.784615, 1.772727, 1.761194, 1.750000, 1.739130, 12.100000, 11.090909, 10.250000, 9.538462, 8.928572, 8.400000, 7.937500, 7.529412, 7.166667, 6.842105, 6.550000, 6.285714, 6.045455, 5.826087, 5.625000, 5.440000, 5.269231, 5.111111, 4.964286, 4.827586, 4.700000, 4.580645, 4.468750, 4.363636, 4.264706, 4.171429, 4.083333, 4.000000, 3.921053, 3.846154, 3.775000, 3.707317, 3.642857, 3.581395, 3.522727, 3.466667, 3.413043, 3.361702, 3.312500, 3.265306, 3.220000, 3.176471, 3.134615, 3.094340, 3.055556, 3.018182, 2.982143, 2.947368, 2.913793, 2.881356, 2.850000, 2.819672, 2.790323, 2.761905, 2.734375, 2.707692, 2.681818, 2.656716, 2.632353, 2.608696, 2.585714, 2.563380, 2.541667, 2.520548, 2.500000, 2.480000, 2.460526, 2.441558, 2.423077, 2.405063, 2.387500, 2.370370, 2.353658, 2.337349, 2.321429, 2.305882, 2.290698, 2.275862, 2.261364, 2.247191, 2.233333, 2.219780, 2.206522, 2.193548, 2.180851, 2.168421, 2.156250, 2.144330, 2.132653, 2.121212, 2.110000, 2.099010, 2.088235, 2.077670, 2.067308, 2.057143, 2.047170, 2.037383, 2.027778, 2.018349, 2.009091, 2.000000, 1.991071, 1.982301, 1.973684, 1.965217, 1.956897, 1.948718, 1.940678, 1.932773, 1.925000, 1.917355, 1.909836, 1.902439, 1.895161, 1.888000, 1.880952, 1.874016, 1.867188, 1.860465, 3.442857, 3.408451, 3.375000, 3.342466, 3.310811, 3.280000, 3.250000, 3.220779, 3.192308, 3.164557, 3.137500, 3.111111, 3.085366, 3.060241, 3.035714, 3.011765, 2.988372, 2.965517, 2.943182, 2.921348, 2.900000, 2.879121, 2.858696, 2.838710, 2.819149, 2.800000, 2.781250, 2.762887, 2.744898, 2.727273, 2.710000, 2.693069, 2.676471, 2.660194, 2.644231, 2.628572, 2.613208, 2.598131, 2.583333, 2.568807, 2.554545, 2.540540, 2.526786, 2.513274, 2.500000, 2.486957, 2.474138, 2.461539, 2.449152, 2.436975, 2.425000, 2.413223, 2.401639, 2.390244, 2.379032, 2.368000, 2.357143, 2.346457, 2.335938, 2.325581, 4.300000, 4.253521, 4.208333, 4.164383, 4.121622, 4.080000, 4.039474, 4.000000, 3.961539, 3.924051, 3.887500, 3.851852, 3.817073, 3.783133, 3.750000, 3.717647, 3.686047, 3.655172, 3.625000, 3.595506, 3.566667, 3.538461, 3.510870, 3.483871, 3.457447, 3.431579, 3.406250, 3.381443, 3.357143, 3.333333, 3.310000, 3.287129, 3.264706, 3.242718, 3.221154, 3.200000, 3.179245, 3.158879, 3.138889, 3.119266, 3.100000, 3.081081, 3.062500, 3.044248, 3.026316, 3.008696, 2.991379, 2.974359, 2.957627, 2.941176, 2.925000, 2.909091, 2.893443, 2.878049, 2.862903, 2.848000, 2.833333, 2.818898, 2.804688, 2.790698 }, sd::DataType::FLOAT32);
 
-    auto e = NDArray('f', { 2, 3, 5, 4, 3 }, nd4j::DataType::FLOAT32);
+    auto e = NDArray('f', { 2, 3, 5, 4, 3 }, sd::DataType::FLOAT32);
 
     e.assign(eC);
 
@@ -1609,20 +1609,20 @@ TEST_F(DeclarableOpsTests14, Test_broadcast_5D_2) {
     y.linspace(10.f);
     z.assign(0.f);
 
-    x.applyBroadcast(nd4j::broadcast::Divide, { 0,2,3,4 }, y, z);
+    x.applyBroadcast(sd::broadcast::Divide, { 0,2,3,4 }, y, z);
 
     ASSERT_EQ(e, z);
 }
 ///////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests14, Test_broadcast_5D_3) {
 
-    auto x = NDArray('f', { 2, 3, 5, 4, 3 }, nd4j::DataType::FLOAT32);
-    auto y = NDArray('f', { 2, 5 }, nd4j::DataType::FLOAT32);
-    auto z = NDArray('f', { 2, 3, 5, 4, 3 }, nd4j::DataType::FLOAT32);
+    auto x = NDArray('f', { 2, 3, 5, 4, 3 }, sd::DataType::FLOAT32);
+    auto y = NDArray('f', { 2, 5 }, sd::DataType::FLOAT32);
+    auto z = NDArray('f', { 2, 3, 5, 4, 3 }, sd::DataType::FLOAT32);
     // recieved by main algorithm
-    auto eC = NDArray('c', { 2, 3, 5, 4, 3 }, { 0.100000, 0.200000, 0.300000, 0.400000, 0.500000, 0.600000, 0.700000, 0.800000, 0.900000, 1.000000, 1.100000, 1.200000, 1.181818, 1.272727, 1.363636, 1.454545, 1.545455, 1.636364, 1.727273, 1.818182, 1.909091, 2.000000, 2.090909, 2.181818, 2.083333, 2.166667, 2.250000, 2.333333, 2.416667, 2.500000, 2.583333, 2.666667, 2.750000, 2.833333, 2.916667, 3.000000, 2.846154, 2.923077, 3.000000, 3.076923, 3.153846, 3.230769, 3.307692, 3.384615, 3.461539, 3.538461, 3.615385, 3.692308, 3.500000, 3.571429, 3.642857, 3.714286, 3.785714, 3.857143, 3.928571, 4.000000, 4.071429, 4.142857, 4.214286, 4.285714, 6.100000, 6.200000, 6.300000, 6.400000, 6.500000, 6.600000, 6.700000, 6.800000, 6.900000, 7.000000, 7.100000, 7.200000, 6.636364, 6.727273, 6.818182, 6.909091, 7.000000, 7.090909, 7.181818, 7.272727, 7.363636, 7.454545, 7.545455, 7.636364, 7.083333, 7.166667, 7.250000, 7.333333, 7.416667, 7.500000, 7.583333, 7.666667, 7.750000, 7.833333, 7.916667, 8.000000, 7.461538, 7.538462, 7.615385, 7.692307, 7.769231, 7.846154, 7.923077, 8.000000, 8.076923, 8.153846, 8.230769, 8.307693, 7.785714, 7.857143, 7.928571, 8.000000, 8.071428, 8.142858, 8.214286, 8.285714, 8.357142, 8.428572, 8.500000, 8.571428, 12.100000, 12.200000, 12.300000, 12.400000, 12.500000, 12.600000, 12.700000, 12.800000, 12.900000, 13.000000, 13.100000, 13.200000, 12.090909, 12.181818, 12.272727, 12.363636, 12.454545, 12.545455, 12.636364, 12.727273, 12.818182, 12.909091, 13.000000, 13.090909, 12.083333, 12.166667, 12.250000, 12.333333, 12.416667, 12.500000, 12.583333, 12.666667, 12.750000, 12.833333, 12.916667, 13.000000, 12.076923, 12.153846, 12.230769, 12.307693, 12.384615, 12.461538, 12.538462, 12.615385, 12.692307, 12.769231, 12.846154, 12.923077, 12.071428, 12.142858, 12.214286, 12.285714, 12.357142, 12.428572, 12.500000, 12.571428, 12.642858, 12.714286, 12.785714, 12.857142, 12.066667, 12.133333, 12.200000, 12.266666, 12.333333, 12.400000, 12.466666, 12.533334, 12.600000, 12.666667, 12.733334, 12.800000, 12.062500, 12.125000, 12.187500, 12.250000, 12.312500, 12.375000, 12.437500, 12.500000, 12.562500, 12.625000, 12.687500, 12.750000, 12.058824, 12.117647, 12.176471, 12.235294, 12.294118, 12.352942, 12.411765, 12.470589, 12.529411, 12.588235, 12.647058, 12.705882, 12.055555, 12.111111, 12.166667, 12.222222, 12.277778, 12.333333, 12.388889, 12.444445, 12.500000, 12.555555, 12.611111, 12.666667, 12.052631, 12.105263, 12.157895, 12.210526, 12.263158, 12.315789, 12.368421, 12.421053, 12.473684, 12.526316, 12.578947, 12.631579, 16.066668, 16.133333, 16.200001, 16.266666, 16.333334, 16.400000, 16.466667, 16.533333, 16.600000, 16.666666, 16.733334, 16.799999, 15.812500, 15.875000, 15.937500, 16.000000, 16.062500, 16.125000, 16.187500, 16.250000, 16.312500, 16.375000, 16.437500, 16.500000, 15.588235, 15.647058, 15.705882, 15.764706, 15.823529, 15.882353, 15.941176, 16.000000, 16.058823, 16.117647, 16.176470, 16.235294, 15.388889, 15.444445, 15.500000, 15.555555, 15.611111, 15.666667, 15.722222, 15.777778, 15.833333, 15.888889, 15.944445, 16.000000, 15.210526, 15.263158, 15.315789, 15.368421, 15.421053, 15.473684, 15.526316, 15.578947, 15.631579, 15.684211, 15.736842, 15.789474, 20.066668, 20.133333, 20.200001, 20.266666, 20.333334, 20.400000, 20.466667, 20.533333, 20.600000, 20.666666, 20.733334, 20.799999, 19.562500, 19.625000, 19.687500, 19.750000, 19.812500, 19.875000, 19.937500, 20.000000, 20.062500, 20.125000, 20.187500, 20.250000, 19.117647, 19.176470, 19.235294, 19.294117, 19.352942, 19.411764, 19.470589, 19.529411, 19.588236, 19.647058, 19.705883, 19.764706, 18.722221, 18.777779, 18.833334, 18.888889, 18.944445, 19.000000, 19.055555, 19.111111, 19.166666, 19.222221, 19.277779, 19.333334, 18.368422, 18.421053, 18.473684, 18.526316, 18.578947, 18.631578, 18.684210, 18.736841, 18.789474, 18.842106, 18.894737, 18.947369 }, nd4j::DataType::FLOAT32);
+    auto eC = NDArray('c', { 2, 3, 5, 4, 3 }, { 0.100000, 0.200000, 0.300000, 0.400000, 0.500000, 0.600000, 0.700000, 0.800000, 0.900000, 1.000000, 1.100000, 1.200000, 1.181818, 1.272727, 1.363636, 1.454545, 1.545455, 1.636364, 1.727273, 1.818182, 1.909091, 2.000000, 2.090909, 2.181818, 2.083333, 2.166667, 2.250000, 2.333333, 2.416667, 2.500000, 2.583333, 2.666667, 2.750000, 2.833333, 2.916667, 3.000000, 2.846154, 2.923077, 3.000000, 3.076923, 3.153846, 3.230769, 3.307692, 3.384615, 3.461539, 3.538461, 3.615385, 3.692308, 3.500000, 3.571429, 3.642857, 3.714286, 3.785714, 3.857143, 3.928571, 4.000000, 4.071429, 4.142857, 4.214286, 4.285714, 6.100000, 6.200000, 6.300000, 6.400000, 6.500000, 6.600000, 6.700000, 6.800000, 6.900000, 7.000000, 7.100000, 7.200000, 6.636364, 6.727273, 6.818182, 6.909091, 7.000000, 7.090909, 7.181818, 7.272727, 7.363636, 7.454545, 7.545455, 7.636364, 7.083333, 7.166667, 7.250000, 7.333333, 7.416667, 7.500000, 7.583333, 7.666667, 7.750000, 7.833333, 7.916667, 8.000000, 7.461538, 7.538462, 7.615385, 7.692307, 7.769231, 7.846154, 7.923077, 8.000000, 8.076923, 8.153846, 8.230769, 8.307693, 7.785714, 7.857143, 7.928571, 8.000000, 8.071428, 8.142858, 8.214286, 8.285714, 8.357142, 8.428572, 8.500000, 8.571428, 12.100000, 12.200000, 12.300000, 12.400000, 12.500000, 12.600000, 12.700000, 12.800000, 12.900000, 13.000000, 13.100000, 13.200000, 12.090909, 12.181818, 12.272727, 12.363636, 12.454545, 12.545455, 12.636364, 12.727273, 12.818182, 12.909091, 13.000000, 13.090909, 12.083333, 12.166667, 12.250000, 12.333333, 12.416667, 12.500000, 12.583333, 12.666667, 12.750000, 12.833333, 12.916667, 13.000000, 12.076923, 12.153846, 12.230769, 12.307693, 12.384615, 12.461538, 12.538462, 12.615385, 12.692307, 12.769231, 12.846154, 12.923077, 12.071428, 12.142858, 12.214286, 12.285714, 12.357142, 12.428572, 12.500000, 12.571428, 12.642858, 12.714286, 12.785714, 12.857142, 12.066667, 12.133333, 12.200000, 12.266666, 12.333333, 12.400000, 12.466666, 12.533334, 12.600000, 12.666667, 12.733334, 12.800000, 12.062500, 12.125000, 12.187500, 12.250000, 12.312500, 12.375000, 12.437500, 12.500000, 12.562500, 12.625000, 12.687500, 12.750000, 12.058824, 12.117647, 12.176471, 12.235294, 12.294118, 12.352942, 12.411765, 12.470589, 12.529411, 12.588235, 12.647058, 12.705882, 12.055555, 12.111111, 12.166667, 12.222222, 12.277778, 12.333333, 12.388889, 12.444445, 12.500000, 12.555555, 12.611111, 12.666667, 12.052631, 12.105263, 12.157895, 12.210526, 12.263158, 12.315789, 12.368421, 12.421053, 12.473684, 12.526316, 12.578947, 12.631579, 16.066668, 16.133333, 16.200001, 16.266666, 16.333334, 16.400000, 16.466667, 16.533333, 16.600000, 16.666666, 16.733334, 16.799999, 15.812500, 15.875000, 15.937500, 16.000000, 16.062500, 16.125000, 16.187500, 16.250000, 16.312500, 16.375000, 16.437500, 16.500000, 15.588235, 15.647058, 15.705882, 15.764706, 15.823529, 15.882353, 15.941176, 16.000000, 16.058823, 16.117647, 16.176470, 16.235294, 15.388889, 15.444445, 15.500000, 15.555555, 15.611111, 15.666667, 15.722222, 15.777778, 15.833333, 15.888889, 15.944445, 16.000000, 15.210526, 15.263158, 15.315789, 15.368421, 15.421053, 15.473684, 15.526316, 15.578947, 15.631579, 15.684211, 15.736842, 15.789474, 20.066668, 20.133333, 20.200001, 20.266666, 20.333334, 20.400000, 20.466667, 20.533333, 20.600000, 20.666666, 20.733334, 20.799999, 19.562500, 19.625000, 19.687500, 19.750000, 19.812500, 19.875000, 19.937500, 20.000000, 20.062500, 20.125000, 20.187500, 20.250000, 19.117647, 19.176470, 19.235294, 19.294117, 19.352942, 19.411764, 19.470589, 19.529411, 19.588236, 19.647058, 19.705883, 19.764706, 18.722221, 18.777779, 18.833334, 18.888889, 18.944445, 19.000000, 19.055555, 19.111111, 19.166666, 19.222221, 19.277779, 19.333334, 18.368422, 18.421053, 18.473684, 18.526316, 18.578947, 18.631578, 18.684210, 18.736841, 18.789474, 18.842106, 18.894737, 18.947369 }, sd::DataType::FLOAT32);
 
-    auto e = NDArray('f', { 2, 3, 5, 4, 3 }, nd4j::DataType::FLOAT32);
+    auto e = NDArray('f', { 2, 3, 5, 4, 3 }, sd::DataType::FLOAT32);
 
     e.assign(eC);
 
@@ -1630,20 +1630,20 @@ TEST_F(DeclarableOpsTests14, Test_broadcast_5D_3) {
     y.linspace(10.f);
     z.assign(0.f);
 
-    x.applyBroadcast(nd4j::broadcast::Divide, { 0,2 }, y, z);
+    x.applyBroadcast(sd::broadcast::Divide, { 0,2 }, y, z);
 
     ASSERT_EQ(e, z);
 }
 ///////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests14, Test_broadcast_5D_4) {
 
-    auto x = NDArray('f', { 2, 3, 5, 4, 3 }, nd4j::DataType::FLOAT32);
-    auto y = NDArray('f', { 2, 1, 5, 1, 1 }, nd4j::DataType::FLOAT32);
-    auto z = NDArray('f', { 2, 3, 5, 4, 3 }, nd4j::DataType::FLOAT32);
+    auto x = NDArray('f', { 2, 3, 5, 4, 3 }, sd::DataType::FLOAT32);
+    auto y = NDArray('f', { 2, 1, 5, 1, 1 }, sd::DataType::FLOAT32);
+    auto z = NDArray('f', { 2, 3, 5, 4, 3 }, sd::DataType::FLOAT32);
     // recieved by main algorithm
-    auto eC = NDArray('c', { 2, 3, 5, 4, 3 }, { 0.100000, 0.200000, 0.300000, 0.400000, 0.500000, 0.600000, 0.700000, 0.800000, 0.900000, 1.000000, 1.100000, 1.200000, 1.181818, 1.272727, 1.363636, 1.454545, 1.545455, 1.636364, 1.727273, 1.818182, 1.909091, 2.000000, 2.090909, 2.181818, 2.083333, 2.166667, 2.250000, 2.333333, 2.416667, 2.500000, 2.583333, 2.666667, 2.750000, 2.833333, 2.916667, 3.000000, 2.846154, 2.923077, 3.000000, 3.076923, 3.153846, 3.230769, 3.307692, 3.384615, 3.461539, 3.538461, 3.615385, 3.692308, 3.500000, 3.571429, 3.642857, 3.714286, 3.785714, 3.857143, 3.928571, 4.000000, 4.071429, 4.142857, 4.214286, 4.285714, 6.100000, 6.200000, 6.300000, 6.400000, 6.500000, 6.600000, 6.700000, 6.800000, 6.900000, 7.000000, 7.100000, 7.200000, 6.636364, 6.727273, 6.818182, 6.909091, 7.000000, 7.090909, 7.181818, 7.272727, 7.363636, 7.454545, 7.545455, 7.636364, 7.083333, 7.166667, 7.250000, 7.333333, 7.416667, 7.500000, 7.583333, 7.666667, 7.750000, 7.833333, 7.916667, 8.000000, 7.461538, 7.538462, 7.615385, 7.692307, 7.769231, 7.846154, 7.923077, 8.000000, 8.076923, 8.153846, 8.230769, 8.307693, 7.785714, 7.857143, 7.928571, 8.000000, 8.071428, 8.142858, 8.214286, 8.285714, 8.357142, 8.428572, 8.500000, 8.571428, 12.100000, 12.200000, 12.300000, 12.400000, 12.500000, 12.600000, 12.700000, 12.800000, 12.900000, 13.000000, 13.100000, 13.200000, 12.090909, 12.181818, 12.272727, 12.363636, 12.454545, 12.545455, 12.636364, 12.727273, 12.818182, 12.909091, 13.000000, 13.090909, 12.083333, 12.166667, 12.250000, 12.333333, 12.416667, 12.500000, 12.583333, 12.666667, 12.750000, 12.833333, 12.916667, 13.000000, 12.076923, 12.153846, 12.230769, 12.307693, 12.384615, 12.461538, 12.538462, 12.615385, 12.692307, 12.769231, 12.846154, 12.923077, 12.071428, 12.142858, 12.214286, 12.285714, 12.357142, 12.428572, 12.500000, 12.571428, 12.642858, 12.714286, 12.785714, 12.857142, 12.066667, 12.133333, 12.200000, 12.266666, 12.333333, 12.400000, 12.466666, 12.533334, 12.600000, 12.666667, 12.733334, 12.800000, 12.062500, 12.125000, 12.187500, 12.250000, 12.312500, 12.375000, 12.437500, 12.500000, 12.562500, 12.625000, 12.687500, 12.750000, 12.058824, 12.117647, 12.176471, 12.235294, 12.294118, 12.352942, 12.411765, 12.470589, 12.529411, 12.588235, 12.647058, 12.705882, 12.055555, 12.111111, 12.166667, 12.222222, 12.277778, 12.333333, 12.388889, 12.444445, 12.500000, 12.555555, 12.611111, 12.666667, 12.052631, 12.105263, 12.157895, 12.210526, 12.263158, 12.315789, 12.368421, 12.421053, 12.473684, 12.526316, 12.578947, 12.631579, 16.066668, 16.133333, 16.200001, 16.266666, 16.333334, 16.400000, 16.466667, 16.533333, 16.600000, 16.666666, 16.733334, 16.799999, 15.812500, 15.875000, 15.937500, 16.000000, 16.062500, 16.125000, 16.187500, 16.250000, 16.312500, 16.375000, 16.437500, 16.500000, 15.588235, 15.647058, 15.705882, 15.764706, 15.823529, 15.882353, 15.941176, 16.000000, 16.058823, 16.117647, 16.176470, 16.235294, 15.388889, 15.444445, 15.500000, 15.555555, 15.611111, 15.666667, 15.722222, 15.777778, 15.833333, 15.888889, 15.944445, 16.000000, 15.210526, 15.263158, 15.315789, 15.368421, 15.421053, 15.473684, 15.526316, 15.578947, 15.631579, 15.684211, 15.736842, 15.789474, 20.066668, 20.133333, 20.200001, 20.266666, 20.333334, 20.400000, 20.466667, 20.533333, 20.600000, 20.666666, 20.733334, 20.799999, 19.562500, 19.625000, 19.687500, 19.750000, 19.812500, 19.875000, 19.937500, 20.000000, 20.062500, 20.125000, 20.187500, 20.250000, 19.117647, 19.176470, 19.235294, 19.294117, 19.352942, 19.411764, 19.470589, 19.529411, 19.588236, 19.647058, 19.705883, 19.764706, 18.722221, 18.777779, 18.833334, 18.888889, 18.944445, 19.000000, 19.055555, 19.111111, 19.166666, 19.222221, 19.277779, 19.333334, 18.368422, 18.421053, 18.473684, 18.526316, 18.578947, 18.631578, 18.684210, 18.736841, 18.789474, 18.842106, 18.894737, 18.947369 }, nd4j::DataType::FLOAT32);
+    auto eC = NDArray('c', { 2, 3, 5, 4, 3 }, { 0.100000, 0.200000, 0.300000, 0.400000, 0.500000, 0.600000, 0.700000, 0.800000, 0.900000, 1.000000, 1.100000, 1.200000, 1.181818, 1.272727, 1.363636, 1.454545, 1.545455, 1.636364, 1.727273, 1.818182, 1.909091, 2.000000, 2.090909, 2.181818, 2.083333, 2.166667, 2.250000, 2.333333, 2.416667, 2.500000, 2.583333, 2.666667, 2.750000, 2.833333, 2.916667, 3.000000, 2.846154, 2.923077, 3.000000, 3.076923, 3.153846, 3.230769, 3.307692, 3.384615, 3.461539, 3.538461, 3.615385, 3.692308, 3.500000, 3.571429, 3.642857, 3.714286, 3.785714, 3.857143, 3.928571, 4.000000, 4.071429, 4.142857, 4.214286, 4.285714, 6.100000, 6.200000, 6.300000, 6.400000, 6.500000, 6.600000, 6.700000, 6.800000, 6.900000, 7.000000, 7.100000, 7.200000, 6.636364, 6.727273, 6.818182, 6.909091, 7.000000, 7.090909, 7.181818, 7.272727, 7.363636, 7.454545, 7.545455, 7.636364, 7.083333, 7.166667, 7.250000, 7.333333, 7.416667, 7.500000, 7.583333, 7.666667, 7.750000, 7.833333, 7.916667, 8.000000, 7.461538, 7.538462, 7.615385, 7.692307, 7.769231, 7.846154, 7.923077, 8.000000, 8.076923, 8.153846, 8.230769, 8.307693, 7.785714, 7.857143, 7.928571, 8.000000, 8.071428, 8.142858, 8.214286, 8.285714, 8.357142, 8.428572, 8.500000, 8.571428, 12.100000, 12.200000, 12.300000, 12.400000, 12.500000, 12.600000, 12.700000, 12.800000, 12.900000, 13.000000, 13.100000, 13.200000, 12.090909, 12.181818, 12.272727, 12.363636, 12.454545, 12.545455, 12.636364, 12.727273, 12.818182, 12.909091, 13.000000, 13.090909, 12.083333, 12.166667, 12.250000, 12.333333, 12.416667, 12.500000, 12.583333, 12.666667, 12.750000, 12.833333, 12.916667, 13.000000, 12.076923, 12.153846, 12.230769, 12.307693, 12.384615, 12.461538, 12.538462, 12.615385, 12.692307, 12.769231, 12.846154, 12.923077, 12.071428, 12.142858, 12.214286, 12.285714, 12.357142, 12.428572, 12.500000, 12.571428, 12.642858, 12.714286, 12.785714, 12.857142, 12.066667, 12.133333, 12.200000, 12.266666, 12.333333, 12.400000, 12.466666, 12.533334, 12.600000, 12.666667, 12.733334, 12.800000, 12.062500, 12.125000, 12.187500, 12.250000, 12.312500, 12.375000, 12.437500, 12.500000, 12.562500, 12.625000, 12.687500, 12.750000, 12.058824, 12.117647, 12.176471, 12.235294, 12.294118, 12.352942, 12.411765, 12.470589, 12.529411, 12.588235, 12.647058, 12.705882, 12.055555, 12.111111, 12.166667, 12.222222, 12.277778, 12.333333, 12.388889, 12.444445, 12.500000, 12.555555, 12.611111, 12.666667, 12.052631, 12.105263, 12.157895, 12.210526, 12.263158, 12.315789, 12.368421, 12.421053, 12.473684, 12.526316, 12.578947, 12.631579, 16.066668, 16.133333, 16.200001, 16.266666, 16.333334, 16.400000, 16.466667, 16.533333, 16.600000, 16.666666, 16.733334, 16.799999, 15.812500, 15.875000, 15.937500, 16.000000, 16.062500, 16.125000, 16.187500, 16.250000, 16.312500, 16.375000, 16.437500, 16.500000, 15.588235, 15.647058, 15.705882, 15.764706, 15.823529, 15.882353, 15.941176, 16.000000, 16.058823, 16.117647, 16.176470, 16.235294, 15.388889, 15.444445, 15.500000, 15.555555, 15.611111, 15.666667, 15.722222, 15.777778, 15.833333, 15.888889, 15.944445, 16.000000, 15.210526, 15.263158, 15.315789, 15.368421, 15.421053, 15.473684, 15.526316, 15.578947, 15.631579, 15.684211, 15.736842, 15.789474, 20.066668, 20.133333, 20.200001, 20.266666, 20.333334, 20.400000, 20.466667, 20.533333, 20.600000, 20.666666, 20.733334, 20.799999, 19.562500, 19.625000, 19.687500, 19.750000, 19.812500, 19.875000, 19.937500, 20.000000, 20.062500, 20.125000, 20.187500, 20.250000, 19.117647, 19.176470, 19.235294, 19.294117, 19.352942, 19.411764, 19.470589, 19.529411, 19.588236, 19.647058, 19.705883, 19.764706, 18.722221, 18.777779, 18.833334, 18.888889, 18.944445, 19.000000, 19.055555, 19.111111, 19.166666, 19.222221, 19.277779, 19.333334, 18.368422, 18.421053, 18.473684, 18.526316, 18.578947, 18.631578, 18.684210, 18.736841, 18.789474, 18.842106, 18.894737, 18.947369 }, sd::DataType::FLOAT32);
 
-    auto e = NDArray('f', { 2, 3, 5, 4, 3 }, nd4j::DataType::FLOAT32);
+    auto e = NDArray('f', { 2, 3, 5, 4, 3 }, sd::DataType::FLOAT32);
     e.assign(eC);
 
     x.linspace(1.f);
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp
index 199630d4e..4a5ea659a 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests15.cpp
@@ -21,13 +21,13 @@
 
 #include "testlayers.h"
 #include <ops/declarable/CustomOperations.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <ops/ops.h>
-#include <GradCheck.h>
+#include <helpers/GradCheck.h>
 #include <array>
 
 
-using namespace nd4j;
+using namespace sd;
 
 
 class DeclarableOpsTests15 : public testing::Test {
@@ -48,7 +48,7 @@ TEST_F(DeclarableOpsTests15, Test_NormalizeMoments_1) {
     auto z0 = NDArrayFactory::create<double>('c', {10});
     auto z1 = NDArrayFactory::create<double>('c', {10});
 
-    nd4j::ops::normalize_moments op;
+    sd::ops::normalize_moments op;
     auto result = op.execute({&w, &x, &y}, std::vector<NDArray*>{&z0, &z1}, {1e-4}, {}, {});
     ASSERT_EQ(Status::OK(), result);
 }
@@ -58,7 +58,7 @@ TEST_F(DeclarableOpsTests15, Test_Add_1) {
     auto y = NDArrayFactory::create<int>('c', {5}, {1, 1, 1, 1, 1});
     auto e = NDArrayFactory::create<int>('c', {5}, {2, 2, 2, 2, 2});
 
-    nd4j::ops::add op;
+    sd::ops::add op;
     auto result = op.execute({&x, &y}, {&x}, {}, {}, {});
     ASSERT_EQ(Status::OK(), result);
     ASSERT_EQ(e, x);
@@ -76,7 +76,7 @@ TEST_F(DeclarableOpsTests15, Test_standarize_1) {
     auto x = NDArrayFactory::create<float>('c', {5}, {1.f, 1.f, 1.f, 1.f, 1.f});
     auto e = NDArrayFactory::create<float>('c', {5}, {0.f, 0.f, 0.f, 0.f, 0.f});
 
-    nd4j::ops::standardize op;
+    sd::ops::standardize op;
     auto result = op.execute({&x}, {&x}, {}, {0}, {});
     ASSERT_EQ(Status::OK(), result);
     ASSERT_EQ(e, x);
@@ -86,7 +86,7 @@ TEST_F(DeclarableOpsTests15, Test_standarize_bp_1) {
     auto x = NDArrayFactory::create<float>('c', {5}, {1.f, 1.f, 1.f, 1.f, 1.f});
     auto eps = NDArrayFactory::create<float>('c', {5}, {0.f, 0.f, 0.f, 0.f, 0.f});
 
-    nd4j::ops::standardize_bp op;
+    sd::ops::standardize_bp op;
     auto result = op.evaluate({&x, &eps}, {0});
     ASSERT_EQ(Status::OK(), result->status());
     delete result;
@@ -102,7 +102,7 @@ TEST_F(DeclarableOpsTests15, Test_AdjustContrast_1) {
 
 
     x.linspace(1.);
-    nd4j::ops::adjust_contrast op;
+    sd::ops::adjust_contrast op;
     auto result = op.evaluate({&x, &factor}, {}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
     auto out = result->at(0);
@@ -120,7 +120,7 @@ TEST_F(DeclarableOpsTests15, Test_AdjustContrast_2) {
             50.5f,  51.5f,  52.5f,   56.5f,  57.5f,  58.5f,  62.5f,  63.5f,  64.5f,  68.5f,  69.5f,  70.5f
     });
     x.linspace(1.);
-    nd4j::ops::adjust_contrast op;
+    sd::ops::adjust_contrast op;
     auto result = op.evaluate({&x}, {2.});
     ASSERT_EQ(Status::OK(), result->status());
     auto out = result->at(0);
@@ -138,7 +138,7 @@ TEST_F(DeclarableOpsTests15, Test_AdjustContrast_3) {
             50.5f,  51.5f,  52.5f,   56.5f,  57.5f,  58.5f,  62.5f,  63.5f,  64.5f,  68.5f,  69.5f,  70.5f
     });
     x.linspace(1.);
-    nd4j::ops::adjust_contrast_v2 op;
+    sd::ops::adjust_contrast_v2 op;
     auto result = op.evaluate({&x}, {2.});
     ASSERT_EQ(Status::OK(), result->status());
     auto out = result->at(0);
@@ -156,7 +156,7 @@ TEST_F(DeclarableOpsTests15, Test_AdjustContrast_4) {
             50.5,  51.5,  52.5,   56.5,  57.5,  58.5,  62.5,  63.5,  64.5,  68.5,  69.5,  70.5
     });
     x.linspace(1.);
-    nd4j::ops::adjust_contrast_v2 op;
+    sd::ops::adjust_contrast_v2 op;
     auto result = op.evaluate({&x}, {2.}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
     auto out = result->at(0);
@@ -171,7 +171,7 @@ TEST_F(DeclarableOpsTests15, Test_AdjustContrast_5) {
         -3., -2., -1.,  0.,      5.,  6.,  7.,  8.,     13., 14., 15., 16.
     });
     x.linspace(1.);
-    nd4j::ops::adjust_contrast_v2 op;
+    sd::ops::adjust_contrast_v2 op;
     auto result = op.evaluate({&x}, {2.}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
     auto out = result->at(0);
@@ -301,7 +301,7 @@ TEST_F(DeclarableOpsTests15, Test_AdjustContrast_6) {
             0.71149576f,            -0.4799042f,             0.4880958f
     });
 
-    nd4j::ops::adjust_contrast op;
+    sd::ops::adjust_contrast op;
     auto result = op.evaluate({&x}, {2.}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
     auto out = result->at(0);
@@ -406,7 +406,7 @@ TEST_F(DeclarableOpsTests15, Test_AdjustContrast_7) {
              0.71149576,            -0.4799042,             0.4880958
     });
 //    x.linspace(1.);
-    nd4j::ops::adjust_contrast_v2 op;
+    sd::ops::adjust_contrast_v2 op;
     auto result = op.evaluate({&x}, {2.}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
     auto out = result->at(0);
@@ -422,8 +422,8 @@ TEST_F(DeclarableOpsTests15, Test_BitCast_1) {
     auto x = NDArrayFactory::create<float>('c', {2, 2, 2});
     auto e = NDArrayFactory::create<double>('c', {2, 2}, {2., 512., 8192., 131072.032 });
     x.linspace(1.);
-    nd4j::ops::bitcast op;
-    auto result = op.evaluate({&x}, {(int) nd4j::DataType::DOUBLE});
+    sd::ops::bitcast op;
+    auto result = op.evaluate({&x}, {(int) sd::DataType::DOUBLE});
     ASSERT_EQ(Status::OK(), result->status());
     auto out = result->at(0);
 //    out->printIndexedBuffer("Casted result");
@@ -436,8 +436,8 @@ TEST_F(DeclarableOpsTests15, Test_BitCast_2) {
     auto e = NDArrayFactory::create<float16>('c', {2, 4, 2}, {0.f, 1.875f, 0.f, 2.f,    0.f, 2.125f, 0.f,  2.25f,
                                                                               0.f, 2.312f, 0.f, 2.375f, 0.f, 2.438f, 0.f, 2.5f});
     x.linspace(1.);
-    nd4j::ops::bitcast op;
-    auto result = op.evaluate({&x}, {(int) nd4j::DataType::HALF});
+    sd::ops::bitcast op;
+    auto result = op.evaluate({&x}, {(int) sd::DataType::HALF});
     ASSERT_EQ(Status::OK(), result->status());
     auto out = result->at(0);
     ASSERT_TRUE(e.equalsTo(out));
@@ -448,9 +448,9 @@ TEST_F(DeclarableOpsTests15, Test_BitCast_3) {
     auto x = NDArrayFactory::create<float>('c', {1, 4});
 
     x.linspace(1.);
-    nd4j::ops::bitcast op;
+    sd::ops::bitcast op;
     try {
-        auto result = op.evaluate({&x}, {(int) nd4j::DataType::INT64});
+        auto result = op.evaluate({&x}, {(int) sd::DataType::INT64});
         ASSERT_NE(Status::OK(), result->status());
         delete result;
     } catch (std::exception& e) {
@@ -462,9 +462,9 @@ TEST_F(DeclarableOpsTests15, Test_BitCast_4) {
     auto x = NDArrayFactory::create<float>('c', {1, 4});
     auto e = NDArrayFactory::create<Nd4jLong>('c', {1, 2}, {1234567890LL, 2468013579LL});
     x.linspace(1.);
-    nd4j::ops::bitcast op;
+    sd::ops::bitcast op;
     try {
-        auto result = op.execute({&x}, {&e}, {}, {nd4j::DataType::INT64}, {});
+        auto result = op.execute({&x}, {&e}, {}, {sd::DataType::INT64}, {});
         ASSERT_NE(Status::OK(), result);
     } catch(std::exception& e) {
         nd4j_printf("Error `%s' should be here. It's OK.\n",e.what());
@@ -476,9 +476,9 @@ TEST_F(DeclarableOpsTests15, Test_BitCast_4_1) {
     auto x = NDArrayFactory::create<double>('c', {1, 2});
     auto e = NDArrayFactory::create<Nd4jLong>('c', {1, 2}, {4607182418800017408LL, 4611686018427387904LL}); // as TF 4607182418800017408, 4611686018427387904
     x.linspace(1.);
-    nd4j::ops::bitcast op;
+    sd::ops::bitcast op;
 
-    auto result = op.evaluate({&x}, {}, {nd4j::DataType::INT64}, {});
+    auto result = op.evaluate({&x}, {}, {sd::DataType::INT64}, {});
     ASSERT_EQ(Status::OK(), result->status());
     //    e.printIndexedBuffer("Double to int64");
     auto res = result->at(0);
@@ -496,8 +496,8 @@ TEST_F(DeclarableOpsTests15, Test_BitCast_5) {
 
     auto e = NDArrayFactory::create<Nd4jLong>('c', {4}, {4260467851820808160LL, 3900173902914993008LL, 3566895990128523424LL,
                                                          3314989625590692528LL});
-    nd4j::ops::bitcast op;
-    auto result = op.evaluate({&x}, {}, {nd4j::DataType::INT64}, {});
+    sd::ops::bitcast op;
+    auto result = op.evaluate({&x}, {}, {sd::DataType::INT64}, {});
     ASSERT_EQ(Status::OK(), result->status());
     auto res = result->at(0);
 //    res->printIndexedBuffer("BITCAST5");
@@ -514,8 +514,8 @@ TEST_F(DeclarableOpsTests15, Test_BitCast_6) {
 
     auto e = NDArrayFactory::create<Nd4jLong>('c', {4}, {4899988963420290048LL, 5188224837230806272LL, 5332342774136064128LL,
                                                          5476460161268730496LL});
-    nd4j::ops::bitcast op;
-    auto result = op.evaluate({&x}, {}, {nd4j::DataType::INT64}, {});
+    sd::ops::bitcast op;
+    auto result = op.evaluate({&x}, {}, {sd::DataType::INT64}, {});
     ASSERT_EQ(Status::OK(), result->status());
     auto res = result->at(0);
 //    res->printIndexedBuffer("BITCAST6");
@@ -531,8 +531,8 @@ TEST_F(DeclarableOpsTests15, Test_BitCast_7) {
 
     auto e = NDArrayFactory::create<Nd4jLong>('c', {4}, {
         4928700072476425318LL, 5202580391758873882LL, 5346698272827918477LL,  5483778673873668736LL});
-    nd4j::ops::bitcast op;
-    auto result = op.evaluate({&x}, {}, {nd4j::DataType::INT64}, {});
+    sd::ops::bitcast op;
+    auto result = op.evaluate({&x}, {}, {sd::DataType::INT64}, {});
     ASSERT_EQ(Status::OK(), result->status());
     auto res = result->at(0);
 //    res->printIndexedBuffer("BITCAST7");
@@ -548,7 +548,7 @@ TEST_F(DeclarableOpsTests15, test_matmul_bp_1) {
     auto gA = NDArrayFactory::create<double>('c', {1, 3});
     auto gB = NDArrayFactory::create<double>('c', {1, 4});
 
-    nd4j::ops::matmul_bp op;
+    sd::ops::matmul_bp op;
     auto status = op.execute({&a, &b, &gI}, std::vector<NDArray*>{&gA, &gB}, {}, {1, 0, 0}, {});
     ASSERT_EQ(Status::OK(), status);
 }
@@ -558,7 +558,7 @@ TEST_F(DeclarableOpsTests15, test_non_decreasing_1) {
     auto z = NDArrayFactory::create<bool>(false);
     auto e = NDArrayFactory::create<bool>(true);
 
-    nd4j::ops::is_non_decreasing op;
+    sd::ops::is_non_decreasing op;
     Context ctx(1);
     ctx.setInputArray(0, &x);
     ctx.setOutputArray(0, &z);
@@ -572,7 +572,7 @@ TEST_F(DeclarableOpsTests15, test_check_numeric_1) {
     auto x = NDArrayFactory::create<float>('c', {3},{1.f, 2.f, 3.f});
     auto y = NDArrayFactory::string("shouldn't ever trigger");
 
-    nd4j::ops::check_numerics op;
+    sd::ops::check_numerics op;
     auto result = op.evaluate({&x, &y}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -593,7 +593,7 @@ TEST_F(DeclarableOpsTests15, test_check_numeric_2) {
     auto y = NDArrayFactory::string("should trigger");
     auto z = NDArrayFactory::create<float>('c', {3} );
 
-    nd4j::ops::check_numerics op;
+    sd::ops::check_numerics op;
     try {
         auto status = op.execute({&x, &y}, {&z}, {}, {}, {});
         ASSERT_TRUE(false);
@@ -612,7 +612,7 @@ TEST_F(DeclarableOpsTests15, test_check_numeric_3) {
     auto y = NDArrayFactory::string("should trigger");
     auto z = NDArrayFactory::create<float>('c', {3} );
 
-    nd4j::ops::check_numerics op;
+    sd::ops::check_numerics op;
     try {
         auto status = op.execute({&x, &y}, {&z}, {}, {}, {});
         ASSERT_TRUE(false);
@@ -626,7 +626,7 @@ TEST_F(DeclarableOpsTests15, Test_layer_norm_1) {
     auto g = NDArrayFactory::create<float>('c', {5}, {1.f, 2.f, 3.f, 4.f, 5.f});
     auto b = NDArrayFactory::create<float>('c', {5}, {1.f, 2.f, 3.f, 4.f, 5.f});
 
-    nd4j::ops::layer_norm op;
+    sd::ops::layer_norm op;
     auto result = op.evaluate({&x, &g, &b}, {}, {0}, {false});
     ASSERT_EQ(Status::OK(), result->status());
     delete result;
@@ -638,7 +638,7 @@ TEST_F(DeclarableOpsTests15, Test_layer_norm_bp_1) {
     auto b = NDArrayFactory::create<float>('c', {5}, {1.f, 2.f, 3.f, 4.f, 5.f});
     auto eps = NDArrayFactory::create<float>('c', {1, 5}, {0.f, 0.f, 0.f, 0.f, 0.f});
 
-    nd4j::ops::layer_norm_bp op;
+    sd::ops::layer_norm_bp op;
     auto result = op.evaluate({&x, &g, &b, &eps}, {}, {0}, {false});
     ASSERT_EQ(Status::OK(), result->status());
     delete result;
@@ -647,19 +647,19 @@ TEST_F(DeclarableOpsTests15, Test_layer_norm_bp_1) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests15, Test_layer_norm_bp_2) {
 
-    NDArray x('c', {3, 4, 8, 8}, nd4j::DataType::FLOAT32);
-    NDArray gain('c', {4}, {-0.1, 0.1, -0.2, 0.2}, nd4j::DataType::FLOAT32);
-    NDArray bias('c', {4}, {-0.05, 0.05, -1.05, 1.05}, nd4j::DataType::FLOAT32);
-    NDArray gradO('c', {3, 4, 8, 8}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {3, 4, 8, 8}, sd::DataType::FLOAT32);
+    NDArray gain('c', {4}, {-0.1, 0.1, -0.2, 0.2}, sd::DataType::FLOAT32);
+    NDArray bias('c', {4}, {-0.05, 0.05, -1.05, 1.05}, sd::DataType::FLOAT32);
+    NDArray gradO('c', {3, 4, 8, 8}, sd::DataType::FLOAT32);
 
-    NDArray gradI('c', {3, 4, 8, 8}, nd4j::DataType::FLOAT32);
-    NDArray gradG('c', {4}, nd4j::DataType::FLOAT32);
-    NDArray gradB('c', {4}, nd4j::DataType::FLOAT32);
+    NDArray gradI('c', {3, 4, 8, 8}, sd::DataType::FLOAT32);
+    NDArray gradG('c', {4}, sd::DataType::FLOAT32);
+    NDArray gradB('c', {4}, sd::DataType::FLOAT32);
 
     x.linspace(-20, 0.5);
     gradO.linspace(-4, 0.05);
 
-    nd4j::ops::layer_norm_bp op;
+    sd::ops::layer_norm_bp op;
     auto status = op.execute({&x, &gain, &bias, &gradO}, {&gradI, &gradG, &gradB}, {}, {1,2,3}, {true});
     ASSERT_EQ(Status::OK(), status);
 }
@@ -671,7 +671,7 @@ TEST_F(DeclarableOpsTests15, test_hashCode_1) {
     x.linspace(1.);
     y.linspace(2.);
 
-    nd4j::ops::hashcode op;
+    sd::ops::hashcode op;
     auto resultA0 = op.evaluate({&x});
     auto resultA1 = op.evaluate({&x});
     auto resultB0 = op.evaluate({&y});
@@ -693,7 +693,7 @@ TEST_F(DeclarableOpsTests15, test_hashCode_2) {
     x.linspace(1.);
     y.linspace(2.);
 
-    nd4j::ops::hashcode op;
+    sd::ops::hashcode op;
     auto resultA0 = op.evaluate({&x});
     auto resultA1 = op.evaluate({&x});
     auto resultB0 = op.evaluate({&y});
@@ -714,7 +714,7 @@ TEST_F(DeclarableOpsTests15, test_reshape_to_scalar_1) {
     auto array = NDArrayFactory::create<float>(119.f);
     auto e = NDArrayFactory::create<float>('c', {1, 1}, {119.f});
 
-    nd4j::ops::reshape op;
+    sd::ops::reshape op;
     auto result = op.evaluate({&array}, {}, {1, 1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -730,7 +730,7 @@ TEST_F(DeclarableOpsTests15, test_reshape_to_scalar_2) {
     auto e = NDArrayFactory::create<float>('c', {1, 1}, {119.f});
     auto z = NDArrayFactory::create<float>('c', {1, 1});
 
-    nd4j::ops::reshape op;
+    sd::ops::reshape op;
     auto result = op.execute({&array}, {&z}, {}, {1, 1}, {});
     ASSERT_EQ(Status::OK(), result);
     ASSERT_EQ(e, z);
@@ -741,7 +741,7 @@ TEST_F(DeclarableOpsTests15, test_rank_1) {
     auto e = NDArrayFactory::create<int>('c', {}, {2});
     auto z = NDArrayFactory::create<int>('c', {});
 
-    nd4j::ops::rank op;
+    sd::ops::rank op;
     auto result = op.execute({&array}, {&z}, {}, {}, {});
     ASSERT_EQ(Status::OK(), result);
     ASSERT_EQ(e, z);
@@ -751,7 +751,7 @@ TEST_F(DeclarableOpsTests15, test_rank_2) {
     auto array = NDArrayFactory::create<float>('c', {4, 64});
     auto e = NDArrayFactory::create<int>('c', {}, {2});
 
-    nd4j::ops::rank op;
+    sd::ops::rank op;
     auto result = op.evaluate({&array}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -773,7 +773,7 @@ TEST_F(DeclarableOpsTests15, test_lstmBlock_1) {
     auto x7 = NDArrayFactory::create<float>('c', {1, 3});
     auto x8 = NDArrayFactory::create<float>('c', {12});
 
-    nd4j::ops::lstmBlock op;
+    sd::ops::lstmBlock op;
     auto result = op.evaluate({&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &x8}, {2.0, 0.3}, {0, 0});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -799,7 +799,7 @@ TEST_F(DeclarableOpsTests15, test_lstmBlock_2) {
     auto x7 = NDArrayFactory::create<float>('f', {nIn});
     auto x8 = NDArrayFactory::create<float>('f', {4 * nIn});
 
-    nd4j::ops::lstmBlock op;
+    sd::ops::lstmBlock op;
     auto result = op.evaluate({&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &x8}, {1.0, 0.0}, {0, 1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -814,8 +814,8 @@ TEST_F(DeclarableOpsTests15, test_lstmBlock_3) {
     int bS = 2;
     int nIn = 4;
 
-    NDArray f('f', {bS, nIn, seqLen}, nd4j::DataType::FLOAT32);
-    NDArray cLast('f', {bS, nIn}, nd4j::DataType::FLOAT32);
+    NDArray f('f', {bS, nIn, seqLen}, sd::DataType::FLOAT32);
+    NDArray cLast('f', {bS, nIn}, sd::DataType::FLOAT32);
 
     f = 2;
     cLast = 3;
@@ -842,7 +842,7 @@ TEST_F(DeclarableOpsTests15, test_empty_increasing_1) {
     ctx.setInputArray(0, &x);
     ctx.setOutputArray(0, &z);
 
-    nd4j::ops::is_strictly_increasing op;
+    sd::ops::is_strictly_increasing op;
     auto status = op.execute(&ctx);
     ASSERT_EQ(Status::OK(), status);
 
@@ -857,7 +857,7 @@ TEST_F(DeclarableOpsTests15, test_empty_decreasing_1) {
     ctx.setInputArray(0, &x);
     ctx.setOutputArray(0, &z);
 
-    nd4j::ops::is_non_decreasing op;
+    sd::ops::is_non_decreasing op;
     auto status = op.execute(&ctx);
     ASSERT_EQ(Status::OK(), status);
 
@@ -867,9 +867,9 @@ TEST_F(DeclarableOpsTests15, test_empty_decreasing_1) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests15, test_rgb_to_grs_1) {
     // rank 1
-    NDArray rgbs('c', { 3 }, { 10, 50, 200 }, nd4j::DataType::INT32);
-    NDArray expected('c', { 1 }, std::vector<double>{ 55 }, nd4j::DataType::INT32);
-    nd4j::ops::rgb_to_grs op;
+    NDArray rgbs('c', { 3 }, { 10, 50, 200 }, sd::DataType::INT32);
+    NDArray expected('c', { 1 }, std::vector<double>{ 55 }, sd::DataType::INT32);
+    sd::ops::rgb_to_grs op;
     auto result = op.evaluate({&rgbs}, {}, {});
     auto output = result->at(0);
 
@@ -885,7 +885,7 @@ TEST_F(DeclarableOpsTests15, test_rgb_to_grs_2) {
     // rank 1
     auto rgbs = NDArrayFactory::create<int>('f', { 3 }, { 1, 120, -25 });
     auto expected = NDArrayFactory::create<int>('f', { 1 }, { 67 });
-    nd4j::ops::rgb_to_grs op;
+    sd::ops::rgb_to_grs op;
     auto result = op.evaluate({ &rgbs }, {}, {});
     auto output = result->at(0);
 
@@ -899,9 +899,9 @@ TEST_F(DeclarableOpsTests15, test_rgb_to_grs_2) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests15, test_rgb_to_grs_3) {
     // rank 2
-    NDArray rgbs('c', { 4, 3 }, { -94,  99,  97, 90, 114, 101, 111,  96, 105, 100, 103, 102 }, nd4j::DataType::INT32);
-    NDArray expected('c', { 4, 1 }, { 41, 105, 101, 101 }, nd4j::DataType::INT32);
-    nd4j::ops::rgb_to_grs op;
+    NDArray rgbs('c', { 4, 3 }, { -94,  99,  97, 90, 114, 101, 111,  96, 105, 100, 103, 102 }, sd::DataType::INT32);
+    NDArray expected('c', { 4, 1 }, { 41, 105, 101, 101 }, sd::DataType::INT32);
+    sd::ops::rgb_to_grs op;
     auto result = op.evaluate({ &rgbs }, {}, {});
     auto output = result->at(0);
 
@@ -915,11 +915,11 @@ TEST_F(DeclarableOpsTests15, test_rgb_to_grs_3) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests15, test_rgb_to_grs_4) {
 
-    NDArray rgbs('c', { 3, 2 }, {14,  99, 207, 10, 114, 201 }, nd4j::DataType::INT32);
+    NDArray rgbs('c', { 3, 2 }, {14,  99, 207, 10, 114, 201 }, sd::DataType::INT32);
 
     rgbs.permutei({1,0});
-    NDArray expected('c', { 2, 1 }, { 138, 58 }, nd4j::DataType::INT32);
-    nd4j::ops::rgb_to_grs op;
+    NDArray expected('c', { 2, 1 }, { 138, 58 }, sd::DataType::INT32);
+    sd::ops::rgb_to_grs op;
     auto result = op.evaluate({ &rgbs }, {}, {});
     auto output = result->at(0);
 
@@ -933,9 +933,9 @@ TEST_F(DeclarableOpsTests15, test_rgb_to_grs_4) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests15, test_rgb_to_grs_5) {
     // rank 2
-    NDArray rgbs('c', { 3, 4 }, { -94,  99,  97, 90, 114, 101, 111,  96, 105, 100, 103, 102 }, nd4j::DataType::INT32);
-    NDArray expected('c', { 1, 4 }, { 50, 100, 105, 94 }, nd4j::DataType::INT32);
-    nd4j::ops::rgb_to_grs op;
+    NDArray rgbs('c', { 3, 4 }, { -94,  99,  97, 90, 114, 101, 111,  96, 105, 100, 103, 102 }, sd::DataType::INT32);
+    NDArray expected('c', { 1, 4 }, { 50, 100, 105, 94 }, sd::DataType::INT32);
+    sd::ops::rgb_to_grs op;
     auto result = op.evaluate({ &rgbs }, {}, {0});
     auto output = result->at(0);
 
@@ -952,7 +952,7 @@ TEST_F(DeclarableOpsTests15, test_rgb_to_grs_6) {
     auto rgbs = NDArrayFactory::create<float>('c', { 5,4,3 }, {1.7750e+01f, -7.1062e+01f, -1.0019e+02f,-2.3406e+01f,  5.2094e+01f,  9.5438e+01f, -6.7461e+00f,  3.8562e+01f,  6.5078e+00f,3.3562e+01f, -5.8844e+01f,  2.2750e+01f, -1.0477e+01f,  7.7344e+00f,  9.5469e+00f,2.1391e+01f, -8.5312e+01f,  7.5830e-01f,2.3125e+01f,  1.8145e+00f,  1.4602e+01f,-4.5859e+00f,  3.9344e+01f,  1.1617e+01f,-8.6562e+01f,  1.0038e+02f,  6.7938e+01f,5.9961e+00f,  6.7812e+01f,  2.9734e+01f,2.9609e+01f, -6.1438e+01f,  1.7750e+01f,6.8562e+01f, -7.4414e+00f,  3.9656e+01f,1.1641e+01f, -2.7516e+01f,  6.7562e+01f,7.8438e+01f,  5.4883e+00f,  2.9438e+01f,-3.1344e+01f,  6.5125e+01f,  1.2695e+01f,4.0531e+01f, -6.1211e+00f,  6.2219e+01f,4.6812e+01f,  5.2250e+01f, -1.1414e+01f,1.5404e-02f,  2.9938e+01f,  5.6719e+00f,-2.0125e+01f,  2.1531e+01f,  6.2500e+01f,7.2188e+01f,  9.3750e+00f, -4.8125e+01f});
     auto expected = NDArrayFactory::create<float>('c', { 5,4,1 }, {-47.82958221f,  34.46305847f,  21.36137581f, -21.91625023f,2.49686432f, -43.59792709f,   9.64180183f,  23.04854202f,40.7946167f,  44.98754883f, -25.19047546f,  20.64586449f,-4.97033119f,   30.0226841f,  30.30688286f,  15.61459541f,43.36166f,  18.22480774f,  13.74833488f,  21.59387016f});
 
-    nd4j::ops::rgb_to_grs op;
+    sd::ops::rgb_to_grs op;
     auto result = op.evaluate({ &rgbs }, {}, {});
     auto output = result->at(0);
 
@@ -969,7 +969,7 @@ TEST_F(DeclarableOpsTests15, test_rgb_to_grs_7) {
     auto rgbs = NDArrayFactory::create<float>('c', { 5,3,4 }, { 1.7750e+01f, -7.1062e+01f, -1.0019e+02f,-2.3406e+01f,  5.2094e+01f,  9.5438e+01f, -6.7461e+00f,  3.8562e+01f,  6.5078e+00f,3.3562e+01f, -5.8844e+01f,  2.2750e+01f, -1.0477e+01f,  7.7344e+00f,  9.5469e+00f,2.1391e+01f, -8.5312e+01f,  7.5830e-01f,2.3125e+01f,  1.8145e+00f,  1.4602e+01f,-4.5859e+00f,  3.9344e+01f,  1.1617e+01f,-8.6562e+01f,  1.0038e+02f,  6.7938e+01f,5.9961e+00f,  6.7812e+01f,  2.9734e+01f,2.9609e+01f, -6.1438e+01f,  1.7750e+01f,6.8562e+01f, -7.4414e+00f,  3.9656e+01f,1.1641e+01f, -2.7516e+01f,  6.7562e+01f,7.8438e+01f,  5.4883e+00f,  2.9438e+01f,-3.1344e+01f,  6.5125e+01f,  1.2695e+01f,4.0531e+01f, -6.1211e+00f,  6.2219e+01f,4.6812e+01f,  5.2250e+01f, -1.1414e+01f,1.5404e-02f,  2.9938e+01f,  5.6719e+00f,-2.0125e+01f,  2.1531e+01f,  6.2500e+01f,7.2188e+01f,  9.3750e+00f, -4.8125e+01f});
     auto expected = NDArrayFactory::create<float>('c', { 5,1,4 }, { 36.626545f, 38.607746f, -40.614971f, 18.233341f, -51.545094f,2.234142f, 20.913160f, 8.783220f, 15.955761f, 55.273506f, 36.838833f, -29.751089f, 8.148357f, 13.676106f, 1.097548f, 68.766457f, 38.690712f, 27.176361f, -14.156269f, 7.157052f  });
 
-    nd4j::ops::rgb_to_grs op;
+    sd::ops::rgb_to_grs op;
     auto result = op.evaluate({ &rgbs }, {}, {1});
     auto output = result->at(0);
 
@@ -985,7 +985,7 @@ TEST_F(DeclarableOpsTests15, test_rgb_to_grs_8) {
     // rank 3
     auto rgbs = NDArrayFactory::create<float>('c', { 3,5,4 }, {1.7750e+01f, -7.1062e+01f, -1.0019e+02f,-2.3406e+01f,  5.2094e+01f,  9.5438e+01f, -6.7461e+00f,  3.8562e+01f,  6.5078e+00f,3.3562e+01f, -5.8844e+01f,  2.2750e+01f, -1.0477e+01f,  7.7344e+00f,  9.5469e+00f,2.1391e+01f, -8.5312e+01f,  7.5830e-01f,2.3125e+01f,  1.8145e+00f,  1.4602e+01f,-4.5859e+00f,  3.9344e+01f,  1.1617e+01f,-8.6562e+01f,  1.0038e+02f,  6.7938e+01f,5.9961e+00f,  6.7812e+01f,  2.9734e+01f,2.9609e+01f, -6.1438e+01f,  1.7750e+01f,6.8562e+01f, -7.4414e+00f,  3.9656e+01f,1.1641e+01f, -2.7516e+01f,  6.7562e+01f,7.8438e+01f,  5.4883e+00f,  2.9438e+01f,-3.1344e+01f,  6.5125e+01f,  1.2695e+01f,4.0531e+01f, -6.1211e+00f,  6.2219e+01f,4.6812e+01f,  5.2250e+01f, -1.1414e+01f,1.5404e-02f,  2.9938e+01f,  5.6719e+00f,-2.0125e+01f,  2.1531e+01f,  6.2500e+01f,7.2188e+01f,  9.3750e+00f, -4.8125e+01f});
     try {
-        nd4j::ops::rgb_to_grs op;
+        sd::ops::rgb_to_grs op;
         auto result = op.evaluate({ &rgbs }, {}, {});
         ASSERT_EQ(Status::THROW(), result->status());
         delete result;
@@ -1000,7 +1000,7 @@ TEST_F(DeclarableOpsTests15, test_rgb_to_grs_9) {
     auto rgbs = NDArrayFactory::create<float>('f', { 2, 2, 3 }, { 1.7750e+01f,-7.1062e+01f, -1.0019e+02f, -2.3406e+01f,5.2094e+01f,9.5438e+01f, -6.7461e+00f,3.8562e+01f, 6.5078e+00f,      3.3562e+01f,-5.8844e+01f,2.2750e+01f});
     auto expected = NDArrayFactory::create<float>('f', { 2,2,1 }, { 36.626545f, 38.607746f, -40.614971f, 18.233341f });
 
-    nd4j::ops::rgb_to_grs op;
+    sd::ops::rgb_to_grs op;
     auto result = op.evaluate({ &rgbs }, {}, {});
     auto output = result->at(0);
 
@@ -1014,9 +1014,9 @@ TEST_F(DeclarableOpsTests15, test_rgb_to_grs_9) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests15, test_rgb_to_yuv_1) {
     // rank 1
-    NDArray rgbs('f', { 3 }, { 10, 50, 200 }, nd4j::DataType::FLOAT32);
-    NDArray expected('f', { 3 }, { 55.14 , 71.2872001, -39.6005542 }, nd4j::DataType::FLOAT32);
-    nd4j::ops::rgb_to_yuv op;
+    NDArray rgbs('f', { 3 }, { 10, 50, 200 }, sd::DataType::FLOAT32);
+    NDArray expected('f', { 3 }, { 55.14 , 71.2872001, -39.6005542 }, sd::DataType::FLOAT32);
+    sd::ops::rgb_to_yuv op;
     auto result = op.evaluate({ &rgbs }, {}, {});
     auto output = result->at(0);
 
@@ -1030,11 +1030,11 @@ TEST_F(DeclarableOpsTests15, test_rgb_to_yuv_1) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests15, test_rgb_to_yuv_2) {
 
-    NDArray rgbs('c', { 3, 2 }, { 14.,  99., 207., 10., 114., 201. }, nd4j::DataType::FLOAT32);
+    NDArray rgbs('c', { 3, 2 }, { 14.,  99., 207., 10., 114., 201. }, sd::DataType::FLOAT32);
     rgbs.permutei({ 1,0 });
 
-    NDArray expected('c', { 2, 3 }, { 138.691, -12.150713, -109.38929, 58.385, 70.18241, 35.63085 }, nd4j::DataType::FLOAT32);
-    nd4j::ops::rgb_to_yuv op;
+    NDArray expected('c', { 2, 3 }, { 138.691, -12.150713, -109.38929, 58.385, 70.18241, 35.63085 }, sd::DataType::FLOAT32);
+    sd::ops::rgb_to_yuv op;
 
     auto result = op.evaluate({ &rgbs }, {}, {});
     auto output = result->at(0);
@@ -1049,10 +1049,10 @@ TEST_F(DeclarableOpsTests15, test_rgb_to_yuv_2) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests15, test_rgb_to_yuv_3) {
     // rank 2
-    NDArray rgbs('c', { 3, 4 }, { -9.4,  9.9, 9.7, 9.0, 1.14, 1.01, 1.11,  9.6, 1.05, 10.0, 1.03, 10.22 }, nd4j::DataType::FLOAT32);
-    NDArray expected('c', { 3, 4 }, {  -2.021720, 4.692970, 3.669290, 9.491281, 1.511627, 2.611648, -1.298824, 0.358612, -6.472839, 4.568039, 5.290639, -0.430992 }, nd4j::DataType::FLOAT32);
+    NDArray rgbs('c', { 3, 4 }, { -9.4,  9.9, 9.7, 9.0, 1.14, 1.01, 1.11,  9.6, 1.05, 10.0, 1.03, 10.22 }, sd::DataType::FLOAT32);
+    NDArray expected('c', { 3, 4 }, {  -2.021720, 4.692970, 3.669290, 9.491281, 1.511627, 2.611648, -1.298824, 0.358612, -6.472839, 4.568039, 5.290639, -0.430992 }, sd::DataType::FLOAT32);
 
-    nd4j::ops::rgb_to_yuv op;
+    sd::ops::rgb_to_yuv op;
     auto result = op.evaluate({ &rgbs }, {}, { 0 });
     auto output = result->at(0);
     ASSERT_EQ(Status::OK(), result->status());
@@ -1065,10 +1065,10 @@ TEST_F(DeclarableOpsTests15, test_rgb_to_yuv_3) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests15, test_rgb_to_yuv_4) {
     // rank 3
-    NDArray rgbs('c', { 5,4,3 }, { 1.7750e+01,  1.4602e+01,  5.4883e+00,  9.5438e+01,  1.0038e+02,  4.0531e+01,       -5.8844e+01,  2.9609e+01, -1.1414e+01,       2.1391e+01,  3.9656e+01,  2.1531e+01,       -7.1062e+01, -4.5859e+00,  2.9438e+01,       -6.7461e+00,  6.7938e+01, -6.1211e+00,       2.2750e+01, -6.1438e+01,  1.5404e-02,       -8.5312e+01,  1.1641e+01,  6.2500e+01,       -1.0019e+02,  3.9344e+01, -3.1344e+01,       3.8562e+01,  5.9961e+00,  6.2219e+01,       -1.0477e+01,  1.7750e+01,  2.9938e+01,       7.5830e-01, -2.7516e+01,  7.2188e+01,       -2.3406e+01,  1.1617e+01,  6.5125e+01,       6.5078e+00,  6.7812e+01,  4.6812e+01,       7.7344e+00,  6.8562e+01,  5.6719e+00,       2.3125e+01,  6.7562e+01,  9.3750e+00,        5.2094e+01, -8.6562e+01,  1.2695e+01,       3.3562e+01,  2.9734e+01,  5.2250e+01,       9.5469e+00, -7.4414e+00, -2.0125e+01,       1.8145e+00,  7.8438e+01, -4.8125e+01    }, nd4j::DataType::FLOAT32);
-    NDArray expected('c', { 5,4,3 }, { 14.5042902, -4.43686799,   2.847406,  92.079556, -25.36761168,   2.94630572,  -1.515069, -4.87137291, -50.29369639,  32.128515, -5.21515376, -9.41983935,-20.5835293,   24.61614501, -44.28390394,  37.1647167, -21.30142676, -38.52221293, -29.26009994,  14.40679768,  45.62757638, -11.550021,    36.44083018, -64.71012983,-10.435098, - 10.28950082, - 78.74044941,  22.1427147,   19.72198103,  14.40435988,  10.699559,     9.46744852, - 18.5778351 ,  -7.6957283,   39.31166179,   7.41657542,  7.245035,    28.48336771, - 26.88963173,  47.0880442, - 0.13584441, - 35.60035823,  43.2050762, - 18.47048906, - 31.11782117,  47.642019, - 18.83162118, - 21.50836396,-33.788558,    22.87507047,  75.34330791,  33.445396,     9.25395257,   0.10229474,  -3.8078287, -8.02985955,  11.71587638,  41.0993915, -43.90830496, -34.46396749 }, nd4j::DataType::FLOAT32);
+    NDArray rgbs('c', { 5,4,3 }, { 1.7750e+01,  1.4602e+01,  5.4883e+00,  9.5438e+01,  1.0038e+02,  4.0531e+01,       -5.8844e+01,  2.9609e+01, -1.1414e+01,       2.1391e+01,  3.9656e+01,  2.1531e+01,       -7.1062e+01, -4.5859e+00,  2.9438e+01,       -6.7461e+00,  6.7938e+01, -6.1211e+00,       2.2750e+01, -6.1438e+01,  1.5404e-02,       -8.5312e+01,  1.1641e+01,  6.2500e+01,       -1.0019e+02,  3.9344e+01, -3.1344e+01,       3.8562e+01,  5.9961e+00,  6.2219e+01,       -1.0477e+01,  1.7750e+01,  2.9938e+01,       7.5830e-01, -2.7516e+01,  7.2188e+01,       -2.3406e+01,  1.1617e+01,  6.5125e+01,       6.5078e+00,  6.7812e+01,  4.6812e+01,       7.7344e+00,  6.8562e+01,  5.6719e+00,       2.3125e+01,  6.7562e+01,  9.3750e+00,        5.2094e+01, -8.6562e+01,  1.2695e+01,       3.3562e+01,  2.9734e+01,  5.2250e+01,       9.5469e+00, -7.4414e+00, -2.0125e+01,       1.8145e+00,  7.8438e+01, -4.8125e+01    }, sd::DataType::FLOAT32);
+    NDArray expected('c', { 5,4,3 }, { 14.5042902, -4.43686799,   2.847406,  92.079556, -25.36761168,   2.94630572,  -1.515069, -4.87137291, -50.29369639,  32.128515, -5.21515376, -9.41983935,-20.5835293,   24.61614501, -44.28390394,  37.1647167, -21.30142676, -38.52221293, -29.26009994,  14.40679768,  45.62757638, -11.550021,    36.44083018, -64.71012983,-10.435098, - 10.28950082, - 78.74044941,  22.1427147,   19.72198103,  14.40435988,  10.699559,     9.46744852, - 18.5778351 ,  -7.6957283,   39.31166179,   7.41657542,  7.245035,    28.48336771, - 26.88963173,  47.0880442, - 0.13584441, - 35.60035823,  43.2050762, - 18.47048906, - 31.11782117,  47.642019, - 18.83162118, - 21.50836396,-33.788558,    22.87507047,  75.34330791,  33.445396,     9.25395257,   0.10229474,  -3.8078287, -8.02985955,  11.71587638,  41.0993915, -43.90830496, -34.46396749 }, sd::DataType::FLOAT32);
 
-    nd4j::ops::rgb_to_yuv op;
+    sd::ops::rgb_to_yuv op;
     auto result = op.evaluate({ &rgbs }, {}, {});
     auto output = result->at(0);
 
@@ -1082,10 +1082,10 @@ TEST_F(DeclarableOpsTests15, test_rgb_to_yuv_4) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests15, test_rgb_to_yuv_5) {
     // rank 3
-    NDArray rgbs('c', { 5,3,4 }, { 1.7750e+01f, -7.1062e+01f, -1.0019e+02f,-2.3406e+01f,  5.2094e+01f,  9.5438e+01f, -6.7461e+00f,  3.8562e+01f,  6.5078e+00f,3.3562e+01f, -5.8844e+01f,  2.2750e+01f, -1.0477e+01f,  7.7344e+00f,  9.5469e+00f,2.1391e+01f, -8.5312e+01f,  7.5830e-01f,2.3125e+01f,  1.8145e+00f,  1.4602e+01f,-4.5859e+00f,  3.9344e+01f,  1.1617e+01f,-8.6562e+01f,  1.0038e+02f,  6.7938e+01f,5.9961e+00f,  6.7812e+01f,  2.9734e+01f,2.9609e+01f, -6.1438e+01f,  1.7750e+01f,6.8562e+01f, -7.4414e+00f,  3.9656e+01f,1.1641e+01f, -2.7516e+01f,  6.7562e+01f,7.8438e+01f,  5.4883e+00f,  2.9438e+01f,-3.1344e+01f,  6.5125e+01f,  1.2695e+01f,4.0531e+01f, -6.1211e+00f,  6.2219e+01f,4.6812e+01f,  5.2250e+01f, -1.1414e+01f,1.5404e-02f,  2.9938e+01f,  5.6719e+00f,-2.0125e+01f,  2.1531e+01f,  6.2500e+01f,7.2188e+01f,  9.3750e+00f, -4.8125e+01f }, nd4j::DataType::FLOAT32);
-    NDArray expected('c', { 5,3,4 }, { 36.628319, 38.600643,-40.624989, 18.231001, - 14.822637, - 2.479566, - 8.965780,  2.223851, -16.561626,-96.205162,-52.255379,-36.527435,-51.546139,2.234915,  20.914114, 8.785358,  32.552223, -3.356598, 9.069552,  1.393482,36.029255, 4.824605,- 9.972263,11.058715, 15.947105, 55.283543, 36.845627, -29.750486,0.887228,  6.534475,  -21.794132,34.155693, -89.929497,39.562351, 27.276817,31.359871, 8.149521,  13.673355, 1.104303, 68.774300, 2.236881, 13.216944, - 3.555702,- 3.225931,3.063015, - 36.134724,58.302204, 8.477802, 38.695396,27.181587, - 14.157411,7.157054, 11.714512, 22.148155, 11.580557, - 27.204905,7.120562, 21.992094, 2.406748, - 6.265247,     }, nd4j::DataType::FLOAT32);
+    NDArray rgbs('c', { 5,3,4 }, { 1.7750e+01f, -7.1062e+01f, -1.0019e+02f,-2.3406e+01f,  5.2094e+01f,  9.5438e+01f, -6.7461e+00f,  3.8562e+01f,  6.5078e+00f,3.3562e+01f, -5.8844e+01f,  2.2750e+01f, -1.0477e+01f,  7.7344e+00f,  9.5469e+00f,2.1391e+01f, -8.5312e+01f,  7.5830e-01f,2.3125e+01f,  1.8145e+00f,  1.4602e+01f,-4.5859e+00f,  3.9344e+01f,  1.1617e+01f,-8.6562e+01f,  1.0038e+02f,  6.7938e+01f,5.9961e+00f,  6.7812e+01f,  2.9734e+01f,2.9609e+01f, -6.1438e+01f,  1.7750e+01f,6.8562e+01f, -7.4414e+00f,  3.9656e+01f,1.1641e+01f, -2.7516e+01f,  6.7562e+01f,7.8438e+01f,  5.4883e+00f,  2.9438e+01f,-3.1344e+01f,  6.5125e+01f,  1.2695e+01f,4.0531e+01f, -6.1211e+00f,  6.2219e+01f,4.6812e+01f,  5.2250e+01f, -1.1414e+01f,1.5404e-02f,  2.9938e+01f,  5.6719e+00f,-2.0125e+01f,  2.1531e+01f,  6.2500e+01f,7.2188e+01f,  9.3750e+00f, -4.8125e+01f }, sd::DataType::FLOAT32);
+    NDArray expected('c', { 5,3,4 }, { 36.628319, 38.600643,-40.624989, 18.231001, - 14.822637, - 2.479566, - 8.965780,  2.223851, -16.561626,-96.205162,-52.255379,-36.527435,-51.546139,2.234915,  20.914114, 8.785358,  32.552223, -3.356598, 9.069552,  1.393482,36.029255, 4.824605,- 9.972263,11.058715, 15.947105, 55.283543, 36.845627, -29.750486,0.887228,  6.534475,  -21.794132,34.155693, -89.929497,39.562351, 27.276817,31.359871, 8.149521,  13.673355, 1.104303, 68.774300, 2.236881, 13.216944, - 3.555702,- 3.225931,3.063015, - 36.134724,58.302204, 8.477802, 38.695396,27.181587, - 14.157411,7.157054, 11.714512, 22.148155, 11.580557, - 27.204905,7.120562, 21.992094, 2.406748, - 6.265247,     }, sd::DataType::FLOAT32);
 
-    nd4j::ops::rgb_to_yuv op;
+    sd::ops::rgb_to_yuv op;
     auto result = op.evaluate({ &rgbs }, {}, { 1 });
     auto output = result->at(0);
 
@@ -1098,9 +1098,9 @@ TEST_F(DeclarableOpsTests15, test_rgb_to_yuv_5) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests15, test_rgb_to_yuv_6) {
     // rank 3
-    NDArray rgbs('c', { 3,5,4 }, { 1.7750e+01f, -7.1062e+01f, -1.0019e+02f,-2.3406e+01f,  5.2094e+01f,  9.5438e+01f, -6.7461e+00f,  3.8562e+01f,  6.5078e+00f,3.3562e+01f, -5.8844e+01f,  2.2750e+01f, -1.0477e+01f,  7.7344e+00f,  9.5469e+00f,2.1391e+01f, -8.5312e+01f,  7.5830e-01f,2.3125e+01f,  1.8145e+00f,  1.4602e+01f,-4.5859e+00f,  3.9344e+01f,  1.1617e+01f,-8.6562e+01f,  1.0038e+02f,  6.7938e+01f,5.9961e+00f,  6.7812e+01f,  2.9734e+01f,2.9609e+01f, -6.1438e+01f,  1.7750e+01f,6.8562e+01f, -7.4414e+00f,  3.9656e+01f,1.1641e+01f, -2.7516e+01f,  6.7562e+01f,7.8438e+01f,  5.4883e+00f,  2.9438e+01f,-3.1344e+01f,  6.5125e+01f,  1.2695e+01f,4.0531e+01f, -6.1211e+00f,  6.2219e+01f,4.6812e+01f,  5.2250e+01f, -1.1414e+01f,1.5404e-02f,  2.9938e+01f,  5.6719e+00f,-2.0125e+01f,  2.1531e+01f,  6.2500e+01f,7.2188e+01f,  9.3750e+00f, -4.8125e+01f }, nd4j::DataType::FLOAT32);
+    NDArray rgbs('c', { 3,5,4 }, { 1.7750e+01f, -7.1062e+01f, -1.0019e+02f,-2.3406e+01f,  5.2094e+01f,  9.5438e+01f, -6.7461e+00f,  3.8562e+01f,  6.5078e+00f,3.3562e+01f, -5.8844e+01f,  2.2750e+01f, -1.0477e+01f,  7.7344e+00f,  9.5469e+00f,2.1391e+01f, -8.5312e+01f,  7.5830e-01f,2.3125e+01f,  1.8145e+00f,  1.4602e+01f,-4.5859e+00f,  3.9344e+01f,  1.1617e+01f,-8.6562e+01f,  1.0038e+02f,  6.7938e+01f,5.9961e+00f,  6.7812e+01f,  2.9734e+01f,2.9609e+01f, -6.1438e+01f,  1.7750e+01f,6.8562e+01f, -7.4414e+00f,  3.9656e+01f,1.1641e+01f, -2.7516e+01f,  6.7562e+01f,7.8438e+01f,  5.4883e+00f,  2.9438e+01f,-3.1344e+01f,  6.5125e+01f,  1.2695e+01f,4.0531e+01f, -6.1211e+00f,  6.2219e+01f,4.6812e+01f,  5.2250e+01f, -1.1414e+01f,1.5404e-02f,  2.9938e+01f,  5.6719e+00f,-2.0125e+01f,  2.1531e+01f,  6.2500e+01f,7.2188e+01f,  9.3750e+00f, -4.8125e+01f }, sd::DataType::FLOAT32);
     try {
-        nd4j::ops::rgb_to_yuv op;
+        sd::ops::rgb_to_yuv op;
         auto result = op.evaluate({ &rgbs }, {}, {});
         ASSERT_EQ(Status::THROW(), result->status());
         delete result;
@@ -1113,10 +1113,10 @@ TEST_F(DeclarableOpsTests15, test_rgb_to_yuv_6) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests15, test_rgb_to_yuv_7) {
     // rank 3
-    NDArray rgbs('f', { 2, 2, 3 }, { 1.7750e+01f,-7.1062e+01f, -1.0019e+02f, -2.3406e+01f,5.2094e+01f,9.5438e+01f, -6.7461e+00f,3.8562e+01f, 6.5078e+00f,      3.3562e+01f,-5.8844e+01f,2.2750e+01f }, nd4j::DataType::FLOAT32);
-    NDArray expected('f', { 2,2,3 }, { 36.628319,38.600643, -40.624989,18.231001, -14.822637,-2.479566, -8.965780, 2.223851,  -16.561626,- 96.205162,-52.255379, -36.527435 }, nd4j::DataType::FLOAT32);
+    NDArray rgbs('f', { 2, 2, 3 }, { 1.7750e+01f,-7.1062e+01f, -1.0019e+02f, -2.3406e+01f,5.2094e+01f,9.5438e+01f, -6.7461e+00f,3.8562e+01f, 6.5078e+00f,      3.3562e+01f,-5.8844e+01f,2.2750e+01f }, sd::DataType::FLOAT32);
+    NDArray expected('f', { 2,2,3 }, { 36.628319,38.600643, -40.624989,18.231001, -14.822637,-2.479566, -8.965780, 2.223851,  -16.561626,- 96.205162,-52.255379, -36.527435 }, sd::DataType::FLOAT32);
 
-    nd4j::ops::rgb_to_yuv op;
+    sd::ops::rgb_to_yuv op;
     auto result = op.evaluate({ &rgbs }, {}, {});
     auto output = result->at(0);
 
@@ -1130,9 +1130,9 @@ TEST_F(DeclarableOpsTests15, test_rgb_to_yuv_7) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests15, test_yuv_to_rgb_1) {
     // rank 1
-    NDArray yuv('c', { 3 }, { 55.14 , 71.2872001, -39.6005542 }, nd4j::DataType::FLOAT32);
-    NDArray expected('c', { 3 }, { 10, 50, 200 }, nd4j::DataType::FLOAT32);
-    nd4j::ops::yuv_to_rgb op;
+    NDArray yuv('c', { 3 }, { 55.14 , 71.2872001, -39.6005542 }, sd::DataType::FLOAT32);
+    NDArray expected('c', { 3 }, { 10, 50, 200 }, sd::DataType::FLOAT32);
+    sd::ops::yuv_to_rgb op;
     auto result = op.evaluate({ &yuv }, {}, {});
     auto output = result->at(0);
 
@@ -1146,9 +1146,9 @@ TEST_F(DeclarableOpsTests15, test_yuv_to_rgb_1) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests15, test_yuv_to_rgb_2) {
     // rank 1
-    NDArray yuv('f', { 3 }, { 55.14, 71.2872001, -39.6005542 }, nd4j::DataType::FLOAT32);
-    NDArray expected('f', { 3 }, { 10, 50, 200 }, nd4j::DataType::FLOAT32);
-    nd4j::ops::yuv_to_rgb op;
+    NDArray yuv('f', { 3 }, { 55.14, 71.2872001, -39.6005542 }, sd::DataType::FLOAT32);
+    NDArray expected('f', { 3 }, { 10, 50, 200 }, sd::DataType::FLOAT32);
+    sd::ops::yuv_to_rgb op;
     auto result = op.evaluate({ &yuv }, {}, {});
     auto output = result->at(0);
 
@@ -1162,10 +1162,10 @@ TEST_F(DeclarableOpsTests15, test_yuv_to_rgb_2) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests15, test_yuv_to_rgb_3) {
     // rank 2
-    NDArray expected('c', { 3, 4 }, { -9.4,  9.9, 9.7, 9.0, 1.14, 1.01, 1.11,  9.6, 1.05, 10.0, 1.03, 10.22 }, nd4j::DataType::FLOAT32);
-    NDArray yuv('c', { 3, 4 }, { -2.021720, 4.692970, 3.669290, 9.491281, 1.511627, 2.611648, -1.298824, 0.358612, -6.472839, 4.568039, 5.290639, -0.430992 }, nd4j::DataType::FLOAT32);
+    NDArray expected('c', { 3, 4 }, { -9.4,  9.9, 9.7, 9.0, 1.14, 1.01, 1.11,  9.6, 1.05, 10.0, 1.03, 10.22 }, sd::DataType::FLOAT32);
+    NDArray yuv('c', { 3, 4 }, { -2.021720, 4.692970, 3.669290, 9.491281, 1.511627, 2.611648, -1.298824, 0.358612, -6.472839, 4.568039, 5.290639, -0.430992 }, sd::DataType::FLOAT32);
 
-    nd4j::ops::yuv_to_rgb op;
+    sd::ops::yuv_to_rgb op;
     auto result = op.evaluate({ &yuv }, {}, { 0 });
     auto output = result->at(0);
     ASSERT_EQ(Status::OK(), result->status());
@@ -1178,10 +1178,10 @@ TEST_F(DeclarableOpsTests15, test_yuv_to_rgb_3) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests15, test_yuv_to_rgb_4) {
     // rank 3
-    NDArray expected('c', { 5,4,3 }, { 1.7750e+01,  1.4602e+01,  5.4883e+00,  9.5438e+01,  1.0038e+02,  4.0531e+01,       -5.8844e+01,  2.9609e+01, -1.1414e+01,       2.1391e+01,  3.9656e+01,  2.1531e+01,       -7.1062e+01, -4.5859e+00,  2.9438e+01,       -6.7461e+00,  6.7938e+01, -6.1211e+00,       2.2750e+01, -6.1438e+01,  1.5404e-02,       -8.5312e+01,  1.1641e+01,  6.2500e+01,       -1.0019e+02,  3.9344e+01, -3.1344e+01,       3.8562e+01,  5.9961e+00,  6.2219e+01,       -1.0477e+01,  1.7750e+01,  2.9938e+01,       7.5830e-01, -2.7516e+01,  7.2188e+01,       -2.3406e+01,  1.1617e+01,  6.5125e+01,       6.5078e+00,  6.7812e+01,  4.6812e+01,       7.7344e+00,  6.8562e+01,  5.6719e+00,       2.3125e+01,  6.7562e+01,  9.3750e+00,        5.2094e+01, -8.6562e+01,  1.2695e+01,       3.3562e+01,  2.9734e+01,  5.2250e+01,       9.5469e+00, -7.4414e+00, -2.0125e+01,       1.8145e+00,  7.8438e+01, -4.8125e+01 }, nd4j::DataType::FLOAT32);
-    NDArray yuv('c', { 5,4,3 }, { 14.5042902, -4.43686799,   2.847406,  92.079556, -25.36761168,   2.94630572,  -1.515069, -4.87137291, -50.29369639,  32.128515, -5.21515376, -9.41983935,-20.5835293,   24.61614501, -44.28390394,  37.1647167, -21.30142676, -38.52221293, -29.26009994,  14.40679768,  45.62757638, -11.550021,    36.44083018, -64.71012983,-10.435098, -10.28950082, -78.74044941,  22.1427147,   19.72198103,  14.40435988,  10.699559,     9.46744852, -18.5778351 ,  -7.6957283,   39.31166179,   7.41657542,  7.245035,    28.48336771, -26.88963173,  47.0880442, -0.13584441, -35.60035823,  43.2050762, -18.47048906, -31.11782117,  47.642019, -18.83162118, -21.50836396,-33.788558,    22.87507047,  75.34330791,  33.445396,     9.25395257,   0.10229474,  -3.8078287, -8.02985955,  11.71587638,  41.0993915, -43.90830496, -34.46396749 }, nd4j::DataType::FLOAT32);
+    NDArray expected('c', { 5,4,3 }, { 1.7750e+01,  1.4602e+01,  5.4883e+00,  9.5438e+01,  1.0038e+02,  4.0531e+01,       -5.8844e+01,  2.9609e+01, -1.1414e+01,       2.1391e+01,  3.9656e+01,  2.1531e+01,       -7.1062e+01, -4.5859e+00,  2.9438e+01,       -6.7461e+00,  6.7938e+01, -6.1211e+00,       2.2750e+01, -6.1438e+01,  1.5404e-02,       -8.5312e+01,  1.1641e+01,  6.2500e+01,       -1.0019e+02,  3.9344e+01, -3.1344e+01,       3.8562e+01,  5.9961e+00,  6.2219e+01,       -1.0477e+01,  1.7750e+01,  2.9938e+01,       7.5830e-01, -2.7516e+01,  7.2188e+01,       -2.3406e+01,  1.1617e+01,  6.5125e+01,       6.5078e+00,  6.7812e+01,  4.6812e+01,       7.7344e+00,  6.8562e+01,  5.6719e+00,       2.3125e+01,  6.7562e+01,  9.3750e+00,        5.2094e+01, -8.6562e+01,  1.2695e+01,       3.3562e+01,  2.9734e+01,  5.2250e+01,       9.5469e+00, -7.4414e+00, -2.0125e+01,       1.8145e+00,  7.8438e+01, -4.8125e+01 }, sd::DataType::FLOAT32);
+    NDArray yuv('c', { 5,4,3 }, { 14.5042902, -4.43686799,   2.847406,  92.079556, -25.36761168,   2.94630572,  -1.515069, -4.87137291, -50.29369639,  32.128515, -5.21515376, -9.41983935,-20.5835293,   24.61614501, -44.28390394,  37.1647167, -21.30142676, -38.52221293, -29.26009994,  14.40679768,  45.62757638, -11.550021,    36.44083018, -64.71012983,-10.435098, -10.28950082, -78.74044941,  22.1427147,   19.72198103,  14.40435988,  10.699559,     9.46744852, -18.5778351 ,  -7.6957283,   39.31166179,   7.41657542,  7.245035,    28.48336771, -26.88963173,  47.0880442, -0.13584441, -35.60035823,  43.2050762, -18.47048906, -31.11782117,  47.642019, -18.83162118, -21.50836396,-33.788558,    22.87507047,  75.34330791,  33.445396,     9.25395257,   0.10229474,  -3.8078287, -8.02985955,  11.71587638,  41.0993915, -43.90830496, -34.46396749 }, sd::DataType::FLOAT32);
 
-    nd4j::ops::yuv_to_rgb op;
+    sd::ops::yuv_to_rgb op;
     auto result = op.evaluate({ &yuv }, {}, {});
     auto output = result->at(0);
 
@@ -1195,10 +1195,10 @@ TEST_F(DeclarableOpsTests15, test_yuv_to_rgb_4) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests15, test_yuv_to_rgb_5) {
     // rank 3
-    NDArray expected('c', { 5,3,4 }, { 1.7750e+01f, -7.1062e+01f, -1.0019e+02f,-2.3406e+01f,  5.2094e+01f,  9.5438e+01f, -6.7461e+00f,  3.8562e+01f,  6.5078e+00f,3.3562e+01f, -5.8844e+01f,  2.2750e+01f, -1.0477e+01f,  7.7344e+00f,  9.5469e+00f,2.1391e+01f, -8.5312e+01f,  7.5830e-01f,2.3125e+01f,  1.8145e+00f,  1.4602e+01f,-4.5859e+00f,  3.9344e+01f,  1.1617e+01f,-8.6562e+01f,  1.0038e+02f,  6.7938e+01f,5.9961e+00f,  6.7812e+01f,  2.9734e+01f,2.9609e+01f, -6.1438e+01f,  1.7750e+01f,6.8562e+01f, -7.4414e+00f,  3.9656e+01f,1.1641e+01f, -2.7516e+01f,  6.7562e+01f,7.8438e+01f,  5.4883e+00f,  2.9438e+01f,-3.1344e+01f,  6.5125e+01f,  1.2695e+01f,4.0531e+01f, -6.1211e+00f,  6.2219e+01f,4.6812e+01f,  5.2250e+01f, -1.1414e+01f,1.5404e-02f,  2.9938e+01f,  5.6719e+00f,-2.0125e+01f,  2.1531e+01f,  6.2500e+01f,7.2188e+01f,  9.3750e+00f, -4.8125e+01f }, nd4j::DataType::FLOAT32);
-    NDArray yuv('c', { 5,3,4 }, { 36.628319, 38.600643,-40.624989, 18.231001, -14.822637, -2.479566, -8.965780,  2.223851, -16.561626,-96.205162,-52.255379,-36.527435,-51.546139,2.234915,  20.914114, 8.785358,  32.552223, -3.356598, 9.069552,  1.393482,36.029255, 4.824605,-9.972263,11.058715, 15.947105, 55.283543, 36.845627, -29.750486,0.887228,  6.534475,  -21.794132,34.155693, -89.929497,39.562351, 27.276817,31.359871, 8.149521,  13.673355, 1.104303, 68.774300, 2.236881, 13.216944, -3.555702,-3.225931,3.063015, -36.134724,58.302204, 8.477802, 38.695396,27.181587, -14.157411,7.157054, 11.714512, 22.148155, 11.580557, -27.204905,7.120562, 21.992094, 2.406748, -6.265247, }, nd4j::DataType::FLOAT32);
+    NDArray expected('c', { 5,3,4 }, { 1.7750e+01f, -7.1062e+01f, -1.0019e+02f,-2.3406e+01f,  5.2094e+01f,  9.5438e+01f, -6.7461e+00f,  3.8562e+01f,  6.5078e+00f,3.3562e+01f, -5.8844e+01f,  2.2750e+01f, -1.0477e+01f,  7.7344e+00f,  9.5469e+00f,2.1391e+01f, -8.5312e+01f,  7.5830e-01f,2.3125e+01f,  1.8145e+00f,  1.4602e+01f,-4.5859e+00f,  3.9344e+01f,  1.1617e+01f,-8.6562e+01f,  1.0038e+02f,  6.7938e+01f,5.9961e+00f,  6.7812e+01f,  2.9734e+01f,2.9609e+01f, -6.1438e+01f,  1.7750e+01f,6.8562e+01f, -7.4414e+00f,  3.9656e+01f,1.1641e+01f, -2.7516e+01f,  6.7562e+01f,7.8438e+01f,  5.4883e+00f,  2.9438e+01f,-3.1344e+01f,  6.5125e+01f,  1.2695e+01f,4.0531e+01f, -6.1211e+00f,  6.2219e+01f,4.6812e+01f,  5.2250e+01f, -1.1414e+01f,1.5404e-02f,  2.9938e+01f,  5.6719e+00f,-2.0125e+01f,  2.1531e+01f,  6.2500e+01f,7.2188e+01f,  9.3750e+00f, -4.8125e+01f }, sd::DataType::FLOAT32);
+    NDArray yuv('c', { 5,3,4 }, { 36.628319, 38.600643,-40.624989, 18.231001, -14.822637, -2.479566, -8.965780,  2.223851, -16.561626,-96.205162,-52.255379,-36.527435,-51.546139,2.234915,  20.914114, 8.785358,  32.552223, -3.356598, 9.069552,  1.393482,36.029255, 4.824605,-9.972263,11.058715, 15.947105, 55.283543, 36.845627, -29.750486,0.887228,  6.534475,  -21.794132,34.155693, -89.929497,39.562351, 27.276817,31.359871, 8.149521,  13.673355, 1.104303, 68.774300, 2.236881, 13.216944, -3.555702,-3.225931,3.063015, -36.134724,58.302204, 8.477802, 38.695396,27.181587, -14.157411,7.157054, 11.714512, 22.148155, 11.580557, -27.204905,7.120562, 21.992094, 2.406748, -6.265247, }, sd::DataType::FLOAT32);
 
-    nd4j::ops::yuv_to_rgb op;
+    sd::ops::yuv_to_rgb op;
     auto result = op.evaluate({ &yuv }, {}, { 1 });
     auto output = result->at(0);
 
@@ -1211,9 +1211,9 @@ TEST_F(DeclarableOpsTests15, test_yuv_to_rgb_5) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests15, test_yuv_to_rgb_6) {
     // rank 3
-    NDArray yuv('c', { 3,5,4 }, { 1.7750e+01f, -7.1062e+01f, -1.0019e+02f,-2.3406e+01f,  5.2094e+01f,  9.5438e+01f, -6.7461e+00f,  3.8562e+01f,  6.5078e+00f,3.3562e+01f, -5.8844e+01f,  2.2750e+01f, -1.0477e+01f,  7.7344e+00f,  9.5469e+00f,2.1391e+01f, -8.5312e+01f,  7.5830e-01f,2.3125e+01f,  1.8145e+00f,  1.4602e+01f,-4.5859e+00f,  3.9344e+01f,  1.1617e+01f,-8.6562e+01f,  1.0038e+02f,  6.7938e+01f,5.9961e+00f,  6.7812e+01f,  2.9734e+01f,2.9609e+01f, -6.1438e+01f,  1.7750e+01f,6.8562e+01f, -7.4414e+00f,  3.9656e+01f,1.1641e+01f, -2.7516e+01f,  6.7562e+01f,7.8438e+01f,  5.4883e+00f,  2.9438e+01f,-3.1344e+01f,  6.5125e+01f,  1.2695e+01f,4.0531e+01f, -6.1211e+00f,  6.2219e+01f,4.6812e+01f,  5.2250e+01f, -1.1414e+01f,1.5404e-02f,  2.9938e+01f,  5.6719e+00f,-2.0125e+01f,  2.1531e+01f,  6.2500e+01f,7.2188e+01f,  9.3750e+00f, -4.8125e+01f }, nd4j::DataType::FLOAT32);
+    NDArray yuv('c', { 3,5,4 }, { 1.7750e+01f, -7.1062e+01f, -1.0019e+02f,-2.3406e+01f,  5.2094e+01f,  9.5438e+01f, -6.7461e+00f,  3.8562e+01f,  6.5078e+00f,3.3562e+01f, -5.8844e+01f,  2.2750e+01f, -1.0477e+01f,  7.7344e+00f,  9.5469e+00f,2.1391e+01f, -8.5312e+01f,  7.5830e-01f,2.3125e+01f,  1.8145e+00f,  1.4602e+01f,-4.5859e+00f,  3.9344e+01f,  1.1617e+01f,-8.6562e+01f,  1.0038e+02f,  6.7938e+01f,5.9961e+00f,  6.7812e+01f,  2.9734e+01f,2.9609e+01f, -6.1438e+01f,  1.7750e+01f,6.8562e+01f, -7.4414e+00f,  3.9656e+01f,1.1641e+01f, -2.7516e+01f,  6.7562e+01f,7.8438e+01f,  5.4883e+00f,  2.9438e+01f,-3.1344e+01f,  6.5125e+01f,  1.2695e+01f,4.0531e+01f, -6.1211e+00f,  6.2219e+01f,4.6812e+01f,  5.2250e+01f, -1.1414e+01f,1.5404e-02f,  2.9938e+01f,  5.6719e+00f,-2.0125e+01f,  2.1531e+01f,  6.2500e+01f,7.2188e+01f,  9.3750e+00f, -4.8125e+01f }, sd::DataType::FLOAT32);
     try {
-        nd4j::ops::yuv_to_rgb op;
+        sd::ops::yuv_to_rgb op;
         auto result = op.evaluate({ &yuv }, {}, {});
         ASSERT_EQ(Status::THROW(), result->status());
         delete result;
@@ -1226,10 +1226,10 @@ TEST_F(DeclarableOpsTests15, test_yuv_to_rgb_6) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests15, test_yuv_to_rgb_7) {
     // rank 3
-    NDArray expected('f', { 2, 2, 3 }, { 1.7750e+01f,-7.1062e+01f, -1.0019e+02f, -2.3406e+01f,5.2094e+01f,9.5438e+01f, -6.7461e+00f,3.8562e+01f, 6.5078e+00f,      3.3562e+01f,-5.8844e+01f,2.2750e+01f }, nd4j::DataType::FLOAT32);
-    NDArray yuv('f', { 2,2,3 }, { 36.628319, 38.600643, -40.624989, 18.231001, -14.822637, -2.479566, -8.965780, 2.223851, -16.561626, -96.205162, -52.255379, -36.527435 }, nd4j::DataType::FLOAT32);
+    NDArray expected('f', { 2, 2, 3 }, { 1.7750e+01f,-7.1062e+01f, -1.0019e+02f, -2.3406e+01f,5.2094e+01f,9.5438e+01f, -6.7461e+00f,3.8562e+01f, 6.5078e+00f,      3.3562e+01f,-5.8844e+01f,2.2750e+01f }, sd::DataType::FLOAT32);
+    NDArray yuv('f', { 2,2,3 }, { 36.628319, 38.600643, -40.624989, 18.231001, -14.822637, -2.479566, -8.965780, 2.223851, -16.561626, -96.205162, -52.255379, -36.527435 }, sd::DataType::FLOAT32);
 
-    nd4j::ops::yuv_to_rgb op;
+    sd::ops::yuv_to_rgb op;
     auto result = op.evaluate({ &yuv }, {}, {});
     auto output = result->at(0);
 
@@ -1245,17 +1245,17 @@ TEST_F(DeclarableOpsTests15, test_yuv_to_rgb_7) {
 TEST_F(DeclarableOpsTests15, Pow_BP_Test1) {
 
     // same shape
-    NDArray x('c', { 2,2,2 }, { 4,3,2,5,7,8,-9,-12 }, nd4j::DataType::FLOAT32);
-    NDArray y('c', { 2,2,2 }, { 2,3,-2,4,-1,-4,10,8 }, nd4j::DataType::FLOAT32);
+    NDArray x('c', { 2,2,2 }, { 4,3,2,5,7,8,-9,-12 }, sd::DataType::FLOAT32);
+    NDArray y('c', { 2,2,2 }, { 2,3,-2,4,-1,-4,10,8 }, sd::DataType::FLOAT32);
 
 
-    NDArray dLdz('c', { 2,2,2 }, nd4j::DataType::FLOAT32);
-    NDArray dLdxExp('c', { 2,2,2 }, { 8,  27, -0.25,  500, -0.0204082, -0.000122, -3.87420e+09, -2.86654e+08 }, nd4j::DataType::FLOAT32);
-    NDArray dLdyExp('c', { 2,2,2 }, { 22.18071, 29.66253, 0.17329, 1005.89874, 0.27799, 0.00051, 0, 0 }, nd4j::DataType::FLOAT32);
+    NDArray dLdz('c', { 2,2,2 }, sd::DataType::FLOAT32);
+    NDArray dLdxExp('c', { 2,2,2 }, { 8,  27, -0.25,  500, -0.0204082, -0.000122, -3.87420e+09, -2.86654e+08 }, sd::DataType::FLOAT32);
+    NDArray dLdyExp('c', { 2,2,2 }, { 22.18071, 29.66253, 0.17329, 1005.89874, 0.27799, 0.00051, 0, 0 }, sd::DataType::FLOAT32);
 
     dLdz.assign(1.0);
 
-    nd4j::ops::Pow_bp op;
+    sd::ops::Pow_bp op;
     auto results = op.evaluate({ &x, &y, &dLdz }, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1273,18 +1273,18 @@ TEST_F(DeclarableOpsTests15, Pow_BP_Test1) {
 
 TEST_F(DeclarableOpsTests15, Pow_BP_Test2) {
 
-    NDArray x('c', { 1,2,3 }, nd4j::DataType::FLOAT32);
-    NDArray y('c', { 3,2,1 }, nd4j::DataType::FLOAT32);
-    NDArray dLdz('c', { 3,2,3 }, nd4j::DataType::FLOAT32);
+    NDArray x('c', { 1,2,3 }, sd::DataType::FLOAT32);
+    NDArray y('c', { 3,2,1 }, sd::DataType::FLOAT32);
+    NDArray dLdz('c', { 3,2,3 }, sd::DataType::FLOAT32);
 
-    NDArray dLdxExp('c', { 1,2,3 }, { 16.8, 19.2, 21.6, 24., 26.4, 28.8 }, nd4j::DataType::FLOAT32);
-    NDArray dLdyExp('c', { 3,2,1 }, { 13.30843, 33.27106, 53.2337, 73.19634, 93.15898, 113.12162 }, nd4j::DataType::FLOAT32);
+    NDArray dLdxExp('c', { 1,2,3 }, { 16.8, 19.2, 21.6, 24., 26.4, 28.8 }, sd::DataType::FLOAT32);
+    NDArray dLdyExp('c', { 3,2,1 }, { 13.30843, 33.27106, 53.2337, 73.19634, 93.15898, 113.12162 }, sd::DataType::FLOAT32);
 
     x.assign(4.0);
     y.assign(2.0);
     dLdz.linspace(0.1, 0.1);
 
-    nd4j::ops::Pow_bp op;
+    sd::ops::Pow_bp op;
     auto results = op.evaluate({ &x, &y, &dLdz }, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
@@ -1303,18 +1303,18 @@ TEST_F(DeclarableOpsTests15, Pow_BP_Test2) {
 TEST_F(DeclarableOpsTests15, Pow_BP_Test3) {
 
     // y - same shape as dLdz
-    NDArray xY('c', { 1,2,3 }, nd4j::DataType::FLOAT32);
-    NDArray yY('c', { 3,2,3 }, nd4j::DataType::FLOAT32);
+    NDArray xY('c', { 1,2,3 }, sd::DataType::FLOAT32);
+    NDArray yY('c', { 3,2,3 }, sd::DataType::FLOAT32);
 
-    NDArray dLdxExpY('c', { 1,2,3 }, { 16.8, 19.2, 21.6, 24. , 26.4, 28.8 }, nd4j::DataType::FLOAT32);
-    NDArray dLdyExpY('c', { 3,2,3 }, { 2.21807,  4.43614,  6.65421, 8.87228, 11.09035, 13.30843, 15.5265 , 17.74457, 19.96264, 22.18071, 24.39878, 26.61685, 28.83492, 31.05299, 33.27106, 35.48914, 37.70721, 39.92528 }, nd4j::DataType::FLOAT32);
-    NDArray dLdz('c', { 3,2,3 }, nd4j::DataType::FLOAT32);
+    NDArray dLdxExpY('c', { 1,2,3 }, { 16.8, 19.2, 21.6, 24. , 26.4, 28.8 }, sd::DataType::FLOAT32);
+    NDArray dLdyExpY('c', { 3,2,3 }, { 2.21807,  4.43614,  6.65421, 8.87228, 11.09035, 13.30843, 15.5265 , 17.74457, 19.96264, 22.18071, 24.39878, 26.61685, 28.83492, 31.05299, 33.27106, 35.48914, 37.70721, 39.92528 }, sd::DataType::FLOAT32);
+    NDArray dLdz('c', { 3,2,3 }, sd::DataType::FLOAT32);
 
     xY.assign(4.0);
     yY.assign(2.0);
     dLdz.linspace(0.1, 0.1);
 
-    nd4j::ops::Pow_bp op;
+    sd::ops::Pow_bp op;
     auto resultsY = op.evaluate({ &xY, &yY, &dLdz }, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, resultsY->status());
@@ -1333,16 +1333,16 @@ TEST_F(DeclarableOpsTests15, Pow_BP_Test3) {
 TEST_F(DeclarableOpsTests15, Pow_BP_Test4) {
 
     // x - same shape ad dLdz
-    NDArray yX('c', { 1,2,3 }, nd4j::DataType::FLOAT32);
-    NDArray xX('c', { 3,2,3 }, nd4j::DataType::FLOAT32);
+    NDArray yX('c', { 1,2,3 }, sd::DataType::FLOAT32);
+    NDArray xX('c', { 3,2,3 }, sd::DataType::FLOAT32);
 
-    NDArray dLdxExpX('c', { 3,2,3 }, { 3.2,  6.4,  9.6, 12.8, 16. , 19.2, 22.4, 25.6, 28.8, 32. , 35.2, 38.4, 41.6, 44.8, 48., 51.2, 54.4, 57.6 }, nd4j::DataType::FLOAT32);
-    NDArray dLdyExpX('c', { 1,2,3 }, { 23.28975, 26.61685, 29.94396, 33.27106, 36.59817, 39.92528 }, nd4j::DataType::FLOAT32);
+    NDArray dLdxExpX('c', { 3,2,3 }, { 3.2,  6.4,  9.6, 12.8, 16. , 19.2, 22.4, 25.6, 28.8, 32. , 35.2, 38.4, 41.6, 44.8, 48., 51.2, 54.4, 57.6 }, sd::DataType::FLOAT32);
+    NDArray dLdyExpX('c', { 1,2,3 }, { 23.28975, 26.61685, 29.94396, 33.27106, 36.59817, 39.92528 }, sd::DataType::FLOAT32);
 
-    NDArray dLdz('c', { 3,2,3 }, nd4j::DataType::FLOAT32);
+    NDArray dLdz('c', { 3,2,3 }, sd::DataType::FLOAT32);
     dLdz.linspace(0.1, 0.1);
 
-    nd4j::ops::Pow_bp op;
+    sd::ops::Pow_bp op;
 
     xX.assign(2.0);
     yX.assign(4.0);
@@ -1365,11 +1365,11 @@ TEST_F(DeclarableOpsTests15, Pow_BP_Test4) {
 TEST_F(DeclarableOpsTests15, Pow_BP_Test5) {
 
     // both single array
-    NDArray xConst('c', { 1 }, nd4j::DataType::FLOAT32);
-    NDArray yConst('c', { 1 }, nd4j::DataType::FLOAT32);
-    NDArray dLdz('c', { 1 }, nd4j::DataType::FLOAT32);
-    NDArray dLdxExp('c', { 1 }, nd4j::DataType::FLOAT32);
-    NDArray dLdyExp('c', { 1 }, nd4j::DataType::FLOAT32);
+    NDArray xConst('c', { 1 }, sd::DataType::FLOAT32);
+    NDArray yConst('c', { 1 }, sd::DataType::FLOAT32);
+    NDArray dLdz('c', { 1 }, sd::DataType::FLOAT32);
+    NDArray dLdxExp('c', { 1 }, sd::DataType::FLOAT32);
+    NDArray dLdyExp('c', { 1 }, sd::DataType::FLOAT32);
 
     xConst.assign(3.0);
     yConst.assign(4.0);
@@ -1378,7 +1378,7 @@ TEST_F(DeclarableOpsTests15, Pow_BP_Test5) {
     dLdxExp.assign(4.0 * pow(3, 3));
     dLdyExp.assign(pow(3, 4) * log(3));
 
-    nd4j::ops::Pow_bp op;
+    sd::ops::Pow_bp op;
     auto results = op.evaluate({ &xConst, &yConst, &dLdz }, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
@@ -1397,18 +1397,18 @@ TEST_F(DeclarableOpsTests15, Pow_BP_Test5) {
 TEST_F(DeclarableOpsTests15, Pow_BP_Test6) {
 
     // x single array
-    NDArray xConst('c', { 1 }, nd4j::DataType::FLOAT32);
-    NDArray y('c', { 2, 2, 2 }, nd4j::DataType::FLOAT32);
-    NDArray dLdzC('c', { 2, 2, 2 }, nd4j::DataType::FLOAT32);
+    NDArray xConst('c', { 1 }, sd::DataType::FLOAT32);
+    NDArray y('c', { 2, 2, 2 }, sd::DataType::FLOAT32);
+    NDArray dLdzC('c', { 2, 2, 2 }, sd::DataType::FLOAT32);
 
     xConst.assign(2.0);
     y.assign(4.0);
     dLdzC.linspace(0.1, 0.1);
 
-    NDArray dLdxExpXC('c', { 1 }, std::vector<double>{ 115.2 }, nd4j::DataType::FLOAT32);
-    NDArray dLdyExpXC('c', { 2, 2, 2 }, { 1.10904, 2.21807, 3.32711, 4.43614, 5.54518, 6.65421, 7.76325, 8.87228 }, nd4j::DataType::FLOAT32);
+    NDArray dLdxExpXC('c', { 1 }, std::vector<double>{ 115.2 }, sd::DataType::FLOAT32);
+    NDArray dLdyExpXC('c', { 2, 2, 2 }, { 1.10904, 2.21807, 3.32711, 4.43614, 5.54518, 6.65421, 7.76325, 8.87228 }, sd::DataType::FLOAT32);
 
-    nd4j::ops::Pow_bp op;
+    sd::ops::Pow_bp op;
     auto resultsXC = op.evaluate({ &xConst, &y, &dLdzC }, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, resultsXC->status());
 
@@ -1427,17 +1427,17 @@ TEST_F(DeclarableOpsTests15, Pow_BP_Test7) {
 
     // Y - scalar
     auto Y = NDArrayFactory::create<float>(2.f);
-    NDArray x('c', { 2, 2, 2 }, nd4j::DataType::FLOAT32);
-    NDArray dLdzC('c', { 2, 2, 2 }, nd4j::DataType::FLOAT32);
+    NDArray x('c', { 2, 2, 2 }, sd::DataType::FLOAT32);
+    NDArray dLdzC('c', { 2, 2, 2 }, sd::DataType::FLOAT32);
 
     dLdzC.linspace(0.1, 0.1);
     x = 4.f;
 
-    NDArray dLdxExpYs('c', { 2, 2, 2 }, { 0.8, 1.6, 2.4, 3.2, 4., 4.8, 5.6, 6.4 }, nd4j::DataType::FLOAT32);
+    NDArray dLdxExpYs('c', { 2, 2, 2 }, { 0.8, 1.6, 2.4, 3.2, 4., 4.8, 5.6, 6.4 }, sd::DataType::FLOAT32);
 
     auto dLdyExpYs = NDArrayFactory::create<float>(79.85056f);
 
-    nd4j::ops::Pow_bp op;
+    sd::ops::Pow_bp op;
     auto resultsYs = op.evaluate({ &x, &Y, &dLdzC }, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, resultsYs->status());
 
@@ -1463,7 +1463,7 @@ TEST_F(DeclarableOpsTests15, Pow_BP_Test8) {
 
     NDArray dLdyExp = NDArrayFactory::create<float>(pow(4.f, 2.f) * log(4.f) * 0.1f);
 
-    nd4j::ops::Pow_bp op;
+    sd::ops::Pow_bp op;
     auto results = op.evaluate({ &X, &Y, &dLdz }, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1481,14 +1481,14 @@ TEST_F(DeclarableOpsTests15, Pow_BP_Test8) {
 
 TEST_F(DeclarableOpsTests15, Pow_BP_Test9) {
 
-    nd4j::ops::Pow_bp op;
+    sd::ops::Pow_bp op;
     // diff shapes
-    NDArray x('c', { 3,2,1 }, nd4j::DataType::FLOAT32);
-    NDArray y('c', { 1,2,3 }, nd4j::DataType::FLOAT32);
-    NDArray dLdz('c', { 3,2,3 }, nd4j::DataType::FLOAT32);
+    NDArray x('c', { 3,2,1 }, sd::DataType::FLOAT32);
+    NDArray y('c', { 1,2,3 }, sd::DataType::FLOAT32);
+    NDArray dLdz('c', { 3,2,3 }, sd::DataType::FLOAT32);
 
-    NDArray dLdxExp('c', { 3,2,1 }, { 4.8, 12., 19.2, 26.4, 33.6, 40.8 }, nd4j::DataType::FLOAT32);
-    NDArray dLdyExp('c', { 1,2,3 }, { 46.57949, 53.2337 , 59.88792, 66.54213, 73.19634, 79.85056 }, nd4j::DataType::FLOAT32);
+    NDArray dLdxExp('c', { 3,2,1 }, { 4.8, 12., 19.2, 26.4, 33.6, 40.8 }, sd::DataType::FLOAT32);
+    NDArray dLdyExp('c', { 1,2,3 }, { 46.57949, 53.2337 , 59.88792, 66.54213, 73.19634, 79.85056 }, sd::DataType::FLOAT32);
 
     x.assign(4.0);
     y.assign(2.0);
@@ -1511,18 +1511,18 @@ TEST_F(DeclarableOpsTests15, Pow_BP_Test9) {
 TEST_F(DeclarableOpsTests15, Pow_BP_Test10) {
 
     // diff shapes broadcastable
-    NDArray yB('c', { 1,2,3,1 }, nd4j::DataType::FLOAT32);
-    NDArray xB('c', { 2,3,1 }, nd4j::DataType::FLOAT32);
+    NDArray yB('c', { 1,2,3,1 }, sd::DataType::FLOAT32);
+    NDArray xB('c', { 2,3,1 }, sd::DataType::FLOAT32);
 
-    NDArray dLdyExpB('c', { 1,2,3,1 }, { 2.21807, 4.43614, 6.65421, 8.87228, 11.09035, 13.30843 }, nd4j::DataType::FLOAT32);
-    NDArray dLdxExpB('c', { 2,3,1 }, { 0.8, 1.6, 2.4, 3.2, 4., 4.8 }, nd4j::DataType::FLOAT32);
-    NDArray dLdzB('c', { 1,2,3,1 }, nd4j::DataType::FLOAT32);
+    NDArray dLdyExpB('c', { 1,2,3,1 }, { 2.21807, 4.43614, 6.65421, 8.87228, 11.09035, 13.30843 }, sd::DataType::FLOAT32);
+    NDArray dLdxExpB('c', { 2,3,1 }, { 0.8, 1.6, 2.4, 3.2, 4., 4.8 }, sd::DataType::FLOAT32);
+    NDArray dLdzB('c', { 1,2,3,1 }, sd::DataType::FLOAT32);
 
     dLdzB.linspace(0.1, 0.1);
     xB.assign(4.0);
     yB.assign(2.0);
 
-    nd4j::ops::Pow_bp op;
+    sd::ops::Pow_bp op;
     auto resultsB = op.evaluate({ &xB, &yB, &dLdzB }, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, resultsB->status());
@@ -1545,15 +1545,15 @@ TEST_F(DeclarableOpsTests15, Pow_BP_Test11) {
         return;
 #endif
 
-    NDArray xB('c', { 3,2,1 }, { .4, 3, 5, .8, -9, -12 }, nd4j::DataType::FLOAT32);
-    NDArray yB('c', { 1,2,3 }, { 3, -2, .4, -4, 10, .8 }, nd4j::DataType::FLOAT32);
+    NDArray xB('c', { 3,2,1 }, { .4, 3, 5, .8, -9, -12 }, sd::DataType::FLOAT32);
+    NDArray yB('c', { 1,2,3 }, { 3, -2, .4, -4, 10, .8 }, sd::DataType::FLOAT32);
 
-    NDArray dLdxExpB('c', { 3,2,1 }, { -5.994056, 39366.191406, 7.508829, -2.223537, -std::numeric_limits<float>::quiet_NaN(), -std::numeric_limits<float>::quiet_NaN() }, nd4j::DataType::FLOAT32);
-    NDArray dLdyExpB('c', { 1,2,3 }, { 20.11211,  -1.119612, -std::numeric_limits<float>::quiet_NaN(), -0.1076, 12974.389648, -std::numeric_limits<float>::quiet_NaN() }, nd4j::DataType::FLOAT32);
+    NDArray dLdxExpB('c', { 3,2,1 }, { -5.994056, 39366.191406, 7.508829, -2.223537, -std::numeric_limits<float>::quiet_NaN(), -std::numeric_limits<float>::quiet_NaN() }, sd::DataType::FLOAT32);
+    NDArray dLdyExpB('c', { 1,2,3 }, { 20.11211,  -1.119612, -std::numeric_limits<float>::quiet_NaN(), -0.1076, 12974.389648, -std::numeric_limits<float>::quiet_NaN() }, sd::DataType::FLOAT32);
 
-    NDArray dLdzB('c', { 3,2,3 }, { .1,.2,.3, .1,.2,.3, .1,.4,.1, .2,.1,.1, .3,.1,.5, .1, .7, .1 }, nd4j::DataType::FLOAT32);
+    NDArray dLdzB('c', { 3,2,3 }, { .1,.2,.3, .1,.2,.3, .1,.4,.1, .2,.1,.1, .3,.1,.5, .1, .7, .1 }, sd::DataType::FLOAT32);
 
-    nd4j::ops::Pow_bp op;
+    sd::ops::Pow_bp op;
     auto resultsB = op.evaluate({ &xB, &yB, &dLdzB }, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, resultsB->status());
@@ -1562,13 +1562,13 @@ TEST_F(DeclarableOpsTests15, Pow_BP_Test11) {
 
     ASSERT_TRUE(dLdxExpB.isSameShape(dLdxB));
     for (int i = 0; i < dLdxB->lengthOf(); ++i) {
-        if (!nd4j::math::nd4j_isnan(dLdxB->e<float>(i)) && !nd4j::math::nd4j_isnan(dLdxExpB.e<float>(i)))
+        if (!sd::math::nd4j_isnan(dLdxB->e<float>(i)) && !sd::math::nd4j_isnan(dLdxExpB.e<float>(i)))
             ASSERT_NEAR(dLdxB->e<float>(i), dLdxExpB.e<float>(i), 0.00001);
     }
 
     ASSERT_TRUE(dLdyExpB.isSameShape(dLdyB));
     for (int i = 0; i < dLdyB->lengthOf(); ++i) {
-        if (!nd4j::math::nd4j_isnan(dLdyB->e<float>(i)) && !nd4j::math::nd4j_isnan(dLdyExpB.e<float>(i)))
+        if (!sd::math::nd4j_isnan(dLdyB->e<float>(i)) && !sd::math::nd4j_isnan(dLdyExpB.e<float>(i)))
             ASSERT_NEAR(dLdyB->e<float>(i), dLdyExpB.e<float>(i), 0.00001);
     }
 
@@ -1577,14 +1577,14 @@ TEST_F(DeclarableOpsTests15, Pow_BP_Test11) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests15, TestTensorMmul_BP1) {
 
-    NDArray A('c', { 1, 2, 3 }, { 2.1, 2.2, 2.3, 2.4, 2.5, 2.6 }, nd4j::DataType::FLOAT32);
-    NDArray B('c', { 1, 2, 4 }, { 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8 }, nd4j::DataType::FLOAT32);
-    NDArray dLdC('c', { 3, 4 }, { .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.1 }, nd4j::DataType::FLOAT32);
+    NDArray A('c', { 1, 2, 3 }, { 2.1, 2.2, 2.3, 2.4, 2.5, 2.6 }, sd::DataType::FLOAT32);
+    NDArray B('c', { 1, 2, 4 }, { 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8 }, sd::DataType::FLOAT32);
+    NDArray dLdC('c', { 3, 4 }, { .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.1 }, sd::DataType::FLOAT32);
 
-    NDArray dLdA('c', { 1, 2, 3 }, { 3.3,  8.5,  13.36, 3.7, 9.54, 15. }, nd4j::DataType::FLOAT32);
-    NDArray dLdB('c', { 1, 2, 4 }, { 3.38, 4.04, 4.7, 5.13, 3.83, 4.58, 5.33, 5.82 }, nd4j::DataType::FLOAT32);
+    NDArray dLdA('c', { 1, 2, 3 }, { 3.3,  8.5,  13.36, 3.7, 9.54, 15. }, sd::DataType::FLOAT32);
+    NDArray dLdB('c', { 1, 2, 4 }, { 3.38, 4.04, 4.7, 5.13, 3.83, 4.58, 5.33, 5.82 }, sd::DataType::FLOAT32);
 
-    nd4j::ops::tensormmul_bp op_bp;
+    sd::ops::tensormmul_bp op_bp;
 
     auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 2,0,1, 2,0,1 }, {});
 
@@ -1604,11 +1604,11 @@ TEST_F(DeclarableOpsTests15, TestTensorMmul_BP1) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests15, TestTensorMmul_BP2) {
 
-    NDArray A('c', { 1, 2, 3 }, { 2,2,2, 2,2,2 }, nd4j::DataType::FLOAT32);
-    NDArray B('c', { 1, 2, 3 }, { 3,3,3,3, 3,3 }, nd4j::DataType::FLOAT32);
-    NDArray dLdC('c', { 1 }, { 1 }, nd4j::DataType::FLOAT32);
+    NDArray A('c', { 1, 2, 3 }, { 2,2,2, 2,2,2 }, sd::DataType::FLOAT32);
+    NDArray B('c', { 1, 2, 3 }, { 3,3,3,3, 3,3 }, sd::DataType::FLOAT32);
+    NDArray dLdC('c', { 1 }, { 1 }, sd::DataType::FLOAT32);
 
-    nd4j::ops::tensormmul_bp op_bp;
+    sd::ops::tensormmul_bp op_bp;
     auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 2,1,2, 2,1,2 }, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, resultsBP->status());
@@ -1627,14 +1627,14 @@ TEST_F(DeclarableOpsTests15, TestTensorMmul_BP2) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests15, TestTensorMmul_BP3) {
 
-    NDArray A('c', { 3, 2, 2 }, { 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3., 3.1, 3.2 }, nd4j::DataType::FLOAT32);
-    NDArray B('c', { 4, 2, 2 }, { 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4., 4.1, 4.2, 4.3, 4.4, 4.5, 4.6 }, nd4j::DataType::FLOAT32);
-    NDArray dLdC('c', { 3, 4 }, { .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2 }, nd4j::DataType::FLOAT32);
+    NDArray A('c', { 3, 2, 2 }, { 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3., 3.1, 3.2 }, sd::DataType::FLOAT32);
+    NDArray B('c', { 4, 2, 2 }, { 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4., 4.1, 4.2, 4.3, 4.4, 4.5, 4.6 }, sd::DataType::FLOAT32);
+    NDArray dLdC('c', { 3, 4 }, { .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2 }, sd::DataType::FLOAT32);
 
-    NDArray dA('c', { 3, 2, 2 }, { 3.9, 4., 4.1, 4.2, 9.82, 10.08, 10.34, 10.6, 15.74, 16.16, 16.58, 17. }, nd4j::DataType::FLOAT32);
-    NDArray dB('c', { 4, 2, 2 }, { 4.07, 4.22, 4.37, 4.52, 4.82, 5., 5.18, 5.36, 5.57, 5.78, 5.99, 6.2, 6.32, 6.56, 6.8,  7.04 }, nd4j::DataType::FLOAT32);
+    NDArray dA('c', { 3, 2, 2 }, { 3.9, 4., 4.1, 4.2, 9.82, 10.08, 10.34, 10.6, 15.74, 16.16, 16.58, 17. }, sd::DataType::FLOAT32);
+    NDArray dB('c', { 4, 2, 2 }, { 4.07, 4.22, 4.37, 4.52, 4.82, 5., 5.18, 5.36, 5.57, 5.78, 5.99, 6.2, 6.32, 6.56, 6.8,  7.04 }, sd::DataType::FLOAT32);
 
-    nd4j::ops::tensormmul_bp op_bp;
+    sd::ops::tensormmul_bp op_bp;
 
     auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 2,1,2, 2,1,2 }, {});
 
@@ -1654,14 +1654,14 @@ TEST_F(DeclarableOpsTests15, TestTensorMmul_BP3) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests15, TestTensorMmul_BP4) {
 
-    NDArray A('c', { 3, 4, 1 }, { 0.4, 3, 5,  9, 23, 0.12,  8, 9, 0.1,  0, 124, 3 }, nd4j::DataType::FLOAT32);
-    NDArray B('c', { 2, 4, 1 }, { 4, 13, .5,  19, 2.3, 1.2,  18, .9 }, nd4j::DataType::FLOAT32);
-    NDArray dLdC('c', { 3, 2 }, { 1.1, 1.2, 1.3, 1.4, 1.5, 1.6 }, nd4j::DataType::FLOAT32);
+    NDArray A('c', { 3, 4, 1 }, { 0.4, 3, 5,  9, 23, 0.12,  8, 9, 0.1,  0, 124, 3 }, sd::DataType::FLOAT32);
+    NDArray B('c', { 2, 4, 1 }, { 4, 13, .5,  19, 2.3, 1.2,  18, .9 }, sd::DataType::FLOAT32);
+    NDArray dLdC('c', { 3, 2 }, { 1.1, 1.2, 1.3, 1.4, 1.5, 1.6 }, sd::DataType::FLOAT32);
 
-    NDArray dLdA('c', { 3, 4, 1 }, { 7.16, 15.74, 22.15, 21.98, 8.42, 18.58, 25.85, 25.96, 9.68, 21.42, 29.55, 29.94 }, nd4j::DataType::FLOAT32);
-    NDArray dLdB('c', { 2, 4, 1 }, { 30.49, 3.456, 201.9, 26.1, 32.84 , 3.768, 215.6, 28.2 }, nd4j::DataType::FLOAT32);
+    NDArray dLdA('c', { 3, 4, 1 }, { 7.16, 15.74, 22.15, 21.98, 8.42, 18.58, 25.85, 25.96, 9.68, 21.42, 29.55, 29.94 }, sd::DataType::FLOAT32);
+    NDArray dLdB('c', { 2, 4, 1 }, { 30.49, 3.456, 201.9, 26.1, 32.84 , 3.768, 215.6, 28.2 }, sd::DataType::FLOAT32);
 
-    nd4j::ops::tensormmul_bp op_bp;
+    sd::ops::tensormmul_bp op_bp;
 
     auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 2,1,2, 2,1,2 }, {});
 
@@ -1681,14 +1681,14 @@ TEST_F(DeclarableOpsTests15, TestTensorMmul_BP4) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests15, TestTensorMmul_BP5) {
 
-    NDArray A('c', { 3, 4, 1, 1 }, { 0.4, 3, 5,  9, 23, 0.12,  8, 9, 0.1,  0, 124, 3 }, nd4j::DataType::FLOAT32);
-    NDArray B('c', { 2, 4, 1, 1 }, { 4, 13, .5,  19, 2.3, 1.2,  18, .9 }, nd4j::DataType::FLOAT32);
-    NDArray dLdC('c', { 3, 1, 2, 1 }, { 1.1,1.2,1.3,1.4,1.5,1.6 }, nd4j::DataType::FLOAT32);
+    NDArray A('c', { 3, 4, 1, 1 }, { 0.4, 3, 5,  9, 23, 0.12,  8, 9, 0.1,  0, 124, 3 }, sd::DataType::FLOAT32);
+    NDArray B('c', { 2, 4, 1, 1 }, { 4, 13, .5,  19, 2.3, 1.2,  18, .9 }, sd::DataType::FLOAT32);
+    NDArray dLdC('c', { 3, 1, 2, 1 }, { 1.1,1.2,1.3,1.4,1.5,1.6 }, sd::DataType::FLOAT32);
 
-    NDArray dLdA('c', { 3, 4, 1, 1 }, { 7.16, 15.74, 22.15, 21.98, 8.42, 18.58, 25.85, 25.96, 9.68, 21.42, 29.55, 29.94 }, nd4j::DataType::FLOAT32);
-    NDArray dLdB('c', { 2, 4, 1, 1 }, { 30.49, 3.456, 201.9,  26.1, 32.84,  3.768, 215.6, 28.2 }, nd4j::DataType::FLOAT32);
+    NDArray dLdA('c', { 3, 4, 1, 1 }, { 7.16, 15.74, 22.15, 21.98, 8.42, 18.58, 25.85, 25.96, 9.68, 21.42, 29.55, 29.94 }, sd::DataType::FLOAT32);
+    NDArray dLdB('c', { 2, 4, 1, 1 }, { 30.49, 3.456, 201.9,  26.1, 32.84,  3.768, 215.6, 28.2 }, sd::DataType::FLOAT32);
 
-    nd4j::ops::tensormmul_bp op_bp;
+    sd::ops::tensormmul_bp op_bp;
 
     auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 2,1,2, 2,1,2 }, {});
 
@@ -1708,12 +1708,12 @@ TEST_F(DeclarableOpsTests15, TestTensorMmul_BP5) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests15, TestTensorMmul_BP6) {
 
-    NDArray A('c', { 2, 2, 2 }, { 2,2, 2,2, 2,2, 2,2 }, nd4j::DataType::FLOAT32);
-    NDArray B('c', { 2, 2, 2 }, { 3,3, 3,3, 3,3, 3,3  }, nd4j::DataType::FLOAT32);
+    NDArray A('c', { 2, 2, 2 }, { 2,2, 2,2, 2,2, 2,2 }, sd::DataType::FLOAT32);
+    NDArray B('c', { 2, 2, 2 }, { 3,3, 3,3, 3,3, 3,3  }, sd::DataType::FLOAT32);
 
     auto dLdC = NDArrayFactory::create<float>(1.f);
 
-    nd4j::ops::tensormmul_bp op_bp;
+    sd::ops::tensormmul_bp op_bp;
     auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 3,0,1,2, 3,0,1,2 }, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, resultsBP->status());
@@ -1732,14 +1732,14 @@ TEST_F(DeclarableOpsTests15, TestTensorMmul_BP6) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests15, TestTensorMmul_BP7) {
 
-    NDArray A('c', { 3, 4, 1 }, { 0.4, 3, 5,  9, 23, 0.12,  8, 9, 0.1,  0, 124, 3 }, nd4j::DataType::FLOAT32);
-    NDArray B('c', { 2, 4, 1 }, { 4, 13, .5,  19, 2.3, 1.2,  18, .9 }, nd4j::DataType::FLOAT32);
-    NDArray dLdC('c', { 3, 1, 2, 1 }, { 1.1, 1.2, 1.3, 1.4, 1.5, 1.6 }, nd4j::DataType::FLOAT32);
+    NDArray A('c', { 3, 4, 1 }, { 0.4, 3, 5,  9, 23, 0.12,  8, 9, 0.1,  0, 124, 3 }, sd::DataType::FLOAT32);
+    NDArray B('c', { 2, 4, 1 }, { 4, 13, .5,  19, 2.3, 1.2,  18, .9 }, sd::DataType::FLOAT32);
+    NDArray dLdC('c', { 3, 1, 2, 1 }, { 1.1, 1.2, 1.3, 1.4, 1.5, 1.6 }, sd::DataType::FLOAT32);
 
-    NDArray dLdA('c', { 3, 4, 1 }, { 7.16, 15.74, 22.15, 21.98, 8.42, 18.58, 25.85, 25.96, 9.68, 21.42, 29.55, 29.94 }, nd4j::DataType::FLOAT32);
-    NDArray dLdB('c', { 2, 4, 1 }, { 30.49, 3.456, 201.9,  26.1, 32.84,  3.768, 215.6, 28.2 }, nd4j::DataType::FLOAT32);
+    NDArray dLdA('c', { 3, 4, 1 }, { 7.16, 15.74, 22.15, 21.98, 8.42, 18.58, 25.85, 25.96, 9.68, 21.42, 29.55, 29.94 }, sd::DataType::FLOAT32);
+    NDArray dLdB('c', { 2, 4, 1 }, { 30.49, 3.456, 201.9,  26.1, 32.84,  3.768, 215.6, 28.2 }, sd::DataType::FLOAT32);
 
-    nd4j::ops::tensormmul_bp op_bp;
+    sd::ops::tensormmul_bp op_bp;
 
     auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 1,1, 1,1 }, {});
 
@@ -1758,14 +1758,14 @@ TEST_F(DeclarableOpsTests15, TestTensorMmul_BP7) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests15, TestTensorMmul_BP8) {
 
-    NDArray A('c', { 1, 1, 4, 3 }, { 0.4, 3, 5,  9, 23, 0.12,  8, 9, 0.1,  0, 124, 3 }, nd4j::DataType::FLOAT32);
-    NDArray B('c', { 1, 1, 4, 2 }, { 4, 13, .5,  19, 2.3, 1.2,  18, .9 }, nd4j::DataType::FLOAT32);
-    NDArray dLdC('c', { 3, 2 }, { 1.1,1.2,1.3,1.4,1.5,1.6 }, nd4j::DataType::FLOAT32);
+    NDArray A('c', { 1, 1, 4, 3 }, { 0.4, 3, 5,  9, 23, 0.12,  8, 9, 0.1,  0, 124, 3 }, sd::DataType::FLOAT32);
+    NDArray B('c', { 1, 1, 4, 2 }, { 4, 13, .5,  19, 2.3, 1.2,  18, .9 }, sd::DataType::FLOAT32);
+    NDArray dLdC('c', { 3, 2 }, { 1.1,1.2,1.3,1.4,1.5,1.6 }, sd::DataType::FLOAT32);
 
-    NDArray dLdA('c', { 1, 1, 4, 3 }, { 20., 23.4,  26.8, 23.35, 27.25, 31.15, 3.97,  4.67,  5.37, 20.88, 24.66, 28.44 }, nd4j::DataType::FLOAT32);
-    NDArray dLdB('c', { 1, 1, 4, 2 }, { 11.84,   12.68,  39.98,  43.192, 20.65, 22.36, 165.7,   178.4 }, nd4j::DataType::FLOAT32);
+    NDArray dLdA('c', { 1, 1, 4, 3 }, { 20., 23.4,  26.8, 23.35, 27.25, 31.15, 3.97,  4.67,  5.37, 20.88, 24.66, 28.44 }, sd::DataType::FLOAT32);
+    NDArray dLdB('c', { 1, 1, 4, 2 }, { 11.84,   12.68,  39.98,  43.192, 20.65, 22.36, 165.7,   178.4 }, sd::DataType::FLOAT32);
 
-    nd4j::ops::tensormmul_bp op_bp;
+    sd::ops::tensormmul_bp op_bp;
 
     auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 3,0,1,2, 3,0,1,2 }, {});
 
@@ -1785,14 +1785,14 @@ TEST_F(DeclarableOpsTests15, TestTensorMmul_BP8) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests15, TestTensorMmul_BP9) {
 
-    NDArray A('c', { 3, 2, 2, 1 }, { 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3., 3.1, 3.2 }, nd4j::DataType::FLOAT32);
-    NDArray B('c', { 4, 2, 2 ,1 }, { 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4., 4.1, 4.2, 4.3, 4.4, 4.5, 4.6 }, nd4j::DataType::FLOAT32);
-    NDArray dLdC('c', { 3, 1, 4, 1 }, { .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2 }, nd4j::DataType::FLOAT32);
+    NDArray A('c', { 3, 2, 2, 1 }, { 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3., 3.1, 3.2 }, sd::DataType::FLOAT32);
+    NDArray B('c', { 4, 2, 2 ,1 }, { 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4., 4.1, 4.2, 4.3, 4.4, 4.5, 4.6 }, sd::DataType::FLOAT32);
+    NDArray dLdC('c', { 3, 1, 4, 1 }, { .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2 }, sd::DataType::FLOAT32);
 
-    NDArray dA('c', { 3, 2, 2, 1 }, { 3.9, 4., 4.1, 4.2, 9.82, 10.08, 10.34, 10.6, 15.74, 16.16, 16.58, 17. }, nd4j::DataType::FLOAT32);
-    NDArray dB('c', { 4, 2, 2, 1 }, { 4.07, 4.22, 4.37, 4.52, 4.82, 5., 5.18, 5.36, 5.57, 5.78, 5.99, 6.2, 6.32, 6.56, 6.8,  7.04 }, nd4j::DataType::FLOAT32);
+    NDArray dA('c', { 3, 2, 2, 1 }, { 3.9, 4., 4.1, 4.2, 9.82, 10.08, 10.34, 10.6, 15.74, 16.16, 16.58, 17. }, sd::DataType::FLOAT32);
+    NDArray dB('c', { 4, 2, 2, 1 }, { 4.07, 4.22, 4.37, 4.52, 4.82, 5., 5.18, 5.36, 5.57, 5.78, 5.99, 6.2, 6.32, 6.56, 6.8,  7.04 }, sd::DataType::FLOAT32);
 
-    nd4j::ops::tensormmul_bp op_bp;
+    sd::ops::tensormmul_bp op_bp;
 
     auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 2,1,2, 2,1,2 }, {});
 
@@ -1812,15 +1812,15 @@ TEST_F(DeclarableOpsTests15, TestTensorMmul_BP9) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests15, TestTensorMmul_BP10) {
 
-    NDArray A('c', { 1, 2, 2, 3 }, { 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3., 3.1, 3.2 }, nd4j::DataType::FLOAT32);
-    NDArray B('c', { 1, 2, 2 ,4 }, { 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4., 4.1, 4.2, 4.3, 4.4, 4.5, 4.6 }, nd4j::DataType::FLOAT32);
-    NDArray dLdC('c', { 1, 3, 1, 4 }, { .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2 }, nd4j::DataType::FLOAT32);
+    NDArray A('c', { 1, 2, 2, 3 }, { 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3., 3.1, 3.2 }, sd::DataType::FLOAT32);
+    NDArray B('c', { 1, 2, 2 ,4 }, { 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4., 4.1, 4.2, 4.3, 4.4, 4.5, 4.6 }, sd::DataType::FLOAT32);
+    NDArray dLdC('c', { 1, 3, 1, 4 }, { .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2 }, sd::DataType::FLOAT32);
 
 
-    NDArray dA('c', { 1, 2, 2, 3 }, { 3.3, 8.5, 13.7, 3.7, 9.54, 15.38, 4.1, 10.58, 17.06, 4.5,  11.62, 18.74 }, nd4j::DataType::FLOAT32);
-    NDArray dB('c', { 1, 2, 2, 4 }, { 3.38, 4.04, 4.7, 5.36, 3.83, 4.58, 5.33, 6.08, 4.28, 5.12, 5.96, 6.8, 4.73, 5.66, 6.59, 7.52 }, nd4j::DataType::FLOAT32);
+    NDArray dA('c', { 1, 2, 2, 3 }, { 3.3, 8.5, 13.7, 3.7, 9.54, 15.38, 4.1, 10.58, 17.06, 4.5,  11.62, 18.74 }, sd::DataType::FLOAT32);
+    NDArray dB('c', { 1, 2, 2, 4 }, { 3.38, 4.04, 4.7, 5.36, 3.83, 4.58, 5.33, 6.08, 4.28, 5.12, 5.96, 6.8, 4.73, 5.66, 6.59, 7.52 }, sd::DataType::FLOAT32);
 
-    nd4j::ops::tensormmul_bp op_bp;
+    sd::ops::tensormmul_bp op_bp;
 
     auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 2,1,2, 2,1,2 }, {});
 
@@ -1840,15 +1840,15 @@ TEST_F(DeclarableOpsTests15, TestTensorMmul_BP10) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests15, TestTensorMmul_BP11) {
 
-    NDArray A('c', { 2, 2, 3 }, { 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3., 3.1, 3.2 }, nd4j::DataType::FLOAT32);
-    NDArray B('c', { 2, 2 ,4 }, { 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4., 4.1, 4.2, 4.3, 4.4, 4.5, 4.6 }, nd4j::DataType::FLOAT32);
-    NDArray dLdC('c', { 3, 4 }, { .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2 }, nd4j::DataType::FLOAT32);
+    NDArray A('c', { 2, 2, 3 }, { 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3., 3.1, 3.2 }, sd::DataType::FLOAT32);
+    NDArray B('c', { 2, 2 ,4 }, { 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4., 4.1, 4.2, 4.3, 4.4, 4.5, 4.6 }, sd::DataType::FLOAT32);
+    NDArray dLdC('c', { 3, 4 }, { .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2 }, sd::DataType::FLOAT32);
 
 
-    NDArray dA('c', { 2, 2, 3 }, { 3.3, 8.5, 13.7, 3.7, 9.54, 15.38, 4.1, 10.58, 17.06, 4.5,  11.62, 18.74 }, nd4j::DataType::FLOAT32);
-    NDArray dB('c', { 2, 2, 4 }, { 3.38, 4.04, 4.7, 5.36, 3.83, 4.58, 5.33, 6.08, 4.28, 5.12, 5.96, 6.8, 4.73, 5.66, 6.59, 7.52 }, nd4j::DataType::FLOAT32);
+    NDArray dA('c', { 2, 2, 3 }, { 3.3, 8.5, 13.7, 3.7, 9.54, 15.38, 4.1, 10.58, 17.06, 4.5,  11.62, 18.74 }, sd::DataType::FLOAT32);
+    NDArray dB('c', { 2, 2, 4 }, { 3.38, 4.04, 4.7, 5.36, 3.83, 4.58, 5.33, 6.08, 4.28, 5.12, 5.96, 6.8, 4.73, 5.66, 6.59, 7.52 }, sd::DataType::FLOAT32);
 
-    nd4j::ops::tensormmul_bp op_bp;
+    sd::ops::tensormmul_bp op_bp;
 
     auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 2,0,1, 2,0,1 }, {});
 
@@ -1868,16 +1868,16 @@ TEST_F(DeclarableOpsTests15, TestTensorMmul_BP11) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests15, TestTensorMmul_BP12) {
 
-    NDArray A('c', { 2, 2, 3 }, { 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3., 3.1, 3.2 }, nd4j::DataType::FLOAT32);
-    NDArray B('c', { 2, 2 ,3 }, { 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4., 4.1, 4.2 }, nd4j::DataType::FLOAT32);
+    NDArray A('c', { 2, 2, 3 }, { 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3., 3.1, 3.2 }, sd::DataType::FLOAT32);
+    NDArray B('c', { 2, 2 ,3 }, { 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4., 4.1, 4.2 }, sd::DataType::FLOAT32);
     NDArray dLdC('c', { 2, 3, 2, 3 }, { .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2,
                       1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2., 2.1, 2.2, 2.3, 2.4,
-                      2.5, 2.6, 2.7, 2.8, 2.9, 3., 3.1, 3.2, 3.3, 3.4, 3.5, 3.6 }, nd4j::DataType::FLOAT32);
+                      2.5, 2.6, 2.7, 2.8, 2.9, 3., 3.1, 3.2, 3.3, 3.4, 3.5, 3.6 }, sd::DataType::FLOAT32);
 
-    NDArray dA('c', { 2, 2, 3 }, { 7.66, 20.26, 32.86, 8.29, 21.97, 35.65, 45.46, 58.06, 70.66, 49.33, 63.01, 76.69 }, nd4j::DataType::FLOAT32);
-    NDArray dB('c', { 2, 2, 3 }, { 25.86, 27.36, 28.86, 28.74, 30.42, 32.1, 30.36, 31.86, 33.36, 33.78, 35.46, 37.14 }, nd4j::DataType::FLOAT32);
+    NDArray dA('c', { 2, 2, 3 }, { 7.66, 20.26, 32.86, 8.29, 21.97, 35.65, 45.46, 58.06, 70.66, 49.33, 63.01, 76.69 }, sd::DataType::FLOAT32);
+    NDArray dB('c', { 2, 2, 3 }, { 25.86, 27.36, 28.86, 28.74, 30.42, 32.1, 30.36, 31.86, 33.36, 33.78, 35.46, 37.14 }, sd::DataType::FLOAT32);
 
-    nd4j::ops::tensormmul_bp op_bp;
+    sd::ops::tensormmul_bp op_bp;
 
     auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 1,1, 1,1 }, {});
 
@@ -1897,16 +1897,16 @@ TEST_F(DeclarableOpsTests15, TestTensorMmul_BP12) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests15, TestTensorMmul_BP13) {
 
-    NDArray A('c', { 3, 2, 2 }, { 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3., 3.1, 3.2 }, nd4j::DataType::DOUBLE);
-    NDArray B('c', { 3, 2, 2 }, { 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4., 4.1, 4.2 }, nd4j::DataType::DOUBLE);
+    NDArray A('c', { 3, 2, 2 }, { 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3., 3.1, 3.2 }, sd::DataType::DOUBLE);
+    NDArray B('c', { 3, 2, 2 }, { 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4., 4.1, 4.2 }, sd::DataType::DOUBLE);
     NDArray dLdC('c', { 3, 2, 3, 2 }, { .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2,
                       1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2., 2.1, 2.2, 2.3, 2.4,
-                      2.5, 2.6, 2.7, 2.8, 2.9, 3., 3.1, 3.2, 3.3, 3.4, 3.5, 3.6 }, nd4j::DataType::DOUBLE);
+                      2.5, 2.6, 2.7, 2.8, 2.9, 3., 3.1, 3.2, 3.3, 3.4, 3.5, 3.6 }, sd::DataType::DOUBLE);
 
-    NDArray dA('c', { 3, 2, 2 }, { 7.79, 20.57, 8.21, 21.71, 33.35, 46.13, 35.21, 48.71, 58.91, 71.69, 62.21, 75.71 }, nd4j::DataType::DOUBLE);
-    NDArray dB('c', { 3, 2, 2 }, { 26.49, 28.02, 28.41, 30.06, 29.55, 31.08, 31.71, 33.36, 32.61, 34.14, 35.01, 36.66 }, nd4j::DataType::DOUBLE);
+    NDArray dA('c', { 3, 2, 2 }, { 7.79, 20.57, 8.21, 21.71, 33.35, 46.13, 35.21, 48.71, 58.91, 71.69, 62.21, 75.71 }, sd::DataType::DOUBLE);
+    NDArray dB('c', { 3, 2, 2 }, { 26.49, 28.02, 28.41, 30.06, 29.55, 31.08, 31.71, 33.36, 32.61, 34.14, 35.01, 36.66 }, sd::DataType::DOUBLE);
 
-    nd4j::ops::tensormmul_bp op_bp;
+    sd::ops::tensormmul_bp op_bp;
 
     auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 1,1, 1,1 }, {});
 
@@ -1926,20 +1926,20 @@ TEST_F(DeclarableOpsTests15, TestTensorMmul_BP13) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests15, TestTensorMmul_BP14) {
 
-    NDArray A('c', { 2, 2, 2, 2 }, { 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3., 3.1, 3.2, 3.3, 3.4, 3.5, 3.6 }, nd4j::DataType::DOUBLE);
+    NDArray A('c', { 2, 2, 2, 2 }, { 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3., 3.1, 3.2, 3.3, 3.4, 3.5, 3.6 }, sd::DataType::DOUBLE);
 
-    NDArray B('c', { 2, 2, 2, 2 }, { 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4., 4.1, 4.2, 4.3, 4.4, 4.5, 4.6 }, nd4j::DataType::DOUBLE);
+    NDArray B('c', { 2, 2, 2, 2 }, { 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4., 4.1, 4.2, 4.3, 4.4, 4.5, 4.6 }, sd::DataType::DOUBLE);
 
     NDArray dLdC('c', { 2, 2, 2, 2, 2, 2 }, { .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2,
                       1.3, 1.4, 1.5, 1.6, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2,
                       1.3, 1.4, 1.5, 1.6, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2,
                       1.3, 1.4, 1.5, 1.6, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2,
-                      1.3, 1.4, 1.5, 1.6 }, nd4j::DataType::DOUBLE);
+                      1.3, 1.4, 1.5, 1.6 }, sd::DataType::DOUBLE);
 
-    NDArray dA('c', { 2, 2, 2, 2 }, { 13.88, 37.24, 13.88, 37.24, 15.32, 41.24, 15.32, 41.24, 13.88, 37.24, 13.88, 37.24, 15.32, 41.24, 15.32, 41.24 }, nd4j::DataType::DOUBLE);
-    NDArray dB('c', { 2, 2, 2, 2 }, { 10.76, 12.88, 15., 17.12, 12.36, 14.8, 17.24, 19.68, 19.24, 21.36, 23.48, 25.6, 22.12, 24.56, 27., 29.44 }, nd4j::DataType::DOUBLE);
+    NDArray dA('c', { 2, 2, 2, 2 }, { 13.88, 37.24, 13.88, 37.24, 15.32, 41.24, 15.32, 41.24, 13.88, 37.24, 13.88, 37.24, 15.32, 41.24, 15.32, 41.24 }, sd::DataType::DOUBLE);
+    NDArray dB('c', { 2, 2, 2, 2 }, { 10.76, 12.88, 15., 17.12, 12.36, 14.8, 17.24, 19.68, 19.24, 21.36, 23.48, 25.6, 22.12, 24.56, 27., 29.44 }, sd::DataType::DOUBLE);
 
-    nd4j::ops::tensormmul_bp op_bp;
+    sd::ops::tensormmul_bp op_bp;
 
     auto resultsBP = op_bp.evaluate({ &A, &B, &dLdC }, {}, { 1,1, 1,1 }, {});
 
@@ -1959,15 +1959,15 @@ TEST_F(DeclarableOpsTests15, TestTensorMmul_BP14) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests15, TestTensorMmul_BP15) {
 
-    NDArray A('c', { 2, 2, 3 }, { 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12. }, nd4j::DataType::FLOAT32);
-    NDArray B('f', { 2, 2, 3 }, { 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12. }, nd4j::DataType::FLOAT32);
+    NDArray A('c', { 2, 2, 3 }, { 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12. }, sd::DataType::FLOAT32);
+    NDArray B('f', { 2, 2, 3 }, { 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12. }, sd::DataType::FLOAT32);
 
-    NDArray dLdC('f', { 2, 2 }, { 23.0, 24.44, 2.0, 26. }, nd4j::DataType::FLOAT32);
+    NDArray dLdC('f', { 2, 2 }, { 23.0, 24.44, 2.0, 26. }, sd::DataType::FLOAT32);
 
-    NDArray dA('c', { 2, 2, 3 }, { 27., 127., 227., 77., 177., 277., 76.44, 278.20001, 479.96002, 177.32, 379.08001, 580.839966 }, nd4j::DataType::FLOAT32);
-    NDArray dB('f', { 2, 2, 3 }, { 194.08, 184., 336.4, 268., 241.52, 212., 383.839996, 296., 288.96002, 240., 431.27999, 324. }, nd4j::DataType::FLOAT32);
+    NDArray dA('c', { 2, 2, 3 }, { 27., 127., 227., 77., 177., 277., 76.44, 278.20001, 479.96002, 177.32, 379.08001, 580.839966 }, sd::DataType::FLOAT32);
+    NDArray dB('f', { 2, 2, 3 }, { 194.08, 184., 336.4, 268., 241.52, 212., 383.839996, 296., 288.96002, 240., 431.27999, 324. }, sd::DataType::FLOAT32);
 
-    nd4j::ops::tensormmul_bp op;
+    sd::ops::tensormmul_bp op;
     auto results = op.evaluate({ &A, &B, &dLdC }, {}, { 2,1,2,2,1,2 });
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1986,16 +1986,16 @@ TEST_F(DeclarableOpsTests15, TestTensorMmul_BP15) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests15, TestTensorMmul_BP16) {
 
-    NDArray A('f', { 2, 2, 3 }, { 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12. }, nd4j::DataType::DOUBLE);
-    NDArray B('c', { 2, 2, 3 }, { 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12. }, nd4j::DataType::DOUBLE);
+    NDArray A('f', { 2, 2, 3 }, { 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12. }, sd::DataType::DOUBLE);
+    NDArray B('c', { 2, 2, 3 }, { 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12. }, sd::DataType::DOUBLE);
 
-    NDArray dLdC('c', { 2, 2 }, nd4j::DataType::DOUBLE);
+    NDArray dLdC('c', { 2, 2 }, sd::DataType::DOUBLE);
 
     const OpArgsHolder argsHolderFF({ &A, &B }, {}, { 2,1,2, 2,1,2 });
     const OpArgsHolder argsHolderBP({ &A, &B, &dLdC }, {}, { 2,1,2, 2,1,2 });
 
-    nd4j::ops::tensormmul op;
-    nd4j::ops::tensormmul_bp op_bp;
+    sd::ops::tensormmul op;
+    sd::ops::tensormmul_bp op_bp;
 
     const bool isGradCorrect = GradCheck::checkGrad(op, op_bp, argsHolderFF, argsHolderBP, {1,0});
     ASSERT_TRUE(isGradCorrect);
@@ -2003,16 +2003,16 @@ TEST_F(DeclarableOpsTests15, TestTensorMmul_BP16) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests15, TestTensorMmul_BP17) {
 
-    NDArray A('f', { 2, 2, 3 }, { 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12. }, nd4j::DataType::DOUBLE);
-    NDArray B('f', { 2, 2, 3 }, { 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12. }, nd4j::DataType::DOUBLE);
+    NDArray A('f', { 2, 2, 3 }, { 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12. }, sd::DataType::DOUBLE);
+    NDArray B('f', { 2, 2, 3 }, { 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12. }, sd::DataType::DOUBLE);
 
-    NDArray dLdC('c', { 2, 2 }, nd4j::DataType::DOUBLE);
+    NDArray dLdC('c', { 2, 2 }, sd::DataType::DOUBLE);
 
     const OpArgsHolder argsHolderFF({ &A, &B }, {}, { 2,1,2, 2,1,2 });
     const OpArgsHolder argsHolderBP({ &A, &B, &dLdC }, {}, { 2,1,2, 2,1,2 });
 
-    nd4j::ops::tensormmul op;
-    nd4j::ops::tensormmul_bp op_bp;
+    sd::ops::tensormmul op;
+    sd::ops::tensormmul_bp op_bp;
 
     const bool isGradCorrect = GradCheck::checkGrad(op, op_bp, argsHolderFF, argsHolderBP, { 1,0 });
     ASSERT_TRUE(isGradCorrect);
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp
index a85772cec..800c9cbf8 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp
@@ -21,13 +21,13 @@
 
 #include "testlayers.h"
 #include <ops/declarable/CustomOperations.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <ops/ops.h>
-#include <GradCheck.h>
+#include <helpers/GradCheck.h>
 #include <array>
 
 
-using namespace nd4j;
+using namespace sd;
 
 
 class DeclarableOpsTests16 : public testing::Test {
@@ -45,7 +45,7 @@ TEST_F(DeclarableOpsTests16, scatter_upd_1) {
     auto w = NDArrayFactory::create<float>(3.0f);
     auto e = NDArrayFactory::create<float>('c', { 3 }, { 3.f, 1.f, 1.f });
 
-    nd4j::ops::scatter_upd op;
+    sd::ops::scatter_upd op;
     auto result = op.evaluate({ &x, &y, &w });
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -58,14 +58,14 @@ TEST_F(DeclarableOpsTests16, scatter_upd_1) {
 
 TEST_F(DeclarableOpsTests16, scatter_upd_2) {
 
-    NDArray x('c', { 10, 3 }, nd4j::DataType::FLOAT32);
-    NDArray indices('c', { 2 }, { 2,5 }, nd4j::DataType::INT32);
-    NDArray updates('c', { 2, 3 }, { 100,101,102,  200,201,202 }, nd4j::DataType::FLOAT32);
-    NDArray e('c', { 10, 3 }, { 1,2,3, 4,5,6, 100,101,102, 10,11,12, 13,14,15, 200,201,202, 19,20,21, 22,23,24, 25,26,27, 28,29,30 }, nd4j::DataType::FLOAT32);
+    NDArray x('c', { 10, 3 }, sd::DataType::FLOAT32);
+    NDArray indices('c', { 2 }, { 2,5 }, sd::DataType::INT32);
+    NDArray updates('c', { 2, 3 }, { 100,101,102,  200,201,202 }, sd::DataType::FLOAT32);
+    NDArray e('c', { 10, 3 }, { 1,2,3, 4,5,6, 100,101,102, 10,11,12, 13,14,15, 200,201,202, 19,20,21, 22,23,24, 25,26,27, 28,29,30 }, sd::DataType::FLOAT32);
 
     x.linspace(1);
 
-    nd4j::ops::scatter_upd op;
+    sd::ops::scatter_upd op;
     auto result = op.evaluate({ &x, &indices, &updates });
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -78,12 +78,12 @@ TEST_F(DeclarableOpsTests16, scatter_upd_2) {
 
 TEST_F(DeclarableOpsTests16, scatter_upd_3) {
 
-    NDArray x('c', { 10, 3 }, nd4j::DataType::FLOAT32);
-    NDArray indices('c', { 2 }, { 20,5 }, nd4j::DataType::INT32);
-    NDArray updates('c', { 2, 3 }, { 100,101,102,  200,201,202 }, nd4j::DataType::FLOAT32);
-    NDArray output('c', { 10, 3 }, nd4j::DataType::FLOAT32);
+    NDArray x('c', { 10, 3 }, sd::DataType::FLOAT32);
+    NDArray indices('c', { 2 }, { 20,5 }, sd::DataType::INT32);
+    NDArray updates('c', { 2, 3 }, { 100,101,102,  200,201,202 }, sd::DataType::FLOAT32);
+    NDArray output('c', { 10, 3 }, sd::DataType::FLOAT32);
 
-    nd4j::ops::scatter_upd op;
+    sd::ops::scatter_upd op;
     ASSERT_ANY_THROW(op.execute({ &x, &indices, &updates }, { &output }, {}, {}, { true, true }));
 }
 
@@ -92,7 +92,7 @@ TEST_F(DeclarableOpsTests16, test_size_dtype_1) {
     auto z = NDArrayFactory::create<float>(0.0f);
     auto e = NDArrayFactory::create<float>(3.0f);
 
-    nd4j::ops::size op;
+    sd::ops::size op;
     auto status = op.execute({ &x }, { &z }, {}, {}, {});
     ASSERT_EQ(Status::OK(), status);
 
@@ -102,7 +102,7 @@ TEST_F(DeclarableOpsTests16, test_size_dtype_1) {
 TEST_F(DeclarableOpsTests16, test_empty_noop_1) {
     auto z = NDArrayFactory::empty<Nd4jLong>();
 
-    nd4j::ops::noop op;
+    sd::ops::noop op;
     auto status = op.execute({}, { &z }, {}, {}, {});
     ASSERT_EQ(Status::OK(), status);
 }
@@ -113,7 +113,7 @@ TEST_F(DeclarableOpsTests16, test_empty_noop_2) {
     Context ctx(1);
     ctx.setOutputArray(0, z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo());
 
-    nd4j::ops::noop op;
+    sd::ops::noop op;
     auto status = op.execute(&ctx);
 
     ASSERT_EQ(Status::OK(), status);
@@ -123,7 +123,7 @@ TEST_F(DeclarableOpsTests16, test_svd_1) {
     auto x = NDArrayFactory::create<float>('c', { 3, 3 }, { 0.7787856f, 0.80119777f, 0.72437465f, 0.23089433f, 0.72714126f, 0.18039072f,0.50563407f, 0.89252293f, 0.5461209f });
     auto z = NDArrayFactory::create<float>('c', { 3 });
 
-    nd4j::ops::svd op;
+    sd::ops::svd op;
     auto status = op.execute({ &x }, { &z }, {}, { 0, 0, 16 }, {});
 
     ASSERT_EQ(Status::OK(), status);
@@ -134,7 +134,7 @@ TEST_F(DeclarableOpsTests16, test_hamming_distance_1) {
     auto y = NDArrayFactory::create<Nd4jLong>({ 8723, 8723, 8723 });
     auto e = NDArrayFactory::create<Nd4jLong>(18);
 
-    nd4j::ops::bits_hamming_distance op;
+    sd::ops::bits_hamming_distance op;
     auto result = op.evaluate({ &x, &y });
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -156,7 +156,7 @@ TEST_F(DeclarableOpsTests16, test_knn_mindistance_1) {
     low.linspace(1.0);
     high.linspace(1.0);
 
-    nd4j::ops::knn_mindistance op;
+    sd::ops::knn_mindistance op;
     auto result = op.execute({ &input, &low, &high }, { &output }, {}, {}, {});
     ASSERT_EQ(Status::OK(), result);
 }
@@ -165,7 +165,7 @@ TEST_F(DeclarableOpsTests16, test_empty_cast_1) {
     auto x = NDArrayFactory::create<bool>('c', { 1, 0, 2 });
     auto e = NDArrayFactory::create<Nd4jLong>('c', { 1, 0, 2 });
 
-    nd4j::ops::cast op;
+    sd::ops::cast op;
     auto result = op.evaluate({&x},  {10});
     ASSERT_EQ(Status::OK(), result->status());
     ASSERT_EQ(e, *result->at(0));
@@ -174,7 +174,7 @@ TEST_F(DeclarableOpsTests16, test_empty_cast_1) {
 }
 
 TEST_F(DeclarableOpsTests16, test_range_1) {
-    nd4j::ops::range op;
+    sd::ops::range op;
     auto z = NDArrayFactory::create<float>('c', { 200 });
 
     Context ctx(1);
@@ -186,7 +186,7 @@ TEST_F(DeclarableOpsTests16, test_range_1) {
 }
 
 TEST_F(DeclarableOpsTests16, test_range_2) {
-    nd4j::ops::range op;
+    sd::ops::range op;
     auto z = NDArrayFactory::create<float>('c', { 200 });
 
     double tArgs[] = { -1.0, 1.0, 0.01 };
@@ -226,7 +226,7 @@ TEST_F(DeclarableOpsTests16, test_reverse_1) {
                 listE.at(e)->assign(rowReversed);
             }
 
-            nd4j::ops::reverse op;
+            sd::ops::reverse op;
             Nd4jLong axis = 1;
             auto status = op.execute({ &array }, { &reversed }, {}, { axis }, {});
             ASSERT_EQ(Status::OK(), status);
@@ -283,7 +283,7 @@ TEST_F(DeclarableOpsTests16, test_rgb_to_hsv_1) {
     ctx.setInputArray(0, &rgbs);
     ctx.setOutputArray(0, &actual);
 
-    nd4j::ops::rgb_to_hsv op;
+    sd::ops::rgb_to_hsv op;
     auto status = op.execute(&ctx);
 #if 0
     //visual check
@@ -337,7 +337,7 @@ TEST_F(DeclarableOpsTests16, test_rgb_to_hsv_2) {
     ctx.setInputArray(0, &rgbs);
     ctx.setOutputArray(0, &actual);
     ctx.setIArguments({ 1 });
-    nd4j::ops::rgb_to_hsv op;
+    sd::ops::rgb_to_hsv op;
     auto status = op.execute(&ctx);
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -364,7 +364,7 @@ TEST_F(DeclarableOpsTests16, test_rgb_to_hsv_3) {
     ctx.setInputArray(0, &rgbs);
     ctx.setOutputArray(0, &actual);
 
-    nd4j::ops::rgb_to_hsv op;
+    sd::ops::rgb_to_hsv op;
     auto status = op.execute(&ctx);
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -391,7 +391,7 @@ TEST_F(DeclarableOpsTests16, test_rgb_to_hsv_4) {
     ctx.setInputArray(0, &rgbs);
     ctx.setOutputArray(0, &actual);
     ctx.setIArguments({ 0 });
-    nd4j::ops::rgb_to_hsv op;
+    sd::ops::rgb_to_hsv op;
     auto status = op.execute(&ctx);
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -413,7 +413,7 @@ TEST_F(DeclarableOpsTests16, test_rgb_to_hsv_5) {
     ctx.setInputArray(0, &rgbs);
     ctx.setOutputArray(0, &actual);
 
-    nd4j::ops::rgb_to_hsv op;
+    sd::ops::rgb_to_hsv op;
     auto status = op.execute(&ctx);
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -449,7 +449,7 @@ TEST_F(DeclarableOpsTests16, test_rgb_to_hsv_6) {
     Context ctx(1);
     ctx.setInputArray(0, &subArrRgbs);
     ctx.setOutputArray(0, &actual);
-    nd4j::ops::rgb_to_hsv op;
+    sd::ops::rgb_to_hsv op;
     auto status = op.execute(&ctx);
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -495,7 +495,7 @@ TEST_F(DeclarableOpsTests16, test_hsv_to_rgb_1) {
     ctx.setInputArray(0, &hsvs);
     ctx.setOutputArray(0, &actual);
 
-    nd4j::ops::hsv_to_rgb op;
+    sd::ops::hsv_to_rgb op;
     auto status = op.execute(&ctx);
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -538,7 +538,7 @@ TEST_F(DeclarableOpsTests16, test_hsv_to_rgb_2) {
     ctx.setInputArray(0, &hsvs);
     ctx.setOutputArray(0, &actual);
     ctx.setIArguments({ 1 });
-    nd4j::ops::hsv_to_rgb op;
+    sd::ops::hsv_to_rgb op;
     auto status = op.execute(&ctx);
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -563,7 +563,7 @@ TEST_F(DeclarableOpsTests16, test_hsv_to_rgb_3) {
     ctx.setInputArray(0, &hsvs);
     ctx.setOutputArray(0, &actual);
 
-    nd4j::ops::hsv_to_rgb op;
+    sd::ops::hsv_to_rgb op;
     auto status = op.execute(&ctx);
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -589,7 +589,7 @@ TEST_F(DeclarableOpsTests16, test_hsv_to_rgb_4) {
     ctx.setInputArray(0, &hsvs);
     ctx.setOutputArray(0, &actual);
     ctx.setIArguments({ 0 });
-    nd4j::ops::hsv_to_rgb op;
+    sd::ops::hsv_to_rgb op;
     auto status = op.execute(&ctx);
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -612,7 +612,7 @@ TEST_F(DeclarableOpsTests16, test_hsv_to_rgb_5) {
     ctx.setInputArray(0, &hsvs);
     ctx.setOutputArray(0, &actual);
 
-    nd4j::ops::hsv_to_rgb op;
+    sd::ops::hsv_to_rgb op;
     auto status = op.execute(&ctx);
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -648,7 +648,7 @@ TEST_F(DeclarableOpsTests16, test_hsv_to_rgb_6) {
     Context ctx(1);
     ctx.setInputArray(0, &subArrHsvs);
     ctx.setOutputArray(0, &actual);
-    nd4j::ops::hsv_to_rgb op;
+    sd::ops::hsv_to_rgb op;
     auto status = op.execute(&ctx);
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -709,7 +709,7 @@ TEST_F(DeclarableOpsTests16, test_rgb_to_yiq_1) {
     ctx.setInputArray(0, &rgb);
     ctx.setOutputArray(0, &actual);
 
-    nd4j::ops::rgb_to_yiq op;
+    sd::ops::rgb_to_yiq op;
     auto status = op.execute(&ctx);
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -759,7 +759,7 @@ TEST_F(DeclarableOpsTests16, test_rgb_to_yiq_2) {
     ctx.setInputArray(0, &rgb);
     ctx.setOutputArray(0, &actual);
     ctx.setIArguments({ 1 });
-    nd4j::ops::rgb_to_yiq op;
+    sd::ops::rgb_to_yiq op;
     auto status = op.execute(&ctx);
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -790,7 +790,7 @@ TEST_F(DeclarableOpsTests16, test_rgb_to_yiq_3) {
     ctx.setInputArray(0, &rgb);
     ctx.setOutputArray(0, &actual);
 
-    nd4j::ops::rgb_to_yiq op;
+    sd::ops::rgb_to_yiq op;
     auto status = op.execute(&ctx);
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -821,7 +821,7 @@ TEST_F(DeclarableOpsTests16, test_rgb_to_yiq_4) {
     ctx.setInputArray(0, &rgb);
     ctx.setOutputArray(0, &actual);
     ctx.setIArguments({ 0 });
-    nd4j::ops::rgb_to_yiq op;
+    sd::ops::rgb_to_yiq op;
     auto status = op.execute(&ctx);
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -845,7 +845,7 @@ TEST_F(DeclarableOpsTests16, test_rgb_to_yiq_5) {
     ctx.setInputArray(0, &rgbs);
     ctx.setOutputArray(0, &actual);
 
-    nd4j::ops::rgb_to_yiq op;
+    sd::ops::rgb_to_yiq op;
     auto status = op.execute(&ctx);
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -883,7 +883,7 @@ TEST_F(DeclarableOpsTests16, test_rgb_to_yiq_6) {
     Context ctx(1);
     ctx.setInputArray(0, &subArrRgbs);
     ctx.setOutputArray(0, &actual);
-    nd4j::ops::rgb_to_yiq op;
+    sd::ops::rgb_to_yiq op;
     auto status = op.execute(&ctx);
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -929,7 +929,7 @@ TEST_F(DeclarableOpsTests16, test_yiq_to_rgb_1) {
     ctx.setInputArray(0, &yiqs);
     ctx.setOutputArray(0, &actual);
 
-    nd4j::ops::yiq_to_rgb op;
+    sd::ops::yiq_to_rgb op;
     auto status = op.execute(&ctx);
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -975,7 +975,7 @@ TEST_F(DeclarableOpsTests16, test_yiq_to_rgb_2) {
     ctx.setInputArray(0, &yiqs);
     ctx.setOutputArray(0, &actual);
     ctx.setIArguments({ 1 });
-    nd4j::ops::yiq_to_rgb op;
+    sd::ops::yiq_to_rgb op;
     auto status = op.execute(&ctx);
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -1002,7 +1002,7 @@ TEST_F(DeclarableOpsTests16, test_yiq_to_rgb_3) {
     ctx.setInputArray(0, &yiqs);
     ctx.setOutputArray(0, &actual);
 
-    nd4j::ops::yiq_to_rgb op;
+    sd::ops::yiq_to_rgb op;
     auto status = op.execute(&ctx);
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -1029,7 +1029,7 @@ TEST_F(DeclarableOpsTests16, test_yiq_to_rgb_4) {
     ctx.setInputArray(0, &yiqs);
     ctx.setOutputArray(0, &actual);
     ctx.setIArguments({ 0 });
-    nd4j::ops::yiq_to_rgb op;
+    sd::ops::yiq_to_rgb op;
     auto status = op.execute(&ctx);
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -1053,7 +1053,7 @@ TEST_F(DeclarableOpsTests16, test_yiq_to_rgb_5) {
     ctx.setInputArray(0, &yiqs);
     ctx.setOutputArray(0, &actual);
 
-    nd4j::ops::yiq_to_rgb op;
+    sd::ops::yiq_to_rgb op;
     auto status = op.execute(&ctx);
 #if 0
     actual.printBuffer("actual");
@@ -1091,7 +1091,7 @@ TEST_F(DeclarableOpsTests16, test_yiq_to_rgb_6) {
     Context ctx(1);
     ctx.setInputArray(0, &subArrYiqs);
     ctx.setOutputArray(0, &actual);
-    nd4j::ops::yiq_to_rgb op;
+    sd::ops::yiq_to_rgb op;
     auto status = op.execute(&ctx);
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests17.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests17.cpp
index 497475262..d0fda960c 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests17.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests17.cpp
@@ -21,13 +21,13 @@
 
 #include "testlayers.h"
 #include <ops/declarable/CustomOperations.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <ops/ops.h>
-#include <GradCheck.h>
+#include <helpers/GradCheck.h>
 #include <array>
 
 
-using namespace nd4j;
+using namespace sd;
 
 
 class DeclarableOpsTests17 : public testing::Test {
@@ -47,7 +47,7 @@ TEST_F(DeclarableOpsTests17, test_sparse_to_dense_1) {
     auto exp = NDArrayFactory::create<float>('c', {3, 3}, {1.f,0.f,0.f,  0.f,2.f,0.f,  0.f,0.f,3.f});
 
 
-    nd4j::ops::compat_sparse_to_dense op;
+    sd::ops::compat_sparse_to_dense op;
     auto result = op.evaluate({&ranges, &shape, &values, &def});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -62,7 +62,7 @@ TEST_F(DeclarableOpsTests17, test_sparse_to_dense_2) {
     auto exp = NDArrayFactory::string( {3, 3}, {"alpha","d","d",  "d","beta","d",  "d","d","gamma"});
 
 
-    nd4j::ops::compat_sparse_to_dense op;
+    sd::ops::compat_sparse_to_dense op;
     auto result = op.evaluate({&ranges, &shape, &values, &def});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -76,7 +76,7 @@ TEST_F(DeclarableOpsTests17, test_compat_string_split_1) {
     auto exp0 = NDArrayFactory::create<Nd4jLong>({0,0, 0,1, 1,0});
     auto exp1 = NDArrayFactory::string( {3}, {"first", "string", "second"});
 
-    nd4j::ops::compat_string_split op;
+    sd::ops::compat_string_split op;
     auto result = op.evaluate({&x, &delimiter});
     ASSERT_EQ(Status::OK(), result->status());
     ASSERT_EQ(2, result->size());
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests18.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests18.cpp
index 2c7737a31..895ec98b3 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests18.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests18.cpp
@@ -21,13 +21,13 @@
 
 #include "testlayers.h"
 #include <ops/declarable/CustomOperations.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <ops/ops.h>
-#include <GradCheck.h>
+#include <helpers/GradCheck.h>
 #include <array>
 
 
-using namespace nd4j;
+using namespace sd;
 
 
 class DeclarableOpsTests18 : public testing::Test {
@@ -44,8 +44,8 @@ TEST_F(DeclarableOpsTests18, test_bitcast_1) {
     auto z = NDArrayFactory::create<Nd4jLong>(0);
     auto e = NDArrayFactory::create<Nd4jLong>(4597464930322771456L);
 
-    nd4j::ops::bitcast op;
-    auto status = op.execute({&x}, {&z}, {}, {(Nd4jLong) nd4j::DataType::INT64}, {});
+    sd::ops::bitcast op;
+    auto status = op.execute({&x}, {&z}, {}, {(Nd4jLong) sd::DataType::INT64}, {});
     ASSERT_EQ(Status::OK(), status);
 
     ASSERT_EQ(e, z);
@@ -56,7 +56,7 @@ TEST_F(DeclarableOpsTests18, test_tanh_1) {
     auto z = x.ulike();
     auto e = NDArrayFactory::create<float>('c', {8}, {0.226028f, -0.226028f, 0.336376f, -0.336376f, 0.564900f, -0.564900f, 1.f, -1.f});
 
-    nd4j::ops::tanh op;
+    sd::ops::tanh op;
     op.execute({&x}, {&z});
 
     ASSERT_EQ(e, z);
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests19.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests19.cpp
index b0a547a7d..09bc17cde 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests19.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests19.cpp
@@ -21,13 +21,13 @@
 
 #include "testlayers.h"
 #include <ops/declarable/CustomOperations.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <ops/ops.h>
-#include <GradCheck.h>
+#include <helpers/GradCheck.h>
 #include <array>
 
 
-using namespace nd4j;
+using namespace sd;
 
 
 class DeclarableOpsTests19 : public testing::Test {
@@ -60,7 +60,7 @@ TEST_F(DeclarableOpsTests19, test_conv1d_bp_1) {
     auto u = NDArrayFactory::create<float>('c', {3, 2, 3});
     auto v = NDArrayFactory::create<float>('c', {2, 3, 6});
 
-    nd4j::ops::conv1d_bp op;
+    sd::ops::conv1d_bp op;
     auto result = op.evaluate({&t, &u, &v}, {3, 2, 0, 1, 2,0});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -73,7 +73,7 @@ TEST_F(DeclarableOpsTests19, test_squeeze_1) {
     auto e = NDArrayFactory::create<double>('c', {3, 4});
     int axis = 2;
 
-    nd4j::ops::squeeze op;
+    sd::ops::squeeze op;
     auto status = op.execute({&x}, {&e}, {axis});
     ASSERT_EQ(Status::OK(), status);
 }
\ No newline at end of file
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests2.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests2.cpp
index 029a392f7..58e9bf450 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests2.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests2.cpp
@@ -17,12 +17,12 @@
 #include "testlayers.h"
 #include <ops/declarable/CustomOperations.h>
 #include <helpers/helper_hash.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <array/NDArrayList.h>
 
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class DeclarableOpsTests2 : public testing::Test {
 public:
@@ -35,11 +35,11 @@ public:
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests2, gather_1) {
 
-    NDArray input('c', {2,3,4}, {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24}, nd4j::DataType::FLOAT32);
-    NDArray indices('c', {1,6},   {0,1, 2,2, 1,2}, nd4j::DataType::INT32);
-    NDArray expected('c', {2,1,6,4}, {1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 9,10,11,12, 5, 6, 7, 8, 9,10,11,12, 13,14,15,16, 17,18,19,20, 21,22,23,24, 21,22,23,24, 17,18,19,20, 21,22,23,24}, nd4j::DataType::FLOAT32);
+    NDArray input('c', {2,3,4}, {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24}, sd::DataType::FLOAT32);
+    NDArray indices('c', {1,6},   {0,1, 2,2, 1,2}, sd::DataType::INT32);
+    NDArray expected('c', {2,1,6,4}, {1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 9,10,11,12, 5, 6, 7, 8, 9,10,11,12, 13,14,15,16, 17,18,19,20, 21,22,23,24, 21,22,23,24, 17,18,19,20, 21,22,23,24}, sd::DataType::FLOAT32);
 
-    nd4j::ops::gather op;
+    sd::ops::gather op;
 
     auto result = op.evaluate({&input, &indices}, {1});
 
@@ -60,7 +60,7 @@ TEST_F(DeclarableOpsTests2, gather_2) {
     //auto indices ('c', {1,6},   {0,1, 2,2, 1,2});
     NDArray expected('c', {2,6,4}, {1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 9,10,11,12, 5, 6, 7, 8, 9,10,11,12, 13,14,15,16, 17,18,19,20, 21,22,23,24, 21,22,23,24, 17,18,19,20, 21,22,23,24});
 
-    nd4j::ops::gather op;
+    sd::ops::gather op;
 
     auto result = op.evaluate({&input}, {}, {1, 0,1, 2,2, 1,2}, {true});
 
@@ -79,10 +79,10 @@ TEST_F(DeclarableOpsTests2, gather_2) {
 TEST_F(DeclarableOpsTests2, gather_3) {
 
     NDArray input   ('c', {2,3,4}, {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24});
-    NDArray indices ('c', {1,1},   std::vector<double>{2}, nd4j::DataType::INT32);
+    NDArray indices ('c', {1,1},   std::vector<double>{2}, sd::DataType::INT32);
     NDArray expected('c', {2,1,1,4}, {9,10,11,12,21,22,23,24});
 
-    nd4j::ops::gather op;
+    sd::ops::gather op;
 
     auto result = op.evaluate({&input, &indices}, {}, {1});
 
@@ -102,7 +102,7 @@ TEST_F(DeclarableOpsTests2, gather_4) {
     //auto indices ('c', {1,1},   {2});
     NDArray expected('c', {2,4}, {9,10,11,12,21,22,23,24});
 
-    nd4j::ops::gather op;
+    sd::ops::gather op;
 
     auto result = op.evaluate({&input}, {}, {1, 2});
 
@@ -120,10 +120,10 @@ TEST_F(DeclarableOpsTests2, gather_4) {
 TEST_F(DeclarableOpsTests2, gather_5) {
 
     NDArray input   ('c', {2,3,4},   {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24});
-    NDArray indices ('c', {2,3},     {0, 1, 2, 2, 1,2}, nd4j::DataType::INT32);
+    NDArray indices ('c', {2,3},     {0, 1, 2, 2, 1,2}, sd::DataType::INT32);
     NDArray expected('c', {2,2,3,4}, {1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,  9,10,11,12, 5, 6, 7, 8, 9,10,11,12, 13,14,15,16,17,18,19,20,21,22,23,24, 21,22,23,24,17,18,19,20,21,22,23,24});
 
-    nd4j::ops::gather op;
+    sd::ops::gather op;
 
     auto result = op.evaluate({&input, &indices}, {}, {1}, {true});
 
@@ -142,10 +142,10 @@ TEST_F(DeclarableOpsTests2, gather_5) {
 TEST_F(DeclarableOpsTests2, gather_6) {
 
     NDArray input   ('c', {3,3,4},   {1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 13,14,15,16,17,18,19,20,21,22,23,24, 25,26,27,28,29,30,31,32,33,34,35,36});
-    NDArray indices ('c', {2,3},     {0, 1, 2, 2, 1,2}, nd4j::DataType::INT32);
+    NDArray indices ('c', {2,3},     {0, 1, 2, 2, 1,2}, sd::DataType::INT32);
     NDArray expected('c', {2,3,3,4}, {1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 13,14,15,16,17,18,19,20,21,22,23,24, 25,26,27,28,29,30,31,32,33,34,35,36, 25,26,27,28,29,30,31,32,33,34,35,36, 13,14,15,16,17,18,19,20,21,22,23,24, 25,26,27,28,29,30,31,32,33,34,35,36});
 
-    nd4j::ops::gather op;
+    sd::ops::gather op;
 
     auto result = op.evaluate({&input, &indices}, {}, {0});
 
@@ -164,10 +164,10 @@ TEST_F(DeclarableOpsTests2, gather_6) {
 TEST_F(DeclarableOpsTests2, gather_7) {
 
     NDArray input   ('c', {2,3,4},   {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24});
-    NDArray indices ('c', {2,3},     {0, 1, 2, 2, 1,2}, nd4j::DataType::INT64);
+    NDArray indices ('c', {2,3},     {0, 1, 2, 2, 1,2}, sd::DataType::INT64);
     NDArray expected('c', {2,3,2,3}, {1, 2, 3, 3, 2, 3, 5, 6, 7, 7, 6, 7, 9,10,11,11,10,11, 13,14,15,15,14,15, 17,18,19,19,18,19, 21,22,23,23,22,23});
 
-    nd4j::ops::gather op;
+    sd::ops::gather op;
 
     auto result = op.evaluate({&input, &indices}, {}, {2});
 
@@ -185,11 +185,11 @@ TEST_F(DeclarableOpsTests2, gather_7) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests2, gather_8) {
 
-    NDArray input('c', {3,5}, {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}, nd4j::DataType::FLOAT32);
-    NDArray indices('c', {1}, std::vector<double>{2}, nd4j::DataType::INT32);
-    NDArray expected('c', {1,5}, {11, 12, 13, 14, 15.}, nd4j::DataType::FLOAT32);
+    NDArray input('c', {3,5}, {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}, sd::DataType::FLOAT32);
+    NDArray indices('c', {1}, std::vector<double>{2}, sd::DataType::INT32);
+    NDArray expected('c', {1,5}, {11, 12, 13, 14, 15.}, sd::DataType::FLOAT32);
 
-    nd4j::ops::gather op;
+    sd::ops::gather op;
 
     auto result = op.evaluate({&input, &indices}, {}, {0});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -205,10 +205,10 @@ TEST_F(DeclarableOpsTests2, gather_8) {
 
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests2, gather_9) {
-    NDArray x('c', {2, 4, 3, 2}, nd4j::DataType::FLOAT32);
-    NDArray indices('c', {2}, std::vector<double>{1, 0}, nd4j::DataType::INT32);
+    NDArray x('c', {2, 4, 3, 2}, sd::DataType::FLOAT32);
+    NDArray indices('c', {2}, std::vector<double>{1, 0}, sd::DataType::INT32);
 
-    nd4j::ops::gather op;
+    sd::ops::gather op;
     auto result = op.evaluate({&x, &indices}, {}, {-2});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -222,7 +222,7 @@ TEST_F(DeclarableOpsTests2, gather_10) {
     NDArray x('c', {2, 2}, {1, 2, 3, 4});
     NDArray e('c', {2, 2}, {3, 4, 1, 2});
 
-    nd4j::ops::gather op;
+    sd::ops::gather op;
     auto result = op.evaluate({&x}, {}, {0, 1, 0});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -238,10 +238,10 @@ TEST_F(DeclarableOpsTests2, gather_10) {
 TEST_F(DeclarableOpsTests2, gather_11) {
 
     NDArray x('c', {2, 2}, {1, 2, 3, 4});
-    NDArray indices('c', {2}, std::vector<double>{1, 0}, nd4j::DataType::INT64);
+    NDArray indices('c', {2}, std::vector<double>{1, 0}, sd::DataType::INT64);
     NDArray e('c', {2, 2}, {3, 4, 1, 2});
 
-    nd4j::ops::gather op;
+    sd::ops::gather op;
     auto result = op.evaluate({&x, &indices}, {}, {0});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -257,10 +257,10 @@ TEST_F(DeclarableOpsTests2, gather_11) {
 TEST_F(DeclarableOpsTests2, gather_12) {
 
     NDArray input('c', {4}, {2.f, 3.f, 4.f, 5.f});
-    NDArray indices('c', {2}, {0, 2}, nd4j::DataType::INT32);
+    NDArray indices('c', {2}, {0, 2}, sd::DataType::INT32);
     NDArray exp('c', {2}, {2.f, 4.f});
 
-    nd4j::ops::gather op;
+    sd::ops::gather op;
     auto result = op.evaluate({&input, &indices}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -275,8 +275,8 @@ TEST_F(DeclarableOpsTests2, gather_12) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests2, gather_13) {
 
-    NDArray input   ('c', {2,3,4,5}, nd4j::DataType::DOUBLE);
-    NDArray indices ('c', {2,3,4}, {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}, nd4j::DataType::INT32);
+    NDArray input   ('c', {2,3,4,5}, sd::DataType::DOUBLE);
+    NDArray indices ('c', {2,3,4}, {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}, sd::DataType::INT32);
     NDArray expected('c', {2,3,  2,3,4,  5}, {0,  1,  2,  3,  4, 5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 0,  1,  2,  3,  4, 5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 0,  1,  2,  3,  4, 5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
                             0,  1,  2,  3,  4, 5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 0,  1,  2,  3,  4, 5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 0,  1,  2,  3,  4, 5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
                             20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
@@ -292,7 +292,7 @@ TEST_F(DeclarableOpsTests2, gather_13) {
 
     input.linspace(0);
 
-    nd4j::ops::gather op;
+    sd::ops::gather op;
 
     auto result = op.evaluate({&input, &indices}, {}, {2}, {true});
 
@@ -310,10 +310,10 @@ TEST_F(DeclarableOpsTests2, gather_13) {
 TEST_F(DeclarableOpsTests2, gather_14) {
 
     NDArray input   ('c', {2,3,4},   {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24});
-    NDArray indices ('c', {2,3},     {0, 10, 2, 20, 1,2}, nd4j::DataType::INT32);
+    NDArray indices ('c', {2,3},     {0, 10, 2, 20, 1,2}, sd::DataType::INT32);
     NDArray output('c', {2,2,3,4});
 
-    nd4j::ops::gather op;
+    sd::ops::gather op;
 
     ASSERT_ANY_THROW(op.execute({&input, &indices}, {&output}, {}, {1}, {true}));
 }
@@ -321,21 +321,21 @@ TEST_F(DeclarableOpsTests2, gather_14) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests2, gather_15) {
 
-    NDArray input   ('c', {2,3,4,5}, nd4j::DataType::DOUBLE);
-    NDArray indices ('c', {2,3,4}, {0, 10, 2, 3, 0, 1, 20, 3, 0, 1, 2, 3,0, 1, 2, 3, 0, 1, 2, 30, 0, 1, 2, 3}, nd4j::DataType::INT32);
+    NDArray input   ('c', {2,3,4,5}, sd::DataType::DOUBLE);
+    NDArray indices ('c', {2,3,4}, {0, 10, 2, 3, 0, 1, 20, 3, 0, 1, 2, 3,0, 1, 2, 3, 0, 1, 2, 30, 0, 1, 2, 3}, sd::DataType::INT32);
     NDArray output('c', {2,3,  2,3,4,  5});
 
-    nd4j::ops::gather op;
+    sd::ops::gather op;
 
     ASSERT_ANY_THROW(op.execute({&input, &indices}, {&output}, {}, {2}, {true}));
 }
 
 TEST_F(DeclarableOpsTests2, BroadcastGradientArgs_1) {
 
-    NDArray input   ('c', {3,3,4},   {1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 13,14,15,16,17,18,19,20,21,22,23,24, 25,26,27,28,29,30,31,32,33,34,35,36}, nd4j::DataType::INT32);
-    NDArray indices ('c', {2,3},     {0, 1, 2, 2, 1,2}, nd4j::DataType::INT32);
+    NDArray input   ('c', {3,3,4},   {1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 13,14,15,16,17,18,19,20,21,22,23,24, 25,26,27,28,29,30,31,32,33,34,35,36}, sd::DataType::INT32);
+    NDArray indices ('c', {2,3},     {0, 1, 2, 2, 1,2}, sd::DataType::INT32);
 
-    nd4j::ops::broadcastgradientargs op;
+    sd::ops::broadcastgradientargs op;
 
     auto result = op.evaluate({&input, &indices}, {}, {});
 
@@ -374,7 +374,7 @@ TEST_F(DeclarableOpsTests2, NLP_Cbow_Test_1) {
     auto randomValue = NDArrayFactory::create<Nd4jLong>(2L);
     auto inferenceVector = NDArrayFactory::empty<double>();
 
-    nd4j::ops::cbow op;
+    sd::ops::cbow op;
     auto result = op.evaluate({&target, &ngStarter, &context, &indices, &codes, &syn0, &syn1, &syn1Neg, &expTable, &negTable, &alpha, &randomValue, &numWords, &locked, &inferenceVector}, {}, {}, {true}, {}, true);
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -402,7 +402,7 @@ TEST_F(DeclarableOpsTests2, Test_Squeeze_1) {
     x.linspace(1);
     auto exp = x.reshape('c', {2, 3, 4});
 
-    nd4j::ops::squeeze op;
+    sd::ops::squeeze op;
     auto result = op.evaluate({&x}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -421,7 +421,7 @@ TEST_F(DeclarableOpsTests2, Test_Squeeze_2) {
     x.linspace(1);
     auto exp = new NDArray(x.dup());
 
-    nd4j::ops::squeeze op;
+    sd::ops::squeeze op;
     auto result = op.evaluate({&x}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -440,7 +440,7 @@ TEST_F(DeclarableOpsTests2, Test_FloorMod_1) {
     auto y = NDArrayFactory::create<float>('c', {1, 3}, {-3.0f, 2.0f, -2.0f});
     auto exp = NDArrayFactory::create<float>('c', {1, 3}, {-1.f,  0.f, -1.f});
 
-    nd4j::ops::floormod op;
+    sd::ops::floormod op;
 
     auto result = op.evaluate({&x, &y}, {}, {});
 
@@ -458,7 +458,7 @@ TEST_F(DeclarableOpsTests2, Test_FloorDiv_1) {
     auto y = NDArrayFactory::create<float>('c', {1, 3}, {-2.0f, 2.0f, -2.0f});
     auto exp = NDArrayFactory::create<float>('c', {1, 3}, {-2.f,  3.f, 1.f});
 
-    nd4j::ops::floordiv op;
+    sd::ops::floordiv op;
 
     auto result = op.evaluate({&x, &y}, {}, {});
 
@@ -480,7 +480,7 @@ TEST_F(DeclarableOpsTests2, Test_FloorDiv_2) {
     auto exp1 = NDArrayFactory::create<float>('c', {1, 3}, {0.f,  0.f, 0.f});
     auto exp2 = NDArrayFactory::create<float>('c', {1, 3}, {0.f, 0.f, 0.f});
 
-    nd4j::ops::floordiv_bp op;
+    sd::ops::floordiv_bp op;
 
     auto result = op.evaluate({&x, &y, &eps}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -500,7 +500,7 @@ TEST_F(DeclarableOpsTests2, Test_CRelu_1) {
     auto x = NDArrayFactory::create<float>('c', {2, 2}, {1.0f, 2.0f, 3.0f, 4.0f});
     auto exp = NDArrayFactory::create<float>('c', {2, 4}, {1.0f, 2.0f, 0.f, 0.f, 3.0f, 4.0f, 0.f, 0.f});
 
-    nd4j::ops::crelu op;
+    sd::ops::crelu op;
 
     auto result = op.evaluate({&x}, {}, {});
 
@@ -519,7 +519,7 @@ TEST_F(DeclarableOpsTests2, Test_CRelu_BP_2) {
     auto eps = NDArrayFactory::create<float>('c', {2, 4}, {1.0f, 2.0f, 4.f, 3.f, 3.0f, 4.0f, 2.f, 1.f});
     auto exp = NDArrayFactory::create<float>('c', {2, 2}, {1.f, 2.f, -2.f, 4.f});
 
-    nd4j::ops::crelu_bp op;
+    sd::ops::crelu_bp op;
     auto result = op.evaluate({&x, &eps});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     ASSERT_EQ(1, result->size());
@@ -539,7 +539,7 @@ TEST_F(DeclarableOpsTests2, Test_Concat_BP_1) {
     auto expEX = NDArrayFactory::create<float>('c', {2, 2}, {1.f, 2.f, 3.f, 4.f});
     auto expEY = NDArrayFactory::create<float>('c', {2, 2}, {0.f, 1.f, 0.f, 1.f});
 
-    nd4j::ops::concat_bp op;
+    sd::ops::concat_bp op;
     auto result = op.evaluate({&x, &y, &eps}, {}, {-1});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     ASSERT_EQ(2, result->size());
@@ -571,7 +571,7 @@ TEST_F(DeclarableOpsTests2, absolute_difference_loss_test_1) {
     weights.assign(0.5f);
     expected.assign(0.5f);
 
-    nd4j::ops::absolute_difference_loss op;
+    sd::ops::absolute_difference_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -599,7 +599,7 @@ TEST_F(DeclarableOpsTests2, absolute_difference_loss_test_2) {
     weights.assign(0.5f);
     expected.assign(0.5f);
 
-    nd4j::ops::absolute_difference_loss op;
+    sd::ops::absolute_difference_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -627,7 +627,7 @@ TEST_F(DeclarableOpsTests2, absolute_difference_loss_test_3) {
     weights.assign(0.5f);
     expected.assign(0.5f);
 
-    nd4j::ops::absolute_difference_loss op;
+    sd::ops::absolute_difference_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -654,7 +654,7 @@ TEST_F(DeclarableOpsTests2, absolute_difference_loss_test_4) {
     weights.assign(0.5f);
     expected.assign(0.5f);
 
-    nd4j::ops::absolute_difference_loss op;
+    sd::ops::absolute_difference_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -681,7 +681,7 @@ TEST_F(DeclarableOpsTests2, absolute_difference_loss_test_5) {
     weights.assign(0.5f);
     expected.assign(0.5f);
 
-    nd4j::ops::absolute_difference_loss op;
+    sd::ops::absolute_difference_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -708,7 +708,7 @@ TEST_F(DeclarableOpsTests2, absolute_difference_loss_test_6) {
     weights.assign(0.f);
     expected.assign(0.f);
 
-    nd4j::ops::absolute_difference_loss op;
+    sd::ops::absolute_difference_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -733,7 +733,7 @@ TEST_F(DeclarableOpsTests2, absolute_difference_loss_test_7) {
     predictions.linspace(2);
     weights.assign(0.5f);
 
-    nd4j::ops::absolute_difference_loss op;
+    sd::ops::absolute_difference_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -758,7 +758,7 @@ TEST_F(DeclarableOpsTests2, absolute_difference_loss_test_8) {
     predictions.linspace(2);
     weights.assign(0.f);
 
-    nd4j::ops::absolute_difference_loss op;
+    sd::ops::absolute_difference_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -783,7 +783,7 @@ TEST_F(DeclarableOpsTests2, absolute_difference_loss_test_9) {
     predictions.linspace(2);
     weights.assign(0.5f);
 
-    nd4j::ops::absolute_difference_loss op;
+    sd::ops::absolute_difference_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -808,7 +808,7 @@ TEST_F(DeclarableOpsTests2, absolute_difference_loss_test_10) {
     predictions.linspace(2);
     weights.assign(0.5f);
 
-    nd4j::ops::absolute_difference_loss op;
+    sd::ops::absolute_difference_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -833,7 +833,7 @@ TEST_F(DeclarableOpsTests2, absolute_difference_loss_test_11) {
     predictions.linspace(2);
     weights.assign(0.5f);
 
-    nd4j::ops::absolute_difference_loss op;
+    sd::ops::absolute_difference_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -858,7 +858,7 @@ TEST_F(DeclarableOpsTests2, absolute_difference_loss_test_12) {
     predictions.linspace(2);
     weights.assign(0.f);
 
-    nd4j::ops::absolute_difference_loss op;
+    sd::ops::absolute_difference_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -883,7 +883,7 @@ TEST_F(DeclarableOpsTests2, absolute_difference_loss_test_13) {
     predictions.linspace(2);
     weights.assign(0.5f);
 
-    nd4j::ops::absolute_difference_loss op;
+    sd::ops::absolute_difference_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -910,7 +910,7 @@ TEST_F(DeclarableOpsTests2, absolute_difference_loss_test_14) {
     weights.p(1, 0.f);
     weights.p(2, 0.f);
 
-    nd4j::ops::absolute_difference_loss op;
+    sd::ops::absolute_difference_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -935,7 +935,7 @@ TEST_F(DeclarableOpsTests2, absolute_difference_loss_test_15) {
     predictions.linspace(3);
     weights.assign(0.5f);
 
-    nd4j::ops::absolute_difference_loss op;
+    sd::ops::absolute_difference_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -964,7 +964,7 @@ TEST_F(DeclarableOpsTests2, absolute_difference_loss_test_16) {
     predictions.p(2, 0.f);
     predictions.p(3, 0.f);
 
-    nd4j::ops::absolute_difference_loss op;
+    sd::ops::absolute_difference_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -997,7 +997,7 @@ TEST_F(DeclarableOpsTests2, absolute_difference_loss_test_17) {
     labels.p(2, 0.f);
     labels.p(3, 0.f);
 
-    nd4j::ops::absolute_difference_loss op;
+    sd::ops::absolute_difference_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1030,7 +1030,7 @@ TEST_F(DeclarableOpsTests2, absolute_difference_loss_test_18) {
     labels.p(2, 0.f);
     labels.p(3, 0.f);
 
-    nd4j::ops::absolute_difference_loss op;
+    sd::ops::absolute_difference_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1056,7 +1056,7 @@ TEST_F(DeclarableOpsTests2, absolute_difference_loss_test_19) {
     predictions.linspace(3);
     weights.assign(0.5);
 
-    nd4j::ops::absolute_difference_loss op;
+    sd::ops::absolute_difference_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1081,7 +1081,7 @@ TEST_F(DeclarableOpsTests2, absolute_difference_loss_test_20) {
     predictions.linspace(3);
     weights.assign(0.5);
 
-    nd4j::ops::absolute_difference_loss op;
+    sd::ops::absolute_difference_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1106,7 +1106,7 @@ TEST_F(DeclarableOpsTests2, absolute_difference_loss_test_21) {
     predictions.linspace(3);
     weights.assign(0.5);
 
-    nd4j::ops::absolute_difference_loss op;
+    sd::ops::absolute_difference_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1131,7 +1131,7 @@ TEST_F(DeclarableOpsTests2, absolute_difference_loss_test_22) {
     predictions.linspace(3);
     weights.assign(0.);
 
-    nd4j::ops::absolute_difference_loss op;
+    sd::ops::absolute_difference_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1168,7 +1168,7 @@ TEST_F(DeclarableOpsTests2, absolute_difference_loss_test_23) {
     weights.p(40+2, 0.);
     weights.p(40+3, 0.);
 
-    nd4j::ops::absolute_difference_loss op;
+    sd::ops::absolute_difference_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1194,7 +1194,7 @@ TEST_F(DeclarableOpsTests2, cosine_distance_loss_test1) {
     predictions.linspace(2);
     weights.assign(0.5);
 
-    nd4j::ops::cosine_distance_loss op;
+    sd::ops::cosine_distance_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {0,0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1220,7 +1220,7 @@ TEST_F(DeclarableOpsTests2, cosine_distance_loss_test2) {
     weights.assign(0.5);
     predictions.assign(0.5);
 
-    nd4j::ops::cosine_distance_loss op;
+    sd::ops::cosine_distance_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {0,1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1247,7 +1247,7 @@ TEST_F(DeclarableOpsTests2, cosine_distance_loss_test3) {
     weights.assign(0.5);
     predictions.assign(0.5);
 
-    nd4j::ops::cosine_distance_loss op;
+    sd::ops::cosine_distance_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {0,2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1272,7 +1272,7 @@ TEST_F(DeclarableOpsTests2, cosine_distance_loss_test4) {
     weights.assign(0.5);
     predictions.assign(0.5);
 
-    nd4j::ops::cosine_distance_loss op;
+    sd::ops::cosine_distance_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {0,2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1297,7 +1297,7 @@ TEST_F(DeclarableOpsTests2, cosine_distance_loss_test5) {
     weights.assign(0.5);
     predictions.assign(0.5);
 
-    nd4j::ops::cosine_distance_loss op;
+    sd::ops::cosine_distance_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {1,1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1322,7 +1322,7 @@ TEST_F(DeclarableOpsTests2, cosine_distance_loss_test6) {
     weights.assign(0.5);
     predictions.assign(0.5);
 
-    nd4j::ops::cosine_distance_loss op;
+    sd::ops::cosine_distance_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {1,1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1347,7 +1347,7 @@ TEST_F(DeclarableOpsTests2, cosine_distance_loss_test7) {
     weights.assign(0.5);
     predictions.assign(0.5);
 
-    nd4j::ops::cosine_distance_loss op;
+    sd::ops::cosine_distance_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {1,0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1372,7 +1372,7 @@ TEST_F(DeclarableOpsTests2, cosine_distance_loss_test8) {
     weights.assign(0.5f);
     predictions.assign(0.5f);
 
-    nd4j::ops::cosine_distance_loss op;
+    sd::ops::cosine_distance_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {2,2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1397,7 +1397,7 @@ TEST_F(DeclarableOpsTests2, cosine_distance_loss_test9) {
     weights.assign(0.5f);
     predictions.assign(0.5f);
 
-    nd4j::ops::cosine_distance_loss op;
+    sd::ops::cosine_distance_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {2,2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1424,7 +1424,7 @@ TEST_F(DeclarableOpsTests2, cosine_distance_loss_test10) {
     weights.p(0, 0.f);
     weights.p(1, 0.f);
 
-    nd4j::ops::cosine_distance_loss op;
+    sd::ops::cosine_distance_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {2,2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1449,7 +1449,7 @@ TEST_F(DeclarableOpsTests2, hinge_loss_test1) {
     logits.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::hinge_loss op;
+    sd::ops::hinge_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1474,7 +1474,7 @@ TEST_F(DeclarableOpsTests2, hinge_loss_test2) {
     logits.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::hinge_loss op;
+    sd::ops::hinge_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1499,7 +1499,7 @@ TEST_F(DeclarableOpsTests2, hinge_loss_test3) {
     logits.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::hinge_loss op;
+    sd::ops::hinge_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1524,7 +1524,7 @@ TEST_F(DeclarableOpsTests2, hinge_loss_test4) {
     weights.assign(0.5);
 
 
-    nd4j::ops::hinge_loss op;
+    sd::ops::hinge_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1548,7 +1548,7 @@ TEST_F(DeclarableOpsTests2, hinge_loss_test5) {
     weights.assign(0.5);
 
 
-    nd4j::ops::hinge_loss op;
+    sd::ops::hinge_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1572,7 +1572,7 @@ TEST_F(DeclarableOpsTests2, hinge_loss_test6) {
     weights.assign(0.5);
 
 
-    nd4j::ops::hinge_loss op;
+    sd::ops::hinge_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1596,7 +1596,7 @@ TEST_F(DeclarableOpsTests2, hinge_loss_test7) {
     weights.assign(0.5);
 
 
-    nd4j::ops::hinge_loss op;
+    sd::ops::hinge_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1620,7 +1620,7 @@ TEST_F(DeclarableOpsTests2, hinge_loss_test8) {
     weights.assign(0.5);
 
 
-    nd4j::ops::hinge_loss op;
+    sd::ops::hinge_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1644,7 +1644,7 @@ TEST_F(DeclarableOpsTests2, hinge_loss_test9) {
     weights.assign(0.5);
 
 
-    nd4j::ops::hinge_loss op;
+    sd::ops::hinge_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1668,7 +1668,7 @@ TEST_F(DeclarableOpsTests2, hinge_loss_test10) {
     weights.assign(0.5);
 
 
-    nd4j::ops::hinge_loss op;
+    sd::ops::hinge_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1692,7 +1692,7 @@ TEST_F(DeclarableOpsTests2, hinge_loss_test11) {
     weights.assign(0.5);
 
 
-    nd4j::ops::hinge_loss op;
+    sd::ops::hinge_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1720,7 +1720,7 @@ TEST_F(DeclarableOpsTests2, hinge_loss_test12) {
     weights.p(3, 0.);
 
 
-    nd4j::ops::hinge_loss op;
+    sd::ops::hinge_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1744,7 +1744,7 @@ TEST_F(DeclarableOpsTests2, hinge_loss_test13) {
     weights.assign(0.);
 
 
-    nd4j::ops::hinge_loss op;
+    sd::ops::hinge_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1769,7 +1769,7 @@ TEST_F(DeclarableOpsTests2, huber_loss_test1) {
     predictions.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::huber_loss op;
+    sd::ops::huber_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {0.1}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1794,7 +1794,7 @@ TEST_F(DeclarableOpsTests2, huber_loss_test2) {
     predictions.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::huber_loss op;
+    sd::ops::huber_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {0.1}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1819,7 +1819,7 @@ TEST_F(DeclarableOpsTests2, huber_loss_test3) {
     predictions.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::huber_loss op;
+    sd::ops::huber_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {0.1}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1843,7 +1843,7 @@ TEST_F(DeclarableOpsTests2, huber_loss_test4) {
     predictions.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::huber_loss op;
+    sd::ops::huber_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {0.1}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1867,7 +1867,7 @@ TEST_F(DeclarableOpsTests2, huber_loss_test5) {
     predictions.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::huber_loss op;
+    sd::ops::huber_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {0.1}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1891,7 +1891,7 @@ TEST_F(DeclarableOpsTests2, huber_loss_test6) {
     predictions.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::huber_loss op;
+    sd::ops::huber_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {0.1}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1915,7 +1915,7 @@ TEST_F(DeclarableOpsTests2, huber_loss_test7) {
     predictions.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::huber_loss op;
+    sd::ops::huber_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {0.1}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1943,7 +1943,7 @@ TEST_F(DeclarableOpsTests2, huber_loss_test8) {
     weights.p(2, 0.);
     weights.p(3, 0.);
 
-    nd4j::ops::huber_loss op;
+    sd::ops::huber_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {0.1}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1967,7 +1967,7 @@ TEST_F(DeclarableOpsTests2, huber_loss_test9) {
     predictions.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::huber_loss op;
+    sd::ops::huber_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {0.1}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1991,7 +1991,7 @@ TEST_F(DeclarableOpsTests2, huber_loss_test10) {
     predictions.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::huber_loss op;
+    sd::ops::huber_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {0.1}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2019,7 +2019,7 @@ TEST_F(DeclarableOpsTests2, huber_loss_test11) {
     weights.p(2, 0.);
     weights.p(3, 0.);
 
-    nd4j::ops::huber_loss op;
+    sd::ops::huber_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {0.1}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2044,7 +2044,7 @@ TEST_F(DeclarableOpsTests2, log_loss_test1) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::log_loss op;
+    sd::ops::log_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {1e-7}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2069,7 +2069,7 @@ TEST_F(DeclarableOpsTests2, log_loss_test2) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::log_loss op;
+    sd::ops::log_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {1e-7}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2087,14 +2087,14 @@ TEST_F(DeclarableOpsTests2, log_loss_test3) {
 
     auto labels = NDArrayFactory::create<double>('c', {2,3,4});
     auto predictions = NDArrayFactory::create<double>('c', {2,3,4});
-    NDArray weights(nd4j::DataType::DOUBLE);
+    NDArray weights(sd::DataType::DOUBLE);
     auto expected = NDArrayFactory::create<double>('c', {2,3,4}, {1.60943663,  2.48403668,  3.05256081,  3.40363169,  3.57730675,  3.59525585,  3.46986699,  3.20791793,  2.81228209,  2.28273821,  1.61630058,  0.80721998, -0.15329313, -1.27764463, -2.5828433 , -4.09208679, -5.83734226, -7.8636713 ,-10.23689461,-13.05822182,-16.49509811,-20.85659218,-26.82411766,-36.52717209});
 
     predictions.linspace(0.04, 0.04);
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::log_loss op;
+    sd::ops::log_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {1e-7}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2118,7 +2118,7 @@ TEST_F(DeclarableOpsTests2, log_loss_test4) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::log_loss op;
+    sd::ops::log_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {1e-7}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2142,7 +2142,7 @@ TEST_F(DeclarableOpsTests2, log_loss_test5) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::log_loss op;
+    sd::ops::log_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {1e-7}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2160,13 +2160,13 @@ TEST_F(DeclarableOpsTests2, log_loss_test6) {
 
     auto labels = NDArrayFactory::create<double>('c', {2,3,4});
     auto predictions = NDArrayFactory::create<double>('c', {2,3,4});
-    NDArray weights(nd4j::DataType::DOUBLE);
+    NDArray weights(sd::DataType::DOUBLE);
 
     predictions.linspace(0.04, 0.04);
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::log_loss op;
+    sd::ops::log_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {1e-7}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2190,7 +2190,7 @@ TEST_F(DeclarableOpsTests2, log_loss_test7) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::log_loss op;
+    sd::ops::log_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {1e-7}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2214,7 +2214,7 @@ TEST_F(DeclarableOpsTests2, log_loss_test8) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::log_loss op;
+    sd::ops::log_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {1e-7}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2232,13 +2232,13 @@ TEST_F(DeclarableOpsTests2, log_loss_test9) {
 
     auto labels = NDArrayFactory::create<double>('c', {2,3,4});
     auto predictions = NDArrayFactory::create<double>('c', {2,3,4});
-    NDArray weights(nd4j::DataType::DOUBLE);
+    NDArray weights(sd::DataType::DOUBLE);
 
     predictions.linspace(0.04, 0.04);
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::log_loss op;
+    sd::ops::log_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {1e-7}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2266,7 +2266,7 @@ TEST_F(DeclarableOpsTests2, log_loss_test10) {
     weights.p(2, 0.);
     weights.p(3, 0.);
 
-    nd4j::ops::log_loss op;
+    sd::ops::log_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {1e-7}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2290,7 +2290,7 @@ TEST_F(DeclarableOpsTests2, log_loss_test11) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::log_loss op;
+    sd::ops::log_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {1e-7}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2314,7 +2314,7 @@ TEST_F(DeclarableOpsTests2, log_loss_test12) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::log_loss op;
+    sd::ops::log_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {1e-7}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2342,7 +2342,7 @@ TEST_F(DeclarableOpsTests2, log_loss_test13) {
     weights.p(2, 0.);
     weights.p(3, 0.);
 
-    nd4j::ops::log_loss op;
+    sd::ops::log_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {1e-7}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2362,7 +2362,7 @@ TEST_F(DeclarableOpsTests2, mean_pairwssqerr_loss_test1) {
     auto weights = NDArrayFactory::create<double>('c', {1,1}, {1});
     auto expected = NDArrayFactory::create<double>('c', {1,1}, {1.});
 
-    nd4j::ops::mean_pairwssqerr_loss op;
+    sd::ops::mean_pairwssqerr_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2382,7 +2382,7 @@ TEST_F(DeclarableOpsTests2, mean_pairwssqerr_loss_test2) {
     auto weights = NDArrayFactory::create<double>('c', {1,1}, {1});
     auto expected = NDArrayFactory::create<double>('c', {10,1}, {1.9665822560405073, 3.806679563402927, 6.185624212589066, 20.237895345263905, 16.739700814450472, 13.655430201400929, 6.473256392322658, 3.9337379694106325, 22.509455553531062, 1.4741234749089487});
 
-    nd4j::ops::mean_pairwssqerr_loss op;
+    sd::ops::mean_pairwssqerr_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2402,7 +2402,7 @@ TEST_F(DeclarableOpsTests2, mean_pairwssqerr_loss_test3) {
     auto weights = NDArrayFactory::create<double>('c', {10,1}, {0.0, 0.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0});
     auto expected = NDArrayFactory::create<double>('c', {10,1}, {0.0, 0.0, 21.748459867092496, 6.090581568657439, 7.51315897553838, 5.999534225166869, 22.58050883748054, 6.8600435676788605, 107.5976928688877, 191.56864939172544});
 
-    nd4j::ops::mean_pairwssqerr_loss op;
+    sd::ops::mean_pairwssqerr_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2421,7 +2421,7 @@ TEST_F(DeclarableOpsTests2, mean_pairwssqerr_loss_test4) {
     auto predictions = NDArrayFactory::create<double>('c', {10,4}, {-1.7053977111021588, 1.7704125629388408, -0.0876171627499475, 0.9428762101237441, 0.9080108618240852, -0.478732892339118, -0.8189639230649537, 1.3359668242925342, -0.07499867017894829, 0.6169780756804321, -1.1891117691972148, -0.319354110980483, -1.4287263424900434, -0.3556443786879834, 0.6389682186473912, 0.3161742985911756, 0.9047447733840537, -1.9974117226910393, 2.1067775658502326, 0.17035521714679938, -1.1393894489992826, 1.4570837278971687, 0.6312249731754015, -0.42793125692777634, -1.0685964336386844, -0.3590636581851568, -0.19147354841437528, -0.10128937266756889, -0.5714869078294972, 0.2682604831358205, 0.6608524575561853, 0.35658907103040305, -0.7053263272861181, -0.6318441042427088, 2.131292677079184, -0.3624048087249232, 1.6008209804575328, 0.1245980660014825, 1.0685424462364297, -0.5672594432046791});
     auto weights = NDArrayFactory::create<double>('c', {1,1}, {1});
 
-    nd4j::ops::mean_pairwssqerr_loss op;
+    sd::ops::mean_pairwssqerr_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2440,7 +2440,7 @@ TEST_F(DeclarableOpsTests2, mean_pairwssqerr_loss_test5) {
     auto predictions = NDArrayFactory::create<double>('c', {10,4}, {-3.398114657004427, 0.40587455906092945, 1.587706448479039, 0.27394335709083156, 1.0463122023764637, -0.6552570653663903, -0.26929204111727345, -2.710461824817806, 0.9141296064806023, -0.7632270851454939, -0.4077235519855459, 0.5555107559107472, -0.6776140976423888, 1.2422270521180823, 0.2372445100636733, 0.08522757123963924, -2.708523129389936, 0.09738215252575103, -0.8797837670498875, 0.8714091607391934, -0.628958978867591, 0.49380147969660415, -0.6663578349373824, 0.14570184758600965, -0.4710388511314244, 0.7708214742640788, 0.06836525442683238, -1.2786368797129386, -0.5077556003990912, 0.45383439418987664, 1.1686877788409553, -0.3078567969393852, -2.2375730522738198, 1.0108200459611192, 0.21955367964983963, 1.2268011099696847, 0.48061693077695455, -0.5306373077054981, 1.5005367299570744, -2.1005486985463966});
     auto weights = NDArrayFactory::create<double>('c', {1,1}, {1});
 
-    nd4j::ops::mean_pairwssqerr_loss op;
+    sd::ops::mean_pairwssqerr_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2459,7 +2459,7 @@ TEST_F(DeclarableOpsTests2, mean_pairwssqerr_loss_test6) {
     auto predictions = NDArrayFactory::create<double>('c', {10,4}, {-0.8253096544930751, 0.81324545672996, 1.2530858908292535, 0.6881658781201572, 0.11626814971230247, 0.810096847233213, -0.41726775033902014, -0.07246036077805246, -0.3491325803119671, -0.7381717490678714, -1.258884944199858, 2.6195012275145992, 0.3241066697239042, -1.3306435333372646, -0.3413119919683999, 0.13167356361127197, -0.3992424507051653, 0.14454163796541403, -2.4931643208872316, 1.8740911656038526, -2.3404306490682956, -0.8036392545918644, -1.9726177395274997, -0.20128619801149433, -1.0680828820641624, -0.6228179015361869, 1.0785520122486962, -0.26148573195062036, -0.9154287856620913, 0.6612224269248097, -0.21735407368781667, 0.5584864652543093, 1.0208212201167435, -0.7560947201084579, -0.9092906572495081, 0.47525819203475833, 1.2215678456801444, -0.39319465979983964, 1.9435677135606038, 1.4540100039010526});
     auto weights = NDArrayFactory::create<double>('c', {1,1}, {1});
 
-    nd4j::ops::mean_pairwssqerr_loss op;
+    sd::ops::mean_pairwssqerr_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2478,7 +2478,7 @@ TEST_F(DeclarableOpsTests2, mean_pairwssqerr_loss_test7) {
     auto predictions = NDArrayFactory::create<double>('c', {10,4}, {-0.7445687252538243, 0.2293875300325241, -1.0231630280206505, -0.18532545069458992, -0.07797403344353356, -0.9132035669873787, 0.9352296415512886, -1.7406458535354787, 0.8578334648119594, -0.6186274065269556, 0.4874824473654153, -0.9285817343788997, 0.1654680500853023, -0.6371334533926012, 1.3115245864160707, -2.072558735678832, 0.660795731844733, -0.34942292767044864, 0.05787182311194333, -0.12939210444705632, -0.6457028552461069, -0.6048992126598505, -0.17179604529778109, 1.292989642826032, -0.28867767615688045, 0.7635565516046265, -1.5464151753137487, -1.273368390129285, -1.074046012825826, -0.3534580692302915, 0.5757285568118223, 1.823271242883469, 0.31618576929075215, 0.5422847605415213, -0.7836698021860683, -0.6292022623165172, 2.1114596721927508, 0.4634986528550097, 0.08922001427846013, 1.5767749644913223});
     auto weights = NDArrayFactory::create<double>('c', {10,1}, {0.0, 0.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0});
 
-    nd4j::ops::mean_pairwssqerr_loss op;
+    sd::ops::mean_pairwssqerr_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2497,7 +2497,7 @@ TEST_F(DeclarableOpsTests2, mean_pairwssqerr_loss_test8) {
     auto predictions = NDArrayFactory::create<double>('c', {10,4}, {-1.1088399463364795, 0.09302972835006071, 0.033839927431215555, -0.39567507675572494, 0.8269497207597863, 1.111162272517752, 0.4930937252630912, -1.4561668998323452, 0.9417715392862969, -1.0553855492735509, 0.05848285303876081, 0.8852337518047972, -0.7472824481835305, 0.404906922583895, -0.2198309547562547, 1.9536515925189717, 0.8165036568007779, -0.19524282774410398, -0.09111693087754393, 1.1604245932512238, -0.6243762858131077, 1.4297003275591034, -0.17220079411538428, -2.3139504326793032, 0.3839796486999712, 2.0287791964679234, 0.1534441713632995, -0.6062103319229825, -0.4965880982906036, -0.373907747810053, -1.6566345746154432, 0.17534987728494222, -1.6713458890334796, 1.254628987947714, 1.914596591838086, -1.0816010467183583, 0.25033738231939673, -1.605752685708275, 1.1029112741353981, 0.3237822320282494});
     auto weights = NDArrayFactory::create<double>('c', {10,1}, {0.0, 0.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0});
 
-    nd4j::ops::mean_pairwssqerr_loss op;
+    sd::ops::mean_pairwssqerr_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2516,7 +2516,7 @@ TEST_F(DeclarableOpsTests2, mean_pairwssqerr_loss_test9) {
     auto predictions = NDArrayFactory::create<double>('c', {10,4}, {-1.6788168943811437, 1.1823653279081687, -0.3580541857004183, -0.4449970504370699, -1.3031645333940127, 0.5755013195969282, -0.7997343141774744, -0.8806735270004084, 0.9705277499376251, -1.6360067944580943, 0.12579369136710156, 1.0525902242414313, -1.625751312422252, -0.03900152587147075, 0.4112500942756277, 0.6589999986358094, 0.6144107111689617, 2.8561269030217264, 1.5299963640392247, -0.314093051147705, 1.6523278218751989, -0.5504653447714114, 0.53395260877978, 0.409795577698306, 0.4466825218051794, 1.2382059301630401, 0.4834869732526594, -0.635409128905636, -1.9343816841697272, -0.4192523056060229, -1.0662979055059818, 0.4270901960618144, -0.7391311480757151, -0.8268168961897452, -1.0855715553457785, -9.410401291588706E-4, -0.7721838774717349, 0.4784019579457375, -0.6979798841469268, -0.319729737118584});
     auto weights = NDArrayFactory::create<double>('c', {10,1}, {0.0, 0.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0});
 
-    nd4j::ops::mean_pairwssqerr_loss op;
+    sd::ops::mean_pairwssqerr_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2540,7 +2540,7 @@ TEST_F(DeclarableOpsTests2, mean_sqerr_loss_test1) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::mean_sqerr_loss op;
+    sd::ops::mean_sqerr_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2565,7 +2565,7 @@ TEST_F(DeclarableOpsTests2, mean_sqerr_loss_test2) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::mean_sqerr_loss op;
+    sd::ops::mean_sqerr_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2590,7 +2590,7 @@ TEST_F(DeclarableOpsTests2, mean_sqerr_loss_test3) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::mean_sqerr_loss op;
+    sd::ops::mean_sqerr_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2619,7 +2619,7 @@ TEST_F(DeclarableOpsTests2, mean_sqerr_loss_test4) {
     weights.p(2, 0.);
     weights.p(3, 0.);
 
-    nd4j::ops::mean_sqerr_loss op;
+    sd::ops::mean_sqerr_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2643,7 +2643,7 @@ TEST_F(DeclarableOpsTests2, mean_sqerr_loss_test5) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::mean_sqerr_loss op;
+    sd::ops::mean_sqerr_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2667,7 +2667,7 @@ TEST_F(DeclarableOpsTests2, mean_sqerr_loss_test6) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::mean_sqerr_loss op;
+    sd::ops::mean_sqerr_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2691,7 +2691,7 @@ TEST_F(DeclarableOpsTests2, mean_sqerr_loss_test7) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::mean_sqerr_loss op;
+    sd::ops::mean_sqerr_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2719,7 +2719,7 @@ TEST_F(DeclarableOpsTests2, mean_sqerr_loss_test8) {
     weights.p(2, 0.);
     weights.p(3, 0.);
 
-    nd4j::ops::mean_sqerr_loss op;
+    sd::ops::mean_sqerr_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2743,7 +2743,7 @@ TEST_F(DeclarableOpsTests2, mean_sqerr_loss_test9) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::mean_sqerr_loss op;
+    sd::ops::mean_sqerr_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2767,7 +2767,7 @@ TEST_F(DeclarableOpsTests2, mean_sqerr_loss_test10) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::mean_sqerr_loss op;
+    sd::ops::mean_sqerr_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2791,7 +2791,7 @@ TEST_F(DeclarableOpsTests2, mean_sqerr_loss_test11) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::mean_sqerr_loss op;
+    sd::ops::mean_sqerr_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2818,7 +2818,7 @@ TEST_F(DeclarableOpsTests2, mean_sqerr_loss_test12) {
     weights.p(1, 0.);
     weights.p(2, 0.);
 
-    nd4j::ops::mean_sqerr_loss op;
+    sd::ops::mean_sqerr_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2842,7 +2842,7 @@ TEST_F(DeclarableOpsTests2, mean_sqerr_loss_test13) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::mean_sqerr_loss op;
+    sd::ops::mean_sqerr_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2866,7 +2866,7 @@ TEST_F(DeclarableOpsTests2, mean_sqerr_loss_test14) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::mean_sqerr_loss op;
+    sd::ops::mean_sqerr_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2890,7 +2890,7 @@ TEST_F(DeclarableOpsTests2, mean_sqerr_loss_test15) {
     labels.linspace(1);
     weights.assign(0.5);
 
-    nd4j::ops::mean_sqerr_loss op;
+    sd::ops::mean_sqerr_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2917,7 +2917,7 @@ TEST_F(DeclarableOpsTests2, mean_sqerr_loss_test16) {
     weights.p(1, 0.);
     weights.p(2, 0.);
 
-    nd4j::ops::mean_sqerr_loss op;
+    sd::ops::mean_sqerr_loss op;
     auto results = op.evaluate({&predictions, &weights, &labels}, {}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2941,7 +2941,7 @@ TEST_F(DeclarableOpsTests2, sigm_cross_entropy_loss_test1) {
     logits.linspace(0.1, 0.1);
     weights.assign(0.5);
 
-    nd4j::ops::sigm_cross_entropy_loss op;
+    sd::ops::sigm_cross_entropy_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {0.}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2965,7 +2965,7 @@ TEST_F(DeclarableOpsTests2, sigm_cross_entropy_loss_test2) {
     logits.linspace(0.1, 0.1);
     weights.assign(0.5);
 
-    nd4j::ops::sigm_cross_entropy_loss op;
+    sd::ops::sigm_cross_entropy_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {0.}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2989,7 +2989,7 @@ TEST_F(DeclarableOpsTests2, sigm_cross_entropy_loss_test3) {
     logits.linspace(0.1, 0.1);
     weights.assign(0.5);
 
-    nd4j::ops::sigm_cross_entropy_loss op;
+    sd::ops::sigm_cross_entropy_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {0.}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3013,7 +3013,7 @@ TEST_F(DeclarableOpsTests2, sigm_cross_entropy_loss_test4) {
     logits.linspace(0.1, 0.1);
     weights.assign(0.5);
 
-    nd4j::ops::sigm_cross_entropy_loss op;
+    sd::ops::sigm_cross_entropy_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {5.}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3036,7 +3036,7 @@ TEST_F(DeclarableOpsTests2, sigm_cross_entropy_loss_test5) {
     logits.linspace(0.1, 0.1);
     weights.assign(0.5);
 
-    nd4j::ops::sigm_cross_entropy_loss op;
+    sd::ops::sigm_cross_entropy_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {0.}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3059,7 +3059,7 @@ TEST_F(DeclarableOpsTests2, sigm_cross_entropy_loss_test6) {
     logits.linspace(0.1, 0.1);
     weights.assign(0.5);
 
-    nd4j::ops::sigm_cross_entropy_loss op;
+    sd::ops::sigm_cross_entropy_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {0.}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3082,7 +3082,7 @@ TEST_F(DeclarableOpsTests2, sigm_cross_entropy_loss_test7) {
     logits.linspace(0.1, 0.1);
     weights.assign(0.5);
 
-    nd4j::ops::sigm_cross_entropy_loss op;
+    sd::ops::sigm_cross_entropy_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {0.}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3105,7 +3105,7 @@ TEST_F(DeclarableOpsTests2, sigm_cross_entropy_loss_test8) {
     logits.linspace(0.1, 0.1);
     weights.assign(0.5);
 
-    nd4j::ops::sigm_cross_entropy_loss op;
+    sd::ops::sigm_cross_entropy_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {5.}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3132,7 +3132,7 @@ TEST_F(DeclarableOpsTests2, sigm_cross_entropy_loss_test9) {
     weights.p(1, 0.);
     weights.p(2, 0.);
 
-    nd4j::ops::sigm_cross_entropy_loss op;
+    sd::ops::sigm_cross_entropy_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {5.}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3155,7 +3155,7 @@ TEST_F(DeclarableOpsTests2, sigm_cross_entropy_loss_test10) {
     logits.linspace(0.1, 0.1);
     weights.assign(0.5);
 
-    nd4j::ops::sigm_cross_entropy_loss op;
+    sd::ops::sigm_cross_entropy_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {0.}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3178,7 +3178,7 @@ TEST_F(DeclarableOpsTests2, sigm_cross_entropy_loss_test11) {
     logits.linspace(0.1, 0.1);
     weights.assign(0.5);
 
-    nd4j::ops::sigm_cross_entropy_loss op;
+    sd::ops::sigm_cross_entropy_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {0.}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3201,7 +3201,7 @@ TEST_F(DeclarableOpsTests2, sigm_cross_entropy_loss_test12) {
     logits.linspace(0.1, 0.1);
     weights.assign(0.5);
 
-    nd4j::ops::sigm_cross_entropy_loss op;
+    sd::ops::sigm_cross_entropy_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {5.}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3227,7 +3227,7 @@ TEST_F(DeclarableOpsTests2, sigm_cross_entropy_loss_test13) {
     weights.p(1, 0.);
     weights.p(2, 0.);
 
-    nd4j::ops::sigm_cross_entropy_loss op;
+    sd::ops::sigm_cross_entropy_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {5.}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3250,7 +3250,7 @@ TEST_F(DeclarableOpsTests2, sigm_cross_entropy_loss_test14) {
     logits.linspace(0.1, 0.1);
     weights.assign(0.5);
 
-    nd4j::ops::sigm_cross_entropy_loss op;
+    sd::ops::sigm_cross_entropy_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {0.}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3273,7 +3273,7 @@ TEST_F(DeclarableOpsTests2, sigm_cross_entropy_loss_test15) {
     logits.linspace(0.1, 0.1);
     weights.assign(0.5);
 
-    nd4j::ops::sigm_cross_entropy_loss op;
+    sd::ops::sigm_cross_entropy_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {0.}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3296,7 +3296,7 @@ TEST_F(DeclarableOpsTests2, sigm_cross_entropy_loss_test16) {
     logits.linspace(0.1, 0.1);
     weights.assign(0.5);
 
-    nd4j::ops::sigm_cross_entropy_loss op;
+    sd::ops::sigm_cross_entropy_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {5.}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3322,7 +3322,7 @@ TEST_F(DeclarableOpsTests2, sigm_cross_entropy_loss_test17) {
     weights.p(1, 0.);
     weights.p(2, 0.);
 
-    nd4j::ops::sigm_cross_entropy_loss op;
+    sd::ops::sigm_cross_entropy_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {5.}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3346,7 +3346,7 @@ TEST_F(DeclarableOpsTests2, softmax_cross_entropy_loss_test1) {
     logits.linspace(0.1, 0.1);
     weights.assign(0.5);
 
-    nd4j::ops::softmax_cross_entropy_loss op;
+    sd::ops::softmax_cross_entropy_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {0.}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3369,7 +3369,7 @@ TEST_F(DeclarableOpsTests2, softmax_cross_entropy_loss_test2) {
     logits.linspace(0.1, 0.1);
     weights.assign(0.5);
 
-    nd4j::ops::softmax_cross_entropy_loss op;
+    sd::ops::softmax_cross_entropy_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {5.}, {0}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3393,7 +3393,7 @@ TEST_F(DeclarableOpsTests2, softmax_cross_entropy_loss_test3) {
     logits.linspace(0.1, 0.1);
     weights.assign(0.5);
 
-    nd4j::ops::softmax_cross_entropy_loss op;
+    sd::ops::softmax_cross_entropy_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {5.}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3417,7 +3417,7 @@ TEST_F(DeclarableOpsTests2, softmax_cross_entropy_loss_test4) {
     logits.linspace(0.1, 0.1);
     weights.assign(0.5);
 
-    nd4j::ops::softmax_cross_entropy_loss op;
+    sd::ops::softmax_cross_entropy_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {5.}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3441,7 +3441,7 @@ TEST_F(DeclarableOpsTests2, softmax_cross_entropy_loss_test5) {
     logits.linspace(0.1, 0.1);
     weights.assign(0.5);
 
-    nd4j::ops::softmax_cross_entropy_loss op;
+    sd::ops::softmax_cross_entropy_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {5.}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3464,7 +3464,7 @@ TEST_F(DeclarableOpsTests2, softmax_cross_entropy_loss_test6) {
     logits.linspace(0.1, 0.1);
     weights.assign(0.5);
 
-    nd4j::ops::softmax_cross_entropy_loss op;
+    sd::ops::softmax_cross_entropy_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {0.}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3487,7 +3487,7 @@ TEST_F(DeclarableOpsTests2, softmax_cross_entropy_loss_test7) {
     logits.linspace(0.1, 0.1);
     weights.assign(0.5);
 
-    nd4j::ops::softmax_cross_entropy_loss op;
+    sd::ops::softmax_cross_entropy_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {5.}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3510,7 +3510,7 @@ TEST_F(DeclarableOpsTests2, softmax_cross_entropy_loss_test8) {
     logits.linspace(0.1, 0.1);
     weights.assign(0.5);
 
-    nd4j::ops::softmax_cross_entropy_loss op;
+    sd::ops::softmax_cross_entropy_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {5.}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3533,7 +3533,7 @@ TEST_F(DeclarableOpsTests2, softmax_cross_entropy_loss_test9) {
     logits.linspace(0.1, 0.1);
     weights.assign(0.5);
 
-    nd4j::ops::softmax_cross_entropy_loss op;
+    sd::ops::softmax_cross_entropy_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {5.}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3556,7 +3556,7 @@ TEST_F(DeclarableOpsTests2, softmax_cross_entropy_loss_test10) {
     logits.linspace(0.1, 0.1);
     weights.assign(0.5);
 
-    nd4j::ops::softmax_cross_entropy_loss op;
+    sd::ops::softmax_cross_entropy_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {5.}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3579,7 +3579,7 @@ TEST_F(DeclarableOpsTests2, softmax_cross_entropy_loss_test11) {
     logits.linspace(0.1, 0.1);
     weights.assign(0.5);
 
-    nd4j::ops::softmax_cross_entropy_loss op;
+    sd::ops::softmax_cross_entropy_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {5.}, {3}, {}, {}, false);
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3602,7 +3602,7 @@ TEST_F(DeclarableOpsTests2, softmax_cross_entropy_loss_test12) {
     logits.linspace(0.1, 0.1);
     weights.assign(0.5);
 
-    nd4j::ops::softmax_cross_entropy_loss op;
+    sd::ops::softmax_cross_entropy_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {5.}, {3}, {}, {}, false);
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3626,7 +3626,7 @@ TEST_F(DeclarableOpsTests2, softmax_cross_entropy_loss_test13) {
     logits.linspace(0.1, 0.1);
     weights.assign(0.5);
 
-    nd4j::ops::softmax_cross_entropy_loss op;
+    sd::ops::softmax_cross_entropy_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {0.}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3652,7 +3652,7 @@ TEST_F(DeclarableOpsTests2, softmax_cross_entropy_loss_test14) {
     logits.linspace(0.1, 0.1);
     weights.assign(0.5);
 
-    nd4j::ops::softmax_cross_entropy_loss op;
+    sd::ops::softmax_cross_entropy_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {5.}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3676,7 +3676,7 @@ TEST_F(DeclarableOpsTests2, softmax_cross_entropy_loss_test15) {
     logits.linspace(0.1, 0.1);
     weights.assign(0.5);
 
-    nd4j::ops::softmax_cross_entropy_loss op;
+    sd::ops::softmax_cross_entropy_loss op;
     auto results = op.evaluate({&logits, &weights, &labels}, {5.}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3718,7 +3718,7 @@ TEST_F(DeclarableOpsTests2, lstmCell_test1) {
     auto expHt = NDArrayFactory::create<double>('c', {batchSize, numProj}, {0.99926789,0.99926789,0.99926789,0.99926789,0.99926789,0.99926789,0.99926789,0.99926789});
     auto expCt = NDArrayFactory::create<double>('c', {batchSize, numUnits},{3.99987108,3.99987108,3.99987108,3.99987108,3.99987108,3.99987108,3.99987108,3.99987108});
 
-    nd4j::ops::lstmCell op;
+    sd::ops::lstmCell op;
     auto results = op.evaluate({&xt, &ht_1, &ct_1, &Wx, &Wh, &Wc, &Wp, &b}, {0., 0., 1.}, {0, 0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3763,7 +3763,7 @@ TEST_F(DeclarableOpsTests2, lstmCell_test2) {
     auto expHt = NDArrayFactory::create<double>('c', {batchSize, numProj}, {0.95867589,0.95867589,0.95867589,0.95867589,0.95867589,0.95867589,0.95867589,0.95867589});
     auto expCt = NDArrayFactory::create<double>('c', {batchSize, numUnits},{1.93001527,1.93001527,1.93001527,1.93001527, 1.93001527,1.93001527,1.93001527,1.93001527});
 
-    nd4j::ops::lstmCell op;
+    sd::ops::lstmCell op;
     auto results = op.evaluate({&xt, &ht_1, &ct_1, &Wx, &Wh, &Wc, &Wp, &b}, {0., 0., -10.5}, {0, 0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3808,7 +3808,7 @@ TEST_F(DeclarableOpsTests2, lstmCell_test3) {
     auto expHt = NDArrayFactory::create<double>('c', {batchSize, numProj}, {0.37992568,0.37992568,0.37992568,0.37992568,0.37992568,0.37992568,0.37992568,0.37992568});
     auto expCt = NDArrayFactory::create<double>('c', {batchSize, numUnits},{0.4,  0.4,  0.4,  0.4,   0.4,  0.4,  0.4,  0.4});
 
-    nd4j::ops::lstmCell op;
+    sd::ops::lstmCell op;
     auto results = op.evaluate({&xt, &ht_1, &ct_1, &Wx, &Wh, &Wc, &Wp, &b}, {0.4, 0., 1.5}, {0, 0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3853,7 +3853,7 @@ TEST_F(DeclarableOpsTests2, lstmCell_test4) {
     auto expHt = NDArrayFactory::create<double>('c', {batchSize, numProj}, {0.37992568,0.37992568,0.37992568,0.37992568,0.37992568,0.37992568,0.37992568,0.37992568});
     auto expCt = NDArrayFactory::create<double>('c', {batchSize, numUnits},{0.4,  0.4,  0.4,  0.4,   0.4,  0.4,  0.4,  0.4});
 
-    nd4j::ops::lstmCell op;
+    sd::ops::lstmCell op;
     auto results = op.evaluate({&xt, &ht_1, &ct_1, &Wx, &Wh, &Wc, &Wp, &b}, {0.4, 0.3, 1.5}, {0, 0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3898,7 +3898,7 @@ TEST_F(DeclarableOpsTests2, lstmCell_test5) {
     auto expHt = NDArrayFactory::create<double>('c', {batchSize, numProj}, {0.3,0.3,0.3,0.3,0.3,0.3});
     auto expCt = NDArrayFactory::create<double>('c', {batchSize, numUnits},{0.4,  0.4,  0.4,  0.4,   0.4,  0.4,  0.4,  0.4});
 
-    nd4j::ops::lstmCell op;
+    sd::ops::lstmCell op;
     auto results = op.evaluate({&xt, &ht_1, &ct_1, &Wx, &Wh, &Wc, &Wp, &b}, {0.4, 0.3, 1.5}, {0, 1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3943,7 +3943,7 @@ TEST_F(DeclarableOpsTests2, lstmCell_test6) {
     auto expHt = NDArrayFactory::create<double>('c', {batchSize, numProj}, {1.99832496,1.99832496,1.99832496,1.99832496,1.99832496,1.99832496});
     auto expCt = NDArrayFactory::create<double>('c', {batchSize, numUnits},{3.99972188,3.99972188,3.99972188,3.99972188,3.99972188,3.99972188,3.99972188,3.99972188});
 
-    nd4j::ops::lstmCell op;
+    sd::ops::lstmCell op;
     auto results = op.evaluate({&xt, &ht_1, &ct_1, &Wx, &Wh, &Wc, &Wp, &b}, {0., 0., 1.5}, {0, 1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -3988,7 +3988,7 @@ TEST_F(DeclarableOpsTests2, lstmCell_test7) {
     auto expHt = NDArrayFactory::create<double>('c', {batchSize, numProj}, {0.75977136,0.75977136,0.75977136,0.75977136,0.75977136,0.75977136});
     auto expCt = NDArrayFactory::create<double>('c', {batchSize, numUnits},{0.4,  0.4,  0.4,  0.4,   0.4,  0.4,  0.4,  0.4});
 
-    nd4j::ops::lstmCell op;
+    sd::ops::lstmCell op;
     auto results = op.evaluate({&xt, &ht_1, &ct_1, &Wx, &Wh, &Wc, &Wp, &b}, {0.4, 0., 1.5}, {0, 1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -4034,7 +4034,7 @@ TEST_F(DeclarableOpsTests2, lstmCell_test8) {
     auto expHt = NDArrayFactory::create<double>('c', {batchSize, numProj}, {0.99930672,0.99930672,0.99930672,0.99930672, 0.99930672,0.99930672,0.99930672,0.99930672});
     auto expCt = NDArrayFactory::create<double>('c', {batchSize, numUnits},{3.99996277,3.99996277,3.99996277,3.99996277,3.99996277,3.99996277,3.99996277,3.99996277});
 
-    nd4j::ops::lstmCell op;
+    sd::ops::lstmCell op;
     auto results = op.evaluate({&xt, &ht_1, &ct_1, &Wx, &Wh, &Wc, &Wp, &b}, {0., 0., 10.5}, {1, 0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -4079,7 +4079,7 @@ TEST_F(DeclarableOpsTests2, lstmCell_test9) {
     auto expHt = NDArrayFactory::create<double>('c', {batchSize, numProj}, {0.99501777,0.99501777,0.99501777,0.99501777,0.99501777,0.99501777,0.99501777,0.99501777});
     auto expCt = NDArrayFactory::create<double>('c', {batchSize, numUnits},{3.,3.,3.,3.,3.,3.,3.,3.});
 
-    nd4j::ops::lstmCell op;
+    sd::ops::lstmCell op;
     auto results = op.evaluate({&xt, &ht_1, &ct_1, &Wx, &Wh, &Wc, &Wp, &b}, {3., 0., 10.5}, {1, 0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -4124,7 +4124,7 @@ TEST_F(DeclarableOpsTests2, lstmCell_test10) {
     auto expHt = NDArrayFactory::create<double>('c', {batchSize, numProj}, {1.99861344,1.99861344,1.99861344,1.99861344,1.99861344,1.99861344});
     auto expCt = NDArrayFactory::create<double>('c', {batchSize, numUnits},{3.99996277,  3.99996277,  3.99996277,  3.99996277,3.99996277,  3.99996277,  3.99996277,  3.99996277});
 
-    nd4j::ops::lstmCell op;
+    sd::ops::lstmCell op;
     auto results = op.evaluate({&xt, &ht_1, &ct_1, &Wx, &Wh, &Wc, &Wp, &b}, {0., 0., 10.5}, {1, 1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -4169,7 +4169,7 @@ TEST_F(DeclarableOpsTests2, lstmCell_test11) {
     auto expHt = NDArrayFactory::create<double>('c', {batchSize, numProj}, {1.99003554,1.99003554,1.99003554,1.99003554,1.99003554,1.99003554});
     auto expCt = NDArrayFactory::create<double>('c', {batchSize, numUnits},{3.,3.,3.,3.,3.,3.,3.,3.});
 
-    nd4j::ops::lstmCell op;
+    sd::ops::lstmCell op;
     auto results = op.evaluate({&xt, &ht_1, &ct_1, &Wx, &Wh, &Wc, &Wp, &b}, {3., 0., 10.5}, {1, 1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -4214,7 +4214,7 @@ TEST_F(DeclarableOpsTests2, lstmCell_test12) {
     auto expHt = NDArrayFactory::create<double>('c', {batchSize, numProj}, {1.,1.,1.,1.,1.,1.});
     auto expCt = NDArrayFactory::create<double>('c', {batchSize, numUnits},{3.,3.,3.,3.,3.,3.,3.,3.});
 
-    nd4j::ops::lstmCell op;
+    sd::ops::lstmCell op;
     auto results = op.evaluate({&xt, &ht_1, &ct_1, &Wx, &Wh, &Wc, &Wp, &b}, {3., 1.,-5.}, {1, 1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests3.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests3.cpp
index e7e95afcb..c9e86df72 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests3.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests3.cpp
@@ -17,14 +17,14 @@
 #include "testlayers.h"
 #include <ops/declarable/CustomOperations.h>
 #include <helpers/helper_hash.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <array/NDArrayList.h>
-#include <MmulHelper.h>
-#include <PointersManager.h>
+#include <helpers/MmulHelper.h>
+#include <helpers/PointersManager.h>
 
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class DeclarableOpsTests3 : public testing::Test {
 public:
@@ -42,7 +42,7 @@ TEST_F(DeclarableOpsTests3, Test_Tile_1) {
 
     auto exp = x.tile(reps);
 
-    nd4j::ops::tile op;
+    sd::ops::tile op;
     auto result = op.evaluate({&x, &rep_vector});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -60,7 +60,7 @@ TEST_F(DeclarableOpsTests3, Test_Tile_2) {
 
     auto exp = x.tile(reps);
 
-    nd4j::ops::tile op;
+    sd::ops::tile op;
     auto result = op.evaluate({&x}, {}, {2, 2});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -76,7 +76,7 @@ TEST_F(DeclarableOpsTests3, Test_Permute_1) {
     auto permute= NDArrayFactory::create<Nd4jLong>('c', {1, 3}, {0, 2, 1});
     auto exp= NDArrayFactory::create<float>('c', {2, 4, 3});
 
-    nd4j::ops::permute op;
+    sd::ops::permute op;
     auto result = op.evaluate({&x, &permute});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -91,7 +91,7 @@ TEST_F(DeclarableOpsTests3, Test_Permute_2) {
     auto x= NDArrayFactory::create<float>('c', {2, 3, 4});
     auto exp= NDArrayFactory::create<float>('c', {4, 3, 2});
 
-    nd4j::ops::permute op;
+    sd::ops::permute op;
     auto result = op.evaluate({&x});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -109,7 +109,7 @@ TEST_F(DeclarableOpsTests3, Test_Unique_1) {
     auto expI= NDArrayFactory::create<Nd4jLong>('c', {5}, {0, 1, 0, 1, 2});
 //    auto expI= NDArrayFactory::create<float>('c', {3}, {0, 1, 4});
 
-    nd4j::ops::unique op;
+    sd::ops::unique op;
     auto result = op.evaluate({&x}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -135,7 +135,7 @@ TEST_F(DeclarableOpsTests3, Test_Unique_2) {
     auto expI= NDArrayFactory::create<Nd4jLong>('c', {5}, {0, 1, 0, 1, 2});
     auto expC= NDArrayFactory::create<Nd4jLong>('c', {3}, {2, 2, 1});
 
-    nd4j::ops::unique_with_counts op;
+    sd::ops::unique_with_counts op;
     auto result = op.evaluate({&x}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -168,7 +168,7 @@ TEST_F(DeclarableOpsTests3, Test_Rint_1) {
     auto x= NDArrayFactory::create<float>('c', {1, 7}, {-1.7f, -1.5f, -0.2f, 0.2f, 1.5f, 1.7f, 2.0f});
     auto exp= NDArrayFactory::create<float>('c', {1, 7}, {-2.f, -2.f, -0.f, 0.f, 2.f, 2.f, 2.f});
 
-    nd4j::ops::rint op;
+    sd::ops::rint op;
     auto result = op.evaluate({&x}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -186,7 +186,7 @@ TEST_F(DeclarableOpsTests3, Test_Norm_1) {
 
     std::vector<int> empty;
     std::vector<int> dims({1});
-    nd4j::ops::norm op;
+    sd::ops::norm op;
 
     auto result0 = op.evaluate({&x}, {0.}, {});
 
@@ -228,7 +228,7 @@ TEST_F(DeclarableOpsTests3, Test_Norm_2) {
 
     std::vector<int> empty;
     std::vector<int> dims({1});
-    nd4j::ops::norm op;
+    sd::ops::norm op;
 
     auto result0 = op.evaluate({&x}, {0}, {});
 
@@ -263,7 +263,7 @@ TEST_F(DeclarableOpsTests3, Test_ClipByAvgNorm_1) {
     auto x = NDArrayFactory::create<double>('c', {2, 3}, {-3.0, 0.0, 0.0, 4.0, 0.0, 0.0});
     auto exp = NDArrayFactory::create<double>('c', {2, 3}, {-2.88, 0.0, 0.0, 3.84, 0.0, 0.0});
 
-    nd4j::ops::clipbyavgnorm op;
+    sd::ops::clipbyavgnorm op;
     auto result = op.evaluate({&x}, {0.8}, {});
 
     auto z = result->at(0);
@@ -278,7 +278,7 @@ TEST_F(DeclarableOpsTests3, Test_ClipByAvgNorm_2) {
     auto x= NDArrayFactory::create<float>('c', {2, 3}, {-3.0f, 0.0f, 0.0f, 4.0f, 0.0f, 0.0f});
     auto exp= NDArrayFactory::create<float>('c', {2, 3}, {-3.f, 0.0f, 0.0f, 4.f, 0.0f, 0.0f});
 
-    nd4j::ops::clipbyavgnorm op;
+    sd::ops::clipbyavgnorm op;
     auto result = op.evaluate({&x}, {0.9}, {});
 
     auto z = result->at(0);
@@ -294,7 +294,7 @@ TEST_F(DeclarableOpsTests3, Test_ClipByNorm_1) {
     auto x= NDArrayFactory::create<double>('c', {2, 3}, {-3.0, 0.0, 0.0, 4.0, 0.0, 0.0});
     auto exp= NDArrayFactory::create<double>('c', {2, 3}, {-2.4, 0.0, 0.0, 3.2, 0.0, 0.0});
 
-    nd4j::ops::clipbynorm op;
+    sd::ops::clipbynorm op;
     auto result = op.evaluate({&x}, {4.0}, {});
 
     auto z = result->at(0);
@@ -309,7 +309,7 @@ TEST_F(DeclarableOpsTests3, Test_ClipByNorm_2) {
     auto x= NDArrayFactory::create<double>('c', {2, 3}, {-3.0f, 0.0f, 0.0f, 4.0f, 0.0f, 0.0f});
     auto exp= NDArrayFactory::create<double>('c', {2, 3}, {-3.0f, 0.0f, 0.0f, 4.0f, 0.0f, 0.0f});
 
-    nd4j::ops::clipbynorm op;
+    sd::ops::clipbynorm op;
     auto result = op.evaluate({&x}, {6.0}, {});
 
     auto z = result->at(0);
@@ -339,7 +339,7 @@ TEST_F(DeclarableOpsTests3, Test_ClipByNorm_3) {
     x *= scale;
     xNorm1 = x.reduceAlongDimension(reduce::Norm2, {1}, true);
 
-    nd4j::ops::clipbynorm op;
+    sd::ops::clipbynorm op;
     auto result = op.evaluate({&x}, {1.0}, {1});
     auto z = result->at(0);
 
@@ -359,7 +359,7 @@ TEST_F(DeclarableOpsTests3, Test_ListDiff_1) {
     auto exp0= NDArrayFactory::create<float>('c', {3}, {2.f, 4.f, 6.f});
     auto exp1= NDArrayFactory::create<Nd4jLong>('c', {3}, {1, 3, 5});
 
-    nd4j::ops::listdiff op;
+    sd::ops::listdiff op;
     auto result = op.evaluate({&x, &y});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -385,7 +385,7 @@ TEST_F(DeclarableOpsTests3, Test_Range_1) {
     auto step = NDArrayFactory::create<float>(-0.33f);
     auto exp= NDArrayFactory::create<float>('c', {17}, { 0.3f, -0.03f, -0.36f, -0.69f, -1.02f, -1.35f, -1.68f, -2.01f, -2.34f, -2.67f, -3.f, -3.33f, -3.66f, -3.99f, -4.32f, -4.65f, -4.98f});
 
-    nd4j::ops::range op;
+    sd::ops::range op;
     auto result = op.evaluate({&start, &stop, &step});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -405,7 +405,7 @@ TEST_F(DeclarableOpsTests3, Test_Range_2) {
     auto step= NDArrayFactory::create<float>('c', {1, 1}, {-1.f});
     auto exp= NDArrayFactory::create<float>('c', {2}, {2.f, 1.f});
 
-    nd4j::ops::range op;
+    sd::ops::range op;
     auto result = op.evaluate({&start, &stop, &step});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -424,7 +424,7 @@ TEST_F(DeclarableOpsTests3, Test_Range_3) {
     auto step= NDArrayFactory::create<float>('c', {1, 1}, {1.f});
     auto exp= NDArrayFactory::create<float>('c', {2}, {0.f, 1.f});
 
-    nd4j::ops::range op;
+    sd::ops::range op;
     auto result = op.evaluate({&start, &stop, &step});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -444,8 +444,8 @@ TEST_F(DeclarableOpsTests3, Test_Range_10) {
     auto step= NDArrayFactory::create<float>('c', {1, 1}, {1.f});
     auto exp= NDArrayFactory::create<double>('c', {2}, {0.f, 1.f});
 
-    nd4j::ops::range op;
-    auto result = op.evaluate({&start, &stop, &step}, {nd4j::DataType::DOUBLE});
+    sd::ops::range op;
+    auto result = op.evaluate({&start, &stop, &step}, {sd::DataType::DOUBLE});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -461,7 +461,7 @@ TEST_F(DeclarableOpsTests3, Test_Range_10) {
 TEST_F(DeclarableOpsTests3, Test_Range_4) {
     auto exp= NDArrayFactory::create<float>('c', {13}, {-10.f,  -8.334f,  -6.668f,  -5.002f,  -3.336f,  -1.67f,  -0.004f,   1.662f,   3.328f,   4.994f,   6.66f,   8.326f,   9.992f});
 
-    nd4j::ops::range op;
+    sd::ops::range op;
     auto result = op.evaluate({}, {-10., 10., 1.666}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -478,7 +478,7 @@ TEST_F(DeclarableOpsTests3, Test_Range_4) {
 TEST_F(DeclarableOpsTests3, Test_Range_5) {
     auto exp= NDArrayFactory::create<float>('c', {2}, {2.f, 1.f});
 
-    nd4j::ops::range op;
+    sd::ops::range op;
     auto result = op.evaluate({}, {2, 0, -1}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -494,7 +494,7 @@ TEST_F(DeclarableOpsTests3, Test_Range_5) {
 TEST_F(DeclarableOpsTests3, Test_Range_6) {
     auto exp= NDArrayFactory::create<float>('c', {2}, {0.f, 1.f});
 
-    nd4j::ops::range op;
+    sd::ops::range op;
     auto result = op.evaluate({}, {0, 2, 1}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -510,7 +510,7 @@ TEST_F(DeclarableOpsTests3, Test_Range_6) {
 TEST_F(DeclarableOpsTests3, Test_Range_7) {
     auto exp= NDArrayFactory::create<float>('c', {10}, {10.f,  8.334f,  6.668f,  5.002f,  3.336f,  1.67f,  0.004f, -1.662f, -3.328f, -4.994f});
 
-    nd4j::ops::range op;
+    sd::ops::range op;
     auto result = op.evaluate({}, {10,-5,-1.666}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -528,7 +528,7 @@ TEST_F(DeclarableOpsTests3, Test_Range_7) {
 TEST_F(DeclarableOpsTests3, Test_Range_8) {
     auto exp= NDArrayFactory::create<int>('c', {2}, {2, 1});
 
-    nd4j::ops::range op;
+    sd::ops::range op;
     auto result = op.evaluate({}, {}, {2, 0, -1});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -544,7 +544,7 @@ TEST_F(DeclarableOpsTests3, Test_Range_8) {
 TEST_F(DeclarableOpsTests3, Test_Range_9) {
     auto exp= NDArrayFactory::create<int>('c', {2}, {0, 1});
 
-    nd4j::ops::range op;
+    sd::ops::range op;
     auto result = op.evaluate({}, {}, {0, 2, 1});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -565,7 +565,7 @@ TEST_F(DeclarableOpsTests3, Test_Batched_Gemm_1) {
 
     auto exp = MmulHelper::mmul(&x, &y);
 
-    nd4j::ops::batched_gemm op;
+    sd::ops::batched_gemm op;
     auto result = op.evaluate({&a, &b, &x, &x, &x, &y, &y, &y}, {}, {111, 111, 3, 3, 3, 3, 3, 3, 3});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -593,7 +593,7 @@ TEST_F(DeclarableOpsTests3, Test_Batched_Gemm_2) {
 
     auto exp = MmulHelper::mmul(&x, &y);
 
-    nd4j::ops::batched_gemm op;
+    sd::ops::batched_gemm op;
     auto result = op.evaluate({&a, &b, &x, &x, &x, &y, &y, &y}, {}, {112, 112, 3, 3, 3, 3, 3, 3, 3});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -621,7 +621,7 @@ TEST_F(DeclarableOpsTests3, Test_Batched_Gemm_3) {
 
     auto exp = MmulHelper::mmul(&x, &y);
 
-    nd4j::ops::batched_gemm op;
+    sd::ops::batched_gemm op;
     auto result = op.evaluate({&a, &b, &x, &x, &x, &y, &y, &y}, {}, {112, 111, 3, 3, 3, 3, 3, 3, 3});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -649,7 +649,7 @@ TEST_F(DeclarableOpsTests3, Test_Batched_Gemm_4) {
 
     auto exp = MmulHelper::mmul(&x, &y);
 
-    nd4j::ops::batched_gemm op;
+    sd::ops::batched_gemm op;
     auto result = op.evaluate({&a, &b, &x, &x, &x, &y, &y, &y}, {}, {111, 111, 5, 4, 3, 5, 3, 5, 3});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -677,7 +677,7 @@ TEST_F(DeclarableOpsTests3, Test_Batched_Gemm_5) {
 
     auto exp = MmulHelper::mmul(&x, &y);
 
-    nd4j::ops::batched_gemm op;
+    sd::ops::batched_gemm op;
     auto result = op.evaluate({&a, &b, &x, &x, &x, &y, &y, &y}, {}, {112, 112, 5, 4, 3, 3, 4, 5, 3});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -706,7 +706,7 @@ TEST_F(DeclarableOpsTests3, Test_Batched_Gemm_6) {
 
     auto exp = MmulHelper::mmul(&x, &y);
 
-    nd4j::ops::batched_gemm op;
+    sd::ops::batched_gemm op;
     auto result = op.evaluate({&a, &b, &x, &x, &x, &y, &y, &y}, {}, {111, 111, 2, 3, 5, 2, 5, 2, 3});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -736,7 +736,7 @@ TEST_F(DeclarableOpsTests3, Test_Batched_Gemm_7) {
 
     // exp->printShapeInfo("exp shape");
 
-    nd4j::ops::batched_gemm op;
+    sd::ops::batched_gemm op;
     auto result = op.evaluate({&a, &b, &x, &x, &x, &y, &y, &y}, {}, {112, 112, 2, 3, 5, 5, 3, 2, 3});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -762,7 +762,7 @@ TEST_F(DeclarableOpsTests3, Test_Batched_Gemm_Validation_1) {
     auto x = NDArrayFactory::create<float16>('c', {2, 5}, {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f});
     auto y = NDArrayFactory::create<float>('c', {5, 3}, {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f});
 
-    nd4j::ops::batched_gemm op;
+    sd::ops::batched_gemm op;
     try {
         auto result = op.evaluate({&a, &b, &x, &x, &x, &y, &y, &y}, {}, {112, 112, 2, 3, 5, 5, 3, 2, 3});
         delete result;
@@ -780,7 +780,7 @@ TEST_F(DeclarableOpsTests3, Test_Batched_Gemm_Validation_2) {
 
     auto z = NDArrayFactory::create<double>('c', {2, 3});
 
-    nd4j::ops::batched_gemm op;
+    sd::ops::batched_gemm op;
     try {
         auto result = op.execute({&a, &b, &x, &x, &x, &y, &y, &y}, {&z}, {}, {112, 112, 2, 3, 5, 5, 3, 2, 3}, {});
         ASSERT_TRUE(false);
@@ -794,7 +794,7 @@ TEST_F(DeclarableOpsTests3, Test_ReverseDivide_1) {
     auto y= NDArrayFactory::create<double>('c', {1, 3}, {4, 6, 8});
     auto exp= NDArrayFactory::create<double>('c', {1, 3}, {2, 3, 4});
 
-    nd4j::ops::reversedivide op;
+    sd::ops::reversedivide op;
     auto result = op.evaluate({&x, &y});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -825,7 +825,7 @@ TEST_F(DeclarableOpsTests3, sruCell_test1) {
     auto expHt= NDArrayFactory::create<float>('c', {batchSize, inSize}, {0.96674103f, 0.96674103f, 0.96674103f, 0.96674103f, 0.96674103f, 0.96674103f, 0.96674103f, 0.96674103f, 0.96674103f, 0.96674103f});
     auto expCt= NDArrayFactory::create<float>('c', {batchSize, inSize}, {2.01958286f, 2.01958286f, 2.01958286f, 2.01958286f, 2.01958286f, 2.01958286f, 2.01958286f, 2.01958286f, 2.01958286f, 2.01958286f});
 
-    nd4j::ops::sruCell op;
+    sd::ops::sruCell op;
     auto results = op.evaluate({&xt, &ct_1, &w, &b});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -861,7 +861,7 @@ TEST_F(DeclarableOpsTests3, sruCell_test2) {
     auto expHt= NDArrayFactory::create<float>('c', {batchSize, inSize}, {0.97542038f, 0.97542038f, 0.97542038f, 0.97542038f, 0.97542038f, 0.97542038f, 0.97542038f, 0.97542038f, 0.97542038f, 0.97542038f});
     auto expCt= NDArrayFactory::create<float>('c', {batchSize, inSize}, {2.09121276f, 2.09121276f, 2.09121276f, 2.09121276f, 2.09121276f, 2.09121276f, 2.09121276f, 2.09121276f, 2.09121276f, 2.09121276f});
 
-    nd4j::ops::sruCell op;
+    sd::ops::sruCell op;
     auto results = op.evaluate({&xt, &ct_1, &w, &b});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -896,7 +896,7 @@ TEST_F(DeclarableOpsTests3, sruCell_test3) {
     auto expHt= NDArrayFactory::create<float>('c', {batchSize, inSize}, {0.76159416f, 0.76159416f, 0.76159416f, 0.76159416f, 0.76159416f, 0.76159416f, 0.76159416f, 0.76159416f, 0.76159416f, 0.76159416f});
     auto expCt= NDArrayFactory::create<float>('c', {batchSize, inSize}, {1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f});
 
-    nd4j::ops::sruCell op;
+    sd::ops::sruCell op;
     auto results = op.evaluate({&xt, &ct_1, &w, &b});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -935,7 +935,7 @@ TEST_F(DeclarableOpsTests3, gruCell_test1) {
 
     auto expHt = NDArrayFactory::create<float>('c', {batchSize, numUnits}, {1.99993872f, 1.99993872f, 1.99993872f, 1.99993872f, 1.99993872f, 1.99993872f, 1.99993872f, 1.99993872f});
 
-    nd4j::ops::gruCell op;
+    sd::ops::gruCell op;
     auto results = op.evaluate({&xt, &ht_1, &Wru, &Wc, &bru, &bc});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -971,7 +971,7 @@ TEST_F(DeclarableOpsTests3, gruCell_test2) {
 
     auto expHt= NDArrayFactory::create<float>('c', {batchSize, numUnits}, {0.00669224f, 0.00669224f, 0.00669224f, 0.00669224f, 0.00669224f, 0.00669224f, 0.00669224f, 0.00669224f});
 
-    nd4j::ops::gruCell op;
+    sd::ops::gruCell op;
     auto results = op.evaluate({&xt, &ht_1, &Wru, &Wc, &bru, &bc});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1007,7 +1007,7 @@ TEST_F(DeclarableOpsTests3, gruCell_test3) {
 
     auto expHt= NDArrayFactory::create<float>('c', {batchSize, numUnits}, {0.1149149f, 0.1149149f, 0.1149149f, 0.1149149f, 0.1149149f, 0.1149149f, 0.1149149f, 0.1149149f});
 
-    nd4j::ops::gruCell op;
+    sd::ops::gruCell op;
     auto results = op.evaluate({&xt, &ht_1, &Wru, &Wc, &bru, &bc});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1026,7 +1026,7 @@ TEST_F(DeclarableOpsTests3, invertPermutation_test1) {
     auto input= NDArrayFactory::create<double>('c', {1, 8}, {5,2,7,4,6,3,1,0});
     auto expected= NDArrayFactory::create<double>('c', {1, 8}, {7, 6, 1, 5, 3, 0, 4, 2});
 
-    nd4j::ops::invert_permutation op;
+    sd::ops::invert_permutation op;
     auto results = op.evaluate({&input});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1045,7 +1045,7 @@ TEST_F(DeclarableOpsTests3, invertPermutation_test2) {
     auto input= NDArrayFactory::create<double>('c', {1, 8}, {5,2,7,4,6,3,1,0});
     auto expected= NDArrayFactory::create<double>('c', {1, 8}, {7, 6, 1, 5, 3, 0, 4, 2});
 
-    nd4j::ops::invert_permutation op;
+    sd::ops::invert_permutation op;
     auto results = op.evaluate({&input});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1064,7 +1064,7 @@ TEST_F(DeclarableOpsTests3, invertPermutation_test3) {
     auto input= NDArrayFactory::create<double>('c', {1, 8}, {1,2,0,4,6,3,5,7});
     auto expected= NDArrayFactory::create<double>('c', {1, 8}, {2, 0, 1, 5, 3, 6, 4, 7});
 
-    nd4j::ops::invert_permutation op;
+    sd::ops::invert_permutation op;
     auto results = op.evaluate({&input});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1085,7 +1085,7 @@ TEST_F(DeclarableOpsTests3, diag_test1) {
 
     auto expected= NDArrayFactory::create<double>('c', {3,2,3,2}, {1,0,0,0,0,0, 0,2,0,0,0,0, 0,0,3,0,0,0, 0,0,0,4,0,0, 0,0,0,0,5,0, 0,0,0,0,0,6});
 
-    nd4j::ops::diag op;
+    sd::ops::diag op;
     auto results = op.evaluate({&input});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1106,7 +1106,7 @@ TEST_F(DeclarableOpsTests3, diag_test2) {
 
     auto expected= NDArrayFactory::create<double>('c', {2,3,2,3}, {1,0,0,0,0,0, 0,2,0,0,0,0, 0,0,3,0,0,0, 0,0,0,4,0,0, 0,0,0,0,5,0, 0,0,0,0,0,6});
 
-    nd4j::ops::diag op;
+    sd::ops::diag op;
     auto results = op.evaluate({&input});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1127,7 +1127,7 @@ TEST_F(DeclarableOpsTests3, diag_test_vector) {
     auto input = NDArrayFactory::linspace<double>(1,4,4);
     auto expected= NDArrayFactory::create<double>('c', {4,4}, {1,0,0,0, 0,2,0,0, 0,0,3,0,0,0,0,4});
 
-    nd4j::ops::diag op;
+    sd::ops::diag op;
     auto results = op.evaluate({input});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1151,7 +1151,7 @@ TEST_F(DeclarableOpsTests3, diag_test_col_vector) {
     input->reshapei({4,1});
     auto expected= NDArrayFactory::create<double>('c', {4,4}, {1,0,0,0, 0,2,0,0, 0,0,3,0,0,0,0,4});
 
-    nd4j::ops::diag op;
+    sd::ops::diag op;
     auto results = op.evaluate({input}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1172,7 +1172,7 @@ TEST_F(DeclarableOpsTests3, diag_test3) {
 
     auto expected= NDArrayFactory::create<double>('c', {3,3}, {1,0,0, 0,2,0, 0,0,3});
 
-    nd4j::ops::diag op;
+    sd::ops::diag op;
     auto results = op.evaluate({&input}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1193,7 +1193,7 @@ TEST_F(DeclarableOpsTests3, diag_test4) {
 
     auto expected= NDArrayFactory::create<double>('c', {3,3}, {1,0,0, 0,2,0, 0,0,3});
 
-    nd4j::ops::diag op;
+    sd::ops::diag op;
     auto results = op.evaluate({&input}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1214,7 +1214,7 @@ TEST_F(DeclarableOpsTests3, diag_test5) {
 
     auto expected= NDArrayFactory::create<double>('c', {1,1}, {2});
 
-    nd4j::ops::diag op;
+    sd::ops::diag op;
     auto results = op.evaluate({&input}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1235,7 +1235,7 @@ TEST_F(DeclarableOpsTests3, diag_test6) {
 
     auto expected= NDArrayFactory::create<double>('c', {2,2,2,2,2,2}, {1,0,0,0, 0,0,0,0, 0,2,0,0, 0,0,0,0, 0,0,3,0, 0,0,0,0, 0,0,0,4, 0,0,0,0, 0,0,0,0, 5,0,0,0, 0,0,0,0, 0,6,0,0, 0,0,0,0, 0,0,7,0, 0,0,0,0, 0,0,0,8});
 
-    nd4j::ops::diag op;
+    sd::ops::diag op;
     auto results = op.evaluate({&input}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1258,7 +1258,7 @@ TEST_F(DeclarableOpsTests3, matrixSetDiag_test1) {
 
     auto expected= NDArrayFactory::create<double>('c', {4,3,2}, {1,0,0,1,0,0, 1,0,0,1,0,0, 1,0,0,1,0,0, 1,0,0,1,0,0});
 
-    nd4j::ops::matrix_set_diag op;
+    sd::ops::matrix_set_diag op;
     auto results = op.evaluate({&input, &diagonal}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1281,7 +1281,7 @@ TEST_F(DeclarableOpsTests3, matrixSetDiag_test2) {
 
     auto expected= NDArrayFactory::create<float>('c', {1,1,2}, {1.f, 0.f});
 
-    nd4j::ops::matrix_set_diag op;
+    sd::ops::matrix_set_diag op;
     auto results = op.evaluate({&input, &diagonal}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1304,7 +1304,7 @@ TEST_F(DeclarableOpsTests3, matrixSetDiag_test3) {
 
     auto expected= NDArrayFactory::create<double>('c', {2,1,4}, {1,0,0,0,1,0,0,0});
 
-    nd4j::ops::matrix_set_diag op;
+    sd::ops::matrix_set_diag op;
     auto results = op.evaluate({&input, &diagonal}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1327,7 +1327,7 @@ TEST_F(DeclarableOpsTests3, matrixSetDiag_test4) {
 
     auto expected= NDArrayFactory::create<double>('c', {2,1,4,1}, {1,0,0,0,1,0,0,0});
 
-    nd4j::ops::matrix_set_diag op;
+    sd::ops::matrix_set_diag op;
     auto results = op.evaluate({&input, &diagonal}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1348,7 +1348,7 @@ TEST_F(DeclarableOpsTests3, diagPart_test1) {
 
     auto expected= NDArrayFactory::create<double>('c', {2}, {1,4});
 
-    nd4j::ops::diag_part op;
+    sd::ops::diag_part op;
     auto results = op.evaluate({&input}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1370,7 +1370,7 @@ TEST_F(DeclarableOpsTests3, diagPart_test2) {
 
     auto expected= NDArrayFactory::create<double>('c', {2,2}, {1,6,11,16});
 
-    nd4j::ops::diag_part op;
+    sd::ops::diag_part op;
     auto results = op.evaluate({&input}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1391,7 +1391,7 @@ TEST_F(DeclarableOpsTests3, diagPart_test3) {
 
     auto expected= NDArrayFactory::create<double>('c', {2,2,2}, {1,10,19,28,37,46,55,64});
 
-    nd4j::ops::diag_part op;
+    sd::ops::diag_part op;
     auto results = op.evaluate({&input}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1417,7 +1417,7 @@ TEST_F(DeclarableOpsTests3, betainc_test1) {
 
     auto expected = NDArrayFactory::create<float16>('c', {3,3}, {0.40638509f, 0.33668978f, 0.28271242f, 0.23973916f, 0.20483276f, 0.17604725f, 0.15203027f, 0.13180567f, 0.114647f});
 
-    nd4j::ops::betainc op;
+    sd::ops::betainc op;
     auto results = op.evaluate({&a, &b, &x}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1443,7 +1443,7 @@ TEST_F(DeclarableOpsTests3, betainc_test2) {
 
     auto expected= NDArrayFactory::create<float>('c', {3,3}, {0.40638509f, 0.33668978f, 0.28271242f, 0.23973916f, 0.20483276f, 0.17604725f, 0.15203027f, 0.13180567f, 0.114647f});
 
-    nd4j::ops::betainc op;
+    sd::ops::betainc op;
     auto results = op.evaluate({&a, &b, &x}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1469,7 +1469,7 @@ TEST_F(DeclarableOpsTests3, betainc_test3) {
 
     auto expected= NDArrayFactory::create<float>('c', {3,3}, {0.40638509f, 0.33668978f, 0.28271242f, 0.23973916f, 0.20483276f, 0.17604725f, 0.15203027f, 0.13180567f, 0.114647f});
 
-    nd4j::ops::betainc op;
+    sd::ops::betainc op;
     auto results = op.evaluate({&a, &b, &x}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1495,7 +1495,7 @@ TEST_F(DeclarableOpsTests3, betainc_test4) {
 
     auto expected= NDArrayFactory::create<float>('c', {3,3}, {1.00000000e-01f, 2.80000000e-02f, 8.56000000e-03f, 2.72800000e-03f, 8.90920000e-04f, 2.95706080e-04f, 9.92854864e-05f, 3.36248880e-05f, 1.14644360e-05f});
 
-    nd4j::ops::betainc op;
+    sd::ops::betainc op;
     auto results = op.evaluate({&a, &b, &x}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1521,7 +1521,7 @@ TEST_F(DeclarableOpsTests3, betainc_test5) {
 
     auto expected= NDArrayFactory::create<float>('c', {3,3}, {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f});
 
-    nd4j::ops::betainc op;
+    sd::ops::betainc op;
     auto results = op.evaluate({&a, &b, &x}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1547,7 +1547,7 @@ TEST_F(DeclarableOpsTests3, betainc_test6) {
 
     auto expected= NDArrayFactory::create<float>('c', {3,3}, {3.92988233e-06f, 1.35306497e-06f, 4.67576826e-07f, 1.62083416e-07f, 5.63356971e-08f, 1.96261318e-08f, 6.85120307e-09f, 2.39594668e-09f, 8.39227685e-10f});
 
-    nd4j::ops::betainc op;
+    sd::ops::betainc op;
     auto results = op.evaluate({&a, &b, &x}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1573,7 +1573,7 @@ TEST_F(DeclarableOpsTests3, betainc_test7) {
 
     auto expected= NDArrayFactory::create<float>('c', {3,3}, {0.99999607f, 0.99999865f, 0.99999953f, 0.99999984f, 0.99999994f, 0.99999998f, 0.99999999f, 1.f, 1.f});
 
-    nd4j::ops::betainc op;
+    sd::ops::betainc op;
     auto results = op.evaluate({&a, &b, &x}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1599,7 +1599,7 @@ TEST_F(DeclarableOpsTests3, betainc_test8) {
 
     auto expected= NDArrayFactory::create<float>('c', {3,3}, {1.f, 1.f, 1.f,1.f,1.f,1.f,1.f,1.f,1.f});
 
-    nd4j::ops::betainc op;
+    sd::ops::betainc op;
     auto results = op.evaluate({&a, &b, &x}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1625,7 +1625,7 @@ TEST_F(DeclarableOpsTests3, betainc_test9) {
 
     auto expected= NDArrayFactory::create<float>('c', {3,3}, {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f});
 
-    nd4j::ops::betainc op;
+    sd::ops::betainc op;
     auto results = op.evaluate({&a, &b, &x}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1651,7 +1651,7 @@ TEST_F(DeclarableOpsTests3, betainc_test10) {
 
     auto expected= NDArrayFactory::create<float>('c', {3,3}, {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f});
 
-    nd4j::ops::betainc op;
+    sd::ops::betainc op;
     auto results = op.evaluate({&a, &b, &x}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1667,12 +1667,12 @@ TEST_F(DeclarableOpsTests3, betainc_test10) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests3, betainc_test11) {
 
-    NDArray a('c', {4}, {0.7788f, 0.8012f, 0.7244f, 0.2309f}, nd4j::DataType::FLOAT32);
-    NDArray b('c', {4}, {0.7717f, 0.9281f, 0.9846f, 0.4838f}, nd4j::DataType::FLOAT32);
-    NDArray x('c', {4}, {0.9441f, 0.5957f, 0.8669f, 0.3502f}, nd4j::DataType::FLOAT32);
+    NDArray a('c', {4}, {0.7788f, 0.8012f, 0.7244f, 0.2309f}, sd::DataType::FLOAT32);
+    NDArray b('c', {4}, {0.7717f, 0.9281f, 0.9846f, 0.4838f}, sd::DataType::FLOAT32);
+    NDArray x('c', {4}, {0.9441f, 0.5957f, 0.8669f, 0.3502f}, sd::DataType::FLOAT32);
 
-    NDArray expected('c', {4}, {0.912156, 0.634460, 0.898314, 0.624538}, nd4j::DataType::FLOAT32);
-    nd4j::ops::betainc op;
+    NDArray expected('c', {4}, {0.912156, 0.634460, 0.898314, 0.624538}, sd::DataType::FLOAT32);
+    sd::ops::betainc op;
     auto results = op.evaluate({&a, &b, &x}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1688,13 +1688,13 @@ TEST_F(DeclarableOpsTests3, betainc_test11) {
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests3, betainc_test12) {
 
-    NDArray a('c', {4}, {8.0091f, 8.2108f, 7.5194f, 3.0780f}, nd4j::DataType::FLOAT32);
-    NDArray b('c', {4}, {7.9456f, 9.3527f, 9.8610f, 5.3541f}, nd4j::DataType::FLOAT32);
-    NDArray x('c', {4}, {0.9441f, 0.5957f, 0.8669f, 0.3502f}, nd4j::DataType::FLOAT32);
+    NDArray a('c', {4}, {8.0091f, 8.2108f, 7.5194f, 3.0780f}, sd::DataType::FLOAT32);
+    NDArray b('c', {4}, {7.9456f, 9.3527f, 9.8610f, 5.3541f}, sd::DataType::FLOAT32);
+    NDArray x('c', {4}, {0.9441f, 0.5957f, 0.8669f, 0.3502f}, sd::DataType::FLOAT32);
 
-    NDArray expected('c', {4}, {0.9999995 , 0.8594694 , 0.999988  , 0.49124345}, nd4j::DataType::FLOAT32);
+    NDArray expected('c', {4}, {0.9999995 , 0.8594694 , 0.999988  , 0.49124345}, sd::DataType::FLOAT32);
 
-    nd4j::ops::betainc op;
+    sd::ops::betainc op;
     auto results = op.evaluate({&a, &b, &x}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1718,7 +1718,7 @@ TEST_F(DeclarableOpsTests3, zeta_test1) {
 
     auto expected= NDArrayFactory::create<float>('c', {3,3}, {1.64493407f, 0.64493407f, 0.39493407f, 0.28382296f, 0.22132296f, 0.18132296f, 0.15354518f, 0.13313701f, 0.11751201f});
 
-    nd4j::ops::zeta op;
+    sd::ops::zeta op;
     auto results = op.evaluate({&x, &q}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1742,7 +1742,7 @@ TEST_F(DeclarableOpsTests3, zeta_test2) {
 
     auto expected= NDArrayFactory::create<float>('c', {3,3}, {0.10516634f, 0.09516634f, 0.08690187f, 0.07995743f, 0.07404027f, 0.06893823f, 0.06449378f, 0.06058753f, 0.05712733f});
 
-    nd4j::ops::zeta op;
+    sd::ops::zeta op;
     auto results = op.evaluate({&x, &q}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1766,7 +1766,7 @@ TEST_F(DeclarableOpsTests3, zeta_test3) {
 
     auto expected= NDArrayFactory::create<float>('c', {3,3}, {0.01005017f, 0.00995017f, 0.00985214f, 0.00975602f, 0.00966176f, 0.0095693f, 0.0094786f, 0.0093896f, 0.00930226f});
 
-    nd4j::ops::zeta op;
+    sd::ops::zeta op;
     auto results = op.evaluate({&x, &q}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1791,7 +1791,7 @@ TEST_F(DeclarableOpsTests3, zeta_test4) {
 
     auto expected= NDArrayFactory::create<float>('c', {3,3}, {0.01005017f, 0.00995017f, 0.00985214f, 0.00975602f, 0.00966176f, 0.0095693f, 0.0094786f, 0.0093896f, 0.00930226f});
 
-    nd4j::ops::zeta op;
+    sd::ops::zeta op;
     auto results = op.evaluate({&x, &q}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1815,7 +1815,7 @@ TEST_F(DeclarableOpsTests3, zeta_test5) {
 
     auto expected= NDArrayFactory::create<float>('c', {3,3}, {10.58444846f, 9.58444846f, 9.11793197f, 8.81927915f, 8.60164151f, 8.43137352f, 8.29204706f, 8.17445116f, 8.07291961f});
 
-    nd4j::ops::zeta op;
+    sd::ops::zeta op;
     auto results = op.evaluate({&x, &q}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1839,7 +1839,7 @@ TEST_F(DeclarableOpsTests3, zeta_test6) {
 
     auto expected= NDArrayFactory::create<float>('c', {3,3}, {100.57794334f, 99.57794334f, 99.08139709f, 98.75170576f, 98.50514758f, 98.30834069f, 98.1446337f, 98.00452955f, 97.88210202f});
 
-    nd4j::ops::zeta op;
+    sd::ops::zeta op;
     auto results = op.evaluate({&x, &q}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1863,7 +1863,7 @@ TEST_F(DeclarableOpsTests3, zeta_test7) {
 
     auto expected= NDArrayFactory::create<float>('c', {3,3}, {1.00099458e+00f, 9.94575128e-04f, 1.80126278e-05f, 1.07754001e-06f, 1.23865693e-07f, 2.14656932e-08f, 4.92752156e-09f, 1.38738839e-09f, 4.56065812e-10f});
 
-    nd4j::ops::zeta op;
+    sd::ops::zeta op;
     auto results = op.evaluate({&x, &q}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1887,7 +1887,7 @@ TEST_F(DeclarableOpsTests3, zeta_test8) {
 
     auto expected= NDArrayFactory::create<double>('c', {3,4}, {23.014574, 12.184081, 8.275731, 6.1532226, 4.776538, 3.7945523, 3.0541048, 2.4765317, 2.0163891, 205.27448, 21.090889, 19.477398});
 
-    nd4j::ops::zeta op;
+    sd::ops::zeta op;
     auto results = op.evaluate({&x, &q}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1912,7 +1912,7 @@ TEST_F(DeclarableOpsTests3, zeta_test9) {
 
     auto expected= NDArrayFactory::create<double>('c', {3,4}, {23.014574, 12.184081, 8.275731, 6.1532226, 4.776538, 3.7945523, 3.0541048, 2.4765317, 2.0163891, 205.27448, 21.090889, 19.477398});
 
-    nd4j::ops::zeta op;
+    sd::ops::zeta op;
     auto results = op.execute({&x, &q}, {&z}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results);
@@ -1937,7 +1937,7 @@ TEST_F(DeclarableOpsTests3, zeta_test10) {
 
     auto expected= NDArrayFactory::create<double>('c', {3,4}, {23.014574, 12.184081, 8.275731, 6.1532226, 4.776538, 3.7945523, 3.0541048, 2.4765317, 2.0163891, 205.27448, 21.090889, 19.477398});
 
-    nd4j::ops::zeta op;
+    sd::ops::zeta op;
     auto results = op.execute({&x, &q}, {&z}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results);
@@ -1959,7 +1959,7 @@ TEST_F(DeclarableOpsTests3, Test_SplitV_Validation_1) {
     auto z0 = NDArrayFactory::create<float>('c', {5, 7});
     auto z1 = NDArrayFactory::create<float>('c', {3, 7});
 
-    nd4j::ops::split_v op;
+    sd::ops::split_v op;
     auto status = op.execute({&x, &indices, &axis}, std::vector<NDArray*>{&z0, &z1}, {}, {}, {});
     ASSERT_EQ(Status::OK(), status);
 }
@@ -1975,7 +1975,7 @@ TEST_F(DeclarableOpsTests3, polygamma_test1) {
 
     auto expected= NDArrayFactory::create<double>('c', {3,3}, {4.934802, -16.828796, 97.409088, -771.474243, 7691.113770, -92203.460938, 1290440.250000, -20644900.000000, 3.71595e+08});
 
-    nd4j::ops::polygamma op;
+    sd::ops::polygamma op;
     auto results = op.evaluate({&n, &x}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2002,7 +2002,7 @@ TEST_F(DeclarableOpsTests3, polygamma_test2) {
 
     //ASSERT_FALSE(true);
 
-    nd4j::ops::polygamma op;
+    sd::ops::polygamma op;
     auto results = op.evaluate({&n, &x}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2025,7 +2025,7 @@ TEST_F(DeclarableOpsTests3, polygamma_test3) {
     x.linspace(10.);
 
     auto expected= NDArrayFactory::create<double>('c', {3,3}, {1.05166336e-01,-9.04983497e-03, 1.31009323e-03,-2.44459433e-04, 5.31593880e-05,-1.28049888e-05, 3.31755364e-06,-9.07408791e-07, 2.58758130e-07});
-    nd4j::ops::polygamma op;
+    sd::ops::polygamma op;
     auto results = op.evaluate({&n, &x}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2040,13 +2040,13 @@ TEST_F(DeclarableOpsTests3, polygamma_test3) {
 
 TEST_F(DeclarableOpsTests3, polygamma_test4) {
 
-    NDArray n('c', {3,4}, {/*0.7788*/0, 0,1,2,3,4,5,6,7,8,9,10}, nd4j::DataType::DOUBLE);
-    NDArray x('c', {3,4}, {0.7717,0.9281,0.9846,0.4838,0.6433,0.6041,0.6501,0.7612,0.7605,0.3948,0.9493,0.8600}, nd4j::DataType::DOUBLE);
+    NDArray n('c', {3,4}, {/*0.7788*/0, 0,1,2,3,4,5,6,7,8,9,10}, sd::DataType::DOUBLE);
+    NDArray x('c', {3,4}, {0.7717,0.9281,0.9846,0.4838,0.6433,0.6041,0.6501,0.7612,0.7605,0.3948,0.9493,0.8600}, sd::DataType::DOUBLE);
 
     NDArray expected('c', {3,4}, {/*std::numeric_limits<double>::quiet_NaN()*/-1.031918,  -7.021327e-01,  1.682743e+00, -1.851378e+01,3.604167e+01, -3.008293e+02,
-                                1.596005e+03, -4.876665e+03,4.510025e+04, -1.730340e+08,  6.110257e+05, -1.907087e+07}, nd4j::DataType::DOUBLE);
+                                1.596005e+03, -4.876665e+03,4.510025e+04, -1.730340e+08,  6.110257e+05, -1.907087e+07}, sd::DataType::DOUBLE);
 
-    nd4j::ops::polygamma op;
+    sd::ops::polygamma op;
     auto results = op.evaluate({&n, &x}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2061,12 +2061,12 @@ TEST_F(DeclarableOpsTests3, polygamma_test4) {
 
 TEST_F(DeclarableOpsTests3, digamma_1) {
 
-    NDArray x('c', {18}, {-25, -24.99999, -21.5, -21.2, -5.5, -4.1, -2.1, -0.5, -0.3, 0., 0.2, 1, 1.5, 2.2, 5.2, 19., 21, 22.2}, nd4j::DataType::DOUBLE);
+    NDArray x('c', {18}, {-25, -24.99999, -21.5, -21.2, -5.5, -4.1, -2.1, -0.5, -0.3, 0., 0.2, 1, 1.5, 2.2, 5.2, 19., 21, 22.2}, sd::DataType::DOUBLE);
 
     NDArray expected('c', {18}, {std::numeric_limits<double>::infinity(), -99996.761229, 3.091129, 7.401432, 1.792911,11.196838,10.630354, 0.03649, 2.11331,
-                                 std::numeric_limits<double>::infinity(),-5.28904,-0.577216, 0.03649, 0.544293, 1.549434,2.917892, 3.020524, 3.077401}, nd4j::DataType::DOUBLE);
+                                 std::numeric_limits<double>::infinity(),-5.28904,-0.577216, 0.03649, 0.544293, 1.549434,2.917892, 3.020524, 3.077401}, sd::DataType::DOUBLE);
 
-    nd4j::ops::digamma op;
+    sd::ops::digamma op;
     auto results = op.evaluate({&x}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2087,7 +2087,7 @@ TEST_F(DeclarableOpsTests3, svd_test1) {
     auto expU= NDArrayFactory::create<double>('c', {6,6}, {0.14692,-0.11132,-0.69568, 0.59282,-0.14881, 0.32935,-0.38751, 0.60378,-0.04927,-0.01397,-0.69456,-0.01581, 0.19293,-0.12795,-0.18682,-0.69065,-0.20597, 0.62617, 0.66806, 0.4314 ,-0.33849,-0.22166, 0.04099,-0.44967, 0.11121,-0.64065,-0.02138,-0.07378,-0.60568,-0.45216,-0.5765 ,-0.1007 ,-0.60305,-0.34175, 0.29068,-0.3042});
     auto expV= NDArrayFactory::create<double>('c', {6,6}, {-0.24577,-0.24512, 0.00401,-0.04585,-0.62058, 0.70162, 0.27937, 0.75961, 0.43885,-0.06857,-0.3839 , 0.01669,-0.35944,-0.09629, 0.44593, 0.78602,-0.09103,-0.19125, 0.53973, 0.07613,-0.10721, 0.49559, 0.35687, 0.56431,-0.6226 , 0.39742, 0.12785,-0.15716, 0.52372, 0.37297, 0.23113,-0.43578, 0.76204,-0.32414, 0.23996, 0.11543});
 
-    nd4j::ops::svd op;
+    sd::ops::svd op;
     auto results = op.evaluate({&x}, {}, {1, 1, 16});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2102,15 +2102,15 @@ TEST_F(DeclarableOpsTests3, svd_test1) {
 
     ASSERT_TRUE(expS.equalsTo(s));
 
-    if(nd4j::Environment::getInstance()->isCPU()) {
+    if(sd::Environment::getInstance()->isCPU()) {
         ASSERT_TRUE(expU.equalsTo(u));
         ASSERT_TRUE(expV.equalsTo(v));
     }
     else {
         for(uint i = 0; i < expU.lengthOf(); ++i)
-            ASSERT_NEAR(nd4j::math::nd4j_abs(expU.e<float>(i)), nd4j::math::nd4j_abs(u->e<float>(i)), 1e-5);
+            ASSERT_NEAR(sd::math::nd4j_abs(expU.e<float>(i)), sd::math::nd4j_abs(u->e<float>(i)), 1e-5);
         for(uint i = 0; i < expV.lengthOf(); ++i)
-            ASSERT_NEAR(nd4j::math::nd4j_abs(expV.e<float>(i)), nd4j::math::nd4j_abs(v->e<float>(i)), 1e-5);
+            ASSERT_NEAR(sd::math::nd4j_abs(expV.e<float>(i)), sd::math::nd4j_abs(v->e<float>(i)), 1e-5);
     }
 
     delete results;
@@ -2124,7 +2124,7 @@ TEST_F(DeclarableOpsTests3, svd_test2) {
     auto expU= NDArrayFactory::create<double>('c', {7,7}, {-0.13417,-0.12443, -0.68854,  0.5196 ,  0.21706,  0.03974,  0.41683, 0.347  , 0.62666, -0.04964, -0.01912,  0.66932,  0.1457 , -0.12183,-0.17329,-0.14666, -0.19639, -0.55355,  0.0614 ,  0.75729,  0.1619 ,-0.64703, 0.37056, -0.37398, -0.32922, -0.0186 , -0.35656, -0.26134,-0.08027,-0.64405, -0.0127 , -0.06934,  0.59287, -0.14956, -0.44712, 0.55906,-0.06235, -0.58017, -0.12911, -0.359  , -0.00393, -0.44877, 0.30645,-0.11953, -0.09083, -0.54163,  0.14283, -0.50417,  0.56178});
     auto expV= NDArrayFactory::create<double>('c', {6,6}, {0.2508 ,-0.2265 , 0.01689,  0.04486,  0.53132,  0.77537,-0.32281, 0.74559, 0.41845, -0.13821,  0.37642,  0.06315, 0.33139,-0.05528, 0.47186,  0.73171,  0.18905, -0.3055 ,-0.57263, 0.06276,-0.09542,  0.59396, -0.36152,  0.419  , 0.59193, 0.4361 , 0.13557, -0.03632, -0.5755 ,  0.32944,-0.21165,-0.44227, 0.75794, -0.29895, -0.27993,  0.13187});
 
-    nd4j::ops::svd op;
+    sd::ops::svd op;
     auto results = op.evaluate({&x}, {}, {1, 1, 16});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2139,15 +2139,15 @@ TEST_F(DeclarableOpsTests3, svd_test2) {
 
     ASSERT_TRUE(expS.equalsTo(s));
 
-    if(nd4j::Environment::getInstance()->isCPU()) {
+    if(sd::Environment::getInstance()->isCPU()) {
         ASSERT_TRUE(expU.equalsTo(u));
         ASSERT_TRUE(expV.equalsTo(v));
     }
     else {
         for(uint i = 0; i < expU.lengthOf(); ++i)
-            ASSERT_NEAR(nd4j::math::nd4j_abs(expU.e<float>(i)), nd4j::math::nd4j_abs(u->e<float>(i)), 1e-5);
+            ASSERT_NEAR(sd::math::nd4j_abs(expU.e<float>(i)), sd::math::nd4j_abs(u->e<float>(i)), 1e-5);
         for(uint i = 0; i < expV.lengthOf(); ++i)
-            ASSERT_NEAR(nd4j::math::nd4j_abs(expV.e<float>(i)), nd4j::math::nd4j_abs(v->e<float>(i)), 1e-5);
+            ASSERT_NEAR(sd::math::nd4j_abs(expV.e<float>(i)), sd::math::nd4j_abs(v->e<float>(i)), 1e-5);
     }
 
     delete results;
@@ -2161,7 +2161,7 @@ TEST_F(DeclarableOpsTests3, svd_test3) {
     auto expU= NDArrayFactory::create<double>('c', {7,6}, {-0.13417, -0.12443, -0.68854,  0.5196 ,  0.21706,  0.03974, 0.347  ,  0.62666, -0.04964, -0.01912,  0.66932,  0.1457 ,-0.17329, -0.14666, -0.19639, -0.55355,  0.0614 ,  0.75729,-0.64703,  0.37056, -0.37398, -0.32922, -0.0186 , -0.35656,-0.08027, -0.64405, -0.0127 , -0.06934,  0.59287, -0.14956, 0.55906, -0.06235, -0.58017, -0.12911, -0.359  , -0.00393, 0.30645, -0.11953, -0.09083, -0.54163,  0.14283, -0.50417});
     auto expV= NDArrayFactory::create<double>('c', {6,6}, {0.2508 ,-0.2265 , 0.01689,  0.04486,  0.53132,  0.77537,-0.32281, 0.74559, 0.41845, -0.13821,  0.37642,  0.06315, 0.33139,-0.05528, 0.47186,  0.73171,  0.18905, -0.3055 ,-0.57263, 0.06276,-0.09542,  0.59396, -0.36152,  0.419  , 0.59193, 0.4361 , 0.13557, -0.03632, -0.5755 ,  0.32944,-0.21165,-0.44227, 0.75794, -0.29895, -0.27993,  0.13187});
 
-    nd4j::ops::svd op;
+    sd::ops::svd op;
     auto results = op.evaluate({&x}, {}, {0, 1, 16});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2176,15 +2176,15 @@ TEST_F(DeclarableOpsTests3, svd_test3) {
 
     ASSERT_TRUE(expS.equalsTo(s));
 
-    if(nd4j::Environment::getInstance()->isCPU()) {
+    if(sd::Environment::getInstance()->isCPU()) {
         ASSERT_TRUE(expU.equalsTo(u));
         ASSERT_TRUE(expV.equalsTo(v));
     }
     else {
         for(uint i = 0; i < expU.lengthOf(); ++i)
-            ASSERT_NEAR(nd4j::math::nd4j_abs(expU.e<float>(i)), nd4j::math::nd4j_abs(u->e<float>(i)), 1e-5f);
+            ASSERT_NEAR(sd::math::nd4j_abs(expU.e<float>(i)), sd::math::nd4j_abs(u->e<float>(i)), 1e-5f);
         for(uint i = 0; i < expV.lengthOf(); ++i)
-            ASSERT_NEAR(nd4j::math::nd4j_abs(expV.e<float>(i)), nd4j::math::nd4j_abs(v->e<float>(i)), 1e-5f);
+            ASSERT_NEAR(sd::math::nd4j_abs(expV.e<float>(i)), sd::math::nd4j_abs(v->e<float>(i)), 1e-5f);
     }
 
     delete results;
@@ -2198,7 +2198,7 @@ TEST_F(DeclarableOpsTests3, svd_test4) {
     auto expU= NDArrayFactory::create<double>('c', {6,6}, {-0.16541, 0.21276,  0.51284,  0.20472,  0.74797,  0.25102,-0.49879, 0.12076,  0.37629, -0.7211 , -0.24585,  0.12086,-0.36569,-0.70218, -0.08012,  0.21274, -0.07314,  0.56231,-0.44508, 0.4329 ,  0.1356 ,  0.60909, -0.47398, -0.02164, 0.61238,-0.05674,  0.59489,  0.06588, -0.3874 ,  0.33685,-0.13044,-0.50644,  0.46552,  0.13236, -0.00474, -0.70161});
     auto expV= NDArrayFactory::create<double>('c', {7,7}, {-0.35914,  0.68966, -0.30077, -0.15238, -0.48179,  0.14716, -0.16709, 0.21989, -0.34343,  0.11086, -0.78381, -0.37902,  0.24224, -0.06862, 0.32179,  0.12812, -0.25812,  0.0691 , -0.12891,  0.26979,  0.84807,-0.50833,  0.13793,  0.06658, -0.53001,  0.52572, -0.16194,  0.36692, 0.48118,  0.15876, -0.65132, -0.24602,  0.3963 , -0.16651, -0.27155,-0.31605, -0.46947, -0.50195,  0.0378 , -0.34937, -0.53062,  0.15069, 0.35957,  0.35408,  0.38732, -0.12154, -0.22827, -0.7151 ,  0.13065});
 
-    nd4j::ops::svd op;
+    sd::ops::svd op;
     auto results = op.evaluate({&x}, {}, {1, 1, 16});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2213,15 +2213,15 @@ TEST_F(DeclarableOpsTests3, svd_test4) {
 
     ASSERT_TRUE(expS.equalsTo(s));
 
-    if(nd4j::Environment::getInstance()->isCPU()) {
+    if(sd::Environment::getInstance()->isCPU()) {
         ASSERT_TRUE(expU.equalsTo(u));
         ASSERT_TRUE(expV.equalsTo(v));
     }
     else {
         for(uint i = 0; i < expU.lengthOf(); ++i)
-            ASSERT_NEAR(nd4j::math::nd4j_abs(expU.e<float>(i)), nd4j::math::nd4j_abs(u->e<float>(i)), 1e-5f);
+            ASSERT_NEAR(sd::math::nd4j_abs(expU.e<float>(i)), sd::math::nd4j_abs(u->e<float>(i)), 1e-5f);
         for(uint i = 0; i < expV.lengthOf(); ++i)
-            ASSERT_NEAR(nd4j::math::nd4j_abs(expV.e<float>(i)), nd4j::math::nd4j_abs(v->e<float>(i)), 1e-5f);
+            ASSERT_NEAR(sd::math::nd4j_abs(expV.e<float>(i)), sd::math::nd4j_abs(v->e<float>(i)), 1e-5f);
     }
 
     delete results;
@@ -2235,7 +2235,7 @@ TEST_F(DeclarableOpsTests3, svd_test5) {
     auto expU= NDArrayFactory::create<double>('c', {6,6}, {-0.16541, 0.21276,  0.51284,  0.20472,  0.74797,  0.25102,-0.49879, 0.12076,  0.37629, -0.7211 , -0.24585,  0.12086,-0.36569,-0.70218, -0.08012,  0.21274, -0.07314,  0.56231,-0.44508, 0.4329 ,  0.1356 ,  0.60909, -0.47398, -0.02164, 0.61238,-0.05674,  0.59489,  0.06588, -0.3874 ,  0.33685,-0.13044,-0.50644,  0.46552,  0.13236, -0.00474, -0.70161});
     auto expV= NDArrayFactory::create<double>('c', {7,6}, {-0.35914,  0.68966, -0.30077, -0.15238, -0.48179,  0.14716, 0.21989, -0.34343,  0.11086, -0.78381, -0.37902,  0.24224, 0.32179,  0.12812, -0.25812,  0.0691 , -0.12891,  0.26979,-0.50833,  0.13793,  0.06658, -0.53001,  0.52572, -0.16194, 0.48118,  0.15876, -0.65132, -0.24602,  0.3963 , -0.16651,-0.31605, -0.46947, -0.50195,  0.0378 , -0.34937, -0.53062, 0.35957,  0.35408,  0.38732, -0.12154, -0.22827, -0.7151});
 
-    nd4j::ops::svd op;
+    sd::ops::svd op;
     auto results = op.evaluate({&x}, {}, {0, 1, 16});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2250,15 +2250,15 @@ TEST_F(DeclarableOpsTests3, svd_test5) {
 
     ASSERT_TRUE(expS.equalsTo(s));
 
-    if(nd4j::Environment::getInstance()->isCPU()) {
+    if(sd::Environment::getInstance()->isCPU()) {
         ASSERT_TRUE(expU.equalsTo(u));
         ASSERT_TRUE(expV.equalsTo(v));
     }
     else {
         for(uint i = 0; i < expU.lengthOf(); ++i)
-            ASSERT_NEAR(nd4j::math::nd4j_abs(expU.e<float>(i)), nd4j::math::nd4j_abs(u->e<float>(i)), 1e-5f);
+            ASSERT_NEAR(sd::math::nd4j_abs(expU.e<float>(i)), sd::math::nd4j_abs(u->e<float>(i)), 1e-5f);
         for(uint i = 0; i < expV.lengthOf(); ++i)
-            ASSERT_NEAR(nd4j::math::nd4j_abs(expV.e<float>(i)), nd4j::math::nd4j_abs(v->e<float>(i)), 1e-5f);
+            ASSERT_NEAR(sd::math::nd4j_abs(expV.e<float>(i)), sd::math::nd4j_abs(v->e<float>(i)), 1e-5f);
     }
 
     delete results;
@@ -2290,7 +2290,7 @@ TEST_F(DeclarableOpsTests3, svd_test6) {
                                           0.45726, -0.33952, -0.32659, -0.18938, -0.73015,0.13486,  0.73816, -0.41646,  0.47458, -0.1956 ,0.5536 , -0.137  ,  0.64688,  0.50536,  0.03017,
                                           -0.51827, -0.31837, -0.16732,  0.71378, -0.30425,-0.39314,  0.15266,  0.63693, -0.30945, -0.5663 ,-0.51981,  0.03325,  0.37603,  0.05147,  0.76462,-0.01282,  0.92491, -0.08042,  0.36977, -0.03428});
 
-    nd4j::ops::svd op;
+    sd::ops::svd op;
     auto results = op.evaluate({&x}, {}, {1, 1, 16});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2305,15 +2305,15 @@ TEST_F(DeclarableOpsTests3, svd_test6) {
 
     ASSERT_TRUE(expS.equalsTo(s));
 
-    if(nd4j::Environment::getInstance()->isCPU()) {
+    if(sd::Environment::getInstance()->isCPU()) {
         ASSERT_TRUE(expU.equalsTo(u));
         ASSERT_TRUE(expV.equalsTo(v));
     }
     else {
         for(uint i = 0; i < expU.lengthOf(); ++i)
-            ASSERT_NEAR(nd4j::math::nd4j_abs(expU.e<float>(i)), nd4j::math::nd4j_abs(u->e<float>(i)), 1e-5f);
+            ASSERT_NEAR(sd::math::nd4j_abs(expU.e<float>(i)), sd::math::nd4j_abs(u->e<float>(i)), 1e-5f);
         for(uint i = 0; i < expV.lengthOf(); ++i)
-            ASSERT_NEAR(nd4j::math::nd4j_abs(expV.e<float>(i)), nd4j::math::nd4j_abs(v->e<float>(i)), 1e-5f);
+            ASSERT_NEAR(sd::math::nd4j_abs(expV.e<float>(i)), sd::math::nd4j_abs(v->e<float>(i)), 1e-5f);
     }
 
     delete results;
@@ -2328,7 +2328,7 @@ TEST_F(DeclarableOpsTests3, svd_test7) {
     auto expS= NDArrayFactory::create<double>('c', {2,2,5}, {40.95395,  31.46869,  24.79993,  12.33768,   1.80031,38.18412,  31.52287,  23.52755,  11.79484,   1.90195,
                                         39.34498,  32.54861,  17.52492,   7.03003,   2.2399,44.72126,  32.3164 ,  16.60139,   6.88783,   0.78122});
 
-    nd4j::ops::svd op;
+    sd::ops::svd op;
     auto results = op.evaluate({&x}, {}, {0, 0, 16});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2464,7 +2464,7 @@ TEST_F(DeclarableOpsTests3, svd_test7) {
 //                                            -0.286167, -0.101052, -0.181529, -0.419406, -0.032204, -0.732282, 0.106833, -0.288881, 0.171516, -0.096242,
 //                                            -0.331834, -0.493188, 0.393195, 0.358365, 0.049125, 0.123457, 0.438169, -0.105015, 0.092386, -0.130413, -0.476991});
 
-//    nd4j::ops::svd op;
+//    sd::ops::svd op;
 //    auto results = op.execute({&x}, {}, {1, 1, 7});
 
 //    ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2479,15 +2479,15 @@ TEST_F(DeclarableOpsTests3, svd_test7) {
 
     // ASSERT_TRUE(expS.equalsTo(s));
 
-    // if(nd4j::Environment::getInstance()->isCPU()) {
+    // if(sd::Environment::getInstance()->isCPU()) {
     //     ASSERT_TRUE(expU.equalsTo(u));
     //     ASSERT_TRUE(expV.equalsTo(v));
     // }
     // else {
     //     for(uint i = 0; i < expU.lengthOf(); ++i)
-    //         ASSERT_NEAR(nd4j::math::nd4j_abs(expU.e<float>(i)), nd4j::math::nd4j_abs(u->e<float>(i)), 1e-5);
+    //         ASSERT_NEAR(sd::math::nd4j_abs(expU.e<float>(i)), sd::math::nd4j_abs(u->e<float>(i)), 1e-5);
     //     for(uint i = 0; i < expV.lengthOf(); ++i)
-    //         ASSERT_NEAR(nd4j::math::nd4j_abs(expV.e<float>(i)), nd4j::math::nd4j_abs(v->e<float>(i)), 1e-5);
+    //         ASSERT_NEAR(sd::math::nd4j_abs(expV.e<float>(i)), sd::math::nd4j_abs(v->e<float>(i)), 1e-5);
     // }
 
 //    delete results;
@@ -2527,7 +2527,7 @@ TEST_F(DeclarableOpsTests3, svd_test9) {
                                           -4.94030000e-01,   1.55540000e-01,  -3.46720000e-01,  -7.58460000e-01,5.20000000e-04,   1.90420000e-01,2.55960000e-01,   3.17040000e-01,  -3.47800000e-02,  -3.01860000e-01,-3.57600000e-02,  -8.60450000e-01,
                                           1.31650000e-01,   7.57150000e-01,  -4.89030000e-01,   3.47710000e-01,-4.39400000e-02,   2.17750000e-01,-6.57270000e-01,   2.91000000e-01,   4.17280000e-01,   2.52880000e-01,-4.63400000e-01,  -1.74620000e-01});
 
-    nd4j::ops::svd op;
+    sd::ops::svd op;
     auto results = op.evaluate({&x}, {}, {1, 1, 16});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2542,15 +2542,15 @@ TEST_F(DeclarableOpsTests3, svd_test9) {
 
     ASSERT_TRUE(expS.equalsTo(s));
 
-    if(nd4j::Environment::getInstance()->isCPU()) {
+    if(sd::Environment::getInstance()->isCPU()) {
         ASSERT_TRUE(expU.equalsTo(u));
         ASSERT_TRUE(expV.equalsTo(v));
     }
     else {
         for(uint i = 0; i < expU.lengthOf(); ++i)
-            ASSERT_NEAR(nd4j::math::nd4j_abs(expU.e<float>(i)), nd4j::math::nd4j_abs(u->e<float>(i)), 1e-5);
+            ASSERT_NEAR(sd::math::nd4j_abs(expU.e<float>(i)), sd::math::nd4j_abs(u->e<float>(i)), 1e-5);
         for(uint i = 0; i < expV.lengthOf(); ++i)
-            ASSERT_NEAR(nd4j::math::nd4j_abs(expV.e<float>(i)), nd4j::math::nd4j_abs(v->e<float>(i)), 1e-5);
+            ASSERT_NEAR(sd::math::nd4j_abs(expV.e<float>(i)), sd::math::nd4j_abs(v->e<float>(i)), 1e-5);
     }
 
     delete results;
@@ -2586,7 +2586,7 @@ TEST_F(DeclarableOpsTests3, svd_test10) {
                                             -4.94030000e-01,   1.55540000e-01,  -3.46720000e-01,  -7.58460000e-01,5.20000000e-04,2.55960000e-01,   3.17040000e-01,  -3.47800000e-02,  -3.01860000e-01,-3.57600000e-02,1.31650000e-01,   7.57150000e-01,  -4.89030000e-01,   3.47710000e-01,
                                             -4.39400000e-02,-6.57270000e-01,   2.91000000e-01,   4.17280000e-01,   2.52880000e-01,-4.63400000e-01});
 
-    nd4j::ops::svd op;
+    sd::ops::svd op;
     auto results = op.evaluate({&x}, {}, {0, 1, 16});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2601,15 +2601,15 @@ TEST_F(DeclarableOpsTests3, svd_test10) {
 
     ASSERT_TRUE(expS.equalsTo(s));
 
-    if(nd4j::Environment::getInstance()->isCPU()) {
+    if(sd::Environment::getInstance()->isCPU()) {
         ASSERT_TRUE(expU.equalsTo(u));
         ASSERT_TRUE(expV.equalsTo(v));
     }
     else {
         for(uint i = 0; i < expU.lengthOf(); ++i)
-            ASSERT_NEAR(nd4j::math::nd4j_abs(expU.e<float>(i)), nd4j::math::nd4j_abs(u->e<float>(i)), 1e-5);
+            ASSERT_NEAR(sd::math::nd4j_abs(expU.e<float>(i)), sd::math::nd4j_abs(u->e<float>(i)), 1e-5);
         for(uint i = 0; i < expV.lengthOf(); ++i)
-            ASSERT_NEAR(nd4j::math::nd4j_abs(expV.e<float>(i)), nd4j::math::nd4j_abs(v->e<float>(i)), 1e-5);
+            ASSERT_NEAR(sd::math::nd4j_abs(expV.e<float>(i)), sd::math::nd4j_abs(v->e<float>(i)), 1e-5);
     }
 
     delete results;
@@ -2631,7 +2631,7 @@ TEST_F(DeclarableOpsTests3, svd_test11) {
                                   -0.26072, -0.51887, 0.18182,  0.96306, -0.19863, 0.85948,  0.2707 , -0.4336 , 0.26688,  0.48582,  0.83232,
                                   -0.43596,  0.83108, -0.34531});
 
-    nd4j::ops::svd op;
+    sd::ops::svd op;
     auto results = op.evaluate({&x}, {}, {0, 1, 16});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2646,15 +2646,15 @@ TEST_F(DeclarableOpsTests3, svd_test11) {
 
     ASSERT_TRUE(expS.equalsTo(s));
 
-    if(nd4j::Environment::getInstance()->isCPU()) {
+    if(sd::Environment::getInstance()->isCPU()) {
         ASSERT_TRUE(expU.equalsTo(u));
         ASSERT_TRUE(expV.equalsTo(v));
     }
     else {
         for(uint i = 0; i < expU.lengthOf(); ++i)
-            ASSERT_NEAR(nd4j::math::nd4j_abs(expU.e<float>(i)), nd4j::math::nd4j_abs(u->e<float>(i)), 1e-5);
+            ASSERT_NEAR(sd::math::nd4j_abs(expU.e<float>(i)), sd::math::nd4j_abs(u->e<float>(i)), 1e-5);
         for(uint i = 0; i < expV.lengthOf(); ++i)
-            ASSERT_NEAR(nd4j::math::nd4j_abs(expV.e<float>(i)), nd4j::math::nd4j_abs(v->e<float>(i)), 1e-5);
+            ASSERT_NEAR(sd::math::nd4j_abs(expV.e<float>(i)), sd::math::nd4j_abs(v->e<float>(i)), 1e-5);
     }
 
     delete results;
@@ -2666,7 +2666,7 @@ TEST_F(DeclarableOpsTests3, svd_test12) {
     NDArray x('c', {4,3}, {1.7787856,0.80119777,0.72437465,0.23089433,1.7271413,0.18039072,0.50563407,0.89252293,1.5461209,0.92336726,0.085571885,0.79378015});
     NDArray expS('c', {3}, {3.024703, 1.459483, 1.026371});
 
-    nd4j::ops::svd op;
+    sd::ops::svd op;
     auto results = op.evaluate({&x}, {}, {1, 0, 16});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2685,7 +2685,7 @@ TEST_F(DeclarableOpsTests3, elu_test1) {
     auto x = NDArrayFactory::create<double>('c', {3,3}, {0.1, .2, .3, -.4,-.5,-.6, .7, .8, .9});
     auto exp = NDArrayFactory::create<double>('c', {3,3}, {.1, .2, .3, 0.5*-0.32968, 0.5*-0.393469, 0.5*-0.451188, .7, .8, .9});
 
-    nd4j::ops::elu op;
+    sd::ops::elu op;
     auto results = op.evaluate({&x}, {0.5}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2704,7 +2704,7 @@ TEST_F(DeclarableOpsTests3, elu_bp_test1) {
     eps.assign(2.);
     auto exp = NDArrayFactory::create<double>('c', {3, 3}, {2, 2, 2, 0.5*1.34064, 0.5*1.213061, 0.5*1.097623, 2, 2, 2});
 
-    nd4j::ops::elu_bp op;
+    sd::ops::elu_bp op;
     auto results = op.evaluate({ &x, &eps }, {0.5}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2721,7 +2721,7 @@ TEST_F(DeclarableOpsTests3, lrelu_test1) {
     auto x = NDArrayFactory::create<double>('c', {3,3}, {1, 2, 3, -4,-5,-6, 7, 8, 9});
     auto exp = NDArrayFactory::create<double>('c', {3,3}, {1, 2, 3, -0.8, -1., -1.2, 7, 8, 9});
 
-    nd4j::ops::lrelu op;
+    sd::ops::lrelu op;
     auto results = op.evaluate({&x}, {0.2}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2738,7 +2738,7 @@ TEST_F(DeclarableOpsTests3, lrelu_bp_test1) {
     auto eps = NDArrayFactory::create<double>('c', {3,3}, {2,2,2,2,2,2,2, 2,2});
     auto exp = NDArrayFactory::create<double>('c', {3,3}, {2, 2, 2, 0.4, 0.4, 0.4, 2, 2, 2});
 
-    nd4j::ops::lrelu_bp op;
+    sd::ops::lrelu_bp op;
     auto results = op.evaluate({&x, &eps}, {0.2}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2755,7 +2755,7 @@ TEST_F(DeclarableOpsTests3, selu_test1) {
     auto x = NDArrayFactory::create<double>('c', {3,3}, {1, 2, 3, -4,-5,-6, 7, 8, 9});
     auto exp = NDArrayFactory::create<double>('c', {3,3}, {1.050701, 2.101402, 3.152103, -1.725899, -1.746253, -1.753742, 7.354907, 8.405608, 9.456309});
 
-    nd4j::ops::selu op;
+    sd::ops::selu op;
     auto results = op.evaluate({&x}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2773,7 +2773,7 @@ TEST_F(DeclarableOpsTests3, selu_test2) {
     auto eps = NDArrayFactory::create<double>('c', {3,3}, {2,2,2,2,2,2,2, 2,2});
     auto exp = NDArrayFactory::create<double>('c', {3,3}, {2.101401, 2.101402, 2.101402, 0.064401, 0.023692, 0.008716, 2.101402, 2.101402, 2.101402});
 
-    nd4j::ops::selu_bp op;
+    sd::ops::selu_bp op;
     auto results = op.evaluate({&x, &eps}, {0.2}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2793,7 +2793,7 @@ TEST_F(DeclarableOpsTests3, EQScalarTests_1) {
     auto x  =     NDArrayFactory::create(1.0f);
     auto scalar = NDArrayFactory::create(1.0f);
 
-    nd4j::ops::eq_scalar op;
+    sd::ops::eq_scalar op;
     auto res = op.verify({&x, &scalar});
     ASSERT_TRUE(res);
 
@@ -2805,7 +2805,7 @@ TEST_F(DeclarableOpsTests3, EQScalarTests_2) {
     auto x  =     NDArrayFactory::create(2.0f);
     auto scalar = NDArrayFactory::create(1.0f);
 
-    nd4j::ops::eq_scalar op;
+    sd::ops::eq_scalar op;
     auto res = op.verify({&x, &scalar});
     ASSERT_FALSE(res);
 }
@@ -2816,7 +2816,7 @@ TEST_F(DeclarableOpsTests3, GTScalarTests_1) {
     auto x  =     NDArrayFactory::create(1.0f);
     auto scalar = NDArrayFactory::create(1.0f);
 
-    nd4j::ops::gt_scalar op;
+    sd::ops::gt_scalar op;
     auto res = op.verify({&x, &scalar});
     ASSERT_FALSE(res);
 }
@@ -2827,7 +2827,7 @@ TEST_F(DeclarableOpsTests3, GTScalarTests_2) {
     auto x  =     NDArrayFactory::create(2.0f);
     auto scalar = NDArrayFactory::create(1.0f);
 
-    nd4j::ops::gt_scalar op;
+    sd::ops::gt_scalar op;
     auto res = op.verify({&x, &scalar});
     ASSERT_TRUE(res);
 }
@@ -2838,7 +2838,7 @@ TEST_F(DeclarableOpsTests3, GTEScalarTests_1) {
     auto x  =     NDArrayFactory::create(1.0f);
     auto scalar = NDArrayFactory::create(1.0f);
 
-    nd4j::ops::gte_scalar op;
+    sd::ops::gte_scalar op;
     auto res = op.verify({&x, &scalar});
     ASSERT_TRUE(res);
 }
@@ -2849,7 +2849,7 @@ TEST_F(DeclarableOpsTests3, GTEScalarTests_2) {
     auto x  =     NDArrayFactory::create(2.0f);
     auto scalar = NDArrayFactory::create(1.0f);
 
-    nd4j::ops::gte_scalar op;
+    sd::ops::gte_scalar op;
     auto res = op.verify({&x, &scalar});
     ASSERT_TRUE(res);
 }
@@ -2860,7 +2860,7 @@ TEST_F(DeclarableOpsTests3, GTEScalarTests_3) {
     auto x  =     NDArrayFactory::create(1.0f);
     auto scalar = NDArrayFactory::create(2.0f);
 
-    nd4j::ops::gte_scalar op;
+    sd::ops::gte_scalar op;
     auto res = op.verify({&x, &scalar});
     ASSERT_FALSE(res);
 }
@@ -2871,7 +2871,7 @@ TEST_F(DeclarableOpsTests3, LTEScalarTests_1) {
     auto x  =     NDArrayFactory::create(1.0f);
     auto scalar = NDArrayFactory::create(1.0f);
 
-    nd4j::ops::lte_scalar op;
+    sd::ops::lte_scalar op;
     auto res = op.verify({&x, &scalar});
     ASSERT_TRUE(res);
 }
@@ -2882,7 +2882,7 @@ TEST_F(DeclarableOpsTests3, LTEScalarTests_2) {
     auto x  =     NDArrayFactory::create(2.0f);
     auto scalar = NDArrayFactory::create(1.0f);
 
-    nd4j::ops::lte_scalar op;
+    sd::ops::lte_scalar op;
     auto res = op.verify({&x, &scalar});
     ASSERT_FALSE(res);
 }
@@ -2893,7 +2893,7 @@ TEST_F(DeclarableOpsTests3, LTEScalarTests_3) {
     auto x  =     NDArrayFactory::create(1.0f);
     auto scalar = NDArrayFactory::create(2.0f);
 
-    nd4j::ops::lte_scalar op;
+    sd::ops::lte_scalar op;
     auto res = op.verify({&x, &scalar});
     ASSERT_TRUE(res);
 }
@@ -2904,7 +2904,7 @@ TEST_F(DeclarableOpsTests3, NEQScalarTests_1) {
     auto x  =     NDArrayFactory::create(1.0f);
     auto scalar = NDArrayFactory::create(1.0f);
 
-    nd4j::ops::neq_scalar op;
+    sd::ops::neq_scalar op;
     auto res = op.verify({&x, &scalar});
     ASSERT_FALSE(res);
 
@@ -2916,7 +2916,7 @@ TEST_F(DeclarableOpsTests3, NEQScalarTests_2) {
     auto x  =     NDArrayFactory::create(2.0f);
     auto scalar = NDArrayFactory::create(1.0f);
 
-    nd4j::ops::neq_scalar op;
+    sd::ops::neq_scalar op;
     auto res = op.verify({&x, &scalar});
     ASSERT_TRUE(res);
 }
@@ -2927,8 +2927,8 @@ TEST_F(DeclarableOpsTests3, NOOPTests_1) {
     auto x  =     NDArrayFactory::create(2.0f);
     auto scalar = NDArrayFactory::create(1.0f);
 
-    nd4j::ops::noop op;
+    sd::ops::noop op;
     auto res = op.evaluate({&x, &scalar}, {}, {});
-    ASSERT_TRUE(res->status() == nd4j::Status::OK());
+    ASSERT_TRUE(res->status() == sd::Status::OK());
     delete res;
 }
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests4.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests4.cpp
index 1fb700779..7f65c3cbf 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests4.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests4.cpp
@@ -21,12 +21,12 @@
 #include "testlayers.h"
 #include <ops/declarable/CustomOperations.h>
 #include <helpers/helper_hash.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <array/NDArrayList.h>
 
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class DeclarableOpsTests4 : public testing::Test {
 public:
@@ -35,8 +35,8 @@ public:
         printf("\n");
         fflush(stdout);
 
-        nd4j::ops::adjust_hue op0;
-        nd4j::ops::adjust_saturation op1;
+        sd::ops::adjust_hue op0;
+        sd::ops::adjust_saturation op1;
     }
 };
 
@@ -48,8 +48,8 @@ public:
         printf("\n");
         fflush(stdout);
 
-        nd4j::ops::adjust_hue op0;
-        nd4j::ops::adjust_saturation op1;
+        sd::ops::adjust_hue op0;
+        sd::ops::adjust_saturation op1;
     }
 };
 
@@ -63,7 +63,7 @@ TYPED_TEST(TypedDeclarableOpsTests4, avgpool2d_1) {
 
     x.linspace(1);
 
-    nd4j::ops::avgpool2d op;
+    sd::ops::avgpool2d op;
     auto result = op.evaluate({&x}, {}, {2, 2, 2, 2, 0, 0, 1, 1, 1, 1, 1});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -84,7 +84,7 @@ TYPED_TEST(TypedDeclarableOpsTests4, avgpool2d_2) {
     x.linspace(1);
 
 
-    nd4j::ops::avgpool2d op;
+    sd::ops::avgpool2d op;
     auto result = op.evaluate({&x}, {}, {2, 2, 2, 2, 0, 0, 1, 1, 0, 1, 1});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -105,7 +105,7 @@ TYPED_TEST(TypedDeclarableOpsTests4, avgpool2d_3) {
     x.linspace(1);
 
 
-    nd4j::ops::avgpool2d op;
+    sd::ops::avgpool2d op;
     auto result = op.evaluate({&x}, {}, {2, 2, 2, 2, 0, 0, 1, 1, 1, 0, 1});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -125,7 +125,7 @@ TYPED_TEST(TypedDeclarableOpsTests4, avgpool2d_4) {
 
     x.linspace(1);
 
-    nd4j::ops::avgpool2d op;
+    sd::ops::avgpool2d op;
     auto result = op.evaluate({&x}, {}, {2, 2, 2, 2, 0, 0, 1, 1, 0, 1, 1});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -146,7 +146,7 @@ TYPED_TEST(TypedDeclarableOpsTests4, avgpool2d_5) {
     x.linspace(1);
 
 
-    nd4j::ops::avgpool2d op;
+    sd::ops::avgpool2d op;
     auto result = op.evaluate({&x}, {}, {2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -167,7 +167,7 @@ TYPED_TEST(TypedDeclarableOpsTests4, avgpool2d_6) {
     x.linspace(1);
 
 
-    nd4j::ops::avgpool2d op;
+    sd::ops::avgpool2d op;
     auto result = op.evaluate({&x}, {}, {2, 2, 2, 2, 1, 1, 1, 1, 0, 1, 0});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -188,7 +188,7 @@ TYPED_TEST(TypedDeclarableOpsTests4, avgpool2d_7) {
     x.linspace(1);
 
 
-    nd4j::ops::avgpool2d op;
+    sd::ops::avgpool2d op;
     auto result = op.evaluate({&x}, {}, {2, 2, 2, 2, 0, 0, 1, 1, 1, 0, 0});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -208,7 +208,7 @@ TYPED_TEST(TypedDeclarableOpsTests4, avgpool2d_8) {
 
     x.linspace(1);
 
-    nd4j::ops::avgpool2d op;
+    sd::ops::avgpool2d op;
     auto result = op.evaluate({&x}, {}, {2, 2, 1, 1, 0, 0, 1, 1, 0, 0, 0});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -229,7 +229,7 @@ TYPED_TEST(TypedDeclarableOpsTests4, avgpool2d_9) {
     x.linspace(1);
 
 
-    nd4j::ops::avgpool2d op;
+    sd::ops::avgpool2d op;
     auto result = op.evaluate({&x}, {}, {2, 2, 1, 1, 0, 0, 1, 1, 1, 0, 0});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -261,7 +261,7 @@ TYPED_TEST(TypedDeclarableOpsTests4, avgpool2d_10) {
         4.63436460f, -4.69907761f, 1.25187206f, 11.46173859f, -2.21917558f, 1.28007793f, 6.92173195f, 2.11268163f, -3.47389889f, 5.08722782f, -3.03950930f, -4.17154264f, 11.30568314f, 0.80361372f, 2.53214502f, 7.18707085f, -4.49114513f, 2.85449266f, 10.14906883f, -0.31974933f, -0.84472644f, -0.52459574f, 0.12921631f, -1.81390119f, 2.76170087f, 1.03982210f, 2.91744232f, -0.29048753f, 5.87453508f, -1.53684759f, 1.85800636f, -0.91404629f, 1.28954852f, 5.11354685f, -2.47475505f, -1.33179152f, 2.58552408f, 1.37316465f, -3.32339454f, 1.54122913f, 3.24953628f, -0.29758382f, 2.82391763f, -1.51142192f, -1.22699404f, 6.75745535f, 0.65452754f, -3.29385471f, 2.06008053f, 2.53172946f, -4.23532820f, -1.53909743f, -0.07010663f, -1.42173731f, 7.29031610f, -0.18448229f, 4.59496164f, 6.73027277f, 0.73441899f, 0.14426160f, 4.14915276f, -2.97010231f, 6.05851364f, 4.95218086f, -2.39145470f, 2.40494704f, 2.10288811f, 0.53503096f, 1.44511235f, 6.66344261f, -3.05803776f, 7.21418667f, 3.30303526f, -0.24163735f, 3.47409391f, 3.64520788f, 2.15189481f, -3.11243272f, 3.62310791f, 0.37379482f, 0.40865007f, -0.83132005f, -4.78246069f, 2.07030797f, 6.51765442f, 3.16178989f, 5.06180477f, 3.78434467f, -0.96689719f, 0.35965276f, 5.89967585f, 1.40294051f, 1.11952639f, 10.59778214f, 0.26739889f, -1.61297631f, 6.24801159f, -0.93914318f, -0.57812452f, 9.92604542f, -0.73025000f, -3.38530874f, 2.45646000f, -2.47949195f, 0.51638460f, 10.65636063f, 1.97816694f, -3.00407791f, 2.66914415f, -0.81951088f, -0.23316640f, 2.40737987f, -2.70007610f, 1.51531935f, 4.08860207f, -0.27552786f, -1.31721711f, 7.11568260f, -3.33498216f, -4.02545023f, 7.22675610f, -0.81690705f, -2.52689576f, 1.04016697f, -0.79291463f, -0.34875512f, 10.00498390f, -4.24167728f, 1.46162593f, 11.82569408f, -1.70359993f, -0.30161047f, 16.44085884f, -0.82253462f, -0.09435523f, 6.13080597f, -0.20259480f, 0.68308711f, 6.15663004f, -6.61776876f, 0.33295766f, 2.55449438f, -0.17819691f, -1.14892209f, 5.56776142f, 1.99279118f, 1.33035934f, 4.45823956f, 3.34916544f, -2.59905386f, 6.16164446f, -2.03881931f, -2.45273542f, 12.46793365f, -2.22743297f, 2.83738565f, 8.48628139f, -1.39347959f, -1.30867767f, 11.08041477f, -4.00363779f, 2.09183025f, 11.30395889f, -2.20504737f, 1.37426853f, 8.98735619f, 1.04676604f, -0.72757077f, 8.28050232f, -6.70741081f, -0.65798020f, 5.68592072f, -0.60760021f, 0.35854483f, 6.26852131f, 1.94100165f, 1.32112014f, 0.80987954f, -1.74617672f, -0.25434083f, 7.16045523f, 1.58884013f, -2.64847064f, 13.14820385f, 1.21393633f, -2.47258949f, 9.41650105f, -0.79384226f, 2.48954105f, 10.95629311f, 0.47723705f, 4.02126694f, 8.02593136f, -2.20726371f, -1.18794477f, 1.50836647f, 0.93118095f, -1.73513174f, 8.85493565f, -2.99670315f, -0.79055870f, 2.39473820f, 2.05046916f, -2.38055134f, 11.82299423f, 0.15609655f, 0.68744308f, 5.66401434f, -0.69281673f, 2.09855556f, 7.74626589f, -0.34283102f, 1.00542057f, 9.95838642f, 0.80161905f, 2.33455157f, 9.80057335f, -0.93561798f, 2.56991577f, 8.29711342f, 0.94213426f, 0.44209945f, 11.70259857f, 0.92710167f, 2.60957146f, 0.24971688f, -0.86529571f, 3.78628922f, 6.80884457f, -0.68178189f, 2.21103406f, 3.18895817f, 0.60283208f, -2.92716241f, 6.72060776f, -1.06625068f, 2.56543374f, 9.97404480f, 3.58080721f, -0.94936347f, 10.16736984f, -1.38464379f, 1.18191063f, 6.66179037f, -3.56115270f, 0.32329530f, 10.90870762f, 2.20638227f, 0.19653285f, 7.34650040f, -3.63859272f, -1.03027737f, 5.98829985f, -3.66606474f, -3.89746714f, 8.63469028f, 1.22569811f, 1.63240814f, 3.74385309f, 0.58243257f, -0.56981975f, 3.69260955f, 1.00979900f, -1.44030499f, 8.57058144f, -1.10648811f, 1.20474911f, 5.43133020f, -2.14822555f, -0.07928789f, 11.25825310f, 0.19645604f, -5.49546146f, 10.41917038f, -0.68178523f, -2.99639869f, 6.50054455f, 0.46488351f, -5.42328453f, 9.09500027f, -2.82107449f, 0.05601966f, 15.34610748f, -0.06820253f, 3.86699796f, 10.73316956f, -3.04795432f, -0.14702171f, 5.64813185f, 1.44028485f, -2.47596145f, 0.07280898f, -3.03187990f, -1.35183525f, 9.35835648f, 2.72966957f, 1.88199532f, 10.36187744f, -0.22834805f, -3.26738238f, 6.92025137f, -2.34061313f, 4.77379704f, 5.28559113f, -2.96323752f, -1.76186585f, 5.94436455f, 0.38647744f, -5.73869514f, 6.76849556f, 1.40892124f, -1.19068217f, 5.37919092f, -6.65328646f, 3.62782669f, 12.34744644f, 2.44762444f, -4.19242620f, 6.14906216f, 0.08121119f, 0.61355996f, 2.69666457f, -1.88962626f, -0.55314136f, 1.84937525f, 1.56048691f, 1.17460012f, 3.75674725f, 1.06198275f, -5.74625874f, 5.41645575f, -1.28946674f, -1.51689398f, 4.32400894f, -0.05222082f, -4.83948946f, 1.80747867f, 1.63144708f, -2.73887825f, 1.63975775f, -2.02163982f, -0.16210437f, 2.93518686f, 1.14427686f, -2.83246303f, 4.79283667f, 2.69697428f, -3.12678456f, -1.19225168f, -2.37022972f, -3.09429741f, 1.94225383f, -1.13747168f, -2.55048585f, 5.40242243f, 1.12777328f, 3.43713188f, 3.62658787f, -2.16878843f, 0.30164462f, 2.97407579f, -0.07275413f, -1.31149673f, 4.70066261f, -2.01323795f, 4.85255766f, 4.59128904f, 1.68084168f, 1.60336494f, 6.58138466f, -1.04759812f, 2.69906545f, 3.55769277f, -0.74327278f, 2.65819693f, 5.39528131f, 2.11248922f, -1.06446671f, 5.24546766f, -2.43146014f, 4.58907509f, 0.06521678f, -2.24503994f, 2.45722699f, 6.94863081f, 0.35258654f, 2.83396196f, 9.92525196f, -1.12225175f, -0.34365177f, 7.19116688f, -4.39813757f, 0.46517885f, 13.22028065f, -2.57483673f, -6.37226963f, 7.58046293f, -2.74600363f, 0.42231262f, 8.04881668f, 0.17289802f, -0.53447008f, 16.55157471f, -5.63614368f, 0.39288223f, 3.37079263f, 1.26484549f, -0.12820500f, 8.46440125f, -4.39304399f, 2.97676420f, 0.65650189f, 0.83158541f, -1.11556435f, 6.32885838f, -0.36087769f, 2.80724382f, 9.90292645f, 1.15936041f, 0.20947981f, 6.91249275f, -2.67404819f, 2.93782163f, 6.65656614f, -2.30828357f, 2.98214006f, 6.80611229f, -4.93821478f, -7.66555262f, 7.59763002f, -0.54159302f, 3.87403512f, 12.42607784f, 2.59284401f, -0.23375344f, 8.95293331f, -0.71807784f, 0.61873478f, 8.66713524f, 1.24289191f, -2.37835455f, 2.08071637f, -0.88315344f, -3.41891551f, 6.85245323f, 1.73007369f, 1.02169311f, 7.69170332f, -2.85411978f, 2.69790673f, 8.12906551f, -1.19351399f, -2.26442742f, 12.26104450f, -0.75579089f, -1.73274946f, 10.68729019f, 2.20655656f, -0.90522075f, 12.42165184f, -1.67929137f, 2.44851565f, 9.31565762f, -0.06645700f, 1.52762020f, 6.18427515f, -1.68882596f, 3.70261097f, 3.02252960f, -3.44125366f, -1.31575799f, 2.84617424f, -0.96849400f, -4.52356243f, 9.95027161f, 0.19966406f, -0.78874779f, 8.18595028f, -4.08300209f, 1.75126517f, 0.96418417f, -4.04913044f, -0.95200396f, 12.03637886f, -0.03041124f, 0.41642749f, 8.88267422f, -3.24985337f, -2.24919462f, 7.32566118f, 0.16964148f, -2.74123430f, 7.05264473f, -3.30191112f, 0.17163286f, 4.81851053f, -1.64463484f, -0.85933101f, 7.29276276f, 2.34066939f, -2.14860010f, 3.46148157f, -0.01782012f, 1.51504040f, 4.79304934f, 1.85281146f, -1.70663762f, 6.93470192f, -4.15440845f, -1.25983095f, 10.52491760f, 0.42930329f, -1.85146868f, 11.70042324f, -0.41704914f, 3.83796859f, 9.21148491f, -2.79719448f, 0.79470479f, 6.26926661f, -5.85230207f, 3.95105338f, 7.84790897f, -1.38680744f, -1.78099084f, 11.95235348f, -2.99841452f, -1.34507811f, 6.15714645f, -1.07552516f, -2.81228638f, 1.66234732f, -4.55166149f, -1.92601109f, 8.64634514f, -0.48158705f, 3.31595659f, 7.67371941f, 2.56964207f, 0.12107098f, 4.56467867f, -0.93541539f, 1.39432955f, 11.99714088f, 1.05353570f, -2.13099813f, 3.67617917f, 3.45895386f, 1.37365830f, 8.74344158f, -4.17585802f, 1.43908918f, 6.28764772f, 3.97346330f, -0.69144285f, 9.07983303f, -0.41635889f, -0.14965028f, 8.85469818f, 1.11306190f, 2.59440994f, 5.38982344f, -1.07948279f, 1.37252975f, 10.26984596f, -0.09318046f, 2.73104119f, 12.45902252f, -1.55446684f, -2.76124811f, 12.19395065f, -0.51846564f, 1.02764034f, 11.42673588f, -0.95940983f, -0.04781032f, 8.78379822f, -4.88957930f, 0.32534006f, 11.97696400f, -3.35108662f, 1.95104563f, 4.46915388f, -2.32061648f, 3.45230985f, 8.29983711f, 2.81034684f, -2.35529327f, 6.07801294f, -0.98105043f, -0.05359888f, 2.52291036f, -0.01986909f, -2.35321999f, 10.51954269f, 2.11145401f, 3.53506470f, 7.29093266f, 0.03721160f, -1.13496494f, 7.43886709f, -5.84201956f, 2.50796294f, 12.14647675f, 2.77490377f, -2.18896222f, 6.05641937f, 5.32617044f, 1.04221284f, 10.79106712f, -2.95749092f, -2.75414610f, 11.30037117f, -3.40654182f, -2.24673963f, 7.49126101f, 0.70811015f, -6.18003702f, 13.83951187f, -1.01204085f, 1.36298490f, -1.04451632f, 2.42435336f, -0.02346706f, -0.85528886f, 1.04731262f, 0.22192979f, 4.15708160f, 0.34933877f, 0.04814529f, 2.24107265f, 0.49676740f, -1.47752666f, 0.45040059f, -0.70471478f, -1.19759345f, 0.21711677f, 0.88461423f, -2.76830935f, 5.52066898f, 1.97664857f, -1.75381601f, 3.45877838f, 1.52617192f, -1.61350942f, 0.85337949f, 1.97610760f, -3.40310287f, 3.40319014f, -3.38691044f, -0.71319139f, 1.65463758f, -0.60680127f, -1.80700517f, 8.02592373f, 2.59627104f, 2.65895891f, 5.93043184f, -4.48425817f, 3.92670918f, 4.19496679f, -2.28286791f, 6.41634607f, 5.72330523f, 1.16269672f, -0.28753027f, 2.46342492f, 0.36693189f, 0.26712441f, 6.37652683f, -2.50139046f, 2.43923736f, 5.56310415f, 0.98065847f, 1.04267502f, 4.16403675f, -0.04966142f, 4.40897894f, 3.72905660f, -3.46129870f, 3.59962773f, 1.34830284f, -1.76661730f, 0.47943926f, 5.29946661f, -1.12711561f, 1.26970029f, 15.17655945f, -1.50971997f, 5.81345224f, 8.48562050f, -4.36049604f, 2.48144460f, 8.23780441f, -3.46030426f, -0.84656560f, 5.94946814f, 1.12747943f, -2.65683913f, 8.69085693f, 1.31309867f, -2.79958344f, 8.76840591f, -1.56444156f, 1.62710834f, 2.41177034f, -0.72804940f, 5.70619011f, 4.67169666f, -0.86167198f, -1.83803177f, 2.96346045f, 2.82692933f, -2.81557131f, 7.11113358f, -1.90071094f, 2.54244423f, 11.19284058f, -0.06298946f, -1.71517313f, 12.98388577f, 0.84510714f, 3.00816894f, 2.57200313f, 0.03899818f, -1.49330592f, 9.60099125f, -3.59513044f, -1.30045319f, 7.09241819f, -0.65233821f, -2.33627677f, 8.81366920f, 0.84154201f, 1.03312039f, 9.85289097f, 0.19351870f, 1.78496623f, 7.34631205f, -2.16530800f, -0.65016162f, 2.46842360f, 0.24016285f, -1.24308395f, 4.78175163f, -0.97682536f, 2.20942235f, 6.68382788f, 3.76786447f, -1.44454038f, 6.26453733f, -3.23575711f, -2.30137897f, 9.53092670f, -5.55222607f, 3.25999236f, 9.37559509f, 1.86339056f, -0.23551451f, 10.23400211f, 3.93031883f, -0.52629089f, 7.85724449f, -2.91549587f, 4.46612740f, 5.66530371f, -2.70820427f, 4.81359577f, 10.31247330f, 1.92230141f, 2.53931546f, 0.74986327f, 1.70303428f, 0.48063779f, 5.31099129f, -0.78976244f, 3.75864220f, 4.23051405f, 2.34042454f, -7.98193836f, 9.83987141f, -1.46722627f, 3.54497814f, 10.36455154f, -4.51249075f, 0.77715248f, 7.78694630f, -4.59989023f, -2.49585629f, 9.90296268f, 1.38535416f, 1.17441154f, 10.10452843f, -0.98628229f, 0.60194463f, 9.12639141f, -3.90754628f, 2.88526392f, 7.24123430f, -0.15283313f, -0.75728363f, -1.15116858f, -2.53791571f, 0.77229571f, 6.44114161f, 0.02646767f, 4.95463037f, 7.21066380f, 1.79384065f, 0.73250306f, 8.04447937f, 0.32576546f, -0.79447043f, 10.12717724f, 2.33392906f, 1.30716443f, 12.36073112f, -0.36694977f, -1.20438910f, 7.03105593f, 0.59557682f, 0.69267452f, 10.18113136f, 2.49944925f, -0.42229167f, 8.83143330f, -1.18805945f, -2.87509322f, 4.53596449f, 4.09732771f, -3.39088297f, -1.02536607f, 0.82119560f, -3.47302604f, 9.29991817f, 0.21001509f, 4.97036457f, 9.50018406f, 1.04420102f, 1.96560478f, 10.74769592f, -6.22709799f, 3.11690164f, 5.06759691f, -1.23724771f, -3.05831861f, 8.12925529f, -1.93435478f, -1.10151744f, 9.32263088f, -0.04249470f, -5.98547363f, 10.49398136f, 0.26400441f, -0.78915191f, 13.28219604f, 2.99276900f, 0.74853164f, 2.49364305f, -3.43529654f, 4.05278301f, 2.13498688f, -2.35444307f, -0.79900265f, 4.66968822f, -0.31095147f, 3.60674143f, 12.37222099f, -0.07855003f, -3.30292702f, 12.15215874f, 0.60886210f, 2.87075138f, 7.75271845f, 0.38044083f, 3.34402204f, 6.40583277f, -0.87888050f, 0.67438459f, 6.91080809f, 1.98332930f, -0.08303714f, 8.08630371f, -0.16772588f, -2.74058914f, 7.17253590f, -2.69122696f, 1.48173678f, 8.99470139f, -1.43302310f, -0.88651133f, 2.66944790f, -0.29186964f, 2.00838661f, 5.09587479f, -0.76676071f, -2.88322186f, 8.31110573f, -0.14550979f, -1.37726915f, 10.28355122f, -1.60575438f, -0.04118848f, 9.97510815f, 0.14440438f, -3.24632120f, 9.00034523f, 4.14319563f, -1.31023729f, 7.16950464f, -0.70428526f, 2.01559544f, 7.26155043f, 2.40816474f, 2.09847403f, 7.31264496f, -0.75401551f, 2.13392544f, 7.03648758f, 1.04036045f, -1.15636516f, 1.09634531f, -0.06340861f, -0.58107805f, -0.65623116f, 1.18972754f, -0.80717683f, 1.40118241f, -0.61932516f, -3.60596156f, 1.59904599f, -2.23774099f, -1.13721037f, 3.89620137f, -0.09115922f, -7.51356888f, 2.36975193f, -1.42520905f, -2.34173775f, 3.33830214f, -2.74016523f, -3.04115510f, 6.00119495f, -1.36084354f, -2.45065260f, 4.56992292f, -3.02825928f, -3.74182844f, 5.11069250f, -0.91531068f, -2.31385994f, 1.83399653f, 3.39370203f, -3.60886002f});
     auto exp = NDArrayFactory::create<TypeParam>('c', {4, 4, 4, 3}, {7.97172260f,  0.06878620f,              2.27749538f,              7.29276514f,              -0.14074677f,              0.65480286f,              5.70313978f,              -0.06546132f,              0.35443667f,              3.70382833f,              -0.84020567f,              0.63826996f,              8.60301399f,              -0.38236514f,              1.55177069f,              7.37542057f,              -0.99374938f,              -0.29971302f,              8.84352493f,              -0.67121059f,              0.43132120f,              4.78175592f,              -1.25070143f,              -1.91523600f,              6.03855371f,              -0.00292124f,              -1.11214364f,              7.90158176f,              -0.57949901f,              -0.96735370f,              7.81192017f,              -0.53255427f,              -0.48009714f,              3.16953635f,              0.08353355f,              -1.54299748f,              3.74821687f,              1.69396687f,              0.72724354f,              5.42915201f,              -1.13686812f,              -0.71793109f,              5.78376389f,              -0.72239977f,              -0.60055625f,              2.53636408f,              0.56777251f,              -2.07892323f,              6.08064651f,              0.68620735f,              2.54017019f,              5.65828180f,              -0.68255502f,              1.47283304f,              6.10842514f,              -0.39655915f,              0.28380761f,              1.96707797f,              -1.98206317f,              0.94027776f,              4.71811438f,              0.32104525f,              -0.92409706f,              8.34588146f,              -1.05581069f,              -0.55217457f,              9.58440876f,              -0.96549922f,              0.45820439f,              5.65453672f,              -2.50953507f,              -0.71441835f,              8.03059578f,              -0.21281289f,              0.92125505f,              9.26900673f,              -0.35963219f,              -0.70039093f,              8.59924412f,              -1.22358346f,              0.81318003f,              3.85920119f,              -0.01305223f,              -1.09234154f,              6.33158875f,              1.28094780f,              -1.48926139f,              4.94969177f,              -0.77126902f,              -1.97033751f,              5.64381838f,              -0.16285487f,              -1.31277227f,              2.39893222f,              -1.32902908f,              -1.39609122f,              6.47572327f,              -0.45267010f,              1.55727172f,              6.70965624f,              -1.68735468f,              -0.05672536f,              7.25092363f,              -0.64613032f,              0.67050058f,              3.60789680f,              -2.05948973f,              2.22687531f,              8.15202713f,              -0.70148355f,              1.28314006f,              8.14842319f,              -1.88807654f,              -1.04808438f,              8.45500565f,              -0.76425624f,              0.94542569f,              4.56179953f,              -0.28786001f,              -2.04502511f,              8.46278095f,              -0.31019822f,              0.07339200f,              9.34214592f,              -0.61948007f,              0.52481830f,              8.32515621f,              -1.52418160f,              0.49678251f,              5.11082315f,              -1.09908783f,              -0.52969611f,              5.27806664f,              0.88632923f,              0.66754371f,              4.75839233f,              0.48928693f,              -0.68036932f,              6.56925392f,              -0.02949905f,              -2.99189186f,              4.46320581f,              -0.64534980f,              -0.29516968f,              8.60809517f,              -1.13120568f,              3.41720533f,              5.84243155f,              -1.24109328f,              0.89566326f,              5.99578333f,              -0.42496428f,              2.07076764f,              3.17812920f,              -0.81566459f,              -0.14363396f,              6.55184317f,              0.39633346f,              -0.43852386f,              8.70214558f,              -2.24613595f,              0.30708700f,              8.73882294f,              -0.53545928f,              1.54409575f,              4.49452257f,              -0.16509305f,              0.19028664f,              8.24897003f,              0.44750381f,              2.15448594f,              8.97640514f,              -0.77728152f,              0.57272542f,              9.03467560f,              0.47173575f,              -1.10807717f,              3.30056310f,              -0.43268481f,              -0.41470885f,              3.53798294f,              -0.08546703f,              -2.16840744f,              6.18733406f,              -0.17871059f,              -2.59837723f,              5.94218683f,              -1.02990067f,              -0.49760687f,              3.76938033f,              0.86383581f,              -1.91504073f});
 
-    nd4j::ops::avgpool2d op;
+    sd::ops::avgpool2d op;
     auto result = op.evaluate({&input}, {3,3,  3,3,  0,0,  1,1,1,  0,1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -285,7 +285,7 @@ TEST_F(DeclarableOpsTests4, avgpool2d_11) {
     auto x = NDArrayFactory::create<double>('c', {1, inOutH, inOutW, inOutC});
     x.linspace(1.0);
 
-    nd4j::ops::avgpool2d op;
+    sd::ops::avgpool2d op;
     auto result = op.evaluate({&x}, {3,3, 1,1, 0,0, 1,1, 1, 0, 1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -308,11 +308,11 @@ TEST_F(DeclarableOpsTests4, avgpool2d_11) {
             int hTo = hFrom + k;
             int wTo = wFrom + k;
 
-            hFrom = nd4j::math::nd4j_max<int>(0, hFrom);
-            wFrom = nd4j::math::nd4j_max<int>(0, wFrom);
+            hFrom = sd::math::nd4j_max<int>(0, hFrom);
+            wFrom = sd::math::nd4j_max<int>(0, wFrom);
 
-            hTo = nd4j::math::nd4j_min<int>(inOutH, hTo);
-            wTo = nd4j::math::nd4j_min<int>(inOutW, wTo);
+            hTo = sd::math::nd4j_min<int>(inOutH, hTo);
+            wTo = sd::math::nd4j_min<int>(inOutW, wTo);
 
             int idxOut[4];
             int idxIn[4];
@@ -361,7 +361,7 @@ TEST_F(DeclarableOpsTests4, avgpool2d_12) {
                                                      1082.5, 1083.5, 1084.5,1090. , 1091. , 1092. ,1099. , 1100. , 1101. ,1106.5, 1107.5, 1108.5,1157.5, 1158.5, 1159.5,1165. , 1166. , 1167. ,1174. , 1175. , 1176. ,1181.5, 1182.5, 1183.5});
     input.linspace(1.);
 
-    nd4j::ops::avgpool2d op;
+    sd::ops::avgpool2d op;
     auto results = op.evaluate({&input}, {kH,kW,  sH,sW,  pH,pW,  dH,dW,  paddingMode, 0, dataFormat});
     auto output = results->at(0);
 
@@ -407,7 +407,7 @@ TEST_F(DeclarableOpsTests4, avgpool2d_13) {
     std::vector<int>* argI = block->getIArguments();
     *argI = {kH,kW, sH,sW, pH,pW, dW,dH, 0, 0, 0};  // 0,1 - kernel Height/Width; 2,3 - stride Height/Width; 4,5 - pad Height/Width; 6,7 - dilation Height/Width; 8 - same mode;
 
-    nd4j::ops::avgpool2d pooling;
+    sd::ops::avgpool2d pooling;
     Nd4jStatus status = pooling.execute(block);
     ASSERT_EQ(ND4J_STATUS_OK, status);
 
@@ -450,7 +450,7 @@ TEST_F(DeclarableOpsTests4, avgpool2d_14) {
     std::vector<int>* argI = block->getIArguments();
     *argI = {kH,kW, sH,sW, pH,pW, dW,dH, 0, 0, 0};  // 0,1 - kernel Height/Width; 2,3 - stride Height/Width; 4,5 - pad Height/Width; 6,7 - dilation Height/Width; 8 - same mode;
 
-    nd4j::ops::avgpool2d pooling;
+    sd::ops::avgpool2d pooling;
     Nd4jStatus status = pooling.execute(block);
     ASSERT_EQ(ND4J_STATUS_OK, status);
 
@@ -476,8 +476,8 @@ TEST_F(DeclarableOpsTests4, Avgpool2d_test15) {
     const int pW = 0;
     const int dH = 1;
     const int dW = 1;
-    const int oH = (int) nd4j::math::nd4j_ceil<float, int>(iH * 1.f / sH);
-    const int oW = (int) nd4j::math::nd4j_ceil<float, int>(iW * 1.f / sW);
+    const int oH = (int) sd::math::nd4j_ceil<float, int>(iH * 1.f / sH);
+    const int oW = (int) sd::math::nd4j_ceil<float, int>(iW * 1.f / sW);
 
 
     auto x = NDArrayFactory::create_<float>('c', {bS,iD,iH,iW});
@@ -493,7 +493,7 @@ TEST_F(DeclarableOpsTests4, Avgpool2d_test15) {
     std::vector<int>* argI = block->getIArguments();
     *argI = {kH,kW, sH,sW, pH,pW, dW,dH, 1, 0, 0};  // 0,1 - kernel Height/Width; 2,3 - stride Height/Width; 4,5 - pad Height/Width; 6,7 - dilation Height/Width; 8 - same mode;
 
-    nd4j::ops::avgpool2d pooling;
+    sd::ops::avgpool2d pooling;
     Nd4jStatus status = pooling.execute(block);
     ASSERT_EQ(ND4J_STATUS_OK, status);
 
@@ -513,13 +513,13 @@ TEST_F(DeclarableOpsTests4, avgpool2d_16) {
     int paddingMode = 1;             // 1-SAME,  0-VALID
     int dataFormat  = 1;             // 1-NHWC, 0-NDHW
 
-    NDArray input('c', {bS, iH, iW, iC}, nd4j::DataType::FLOAT32);
-    NDArray output('f', {bS, oH, oW, iC}, nd4j::DataType::FLOAT32);
-    NDArray expected('c', {bS, oH, oW, iC}, {6.f, 7.f, 10.f, 11.f, 22.f, 23.f, 26.f, 27.f, 38.f, 39.f, 42.f, 43.f, 54.f, 55.f, 58.f, 59.f}, nd4j::DataType::FLOAT32);
+    NDArray input('c', {bS, iH, iW, iC}, sd::DataType::FLOAT32);
+    NDArray output('f', {bS, oH, oW, iC}, sd::DataType::FLOAT32);
+    NDArray expected('c', {bS, oH, oW, iC}, {6.f, 7.f, 10.f, 11.f, 22.f, 23.f, 26.f, 27.f, 38.f, 39.f, 42.f, 43.f, 54.f, 55.f, 58.f, 59.f}, sd::DataType::FLOAT32);
 
     input.linspace(1.);
 
-    nd4j::ops::avgpool2d op;
+    sd::ops::avgpool2d op;
     auto status = op.execute({&input}, {&output}, {}, {kH,kW,  sH,sW,  pH,pW,  dH,dW,  paddingMode, 0, dataFormat}, {});
 
     ASSERT_EQ(Status::OK(), status);
@@ -536,7 +536,7 @@ TEST_F(DeclarableOpsTests4, biasadd_1) {
     auto bias = NDArrayFactory::create<double>('c', {2}, {1, 2});
     auto exp = NDArrayFactory::create<double>('c', {2, 3, 3, 2}, {1.f,  2.f,  1.f,  2.f,  1.f,  2.f,  1.f,  2.f,  1.f,  2.f,  1.f,  2.f,  1.f,  2.f,  1.f,  2.f,  1.f,  2.f, 1.f,  2.f,  1.f,  2.f,  1.f,  2.f,  1.f,  2.f,  1.f,  2.f,  1.f,  2.f,  1.f,  2.f,  1.f,  2.f,  1.f,  2.f});
 
-    nd4j::ops::biasadd op;
+    sd::ops::biasadd op;
     auto result = op.evaluate({&x, &bias}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -554,7 +554,7 @@ TEST_F(DeclarableOpsTests4, biasadd_2) {
     auto bias = NDArrayFactory::create<double>('c', {2}, {1, 2});
     auto exp = NDArrayFactory::create<double>('c', {2, 2, 3, 3}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2});
 
-    nd4j::ops::biasadd op;
+    sd::ops::biasadd op;
     auto result = op.evaluate({&x, &bias}, {}, {}, {true});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -572,7 +572,7 @@ TEST_F(DeclarableOpsTests4, biasadd_3) {
     auto row = NDArrayFactory::create<double>('c', {3}, {1, 2, 3});
     auto exp = NDArrayFactory::create<double>('c', {2, 3}, {1, 2, 3, 1, 2, 3});
 
-    nd4j::ops::biasadd op;
+    sd::ops::biasadd op;
     auto result = op.evaluate({&x, &row}, {}, {}, {true});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -587,15 +587,15 @@ TEST_F(DeclarableOpsTests4, biasadd_3) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests4, biasadd_bp_1) {
 
-    NDArray x('c', {2,2,2,3}, {1.,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24}, nd4j::DataType::FLOAT32);
-    NDArray gradO('c', {2,2,2,3}, nd4j::DataType::FLOAT32);
-    NDArray bias('c', {3}, {-1., -2, -3}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {2,2,2,3}, {1.,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24}, sd::DataType::FLOAT32);
+    NDArray gradO('c', {2,2,2,3}, sd::DataType::FLOAT32);
+    NDArray bias('c', {3}, {-1., -2, -3}, sd::DataType::FLOAT32);
 
-    NDArray expGradB('c', {3}, {9.2, 10. , 10.8}, nd4j::DataType::FLOAT32);
+    NDArray expGradB('c', {3}, {9.2, 10. , 10.8}, sd::DataType::FLOAT32);
 
     gradO.linspace(0.1, 0.1);
 
-    nd4j::ops::biasadd_bp op;
+    sd::ops::biasadd_bp op;
     auto result = op.evaluate({&x, &bias, &gradO}, {}, {}, {false}); // NHWC
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -615,15 +615,15 @@ TEST_F(DeclarableOpsTests4, biasadd_bp_1) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests4, biasadd_bp_2) {
 
-    NDArray x('c', {2,3,2,2}, {1.,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24}, nd4j::DataType::FLOAT32);
-    NDArray gradO('c', {2,3,2,2}, nd4j::DataType::FLOAT32);
-    NDArray bias('c', {3}, {-1., -2, -3}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {2,3,2,2}, {1.,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24}, sd::DataType::FLOAT32);
+    NDArray gradO('c', {2,3,2,2}, sd::DataType::FLOAT32);
+    NDArray bias('c', {3}, {-1., -2, -3}, sd::DataType::FLOAT32);
 
-    NDArray expGradB('c', {3}, {6.8, 10., 13.2}, nd4j::DataType::FLOAT32);
+    NDArray expGradB('c', {3}, {6.8, 10., 13.2}, sd::DataType::FLOAT32);
 
     gradO.linspace(0.1, 0.1);
 
-    nd4j::ops::biasadd_bp op;
+    sd::ops::biasadd_bp op;
     auto result = op.evaluate({&x, &bias, &gradO}, {}, {}, {true}); // NCHW
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -649,7 +649,7 @@ TEST_F(DeclarableOpsTests4, biasadd_4) {
     auto z = NDArrayFactory::create<float>('c', {2, 3});
     auto exp = NDArrayFactory::create<float>('c', {2, 3}, {1.f, 2.f, 3.f, 1.f, 2.f, 3.f});
 
-    nd4j::ops::biasadd op;
+    sd::ops::biasadd op;
     auto status = op.execute({&x, &y}, {&z}, {}, {}, {true});
     ASSERT_EQ(Status::OK(), status);
 
@@ -662,7 +662,7 @@ TEST_F(DeclarableOpsTests4, Test_Fill_1) {
     auto exp = NDArrayFactory::create<double>('c', {3, 2, 4});
     exp.assign(2.0f);
 
-    nd4j::ops::fill op;
+    sd::ops::fill op;
     auto result = op.evaluate({&x, &v});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -683,7 +683,7 @@ TEST_F(DeclarableOpsTests4, Test_FirasSparce_1) {
     x.p(52, 0);
     x.p(60, 1);
     x.p(61, 0);
-    nd4j::ops::firas_sparse op;
+    sd::ops::firas_sparse op;
     auto result = op.evaluate({&x}, {0, 1});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -703,7 +703,7 @@ TEST_F(DeclarableOpsTests4, Test_FlattenTests_1) {
 
     x.linspace(1);
     exp.linspace(1);
-    nd4j::ops::flatten op;
+    sd::ops::flatten op;
     auto result = op.evaluate({&x}, {}, {'c'});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -724,7 +724,7 @@ TEST_F(DeclarableOpsTests4, Test_FlattenTests_2) {
     x.linspace(1);
     y.linspace(82);
     exp.linspace(1);
-    nd4j::ops::flatten op;
+    sd::ops::flatten op;
     auto result = op.evaluate({&x, &y}, {}, {'c'});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -738,13 +738,13 @@ TEST_F(DeclarableOpsTests4, Test_FlattenTests_2) {
 }
 
 TEST_F(DeclarableOpsTests4, Test_FlattenTests_3) {
-    NDArray x('c', {2,2}, {1, 2, 3, 4}, nd4j::DataType::INT32);
-    NDArray y('f', {2,2}, nd4j::DataType::INT32);
-    NDArray exp('c', {8}, {1, 2, 3, 4, 1, 2, 3, 4}, nd4j::DataType::INT32);
+    NDArray x('c', {2,2}, {1, 2, 3, 4}, sd::DataType::INT32);
+    NDArray y('f', {2,2}, sd::DataType::INT32);
+    NDArray exp('c', {8}, {1, 2, 3, 4, 1, 2, 3, 4}, sd::DataType::INT32);
 
     y.assign(x);
 
-    nd4j::ops::flatten op;
+    sd::ops::flatten op;
     auto result = op.evaluate({&x, &y}, {}, {'c'});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -756,13 +756,13 @@ TEST_F(DeclarableOpsTests4, Test_FlattenTests_3) {
 }
 
 TEST_F(DeclarableOpsTests4, Test_FlattenTests_4) {
-    NDArray x('c', {2,2}, {1, 2, 3, 4}, nd4j::DataType::INT32);
-    NDArray y('f', {2,2}, nd4j::DataType::INT32);
-    NDArray exp('c', {8}, {1, 3, 2, 4, 1, 3, 2, 4}, nd4j::DataType::INT32);
+    NDArray x('c', {2,2}, {1, 2, 3, 4}, sd::DataType::INT32);
+    NDArray y('f', {2,2}, sd::DataType::INT32);
+    NDArray exp('c', {8}, {1, 3, 2, 4, 1, 3, 2, 4}, sd::DataType::INT32);
 
     y.assign(x);
 
-    nd4j::ops::flatten op;
+    sd::ops::flatten op;
     auto result = op.evaluate({&x, &y}, {}, {'f'});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -778,7 +778,7 @@ TEST_F(DeclarableOpsTests4, Test_FloorTests_1) {
     auto exp = NDArrayFactory::create<double>('c', {3,3});
 
     exp.linspace(1);
-    nd4j::ops::Floor op;
+    sd::ops::Floor op;
     auto result = op.evaluate({&x});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -798,7 +798,7 @@ TEST_F(DeclarableOpsTests4, Test_Reshape_Again) {
     x.linspace(1);
     exp.linspace(1);
 
-    nd4j::ops::reshape op;
+    sd::ops::reshape op;
     auto result = op.evaluate({&x}, {-99, 4, 3});
 
     auto z = result->at(0);
@@ -826,7 +826,7 @@ TEST_F(DeclarableOpsTests4, Test_Split_1) {
     sub2.assign(2.0);
 
 
-    nd4j::ops::split_v op;
+    sd::ops::split_v op;
     auto result = op.evaluate({&x, &sizes}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -869,7 +869,7 @@ TEST_F(DeclarableOpsTests4, Test_Split_2) {
     sub3.assign(3.0f);
 
 
-    nd4j::ops::split op;
+    sd::ops::split op;
     auto result = op.evaluate({&axis, &x}, {}, {4});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -908,7 +908,7 @@ TEST_F(DeclarableOpsTests4, Test_Split_3) {
     sub1.assign(1.0f);
     sub2.assign(2.0f);
 
-    nd4j::ops::split op;
+    sd::ops::split op;
     auto result = op.evaluate({&axis, &x}, {}, {3});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -933,7 +933,7 @@ TEST_F(DeclarableOpsTests4, Test_Stack_4) {
     auto v = NDArrayFactory::create<double>('c', {2, 3, 5});
     auto exp = NDArrayFactory::create<double>('c', {3, 2, 3, 5});
 
-    nd4j::ops::stack op;
+    sd::ops::stack op;
     auto result = op.evaluate({&t, &u, &v}, {}, {-4});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -949,7 +949,7 @@ TEST_F(DeclarableOpsTests4, Test_Squeeze_args_1) {
     auto x = NDArrayFactory::create<double>('c', {2, 1, 1, 1, 2}, {1, 2, 3, 4});
     auto exp = NDArrayFactory::create<double>('c', {2, 1, 2}, {1, 2, 3, 4});
 
-    nd4j::ops::squeeze op;
+    sd::ops::squeeze op;
     auto result = op.evaluate({&x}, {}, {1, 3});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -966,7 +966,7 @@ TEST_F(DeclarableOpsTests4, Test_Squeeze_args_2) {
     auto y = NDArrayFactory::create<double>('c', {2}, {1.f, 3.f});
     auto exp = NDArrayFactory::create<double>('c', {2, 1, 2}, {1, 2, 3, 4});
 
-    nd4j::ops::squeeze op;
+    sd::ops::squeeze op;
     auto result = op.evaluate({&x, &y}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -983,7 +983,7 @@ TEST_F(DeclarableOpsTests4, Test_Squeeze_args_3) {
     auto x = NDArrayFactory::create<double>('c', {2, 1, 1, 1, 2}, {1, 2, 3, 4});
     auto exp = NDArrayFactory::create<double>('c', {2, 1, 2}, {1, 2, 3, 4});
 
-    nd4j::ops::squeeze op;
+    sd::ops::squeeze op;
     auto result = op.evaluate({&x}, {}, {-2, -3});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -998,7 +998,7 @@ TEST_F(DeclarableOpsTests4, Test_Squeeze_args_3) {
 TEST_F(DeclarableOpsTests4, Test_1D_1) {
     auto x = NDArrayFactory::create<double>('c', {2, 3});
 
-    nd4j::ops::unstack op;
+    sd::ops::unstack op;
     auto result = op.evaluate({&x}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1015,7 +1015,7 @@ TEST_F(DeclarableOpsTests4, Test_SpaceToDepth_1) {
     auto x = NDArrayFactory::create<double>('c', {1, 2, 2, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
     auto exp = NDArrayFactory::create<double>('c', {1, 1, 1, 12}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
 
-    nd4j::ops::space_to_depth op;
+    sd::ops::space_to_depth op;
     auto result = op.evaluate({&x}, {}, {2, 1});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1031,7 +1031,7 @@ TEST_F(DeclarableOpsTests4, Test_SpaceToDepth_2) {
     auto x = NDArrayFactory::create<double>('c', {1, 3, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
     auto exp = NDArrayFactory::create<double>('c', {1, 12, 1, 1}, {1, 5, 9, 2, 6, 10, 3, 7, 11, 4, 8, 12});
 
-    nd4j::ops::space_to_depth op;
+    sd::ops::space_to_depth op;
     auto result = op.evaluate({&x}, {}, {2, 0});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1048,7 +1048,7 @@ TEST_F(DeclarableOpsTests4, Test_DepthToSpace_1) {
     auto x = NDArrayFactory::create<double>('c', {1, 1, 1, 12}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
     auto exp = NDArrayFactory::create<double>('c', {1, 2, 2, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
 
-    nd4j::ops::depth_to_space op;
+    sd::ops::depth_to_space op;
     auto result = op.evaluate({&x}, {}, {2, 1});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1065,7 +1065,7 @@ TEST_F(DeclarableOpsTests4, Test_DepthToSpace_2) {
     auto x = NDArrayFactory::create<double>('c', {1, 12, 1, 1}, {1, 5, 9, 2, 6, 10, 3, 7, 11, 4, 8, 12});
     auto exp = NDArrayFactory::create<double>('c', {1, 3, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
 
-    nd4j::ops::depth_to_space op;
+    sd::ops::depth_to_space op;
     auto result = op.evaluate({&x}, {}, {2, 0});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1081,7 +1081,7 @@ TEST_F(DeclarableOpsTests4, Test_DepthToSpace_3) {
     auto x = NDArrayFactory::create<double>('c', {4, 4, 16, 16});
     auto exp = NDArrayFactory::create<double>('c', {4, 16, 64, 1});
 
-    nd4j::ops::depth_to_space op;
+    sd::ops::depth_to_space op;
     auto result = op.evaluate({&x}, {}, {4, 1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -1098,7 +1098,7 @@ TEST_F(DeclarableOpsTests4, Test_Cross_1) {
     auto b = NDArrayFactory::create<double>('c', {3}, {6, 7, 8});
     auto exp = NDArrayFactory::create<double>('c', {3}, {-5, 10, -5});
 
-    nd4j::ops::cross op;
+    sd::ops::cross op;
     auto result = op.evaluate({&a, &b});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1116,7 +1116,7 @@ TEST_F(DeclarableOpsTests4, Test_Cross_2) {
     auto b = NDArrayFactory::create<double>('c', {2, 3}, {6, 7, 8, 6, 7, 8});
     auto exp = NDArrayFactory::create<double>('c', {2, 3}, {-5, 10, -5, -5, 10, -5});
 
-    nd4j::ops::cross op;
+    sd::ops::cross op;
     auto result = op.evaluate({&a, &b});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1134,7 +1134,7 @@ TEST_F(DeclarableOpsTests4, Test_Cross_3) {
     auto b = NDArrayFactory::create<double>('c', {3, 3}, {2, 3, 4, 7, 6, 5, 6, 3, 2});
     auto exp = NDArrayFactory::create<double>('c', {3, 3}, { -1,   2,  -1, -11,  22, -11, -11,  40, -27});
 
-    nd4j::ops::cross op;
+    sd::ops::cross op;
     auto result = op.evaluate({&a, &b});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1151,7 +1151,7 @@ TEST_F(DeclarableOpsTests4, Test_Add_119) {
     auto b = NDArrayFactory::create<double>('c', {4}, {1, 2, 3, 4});
     auto exp = NDArrayFactory::create<double>('c', {1, 4}, {2, 4, 6, 8});
 
-    nd4j::ops::add op;
+    sd::ops::add op;
     auto result = op.evaluate({&a, &b});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1171,7 +1171,7 @@ TEST_F(DeclarableOpsTests4, Test_Reshape_Negative_1) {
     auto shape = NDArrayFactory::create<Nd4jLong>('c', {2}, {-1, 2});
     auto exp = NDArrayFactory::create<double>('c', {4, 2}, {1, 2, 3, 4, 5, 6, 7, 8});
 
-    nd4j::ops::reshape op;
+    sd::ops::reshape op;
     auto result = op.evaluate({&x, &shape});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1189,7 +1189,7 @@ TEST_F(DeclarableOpsTests4, Test_TileToShape_1) {
                                         4.f, 5.f, 6.f,4.f, 5.f, 6.f,4.f, 5.f, 6.f,4.f, 5.f, 6.f});
     x.linspace(1.f);
 
-    nd4j::ops::tile_to_shape op;
+    sd::ops::tile_to_shape op;
     auto result = op.evaluate({&x},{}, {2, 4, 3});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -1208,7 +1208,7 @@ TEST_F(DeclarableOpsTests4, Test_StridedSlice_Alex_1) {
     auto exp = NDArrayFactory::create<double>('c', {1,3,4,5});
     exp.linspace(1);
 
-    nd4j::ops::strided_slice op;
+    sd::ops::strided_slice op;
     auto result = op.evaluate({&x}, {}, {0,0,0,1,0, -999,0,0,0, -999,3,4,5, -999,1,1,1});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -1230,7 +1230,7 @@ TEST_F(DeclarableOpsTests4, Test_StridedSlice_Alex_2) {
     auto exp = NDArrayFactory::create<double>('c', {1,3,4,5});
     exp.linspace(1);
 
-    nd4j::ops::strided_slice op;
+    sd::ops::strided_slice op;
     auto result = op.evaluate({&x, &begin, &end, &stride}, {}, {0,0,0,1,0});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -1255,7 +1255,7 @@ TEST_F(DeclarableOpsTests4, Test_StridedSlice_Alex_3) {
     //auto exp = NDArrayFactory::create<double>('c', {1,3,4,5});
     //exp.linspace(1);
 
-    nd4j::ops::strided_slice op;
+    sd::ops::strided_slice op;
     auto result = op.evaluate({&x, &begin, &end, &stride}, {}, {1,0,0,0,0});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -1274,7 +1274,7 @@ TEST_F(DeclarableOpsTests4, Test_StridedSlice_Alex_4) {
     auto exp = NDArrayFactory::create<double>('c', {1}, {1});
     //exp.linspace(1);
 
-    nd4j::ops::strided_slice op;
+    sd::ops::strided_slice op;
     auto result = op.evaluate({&x, &begin, &end, &stride}, {}, {1,0,1,0,2});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -1298,7 +1298,7 @@ TEST_F(DeclarableOpsTests4, parallel_stack_test1) {
     auto expected = NDArrayFactory::create<double>('c', {3,2,2,2});
     expected.linspace(1);
 
-    nd4j::ops::parallel_stack op;
+    sd::ops::parallel_stack op;
     auto  results = op.evaluate({&x1, &x2, &x3});
     auto  output = results->at(0);
 
@@ -1318,7 +1318,7 @@ TEST_F(DeclarableOpsTests4, parallel_stack_test2) {
 
     auto expected = NDArrayFactory::create<double>('c', {3,1,2}, {1,2,3,4,5,6});
 
-    nd4j::ops::parallel_stack op;
+    sd::ops::parallel_stack op;
     auto  results = op.evaluate({&x1, &x2, &x3});
     auto  output = results->at(0);
 
@@ -1339,7 +1339,7 @@ TEST_F(DeclarableOpsTests4, parallel_stack_test3) {
 
     auto expected = NDArrayFactory::create<double>('c', {3,2,1}, {1,2,3,4,5,6});
 
-    nd4j::ops::parallel_stack op;
+    sd::ops::parallel_stack op;
     auto  results = op.evaluate({&x1, &x2, &x3});
     auto  output = results->at(0);
 
@@ -1359,7 +1359,7 @@ TEST_F(DeclarableOpsTests4, parallel_stack_test4) {
 
     auto expected = NDArrayFactory::create<double>('c', {3,2}, {1,2,3,4,5,6});
 
-    nd4j::ops::parallel_stack op;
+    sd::ops::parallel_stack op;
     auto  results = op.evaluate({&x1, &x2, &x3});
     auto  output = results->at(0);
 
@@ -1379,7 +1379,7 @@ TEST_F(DeclarableOpsTests4, parallel_stack_test5) {
 
     auto expected = NDArrayFactory::create<double>('c', {3,1}, {1,3,5});
 
-    nd4j::ops::parallel_stack op;
+    sd::ops::parallel_stack op;
     auto  results = op.evaluate({&x1, &x2, &x3});
     auto  output = results->at(0);
 
@@ -1399,7 +1399,7 @@ TEST_F(DeclarableOpsTests4, parallel_stack_test6) {
 
     auto expected = NDArrayFactory::create<double>('c', {3}, {1,3,5});
 
-    nd4j::ops::parallel_stack op;
+    sd::ops::parallel_stack op;
     auto  results = op.evaluate({&x1, &x2, &x3});
     auto  output = results->at(0);
 
@@ -1416,7 +1416,7 @@ TEST_F(DeclarableOpsTests4, parallel_stack_test7) {
     auto x1 = NDArrayFactory::create<double>(1.);
     auto expected = NDArrayFactory::create<double>('c', {1}, {1.});
 
-    nd4j::ops::parallel_stack op;
+    sd::ops::parallel_stack op;
     auto  results = op.evaluate({&x1});
     auto  output = results->at(0);
 
@@ -1437,7 +1437,7 @@ TEST_F(DeclarableOpsTests4, meshgrid_test1) {
     auto exp1 = NDArrayFactory::create<double>('c', {2,3,4}, {10, 10, 10, 10, 20, 20, 20, 20, 30, 30, 30, 30, 10, 10, 10, 10, 20, 20, 20, 20, 30, 30, 30, 30});
     auto exp2 = NDArrayFactory::create<double>('c', {2,3,4}, {100, 200, 300, 400, 100, 200, 300, 400, 100, 200, 300, 400, 100, 200, 300, 400, 100, 200, 300, 400, 100, 200, 300, 400});
 
-    nd4j::ops::meshgrid op;
+    sd::ops::meshgrid op;
     auto  results = op.evaluate({&in0, &in1, &in2}, {}, {0});
     auto  out0 = results->at(0);
     auto  out1 = results->at(1);
@@ -1466,7 +1466,7 @@ TEST_F(DeclarableOpsTests4, meshgrid_test2) {
     auto exp1 = NDArrayFactory::create<double>('c', {3,2,4}, {10, 10, 10, 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20, 30, 30, 30, 30, 30, 30, 30, 30});
     auto exp2 = NDArrayFactory::create<double>('c', {3,2,4}, {100, 200, 300, 400, 100, 200, 300, 400, 100, 200, 300, 400, 100, 200, 300, 400, 100, 200, 300, 400, 100, 200, 300, 400});
 
-    nd4j::ops::meshgrid op;
+    sd::ops::meshgrid op;
     auto  results = op.evaluate({&in0, &in1, &in2});
     auto  out0 = results->at(0);
     auto  out1 = results->at(1);
@@ -1493,7 +1493,7 @@ TEST_F(DeclarableOpsTests4, meshgrid_test3) {
     auto exp1 = NDArrayFactory::create<double>('c', {3,2,4}, {10, 10, 10, 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20, 30, 30, 30, 30, 30, 30, 30, 30});
     auto exp2 = NDArrayFactory::create<double>('c', {3,2,4}, {100, 200, 300, 400, 100, 200, 300, 400, 100, 200, 300, 400, 100, 200, 300, 400, 100, 200, 300, 400, 100, 200, 300, 400});
 
-    nd4j::ops::meshgrid op;
+    sd::ops::meshgrid op;
     auto  results = op.evaluate({&in0, &in1, &in2});
     auto  out0 = results->at(0);
     auto  out1 = results->at(1);
@@ -1520,7 +1520,7 @@ TEST_F(DeclarableOpsTests4, meshgrid_test4) {
     auto exp1 = NDArrayFactory::create<double>('c', {2,3,4}, {10, 10, 10, 10, 20, 20, 20, 20, 30, 30, 30, 30, 10, 10, 10, 10, 20, 20, 20, 20, 30, 30, 30, 30});
     auto exp2 = NDArrayFactory::create<double>('c', {2,3,4}, {100, 200, 300, 400, 100, 200, 300, 400, 100, 200, 300, 400, 100, 200, 300, 400, 100, 200, 300, 400, 100, 200, 300, 400});
 
-    nd4j::ops::meshgrid op;
+    sd::ops::meshgrid op;
     auto  results = op.evaluate({&in0, &in1, &in2}, {}, {0});
     auto  out0 = results->at(0);
     auto  out1 = results->at(1);
@@ -1547,7 +1547,7 @@ TEST_F(DeclarableOpsTests4, meshgrid_test5) {
     auto exp1 = NDArrayFactory::create<double>('c', {1,1,1}, {2});
     auto exp2 = NDArrayFactory::create<double>('c', {1,1,1}, {3});
 
-    nd4j::ops::meshgrid op;
+    sd::ops::meshgrid op;
     auto  results = op.evaluate({&in0, &in1, &in2}, {}, {0});
     auto  out0 = results->at(0);
     auto  out1 = results->at(1);
@@ -1574,7 +1574,7 @@ TEST_F(DeclarableOpsTests4, meshgrid_test6) {
     auto exp1 = NDArrayFactory::create<double>('c', {4,1,1}, {5,5,5,5});
     auto exp2 = NDArrayFactory::create<double>('c', {4,1,1}, {6,6,6,6});
 
-    nd4j::ops::meshgrid op;
+    sd::ops::meshgrid op;
     auto  results = op.evaluate({&in0, &in1, &in2}, {}, {0});
     auto  out0 = results->at(0);
     auto  out1 = results->at(1);
@@ -1601,7 +1601,7 @@ TEST_F(DeclarableOpsTests4, meshgrid_test7) {
     auto exp1 = NDArrayFactory::create<double>('c', {1,4,1}, {5,5,5,5});
     auto exp2 = NDArrayFactory::create<double>('c', {1,4,1}, {6,6,6,6});
 
-    nd4j::ops::meshgrid op;
+    sd::ops::meshgrid op;
     auto  results = op.evaluate({&in0, &in1, &in2}, {}, {1});
     auto  out0 = results->at(0);
     auto  out1 = results->at(1);
@@ -1624,7 +1624,7 @@ TEST_F(DeclarableOpsTests4, meshgrid_test8) {
     auto in0 = NDArrayFactory::create<double>(5);
     auto exp0 = NDArrayFactory::create<double>('c', {1}, {5});
 
-    nd4j::ops::meshgrid op;
+    sd::ops::meshgrid op;
     auto  results = op.evaluate({&in0}, {}, {0});
     auto  out0 = results->at(0);
 
@@ -1641,7 +1641,7 @@ TEST_F(DeclarableOpsTests4, meshgrid_test9) {
     auto in0 = NDArrayFactory::create<double>(5);
     auto exp0 = NDArrayFactory::create<double>('c', {1}, {5});
 
-    nd4j::ops::meshgrid op;
+    sd::ops::meshgrid op;
     auto  results = op.evaluate({&in0}, {}, {1});
     auto  out0 = results->at(0);
 
@@ -1668,7 +1668,7 @@ TEST_F(DeclarableOpsTests4, WeightedCrossEntropyWithLogits_1) {
 //Weights [0.7]
 //Result {-159.50006,  -191.1,       -16.009075, -210., -24.001238,  -15.03887}
 
-    nd4j::ops::weighted_cross_entropy_with_logits op;
+    sd::ops::weighted_cross_entropy_with_logits op;
     auto results = op.evaluate({&targets, &input, &weight});
     auto  output = results->at(0);
 
@@ -1690,7 +1690,7 @@ TEST_F(DeclarableOpsTests4, WeightedCrossEntropyWithLogits_2) {
     auto weights  = NDArrayFactory::create<double>({0.5f, 0.7f, 1.0f}) ;
     auto expected = NDArrayFactory::create<double>('c', {2, 3}, {-159.5001f, -191.1f, -15.98185f, -210.f,  -24.001238f, -14.951412f});
 
-    nd4j::ops::weighted_cross_entropy_with_logits op;
+    sd::ops::weighted_cross_entropy_with_logits op;
     auto results = op.evaluate({&targets, &input, &weights});
     auto  output = results->at(0);
 
@@ -1737,7 +1737,7 @@ TEST_F(DeclarableOpsTests4, lstm_test1) {
 
     auto expClast = NDArrayFactory::create<double>('c', {1, batchSize, numProj}, {1.1589154,1.1589154,1.1589154,1.1892855,1.1892855,1.1892855,1.219861 ,1.219861 ,1.219861});
 
-    nd4j::ops::lstm op;
+    sd::ops::lstm op;
     auto results = op.evaluate({&x, &h0, &c0, &Wx, &Wh, &Wc, &Wp, &b}, {0., 0., 0.}, {0, 0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1761,7 +1761,7 @@ TEST_F(DeclarableOpsTests4, relu6_test1) {
     auto input  = NDArrayFactory::create<double>('c', {2,4}, {-13.,10,-5,0,2,7,6,12});
     auto expected  = NDArrayFactory::create<double>('c', {2,4}, {0., 6., 0., 0.,2., 6., 6., 6.});
 
-    nd4j::ops::relu6 op;
+    sd::ops::relu6 op;
     auto results = op.evaluate({&input}, {0.}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1783,7 +1783,7 @@ TEST_F(DeclarableOpsTests4, relu6_bp_test1) {
 
     auto expected  = NDArrayFactory::create<double>('c', {2,4}, {0., 0., 0., 0., 5., 0., 0., 8.});
 
-    nd4j::ops::relu6_bp op;
+    sd::ops::relu6_bp op;
     auto results = op.evaluate({&input, &gradO}, {0.});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1812,7 +1812,7 @@ TYPED_TEST(TypedDeclarableOpsTests4, LrnTest_1) {
                                             0.7581754f,  0.58321184f, 0.86747235f, 0.4048204f}
     );
 
-    nd4j::ops::lrn op;
+    sd::ops::lrn op;
     auto results = op.evaluate({&x}, {1.0, 1.0, 0.5}, {5});
     auto out = results->at(0);
 
@@ -1840,7 +1840,7 @@ TYPED_TEST(TypedDeclarableOpsTests4, LrnTest_2) {
                                             0.72760683f, 0.4850712f,   0.5848977f, 0.67488194f,
                                             0.7581754f,  0.58321184f, 0.86747235f, 0.4048204f});
 
-    nd4j::ops::lrn op;
+    sd::ops::lrn op;
     auto results = op.evaluate({&x}, {1.0, 1.0, 0.5}, {2});
     auto out = results->at(0);
 
@@ -1879,7 +1879,7 @@ TYPED_TEST(TypedDeclarableOpsTests4, LrnTest_3) {
                      0.9520745f, 0.21039814f, 0.06311944f, 0.3268602f }
     );
 
-    nd4j::ops::lrn op;
+    sd::ops::lrn op;
     auto results = op.evaluate({&x}, {1.0, 1.0, 0.5}, {2});
     auto out = results->at(0);
 
@@ -1918,7 +1918,7 @@ TYPED_TEST(TypedDeclarableOpsTests4, LrnTest_4) {
                     0.94679165f, 0.21039814f, 0.06311944f, 0.10519907f}
     );
 
-    nd4j::ops::lrn op;
+    sd::ops::lrn op;
     auto  results = op.evaluate({&x}, {1.0, 1.0, 0.5}, {5});
     auto out = results->at(0);
 
@@ -1963,7 +1963,7 @@ TYPED_TEST(TypedDeclarableOpsTests4, LrnTest_5) {
 
     auto exp = NDArrayFactory::create<TypeParam>('c', {2, 2, 2, 4});
 
-    nd4j::ops::lrn_bp op;
+    sd::ops::lrn_bp op;
     auto  results = op.evaluate({&x, &eps}, {1.0, 1.0, 0.5}, {5}, {}, {}, false);
     auto out = results->at(0);
 
@@ -1985,7 +1985,7 @@ TEST_F(DeclarableOpsTests4, tri_test1) {
 
     auto expected = NDArrayFactory::create<float>('c', {rows, cols}, {1.f, 0.f, 0.f, 0.f, 0.f, 1.f, 1.f, 0.f, 0.f, 0.f, 1.f, 1.f, 1.f, 0.f, 0.f});
 
-    nd4j::ops::tri op;
+    sd::ops::tri op;
     auto results = op.evaluate({}, {}, {rows, cols});
     auto  output = results->at(0);
 
@@ -2008,7 +2008,7 @@ TEST_F(DeclarableOpsTests4, tri_test2) {
 
     auto expected = NDArrayFactory::create<float>('c', {rows, cols}, {1.f, 1.f, 1.f, 0.f, 0.f, 1.f, 1.f, 1.f, 1.f, 0.f, 1.f, 1.f, 1.f, 1.f, 1.f});
 
-    nd4j::ops::tri op;
+    sd::ops::tri op;
     auto results = op.evaluate({}, {}, {rows, cols, diag});
     auto  output = results->at(0);
 
@@ -2029,7 +2029,7 @@ TEST_F(DeclarableOpsTests4, tri_test3) {
 
     auto expected = NDArrayFactory::create<float>('c', {rows, cols}, {0.f, 0.f, 0.f, 0.f, 0.f, 1.f, 0.f, 0.f, 0.f, 0.f, 1.f, 1.f, 0.f, 0.f, 0.f});
 
-    nd4j::ops::tri op;
+    sd::ops::tri op;
     auto results = op.evaluate({}, {}, {rows, cols, diag});
     auto  output = results->at(0);
 
@@ -2050,7 +2050,7 @@ TEST_F(DeclarableOpsTests4, tri_test4) {
 
     auto expected = NDArrayFactory::create<float>('c', {rows, cols}, {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 1.f, 0.f, 0.f, 0.f, 0.f});
 
-    nd4j::ops::tri op;
+    sd::ops::tri op;
     auto results = op.evaluate({}, {}, {rows, cols, diag});
     auto  output = results->at(0);
 
@@ -2069,7 +2069,7 @@ TEST_F(DeclarableOpsTests4, tri_test5) {
 
     auto expected = NDArrayFactory::create<float>('c', {rows, rows}, {1.f, 0.f, 0.f, 0.f, 0.f, 1.f, 1.f, 0.f, 0.f, 0.f, 1.f, 1.f, 1.f, 0.f, 0.f, 1.f, 1.f, 1.f, 1.f, 0.f, 1.f, 1.f, 1.f, 1.f, 1.f});
 
-    nd4j::ops::tri op;
+    sd::ops::tri op;
     auto results = op.evaluate({}, {}, {rows});
     auto  output = results->at(0);
 
@@ -2090,7 +2090,7 @@ TEST_F(DeclarableOpsTests4, tri_test6) {
 
     auto expected = NDArrayFactory::create<float>('c', {rows, cols}, {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f});
 
-    nd4j::ops::tri op;
+    sd::ops::tri op;
     auto results = op.evaluate({}, {}, {rows, cols, diag});
     auto  output = results->at(0);
 
@@ -2111,7 +2111,7 @@ TEST_F(DeclarableOpsTests4, tri_test7) {
 
     auto expected = NDArrayFactory::create<float>('c', {rows, cols}, {1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f});
 
-    nd4j::ops::tri op;
+    sd::ops::tri op;
     auto results = op.evaluate({}, {}, {rows, cols, diag});
     auto  output = results->at(0);
 
@@ -2129,7 +2129,7 @@ TEST_F(DeclarableOpsTests4, triu_test1) {
     auto input = NDArrayFactory::create<double>('c', {4, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
     auto expected = NDArrayFactory::create<double>('c', {4, 3}, {1,  2,  3, 0, 5, 6, 0,  0,  9, 0,  0, 0});
 
-    nd4j::ops::triu op;
+    sd::ops::triu op;
     auto results = op.evaluate({&input}, {}, {});
     auto  output = results->at(0);
 
@@ -2148,7 +2148,7 @@ TEST_F(DeclarableOpsTests4, triu_test2) {
     auto input = NDArrayFactory::create<double>('c', {4, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
     auto expected = NDArrayFactory::create<double>('c', {4, 3}, {1,  2,  3,4,  5,  6,0,  8,  9,0,  0, 12});
 
-    nd4j::ops::triu op;
+    sd::ops::triu op;
     auto results = op.evaluate({&input}, {}, {-1});
     auto  output = results->at(0);
 
@@ -2166,7 +2166,7 @@ TEST_F(DeclarableOpsTests4, triu_test3) {
     auto input = NDArrayFactory::create<double>('c', {2, 3, 2}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
     auto expected = NDArrayFactory::create<double>('c', {2, 3, 2}, {1, 2,3, 4,0, 6,7, 8,9,10,0,12});
 
-    nd4j::ops::triu op;
+    sd::ops::triu op;
     auto results = op.evaluate({&input}, {}, {-1});
     auto  output = results->at(0);
 
@@ -2184,7 +2184,7 @@ TEST_F(DeclarableOpsTests4, triu_test4) {
     auto input = NDArrayFactory::create<double>('c', {2, 3, 2}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
     auto expected = NDArrayFactory::create<double>('c', {2, 3, 2}, {1,  2,0,  4,0,  0,7,  8,0, 10,0,  0});
 
-    nd4j::ops::triu op;
+    sd::ops::triu op;
     auto results = op.evaluate({&input}, {}, {});
     auto  output = results->at(0);
 
@@ -2202,7 +2202,7 @@ TEST_F(DeclarableOpsTests4, triu_test5) {
     auto input = NDArrayFactory::create<double>('c', {2, 3, 2}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
     auto expected = NDArrayFactory::create<double>('c', {2, 3, 2}, {0, 2,0,  0,0,  0,0,  8,0, 0,0,  0});
 
-    nd4j::ops::triu op;
+    sd::ops::triu op;
     auto results = op.evaluate({&input}, {}, {1});
     auto  output = results->at(0);
 
@@ -2220,7 +2220,7 @@ TEST_F(DeclarableOpsTests4, triu_test6) {
     auto input = NDArrayFactory::create<double>('c', {2, 3, 2}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
     auto expected = NDArrayFactory::create<double>('c', {2, 3, 2}, {0, 0,0,  0,0,  0,0,  0,0, 0,0,  0});
 
-    nd4j::ops::triu op;
+    sd::ops::triu op;
     auto results = op.evaluate({&input}, {}, {10});
     auto  output = results->at(0);
 
@@ -2238,7 +2238,7 @@ TEST_F(DeclarableOpsTests4, triu_test7) {
     auto input = NDArrayFactory::create<double>('c', {2, 3, 2}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
     auto expected = NDArrayFactory::create<double>('c', {2, 3, 2}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
 
-    nd4j::ops::triu op;
+    sd::ops::triu op;
     auto results = op.evaluate({&input}, {}, {-10});
     auto  output = results->at(0);
 
@@ -2256,7 +2256,7 @@ TEST_F(DeclarableOpsTests4, triu_test8) {
     auto input = NDArrayFactory::create<double>('c', {6}, {1, 2, 3, 4, 5, 6});
     auto expected = NDArrayFactory::create<double>('c', {6, 6}, {1, 2, 3, 4, 5, 6,0, 2, 3, 4, 5, 6,0, 0, 3, 4, 5, 6,0, 0, 0, 4, 5, 6,0, 0, 0, 0, 5, 6,0, 0, 0, 0, 0, 6});
 
-    nd4j::ops::triu op;
+    sd::ops::triu op;
     auto results = op.evaluate({&input}, {}, {});
     auto  output = results->at(0);
 
@@ -2274,7 +2274,7 @@ TEST_F(DeclarableOpsTests4, triu_test9) {
     auto input = NDArrayFactory::create<double>('c', {6}, {1, 2, 3, 4, 5, 6});
     auto expected = NDArrayFactory::create<double>('c', {6, 6}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 0, 2, 3, 4, 5, 6, 0, 0, 3, 4, 5, 6});
 
-    nd4j::ops::triu op;
+    sd::ops::triu op;
     auto results = op.evaluate({&input}, {}, {-3});
     auto  output = results->at(0);
 
@@ -2292,7 +2292,7 @@ TEST_F(DeclarableOpsTests4, triu_test10) {
     auto input = NDArrayFactory::create<double>('c', {6}, {1, 2, 3, 4, 5, 6});
     auto expected = NDArrayFactory::create<double>('c', {6, 6}, {0, 0, 0, 4, 5, 6, 0, 0, 0, 0, 5, 6, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
 
-    nd4j::ops::triu op;
+    sd::ops::triu op;
     auto results = op.evaluate({&input}, {}, {3});
     auto  output = results->at(0);
 
@@ -2310,7 +2310,7 @@ TEST_F(DeclarableOpsTests4, triu_test11) {
     auto input = NDArrayFactory::create<double>('c', {6}, {1, 2, 3, 4, 5, 6});
     auto expected = NDArrayFactory::create<double>('c', {6, 6}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6});
 
-    nd4j::ops::triu op;
+    sd::ops::triu op;
     auto results = op.evaluate({&input}, {}, {-58});
     auto  output = results->at(0);
 
@@ -2332,7 +2332,7 @@ TEST_F(DeclarableOpsTests4, triu_bp_test1) {
 
     auto expected = NDArrayFactory::create<double>('c', {2, 3, 2}, {0.,0.5,0.,0. ,0.,0.       ,0.,0.5,0.,0. ,0.,0.});
 
-    nd4j::ops::triu_bp op;
+    sd::ops::triu_bp op;
     auto results = op.evaluate({&input, &gradO}, {}, {1});
     auto  gradI = results->at(0);
 
@@ -2353,7 +2353,7 @@ TEST_F(DeclarableOpsTests4, triu_bp_test2) {
 
     auto expected = NDArrayFactory::create<double>('c', {2, 3, 2}, {0.5,0.5,0. ,0.5,0. ,0. ,0.5,0.5,0. ,0.5,0. ,0.});
 
-    nd4j::ops::triu_bp op;
+    sd::ops::triu_bp op;
     auto results = op.evaluate({&input, &gradO}, {}, {});
     auto  gradI = results->at(0);
 
@@ -2374,7 +2374,7 @@ TEST_F(DeclarableOpsTests4, triu_bp_test3) {
 
     auto expected = NDArrayFactory::create<double>('c', {6,6}, {0.5, 0.5, 0.5, 0.5, 0.5, 0.5,0.5, 0.5, 0.5, 0.5, 0.5, 0.5,0.5, 0.5, 0.5, 0.5, 0.5, 0.5,0. , 0.5, 0.5, 0.5, 0.5, 0.5,0. , 0. , 0.5, 0.5, 0.5, 0.5,0. , 0. , 0. , 0.5, 0.5, 0.5});
 
-    nd4j::ops::triu_bp op;
+    sd::ops::triu_bp op;
     auto results = op.evaluate({&input, &gradO}, {}, {-2});
     auto  gradI = results->at(0);
 
@@ -2395,7 +2395,7 @@ TEST_F(DeclarableOpsTests4, triu_bp_test4) {
 
     auto expected = NDArrayFactory::create<double>('c', {2,3}, {0., 0., 0., 0., 0., 0.});
 
-    nd4j::ops::triu_bp op;
+    sd::ops::triu_bp op;
     auto results = op.evaluate({&input, &gradO}, {}, {10});
     auto  gradI = results->at(0);
 
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests5.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests5.cpp
index 62868f67f..37e6238af 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests5.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests5.cpp
@@ -22,12 +22,12 @@
 #include "testlayers.h"
 #include <ops/declarable/CustomOperations.h>
 #include <helpers/helper_hash.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <array/NDArrayList.h>
 
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class DeclarableOpsTests5 : public testing::Test {
 public:
@@ -45,7 +45,7 @@ TEST_F(DeclarableOpsTests5, Test_PermuteEquality_1) {
     x.linspace(1);
     x.reshapei('c', {3, 4, 5});
 
-    nd4j::ops::permute op;
+    sd::ops::permute op;
     auto result = op.evaluate({&x}, {}, {0, 2, 1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -66,7 +66,7 @@ TEST_F(DeclarableOpsTests5, Test_PermuteEquality_0) {
 //    x.printShapeInfo("{0, 1, 2} shape");
 //    x.printBuffer("{0, 1, 2} data");
 
-    nd4j::ops::permute op;
+    sd::ops::permute op;
     auto result = op.evaluate({&x}, {}, {0, 1, 2});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -88,7 +88,7 @@ TEST_F(DeclarableOpsTests5, Test_PermuteEquality_2) {
 //    x.printShapeInfo("{1, 0, 2} shape");
 //    x.printBuffer("{1, 0, 2} data");
 
-    nd4j::ops::permute op;
+    sd::ops::permute op;
     auto result = op.evaluate({&x}, {}, {1, 0, 2});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -109,7 +109,7 @@ TEST_F(DeclarableOpsTests5, Test_PermuteEquality_3) {
 //    x.printShapeInfo("{1, 2, 0} shape");
 //    x.printBuffer("{1, 2, 0} data");
 
-    nd4j::ops::permute op;
+    sd::ops::permute op;
     auto result = op.evaluate({&x}, {}, {1, 2, 0});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -130,7 +130,7 @@ TEST_F(DeclarableOpsTests5, Test_PermuteEquality_4) {
 //    x.printShapeInfo("{2, 0, 1} shape");
 //    x.printBuffer("{2, 0, 1} data");
 
-    nd4j::ops::permute op;
+    sd::ops::permute op;
     auto result = op.evaluate({&x}, {}, {2, 0, 1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -151,7 +151,7 @@ TEST_F(DeclarableOpsTests5, Test_PermuteEquality_5) {
 //    x.printShapeInfo("{2, 1, 0} shape");
 //    x.printBuffer("{2, 1, 0} data");
 
-    nd4j::ops::permute op;
+    sd::ops::permute op;
     auto result = op.evaluate({&x}, {}, {2, 1, 0});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -171,7 +171,7 @@ TEST_F(DeclarableOpsTests5, Test_TTS_bp_1) {
 
     eps.linspace(1.f);
 
-    nd4j::ops::tile_to_shape_bp op;
+    sd::ops::tile_to_shape_bp op;
     auto result = op.evaluate({&x, &eps}, {}, {2, 4, 3});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -193,14 +193,14 @@ TEST_F(DeclarableOpsTests5, Test_Rdiv_bp_1) {
     auto eps = NDArrayFactory::create<double>('c', {3, 4}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
 
 
-    nd4j::ops::reversedivide op_ff;
+    sd::ops::reversedivide op_ff;
     auto result_ff = op_ff.evaluate({&x, &y}, {}, {});
     ASSERT_EQ(Status::OK(), result_ff->status());
 
     auto z_ff = result_ff->at(0);
     ASSERT_TRUE(eps.isSameShape(z_ff));
 
-    nd4j::ops::reversedivide_bp op_bp;
+    sd::ops::reversedivide_bp op_bp;
     auto result_bp = op_bp.evaluate({&x, &y, &eps}, {}, {});
     ASSERT_EQ(Status::OK(), result_bp->status());
 
@@ -216,7 +216,7 @@ TEST_F(DeclarableOpsTests5, Test_Boolean_diff_1) {
     auto x = NDArrayFactory::create<double>('c', {1, 1}, {1.0f});
     auto y = NDArrayFactory::create<double>(2.0f);
 
-    nd4j::ops::less op;
+    sd::ops::less op;
     auto result = op.evaluate({&x, &y});
     ASSERT_EQ(Status::OK(), result->status());
     ASSERT_EQ(result->at(0)->t<bool>(0), true);
@@ -227,12 +227,12 @@ TEST_F(DeclarableOpsTests5, Test_SetSeed_1) {
     auto x = NDArrayFactory::create<int>('c', {1, 1}, {120});
     auto y = NDArrayFactory::create<int>(5);
 
-    nd4j::ops::set_seed op;
+    sd::ops::set_seed op;
     auto result = op.evaluate({&x, &y}, {}, {120, 5});
 
     ASSERT_EQ(Status::OK(), result->status());
 //    result->at(0)->printIndexedBuffer("RES SEED");
-    nd4j::ops::get_seed getOp;
+    sd::ops::get_seed getOp;
     auto getRes = getOp.evaluate({});
     ASSERT_EQ(Status::OK(), getRes->status());
 //    getRes->at(0)->printIndexedBuffer("Output RES GET SEED");
@@ -243,11 +243,11 @@ TEST_F(DeclarableOpsTests5, Test_SetSeed_1) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests5, scatterMul_test1) {
     auto matrix = NDArrayFactory::create<float>('c', {2, 2}, {1.f, 2.f, 3.f, 4.f});
-    NDArray idc('c', {1}, std::vector<double>({0LL}), nd4j::DataType::INT64);
+    NDArray idc('c', {1}, std::vector<double>({0LL}), sd::DataType::INT64);
     auto updates = NDArrayFactory::create<float>('c', {1, 2}, {10.f, 1.f});
     auto exp = NDArrayFactory::create<float>('c', {2, 2}, {10.f, 2.f, 3.f, 4.f});
 
-    nd4j::ops::scatter_mul op;
+    sd::ops::scatter_mul op;
     auto result = op.evaluate({&matrix, &idc, &updates}, {}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -261,11 +261,11 @@ TEST_F(DeclarableOpsTests5, scatterMul_test1) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests5, scatterDiv_test1) {
     auto matrix = NDArrayFactory::create<float>('c', {2, 2}, {1.f, 2.f, 3.f, 4.f});
-    NDArray idc('c', {1}, std::vector<double>({0LL}), nd4j::DataType::INT64);
+    NDArray idc('c', {1}, std::vector<double>({0LL}), sd::DataType::INT64);
     auto updates = NDArrayFactory::create<float>('c', {1, 2}, {10.f, 1.f});
     auto exp = NDArrayFactory::create<float>('c', {2, 2}, {0.10f, 2.f, 3.f, 4.f});
 
-    nd4j::ops::scatter_div op;
+    sd::ops::scatter_div op;
     auto result = op.evaluate({&matrix, &idc, &updates}, {}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -279,11 +279,11 @@ TEST_F(DeclarableOpsTests5, scatterDiv_test1) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests5, scatterSub_test1) {
     auto matrix = NDArrayFactory::create<float>('c', {2, 2}, {1.f, 2.f, 3.f, 4.f});
-    NDArray idc('c', {1}, std::vector<double>({0LL}), nd4j::DataType::INT64);
+    NDArray idc('c', {1}, std::vector<double>({0LL}), sd::DataType::INT64);
     auto updates = NDArrayFactory::create<float>('c', {1, 2}, {10.f, 1.f});
     auto exp = NDArrayFactory::create<float>('c', {2, 2}, {-9.f, 1.f, 3.f, 4.f});
 
-    nd4j::ops::scatter_sub op;
+    sd::ops::scatter_sub op;
     auto result = op.evaluate({&matrix, &idc, &updates}, {}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -299,7 +299,7 @@ TEST_F(DeclarableOpsTests5, hardsigmoid_test1) {
     auto matrix = NDArrayFactory::create<float>('c', {2, 2}, {1.f, 2.f, 3.f, 4.f});
     auto exp = NDArrayFactory::create<float>('c', {2, 2}, {0.7f, 0.9f, 1.f, 1.f});
 
-    nd4j::ops::hardsigmoid op;
+    sd::ops::hardsigmoid op;
     auto result = op.evaluate({&matrix}, {}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -315,7 +315,7 @@ TEST_F(DeclarableOpsTests5, hardsigmoid_test2) {
     auto eps = NDArrayFactory::create<float>('c', {2, 2}, {1.f, 2.f, 3.f, 4.f});
     auto exp = NDArrayFactory::create<float>('c', {2, 2}, {0.2f, 0.4f, 0.f, 0.f});
 
-    nd4j::ops::hardsigmoid_bp op;
+    sd::ops::hardsigmoid_bp op;
     auto result = op.evaluate({&matrix, &eps}, {}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -330,7 +330,7 @@ TEST_F(DeclarableOpsTests5, hardtanh_test1) {
     auto matrix = NDArrayFactory::create<double>('c', {3, 3}, {-4, -3, -2, -1, 0, 1, 2, 3, 4});
     auto exp = NDArrayFactory::create<double>('c', {3, 3}, {-1, -1, -1, -1, 0, 1, 1, 1, 1});
 
-    nd4j::ops::hardtanh op;
+    sd::ops::hardtanh op;
     auto result = op.evaluate({&matrix}, {}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -346,7 +346,7 @@ TEST_F(DeclarableOpsTests5, hardtanh_test2) {
     auto eps = NDArrayFactory::create<double>('c', {3, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
     auto exp = NDArrayFactory::create<double>('c', {3, 3}, {0, 0, 0, 4, 5, 6, 0, 0, 0});
 
-    nd4j::ops::hardtanh_bp op;
+    sd::ops::hardtanh_bp op;
     auto result = op.evaluate({&matrix, &eps}, {}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -362,7 +362,7 @@ TEST_F(DeclarableOpsTests5, histogram_test1) {
     auto matrix = NDArrayFactory::create<double>('c', {3, 3}, {-4, -3, -2, -1, 0, 1, 2, 3, 4});
     auto exp = NDArrayFactory::create<Nd4jLong>('c', {3}, {3, 3, 3});
 
-    nd4j::ops::histogram op;
+    sd::ops::histogram op;
     auto result = op.evaluate({&matrix}, {}, {3}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -377,7 +377,7 @@ TEST_F(DeclarableOpsTests5, histogram_test2) {
     auto matrix = NDArrayFactory::create<double>('c', {3}, {1, 2, 1});
     auto exp = NDArrayFactory::create<Nd4jLong>('c', {4}, {2, 0, 0, 1});
 
-    nd4j::ops::histogram op;
+    sd::ops::histogram op;
     auto result = op.evaluate({&matrix}, {}, {4}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -392,7 +392,7 @@ TEST_F(DeclarableOpsTests5, Identity_test1) {
     auto matrix = NDArrayFactory::create<float>('c', {3, 3}, {-4.f, -3.f, -2.f, -1.f, 0.f, 1.f, 2.f, 3.f, 4.f});
 //    auto exp = NDArrayFactory::create<Nd4jLong>('c', {3, 3}, {3, 3, 3});
 
-    nd4j::ops::identity op;
+    sd::ops::identity op;
     auto result = op.evaluate({&matrix}, {}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -407,7 +407,7 @@ TEST_F(DeclarableOpsTests5, Identity_test2) {
     auto matrix = NDArrayFactory::create<double>('c', {3, 3}, {-4, -3, -2, -1, 0, 1, 2, 3, 4});
     auto eps = NDArrayFactory::create<double>('c', {3, 3}, {1,2,3,4,5,6,7,8,9});
 //    auto exp = NDArrayFactory::create<float>('c', {3,3});
-    nd4j::ops::identity_bp op;
+    sd::ops::identity_bp op;
     auto result = op.evaluate({&matrix, &eps}, {}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -422,8 +422,8 @@ TEST_F(DeclarableOpsTests5, Log1p_test1) {
     auto y = NDArrayFactory::create<double>('c', {3,3}, {5,4,3,2,1,2,3,4,5});
     //  auto eps = NDArrayFactory::create<float>('c', {3, 3}, {1,2,3,4,5,6,7,8,9});
 //    auto exp = NDArrayFactory::create<float>('c', {3,3});
-    nd4j::ops::Log1p op;
-    y.applyTransform(nd4j::transform::Log, y);
+    sd::ops::Log1p op;
+    y.applyTransform(sd::transform::Log, y);
     auto result = op.evaluate({&matrix}, {}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -439,7 +439,7 @@ TEST_F(DeclarableOpsTests5, Test_SpaceToBatch_1) {
     auto exp = NDArrayFactory::create<double>('c', {4, 1, 1, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
     auto paddings = NDArrayFactory::create<int>('c', {2, 2}, {0, 0, 0, 0});
 
-    nd4j::ops::space_to_batch op;
+    sd::ops::space_to_batch op;
     auto result = op.evaluate({&x, &paddings}, {}, {2});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -457,7 +457,7 @@ TEST_F(DeclarableOpsTests5, Test_SpaceToBatch_2) {
     auto exp = NDArrayFactory::create<double>('c', {4, 1, 1, 1}, {1, 2, 3, 4});
     auto paddings = NDArrayFactory::create<int>('c', {2, 2}, {0, 0, 0, 0});
 
-    nd4j::ops::space_to_batch op;
+    sd::ops::space_to_batch op;
     auto result = op.evaluate({&x, &paddings}, {}, {2});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -476,7 +476,7 @@ TEST_F(DeclarableOpsTests5, Test_SpaceToBatch_3) {
     auto paddings = NDArrayFactory::create<int>('c', {2, 2}, {0, 0, 2, 0});
     auto exp = NDArrayFactory::create<double>('c', {8, 1, 3, 1}, {0, 1, 3, 0,  9, 11,0, 2, 4, 0, 10, 12,0, 5, 7, 0, 13, 15,0, 6, 8, 0, 14, 16});
 
-    nd4j::ops::space_to_batch op;
+    sd::ops::space_to_batch op;
     auto result = op.evaluate({&x, &paddings}, {}, {2});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -493,7 +493,7 @@ TEST_F(DeclarableOpsTests5, Test_SpaceToBatch_3) {
 TEST_F(DeclarableOpsTests5, Test_SpaceToBatch_4) {
 
     const int blockSize = 2;
-    NDArray x('c', {3, 3*blockSize - 1 - 2, 4*blockSize - 2 - 3, 2}, {147, 148, 219, 220, 149, 150, 11,  12, 83,  84, 13,  14, 155, 156, 227, 228, 157, 158, 171, 172, 243, 244, 173, 174, 35,  36, 107, 108, 37,  38, 179, 180, 251, 252, 181, 182, 195, 196, 267, 268, 197, 198, 59,  60, 131, 132, 61,  62, 203, 204, 275, 276, 205, 206}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {3, 3*blockSize - 1 - 2, 4*blockSize - 2 - 3, 2}, {147, 148, 219, 220, 149, 150, 11,  12, 83,  84, 13,  14, 155, 156, 227, 228, 157, 158, 171, 172, 243, 244, 173, 174, 35,  36, 107, 108, 37,  38, 179, 180, 251, 252, 181, 182, 195, 196, 267, 268, 197, 198, 59,  60, 131, 132, 61,  62, 203, 204, 275, 276, 205, 206}, sd::DataType::FLOAT32);
     NDArray paddings = NDArrayFactory::create<int>('c', {2, 2}, {1, 2, 2, 3});
 
     NDArray exp('c', {3*blockSize*blockSize, 3, 4, 2}, {0,0, 0,0, 0,0, 0,0, 0,0, 11,12, 13,14, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0,
@@ -503,9 +503,9 @@ TEST_F(DeclarableOpsTests5, Test_SpaceToBatch_4) {
         0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 171, 172, 173, 174, 0,0, 0,0, 179, 180, 181, 182, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 195, 196,
         197, 198, 0,0, 0,0, 203, 204, 205, 206, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 219, 220, 0,0, 0,0, 0,0, 227, 228, 0,0, 0,0, 0,0,
         0,0, 0,0, 0,0, 0,0, 243, 244, 0,0, 0,0, 0,0, 251, 252, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 267, 268, 0,0, 0,0, 0,0, 275,
-        276, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0}, nd4j::DataType::FLOAT32);
+        276, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0}, sd::DataType::FLOAT32);
 
-    nd4j::ops::space_to_batch op;
+    sd::ops::space_to_batch op;
     auto result = op.evaluate({&x, &paddings}, {}, {blockSize});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -523,7 +523,7 @@ TEST_F(DeclarableOpsTests5, Test_BatchToSpace_1) {
     auto exp = NDArrayFactory::create<double>('c', {1, 2, 2, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
     auto crops = NDArrayFactory::create<int>('c', {2, 2}, {0, 0, 0, 0});
 
-    nd4j::ops::batch_to_space op;
+    sd::ops::batch_to_space op;
     auto result = op.evaluate({&x, &crops}, {}, {2});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -541,7 +541,7 @@ TEST_F(DeclarableOpsTests5, Test_BatchToSpace_2) {
     auto exp = NDArrayFactory::create<double>('c', {1, 2, 2, 1}, {1, 2, 3, 4});
     auto crops = NDArrayFactory::create<int>('c', {2, 2}, {0, 0, 0, 0});
 
-    nd4j::ops::batch_to_space op;
+    sd::ops::batch_to_space op;
     auto result = op.evaluate({&x, &crops}, {}, {2});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -562,7 +562,7 @@ TEST_F(DeclarableOpsTests5, Test_BatchToSpace_3) {
     auto exp = NDArrayFactory::create<double>('c', {2, 2, 4, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
     auto crops = NDArrayFactory::create<int>('c', {2, 2}, {0, 0, 2, 0});
 
-    nd4j::ops::batch_to_space op;
+    sd::ops::batch_to_space op;
     auto result = op.evaluate({&x, &crops}, {}, {2});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -578,13 +578,13 @@ TEST_F(DeclarableOpsTests5, Test_BatchToSpace_3) {
 TEST_F(DeclarableOpsTests5, Test_BatchToSpace_4) {
 
     const int blockSize = 2;
-    NDArray x('c', {3*blockSize*blockSize, 3, 4, 2}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {3*blockSize*blockSize, 3, 4, 2}, sd::DataType::FLOAT32);
     x.linspace(1, 1);
     NDArray crops = NDArrayFactory::create<int>('c', {2, 2}, {1, 2, 2, 3});
 
-    NDArray exp('c', {3, 3*blockSize - 1 - 2, 4*blockSize - 2 - 3, 2}, {147, 148, 219, 220, 149, 150, 11,  12, 83,  84, 13,  14, 155, 156, 227, 228, 157, 158, 171, 172, 243, 244, 173, 174, 35,  36, 107, 108, 37,  38, 179, 180, 251, 252, 181, 182, 195, 196, 267, 268, 197, 198, 59,  60, 131, 132, 61,  62, 203, 204, 275, 276, 205, 206}, nd4j::DataType::FLOAT32);
+    NDArray exp('c', {3, 3*blockSize - 1 - 2, 4*blockSize - 2 - 3, 2}, {147, 148, 219, 220, 149, 150, 11,  12, 83,  84, 13,  14, 155, 156, 227, 228, 157, 158, 171, 172, 243, 244, 173, 174, 35,  36, 107, 108, 37,  38, 179, 180, 251, 252, 181, 182, 195, 196, 267, 268, 197, 198, 59,  60, 131, 132, 61,  62, 203, 204, 275, 276, 205, 206}, sd::DataType::FLOAT32);
 
-    nd4j::ops::batch_to_space op;
+    sd::ops::batch_to_space op;
     auto result = op.evaluate({&x, &crops}, {}, {blockSize});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -601,7 +601,7 @@ TEST_F(DeclarableOpsTests5, eye_test1) {
 
     auto expected = NDArrayFactory::create<float>('c', {3, 3}, {1.f, 0.f, 0.f, 0.f, 1.f, 0.f, 0.f, 0.f, 1.f});
 
-    nd4j::ops::eye op;
+    sd::ops::eye op;
     auto results = op.evaluate({}, {}, {-99, 3});
     auto output = results->at(0);
     // output->printIndexedBuffer();
@@ -618,7 +618,7 @@ TEST_F(DeclarableOpsTests5, eye_test2) {
 
     auto expected = NDArrayFactory::create<float>('c', {3, 4}, {1.f,  0.f,  0.f,  0.f, 0.f,  1.f,  0.f,  0.f, 0.f,  0.f,  1.f,  0.f});
 
-    nd4j::ops::eye op;
+    sd::ops::eye op;
     auto results = op.evaluate({}, {}, {-99, 3, 4});
     auto output = results->at(0);
 
@@ -634,7 +634,7 @@ TEST_F(DeclarableOpsTests5, eye_test3) {
 
     auto expected = NDArrayFactory::create<int>('c', {2, 3, 4}, {1,  0,  0,  0, 0,  1,  0,  0, 0,  0,  1,  0, 1,  0,  0,  0, 0,  1,  0,  0, 0,  0,  1,  0});
 
-    nd4j::ops::eye op;
+    sd::ops::eye op;
     auto results = op.evaluate({}, {9 /*int*/}, {-99, 3, 4, 2});
     auto output = results->at(0);
      // output->printIndexedBuffer("Output eye");
@@ -651,7 +651,7 @@ TEST_F(DeclarableOpsTests5, eye_test4) {
 
     auto expected = NDArrayFactory::create<double>('c', {2, 2, 3, 4}, {1.,  0.,  0.,  0., 0.,  1.,  0.,  0., 0.,  0.,  1.,  0., 1.,  0.,  0.,  0., 0.,  1.,  0.,  0., 0.,  0.,  1.,  0., 1.,  0.,  0.,  0., 0.,  1.,  0.,  0., 0.,  0.,  1.,  0., 1.,  0.,  0.,  0., 0.,  1.,  0.,  0., 0.,  0.,  1.,  0.});
 
-    nd4j::ops::eye op;
+    sd::ops::eye op;
     auto results = op.evaluate({}, {6/*double*/}, {-99, 3, 4, 2, 2});
     auto output = results->at(0);
 
@@ -665,7 +665,7 @@ TEST_F(DeclarableOpsTests5, eye_test4) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests5, eye_test5) {
 
-    nd4j::ops::eye op;
+    sd::ops::eye op;
     auto result = op.evaluate({},{},{3, 2});
 
     auto z = result->at(0);
@@ -683,7 +683,7 @@ TEST_F(DeclarableOpsTests5, gatherNd_test1) {
 
     auto expected = NDArrayFactory::create<double>('c', {2,2,3,2}, {19, 20, 21, 22, 23, 24, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 13, 14, 15, 16, 17, 18});
 
-    nd4j::ops::gather_nd op;
+    sd::ops::gather_nd op;
     auto results = op.evaluate({&input, &indices}, {}, {});
     auto output = results->at(0);
 
@@ -703,7 +703,7 @@ TEST_F(DeclarableOpsTests5, gatherNd_test2) {
 
     auto expected = NDArrayFactory::create<double>('c', {2,2,2}, {23, 24, 11, 12, 3,  4, 3,  4});
 
-    nd4j::ops::gather_nd op;
+    sd::ops::gather_nd op;
     auto results = op.evaluate({&input, &indices}, {}, {}, {true});
     auto output = results->at(0);
 
@@ -722,7 +722,7 @@ TEST_F(DeclarableOpsTests5, gatherNd_test3) {
     auto indices = NDArrayFactory::create<int>('c', {3}, {3,2,1});
     auto expected = NDArrayFactory::create<double>(24.);
 
-    nd4j::ops::gather_nd op;
+    sd::ops::gather_nd op;
     auto results = op.evaluate({&input, &indices}, {}, {});
     auto output = results->at(0);
 
@@ -741,7 +741,7 @@ TEST_F(DeclarableOpsTests5, gatherNd_test4) {
     auto indices = NDArrayFactory::create<int>('c', {2,3}, {3,2,1,0,2,1});
     auto expected = NDArrayFactory::create<double>('c',{2}, {24., 6});
 
-    nd4j::ops::gather_nd op;
+    sd::ops::gather_nd op;
     auto results = op.evaluate({&input, &indices}, {}, {});
     auto output = results->at(0);
 
@@ -759,7 +759,7 @@ TEST_F(DeclarableOpsTests5, gatherNd_test5) {
     auto indices = NDArrayFactory::create<int>('c', {5,1}, {3,2,0,1,1});
     auto expected = NDArrayFactory::create<double>('c',{5}, {4.,3,1,2,2});
 
-    nd4j::ops::gather_nd op;
+    sd::ops::gather_nd op;
     auto results = op.evaluate({&input, &indices}, {}, {});
     auto output = results->at(0);
 
@@ -778,7 +778,7 @@ TEST_F(DeclarableOpsTests5, gatherNd_test6) {
     auto indices = NDArrayFactory::create<int>('c', shape, {2});
     auto expected = NDArrayFactory::create<double>(3.);
 
-    nd4j::ops::gather_nd op;
+    sd::ops::gather_nd op;
     auto results = op.evaluate({&input, &indices}, {}, {});
     auto output = results->at(0);
 
@@ -797,7 +797,7 @@ TEST_F(DeclarableOpsTests5, gatherNd_test7) {
     auto indices = NDArrayFactory::create<int>('c', {3,3,2}, {0,2,1, 0,1,0, 1,3,1, 0,2,1, 0,1,0, 1,3,1});
     auto expected = NDArrayFactory::create<double>('c', {3,3}, {3,5,5,8,5,10,2,2,14});
 
-    nd4j::ops::gather_nd op;
+    sd::ops::gather_nd op;
     auto results = op.evaluate({&input, &indices}, {}, {}, {true});
     auto output = results->at(0);
 
@@ -814,7 +814,7 @@ TEST_F(DeclarableOpsTests5, gatherNd_test8) {
     auto y = NDArrayFactory::create<int>('c', {2, 2}, {0, 0, 1, 1});
     auto e = NDArrayFactory::create<double>('c', {2}, {1., 4.});
 
-    nd4j::ops::gather_nd op;
+    sd::ops::gather_nd op;
     auto result = op.evaluate({&x, &y}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -831,7 +831,7 @@ TEST_F(DeclarableOpsTests5, gatherNd_test9) {
     auto exp = NDArrayFactory::create<double>('c', {3,2}, {11.f, 12.f, 5.f, 6.f, 31.f, 32.f});
     x.linspace(1);
 
-    nd4j::ops::gather_nd op;
+    sd::ops::gather_nd op;
     auto result = op.evaluate({&x, &indices}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -854,7 +854,7 @@ TEST_F(DeclarableOpsTests5, gatherNd_test10) {
 
     auto output = NDArrayFactory::create<double>('c', {2,2,2});
 
-    nd4j::ops::gather_nd op;
+    sd::ops::gather_nd op;
 
     ASSERT_ANY_THROW(op.execute({&input, &indices}, {&output}, {}, {}, {true}));
 }
@@ -866,7 +866,7 @@ TEST_F(DeclarableOpsTests5, gatherNd_test11) {
     auto indices = NDArrayFactory::create<int>('c', {3,3,2}, {0,2,1, 0,10,0, 1,30,1, 0,20,1, 0,1,0, 1,30,1});
     auto output = NDArrayFactory::create<double>('c', {3,3});
 
-    nd4j::ops::gather_nd op;
+    sd::ops::gather_nd op;
 
     ASSERT_ANY_THROW(op.execute({&input, &indices}, {&output}, {}, {}, {true}));
 }
@@ -879,7 +879,7 @@ TEST_F(DeclarableOpsTests5, reverse_sequense_test1) {
     auto seqLengths = NDArrayFactory::create<int>('c', {4}, {4,4,4,4});
     auto exp = NDArrayFactory::create<double>('c', {3, 4, 5}, {4,  3,  2,  1,  5, 9,  8,  7,  6, 10, 14, 13, 12, 11, 15, 19, 18, 17, 16, 20, 24, 23, 22, 21, 25, 29, 28, 27, 26, 30, 34, 33, 32, 31, 35, 39, 38, 37, 36, 40, 44, 43, 42, 41, 45, 49, 48, 47, 46, 50, 54, 53, 52, 51, 55, 59, 58, 57, 56, 60});
 
-    nd4j::ops::reverse_sequence op;
+    sd::ops::reverse_sequence op;
     auto results = op.evaluate({&input, &seqLengths}, {}, {2, 1});
     ASSERT_EQ(Status::OK(), results->status());
 
@@ -899,7 +899,7 @@ TEST_F(DeclarableOpsTests5, reverse_sequense_test2) {
     auto seqLengths = NDArrayFactory::create<Nd4jLong>('c', {4}, {0,1,2,3});
     auto exp = NDArrayFactory::create<double>('c', {3, 4, 5}, {1,  2,  3,  4,  5, 6,  7,  8,  9, 10, 12, 11, 13, 14, 15, 18, 17, 16, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 32, 31, 33, 34, 35, 38, 37, 36, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 52, 51, 53, 54, 55, 58, 57, 56, 59, 60});
 
-    nd4j::ops::reverse_sequence op;
+    sd::ops::reverse_sequence op;
     auto results = op.evaluate({&input, &seqLengths}, {}, {2, 1});
     auto output = results->at(0);
 
@@ -918,7 +918,7 @@ TEST_F(DeclarableOpsTests5, reverse_sequense_test3) {
     auto seqLengths = NDArrayFactory::create<int>('c', {3}, {2,3,4});
     auto exp = NDArrayFactory::create<double>('c', {3, 4, 5}, {2,  1,  3,  4,  5, 7,  6,  8,  9, 10, 12, 11, 13, 14, 15, 17, 16, 18, 19, 20, 23, 22, 21, 24, 25, 28, 27, 26, 29, 30, 33, 32, 31, 34, 35, 38, 37, 36, 39, 40, 44, 43, 42, 41, 45, 49, 48, 47, 46, 50, 54, 53, 52, 51, 55, 59, 58, 57, 56, 60});
 
-    nd4j::ops::reverse_sequence op;
+    sd::ops::reverse_sequence op;
     auto results = op.evaluate({&input, &seqLengths}, {}, {2, 0});
     auto output = results->at(0);
 
@@ -937,7 +937,7 @@ TEST_F(DeclarableOpsTests5, reverse_sequense_test4) {
     auto seqLengths = NDArrayFactory::create<int>('c', {5}, {1, 2, 1, 2, 3});
     auto exp = NDArrayFactory::create<double>('c', {3, 4, 5}, {1, 22,  3, 24, 45, 6, 27,  8, 29, 50, 11, 32, 13, 34, 55, 16, 37, 18, 39, 60, 21,  2, 23,  4, 25, 26,  7, 28,  9, 30, 31, 12, 33, 14, 35, 36, 17, 38, 19, 40, 41, 42, 43, 44,  5, 46, 47, 48, 49, 10, 51, 52, 53, 54, 15, 56, 57, 58, 59, 20});
 
-    nd4j::ops::reverse_sequence op;
+    sd::ops::reverse_sequence op;
     auto results = op.evaluate({&input, &seqLengths}, {}, {0, 2});
     auto output = results->at(0);
 
@@ -956,7 +956,7 @@ TEST_F(DeclarableOpsTests5, reverse_sequense_test5) {
     auto seqLengths = NDArrayFactory::create<int>('c', {5}, {1, 2, 4, 2, 3});
     auto exp = NDArrayFactory::create<double>('c', {3, 4, 5}, {1,  7, 18,  9, 15, 6,  2, 13,  4, 10, 11, 12,  8, 14,  5, 16, 17,  3, 19, 20, 21, 27, 38, 29, 35, 26, 22, 33, 24, 30, 31, 32, 28, 34, 25, 36, 37, 23, 39, 40, 41, 47, 58, 49, 55, 46, 42, 53, 44, 50, 51, 52, 48, 54, 45, 56, 57, 43, 59, 60});
 
-    nd4j::ops::reverse_sequence op;
+    sd::ops::reverse_sequence op;
     auto results = op.evaluate({&input, &seqLengths}, {}, {1, 2});
     auto output = results->at(0);
 
@@ -975,7 +975,7 @@ TEST_F(DeclarableOpsTests5, reverse_sequense_test6) {
     auto seqLengths = NDArrayFactory::create<int>('c', {4}, {1, 2, 3, 2});
     auto exp = NDArrayFactory::create<double>('c', {3, 4, 5}, {1,  2,  3,  4,  5, 26, 27, 28, 29, 30, 51, 52, 53, 54, 55, 36, 37, 38, 39, 40, 21, 22, 23, 24, 25, 6,  7,  8,  9, 10, 31, 32, 33, 34, 35, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 11, 12, 13, 14, 15, 56, 57, 58, 59, 60});
 
-    nd4j::ops::reverse_sequence op;
+    sd::ops::reverse_sequence op;
     auto results = op.evaluate({&input, &seqLengths}, {}, {0, 1});
     auto output = results->at(0);
 
@@ -995,7 +995,7 @@ TEST_F(DeclarableOpsTests5, reverse_sequense_test7) {
     auto seqLengths = NDArrayFactory::create<int>('c', {1}, data);
     auto exp = NDArrayFactory::create<double>('c', {1, 5}, {3, 2, 1, 4, 5});
 
-    nd4j::ops::reverse_sequence op;
+    sd::ops::reverse_sequence op;
     auto results = op.evaluate({&input, &seqLengths}, {}, {1, 0});
     auto output = results->at(0);
 
@@ -1015,7 +1015,7 @@ TEST_F(DeclarableOpsTests5, reverse_sequense_test8) {
     auto seqLengths = NDArrayFactory::create<int>('c', {5}, data);
     auto exp = NDArrayFactory::create<double>('c', {1, 5}, {1, 2, 3, 4, 5});
 
-    nd4j::ops::reverse_sequence op;
+    sd::ops::reverse_sequence op;
     auto results = op.evaluate({&input, &seqLengths}, {}, {0, 1});
     auto output = results->at(0);
 
@@ -1035,7 +1035,7 @@ TEST_F(DeclarableOpsTests5, reverse_sequense_test9) {
     auto seqLengths = NDArrayFactory::create<Nd4jLong>('c', {5}, data);
     auto exp = NDArrayFactory::create<double>('c', {5, 1}, {1, 2, 3, 4, 5});
 
-    nd4j::ops::reverse_sequence op;
+    sd::ops::reverse_sequence op;
     auto results = op.evaluate({&input, &seqLengths}, {}, {1, 0});
     auto output = results->at(0);
 
@@ -1055,7 +1055,7 @@ TEST_F(DeclarableOpsTests5, reverse_sequense_test10) {
     auto seqLengths = NDArrayFactory::create<int>('c', {1}, data);
     auto exp = NDArrayFactory::create<double>('c', {5, 1}, {3, 2, 1, 4, 5});
 
-    nd4j::ops::reverse_sequence op;
+    sd::ops::reverse_sequence op;
     auto results = op.evaluate({&input, &seqLengths}, {}, {0, 1});
     auto output = results->at(0);
 
@@ -1075,7 +1075,7 @@ TEST_F(DeclarableOpsTests5, reverse_sequense_test11) {
     auto seqLengths = NDArrayFactory::create<int>('c', {5}, data);
     auto exp = NDArrayFactory::create<double>('c', {1, 1, 5, 1}, {1, 2, 3, 4, 5});
 
-    nd4j::ops::reverse_sequence op;
+    sd::ops::reverse_sequence op;
     auto results = op.evaluate({&input, &seqLengths}, {}, {1, 2});
     auto output = results->at(0);
 
@@ -1095,7 +1095,7 @@ TEST_F(DeclarableOpsTests5, reverse_sequense_test12) {
     auto seqLengths = NDArrayFactory::create<int>('c', {1}, data);
     auto exp = NDArrayFactory::create<double>('c', {1, 1, 5, 1}, {3, 2, 1, 4, 5});
 
-    nd4j::ops::reverse_sequence op;
+    sd::ops::reverse_sequence op;
     auto results = op.evaluate({&input, &seqLengths}, {}, {2, 0});
     auto output = results->at(0);
 
@@ -1115,7 +1115,7 @@ TEST_F(DeclarableOpsTests5, reverse_sequense_test13) {
     auto seqLengths = NDArrayFactory::create<int>('c', {1}, data);
     auto exp = NDArrayFactory::create<double>('c', {1, 1, 5, 1}, {1, 2, 3, 4, 5});
 
-    nd4j::ops::reverse_sequence op;
+    sd::ops::reverse_sequence op;
     auto results = op.evaluate({&input, &seqLengths}, {}, {3, 0});
     auto output = results->at(0);
 
@@ -1132,7 +1132,7 @@ TEST_F(DeclarableOpsTests5, reverse_sequense_test14) {
     auto lengths = NDArrayFactory::create<Nd4jLong>('c', {8}, {7, 2, 3, 5, 2, 1, 6, 4});
     auto e = NDArrayFactory::create<double>('c', {8, 8, 3, 2}, {0.54193264,             0.05176904,             0.82555761,             0.71106697,             0.04416722,             0.07653656,             0.06478678,             0.68985848,             0.55216783,             0.55382648,             0.34652863,             0.17261296,             0.61523204,             0.64298760,             0.26848351,             0.75015615,             0.28683049,             0.70937606,             0.38700677,             0.68832738,             0.37292716,             0.94616004,             0.77735792,             0.60803430,             0.54555996,             0.23407607,             0.11372584,             0.49965927,             0.15210842,             0.53268608,             0.67197708,             0.80659380,             0.98274191,             0.63566073,             0.21592326,             0.54902743,             0.09753360,             0.76124972,             0.24693797,             0.13813169,             0.33144656,             0.08299957,             0.01034390,             0.99430482,             0.59944390,             0.17973880,             0.36437840,             0.86383673,             0.93630291,             0.67277404,             0.93899264,             0.52422773,             0.44892176,             0.03127759,             0.45025550,             0.97136977,             0.13565978,             0.71567448,             0.92094825,             0.93536442,             0.85910449,             0.18252879,             0.72830945,             0.96736828,             0.89831575,             0.83437150,             0.59050780,             0.36145925,             0.16483070,             0.44021176,             0.76018652,             0.44227383,             0.13052339,             0.18204235,             0.99743733,             0.26885190,             0.87726522,             0.16396056,             0.94943412,             0.40016700,             0.65267938,             0.71073267,             0.40094733,             0.91182634,             0.05391789,             0.49520416,             0.24963864,             0.34847086,             0.74088617,             0.36115701,             0.63074210,             0.97423085,             0.42216846,             0.06326975,             0.07858702,             0.20586622,             0.34755773,             0.63166554,             0.18849320,             0.34828456,             0.98477707,             0.75163124,             0.33309570,             0.67563176,             0.98343578,             0.95919930,             0.66994391,             0.89296165,             0.28752144,             0.38146961,             0.83518735,             0.08207577,             0.82083487,             0.81665728,             0.83306004,             0.14203056,             0.01497920,             0.85727447,             0.71194544,             0.85654019,             0.86160433,             0.79580411,             0.47710411,             0.09318029,             0.31369071,             0.64122249,             0.58399725,             0.26706597,             0.05655339,             0.91025211,             0.30330468,             0.33142930,             0.05668627,             0.02936449,             0.12613087,             0.09960114,             0.16218074,             0.15088139,             0.31239040,             0.55980062,             0.34804391,             0.34941538,             0.61370555,             0.07022964,             0.27274571,             0.83306066,             0.75830824,             0.25963478,             0.87137718,             0.24418835,             0.59371493,             0.74479056,             0.84699625,             0.51210368,             0.12489571,             0.23371067,             0.18361641,             0.48636240,             0.06052657,             0.04241913,             0.66710351,             0.07007925,             0.60553664,             0.07536713,             0.55971796,             0.38764845,             0.20737843,             0.37989120,             0.59757058,             0.31189846,             0.25215345,             0.52546591,             0.55744218,             0.59485650,             0.05032742,             0.52076188,             0.47762345,             0.89829370,             0.34417708,             0.84705151,             0.08203183,             0.10632956,             0.78431292,             0.86441722,             0.36487598,             0.09833603,             0.85863594,             0.11010505,             0.11659283,             0.42500288,             0.02747301,             0.12359903,             0.19736489,             0.44461885,             0.33341706,             0.22519571,             0.31528710,             0.14802902,             0.01753431,             0.41160932,             0.47245979,             0.08268172,             0.21580773,             0.75770279,             0.64171939,             0.52643769,             0.19261234,             0.98032835,             0.15401656,             0.85274458,             0.66408502,             0.23212704,             0.74630026,             0.05713613,             0.49025892,             0.48418810,             0.59541513,             0.09243053,             0.93919152,             0.95357019,             0.52377729,             0.65963871,             0.47934951,             0.49919534,             0.34369898,             0.78211256,             0.13908708,             0.95754117,             0.84107746,             0.09126213,             0.42979124,             0.10295325,             0.34631257,             0.69448345,             0.41720536,             0.15282440,             0.74329854,             0.45775009,             0.12786280,             0.39830299,             0.20386769,             0.59703523,             0.94077086,             0.42255597,             0.80453309,             0.79757204,             0.28653229,             0.60175909,             0.55859623,             0.34318230,             0.63002770,             0.36533324,             0.89689906,             0.73236186,             0.61491989,             0.83787947,             0.67939463,             0.72016694,             0.77499849,             0.72428343,             0.34571059,             0.23143007,             0.20099338,             0.85583142,             0.73174191,             0.54284092,             0.20264181,             0.53037061,             0.30493131,             0.82279766,             0.58542432,             0.72632070,             0.18394258,             0.00608118,             0.23808232,             0.17007573,             0.75245459,             0.84990616,             0.38827634,             0.33809538,             0.01080317,             0.27250145,             0.81769542,             0.15323253,             0.71668395,             0.99427044,             0.11355576,             0.50511923,             0.22952055,             0.78271870,             0.12833592,             0.88639055,             0.76398188,             0.49533508,             0.47939640,             0.73564612,             0.41465671,             0.10995635,             0.20271728,             0.00521771,             0.67265260,             0.11917707,             0.76574855,             0.43842117,             0.28530411,             0.79648090,             0.79433656,             0.12074559,             0.02325163,             0.10117917,             0.83559239,             0.67213900,             0.25247084,             0.47968157,             0.88649124,             0.33588961,             0.92338319,             0.18808573,             0.60248266,             0.36610154,             0.99123140,             0.10519719,             0.18754650,             0.43232584,             0.85447872,             0.15937568,             0.92947480,             0.62705964,             0.85960084,             0.13435660,             0.81845809,             0.60715133,             0.83030708,             0.83071910,             0.38883408,             0.92033237,             0.59820890,             0.75527947,             0.67683355,             0.21847023,             0.29395619,             0.50477953,             0.98977921,             0.96225332,             0.90143562,             0.19559914,             0.08978307,             0.09687492,             0.07381865,             0.22801110,             0.26669388,             0.99691302,             0.12113623,             0.34373057,             0.46066239,             0.48806761,             0.50688779,             0.00654483,             0.32076493,             0.42367646,             0.07112842,             0.54090558,             0.68230725,             0.49713828,             0.41958965,             0.68013847,             0.47691765,             0.63269259,             0.94304095,             0.54587271,             0.72447569,             0.28913523,             0.75766936,             0.52965692,             0.96854824,             0.15589071,             0.84128672,             0.16337522,             0.05771034,             0.21556356,             0.12094140,             0.29721207,             0.00811008,             0.66184926});
 
-    nd4j::ops::reverse_sequence op;
+    sd::ops::reverse_sequence op;
     auto results = op.evaluate({&input, &lengths}, {}, {1, 0});
     ASSERT_EQ(Status::OK(), results->status());
 
@@ -1149,7 +1149,7 @@ TEST_F(DeclarableOpsTests5, Test_TopK_0) {
     auto expV = NDArrayFactory::create<double>('c', {2, 1}, {11.0, 14.0});
     auto expI = NDArrayFactory::create<Nd4jLong>('c', {2, 1}, {4, 3});
 
-    nd4j::ops::top_k op;
+    sd::ops::top_k op;
     auto result = op.evaluate({&x}, {}, {1, 0}); // without sorting
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1188,7 +1188,7 @@ TEST_F(DeclarableOpsTests5, Test_TopK_1) {
     auto expV = NDArrayFactory::create<double>('c', {2, 1}, {11.0f, 14.0f});
     auto expI = NDArrayFactory::create<Nd4jLong>('c', {2, 1}, {1, 0});
 
-    nd4j::ops::top_k op;
+    sd::ops::top_k op;
     auto result = op.evaluate({&x}, {}, {1, 0}); // without sorting
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1241,7 +1241,7 @@ TEST_F(DeclarableOpsTests5, Test_TopK_2) {
 
     auto expI = NDArrayFactory::create<Nd4jLong>('c', {2, 3, 1 }, {2, 1, 0, 1, 2, 0});
 
-    nd4j::ops::top_k op;
+    sd::ops::top_k op;
     auto result = op.evaluate({&x}, {}, {1, 1});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1291,7 +1291,7 @@ TEST_F(DeclarableOpsTests5, Test_TopK_3) {
 
     auto expI = NDArrayFactory::create<Nd4jLong>('c', {2, 3, 2 }, {2, 0, 1, 3, 0, 3, 1,  3, 2, 1, 0, 2});
 
-    nd4j::ops::top_k op;
+    sd::ops::top_k op;
     auto result = op.evaluate({&x}, {}, {2, 1});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1341,7 +1341,7 @@ TEST_F(DeclarableOpsTests5, Test_TopK_3_unsorted) {
 
     auto expI = NDArrayFactory::create<Nd4jLong>('c', {2, 3, 2 }, {0, 2, 1, 3, 0, 3, 1,  3, 1, 2, 0, 2});
 
-    nd4j::ops::top_k op;
+    sd::ops::top_k op;
     auto result = op.evaluate({&x}, {}, {2}, {false});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1365,7 +1365,7 @@ TEST_F(DeclarableOpsTests5, Test_TopK_4) {
     auto expV = NDArrayFactory::create<double>('c', {2, 2}, {11.0f, 3.0f, 14.0f, 6.0f});
     auto expI = NDArrayFactory::create<Nd4jLong>('c', {2, 2}, {1, 2, 0, 2});
 
-    nd4j::ops::top_k op;
+    sd::ops::top_k op;
     auto result = op.evaluate({&x}, {}, {2, 1});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1389,7 +1389,7 @@ TEST_F(DeclarableOpsTests5, Test_TopK_5) {
     auto expV = NDArrayFactory::create<double>('f', {2, 2}, {11.1, 14.2, 3.1, 6.2});
     auto expI = NDArrayFactory::create<Nd4jLong>('f', {2, 2}, {2, 1, 1, 2});
 
-    nd4j::ops::top_k op;
+    sd::ops::top_k op;
     auto result = op.evaluate({&x}, {}, {2, 1});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1427,7 +1427,7 @@ TEST_F(DeclarableOpsTests5, Test_Moments_1) {
 
     float inf = 1.e-5f;
 
-    nd4j::ops::moments op;
+    sd::ops::moments op;
     auto result = op.evaluate({&x, &y}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1458,7 +1458,7 @@ TEST_F(DeclarableOpsTests5, Test_Moments_2) {
     NDArray expV('c', {4}, {11.833333, 7.6666665, 10.416667, 7.6666665});
     NDArray expD('c', {4}, {28.472221, 12.888889, 23.951387, 11.555554});
 
-    nd4j::ops::moments op;
+    sd::ops::moments op;
     auto result = op.evaluate({&x}, {}, {0, 1});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1492,7 +1492,7 @@ TEST_F(DeclarableOpsTests5, Test_Moments_3) {
                                        6.25f, 4.f, 27.5625f,  1.f,
                                        6.25f, 9.f, 0.0625f,  16.f});
 
-    nd4j::ops::moments op;
+    sd::ops::moments op;
     auto result = op.evaluate({&x}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1519,7 +1519,7 @@ TEST_F(DeclarableOpsTests5, Test_Moments_4) {
     auto expV = NDArrayFactory::create<double>('c', {3, 4}, { 8.5f, 6.f , 8.75f,  6.f, 8.5f, 11.f, 8.75f, 6.f, 18.5f, 6.f, 13.75f, 11.f});
     auto expD = NDArrayFactory::create<double>('c', {3, 4}, { 6.25f, 9.f, 27.5625f,  1.f, 6.25f, 4.f, 27.5625f,  1.f, 6.25f, 9.f, 0.0625f,  16.f});
 
-    nd4j::ops::moments op;
+    sd::ops::moments op;
     auto result = op.evaluate({&x}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1550,7 +1550,7 @@ TEST_F(DeclarableOpsTests5, trace_test1) {
     input.linspace(1);
     auto exp = NDArrayFactory::create<double>('c', {3}, {40, 120, 200});
     NDArray matrix('c', {3, 3}, {1., 2., 3., 4., 5., 6., 7., 8., 9.});
-    nd4j::ops::trace op;
+    sd::ops::trace op;
     auto results = op.evaluate({&input}, {}, {});
     auto output = results->at(0);
     double traceM = matrix.getTrace();
@@ -1571,7 +1571,7 @@ TEST_F(DeclarableOpsTests5, trace_test2) {
     input.linspace(1);
     auto exp = NDArrayFactory::create<double>(40.);
 
-    nd4j::ops::trace op;
+    sd::ops::trace op;
     auto results = op.evaluate({&input}, {}, {});
     auto output = results->at(0);
 
@@ -1589,7 +1589,7 @@ TEST_F(DeclarableOpsTests5, trace_test3) {
     input.linspace(1);
     auto exp = NDArrayFactory::create<double>(1.);
 
-    nd4j::ops::trace op;
+    sd::ops::trace op;
     auto results = op.evaluate({&input}, {}, {});
     auto output = results->at(0);
 
@@ -1607,7 +1607,7 @@ TEST_F(DeclarableOpsTests5, trace_test4) {
     input.linspace(1);
     auto exp = NDArrayFactory::create<double>(1.);
 
-    nd4j::ops::trace op;
+    sd::ops::trace op;
     auto results = op.evaluate({&input}, {}, {});
     auto output = results->at(0);
 
@@ -1625,7 +1625,7 @@ TEST_F(DeclarableOpsTests5, trace_test5) {
     input.linspace(1);
     auto exp = NDArrayFactory::create<double>('c', {3, 4}, {75,  225,  375,  525, 675,  825,  975, 1125, 1275, 1425, 1575, 1725});
 
-    nd4j::ops::trace op;
+    sd::ops::trace op;
     auto results = op.evaluate({&input});
     auto output = results->at(0);
 
@@ -1642,7 +1642,7 @@ TEST_F(DeclarableOpsTests5, random_shuffle_test1) {
     auto input = NDArrayFactory::create<double>('c', {2, 2, 2});
     input.linspace(1);
 
-    nd4j::ops::random_shuffle op;
+    sd::ops::random_shuffle op;
     auto results = op.evaluate({&input});
     auto output = results->at(0);
 
@@ -1665,7 +1665,7 @@ TEST_F(DeclarableOpsTests5, random_shuffle_test2) {
     auto input = NDArrayFactory::create<double>('c', {1, 3, 2});
     input.linspace(1);
 
-    nd4j::ops::random_shuffle op;
+    sd::ops::random_shuffle op;
     auto results = op.evaluate({&input});
     auto output = results->at(0);
 
@@ -1682,7 +1682,7 @@ TEST_F(DeclarableOpsTests5, random_shuffle_test3) {
     auto input = NDArrayFactory::create<double>('c', {3, 2, 1});
     input.linspace(1);
 
-    nd4j::ops::random_shuffle op;
+    sd::ops::random_shuffle op;
     auto results = op.evaluate({&input});
     auto output = results->at(0);
 
@@ -1703,7 +1703,7 @@ TEST_F(DeclarableOpsTests5, random_shuffle_test04) {
     auto input = NDArrayFactory::create<double>('c', {4});
     input.linspace(1);
 
-    nd4j::ops::random_shuffle op;
+    sd::ops::random_shuffle op;
     //NDArray* output;
     auto results = op.evaluate({&input}, {},  {},  {}, {}, true);
     ASSERT_EQ(Status::OK(), results->status());
@@ -1725,7 +1725,7 @@ TEST_F(DeclarableOpsTests5, random_shuffle_test4) {
     auto input = NDArrayFactory::create<double>('c', {4});
     input.linspace(1);
 
-    nd4j::ops::random_shuffle op;
+    sd::ops::random_shuffle op;
     //NDArray* output;
     auto results = op.evaluate({&input});
     ASSERT_EQ(Status::OK(), results->status());
@@ -1748,7 +1748,7 @@ TEST_F(DeclarableOpsTests5, random_shuffle_test5) {
     auto input = NDArrayFactory::create<double>('c', {4,1});
     input.linspace(1);
 
-    nd4j::ops::random_shuffle op;
+    sd::ops::random_shuffle op;
     auto results = op.evaluate({&input});
     auto output = results->at(0);
 
@@ -1771,7 +1771,7 @@ TEST_F(DeclarableOpsTests5, random_shuffle_test6) {
     auto input = NDArrayFactory::create<double>('c', {4,1,1});
     input.linspace(1);
 
-    nd4j::ops::random_shuffle op;
+    sd::ops::random_shuffle op;
     auto results = op.evaluate({&input});
     auto output = results->at(0);
 
@@ -1795,7 +1795,7 @@ TEST_F(DeclarableOpsTests5, random_shuffle_test7) {
     input.linspace(1);
     auto exp = NDArrayFactory::create<double>('c', {1,4}, {1, 2, 3, 4});
 
-    nd4j::ops::random_shuffle op;
+    sd::ops::random_shuffle op;
     auto results = op.evaluate({&input});
     auto output = results->at(0);
 
@@ -1825,7 +1825,7 @@ TEST_F(DeclarableOpsTests5, EmbeddingLookup_1) {
     // y.printShapeInfo("y shape");
     // y.printIndexedBuffer("y buffer");
 
-    nd4j::ops::embedding_lookup op;
+    sd::ops::embedding_lookup op;
     auto result = op.evaluate({&x, &y}, {}, {0});
     auto output = result->at(0);
     // x.printShapeInfo("Input");
@@ -1861,7 +1861,7 @@ TEST_F(DeclarableOpsTests5, EmbeddingLookup_2) {
     // y.printShapeInfo("y shape");
     // y.printIndexedBuffer("y buffer");
 
-    nd4j::ops::embedding_lookup op;
+    sd::ops::embedding_lookup op;
     auto result = op.evaluate({&x, &y}, {}, {0});
     auto output = result->at(0);
     // x.printShapeInfo("Input");
@@ -1902,7 +1902,7 @@ TEST_F(DeclarableOpsTests5, EmbeddingLookup_3) {
 
 //    res = tf.nn.embedding_lookup((p1, p2, p3, p4, p5, p6, p7), ids, 'mod')
 
-    nd4j::ops::embedding_lookup op;
+    sd::ops::embedding_lookup op;
     auto result = op.evaluate({&p1, &p2, &p3, &p4, &p5, &p6, &p7, &p8, &y}, {}, {1});
     auto output = result->at(0);
     // x.printShapeInfo("Input");
@@ -1945,7 +1945,7 @@ TEST_F(DeclarableOpsTests5, DynamicPartition_01) {
                                 NDArrayFactory::create<int>('c', {1}, {2}),
                                 NDArrayFactory::create<int>('c', {1}, {1})});
 
-    nd4j::ops::dynamic_partition op;
+    sd::ops::dynamic_partition op;
     auto result = op.evaluate({&x, &y}, {}, {numPartition});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1984,7 +1984,7 @@ TEST_F(DeclarableOpsTests5, DynamicPartition_1) {
                                 NDArrayFactory::create<double>('c', {8}, {18, 28, 19, 29, 20, 30, 21, 31}),
                                 NDArrayFactory::create<double>('c', {10}, {13, 23, 14, 24, 15, 25, 16, 26, 17, 27})});
 
-    nd4j::ops::dynamic_partition op;
+    sd::ops::dynamic_partition op;
     auto result = op.evaluate({&x, &y}, {}, {numPartition});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -2013,7 +2013,7 @@ TEST_F(DeclarableOpsTests5, DynamicPartition_2) {
                                NDArrayFactory::create<double>('c', {3}, {-1., 4.3, 7.4}),
                                NDArrayFactory::create<double>('c', {1}, {0.0})});
 
-    nd4j::ops::dynamic_partition op;
+    sd::ops::dynamic_partition op;
     int numPartition = 4;
     auto result = op.evaluate({&x, &y}, {}, {numPartition});
 
@@ -2041,7 +2041,7 @@ TEST_F(DeclarableOpsTests5, DynamicPartition_3) {
                                NDArrayFactory::create<double>({4.3f, 7.4f}),
                                NDArrayFactory::create<double>('c', {1}, {0.0f})});
 
-    nd4j::ops::dynamic_partition op;
+    sd::ops::dynamic_partition op;
     int numPartition = 4;
     auto result = op.evaluate({&x, &y}, {}, {numPartition});
 
@@ -2077,7 +2077,7 @@ TEST_F(DeclarableOpsTests5, DynamicStitch_empty_1) {
     auto d1 = NDArrayFactory::empty<double>();
     auto d2 = NDArrayFactory::create<double>('c', {2, 5}, {0.94414854,0.5956861,0.8668989,0.3502196,0.5100082,0.061725974,0.6621324,0.034165382,0.32576954,0.51917326});
 
-    nd4j::ops::dynamic_stitch op;
+    sd::ops::dynamic_stitch op;
     auto result = op.evaluate({&i0, &i1, &i2, &d0, &d1, &d2}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -2093,7 +2093,7 @@ TEST_F(DeclarableOpsTests5, DynamicStitch_empty_2) {
     auto d1 = NDArrayFactory::create<double>('c', {0, 5});
     auto d2 = NDArrayFactory::create<double>('c', {2, 5}, {0.94414854,0.5956861,0.8668989,0.3502196,0.5100082,0.061725974,0.6621324,0.034165382,0.32576954,0.51917326});
 
-    nd4j::ops::dynamic_stitch op;
+    sd::ops::dynamic_stitch op;
     auto result = op.evaluate({&i0, &i1, &i2, &d0, &d1, &d2}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -2112,7 +2112,7 @@ TEST_F(DeclarableOpsTests5, DynamicStitch_1) {
 
     auto exp = NDArrayFactory::create<double>({7.4f, 0.1f, -1.f, 5.2f, -1.f, 4.3f});
 
-    nd4j::ops::dynamic_stitch op;
+    sd::ops::dynamic_stitch op;
     auto result = op.evaluate({&x1, &x2, &y1, &y2}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -2137,7 +2137,7 @@ TEST_F(DeclarableOpsTests5, DynamicStitch_2) {
 
     auto exp = NDArrayFactory::create<double>({5.2f, -1.f, 4.3f, -1.f, 7.4f, 0.1f});
 
-    nd4j::ops::dynamic_stitch op;
+    sd::ops::dynamic_stitch op;
     auto result = op.evaluate({&x1, &x2, &y1, &y2}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -2169,7 +2169,7 @@ TEST_F(DeclarableOpsTests5, fusedBatchNorm_test1) {
     auto expBatchVar = NDArrayFactory::create<double>('c', {4}, {208.00001526,  208.00001526,  208.00001526,  208.00001526});
 
 
-    nd4j::ops::fused_batch_norm op;
+    sd::ops::fused_batch_norm op;
     auto results = op.evaluate({&x, &scale, &offset}, {}, {0,1});
     auto y = results->at(0);
     auto batchMean = results->at(1);
@@ -2198,7 +2198,7 @@ TEST_F(DeclarableOpsTests5, fusedBatchNorm_test2) {
     auto expBatchMean = NDArrayFactory::create<double>('c', {4}, {23.,  24.,  25.,  26.});
     auto expBatchVar = NDArrayFactory::create<double>('c', {4}, {208.00001526,  208.00001526,  208.00001526,  208.00001526});
 
-    nd4j::ops::fused_batch_norm op;
+    sd::ops::fused_batch_norm op;
     auto results = op.evaluate({&x, &scale, &offset}, {0.05}, {0,1});
     auto y = results->at(0);
     auto batchMean = results->at(1);
@@ -2227,7 +2227,7 @@ TEST_F(DeclarableOpsTests5, fusedBatchNorm_test3) {
     auto expBatchMean = NDArrayFactory::create<double>('c', {4}, {23.,  24.,  25.,  26.});
     auto expBatchVar = NDArrayFactory::create<double>('c', {4}, {208.00001526,  208.00001526,  208.00001526,  208.00001526});
 
-    nd4j::ops::fused_batch_norm op;
+    sd::ops::fused_batch_norm op;
     auto results = op.evaluate({&x, &scale, &offset}, {}, {1,1});
     auto y = results->at(0);
     auto batchMean = results->at(1);
@@ -2262,7 +2262,7 @@ TEST_F(DeclarableOpsTests5, fusedBatchNorm_test4) {
     auto expBatchVar = NDArrayFactory::create<double>('c', shape, {0.,  0.,  0.,  0.});
 
 
-    nd4j::ops::fused_batch_norm op;
+    sd::ops::fused_batch_norm op;
     auto results = op.evaluate({&x, &scale, &offset}, {}, {0,1});
     auto y = results->at(0);
     auto batchMean = results->at(1);
@@ -2297,7 +2297,7 @@ TEST_F(DeclarableOpsTests5, fusedBatchNorm_test5) {
     auto expBatchVar = NDArrayFactory::create<double>('c', shape, {0.,  0.,  0.,  0.});
 
 
-    nd4j::ops::fused_batch_norm op;
+    sd::ops::fused_batch_norm op;
     auto results = op.evaluate({&x, &scale, &offset}, {0.05}, {0,1});
     auto y = results->at(0);
     auto batchMean = results->at(1);
@@ -2318,7 +2318,7 @@ TEST_F(DeclarableOpsTests5, confusion_matrix_test1) {
     auto predictions = NDArrayFactory::create<Nd4jLong>('c', {1, 3}, {2, 2, 4});
     auto expected = NDArrayFactory::create<Nd4jLong>('c', {5, 5}, {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1});
 
-    nd4j::ops::confusion_matrix op;
+    sd::ops::confusion_matrix op;
     auto results = op.evaluate({&labels, &predictions}, {}, {});
     ASSERT_EQ(Status::OK(), results->status());
 
@@ -2337,7 +2337,7 @@ TEST_F(DeclarableOpsTests5, confusion_matrix_test2) {
     auto predictions = NDArrayFactory::create<Nd4jLong>('c', {1, 2}, {0, 2});
     auto expected = NDArrayFactory::create<Nd4jLong>('c', {3, 3}, {0, 0, 0, 1, 0, 0, 0, 0, 1});
 
-    nd4j::ops::confusion_matrix op;
+    sd::ops::confusion_matrix op;
     auto results = op.evaluate({&labels, &predictions}, {}, {3});
     ASSERT_EQ(Status::OK(), results->status());
 
@@ -2357,7 +2357,7 @@ TEST_F(DeclarableOpsTests5, confusion_matrix_test3) {
     auto weights = NDArrayFactory::create<Nd4jLong>('c', {1, 2}, {100, 200});
     auto expected = NDArrayFactory::create<Nd4jLong>('c', {3, 3}, {0, 0, 0, 100, 0, 0, 0, 0, 200});
 
-    nd4j::ops::confusion_matrix op;
+    sd::ops::confusion_matrix op;
     auto results = op.evaluate({&labels, &predictions, &weights}, {}, {3});
     auto output = results->at(0);
 
@@ -2376,8 +2376,8 @@ TEST_F(DeclarableOpsTests5, confusion_matrix_test4) {
     auto weights = NDArrayFactory::create<double>('c', {1, 2}, {100, 200});
     auto expected = NDArrayFactory::create<double>('c', {3, 3}, {0, 0, 0, 100, 0, 0, 0, 0, 200});
 
-    nd4j::ops::confusion_matrix op;
-    auto results = op.evaluate({&labels, &predictions, &weights}, {}, {3, nd4j::DataType::DOUBLE});
+    sd::ops::confusion_matrix op;
+    auto results = op.evaluate({&labels, &predictions, &weights}, {}, {3, sd::DataType::DOUBLE});
     auto output = results->at(0);
 
     ASSERT_EQ(Status::OK(), results->status());
@@ -2395,7 +2395,7 @@ TEST_F(DeclarableOpsTests5, ZeroFraction_1) {
                                       13, 14, 15, 16, 17, 18,
                                       19, 0, 21, 22, 23, 24});
 
-    nd4j::ops::zero_fraction op;
+    sd::ops::zero_fraction op;
     auto res = op.evaluate({&x}, {}, {});
 
     ASSERT_EQ(Status::OK(), res->status());
@@ -2410,7 +2410,7 @@ TEST_F(DeclarableOpsTests5, ZeroFraction_2) {
 
     auto x = NDArrayFactory::create<double>('c', {2, 2, 2}, {5.5, 0., 0.3, 5.5, 8.6, 0., 0., 0.4});
 
-    nd4j::ops::zero_fraction op;
+    sd::ops::zero_fraction op;
     auto res = op.evaluate({&x}, {}, {});
 
     ASSERT_EQ(Status::OK(), res->status());
@@ -2425,7 +2425,7 @@ TEST_F(DeclarableOpsTests5, ZeroFraction_3) {
 
     auto x = NDArrayFactory::create<double>('f', {2, 2, 2}, {5.5, 0., 0.3, 5.5, 8.6, 0., 0., 0.4});
 
-    nd4j::ops::zero_fraction op;
+    sd::ops::zero_fraction op;
     auto res = op.evaluate({&x}, {}, {});
 
     ASSERT_EQ(Status::OK(), res->status());
@@ -2444,7 +2444,7 @@ TEST_F(DeclarableOpsTests5, XWPlusB_1) {
 
     auto exp = NDArrayFactory::create<double>('c', {2,2}, {173.f, 264.f, 310.f, 279.f});
 
-    nd4j::ops::xw_plus_b op;
+    sd::ops::xw_plus_b op;
     auto result = op.evaluate({&x, &y, &b}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -2462,7 +2462,7 @@ TEST_F(DeclarableOpsTests5, StopGradient_1) {
 
     auto x = NDArrayFactory::create<double>('c', {2,3}, { 1.f, 11.f,  3.f, 14.f,  5.f,  6.f});
 
-    nd4j::ops::stop_gradient op;
+    sd::ops::stop_gradient op;
     auto result = op.evaluate({&x});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -2485,7 +2485,7 @@ TEST_F(DeclarableOpsTests5, StopGradient_2) {
 
     auto x = NDArrayFactory::create<double>('f', {2,3}, { 1.f, 11.f,  3.f, 14.f,  5.f,  6.f});
 
-    nd4j::ops::stop_gradient op;
+    sd::ops::stop_gradient op;
     auto result = op.evaluate({&x});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -2509,7 +2509,7 @@ TEST_F(DeclarableOpsTests5, log_softmax_test1) {
     auto input = NDArrayFactory::create<double>('c', {3, 3, 3}, {-1, 1, -2, 2, -3, 3, -4, 4, -5,5 ,-6,6, -7,7, -8,8, -9,9, -10,10, -11,11, -12,12, -13,13, 14});
     auto expOutput = NDArrayFactory::create<double>('c', {3, 3, 3}, {-2.16985e+00,-1.69846e-01,-3.16985e+00, -1.31507e+00,-6.31507e+00,-3.15072e-01, -8.00046e+00,-4.58767e-04,-9.00046e+00, -1.31327e+00,-1.23133e+01,-3.13266e-01, -1.40000e+01,-1.13743e-06,-1.50000e+01, -1.31326e+00,-1.83133e+01,-3.13262e-01, -2.00000e+01,-2.81941e-09,-2.10000e+01, -1.31326e+00,-2.43133e+01,-3.13262e-01, -2.73133e+01,-1.31326e+00,-3.13262e-01});
 
-    nd4j::ops::log_softmax op;
+    sd::ops::log_softmax op;
     auto  results = op.evaluate({&input});
     auto z = results->at(0);
 
@@ -2526,7 +2526,7 @@ TEST_F(DeclarableOpsTests5, log_softmax_test2) {
     auto input = NDArrayFactory::create<double>('c', {3, 3, 3}, {-1, 1, -2, 2, -3, 3, -4, 4, -5,5 ,-6,6, -7,7, -8,8, -9,9, -10,10, -11,11, -12,12, -13,13, 14});
     auto expOutput = NDArrayFactory::create<double>('c', {3, 3, 3}, {-3.05095e+00,-3.04946e+00,-5.00705e+00, -5.09458e-02,-7.04946e+00,-7.04851e-03, -6.05095e+00,-4.94556e-02,-8.00705e+00, -3.04859e+00,-1.30000e+01,-3.04859e+00, -1.50486e+01,-2.37286e-06,-1.70486e+01, -4.85876e-02,-1.60000e+01,-4.85874e-02, -2.10000e+01,-3.04859e+00,-2.51269e+01, -7.96007e-10,-2.50486e+01,-2.12693e+00, -2.40000e+01,-4.85874e-02,-1.26928e-01});
 
-    nd4j::ops::log_softmax op;
+    sd::ops::log_softmax op;
     auto  results = op.evaluate({&input}, {}, {1});
     auto z = results->at(0);
 
@@ -2543,7 +2543,7 @@ TEST_F(DeclarableOpsTests5, log_softmax_test3) {
     auto input = NDArrayFactory::create<double>('c', {3, 3, 3}, {-1, 1, -2, 2, -3, 3, -4, 4, -5,5 ,-6,6, -7,7, -8,8, -9,9, -10,10, -11,11, -12,12, -13,13, 14});
     auto expOutput = NDArrayFactory::create<double>('c', {3, 3, 3}, {-2.16985e+00,-1.69846e-01,-3.16985e+00, -1.31507e+00,-6.31507e+00,-3.15072e-01, -8.00046e+00,-4.58767e-04,-9.00046e+00, -1.31327e+00,-1.23133e+01,-3.13266e-01, -1.40000e+01,-1.13743e-06,-1.50000e+01, -1.31326e+00,-1.83133e+01,-3.13262e-01, -2.00000e+01,-2.81941e-09,-2.10000e+01, -1.31326e+00,-2.43133e+01,-3.13262e-01, -2.73133e+01,-1.31326e+00,-3.13262e-01});
 
-    nd4j::ops::log_softmax op;
+    sd::ops::log_softmax op;
     auto  results = op.evaluate({&input}, {}, {2});
     auto z = results->at(0);
 
@@ -2561,7 +2561,7 @@ TEST_F(DeclarableOpsTests5, log_softmax_test5) {
     auto input = NDArrayFactory::create<double>('c', {3, 3}, {-1, 1, -2, 2, -3, 3, -4, 4, 5});
     auto expOutput = NDArrayFactory::create<double>('c', {3, 3}, {-2.16985, -0.16985, -3.16985, -1.31507, -6.31507, -0.31507, -9.31335, -1.31335, -0.31335});
 
-    nd4j::ops::log_softmax op;
+    sd::ops::log_softmax op;
     auto  results = op.evaluate({&input});
     auto z = results->at(0);
 
@@ -2578,7 +2578,7 @@ TEST_F(DeclarableOpsTests5, log_softmax_test6) {
     auto input = NDArrayFactory::create<double>('c', {3, 3}, {-1, 1, -2, 2, -3, 3, -4, 4, 5});
     auto expOutput = NDArrayFactory::create<double>('c', {3, 3}, {-3.05095,-3.04946,-7.12773, -0.05095,-7.04946,-2.12773, -6.05095,-0.04946,-0.12773});
 
-    nd4j::ops::log_softmax op;
+    sd::ops::log_softmax op;
     auto  results = op.evaluate({&input}, {}, {0});
     auto z = results->at(0);
 
@@ -2595,7 +2595,7 @@ TEST_F(DeclarableOpsTests5, log_softmax_test7) {
     auto input = NDArrayFactory::create<double>('c', {1, 5}, {-1, 1, -2, 2, 3});
     auto expOutput = NDArrayFactory::create<double>('c', {1, 5}, {-4.42414, -2.42414, -5.42414, -1.42414, -0.42414});
 
-    nd4j::ops::log_softmax op;
+    sd::ops::log_softmax op;
     auto  results = op.evaluate({&input});
     auto z = results->at(0);
 
@@ -2612,7 +2612,7 @@ TEST_F(DeclarableOpsTests5, log_softmax_test8) {
     auto input = NDArrayFactory::create<double>('c', {1, 5}, {-1, 1, -2, 2, 3});
     auto expOutput = NDArrayFactory::create<double>('c', {1, 5}, {0, 0, 0, 0, 0});
 
-    nd4j::ops::log_softmax op;
+    sd::ops::log_softmax op;
     auto  results = op.evaluate({&input}, {}, {0});
     auto z = results->at(0);
 
@@ -2629,7 +2629,7 @@ TEST_F(DeclarableOpsTests5, log_softmax_test9) {
     auto input = NDArrayFactory::create<double>('c', {5, 1}, {-1, 1, -2, 2, 3});
     auto expOutput = NDArrayFactory::create<double>('c', {5, 1}, {0, 0, 0, 0, 0});
 
-    nd4j::ops::log_softmax op;
+    sd::ops::log_softmax op;
     auto  results = op.evaluate({&input});
     auto z = results->at(0);
 
@@ -2646,7 +2646,7 @@ TEST_F(DeclarableOpsTests5, log_softmax_test10) {
     auto input = NDArrayFactory::create<double>('c', {5, 1}, {-1, 1, -2, 2, 3});
     auto expOutput = NDArrayFactory::create<double>('c', {5, 1}, {-4.42414, -2.42414, -5.42414, -1.42414, -0.42414});
 
-    nd4j::ops::log_softmax op;
+    sd::ops::log_softmax op;
     auto  results = op.evaluate({&input}, {}, {0});
     auto z = results->at(0);
 
@@ -2663,7 +2663,7 @@ TEST_F(DeclarableOpsTests5, log_softmax_test11) {
     auto input = NDArrayFactory::create<double>('c', {5}, {-1, 1, -2, 2, 3});
     auto expOutput = NDArrayFactory::create<double>('c', {5}, {-4.42414, -2.42414, -5.42414, -1.42414, -0.42414});
 
-    nd4j::ops::log_softmax op;
+    sd::ops::log_softmax op;
     auto  results = op.evaluate({&input});
     auto z = results->at(0);
 
@@ -2682,7 +2682,7 @@ TEST_F(DeclarableOpsTests5, log_softmax_test12) {
 
     for (int i = 0; i < 10; ++i)
     {
-        nd4j::ops::log_softmax op;
+        sd::ops::log_softmax op;
         auto  results = op.evaluate({&input});
         auto z = results->at(0);
 
@@ -2701,7 +2701,7 @@ TEST_F(DeclarableOpsTests5, log_softmax_bp_test1) {
     auto epsilon = NDArrayFactory::create<double>('c', {2, 2}, {0.1, 0.2, 0.3, 0.4});
     auto exp = NDArrayFactory::create<double>('c', {2, 2}, {-0.07311,0.02689, -0.07311,0.02689});
 
-    nd4j::ops::log_softmax_bp op;
+    sd::ops::log_softmax_bp op;
     auto  results = op.evaluate({&input, &epsilon});
     auto output = results->at(0);
 
@@ -2719,7 +2719,7 @@ TEST_F(DeclarableOpsTests5, log_softmax_bp_test2) {
     auto epsilon = NDArrayFactory::create<double>('c', {2, 2}, {0.1, 0.2, 0.3, 0.4});
     auto exp = NDArrayFactory::create<double>('c', {2, 2}, {-0.17616, -0.17616, 0.02384,  0.02384});
 
-    nd4j::ops::log_softmax_bp op;
+    sd::ops::log_softmax_bp op;
     auto  results = op.evaluate({&input, &epsilon}, {}, {0});
     auto output = results->at(0);
 
@@ -2737,7 +2737,7 @@ TEST_F(DeclarableOpsTests5, ELU_1) {
     auto exp     = NDArrayFactory::create<double>('c', {2, 2, 2}, { -0.63212055,  2. , 1.5, -0.753403, 1.,   2.,  2.,   1.});
     auto res     = NDArrayFactory::create<double>('c', {2, 2, 2});
 
-    input.applyScalar(nd4j::scalar::ELU, 1.f, res);
+    input.applyScalar(sd::scalar::ELU, 1.f, res);
 
     ASSERT_TRUE(res.equalsTo(&exp));
 }
@@ -2748,7 +2748,7 @@ TEST_F(DeclarableOpsTests5, L2_Loss_1) {
     auto input   = NDArrayFactory::create<double>('c', {2, 2, 2}, { -1.,  2. , 1.5, -1.4, 1.,   2.,  2.,   1.});
     double exp(9.605);
 
-    nd4j::ops::l2_loss op;
+    sd::ops::l2_loss op;
     auto results = op.evaluate({&input}, {}, {});
     auto output = results->at(0);
 
@@ -2764,7 +2764,7 @@ TEST_F(DeclarableOpsTests5, L2_Loss_2) {
     auto x = NDArrayFactory::create<double>(0.7787855863571167);
     auto e = NDArrayFactory::create<double>(0.303254);
 
-    nd4j::ops::l2_loss op;
+    sd::ops::l2_loss op;
     auto results = op.evaluate({&x}, {}, {});
     ASSERT_EQ(Status::OK(), results->status());
 
@@ -2780,7 +2780,7 @@ TEST_F(DeclarableOpsTests5, L2_Loss_3) {
     auto e = NDArrayFactory::create<double>(0.303254);
     auto z = NDArrayFactory::create<double>(0.0);
 
-    nd4j::ops::l2_loss op;
+    sd::ops::l2_loss op;
     auto status = op.execute({&x}, {&z} , {}, {}, {});
     ASSERT_EQ(Status::OK(), status);
 
@@ -2796,7 +2796,7 @@ TEST_F(DeclarableOpsTests5, LogPoissonLoss_1) {
 
     auto exp = NDArrayFactory::create<double>('c', {2, 2, 2}, {1.3678794, 5.389056, 2.981689, 1.6465969, 1.7182817, 5.389056, 5.389056, 1.7182817});
 
-    nd4j::ops::log_poisson_loss op;
+    sd::ops::log_poisson_loss op;
     auto results = op.evaluate({&input, &weights, &targets}, {}, {0});
     auto output = results->at(0);
 
@@ -2817,7 +2817,7 @@ TEST_F(DeclarableOpsTests5, LogPoissonLoss_2) {
 
     auto exp = NDArrayFactory::create<double>('c', {2, 2, 2}, {3.0196857, 4.0408626, 2.1334953, 3.6984034, 1.3700882, 4.0408626, 4.0408626, 1.3700882});
 
-    nd4j::ops::log_poisson_loss op;
+    sd::ops::log_poisson_loss op;
     auto results = op.evaluate({&input, &weights, &targets}, {}, {0, 1});
     auto output = results->at(0);
 
@@ -2863,7 +2863,7 @@ TEST_F(DeclarableOpsTests5, NormalizeMoments_1) {
                                                 -19.75,   -30.75,       -37.,   1.25,
                                                   -51.,   -10.75,   -33.8125,  -3.75});
 
-    nd4j::ops::normalize_moments op;
+    sd::ops::normalize_moments op;
     auto results = op.evaluate({&counts, &means, &deviance}, {0.0}, {});
 
     ASSERT_EQ(Status::OK(), results->status());
@@ -2914,7 +2914,7 @@ TEST_F(DeclarableOpsTests5, NormalizeMoments_2) {
                                                  0.9097222,  0.7430556,  0.6388887,  1.0763888,
                                                 0.38888884,  1.0208334,  0.6927084,   1.076389});
 
-    nd4j::ops::normalize_moments op;
+    sd::ops::normalize_moments op;
     auto results = op.evaluate({&counts, &means, &deviance}, {0.0}, {});
 
     ASSERT_EQ(Status::OK(), results->status());
@@ -2965,7 +2965,7 @@ TEST_F(DeclarableOpsTests5, NormalizeMoments_3) {
                                                  0.9097222,  0.7430556,  0.6388887,  1.0763888,
                                                 0.38888884,  1.0208334,  0.6927084,   1.076389});
 
-    nd4j::ops::normalize_moments op;
+    sd::ops::normalize_moments op;
     auto results = op.evaluate({&counts, &means, &deviance}, {shift}, {});
 
     ASSERT_EQ(Status::OK(), results->status());
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp
index 5a919d132..6eee59058 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp
@@ -22,12 +22,12 @@
 #include "testlayers.h"
 #include <ops/declarable/CustomOperations.h>
 #include <helpers/helper_hash.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <array/NDArrayList.h>
 
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class DeclarableOpsTests6 : public testing::Test {
 public:
@@ -49,7 +49,7 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_Once_Again_1) {
 
     matrix.linspace(1);
 
-    nd4j::ops::strided_slice op;
+    sd::ops::strided_slice op;
     auto result = op.evaluate({&matrix, &b, &e, &s}, {}, {0, 0, 0, 0, 1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -70,7 +70,7 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_Once_Again_2) {
 
     matrix.linspace(1);
 
-    nd4j::ops::strided_slice op;
+    sd::ops::strided_slice op;
     auto result = op.evaluate({&matrix, &b, &e, &s}, {}, {0, 0, 0, 0, 1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -91,7 +91,7 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_Once_Again_3) {
 
     //matrix.linspace(1);
 
-    nd4j::ops::strided_slice op;
+    sd::ops::strided_slice op;
     auto result = op.evaluate({&matrix, &b, &e, &s}, {}, {0, 0, 0, 0, 1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -113,7 +113,7 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_Once_Again_4) {
 
     //matrix.linspace(1);
 
-    nd4j::ops::strided_slice op;
+    sd::ops::strided_slice op;
     auto result = op.evaluate({&matrix, &b, &e, &s}, {}, {0, 0, 0, 0, 1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -131,7 +131,7 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_Once_Again_04) {
     auto b = NDArrayFactory::create_<int>('c', {1}, {1});
     auto e = NDArrayFactory::create_<int>('c', {1}, {z});
     auto s = NDArrayFactory::create_<int>('c', {1}, {1});
-    nd4j::ops::ones_as opOnes;
+    sd::ops::ones_as opOnes;
     //auto exp = NDArrayFactory::create<double>('c', {2}, {1.0f, 2.0f});
     auto onesRes = opOnes.evaluate({&matrix});
     //matrix.linspace(1);
@@ -157,7 +157,7 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_Once_Again_04) {
     block->getIArguments()->push_back(0);
     block->getIArguments()->push_back(0);
     auto inputShapes = new ShapeList({ones->getShapeInfo(), b->getShapeInfo(), e->getShapeInfo(), s->getShapeInfo()});
-    nd4j::ops::strided_slice op;
+    sd::ops::strided_slice op;
     auto result = op.calculateOutputShape(inputShapes, *block); //execute({ones, &b, &e, &s}, {}, {0, 1, 0, 0, 0});
     ASSERT_EQ(result->size(), 1);
     ASSERT_TRUE(shape::isEmpty(result->at(0)));
@@ -180,7 +180,7 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_Once_Again_5) {
 
     //matrix.linspace(1);
 
-    nd4j::ops::strided_slice op;
+    sd::ops::strided_slice op;
     auto result = op.evaluate({&matrix, &b, &e, &s}, {}, {0, 0, 0, 0, 1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -200,7 +200,7 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_Once_Again_6) {
 
     //matrix.linspace(1);
 
-    nd4j::ops::strided_slice op;
+    sd::ops::strided_slice op;
     auto result = op.evaluate({&matrix, &b, &e, &s}, {}, {0, 0, 0, 0, 2});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -221,7 +221,7 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_Once_Again_7) {
 
     //matrix.linspace(1);
 
-    nd4j::ops::strided_slice op;
+    sd::ops::strided_slice op;
     auto result = op.evaluate({&matrix, &b, &e, &s}, {}, {1, 0, 0, 0, 0});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -243,7 +243,7 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_BP_1) {
     matrix.linspace(1);
     grad.linspace(1);
 
-    nd4j::ops::strided_slice_bp op;
+    sd::ops::strided_slice_bp op;
     auto result = op.evaluate({&matrix, &grad}, {}, {1, 0, 1, 0, 2, 0, 0, 0, 1, 1, 1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -265,7 +265,7 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_BP_2) {
     matrix.linspace(1);
     //grad.linspace(1);
 
-    nd4j::ops::strided_slice_bp op;
+    sd::ops::strided_slice_bp op;
     auto result = op.evaluate({&matrix, &grad}, {}, {1, 0, 1, 0, 2, 0, 0, 0, 1, 1, 1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -287,7 +287,7 @@ TEST_F(DeclarableOpsTests6, Test_StridedSlice_BP_3) {
     matrix.linspace(1);
     grad.linspace(1);
 
-    nd4j::ops::strided_slice_bp op;
+    sd::ops::strided_slice_bp op;
     auto result = op.evaluate({&matrix, &grad}, {}, {1, 0, 1, 0, 0, 0, 0, 0, 256, 1, 1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -301,7 +301,7 @@ TEST_F(DeclarableOpsTests6, Test_Simple_Scalar_1) {
     auto x = NDArrayFactory::create<double>('c', {1, 1}, {2.0f});
     auto exp = NDArrayFactory::create<double>('c', {1, 1}, {4.0f});
 
-    nd4j::ops::test_scalar op;
+    sd::ops::test_scalar op;
     auto result = op.evaluate({&x}, {}, {});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -320,7 +320,7 @@ TEST_F(DeclarableOpsTests6, Test_Order_1) {
     x.linspace(1);
     exp.linspace(1);
 
-    nd4j::ops::order op;
+    sd::ops::order op;
     auto result = op.evaluate({&x}, {}, {0});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -335,7 +335,7 @@ TEST_F(DeclarableOpsTests6, cumSum_1) {
     auto x = NDArrayFactory::create<float>('c', {1, 4}, {1.f, 2.f, 3.f, 4.f});
     auto exp = NDArrayFactory::create<float>('c', {1, 4}, {1.f, 3.f, 6.f, 10.f});
 
-    nd4j::ops::cumsum op;
+    sd::ops::cumsum op;
     auto result = op.evaluate({&x}, {}, {0, 0});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -351,7 +351,7 @@ TEST_F(DeclarableOpsTests6, cumSum_2) {
     auto x= NDArrayFactory::create<float>('c', {2, 4}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
     auto exp= NDArrayFactory::create<float>('c', {2, 4}, {1.f, 3.f, 6.f, 10.f, 1.f, 3.f, 6.f, 10.f});
 
-    nd4j::ops::cumsum op;
+    sd::ops::cumsum op;
     auto result = op.evaluate({&x}, {}, {0, 0, 1});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -368,7 +368,7 @@ TEST_F(DeclarableOpsTests6, cumSum_3) {
     auto x= NDArrayFactory::create<float>('c', {2, 4}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f});
     auto exp= NDArrayFactory::create<float>('c', {2, 4}, {1.f, 2.f, 3.f, 4.f, 2.f, 4.f, 6.f, 8.f});
 
-    nd4j::ops::cumsum op;
+    sd::ops::cumsum op;
     auto result = op.evaluate({&x}, {}, {0, 0, 0});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -384,7 +384,7 @@ TEST_F(DeclarableOpsTests6, cumSum_4) {
     auto x = NDArrayFactory::create<double>('c', {3, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
     auto exp = NDArrayFactory::create<double>('c', {3, 3}, {12., 15., 18., 11., 13., 15., 7., 8., 9.});
 
-    nd4j::ops::cumsum op;
+    sd::ops::cumsum op;
     auto result = op.evaluate({&x}, {}, {0, 1, 0}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -400,7 +400,7 @@ TEST_F(DeclarableOpsTests6, cumSum_5) {
     auto x = NDArrayFactory::create<double>('c', {3, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
     auto exp = NDArrayFactory::create<double>('c', {3, 3}, {6.f, 5.f, 3.f, 15.f, 11.f, 6.f, 24.f, 17.f, 9.f,});
 
-    nd4j::ops::cumsum op;
+    sd::ops::cumsum op;
     auto result = op.evaluate({&x}, {}, {0, 1, 1}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -415,7 +415,7 @@ TEST_F(DeclarableOpsTests6, cumSum_6) {
     auto x = NDArrayFactory::create<double>('c', {3, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
     auto exp = NDArrayFactory::create<double>('c', {3, 3}, {11.f, 13.f, 15.f, 7.f, 8.f, 9.f, 0.f, 0.f, 0.f});
 
-    nd4j::ops::cumsum op;
+    sd::ops::cumsum op;
     auto result = op.evaluate({&x}, {}, {1, 1, 0}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -430,7 +430,7 @@ TEST_F(DeclarableOpsTests6, cumSum_7) {
     auto x = NDArrayFactory::create<double>('c', {3, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
     auto exp = NDArrayFactory::create<double>('c', {3, 3}, {5.f, 3.f, 0.f, 11.f, 6.f, 0.f, 17.f, 9.f, 0.f});
 
-    nd4j::ops::cumsum op;
+    sd::ops::cumsum op;
     auto result = op.evaluate({&x}, {}, {1, 1, 1}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -446,7 +446,7 @@ TEST_F(DeclarableOpsTests6, cumSum_8) {
     auto axis = NDArrayFactory::create<Nd4jLong>('c', {1}, {1});
     auto exp = NDArrayFactory::create<double>('c', {3, 3}, {5.f, 3.f, 0.f, 11.f, 6.f, 0.f, 17.f, 9.f, 0.f});
 
-    nd4j::ops::cumsum op;
+    sd::ops::cumsum op;
     auto result = op.evaluate({&x, &axis}, {}, {1, 1}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -474,7 +474,7 @@ TEST_F(DeclarableOpsTests6, cumSum_9) {
     //************************************//
     exclusive = 0; reverse = 0;
 
-    nd4j::ops::cumsum op;
+    sd::ops::cumsum op;
     auto result = op.evaluate({&inputC, &axis}, {}, {exclusive, reverse}, {});
     ASSERT_EQ(Status::OK(), result->status());
     auto z = result->at(0);
@@ -515,7 +515,7 @@ TEST_F(DeclarableOpsTests6, cumSum_10) {
     auto x = NDArrayFactory::create<double>('c', {4, 16, 16, 1});
     auto y = NDArrayFactory::create<int>(-3);
 
-    nd4j::ops::cumsum op;
+    sd::ops::cumsum op;
     auto result = op.evaluate({&x, &y}, {}, {1, 1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -525,12 +525,12 @@ TEST_F(DeclarableOpsTests6, cumSum_10) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests6, cumSum_11) {
 
-    NDArray x('c', {3, 3, 3}, nd4j::DataType::DOUBLE);
+    NDArray x('c', {3, 3, 3}, sd::DataType::DOUBLE);
     auto exp = NDArrayFactory::create<double>('c', {3,3,3}, {12., 15., 18.,11., 13., 15.,7.,  8.,  9., 39., 42., 45.,29., 31., 33.,16., 17., 18., 66., 69., 72.,47., 49., 51.,25., 26., 27.});
 
     x.linspace(1);
 
-    nd4j::ops::cumsum op;
+    sd::ops::cumsum op;
     auto result = op.evaluate({&x}, {}, {0, 1, 1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -544,12 +544,12 @@ TEST_F(DeclarableOpsTests6, cumSum_11) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests6, cumSum_12) {
 
-    NDArray x('c', {3, 3, 3}, nd4j::DataType::DOUBLE);
+    NDArray x('c', {3, 3, 3}, sd::DataType::DOUBLE);
     auto exp = NDArrayFactory::create<double>('c', {3,3,3}, {1.,  2.,  3.,5.,  7.,  9.,12., 15., 18., 10., 11., 12.,23., 25., 27.,39., 42., 45., 19., 20., 21.,41., 43., 45., 66., 69., 72.});
 
     x.linspace(1);
 
-    nd4j::ops::cumsum op;
+    sd::ops::cumsum op;
     auto result = op.evaluate({&x}, {}, {0, 0, 1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -563,12 +563,12 @@ TEST_F(DeclarableOpsTests6, cumSum_12) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests6, cumSum_13) {
 
-    NDArray x('c', {3, 3, 3}, nd4j::DataType::DOUBLE);
+    NDArray x('c', {3, 3, 3}, sd::DataType::DOUBLE);
     auto exp = NDArrayFactory::create<double>('c', {3,3,3}, {11., 13., 15.,7.,  8.,  9.,0.,  0.,  0., 29., 31., 33.,16., 17., 18.,0.,  0.,  0., 47., 49., 51.,25., 26., 27.,0.,  0.,  0.});
 
     x.linspace(1);
 
-    nd4j::ops::cumsum op;
+    sd::ops::cumsum op;
     auto result = op.evaluate({&x}, {}, {1, 1, 1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -582,12 +582,12 @@ TEST_F(DeclarableOpsTests6, cumSum_13) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests6, cumSum_14) {
 
-    NDArray x('c', {3, 3, 3}, nd4j::DataType::DOUBLE);
+    NDArray x('c', {3, 3, 3}, sd::DataType::DOUBLE);
     auto exp = NDArrayFactory::create<double>('c', {3,3,3}, {29., 31., 33.,35., 37., 39.,41., 43., 45., 19., 20., 21.,22., 23., 24.,25., 26., 27.,  0.,  0.,  0.,0.,  0.,  0.,0.,  0.,  0.});
 
     x.linspace(1);
 
-    nd4j::ops::cumsum op;
+    sd::ops::cumsum op;
     auto result = op.evaluate({&x}, {}, {1, 1, 0});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -601,12 +601,12 @@ TEST_F(DeclarableOpsTests6, cumSum_14) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests6, cumSum_15) {
 
-    NDArray x('c', {3, 3, 3}, nd4j::DataType::DOUBLE);
+    NDArray x('c', {3, 3, 3}, sd::DataType::DOUBLE);
     auto exp = NDArrayFactory::create<double>('c', {3,3,3}, {6.,  5.,  3.,15., 11.,  6.,24., 17.,  9., 33., 23., 12.,42., 29., 15.,51., 35., 18., 60., 41., 21.,69., 47., 24.,78., 53., 27.});
 
     x.linspace(1);
 
-    nd4j::ops::cumsum op;
+    sd::ops::cumsum op;
     auto result = op.evaluate({&x}, {}, {0, 1, 2});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -620,9 +620,9 @@ TEST_F(DeclarableOpsTests6, cumSum_15) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests6, cumSum_16) {
 
-    NDArray x('f', {3, 4}, nd4j::DataType::FLOAT32);
+    NDArray x('f', {3, 4}, sd::DataType::FLOAT32);
 
-    nd4j::ops::cumsum op;
+    sd::ops::cumsum op;
     auto result = op.evaluate({&x}, {}, {0, 0, 1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -639,13 +639,13 @@ TEST_F(DeclarableOpsTests6, cumSum_16) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests6, cumSum_17) {
 
-    NDArray x('c', {2, 1500}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {2, 1500}, sd::DataType::FLOAT32);
     NDArray x0 = x(0, {0});
     NDArray x1 = x(1, {0});
     x0.linspace(1);
     x1.linspace(1);
 
-    NDArray exp('c', {2, 1500}, nd4j::DataType::FLOAT32);
+    NDArray exp('c', {2, 1500}, sd::DataType::FLOAT32);
     NDArray exp0 = exp(0, {0});
     NDArray exp1 = exp(1, {0});
 
@@ -658,7 +658,7 @@ TEST_F(DeclarableOpsTests6, cumSum_17) {
         exp1.p(i, prev + i + 1);
     }
 
-    nd4j::ops::cumsum op;
+    sd::ops::cumsum op;
     auto result = op.evaluate({&x}, {}, {0, 0, 1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -672,13 +672,13 @@ TEST_F(DeclarableOpsTests6, cumSum_17) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests6, cumSum_18) {
 
-    NDArray x('c', {2, 1500}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {2, 1500}, sd::DataType::FLOAT32);
     NDArray x0 = x(0, {0});
     NDArray x1 = x(1, {0});
     x0.linspace(1);
     x1.linspace(1);
 
-    NDArray exp('c', {2, 1500}, nd4j::DataType::FLOAT32);
+    NDArray exp('c', {2, 1500}, sd::DataType::FLOAT32);
     NDArray exp0 = exp(0, {0});
     NDArray exp1 = exp(1, {0});
 
@@ -691,7 +691,7 @@ TEST_F(DeclarableOpsTests6, cumSum_18) {
         exp1.p(i, prev + i);
     }
 
-    nd4j::ops::cumsum op;
+    sd::ops::cumsum op;
     auto result = op.evaluate({&x}, {}, {1, 0, 1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -705,13 +705,13 @@ TEST_F(DeclarableOpsTests6, cumSum_18) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests6, cumSum_19) {
 
-    NDArray x('c', {2, 1500}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {2, 1500}, sd::DataType::FLOAT32);
     NDArray x0 = x(0, {0});
     NDArray x1 = x(1, {0});
     x0.linspace(1);
     x1.linspace(1);
 
-    NDArray exp('c', {2, 1500}, nd4j::DataType::FLOAT32);
+    NDArray exp('c', {2, 1500}, sd::DataType::FLOAT32);
     NDArray exp0 = exp(0, {0});
     NDArray exp1 = exp(1, {0});
 
@@ -724,7 +724,7 @@ TEST_F(DeclarableOpsTests6, cumSum_19) {
         exp1.p(i, prev + i + 1);
     }
 
-    nd4j::ops::cumsum op;
+    sd::ops::cumsum op;
     auto result = op.evaluate({&x}, {}, {0, 1, 1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -739,13 +739,13 @@ TEST_F(DeclarableOpsTests6, cumSum_19) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests6, cumSum_20) {
 
-    NDArray x('c', {2, 1500}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {2, 1500}, sd::DataType::FLOAT32);
     NDArray x0 = x(0, {0});
     NDArray x1 = x(1, {0});
     x0.linspace(1);
     x1.linspace(1);
 
-    NDArray exp('c', {2, 1500}, nd4j::DataType::FLOAT32);
+    NDArray exp('c', {2, 1500}, sd::DataType::FLOAT32);
     NDArray exp0 = exp(0, {0});
     NDArray exp1 = exp(1, {0});
 
@@ -758,7 +758,7 @@ TEST_F(DeclarableOpsTests6, cumSum_20) {
         exp1.p(i, prev + i + 2);
     }
 
-    nd4j::ops::cumsum op;
+    sd::ops::cumsum op;
     auto result = op.evaluate({&x}, {}, {1, 1, 1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -776,7 +776,7 @@ TEST_F(DeclarableOpsTests6, TestMergeMaxIndex_1) {
     auto y = NDArrayFactory::create<double>('c', {2, 2, 2}, {10.f, 2.f, 30.f, 4.f, 50.f, 6.f, 70.f, 8.f});
     auto z = NDArrayFactory::create<double>('c', {2, 2, 2}, {1.f, 20.f, 3.f, 40.f, 5.f, 60.f, 7.f, 80.f});
     auto exp = NDArrayFactory::create<int>('c', {2, 2, 2}, {1, 2, 1, 2, 1, 2, 1, 2});
-    nd4j::ops::mergemaxindex op;
+    sd::ops::mergemaxindex op;
 
     auto ress = op.evaluate({&x, &y, &z}, {}, {}, {});
 
@@ -795,9 +795,9 @@ TEST_F(DeclarableOpsTests6, TestMergeMaxIndex_2) {
     auto y = NDArrayFactory::create<double>('c', {2, 2, 2}, {10.f, 2.f, 30.f, 4.f, 50.f, 6.f, 70.f, 8.f});
     auto z = NDArrayFactory::create<double>('c', {2, 2, 2}, {1.f, 20.f, 3.f, 40.f, 5.f, 60.f, 7.f, 80.f});
     auto exp = NDArrayFactory::create<Nd4jLong>('c', {2, 2, 2}, {1, 2, 1, 2, 1, 2, 1, 2});
-    nd4j::ops::mergemaxindex op;
+    sd::ops::mergemaxindex op;
 
-    auto ress = op.evaluate({&x, &y, &z}, {}, {nd4j::DataType::INT64});
+    auto ress = op.evaluate({&x, &y, &z}, {}, {sd::DataType::INT64});
 
     ASSERT_EQ(ND4J_STATUS_OK, ress->status());
 //    ress->at(0)->printIndexedBuffer("MergeMaxIndex2 Result is ");
@@ -812,7 +812,7 @@ TEST_F(DeclarableOpsTests6, TestDropout_1) {
 
     auto x = NDArrayFactory::create<double>('c', {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f});
     auto shape = NDArrayFactory::create<Nd4jLong>({2, 2});
-    nd4j::ops::dropout op;
+    sd::ops::dropout op;
 
     auto ress = op.evaluate({&x, &shape}, {0.2f}, {113});
 
@@ -828,7 +828,7 @@ TEST_F(DeclarableOpsTests6, TestMod_1) {
     auto x = NDArrayFactory::create<double>('c', {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f});
     auto y = NDArrayFactory::create<double>('c', {2, 2, 2}, {10.f, 2.f, 30.f, 4.f, 50.f, 6.f, 70.f, 8.f});
     auto exp = NDArrayFactory::create<double>('c', {2, 2, 2}, {1, 0, 3, 0, 5, 0, 7, 0});
-    nd4j::ops::mod op;
+    sd::ops::mod op;
 
     auto ress = op.evaluate({&x, &y});
 
@@ -846,7 +846,7 @@ TEST_F(DeclarableOpsTests6, TestMod_BP_1) {
     auto y = NDArrayFactory::create<double>('c', {2, 2, 2}, {10.f, 2.f, 30.f, 4.f, 50.f, 6.f, 70.f, 8.f});
     auto eps = NDArrayFactory::create<double>('c', {2, 2, 2}, {10.f, 2.f, 30.f, 4.f, 50.f, 6.f, 70.f, 8.f});
     auto exp = NDArrayFactory::create<double>('c', {2, 2, 2});
-    nd4j::ops::mod_bp op;
+    sd::ops::mod_bp op;
 
     auto ress = op.evaluate({&x, &y, &eps});
 
@@ -865,7 +865,7 @@ TEST_F(DeclarableOpsTests6, TestRank_1) {
     auto y = NDArrayFactory::create<double>('c', {2, 2, 2}, {10.f, 2.f, 30.f, 4.f, 50.f, 6.f, 70.f, 8.f});
     auto eps = NDArrayFactory::create<double>('c', {2, 2, 2}, {10.f, 2.f, 30.f, 4.f, 50.f, 6.f, 70.f, 8.f});
     auto exp = NDArrayFactory::create<int>(3);
-    nd4j::ops::rank op;
+    sd::ops::rank op;
 
     auto ress = op.evaluate({&x});
 
@@ -879,7 +879,7 @@ TEST_F(DeclarableOpsTests6, TestDropout_2) {
 //    auto x1 = NDArrayFactory::create<double>('c', {10, 10});
     auto x = NDArrayFactory::create<double>('c', {3, 3}, {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f});
 
-    nd4j::ops::dropout op;
+    sd::ops::dropout op;
 
     auto ress = op.evaluate({&x}, {0.4f}, {113});
 
@@ -894,7 +894,7 @@ TEST_F(DeclarableOpsTests6, TestDropout_3) {
     auto x = NDArrayFactory::create<double>('c', {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f});
     auto shape = NDArrayFactory::create<int>({1, 2});
 
-    nd4j::ops::dropout op;
+    sd::ops::dropout op;
 
     auto ress = op.evaluate({&x, &shape}, {0.4f}, {113});
 
@@ -911,7 +911,7 @@ TEST_F(DeclarableOpsTests6, MaxPoolWithArgmax_1) {
     auto expI = NDArrayFactory::create<Nd4jLong>('c', {2, 2, 2, 4}, {0,  1,  2,  3,4,  5,  6,  7,8,  9, 10, 11,12, 13, 14, 15,
                                                                 0,  1,  2,  3,4,  5,  6,  7,8,  9, 10, 11,12, 13, 14, 15});
 
-    nd4j::ops::max_pool_with_argmax op;
+    sd::ops::max_pool_with_argmax op;
 
     auto ress = op.evaluate({&x}, {}, {1,1,1,1,1,1,1,1,1});
 
@@ -940,7 +940,7 @@ TEST_F(DeclarableOpsTests6, SufficientStatistics_1) {
 
     auto axis = NDArrayFactory::create<Nd4jLong>({0, 1, 2});
 
-    nd4j::ops::sufficient_statistics op;
+    sd::ops::sufficient_statistics op;
 
     auto ress = op.evaluate({&x, &axis});
 
@@ -972,7 +972,7 @@ TEST_F(DeclarableOpsTests6, SufficientStatistics_2) {
 
     auto axis = NDArrayFactory::create<int>({0, 1});
 
-    nd4j::ops::sufficient_statistics op;
+    sd::ops::sufficient_statistics op;
 
     auto ress = op.evaluate({&x, &axis});
 
@@ -992,9 +992,9 @@ TEST_F(DeclarableOpsTests6, BinCount_1) {
     );
 // ------------------------------------
 
-    NDArray exp('c', {3}, {1, 3, 4}, nd4j::DataType::INT32);
+    NDArray exp('c', {3}, {1, 3, 4}, sd::DataType::INT32);
 
-    nd4j::ops::bincount op;
+    sd::ops::bincount op;
 
     auto res = op.evaluate({&x});
 
@@ -1019,7 +1019,7 @@ TEST_F(DeclarableOpsTests6, BinCount_2) {
 
     auto exp = NDArrayFactory::create<double>({3., 4., 13.});
 
-    nd4j::ops::bincount op;
+    sd::ops::bincount op;
 
     auto res = op.evaluate({&x, &weights});
 
@@ -1044,7 +1044,7 @@ TEST_F(DeclarableOpsTests6, BinCount_3) {
 
     auto exp = NDArrayFactory::create<double>({3., 4.});
 
-    nd4j::ops::bincount op;
+    sd::ops::bincount op;
 
     auto res = op.evaluate({&x, &weights}, {}, {0, 2});
 
@@ -1069,7 +1069,7 @@ TEST_F(DeclarableOpsTests6, BinCount_4) {
 
     auto exp = NDArrayFactory::create<double>({3., 4.,  13., 0.0});
 
-    nd4j::ops::bincount op;
+    sd::ops::bincount op;
 
     auto res = op.evaluate({&x, &weights}, {}, {4, 4});
 
@@ -1095,7 +1095,7 @@ TEST_F(DeclarableOpsTests6, BinCount_5) {
 
     auto exp = NDArrayFactory::create<double>({3., 4., 13., 0.0});
 
-    nd4j::ops::bincount op;
+    sd::ops::bincount op;
 
     auto res = op.evaluate({&x, &weights, &minV, &maxV});
     ASSERT_EQ(ND4J_STATUS_OK, res->status());
@@ -1114,7 +1114,7 @@ TEST_F(DeclarableOpsTests6, BroadcastDynamicShape_1) {
 
     auto exp = NDArrayFactory::create<int>({2, 2, 2});
 
-    nd4j::ops::broadcast_dynamic_shape op;
+    sd::ops::broadcast_dynamic_shape op;
 
     auto res = op.evaluate({&x, &y});
 
@@ -1133,7 +1133,7 @@ TEST_F(DeclarableOpsTests6, BroadcastDynamicShape_2) {
 
     auto exp = NDArrayFactory::create<Nd4jLong>({2, 2, 2});
 
-    nd4j::ops::broadcast_dynamic_shape op;
+    sd::ops::broadcast_dynamic_shape op;
 
     auto res = op.evaluate({&x, &y});
     ASSERT_EQ(ND4J_STATUS_OK, res->status());
@@ -1151,7 +1151,7 @@ TEST_F(DeclarableOpsTests6, BroadcastDynamicShape_3) {
 
     auto exp = NDArrayFactory::create<int>({2, 2, 2});
 
-    nd4j::ops::broadcast_dynamic_shape op;
+    sd::ops::broadcast_dynamic_shape op;
 
     auto res = op.evaluate({&x, &y}, {}, {}, {});
 
@@ -1170,7 +1170,7 @@ TEST_F(DeclarableOpsTests6, BroadcastDynamicShape_SGO_4) {
 
     auto exp = NDArrayFactory::create<Nd4jLong>({2, 4});
 
-    nd4j::ops::broadcast_dynamic_shape op;
+    sd::ops::broadcast_dynamic_shape op;
 
     auto res = op.evaluate({&x, &y});
 
@@ -1190,7 +1190,7 @@ TEST_F(DeclarableOpsTests6, BroadcastDynamicShape_SGO_6) {
 
     auto exp = NDArrayFactory::create<Nd4jLong>({2, 2, 4});
 
-    nd4j::ops::broadcast_dynamic_shape op;
+    sd::ops::broadcast_dynamic_shape op;
     auto res = op.evaluate({&x, &y});
 
     ASSERT_EQ(ND4J_STATUS_OK, res->status());
@@ -1208,7 +1208,7 @@ TEST_F(DeclarableOpsTests6, BroadcastDynamicShape_SGO_7) {
 
     auto exp = NDArrayFactory::create<Nd4jLong>({2, 4, 3});
 
-    nd4j::ops::broadcast_dynamic_shape op;
+    sd::ops::broadcast_dynamic_shape op;
     auto res = op.evaluate({&x, &y});
 
     ASSERT_EQ(ND4J_STATUS_OK, res->status());
@@ -1228,7 +1228,7 @@ TEST_F(DeclarableOpsTests6, BroadcastDynamicShape_SGO_8) {
 
     auto exp = NDArrayFactory::create<int>('c', {1}, {4});
 
-    nd4j::ops::broadcast_dynamic_shape op;
+    sd::ops::broadcast_dynamic_shape op;
     auto status = op.execute({&x, &y}, {&z}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -1246,7 +1246,7 @@ TEST_F(DeclarableOpsTests6, BroadcastDynamicShape_SGO_9) {
 
     auto exp = NDArrayFactory::create<Nd4jLong>('c', {2}, {2,2});
 
-    nd4j::ops::broadcast_dynamic_shape op;
+    sd::ops::broadcast_dynamic_shape op;
     auto status = op.execute({&x, &y}, {&z}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -1273,7 +1273,7 @@ TEST_F(DeclarableOpsTests6, ClipByGlobalNorm_1) {
 //    8.660254
 //    auto expNorm(8.660254);
 
-    nd4j::ops::clip_by_global_norm op;
+    sd::ops::clip_by_global_norm op;
     auto result = op.evaluate({&x}, {0.8}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1315,7 +1315,7 @@ TEST_F(DeclarableOpsTests6, ClipByGlobalNorm_2) {
 
     );
 
-    nd4j::ops::clip_by_global_norm op;
+    sd::ops::clip_by_global_norm op;
     auto result = op.evaluate({&x, &a}, {1.8}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1345,7 +1345,7 @@ TEST_F(DeclarableOpsTests6, ClipByGlobalNorm_3) {
               0.2612789,   0.,          0.}
     );
 
-    nd4j::ops::clip_by_global_norm op;
+    sd::ops::clip_by_global_norm op;
     auto result = op.evaluate({&x, &a}, {0.8}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1371,7 +1371,7 @@ TEST_F(DeclarableOpsTests6, MatrixDeterminant_1) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 3}, {-3.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0, -3.0, 4.0, 0.0, 0.0, 0.0, -3.0, 0.0, 0.0, 0.0, 4.0});
     auto exp = NDArrayFactory::create<double>({36.0, -48.0});
 
-    nd4j::ops::matrix_determinant op;
+    sd::ops::matrix_determinant op;
     auto result = op.evaluate({&x}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1392,7 +1392,7 @@ TEST_F(DeclarableOpsTests6, MatrixDeterminant_2) {
     auto x = NDArrayFactory::create<double>('c', {2, 2, 2}, {1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0});
     auto exp = NDArrayFactory::create<double>({-2.0, -2.0});
 
-    nd4j::ops::matrix_determinant op;
+    sd::ops::matrix_determinant op;
     auto result = op.evaluate({&x}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1413,7 +1413,7 @@ TEST_F(DeclarableOpsTests6, MatrixDeterminant_3) {
     auto x = NDArrayFactory::create<double>('c', {1, 3, 3}, {3.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 3.0});
     NDArray exp('c', {1}, std::vector<double>{-54.0});
 
-    nd4j::ops::matrix_determinant op;
+    sd::ops::matrix_determinant op;
     auto result = op.evaluate({&x}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1434,7 +1434,7 @@ TEST_F(DeclarableOpsTests6, MatrixDeterminant_4) {
     auto x = NDArrayFactory::create<double>('c', {1, 3, 3}, {12.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 13.0});
     auto exp = NDArrayFactory::create<double>('c', {1}, {189.0});
 
-    nd4j::ops::matrix_determinant op;
+    sd::ops::matrix_determinant op;
     auto result = op.evaluate({&x}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1458,7 +1458,7 @@ TEST_F(DeclarableOpsTests6, MatrixDeterminant_5) {
     x.p(5, 4.0);
     x.p(12, 12.0);
 
-    nd4j::ops::matrix_determinant op;
+    sd::ops::matrix_determinant op;
     auto result = op.evaluate({&x}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1482,7 +1482,7 @@ TEST_F(DeclarableOpsTests6, MatrixDeterminant_6) {
     x.p(5, 4.0);
     x.p(12, 12.0);
 
-    nd4j::ops::matrix_determinant op;
+    sd::ops::matrix_determinant op;
     auto result = op.evaluate({&x}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1504,7 +1504,7 @@ TEST_F(DeclarableOpsTests6, LogMatrixDeterminant_1) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 3}, {-3.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0, -3.0, 4.0, 0.0, 0.0, 0.0, -3.0, 0.0, 0.0, 0.0, 4.0});
     auto exp = NDArrayFactory::create<double>({3.58351893845611, 3.871201010907891});
 
-    nd4j::ops::log_matrix_determinant op;
+    sd::ops::log_matrix_determinant op;
     auto result = op.evaluate({&x}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1523,7 +1523,7 @@ TEST_F(DeclarableOpsTests6, LogDet_1) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 3}, {4,12,-16,12,37,-43,-16,-43,98, 4,1.2,-1.6,1.2,3.7,-4.3,-1.6,-4.3,9.8});
     auto exp = NDArrayFactory::create<double>({ 3.5835189, 4.159008});
 
-    nd4j::ops::logdet op;
+    sd::ops::logdet op;
     auto result = op.evaluate({&x}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1541,7 +1541,7 @@ TEST_F(DeclarableOpsTests6, LogDet_2) {
     auto x = NDArrayFactory::create<double>('c', {1, 3, 3}, {4,12,-16,12,37,-43,-16,-43,98});
     auto exp = NDArrayFactory::create<double>('c', {1}, { 3.5835189});
 
-    nd4j::ops::logdet op;
+    sd::ops::logdet op;
     auto result = op.evaluate({&x}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1560,7 +1560,7 @@ TEST_F(DeclarableOpsTests6, LogDet_3) {
     auto x = NDArrayFactory::create<double>('c', {3, 3}, {4,12,-16,12,37,-43,-16,-43,98});
     auto exp = NDArrayFactory::create<double>( 3.5835189);
 
-    nd4j::ops::logdet op;
+    sd::ops::logdet op;
     auto result = op.evaluate({&x}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1604,7 +1604,7 @@ TEST_F(DeclarableOpsTests6, MatrixInverse_1) {
                   -27.0f, 0.0f, 1.0f, -2.0f, 1.f,
     });
 
-    nd4j::ops::matrix_inverse op;
+    sd::ops::matrix_inverse op;
     auto result = op.evaluate({&x});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1623,7 +1623,7 @@ TEST_F(DeclarableOpsTests6, MatrixInverse_010) {
     auto x = NDArrayFactory::create<float>('c', {1, 5, 5}, {1.f, 0.f, 0.f, 0.f, 0.f, 2.f, 1.f, 0.f, 0.f, 0.f, 30.f, 2.f, 1.f, 0.f, 0.f, 4.f, 3.f, 2.f, 1.f, 0.f, 5.f, 4.f, 3.f, 2.f, 1.f, });
     auto exp = NDArrayFactory::create<float>('c', {1, 5, 5}, {1.0f, 0.0f, 0.0f, 0.0f, 0.f, -2.0f, 1.0f, 0.f, 0.f, 0.f, -26.0f, -2.0f, 1.f, 0.f, 0.f, 54.0f, 1.0f, -2.0f, 1.f, 0.f, -27.0f, 0.0f, 1.0f, -2.0f, 1.f});
 
-    nd4j::ops::matrix_inverse op;
+    sd::ops::matrix_inverse op;
     auto result = op.evaluate({&x});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1642,7 +1642,7 @@ TEST_F(DeclarableOpsTests6, MatrixInverse_01) {
     auto x = NDArrayFactory::create<float>('c', {1, 5, 5}, {2.f, 4.f, 60.f, 8.f, 10.f, 0.f, 1.f, 2.f, 3.f, 4.f, 0.f, 0.f, 2.f, 4.f, 6.f, 0.f, 0.f, 0.f, 1.f, 2.f, 0.f, 0.f, 0.f, 0.f, 4.f });
 
     auto exp = NDArrayFactory::create<float>('c', {1, 5, 5}, {0.5f, -2.0f, -13.0f, 54.0f, -6.75f, 0.0f, 1.0f, -1.0f, 1.0f, 0.0f, 0.f, 0.f, 0.5f, -2.0f, 0.25f, 0.f, 0.f, 0.f, 1.0f, -0.5f, 0.f, 0.f, 0.f, 0.f, 0.25f });
-    nd4j::ops::matrix_inverse op;
+    sd::ops::matrix_inverse op;
     auto result = op.evaluate({&x});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1661,7 +1661,7 @@ TEST_F(DeclarableOpsTests6, MatrixInverse_02) {
     auto x = NDArrayFactory::create<float>('c', {1, 5, 5}, {1.f, 0.f, 0.f, 0.f, 0.f, 2.f, 1.f, 0.f, 0.f, 0.f, 30.f, 2.f, 1.f, 0.f, 0.f, 4.f, 3.f, 2.f, 1.f, 0.f, 5.f, 4.f, 3.f, 2.f, 1.f });
     auto exp = NDArrayFactory::create<float>('c', {1, 5, 5}, {1.0f, 0.0f, 0.0f, 0.0f, 0.f, -2.0f, 1.0f, 0.f, 0.f, 0.f, -26.0f, -2.0f, 1.f, 0.f, 0.f, 54.0f, 1.0f, -2.0f, 1.f, 0.f, -27.0f, 0.0f, 1.0f, -2.0f, 1.f });
 
-    nd4j::ops::matrix_inverse op;
+    sd::ops::matrix_inverse op;
     auto result = op.evaluate({&x});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1706,7 +1706,7 @@ TEST_F(DeclarableOpsTests6, MatrixInverse_2) {
     -6.75,  0.0,    1.0,  -1.0,   0.33333333
     });
 
-    nd4j::ops::matrix_inverse op;
+    sd::ops::matrix_inverse op;
     auto result = op.evaluate({&x}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1739,7 +1739,7 @@ TEST_F(DeclarableOpsTests6, MatrixInverse_03) {
             -6.75f,  0.0f,    1.0f,  -1.0f,   0.33333333f
     });
 
-    nd4j::ops::matrix_inverse op;
+    sd::ops::matrix_inverse op;
     auto result = op.evaluate({&x});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1773,7 +1773,7 @@ TEST_F(DeclarableOpsTests6, MatrixInverse_3) {
     -6.75f,  0.0f,    1.0f,  -1.0f,   0.33333333f
     });
 
-    nd4j::ops::matrix_inverse op;
+    sd::ops::matrix_inverse op;
     auto result = op.evaluate({&x});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1807,7 +1807,7 @@ TEST_F(DeclarableOpsTests6, MatrixInverse_4) {
      0.0f,   0.0f,   0.0f,    0.0f,   1.0f
     });
 
-    nd4j::ops::matrix_inverse op;
+    sd::ops::matrix_inverse op;
     auto result = op.evaluate({&x});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1841,7 +1841,7 @@ TEST_F(DeclarableOpsTests6, MatrixInverse_04) {
             0.0f,   0.0f,   0.0f,    0.0f,   1.0f
     });
 
-    nd4j::ops::matrix_inverse op;
+    sd::ops::matrix_inverse op;
     auto result = op.evaluate({&x});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1869,7 +1869,7 @@ TEST_F(DeclarableOpsTests6, ReluLayer_1) {
                         23.8,  31.05, 56.5,
                         26.2,  31.65, 60.7});
 
-    nd4j::ops::relu_layer op;
+    sd::ops::relu_layer op;
     auto result = op.evaluate({&x, &w, &b});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1922,7 +1922,7 @@ TEST_F(DeclarableOpsTests6, static_rnn_test1) {
 
     auto expHFinal = NDArrayFactory::create<double>('c', {bS, numUnits},       {0.97732812, 0.97732812, 0.97732812, 0.97732812, 0.93751527, 0.93751527, 0.93751527, 0.93751527});
 
-    nd4j::ops::static_rnn op;
+    sd::ops::static_rnn op;
     auto results = op.evaluate({&x, &Wx, &Wh, &b, &h0, &maxTimeStep}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1965,7 +1965,7 @@ TEST_F(DeclarableOpsTests6, static_rnn_test2) {
 
     auto expHFinal = NDArrayFactory::create<double>('c', {bS, numUnits},       {0.98000654, 0.98000654, 0.98000654, 0.98000654,0.98112648, 0.98112648, 0.98112648, 0.98112648});
 
-    nd4j::ops::static_rnn op;
+    sd::ops::static_rnn op;
     auto results = op.evaluate({&x, &Wx, &Wh, &b, &h0}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2008,7 +2008,7 @@ TEST_F(DeclarableOpsTests6, static_rnn_test3) {
 
     auto expHFinal = NDArrayFactory::create<double>('c', {bS, numUnits},       {0.97732812, 0.97732812, 0.97732812, 0.97732812, 0.2       , 0.2       , 0.2       , 0.2});
 
-    nd4j::ops::static_rnn op;
+    sd::ops::static_rnn op;
     auto results = op.evaluate({&x, &Wx, &Wh, &b, &h0, &maxTimeStep}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2050,7 +2050,7 @@ TEST_F(DeclarableOpsTests6, static_rnn_test4) {
 
     auto expHFinal = NDArrayFactory::create<double>('c', {bS, numUnits},       {0.97688859, 0.97688859, 0.97688859, 0.97688859, 0.88400882, 0.88400882, 0.88400882, 0.88400882});
 
-    nd4j::ops::static_rnn op;
+    sd::ops::static_rnn op;
     auto results = op.evaluate({&x, &Wx, &Wh, &b, &maxTimeStep}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2092,7 +2092,7 @@ TEST_F(DeclarableOpsTests6, static_rnn_test5) {
 
     auto expHFinal = NDArrayFactory::create<double>('c', {bS, numUnits},       {0.97997868, 0.97997868, 0.97997868, 0.97997868, 0.98110653, 0.98110653, 0.98110653, 0.98110653});
 
-    nd4j::ops::static_rnn op;
+    sd::ops::static_rnn op;
     auto results = op.evaluate({&x, &Wx, &Wh, &b}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2143,7 +2143,7 @@ TEST_F(DeclarableOpsTests6, static_bidir_rnn_test1) {
     auto expHFWfinal = NDArrayFactory::create<double>('c', {bS, numUnitsFW},  {0.9555734 , 0.9555734 , 0.9555734 , 0.77843476, 0.77843476, 0.77843476, 0.51241561, 0.51241561, 0.51241561, 0.2, 0.2, 0.2});
     auto expHBWfinal = NDArrayFactory::create<double>('c', {bS, numUnitsBW},  {0.86708881, 0.86708881, 0.86708881, 0.78347842, 0.78347842, 0.78347842, 0.55529176, 0.55529176, 0.55529176, 0.25, 0.25, 0.25});
 
-    nd4j::ops::static_bidirectional_rnn op;
+    sd::ops::static_bidirectional_rnn op;
     auto results = op.evaluate({&x, &WxFW,&WhFW,&bFW,  &WxFW,&WhFW,&bFW,  &h0FW, &h0BW, &maxTimeStep}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2196,7 +2196,7 @@ TEST_F(DeclarableOpsTests6, static_bidir_rnn_test2) {
     auto expHFWfinal = NDArrayFactory::create<double>('c', {bS, numUnitsFW},  {0.95177305, 0.95177305, 0.95177305, 0.66138054, 0.66138054, 0.66138054, 0.31492203, 0.31492203, 0.31492203, 0.        , 0.        , 0.});
     auto expHBWfinal = NDArrayFactory::create<double>('c', {bS, numUnitsBW},  {0.86518273, 0.86518273, 0.86518273, 0.66617761, 0.66617761, 0.66617761, 0.31492203, 0.31492203, 0.31492203, 0.        , 0.        , 0.});
 
-    nd4j::ops::static_bidirectional_rnn op;
+    sd::ops::static_bidirectional_rnn op;
     auto results = op.evaluate({&x, &WxFW,&WhFW,&bFW,  &WxFW,&WhFW,&bFW,  &maxTimeStep}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2249,7 +2249,7 @@ TEST_F(DeclarableOpsTests6, static_bidir_rnn_test3) {
     auto expHFWfinal = NDArrayFactory::create<double>('c', {bS, numUnitsFW},  {0.97269956, 0.97269956, 0.97269956, 0.97557464, 0.97557464, 0.97557464, 0.97806922, 0.97806922, 0.97806922, 0.98026195, 0.98026195, 0.98026195});
     auto expHBWfinal = NDArrayFactory::create<double>('c', {bS, numUnitsBW},  {0.86841012, 0.86841012, 0.86841012, 0.88207531, 0.88207531, 0.88207531, 0.8941667 , 0.8941667 , 0.8941667 , 0.90489713, 0.90489713, 0.90489713});
 
-    nd4j::ops::static_bidirectional_rnn op;
+    sd::ops::static_bidirectional_rnn op;
     auto results = op.evaluate({&x, &WxFW,&WhFW,&bFW,  &WxFW,&WhFW,&bFW}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2295,7 +2295,7 @@ TEST_F(DeclarableOpsTests6, dynamic_rnn_test1) {
 
     auto expHFinal = NDArrayFactory::create<double>('c', {bS, numUnits},       {0.97732812, 0.97732812, 0.97732812, 0.97732812, 0.93751527, 0.93751527, 0.93751527, 0.93751527});
 
-    nd4j::ops::dynamic_rnn op;
+    sd::ops::dynamic_rnn op;
     auto results = op.evaluate({&x, &Wx, &Wh, &b, &h0, &maxTimeStep}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2340,7 +2340,7 @@ TEST_F(DeclarableOpsTests6, dynamic_rnn_test2) {
 
     auto expHFinal = NDArrayFactory::create<double>('c', {bS, numUnits},       {0.97309129, 0.97309129, 0.97309129, 0.97309129, 0.98120782, 0.98120782, 0.98120782, 0.98120782});
 
-    nd4j::ops::dynamic_rnn op;
+    sd::ops::dynamic_rnn op;
     auto results = op.evaluate({&x, &Wx, &Wh, &b, &h0, &maxTimeStep}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2382,7 +2382,7 @@ TEST_F(DeclarableOpsTests6, dynamic_rnn_test3) {
 
     auto expHFinal = NDArrayFactory::create<double>('c', {bS, numUnits},       {0.97491207, 0.97491207, 0.97491207, 0.97491207, 0.98120782, 0.98120782, 0.98120782, 0.98120782});
 
-    nd4j::ops::dynamic_rnn op;
+    sd::ops::dynamic_rnn op;
     auto results = op.evaluate({&x, &Wx, &Wh, &b, &h0}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2423,7 +2423,7 @@ TEST_F(DeclarableOpsTests6, dynamic_rnn_test4) {
 
     auto expHFinal = NDArrayFactory::create<double>('c', {bS, numUnits},       {0.9724738 , 0.9724738 , 0.9724738 , 0.9724738 ,0.57368608, 0.57368608, 0.57368608, 0.57368608});
 
-    nd4j::ops::dynamic_rnn op;
+    sd::ops::dynamic_rnn op;
     auto results = op.evaluate({&x, &Wx, &Wh, &b, &maxTimeStep}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2464,7 +2464,7 @@ TEST_F(DeclarableOpsTests6, dynamic_rnn_test5) {
 
     auto expHFinal = NDArrayFactory::create<double>('c', {bS, numUnits},       {0.97486307, 0.97486307, 0.97486307, 0.97486307,0.98119833, 0.98119833, 0.98119833, 0.98119833});
 
-    nd4j::ops::dynamic_rnn op;
+    sd::ops::dynamic_rnn op;
     auto results = op.evaluate({&x, &Wx, &Wh, &b}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2520,7 +2520,7 @@ TEST_F(DeclarableOpsTests6, dynamic_bidir_rnn_test1) {
     auto expHFWfinal = NDArrayFactory::create<double>('c', {bS, numUnitsFW},  {0.9555734 , 0.9555734 , 0.9555734 , 0.77843476, 0.77843476, 0.77843476, 0.51241561, 0.51241561, 0.51241561, 0.2       , 0.2       , 0.2});
     auto expHBWfinal = NDArrayFactory::create<double>('c', {bS, numUnitsBW},  {0.86708881, 0.86708881, 0.86708881, 0.78347842, 0.78347842, 0.78347842, 0.55529176, 0.55529176, 0.55529176, 0.25      , 0.25      , 0.25});
 
-    nd4j::ops::dynamic_bidirectional_rnn op;
+    sd::ops::dynamic_bidirectional_rnn op;
     auto results = op.evaluate({&x, &WxFW,&WhFW,&bFW,  &WxFW,&WhFW,&bFW,  &h0FW, &h0BW, &maxTimeStep}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2580,7 +2580,7 @@ TEST_F(DeclarableOpsTests6, dynamic_bidir_rnn_test2) {
     auto expHFWfinal = NDArrayFactory::create<double>('c', {bS, numUnitsFW},  {0.87294706, 0.87294706, 0.87294706,0.84851124, 0.84851124, 0.84851124,0.73978305, 0.73978305, 0.73978305,0.2       , 0.2       , 0.2});
     auto expHBWfinal = NDArrayFactory::create<double>('c', {bS, numUnitsBW},  {0.84345207, 0.84345207, 0.84345207, 0.85615841, 0.85615841, 0.85615841, 0.76576202, 0.76576202, 0.76576202, 0.25      , 0.25      , 0.25});
 
-    nd4j::ops::dynamic_bidirectional_rnn op;
+    sd::ops::dynamic_bidirectional_rnn op;
     auto results = op.evaluate({&x, &WxFW,&WhFW,&bFW,  &WxFW,&WhFW,&bFW,  &h0FW, &h0BW, &maxTimeStep}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2636,7 +2636,7 @@ TEST_F(DeclarableOpsTests6, dynamic_bidir_rnn_test3) {
     auto expHFWfinal = NDArrayFactory::create<double>('c', {bS, numUnitsFW},  {0.84784327, 0.84784327, 0.84784327, 0.7793996 , 0.7793996 , 0.7793996 , 0.61067683, 0.61067683, 0.61067683, 0.        , 0.        , 0.});
     auto expHBWfinal = NDArrayFactory::create<double>('c', {bS, numUnitsBW},  {0.82273707, 0.82273707, 0.82273707, 0.77843476, 0.77843476, 0.77843476, 0.61067683, 0.61067683, 0.61067683, 0.        , 0.        , 0.});
 
-    nd4j::ops::dynamic_bidirectional_rnn op;
+    sd::ops::dynamic_bidirectional_rnn op;
     auto results = op.evaluate({&x, &WxFW,&WhFW,&bFW,  &WxFW,&WhFW,&bFW,  &maxTimeStep}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2695,7 +2695,7 @@ TEST_F(DeclarableOpsTests6, dynamic_bidir_rnn_test4) {
     auto expHFWfinal = NDArrayFactory::create<double>('c', {bS, numUnitsFW},  {0.89948899, 0.89948899, 0.89948899, 0.94544483, 0.94544483, 0.94544483, 0.96797541, 0.96797541, 0.96797541, 0.9807326 , 0.9807326 , 0.9807326 });
     auto expHBWfinal = NDArrayFactory::create<double>('c', {bS, numUnitsBW},  {0.85301722, 0.85301722, 0.85301722, 0.91888753, 0.91888753, 0.91888753, 0.95254269, 0.95254269, 0.95254269, 0.97154357, 0.97154357, 0.97154357});
 
-    nd4j::ops::dynamic_bidirectional_rnn op;
+    sd::ops::dynamic_bidirectional_rnn op;
     auto results = op.evaluate({&x, &WxFW,&WhFW,&bFW,  &WxFW,&WhFW,&bFW,  &h0FW, &h0BW}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2748,7 +2748,7 @@ TEST_F(DeclarableOpsTests6, dynamic_bidir_rnn_test5) {
     auto expHFWfinal = NDArrayFactory::create<double>('c', {bS, numUnitsFW},  {0.89357928, 0.89357928, 0.89357928, 0.94518339, 0.94518339, 0.94518339, 0.96795929, 0.96795929, 0.96795929, 0.98073144, 0.98073144, 0.98073144});
     auto expHBWfinal = NDArrayFactory::create<double>('c', {bS, numUnitsBW},  {0.84882345, 0.84882345, 0.84882345, 0.91865453, 0.91865453, 0.91865453, 0.95252666, 0.95252666, 0.95252666, 0.97154234, 0.97154234, 0.97154234});
 
-    nd4j::ops::dynamic_bidirectional_rnn op;
+    sd::ops::dynamic_bidirectional_rnn op;
     auto results = op.evaluate({&x, &WxFW,&WhFW,&bFW,  &WxFW,&WhFW,&bFW}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2775,7 +2775,7 @@ TEST_F(DeclarableOpsTests6, Test_Diag_119_1) {
     auto x = NDArrayFactory::create<double>('c', {3}, {0.15f, 0.25f, 0.35f});
     auto e = NDArrayFactory::create<double>('c', {3, 3}, {0.15f, 0.0f, 0.0f,   0.0f, 0.25f, 0.0f,   0.0f, 0.0f, 0.35f});
 
-    nd4j::ops::diag op;
+    sd::ops::diag op;
     auto result = op.evaluate({&x}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -2788,7 +2788,7 @@ TEST_F(DeclarableOpsTests6, Test_Diag_119_2) {
     auto x = NDArrayFactory::create<double>('c', {1}, {0.15f});
     auto e = NDArrayFactory::create<double>('c', {1, 1}, {0.15f});
 
-    nd4j::ops::diag op;
+    sd::ops::diag op;
     auto result = op.evaluate({&x}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -2801,7 +2801,7 @@ TEST_F(DeclarableOpsTests6, Test_Diag_119_3) {
     auto x = NDArrayFactory::create<double>(0.15f);
     auto e = NDArrayFactory::create<double>('c', {1, 1}, {0.15f});
 
-    nd4j::ops::diag op;
+    sd::ops::diag op;
     auto result = op.evaluate({&x}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests7.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests7.cpp
index 7a9bc1648..65ee84030 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests7.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests7.cpp
@@ -22,13 +22,13 @@
 #include "testlayers.h"
 #include <ops/declarable/CustomOperations.h>
 #include <helpers/helper_hash.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <array/NDArrayList.h>
-#include <GradCheck.h>
+#include <helpers/GradCheck.h>
 
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 
 class DeclarableOpsTests7 : public testing::Test {
@@ -59,7 +59,7 @@ TEST_F(DeclarableOpsTests7, Test_CHOOSE_SCALAR_LARGE) {
     };
 
     auto x = NDArrayFactory::create<double>(inputData,'c',{1,149});
-    nd4j::ops::choose op;
+    sd::ops::choose op;
     //greater than test
     auto result = op.evaluate({&x}, {0.0},{3});
     ASSERT_EQ(Status::OK(), result->status());
@@ -82,7 +82,7 @@ TEST_F(DeclarableOpsTests7, Test_CHOOSE_SCALAR_ZERO) {
 
 
     auto x = NDArrayFactory::create<double>('c',{1,4},data);
-    nd4j::ops::choose op;
+    sd::ops::choose op;
     //greater than test
     auto result = op.evaluate({&x}, {0.0},{3});
     ASSERT_EQ(Status::OK(), result->status());
@@ -107,7 +107,7 @@ TEST_F(DeclarableOpsTests7, Test_CHOOSE_SCALAR) {
 
     auto x = NDArrayFactory::create<double>('c',{1,4},data);
     auto scalar = NDArrayFactory::create<double>('c',{1,1},{0.0});
-    nd4j::ops::choose op;
+    sd::ops::choose op;
     //greater than test
     auto result = op.evaluate({&x,&scalar}, {1.0},{3});
     ASSERT_EQ(Status::OK(), result->status());
@@ -131,7 +131,7 @@ TEST_F(DeclarableOpsTests7, Test_CHOOSE_SCALAR_LEFT) {
 
     auto x = NDArrayFactory::create<double>('c',{1,4},data);
     auto scalar = NDArrayFactory::create<double>('c',{1,1},{0.0});
-    nd4j::ops::choose op;
+    sd::ops::choose op;
     //greater than test
     auto result = op.evaluate({&scalar,&x}, {1.0},{3});
     ASSERT_EQ(Status::OK(), result->status());
@@ -154,7 +154,7 @@ TEST_F(DeclarableOpsTests7, Test_CHOOSE_ONLY_SCALAR) {
 
 
     auto x = NDArrayFactory::create<double>('c',{1,4},data);
-    nd4j::ops::choose op;
+    sd::ops::choose op;
     //greater than test
     auto result = op.evaluate({&x}, {1.0},{3});
     ASSERT_EQ(Status::OK(), result->status());
@@ -177,7 +177,7 @@ TEST_F(DeclarableOpsTests7, Test_CHOOSE_ONLY_SCALAR_GTE) {
 
 
     auto x = NDArrayFactory::create<double>('c',{1,4},data);
-    nd4j::ops::choose op;
+    sd::ops::choose op;
     //greater than test
     auto result = op.evaluate({&x}, {1.0},{5});
     ASSERT_EQ(Status::OK(), result->status());
@@ -219,7 +219,7 @@ TEST_F(DeclarableOpsTests7, TEST_WHERE) {
     auto maskArr = NDArrayFactory::create<bool>('c',{1,4},mask);
     auto putArr = NDArrayFactory::create<double>('c',{1,4},put);
     auto resultArr = NDArrayFactory::create<double>('c',{1,4},resultData);
-    nd4j::ops::where_np op;
+    sd::ops::where_np op;
     //greater than test
     //            Nd4jStatus execute(std::initializer_list<NDArray<T>*> inputs, std::initializer_list<NDArray<T>*> outputs , std::initializer_list<T> tArgs, std::initializer_list<int> iArgs, bool isInplace = false);
 
@@ -242,19 +242,19 @@ TEST_F(DeclarableOpsTests7, TEST_WHERE_MASK) {
     double assertion[300] = {1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,1.000000000000000000e+00,9.966611049434810354e-01,9.867111603284486332e-01,9.768605487739230320e-01,9.671082786103732953e-01,9.574533680683808834e-01,9.478948451798039354e-01,9.384317476799283186e-01,9.290631229105962285e-01,9.197880277243004610e-01,9.106055283892373620e-01,9.015147004953073528e-01,8.925146288610534828e-01,8.836044074415293492e-01,8.747831392370875037e-01,8.660499362030764647e-01,8.574039191604412302e-01,8.488442177072155204e-01,8.403699701308978698e-01,8.319803233217017979e-01,8.236744326866727306e-01,8.154514620646623468e-01,8.073105836421510251e-01,7.992509778699116163e-01,7.912718333805045523e-01,7.833723469065965173e-01,7.755517232000953554e-01,7.678091749520912224e-01,7.601439227135980969e-01,7.525551948170853267e-01,7.450422272987937689e-01,7.376042638218265335e-01,7.302405556000080011e-01,7.229503613225031211e-01,7.157329470791886639e-01,7.085875862867698771e-01,7.015135596156351072e-01,6.945101549174396149e-01,6.875766671534137009e-01,6.807123983233853703e-01,6.739166573955123196e-01,6.671887602367149173e-01,6.605280295438040739e-01,6.539337947752965619e-01,6.474053920839111242e-01,6.409421642497381555e-01,6.345434606140767375e-01,6.282086370139332576e-01,6.219370557171712832e-01,6.157280853583116942e-01,6.095811008749726367e-01,6.034954834449430816e-01,5.974706204238864338e-01,5.915059052836644238e-01,5.856007375512777280e-01,5.797545227484157682e-01,5.739666723316099173e-01,5.682366036329845604e-01,5.625637398015992385e-01,5.569475097453767676e-01,5.513873480736106725e-01,5.458826950400470501e-01,5.404329964865340896e-01,5.350377037872348085e-01,5.296962737933965659e-01,5.244081687786711354e-01,5.191728563849821176e-01,5.139898095689314772e-01,5.088585065487419845e-01,5.037784307517284565e-01,4.987490707622945774e-01,4.937699202704479151e-01,4.888404780208293054e-01,4.839602477622509946e-01,4.791287381977387683e-01,4.743454629350723484e-01,4.696099404378203390e-01,4.649216939768630041e-01,4.602802515824001017e-01,4.556851459964368911e-01,4.511359146257447605e-01,4.466320994952920342e-01,4.421732472021388527e-01,4.377589088697927955e-01,4.333886401030203062e-01,4.290620009431086457e-01,4.247785558235752101e-01,4.205378735263185508e-01,4.163395271382073215e-01,4.121830940081024908e-01,4.080681557043087104e-01,4.039942979724505667e-01,3.999611106937689398e-01,3.959681878438343627e-01,3.920151274516718853e-01,3.881015315592946102e-01,3.842270061816405180e-01,3.803911612669100828e-01,3.765936106572991271e-01,3.728339720501240850e-01,3.691118669593352886e-01,3.654269206774144463e-01,3.617787622376523182e-01,3.581670243768036999e-01,3.545913434981138868e-01,3.510513596347161203e-01,3.475467164133922426e-01,3.440770610186974499e-01,3.406420441574410929e-01,3.372413200235238606e-01,3.338745462631242389e-01,3.305413839402346898e-01,3.272414975025391692e-01,3.239745547476344245e-01,3.207402267895853032e-01,3.175381880258169032e-01,3.143681161043347383e-01,3.112296918912743071e-01,3.081225994387726264e-01,3.050465259531625062e-01,3.020011617634821843e-01,2.989862002903017069e-01,2.960013380148582840e-01,2.930462744485015647e-01,2.901207121024425017e-01,2.872243564578055852e-01,2.843569159359789489e-01,2.815181018692606840e-01,2.787076284717992514e-01,2.759252128108221624e-01,2.731705747781537075e-01,2.704434370620155681e-01,2.677435251191103149e-01,2.650705671469821278e-01,2.624242940566549609e-01,2.598044394455423789e-01,2.572107395706292876e-01,2.546429333219200064e-01,2.521007621961529055e-01,2.495839702707757235e-01,2.470923041781825646e-01,2.446255130802063582e-01,2.421833486428674187e-01,2.397655650113727777e-01,2.373719187853666479e-01,2.350021689944260528e-01,2.326560770738031469e-01,2.303334068404078172e-01,2.280339244690317291e-01,2.257573984688081292e-01,2.235035996599082919e-01,2.212723011504689752e-01,2.190632783137518302e-01,2.168763087655291855e-01,2.147111723416972873e-01,2.125676510761114746e-01,2.104455291786438698e-01,2.083445930134591173e-01,2.062646310775079761e-01,2.042054339792348794e-01,2.021667944174980747e-01,2.001485071607009836e-01,1.981503690261307848e-01,1.961721788595043592e-01,1.942137375147174327e-01,1.922748478337968081e-01,1.903553146270518526e-01,1.884549446534251604e-01,1.865735466010380594e-01,1.847109310679319050e-01,1.828669105430000552e-01,1.810412993871116094e-01,1.792339138144224131e-01,1.774445718738737465e-01,1.756730934308744496e-01,1.739193001491673995e-01,1.721830154728755669e-01,1.704640646087285105e-01,1.687622745084652875e-01,1.670774738514141378e-01,1.654094930272448083e-01,1.637581641188943782e-01,1.621233208856623365e-01,1.605047987464754966e-01,1.589024347633189727e-01,1.573160676248336609e-01,1.557455376300762306e-01,1.541906866724424563e-01,1.526513582237501165e-01,1.511273973184814046e-01,1.496186505381822129e-01,1.481249659960175158e-01,1.466461933214808777e-01,1.451821836452561187e-01,1.437327895842310799e-01,1.422978652266598532e-01,1.408772661174743090e-01,1.394708492437411185e-01,1.380784730202649913e-01,1.366999972753347725e-01,1.353352832366127023e-01};
     Nd4jLong threeHundredShapePointer[8] = {2,1,300,1,1,0,1,99};
     Nd4jLong twoHundredShapePointer[8] = {2,1,200,1,1,0,1,99};
-    nd4j::ops::where_np op;
-    ArrayOptions::setDataType(threeHundredShapePointer, nd4j::DataType::DOUBLE);
-    ArrayOptions::setDataType(twoHundredShapePointer, nd4j::DataType::DOUBLE);
+    sd::ops::where_np op;
+    ArrayOptions::setDataType(threeHundredShapePointer, sd::DataType::DOUBLE);
+    ArrayOptions::setDataType(twoHundredShapePointer, sd::DataType::DOUBLE);
 
     NDArray xArr(x,threeHundredShapePointer);
     NDArray putArr(put,twoHundredShapePointer);
     NDArray resultArr(z,threeHundredShapePointer);
 
     resultArr.assign(0.0);
-    ArrayOptions::setDataType(threeHundredShapePointer, nd4j::DataType::BOOL);
+    ArrayOptions::setDataType(threeHundredShapePointer, sd::DataType::BOOL);
     NDArray maskArr(mask,threeHundredShapePointer);
 
-    ArrayOptions::setDataType(threeHundredShapePointer, nd4j::DataType::DOUBLE);
+    ArrayOptions::setDataType(threeHundredShapePointer, sd::DataType::DOUBLE);
     NDArray assertArr(assertion, threeHundredShapePointer);
     Nd4jStatus result = op.execute({&maskArr, &xArr, &putArr},{&resultArr},{},{},{});
     ASSERT_EQ(Status::OK(),result);
@@ -290,7 +290,7 @@ TEST_F(DeclarableOpsTests7, TEST_WHERE_SCALAR) {
     auto maskArr = NDArrayFactory::create<bool>('c',{1,4},mask);
     auto putArr = NDArrayFactory::create<double>('c',{1,1},put);
     auto resultArr = NDArrayFactory::create<double>('c',{1,4},resultData);
-    nd4j::ops::where_np op;
+    sd::ops::where_np op;
     //greater than test
     //            Nd4jStatus execute(std::initializer_list<NDArray<T>*> inputs, std::initializer_list<NDArray<T>*> outputs , std::initializer_list<T> tArgs, std::initializer_list<int> iArgs, bool isInplace = false);
 
@@ -312,7 +312,7 @@ TEST_F(DeclarableOpsTests7, TestMatrixDiagPart_1) {
 
     auto z = NDArrayFactory::create<double>('c', {2, 4}, {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0});
 
-    nd4j::ops::matrix_diag_part op;
+    sd::ops::matrix_diag_part op;
 
     auto result = op.evaluate({&x}, {}, {});
 
@@ -328,7 +328,7 @@ TEST_F(DeclarableOpsTests7, TestMatrixDiagPart_2) {
 
     auto z = NDArrayFactory::create<double>('c', {2, 3}, {1, 2, 3, 5, 6, 7});
 
-    nd4j::ops::matrix_diag_part op;
+    sd::ops::matrix_diag_part op;
 
     auto result = op.evaluate({&x}, {}, {});
 
@@ -344,7 +344,7 @@ TEST_F(DeclarableOpsTests7, TestMatrixDiag_1) {
 
     auto x = NDArrayFactory::create<double>('c', {2, 4}, {1, 2, 3, 4, 5, 6, 7, 8});
 
-    nd4j::ops::matrix_diag op;
+    sd::ops::matrix_diag op;
 
     auto result = op.evaluate({&x}, {}, {});
 
@@ -359,7 +359,7 @@ TEST_F(DeclarableOpsTests7, TestMatrixDiag_2) {
     auto z = NDArrayFactory::create<double>('c', {2, 3, 3}, {1., 0., 0., 0., 2., 0., 0., 0., 3.,5., 0., 0., 0., 6., 0.,0., 0., 7.});
     auto x = NDArrayFactory::create<double>('c', {2, 3}, {1, 2, 3, 5, 6, 7});
 
-    nd4j::ops::matrix_diag op;
+    sd::ops::matrix_diag op;
 
     auto result = op.evaluate({&x}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -373,7 +373,7 @@ TEST_F(DeclarableOpsTests7, TestMatrixDiag_2) {
 TEST_F(DeclarableOpsTests7, TestRandomCrop_1) {
     auto x = NDArrayFactory::create<double>('c', {2, 2, 4}, {1.8, 2.5,  4.,  9., 2.1, 2.4,  3.,  9.,2.1, 2.1, 0.7, 0.1,3., 4.2, 2.2, 1. });
     auto shape = NDArrayFactory::create<int>({1, 2, 3});
-    nd4j::ops::random_crop op;
+    sd::ops::random_crop op;
 
     auto result = op.evaluate({&x, &shape}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -387,7 +387,7 @@ TEST_F(DeclarableOpsTests7, TestRandomCrop_1) {
 TEST_F(DeclarableOpsTests7, TestRandomCrop_2) {
     auto x = NDArrayFactory::create<double>('c', {2, 2, 4}, {1.8, 2.5,  4.,  9., 2.1, 2.4,  3.,  9.,2.1, 2.1, 0.7, 0.1,3., 4.2, 2.2, 1. });
     auto shape = NDArrayFactory::create<Nd4jLong>({2, 2, 2});
-    nd4j::ops::random_crop op;
+    sd::ops::random_crop op;
 
     auto result = op.evaluate({&x, &shape}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -425,7 +425,7 @@ TEST_F(DeclarableOpsTests7, Test_Dynamic_Stitch_119) {
                                          89.f,  90.f,  91.f,  92.f,93.f,  94.f,  95.f,  96.f,97.f,  98.f,  99.f, 100.f,41.f,  42.f,  43.f,  44.f,45.f,  46.f,  47.f,  48.f,49.f,  50.f,  51.f,  52.f,
                                          53.f,  54.f,  55.f,  56.f,57.f,  58.f,  59.f,  60.f,21.f,  22.f,  23.f,  24.f,25.f,  26.f,  27.f,  28.f,29.f,  30.f,  31.f,  32.f,33.f,  34.f,  35.f,  36.f,37.f,  38.f,  39.f,  40.f});
 
-    nd4j::ops::dynamic_stitch op;
+    sd::ops::dynamic_stitch op;
     auto result = op.evaluate({&indices0, &indices1, &indices2, &data0, &data1, &data2}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 //    result->at(0)->printIndexedBuffer("Output");
@@ -463,7 +463,7 @@ TEST_F(DeclarableOpsTests7, Test_Dynamic_Stitch_Prof_1) {
                                          89.f,  90.f,  91.f,  92.f,93.f,  94.f,  95.f,  96.f,97.f,  98.f,  99.f, 100.f,41.f,  42.f,  43.f,  44.f,45.f,  46.f,  47.f,  48.f,49.f,  50.f,  51.f,  52.f,
                                          53.f,  54.f,  55.f,  56.f,57.f,  58.f,  59.f,  60.f,21.f,  22.f,  23.f,  24.f,25.f,  26.f,  27.f,  28.f,29.f,  30.f,  31.f,  32.f,33.f,  34.f,  35.f,  36.f,37.f,  38.f,  39.f,  40.f});
 
-    nd4j::ops::dynamic_stitch op;
+    sd::ops::dynamic_stitch op;
     auto result = op.evaluate({&indices0, &indices1, &indices2, &data0, &data1, &data2}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 //    result->at(0)->printIndexedBuffer("Output");
@@ -566,7 +566,7 @@ TEST_F(DeclarableOpsTests7, Test_Dynamic_Stitch_119_1) {
     data0.linspace(1);
     data1.linspace(21);
     data2.linspace(141);
-    nd4j::ops::dynamic_stitch op;
+    sd::ops::dynamic_stitch op;
     auto result = op.evaluate({&indices0, &indices1, &indices2, &data0, &data1, &data2}, {}, {});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -657,7 +657,7 @@ TEST_F(DeclarableOpsTests7, Test_Dynamic_Stitch_119_2) {
     data0.linspace(1);
     data1.linspace(41);
     data2.linspace(161);
-    nd4j::ops::dynamic_stitch op;
+    sd::ops::dynamic_stitch op;
     auto result = op.evaluate({&indices0, &indices1, &indices2, &data0, &data1, &data2}, {}, {});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -675,7 +675,7 @@ TEST_F(DeclarableOpsTests7, Test_Dynamic_Partition_119) {
     auto e = NDArrayFactory::create<double>('c', {5, 11});
     x.assign(1.f);
     e.assign(1.f);
-    nd4j::ops::dynamic_partition op;
+    sd::ops::dynamic_partition op;
     auto result = op.evaluate({&x, &y}, {}, {4});
     ASSERT_EQ(Status::OK(), result->status());
     ASSERT_EQ(4, result->size());
@@ -694,7 +694,7 @@ TEST_F(DeclarableOpsTests7, Test_Dynamic_Partition_119_1) {
 
 //    x.assign(1.f);
 //    e.assign(1.f);
-    nd4j::ops::dynamic_partition op;
+    sd::ops::dynamic_partition op;
     auto result = op.evaluate({&x, &y}, {}, {3});
     ASSERT_EQ(Status::OK(), result->status());
     ASSERT_EQ(3, result->size());
@@ -737,7 +737,7 @@ TEST_F(DeclarableOpsTests7, Test_Dynamic_Partition_119_2) {
     std::vector<NDArray*> e({&e1, &e2, &e3, &e4});
     x.linspace(1.f);
     //.assign(1.f);
-    nd4j::ops::dynamic_partition op;
+    sd::ops::dynamic_partition op;
     auto result = op.evaluate({&x, &y}, {}, {4});
     ASSERT_EQ(Status::OK(), result->status());
     ASSERT_EQ(4, result->size());
@@ -767,7 +767,7 @@ TEST_F(DeclarableOpsTests7, Test_SequenceMask_1) {
                                         1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  0,  0, 0,1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  0, 0,
                                         1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1, 0,1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1, 1 });
 
-    nd4j::ops::sequence_mask op;
+    sd::ops::sequence_mask op;
     auto result = op.evaluate({&input}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -789,7 +789,7 @@ TEST_F(DeclarableOpsTests7, Test_SequenceMask_2) {
              1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
              1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
 
-    nd4j::ops::sequence_mask op;
+    sd::ops::sequence_mask op;
     auto result = op.evaluate({&input}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -810,8 +810,8 @@ TEST_F(DeclarableOpsTests7, Test_SequenceMask_3) {
                                                                  1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                                                  1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
 
-    nd4j::ops::sequence_mask op;
-    auto result = op.evaluate({&input}, {nd4j::DataType::INT32});
+    sd::ops::sequence_mask op;
+    auto result = op.evaluate({&input}, {sd::DataType::INT32});
     ASSERT_EQ(Status::OK(), result->status());
 
     auto z = result->at(0);
@@ -830,8 +830,8 @@ TEST_F(DeclarableOpsTests7, Test_SequenceMask_4) {
         1.f, 0.f, 0.f, 0.f, 0.f,   1.f, 1.f, 1.f, 0.f, 0.f,   1.f, 1.f, 0.f, 0.f, 0.f
     });
 
-    nd4j::ops::sequence_mask op;
-    auto result = op.evaluate({&input, &maxLen}, {nd4j::DataType::FLOAT32});
+    sd::ops::sequence_mask op;
+    auto result = op.evaluate({&input, &maxLen}, {sd::DataType::FLOAT32});
     ASSERT_EQ(Status::OK(), result->status());
 
     auto z = result->at(0);
@@ -849,8 +849,8 @@ TEST_F(DeclarableOpsTests7, Test_SequenceMask_5) {
             1.f, 0.f, 0.f, 0.f, 0.f,   1.f, 1.f, 1.f, 0.f, 0.f,   1.f, 1.f, 0.f, 0.f, 0.f
     });
 
-    nd4j::ops::sequence_mask op;
-    auto result = op.evaluate({&input}, {5, (int)nd4j::DataType::FLOAT32});
+    sd::ops::sequence_mask op;
+    auto result = op.evaluate({&input}, {5, (int)sd::DataType::FLOAT32});
     ASSERT_EQ(Status::OK(), result->status());
 
     auto z = result->at(0);
@@ -868,7 +868,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentMax_1) {
     auto idx = NDArrayFactory::create<int>({0, 0, 1, 1, 1, 1, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4});
     auto exp = NDArrayFactory::create<double>({2.5, 9, 3, 9, 4.2});
 
-    nd4j::ops::segment_max op;
+    sd::ops::segment_max op;
 
     auto result = op.evaluate({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -884,7 +884,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentMax_01) {
     auto idx = NDArrayFactory::create<int>({0, 0, 1, 1, 1, 1, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5,5, 5});
     auto exp = NDArrayFactory::create<double>({2.5, 9, 3, 9, 4.2, 40});
 
-    nd4j::ops::segment_max op;
+    sd::ops::segment_max op;
 
     auto result = op.evaluate({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -900,7 +900,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentMaxBP_1) {
     auto idx = NDArrayFactory::create<int>({0, 0, 1, 1, 1, 1, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4});
     auto exp = NDArrayFactory::create<double>({0., 1., 0., 2., 0., 0., 3., 4., 0., 0.,0., 0., 0., 5., 0.,0.});
     auto eps = NDArrayFactory::create<double>('c', {5});
-    nd4j::ops::segment_max_bp op;
+    sd::ops::segment_max_bp op;
     eps.linspace(1);
     auto result = op.evaluate({&x, &idx, &eps}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -923,7 +923,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentMax_2) {
                                                             3, 4.2, 2.2,   1.});
 
     //{ 2.1, 2.5,  4.,  9., 2.1, 2.1, 0.7, 0.1, 3.,  4.2, 2.2, 1.}
-    nd4j::ops::segment_max op;
+    sd::ops::segment_max op;
 
     auto result = op.evaluate({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -947,7 +947,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentMaxBP_2) {
 
     //{ 2.1, 2.5,  4.,  9., 2.1, 2.1, 0.7, 0.1, 3.,  4.2, 2.2, 1.}
 
-    nd4j::ops::segment_max_bp op;
+    sd::ops::segment_max_bp op;
 
     auto result = op.evaluate({&x, &idx, &eps}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -975,7 +975,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentMax_3) {
 
     //{ 2.1, 2.5,  4.,  9., 2.1, 2.1, 0.7, 0.1, 3.,  4.2, 2.2, 1.}
 
-    nd4j::ops::segment_max op;
+    sd::ops::segment_max op;
 
     auto result = op.evaluate({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1003,7 +1003,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentMax_4) {
                       0. ,   0. ,   0. ,   0. ,0. ,   0. ,   0. ,   0. ,0. ,   0. ,   0. ,   0. ,0. ,   0. ,   0. ,   0. ,91. ,  82. ,  37. ,  64. ,55.1,  46.4,  73. ,  28. ,
                     119.1,  12.1, 112.7,  13.1,14. , 114.2,  16.2, 117. });
 
-    nd4j::ops::segment_max op;
+    sd::ops::segment_max op;
 
     auto result = op.evaluate({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1023,7 +1023,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentMax_1) {
     auto idx = NDArrayFactory::create<int>({4, 4,  1,  1,  1,  1, 2, 3,  3,  3, 4,   4,  4,   4,   0, 0});
     auto exp = NDArrayFactory::create<double>({2.2, 9., 3., 9., 4.2});
 
-    nd4j::ops::unsorted_segment_max op;
+    sd::ops::unsorted_segment_max op;
 
     auto result = op.evaluate({&x, &idx}, {}, {5});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1038,7 +1038,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentMaxBP_1) {
     auto idx = NDArrayFactory::create<int>({0, 0, 1, 1, 1, 1, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4});
     auto exp = NDArrayFactory::create<double>({0., 1., 0., 2., 0., 0., 3., 4., 0., 0.,0., 0., 0., 5., 0.,0.});
     auto eps = NDArrayFactory::create<double>('c', {5});
-    nd4j::ops::segment_max_bp op;
+    sd::ops::segment_max_bp op;
     eps.linspace(1);
     auto result = op.evaluate({&x, &idx, &eps}, {}, {5});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1053,7 +1053,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentMaxBP_2) {
     auto idx = NDArrayFactory::create<int>({2, 0, 0, 1, 1, 1, 1, 3, 3, 3, 4, 4, 4, 4, 4, 4});
     auto exp = NDArrayFactory::create<double>({3., 0., 1., 0., 2., 0., 0., 4., 0., 0.,0., 0., 0., 5., 0.,0.});
     auto eps = NDArrayFactory::create<double>('c', {5});
-    nd4j::ops::segment_max_bp op;
+    sd::ops::segment_max_bp op;
     eps.linspace(1);
     auto result = op.evaluate({&x, &idx, &eps}, {}, {5});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1070,7 +1070,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentMax_2) {
     auto idx = NDArrayFactory::create<int>({4, 4,  1,  1,  1,  1, 3, 3,  3,  3, 4,   4,  4,   4,   0, 0});
     auto exp = NDArrayFactory::create<double>({2.2, 9., -DataTypeUtils::max<double>(), 9., 4.2});
 
-    nd4j::ops::unsorted_segment_max op;
+    sd::ops::unsorted_segment_max op;
 
     auto result = op.evaluate({&x, &idx}, {}, {5});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1088,7 +1088,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentMax_3) {
 
     //{ 2.1, 2.5,  4.,  9., 2.1, 2.1, 0.7, 0.1, 3.,  4.2, 2.2, 1.}
 
-    nd4j::ops::unsorted_segment_max op;
+    sd::ops::unsorted_segment_max op;
 
     auto result = op.evaluate({&x, &idx}, {}, {3});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1111,7 +1111,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentMax_4) {
 
     //{ 2.1, 2.5,  4.,  9., 2.1, 2.1, 0.7, 0.1, 3.,  4.2, 2.2, 1.}
 
-    nd4j::ops::unsorted_segment_max op;
+    sd::ops::unsorted_segment_max op;
 
     auto result = op.evaluate({&x, &idx}, {}, {3});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1128,7 +1128,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentMin_1) {
     auto idx = NDArrayFactory::create<int>({0, 0, 1, 1, 1, 1, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4});
     auto exp = NDArrayFactory::create<double>({1.8, 2.1, 3.,  2.1, 0.1});
 
-    nd4j::ops::segment_min op;
+    sd::ops::segment_min op;
 
     auto result = op.evaluate({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1145,7 +1145,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentMin_01) {
     auto idx = NDArrayFactory::create<int>({0, 0, 1, 1, 1, 1, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4});
     auto exp = NDArrayFactory::create<double>({-2.5, -9, -3.,  -9, -4.2});
 
-    nd4j::ops::segment_min op;
+    sd::ops::segment_min op;
 
     auto result = op.evaluate({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1161,7 +1161,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentMin_02) {
     auto idx = NDArrayFactory::create<int>({0, 0, 1, 1, 1, 1, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4});
     auto exp = NDArrayFactory::create<float>({-2.5f, -9.f, -3.f,  -9.f, -4.2f});
 
-    nd4j::ops::segment_min op;
+    sd::ops::segment_min op;
 
     auto result = op.evaluate({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1179,7 +1179,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentMinBP_1) {
     auto exp = NDArrayFactory::create<double>({ 1.,   0.,  0., 0.,  2.,   0.,  3.,  0.,  4.,  4.,  0., 5.,  0.,  0.,  0.,  0.});
     auto eps = NDArrayFactory::create<double>('c', {5});
     eps.linspace(1);
-    nd4j::ops::segment_min_bp op;
+    sd::ops::segment_min_bp op;
 
     auto result = op.evaluate({&x, &idx, &eps}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1196,7 +1196,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentMinBP_1) {
     auto exp = NDArrayFactory::create<double>({ 1.,   0.,  0., 0.,  2.,   0.,  3.,  0.,  4.,  4.,  0., 5.,  0.,  0.,  0.,  0.});
     auto eps = NDArrayFactory::create<double>('c', {5});
     eps.linspace(1);
-    nd4j::ops::unsorted_segment_min_bp op;
+    sd::ops::unsorted_segment_min_bp op;
 
     auto result = op.evaluate({&x, &idx, &eps}, {}, {5});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1215,7 +1215,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentMinBP_2) {
     auto exp = NDArrayFactory::create<double>({3., 1.,   0.,  0., 0.,  2.,   0.,  0.,  4.,  4.,  0., 5.,  0.,  0.,  0.,  0.});
     auto eps = NDArrayFactory::create<double>('c', {5});
     eps.linspace(1);
-    nd4j::ops::unsorted_segment_min_bp op;
+    sd::ops::unsorted_segment_min_bp op;
 
     auto result = op.evaluate({&x, &idx, &eps}, {}, {5});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1235,7 +1235,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentMin_2) {
 
     //{ 2.1, 2.5,  4.,  9., 2.1, 2.1, 0.7, 0.1, 3.,  4.2, 2.2, 1.}
 
-    nd4j::ops::segment_min op;
+    sd::ops::segment_min op;
 
     auto result = op.evaluate({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1256,7 +1256,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentMinBP_2) {
 
     //{ 2.1, 2.5,  4.,  9., 2.1, 2.1, 0.7, 0.1, 3.,  4.2, 2.2, 1.}
 
-    nd4j::ops::segment_min_bp op;
+    sd::ops::segment_min_bp op;
 
     auto result = op.evaluate({&x, &idx, &eps}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1281,7 +1281,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentMin_3) {
     auto exp = NDArrayFactory::create<double>('c', {3, 4, 4}, {91. ,  82. ,  37. ,  64. ,55.1,  46.4,  73. ,  28. ,119.1,  12.1, 112.7,  13.1,14. , 114.2,  16.2, 117. ,31. ,  22. ,  67. ,  24. ,
                  15.1,  46.4,  73. ,  28. ,109.1,  12.1,  12.7,  13.1,14. ,  14.2,  16.2,  11. ,91. ,  82. ,  37. ,  64. ,55.1,  46.4,  73. ,  28. ,119.1,  12.1, 112.7,  13.1,14. , 114.2,  16.2, 117. });
 
-    nd4j::ops::segment_min op;
+    sd::ops::segment_min op;
 
     auto result = op.evaluate({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1311,7 +1311,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentMin_4) {
                       0. ,   0. ,   0. ,   0. ,0. ,   0. ,   0. ,   0. ,0. ,   0. ,   0. ,   0. ,0. ,   0. ,   0. ,   0. ,0. ,   0. ,   0. ,   0. ,0. ,   0. ,   0. ,   0. ,
                       0. ,   0. ,   0. ,   0. ,0. ,   0. ,   0. ,   0. ,0. ,   0. ,   0. ,   0. ,0. ,   0. ,   0. ,   0. ,91. ,  82. ,  37. ,  64. ,55.1,  46.4,  73. ,  28. ,119.1,  12.1, 112.7,  13.1,14. , 114.2,  16.2, 117. });
 
-    nd4j::ops::segment_min op;
+    sd::ops::segment_min op;
 
     auto result = op.evaluate({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1331,7 +1331,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentMin_1) {
     auto idx = NDArrayFactory::create<int>({0, 0, 1, 1, 1, 1, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4});
     auto exp = NDArrayFactory::create<double>({1.8, 2.1, 3.,  2.1, 0.1});
 
-    nd4j::ops::unsorted_segment_min op;
+    sd::ops::unsorted_segment_min op;
 
     auto result = op.evaluate({&x, &idx}, {}, {5});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1345,7 +1345,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentMin_01) {
     auto idx = NDArrayFactory::create<int>({0, 0, 1, 1, 1, 1, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4});
     auto exp = NDArrayFactory::create<double>({1.8, 2.1, 3.,  2.1, 0.1});
 
-    nd4j::ops::unsorted_segment_min op;
+    sd::ops::unsorted_segment_min op;
 
     auto result = op.evaluate({&x, &idx}, {}, {5});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1362,7 +1362,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentMin_2) {
 
     //{ 2.1, 2.5,  4.,  9., 2.1, 2.1, 0.7, 0.1, 3.,  4.2, 2.2, 1.}
 
-    nd4j::ops::unsorted_segment_min op;
+    sd::ops::unsorted_segment_min op;
 
     auto result = op.evaluate({&x, &idx}, {}, {3});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1386,7 +1386,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentMin_3) {
     auto exp = NDArrayFactory::create<double>('c', {3, 4, 4}, {91. ,  82. ,  37. ,  64. ,55.1,  46.4,  73. ,  28. ,119.1,  12.1, 112.7,  13.1,14. , 114.2,  16.2, 117. ,31. ,  22. ,  67. ,  24. ,
                                          15.1,  46.4,  73. ,  28. ,109.1,  12.1,  12.7,  13.1,14. ,  14.2,  16.2,  11. ,91. ,  82. ,  37. ,  64. ,55.1,  46.4,  73. ,  28. ,119.1,  12.1, 112.7,  13.1,14. , 114.2,  16.2, 117. });
 
-    nd4j::ops::unsorted_segment_min op;
+    sd::ops::unsorted_segment_min op;
 
     auto result = op.evaluate({&x, &idx}, {}, {3});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1427,7 +1427,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentMin_4) {
             principalMax,   principalMax,   principalMax,   principalMax,   principalMax,   principalMax,
             91.,  82. ,  37. ,  64. ,55.1,  46.4,  73. ,  28. ,119.1,  12.1, 112.7,  13.1,14. , 114.2,  16.2, 117. });
 
-    nd4j::ops::unsorted_segment_min op;
+    sd::ops::unsorted_segment_min op;
 
     auto result = op.evaluate({&x, &idx}, {}, {8});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1447,7 +1447,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentMean_1) {
     auto idx = NDArrayFactory::create<int>({0, 0, 1, 1, 1, 1, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4});
     auto exp = NDArrayFactory::create<double>({2.15,      4.375,     3.,        4.4,       1.8666667});
 
-    nd4j::ops::segment_mean op;
+    sd::ops::segment_mean op;
 
     auto result = op.evaluate({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1461,7 +1461,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentMean_2) {
     auto idx = NDArrayFactory::create<int>({0, 0, 1, 2});
     auto exp = NDArrayFactory::create<double>('c', {3, 4}, {    1.95,     2.45,       3.5,       9.,    2.1,       2.1,       0.7,      0.1,    3. ,       4.2,       2.2,      1.});
 
-    nd4j::ops::segment_mean op;
+    sd::ops::segment_mean op;
 
     auto result = op.evaluate({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1479,7 +1479,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentMean_02) {
     auto idx = NDArrayFactory::create<int>({0, 0, 1, 1, 2,2});
     auto exp = NDArrayFactory::create<double>('c', {3, 3}, {    2.5, 3.5, 4.5,      8.5, 9.5, 10.5,   14.5, 15.5,  16.5});
 
-    nd4j::ops::segment_mean op;
+    sd::ops::segment_mean op;
 
     auto result = op.evaluate({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1494,7 +1494,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentMean_021) {
     auto idx = NDArrayFactory::create<int>({0, 0, 1, 1, 2,2});
     auto exp = NDArrayFactory::create<float>('c', {3, 3}, {    2.5f, 3.5f, 4.5f,      8.5f, 9.5f, 10.5f,   14.5f, 15.5f,  16.5f});
 
-    nd4j::ops::segment_mean op;
+    sd::ops::segment_mean op;
     x.linspace(1.);
     auto result = op.evaluate({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1510,7 +1510,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentMean_022) {
     auto z = NDArrayFactory::create<float>('c', {3, 3}); //, {    2.5, 3.5, 4.5,      8.5, 9.5, 10.5,   14.5, 15.5,  16.5});
     auto exp = NDArrayFactory::create<float>('c', {3, 3}, {    2.5f, 3.5f, 4.5f,      8.5f, 9.5f, 10.5f,   14.5f, 15.5f,  16.5f});
 
-    nd4j::ops::segment_mean op;
+    sd::ops::segment_mean op;
     x.linspace(1.);
     auto result = op.execute({&x, &idx}, {&z});
     ASSERT_EQ(result, Status::OK());
@@ -1528,7 +1528,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentMeanBP_2) {
     auto exp = NDArrayFactory::create<double>('c', {4, 4}, { 0.5, 1., 1.5, 2.,  0.5, 1., 1.5, 2., 5., 6., 7., 8., 9., 10., 11., 12.});
     eps.linspace(1);
 
-    nd4j::ops::segment_mean_bp op;
+    sd::ops::segment_mean_bp op;
 
     auto result = op.evaluate({&x, &idx, &eps}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1553,7 +1553,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentMean_3) {
              41.  ,      32.  ,      77.  ,      34. ,35.1 ,      51.4 ,      83.  ,      28. ,114.1 ,      47.1 ,      62.7,      63.1,64.  ,      64.2 ,      66.2 ,      64. ,
              91.  ,      82.  ,      37.  ,      64. ,55.1 ,      46.4 ,      73.  ,      28. ,119.1 ,      12.1 ,     112.7 ,      13.1,14.  ,     114.2 ,      16.2 ,     117. });
 
-    nd4j::ops::segment_mean op;
+    sd::ops::segment_mean op;
 
     auto result = op.evaluate({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1584,7 +1584,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentMean_4) {
               0. ,   0. ,   0. ,   0. ,0. ,   0. ,   0. ,   0. ,0. ,   0. ,   0. ,   0. ,0. ,   0. ,   0. ,   0. ,91. ,  82. ,  37. ,  64. ,55.1,  46.4,  73. ,  28. ,
             119.1,  12.1, 112.7,  13.1,14. , 114.2,  16.2, 117. });
 
-    nd4j::ops::segment_mean op;
+    sd::ops::segment_mean op;
 
     auto result = op.evaluate({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1604,7 +1604,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentMean_1) {
     auto idx = NDArrayFactory::create<int>({0, 0, 1, 1, 1, 1, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4});
     auto exp = NDArrayFactory::create<double>({2.15,      4.375,     3.,        4.4,       1.8666667});
 
-    nd4j::ops::unsorted_segment_mean op;
+    sd::ops::unsorted_segment_mean op;
 
     auto result = op.evaluate({&x, &idx}, {}, {5});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1620,7 +1620,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentMeanBP_1) {
     auto eps = NDArrayFactory::create<double>({1.,      2.,     3.,        4.,       5.});
     auto exp = NDArrayFactory::create<double>({1./2.,  1./2., 2./4., 2./4., 2./4., 2./4, 3., 4./3., 4./3., 4./3.,
                          5./6., 5./6., 5./6., 5./6., 5./6., 5./6.});
-    nd4j::ops::segment_mean_bp op;
+    sd::ops::segment_mean_bp op;
 
     auto result = op.evaluate({&x, &idx, &eps}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1636,7 +1636,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentMeanBP_1) {
     auto eps = NDArrayFactory::create<double>({1.,      2.,     3.,        4.,       5.});
     auto exp = NDArrayFactory::create<double>({1./2.,  1./2., 2./4., 2./4., 2./4., 2./4, 3., 4./3., 4./3., 4./3.,
                          5./6., 5./6., 5./6., 5./6., 5./6., 5./6.});
-    nd4j::ops::unsorted_segment_mean_bp op;
+    sd::ops::unsorted_segment_mean_bp op;
 
     auto result = op.evaluate({&x, &idx, &eps}, {}, {5});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1652,7 +1652,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentMeanBP_2) {
     auto eps = NDArrayFactory::create<double>({1.,      2.,     3.,        4.,       5.});
     auto exp = NDArrayFactory::create<double>({3., 1./2.,  1./2., 2./4., 2./4., 2./4., 2./4, 4./3., 4./3., 4./3.,
                          5./6., 5./6., 5./6., 5./6., 5./6., 5./6.});
-    nd4j::ops::unsorted_segment_mean_bp op;
+    sd::ops::unsorted_segment_mean_bp op;
 
     auto result = op.evaluate({&x, &idx, &eps}, {}, {5});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1667,7 +1667,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentMean_2) {
     auto idx = NDArrayFactory::create<int>({0, 0, 1, 2});
     auto exp = NDArrayFactory::create<double>('c', {3, 4}, {    1.95,     2.45,       3.5,       9.,    2.1,       2.1,       0.7,      0.1,    3. ,       4.2,       2.2,      1.});
 
-    nd4j::ops::unsorted_segment_mean op;
+    sd::ops::unsorted_segment_mean op;
 
     auto result = op.evaluate({&x, &idx}, {}, {3});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1695,7 +1695,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentMean_3) {
             41.  ,      32.  ,      77.  ,      34. ,35.1 ,      51.4 ,      83.  ,      28. ,114.1 ,      47.1 ,      62.7,      63.1,64.  ,      64.2 ,      66.2 ,      64. ,
             91.  ,      82.  ,      37.  ,      64. ,55.1 ,      46.4 ,      73.  ,      28. ,119.1 ,      12.1 ,     112.7 ,      13.1,14.  ,     114.2 ,      16.2 ,     117. });
 
-    nd4j::ops::unsorted_segment_mean op;
+    sd::ops::unsorted_segment_mean op;
 
     auto result = op.evaluate({&x, &idx}, {}, {3});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1726,7 +1726,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentMean_4) {
             0. ,   0. ,   0. ,   0. ,0. ,   0. ,   0. ,   0. ,0. ,   0. ,   0. ,   0. ,0. ,   0. ,   0. ,   0. ,91. ,  82. ,  37. ,  64. ,55.1,  46.4,  73. ,  28. ,
             119.1,  12.1, 112.7,  13.1,14. , 114.2,  16.2, 117. });
 
-    nd4j::ops::unsorted_segment_mean op;
+    sd::ops::unsorted_segment_mean op;
 
     auto result = op.evaluate({&x, &idx}, {}, {8});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1746,7 +1746,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentSqrtN_1) {
     auto idx = NDArrayFactory::create<int>({0, 0, 1, 1, 1, 1, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4});
     auto exp = NDArrayFactory::create<double>({3.0405593, 8.75,      3.,        7.621024,  4.5723805});
 
-    nd4j::ops::unsorted_segment_sqrt_n op;
+    sd::ops::unsorted_segment_sqrt_n op;
 
     auto result = op.evaluate({&x, &idx}, {}, {5});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1762,7 +1762,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentSqrtN_BP_1) {
     auto eps = NDArrayFactory::create<double>({1., 2.,      3.,        4.,  5.});
 //    NDArray<double> exp({3.0405593, 8.75,      3.,        7.621024,  4.5723805});
     auto exp = NDArrayFactory::create<double>({3., 0.707107, 0.707107, 1., 1., 1., 1., 2.309401, 2.309401, 2.309401, 2.041241, 2.041241, 2.041241, 2.041241, 2.041241, 2.041241});
-    nd4j::ops::unsorted_segment_sqrt_n_bp op;
+    sd::ops::unsorted_segment_sqrt_n_bp op;
 
     auto result = op.evaluate({&x, &idx, &eps}, {}, {5});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1781,7 +1781,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentSqrtN_2) {
                                              3. ,        4.2,        2.2,       1.
     });
 
-    nd4j::ops::unsorted_segment_sqrt_n op;
+    sd::ops::unsorted_segment_sqrt_n op;
 
     auto result = op.evaluate({&x, &idx}, {}, {3});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1809,7 +1809,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentSqrtN_3) {
             57.982758,  45.254833, 108.89445,   48.083263,     49.638893,  72.69058,  117.37973,   39.59798,    161.36177,   66.60946,   88.67119,   89.23688,  90.50967,   90.79251,   93.62093,   90.50967,
             91.  ,      82.  ,      37.  ,      64. ,55.1 ,      46.4 ,      73.  ,      28. ,119.1 ,      12.1 ,     112.7 ,      13.1,14.  ,     114.2 ,      16.2 ,     117. });
 
-    nd4j::ops::unsorted_segment_sqrt_n op;
+    sd::ops::unsorted_segment_sqrt_n op;
 
     auto result = op.evaluate({&x, &idx}, {}, {3});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1840,7 +1840,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentSqrtN_4) {
             0. ,   0. ,   0. ,   0. ,0. ,   0. ,   0. ,   0. ,0. ,   0. ,   0. ,   0. ,0. ,   0. ,   0. ,   0. ,91. ,  82. ,  37. ,  64. ,55.1,  46.4,  73. ,  28. ,
             119.1,  12.1, 112.7,  13.1,14. , 114.2,  16.2, 117. });
 
-    nd4j::ops::unsorted_segment_sqrt_n op;
+    sd::ops::unsorted_segment_sqrt_n op;
 
     auto result = op.evaluate({&x, &idx}, {}, {8});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1860,7 +1860,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentSqrtN_5) {
     auto idx = NDArrayFactory::create<int>({3, 1, 0, 0, 2, 0, 3, 2});
     //NDArray<double> exp({1.7320508075688772, 1.,      1.4142135623730951,        1.4142135623730951});
     auto exp = NDArrayFactory::create<double>({7.5055537, 2.,        4.9497476, 2.828427});
-    nd4j::ops::unsorted_segment_sqrt_n op;
+    sd::ops::unsorted_segment_sqrt_n op;
 
     auto result = op.evaluate({&x, &idx}, {}, {4});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1877,7 +1877,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentSum_1) {
     auto idx = NDArrayFactory::create<int>({   0,   0, 1,   1,    1,   1, 2,  3,    3,   3,   4,   4, 4,    4,   4, 4});
     auto exp = NDArrayFactory::create<double>({4.3,  17.5,  3.,  13.2,  11.2});
 
-    nd4j::ops::segment_sum op;
+    sd::ops::segment_sum op;
 
     auto result = op.evaluate({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1893,7 +1893,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentSumBP_1) {
     auto idx = NDArrayFactory::create<int>({0, 0, 1, 1, 1, 1, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4});
     auto eps = NDArrayFactory::create<double>({1.,  2.,  3.,  4.,  5.});
     auto exp = NDArrayFactory::create<double>({ 1.,  1.,  2.,  2.,  2.,  2.,  3.,  4.,  4.,  4.,  5.,  5.,  5.,  5.,  5.,  5.});
-    nd4j::ops::segment_sum_bp op;
+    sd::ops::segment_sum_bp op;
 
     auto result = op.evaluate({&x, &idx, &eps}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1907,7 +1907,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentSumBP_1) {
     auto idx = NDArrayFactory::create<int>({0, 0, 1, 1, 1, 1, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4});
     auto eps = NDArrayFactory::create<double>({1,  2,  3,  4,  5});
     auto exp = NDArrayFactory::create<double>({ 1.,  1.,  2.,  2.,  2.,  2.,  3.,  4.,  4.,  4.,  5.,  5.,  5.,  5.,  5.,  5.});
-    nd4j::ops::unsorted_segment_sum_bp op;
+    sd::ops::unsorted_segment_sum_bp op;
 
     auto result = op.evaluate({&x, &idx, &eps}, {}, {5});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1921,7 +1921,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentSumBP_2) {
     auto idx = NDArrayFactory::create<int>({2, 0, 0, 1, 1, 1, 1, 3, 3, 3, 4, 4, 4, 4, 4, 4});
     auto eps = NDArrayFactory::create<double>({1.,  2.,  3.,  4.,  5.});
     auto exp = NDArrayFactory::create<double>({ 3., 1.,  1.,  2.,  2.,  2.,  2.,  4.,  4.,  4.,  5.,  5.,  5.,  5.,  5.,  5.});
-    nd4j::ops::unsorted_segment_sum_bp op;
+    sd::ops::unsorted_segment_sum_bp op;
 
     auto result = op.evaluate({&x, &idx, &eps}, {}, {5});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1936,7 +1936,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentSum_2) {
     auto idx = NDArrayFactory::create<int>({0, 0, 1, 2});
     auto exp = NDArrayFactory::create<double>('c', {3, 4}, {3.9 ,       4.9,        7. ,       18.,2.1 ,       2.1,        0.7,        0.1,3.  ,       4.2,        2.2,        1.});
 
-    nd4j::ops::segment_sum op;
+    sd::ops::segment_sum op;
 
     auto result = op.evaluate({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1956,7 +1956,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentSumBP_2) {
     auto eps = NDArrayFactory::create<double>('c', {3, 4});
     eps.linspace(1);
 
-    nd4j::ops::segment_sum_bp op;
+    sd::ops::segment_sum_bp op;
 
     auto result = op.evaluate({&x, &idx, &eps}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -1983,7 +1983,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentSum_3) {
              70.2,      102.8,      166. ,       56.  ,228.2,       94.2,      125.4,     126.2 ,128. ,      128.4,      132.4,      128.  ,91. ,  82. ,  37. ,  64. ,
                   55.1,  46.4,  73. ,  28. ,119.1,  12.1, 112.7,  13.1,14. , 114.2,  16.2, 117. });
 
-    nd4j::ops::segment_sum op;
+    sd::ops::segment_sum op;
 
     auto result = op.evaluate({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -2014,7 +2014,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentSum_4) {
                       0. ,   0. ,   0. ,   0. ,0. ,   0. ,   0. ,   0. ,0. ,   0. ,   0. ,   0. ,0. ,   0. ,   0. ,   0. ,91. ,  82. ,  37. ,  64. ,55.1,  46.4,  73. ,  28. ,
                     119.1,  12.1, 112.7,  13.1,14. , 114.2,  16.2, 117. });
 
-    nd4j::ops::segment_sum op;
+    sd::ops::segment_sum op;
 
     auto result = op.evaluate({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -2034,7 +2034,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentSum_1) {
     auto idx = NDArrayFactory::create<int>({0, 0, 1, 1, 1, 1, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4});
     auto exp = NDArrayFactory::create<double>({4.3,  17.5,  3.,  13.2,  11.2});
 
-    nd4j::ops::unsorted_segment_sum op;
+    sd::ops::unsorted_segment_sum op;
 
     auto result = op.evaluate({&x, &idx}, {}, {5});
     ASSERT_EQ(result->status(), Status::OK());
@@ -2049,7 +2049,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentSum_2) {
     auto idx = NDArrayFactory::create<int>({0, 0, 1, 2});
     auto exp = NDArrayFactory::create<double>('c', {3, 4}, {3.9 ,       4.9,        7. ,       18.,2.1 ,       2.1,        0.7,        0.1,3.  ,       4.2,        2.2,        1.});
 
-    nd4j::ops::unsorted_segment_sum op;
+    sd::ops::unsorted_segment_sum op;
 
     auto result = op.evaluate({&x, &idx}, {}, {3});
     ASSERT_EQ(result->status(), Status::OK());
@@ -2074,7 +2074,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentSum_3) {
             70.2,      102.8,      166. ,       56.  ,228.2,       94.2,      125.4,     126.2 ,128. ,      128.4,      132.4,      128.  ,91. ,  82. ,  37. ,  64. ,
             55.1,  46.4,  73. ,  28. ,119.1,  12.1, 112.7,  13.1,14. , 114.2,  16.2, 117. });
 
-    nd4j::ops::unsorted_segment_sum op;
+    sd::ops::unsorted_segment_sum op;
 
     auto result = op.evaluate({&x, &idx}, {}, {3});
     ASSERT_EQ(result->status(), Status::OK());
@@ -2104,7 +2104,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentSum_4) {
             0. ,   0. ,   0. ,   0. ,0. ,   0. ,   0. ,   0. ,0. ,   0. ,   0. ,   0. ,0. ,   0. ,   0. ,   0. ,91. ,  82. ,  37. ,  64. ,55.1,  46.4,  73. ,  28. ,
             119.1,  12.1, 112.7,  13.1,14. , 114.2,  16.2, 117. });
 
-    nd4j::ops::unsorted_segment_sum op;
+    sd::ops::unsorted_segment_sum op;
 
     auto result = op.evaluate({&x, &idx}, {}, {8});
     ASSERT_EQ(result->status(), Status::OK());
@@ -2124,7 +2124,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentProd_1) {
     auto idx = NDArrayFactory::create<int>({0, 0, 1, 1, 1, 1, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4});
     auto exp = NDArrayFactory::create<double>({4.5,    181.44,     3.,      39.69,     1.9404});
 
-    nd4j::ops::segment_prod op;
+    sd::ops::segment_prod op;
 
     auto result = op.evaluate({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -2139,7 +2139,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentProdBP_1) {
     auto idx = NDArrayFactory::create<int>({0, 0, 1, 1, 1, 1, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4});
     auto eps = NDArrayFactory::create<double>({1.,    2.,     3.,      4.,     5.});
     auto exp = NDArrayFactory::create<double>({2.5, 1.8, 90.72, 40.32, 172.8, 151.2, 3., 17.64, 75.6, 75.6, 13.86, 97.02, 3.234, 2.31, 4.41, 9.702});
-    nd4j::ops::segment_prod_bp op;
+    sd::ops::segment_prod_bp op;
 
     auto result = op.evaluate({&x, &idx, &eps}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -2157,7 +2157,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentProdBP_1) {
     auto idx = NDArrayFactory::create<int>({0, 0, 1, 1, 1, 1, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4});
     auto eps = NDArrayFactory::create<double>({1.,    2.,     3.,      4.,     5.});
     auto exp = NDArrayFactory::create<double>({2.5, 1.8, 90.72, 40.32, 172.8, 151.2, 3., 17.64, 75.6, 75.6, 13.86, 97.02, 3.234, 2.31, 4.41, 9.702});
-    nd4j::ops::segment_prod_bp op;
+    sd::ops::segment_prod_bp op;
 
     auto result = op.evaluate({&x, &idx, &eps}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -2176,7 +2176,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentProdBP_2) {
     auto eps = NDArrayFactory::create<double>({1.,    2.,     3.,      4.,     5.});
     auto exp = NDArrayFactory::create<double>({3., 2.5, 1.8, 90.72, 40.32, 172.8, 151.2, 17.64, 75.6, 75.6, 13.86, 97.02, 3.234, 2.31, 4.41, 9.702});
     auto n = NDArrayFactory::create<Nd4jLong>(5LL);
-    nd4j::ops::unsorted_segment_prod_bp op;
+    sd::ops::unsorted_segment_prod_bp op;
 
     auto result = op.evaluate({&x, &idx, &eps, &n}, {}, {5});
     ASSERT_EQ(result->status(), Status::OK());
@@ -2197,7 +2197,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentProd_2) {
 
     //{ 2.1, 2.5,  4.,  9., 2.1, 2.1, 0.7, 0.1, 3.,  4.2, 2.2, 1.}
 
-    nd4j::ops::segment_prod op;
+    sd::ops::segment_prod op;
 
     auto result = op.evaluate({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -2221,7 +2221,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentProdBP_2) {
 
     //{ 2.1, 2.5,  4.,  9., 2.1, 2.1, 0.7, 0.1, 3.,  4.2, 2.2, 1.}
     eps.linspace(1);
-    nd4j::ops::segment_prod_bp op;
+    sd::ops::segment_prod_bp op;
 
     auto result = op.evaluate({&x, &idx, &eps}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -2248,7 +2248,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentProd_3) {
                 1581, 924, 5829, 1056,832.01001, 2616.9602, 6789, 784, 12993.810, 993.41003, 1431.2899, 1481.61, 1596, 1621.64, 1882.4401, 1287,
                  91. ,  82. ,  37. ,  64. ,   55.1,  46.4,  73. ,  28. ,119.1,  12.1, 112.7,  13.1, 14. , 114.2,  16.2, 117. });
 
-    nd4j::ops::segment_prod op;
+    sd::ops::segment_prod op;
 
     auto result = op.evaluate({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -2270,7 +2270,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentProd_04) {
     auto idx = NDArrayFactory::create<int>({0,0,1,2,2,2,3,3});
     auto exp = NDArrayFactory::create<int>({ 2,   3, 120,  56});
 
-    nd4j::ops::segment_prod op;
+    sd::ops::segment_prod op;
 
     auto result = op.evaluate({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -2287,7 +2287,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentProd_05) {
     auto idx = NDArrayFactory::create<int>({0,0,1,2,2,2,3,3});
     auto exp = NDArrayFactory::create<int16_t>({ 2,   3, 120,  56});
 
-    nd4j::ops::segment_prod op;
+    sd::ops::segment_prod op;
 
     auto result = op.evaluate({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -2306,7 +2306,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentProd_05_1) {
     auto idx = NDArrayFactory::create<int>({0,0,1,2,2,2,3,3});
     auto exp = NDArrayFactory::create<int>({ 2,   3, 120,  56});
 
-    nd4j::ops::segment_prod op;
+    sd::ops::segment_prod op;
 
     auto result = op.evaluate({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -2325,7 +2325,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentProd_06) {
 
     auto idx = NDArrayFactory::create<int>({0,0,1,2,2,2,3,3});
     auto exp = NDArrayFactory::create<int8_t>({ 2,   3, 120,  56});
-    nd4j::ops::segment_prod op;
+    sd::ops::segment_prod op;
 
     auto result = op.evaluate({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -2342,7 +2342,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentProd_07) {
 
     auto idx = NDArrayFactory::create<int>({0,0,1,2,2,2,3,3});
     auto exp = NDArrayFactory::create<uint8_t>({ 2,   3, 120,  56});
-    nd4j::ops::segment_prod op;
+    sd::ops::segment_prod op;
 
     auto result = op.evaluate({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -2359,7 +2359,7 @@ TEST_F(DeclarableOpsTests7, TestSegmentProd_08) {
 
     auto idx = NDArrayFactory::create<int>({0,0,2,2,2,2,3,3,3,3});
     auto exp = NDArrayFactory::create<int>({ 2, 1,360, 5040});
-    nd4j::ops::segment_prod op;
+    sd::ops::segment_prod op;
 
     auto result = op.evaluate({&x, &idx}, {}, {});
     ASSERT_EQ(result->status(), Status::OK());
@@ -2374,7 +2374,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentProd_1) {
     auto idx = NDArrayFactory::create<int>({0, 0, 1, 1, 1, 1, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4});
     auto exp = NDArrayFactory::create<double>({4.5,    181.44,     3.,      39.69,     1.9404});
 
-    nd4j::ops::unsorted_segment_prod op;
+    sd::ops::unsorted_segment_prod op;
 
     auto result = op.evaluate({&x, &idx}, {}, {5});
     ASSERT_EQ(result->status(), Status::OK());
@@ -2389,7 +2389,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentProd_11) {
     auto idx = NDArrayFactory::create<int>({2, 0, 0, 1, 1, 1, 1, 3, 3, 3, 4, 4, 4, 4, 4, 4});
     auto exp = NDArrayFactory::create<double>({4.5,    181.44,     3.,      39.69,     1.9404});
 
-    nd4j::ops::unsorted_segment_prod op;
+    sd::ops::unsorted_segment_prod op;
 
     auto result = op.evaluate({&x, &idx}, {}, {5});
     ASSERT_EQ(result->status(), Status::OK());
@@ -2406,7 +2406,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentProd_2) {
 
     //{ 2.1, 2.5,  4.,  9., 2.1, 2.1, 0.7, 0.1, 3.,  4.2, 2.2, 1.}
 
-    nd4j::ops::unsorted_segment_prod op;
+    sd::ops::unsorted_segment_prod op;
 
     auto result = op.evaluate({&x, &idx}, {}, {3});
     ASSERT_EQ(result->status(), Status::OK());
@@ -2428,7 +2428,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentProd_12) {
 
     //{ 2.1, 2.5,  4.,  9., 2.1, 2.1, 0.7, 0.1, 3.,  4.2, 2.2, 1.}
 
-    nd4j::ops::unsorted_segment_prod op;
+    sd::ops::unsorted_segment_prod op;
 
     auto result = op.evaluate({&x, &idx}, {}, {3});
     ASSERT_EQ(result->status(), Status::OK());
@@ -2447,7 +2447,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentProd_08) {
 
     auto idx = NDArrayFactory::create<int>({0,0,2,2,2,2,3,3,3,3});
     auto exp = NDArrayFactory::create<int>({ 2, 1,360, 5040});
-    nd4j::ops::unsorted_segment_prod op;
+    sd::ops::unsorted_segment_prod op;
 
     auto result = op.evaluate({&x, &idx}, {}, {4});
     ASSERT_EQ(result->status(), Status::OK());
@@ -2471,7 +2471,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentProd_3) {
             1581, 924, 5829, 1056,832.01001, 2616.9602, 6789, 784, 12993.810, 993.41003, 1431.2899, 1481.61, 1596.0000, 1621.6399, 1882.4401, 1287,
             91. ,  82. ,  37. ,  64. ,   55.1,  46.4,  73. ,  28. ,119.1,  12.1, 112.7,  13.1, 14. , 114.2,  16.2, 117. });
 
-    nd4j::ops::unsorted_segment_prod op;
+    sd::ops::unsorted_segment_prod op;
 
     auto result = op.evaluate({&x, &idx}, {}, {3});
     ASSERT_EQ(result->status(), Status::OK());
@@ -2502,7 +2502,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentProd_4) {
 
             91., 82., 37., 64,     55.1, 46.400002, 73, 28,    119.1, 12.1, 112.7, 13.1,    14, 114.2, 16.2, 117});
 
-    nd4j::ops::unsorted_segment_prod op;
+    sd::ops::unsorted_segment_prod op;
 
     auto result = op.evaluate({&x, &idx}, {}, {3});
     ASSERT_EQ(result->status(), Status::OK());
@@ -2537,7 +2537,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentProd_5) {
                946.,   1044.,   1144.,   1246.,   1350.});
     x.linspace(1.);
 
-    nd4j::ops::unsorted_segment_prod op;
+    sd::ops::unsorted_segment_prod op;
 
     auto result = op.evaluate({&x, &idx}, {}, {4});
     ASSERT_EQ(result->status(), Status::OK());
@@ -2568,7 +2568,7 @@ TEST_F(DeclarableOpsTests7, TestUnsortedSegmentProdBP_4) {
 //
 //            91., 82., 37., 64,     55.1, 46.400002, 73, 28,    119.1, 12.1, 112.7, 13.1,    14, 114.2, 16.2, 117});
 
-    nd4j::ops::unsorted_segment_prod_bp op;
+    sd::ops::unsorted_segment_prod_bp op;
 
     auto result = op.evaluate({&x, &idx, &gradO}, {}, {4});
     ASSERT_EQ(result->status(), Status::OK());
@@ -2606,7 +2606,7 @@ TEST_F(DeclarableOpsTests7, TestExtractImagePatches_1) {
      31.,  22.,  87.,  44.,     55.,  46.,  73.,  28.,    119.,  12., 112.,  13.,     14., 114.,  16., 117.,
      91.,  82.,  37.,  64.,    55.1,  46.4, 73.,  28.,    119.,  12., 112.,  13.,    140., 110., 160., 107.});
 
-    nd4j::ops::extract_image_patches op;
+    sd::ops::extract_image_patches op;
 
     auto result = op.evaluate({&x}, {}, {1,1,1,1,1,1,0});
     ASSERT_EQ(result->status(), Status::OK());
@@ -2643,7 +2643,7 @@ auto exp = NDArrayFactory::create<double>('c', {3, 1, 1, 12}, {
     211.,  12.,  13.,  12., 213.,  14.,  21.,   2.,   3.,   2.,   3.,  24.
  });
 // ----------------------------------------------------------------
-    nd4j::ops::extract_image_patches op;
+    sd::ops::extract_image_patches op;
 
     auto result = op.evaluate({&x}, {}, {2,2, 3,3, 1,1,0});
     ASSERT_EQ(result->status(), Status::OK());
@@ -2677,7 +2677,7 @@ auto exp = NDArrayFactory::create<double>('c', {3, 1, 2, 6}, {
         6.,   7.,  15., 216.,  17.,  35.,  36., 327.
  });
 // ----------------------------------------------------------------
-    nd4j::ops::extract_image_patches op;
+    sd::ops::extract_image_patches op;
 
     auto result = op.evaluate({&x}, {}, {2,1,3,2,2,2,0});
     ASSERT_EQ(result->status(), Status::OK());
@@ -2716,7 +2716,7 @@ auto exp = NDArrayFactory::create<double>('c', {3, 3, 4, 3}, {
      21.,   2.,   3.,      2.,   3.,  24.,     21.,  22., 223.,     22., 223.,  24.,
      25.,   6.,   7.,      8.,   9.,  20.,     35.,  36., 327.,     38., 239.,  40.});
 // ----------------------------------------------------------------
-    nd4j::ops::extract_image_patches op;
+    sd::ops::extract_image_patches op;
 
     auto result = op.evaluate({&x}, {}, {1,1,1,1,1,1,0});
     ASSERT_EQ(result->status(), Status::OK());
@@ -2755,7 +2755,7 @@ auto exp = NDArrayFactory::create<double>('c', {3, 1, 1, 18}, {
 
  });
 // ----------------------------------------------------------------
-    nd4j::ops::extract_image_patches op;
+    sd::ops::extract_image_patches op;
 
     auto result = op.evaluate({&x}, {}, {3,2,3,2,1,2,0});
     ASSERT_EQ(result->status(), Status::OK());
@@ -2784,7 +2784,7 @@ auto exp = NDArrayFactory::create<double>('c', {2, 1, 4, 4}, {
     21.11, 21.12, 22.11, 22.12, 21.21, 21.22, 22.21, 22.22,    21.31, 21.32, 22.31, 22.32,     21.41, 21.42, 22.41, 22.42
  });
 // ----------------------------------------------------------------
-    nd4j::ops::extract_image_patches op;
+    sd::ops::extract_image_patches op;
 
     auto result = op.evaluate({&x}, {}, {2,1, 1,1, 1,1,0});
     ASSERT_EQ(result->status(), Status::OK());
@@ -2806,7 +2806,7 @@ TEST_F(DeclarableOpsTests7, TestExtractImagePatches_SGO_7) {
             1., 2., 4., 5.,    2., 3., 5., 6.,    3., 0., 6., 0.,
     4., 5., 7., 8.,    5., 6., 8., 9.,    6., 0., 9., 0.,    7., 8., 0., 0.,    8., 9., 0., 0.,   9., 0., 0., 0. });
 // ----------------------------------------------------------------
-    nd4j::ops::extract_image_patches op;
+    sd::ops::extract_image_patches op;
 
     auto result = op.evaluate({&x}, {}, {2,2, 1,1, 1,1, 1}); // equiv TF ksizes=[1,2,2,1], strides=[1,1,1,1], rates=[1,1,1,1], padding="SAME"
     ASSERT_EQ(result->status(), Status::OK());
@@ -2837,7 +2837,7 @@ TEST_F(DeclarableOpsTests7, TestExtractImagePatches_SGO_8) {
             7,  8,  9, 10, 13, 14, 15, 16,  9, 10, 11, 12, 15, 16, 17, 18, 11, 12,  0,  0, 17, 18,  0,  0,
             13, 14, 15, 16, 0, 0, 0, 0, 15, 16, 17, 18, 0, 0, 0, 0, 17, 18, 0, 0, 0, 0, 0, 0 });
 // ----------------------------------------------------------------
-    nd4j::ops::extract_image_patches op;
+    sd::ops::extract_image_patches op;
 
     auto result = op.evaluate({&x}, {}, {2,2, 1,1, 1,1, 1}); // equiv TF ksizes=[1,2,2,1], strides=[1,1,1,1], rates=[1,1,1,1], padding="SAME"
     ASSERT_EQ(result->status(), Status::OK());
@@ -2901,7 +2901,7 @@ TEST_F(DeclarableOpsTests7, TestExtractImagePatches_SGO_9) {
            55., 56., 57., 58., 59., 60., 67., 68., 69., 70., 71., 72.,  0.,  0.,  0.,  0.,  0.,  0.,
            57., 58., 59., 60.,  0.,  0., 69., 70., 71., 72.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.});
 // ----------------------------------------------------------------
-    nd4j::ops::extract_image_patches op;
+    sd::ops::extract_image_patches op;
 
     auto result = op.evaluate({&x}, {}, {3,3, 1,1, 1,1, 1}); // equiv TF ksizes=[1,2,2,1], strides=[1,1,1,1], rates=[1,1,1,1], padding="SAME"
     ASSERT_EQ(result->status(), Status::OK());
@@ -2942,7 +2942,7 @@ TEST_F(DeclarableOpsTests7, TestExtractImagePatches_SGO_9_1) {
 
     });
 // ----------------------------------------------------------------
-    nd4j::ops::extract_image_patches op;
+    sd::ops::extract_image_patches op;
 
     auto result = op.evaluate({&x}, {}, {2,2, 1,1, 1,1, 1}); // equiv TF ksizes=[1,2,2,1], strides=[1,1,1,1], rates=[1,1,1,1], padding="SAME"
     ASSERT_EQ(result->status(), Status::OK());
@@ -2988,7 +2988,7 @@ TEST_F(DeclarableOpsTests7, TestExtractImagePatches_SGO_10) {
            41., 42., 43., 44., 45., 46., 53., 54., 55., 56., 57., 58., 65., 66., 67., 68., 69., 70.,
            43., 44., 45., 46., 47., 48., 55., 56., 57., 58., 59., 60., 67., 68., 69., 70., 71., 72.});
 // ----------------------------------------------------------------
-    nd4j::ops::extract_image_patches op;
+    sd::ops::extract_image_patches op;
     //x.printIndexedBuffer("Images");
     //x.printBuffer("Images linear");
     auto result = op.evaluate({&x}, {}, {3,3, 1,1, 1,1, 0}); // equiv TF ksizes=[1,2,2,1], strides=[1,1,1,1], rates=[1,1,1,1], padding="VALID"
@@ -3017,7 +3017,7 @@ TEST_F(DeclarableOpsTests7, TestExtractImagePatches_SGO_010) {
             1,  2,  5,  6,  2,  3,  6,  7,  3,  4,  7,  8,  5,  6,  9, 10,  6,  7, 10, 11,  7,  8, 11, 12,
             9, 10, 13, 14, 10, 11, 14, 15, 11, 12, 15, 16});
 // ----------------------------------------------------------------
-    nd4j::ops::extract_image_patches op;
+    sd::ops::extract_image_patches op;
     //x.printIndexedBuffer("Images");
     //x.printBuffer("Images linear");
     auto result = op.evaluate({&x}, {}, {2,2, 1,1, 1,1, 0}); // equiv TF ksizes=[1,2,2,1], strides=[1,1,1,1], rates=[1,1,1,1], padding="VALID"
@@ -3047,7 +3047,7 @@ TEST_F(DeclarableOpsTests7, TestExtractImagePatches_SGO_010_1) {
             7,  8, 11, 12,  8,  0, 12,  0,  9, 10, 13, 14, 10, 11, 14, 15, 11, 12, 15, 16, 12,  0, 16,  0,
             13, 14,  0,  0, 14, 15,  0,  0, 15, 16,  0,  0, 16,  0,  0,  0});
 // ----------------------------------------------------------------
-    nd4j::ops::extract_image_patches op;
+    sd::ops::extract_image_patches op;
     //x.printIndexedBuffer("Images");
     //x.printBuffer("Images linear");
     auto result = op.evaluate({&x}, {}, {2,2, 1,1, 1,1, 1}); // equiv TF ksizes=[1,2,2,1], strides=[1,1,1,1], rates=[1,1,1,1], padding="VALID"
@@ -3079,7 +3079,7 @@ TEST_F(DeclarableOpsTests7, TestExtractImagePatches_SGO_011) {
             1,  3,  9, 11,  2,  4, 10, 12,  5,  7, 13, 15,  6,  8, 14, 16,
             });
 // ----------------------------------------------------------------
-    nd4j::ops::extract_image_patches op;
+    sd::ops::extract_image_patches op;
     //x.printIndexedBuffer("Images");
     //x.printBuffer("Images linear");
     auto result = op.evaluate({&x}, {}, {2,2, 1,1, 2,2, 0}); // equiv TF ksizes=[1,2,2,1], strides=[1,1,1,1], rates=[1,1,1,1], padding="VALID"
@@ -3116,7 +3116,7 @@ TEST_F(DeclarableOpsTests7, TestExtractImagePatches_SGO_11) {
             117, 118, 119, 120, 105, 106, 107, 108, 121, 122, 123, 124, 109, 110, 111, 112, 125, 126,
             127, 128});
 // ----------------------------------------------------------------
-    nd4j::ops::extract_image_patches op;
+    sd::ops::extract_image_patches op;
 
     auto result = op.evaluate({&x}, {}, {2,2, 2,2, 1,1, 1}); // equiv TF ksizes=[1,2,2,1], strides=[1,1,1,1], rates=[1,1,1,1], padding="SAME"
     ASSERT_EQ(result->status(), Status::OK());
@@ -3173,7 +3173,7 @@ TEST_F(DeclarableOpsTests7, TestExtractImagePatches_SGO_12) {
               0,   0, 105, 106, 109, 110,   0,   0,   0,   0, 107, 108, 111, 112,   0,   0,   0,   0,
             109, 110,   0,   0,   0,   0,   0,   0});
 // ----------------------------------------------------------------
-    nd4j::ops::extract_image_patches op;
+    sd::ops::extract_image_patches op;
 
     auto result = op.evaluate({&x}, {}, {2,2, 1,1, 2,2, 1}); // equiv TF ksizes=[1,2,2,1], strides=[1,1,1,1], rates=[1,2,2,1], padding="SAME"
     ASSERT_EQ(result->status(), Status::OK());
@@ -3203,7 +3203,7 @@ TEST_F(DeclarableOpsTests7, TestExtractImagePatches_SGO_13) {
             15., 16., 17., 18., 11., 12.,  0.,  0., 17., 18.,  0.,  0., 13., 14., 15., 16.,  0.,  0.,
             0.,  0., 15., 16., 17., 18.,  0.,  0.,  0.,  0., 17., 18.,  0.,  0.,  0.,  0.,  0.,  0. });
 // ----------------------------------------------------------------
-    nd4j::ops::extract_image_patches op;
+    sd::ops::extract_image_patches op;
 
     auto result = op.evaluate({&x}, {}, {2,2, 1,1, 1,1, 1}); // equiv TF ksizes=[1,2,2,1], strides=[1,1,1,1], rates=[1,1,1,1], padding="SAME"
     ASSERT_EQ(result->status(), Status::OK());
@@ -3228,7 +3228,7 @@ auto exp = NDArrayFactory::create<double>('c', {2, 2, 4, 2}, {
     21.41, 21.42, 22.11, 22.12
  });
 // ----------------------------------------------------------------
-    nd4j::ops::roll op;
+    sd::ops::roll op;
 
     auto result = op.evaluate({&x}, {}, {6});
     ASSERT_EQ(result->status(), Status::OK());
@@ -3250,7 +3250,7 @@ auto exp = NDArrayFactory::create<double>('c', {2, 2, 4, 2}, {
          22.11, 22.12, 22.21, 22.22, 22.31, 22.32, 22.41, 22.42,    11.11, 11.12, 11.21, 11.22, 11.31, 11.32, 11.41, 11.42
  });
 // ----------------------------------------------------------------
-    nd4j::ops::roll op;
+    sd::ops::roll op;
 
     auto result = op.evaluate({&x}, {}, {-8});
     ASSERT_EQ(result->status(), Status::OK());
@@ -3271,7 +3271,7 @@ auto exp = NDArrayFactory::create<double>('c', {2, 2, 4, 2}, {
          22.11, 22.12, 22.21, 22.22, 22.31, 22.32, 22.41, 22.42,    11.11, 11.12, 11.21, 11.22, 11.31, 11.32, 11.41, 11.42
  });
 // ----------------------------------------------------------------
-    nd4j::ops::roll op;
+    sd::ops::roll op;
 
     auto result = op.evaluate({&x}, {}, {-40});
     ASSERT_EQ(result->status(), Status::OK());
@@ -3294,7 +3294,7 @@ auto exp = NDArrayFactory::create<double>('c', {2, 2, 4, 2}, {
     21.41, 21.42, 22.11, 22.12
  });
 // ----------------------------------------------------------------
-    nd4j::ops::roll op;
+    sd::ops::roll op;
 
     auto result = op.evaluate({&x}, {}, {38});
     ASSERT_EQ(result->status(), Status::OK());
@@ -3319,7 +3319,7 @@ auto exp = NDArrayFactory::create<double>('c', {2, 2, 4, 2}, {
     21.41, 21.42, 22.11, 22.12
  });
 // ----------------------------------------------------------------
-    nd4j::ops::roll op;
+    sd::ops::roll op;
     NDArray* y = nullptr;
     auto result = op.execute({&x}, {y}, {}, {38}, {}, {}, true);
     ASSERT_EQ(result, Status::OK());
@@ -3342,7 +3342,7 @@ auto exp = NDArrayFactory::create<double>('c', {3, 4}, {
 //     4,  5,  6,  7, 8,  9, 10, 11, 0,  1,  2,  3
 });
 // ----------------------------------------------------------------
-    nd4j::ops::roll op;
+    sd::ops::roll op;
 
     auto result = op.evaluate({&x}, {}, {2, 1});
     ASSERT_EQ(result->status(), Status::OK());
@@ -3365,7 +3365,7 @@ auto exp = NDArrayFactory::create<double>('c', {2, 3, 2}, {
     1., 0., 3., 2., 5., 4., 7., 6., 9., 8., 11., 10.
 });
 // ----------------------------------------------------------------
-    nd4j::ops::roll op;
+    sd::ops::roll op;
 
     auto result = op.evaluate({&x}, {}, {1, 2});
     ASSERT_EQ(result->status(), Status::OK());
@@ -3388,7 +3388,7 @@ auto exp = NDArrayFactory::create<double>('c', {2, 3, 2}, {
     11., 10., 7., 6., 9., 8., 5.,  4., 1., 0., 3., 2.
 });
 // ----------------------------------------------------------------
-    nd4j::ops::roll op;
+    sd::ops::roll op;
 
     auto result = op.evaluate({&x}, {}, {1, 2, 1, 0});
     ASSERT_EQ(result->status(), Status::OK());
@@ -3411,7 +3411,7 @@ auto exp = NDArrayFactory::create<double>('c', {2, 3, 2}, {
     11., 10., 7., 6., 9., 8., 5.,  4., 1., 0., 3., 2.
 });
 // ----------------------------------------------------------------
-    nd4j::ops::roll op;
+    sd::ops::roll op;
     NDArray* y = nullptr;
     auto result = op.execute({&x}, {y}, {}, {1, 2, 1, 0}, {}, {}, true);
     ASSERT_EQ(result, Status::OK());
@@ -3434,7 +3434,7 @@ auto exp = NDArrayFactory::create<double>('c', {2, 3, 3}, {
     6.,  7.,  8., 0., 1., 2., 3., 4., 5., 15., 16., 17., 9., 10., 11., 12., 13., 14.
 });
 // ----------------------------------------------------------------
-    nd4j::ops::roll op;
+    sd::ops::roll op;
     NDArray* y = nullptr;
     auto result = op.execute({&x}, {y}, {}, {1, 1}, {}, {}, true);
     ASSERT_EQ(result, Status::OK());
@@ -3454,7 +3454,7 @@ TEST_F(DeclarableOpsTests7, TestRoll_10) {
         1., 2., 3.,  4.,  5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24.
     });
 // ----------------------------------------------------------------
-    nd4j::ops::roll op;
+    sd::ops::roll op;
     auto result = op.evaluate({&x}, {}, {3, 1});
     ASSERT_EQ(result->status(), Status::OK());
     auto out = result->at(0);
@@ -3478,7 +3478,7 @@ TEST_F(DeclarableOpsTests7, TestRoll_11) {
          17., 18., 19., 20., 21., 22., 23., 24., 13., 14., 15., 16., 5., 6., 7, 8, 9, 10, 11, 12, 1, 2, 3, 4
     });
 // ----------------------------------------------------------------
-    nd4j::ops::roll op;
+    sd::ops::roll op;
     NDArray* y = nullptr;
     auto result = op.evaluate({&x, &shift, &axis});
     ASSERT_EQ(result->status(), Status::OK());
@@ -3504,7 +3504,7 @@ TEST_F(DeclarableOpsTests7, TestRoll_12) {
          24, 21, 22, 23, 16, 13, 14, 15, 20, 17, 18, 19, 12, 9, 10, 11, 4, 1, 2, 3, 8, 5, 6, 7
     });
 // ----------------------------------------------------------------
-    nd4j::ops::roll op;
+    sd::ops::roll op;
     NDArray* y = nullptr;
     auto result = op.evaluate({&x, &shift, &axis});
     ASSERT_EQ(result->status(), Status::OK());
@@ -3527,7 +3527,7 @@ TEST_F(DeclarableOpsTests7, TestRoll_13) {
             2,3,4,1,6,7,8,5,10,11,12,9,14, 15, 16, 13, 18, 19, 20, 17, 22, 23, 24, 21
     });
 // ----------------------------------------------------------------
-    nd4j::ops::roll op;
+    sd::ops::roll op;
     NDArray* y = nullptr;
     auto result = op.evaluate({&x}, {}, {3,2});
     ASSERT_EQ(result->status(), Status::OK());
@@ -3550,7 +3550,7 @@ TEST_F(DeclarableOpsTests7, TestRoll_14) {
         24, 21, 22, 23, 16, 13, 14, 15, 20, 17, 18, 19, 12, 9, 10, 11, 4, 1, 2, 3, 8, 5, 6, 7
     });
 // ----------------------------------------------------------------
-    nd4j::ops::roll op;
+    sd::ops::roll op;
 
     auto result = op.evaluate({&x, &shift, &axis});
     ASSERT_EQ(result->status(), Status::OK());
@@ -3571,7 +3571,7 @@ TEST_F(DeclarableOpsTests7, TestRoll_15) {
 
     auto exp = NDArrayFactory::create<float>({0.7244f,    0.2309f,    0.7788f,    0.8012f });
 // ----------------------------------------------------------------
-    nd4j::ops::roll op;
+    sd::ops::roll op;
 
     auto result = op.evaluate({&x, &shift, &axis});
     ASSERT_EQ(result->status(), Status::OK());
@@ -3596,7 +3596,7 @@ TEST_F(DeclarableOpsTests7, percentile_test1) {
                                                     82., 90.,  91., 89., 92., 34.,  35., 33., 36.});
     auto expected = NDArrayFactory::create<double>(50.);
 
-    nd4j::ops::percentile op;
+    sd::ops::percentile op;
 
     auto result = op.evaluate({&input}, {50.}, {});
     auto output = result->at(0);
@@ -3620,7 +3620,7 @@ TEST_F(DeclarableOpsTests7, percentile_test2) {
                                                    82., 90.,  91., 89., 92., 34.,  35., 33., 36.});
     auto expected = NDArrayFactory::create<double>('c', {1,1,1}, {11.});
 
-    nd4j::ops::percentile op;
+    sd::ops::percentile op;
                                        //q,  interpolation, keepDims
     auto result = op.evaluate({&input}, {10,  2,             1}, {});
     auto output = result->at(0);
@@ -3644,7 +3644,7 @@ TEST_F(DeclarableOpsTests7, percentile_test3) {
                                                    82., 90.,  91., 89., 92., 34.,  35., 33., 36.});
     auto expected = NDArrayFactory::create<double>('c', {1,1,1}, {10.});
 
-    nd4j::ops::percentile op;
+    sd::ops::percentile op;
                                        //q,  interpolation, keepDims
     auto result = op.evaluate({&input}, {10,  0,             1}, {});
     auto output = result->at(0);
@@ -3668,7 +3668,7 @@ TEST_F(DeclarableOpsTests7, percentile_test4) {
                                                    82., 90.,  91., 89., 92., 34.,  35., 33., 36.});
     auto expected = NDArrayFactory::create<double>('c', {1,1,1}, {11.});
 
-    nd4j::ops::percentile op;
+    sd::ops::percentile op;
                                        //q,  interpolation, keepDims
     auto result = op.evaluate({&input}, {10,  1,             1}, {});
     auto output = result->at(0);
@@ -3692,7 +3692,7 @@ TEST_F(DeclarableOpsTests7, percentile_test5) {
 
     auto expected = NDArrayFactory::create<double>('c', {1,1,4}, {12., 7., 11., 10.});
 
-    nd4j::ops::percentile op;
+    sd::ops::percentile op;
                                        //q,  interpolation, keepDims
     auto result = op.evaluate({&input}, {10,  0,             1}, {0,1});
     auto output = result->at(0);
@@ -3716,7 +3716,7 @@ TEST_F(DeclarableOpsTests7, percentile_test6) {
 
     auto expected = NDArrayFactory::create<double>('c', {1,1,4}, {16., 14., 15., 13.});
 
-    nd4j::ops::percentile op;
+    sd::ops::percentile op;
                                        //q,  interpolation, keepDims
     auto result = op.evaluate({&input}, {10,  1,             1}, {0,1});
     auto output = result->at(0);
@@ -3740,7 +3740,7 @@ TEST_F(DeclarableOpsTests7, percentile_test7) {
 
     auto expected = NDArrayFactory::create<double>('c', {1,1,4}, {12., 7., 11., 10.});
 
-    nd4j::ops::percentile op;
+    sd::ops::percentile op;
                                        //q,  interpolation, keepDims
     auto result = op.evaluate({&input}, {10,  2,             1}, {0,1});
     auto output = result->at(0);
@@ -3764,7 +3764,7 @@ TEST_F(DeclarableOpsTests7, percentile_test8) {
 
     auto expected = NDArrayFactory::create<double>('c', {4}, {12., 7., 11., 10.});
 
-    nd4j::ops::percentile op;
+    sd::ops::percentile op;
                                        //q,  interpolation, keepDims
     auto result = op.evaluate({&input}, {10,  2,             0}, {0,1});
     auto output = result->at(0);
@@ -3788,7 +3788,7 @@ TEST_F(DeclarableOpsTests7, percentile_test9) {
 
     auto expected = NDArrayFactory::create<double>(11.);
 
-    nd4j::ops::percentile op;
+    sd::ops::percentile op;
                                        //q,  interpolation, keepDims
     auto result = op.evaluate({&input}, {10,  2,             0}, {0});
     auto output = result->at(0);
@@ -3812,7 +3812,7 @@ TEST_F(DeclarableOpsTests7, percentile_test10) {
 
     auto expected = NDArrayFactory::create<double>('c', {1}, {11.});
 
-    nd4j::ops::percentile op;
+    sd::ops::percentile op;
                                        //q,  interpolation, keepDims
     auto result = op.evaluate({&input}, {10,  2,             1}, {0});
     auto output = result->at(0);
@@ -3832,7 +3832,7 @@ TEST_F(DeclarableOpsTests7, percentile_test11) {
 
     auto expected = NDArrayFactory::create<double>('c', {1}, {100.});
 
-    nd4j::ops::percentile op;
+    sd::ops::percentile op;
                                        //q,  interpolation, keepDims
     auto result = op.evaluate({&input}, {10,  2,             1}, {0});
     auto output = result->at(0);
@@ -3852,7 +3852,7 @@ TEST_F(DeclarableOpsTests7, percentile_test12) {
 
     auto expected = NDArrayFactory::create<double>(100.);
 
-    nd4j::ops::percentile op;
+    sd::ops::percentile op;
                                        //q,  interpolation, keepDims
     auto result = op.evaluate({&input}, {10,  2,             0}, {});
     auto output = result->at(0);
@@ -3869,7 +3869,7 @@ TEST_F(DeclarableOpsTests7, transpose_test3) {
     auto input = NDArrayFactory::create<double>('c', {5, 3},   {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f});
     auto exp = NDArrayFactory::create<double>('c', {3, 5},   {1.f, 4.f, 7.f, 10.f, 13.f, 2.f, 5.f, 8.f, 11.f, 14.f, 3.f, 6.f, 9.f, 12.f, 15.f});
 
-    nd4j::ops::transpose op;
+    sd::ops::transpose op;
     auto result = op.evaluate({&input}, {}, {});
     auto output = result->at(0);
 
@@ -3885,7 +3885,7 @@ TEST_F(DeclarableOpsTests7, rationaltanh_test1) {
     auto input = NDArrayFactory::create<double>('c', {8},   {0, 1, 2, 3, 4, 5, 6, 7});
     NDArray exp = NDArrayFactory::create<double>({0.000000, 0.998222, 1.516093, 1.658054, 1.695077, 1.706884, 1.711427, 1.713446});
 
-    nd4j::ops::rationaltanh op;
+    sd::ops::rationaltanh op;
     auto result = op.evaluate({&input}, {}, {});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Output rationaltanh");
@@ -3901,7 +3901,7 @@ TEST_F(DeclarableOpsTests7, rationaltanh_test2) {
     auto input = NDArrayFactory::create<double>('c', {2,2,2},   {0, 1, 2, 3, 4, 5, 6, 7});
     NDArray exp = NDArrayFactory::create<double>('c', {2,2,2}, {0.000000, 0.998222, 1.516093, 1.658054, 1.695077, 1.706884, 1.711427, 1.713446});
 
-    nd4j::ops::rationaltanh op;
+    sd::ops::rationaltanh op;
     auto result = op.evaluate({&input}, {}, {});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Output rationaltanh");
@@ -3918,7 +3918,7 @@ TEST_F(DeclarableOpsTests7, rationaltanh_test3) {
     auto eps = NDArrayFactory::create<double>('c', {2,2,2},   {1, 2, 3, 4, 5, 6, 7, 8});
     NDArray exp = NDArrayFactory::create<double>('c', {2,2,2}, {1.143933, 1.605747, 0.795557, 0.261710, 0.095832, 0.041218, 0.020221, 0.010971});
 
-    nd4j::ops::rationaltanh_bp op;
+    sd::ops::rationaltanh_bp op;
     auto result = op.evaluate({&input, &eps}, {}, {});
     auto output = result->at(0);
 //    output->printBuffer("Output rationaltanh BP");
@@ -3934,7 +3934,7 @@ TEST_F(DeclarableOpsTests7, rectifiedtanh_test1) {
     auto input = NDArrayFactory::create<double>('c', {2,2,2},   {0, 1, 2, 3, 4, 5, 6, 7});
     NDArray exp = NDArrayFactory::create<double>('c', {2,2,2}, {0.000000, 0.761594, 0.964028, 0.995055, 0.999329, 0.999909, 0.999988, 0.999998});
 
-    nd4j::ops::rectifiedtanh op;
+    sd::ops::rectifiedtanh op;
     auto result = op.evaluate({&input}, {}, {});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Output rectifiedtanh");
@@ -3951,7 +3951,7 @@ TEST_F(DeclarableOpsTests7, rectifiedtanh_test2) {
     auto eps = NDArrayFactory::create<double>('c', {2,2,2},   {1, 2, 3, 4, 5, 6, 7, 8});
     NDArray exp = NDArrayFactory::create<double>('c', {2,2,2}, {0.000000, 0.839949, 0.211952, 0.039464, 0.006705, 0.001089, 0.000172, 0.000027});
 
-    nd4j::ops::rectifiedtanh_bp op;
+    sd::ops::rectifiedtanh_bp op;
     auto result = op.evaluate({&input, &eps}, {}, {});
     auto output = result->at(0);
 //    output->printBuffer("Output rectifiedtanh BP");
@@ -3967,7 +3967,7 @@ TEST_F(DeclarableOpsTests7, RealDiv_1) {
     NDArray y = NDArrayFactory::create<float>('c', {1, 2}, {1.f,2.f});
     NDArray e = NDArrayFactory::create<float>('c', {1, 2, 2}, {2.f, 1.f, 4.f, 2.f});
 
-    nd4j::ops::realdiv op;
+    sd::ops::realdiv op;
     auto result = op.evaluate({&x, &y}, {}, {});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -3989,7 +3989,7 @@ TEST_F(DeclarableOpsTests7, RealDiv_BP_1) {
     NDArray e1 = NDArrayFactory::create<float>('c', {1, 2}, {-14.f, -5.f});
     NDArray eps = NDArrayFactory::create<float>('c', {1, 2, 2}, {1.f, 2.f, 3.f, 4.f});
 
-    nd4j::ops::realdiv_bp op;
+    sd::ops::realdiv_bp op;
     auto result = op.evaluate({&x, &y, &eps}, {}, {});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -4014,7 +4014,7 @@ TEST_F(DeclarableOpsTests7, ShapesOf_1) {
 //    NDArray y = NDArrayFactory::create<float>('c', {1, 2}, {1,2});
     NDArray e = NDArrayFactory::create<Nd4jLong>({1, 2, 1});
 
-    nd4j::ops::shapes_of op;
+    sd::ops::shapes_of op;
     auto result = op.evaluate({&x}, {}, {});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -4035,7 +4035,7 @@ TEST_F(DeclarableOpsTests7, ShapesOf_2) {
     NDArray e0 = NDArrayFactory::create<Nd4jLong>({1, 2, 1});
     NDArray e1 = NDArrayFactory::create<Nd4jLong>({1, 2});
 
-    nd4j::ops::shapes_of op;
+    sd::ops::shapes_of op;
     auto result = op.evaluate({&x, &y}, {}, {});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -4057,7 +4057,7 @@ TEST_F(DeclarableOpsTests7, Size_1) {
     NDArray y = NDArrayFactory::create<float>('c', {5, 2}, {1.f, 2.f, 3.f, 4.f, 5.f, 7.f, 9.f, 10.f, 10.f, 11.f});
     NDArray e = NDArrayFactory::create<Nd4jLong>(2);
 
-    nd4j::ops::size op;
+    sd::ops::size op;
     auto result = op.evaluate({&x}, {}, {});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -4076,7 +4076,7 @@ TEST_F(DeclarableOpsTests7, Size_2) {
     NDArray y = NDArrayFactory::create<double>('c', {5, 2}, {1,2,3,4,5,7,9,10, 10, 11});
     NDArray e = NDArrayFactory::create<Nd4jLong>(10);
 
-    nd4j::ops::size op;
+    sd::ops::size op;
     auto result = op.evaluate({&y}, {}, {});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -4094,7 +4094,7 @@ TEST_F(DeclarableOpsTests7, Softplus_1) {
     NDArray x = NDArrayFactory::create<double>('c', {5, 2}, {1,2,3,4,5,7,9,10, 10, 11});
     NDArray e = NDArrayFactory::create<double>('c', {5, 2}, {1.3132616,  2.126928, 3.0485873, 4.01815, 5.0067153, 7.0009117, 9.000123, 10.000046, 10.000046, 11.000016});
 
-    nd4j::ops::softplus op;
+    sd::ops::softplus op;
     auto result = op.evaluate({&x}, {}, {});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -4112,8 +4112,8 @@ TEST_F(DeclarableOpsTests7, Softplus_BP_1) {
     NDArray x = NDArrayFactory::create<double >('c', {5, 2}, {1,2,3,4,5,7,9,10, 10, 11});
 //    NDArray e = NDArrayFactory::create<float>('c', {5, 2}, {1.3132616,  2.126928, 3.0485873, 4.01815, 5.0067153, 7.0009117, 9.000123, 10.000046, 10.000046, 11.000016});
     NDArray eps = NDArrayFactory::create<double>('c', {5, 2}, {1,2,3,4,5,6,7,8, 9, 10});
-    nd4j::ops::softplus ffOP;
-    nd4j::ops::softplus_bp bpOp;
+    sd::ops::softplus ffOP;
+    sd::ops::softplus_bp bpOp;
     const OpArgsHolder argsHolderFF({&x}, {}, {});
     const OpArgsHolder argsHolderBP({&x, &eps}, {}, {});
 
@@ -4134,7 +4134,7 @@ TEST_F(DeclarableOpsTests7, Softsign_1) {
     NDArray x = NDArrayFactory::create<double>('c', {5, 2}, {1,2,3,4,5,7,9,10, 10, 11});
     NDArray e = NDArrayFactory::create<double>('c', {5, 2}, {0.5, 0.6666667, 0.75, 0.8, 0.8333333, 0.875, 0.9, 0.90909094, 0.90909094, 0.9166667});
 
-    nd4j::ops::softsign op;
+    sd::ops::softsign op;
     auto result = op.evaluate({&x}, {}, {});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -4152,8 +4152,8 @@ TEST_F(DeclarableOpsTests7, Softsign_BP_1) {
     NDArray x = NDArrayFactory::create<double >('c', {5, 2}, {1,2,3,4,5,7,9,10, 10, 11});
 //    NDArray e = NDArrayFactory::create<float>('c', {5, 2}, {1.3132616f,  2.126928f, 3.0485873f, 4.01815f, 5.0067153f, 7.0009117f, 9.000123f, 10.000046f, 10.000046f, 11.000016f});
     NDArray eps = NDArrayFactory::create<double>('c', {5, 2}, {1,2,3,4,5,6,7,8, 9, 10});
-    nd4j::ops::softsign ffOP;
-    nd4j::ops::softsign_bp bpOp;
+    sd::ops::softsign ffOP;
+    sd::ops::softsign_bp bpOp;
     const OpArgsHolder argsHolderFF({&x}, {}, {});
     const OpArgsHolder argsHolderBP({&x, &eps}, {}, {});
 
@@ -4169,7 +4169,7 @@ TEST_F(DeclarableOpsTests7, fill_test2) {
     auto v = NDArrayFactory::create<double>(42.);
     auto exp = NDArrayFactory::create<double>('c', {2, 2},{42.f, 42.f, 42.f, 42.f});
 
-    nd4j::ops::fill op;
+    sd::ops::fill op;
     auto result = op.evaluate({&x, &v}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -4189,7 +4189,7 @@ TEST_F(DeclarableOpsTests7, fill_test3) {
     auto v = NDArrayFactory::create<double>(42.);
     auto exp = NDArrayFactory::create<double>('c', {2, 2}, {42.f, 42.f, 42.f, 42.f});
 
-    nd4j::ops::fill op;
+    sd::ops::fill op;
     auto result = op.evaluate({&x, &v}, {}, {});
     auto output = result->at(0);
 
@@ -4207,7 +4207,7 @@ TEST_F(DeclarableOpsTests7, ToggleBits_test1) {
     auto x = NDArrayFactory::create<int>('c', {2},  {2, 2});
     auto exp = NDArrayFactory::create<int>('c', {2}, {-3, -3});
 
-    nd4j::ops::toggle_bits op;
+    sd::ops::toggle_bits op;
     auto result = op.evaluate({&x});
     auto output = result->at(0);
 
@@ -4227,7 +4227,7 @@ TEST_F(DeclarableOpsTests7, ToggleBits_test2) {
     auto exp0 = NDArrayFactory::create<int>('c', {2}, {-3, -3});
     auto exp1 = NDArrayFactory::create<int>('c', {2}, {-2, -2});
 
-    nd4j::ops::toggle_bits op;
+    sd::ops::toggle_bits op;
     auto result = op.evaluate({&x, &y});
     auto output = result->at(0);
     auto z = result->at(1);
@@ -4248,7 +4248,7 @@ TEST_F(DeclarableOpsTests7, Truncatediv_test1) {
     NDArray y = NDArrayFactory::create<double >('c', {5, 2}, {2,2,2,2,2,2,2,2, 2, 2});
     NDArray exp = NDArrayFactory::create<double >('c', {5, 2}, {0.5, 1., 1.5, 2., 2.5, 3.5, 4.5, 5., 5., 5.5});
 
-    nd4j::ops::truncatediv op;
+    sd::ops::truncatediv op;
     auto result = op.evaluate({&x, &y}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     auto output = result->at(0);
@@ -4264,7 +4264,7 @@ TEST_F(DeclarableOpsTests7, Truncatediv_test2) {
     NDArray y = NDArrayFactory::create<double >('c', {1, 2}, {2,2});
     NDArray exp = NDArrayFactory::create<double >('c', {5, 2}, {0.5, 1., 1.5, 2., 2.5, 3.5, 4.5, 5., 5., 5.5});
 
-    nd4j::ops::truncatediv op;
+    sd::ops::truncatediv op;
     auto result = op.evaluate({&x, &y}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     auto output = result->at(0);
@@ -4282,8 +4282,8 @@ TEST_F(DeclarableOpsTests7, TypesConversion_test1) {
     NDArray expF = NDArrayFactory::create<float>('c', {5, 2}, {1.f,2.f,3.f,4.f,5.f,7.f,9.f,10.f, 10.f, 11.f});
     NDArray expF16 = NDArrayFactory::create<float16>('c', {5, 2}, {1.f,2.f,3.f,4.f,5.f,7.f,9.f,10.f, 10.f, 11.f});
 
-    nd4j::ops::to_int32 op32;
-    nd4j::ops::to_int64 op64;
+    sd::ops::to_int32 op32;
+    sd::ops::to_int64 op64;
     auto result32 = op32.evaluate({&x}, {}, {});
     auto result64 = op64.evaluate({&x}, {}, {});
 
@@ -4308,8 +4308,8 @@ TEST_F(DeclarableOpsTests7, TypesConversion_test2) {
     NDArray expF = NDArrayFactory::create<float>('c', {5, 2}, {1.f,2.f,3.f,4.f,5.f,7.f,9.f,10.f, 10.f, 11.f});
     NDArray expH = NDArrayFactory::create<float16>('c', {5, 2}, {1.f,2.f,3.f,4.f,5.f,7.f,9.f,10.f, 10.f, 11.f});
 
-    nd4j::ops::to_float32 op32;
-    nd4j::ops::to_float16 op16;
+    sd::ops::to_float32 op32;
+    sd::ops::to_float16 op16;
     auto result32 = op32.evaluate({&x}, {}, {});
     auto result16 = op16.evaluate({&x}, {}, {});
 
@@ -4334,8 +4334,8 @@ TEST_F(DeclarableOpsTests7, TypesConversion_test3) {
     NDArray exp32 = NDArrayFactory::create<unsigned int>('c', {5, 2}, {1,2,3,4,5,7,9,10, 10, 11});
     NDArray exp64 = NDArrayFactory::create<uint64_t>('c', {5, 2}, {1,2,3,4,5,7,9,10, 10, 11});
 
-    nd4j::ops::to_uint32 op32;
-    nd4j::ops::to_uint64 op64;
+    sd::ops::to_uint32 op32;
+    sd::ops::to_uint64 op64;
     auto result32 = op32.evaluate({&x}, {}, {});
     auto result64 = op64.evaluate({&x}, {}, {});
 
@@ -4360,8 +4360,8 @@ TEST_F(DeclarableOpsTests7, TypesConversion_test4) {
     NDArray exp32 = NDArrayFactory::create<float>('c', {5, 2}, {1.f,2.f,3.f,4.f,5.f,7.f,9.f,10.f, 10.f, 11.f});
     NDArray exp64 = NDArrayFactory::create<double>('c', {5, 2}, {1,2,3,4,5,7,9,10, 10, 11});
 
-    nd4j::ops::to_float32 op32;
-    nd4j::ops::to_double op64;
+    sd::ops::to_float32 op32;
+    sd::ops::to_double op64;
     auto result32 = op32.evaluate({&x}, {}, {});
     auto result64 = op64.evaluate({&x}, {}, {});
 
@@ -4385,7 +4385,7 @@ TEST_F(DeclarableOpsTests7, mirrorPad_test1) {
 
     auto exp = NDArrayFactory::create<double>('c', {4, 7}, {2, 1, 1, 2, 3, 3, 2, 2, 1, 1, 2, 3, 3, 2, 5, 4, 4, 5, 6, 6, 5, 5, 4, 4, 5, 6, 6, 5});
 
-    nd4j::ops::mirror_pad op;
+    sd::ops::mirror_pad op;
     auto result = op.evaluate({&input, &paddings}, {}, {1});
     auto output = result->at(0);
 
@@ -4403,7 +4403,7 @@ TEST_F(DeclarableOpsTests7, mirrorPad_test2) {
 
     auto exp = NDArrayFactory::create<double>('c', {4, 7}, {6, 5, 4, 5, 6, 5, 4, 3, 2, 1, 2, 3, 2, 1, 6, 5, 4, 5, 6, 5, 4, 3, 2, 1, 2, 3, 2, 1});
 
-    nd4j::ops::mirror_pad op;
+    sd::ops::mirror_pad op;
     auto result = op.evaluate({&input, &paddings}, {}, {0});
     auto output = result->at(0);
 
@@ -4421,7 +4421,7 @@ TEST_F(DeclarableOpsTests7, mirrorPad_test3) {
 
     auto exp = NDArrayFactory::create<double>('c', {7}, {2, 1, 1, 2, 3, 3, 2});
 
-    nd4j::ops::mirror_pad op;
+    sd::ops::mirror_pad op;
     auto result = op.evaluate({&input, &paddings}, {}, {1});
     auto output = result->at(0);
 
@@ -4439,7 +4439,7 @@ TEST_F(DeclarableOpsTests7, mirrorPad_test4) {
 
     auto exp = NDArrayFactory::create<double>('c', {8}, {2, 1, 1, 2, 3, 3, 2, 1});
 
-    nd4j::ops::mirror_pad op;
+    sd::ops::mirror_pad op;
     auto result = op.evaluate({&input, &paddings}, {}, {1});
     auto output = result->at(0);
 
@@ -4457,7 +4457,7 @@ TEST_F(DeclarableOpsTests7, mirrorPad_test5) {
 
     auto exp = NDArrayFactory::create<double>('c', {7}, {3, 2, 1, 2, 3, 2, 1});
 
-    nd4j::ops::mirror_pad op;
+    sd::ops::mirror_pad op;
     auto result = op.evaluate({&input, &paddings}, {}, {0});
     auto output = result->at(0);
     ASSERT_TRUE(exp.isSameShape(output));
@@ -4474,7 +4474,7 @@ TEST_F(DeclarableOpsTests7, mirrorPad_test6) {
 
     auto exp = NDArrayFactory::create<double>('c', {3}, {1,1,1});
 
-    nd4j::ops::mirror_pad op;
+    sd::ops::mirror_pad op;
     auto result = op.evaluate({&input, &paddings}, {}, {1});
     auto output = result->at(0);
 
@@ -4492,7 +4492,7 @@ TEST_F(DeclarableOpsTests7, mirrorPad_test7) {
 
     auto exp = NDArrayFactory::create<double>('c', {3}, {1,1,1});
 
-    nd4j::ops::mirror_pad op;
+    sd::ops::mirror_pad op;
     auto result = op.evaluate({&input, &paddings}, {}, {1});
     auto output = result->at(0);
 
@@ -4510,7 +4510,7 @@ TEST_F(DeclarableOpsTests7, mirrorPad_test8) {
 
     auto exp = NDArrayFactory::create<double>('c', {3,9}, {3, 2, 1, 1, 2, 3, 3, 2, 1, 3, 2, 1, 1, 2, 3, 3, 2, 1, 3, 2, 1, 1, 2, 3, 3, 2, 1});
 
-    nd4j::ops::mirror_pad op;
+    sd::ops::mirror_pad op;
     auto result = op.evaluate({&input, &paddings}, {}, {1});
     ASSERT_EQ(result->status(), Status::OK());
 
@@ -4529,7 +4529,7 @@ TEST_F(DeclarableOpsTests7, mirrorPad_test9) {
 
     auto exp = NDArrayFactory::create<double>('c', {6, 9}, {6, 5, 4, 4, 5, 6, 6, 5, 4, 3, 2, 1, 1, 2, 3, 3, 2, 1, 3, 2, 1, 1, 2, 3, 3, 2, 1, 6, 5, 4, 4, 5, 6, 6, 5, 4, 6, 5, 4, 4, 5, 6, 6, 5, 4, 3, 2, 1, 1, 2, 3, 3, 2, 1});
 
-    nd4j::ops::mirror_pad op;
+    sd::ops::mirror_pad op;
     auto result = op.evaluate({&input, &paddings}, {}, {1});
     auto output = result->at(0);
 
@@ -4547,7 +4547,7 @@ TEST_F(DeclarableOpsTests7, mirrorPad_test10) {
 
     auto exp = NDArrayFactory::create<double>('c', {1,3}, {1., 2., 3.});
 
-    nd4j::ops::mirror_pad op;
+    sd::ops::mirror_pad op;
     auto result = op.evaluate({&input, &paddings}, {}, {1});
     auto output = result->at(0);
 
@@ -4565,7 +4565,7 @@ TEST_F(DeclarableOpsTests7, mirrorPad_test11) {
 
     auto exp = NDArrayFactory::create<double>('c', {1,3}, {1., 2., 3.});
 
-    nd4j::ops::mirror_pad op;
+    sd::ops::mirror_pad op;
     auto result = op.evaluate({&input, &paddings}, {}, {0});
     auto output = result->at(0);
 
@@ -4583,7 +4583,7 @@ TEST_F(DeclarableOpsTests7, mirrorPad_test12) {
 
     auto exp = NDArrayFactory::create<double>('c', {3}, {1., 2., 3.});
 
-    nd4j::ops::mirror_pad op;
+    sd::ops::mirror_pad op;
     auto result = op.evaluate({&input, &paddings}, {}, {0});
     auto output = result->at(0);
 
@@ -4601,7 +4601,7 @@ TEST_F(DeclarableOpsTests7, mirrorPad_test13) {
 
     auto exp = NDArrayFactory::create<double>('c', {2, 3},    {1., 2., 3., 4., 5., 6.});
 
-    nd4j::ops::mirror_pad op;
+    sd::ops::mirror_pad op;
     auto result = op.evaluate({&input, &paddings}, {}, {0});
     auto output = result->at(0);
 
@@ -4619,7 +4619,7 @@ TEST_F(DeclarableOpsTests7, mirrorPad_test14) {
 
     auto exp = NDArrayFactory::create<double>('c', {3, 4},    {4, 5, 6, 5, 1, 2, 3, 2, 4, 5, 6, 5});
 
-    nd4j::ops::mirror_pad op;
+    sd::ops::mirror_pad op;
     auto result = op.evaluate({&input, &paddings}, {}, {0});
     auto output = result->at(0);
 
@@ -4637,7 +4637,7 @@ TEST_F(DeclarableOpsTests7, mirrorPad_test15) {
 
     auto exp = NDArrayFactory::create<double>('c', {4, 3},  {1, 2, 3, 1, 2, 3, 4, 5, 6, 4, 5, 6});
 
-    nd4j::ops::mirror_pad op;
+    sd::ops::mirror_pad op;
     auto result = op.evaluate({&input, &paddings}, {}, {1});
     auto output = result->at(0);
 
@@ -4660,7 +4660,7 @@ TEST_F(DeclarableOpsTests7, mirrorPad_test16) {
             12., 11., 12., 11.,10.,  9., 10.,  9., 8.,  7.,  8.,  7.,10.,  9., 10.,  9.,12., 11., 12., 11.,10.,  9., 10.,  9., 8.,  7.,  8.,  7., 6.,  5.,  6.,  5., 4.,  3.,  4.,  3., 2.,  1.,  2.,  1., 4.,  3.,  4.,  3., 6.,  5.,  6.,  5., 4.,  3.,  4.,  3., 2.,  1.,  2.,  1.});
     input.linspace(1.);
 
-    nd4j::ops::mirror_pad op;
+    sd::ops::mirror_pad op;
     auto result = op.evaluate({&input, &paddings}, {}, {0});
     ASSERT_EQ(result->status(), Status::OK());
     auto output = result->at(0);
@@ -4680,7 +4680,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Sum_1) {
     auto exp = NDArrayFactory::create<double>(120.f);
     //************************************//
 
-    nd4j::ops::reduce_sum op;
+    sd::ops::reduce_sum op;
     auto result = op.evaluate({&input}, {}, {});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -4697,7 +4697,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Sum_2) {
     auto exp = NDArrayFactory::create<double>({15.f, 40.f, 65.f});
     //************************************//
 
-    nd4j::ops::reduce_sum op;
+    sd::ops::reduce_sum op;
     auto result = op.evaluate({&input}, {}, {1});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -4714,7 +4714,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Prod_1) {
     auto exp = NDArrayFactory::create<double>(1307674368000.f);
     //************************************//
 
-    nd4j::ops::reduce_prod op;
+    sd::ops::reduce_prod op;
     auto result = op.evaluate({&input}, {}, {});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -4731,7 +4731,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Prod_2) {
     auto exp = NDArrayFactory::create<double>({120.f, 30240.f, 360360.f});
     //************************************//
 
-    nd4j::ops::reduce_prod op;
+    sd::ops::reduce_prod op;
     auto result = op.evaluate({&input}, {}, {1});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -4748,7 +4748,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Sum_01) {
     auto exp = NDArrayFactory::create<double>('c', {4}, {66.f, 72.f, 78.f, 84.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_sum op;
+    sd::ops::reduce_sum op;
     auto result = op.evaluate({&x}, {}, {0,1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -4767,7 +4767,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Sum_02) {
     auto exp = NDArrayFactory::create<double>('c', {1,1,4}, {66.f, 72.f, 78.f, 84.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_sum op;
+    sd::ops::reduce_sum op;
     auto result = op.evaluate({&x}, {1.}, {0, 1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -4787,7 +4787,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Sum_3) {
     auto exp = NDArrayFactory::create<double>('c', {3}, {68.f, 100.f, 132.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_sum op;
+    sd::ops::reduce_sum op;
     auto result = op.evaluate({&x}, {}, {0, 2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -4807,7 +4807,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Sum_4) {
     auto exp = NDArrayFactory::create<double>('c', {1,3,1}, {68.f, 100.f, 132.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_sum op;
+    sd::ops::reduce_sum op;
     auto result = op.evaluate({&x}, {1.}, {0, 2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -4827,7 +4827,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Sum_5) {
     auto exp = NDArrayFactory::create<double>(300.f);
     x.linspace(1);
 
-    nd4j::ops::reduce_sum op;
+    sd::ops::reduce_sum op;
     auto result = op.evaluate({&x}, {}, {});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -4847,7 +4847,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Sum_6) {
     auto exp = NDArrayFactory::create<double>(300.f);
     x.linspace(1);
 
-    nd4j::ops::reduce_sum op;
+    sd::ops::reduce_sum op;
     auto result = op.evaluate({&x}, {}, {0,1,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -4867,7 +4867,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Sum_7) {
     auto exp = NDArrayFactory::create<double>('c', {1,1,1}, {300.f});
     x.linspace(1);
 //    x.printIndexedBuffer("Input with shape (2, 3, 4) is");
-    nd4j::ops::reduce_sum op;
+    sd::ops::reduce_sum op;
     auto result = op.evaluate({&x}, {1.}, {0,1,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -4887,7 +4887,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Prod_01) {
     auto exp = NDArrayFactory::create<double>('c', {2}, {10395.f, 46080.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_prod op;
+    sd::ops::reduce_prod op;
     auto result = op.evaluate({&x}, {}, {0,1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -4906,7 +4906,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Prod_02) {
     auto exp = NDArrayFactory::create<double>('c', {1,1,2}, {10395.f, 46080.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_prod op;
+    sd::ops::reduce_prod op;
     auto result = op.evaluate({&x}, {1.}, {0, 1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -4926,7 +4926,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Prod_3) {
     auto exp = NDArrayFactory::create<double>('c', {3}, {112.f, 1080.f, 3960.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_prod op;
+    sd::ops::reduce_prod op;
     auto result = op.evaluate({&x}, {}, {0, 2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -4946,7 +4946,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Prod_4) {
     auto exp = NDArrayFactory::create<double>('c', {1,3,1}, {112.f, 1080.f, 3960.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_prod op;
+    sd::ops::reduce_prod op;
     auto result = op.evaluate({&x}, {1.}, {0, 2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -4966,7 +4966,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Prod_5) {
     auto exp = NDArrayFactory::create<double>(479001600.f);
     x.linspace(1);
 
-    nd4j::ops::reduce_prod op;
+    sd::ops::reduce_prod op;
     auto result = op.evaluate({&x}, {}, {});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -4986,7 +4986,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Prod_6) {
     auto exp = NDArrayFactory::create<double>(479001600.f);
     x.linspace(1);
 
-    nd4j::ops::reduce_prod op;
+    sd::ops::reduce_prod op;
     auto result = op.evaluate({&x}, {}, {0,1,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5006,7 +5006,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Prod_7) {
     auto exp = NDArrayFactory::create<double>('c', {1, 1, 1}, {479001600.f});
     x.linspace(1);
 //    x.printIndexedBuffer("Input with shape (2, 3, 4) is");
-    nd4j::ops::reduce_prod op;
+    sd::ops::reduce_prod op;
     auto result = op.evaluate({&x}, {1.}, {0,1,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5023,7 +5023,7 @@ TYPED_TEST(TypedDeclarableOpsTests7, Test_Pnorm_Once_Again) {
     auto input = NDArrayFactory::create<TypeParam>('c', {1, 1, 5, 5}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f});
     auto exp = NDArrayFactory::create<TypeParam>('c', {1, 1, 5, 5}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f});
 
-    nd4j::ops::pnormpool2d op;
+    sd::ops::pnormpool2d op;
     auto result = op.evaluate({&input}, {}, {1,1,  1,1,  0,0,  1,1,1,  3,  0});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -5039,7 +5039,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Min_1) {
     auto exp = NDArrayFactory::create<double>('c', {4}, {1.f, 2.f, 3.f, 4.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_min op;
+    sd::ops::reduce_min op;
     auto result = op.evaluate({&x}, {}, {0, 1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5058,7 +5058,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Min_2) {
     auto exp = NDArrayFactory::create<double>('c', {1,1,4}, {1.f, 2.f, 3.f, 4.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_min op;
+    sd::ops::reduce_min op;
     auto result = op.evaluate({&x}, {1.}, {0, 1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5078,7 +5078,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Min_3) {
     auto exp = NDArrayFactory::create<double>('c', {3}, {1.f, 5.f, 9.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_min op;
+    sd::ops::reduce_min op;
     auto result = op.evaluate({&x}, {}, {0, 2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5098,7 +5098,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Min_4) {
     auto exp = NDArrayFactory::create<double>('c', {1,3,1}, {1.f, 5.f, 9.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_min op;
+    sd::ops::reduce_min op;
     auto result = op.evaluate({&x}, {1.}, {0, 2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5118,7 +5118,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Min_5) {
     auto exp = NDArrayFactory::create<double>(1.f);
     x.linspace(1);
 
-    nd4j::ops::reduce_min op;
+    sd::ops::reduce_min op;
     auto result = op.evaluate({&x}, {}, {});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5138,7 +5138,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Min_6) {
     auto exp = NDArrayFactory::create<double>(1.f);
     x.linspace(1);
 
-    nd4j::ops::reduce_min op;
+    sd::ops::reduce_min op;
     auto result = op.evaluate({&x}, {}, {0,1,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5158,7 +5158,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Min_7) {
     auto exp = NDArrayFactory::create<double>('c', {1, 1, 1}, {1.f});
     x.linspace(1);
 //    x.printIndexedBuffer("Input with shape (2, 3, 4) is");
-    nd4j::ops::reduce_min op;
+    sd::ops::reduce_min op;
     auto result = op.evaluate({&x}, {1.}, {0,1,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5178,7 +5178,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Max_1) {
     auto exp = NDArrayFactory::create<double>('c', {4}, {21.f, 22.f, 23.f, 24.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_max op;
+    sd::ops::reduce_max op;
     auto result = op.evaluate({&x}, {}, {0,1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5198,7 +5198,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Max_2) {
     auto exp = NDArrayFactory::create<double>('c', {1,1,4}, {21.f, 22.f, 23.f, 24.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_max op;
+    sd::ops::reduce_max op;
     auto result = op.evaluate({&x}, {1.}, {0, 1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5218,7 +5218,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Max_3) {
     auto exp = NDArrayFactory::create<double>('c', {3}, {16.f, 20.f, 24.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_max op;
+    sd::ops::reduce_max op;
     auto result = op.evaluate({&x}, {}, {0, 2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5238,7 +5238,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Max_4) {
     auto exp = NDArrayFactory::create<double>('c', {1,3,1}, {16.f, 20.f, 24.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_max op;
+    sd::ops::reduce_max op;
     auto result = op.evaluate({&x}, {1.}, {0, 2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5258,7 +5258,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Max_5) {
     auto exp = NDArrayFactory::create<double>(24.f);
     x.linspace(1);
 
-    nd4j::ops::reduce_max op;
+    sd::ops::reduce_max op;
     auto result = op.evaluate({&x}, {}, {});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5278,7 +5278,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Max_6) {
     auto exp = NDArrayFactory::create<double>(24.f);
     x.linspace(1);
 
-    nd4j::ops::reduce_max op;
+    sd::ops::reduce_max op;
     auto result = op.evaluate({&x}, {}, {0,1,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5298,7 +5298,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Max_7) {
     auto exp = NDArrayFactory::create<double>('c', {1, 1, 1}, {24.f});
     x.linspace(1);
 //    x.printIndexedBuffer("Input with shape (2, 3, 4) is");
-    nd4j::ops::reduce_max op;
+    sd::ops::reduce_max op;
     auto result = op.evaluate({&x}, {1.}, {0,1,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5318,7 +5318,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Norm1_1) {
     auto exp = NDArrayFactory::create<double>('c', {4}, {66.f, 72.f, 78.f, 84.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_norm1 op;
+    sd::ops::reduce_norm1 op;
     auto result = op.evaluate({&x}, {}, {0,1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5337,7 +5337,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Norm1_2) {
     auto exp = NDArrayFactory::create<double>('c', {1,1,4}, {66.f, 72.f, 78.f, 84.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_norm1 op;
+    sd::ops::reduce_norm1 op;
     auto result = op.evaluate({&x}, {1.}, {0, 1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5357,7 +5357,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Norm1_3) {
     auto exp = NDArrayFactory::create<double>('c', {3}, {68.f, 100.f, 132.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_norm1 op;
+    sd::ops::reduce_norm1 op;
     auto result = op.evaluate({&x}, {}, {0, 2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5377,7 +5377,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Norm1_4) {
     auto exp = NDArrayFactory::create<double>('c', {1,3,1}, {68.f, 100.f, 132.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_norm1 op;
+    sd::ops::reduce_norm1 op;
     auto result = op.evaluate({&x}, {1.}, {0, 2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5397,7 +5397,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Norm1_5) {
     auto exp = NDArrayFactory::create<double>(300.f);
     x.linspace(1);
 
-    nd4j::ops::reduce_norm1 op;
+    sd::ops::reduce_norm1 op;
     auto result = op.evaluate({&x}, {}, {});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5417,7 +5417,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Norm1_6) {
     auto exp = NDArrayFactory::create<double>(300.f);
     x.linspace(1);
 
-    nd4j::ops::reduce_norm1 op;
+    sd::ops::reduce_norm1 op;
     auto result = op.evaluate({&x}, {}, {0,1,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5437,7 +5437,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Norm1_7) {
     auto exp = NDArrayFactory::create<double>('c', {1, 1, 1}, {300.f});
     x.linspace(1);
 //    x.printIndexedBuffer("Input with shape (2, 3, 4) is");
-    nd4j::ops::reduce_norm1 op;
+    sd::ops::reduce_norm1 op;
     auto result = op.evaluate({&x}, {1.}, {0,1,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5456,7 +5456,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Norm2_1) {
     auto exp = NDArrayFactory::create<double>('c', {4}, {31.7175f, 33.823071f, 35.97221f, 38.15757f});
     x.linspace(1);
 
-    nd4j::ops::reduce_norm2 op;
+    sd::ops::reduce_norm2 op;
     auto result = op.evaluate({&x}, {}, {0,1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5475,7 +5475,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Norm2_2) {
     auto exp = NDArrayFactory::create<double>('c', {1,1,4}, {31.7175f, 33.823071f, 35.97221f, 38.15757f});
     x.linspace(1);
 
-    nd4j::ops::reduce_norm2 op;
+    sd::ops::reduce_norm2 op;
     auto result = op.evaluate({&x}, {1.}, {0, 1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5495,7 +5495,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Norm2_3) {
     auto exp = NDArrayFactory::create<double>('c', {3}, {29.597298f, 39.344631f, 49.759422f});
     x.linspace(1);
 
-    nd4j::ops::reduce_norm2 op;
+    sd::ops::reduce_norm2 op;
     auto result = op.evaluate({&x}, {}, {0, 2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5515,7 +5515,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Norm2_4) {
     auto exp = NDArrayFactory::create<double>('c', {1,3,1}, {29.597298f, 39.344631f, 49.759422f});
     x.linspace(1);
 
-    nd4j::ops::reduce_norm2 op;
+    sd::ops::reduce_norm2 op;
     auto result = op.evaluate({&x}, {1.}, {0, 2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5535,7 +5535,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Norm2_5) {
     auto exp = NDArrayFactory::create<double>(70.f);
     x.linspace(1);
 
-    nd4j::ops::reduce_norm2 op;
+    sd::ops::reduce_norm2 op;
     auto result = op.evaluate({&x}, {}, {});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5555,7 +5555,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Norm2_6) {
     auto exp = NDArrayFactory::create<double>(70.f);
     x.linspace(1);
 
-    nd4j::ops::reduce_norm2 op;
+    sd::ops::reduce_norm2 op;
     auto result = op.evaluate({&x}, {}, {0,1,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5575,7 +5575,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Norm2_7) {
     auto exp = NDArrayFactory::create<double>('c', {1, 1, 1}, {70.f});
     x.linspace(1);
 //    x.printIndexedBuffer("Input with shape (2, 3, 4) is");
-    nd4j::ops::reduce_norm2 op;
+    sd::ops::reduce_norm2 op;
     auto result = op.evaluate({&x}, {1.}, {0,1,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5595,7 +5595,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_NormMax_1) {
     auto exp = NDArrayFactory::create<double>('c', {4}, {21.f, 22.f, 23.f, 24.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_norm_max op;
+    sd::ops::reduce_norm_max op;
     auto result = op.evaluate({&x}, {}, {0,1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5614,7 +5614,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_NormMax_2) {
     auto exp = NDArrayFactory::create<double>('c', {1,1,4}, {21.f, 22.f, 23.f, 24.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_norm_max op;
+    sd::ops::reduce_norm_max op;
     auto result = op.evaluate({&x}, {1.f}, {0,1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5633,7 +5633,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_NormMax_3) {
     auto exp = NDArrayFactory::create<double>('c', {3}, {16.f, 20.f, 24.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_norm_max op;
+    sd::ops::reduce_norm_max op;
     auto result = op.evaluate({&x}, {}, {0,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5652,7 +5652,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_NormMax_4) {
     auto exp = NDArrayFactory::create<double>('c', {1, 3, 1}, {16.f, 20.f, 24.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_norm_max op;
+    sd::ops::reduce_norm_max op;
     auto result = op.evaluate({&x}, {1.f}, {0,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5671,7 +5671,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_NormMax_5) {
     auto exp = NDArrayFactory::create<double>(24.f);
     x.linspace(1);
 
-    nd4j::ops::reduce_norm_max op;
+    sd::ops::reduce_norm_max op;
     auto result = op.evaluate({&x}, {}, {});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5691,7 +5691,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_NormMax_6) {
     auto exp = NDArrayFactory::create<double>(24.f);
     x.linspace(1);
 
-    nd4j::ops::reduce_norm_max op;
+    sd::ops::reduce_norm_max op;
     auto result = op.evaluate({&x}, {}, {0, 1, 2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5711,7 +5711,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_NormMax_7) {
     auto exp = NDArrayFactory::create<double>('c', {1, 1, 1}, {24.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_norm_max op;
+    sd::ops::reduce_norm_max op;
     auto result = op.evaluate({&x}, {1.f}, {});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5731,7 +5731,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_SquaredNorm_1) {
     auto exp = NDArrayFactory::create<double>('c', {4}, {1006.f, 1144.f, 1294.f, 1456.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_sqnorm op;
+    sd::ops::reduce_sqnorm op;
     auto result = op.evaluate({&x}, {}, {0,1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5750,7 +5750,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_SquaredNorm_2) {
     auto exp = NDArrayFactory::create<double>('c', {1,1,4}, {1006.f, 1144.f, 1294.f, 1456.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_sqnorm op;
+    sd::ops::reduce_sqnorm op;
     auto result = op.evaluate({&x}, {1.f}, {0,1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5769,7 +5769,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_SquaredNorm_3) {
     auto exp = NDArrayFactory::create<double>('c', {3}, {876.f, 1548.f, 2476.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_sqnorm op;
+    sd::ops::reduce_sqnorm op;
     auto result = op.evaluate({&x}, {}, {0,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5788,7 +5788,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_SquaredNorm_4) {
     auto exp = NDArrayFactory::create<double>('c', {1, 3, 1}, {876.f, 1548.f, 2476.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_sqnorm op;
+    sd::ops::reduce_sqnorm op;
     auto result = op.evaluate({&x}, {1.f}, {0,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5807,7 +5807,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_SquaredNorm_5) {
     auto exp = NDArrayFactory::create<double>(4900.f);
     x.linspace(1);
 
-    nd4j::ops::reduce_sqnorm op;
+    sd::ops::reduce_sqnorm op;
     auto result = op.evaluate({&x}, {}, {});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5827,7 +5827,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_SquaredNorm_6) {
     auto exp = NDArrayFactory::create<double>(4900.f);
     x.linspace(1);
 
-    nd4j::ops::reduce_sqnorm op;
+    sd::ops::reduce_sqnorm op;
     auto result = op.evaluate({&x}, {}, {0, 1, 2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5847,7 +5847,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_SquaredNorm_7) {
     auto exp = NDArrayFactory::create<double>('c', {1, 1, 1}, {4900.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_sqnorm op;
+    sd::ops::reduce_sqnorm op;
     auto result = op.evaluate({&x}, {1.f}, {});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -5868,7 +5868,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Sum_BP_1) {
     auto exp = NDArrayFactory::create<double>('c', {3, 4}, {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,0.5f});
     //************************************//
 
-    nd4j::ops::reduce_sum_bp op;
+    sd::ops::reduce_sum_bp op;
     auto result = op.evaluate({&input, &eps}, {}, {});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -5889,7 +5889,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Sum_BP_2) {
                                      0.5f, 0.5f, 0.5f,0.5f});
     //************************************//
 
-    nd4j::ops::reduce_sum_bp op;
+    sd::ops::reduce_sum_bp op;
     auto result = op.evaluate({&input, &eps}, {1.f}, {});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -5910,7 +5910,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Sum_BP_3) {
                                      1.f, 2.f, 3.f, 4.f});
     //************************************//
 
-    nd4j::ops::reduce_sum_bp op;
+    sd::ops::reduce_sum_bp op;
     auto result = op.evaluate({&input, &eps}, {}, {0});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -5931,7 +5931,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Sum_BP_4) {
                                      1.f, 2.f, 3.f, 4.f});
     //************************************//
 
-    nd4j::ops::reduce_sum_bp op;
+    sd::ops::reduce_sum_bp op;
     auto result = op.evaluate({&input, &eps}, {1.f}, {0});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -5956,7 +5956,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Prod_BP_1) {
                                        171001227491294996070400.f, 155455648254341989531648.f, 142501016904612993564672.f,
                                        131539399526781282156544.f, 122143728775382565912576.f, 114000815325130245799936.f});
 
-    nd4j::ops::reduce_prod_bp op;
+    sd::ops::reduce_prod_bp op;
     auto result = op.evaluate({&input, &eps}, {}, {});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -5977,8 +5977,8 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Prod_BP_2) {
     //************************************//
     auto exp = NDArrayFactory::create<double>('c', {3, 4});
 
-    nd4j::ops::reduce_prod_bp op;
-    nd4j::ops::reduce_prod op_exp;
+    sd::ops::reduce_prod_bp op;
+    sd::ops::reduce_prod op_exp;
     auto res = op_exp.evaluate({&input});
     auto result = op.evaluate({&input, &eps}, {}, {});
     exp.assign(res->at(0)->e<double>(0));
@@ -6002,8 +6002,8 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Prod_BP_3) {
     //************************************//
     auto exp = NDArrayFactory::create<double>('c', {3, 4}, {45.f, 120.f, 231.f, 384.f, 9.f, 40.f, 99.f, 192.f, 5.f, 24.f, 63.f, 128.f});
 
-    nd4j::ops::reduce_prod_bp op;
-    //nd4j::ops::reduce_prod op_exp;
+    sd::ops::reduce_prod_bp op;
+    //sd::ops::reduce_prod op_exp;
     auto result = op.evaluate({&input, &eps}, {1.f}, {0});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -6023,8 +6023,8 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Prod_BP_03) {
     //************************************//
     auto exp = NDArrayFactory::create<double>('c', {3, 4}, {45.f, 120.f, 231.f, 384.f, 9.f, 40.f, 99.f, 192.f, 5.f, 24.f, 63.f, 128.f});
     auto axis = NDArrayFactory::create<int>('c', {1}, {ax});
-    nd4j::ops::reduce_prod_bp op;
-    //nd4j::ops::reduce_prod op_exp;
+    sd::ops::reduce_prod_bp op;
+    //sd::ops::reduce_prod op_exp;
     auto result = op.evaluate({&input, &eps, &axis}, {}, {}, {true});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -6044,8 +6044,8 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Prod_BP_4) {
     //************************************//
     auto exp = NDArrayFactory::create<double>('c', {3, 4}, {45.f, 120.f, 231.f, 384.f, 9.f, 40.f, 99.f, 192.f, 5.f, 24.f, 63.f, 128.f});
 
-    nd4j::ops::reduce_prod_bp op;
-    nd4j::ops::reduce_prod op_exp;
+    sd::ops::reduce_prod_bp op;
+    sd::ops::reduce_prod op_exp;
 //    auto res = op_exp.execute({&input}, {}, {});
     auto result = op.evaluate({&input, &eps}, {0.f}, {0});
 
@@ -6067,8 +6067,8 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Prod_BP_5) {
     //************************************//
     auto exp = NDArrayFactory::create<double>('c', {3, 4}, {24.f, 12.f, 8.f, 6.f, 672.f, 560.f, 480.f, 420.f, 3960.f, 3564.f, 3240.f, 2970.f});
 
-    nd4j::ops::reduce_prod_bp op;
-    nd4j::ops::reduce_prod op_exp;
+    sd::ops::reduce_prod_bp op;
+    sd::ops::reduce_prod op_exp;
 //    auto res = op_exp.execute({&input}, {}, {});
     auto result = op.evaluate({&input, &eps}, {0.f}, {1});
 
@@ -6095,7 +6095,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Min_BP_1) {
     x.linspace(1);
 //    x.printIndexedBuffer("Input is");
 //    exp.printIndexedBuffer("Expected ");
-    nd4j::ops::reduce_min_bp op;
+    sd::ops::reduce_min_bp op;
     auto result = op.evaluate({&x, &eps}, {}, {0, 1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -6120,7 +6120,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Min_BP_2) {
     x.linspace(1);
 //    x.printIndexedBuffer("Input is");
 //    exp.printIndexedBuffer("Expected ");
-    nd4j::ops::reduce_min_bp op;
+    sd::ops::reduce_min_bp op;
     auto result = op.evaluate({&x, &eps}, {1.f}, {0, 1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -6145,7 +6145,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Min_BP_02) {
     x.linspace(1);
 //    x.printIndexedBuffer("Input is");
 //    exp.printIndexedBuffer("Expected ");
-    nd4j::ops::reduce_min_bp op;
+    sd::ops::reduce_min_bp op;
     auto result = op.evaluate({&x, &eps, &axes}, {}, {}, {true});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -6168,7 +6168,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Min_BP_3) {
     exp.p(2,2, 0.5f);
     //x.printIndexedBuffer("Input is");
     // exp.printIndexedBuffer("Expected ");
-    nd4j::ops::reduce_min_bp op;
+    sd::ops::reduce_min_bp op;
     auto result = op.evaluate({&x, &eps}, {1.f}, {});
     auto output = result->at(0);
     // output->printIndexedBuffer("Result is");
@@ -6191,7 +6191,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Min_BP_4) {
     exp.p(2,2, 0.5f);
 //    x.printIndexedBuffer("Input is");
 //    exp.printIndexedBuffer("Expected ");
-    nd4j::ops::reduce_min_bp op;
+    sd::ops::reduce_min_bp op;
     auto result = op.evaluate({&x, &eps}, {}, {});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -6221,7 +6221,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Min_BP_5) {
 //    exp(2,2) = 0.5f;
 //    x.printIndexedBuffer("Input is");
 //    exp.printIndexedBuffer("Expected ");
-    nd4j::ops::reduce_min_bp op;
+    sd::ops::reduce_min_bp op;
     auto result = op.evaluate({&x, &eps}, {}, {0});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -6251,7 +6251,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Min_BP_6) {
 //    exp(2,2) = 0.5f;
 //    x.printIndexedBuffer("Input is");
 //    exp.printIndexedBuffer("Expected ");
-    nd4j::ops::reduce_min_bp op;
+    sd::ops::reduce_min_bp op;
     auto result = op.evaluate({&x, &eps}, {1.f}, {0});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -6276,7 +6276,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Max_BP_1) {
     x.linspace(1);
     // x.printIndexedBuffer("Input is");
     // exp.printIndexedBuffer("Expected ");
-    nd4j::ops::reduce_max_bp op;
+    sd::ops::reduce_max_bp op;
     auto result = op.evaluate({&x, &eps}, {}, {0, 1});
     auto output = result->at(0);
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -6300,7 +6300,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Max_BP_2) {
     x.linspace(1);
 //    x.printIndexedBuffer("Input is");
 //    exp.printIndexedBuffer("Expected ");
-    nd4j::ops::reduce_max_bp op;
+    sd::ops::reduce_max_bp op;
     auto result = op.evaluate({&x, &eps}, {1.f}, {0, 1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -6326,7 +6326,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Max_BP_02) {
     x.linspace(1);
 //    x.printIndexedBuffer("Input is");
 //    exp.printIndexedBuffer("Expected ");
-    nd4j::ops::reduce_max_bp op;
+    sd::ops::reduce_max_bp op;
     auto result = op.evaluate({&x, &eps, &axes}, {}, {}, {true});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -6355,7 +6355,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Max_BP_3) {
     exp.p(3,3, 4.f);
 //    x.printIndexedBuffer("Input is");
 //    exp.printIndexedBuffer("Expected ");
-    nd4j::ops::reduce_max_bp op;
+    sd::ops::reduce_max_bp op;
     auto result = op.evaluate({&x, &eps}, {}, {0});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -6385,7 +6385,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Max_BP_4) {
 
 //    x.printIndexedBuffer("Input is");
 //    exp.printIndexedBuffer("Expected ");
-    nd4j::ops::reduce_max_bp op;
+    sd::ops::reduce_max_bp op;
     auto result = op.evaluate({&x, &eps}, {1.f}, {0});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -6409,7 +6409,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Norm1_BP_1) {
     exp.assign(5.f);
     exp.p(12, -exp.e<double>(12));
     exp.p(20, -exp.e<double>(20));
-    nd4j::ops::reduce_norm1_bp op;
+    sd::ops::reduce_norm1_bp op;
     auto result = op.evaluate({&x, &eps}, {}, {});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -6428,7 +6428,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Norm1_BP_2) {
     auto eps = NDArrayFactory::create<double>({1.f, 2.f, 3.f, 4.f});
     x.linspace(1);
     auto exp = NDArrayFactory::create<double>('c', {2, 3, 4}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f,1.f, 2.f, 3.f, 4.f,1.f, 2.f, 3.f, 4.f});
-    nd4j::ops::reduce_norm1_bp op;
+    sd::ops::reduce_norm1_bp op;
     auto result = op.evaluate({&x, &eps}, {}, {0,1});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     auto output = result->at(0);
@@ -6448,7 +6448,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Norm1_BP_02) {
     x.linspace(1);
     auto exp = NDArrayFactory::create<double>('c', {2, 3, 4}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f,1.f, 2.f, 3.f, 4.f,1.f, 2.f, 3.f, 4.f});
     auto axes = NDArrayFactory::create<int>({0,1});
-    nd4j::ops::reduce_norm1_bp op;
+    sd::ops::reduce_norm1_bp op;
     auto result = op.evaluate({&x, &eps, &axes}, {}, {}, {false});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     auto output = result->at(0);
@@ -6465,7 +6465,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Norm1_BP_3) {
     auto eps = NDArrayFactory::create<double>('c', {1, 1, 4}, {1.f, 2.f, 3.f, 4.f});
     x.linspace(1);
     auto exp = NDArrayFactory::create<double>('c', {2, 3, 4}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f,1.f, 2.f, 3.f, 4.f,1.f, 2.f, 3.f, 4.f});
-    nd4j::ops::reduce_norm1_bp op;
+    sd::ops::reduce_norm1_bp op;
     auto result = op.evaluate({&x, &eps}, {1.f}, {0,1});
     auto output = result->at(0);
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -6483,7 +6483,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Norm2_BP_1) {
     auto eps = NDArrayFactory::create<double>('c', {4}, {31.7175f, 33.823071f, 35.97221f, 38.15757f});
     x.linspace(1);
 
-    nd4j::ops::reduce_norm2_bp op;
+    sd::ops::reduce_norm2_bp op;
     auto result = op.evaluate({&x, &eps}, {}, {0,1});
     auto output = result->at(0);
     // output->printIndexedBuffer("Result is");
@@ -6502,7 +6502,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Norm2_BP_2) {
     auto eps = NDArrayFactory::create<double>('c', {1, 1, 4}, {31.7175f, 33.823071f, 35.97221f, 38.15757f});
     x.linspace(1);
 
-    nd4j::ops::reduce_norm2_bp op;
+    sd::ops::reduce_norm2_bp op;
     auto result = op.evaluate({&x, &eps}, {1.f}, {0,1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -6522,7 +6522,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Norm2_BP_02) {
     auto axes = NDArrayFactory::create<int>({0, 1});
     x.linspace(1);
 
-    nd4j::ops::reduce_norm2_bp op;
+    sd::ops::reduce_norm2_bp op;
     auto result = op.evaluate({&x, &eps, &axes}, {}, {}, {true});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -6541,7 +6541,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Norm2_BP_3) {
     auto eps = NDArrayFactory::create<double>('c', {3}, {29.597298f, 39.344631f, 49.759422f});
     x.linspace(1);
 
-    nd4j::ops::reduce_norm2_bp op;
+    sd::ops::reduce_norm2_bp op;
     auto result = op.evaluate({&x, &eps}, {}, {0, 2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -6561,7 +6561,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Norm2_BP_4) {
     auto eps = NDArrayFactory::create<double>('c', {1,3,1}, {29.597298f, 39.344631f, 49.759422f});
     x.linspace(1);
 
-    nd4j::ops::reduce_norm2_bp op;
+    sd::ops::reduce_norm2_bp op;
     auto result = op.evaluate({&x, &eps}, {1.f}, {0, 2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -6587,7 +6587,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_SquaredNorm_BP_1) {
                                             42.f, 88.f, 138.f, 192.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_sqnorm_bp op;
+    sd::ops::reduce_sqnorm_bp op;
     auto result = op.evaluate({&x, &eps}, {}, {0,1});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -6613,7 +6613,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_SquaredNorm_BP_01) {
     auto axes = NDArrayFactory::create<int>({0, 1});
     x.linspace(1);
 
-    nd4j::ops::reduce_sqnorm_bp op;
+    sd::ops::reduce_sqnorm_bp op;
     auto result = op.evaluate({&x, &eps, &axes}, {}, {}, {false});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -6637,7 +6637,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_NormMax_BP_1) {
     exp.p(22, 3.f);
     exp.p(23, 4.f);
 
-    nd4j::ops::reduce_norm_max_bp op;
+    sd::ops::reduce_norm_max_bp op;
     auto result = op.evaluate({&x, &eps}, {}, {0,1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -6661,7 +6661,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_NormMax_BP_2) {
     exp.p(22, 3.f);
     exp.p(23, 4.f);
 
-    nd4j::ops::reduce_norm_max_bp op;
+    sd::ops::reduce_norm_max_bp op;
     auto result = op.evaluate({&x, &eps}, {1.f}, {0,1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -6686,7 +6686,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_NormMax_BP_02) {
     exp.p(22, 3.f);
     exp.p(23, 4.f);
 
-    nd4j::ops::reduce_norm_max_bp op;
+    sd::ops::reduce_norm_max_bp op;
     auto result = op.evaluate({&x, &eps, &axes}, {}, {}, {true});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -6709,7 +6709,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_NormMax_BP_3) {
     exp.p(19, 2.f);
     exp.p(23, 3.f);
 
-    nd4j::ops::reduce_norm_max_bp op;
+    sd::ops::reduce_norm_max_bp op;
     auto result = op.evaluate({&x, &eps}, {}, {0,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -6731,7 +6731,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_NormMax_BP_4) {
     exp.p(15, 1.f);
     exp.p(19, 2.f);
     exp.p(23, 3.f);
-    nd4j::ops::reduce_norm_max_bp op;
+    sd::ops::reduce_norm_max_bp op;
     auto result = op.evaluate({&x, &eps}, {1.f}, {0,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -6751,7 +6751,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_NormMax_BP_5) {
     auto exp = NDArrayFactory::create<double>('c', {2, 3, 4});
     x.linspace(1);
     exp.p(23, 1.f);
-    nd4j::ops::reduce_norm_max_bp op;
+    sd::ops::reduce_norm_max_bp op;
     auto result = op.evaluate({&x, &eps}, {}, {});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -6773,7 +6773,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_NormMax_BP_6) {
     x.linspace(1);
     exp.p(23, 1.f);
 
-    nd4j::ops::reduce_norm_max_bp op;
+    sd::ops::reduce_norm_max_bp op;
     auto result = op.evaluate({&x, &eps}, {}, {0, 1, 2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -6794,7 +6794,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_NormMax_BP_7) {
     auto exp = NDArrayFactory::create<double>('c', {2, 3, 4});
     x.linspace(1);
     exp.p(23, 1.f);
-    nd4j::ops::reduce_norm_max_bp op;
+    sd::ops::reduce_norm_max_bp op;
     auto result = op.evaluate({&x, &eps}, {1.f}, {});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -6819,7 +6819,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Dot_BP_1) {
     y.linspace(2);
 
 
-    nd4j::ops::reduce_dot_bp op;
+    sd::ops::reduce_dot_bp op;
     auto result = op.evaluate({&x, &y, &eps}, {}, {});
     auto output = result->at(0);
     auto outputX = result->at(1);
@@ -6850,7 +6850,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Dot_BP_2) {
     x.assign(1.f);
     eps.linspace(1);
     y.assign(2.f);
-    nd4j::ops::reduce_dot_bp op;
+    sd::ops::reduce_dot_bp op;
     auto result = op.evaluate({&x, &y, &eps}, {}, {1});
     ASSERT_EQ(result->status(), ND4J_STATUS_OK);
     ASSERT_EQ(result->size(), 2);
@@ -6882,7 +6882,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Dot_BP_02) {
     x.assign(1.f);
     eps.linspace(1);
     y.assign(2.f);
-    nd4j::ops::reduce_dot_bp op;
+    sd::ops::reduce_dot_bp op;
     auto result = op.evaluate({&x, &y, &eps, &axis}, {}, {}, {false});
     ASSERT_EQ(result->status(), ND4J_STATUS_OK);
     ASSERT_EQ(result->size(), 2);
@@ -6909,7 +6909,7 @@ TEST_F(DeclarableOpsTests7, Test_Reduce_Dot_BP_3) {
     eps.linspace(1);
     y.assign(2.f);
 
-    nd4j::ops::reduce_dot_bp op;
+    sd::ops::reduce_dot_bp op;
     auto result = op.evaluate({&x,&y, &eps}, {}, {1});
     auto outputX = result->at(0);
     auto outputY = result->at(1);
@@ -6931,7 +6931,7 @@ TEST_F(DeclarableOpsTests7, cumsum_bp_1) {
     x.linspace(1);
     eps.assign(1.f);
 
-    nd4j::ops::cumsum_bp op;
+    sd::ops::cumsum_bp op;
     auto result = op.evaluate({&x, &eps}, {}, {0,0});
     auto output = result->at(0);
 
@@ -6953,7 +6953,7 @@ TEST_F(DeclarableOpsTests7, cumsum_bp_2) {
     eps.assign(1.f);
 
 
-    nd4j::ops::cumsum_bp op;
+    sd::ops::cumsum_bp op;
     auto result = op.evaluate({&x, &eps}, {}, {1,0});
     auto output = result->at(0);
 
@@ -6975,7 +6975,7 @@ TEST_F(DeclarableOpsTests7, cumsum_bp_3) {
     exp.linspace(0);
     eps.assign(1.f);
 
-    nd4j::ops::cumsum_bp op;
+    sd::ops::cumsum_bp op;
     auto result = op.evaluate({&x, &eps}, {}, {1,1});
     auto output = result->at(0);
 
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests8.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests8.cpp
index 002a31d6e..17158de2a 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests8.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests8.cpp
@@ -21,11 +21,11 @@
 
 #include "testlayers.h"
 #include <ops/declarable/CustomOperations.h>
-#include <NDArray.h>
-#include <GradCheck.h>
+#include <array/NDArray.h>
+#include <helpers/GradCheck.h>
 // #include <array/NDArrayList.h>
 
-using namespace nd4j;
+using namespace sd;
 
 
 class DeclarableOpsTests8 : public testing::Test {
@@ -56,7 +56,7 @@ TEST_F(DeclarableOpsTests8, reduceVariance_test1) {
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.f});
     auto exp = NDArrayFactory::create<double>('c', {4}, {602.2222f, 727.13885f, 993.5555f, 755.8889f});
 
-    nd4j::ops::reduce_variance op;
+    sd::ops::reduce_variance op;
     auto result = op.evaluate({&x}, {}, {0,1});
     auto output = result->at(0);
 
@@ -74,7 +74,7 @@ TEST_F(DeclarableOpsTests8, reduceVariance_test2) {
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.f});
     auto exp = NDArrayFactory::create<double>('c', {1,1,4}, {602.2222f, 727.13885f, 993.5555f, 755.8889f});
 
-    nd4j::ops::reduce_variance op;
+    sd::ops::reduce_variance op;
     auto result = op.evaluate({&x}, {1.}, {0,1});
     auto output = result->at(0);
 
@@ -92,7 +92,7 @@ TEST_F(DeclarableOpsTests8, reduceVariance_test3) {
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.f});
     auto exp = NDArrayFactory::create<double>('c', {3}, {900.9375f, 969.8594f, 424.1875f});
 
-    nd4j::ops::reduce_variance op;
+    sd::ops::reduce_variance op;
     auto result = op.evaluate({&x}, {}, {0,2});
     auto output = result->at(0);
 
@@ -110,7 +110,7 @@ TEST_F(DeclarableOpsTests8, reduceVariance_test4) {
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.f});
     auto exp = NDArrayFactory::create<double>('c', {1,3,1}, {900.9375f, 969.8594f, 424.1875f});
 
-    nd4j::ops::reduce_variance op;
+    sd::ops::reduce_variance op;
     auto result = op.evaluate({&x}, {1.}, {0,2});
     auto output = result->at(0);
 
@@ -128,7 +128,7 @@ TEST_F(DeclarableOpsTests8, reduceVariance_test5) {
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.f});
     auto exp = NDArrayFactory::create<double>(788.6927f);
 
-    nd4j::ops::reduce_variance op;
+    sd::ops::reduce_variance op;
     auto result = op.evaluate({&x}, {}, {});
     auto output = result->at(0);
 
@@ -146,7 +146,7 @@ TEST_F(DeclarableOpsTests8, reduceVariance_test6) {
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.});
     auto exp = NDArrayFactory::create<double>(788.6927f);
 
-    nd4j::ops::reduce_variance op;
+    sd::ops::reduce_variance op;
     auto result = op.evaluate({&x}, {}, {0,1,2});
     auto output = result->at(0);
 
@@ -164,7 +164,7 @@ TEST_F(DeclarableOpsTests8, reduceVariance_test7) {
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.});
     auto exp = NDArrayFactory::create<double>('c', {1,1,1}, {788.6927f});
 
-    nd4j::ops::reduce_variance op;
+    sd::ops::reduce_variance op;
     auto result = op.evaluate({&x}, {1.}, {0,1,2});
     auto output = result->at(0);
 
@@ -182,7 +182,7 @@ TEST_F(DeclarableOpsTests8, reduceVariance_test8) {
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.});
     auto exp = NDArrayFactory::create<double>('c', {1,1,1}, {788.6927f});
     auto axes = NDArrayFactory::create<int>({0, 1, 2});
-    nd4j::ops::reduce_variance op;
+    sd::ops::reduce_variance op;
     auto result = op.evaluate({&x, &axes}, {}, {}, {true});
     auto output = result->at(0);
 
@@ -200,7 +200,7 @@ TEST_F(DeclarableOpsTests8, reduceStDev_test1) {
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.});
     auto exp = NDArrayFactory::create<double>('c', {4}, {24.54022f, 26.96551f, 31.52072f, 27.49343f});
 
-    nd4j::ops::reduce_stdev op;
+    sd::ops::reduce_stdev op;
     auto result = op.evaluate({&x}, {}, {0,1});
     auto output = result->at(0);
 
@@ -218,7 +218,7 @@ TEST_F(DeclarableOpsTests8, reduceStDev_test2) {
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.});
     auto exp = NDArrayFactory::create<double>('c', {1,1,4}, {24.54022f, 26.96551f, 31.52072f, 27.49343f});
 
-    nd4j::ops::reduce_stdev op;
+    sd::ops::reduce_stdev op;
     auto result = op.evaluate({&x}, {1.}, {0,1});
     auto output = result->at(0);
 
@@ -236,7 +236,7 @@ TEST_F(DeclarableOpsTests8, reduceStDev_test3) {
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.});
     auto exp = NDArrayFactory::create<double>('c', {3}, {30.01562f, 31.14257f, 20.59581f});
 
-    nd4j::ops::reduce_stdev op;
+    sd::ops::reduce_stdev op;
     auto result = op.evaluate({&x}, {}, {0,2});
     auto output = result->at(0);
 
@@ -254,7 +254,7 @@ TEST_F(DeclarableOpsTests8, reduceStDev_test4) {
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.});
     auto exp = NDArrayFactory::create<double>('c', {1,3,1}, {30.01562f, 31.14257f, 20.59581f});
 
-    nd4j::ops::reduce_stdev op;
+    sd::ops::reduce_stdev op;
     auto result = op.evaluate({&x}, {1.}, {0,2});
     auto output = result->at(0);
 
@@ -272,7 +272,7 @@ TEST_F(DeclarableOpsTests8, reduceStDev_test5) {
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.});
     auto exp = NDArrayFactory::create<double>(28.08367f);
 
-    nd4j::ops::reduce_stdev op;
+    sd::ops::reduce_stdev op;
     auto result = op.evaluate({&x}, {}, {});
     auto output = result->at(0);
 
@@ -290,7 +290,7 @@ TEST_F(DeclarableOpsTests8, reduceStDev_test6) {
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.});
     auto exp = NDArrayFactory::create<double>(28.08367f);
 
-    nd4j::ops::reduce_stdev op;
+    sd::ops::reduce_stdev op;
     auto result = op.evaluate({&x}, {}, {0,1,2});
     auto output = result->at(0);
 
@@ -308,7 +308,7 @@ TEST_F(DeclarableOpsTests8, reduceStDev_test7) {
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.});
     auto exp = NDArrayFactory::create<double>('c', {1,1,1}, {28.08367f});
 
-    nd4j::ops::reduce_stdev op;
+    sd::ops::reduce_stdev op;
     auto result = op.evaluate({&x}, {1.f}, {0,1,2});
     auto output = result->at(0);
 
@@ -326,7 +326,7 @@ TEST_F(DeclarableOpsTests8, reduceStDev_test8) {
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.});
     auto exp = NDArrayFactory::create<double>('c', {4}, {26.88246f, 29.53924f, 34.52921f, 30.11755f});
 
-    nd4j::ops::reduce_stdev op;
+    sd::ops::reduce_stdev op;
     auto result = op.evaluate({&x}, {0.f,1.f}, {0,1});
     auto output = result->at(0);
 
@@ -344,7 +344,7 @@ TEST_F(DeclarableOpsTests8, reduceStDev_test08) {
     auto x = NDArrayFactory::create<double>('c', {2,3,4}, {27.f,34.f,5.f,4.f,54.f,6.f,65.f,8.f,37.f,45.f,8.f,67.f,96.f,10.f,65.f,41.f,33.f,85.f,92.f,24.f,25.f,55.f,49.f,76.});
     auto exp = NDArrayFactory::create<double>('c', {4}, {26.88246f, 29.53924f, 34.52921f, 30.11755f});
     auto axes = NDArrayFactory::create<int>({0,1});
-    nd4j::ops::reduce_stdev op;
+    sd::ops::reduce_stdev op;
     auto result = op.evaluate({&x, &axes}, {}, {}, {false, true});
     auto output = result->at(0);
 
@@ -367,7 +367,7 @@ TEST_F(DeclarableOpsTests8, reduceVarianceBP_test1) {
 
     x.linspace(1);
 
-    nd4j::ops::reduce_variance_bp op;
+    sd::ops::reduce_variance_bp op;
 
     auto result = op.evaluate({&x, &gradO2}, {0,1}, {});
     ASSERT_EQ(Status::OK(), result->status());
@@ -410,7 +410,7 @@ TEST_F(DeclarableOpsTests8, reduceVarianceBP_test2) {
 
     x.linspace(1);
 
-    nd4j::ops::reduce_variance_bp op;
+    sd::ops::reduce_variance_bp op;
 
     auto result = op.evaluate({&x, &gradO2}, {0,0}, {0});
     ASSERT_EQ(Status::OK(), result->status());
@@ -453,7 +453,7 @@ TEST_F(DeclarableOpsTests8, reduceVarianceBP_test02) {
     auto axes = NDArrayFactory::create<int>({(int)0,});
     x.linspace(1);
 
-    nd4j::ops::reduce_variance_bp op;
+    sd::ops::reduce_variance_bp op;
 
     auto result = op.evaluate({&x, &gradO2, &axes}, {}, {}, {false, false});
     ASSERT_EQ(Status::OK(), result->status());
@@ -500,7 +500,7 @@ TEST_F(DeclarableOpsTests8, reduceVarianceBP_test3) {
 
     x.linspace(1);
 
-    nd4j::ops::reduce_variance_bp op;
+    sd::ops::reduce_variance_bp op;
 
     auto result = op.evaluate({&x, &gradO2}, {0, 0}, {1});
     ASSERT_EQ(Status::OK(), result->status());
@@ -542,7 +542,7 @@ TEST_F(DeclarableOpsTests8, reduceStDevBP_test1) {
 
     x.linspace(1);
 
-    nd4j::ops::reduce_stdev_bp op;
+    sd::ops::reduce_stdev_bp op;
 
     auto result = op.evaluate({&x, &gradO2}, {0,1}, {});
     ASSERT_EQ(Status::OK(), result->status());
@@ -585,7 +585,7 @@ TEST_F(DeclarableOpsTests8, reduceStDevBP_test2) {
 
     x.linspace(1);
 
-    nd4j::ops::reduce_stdev_bp op;
+    sd::ops::reduce_stdev_bp op;
 
     auto result = op.evaluate({&x, &gradO2}, {0,0}, {0});
     ASSERT_EQ(Status::OK(), result->status());
@@ -628,7 +628,7 @@ TEST_F(DeclarableOpsTests8, reduceStDevBP_test02) {
     auto axis = NDArrayFactory::create<int>('c', {1}, {ax});
     x.linspace(1);
 
-    nd4j::ops::reduce_stdev_bp op;
+    sd::ops::reduce_stdev_bp op;
 
     auto result = op.evaluate({&x, &gradO2, &axis}, {}, {}, {false, false});
     ASSERT_EQ(Status::OK(), result->status());
@@ -670,7 +670,7 @@ TEST_F(DeclarableOpsTests8, reduceStDevBP_test3) {
 
     x.linspace(1);
 
-    nd4j::ops::reduce_stdev_bp op;
+    sd::ops::reduce_stdev_bp op;
 
     auto result = op.evaluate({&x, &gradO2}, {0,0}, {1});
     ASSERT_EQ(Status::OK(), result->status());
@@ -710,7 +710,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_1) {
     auto exp = NDArrayFactory::create<double>(120.f);
     //************************************//
 
-    nd4j::ops::reduce_sum op;
+    sd::ops::reduce_sum op;
     auto result = op.evaluate({&input}, {}, {});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -727,7 +727,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_2) {
     auto exp = NDArrayFactory::create<double>({15.f, 40.f, 65.f});
     //************************************//
 
-    nd4j::ops::reduce_sum op;
+    sd::ops::reduce_sum op;
     auto result = op.evaluate({&input}, {}, {1});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -745,7 +745,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_03) {
     auto axis = NDArrayFactory::create<int>('c', {1}, {1});
     //************************************//
 
-    nd4j::ops::reduce_sum op;
+    sd::ops::reduce_sum op;
     auto result = op.evaluate({&input, &axis}, {}, {}, {false});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -762,7 +762,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_1) {
     auto exp = NDArrayFactory::create<double>(1307674368000.f);
     //************************************//
 
-    nd4j::ops::reduce_prod op;
+    sd::ops::reduce_prod op;
     auto result = op.evaluate({&input}, {}, {});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -779,7 +779,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_2) {
     auto exp = NDArrayFactory::create<double>({120.f, 30240.f, 360360.f});
     //************************************//
 
-    nd4j::ops::reduce_prod op;
+    sd::ops::reduce_prod op;
     auto result = op.evaluate({&input}, {}, {1});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -796,7 +796,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_01) {
     auto exp = NDArrayFactory::create<double>('c', {4}, {66.f, 72.f, 78.f, 84.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_sum op;
+    sd::ops::reduce_sum op;
     auto result = op.evaluate({&x}, {}, {0,1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -815,7 +815,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_02) {
     auto exp = NDArrayFactory::create<double>('c', {1,1,4}, {66.f, 72.f, 78.f, 84.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_sum op;
+    sd::ops::reduce_sum op;
     auto result = op.evaluate({&x}, {1.}, {0, 1});
     auto output = result->at(0);
    // output->printIndexedBuffer("Result is");
@@ -835,7 +835,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_3) {
     auto exp = NDArrayFactory::create<double>('c', {3}, {68.f, 100.f, 132.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_sum op;
+    sd::ops::reduce_sum op;
     auto result = op.evaluate({&x}, {}, {0, 2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -855,7 +855,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_4) {
     auto exp = NDArrayFactory::create<double>('c', {1,3,1}, {68.f, 100.f, 132.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_sum op;
+    sd::ops::reduce_sum op;
     auto result = op.evaluate({&x}, {1.}, {0, 2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -875,7 +875,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_5) {
     auto exp = NDArrayFactory::create<double>(300.f);
     x.linspace(1);
 
-    nd4j::ops::reduce_sum op;
+    sd::ops::reduce_sum op;
     auto result = op.evaluate({&x}, {}, {});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -895,7 +895,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_6) {
     auto exp = NDArrayFactory::create<double>(300.f);
     x.linspace(1);
 
-    nd4j::ops::reduce_sum op;
+    sd::ops::reduce_sum op;
     auto result = op.evaluate({&x}, {}, {0,1,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -915,7 +915,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_7) {
     auto exp = NDArrayFactory::create<double>('c', {1,1,1}, {300.f});
     x.linspace(1);
 //    x.printIndexedBuffer("Input with shape (2, 3, 4) is");
-    nd4j::ops::reduce_sum op;
+    sd::ops::reduce_sum op;
     auto result = op.evaluate({&x}, {1.}, {0,1,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -935,7 +935,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_01) {
     auto exp = NDArrayFactory::create<double>('c', {2}, {10395.f, 46080.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_prod op;
+    sd::ops::reduce_prod op;
     auto result = op.evaluate({&x}, {}, {0,1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -954,7 +954,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_02) {
     auto exp = NDArrayFactory::create<double>('c', {1,1,2}, {10395.f, 46080.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_prod op;
+    sd::ops::reduce_prod op;
     auto result = op.evaluate({&x}, {1.}, {0, 1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -974,7 +974,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_3) {
     auto exp = NDArrayFactory::create<double>('c', {3}, {112.f, 1080.f, 3960.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_prod op;
+    sd::ops::reduce_prod op;
     auto result = op.evaluate({&x}, {}, {0, 2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -994,7 +994,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_4) {
     auto exp = NDArrayFactory::create<double>('c', {1,3,1}, {112.f, 1080.f, 3960.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_prod op;
+    sd::ops::reduce_prod op;
     auto result = op.evaluate({&x}, {1.}, {0, 2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -1015,7 +1015,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_04) {
     auto axes = NDArrayFactory::create<int>({0, 2});
     x.linspace(1);
 
-    nd4j::ops::reduce_prod op;
+    sd::ops::reduce_prod op;
     auto result = op.evaluate({&x, &axes}, {}, {}, {true});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -1035,7 +1035,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_5) {
     auto exp = NDArrayFactory::create<double>(479001600.f);
     x.linspace(1);
 
-    nd4j::ops::reduce_prod op;
+    sd::ops::reduce_prod op;
     auto result = op.evaluate({&x}, {}, {});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -1055,7 +1055,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_6) {
     auto exp = NDArrayFactory::create<double>(479001600.f);
     x.linspace(1);
 
-    nd4j::ops::reduce_prod op;
+    sd::ops::reduce_prod op;
     auto result = op.evaluate({&x}, {}, {0,1,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -1075,7 +1075,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_7) {
     auto exp = NDArrayFactory::create<double>('c', {1, 1, 1}, {479001600.f});
     x.linspace(1);
 //    x.printIndexedBuffer("Input with shape (2, 3, 4) is");
-    nd4j::ops::reduce_prod op;
+    sd::ops::reduce_prod op;
     auto result = op.evaluate({&x}, {1.}, {0,1,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -1095,7 +1095,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Min_1) {
     auto exp = NDArrayFactory::create<double>('c', {4}, {1.f, 2.f, 3.f, 4.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_min op;
+    sd::ops::reduce_min op;
     auto result = op.evaluate({&x}, {}, {0, 1});
     auto output = result->at(0);
     // output->printIndexedBuffer("Result is");
@@ -1114,7 +1114,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Min_2) {
     auto exp = NDArrayFactory::create<double>('c', {1,1,4}, {1.f, 2.f, 3.f, 4.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_min op;
+    sd::ops::reduce_min op;
     auto result = op.evaluate({&x}, {1.}, {0, 1});
     auto output = result->at(0);
     // output->printIndexedBuffer("Result is");
@@ -1134,7 +1134,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Min_3) {
     auto exp = NDArrayFactory::create<double>('c', {3}, {1.f, 5.f, 9.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_min op;
+    sd::ops::reduce_min op;
     auto result = op.evaluate({&x}, {}, {0, 2});
     auto output = result->at(0);
     // output->printIndexedBuffer("Result is");
@@ -1154,7 +1154,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Min_4) {
     auto exp = NDArrayFactory::create<double>('c', {1,3,1}, {1.f, 5.f, 9.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_min op;
+    sd::ops::reduce_min op;
     auto result = op.evaluate({&x}, {1.}, {0, 2});
     auto output = result->at(0);
     // output->printIndexedBuffer("Result is");
@@ -1175,7 +1175,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Min_04) {
     auto axes = NDArrayFactory::create<int>({0, 2});
     x.linspace(1);
 
-    nd4j::ops::reduce_min op;
+    sd::ops::reduce_min op;
     auto result = op.evaluate({&x, &axes}, {}, {}, {true});
     auto output = result->at(0);
     // output->printIndexedBuffer("Result is");
@@ -1195,7 +1195,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Min_5) {
     auto exp = NDArrayFactory::create<double>(1.f);
     x.linspace(1);
 
-    nd4j::ops::reduce_min op;
+    sd::ops::reduce_min op;
     auto result = op.evaluate({&x}, {}, {});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -1215,7 +1215,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Min_6) {
     auto exp = NDArrayFactory::create<double>(1.f);
     x.linspace(1);
 
-    nd4j::ops::reduce_min op;
+    sd::ops::reduce_min op;
     auto result = op.evaluate({&x}, {}, {0,1,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -1235,7 +1235,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Min_7) {
     auto exp = NDArrayFactory::create<double>('c', {1, 1, 1}, {1.f});
     x.linspace(1);
     // x.printIndexedBuffer("Input with shape (2, 3, 4) is");
-    nd4j::ops::reduce_min op;
+    sd::ops::reduce_min op;
     auto result = op.evaluate({&x}, {1.}, {0,1,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -1255,7 +1255,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Max_1) {
     auto exp = NDArrayFactory::create<double>('c', {4}, {21.f, 22.f, 23.f, 24.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_max op;
+    sd::ops::reduce_max op;
     auto result = op.evaluate({&x}, {}, {0,1});
     auto output = result->at(0);
     // output->printIndexedBuffer("Result is");
@@ -1275,7 +1275,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Max_2) {
     auto exp = NDArrayFactory::create<double>('c', {1,1,4}, {21.f, 22.f, 23.f, 24.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_max op;
+    sd::ops::reduce_max op;
     auto result = op.evaluate({&x}, {1.}, {0, 1});
     auto output = result->at(0);
     // output->printIndexedBuffer("Result is");
@@ -1295,7 +1295,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Max_3) {
     auto exp = NDArrayFactory::create<double>('c', {3}, {16.f, 20.f, 24.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_max op;
+    sd::ops::reduce_max op;
     auto result = op.evaluate({&x}, {}, {0, 2});
     auto output = result->at(0);
     // output->printIndexedBuffer("Result is");
@@ -1315,7 +1315,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Max_4) {
     auto exp = NDArrayFactory::create<double>('c', {1,3,1}, {16.f, 20.f, 24.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_max op;
+    sd::ops::reduce_max op;
     auto result = op.evaluate({&x}, {1.}, {0, 2});
     auto output = result->at(0);
     // output->printIndexedBuffer("Result is");
@@ -1336,7 +1336,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Max_04) {
     auto axes = NDArrayFactory::create<int>({0, 2});
     x.linspace(1);
 
-    nd4j::ops::reduce_max op;
+    sd::ops::reduce_max op;
     auto result = op.evaluate({&x, &axes}, {}, {}, {true});
     auto output = result->at(0);
     // output->printIndexedBuffer("Result is");
@@ -1356,7 +1356,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Max_5) {
     auto exp = NDArrayFactory::create<double>(24.f);
     x.linspace(1);
 
-    nd4j::ops::reduce_max op;
+    sd::ops::reduce_max op;
     auto result = op.evaluate({&x}, {}, {});
     auto output = result->at(0);
     // output->printIndexedBuffer("Result is");
@@ -1376,7 +1376,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Max_6) {
     auto exp = NDArrayFactory::create<double>(24.f);
     x.linspace(1);
 
-    nd4j::ops::reduce_max op;
+    sd::ops::reduce_max op;
     auto result = op.evaluate({&x}, {}, {0,1,2});
     auto output = result->at(0);
     // output->printIndexedBuffer("Result is");
@@ -1396,7 +1396,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Max_7) {
     auto exp = NDArrayFactory::create<double>('c', {1, 1, 1}, {24.f});
     x.linspace(1);
 //    x.printIndexedBuffer("Input with shape (2, 3, 4) is");
-    nd4j::ops::reduce_max op;
+    sd::ops::reduce_max op;
     auto result = op.evaluate({&x}, {1.}, {0,1,2});
     auto output = result->at(0);
     // output->printIndexedBuffer("Result is");
@@ -1415,7 +1415,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm1_1) {
     auto exp = NDArrayFactory::create<double>('c', {4}, {66.f, 72.f, 78.f, 84.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_norm1 op;
+    sd::ops::reduce_norm1 op;
     auto result = op.evaluate({&x}, {}, {0,1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -1434,7 +1434,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm1_2) {
     auto exp = NDArrayFactory::create<double>('c', {1,1,4}, {66.f, 72.f, 78.f, 84.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_norm1 op;
+    sd::ops::reduce_norm1 op;
     auto result = op.evaluate({&x}, {1.}, {0, 1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -1454,7 +1454,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm1_3) {
     auto exp = NDArrayFactory::create<double>('c', {3}, {68.f, 100.f, 132.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_norm1 op;
+    sd::ops::reduce_norm1 op;
     auto result = op.evaluate({&x}, {}, {0, 2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -1474,7 +1474,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm1_4) {
     auto exp = NDArrayFactory::create<double>('c', {1,3,1}, {68.f, 100.f, 132.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_norm1 op;
+    sd::ops::reduce_norm1 op;
     auto result = op.evaluate({&x}, {1.}, {0, 2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -1495,7 +1495,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm1_04) {
     auto axes = NDArrayFactory::create<int>({0, 2});
     x.linspace(1);
 
-    nd4j::ops::reduce_norm1 op;
+    sd::ops::reduce_norm1 op;
     auto result = op.evaluate({&x, &axes}, {}, {}, {true});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -1515,7 +1515,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm1_5) {
     auto exp = NDArrayFactory::create<double>(300.f);
     x.linspace(1);
 
-    nd4j::ops::reduce_norm1 op;
+    sd::ops::reduce_norm1 op;
     auto result = op.evaluate({&x}, {}, {});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -1535,7 +1535,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm1_6) {
     auto exp = NDArrayFactory::create<double>(300.f);
     x.linspace(1);
 
-    nd4j::ops::reduce_norm1 op;
+    sd::ops::reduce_norm1 op;
     auto result = op.evaluate({&x}, {}, {0,1,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -1555,7 +1555,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm1_7) {
     auto exp = NDArrayFactory::create<double>('c', {1, 1, 1}, {300.f});
     x.linspace(1);
 //    x.printIndexedBuffer("Input with shape (2, 3, 4) is");
-    nd4j::ops::reduce_norm1 op;
+    sd::ops::reduce_norm1 op;
     auto result = op.evaluate({&x}, {1.}, {0,1,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -1574,7 +1574,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm2_1) {
     auto exp = NDArrayFactory::create<double>('c', {4}, {31.7175f, 33.823071f, 35.97221f, 38.15757f});
     x.linspace(1);
 
-    nd4j::ops::reduce_norm2 op;
+    sd::ops::reduce_norm2 op;
     auto result = op.evaluate({&x}, {}, {0,1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -1593,7 +1593,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm2_2) {
     auto exp = NDArrayFactory::create<double>('c', {1,1,4}, {31.7175f, 33.823071f, 35.97221f, 38.15757f});
     x.linspace(1);
 
-    nd4j::ops::reduce_norm2 op;
+    sd::ops::reduce_norm2 op;
     auto result = op.evaluate({&x}, {1.}, {0, 1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -1613,7 +1613,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm2_3) {
     auto exp = NDArrayFactory::create<double>('c', {3}, {29.597298f, 39.344631f, 49.759422f});
     x.linspace(1);
 
-    nd4j::ops::reduce_norm2 op;
+    sd::ops::reduce_norm2 op;
     auto result = op.evaluate({&x}, {}, {0, 2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -1633,7 +1633,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm2_4) {
     auto exp = NDArrayFactory::create<double>('c', {1,3,1}, {29.597298f, 39.344631f, 49.759422f});
     x.linspace(1);
 
-    nd4j::ops::reduce_norm2 op;
+    sd::ops::reduce_norm2 op;
     auto result = op.evaluate({&x}, {1.}, {0, 2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -1654,7 +1654,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm2_04) {
     auto axes = NDArrayFactory::create<int>({0,2});
     x.linspace(1);
 
-    nd4j::ops::reduce_norm2 op;
+    sd::ops::reduce_norm2 op;
     auto result = op.evaluate({&x, &axes}, {}, {}, {true});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -1674,7 +1674,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm2_5) {
     auto exp = NDArrayFactory::create<double>(70.f);
     x.linspace(1);
 
-    nd4j::ops::reduce_norm2 op;
+    sd::ops::reduce_norm2 op;
     auto result = op.evaluate({&x}, {}, {});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -1694,7 +1694,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm2_6) {
     auto exp = NDArrayFactory::create<double>(70.f);
     x.linspace(1);
 
-    nd4j::ops::reduce_norm2 op;
+    sd::ops::reduce_norm2 op;
     auto result = op.evaluate({&x}, {}, {0,1,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -1714,7 +1714,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Norm2_7) {
     auto exp = NDArrayFactory::create<double>('c', {1, 1, 1}, {70.f});
     x.linspace(1);
 //    x.printIndexedBuffer("Input with shape (2, 3, 4) is");
-    nd4j::ops::reduce_norm2 op;
+    sd::ops::reduce_norm2 op;
     auto result = op.evaluate({&x}, {1.}, {0,1,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -1734,7 +1734,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_NormMax_1) {
     auto exp = NDArrayFactory::create<double>('c', {4}, {21.f, 22.f, 23.f, 24.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_norm_max op;
+    sd::ops::reduce_norm_max op;
     auto result = op.evaluate({&x}, {}, {0,1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -1753,7 +1753,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_NormMax_2) {
     auto exp = NDArrayFactory::create<double>('c', {1,1,4}, {21.f, 22.f, 23.f, 24.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_norm_max op;
+    sd::ops::reduce_norm_max op;
     auto result = op.evaluate({&x}, {1.f}, {0,1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -1772,7 +1772,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_NormMax_3) {
     auto exp = NDArrayFactory::create<double>('c', {3}, {16.f, 20.f, 24.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_norm_max op;
+    sd::ops::reduce_norm_max op;
     auto result = op.evaluate({&x}, {}, {0,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -1791,7 +1791,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_NormMax_4) {
     auto exp = NDArrayFactory::create<double>('c', {1, 3, 1}, {16.f, 20.f, 24.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_norm_max op;
+    sd::ops::reduce_norm_max op;
     auto result = op.evaluate({&x}, {1.f}, {0,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -1811,7 +1811,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_NormMax_04) {
     auto axes = NDArrayFactory::create<int>({0,2});
     x.linspace(1);
 
-    nd4j::ops::reduce_norm_max op;
+    sd::ops::reduce_norm_max op;
     auto result = op.evaluate({&x, &axes}, {}, {}, {true});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -1830,7 +1830,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_NormMax_5) {
     auto exp = NDArrayFactory::create<double>(24.f);
     x.linspace(1);
 
-    nd4j::ops::reduce_norm_max op;
+    sd::ops::reduce_norm_max op;
     auto result = op.evaluate({&x}, {}, {});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -1850,7 +1850,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_NormMax_6) {
     auto exp = NDArrayFactory::create<double>(24.f);
     x.linspace(1);
 
-    nd4j::ops::reduce_norm_max op;
+    sd::ops::reduce_norm_max op;
     auto result = op.evaluate({&x}, {}, {0, 1, 2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -1870,7 +1870,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_NormMax_7) {
     auto exp = NDArrayFactory::create<double>('c', {1, 1, 1}, {24.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_norm_max op;
+    sd::ops::reduce_norm_max op;
     auto result = op.evaluate({&x}, {1.f}, {});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -1890,7 +1890,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_SquaredNorm_1) {
     auto exp = NDArrayFactory::create<double>('c', {4}, {1006.f, 1144.f, 1294.f, 1456.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_sqnorm op;
+    sd::ops::reduce_sqnorm op;
     auto result = op.evaluate({&x}, {}, {0,1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -1909,7 +1909,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_SquaredNorm_2) {
     auto exp = NDArrayFactory::create<double>('c', {1,1,4}, {1006.f, 1144.f, 1294.f, 1456.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_sqnorm op;
+    sd::ops::reduce_sqnorm op;
     auto result = op.evaluate({&x}, {1.f}, {0,1});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -1928,7 +1928,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_SquaredNorm_3) {
     auto exp = NDArrayFactory::create<double>('c', {3}, {876.f, 1548.f, 2476.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_sqnorm op;
+    sd::ops::reduce_sqnorm op;
     auto result = op.evaluate({&x}, {}, {0,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -1947,7 +1947,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_SquaredNorm_4) {
     auto exp = NDArrayFactory::create<double>('c', {1, 3, 1}, {876.f, 1548.f, 2476.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_sqnorm op;
+    sd::ops::reduce_sqnorm op;
     auto result = op.evaluate({&x}, {1.f}, {0,2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -1967,7 +1967,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_SquaredNorm_04) {
     auto axes = NDArrayFactory::create<int>({0, 2});
     x.linspace(1);
 
-    nd4j::ops::reduce_sqnorm op;
+    sd::ops::reduce_sqnorm op;
     auto result = op.evaluate({&x, &axes}, {}, {}, {true});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -1986,7 +1986,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_SquaredNorm_5) {
     auto exp = NDArrayFactory::create<double>(4900.f);
     x.linspace(1);
 
-    nd4j::ops::reduce_sqnorm op;
+    sd::ops::reduce_sqnorm op;
     auto result = op.evaluate({&x}, {}, {});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -2006,7 +2006,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_SquaredNorm_6) {
     auto exp = NDArrayFactory::create<double>(4900.f);
     x.linspace(1);
 
-    nd4j::ops::reduce_sqnorm op;
+    sd::ops::reduce_sqnorm op;
     auto result = op.evaluate({&x}, {}, {0, 1, 2});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -2026,7 +2026,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_SquaredNorm_7) {
     auto exp = NDArrayFactory::create<double>('c', {1, 1, 1}, {4900.f});
     x.linspace(1);
 
-    nd4j::ops::reduce_sqnorm op;
+    sd::ops::reduce_sqnorm op;
     auto result = op.evaluate({&x}, {1.f}, {});
     auto output = result->at(0);
 //    output->printIndexedBuffer("Result is");
@@ -2047,7 +2047,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_1) {
     auto exp = NDArrayFactory::create<double>('c', {3, 4}, {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,0.5f});
     //************************************//
 
-    nd4j::ops::reduce_sum_bp op;
+    sd::ops::reduce_sum_bp op;
     auto result = op.evaluate({&input, &eps}, {}, {});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -2068,7 +2068,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_2) {
                                      0.5f, 0.5f, 0.5f,0.5f});
     //************************************//
 
-    nd4j::ops::reduce_sum_bp op;
+    sd::ops::reduce_sum_bp op;
     auto result = op.evaluate({&input, &eps}, {1.f}, {});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -2089,7 +2089,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_3) {
                                      1.f, 2.f, 3.f, 4.f});
     //************************************//
 
-    nd4j::ops::reduce_sum_bp op;
+    sd::ops::reduce_sum_bp op;
     auto result = op.evaluate({&input, &eps}, {}, {0});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -2110,7 +2110,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_4) {
                                      1.f, 2.f, 3.f, 4.f});
     //************************************//
 
-    nd4j::ops::reduce_sum_bp op;
+    sd::ops::reduce_sum_bp op;
     auto result = op.evaluate({&input, &eps}, {1.f}, {0});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -2133,7 +2133,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Sum_BP_04) {
     auto axis = NDArrayFactory::create<int>('c', {1}, {ax});
     //************************************//
 
-    nd4j::ops::reduce_sum_bp op;
+    sd::ops::reduce_sum_bp op;
     auto result = op.evaluate({&input, &eps, &axis}, {}, {}, {true});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -2158,7 +2158,7 @@ TEST_F(DeclarableOpsTests8, Test_Reduce_Prod_BP_1) {
                                        171001227491294996070400.f, 155455648254341989531648.f, 142501016904612993564672.f,
                                        131539399526781282156544.f, 122143728775382565912576.f, 114000815325130245799936.f});
 
-    nd4j::ops::reduce_prod_bp op;
+    sd::ops::reduce_prod_bp op;
     auto result = op.evaluate({&input, &eps}, {}, {});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -2177,7 +2177,7 @@ TEST_F(DeclarableOpsTests8, reduceMean_test1) {
     x.linspace(1);
 
 
-    nd4j::ops::reduce_mean op;
+    sd::ops::reduce_mean op;
     auto result = op.evaluate({&x}, {}, {0,1});
     auto output = result->at(0);
 
@@ -2197,7 +2197,7 @@ TEST_F(DeclarableOpsTests8, reduceMean_test2) {
     x.linspace(1);
 
 
-    nd4j::ops::reduce_mean op;
+    sd::ops::reduce_mean op;
     auto result = op.evaluate({&x}, {1.}, {0,1});
     auto output = result->at(0);
 
@@ -2217,7 +2217,7 @@ TEST_F(DeclarableOpsTests8, reduceMean_test3) {
     x.linspace(1);
 
 
-    nd4j::ops::reduce_mean op;
+    sd::ops::reduce_mean op;
     auto result = op.evaluate({&x}, {}, {0,2});
     auto output = result->at(0);
 
@@ -2237,7 +2237,7 @@ TEST_F(DeclarableOpsTests8, reduceMean_test4) {
     x.linspace(1);
 
 
-    nd4j::ops::reduce_mean op;
+    sd::ops::reduce_mean op;
     auto result = op.evaluate({&x}, {1.f}, {0,2});
     auto output = result->at(0);
 
@@ -2257,7 +2257,7 @@ TEST_F(DeclarableOpsTests8, reduceMean_test5) {
     x.linspace(1);
 
 
-    nd4j::ops::reduce_mean op;
+    sd::ops::reduce_mean op;
     auto result = op.evaluate({&x}, {}, {});
     auto output = result->at(0);
 
@@ -2276,7 +2276,7 @@ TEST_F(DeclarableOpsTests8, reduceMean_test6) {
     auto exp = NDArrayFactory::create<double>(12.5f);
     x.linspace(1);
 
-    nd4j::ops::reduce_mean op;
+    sd::ops::reduce_mean op;
     auto result = op.evaluate({&x}, {}, {0,1,2});
     auto output = result->at(0);
 
@@ -2295,7 +2295,7 @@ TEST_F(DeclarableOpsTests8, reduceMean_test7) {
     auto exp = NDArrayFactory::create<double>('c', {1,1,1}, {12.5f});
     x.linspace(1);
 
-    nd4j::ops::reduce_mean op;
+    sd::ops::reduce_mean op;
     auto result = op.evaluate({&x}, {1.}, {0,1,2});
     auto output = result->at(0);
 
@@ -2315,7 +2315,7 @@ TEST_F(DeclarableOpsTests8, reduceMean_test8) {
     auto axes = NDArrayFactory::create<int>({0, 1, 2});
     x.linspace(1);
 
-    nd4j::ops::reduce_mean op;
+    sd::ops::reduce_mean op;
     auto result = op.evaluate({&x, &axes}, {}, {}, {true});
     auto output = result->at(0);
 
@@ -2337,7 +2337,7 @@ TEST_F(DeclarableOpsTests8, reduceMeanBP_test1) {
 
     x.linspace(1);
 
-    nd4j::ops::reduce_mean_bp op;
+    sd::ops::reduce_mean_bp op;
 
     auto result = op.evaluate({&x, &gradO1}, {0}, {});
     ASSERT_EQ(Status::OK(), result->status());
@@ -2368,7 +2368,7 @@ TEST_F(DeclarableOpsTests8, reduceMeanBP_test2) {
 
     x.linspace(1);
 
-    nd4j::ops::reduce_mean_bp op;
+    sd::ops::reduce_mean_bp op;
 
     auto result = op.evaluate({&x, &gradO1}, {0}, {0});
     ASSERT_EQ(Status::OK(), result->status());
@@ -2396,7 +2396,7 @@ TEST_F(DeclarableOpsTests8, reduceMeanBP_test02) {
     auto axis = NDArrayFactory::create<int>('c', {1}, {ax});
     x.linspace(1);
 
-    nd4j::ops::reduce_mean_bp op;
+    sd::ops::reduce_mean_bp op;
 
     auto result = op.evaluate({&x, &gradO1, &axis}, {}, {}, {false});
     ASSERT_EQ(Status::OK(), result->status());
@@ -2423,7 +2423,7 @@ TEST_F(DeclarableOpsTests8, reduceMeanBP_test3) {
 
     x.linspace(1);
 
-    nd4j::ops::reduce_mean_bp op;
+    sd::ops::reduce_mean_bp op;
 
     auto result = op.evaluate({&x, &gradO1}, {0}, {1});
     ASSERT_EQ(Status::OK(), result->status());
@@ -2447,7 +2447,7 @@ TEST_F(DeclarableOpsTests8, reduceStDevBP_test4) {
     auto gradO = NDArrayFactory::create<double>(0.5f);
     auto exp = NDArrayFactory::create<double>('c', {3}, {-0.25f, 0.f, 0.25f});
 
-    nd4j::ops::reduce_stdev_bp op;
+    sd::ops::reduce_stdev_bp op;
 
     auto result = op.evaluate({&x, &gradO}, {0,1}, {});
     ASSERT_EQ(Status::OK(), result->status());
@@ -2468,7 +2468,7 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test1) {
 
     logits.linspace(0.1, 0.1);
 
-    nd4j::ops::softmax_cross_entropy_loss_with_logits op;
+    sd::ops::softmax_cross_entropy_loss_with_logits op;
     auto results = op.evaluate({&logits, &labels}, {}, {});
 
     ASSERT_EQ(Status::OK(), results->status());
@@ -2490,7 +2490,7 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test2) {
 
     logits.linspace(0.1, 0.1);
 
-    nd4j::ops::softmax_cross_entropy_loss_with_logits op;
+    sd::ops::softmax_cross_entropy_loss_with_logits op;
     auto results = op.evaluate({&logits, &labels}, {}, {0});
 
     ASSERT_EQ(Status::OK(), results->status());
@@ -2512,7 +2512,7 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test3) {
 
     logits.linspace(0.1, 0.1);
 
-    nd4j::ops::softmax_cross_entropy_loss_with_logits op;
+    sd::ops::softmax_cross_entropy_loss_with_logits op;
     auto results = op.evaluate({&logits, &labels}, {}, {1});
 
     ASSERT_EQ(Status::OK(), results->status());
@@ -2534,7 +2534,7 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test4) {
 
     logits.linspace(0.1, 0.1);
 
-    nd4j::ops::softmax_cross_entropy_loss_with_logits op;
+    sd::ops::softmax_cross_entropy_loss_with_logits op;
     auto results = op.evaluate({&logits, &labels}, {}, {});
 
     ASSERT_EQ(Status::OK(), results->status());
@@ -2556,7 +2556,7 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test5) {
 
     logits.linspace(0.1, 0.1);
 
-    nd4j::ops::softmax_cross_entropy_loss_with_logits op;
+    sd::ops::softmax_cross_entropy_loss_with_logits op;
     auto results = op.evaluate({&logits, &labels}, {}, {0});
 
     ASSERT_EQ(Status::OK(), results->status());
@@ -2578,7 +2578,7 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test6) {
 
     logits.linspace(0.1, 0.1);
 
-    nd4j::ops::softmax_cross_entropy_loss_with_logits op;
+    sd::ops::softmax_cross_entropy_loss_with_logits op;
     auto results = op.evaluate({&logits, &labels}, {}, {0});
 
     ASSERT_EQ(Status::OK(), results->status());
@@ -2600,7 +2600,7 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test7) {
 
     logits.linspace(0.1, 0.1);
 
-    nd4j::ops::softmax_cross_entropy_loss_with_logits op;
+    sd::ops::softmax_cross_entropy_loss_with_logits op;
     auto results = op.evaluate({&logits, &labels}, {}, {1});
 
     ASSERT_EQ(Status::OK(), results->status());
@@ -2622,7 +2622,7 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test8) {
 
     logits.linspace(0.1, 0.1);
 
-    nd4j::ops::softmax_cross_entropy_loss_with_logits op;
+    sd::ops::softmax_cross_entropy_loss_with_logits op;
     auto results = op.evaluate({&logits, &labels}, {}, {});
 
     ASSERT_EQ(Status::OK(), results->status());
@@ -2642,7 +2642,7 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test9) {
     auto logits = NDArrayFactory::create<double>('c', {1}, {0.2});
     auto expected = NDArrayFactory::create<double>(0.);
 
-    nd4j::ops::softmax_cross_entropy_loss_with_logits op;
+    sd::ops::softmax_cross_entropy_loss_with_logits op;
     auto results = op.evaluate({&logits, &labels}, {}, {});
 
     ASSERT_EQ(Status::OK(), results->status());
@@ -2664,7 +2664,7 @@ TEST_F(DeclarableOpsTests8, softmax_cross_entropy_loss_with_logits_test10) {
 
     logits.linspace(0.1, 0.1);
 
-    nd4j::ops::softmax_cross_entropy_loss_with_logits op;
+    sd::ops::softmax_cross_entropy_loss_with_logits op;
     auto results = op.evaluate({&logits, &labels}, {}, {0});
 
     ASSERT_EQ(Status::OK(), results->status());
@@ -2683,7 +2683,7 @@ TEST_F(DeclarableOpsTests8, clipbynorm_test4) {
     auto x = NDArrayFactory::create<double>('c', {3, 5}, {0.7044955, 0.55606544, 0.15833677, 0.001874401, 0.61595726, 0.3924779, 0.7414847, 0.4127324, 0.24026828, 0.26093036, 0.46741188, 0.01863421, 0.08528871, 0.529365, 0.5510694});
     auto exp = NDArrayFactory::create<double>('c', {3, 5}, {0.405392, 0.319980, 0.091113, 0.001079, 0.354444, 0.225846, 0.426676, 0.237501, 0.138259, 0.150149, 0.268965, 0.010723, 0.049078, 0.304615, 0.317105});
 
-    nd4j::ops::clipbynorm op;
+    sd::ops::clipbynorm op;
     auto result = op.evaluate({&x}, {1.f}, {});
     auto output = result->at(0);
 
@@ -2703,7 +2703,7 @@ TEST_F(DeclarableOpsTests8, clipbynorm_test5) {
 
     x.linspace(1);
 
-    nd4j::ops::clipbynorm op;
+    sd::ops::clipbynorm op;
     auto result = op.evaluate({&x}, {15.f}, {0});
     auto output = result->at(0);
 
@@ -2721,7 +2721,7 @@ TEST_F(DeclarableOpsTests8, clipbynorm_test6) {
 
     x.linspace(1);
 
-    nd4j::ops::clipbynorm op;
+    sd::ops::clipbynorm op;
     auto result = op.evaluate({&x}, {15.f}, {1});
     auto output = result->at(0);
 
@@ -2739,7 +2739,7 @@ TEST_F(DeclarableOpsTests8, clipbynorm_test7) {
 
     x.linspace(1);
 
-    nd4j::ops::clipbynorm op;
+    sd::ops::clipbynorm op;
     auto result = op.evaluate({&x}, {15.f}, {0,1});
     auto output = result->at(0);
 
@@ -2757,7 +2757,7 @@ TEST_F(DeclarableOpsTests8, clipbynorm_test8) {
 
     x.linspace(1);
 
-    nd4j::ops::clipbynorm op;
+    sd::ops::clipbynorm op;
     auto result = op.evaluate({&x}, {15.}, {});
     auto output = result->at(0);
 
@@ -2773,7 +2773,7 @@ TEST_F(DeclarableOpsTests8, clipbynorm_test9) {
     auto x = NDArrayFactory::create<double>('c', {2}, {3., 4.});
     auto exp = NDArrayFactory::create<double>('c', {2}, {2.4, 3.2});
 
-    nd4j::ops::clipbynorm op;
+    sd::ops::clipbynorm op;
     auto result = op.evaluate({&x}, {4.}, {});
     auto output = result->at(0);
 
@@ -2789,7 +2789,7 @@ TEST_F(DeclarableOpsTests8, clipbynorm_test10) {
     auto x = NDArrayFactory::create<double>(6.);
     auto exp = NDArrayFactory::create<double>(5.);
 
-    nd4j::ops::clipbynorm op;
+    sd::ops::clipbynorm op;
     auto result = op.evaluate({&x}, {5.}, {});
     auto output = result->at(0);
 
@@ -2808,7 +2808,7 @@ TEST_F(DeclarableOpsTests8, clipbynorm_test11) {
 
     x.linspace(1);
 
-    nd4j::ops::clipbynorm op;
+    sd::ops::clipbynorm op;
     auto result = op.evaluate({&x}, {35.}, {0, 2});
     auto output = result->at(0);
 
@@ -2823,7 +2823,7 @@ TEST_F(DeclarableOpsTests8, clipbynorm_test_tf_119_1) {
     auto x = NDArrayFactory::create<double>('c', {3, 3}, {1, 2, 3, 4, 5,6, 7, 8, 9});
     auto e = NDArrayFactory::create<double>('c', {3, 3}, {0.03198684, 0.06397368, 0.09596053, 0.12794736, 0.15993419, 0.19192106, 0.22390789, 0.25589472, 0.28788155});
 
-    nd4j::ops::clipbynorm op;
+    sd::ops::clipbynorm op;
     auto result = op.evaluate({&x}, {0.54}, {});
 
     ASSERT_EQ(e, *result->at(0));
@@ -2839,7 +2839,7 @@ TEST_F(DeclarableOpsTests8, reduceMeanBP_test4) {
     auto gradO2 = NDArrayFactory::create<double>('c', {1, 4}, {1., 2., 3., 4.});
     auto exp = NDArrayFactory::create<double>('c', {3,4}, {0.333333, 0.666667, 1.000000, 1.333333, 0.333333, 0.666667, 1.000000, 1.333333, 0.333333, 0.666667, 1.000000, 1.333333});
 
-    nd4j::ops::reduce_mean_bp op;
+    sd::ops::reduce_mean_bp op;
 
     auto result = op.evaluate({&x, &gradO1}, {0}, {0});
     ASSERT_EQ(Status::OK(), result->status());
@@ -2865,7 +2865,7 @@ TEST_F(DeclarableOpsTests8, reduceMeanBP_test5) {
     auto gradO2 = NDArrayFactory::create<double>('c', {3, 1}, {1., 2., 3.});
     auto exp = NDArrayFactory::create<double>('c', {3,4}, {0.2500,0.2500,0.2500,0.2500, 0.5000,0.5000,0.5000,0.5000, 0.7500,0.7500,0.7500,0.7500});
 
-    nd4j::ops::reduce_mean_bp op;
+    sd::ops::reduce_mean_bp op;
 
     auto result = op.evaluate({&x, &gradO1}, {0}, {1});
     ASSERT_EQ(Status::OK(), result->status());
@@ -2891,7 +2891,7 @@ TEST_F(DeclarableOpsTests8, reduceStDevBP_test5) {
     auto gradO2 = NDArrayFactory::create<double>('c', {1, 4}, {1., 2., 3., 4.});
     auto exp = NDArrayFactory::create<double>('c', {3,4}, {-0.408248, -0.816497, -1.224745, -1.632993, 0.000000, 0.000000, 0.000000, 0.000000, 0.408248, 0.816497, 1.224745, 1.632993});
 
-    nd4j::ops::reduce_stdev_bp op;
+    sd::ops::reduce_stdev_bp op;
 
     auto result = op.evaluate({&x, &gradO1}, {0}, {0});
     ASSERT_EQ(Status::OK(), result->status());
@@ -2915,7 +2915,7 @@ TEST_F(DeclarableOpsTests8, zeros_as_test1) {
     auto y = NDArrayFactory::create<double>(100.f);
     auto exp = NDArrayFactory::create<double>(0.f);
 
-    nd4j::ops::zeros_as op;
+    sd::ops::zeros_as op;
 
     Nd4jStatus status = op.execute({&x}, {&y}, {}, {}, {});
     ASSERT_EQ(Status::OK(), status);
@@ -2932,7 +2932,7 @@ TEST_F(DeclarableOpsTests8, zeros_as_test2) {
     //auto y = NDArrayFactory::create<float>(100.f);
     auto exp = NDArrayFactory::create<float>(0.f);
 
-    nd4j::ops::zeros_as op;
+    sd::ops::zeros_as op;
 
     auto result = op.evaluate({&x}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
@@ -2950,7 +2950,7 @@ TEST_F(DeclarableOpsTests8, ones_as_test1) {
     auto y = NDArrayFactory::create<double>(100.);
     auto exp = NDArrayFactory::create<double>(1.);
 
-    nd4j::ops::ones_as op;
+    sd::ops::ones_as op;
 
     Nd4jStatus status = op.execute({&x}, {&y});
     ASSERT_EQ(Status::OK(), status);
@@ -2967,7 +2967,7 @@ TEST_F(DeclarableOpsTests8, ones_as_test2) {
     //auto y = NDArrayFactory::create<double>(100.);
     auto exp = NDArrayFactory::create<double>(1.);
 
-    nd4j::ops::ones_as op;
+    sd::ops::ones_as op;
 
     auto results = op.evaluate({&x});
     ASSERT_EQ(Status::OK(), results->status());
@@ -2985,9 +2985,9 @@ TEST_F(DeclarableOpsTests8, ones_as_test3) {
     //auto y = NDArrayFactory::create<double>(100.);
     auto exp = NDArrayFactory::create<int>(1.);
 
-    nd4j::ops::ones_as op;
+    sd::ops::ones_as op;
 
-    auto results = op.evaluate({&x}, {}, {}, {}, {nd4j::DataType::INT32});
+    auto results = op.evaluate({&x}, {}, {}, {}, {sd::DataType::INT32});
     ASSERT_EQ(Status::OK(), results->status());
     auto y = results->at(0);
     ASSERT_TRUE(y->isSameShape(exp));
@@ -3015,10 +3015,10 @@ TEST_F(DeclarableOpsTests8, NormalizeMoments_SGO_1) {
     auto ssSquared = squared.reduceAlongDimension(reduce::Sum, {0});
 //    ssSquared->printBuffer("Sum squared");
 //    squared.printBuffer("Squared");
-    nd4j::ops::normalize_moments op;
+    sd::ops::normalize_moments op;
     auto results = op.evaluate({&counts, &means, &ssSquared}, {0.0}, {0});
     means /= counts;
-//    nd4j::ops::normalize_moments op;
+//    sd::ops::normalize_moments op;
 //    auto results = op.evaluate({&counts, means, deviance}, {0.0}, {});
 
     ASSERT_EQ(Status::OK(), results->status());
@@ -3052,7 +3052,7 @@ TEST_F(DeclarableOpsTests8, Test_Moments_1) {
     auto expVariance = NDArrayFactory::create<double>('c', {4}, {46.666668f, 46.666668f, 46.66666f, 46.666668f});
     x.linspace(1);
 
-    nd4j::ops::moments op;
+    sd::ops::moments op;
     auto result = op.evaluate({&x}, {}, {0, 1});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -3083,7 +3083,7 @@ TEST_F(DeclarableOpsTests8, Test_Moments_2) {
     auto expVariance = NDArrayFactory::create<double>('c', {1,1,4}, {46.666668f, 46.666668f, 46.66666f, 46.666668f});
     x.linspace(1);
 
-    nd4j::ops::moments op;
+    sd::ops::moments op;
     auto result = op.evaluate({&x}, {1.}, {0, 1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -3112,7 +3112,7 @@ TEST_F(DeclarableOpsTests8, Test_Moments_3) {
     auto expVariance = NDArrayFactory::create<double>('c', {3}, {37.25f, 37.25f, 37.25f});
     x.linspace(1);
 
-    nd4j::ops::moments op;
+    sd::ops::moments op;
     auto result = op.evaluate({&x}, {}, {0, 2});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -3141,7 +3141,7 @@ TEST_F(DeclarableOpsTests8, Test_Moments_4) {
     auto expVariance = NDArrayFactory::create<double>('c', {1,3,1}, {37.25f, 37.25f, 37.25f});
     x.linspace(1);
 
-    nd4j::ops::moments op;
+    sd::ops::moments op;
     auto result = op.evaluate({&x}, {1.}, {0, 2});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -3170,7 +3170,7 @@ TEST_F(DeclarableOpsTests8, Test_Moments_6) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4});
     x.linspace(1);
 
-    nd4j::ops::moments op;
+    sd::ops::moments op;
     auto result = op.evaluate({&x}, {}, {0,1,2});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -3199,7 +3199,7 @@ TEST_F(DeclarableOpsTests8, Test_Moments_7) {
 
     x.linspace(1);
     // x.printIndexedBuffer("Input with shape (2, 3, 4) is");
-    nd4j::ops::moments op;
+    sd::ops::moments op;
     auto result = op.evaluate({&x}, {1.}, {0,1,2});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -3228,7 +3228,7 @@ TYPED_TEST(TypedDeclarableOpsTests8, LrnTest_01) {
 //            0.7581754,  0.58321184, 0.86747235, 0.4048204}
    );
 
-    nd4j::ops::lrn op;
+    sd::ops::lrn op;
     auto results = op.evaluate({&x}, {1.0, 1.0, 0.5}, {2});
     auto out = results->at(0);
 
@@ -3250,7 +3250,7 @@ TYPED_TEST(TypedDeclarableOpsTests8, LrnTest_02) {
         0.2581989f, 0.3592106f, 0.40089184f, 0.4193139f, 0.5360563f, 0.67936623f}
     );
 
-    nd4j::ops::lrn op;
+    sd::ops::lrn op;
     auto results = op.evaluate({&x}, {1.0, 1.0, 0.5}, {2});
     auto out = results->at(0);
 
@@ -3269,7 +3269,7 @@ TYPED_TEST(TypedDeclarableOpsTests8, LrnTest_03) {
     auto x = NDArrayFactory::create<TypeParam>('c', {1, 1, 1, 10}, { 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f});
     auto exp = NDArrayFactory::create<TypeParam>('c', {1, 1, 1, 10}, {0.10425719f, 0.16843036f, 0.2095291f, 0.23652494f, 0.25449327f, 0.3053919f, 0.35675305f, 0.4098524f, 0.46662825f, 0.52999896f});
 
-    nd4j::ops::lrn op;
+    sd::ops::lrn op;
     auto results = op.evaluate({&x}, {1.0, 1.0, 0.5}, {5});
     auto out = results->at(0);
 
@@ -3298,7 +3298,7 @@ TYPED_TEST(TypedDeclarableOpsTests8, LrnTest_1) {
                                             0.7581754f,  0.58321184f, 0.86747235f, 0.4048204f}
     );
 
-    nd4j::ops::lrn op;
+    sd::ops::lrn op;
     auto results = op.evaluate({&x}, {1.0, 1.0, 0.5}, {2});
     auto out = results->at(0);
 
@@ -3375,7 +3375,7 @@ TYPED_TEST(TypedDeclarableOpsTests8, LrnTest_2) {
     0.57474375f,  0.49886885f,  0.44720373f,  0.50111103f,  0.5799219f }
     );
 //
-    nd4j::ops::lrn op;
+    sd::ops::lrn op;
     auto results = op.evaluate({&x}, {1.0, 1.0, 0.5}, {2});
     auto out = results->at(0);
 
@@ -3452,7 +3452,7 @@ TYPED_TEST(TypedDeclarableOpsTests8, LrnTest_3) {
             0.57474375f,  0.49886885f,  0.44720373f,  0.50111103f,  0.5799219f }
     );
 //
-    nd4j::ops::lrn op;
+    sd::ops::lrn op;
     auto results = op.evaluate({&x}, {1.0, 1.0, 0.5}, {2});
     auto out = results->at(0);
 
@@ -3472,7 +3472,7 @@ TYPED_TEST(TypedDeclarableOpsTests8, LrnTest_4) {
     auto x = NDArrayFactory::create<TypeParam>('c', {2, 8, 16, 16});
     x.linspace(1);
 
-    nd4j::ops::lrn op;
+    sd::ops::lrn op;
     auto results = op.evaluate({&x}, {1.0, 1.0, 0.5}, {2});
     auto out = results->at(0);
 
@@ -3494,7 +3494,7 @@ TYPED_TEST(TypedDeclarableOpsTests8, LrnTest_4_119) {
     auto z = NDArrayFactory::create<TypeParam>('c', {2, 8, 16, 16});
     x.linspace(1);
 
-    nd4j::ops::lrn op;
+    sd::ops::lrn op;
 
     op.execute({&x}, {&z}, {1.0, 1.0, 0.5}, {2});
 
@@ -3518,7 +3518,7 @@ TYPED_TEST(TypedDeclarableOpsTests8, LrnTest_5) {
     auto x = NDArrayFactory::create<TypeParam>('f', {8, 32, 64, 64});
     x.linspace(1);
 
-    nd4j::ops::lrn op;
+    sd::ops::lrn op;
     auto results = op.evaluate({&x}, {1.0, 1.0, 0.5}, {2});
     auto out = results->at(0);
 
@@ -3541,7 +3541,7 @@ TYPED_TEST(TypedDeclarableOpsTests8, LrnTest_BP_01) {
 //            0.238337, 0.309664, 0.334077, 0.376534, 0.342926, 0.370734, 0.362017, 0.354182, 0.379140, 0.376275, 0.380027, 0.368347, 0.356401, 0.378316, 0.381315, 0.382465, 0.370592, 0.357055, 0.377670, 0.382950, 0.383445, 0.371718, 0.357332, 0.377217, 0.383677, 0.383933, 0.372391, 0.357475, 0.376891, 0.384062, 0.384212, 0.372837, 0.357557, 0.376646, 0.384290, 0.384385, 0.373153, 0.357610, 0.376457, 0.384436, 0.384500, 0.373389, 0.357645, 0.376306, 0.384536, 0.384581, 0.373572, 0.357670, 0.376184, 0.384606, 0.384639, 0.373718, 0.357688, 0.376082, 0.384658, 0.384683, 0.373837, 0.357702, 0.375996, 0.384698, 0.384717, 0.373935, 0.357712, 0.375923, 0.384728, 0.384743, 0.374019, 0.357721, 0.375860, 0.384752, 0.384764, 0.374090, 0.357727, 0.375804, 0.384771, 0.384781, 0.374152, 0.357733, 0.375756, 0.384787, 0.384795, 0.374205, 0.357737, 0.375713, 0.384800, 0.384807, 0.374253, 0.357741, 0.375674, 0.384811, 0.384817, 0.374295, 0.357744, 0.375640, 0.384820, 0.384825, 0.374333, 0.357747, 0.375609, 0.384828, 0.384832, 0.374366, 0.357749, 0.375581, 0.384835, 0.384839, 0.374397, 0.357751, 0.375555, 0.384841, 0.384844, 0.374425, 0.357753, 0.375531, 0.384846, 0.384849, 0.374450, 0.357754, 0.375510, 0.384850, 0.384853, 0.374473, 0.357756, 0.375490, 0.384854, 0.384856, 0.374494, 0.357757, 0.375471, 0.384858, 0.384860, 0.374514, 0.357758, 0.375454, 0.384861, 0.384863, 0.374532, 0.357759, 0.375438, 0.384864, 0.384865, 0.374549, 0.357760, 0.375423, 0.384866, 0.384868, 0.374565, 0.357760, 0.375410, 0.384868, 0.384870, 0.374579, 0.357761, 0.375397, 0.384870, 0.384872, 0.374593, 0.357762, 0.375384, 0.384872, 0.384873, 0.374606, 0.357762, 0.375373, 0.384874, 0.384875, 0.374618, 0.357763, 0.375362, 0.384875, 0.384876, 0.374629, 0.357763, 0.375352, 0.384877, 0.384878, 0.374640, 0.357764, 0.375342, 0.384878, 0.384879, 0.374650, 0.357764, 0.375333, 0.384879, 0.384880, 0.374660, 0.357764, 0.375325, 0.384880, 0.384881, 0.374669, 0.357765, 0.375316, 0.384881, 0.384882, 0.374677, 0.357765, 0.375309, 0.384882, 0.384883, 0.374685, 0.357765, 0.375301, 0.384883, 0.384884, 0.374693, 0.357765, 0.375294, 0.384884, 0.384884, 0.374700, 0.357766, 0.375287, 0.384885, 0.384885, 0.374707, 0.357766, 0.375281, 0.384885, 0.384886, 0.374714, 0.357766, 0.375275, 0.384886}
 //    );
 ///
-    nd4j::ops::lrn_bp op;
+    sd::ops::lrn_bp op;
     auto results = op.evaluate({&x, &eps}, {1.0, 1.0, 0.5}, {5});
     auto out = results->at(0);
 
@@ -3566,14 +3566,14 @@ TYPED_TEST(TypedDeclarableOpsTests8, LrnTest_BP_02) {
 //            0.238337, 0.309664, 0.334077, 0.376534, 0.342926, 0.370734, 0.362017, 0.354182, 0.379140, 0.376275, 0.380027, 0.368347, 0.356401, 0.378316, 0.381315, 0.382465, 0.370592, 0.357055, 0.377670, 0.382950, 0.383445, 0.371718, 0.357332, 0.377217, 0.383677, 0.383933, 0.372391, 0.357475, 0.376891, 0.384062, 0.384212, 0.372837, 0.357557, 0.376646, 0.384290, 0.384385, 0.373153, 0.357610, 0.376457, 0.384436, 0.384500, 0.373389, 0.357645, 0.376306, 0.384536, 0.384581, 0.373572, 0.357670, 0.376184, 0.384606, 0.384639, 0.373718, 0.357688, 0.376082, 0.384658, 0.384683, 0.373837, 0.357702, 0.375996, 0.384698, 0.384717, 0.373935, 0.357712, 0.375923, 0.384728, 0.384743, 0.374019, 0.357721, 0.375860, 0.384752, 0.384764, 0.374090, 0.357727, 0.375804, 0.384771, 0.384781, 0.374152, 0.357733, 0.375756, 0.384787, 0.384795, 0.374205, 0.357737, 0.375713, 0.384800, 0.384807, 0.374253, 0.357741, 0.375674, 0.384811, 0.384817, 0.374295, 0.357744, 0.375640, 0.384820, 0.384825, 0.374333, 0.357747, 0.375609, 0.384828, 0.384832, 0.374366, 0.357749, 0.375581, 0.384835, 0.384839, 0.374397, 0.357751, 0.375555, 0.384841, 0.384844, 0.374425, 0.357753, 0.375531, 0.384846, 0.384849, 0.374450, 0.357754, 0.375510, 0.384850, 0.384853, 0.374473, 0.357756, 0.375490, 0.384854, 0.384856, 0.374494, 0.357757, 0.375471, 0.384858, 0.384860, 0.374514, 0.357758, 0.375454, 0.384861, 0.384863, 0.374532, 0.357759, 0.375438, 0.384864, 0.384865, 0.374549, 0.357760, 0.375423, 0.384866, 0.384868, 0.374565, 0.357760, 0.375410, 0.384868, 0.384870, 0.374579, 0.357761, 0.375397, 0.384870, 0.384872, 0.374593, 0.357762, 0.375384, 0.384872, 0.384873, 0.374606, 0.357762, 0.375373, 0.384874, 0.384875, 0.374618, 0.357763, 0.375362, 0.384875, 0.384876, 0.374629, 0.357763, 0.375352, 0.384877, 0.384878, 0.374640, 0.357764, 0.375342, 0.384878, 0.384879, 0.374650, 0.357764, 0.375333, 0.384879, 0.384880, 0.374660, 0.357764, 0.375325, 0.384880, 0.384881, 0.374669, 0.357765, 0.375316, 0.384881, 0.384882, 0.374677, 0.357765, 0.375309, 0.384882, 0.384883, 0.374685, 0.357765, 0.375301, 0.384883, 0.384884, 0.374693, 0.357765, 0.375294, 0.384884, 0.384884, 0.374700, 0.357766, 0.375287, 0.384885, 0.384885, 0.374707, 0.357766, 0.375281, 0.384885, 0.384886, 0.374714, 0.357766, 0.375275, 0.384886}
 //    );
 ///
-    nd4j::ops::lrn opFF;
-    nd4j::ops::lrn_bp opBP;
+    sd::ops::lrn opFF;
+    sd::ops::lrn_bp opBP;
 
     const OpArgsHolder argsHolderFF({&x},         {1., 1., 0.5}, {5});
     const OpArgsHolder argsHolderBP({&x, &eps}, {1., 1., 0.5}, {5});
 
     bool gradOK = true; //GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
-    //auto  results = op.execute({&x, &eps}, {1.0, 1.0, 0.5}, {5}, {}, false, nd4j::DataType::DOUBLE);
+    //auto  results = op.execute({&x, &eps}, {1.0, 1.0, 0.5}, {5}, {}, false, sd::DataType::DOUBLE);
     //auto out = results->at(0);
 
     //ASSERT_EQ(Status::OK(), results->status());
@@ -3597,7 +3597,7 @@ auto exp = NDArrayFactory::create<TypeParam>('c', {3,3,5,5}, {
         0.238337f, 0.309664f, 0.334077f, 0.376534f, 0.342926f, 0.370734f, 0.362017f, 0.354182f, 0.379140f, 0.376275f, 0.380027f, 0.368347f, 0.356401f, 0.378316f, 0.381315f, 0.382465f, 0.370592f, 0.357055f, 0.377670f, 0.382950f, 0.383445f, 0.371718f, 0.357332f, 0.377217f, 0.383677f, 0.383933f, 0.372391f, 0.357475f, 0.376891f, 0.384062f, 0.384212f, 0.372837f, 0.357557f, 0.376646f, 0.384290f, 0.384385f, 0.373153f, 0.357610f, 0.376457f, 0.384436f, 0.384500f, 0.373389f, 0.357645f, 0.376306f, 0.384536f, 0.384581f, 0.373572f, 0.357670f, 0.376184f, 0.384606f, 0.384639f, 0.373718f, 0.357688f, 0.376082f, 0.384658f, 0.384683f, 0.373837f, 0.357702f, 0.375996f, 0.384698f, 0.384717f, 0.373935f, 0.357712f, 0.375923f, 0.384728f, 0.384743f, 0.374019f, 0.357721f, 0.375860f, 0.384752f, 0.384764f, 0.374090f, 0.357727f, 0.375804f, 0.384771f, 0.384781f, 0.374152f, 0.357733f, 0.375756f, 0.384787f, 0.384795f, 0.374205f, 0.357737f, 0.375713f, 0.384800f, 0.384807f, 0.374253f, 0.357741f, 0.375674f, 0.384811f, 0.384817f, 0.374295f, 0.357744f, 0.375640f, 0.384820f, 0.384825f, 0.374333f, 0.357747f, 0.375609f, 0.384828f, 0.384832f, 0.374366f, 0.357749f, 0.375581f, 0.384835f, 0.384839f, 0.374397f, 0.357751f, 0.375555f, 0.384841f, 0.384844f, 0.374425f, 0.357753f, 0.375531f, 0.384846f, 0.384849f, 0.374450f, 0.357754f, 0.375510f, 0.384850f, 0.384853f, 0.374473f, 0.357756f, 0.375490f, 0.384854f, 0.384856f, 0.374494f, 0.357757f, 0.375471f, 0.384858f, 0.384860f, 0.374514f, 0.357758f, 0.375454f, 0.384861f, 0.384863f, 0.374532f, 0.357759f, 0.375438f, 0.384864f, 0.384865f, 0.374549f, 0.357760f, 0.375423f, 0.384866f, 0.384868f, 0.374565f, 0.357760f, 0.375410f, 0.384868f, 0.384870f, 0.374579f, 0.357761f, 0.375397f, 0.384870f, 0.384872f, 0.374593f, 0.357762f, 0.375384f, 0.384872f, 0.384873f, 0.374606f, 0.357762f, 0.375373f, 0.384874f, 0.384875f, 0.374618f, 0.357763f, 0.375362f, 0.384875f, 0.384876f, 0.374629f, 0.357763f, 0.375352f, 0.384877f, 0.384878f, 0.374640f, 0.357764f, 0.375342f, 0.384878f, 0.384879f, 0.374650f, 0.357764f, 0.375333f, 0.384879f, 0.384880f, 0.374660f, 0.357764f, 0.375325f, 0.384880f, 0.384881f, 0.374669f, 0.357765f, 0.375316f, 0.384881f, 0.384882f, 0.374677f, 0.357765f, 0.375309f, 0.384882f, 0.384883f, 0.374685f, 0.357765f, 0.375301f, 0.384883f, 0.384884f, 0.374693f, 0.357765f, 0.375294f, 0.384884f, 0.384884f, 0.374700f, 0.357766f, 0.375287f, 0.384885f, 0.384885f, 0.374707f, 0.357766f, 0.375281f, 0.384885f, 0.384886f, 0.374714f, 0.357766f, 0.375275f, 0.384886f}
     );
 ///
-    nd4j::ops::lrn_bp op;
+    sd::ops::lrn_bp op;
     auto  results = op.evaluate({&x, &eps}, {1.0, 1.0, 0.5}, {2}, {}, {}, false);
     auto out = results->at(0);
 
@@ -3677,7 +3677,7 @@ TYPED_TEST(TypedDeclarableOpsTests8, LrnTest_BP_2) {
         //    0.009859f, 0.013075f, 0.013874f, 0.017893f, 0.022344f, 0.014551f, 0.012859f, 0.011511f, 0.013311f, 0.015834f, 0.012025f, 0.010047f, 0.008601f, 0.009920f, 0.011885f, 0.009505f, 0.007636f, 0.006299f, 0.007413f, 0.009095f, 0.007446f, 0.005743f, 0.004540f, 0.005533f, 0.007033f, 0.005821f, 0.004282f, 0.003209f, 0.004123f, 0.005491f, 0.004577f, 0.003198f, 0.002247f, 0.003097f, 0.004355f, 0.003652f, 0.002412f, 0.001565f, 0.002357f, 0.003517f, 0.002965f, 0.001844f, 0.001084f, 0.001821f, 0.002893f, 0.002451f, 0.001430f, 0.000741f, 0.001428f, 0.002422f, -0.111434f, -0.105946f, -0.100351f, -0.091868f, -0.083323f, -0.078775f, -0.076222f, -0.073291f, -0.067635f, -0.061692f, -0.058943f, -0.057832f, -0.056263f, -0.052198f, -0.047768f, -0.046002f, -0.045655f, -0.044839f, -0.041748f, -0.038271f, -0.037084f, -0.037161f, -0.036786f, -0.034331f, -0.031495f, 0.000077f, -0.000673f, -0.001181f, -0.000667f, 0.000079f, -0.000089f, -0.000802f, -0.001285f, -0.000793f, -0.000079f, -0.000228f, -0.000908f, -0.001368f, -0.000896f, -0.000212f, -0.000345f, -0.000996f, -0.001434f, -0.000981f, -0.000325f, -0.000444f, -0.001067f, -0.001487f, -0.001051f, -0.000421f, 0.000697f, 0.000188f, -0.000152f, 0.000210f, 0.000731f, 0.000650f, 0.000165f, -0.000161f, 0.000185f, 0.000683f, 0.000610f, 0.000145f, -0.000168f, 0.000164f, 0.000641f, 0.000574f, 0.000128f, -0.000172f, 0.000146f, 0.000604f, 0.000542f, 0.000113f, -0.000175f, 0.000131f, 0.000571f, -0.009490f, -0.010070f, -0.010409f, -0.009734f, -0.008834f, -0.008785f, -0.009351f, -0.009687f, -0.009054f, -0.008207f, -0.008167f, -0.008718f, -0.009050f, -0.008455f, -0.007654f, -0.007622f, -0.008159f, -0.008485f, -0.007924f, -0.007164f, -0.007138f, -0.007661f, -0.007981f, -0.007450f, -0.006728f, -0.000901f, -0.001327f, -0.001614f, -0.001310f, -0.000869f, -0.000913f, -0.001328f, -0.001607f, -0.001310f, -0.000882f, -0.000922f, -0.001326f, -0.001598f, -0.001309f, -0.000892f, -0.000930f, -0.001323f, -0.001588f, -0.001306f, -0.000900f, -0.000936f, -0.001319f, -0.001577f, -0.001302f, -0.000906f, 0.000339f, 0.000038f, -0.000164f, 0.000048f, 0.000355f, 0.000328f, 0.000035f, -0.000162f, 0.000045f, 0.000343f, 0.000318f, 0.000033f, -0.000159f, 0.000041f, 0.000332f, 0.000308f, 0.000030f, -0.000157f, 0.000039f, 0.000322f, 0.000299f, 0.000028f, -0.000155f, 0.000036f, 0.000312f, -0.004085f, -0.004479f, -0.004733f, -0.004396f, -0.003925f, -0.003925f, -0.004309f, -0.004558f, -0.004232f, -0.003775f, -0.003776f, -0.004151f, -0.004395f, -0.004079f, -0.003636f, -0.003637f, -0.004004f, -0.004242f, -0.003936f, -0.003505f, -0.003507f, -0.003866f, -0.004100f, -0.003802f, -0.003383f}
     );
 
-    nd4j::ops::lrn_bp op;
+    sd::ops::lrn_bp op;
     auto  results = op.evaluate({&x, &eps}, {1.0, 1.0, 0.5}, {2}, {}, {}, false);
     auto out = results->at(0);
 
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp
index 773e1dc18..a1c5ac832 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp
@@ -21,13 +21,13 @@
 
 #include "testlayers.h"
 #include <ops/declarable/CustomOperations.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <ops/ops.h>
-#include <GradCheck.h>
+#include <helpers/GradCheck.h>
 #include <loops/random.h>
 
 
-using namespace nd4j;
+using namespace sd;
 
 
 class DeclarableOpsTests9 : public testing::Test {
@@ -49,7 +49,7 @@ TEST_F(DeclarableOpsTests9, reduceStDevBP_test3) {
 
     x.linspace(1);
 
-    nd4j::ops::reduce_stdev_bp op;
+    sd::ops::reduce_stdev_bp op;
 
     auto result = op.evaluate({&x, &gradO2}, {0,0}, {1});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -78,7 +78,7 @@ TEST_F(DeclarableOpsTests9, reduceStDevBP_test03) {
     auto axis = NDArrayFactory::create<int>('c', {1}, {1});
     x.linspace(1);
 
-    nd4j::ops::reduce_stdev_bp op;
+    sd::ops::reduce_stdev_bp op;
 
     auto result = op.evaluate({&x, &gradO2, &axis}, {}, {}, {false, false});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -110,7 +110,7 @@ TEST_F(DeclarableOpsTests9, exponentialDistributionInv_test1) {
     double extraParams[] = {lambda};
 
     Nd4jLong *buffer = new Nd4jLong[N];
-    auto rng = (nd4j::random::RandomBuffer *) initRandom(nullptr, 123, N, (Nd4jPointer) buffer);
+    auto rng = (sd::random::RandomBuffer *) initRandom(nullptr, 123, N, (Nd4jPointer) buffer);
     if (rng == nullptr)
         throw std::runtime_error("DeclarableOpsTests9.exponentialDistributionInv_test1: RNG initialization failed !");
 
@@ -141,7 +141,7 @@ TEST_F(DeclarableOpsTests9, exponentialDistributionInv_test2) {
 
 
     Nd4jLong *buffer = new Nd4jLong[N];
-    auto rng = (nd4j::random::RandomBuffer *) initRandom(nullptr, 123, N, (Nd4jPointer) buffer);
+    auto rng = (sd::random::RandomBuffer *) initRandom(nullptr, 123, N, (Nd4jPointer) buffer);
     if (rng == nullptr)
         throw std::runtime_error("DeclarableOpsTests9.exponentialDistributionInv_test2: RNG initialization failed !");
 
@@ -170,7 +170,7 @@ TEST_F(DeclarableOpsTests9, exponentialDistribution_test1) {
     double extraParams[] = {lambda};
 
     Nd4jLong *buffer = new Nd4jLong[N];
-    auto rng = (nd4j::random::RandomBuffer *) initRandom(nullptr, 123, N, (Nd4jPointer) buffer);
+    auto rng = (sd::random::RandomBuffer *) initRandom(nullptr, 123, N, (Nd4jPointer) buffer);
     if (rng == nullptr)
         throw std::runtime_error("DeclarableOpsTests9.exponentialDistribution_test1: RNG initialization failed !");
 
@@ -203,7 +203,7 @@ TEST_F(DeclarableOpsTests9, exponentialDistribution_test2) {
     Nd4jLong *buffer = new Nd4jLong[N];
    // Nd4jPointer extra[2];
 #ifndef __CUDABLAS__
-    nd4j::random::RandomBuffer* rng = (nd4j::random::RandomBuffer *) initRandom(nullptr, 123, N, (Nd4jPointer) buffer);
+    sd::random::RandomBuffer* rng = (sd::random::RandomBuffer *) initRandom(nullptr, 123, N, (Nd4jPointer) buffer);
     if (rng == nullptr)
         throw std::runtime_error("DeclarableOpsTests9.exponentialDistribution_test2: RNG initialization failed !");
 
@@ -246,7 +246,7 @@ TEST_F(DeclarableOpsTests9, concat_test1) {
     x1.linspace(1);
     x2.linspace(1);
 
-    nd4j::ops::concat op;
+    sd::ops::concat op;
 
     auto result = op.evaluate({&x0, &x1, &x2}, {}, {1});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -270,7 +270,7 @@ TEST_F(DeclarableOpsTests9, concat_test2) {
     x1.linspace(1);
     x2.linspace(1);
 
-    nd4j::ops::concat op;
+    sd::ops::concat op;
 
     auto result = op.evaluate({&x0, &x1, &x2}, {}, {1});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -294,7 +294,7 @@ TEST_F(DeclarableOpsTests9, concat_test3) {
     x1.linspace(1);
     x2.linspace(1);
 
-    nd4j::ops::concat op;
+    sd::ops::concat op;
 
     auto result = op.evaluate({&x0, &x1, &x2}, {}, {0});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -316,7 +316,7 @@ TEST_F(DeclarableOpsTests9, concat_test4) {
     auto x2 = NDArrayFactory::create<double>('c', {1,1,1}, {3.f});
     auto exp = NDArrayFactory::create<double>('c', {1,3,1}, {1.f, 2.f, 3.f});
 
-    nd4j::ops::concat op;
+    sd::ops::concat op;
 
     auto result = op.evaluate({&x0, &x1, &x2}, {}, {1});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -336,7 +336,7 @@ TEST_F(DeclarableOpsTests9, concat_test5) {
     auto x2 = NDArrayFactory::create<double>(3.f);
     auto exp = NDArrayFactory::create<double>('c', {3}, {1.f, 2.f, 3.f});
 
-    nd4j::ops::concat op;
+    sd::ops::concat op;
 
     auto result = op.evaluate({&x0, &x1, &x2}, {}, {0});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -356,7 +356,7 @@ TEST_F(DeclarableOpsTests9, concat_test6) {
     auto x2 = NDArrayFactory::create<double>(3.f);
     auto exp = NDArrayFactory::create<double>('c', {4}, {1.f, 2.f, 20.f, 3.f});
 
-    nd4j::ops::concat op;
+    sd::ops::concat op;
 
     auto result = op.evaluate({&x0, &x1, &x2}, {}, {0});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -376,7 +376,7 @@ TEST_F(DeclarableOpsTests9, concat_test7) {
     auto x2 = NDArrayFactory::create<double>(3.f);
     auto exp = NDArrayFactory::create<double>('c', {3}, {1.f, 2.f, 3.f});
 
-    nd4j::ops::concat op;
+    sd::ops::concat op;
 
     auto result = op.evaluate({&x0, &x1, &x2}, {}, {0});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -394,7 +394,7 @@ TEST_F(DeclarableOpsTests9, concat_test8) {
     auto x0 = NDArrayFactory::create<double>(1.f);
     auto exp = NDArrayFactory::create<double>('c', {1}, {1.f});
 
-    nd4j::ops::concat op;
+    sd::ops::concat op;
 
     auto result = op.evaluate({&x0}, {}, {0});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -412,7 +412,7 @@ TEST_F(DeclarableOpsTests9, concat_test9) {
     auto x0 = NDArrayFactory::create<double>('c', {1}, {1.f});
     auto exp = NDArrayFactory::create<double>('c', {1}, {1.f});
 
-    nd4j::ops::concat op;
+    sd::ops::concat op;
 
     auto result = op.evaluate({&x0}, {}, {0});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -437,7 +437,7 @@ TEST_F(DeclarableOpsTests9, concat_test10) {
     x1.linspace(1);
     x2.linspace(1);
 
-    nd4j::ops::concat op;
+    sd::ops::concat op;
 
     auto result = op.evaluate({&x0, &x1, &x2}, {}, {1});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -462,7 +462,7 @@ TEST_F(DeclarableOpsTests9, concat_test11) {
     x1.linspace(1);
     x2.linspace(1);
 
-    nd4j::ops::concat op;
+    sd::ops::concat op;
 
     auto result = op.evaluate({&x0, &x1, &x2}, {}, {1});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -487,7 +487,7 @@ TEST_F(DeclarableOpsTests9, concat_test12) {
     x1.linspace(1);
     x2.linspace(1);
 
-    nd4j::ops::concat op;
+    sd::ops::concat op;
 
     auto result = op.evaluate({&x0, &x1, &x2}, {}, {1});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -512,7 +512,7 @@ TEST_F(DeclarableOpsTests9, concat_test13) {
     x1.linspace(1);
     x2.linspace(1);
 
-    nd4j::ops::concat op;
+    sd::ops::concat op;
 
     auto result = op.evaluate({&x0, &x1, &x2}, {}, {1});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -527,13 +527,13 @@ TEST_F(DeclarableOpsTests9, concat_test13) {
 
 TEST_F(DeclarableOpsTests9, concat_test14) {
 
-    NDArray x0('c', {1, 40, 60}, nd4j::DataType::DOUBLE);
-    NDArray x1('c', {1, 40, 60}, nd4j::DataType::DOUBLE);
+    NDArray x0('c', {1, 40, 60}, sd::DataType::DOUBLE);
+    NDArray x1('c', {1, 40, 60}, sd::DataType::DOUBLE);
 
     x0 = 1.;
     x1 = 2.;
 
-    nd4j::ops::concat op;
+    sd::ops::concat op;
     auto result = op.evaluate({&x0, &x1}, {}, {0}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -556,7 +556,7 @@ TEST_F(DeclarableOpsTests9, concat_test15) {
     auto y = NDArrayFactory::create<double> (3.0f);
     auto exp = NDArrayFactory::create<double>('c', {3}, {1, 0, 3});
 
-    nd4j::ops::concat op;
+    sd::ops::concat op;
     auto result = op.evaluate({&x, &y}, {}, {0});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -575,7 +575,7 @@ TEST_F(DeclarableOpsTests9, concat_test16) {
     auto y = NDArrayFactory::create<double>('c', {0,2,3});
     auto exp = NDArrayFactory::create<double>('c', {0,2,3});
 
-    nd4j::ops::concat op;
+    sd::ops::concat op;
     auto result = op.evaluate({&x, &y}, {}, {0});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -589,13 +589,13 @@ TEST_F(DeclarableOpsTests9, concat_test16) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests9, concat_test17) {
 
-    NDArray x0('c', {1, 55, 40}, nd4j::DataType::DOUBLE);
-    NDArray x1('c', {1, 55, 40}, nd4j::DataType::DOUBLE);
+    NDArray x0('c', {1, 55, 40}, sd::DataType::DOUBLE);
+    NDArray x1('c', {1, 55, 40}, sd::DataType::DOUBLE);
 
     x0 = 1.;
     x1 = 2.;
 
-    nd4j::ops::concat op;
+    sd::ops::concat op;
     auto result = op.evaluate({&x0, &x1}, {}, {0}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -631,7 +631,7 @@ TEST_F(DeclarableOpsTests9, concat_test18) {
     context.setOutputArray(0, &z, false);
     context.setIArguments(&axis, 1);
 
-    nd4j::ops::concat op;
+    sd::ops::concat op;
     op.execute(&context);
 
     for (int e = 0; e < 2000; e++) {
@@ -659,7 +659,7 @@ TEST_F(DeclarableOpsTests9, concat_test19) {
     context.setOutputArray(0, &z, false);
     context.setIArguments(&axis, 1);
 
-    nd4j::ops::concat op;
+    sd::ops::concat op;
     op.execute(&context);
 
     for (int e = 0; e < 10; e++)
@@ -678,7 +678,7 @@ TEST_F(DeclarableOpsTests9, concat_test20) {
     x2.assign(3.0);
     x3.assign(4.0);
 
-    nd4j::ops::concat op;
+    sd::ops::concat op;
     auto result = op.evaluate({&x0, &x1, &x2, &x3}, {}, {0}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -699,14 +699,14 @@ TEST_F(DeclarableOpsTests9, concat_test20) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests9, concat_test21) {
 
-    NDArray x0('c', {1,4,5}, nd4j::DataType::FLOAT32);
-    NDArray x1('c', {2,4,5}, nd4j::DataType::FLOAT32);
-    NDArray  z('f', {3,4,5}, nd4j::DataType::FLOAT32);
+    NDArray x0('c', {1,4,5}, sd::DataType::FLOAT32);
+    NDArray x1('c', {2,4,5}, sd::DataType::FLOAT32);
+    NDArray  z('f', {3,4,5}, sd::DataType::FLOAT32);
 
     x0 = 0.;
     x1 = 1.;
 
-    nd4j::ops::concat op;
+    sd::ops::concat op;
     auto status = op.execute({&x0, &x1}, {&z}, {}, {0}, {});
     ASSERT_EQ(ND4J_STATUS_OK, status);
 }
@@ -716,10 +716,10 @@ TEST_F(DeclarableOpsTests9, concat_test22) {
 
     NDArray x0('c', {1,6}, {1,2,3,4,5,6});
     NDArray x1('c', {1,6}, {7,8,9,10,11,12});
-    NDArray output('f', {2,6}, nd4j::DataType::DOUBLE);
+    NDArray output('f', {2,6}, sd::DataType::DOUBLE);
     NDArray exp('c', {2,6}, {1,2,3,4,5,6,7,8,9,10,11,12});
 
-    nd4j::ops::concat op;
+    sd::ops::concat op;
 
     auto status = op.execute({&x0, &x1}, {&output}, {}, {0}, {});
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -732,10 +732,10 @@ TEST_F(DeclarableOpsTests9, concat_test23) {
 
     NDArray x0('c', {1,4}, {1,2,3,4});
     NDArray x1('c', {1,4}, {5,6,7,8});
-    NDArray output('c', {2,4}, nd4j::DataType::DOUBLE);
+    NDArray output('c', {2,4}, sd::DataType::DOUBLE);
     NDArray exp('c', {2,4}, {1,2,3,4,5,6,7,8});
 
-    nd4j::ops::concat op;
+    sd::ops::concat op;
 
     auto status = op.execute({&x0, &x1}, {&output}, {}, {0}, {});
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -750,7 +750,7 @@ TEST_F(DeclarableOpsTests9, concat_test24) {
     auto e = NDArrayFactory::create<double>('c', {2, 2}, {1, 0, 1, 0});
     auto z = NDArrayFactory::create<double>('c', {2, 2});
 
-    nd4j::ops::concat op;
+    sd::ops::concat op;
     auto status = op.execute({&x, &y}, {&z}, {}, {1}, {});
     ASSERT_EQ(Status::OK(), status);
 
@@ -765,7 +765,7 @@ TEST_F(DeclarableOpsTests9, concat_test25) {
     auto axis = NDArrayFactory::create<double>('c', {1}, {0.});
     auto exp = NDArrayFactory::create<double>('c', {2,4}, {1,2,3,4,5,6,7,8});
 
-    nd4j::ops::concat op;
+    sd::ops::concat op;
 
     auto result = op.evaluate({&x0, &x1, &axis}, {}, {}, {true});
 
@@ -787,7 +787,7 @@ TEST_F(DeclarableOpsTests9, tile_bp_test1) {
 
     gradO.linspace(0.01, 0.01);
 
-    nd4j::ops::tile_bp op;
+    sd::ops::tile_bp op;
     auto results = op.evaluate({&input, &gradO}, {}, {2, 3});
     auto gradI = results->at(0);
 
@@ -807,7 +807,7 @@ TEST_F(DeclarableOpsTests9, tile_bp_test2) {
 
     gradO.linspace(0.01, 0.01);
 
-    nd4j::ops::tile_bp op;
+    sd::ops::tile_bp op;
     auto results = op.evaluate({&input, &gradO}, {}, {1, 3});
     auto gradI = results->at(0);
     ASSERT_EQ(Status::OK(), results->status());
@@ -826,7 +826,7 @@ TEST_F(DeclarableOpsTests9, tile_bp_test3) {
 
     gradO.linspace(0.01, 0.01);
 
-    nd4j::ops::tile_bp op;
+    sd::ops::tile_bp op;
     auto results = op.evaluate({&input, &gradO}, {}, {1, 1});
     auto gradI = results->at(0);
 
@@ -846,7 +846,7 @@ TEST_F(DeclarableOpsTests9, tile_bp_test4) {
 
     gradO.linspace(0.01, 0.01);
 
-    nd4j::ops::tile_bp op;
+    sd::ops::tile_bp op;
     auto results = op.evaluate({&input, &gradO}, {}, {2});
     auto gradI = results->at(0);
 
@@ -866,7 +866,7 @@ TEST_F(DeclarableOpsTests9, tile_bp_test5) {
 
     gradO.linspace(0.01, 0.01);
 
-    nd4j::ops::tile_bp op;
+    sd::ops::tile_bp op;
     auto results = op.evaluate({&input, &gradO}, {}, {1});
     auto gradI = results->at(0);
 
@@ -886,7 +886,7 @@ TEST_F(DeclarableOpsTests9, tile_bp_test6) {
 
     gradO.linspace(0.01, 0.01);
 
-    nd4j::ops::tile_bp op;
+    sd::ops::tile_bp op;
     auto results = op.evaluate({&input, &gradO}, {}, {1, 3, 2});
     auto gradI = results->at(0);
 
@@ -907,7 +907,7 @@ TEST_F(DeclarableOpsTests9, tile_bp_test7) {
 
     gradO.linspace(0.01, 0.01);
 
-    nd4j::ops::tile_bp op;
+    sd::ops::tile_bp op;
     auto results = op.evaluate({&input, &reps, &gradO}, {}, {});
     auto gradI = results->at(0);
 
@@ -925,7 +925,7 @@ TEST_F(DeclarableOpsTests9, tile_test1) {
     auto reps   = NDArrayFactory::create<int>('c', {1, 2}, {2, 1});
     auto expOut = NDArrayFactory::create<double>('c', {2, 6,}, {1.,2.,3.,4.,5.,6., 1.,2.,3.,4.,5.,6.});
 
-    nd4j::ops::tile op;
+    sd::ops::tile op;
     auto results = op.evaluate({&input, &reps}, {}, {});
     auto out = results->at(0);
 
@@ -942,7 +942,7 @@ TEST_F(DeclarableOpsTests9, TestDropout_BP_1) {
     NDArray x('c', {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f});
     NDArray errs('c', {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f});
     NDArray shape('c', {2}, {2, 2});
-    nd4j::ops::dropout_bp op;
+    sd::ops::dropout_bp op;
 
     auto ress = op.evaluate({&x, &errs, &shape}, {0.2f}, {113});
 
@@ -956,10 +956,10 @@ TEST_F(DeclarableOpsTests9, TestDropout_BP_1) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests9, TestDropout_1) {
 
-    NDArray x('c', {10, 10}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {10, 10}, sd::DataType::FLOAT32);
 //    NDArray<float> errs('c', {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f});
     //NDArray<float> shape({2.f, 2.f});
-    nd4j::ops::dropout op;
+    sd::ops::dropout op;
     x.linspace(1);
     auto ress = op.evaluate({&x}, {0.2f}, {113});
 
@@ -986,8 +986,8 @@ TEST_F(DeclarableOpsTests9, TestDropout_1) {
 }
 
 TEST_F(DeclarableOpsTests9, Test_DropoutInverted_01) {
-    NDArray x0('c', {10, 10}, nd4j::DataType::FLOAT32);
-    NDArray x1('c', {10, 10}, nd4j::DataType::FLOAT32);
+    NDArray x0('c', {10, 10}, sd::DataType::FLOAT32);
+    NDArray x1('c', {10, 10}, sd::DataType::FLOAT32);
 
     x0.linspace(1);
     x1.linspace(1);
@@ -995,7 +995,7 @@ TEST_F(DeclarableOpsTests9, Test_DropoutInverted_01) {
     float prob[] = {0.5f};
     Nd4jLong* _bufferA = new Nd4jLong[100000];
     long _seed = 119L;
-    auto _rngA = (nd4j::random::RandomBuffer *) initRandom(nullptr, _seed, 100000, (Nd4jPointer) _bufferA);
+    auto _rngA = (sd::random::RandomBuffer *) initRandom(nullptr, _seed, 100000, (Nd4jPointer) _bufferA);
 
     x0. applyTransform(random::DropOutInverted, &x0, prob);
 //    x1.template applyRandom<randomOps::DropOutInverted<float>>(_rngB, nullptr, &x1, prob);
@@ -1014,7 +1014,7 @@ TEST_F(DeclarableOpsTests9, Test_DropoutInverted_01) {
     destroyRandom(_rngA);
     delete [] _bufferA;
 */
-    nd4j::ops::dropout op;
+    sd::ops::dropout op;
 
     auto ress = op.evaluate({&x1}, {0.5f}, {119});
 
@@ -1023,11 +1023,11 @@ TEST_F(DeclarableOpsTests9, Test_DropoutInverted_01) {
     auto count = ress->at(0)->reduceNumber(reduce::CountNonZero);
 //    nd4j_printf("\n01Dropout count %i\n\n", count);
 
-    nd4j::ops::dropout_bp op2;
+    sd::ops::dropout_bp op2;
     //NDArray<float> exp('c', {10,10}, {4.f, 0.f, 12.f, 0.f, 20.f, 24.f, 0.f, 32.f, 0.f, 0.f, 0.f, 0.f, 52.f, 56.f, 60.f, 0.f, 0.f, 0.f, 0.f, 0.f, 84.f, 88.f, 0.f, 0.f, 0.f, 0.f, 108.f, 0.f, 0.f, 120.f, 0.f, 0.f, 132.f, 0.f, 0.f, 0.f, 0.f, 0.f, 156.f, 0.f, 164.f, 168.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 200.f, 204.f, 0.f, 0.f, 0.f, 220.f, 0.f, 0.f, 232.f, 236.f, 240.f, 0.f, 248.f, 0.f, 0.f, 260.f, 0.f, 0.f, 0.f, 276.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 316.f, 0.f, 324.f, 0.f, 0.f, 336.f, 0.f, 0.f, 0.f, 0.f, 356.f, 0.f, 0.f, 368.f, 0.f, 0.f, 0.f, 384.f, 388.f, 0.f, 0.f, 400.f});
     //02Dropout result is  [4.000000, 0.000000, 12.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 36.000000, 0.000000, 0.000000, 0.000000, 0.000000, 56.000000, 60.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 88.000000, 0.000000, 96.000000, 0.000000, 0.000000, 108.000000, 0.000000, 0.000000, 120.000000, 0.000000, 128.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 156.000000, 0.000000, 164.000000, 0.000000, 0.000000, 0.000000, 0.000000, 184.000000, 0.000000, 0.000000, 0.000000, 200.000000, 0.000000, 0.000000, 0.000000, 216.000000, 0.000000, 0.000000, 0.000000, 232.000000, 0.000000, 240.000000, 0.000000, 248.000000, 0.000000, 0.000000, 260.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 308.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 348.000000, 0.000000, 356.000000, 0.000000, 0.000000, 0.000000, 0.000000, 376.000000, 0.000000, 384.000000, 0.000000, 0.000000, 0.000000, 400.000000]
 
-    auto ressX = op2.evaluate({&x1, &x1}, {0.5f}, {119}); // , false, nd4j::DataType::FLOAT32); // skipped due given by default
+    auto ressX = op2.evaluate({&x1, &x1}, {0.5f}, {119}); // , false, sd::DataType::FLOAT32); // skipped due given by default
     //x0.printIndexedBuffer("X0");
     //x1.printIndexedBuffer("X1");
     ASSERT_EQ(ND4J_STATUS_OK, ressX->status());
@@ -1060,18 +1060,18 @@ TEST_F(DeclarableOpsTests9, Test_DropoutInverted_01) {
 }
 
 TEST_F(DeclarableOpsTests9, Test_Dropout_BP_2) {
-    NDArray x('c', {10, 10}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {10, 10}, sd::DataType::FLOAT32);
 
     x.linspace(1);
 
-    nd4j::ops::dropout op;
+    sd::ops::dropout op;
 
     auto ress = op.evaluate({&x}, {0.5f}, {119});
 
     ASSERT_EQ(ND4J_STATUS_OK, ress->status());
 //    ress->at(0)->printIndexedBuffer("01Dropout result is ");
 
-    nd4j::ops::dropout_bp op2;
+    sd::ops::dropout_bp op2;
 
     auto ressX = op2.evaluate({&x, &x}, {0.5f}, {119});
 
@@ -1101,13 +1101,13 @@ TEST_F(DeclarableOpsTests9, Test_Dropout_BP_2) {
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests9, Test_AlphaDropout_BP_1) {
-    NDArray x('c', {10, 10}, nd4j::DataType::FLOAT32);
-    NDArray eps('c', {10, 10}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {10, 10}, sd::DataType::FLOAT32);
+    NDArray eps('c', {10, 10}, sd::DataType::FLOAT32);
 
     x.linspace(1);
     eps.linspace(1);
 
-    nd4j::ops::alpha_dropout_bp op;
+    sd::ops::alpha_dropout_bp op;
 
     auto ress = op.evaluate({&x, &eps}, {0.5f, 0.5f, 1.5f, 1.6f}, {119});
 
@@ -1132,7 +1132,7 @@ TEST_F(DeclarableOpsTests9, test_range_int_1) {
     auto x1 = NDArrayFactory::create<int>(2);
     auto x2 = NDArrayFactory::create<int>(1);
 
-    nd4j::ops::range op;
+    sd::ops::range op;
     auto result = op.evaluate({&x0, &x1, &x2}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -1146,7 +1146,7 @@ TEST_F(DeclarableOpsTests9, test_range_empty_1) {
     auto x1 = NDArrayFactory::create<int>(0);
     auto x2 = NDArrayFactory::create<int>(1);
 
-    nd4j::ops::range op;
+    sd::ops::range op;
     auto result = op.evaluate({&x0, &x1, &x2}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -1185,7 +1185,7 @@ TEST_F(DeclarableOpsTests9, test_unstack_1) {
     auto x = NDArrayFactory::create<double>('c', {5, 5});
     x.linspace(1.0);
 
-    nd4j::ops::unstack op;
+    sd::ops::unstack op;
     auto result = op.evaluate({&x}, {}, {0});
     ASSERT_EQ(Status::OK(), result->status());
     ASSERT_EQ(5, result->size());
@@ -1203,7 +1203,7 @@ TEST_F(DeclarableOpsTests9, test_unstack_SGO_1) {
     auto z4 = NDArrayFactory::create<double>(4);
     auto z5 = NDArrayFactory::create<double>(5);
     std::vector<NDArray*> z({&z1, &z2, &z3, &z4, &z5});
-    nd4j::ops::unstack op;
+    sd::ops::unstack op;
     auto result = op.evaluate({&x}, {}, {0});
     ASSERT_EQ(Status::OK(), result->status());
     ASSERT_EQ(5, result->size());
@@ -1240,7 +1240,7 @@ TEST_F(DeclarableOpsTests9, clipbynorm_test12) {
             expect({0,0, j,j+1}).assign ( yCol * (clip / norm2Col) );
     }
 
-    nd4j::ops::clipbynorm op;
+    sd::ops::clipbynorm op;
     auto result = op.evaluate({&y}, {clip}, {axis});
     auto outFF = result->at(0);
 
@@ -1264,8 +1264,8 @@ TEST_F(DeclarableOpsTests9, clipbynorm_bp_test1) {
     const OpArgsHolder argsHolderFF({&x}, {clip}, {});
     const OpArgsHolder argsHolderBP({&x, &gradO}, {clip}, {});
 
-    nd4j::ops::clipbynorm opFF;
-    nd4j::ops::clipbynorm_bp opBP;
+    sd::ops::clipbynorm opFF;
+    sd::ops::clipbynorm_bp opBP;
 
     const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
 
@@ -1286,8 +1286,8 @@ TEST_F(DeclarableOpsTests9, clipbynorm_bp_test2) {
     const OpArgsHolder argsHolderFF({&x}, {clip}, {axis});
     const OpArgsHolder argsHolderBP({&x, &gradO}, {clip}, {axis});
 
-    nd4j::ops::clipbynorm opFF;
-    nd4j::ops::clipbynorm_bp opBP;
+    sd::ops::clipbynorm opFF;
+    sd::ops::clipbynorm_bp opBP;
 
     const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
 
@@ -1309,8 +1309,8 @@ TEST_F(DeclarableOpsTests9, clipbynorm_bp_test3) {
     const OpArgsHolder argsHolderFF({&x}, {clip}, {axis});
     const OpArgsHolder argsHolderBP({&x, &gradO}, {clip}, {axis});
 
-    nd4j::ops::clipbynorm opFF;
-    nd4j::ops::clipbynorm_bp opBP;
+    sd::ops::clipbynorm opFF;
+    sd::ops::clipbynorm_bp opBP;
 
     const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
 
@@ -1334,7 +1334,7 @@ TEST_F(DeclarableOpsTests9, cumprod_1) {
     //************************************//
     exclusive = 0; reverse = 0;
 
-    nd4j::ops::cumprod op;
+    sd::ops::cumprod op;
     auto result = op.evaluate({&inputC, &axis}, {}, {exclusive, reverse});
     ASSERT_EQ(Status::OK(), result->status());
     auto z = result->at(0);
@@ -1373,13 +1373,13 @@ TEST_F(DeclarableOpsTests9, cumprod_1) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests9, cumprod_2) {
 
-    NDArray x('c', {2, 1500}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {2, 1500}, sd::DataType::FLOAT32);
     NDArray x0 = x(0, {0});
     NDArray x1 = x(1, {0});
     x0.linspace(1, 0.1);
     x1.linspace(1, 0.1);
 
-    NDArray exp('c', {2, 1500}, nd4j::DataType::FLOAT32);
+    NDArray exp('c', {2, 1500}, sd::DataType::FLOAT32);
     NDArray exp0 = exp(0, {0});
     NDArray exp1 = exp(1, {0});
 
@@ -1392,7 +1392,7 @@ TEST_F(DeclarableOpsTests9, cumprod_2) {
         exp1.p(i, prev * x1.e<float>(i));
     }
 
-    nd4j::ops::cumprod op;
+    sd::ops::cumprod op;
     auto result = op.evaluate({&x}, {}, {0, 0, 1});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -1414,8 +1414,8 @@ TEST_F(DeclarableOpsTests9, cumprod_bp_check_1) {
     const OpArgsHolder argsHolderFF({&x},         {}, {0, 0});
     const OpArgsHolder argsHolderBP({&x, &gradO}, {}, {0, 0});
 
-    nd4j::ops::cumprod opFF;
-    nd4j::ops::cumprod_bp opBP;
+    sd::ops::cumprod opFF;
+    sd::ops::cumprod_bp opBP;
 
     const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP, {1, 1}, {1, 1},GradCheck::MEAN);
 
@@ -1433,8 +1433,8 @@ TEST_F(DeclarableOpsTests9, cumprod_bp_check_2) {
     const OpArgsHolder argsHolderFF({&x},         {}, {1, 1});
     const OpArgsHolder argsHolderBP({&x, &gradO}, {}, {1, 1});
 
-    nd4j::ops::cumprod opFF;
-    nd4j::ops::cumprod_bp opBP;
+    sd::ops::cumprod opFF;
+    sd::ops::cumprod_bp opBP;
 
     const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP, {1, 1}, {1, 1},GradCheck::MEAN);
 
@@ -1452,8 +1452,8 @@ TEST_F(DeclarableOpsTests9, cumprod_bp_check_3) {
     const OpArgsHolder argsHolderFF({&x},         {}, {1, 0});
     const OpArgsHolder argsHolderBP({&x, &gradO}, {}, {1, 0});
 
-    nd4j::ops::cumprod opFF;
-    nd4j::ops::cumprod_bp opBP;
+    sd::ops::cumprod opFF;
+    sd::ops::cumprod_bp opBP;
 
     const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP, {1, 1}, {1, 1},GradCheck::MEAN);
 
@@ -1471,8 +1471,8 @@ TEST_F(DeclarableOpsTests9, cumprod_bp_check_4) {
     const OpArgsHolder argsHolderFF({&x},         {}, {0, 1});
     const OpArgsHolder argsHolderBP({&x, &gradO}, {}, {0, 1});
 
-    nd4j::ops::cumprod opFF;
-    nd4j::ops::cumprod_bp opBP;
+    sd::ops::cumprod opFF;
+    sd::ops::cumprod_bp opBP;
 
     const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP, {1, 1}, {1, 1},GradCheck::MEAN);
 
@@ -1491,8 +1491,8 @@ TEST_F(DeclarableOpsTests9, cumsum_bp_check_2) {
     const OpArgsHolder argsHolderFF({&x},         {}, {1, 1});
     const OpArgsHolder argsHolderBP({&x, &gradO}, {}, {1, 1});
 
-    nd4j::ops::cumsum opFF;
-    nd4j::ops::cumsum_bp opBP;
+    sd::ops::cumsum opFF;
+    sd::ops::cumsum_bp opBP;
 
     const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP, {1, 1}, {1, 1},GradCheck::MEAN);
 
@@ -1520,8 +1520,8 @@ TEST_F(DeclarableOpsTests9, cumprod_test1) {
     const OpArgsHolder argsHolderFF({&inputC, &axis}, {}, {exclusive, reverse});
     const OpArgsHolder argsHolderBP({&inputC, &axis, &gradO}, {}, {exclusive, reverse});
 
-    nd4j::ops::cumprod opFF;
-    nd4j::ops::cumprod_bp opBP;
+    sd::ops::cumprod opFF;
+    sd::ops::cumprod_bp opBP;
 
     const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP, {1, 1}, {1, 1},GradCheck::MEAN);
 
@@ -1544,8 +1544,8 @@ TEST_F(DeclarableOpsTests9, cumprod_test2) {
     const OpArgsHolder argsHolderFF({&inputC, &axis}, {}, {exclusive, reverse});
     const OpArgsHolder argsHolderBP({&inputC, &axis, &gradO}, {}, {exclusive, reverse});
 
-    nd4j::ops::cumprod opFF;
-    nd4j::ops::cumprod_bp opBP;
+    sd::ops::cumprod opFF;
+    sd::ops::cumprod_bp opBP;
 
     const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP, {1, 1, 1, 1}, {1, 1},GradCheck::MEAN);
 
@@ -1559,7 +1559,7 @@ TEST_F(DeclarableOpsTests9, prelu_test1) {
     auto alpha = NDArrayFactory::create<double>('c', {3, 4}, {-0.6f, -0.5f, -0.4f, -0.3f, -0.2f, -0.1f, 0.f, 0.1f, 0.2f, 0.3f, 0.4f, 0.5f});
     auto exp = NDArrayFactory::create<double>('c', {2, 3, 4}, {7.2f, 5.5f, 4.f, 2.7f, 1.6f, 0.7f, 0.f, -0.5f,-0.8f, -0.9f, -0.8f, -0.5f, 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f});
 
-    nd4j::ops::prelu op;
+    sd::ops::prelu op;
 
     auto result = op.evaluate({&x, &alpha});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1578,7 +1578,7 @@ TEST_F(DeclarableOpsTests9, prelu_test2) {
     auto alpha = NDArrayFactory::create<double>('c', {3}, {-0.6f, 2.f, 4.f});
     auto exp = NDArrayFactory::create<double>('c', {2, 3, 4}, {7.2f,  6.6f,   6.f,   5.4f, -16.f, -14.f, -12.f, -10.f, -16.f, -12.f,  -8.f,  -4.f, 0.f,   1.f,   2.f,   3.f, 4.f,   5.f,   6.f,   7.f, 8.f,   9.f,  10.f,  11.f});
 
-    nd4j::ops::prelu op;
+    sd::ops::prelu op;
     auto result = op.evaluate({&x, &alpha}, {}, {0});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     auto output = result->at(0);
@@ -1596,7 +1596,7 @@ TEST_F(DeclarableOpsTests9, prelu_test3) {
     auto alpha = NDArrayFactory::create<double>('c', {3,1}, {-0.6f, 2.f, 4.f});
     auto exp = NDArrayFactory::create<double>('c', {2, 3, 4}, {7.2f,  6.6f,   6.f,   5.4f, -16.f, -14.f, -12.f, -10.f, -16.f, -12.f,  -8.f,  -4.f, 0.f,   1.f,   2.f,   3.f, 4.f,   5.f,   6.f,   7.f, 8.f,   9.f,  10.f,  11.f});
 
-    nd4j::ops::prelu op;
+    sd::ops::prelu op;
     auto result = op.evaluate({&x, &alpha}, {}, {0});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     auto output = result->at(0);
@@ -1614,7 +1614,7 @@ TEST_F(DeclarableOpsTests9, prelu_test4) {
     auto alpha = NDArrayFactory::create<double>('c', {1, 3}, {-0.6f, 2.f, 4.f});
     auto exp = NDArrayFactory::create<double>('c', {2, 3, 4}, {7.2f,  6.6f,   6.f,   5.4f, -16.f, -14.f, -12.f, -10.f, -16.f, -12.f,  -8.f,  -4.f, 0.f,   1.f,   2.f,   3.f, 4.f,   5.f,   6.f,   7.f, 8.f,   9.f,  10.f,  11.f});
 
-    nd4j::ops::prelu op;
+    sd::ops::prelu op;
     auto result = op.evaluate({&x, &alpha}, {}, {0});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     auto output = result->at(0);
@@ -1632,7 +1632,7 @@ TEST_F(DeclarableOpsTests9, prelu_test5) {
     auto alpha = NDArrayFactory::create<double>('c', {4}, {-0.6f, 2.f, 4.f, -1.f});
     auto exp = NDArrayFactory::create<double>('c', {2, 3, 4}, {7.2f, -22.f, -40.f,   9.f, 4.8f, -14.f, -24.f,   5.f, 2.4f,  -6.f,  -8.f,   1.f, 0.f,   1.f,   2.f,   3.f, 4.f,   5.f,   6.f,   7.f, 8.f,   9.f,  10.f,  11.f});
 
-    nd4j::ops::prelu op;
+    sd::ops::prelu op;
     auto result = op.evaluate({&x, &alpha}, {}, {1});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     auto output = result->at(0);
@@ -1650,7 +1650,7 @@ TEST_F(DeclarableOpsTests9, prelu_test6) {
     auto alpha = NDArrayFactory::create<double>('c', {1,1,1}, {-2.});
     auto exp = NDArrayFactory::create<double>('c', {2, 3, 4}, {24.f, 22.f, 20.f, 18.f, 16.f, 14.f, 12.f, 10.f, 8.f,  6.f,  4.f,  2.f, 0.f,  1.f,  2.f,  3.f, 4.f,  5.f,  6.f,  7.f, 8.f,  9.f, 10.f, 11.f});
 
-    nd4j::ops::prelu op;
+    sd::ops::prelu op;
     auto result = op.evaluate({&x, &alpha}, {}, {1,0});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     auto output = result->at(0);
@@ -1669,7 +1669,7 @@ TEST_F(DeclarableOpsTests9, prelu_test7) {
     auto alpha = NDArrayFactory::create<double>(-2.f);
     auto exp = NDArrayFactory::create<double>('c', {2, 3, 4}, {24.f, 22.f, 20.f, 18.f, 16.f, 14.f, 12.f, 10.f, 8.f,  6.f,  4.f,  2.f, 0.f,  1.f,  2.f,  3.f, 4.f,  5.f,  6.f,  7.f, 8.f,  9.f, 10.f, 11.f});
 
-    nd4j::ops::prelu op;
+    sd::ops::prelu op;
     auto result = op.evaluate({&x, &alpha}, {}, {1,0});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     auto output = result->at(0);
@@ -1687,7 +1687,7 @@ TEST_F(DeclarableOpsTests9, prelu_test8) {
     auto alpha = NDArrayFactory::create<double>(-2.f);
     auto exp = NDArrayFactory::create<double>('c', {2, 3, 4}, {24.f, 22.f, 20.f, 18.f, 16.f, 14.f, 12.f, 10.f, 8.f,  6.f,  4.f,  2.f, 0.f,  1.f,  2.f,  3.f, 4.f,  5.f,  6.f,  7.f, 8.f,  9.f, 10.f, 11.f});
 
-    nd4j::ops::prelu op;
+    sd::ops::prelu op;
     auto result = op.evaluate({&x, &alpha}, {}, {1,0,1,0,1,0});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     auto output = result->at(0);
@@ -1705,7 +1705,7 @@ TEST_F(DeclarableOpsTests9, prelu_test9) {
     auto alpha = NDArrayFactory::create<double>(-2.f);
     auto exp = NDArrayFactory::create<double>('c', {2, 4}, {8.f, 6.f, 4.f, 2.f,0.f, 1.f, 2.f, 3.f});
 
-    nd4j::ops::prelu op;
+    sd::ops::prelu op;
     auto result = op.evaluate({&x, &alpha}, {}, {0});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     auto output = result->at(0);
@@ -1723,7 +1723,7 @@ TEST_F(DeclarableOpsTests9, prelu_test10) {
     auto alpha = NDArrayFactory::create<double>(-2.f);
     auto exp = NDArrayFactory::create<double>('c', {2, 4}, {8.f, 6.f, 4.f, 2.f,0.f, 1.f, 2.f, 3.f});
 
-    nd4j::ops::prelu op;
+    sd::ops::prelu op;
     auto result = op.evaluate({&x, &alpha}, {}, {1});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     auto output = result->at(0);
@@ -1748,7 +1748,7 @@ TEST_F(DeclarableOpsTests9, prelu_test11) {
                                            43.f,  44.f, 45.f,  46.f,  47.f,  48.f,  49.f, 50.f,  51.f,  52.f,  53.f,  54.f, 55.f,  56.f,  57.f,  58.f,  59.f, 60.f,  61.f,
                                            62.f,  63.f,  64.f, 65.f,  66.f,  67.f,  68.f,  69.f});
 
-    nd4j::ops::prelu op;
+    sd::ops::prelu op;
     auto result = op.evaluate({&x, &alpha}, {}, {1,3});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     auto output = result->at(0);
@@ -1772,7 +1772,7 @@ TEST_F(DeclarableOpsTests9, prelu_test12) {
                                            31.f, 32.f, 33.f, 34.f, 35.f, 36.f, 37.f, 38.f, 39.f, 40.f, 41.f, 42.f, 43.f, 44.f, 45.f, 46.f, 47.f, 48.f, 49.f, 50.f, 51.f, 52.f,
                                            53.f, 54.f, 55.f, 56.f, 57.f, 58.f, 59.f, 60.f, 61.f, 62.f, 63.f, 64.f, 65.f, 66.f, 67.f, 68.f, 69.f});
 
-    nd4j::ops::prelu op;
+    sd::ops::prelu op;
     auto result = op.evaluate({&x, &alpha}, {}, {-1, 2});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     auto output = result->at(0);
@@ -1796,7 +1796,7 @@ TEST_F(DeclarableOpsTests9, prelu_test13) {
                                            31.f, 32.f, 33.f, 34.f, 35.f, 36.f, 37.f, 38.f, 39.f, 40.f, 41.f, 42.f, 43.f, 44.f, 45.f, 46.f, 47.f, 48.f, 49.f, 50.f, 51.f, 52.f,
                                            53.f, 54.f, 55.f, 56.f, 57.f, 58.f, 59.f, 60.f, 61.f, 62.f, 63.f, 64.f, 65.f, 66.f, 67.f, 68.f, 69.f});
 
-    nd4j::ops::prelu op;
+    sd::ops::prelu op;
     auto result = op.evaluate({&x, &alpha}, {}, {-1, 2});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     auto output = result->at(0);
@@ -1821,7 +1821,7 @@ TEST_F(DeclarableOpsTests9, prelu_test14) {
                                            37.f,  38.f,  39.f, 40.f,  41.f,  42.f,  43.f,  44.f, 45.f,  46.f,  47.f,  48.f,  49.f, 50.f,  51.f,  52.f,  53.f,  54.f,
                                            55.f,  56.f,  57.f,  58.f,  59.f, 60.f,  61.f,  62.f,  63.f,  64.f, 65.f,  66.f,  67.f,  68.f,  69.f});
 
-    nd4j::ops::prelu op;
+    sd::ops::prelu op;
     auto result = op.evaluate({&x, &alpha}, {}, {-2});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     auto output = result->at(0);
@@ -1839,7 +1839,7 @@ TEST_F(DeclarableOpsTests9, thresholdedrelu_test1) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4}, {-12.f, -11.f, -10.f, -9.f, -8.f, -7.f, -6.f, -5.f, -4.f, -3.f, -2.f, -1.f, 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f});
     auto exp = NDArrayFactory::create<double>('c', {2, 3, 4}, {0.f, 0.f, 0.f, 0.f,0.f, 0.f, 0.f, 0.f,0.f, 0.f, 0.f, 0.f,0.f, 0.f, 0.f, 3.f,4.f, 5.f, 6.f, 7.f,8.f, 9.f,10.f,11.f});
 
-    nd4j::ops::thresholdedrelu op;
+    sd::ops::thresholdedrelu op;
 
     auto result = op.evaluate({&x}, {theta});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1859,7 +1859,7 @@ TEST_F(DeclarableOpsTests9, compare_and_bitpack_test1) {
     auto exp = NDArrayFactory::create<uint8_t>('c', {2, 3, 4}, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                                                                 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1});
 
-    nd4j::ops::compare_and_bitpack op;
+    sd::ops::compare_and_bitpack op;
 
     auto result = op.evaluate({&x, &threshold}, {}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1877,7 +1877,7 @@ TEST_F(DeclarableOpsTests9, thresholdedrelu_test2) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 4}, {0.f,-4.f, -10.f, -8.f, 0.f, -9.f, -8.f, 5.f, 6.f, 6.f, 9.f, 6.f, -8.f, 5.f, 10.f, -2.f, 3.f, -7.f, 4.f, -8.f, -4.f, -9.f, -9.f, 3.f});
     auto exp = NDArrayFactory::create<double>('c', {2, 3, 4}, {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 5.f, 6.f, 6.f, 9.f, 6.f, 0.f, 5.f, 10.f, 0.f, 3.f, 0.f, 4.f, 0.f, 0.f, 0.f, 0.f, 3.f});
 
-    nd4j::ops::thresholdedrelu op;
+    sd::ops::thresholdedrelu op;
 
     auto result = op.evaluate({&x}, {theta});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -1899,8 +1899,8 @@ TEST_F(DeclarableOpsTests9, prelu_bp_test1) {
     const OpArgsHolder argsHolderFF({&x, &alpha}, {}, {});
     const OpArgsHolder argsHolderBP({&x, &alpha, &dLdO}, {}, {});
 
-    nd4j::ops::prelu opFF;
-    nd4j::ops::prelu_bp opBP;
+    sd::ops::prelu opFF;
+    sd::ops::prelu_bp opBP;
 
     const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
 
@@ -1917,8 +1917,8 @@ TEST_F(DeclarableOpsTests9, prelu_bp_test2) {
     const OpArgsHolder argsHolderFF({&x, &alpha}, {}, {1});
     const OpArgsHolder argsHolderBP({&x, &alpha, &dLdO}, {}, {1});
 
-    nd4j::ops::prelu opFF;
-    nd4j::ops::prelu_bp opBP;
+    sd::ops::prelu opFF;
+    sd::ops::prelu_bp opBP;
 
     const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
 
@@ -1937,8 +1937,8 @@ TEST_F(DeclarableOpsTests9, prelu_bp_test3) {
     const OpArgsHolder argsHolderFF({&x, &alpha}, {}, {-1, 2});
     const OpArgsHolder argsHolderBP({&x, &alpha, &dLdO}, {}, {-1, 2});
 
-    nd4j::ops::prelu opFF;
-    nd4j::ops::prelu_bp opBP;
+    sd::ops::prelu opFF;
+    sd::ops::prelu_bp opBP;
 
     const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
 
@@ -1957,8 +1957,8 @@ TEST_F(DeclarableOpsTests9, prelu_bp_test4) {
     const OpArgsHolder argsHolderFF({&x, &alpha}, {}, {-2});
     const OpArgsHolder argsHolderBP({&x, &alpha, &dLdO}, {}, {-2});
 
-    nd4j::ops::prelu opFF;
-    nd4j::ops::prelu_bp opBP;
+    sd::ops::prelu opFF;
+    sd::ops::prelu_bp opBP;
 
     const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
 
@@ -1976,8 +1976,8 @@ TEST_F(DeclarableOpsTests9, thresholdedrelu_bp_test1) {
     const OpArgsHolder argsHolderFF({&x}, {theta}, {});
     const OpArgsHolder argsHolderBP({&x, &dLdO}, {theta}, {});
 
-    nd4j::ops::thresholdedrelu opFF;
-    nd4j::ops::thresholdedrelu_bp opBP;
+    sd::ops::thresholdedrelu opFF;
+    sd::ops::thresholdedrelu_bp opBP;
 
     const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
 
@@ -1993,7 +1993,7 @@ TEST_F(DeclarableOpsTests9, multiply_test1) {
     x.linspace(1.f);
     y.linspace(0.1f, 0.1f);
 
-    nd4j::ops::multiply op;
+    sd::ops::multiply op;
     auto result = op.evaluate({&x, &y}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     auto z = result->at(0);
@@ -2013,7 +2013,7 @@ TEST_F(DeclarableOpsTests9, multiply_test2) {
     x.linspace(1.f);
     // y.linspace(0.1f, 0.1f);
 
-    nd4j::ops::multiply op;
+    sd::ops::multiply op;
     auto result = op.evaluate({&y, &x}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     auto z = result->at(0);
@@ -2033,7 +2033,7 @@ TEST_F(DeclarableOpsTests9, multiply_test3) {
     x.linspace(1.f);
     y.linspace(0.1f, 0.1f);
 
-    nd4j::ops::multiply op;
+    sd::ops::multiply op;
     auto result = op.evaluate({&x, &y}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     auto z = result->at(0);
@@ -2052,7 +2052,7 @@ TEST_F(DeclarableOpsTests9, multiply_test4) {
     auto exp = NDArrayFactory::create<double>('c', {1, 1}, {0.1f});
     x.linspace(1.f);
 
-    nd4j::ops::multiply op;
+    sd::ops::multiply op;
     auto result = op.evaluate({&x, &y}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     auto z = result->at(0);
@@ -2070,7 +2070,7 @@ TEST_F(DeclarableOpsTests9, multiply_test5) {
     auto y = NDArrayFactory::create<double>(0.1f);
     auto exp = NDArrayFactory::create<double>(0.1f);
 
-    nd4j::ops::multiply op;
+    sd::ops::multiply op;
     auto result = op.evaluate({&x, &y}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     auto z = result->at(0);
@@ -2091,8 +2091,8 @@ TEST_F(DeclarableOpsTests9, multiply_bp_test1) {
     const OpArgsHolder argsHolderFF({&x, &y}, {}, {});
     const OpArgsHolder argsHolderBP({&x, &y, &dLdz}, {}, {});
 
-    nd4j::ops::multiply opFF;
-    nd4j::ops::multiply_bp opBP;
+    sd::ops::multiply opFF;
+    sd::ops::multiply_bp opBP;
     auto resFF = opFF.evaluate({&x, &y}, {}, {});
     auto resBP = opBP.evaluate({&x, &y, &dLdz}, {}, {});
 //    resFF->at(0)->printIndexedBuffer("Multiply 1x1");
@@ -2114,8 +2114,8 @@ TEST_F(DeclarableOpsTests9, multiply_bp_test2) {
     const OpArgsHolder argsHolderFF({&x, &y}, {}, {});
     const OpArgsHolder argsHolderBP({&x, &y, &dLdz}, {}, {});
 
-    nd4j::ops::multiply opFF;
-    nd4j::ops::multiply_bp opBP;
+    sd::ops::multiply opFF;
+    sd::ops::multiply_bp opBP;
 
     const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
 
@@ -2132,8 +2132,8 @@ TEST_F(DeclarableOpsTests9, multiply_bp_test3) {
     const OpArgsHolder argsHolderFF({&x, &y}, {}, {});
     const OpArgsHolder argsHolderBP({&x, &y, &dLdz}, {}, {});
 
-    nd4j::ops::multiply opFF;
-    nd4j::ops::multiply_bp opBP;
+    sd::ops::multiply opFF;
+    sd::ops::multiply_bp opBP;
 
     const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
 
@@ -2150,8 +2150,8 @@ TEST_F(DeclarableOpsTests9, multiply_bp_test4) {
     const OpArgsHolder argsHolderFF({&x, &y}, {}, {});
     const OpArgsHolder argsHolderBP({&x, &y, &dLdz}, {}, {});
 
-    nd4j::ops::multiply opFF;
-    nd4j::ops::multiply_bp opBP;
+    sd::ops::multiply opFF;
+    sd::ops::multiply_bp opBP;
 
     const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
 
@@ -2168,8 +2168,8 @@ TEST_F(DeclarableOpsTests9, multiply_bp_test5) {
     const OpArgsHolder argsHolderFF({&x, &y}, {}, {});
     const OpArgsHolder argsHolderBP({&x, &y, &dLdz}, {}, {});
 
-    nd4j::ops::multiply opFF;
-    nd4j::ops::multiply_bp opBP;
+    sd::ops::multiply opFF;
+    sd::ops::multiply_bp opBP;
 
     const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
 
@@ -2186,8 +2186,8 @@ TEST_F(DeclarableOpsTests9, multiply_bp_test6) {
     const OpArgsHolder argsHolderFF({&x, &y}, {}, {});
     const OpArgsHolder argsHolderBP({&x, &y, &dLdz}, {}, {});
 
-    nd4j::ops::multiply opFF;
-    nd4j::ops::multiply_bp opBP;
+    sd::ops::multiply opFF;
+    sd::ops::multiply_bp opBP;
 
     const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
 
@@ -2204,8 +2204,8 @@ TEST_F(DeclarableOpsTests9, multiply_bp_test7) {
     const OpArgsHolder argsHolderFF({&x, &y}, {}, {});
     const OpArgsHolder argsHolderBP({&x, &y, &dLdz}, {}, {});
 
-    nd4j::ops::multiply opFF;
-    nd4j::ops::multiply_bp opBP;
+    sd::ops::multiply opFF;
+    sd::ops::multiply_bp opBP;
 
     const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
 
@@ -2224,8 +2224,8 @@ TEST_F(DeclarableOpsTests9, multiply_bp_test8) {
     const OpArgsHolder argsHolderFF({&x, &y}, {}, {});
     const OpArgsHolder argsHolderBP({&x, &y, &dLdz}, {}, {});
 
-    nd4j::ops::multiply opFF;
-    nd4j::ops::multiply_bp opBP;
+    sd::ops::multiply opFF;
+    sd::ops::multiply_bp opBP;
 
     const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
 
@@ -2245,11 +2245,11 @@ TEST_F(DeclarableOpsTests9, Floormod_BP_Test_2) {
 //    const OpArgsHolder argsHolderFF({&x, &y}, {}, {});
 //    const OpArgsHolder argsHolderBP({&x, &y, &dLdz}, {}, {});
 
-//    nd4j::ops::floormod opFF;
+//    sd::ops::floormod opFF;
 //    auto resFF = opFF.execute({&x, &y}, {}, {});
 //    resFF->at(0)->printIndexedBuffer("FF floormod");
 //    delete resFF;
-    nd4j::ops::floormod_bp opBP;
+    sd::ops::floormod_bp opBP;
     auto resBP = opBP.evaluate({&x, &y, &dLdz}, {}, {});
     ASSERT_TRUE(resBP->status() == ND4J_STATUS_OK);
 
@@ -2281,10 +2281,10 @@ TEST_F(DeclarableOpsTests9, Dynamic_Partition_BP_1) {
     dLdzY.assign(2);
     dLdzZ.assign(3);
 
-    nd4j::ops::dynamic_partition op1;
+    sd::ops::dynamic_partition op1;
     auto res1 = op1.evaluate({&x, &y}, {}, {3});
 
-    nd4j::ops::dynamic_partition_bp op2;
+    sd::ops::dynamic_partition_bp op2;
     auto res2 = op2.evaluate({&x, &y, &dLdzX, &dLdzY, &dLdzZ}, {}, {3});
     ASSERT_TRUE(res2->status() == ND4J_STATUS_OK);
     ASSERT_TRUE(res2->size() == 2);
@@ -2311,8 +2311,8 @@ TEST_F(DeclarableOpsTests9, Dynamic_Partition_BP_1) {
 //    const OpArgsHolder argsHolderFF({&x, &y}, {}, {3});
 //    const OpArgsHolder argsHolderBP({&x, &y, &dLdzX, &dLdzY, &dLdzZ}, {}, {3});
 //
-//    nd4j::ops::dynamic_partition opFF;
-//    nd4j::ops::dynamic_partition_bp opBP;
+//    sd::ops::dynamic_partition opFF;
+//    sd::ops::dynamic_partition_bp opBP;
 //
 //    const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
 //
@@ -2327,7 +2327,7 @@ TEST_F(DeclarableOpsTests9, Floormod_BP_Test_4) {
     auto exp = NDArrayFactory::create<double>('c', {1, 3}, {-1.,  0., -1.});
     auto eps = NDArrayFactory::create<double>('c', {2, 1, 3});
     eps.assign(1.f);
-    nd4j::ops::floormod_bp op;
+    sd::ops::floormod_bp op;
 
     auto result = op.evaluate({&x, &y, &eps}, {}, {});
 
@@ -2353,16 +2353,16 @@ TEST_F(DeclarableOpsTests9, gru_cell_bp_test1) {
     const int iS = 3;
     const int nU = 4;
 
-    NDArray x('c', {bS, iS}, nd4j::DataType::DOUBLE);
-    NDArray hi('c', {bS, nU}, nd4j::DataType::DOUBLE);
-    NDArray W('c', {iS+nU, 2*nU}, nd4j::DataType::DOUBLE);
-    NDArray Wc('c', {iS+nU, nU}, nd4j::DataType::DOUBLE);
-    NDArray b('c', {2*nU}, nd4j::DataType::DOUBLE);
-    NDArray bc('c', {nU}, nd4j::DataType::DOUBLE);
-    NDArray dLdr('c', {bS, nU}, nd4j::DataType::DOUBLE);
-    NDArray dLdu('c', {bS, nU}, nd4j::DataType::DOUBLE);
-    NDArray dLdc('c', {bS, nU}, nd4j::DataType::DOUBLE);
-    NDArray dLdh('c', {bS, nU}, nd4j::DataType::DOUBLE);
+    NDArray x('c', {bS, iS}, sd::DataType::DOUBLE);
+    NDArray hi('c', {bS, nU}, sd::DataType::DOUBLE);
+    NDArray W('c', {iS+nU, 2*nU}, sd::DataType::DOUBLE);
+    NDArray Wc('c', {iS+nU, nU}, sd::DataType::DOUBLE);
+    NDArray b('c', {2*nU}, sd::DataType::DOUBLE);
+    NDArray bc('c', {nU}, sd::DataType::DOUBLE);
+    NDArray dLdr('c', {bS, nU}, sd::DataType::DOUBLE);
+    NDArray dLdu('c', {bS, nU}, sd::DataType::DOUBLE);
+    NDArray dLdc('c', {bS, nU}, sd::DataType::DOUBLE);
+    NDArray dLdh('c', {bS, nU}, sd::DataType::DOUBLE);
 
     x.linspace(-5, 0.5);
     hi   = 1.;
@@ -2373,7 +2373,7 @@ TEST_F(DeclarableOpsTests9, gru_cell_bp_test1) {
 
 
     const OpArgsHolder argsHolderFF({&x, &hi, &W, &Wc, &b, &bc}, {}, {});
-    nd4j::ops::gruCell op;
+    sd::ops::gruCell op;
     auto results = op.evaluate(argsHolderFF);
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -2397,10 +2397,10 @@ TEST_F(DeclarableOpsTests9, gru_cell_bp_test1) {
 
     const OpArgsHolder argsHolderBP({&x, &hi, &W, &Wc, &b, &bc, &dLdr, &dLdu, &dLdc, &dLdh}, {}, {});
 
-    nd4j::ops::gruCell opFF;
-    nd4j::ops::gruCell_bp opBP;
+    sd::ops::gruCell opFF;
+    sd::ops::gruCell_bp opBP;
 
-    const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP, {1, 1, 1, 1 , 1, 1}, {0., 1.}, nd4j::GradCheck::LossFunc::SUM, true);
+    const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP, {1, 1, 1, 1 , 1, 1}, {0., 1.}, sd::GradCheck::LossFunc::SUM, true);
 
     ASSERT_TRUE(isGradCorrect);
 }
@@ -2412,7 +2412,7 @@ TEST_F(DeclarableOpsTests9, Cholesky_Test_1) {
     NDArray x = NDArrayFactory::create<double>('c', {3, 3}, {4,12,-16, 12 ,37,-43, -16, -43, 98});
     NDArray exp = NDArrayFactory::create<double>('c', {3,3}, {2.,  0.,  0., 6., 1.,  0., -8.,  5.,  3.});
 
-    nd4j::ops::cholesky op;
+    sd::ops::cholesky op;
 
     auto result = op.evaluate({&x}, {}, {});
     ASSERT_EQ(result->status(), ND4J_STATUS_OK);
@@ -2428,7 +2428,7 @@ TEST_F(DeclarableOpsTests9, Cholesky_Test_2) {
     NDArray x = NDArrayFactory::create<double>('c', {2, 3, 3}, {4, 12,-16, 12 ,37,-43, -16, -43, 98, 1, 1, 1, 1, 2, 2, 1, 2., 6});
     NDArray exp = NDArrayFactory::create<double>('c', {2, 3, 3}, {2.,  0.,  0., 6., 1.,  0., -8.,  5.,  3., 1., 0., 0., 1., 1., 0,1., 1., 2.});
 
-    nd4j::ops::cholesky op;
+    sd::ops::cholesky op;
 
     auto result = op.evaluate({&x}, {}, {});
     ASSERT_EQ(result->status(), ND4J_STATUS_OK);
@@ -2444,7 +2444,7 @@ TEST_F(DeclarableOpsTests9, Cholesky_Test_3) {
     NDArray x = NDArrayFactory::create<float>('c', {2, 3, 3}, {4.f, 12.f, -16.f, 12.f, 37.f, -43.f, -16.f, -43.f, 98.f, 1.f, 1.f, 1.f, 1.f, 2.f, 2.f, 1.f, 2.f, 6.f});
     NDArray exp = NDArrayFactory::create<float>('c', {2, 3, 3}, {2.f,  0.f,  0.f, 6.f, 1.f,  0.f, -8.f,  5.f,  3.f, 1.f, 0.f, 0.f, 1.f, 1.f, 0.f, 1.f, 1.f, 2.f});
 
-    nd4j::ops::cholesky op;
+    sd::ops::cholesky op;
 
     auto result = op.evaluate({&x}, {}, {});
     ASSERT_EQ(result->status(), ND4J_STATUS_OK);
@@ -2478,8 +2478,8 @@ TEST_F(DeclarableOpsTests9, Cholesky_Test_3) {
 //     const OpArgsHolder<double> argsHolderFF({&x, &h0, &Wx, &Wh, &b}, {}, {});
 //     const OpArgsHolder<double> argsHolderBP({&x, &h0, &Wx, &Wh, &b, &dLdh}, {}, {});
 
-//     nd4j::ops::gru<double> opFF;
-//     nd4j::ops::gru_bp<double> opBP;
+//     sd::ops::gru<double> opFF;
+//     sd::ops::gru_bp<double> opBP;
 
 //     const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
 
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTestsCuda1.cu b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTestsCuda1.cu
index b7907ce1d..f0230efb4 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTestsCuda1.cu
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTestsCuda1.cu
@@ -21,13 +21,13 @@
 
 #include "testlayers.h"
 #include <ops/declarable/CustomOperations.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <ops/ops.h>
-#include <GradCheck.h>
+#include <helpers/GradCheck.h>
 #include <chrono>
 
 
-using namespace nd4j;
+using namespace sd;
 
 
 class DeclarableOpsTestsCuda1 : public testing::Test {
@@ -48,7 +48,7 @@ TEST_F(DeclarableOpsTestsCuda1, Test_CHOOSE_SCALAR_LARGE) {
     auto precursor = NDArrayFactory::create<double>(inputData,'c',{1,149});
     NDArray x(nullptr, precursor.specialBuffer(), precursor.shapeInfo());
 
-    nd4j::ops::choose op;
+    sd::ops::choose op;
     //greater than test
     auto result = op.evaluate({&x}, {0.0},{3});
     ASSERT_EQ(Status::OK(), result->status());
@@ -67,7 +67,7 @@ TEST_F(DeclarableOpsTestsCuda1, Test_Reverse_TAD_1) {
     auto z = x.like();
     x.linspace(1.0f);
 
-    nd4j::ops::reverse op;
+    sd::ops::reverse op;
     auto timeStart = std::chrono::system_clock::now();
     auto status = op.execute({&x}, {&z}, {}, {1}, {});
     auto timeEnd = std::chrono::system_clock::now();
diff --git a/libnd4j/tests_cpu/layers_tests/EmptyTests.cpp b/libnd4j/tests_cpu/layers_tests/EmptyTests.cpp
index a234e6d50..4dc7ae102 100644
--- a/libnd4j/tests_cpu/layers_tests/EmptyTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/EmptyTests.cpp
@@ -20,10 +20,10 @@
 
 #include "testlayers.h"
 #include <ops/declarable/CustomOperations.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 // #include <array/NDArrayList.h>
 
-using namespace nd4j;
+using namespace sd;
 
 
 class EmptyTests : public testing::Test {
@@ -60,12 +60,12 @@ TEST_F(EmptyTests, Test_Create_Empty_2) {
 
 TEST_F(EmptyTests, Test_Concat_1) {
 //    auto empty = NDArrayFactory::empty_<float>();
-    auto empty = new NDArray('c',  {0}, nd4j::DataType::FLOAT32);//NDArrayFactory::create_<float>('c', {(Nd4jLong)0}};
+    auto empty = new NDArray('c',  {0}, sd::DataType::FLOAT32);//NDArrayFactory::create_<float>('c', {(Nd4jLong)0}};
     auto vector = NDArrayFactory::create_<float>('c', {1}, {1.0f});
 
     ASSERT_TRUE(empty->isEmpty());
 
-    nd4j::ops::concat op;
+    sd::ops::concat op;
     auto result = op.evaluate({empty, vector}, {}, {0});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -83,14 +83,14 @@ TEST_F(EmptyTests, Test_Concat_1) {
 
 
 TEST_F(EmptyTests, Test_Concat_2) {
-    auto empty = new NDArray('c',  {0}, nd4j::DataType::FLOAT32); //NDArrayFactory::empty_<float>();
+    auto empty = new NDArray('c',  {0}, sd::DataType::FLOAT32); //NDArrayFactory::empty_<float>();
     auto scalar1 =  NDArrayFactory::create_<float>('c', {1}, {1.0f});
     auto scalar2  = NDArrayFactory::create_<float>('c', {1}, {2.0f});
     auto exp = NDArrayFactory::create<float>('c', {2}, {1.f, 2.f});
 
     ASSERT_TRUE(empty->isEmpty());
 
-    nd4j::ops::concat op;
+    sd::ops::concat op;
     auto result = op.evaluate({empty, scalar1, scalar2}, {}, {0});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -115,7 +115,7 @@ TEST_F(EmptyTests, Test_Concat_3) {
 
     ASSERT_TRUE(empty.isEmpty());
 
-    nd4j::ops::concat op;
+    sd::ops::concat op;
     auto result = op.evaluate({&empty, &scalar1, &scalar2}, {}, {0});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -134,7 +134,7 @@ TEST_F(EmptyTests, Test_Concat_4) {
 
     ASSERT_TRUE(empty.isEmpty());
 
-    nd4j::ops::concat op;
+    sd::ops::concat op;
     auto result = op.evaluate({&scalar1, &empty, &scalar2}, {}, {0});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -150,7 +150,7 @@ TEST_F(EmptyTests, Test_Reshape_1) {
     auto exp = NDArrayFactory::create<float>(119.f);
     auto empty = NDArrayFactory::empty_<int>();
 
-    nd4j::ops::reshape op;
+    sd::ops::reshape op;
     auto result = op.evaluate({&vector, empty}, {}, {});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -166,7 +166,7 @@ TEST_F(EmptyTests, Test_Reshape_3) {
     auto y = NDArrayFactory::create<int>('c', {2}, {10, 0});
     auto e = NDArrayFactory::create<float>('c', {10, 0});
 
-    nd4j::ops::reshape op;
+    sd::ops::reshape op;
     auto result = op.evaluate({&x, &y}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -195,7 +195,7 @@ TEST_F(EmptyTests, test_empty_scatter_1) {
 
     x.linspace(1.0f);
 
-    nd4j::ops::scatter_upd op;
+    sd::ops::scatter_upd op;
     auto result = op.evaluate({&x, &indices, &updates}, {}, {}, {true});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -221,7 +221,7 @@ TEST_F(EmptyTests, test_empty_scatter_2) {
     bool args[] = {true};
     ctx.setBArguments(args, 1);
 
-    nd4j::ops::scatter_upd op;
+    sd::ops::scatter_upd op;
     auto result = op.execute(&ctx);
     ASSERT_EQ(Status::OK(), result);
 
@@ -232,7 +232,7 @@ TEST_F(EmptyTests, test_shaped_empty_1) {
     auto empty = NDArrayFactory::create<float>('c', {2, 0, 3});
     std::vector<Nd4jLong> shape = {2, 0, 3};
 
-    ASSERT_EQ(nd4j::DataType::FLOAT32, empty.dataType());
+    ASSERT_EQ(sd::DataType::FLOAT32, empty.dataType());
     ASSERT_EQ(0, empty.lengthOf());
     ASSERT_TRUE(empty.isEmpty());
     ASSERT_EQ(shape, empty.getShapeAsVector());
@@ -243,7 +243,7 @@ TEST_F(EmptyTests, test_shaped_empty_2) {
     auto empty = NDArrayFactory::create<float>('c', {0, 3});
     std::vector<Nd4jLong> shape = {0, 3};
 
-    ASSERT_EQ(nd4j::DataType::FLOAT32, empty.dataType());
+    ASSERT_EQ(sd::DataType::FLOAT32, empty.dataType());
     ASSERT_EQ(0, empty.lengthOf());
     ASSERT_TRUE(empty.isEmpty());
     ASSERT_EQ(shape, empty.getShapeAsVector());
@@ -254,7 +254,7 @@ TEST_F(EmptyTests, test_shaped_empty_3) {
     auto empty = NDArrayFactory::create<float>('c', {0});
     std::vector<Nd4jLong> shape = {0};
 
-    ASSERT_EQ(nd4j::DataType::FLOAT32, empty.dataType());
+    ASSERT_EQ(sd::DataType::FLOAT32, empty.dataType());
     ASSERT_EQ(0, empty.lengthOf());
     ASSERT_TRUE(empty.isEmpty());
     ASSERT_EQ(shape, empty.getShapeAsVector());
@@ -262,8 +262,8 @@ TEST_F(EmptyTests, test_shaped_empty_3) {
 }
 
 TEST_F(EmptyTests, test_shaped_empty_4) {
-    auto shape = ConstantShapeHelper::getInstance()->vectorShapeInfo(0, nd4j::DataType::FLOAT32);
-    NDArray array(shape, true, nd4j::LaunchContext::defaultContext());
+    auto shape = ConstantShapeHelper::getInstance()->vectorShapeInfo(0, sd::DataType::FLOAT32);
+    NDArray array(shape, true, sd::LaunchContext::defaultContext());
     std::vector<Nd4jLong> shapeOf({0});
 
     ASSERT_TRUE(array.isEmpty());
@@ -293,7 +293,7 @@ TEST_F(EmptyTests, test_empty_reshape_1) {
     auto e0 = NDArrayFactory::create<float>('c', {2, 0, 0});
     auto e1 = NDArrayFactory::create<float>('c', {0, 1});
 
-    nd4j::ops::reshape op;
+    sd::ops::reshape op;
     auto result0 = op.evaluate({&x0, &shape0}, {}, {});
     ASSERT_EQ(Status::OK(), result0->status());
     auto z0 = result0->at(0);
@@ -314,7 +314,7 @@ TEST_F(EmptyTests, test_empty_matmul_1) {
     auto y = NDArrayFactory::create<float>('c', {1, 0});
     auto e = NDArrayFactory::create<float>('c', {0, 0});
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     auto result = op.evaluate({&x, &y}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -329,7 +329,7 @@ TEST_F(EmptyTests, test_empty_matmul_2) {
     auto y = NDArrayFactory::create<float>('c', {1, 4, 0});
     auto e = NDArrayFactory::create<float>('c', {1, 0, 0});
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     auto result = op.evaluate({&x, &y}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
diff --git a/libnd4j/tests_cpu/layers_tests/ExtraArgumentsTests.cpp b/libnd4j/tests_cpu/layers_tests/ExtraArgumentsTests.cpp
index 8e18051a5..87ac750b2 100644
--- a/libnd4j/tests_cpu/layers_tests/ExtraArgumentsTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ExtraArgumentsTests.cpp
@@ -22,7 +22,7 @@
 #include <array/ExtraArguments.h>
 #include <array>
 
-using namespace nd4j;
+using namespace sd;
 
 class ExtraArgumentsTests : public testing::Test {
 public:
diff --git a/libnd4j/tests_cpu/layers_tests/FlatBuffersTests.cpp b/libnd4j/tests_cpu/layers_tests/FlatBuffersTests.cpp
index 49dd0657d..bdb8bde68 100644
--- a/libnd4j/tests_cpu/layers_tests/FlatBuffersTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/FlatBuffersTests.cpp
@@ -25,11 +25,11 @@
 #include <graph/generated/result_generated.h>
 #include <graph/Node.h>
 #include <graph/Graph.h>
-#include <GraphExecutioner.h>
+#include <graph/GraphExecutioner.h>
 #include <ops/declarable/CustomOperations.h>
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class FlatBuffersTest : public testing::Test {
 public:
@@ -94,10 +94,10 @@ TEST_F(FlatBuffersTest, FlatGraphTest1) {
     auto fShape = builder.CreateVector(array->getShapeInfoAsFlatVector());
     auto fBuffer = builder.CreateVector(array->asByteVector());
 
-    auto fArray = CreateFlatArray(builder, fShape, fBuffer, nd4j::graph::DType::DType_FLOAT);
+    auto fArray = CreateFlatArray(builder, fShape, fBuffer, sd::graph::DType::DType_FLOAT);
     auto fVid = CreateIntPair(builder, -1);
 
-    auto fVar = CreateFlatVariable(builder, fVid, 0, nd4j::graph::DType::DType_FLOAT, 0, fArray);
+    auto fVar = CreateFlatVariable(builder, fVid, 0, sd::graph::DType::DType_FLOAT, 0, fArray);
 
     std::vector<int> outputs1, outputs2, inputs1, inputs2;
     outputs1.push_back(2);
@@ -182,9 +182,9 @@ TEST_F(FlatBuffersTest, FlatGraphTest1) {
     ASSERT_TRUE(var != nullptr);
     ASSERT_EQ(-2.0, var->reduceNumber(reduce::Mean).e<float>(0));
 
-    nd4j::graph::GraphExecutioner::execute(&graph);
+    sd::graph::GraphExecutioner::execute(&graph);
 
-    auto resultWrapper = nd4j::graph::GraphExecutioner::executeFlatBuffer((Nd4jPointer) buf);
+    auto resultWrapper = sd::graph::GraphExecutioner::executeFlatBuffer((Nd4jPointer) buf);
 
     auto flatResults = GetFlatResult(resultWrapper->pointer());
 
@@ -265,7 +265,7 @@ TEST_F(FlatBuffersTest, ExplicitOutputTest1) {
 
     auto name1 = builder.CreateString("wow1");
 
-    auto node1 = CreateFlatNode(builder, 1, name1, OpType_TRANSFORM, 0, in1, 0, nd4j::graph::DType::FLOAT);
+    auto node1 = CreateFlatNode(builder, 1, name1, OpType_TRANSFORM, 0, in1, 0, sd::graph::DType::FLOAT);
 
     std::vector<flatbuffers::Offset<FlatVariable>> variables_vector;
     variables_vector.push_back(fXVar);
@@ -316,7 +316,7 @@ TEST_F(FlatBuffersTest, ExplicitOutputTest1) {
 /*
 TEST_F(FlatBuffersTest, ReadFile1) {
 
-    uint8_t* data = nd4j::graph::readFlatBuffers("./resources/adam_sum.fb");
+    uint8_t* data = sd::graph::readFlatBuffers("./resources/adam_sum.fb");
 
     auto fg = GetFlatGraph(data);
     auto restoredGraph = new Graph<float>(fg);
@@ -341,7 +341,7 @@ TEST_F(FlatBuffersTest, ReadFile1) {
 }
 
 TEST_F(FlatBuffersTest, ReadFile2) {
-    uint8_t* data = nd4j::graph::readFlatBuffers("./resources/adam_sum.fb");
+    uint8_t* data = sd::graph::readFlatBuffers("./resources/adam_sum.fb");
     Nd4jPointer result = GraphExecutioner<float>::executeFlatBuffer((Nd4jPointer) data);
 
     ResultSet<float> arrays(GetFlatResult(result));
@@ -466,7 +466,7 @@ TEST_F(FlatBuffersTest, ReadTensorArrayLoop_1) {
 TEST_F(FlatBuffersTest, ReadLoops_NestedWhile_1) {
     // TF graph:
     // https://gist.github.com/raver119/2aa49daf7ec09ed4ddddbc6262f213a0
-    nd4j::ops::assign<float> op1;
+    sd::ops::assign<float> op1;
 
     auto graph = GraphExecutioner<float>::importFromFlatBuffers("./resources/nested_while.fb");
 
@@ -605,7 +605,7 @@ TEST_F(FlatBuffersTest, ReduceDim_2) {
 
 #ifdef GRAPH_FILES_OK
 TEST_F(FlatBuffersTest, Ae_00) {
-    nd4j::ops::rank op1;
+    sd::ops::rank op1;
 
     auto graph = GraphExecutioner::importFromFlatBuffers("./resources/ae_00.fb");
 
@@ -629,7 +629,7 @@ TEST_F(FlatBuffersTest, Ae_00) {
 }
 
 TEST_F(FlatBuffersTest, expand_dims) {
-    nd4j::ops::rank op1;
+    sd::ops::rank op1;
 
     auto exp = NDArrayFactory::create<float>('c', {3, 1, 4}, {-0.95938617f, -1.20301781f, 1.22260064f, 0.50172403f, 0.59972949f, 0.78568028f, 0.31609724f, 1.51674747f, 0.68013491f, -0.05227458f, 0.25903158f, 1.13243439f});
 
@@ -650,7 +650,7 @@ TEST_F(FlatBuffersTest, expand_dims) {
 }
 
 TEST_F(FlatBuffersTest, transpose) {
-    nd4j::ops::rank op1;
+    sd::ops::rank op1;
 
     auto graph = GraphExecutioner::importFromFlatBuffers("./resources/transpose.fb");
 
@@ -663,7 +663,7 @@ TEST_F(FlatBuffersTest, transpose) {
 }
 
 TEST_F(FlatBuffersTest, Test_Stitches) {
-    nd4j::ops::realdiv op0;
+    sd::ops::realdiv op0;
 
     auto graph = GraphExecutioner::importFromFlatBuffers("./resources/partition_stitch_misc.fb");
     //graph->printOut();
@@ -676,8 +676,8 @@ TEST_F(FlatBuffersTest, Test_Stitches) {
 }
 
 TEST_F(FlatBuffersTest, Test_GruDynamicMnist) {
-    nd4j::Environment::getInstance()->setDebug(false);
-    nd4j::Environment::getInstance()->setVerbose(false);
+    sd::Environment::getInstance()->setDebug(false);
+    sd::Environment::getInstance()->setVerbose(false);
 
     auto graph = GraphExecutioner::importFromFlatBuffers("./resources/gru_dynamic_mnist.fb");
     //graph->printOut();
@@ -696,9 +696,9 @@ TEST_F(FlatBuffersTest, Test_GruDynamicMnist) {
 }
 
 TEST_F(FlatBuffersTest, Test_Non2D_2) {
-    nd4j::Environment::getInstance()->setDebug(false);
-    nd4j::Environment::getInstance()->setVerbose(false);
-    nd4j::ops::realdiv op0;
+    sd::Environment::getInstance()->setDebug(false);
+    sd::Environment::getInstance()->setVerbose(false);
+    sd::ops::realdiv op0;
 
     auto graph = GraphExecutioner::importFromFlatBuffers("./resources/non2d_2.fb");
     //graph->printOut();
@@ -764,7 +764,7 @@ TEST_F(FlatBuffersTest, Test_MNIST_1) {
 /*
 // FIXME: uncomment this test once conv_0 fb reexported
 TEST_F(FlatBuffersTest, nhwc_conv_0) {
-    nd4j::ops::rank<float> op1;
+    sd::ops::rank<float> op1;
 
     auto exp('c', {4, 2}, {2.958640f, 0.602521f, 7.571267f, 1.496686f, -2.292647f, -1.791460f, 13.055838f, 4.278642f});
 
diff --git a/libnd4j/tests_cpu/layers_tests/FlatUtilsTests.cpp b/libnd4j/tests_cpu/layers_tests/FlatUtilsTests.cpp
index 31aa997c6..f31a1c7ec 100644
--- a/libnd4j/tests_cpu/layers_tests/FlatUtilsTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/FlatUtilsTests.cpp
@@ -18,13 +18,13 @@
 // @author raver119@gmail.com
 //
 
-#include <NDArray.h>
-#include <NDArrayFactory.h>
+#include <array/NDArray.h>
+#include <array/NDArrayFactory.h>
 #include "testlayers.h"
 #include <graph/Stash.h>
-#include <FlatUtils.h>
+#include <graph/FlatUtils.h>
 
-using namespace nd4j;
+using namespace sd;
 
 class FlatUtilsTests : public testing::Test {
 public:
diff --git a/libnd4j/tests_cpu/layers_tests/GraphExecutionerTests.cpp b/libnd4j/tests_cpu/layers_tests/GraphExecutionerTests.cpp
index 1e34a62d9..7a2856dc0 100644
--- a/libnd4j/tests_cpu/layers_tests/GraphExecutionerTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/GraphExecutionerTests.cpp
@@ -25,11 +25,11 @@
 #include <graph/generated/graph_generated.h>
 #include <graph/Node.h>
 #include <graph/Graph.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <ops/declarable/DeclarableOp.h>
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class GraphExecutionerTests : public testing::Test {
 public:
diff --git a/libnd4j/tests_cpu/layers_tests/GraphHolderTests.cpp b/libnd4j/tests_cpu/layers_tests/GraphHolderTests.cpp
index f144a7b84..f1f7195e7 100644
--- a/libnd4j/tests_cpu/layers_tests/GraphHolderTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/GraphHolderTests.cpp
@@ -21,9 +21,9 @@
 #include "testlayers.h"
 #include <graph/GraphHolder.h>
 
-using namespace nd4j;
-using namespace nd4j::ops;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::ops;
+using namespace sd::graph;
 
 class GraphHolderTests : public testing::Test {
 public:
diff --git a/libnd4j/tests_cpu/layers_tests/GraphRandomGeneratorTests.cpp b/libnd4j/tests_cpu/layers_tests/GraphRandomGeneratorTests.cpp
index 10fac836f..8fe46cd2f 100644
--- a/libnd4j/tests_cpu/layers_tests/GraphRandomGeneratorTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/GraphRandomGeneratorTests.cpp
@@ -20,8 +20,8 @@
 #include <graph/Graph.h>
 #include <array>
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class GraphRandomGeneratorTests : public testing::Test {
 public:
@@ -29,8 +29,8 @@ public:
 };
 
 TEST_F(GraphRandomGeneratorTests, Reproducibility_Test_1) {
-    nd4j::graph::RandomGenerator g0(119);
-    nd4j::graph::RandomGenerator g1(119);
+    sd::graph::RandomGenerator g0(119);
+    sd::graph::RandomGenerator g1(119);
 
     auto i0 = g0.relativeT<int>(15, 0, DataTypeUtils::max<int>());
     auto i1 = g1.relativeT<int>(15, 0, DataTypeUtils::max<int>());
@@ -39,8 +39,8 @@ TEST_F(GraphRandomGeneratorTests, Reproducibility_Test_1) {
 }
 
 TEST_F(GraphRandomGeneratorTests, Reproducibility_Test_2) {
-    nd4j::graph::RandomGenerator g0(119);
-    nd4j::graph::RandomGenerator g1(117);
+    sd::graph::RandomGenerator g0(119);
+    sd::graph::RandomGenerator g1(117);
 
     auto i0 = g0.relativeT<int>(15, 0, DataTypeUtils::max<int>());
     auto i1 = g1.relativeT<int>(15, 0, DataTypeUtils::max<int>());
@@ -49,8 +49,8 @@ TEST_F(GraphRandomGeneratorTests, Reproducibility_Test_2) {
 }
 
 TEST_F(GraphRandomGeneratorTests, Reproducibility_Test_3) {
-    nd4j::graph::RandomGenerator g0(119, 5);
-    nd4j::graph::RandomGenerator g1(119, 10);
+    sd::graph::RandomGenerator g0(119, 5);
+    sd::graph::RandomGenerator g1(119, 10);
 
     auto i0 = g0.relativeT<int>(15, 0, DataTypeUtils::max<int>());
     auto i1 = g1.relativeT<int>(15, 0, DataTypeUtils::max<int>());
@@ -59,8 +59,8 @@ TEST_F(GraphRandomGeneratorTests, Reproducibility_Test_3) {
 }
 
 TEST_F(GraphRandomGeneratorTests, Reproducibility_Test_4) {
-    nd4j::graph::RandomGenerator g0(119, 5);
-    nd4j::graph::RandomGenerator g1(117, 5);
+    sd::graph::RandomGenerator g0(119, 5);
+    sd::graph::RandomGenerator g1(117, 5);
 
     auto i0 = g0.relativeT<int>(15, 0, DataTypeUtils::max<int>());
     auto i1 = g1.relativeT<int>(15, 0, DataTypeUtils::max<int>());
@@ -69,8 +69,8 @@ TEST_F(GraphRandomGeneratorTests, Reproducibility_Test_4) {
 }
 
 TEST_F(GraphRandomGeneratorTests, Sequential_Test_1) {
-    nd4j::graph::RandomGenerator g0(119, 5);
-    nd4j::graph::RandomGenerator g1(119, 5);
+    sd::graph::RandomGenerator g0(119, 5);
+    sd::graph::RandomGenerator g1(119, 5);
 
     auto v0 = g0.relativeT<int>(15, 0, DataTypeUtils::max<int>());
     auto v1 = g1.relativeT<int>(15, 0, DataTypeUtils::max<int>());
@@ -89,8 +89,8 @@ TEST_F(GraphRandomGeneratorTests, Sequential_Test_1) {
 }
 
 TEST_F(GraphRandomGeneratorTests, Sequential_Test_2) {
-    nd4j::graph::RandomGenerator g0(119, 5);
-    nd4j::graph::RandomGenerator g1(119, 5);
+    sd::graph::RandomGenerator g0(119, 5);
+    sd::graph::RandomGenerator g1(119, 5);
 
     auto v0 = g0.relativeT<int>(15, 0, DataTypeUtils::max<int>());
     auto v1 = g1.relativeT<int>(15, 0, DataTypeUtils::max<int>());
@@ -110,8 +110,8 @@ TEST_F(GraphRandomGeneratorTests, Sequential_Test_2) {
 }
 
 TEST_F(GraphRandomGeneratorTests, Sequential_Test_3) {
-    nd4j::graph::RandomGenerator g0(119, 5);
-    nd4j::graph::RandomGenerator g1(119, 5);
+    sd::graph::RandomGenerator g0(119, 5);
+    sd::graph::RandomGenerator g1(119, 5);
 
     auto v0 = g0.relativeT<int>(15, 0, DataTypeUtils::max<int>());
     auto v1 = g1.relativeT<int>(15, 0, DataTypeUtils::max<int>());
@@ -131,8 +131,8 @@ TEST_F(GraphRandomGeneratorTests, Sequential_Test_3) {
 }
 
 TEST_F(GraphRandomGeneratorTests, Sequential_Test_4) {
-    nd4j::graph::RandomGenerator g0(119, 5);
-    nd4j::graph::RandomGenerator g1(119, 5);
+    sd::graph::RandomGenerator g0(119, 5);
+    sd::graph::RandomGenerator g1(119, 5);
 
     auto v0 = g0.relativeT<int>(15, 0, DataTypeUtils::max<int>());
     auto v1 = g1.relativeT<int>(15, 0, DataTypeUtils::max<int>());
@@ -171,8 +171,8 @@ TEST_F(GraphRandomGeneratorTests, Sequential_Test_4) {
 //#ifndef __clang__
 
 TEST_F(GraphRandomGeneratorTests, Long_Test_1) {
-    nd4j::graph::RandomGenerator g0(119, 5);
-    nd4j::graph::RandomGenerator g1(119, 5);
+    sd::graph::RandomGenerator g0(119, 5);
+    sd::graph::RandomGenerator g1(119, 5);
 
     std::array<Nd4jLong, 10000> z0, z1, z2, z3; 
 
@@ -215,8 +215,8 @@ TEST_F(GraphRandomGeneratorTests, Long_Test_1) {
 
 
 TEST_F(GraphRandomGeneratorTests, FloatingPoint_Test_1) {
-    nd4j::graph::RandomGenerator g0(119, 5);
-    nd4j::graph::RandomGenerator g1(119, 5);
+    sd::graph::RandomGenerator g0(119, 5);
+    sd::graph::RandomGenerator g1(119, 5);
 
     std::array<double, 100> z0, z1, z2, z3;
 
diff --git a/libnd4j/tests_cpu/layers_tests/GraphStateTests.cpp b/libnd4j/tests_cpu/layers_tests/GraphStateTests.cpp
index f03ed4b97..878b05712 100644
--- a/libnd4j/tests_cpu/layers_tests/GraphStateTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/GraphStateTests.cpp
@@ -23,10 +23,10 @@
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/LegacyTransformOp.h>
 #include <ops/declarable/LegacyReduceOp.h>
-#include <NativeOps.h>
+#include <legacy/NativeOps.h>
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class GraphStateTests : public testing::Test {
 public:
@@ -57,8 +57,8 @@ TEST_F(GraphStateTests, Basic_Tests_1) {
     // this call will create scope internally
     state->registerScope(119);
 
-    nd4j::ops::add opA;
-    nd4j::ops::LegacyTransformSameOp opB(transform::Neg); // simdOps::Neg
+    sd::ops::add opA;
+    sd::ops::LegacyTransformSameOp opB(transform::Neg); // simdOps::Neg
 
     ArgumentsList argsA;
     ArgumentsList argsB;
@@ -81,8 +81,8 @@ TEST_F(GraphStateTests, Basic_Tests_2) {
     // this call will create scope internally
     state->registerScope(119);
 
-    nd4j::ops::add opA;
-    nd4j::ops::LegacyTransformSameOp opB(transform::Neg); // simdOps::Neg
+    sd::ops::add opA;
+    sd::ops::LegacyTransformSameOp opB(transform::Neg); // simdOps::Neg
 
     ArgumentsList argsA;
     ArgumentsList argsB;
@@ -147,8 +147,8 @@ TEST_F(GraphStateTests, Stateful_Execution_3) {
     // conditional scope
     state->registerScope(22);
 
-    nd4j::ops::LegacyReduceSameOp op1(reduce::Sum);
-    nd4j::ops::lt_scalar op2;
+    sd::ops::LegacyReduceSameOp op1(reduce::Sum);
+    sd::ops::lt_scalar op2;
 
     // while sum(var0) < var1
     // this op takes sum
@@ -170,8 +170,8 @@ TEST_F(GraphStateTests, Stateful_Execution_3) {
     // this op is result of previous op + 1
     ArgumentsList args4({{3, 0}, {0, 2}});
 
-    nd4j::ops::add op3;
-    nd4j::ops::add op4;
+    sd::ops::add op3;
+    sd::ops::add op4;
 
     state->attachOpToScope(33, 3, &op3, args3);
     state->attachOpToScope(33, 4, &op4, args4);
@@ -225,8 +225,8 @@ TEST_F(GraphStateTests, Stateful_Execution_4) {
     // conditional scope
     state->registerScope(22);
 
-    nd4j::ops::LegacyReduceSameOp op1(reduce::Sum);
-    nd4j::ops::lt_scalar op2;
+    sd::ops::LegacyReduceSameOp op1(reduce::Sum);
+    sd::ops::lt_scalar op2;
 
     // if sum(var0) < var1
     // this op takes sum
@@ -242,7 +242,7 @@ TEST_F(GraphStateTests, Stateful_Execution_4) {
     state->registerScope(33);
 
     ArgumentsList args3({{0, 0}, {0, 1}});
-    nd4j::ops::subtract op3;
+    sd::ops::subtract op3;
     state->attachOpToScope(33, 3, &op3, args3);
 
     // return for false scope
@@ -253,7 +253,7 @@ TEST_F(GraphStateTests, Stateful_Execution_4) {
     state->registerScope(44);
 
     ArgumentsList args4({{0, 0}, {0, 1}});
-    nd4j::ops::add op4;
+    sd::ops::add op4;
     state->attachOpToScope(44, 4, &op4, args4);
 
     // return for false scope
@@ -299,8 +299,8 @@ TEST_F(GraphStateTests, Stateful_Execution_5) {
     // conditional scope
     state->registerScope(22);
 
-    nd4j::ops::LegacyReduceSameOp op1(reduce::Sum);
-    nd4j::ops::gt_scalar op2;
+    sd::ops::LegacyReduceSameOp op1(reduce::Sum);
+    sd::ops::gt_scalar op2;
 
     // if sum(var0) < var1
     // this op takes sum
@@ -316,7 +316,7 @@ TEST_F(GraphStateTests, Stateful_Execution_5) {
     state->registerScope(33);
 
     ArgumentsList args3({{0, 0}, {0, 1}});
-    nd4j::ops::subtract op3;
+    sd::ops::subtract op3;
     state->attachOpToScope(33, 3, &op3, args3);
 
     // return for false scope
@@ -327,7 +327,7 @@ TEST_F(GraphStateTests, Stateful_Execution_5) {
     state->registerScope(44);
 
     ArgumentsList args4({{0, 0}, {0, 1}});
-    nd4j::ops::add op4;
+    sd::ops::add op4;
     state->attachOpToScope(44, 4, &op4, args4);
 
     // return for false scope
diff --git a/libnd4j/tests_cpu/layers_tests/GraphTests.cpp b/libnd4j/tests_cpu/layers_tests/GraphTests.cpp
index 180d8fef4..73aac9c3b 100644
--- a/libnd4j/tests_cpu/layers_tests/GraphTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/GraphTests.cpp
@@ -25,12 +25,12 @@
 #include <graph/Node.h>
 #include <graph/Graph.h>
 #include <graph/GraphUtils.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <ops/declarable/DeclarableOp.h>
 #include <ops/declarable/generic/parity_ops.cpp>
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class GraphTests : public testing::Test {
 public:
@@ -394,7 +394,7 @@ TEST_F(GraphTests, ReductionsTest1) {
     graph->getVariableSpace()->putVariable(-1, x);
     graph->getVariableSpace()->putVariable(-2, z);
 
-//    nd4j::graph::Node::Node(OpType opType, int opNum, int id, std::initializer_list<int> input, std::initializer_list<int> output, std::initializer_list<int> dimensions, float scalar, std::initializer_list<double> tArgs, std::initializer_list<int> iArgs) {
+//    sd::graph::Node::Node(OpType opType, int opNum, int id, std::initializer_list<int> input, std::initializer_list<int> output, std::initializer_list<int> dimensions, float scalar, std::initializer_list<double> tArgs, std::initializer_list<int> iArgs) {
 
     auto nodeA = new Node(OpType_REDUCE_FLOAT, reduce::Mean, 1, {-1}, {2}, {1}, {});
     auto nodeB = new Node(OpType_TRANSFORM_SAME, transform::Abs, 2, {1}, {-2});
@@ -891,7 +891,7 @@ TEST_F(GraphTests, OutputValidation6) {
 }
 
 TEST_F(GraphTests, TestMultiOutput1) {
-    nd4j::ops::testop2i2o op1;
+    sd::ops::testop2i2o op1;
     auto graph = new Graph();
 
     auto x = NDArrayFactory::create_<float>('c', {5, 5});
@@ -910,7 +910,7 @@ TEST_F(GraphTests, TestMultiOutput1) {
     auto nodeB0 = new Node(OpType_TRANSFORM_SAME, transform::Abs, 2, {-2}, {11});
     nodeB0->markInplace(false);
 
-    auto op = nd4j::ops::OpRegistrator::getInstance()->getOperation("testop2i2o");
+    auto op = sd::ops::OpRegistrator::getInstance()->getOperation("testop2i2o");
 
     // this op will add 1.0 to first input, and 2.0 for second input
     auto nodeT = new Node(op, 11, {1, 2}, {21, 31}, {}, 0.0f);
@@ -951,7 +951,7 @@ TEST_F(GraphTests, TestMultiOutput1) {
 }
 
 TEST_F(GraphTests, TestDivergentNode1) {
-    auto op = nd4j::ops::OpRegistrator::getInstance()->getOperation("Switch");
+    auto op = sd::ops::OpRegistrator::getInstance()->getOperation("Switch");
     auto nodeY = new Node(op, 1);
 
     ASSERT_TRUE(nodeY->isDivergencePoint());
@@ -1065,7 +1065,7 @@ TEST_F(GraphTests, MemoryEstimationTest5) {
 
     graph.getVariableSpace()->putVariable(-1, x);
 
-    nd4j::ops::testcustom op;
+    sd::ops::testcustom op;
 
     auto nodeA0 = new Node(OpType_TRANSFORM_SAME, transform::Abs, 1, {-1}, {2});
     auto nodeA1 = new Node(OpType_TRANSFORM_SAME, transform::Abs, 2, {1}, {3});
@@ -1409,7 +1409,7 @@ TEST_F(GraphTests, OpListTest_1) {
     GraphUtils::filterOperations(ops);
     ASSERT_TRUE(ops.size() == 7);
 
-    std::string exp(" -g \"-DLIBND4J_OPS_LIST='-DOP_rank=true -DOP_range=true -DOP_subtract=true -DOP_permute=true -DOP_matmul=true -DOP_biasadd=true -DOP_TRANSFORM{15}=true '\"");
+    std::string exp(" -g \"-DSD_OPS_LIST='-DOP_rank=true -DOP_range=true -DOP_subtract=true -DOP_permute=true -DOP_matmul=true -DOP_biasadd=true -DOP_TRANSFORM{15}=true '\"");
     std::string out = GraphUtils::makeCommandLine(ops);
 //    nd4j_printf("EXP: >%s<\n", exp.c_str());
 //    nd4j_printf("OUT: >%s<\n", out.c_str());
@@ -1434,7 +1434,7 @@ TEST_F(GraphTests, OpListTest_2) {
 
     GraphUtils::filterOperations(ops);
 
-    std::string exp = " -g \"-DLIBND4J_OPS_LIST='-DOP_rank=true -DOP_range=true -DOP_subtract=true -DOP_permute=true -DOP_matmul=true -DOP_biasadd=true -DOP_TRANSFORM{15}=true -DOP_strided_slice=true -DOP_ACCUMULATION{1}=true '\"";
+    std::string exp = " -g \"-DSD_OPS_LIST='-DOP_rank=true -DOP_range=true -DOP_subtract=true -DOP_permute=true -DOP_matmul=true -DOP_biasadd=true -DOP_TRANSFORM{15}=true -DOP_strided_slice=true -DOP_ACCUMULATION{1}=true '\"";
 
     ASSERT_TRUE(ops.size() == 9);
     ASSERT_EQ(exp, GraphUtils::makeCommandLine(ops));
@@ -1570,7 +1570,7 @@ TEST_F(GraphTests, Test_Inplace_Outputs_1) {
     auto exp = NDArrayFactory::create<float>('c', {6}, {1.f, 2.f, 3.f, 4.f, 5.f, 6.f});
     auto z = NDArrayFactory::create<float>('c', {2, 3});
 
-    nd4j::ops::test_output_reshape op;
+    sd::ops::test_output_reshape op;
     auto result = op.execute({&x}, {&z}, {}, {}, {});
     ASSERT_EQ(Status::OK(), result);
 
@@ -1587,7 +1587,7 @@ TEST_F(GraphTests, Test_Inplace_Outputs_2) {
     auto z = NDArrayFactory::create<float>('c', {3, 3});
 
     bool failed = false;
-    nd4j::ops::test_output_reshape op;
+    sd::ops::test_output_reshape op;
     try {
         op.execute({&x}, {&z}, {}, {}, {});
 
diff --git a/libnd4j/tests_cpu/layers_tests/HashUtilsTests.cpp b/libnd4j/tests_cpu/layers_tests/HashUtilsTests.cpp
index fd4baaafc..da513f7d4 100644
--- a/libnd4j/tests_cpu/layers_tests/HashUtilsTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/HashUtilsTests.cpp
@@ -29,7 +29,7 @@ class HashUtilsTests : public testing::Test {
 TEST_F(HashUtilsTests, TestEquality1) {
     std::string str("Conv2D");
 
-    Nd4jLong hash1 = nd4j::ops::HashHelper::getInstance()->getLongHash(str);
+    Nd4jLong hash1 = sd::ops::HashHelper::getInstance()->getLongHash(str);
     ASSERT_EQ(-1637140380760460323L, hash1);
 }
 
@@ -38,6 +38,6 @@ TEST_F(HashUtilsTests, TestEquality1) {
 TEST_F(HashUtilsTests, TestEquality2) {
     std::string str("switch");
 
-    Nd4jLong hash1 = nd4j::ops::HashHelper::getInstance()->getLongHash(str);
+    Nd4jLong hash1 = sd::ops::HashHelper::getInstance()->getLongHash(str);
     ASSERT_EQ(-1988317239813741487L, hash1);
 }
\ No newline at end of file
diff --git a/libnd4j/tests_cpu/layers_tests/HelpersTests1.cpp b/libnd4j/tests_cpu/layers_tests/HelpersTests1.cpp
index 0bf9a1eb7..b06d6b96d 100644
--- a/libnd4j/tests_cpu/layers_tests/HelpersTests1.cpp
+++ b/libnd4j/tests_cpu/layers_tests/HelpersTests1.cpp
@@ -16,24 +16,24 @@
  ******************************************************************************/
 
 #include "testlayers.h"
-#include <householder.h>
-#include <biDiagonalUp.h>
-#include <hhSequence.h>
-#include <svd.h>
-#include <hhColPivQR.h>
+#include <helpers/householder.h>
+#include <helpers/biDiagonalUp.h>
+#include <helpers/hhSequence.h>
+#include <helpers/svd.h>
+#include <helpers/hhColPivQR.h>
 #include <array>
-#include <jacobiSVD.h>
+#include <helpers/jacobiSVD.h>
 #include <ops/declarable/helpers/reverse.h>
 #include <ops/declarable/helpers/activations.h>
 #include <ops/declarable/helpers/rnn.h>
 #include <ops/declarable/helpers/sg_cb.h>
-#include <MmulHelper.h>
-#include <GradCheck.h>
+#include <helpers/MmulHelper.h>
+#include <helpers/GradCheck.h>
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/lstmLayer.h>
 
 
-using namespace nd4j;
+using namespace sd;
 
 class HelpersTests1 : public testing::Test {
 public:
@@ -50,14 +50,14 @@ public:
 TEST_F(HelpersTests1, test_binary_search_1) {
     std::array<int, 10> array = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
 
-    auto idx = nd4j::ops::helpers::binarySearch(array.data(), 2, 10);
+    auto idx = sd::ops::helpers::binarySearch(array.data(), 2, 10);
     ASSERT_EQ(2, idx);
 }
 
 TEST_F(HelpersTests1, test_binary_search_2) {
     std::array<int, 10> array = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
 
-    auto idx = nd4j::ops::helpers::binarySearch(array.data(), 18, 10);
+    auto idx = sd::ops::helpers::binarySearch(array.data(), 18, 10);
     ASSERT_EQ(-1, idx);
 }
 
@@ -1440,7 +1440,7 @@ TEST_F(HelpersTests1, SVD_test17) {
 //    auto outArr = NDArrayFactory::create<float>('c', {2,5});
 //
 //
-//    ops::helpers::reverseArray<float>(nd4j::LaunchContext ::defaultContext(), inArr.getBuffer(), inArr.getShapeInfo(), outArr.getBuffer(), outArr.getShapeInfo());
+//    ops::helpers::reverseArray<float>(sd::LaunchContext ::defaultContext(), inArr.getBuffer(), inArr.getShapeInfo(), outArr.getBuffer(), outArr.getShapeInfo());
 //
 //    ASSERT_TRUE(outArr.equalsTo(&exp));
 //    ASSERT_TRUE(outArr.isSameShapeStrict(exp));
@@ -1454,7 +1454,7 @@ TEST_F(HelpersTests1, SVD_test17) {
 //    auto exp = NDArrayFactory::create<float>('c', {2,5}, {10,9,8,7,6,5,4,3,2,1});
 //
 //
-//    ops::helpers::reverseArray<float>(nd4j::LaunchContext ::defaultContext(), inArr.getBuffer(), inArr.getShapeInfo(), inArr.getBuffer(), inArr.getShapeInfo());
+//    ops::helpers::reverseArray<float>(sd::LaunchContext ::defaultContext(), inArr.getBuffer(), inArr.getShapeInfo(), inArr.getBuffer(), inArr.getShapeInfo());
 //
 //    ASSERT_TRUE(inArr.equalsTo(&exp));
 //    ASSERT_TRUE(inArr.isSameShapeStrict(exp));
@@ -1468,7 +1468,7 @@ TEST_F(HelpersTests1, SVD_test17) {
 //    auto exp = NDArrayFactory::create<float>('c', {2,5}, {5,4,3,2,1,6,7,8,9,10});
 //    auto outArr = NDArrayFactory::create<float>('c', {2,5});
 //
-//    ops::helpers::reverseArray<float>(nd4j::LaunchContext ::defaultContext(), inArr.getBuffer(), inArr.getShapeInfo(), outArr.getBuffer(), outArr.getShapeInfo(), 5);
+//    ops::helpers::reverseArray<float>(sd::LaunchContext ::defaultContext(), inArr.getBuffer(), inArr.getShapeInfo(), outArr.getBuffer(), outArr.getShapeInfo(), 5);
 //
 //    ASSERT_TRUE(outArr.equalsTo(&exp));
 //    ASSERT_TRUE(outArr.isSameShapeStrict(exp));
@@ -1481,12 +1481,12 @@ TEST_F(HelpersTests1, rnnCell_test1) {
     const int inSize   = 4;
     const int numUnits = 4;
 
-    NDArray xt('c', {bS, inSize}, nd4j::DataType::DOUBLE);
-    NDArray ht_1('c', {bS, numUnits}, nd4j::DataType::DOUBLE);
-    NDArray Wx('c', {inSize, numUnits}, nd4j::DataType::DOUBLE);
-    NDArray Wh('c', {numUnits, numUnits}, nd4j::DataType::DOUBLE);
+    NDArray xt('c', {bS, inSize}, sd::DataType::DOUBLE);
+    NDArray ht_1('c', {bS, numUnits}, sd::DataType::DOUBLE);
+    NDArray Wx('c', {inSize, numUnits}, sd::DataType::DOUBLE);
+    NDArray Wh('c', {numUnits, numUnits}, sd::DataType::DOUBLE);
     NDArray b ('c', {2*numUnits}, {0.0,0.0,0.0,0.0,  0.1,0.2,0.3,0.4});
-    NDArray ht('c', {bS, numUnits}, nd4j::DataType::DOUBLE);
+    NDArray ht('c', {bS, numUnits}, sd::DataType::DOUBLE);
 
     xt.assign(0.1);
     ht_1.assign(0.2);
@@ -1495,7 +1495,7 @@ TEST_F(HelpersTests1, rnnCell_test1) {
 
     NDArray expHt('c', {bS, numUnits}, {0.492988, 0.56489956, 0.6291452 , 0.6858091,0.492988, 0.56489956, 0.6291452 , 0.6858091});
 
-    ops::helpers::rnnCell(nd4j::LaunchContext ::defaultContext(), &xt, &Wx, &Wh, &b, &ht_1, &ht);
+    ops::helpers::rnnCell(sd::LaunchContext ::defaultContext(), &xt, &Wx, &Wh, &b, &ht_1, &ht);
 
     ASSERT_TRUE(expHt.isSameShape(ht));
     ASSERT_TRUE(expHt.equalsTo(ht));
@@ -1524,7 +1524,7 @@ TEST_F(HelpersTests1, rnnCell_test2) {
 
     auto expHt = NDArrayFactory::create<double>('c', {bS, numUnits}, {0.6169093,0.67506987,0.72589741,0.76986654,0.6169093,0.67506987,0.72589741,0.76986654});
 
-    ops::helpers::rnnCell(nd4j::LaunchContext ::defaultContext(), &xt, &Wx, &Wh, &b, &ht_1, &ht);
+    ops::helpers::rnnCell(sd::LaunchContext ::defaultContext(), &xt, &Wx, &Wh, &b, &ht_1, &ht);
 
     ASSERT_TRUE(expHt.isSameShape(ht));
     ASSERT_TRUE(expHt.equalsTo(ht));
@@ -1552,7 +1552,7 @@ TEST_F(HelpersTests1, rnnCell_test3) {
 
     auto expHt = NDArrayFactory::create<double>('c', {bS, numUnits}, {0.5915195, 0.6043678, 0.6169093, 0.6291452,0.5915195, 0.6043678, 0.6169093, 0.6291452});
 
-    ops::helpers::rnnCell(nd4j::LaunchContext ::defaultContext(), &xt, &Wx, &Wh, &b, &ht_1, &ht);
+    ops::helpers::rnnCell(sd::LaunchContext ::defaultContext(), &xt, &Wx, &Wh, &b, &ht_1, &ht);
 
     ASSERT_TRUE(expHt.isSameShape(ht));
     ASSERT_TRUE(expHt.equalsTo(ht));
@@ -1581,7 +1581,7 @@ TEST_F(HelpersTests1, rnnCell_test4) {
 
     auto expHt = NDArrayFactory::create<double>('c', {bS, numUnits}, {0.68474828, 0.68474828, 0.68474828, 0.68474828,0.69882484, 0.69882484, 0.69882484, 0.69882484});
 
-    ops::helpers::rnnCell(nd4j::LaunchContext ::defaultContext(), &xt, &Wx, &Wh, &b, &ht_1, &ht);
+    ops::helpers::rnnCell(sd::LaunchContext ::defaultContext(), &xt, &Wx, &Wh, &b, &ht_1, &ht);
 
     ASSERT_TRUE(expHt.isSameShape(ht));
     ASSERT_TRUE(expHt.equalsTo(ht));
@@ -1887,7 +1887,7 @@ TEST_F(HelpersTests1, OpArgsHolder_test3) {
     gradO.linspace(0.01, 0.01);
 
     OpArgsHolder holderFF({&input}, {}, {2, 3});
-    nd4j::ops::tile opFF;                                              // the kind of op doesn't matter, we simply check here whether op.execute() works with OpArgsHolder correctly
+    sd::ops::tile opFF;                                              // the kind of op doesn't matter, we simply check here whether op.execute() works with OpArgsHolder correctly
     auto results = opFF.execute(holderFF);
     auto tiled = results->at(0);
     ASSERT_EQ(Status::OK(), results->status());
@@ -1896,7 +1896,7 @@ TEST_F(HelpersTests1, OpArgsHolder_test3) {
     delete results;
 
     OpArgsHolder holderBP = holderFF.createArgsHolderForBP({&gradO}, true);
-    nd4j::ops::tile_bp opBP;
+    sd::ops::tile_bp opBP;
     results = opBP.execute(holderBP);
     auto gradI = results->at(0);
     ASSERT_EQ(Status::OK(), results->status());
@@ -1915,8 +1915,8 @@ TEST_F(HelpersTests1, checkGrad_test1) {
     const OpArgsHolder argsHolderFF({&x}, {}, {});
     const OpArgsHolder argsHolderBP({&x, &gradO}, {}, {});
 
-    nd4j::ops::sigmoid opFF;
-    nd4j::ops::sigmoid_bp opBP;
+    sd::ops::sigmoid opFF;
+    sd::ops::sigmoid_bp opBP;
 
     const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
 
@@ -1937,8 +1937,8 @@ TEST_F(HelpersTests1, checkGrad_test2) {
     const OpArgsHolder argsHolderFF({&x, &weights},         {}, {2, 2, 1, 1, 0, 0, 1, 1, 1});
     const OpArgsHolder argsHolderBP({&x, &weights, &gradO}, {}, {2, 2, 1, 1, 0, 0, 1, 1, 1});
 
-    nd4j::ops::conv2d opFF;
-    nd4j::ops::conv2d_bp opBP;
+    sd::ops::conv2d opFF;
+    sd::ops::conv2d_bp opBP;
 
     const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
 
@@ -1961,8 +1961,8 @@ TEST_F(HelpersTests1, checkGrad_test3) {
     const OpArgsHolder argsHolderFF({&x, &weights, &bias},         {}, {2, 2, 1, 1, 0, 0, 1, 1, 1});
     const OpArgsHolder argsHolderBP({&x, &weights, &bias, &gradO}, {}, {2, 2, 1, 1, 0, 0, 1, 1, 1});
 
-    nd4j::ops::conv2d opFF;
-    nd4j::ops::conv2d_bp opBP;
+    sd::ops::conv2d opFF;
+    sd::ops::conv2d_bp opBP;
 
     const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
 
@@ -1985,8 +1985,8 @@ TEST_F(HelpersTests1, checkGrad_test4) {
     const OpArgsHolder argsHolderFF({&x, &weights, &bias},         {}, {2, 2, 1, 1, 0, 0, 1, 1, 1});
     const OpArgsHolder argsHolderBP({&x, &weights, &bias, &gradO}, {}, {2, 2, 1, 1, 0, 0, 1, 1, 1});
 
-    nd4j::ops::conv2d opFF;
-    nd4j::ops::conv2d_bp opBP;
+    sd::ops::conv2d opFF;
+    sd::ops::conv2d_bp opBP;
 
     const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP, {1, 0, 1});
 
@@ -2009,8 +2009,8 @@ TEST_F(HelpersTests1, checkGrad_test5) {
     const OpArgsHolder argsHolderFF({&x, &weights, &bias},         {}, {2, 2, 1, 1, 0, 0, 1, 1, 1});
     const OpArgsHolder argsHolderBP({&x, &weights, &bias, &gradO}, {}, {2, 2, 1, 1, 0, 0, 1, 1, 1});
 
-    nd4j::ops::conv2d opFF;
-    nd4j::ops::conv2d_bp opBP;
+    sd::ops::conv2d opFF;
+    sd::ops::conv2d_bp opBP;
 
     const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP, {1, 1, 1}, {0.5, 1});
 
@@ -2033,8 +2033,8 @@ TEST_F(HelpersTests1, checkGrad_test6) {
     const OpArgsHolder argsHolderFF({&x, &weights, &bias},         {}, {2, 2, 1, 1, 0, 0, 1, 1, 1});
     const OpArgsHolder argsHolderBP({&x, &weights, &bias, &gradO}, {}, {2, 2, 1, 1, 0, 0, 1, 1, 1});
 
-    nd4j::ops::conv2d opFF;
-    nd4j::ops::conv2d_bp opBP;
+    sd::ops::conv2d opFF;
+    sd::ops::conv2d_bp opBP;
 
     const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP, {1, 0, 1}, {0.5, 1}, GradCheck::MEAN);
 
@@ -2049,7 +2049,7 @@ TEST_F(HelpersTests1, softMaxForVector_test1) {
     auto expOutput = NDArrayFactory::create<double>('c', {1,5});
     expOutput = 1;
 
-    ops::helpers::softmax(nd4j::LaunchContext ::defaultContext(), input, output, 0);
+    ops::helpers::softmax(sd::LaunchContext ::defaultContext(), input, output, 0);
 
     ASSERT_TRUE(output.equalsTo(&expOutput));
 }
@@ -2061,7 +2061,7 @@ TEST_F(HelpersTests1, softMaxForVector_test2) {
     auto output = NDArrayFactory::create<double>('c', {5,1});
     auto expOutput = NDArrayFactory::create<double>('c', {5,1}, {0.01165623,  0.03168492,  0.08612854,  0.23412166,  0.63640865});
 
-    ops::helpers::softmax(nd4j::LaunchContext ::defaultContext(), input, output, 0);
+    ops::helpers::softmax(sd::LaunchContext ::defaultContext(), input, output, 0);
 
     ASSERT_TRUE(output.equalsTo(&expOutput));
 }
@@ -2073,7 +2073,7 @@ TEST_F(HelpersTests1, softMaxForVector_test3) {
     auto output = NDArrayFactory::create<double>('c', {5});
     auto expOutput = NDArrayFactory::create<double>('c', {5}, {0.01165623,  0.03168492,  0.08612854,  0.23412166,  0.63640865});
 
-    ops::helpers::softmax(nd4j::LaunchContext ::defaultContext(), input, output, 0);
+    ops::helpers::softmax(sd::LaunchContext ::defaultContext(), input, output, 0);
 
     ASSERT_TRUE(output.equalsTo(&expOutput));
 }
@@ -2081,8 +2081,8 @@ TEST_F(HelpersTests1, softMaxForVector_test3) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(HelpersTests1, softMaxForVector_test4) {
 
-    NDArray input('c', {1500}, nd4j::DataType::DOUBLE);
-    NDArray output('c', {1500}, nd4j::DataType::DOUBLE);
+    NDArray input('c', {1500}, sd::DataType::DOUBLE);
+    NDArray output('c', {1500}, sd::DataType::DOUBLE);
     NDArray expOutput('c', {1500}, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -2103,10 +2103,10 @@ TEST_F(HelpersTests1, softMaxForVector_test4) {
 0.001103, 0.001114, 0.001125, 0.001136, 0.001148, 0.001159, 0.001171, 0.001182, 0.001194, 0.001206, 0.001218, 0.001231, 0.001243, 0.001256, 0.001268, 0.001281, 0.001294, 0.001307, 0.001320, 0.001333, 0.001347, 0.001360, 0.001374, 0.001388, 0.001402, 0.001416, 0.001430, 0.001444,0.001459, 0.001473, 0.001488, 0.001503, 0.001518, 0.001534, 0.001549, 0.001565, 0.001580, 0.001596, 0.001612, 0.001628, 0.001645, 0.001661, 0.001678, 0.001695, 0.001712, 0.001729, 0.001746, 0.001764, 0.001782, 0.001800, 0.001818, 0.001836, 0.001854, 0.001873, 0.001892, 0.001911,
 0.001930, 0.001950, 0.001969, 0.001989, 0.002009, 0.002029, 0.002049, 0.002070, 0.002091, 0.002112, 0.002133, 0.002155, 0.002176, 0.002198, 0.002220, 0.002242, 0.002265, 0.002288, 0.002311, 0.002334, 0.002357, 0.002381, 0.002405, 0.002429, 0.002454, 0.002478, 0.002503, 0.002528,0.002554, 0.002579, 0.002605, 0.002632, 0.002658, 0.002685, 0.002712, 0.002739, 0.002767, 0.002794, 0.002822, 0.002851, 0.002879, 0.002908, 0.002938, 0.002967, 0.002997, 0.003027, 0.003057, 0.003088, 0.003119, 0.003151, 0.003182, 0.003214, 0.003247, 0.003279, 0.003312, 0.003345,
 0.003379, 0.003413, 0.003447, 0.003482, 0.003517, 0.003552, 0.003588, 0.003624, 0.003660, 0.003697, 0.003734, 0.003772, 0.003810, 0.003848, 0.003887, 0.003926, 0.003965, 0.004005, 0.004045, 0.004086, 0.004127, 0.004169, 0.004211, 0.004253, 0.004296, 0.004339, 0.004382, 0.004426,0.004471, 0.004516, 0.004561, 0.004607, 0.004653, 0.004700, 0.004747, 0.004795, 0.004843, 0.004892, 0.004941, 0.004991, 0.005041, 0.005092, 0.005143, 0.005194, 0.005247, 0.005299, 0.005353, 0.005406, 0.005461, 0.005516, 0.005571, 0.005627, 0.005684, 0.005741, 0.005798, 0.005857,
-0.005916, 0.005975, 0.006035, 0.006096, 0.006157, 0.006219, 0.006281, 0.006345, 0.006408, 0.006473, 0.006538, 0.006603, 0.006670, 0.006737, 0.006805, 0.006873, 0.006942, 0.007012, 0.007082, 0.007153, 0.007225, 0.007298, 0.007371, 0.007445, 0.007520, 0.007596, 0.007672, 0.007749,0.007827, 0.007906, 0.007985, 0.008065, 0.008147, 0.008228, 0.008311, 0.008395, 0.008479, 0.008564, 0.008650, 0.008737, 0.008825, 0.008914, 0.009003, 0.009094, 0.009185, 0.009277, 0.009371, 0.009465, 0.009560, 0.009656, 0.009753, 0.009851, 0.009950}, nd4j::DataType::DOUBLE);
+0.005916, 0.005975, 0.006035, 0.006096, 0.006157, 0.006219, 0.006281, 0.006345, 0.006408, 0.006473, 0.006538, 0.006603, 0.006670, 0.006737, 0.006805, 0.006873, 0.006942, 0.007012, 0.007082, 0.007153, 0.007225, 0.007298, 0.007371, 0.007445, 0.007520, 0.007596, 0.007672, 0.007749,0.007827, 0.007906, 0.007985, 0.008065, 0.008147, 0.008228, 0.008311, 0.008395, 0.008479, 0.008564, 0.008650, 0.008737, 0.008825, 0.008914, 0.009003, 0.009094, 0.009185, 0.009277, 0.009371, 0.009465, 0.009560, 0.009656, 0.009753, 0.009851, 0.009950}, sd::DataType::DOUBLE);
     input.linspace(0.01, 0.01);
 
-    ops::helpers::softmax(nd4j::LaunchContext ::defaultContext(), input, output, 0);
+    ops::helpers::softmax(sd::LaunchContext ::defaultContext(), input, output, 0);
 
     ASSERT_TRUE(output.equalsTo(&expOutput));
 }
@@ -2119,7 +2119,7 @@ TEST_F(HelpersTests1, logSoftMaxForVector_test1) {
     auto expOutput = NDArrayFactory::create<double>('c', {1,5});
     expOutput = 0;
 
-    ops::helpers::logSoftmax(nd4j::LaunchContext ::defaultContext(), input, output, 0);
+    ops::helpers::logSoftmax(sd::LaunchContext ::defaultContext(), input, output, 0);
 
     ASSERT_TRUE(output.equalsTo(&expOutput));
 }
@@ -2131,7 +2131,7 @@ TEST_F(HelpersTests1, logSoftMaxForVector_test2) {
     auto output = NDArrayFactory::create<double>('c', {5,1});
     auto expOutput = NDArrayFactory::create<double>('c', {5,1}, {-4.4519144, -3.4519144, -2.4519144, -1.4519144, -0.4519144});
 
-    ops::helpers::logSoftmax(nd4j::LaunchContext ::defaultContext(), input, output, 0);
+    ops::helpers::logSoftmax(sd::LaunchContext ::defaultContext(), input, output, 0);
 
     ASSERT_TRUE(output.equalsTo(&expOutput));
 }
@@ -2143,7 +2143,7 @@ TEST_F(HelpersTests1, logSoftMaxForVector_test3) {
     auto output = NDArrayFactory::create<double>('c', {5});
     auto expOutput = NDArrayFactory::create<double>('c', {5}, {-4.4519144, -3.4519144, -2.4519144, -1.4519144, -0.4519144});
 
-    ops::helpers::logSoftmax(nd4j::LaunchContext ::defaultContext(), input, output, 0);
+    ops::helpers::logSoftmax(sd::LaunchContext ::defaultContext(), input, output, 0);
 
     ASSERT_TRUE(output.equalsTo(&expOutput));
 }
@@ -2151,8 +2151,8 @@ TEST_F(HelpersTests1, logSoftMaxForVector_test3) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(HelpersTests1, logSoftMaxForVector_test4) {
 
-    NDArray input('c', {1500}, nd4j::DataType::DOUBLE);
-    NDArray output('c', {1500}, nd4j::DataType::DOUBLE);
+    NDArray input('c', {1500}, sd::DataType::DOUBLE);
+    NDArray output('c', {1500}, sd::DataType::DOUBLE);
     NDArray expOutput('c', {1500}, {-8.154773, -8.153772, -8.152773, -8.151772, -8.150773, -8.149773, -8.148773, -8.147773, -8.146772, -8.145773, -8.144773, -8.143773, -8.142773, -8.141773, -8.140773, -8.139772, -8.138773, -8.137773, -8.136773, -8.135773, -8.134773, -8.133773, -8.132772, -8.131773, -8.130773, -8.129773, -8.128773, -8.127772, -8.126773, -8.125772, -8.124773, -8.123773, -8.122773, -8.121773, -8.120772, -8.119773, -8.118773, -8.117773, -8.116773, -8.115773, -8.114773, -8.113772, -8.112773, -8.111773, -8.110773, -8.109773, -8.108773, -8.107773, -8.106772, -8.105773, -8.104773, -8.103773, -8.102773, -8.101772, -8.100773, -8.099772, -8.098773, -8.097773, -8.096773, -8.095773, -8.094772, -8.093773, -8.092772, -8.091773, -8.090773, -8.089773, -8.088773, -8.087772, -8.086773, -8.085773, -8.084773, -8.083773, -8.082773, -8.081773, -8.080772, -8.079773, -8.078773, -8.077773, -8.076773, -8.075773, -8.074773, -8.073772, -8.072773, -8.071773, -8.070773, -8.069773, -8.068772, -8.067773, -8.066772, -8.065773, -8.064773, -8.063773, -8.062773, -8.061772, -8.060773, -8.059772, -8.058773, -8.057773, -8.056773, -8.055773, -8.054772,
 -8.053773, -8.052773, -8.051773, -8.050773, -8.049773, -8.048773, -8.047772, -8.046773, -8.045773, -8.044773, -8.043773, -8.042773, -8.041773, -8.040772, -8.039773, -8.038773, -8.037773, -8.036773, -8.035772, -8.034773, -8.033772, -8.032773, -8.031773, -8.030773, -8.029773, -8.028772, -8.027773, -8.026772, -8.025773, -8.024773, -8.023773, -8.022773, -8.021772, -8.020773, -8.019773, -8.018773, -8.017773, -8.016773, -8.015773, -8.014772, -8.013773, -8.012773, -8.011773, -8.010773, -8.009773, -8.008773, -8.007772, -8.006773, -8.005773, -8.004773, -8.003773, -8.002772, -8.001773, -8.000772, -7.999773, -7.998773, -7.997773, -7.996773, -7.995773, -7.994773, -7.993773, -7.992773, -7.991773, -7.990773, -7.989773, -7.988773, -7.987773, -7.986773, -7.985773, -7.984773, -7.983773, -7.982773, -7.981773, -7.980773, -7.979773, -7.978773, -7.977773, -7.976773, -7.975773, -7.974773, -7.973773, -7.972773, -7.971773, -7.970773, -7.969773, -7.968773, -7.967773, -7.966773, -7.965773, -7.964773, -7.963773, -7.962773, -7.961773, -7.960773, -7.959773, -7.958773, -7.957773, -7.956773, -7.955773, -7.954773, -7.953773, -7.952773,
 -7.951773, -7.950773, -7.949773, -7.948773, -7.947773, -7.946773, -7.945773, -7.944773, -7.943773, -7.942773, -7.941773, -7.940773, -7.939773, -7.938773, -7.937773, -7.936773, -7.935773, -7.934773, -7.933773, -7.932773, -7.931773, -7.930773, -7.929773, -7.928773, -7.927773, -7.926773, -7.925773, -7.924773, -7.923773, -7.922773, -7.921773, -7.920773, -7.919773, -7.918773, -7.917773, -7.916773, -7.915773, -7.914773, -7.913773, -7.912773, -7.911773, -7.910773, -7.909773, -7.908773, -7.907773, -7.906773, -7.905773, -7.904773, -7.903773, -7.902773, -7.901773, -7.900773, -7.899773, -7.898773, -7.897773, -7.896773, -7.895773, -7.894773, -7.893773, -7.892773, -7.891773, -7.890773, -7.889773, -7.888773, -7.887773, -7.886773, -7.885773, -7.884773, -7.883773, -7.882773, -7.881773, -7.880773, -7.879773, -7.878773, -7.877773, -7.876773, -7.875773, -7.874773, -7.873773, -7.872773, -7.871773, -7.870773, -7.869773, -7.868773, -7.867773, -7.866773, -7.865773, -7.864773, -7.863773, -7.862773, -7.861773, -7.860773, -7.859773, -7.858773, -7.857773, -7.856773, -7.855773, -7.854773, -7.853773, -7.852773, -7.851773, -7.850773, -7.849773,
@@ -2167,10 +2167,10 @@ TEST_F(HelpersTests1, logSoftMaxForVector_test4) {
 -7.024773, -7.023773, -7.022773, -7.021773, -7.020773, -7.019773, -7.018773, -7.017773, -7.016773, -7.015773, -7.014773, -7.013773, -7.012773, -7.011773, -7.010773, -7.009773, -7.008773, -7.007773, -7.006773, -7.005773, -7.004773, -7.003773, -7.002773, -7.001773, -7.000773, -6.999773, -6.998773, -6.997773, -6.996773, -6.995773, -6.994773, -6.993773, -6.992773, -6.991773, -6.990773, -6.989773, -6.988773, -6.987773, -6.986773, -6.985773, -6.984773, -6.983773, -6.982773, -6.981773, -6.980773, -6.979773, -6.978773, -6.977773, -6.976773, -6.975773, -6.974773, -6.973773, -6.972773, -6.971773, -6.970773, -6.969773, -6.968773, -6.967773, -6.966773, -6.965773, -6.964773, -6.963773, -6.962773, -6.961773, -6.960773, -6.959773, -6.958773, -6.957773, -6.956773, -6.955773, -6.954773, -6.953773, -6.952773, -6.951773, -6.950773, -6.949773, -6.948773, -6.947773, -6.946773, -6.945773, -6.944773, -6.943773, -6.942773, -6.941773, -6.940773, -6.939773, -6.938773, -6.937773, -6.936773, -6.935773, -6.934773, -6.933773, -6.932773, -6.931773, -6.930773, -6.929773, -6.928773, -6.927773, -6.926773, -6.925773, -6.924773, -6.923773, -6.922773,
 -6.921773, -6.920773, -6.919773, -6.918773, -6.917773, -6.916773, -6.915773, -6.914773, -6.913773, -6.912773, -6.911773, -6.910773, -6.909773, -6.908773, -6.907773, -6.906773, -6.905773, -6.904773, -6.903773, -6.902773, -6.901773, -6.900773, -6.899773, -6.898773, -6.897773, -6.896773, -6.895773, -6.894773, -6.893773, -6.892773, -6.891773, -6.890773, -6.889773, -6.888773, -6.887773, -6.886773, -6.885773, -6.884773, -6.883773, -6.882773, -6.881773, -6.880773, -6.879773, -6.878773, -6.877773, -6.876773, -6.875773, -6.874773, -6.873773, -6.872773, -6.871773, -6.870773, -6.869773, -6.868773, -6.867773, -6.866773, -6.865773, -6.864773, -6.863773, -6.862773, -6.861773, -6.860773, -6.859773, -6.858773, -6.857773, -6.856773, -6.855773, -6.854773, -6.853773, -6.852773, -6.851773, -6.850773, -6.849773, -6.848773, -6.847773, -6.846773, -6.845773, -6.844773, -6.843773, -6.842773, -6.841773, -6.840773, -6.839773, -6.838773, -6.837773, -6.836773, -6.835773, -6.834773, -6.833773, -6.832773, -6.831773, -6.830773, -6.829773, -6.828773, -6.827773, -6.826773, -6.825773, -6.824773, -6.823773, -6.822773, -6.821773, -6.820773, -6.819773,
 -6.818773, -6.817773, -6.816773, -6.815773, -6.814773, -6.813773, -6.812773, -6.811773, -6.810773, -6.809773, -6.808773, -6.807773, -6.806773, -6.805773, -6.804773, -6.803773, -6.802773, -6.801773, -6.800773, -6.799773, -6.798773, -6.797773, -6.796773, -6.795773, -6.794773, -6.793773, -6.792773, -6.791773, -6.790773, -6.789773, -6.788773, -6.787773, -6.786773, -6.785773, -6.784773, -6.783773, -6.782773, -6.781773, -6.780773, -6.779773, -6.778773, -6.777773, -6.776773, -6.775773, -6.774773, -6.773773, -6.772773, -6.771773, -6.770773, -6.769773, -6.768773, -6.767773, -6.766773, -6.765773, -6.764773, -6.763773, -6.762773, -6.761773, -6.760773, -6.759773, -6.758773, -6.757773, -6.756773, -6.755773, -6.754773, -6.753773, -6.752773, -6.751773, -6.750773, -6.749773, -6.748773, -6.747773, -6.746773, -6.745773, -6.744773, -6.743773, -6.742773, -6.741773, -6.740773, -6.739773, -6.738773, -6.737773, -6.736773, -6.735773, -6.734773, -6.733773, -6.732773, -6.731773, -6.730773, -6.729773, -6.728773, -6.727773, -6.726773, -6.725773, -6.724773, -6.723773, -6.722773, -6.721773, -6.720773, -6.719773, -6.718773, -6.717773, -6.716773, -6.715773,
--6.714773, -6.713773, -6.712773, -6.711773, -6.710773, -6.709773, -6.708773, -6.707773, -6.706773, -6.705773, -6.704773, -6.703773, -6.702773, -6.701773, -6.700773, -6.699773, -6.698773, -6.697773, -6.696773, -6.695773, -6.694773, -6.693773, -6.692773, -6.691773, -6.690773, -6.689773, -6.688773, -6.687773, -6.686773, -6.685773, -6.684773, -6.683773, -6.682773, -6.681773, -6.680773, -6.679773, -6.678773, -6.677773, -6.676773, -6.675773, -6.674773, -6.673773, -6.672773, -6.671773, -6.670773, -6.669773, -6.668773, -6.667773, -6.666773, -6.665773, -6.664773, -6.663773, -6.662773, -6.661773, -6.660773, -6.659773, -6.658773, -6.657773, -6.656773, -6.655773}, nd4j::DataType::DOUBLE);
+-6.714773, -6.713773, -6.712773, -6.711773, -6.710773, -6.709773, -6.708773, -6.707773, -6.706773, -6.705773, -6.704773, -6.703773, -6.702773, -6.701773, -6.700773, -6.699773, -6.698773, -6.697773, -6.696773, -6.695773, -6.694773, -6.693773, -6.692773, -6.691773, -6.690773, -6.689773, -6.688773, -6.687773, -6.686773, -6.685773, -6.684773, -6.683773, -6.682773, -6.681773, -6.680773, -6.679773, -6.678773, -6.677773, -6.676773, -6.675773, -6.674773, -6.673773, -6.672773, -6.671773, -6.670773, -6.669773, -6.668773, -6.667773, -6.666773, -6.665773, -6.664773, -6.663773, -6.662773, -6.661773, -6.660773, -6.659773, -6.658773, -6.657773, -6.656773, -6.655773}, sd::DataType::DOUBLE);
     input.linspace(0.01, 0.001);
 
-    ops::helpers::logSoftmax(nd4j::LaunchContext ::defaultContext(), input, output, 0);
+    ops::helpers::logSoftmax(sd::LaunchContext ::defaultContext(), input, output, 0);
 
     ASSERT_TRUE(output.equalsTo(&expOutput));
 }
@@ -2182,14 +2182,14 @@ TEST_F(HelpersTests1, mmulMxV_1) {
     const Nd4jLong M = 3;
     const Nd4jLong N = 4;
 
-    NDArray a('f', {M,N}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::DOUBLE);
-    NDArray temp('f', {M,N,5}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, nd4j::DataType::DOUBLE);
+    NDArray a('f', {M,N}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::DOUBLE);
+    NDArray temp('f', {M,N,5}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, sd::DataType::DOUBLE);
     NDArray x = temp(6, {0,2});
-    NDArray y('f', {M}, nd4j::DataType::DOUBLE);
+    NDArray y('f', {M}, sd::DataType::DOUBLE);
 
-    NDArray exp('f', {M}, {5.5, 5.1, 4.7}, nd4j::DataType::DOUBLE);
+    NDArray exp('f', {M}, {5.5, 5.1, 4.7}, sd::DataType::DOUBLE);
 
-    nd4j::MmulHelper::mmul(&a, &x, &y, 1., 0.);
+    sd::MmulHelper::mmul(&a, &x, &y, 1., 0.);
     ASSERT_TRUE(y.equalsTo(&exp));
 }
 
@@ -2199,15 +2199,15 @@ TEST_F(HelpersTests1, mmulMxV_2) {
     const Nd4jLong M = 3;
     const Nd4jLong N = 4;
 
-    NDArray a('f', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::DOUBLE);
+    NDArray a('f', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::DOUBLE);
     a.permutei({1,0});
-    NDArray temp('f', {M,N,5}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, nd4j::DataType::DOUBLE);
+    NDArray temp('f', {M,N,5}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, sd::DataType::DOUBLE);
     NDArray x = temp(6, {0,2});
-    NDArray y('f', {M}, nd4j::DataType::DOUBLE);
+    NDArray y('f', {M}, sd::DataType::DOUBLE);
 
-    NDArray exp('f', {M}, {5.1, 3.3, 1.5}, nd4j::DataType::DOUBLE);
+    NDArray exp('f', {M}, {5.1, 3.3, 1.5}, sd::DataType::DOUBLE);
 
-    nd4j::MmulHelper::mmul(&a, &x, &y, 1., 0.);
+    sd::MmulHelper::mmul(&a, &x, &y, 1., 0.);
     ASSERT_TRUE(y.equalsTo(&exp));
 }
 
@@ -2217,15 +2217,15 @@ TEST_F(HelpersTests1, mmulMxV_3) {
     const Nd4jLong M = 3;
     const Nd4jLong N = 4;
 
-    NDArray a('f', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::DOUBLE);
+    NDArray a('f', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::DOUBLE);
     a.permutei({1,0});
-    NDArray temp('f', {N,M,5}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, nd4j::DataType::DOUBLE);
+    NDArray temp('f', {N,M,5}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, sd::DataType::DOUBLE);
     NDArray x = temp(4, {1,2});
-    NDArray y('f', {M}, nd4j::DataType::DOUBLE);
+    NDArray y('f', {M}, sd::DataType::DOUBLE);
 
-    NDArray exp('f', {M}, {6.2, 4.5, 1.7}, nd4j::DataType::DOUBLE);
+    NDArray exp('f', {M}, {6.2, 4.5, 1.7}, sd::DataType::DOUBLE);
 
-    nd4j::MmulHelper::mmul(&a, &x, &y, 1., 0.);
+    sd::MmulHelper::mmul(&a, &x, &y, 1., 0.);
     ASSERT_TRUE(y.equalsTo(&exp));
 }
 
@@ -2235,15 +2235,15 @@ TEST_F(HelpersTests1, mmulMxV_4) {
     const Nd4jLong M = 3;
     const Nd4jLong N = 4;
 
-    NDArray a('f', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::DOUBLE);
+    NDArray a('f', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::DOUBLE);
     a.permutei({1,0});
-    NDArray temp('f', {5,M,N}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, nd4j::DataType::DOUBLE);
+    NDArray temp('f', {5,M,N}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, sd::DataType::DOUBLE);
     NDArray x = temp(3, {0,1});
-    NDArray y('f', {M}, nd4j::DataType::DOUBLE);
+    NDArray y('f', {M}, sd::DataType::DOUBLE);
 
-    NDArray exp('f', {M}, {1.5, 1.8, 1.5}, nd4j::DataType::DOUBLE);
+    NDArray exp('f', {M}, {1.5, 1.8, 1.5}, sd::DataType::DOUBLE);
 
-    nd4j::MmulHelper::mmul(&a, &x, &y, 1., 0.);
+    sd::MmulHelper::mmul(&a, &x, &y, 1., 0.);
     ASSERT_TRUE(y.equalsTo(&exp));
 }
 
@@ -2253,15 +2253,15 @@ TEST_F(HelpersTests1, mmulMxV_5) {
     const Nd4jLong M = 3;
     const Nd4jLong N = 4;
 
-    NDArray a('c', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::DOUBLE);
+    NDArray a('c', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::DOUBLE);
     a.permutei({1,0});
-    NDArray temp('f', {5,M,N}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, nd4j::DataType::DOUBLE);
+    NDArray temp('f', {5,M,N}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, sd::DataType::DOUBLE);
     NDArray x = temp(2, {0,1});
-    NDArray y('f', {M}, nd4j::DataType::DOUBLE);
+    NDArray y('f', {M}, sd::DataType::DOUBLE);
 
-    NDArray exp('f', {M}, {-0.3, 0.3, 0.9}, nd4j::DataType::DOUBLE);
+    NDArray exp('f', {M}, {-0.3, 0.3, 0.9}, sd::DataType::DOUBLE);
 
-    nd4j::MmulHelper::mmul(&a, &x, &y, 1., 0.);
+    sd::MmulHelper::mmul(&a, &x, &y, 1., 0.);
     ASSERT_TRUE(y.equalsTo(&exp));
 }
 
@@ -2271,15 +2271,15 @@ TEST_F(HelpersTests1, mmulMxV_6) {
     const Nd4jLong M = 3;
     const Nd4jLong N = 4;
 
-    NDArray a('c', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::DOUBLE);
+    NDArray a('c', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::DOUBLE);
     a.permutei({1,0});
-    NDArray temp('c', {5,N,M}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, nd4j::DataType::DOUBLE);
+    NDArray temp('c', {5,N,M}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, sd::DataType::DOUBLE);
     NDArray x = temp(13, {0,2});
-    NDArray y('f', {M}, nd4j::DataType::DOUBLE);
+    NDArray y('f', {M}, sd::DataType::DOUBLE);
 
-    NDArray exp('f', {M}, {-12.1, -10.9, -9.7}, nd4j::DataType::DOUBLE);
+    NDArray exp('f', {M}, {-12.1, -10.9, -9.7}, sd::DataType::DOUBLE);
 
-    nd4j::MmulHelper::mmul(&a, &x, &y, 1., 0.);
+    sd::MmulHelper::mmul(&a, &x, &y, 1., 0.);
     ASSERT_TRUE(y.equalsTo(&exp));
 }
 
@@ -2289,28 +2289,28 @@ TEST_F(HelpersTests1, mmulMxV_7) {
     const Nd4jLong M = 3;
     const Nd4jLong N = 4;
 
-    NDArray a('c', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, nd4j::DataType::DOUBLE);
+    NDArray a('c', {N,M}, {1.2,1.1,1.0,0.9,0.8,0.7,0.5,0.4,0.3,0.2,0.1,0}, sd::DataType::DOUBLE);
     a.permutei({1,0});
-    NDArray temp('c', {5,N,M}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, nd4j::DataType::DOUBLE);
+    NDArray temp('c', {5,N,M}, {16,2,-6,7,2,-2,4,-7,6,4,4,6,-3,1,3,9,1,4,9,10,-10,-3,-8,7,-7,-7,6,9,7,-6,8,7,-3,-3,4,-2,5,-3,-3,4,6,-5,-1,7,-5,4,-10,-1,8,0,-7,4,-10,-7,-8,-9,2,9,7,9}, sd::DataType::DOUBLE);
     NDArray x = temp(10, {0,2});
-    NDArray y('c', {M}, nd4j::DataType::DOUBLE);
+    NDArray y('c', {M}, sd::DataType::DOUBLE);
 
-    NDArray exp('c', {M}, {3.3, 3.3, 3.3}, nd4j::DataType::DOUBLE);
+    NDArray exp('c', {M}, {3.3, 3.3, 3.3}, sd::DataType::DOUBLE);
 
-    nd4j::MmulHelper::mmul(&a, &x, &y, 1., 0.);
+    sd::MmulHelper::mmul(&a, &x, &y, 1., 0.);
     ASSERT_TRUE(y.equalsTo(&exp));
 }
 
 //////////////////////////////////////////////////////////////////////
 TEST_F(HelpersTests1, softmaxDerivative_1) {
 
-    NDArray input('c', {3,3}, {-1, 1, -2, 2, -3, 3, -4, 4, 5.}, nd4j::DataType::DOUBLE);
-    NDArray expOutput('c', {3,3}, {0.04508, 0.04514, 0.0008 , 0.0472 , 0.00087, 0.10492, 0.00235, 0.04592, 0.10553}, nd4j::DataType::DOUBLE);
-    NDArray output('c', {3,3}, nd4j::DataType::DOUBLE);
+    NDArray input('c', {3,3}, {-1, 1, -2, 2, -3, 3, -4, 4, 5.}, sd::DataType::DOUBLE);
+    NDArray expOutput('c', {3,3}, {0.04508, 0.04514, 0.0008 , 0.0472 , 0.00087, 0.10492, 0.00235, 0.04592, 0.10553}, sd::DataType::DOUBLE);
+    NDArray output('c', {3,3}, sd::DataType::DOUBLE);
 
-    // input.applyTransform(nd4j::transform::SoftMaxDerivative, &output);
+    // input.applyTransform(sd::transform::SoftMaxDerivative, &output);
 
-    nd4j::ops::helpers::softmaxDerivative(input.getContext(), input, output, 0);
+    sd::ops::helpers::softmaxDerivative(input.getContext(), input, output, 0);
     ASSERT_TRUE(expOutput.isSameShape(output));
     ASSERT_TRUE(expOutput.equalsTo(output));
 }
@@ -2318,15 +2318,15 @@ TEST_F(HelpersTests1, softmaxDerivative_1) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(HelpersTests1, softmaxDerivative_2) {
 
-    NDArray input('c', {3,3,3}, {-1, 1, -2, 2, -3, 3, -4, 4, -5,5 ,-6,6, -7,7, -8,8, -9,9, -10,10, -11,11, -12,12, -13,13, 14.}, nd4j::DataType::DOUBLE);
+    NDArray input('c', {3,3,3}, {-1, 1, -2, 2, -3, 3, -4, 4, -5,5 ,-6,6, -7,7, -8,8, -9,9, -10,10, -11,11, -12,12, -13,13, 14.}, sd::DataType::DOUBLE);
     NDArray expOutput('c', {3,3,3}, {4.50755e-02, 4.51394e-02, 6.64586e-03,4.72027e-02, 8.67128e-04, 6.97440e-03,2.35008e-03, 4.59243e-02, 3.32995e-04,
                                     4.51766e-02, 2.26032e-06, 4.51767e-02,2.91394e-07, 2.37285e-06, 3.94360e-08,4.51769e-02, 1.12535e-07, 4.51767e-02,
-                                    7.58256e-10, 4.51767e-02, 1.22325e-11,7.96007e-10, 1.32293e-11, 1.04994e-01,3.77513e-11, 4.51767e-02, 1.04994e-01}, nd4j::DataType::DOUBLE);
-    NDArray output('c', {3,3,3}, nd4j::DataType::DOUBLE);
+                                    7.58256e-10, 4.51767e-02, 1.22325e-11,7.96007e-10, 1.32293e-11, 1.04994e-01,3.77513e-11, 4.51767e-02, 1.04994e-01}, sd::DataType::DOUBLE);
+    NDArray output('c', {3,3,3}, sd::DataType::DOUBLE);
 
-    // input.applyTransform(nd4j::transform::SoftMaxDerivative, &output);
+    // input.applyTransform(sd::transform::SoftMaxDerivative, &output);
 
-    nd4j::ops::helpers::softmaxDerivative(input.getContext(), input, output, 1);
+    sd::ops::helpers::softmaxDerivative(input.getContext(), input, output, 1);
     ASSERT_TRUE(expOutput.isSameShape(output));
     ASSERT_TRUE(expOutput.equalsTo(output));
 }
@@ -2334,13 +2334,13 @@ TEST_F(HelpersTests1, softmaxDerivative_2) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(HelpersTests1, softmaxDerivative_3) {
 
-    NDArray input('c', {5}, {-1., 1, -2, 2, 3}, nd4j::DataType::DOUBLE);
-    NDArray expOutput('c', {5}, {0.01184, 0.08071, 0.00439, 0.18277, 0.22618}, nd4j::DataType::DOUBLE);
-    NDArray output('c', {5}, nd4j::DataType::DOUBLE);
+    NDArray input('c', {5}, {-1., 1, -2, 2, 3}, sd::DataType::DOUBLE);
+    NDArray expOutput('c', {5}, {0.01184, 0.08071, 0.00439, 0.18277, 0.22618}, sd::DataType::DOUBLE);
+    NDArray output('c', {5}, sd::DataType::DOUBLE);
 
-    // input.applyTransform(nd4j::transform::SoftMaxDerivative, &output);
+    // input.applyTransform(sd::transform::SoftMaxDerivative, &output);
 
-    nd4j::ops::helpers::softmaxDerivative(input.getContext(), input, output, 0);
+    sd::ops::helpers::softmaxDerivative(input.getContext(), input, output, 0);
     ASSERT_TRUE(expOutput.isSameShape(output));
     ASSERT_TRUE(expOutput.equalsTo(output));
 }
@@ -2364,19 +2364,19 @@ TEST_F(HelpersTests1, lstmLayerCell_1) {
     const float outAlpha = 0;       // alpha value for output activation, not required for tanh
     const float outBeta = 0;        // beta value for output activation, not required for tanh
 
-    NDArray x ('c', {bS, nIn}, nd4j::DataType::FLOAT32);
-    NDArray Wx('c', {nIn, 4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray Wr('c', {nOut, 4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray b ('c', {4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray hI('c', {bS, nOut}, nd4j::DataType::FLOAT32);
-    NDArray cI('c', {bS, nOut}, nd4j::DataType::FLOAT32);
-    NDArray Wp('c', {3*nOut}, nd4j::DataType::FLOAT32);
+    NDArray x ('c', {bS, nIn}, sd::DataType::FLOAT32);
+    NDArray Wx('c', {nIn, 4*nOut}, sd::DataType::FLOAT32);
+    NDArray Wr('c', {nOut, 4*nOut}, sd::DataType::FLOAT32);
+    NDArray b ('c', {4*nOut}, sd::DataType::FLOAT32);
+    NDArray hI('c', {bS, nOut}, sd::DataType::FLOAT32);
+    NDArray cI('c', {bS, nOut}, sd::DataType::FLOAT32);
+    NDArray Wp('c', {3*nOut}, sd::DataType::FLOAT32);
 
-    NDArray h('c', {bS, nOut}, nd4j::DataType::FLOAT32);
-    NDArray c('c', {bS, nOut}, nd4j::DataType::FLOAT32);
+    NDArray h('c', {bS, nOut}, sd::DataType::FLOAT32);
+    NDArray c('c', {bS, nOut}, sd::DataType::FLOAT32);
 
-    NDArray expH('c', {bS, nOut}, {0.999288, 0.999288, 0.999288, 0.999288, 0.999288, 0.999288, 0.999288, 0.999288}, nd4j::DataType::FLOAT32);
-    NDArray expC('c', {bS, nOut}, {3.999778, 3.999778, 3.999778, 3.999778, 3.999778, 3.999778, 3.999778, 3.999778}, nd4j::DataType::FLOAT32);
+    NDArray expH('c', {bS, nOut}, {0.999288, 0.999288, 0.999288, 0.999288, 0.999288, 0.999288, 0.999288, 0.999288}, sd::DataType::FLOAT32);
+    NDArray expC('c', {bS, nOut}, {3.999778, 3.999778, 3.999778, 3.999778, 3.999778, 3.999778, 3.999778, 3.999778}, sd::DataType::FLOAT32);
 
     std::vector<float> params = {dataFormat, 0, cellClip, gateAct, gateAlpha, gateBeta, cellAct, cellAlpha, cellBeta, outAct, outAlpha, outBeta};
 
@@ -2388,7 +2388,7 @@ TEST_F(HelpersTests1, lstmLayerCell_1) {
     Wp = 0.3;
     b = 0.7;
 
-    nd4j::ops::helpers::lstmLayerCell(&x, &Wx, &Wr, &b, &hI, &cI, &Wp, params, &h, &c);
+    sd::ops::helpers::lstmLayerCell(&x, &Wx, &Wr, &b, &hI, &cI, &Wp, params, &h, &c);
 
     ASSERT_TRUE(expH.isSameShape(h));
     ASSERT_TRUE(expH.equalsTo(h));
@@ -2415,19 +2415,19 @@ TEST_F(HelpersTests1, lstmLayerCell_2) {
     const float outAlpha = 0;       // alpha value for output activation, not required for tanh
     const float outBeta = 0;        // beta value for output activation, not required for tanh
 
-    NDArray x ('c', {bS, nIn}, nd4j::DataType::FLOAT32);
-    NDArray Wx('c', {nIn, 4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray Wr('c', {nOut, 4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray b ('c', {4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray hI('c', {bS, nOut}, nd4j::DataType::FLOAT32);
-    NDArray cI('c', {bS, nOut}, nd4j::DataType::FLOAT32);
-    NDArray Wp('c', {3*nOut}, nd4j::DataType::FLOAT32);
+    NDArray x ('c', {bS, nIn}, sd::DataType::FLOAT32);
+    NDArray Wx('c', {nIn, 4*nOut}, sd::DataType::FLOAT32);
+    NDArray Wr('c', {nOut, 4*nOut}, sd::DataType::FLOAT32);
+    NDArray b ('c', {4*nOut}, sd::DataType::FLOAT32);
+    NDArray hI('c', {bS, nOut}, sd::DataType::FLOAT32);
+    NDArray cI('c', {bS, nOut}, sd::DataType::FLOAT32);
+    NDArray Wp('c', {3*nOut}, sd::DataType::FLOAT32);
 
-    NDArray h('c', {bS, nOut}, nd4j::DataType::FLOAT32);
-    NDArray c('c', {bS, nOut}, nd4j::DataType::FLOAT32);
+    NDArray h('c', {bS, nOut}, sd::DataType::FLOAT32);
+    NDArray c('c', {bS, nOut}, sd::DataType::FLOAT32);
 
-    NDArray expH('c', {bS, nOut}, {0.995, 0.995, 0.995, 0.995, 0.995, 0.995, 0.995, 0.995}, nd4j::DataType::FLOAT32);
-    NDArray expC('c', {bS, nOut}, {3., 3., 3., 3., 3., 3., 3., 3.}, nd4j::DataType::FLOAT32);
+    NDArray expH('c', {bS, nOut}, {0.995, 0.995, 0.995, 0.995, 0.995, 0.995, 0.995, 0.995}, sd::DataType::FLOAT32);
+    NDArray expC('c', {bS, nOut}, {3., 3., 3., 3., 3., 3., 3., 3.}, sd::DataType::FLOAT32);
 
     std::vector<float> params = {dataFormat, 0, cellClip, gateAct, gateAlpha, gateBeta, cellAct, cellAlpha, cellBeta, outAct, outAlpha, outBeta};
 
@@ -2439,7 +2439,7 @@ TEST_F(HelpersTests1, lstmLayerCell_2) {
     Wp = 0.3;
     b = 0.7;
 
-    nd4j::ops::helpers::lstmLayerCell(&x, &Wx, &Wr, &b, &hI, &cI, &Wp, params, &h, &c);
+    sd::ops::helpers::lstmLayerCell(&x, &Wx, &Wr, &b, &hI, &cI, &Wp, params, &h, &c);
 
     ASSERT_TRUE(expH.isSameShape(h));
     ASSERT_TRUE(expH.equalsTo(h));
@@ -2465,19 +2465,19 @@ TEST_F(HelpersTests1, lstmLayerCell_3) {
     const float outAlpha = 0;       // alpha value for output activation, not required for tanh
     const float outBeta = 0;        // beta value for output activation, not required for tanh
 
-    NDArray x ('c', {nIn}, nd4j::DataType::FLOAT32);
-    NDArray Wx('c', {nIn, 4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray Wr('c', {nOut, 4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray b ('c', {4*nOut}, nd4j::DataType::FLOAT32);
-    NDArray hI('c', {nOut}, nd4j::DataType::FLOAT32);
-    NDArray cI('c', {nOut}, nd4j::DataType::FLOAT32);
-    NDArray Wp('c', {3*nOut}, nd4j::DataType::FLOAT32);
+    NDArray x ('c', {nIn}, sd::DataType::FLOAT32);
+    NDArray Wx('c', {nIn, 4*nOut}, sd::DataType::FLOAT32);
+    NDArray Wr('c', {nOut, 4*nOut}, sd::DataType::FLOAT32);
+    NDArray b ('c', {4*nOut}, sd::DataType::FLOAT32);
+    NDArray hI('c', {nOut}, sd::DataType::FLOAT32);
+    NDArray cI('c', {nOut}, sd::DataType::FLOAT32);
+    NDArray Wp('c', {3*nOut}, sd::DataType::FLOAT32);
 
-    NDArray h('c', {nOut}, nd4j::DataType::FLOAT32);
-    NDArray c('c', {nOut}, nd4j::DataType::FLOAT32);
+    NDArray h('c', {nOut}, sd::DataType::FLOAT32);
+    NDArray c('c', {nOut}, sd::DataType::FLOAT32);
 
-    NDArray expH('c', {nOut}, {0.999288, 0.999288, 0.999288, 0.999288}, nd4j::DataType::FLOAT32);
-    NDArray expC('c', {nOut}, {3.999778, 3.999778, 3.999778, 3.999778}, nd4j::DataType::FLOAT32);
+    NDArray expH('c', {nOut}, {0.999288, 0.999288, 0.999288, 0.999288}, sd::DataType::FLOAT32);
+    NDArray expC('c', {nOut}, {3.999778, 3.999778, 3.999778, 3.999778}, sd::DataType::FLOAT32);
 
     std::vector<float> params = {dataFormat, 0, cellClip, gateAct, gateAlpha, gateBeta, cellAct, cellAlpha, cellBeta, outAct, outAlpha, outBeta};
 
@@ -2489,7 +2489,7 @@ TEST_F(HelpersTests1, lstmLayerCell_3) {
     Wp = 0.3;
     b = 0.7;
 
-    nd4j::ops::helpers::lstmLayerCell(&x, &Wx, &Wr, &b, &hI, &cI, &Wp, params, &h, &c);
+    sd::ops::helpers::lstmLayerCell(&x, &Wx, &Wr, &b, &hI, &cI, &Wp, params, &h, &c);
 
     ASSERT_TRUE(expH.isSameShape(h));
     ASSERT_TRUE(expH.equalsTo(h));
diff --git a/libnd4j/tests_cpu/layers_tests/IndexingTests.cpp b/libnd4j/tests_cpu/layers_tests/IndexingTests.cpp
index d26fbd122..c43fc6a7b 100644
--- a/libnd4j/tests_cpu/layers_tests/IndexingTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/IndexingTests.cpp
@@ -20,11 +20,11 @@
 
 #include "testlayers.h"
 #include <ops/declarable/CustomOperations.h>
-#include <NDArray.h>
-#include <NativeOps.h>
+#include <array/NDArray.h>
+#include <legacy/NativeOps.h>
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class IndexingTests : public testing::Test {
 public:
@@ -44,7 +44,7 @@ TEST_F(IndexingTests, StridedSlice_1) {
     auto strides = NDArrayFactory::create<int>({1,1,1});
 
 
-    nd4j::ops::strided_slice op;
+    sd::ops::strided_slice op;
 
     auto result = op.evaluate({&x, &begin, &end, &strides}, {}, {0,0,0,0,0}); //, 2,2,0,  3,3,3,  1,1,1});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -63,7 +63,7 @@ TEST_F(IndexingTests, StridedSlice_2) {
 
     x.linspace(1);
 
-    nd4j::ops::strided_slice op;
+    sd::ops::strided_slice op;
 
     auto result = op.evaluate({&x}, {}, {0,0,0,0,0, 3,2,0,  5,5,3,  1,1,1});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -83,7 +83,7 @@ TEST_F(IndexingTests, StridedSlice_3) {
 
     x.linspace(1);
 
-    nd4j::ops::strided_slice op;
+    sd::ops::strided_slice op;
 
     auto result = op.evaluate({&x}, {}, {0,0,0,0,0, 3,2,0,  5,5,3,  1,1,2});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -106,7 +106,7 @@ TEST_F(IndexingTests, SimpleSlice_1) {
     exp.p(1, 3.0f);
     exp.p(2, 3.0f);
 
-    nd4j::ops::slice op;
+    sd::ops::slice op;
 
     auto result = op.evaluate({&input}, {}, {1,0,0, 1,1,3});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -132,7 +132,7 @@ TEST_F(IndexingTests, SimpleSlice_2) {
     exp.p(4, 4.0f);
     exp.p(5, 4.0f);
 
-    nd4j::ops::slice op;
+    sd::ops::slice op;
 
     auto result = op.evaluate({&input}, {}, {1,0,0, 1,2,3});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -157,7 +157,7 @@ TEST_F(IndexingTests, SimpleSlice_3) {
     exp.p(4, 5.0f);
     exp.p(5, 5.0f);
 
-    nd4j::ops::slice op;
+    sd::ops::slice op;
 
     auto result = op.evaluate({&input}, {}, {1,0,0, 2,1,3});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -177,7 +177,7 @@ TEST_F(IndexingTests, SimpleSlice_4) {
     auto stop = NDArrayFactory::create<double>('c', {3}, {2.0, 1.0, 3.0});
     auto exp = NDArrayFactory::create<double>('c', {2, 1, 3}, {3.0, 3.0, 3.0, 5.0, 5.0, 5.0});
 
-    nd4j::ops::slice op;
+    sd::ops::slice op;
 
     auto result = op.evaluate({&input, &start, &stop});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -201,7 +201,7 @@ TEST_F(IndexingTests, MaskedSlice_0) {
     auto exp = NDArrayFactory::create<float>('c', {1, 5});
     exp.assign(2.0f);
 
-    nd4j::ops::strided_slice op;
+    sd::ops::strided_slice op;
     auto result = op.evaluate({&matrix}, {}, {0,0,0,0,0,   1, 2, 1});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -227,7 +227,7 @@ TEST_F(IndexingTests, MaskedSlice_00) {
     auto exp = NDArrayFactory::create<float>('c', {1, 2}, {2, 2});
 
 
-    nd4j::ops::strided_slice op;
+    sd::ops::strided_slice op;
     auto result = op.evaluate({&matrix}, {}, {0,0,0,0,0,   1, 1, 2, 3, 1, 1});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -251,7 +251,7 @@ TEST_F(IndexingTests, MaskedSlice_1) {
     auto exp = NDArrayFactory::create<float>('c', {5});
     exp.assign(2.0f);
 
-    nd4j::ops::strided_slice op;
+    sd::ops::strided_slice op;
     auto result = op.evaluate({&matrix}, {}, {0,0,0,0,1,   1, 2, 1});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -272,7 +272,7 @@ TEST_F(IndexingTests, MaskedSlice_2) {
     auto exp = NDArrayFactory::create<float>('c', {3, 3}, {4.000000f, 4.200000f, 4.300000f, 5.000000f, 5.200000f, 5.300000f, 6.000000f, 6.200000f, 6.300000f});
 
     // output = tf.strided_slice(a, [1, 0, 0], [3, 3, 3], shrink_axis_mask=5)
-    nd4j::ops::strided_slice op;
+    sd::ops::strided_slice op;
     auto result = op.evaluate({&matrix}, {}, {0,0,0,0,1,   1, 0, 0,  3, 3, 3,  1, 1, 1});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -292,7 +292,7 @@ TEST_F(IndexingTests, MaskedSlice_3) {
     auto exp = NDArrayFactory::create<float>('c', {2, 3}, { 4.f,   4.2f,  4.3f, 7.f, 7.2f,  7.3f});
 
     // output = tf.strided_slice(a, [1, 0, 0], [3, 3, 3], shrink_axis_mask=5)
-    nd4j::ops::strided_slice op;
+    sd::ops::strided_slice op;
     auto result = op.evaluate({&matrix}, {}, {0,0,0,0,2,   1, 0, 0,  3, 3, 3,  1, 1, 1});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -312,7 +312,7 @@ TEST_F(IndexingTests, MaskedSlice_4) {
     auto exp = NDArrayFactory::create<float>('c', {3}, { 4.f,   4.2f,  4.3f});
 
     // output = tf.strided_slice(a, [1, 0, 0], [3, 3, 3], shrink_axis_mask=5)
-    nd4j::ops::strided_slice op;
+    sd::ops::strided_slice op;
     auto result = op.evaluate({&matrix}, {}, {0,0,0,0, 3,   1, 0, 0,  3, 3, 3,  1, 1, 1});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -335,7 +335,7 @@ TEST_F(IndexingTests, Live_Slice_1) {
     auto stride = NDArrayFactory::create<float>('c', {3}, {1.0f, 1.0f, 1.0f});
 
     // output = tf.strided_slice(a, [1, 0, 0], [3, 3, 3], shrink_axis_mask=5)
-    nd4j::ops::strided_slice op;
+    sd::ops::strided_slice op;
     auto result = op.evaluate({&matrix, &begin, &end, &stride}, {}, {0,0,0,0,3});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -358,7 +358,7 @@ TEST_F(IndexingTests, Test_StridedSlice_1) {
     auto c = NDArrayFactory::create<float>('c', {1}, {1.f});
     auto exp = NDArrayFactory::create<float>({5.0f, 2});
 
-    nd4j::ops::strided_slice op;
+    sd::ops::strided_slice op;
     auto result = op.evaluate({&x, &a, &b, &c}, {}, {0, 0, 0, 0, 1});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -378,7 +378,7 @@ TEST_F(IndexingTests, Test_StridedSlice_2) {
     auto c = NDArrayFactory::create<float>('c', {2}, {1, 1});
     auto exp = NDArrayFactory::create<float>('c', {1}, {5.0});
 
-    nd4j::ops::strided_slice op;
+    sd::ops::strided_slice op;
     auto result = op.evaluate({&x, &a, &b, &c}, {}, {0, 0, 0, 0, 1});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -401,7 +401,7 @@ TEST_F(IndexingTests, Test_StridedSlice_3) {
     auto c = NDArrayFactory::create<float>('c', {2}, {1, 1});
     auto exp = NDArrayFactory::create<float>('c', {1}, {6.0});
 
-    nd4j::ops::strided_slice op;
+    sd::ops::strided_slice op;
     auto result = op.evaluate({&x, &a, &b, &c}, {}, {0, 0, 0, 0, 1});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -422,7 +422,7 @@ TEST_F(IndexingTests, Test_StridedSlice_4) {
     auto c = NDArrayFactory::create<float>('c', {1}, {1});
     auto exp = NDArrayFactory::create<float>({5.0f, 2});
 
-    nd4j::ops::strided_slice op;
+    sd::ops::strided_slice op;
     auto result = op.evaluate({&x, &a, &b, &c}, {}, {0, 0, 0, 0, 1});
 //    auto result = op.execute({&x, &a, &b, &c}, {}, {0, 0, 0, 0, 1, 0, 1, 1});
 
@@ -455,7 +455,7 @@ TEST_F(IndexingTests, MaskedSlice_5) {
     auto exp('c', {2, 3}, { 4.f,   4.2f,  4.3f, 7.f, 7.2f,  7.3f});
 
     // output = tf.strided_slice(a, [1, 0, 0], [3, 3, 3], shrink_axis_mask=5)
-    nd4j::ops::strided_slice<float> op;
+    sd::ops::strided_slice<float> op;
     auto result = op.execute({&matrix}, {}, {0,0,0,0,2,   1, 0, 0,  3, 3, 3});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
diff --git a/libnd4j/tests_cpu/layers_tests/JavaInteropCudaTests.cu b/libnd4j/tests_cpu/layers_tests/JavaInteropCudaTests.cu
index f442c0bb9..aa2c13eb5 100644
--- a/libnd4j/tests_cpu/layers_tests/JavaInteropCudaTests.cu
+++ b/libnd4j/tests_cpu/layers_tests/JavaInteropCudaTests.cu
@@ -26,8 +26,8 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 
-using namespace nd4j;
-using namespace nd4j::ops;
+using namespace sd;
+using namespace sd::ops;
 
 class JavaInteropCudaTests : public testing::Test {
 public:
@@ -41,7 +41,7 @@ TEST_F(JavaInteropCudaTests, test_DeclarableOp_execution_1) {
     x.assign(1.f);
     e.assign(2.f);
 
-    nd4j::ops::add op;
+    sd::ops::add op;
     Context context(1);
 
     context.setCudaContext(LaunchContext::defaultContext()->getCudaStream(), LaunchContext::defaultContext()->getReductionPointer(), LaunchContext::defaultContext()->getAllocationPointer());
@@ -59,16 +59,16 @@ TEST_F(JavaInteropCudaTests, test_DeclarableOp_execution_1) {
 }
 
 TEST_F(JavaInteropCudaTests, test_DeclarableOp_execution_2) {
-    NDArray x('c', {3, 1, 2}, nd4j::DataType::FLOAT32);
-    NDArray y('c', {2, 2}, nd4j::DataType::FLOAT32);
-    NDArray z('c', {3, 2, 2}, nd4j::DataType::BOOL);
-    NDArray e('c', {3, 2, 2}, nd4j::DataType::BOOL);
+    NDArray x('c', {3, 1, 2}, sd::DataType::FLOAT32);
+    NDArray y('c', {2, 2}, sd::DataType::FLOAT32);
+    NDArray z('c', {3, 2, 2}, sd::DataType::BOOL);
+    NDArray e('c', {3, 2, 2}, sd::DataType::BOOL);
 
     x.assign(1.f);
     y.assign(2.f);
     e.assign(false);
 
-    nd4j::ops::equals op;
+    sd::ops::equals op;
     Context context(1);
 
     context.setCudaContext(LaunchContext::defaultContext()->getCudaStream(), LaunchContext::defaultContext()->getReductionPointer(), LaunchContext::defaultContext()->getAllocationPointer());
diff --git a/libnd4j/tests_cpu/layers_tests/JavaInteropTests.cpp b/libnd4j/tests_cpu/layers_tests/JavaInteropTests.cpp
index 2f3f93d56..740e33110 100644
--- a/libnd4j/tests_cpu/layers_tests/JavaInteropTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/JavaInteropTests.cpp
@@ -18,8 +18,8 @@
 // @author raver119@gmail.com
 //
 
-#include <NativeOps.h>
-#include <NDArray.h>
+#include <legacy/NativeOps.h>
+#include <array/NDArray.h>
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/OpRegistrator.h>
 #include <graph/GraphHolder.h>
@@ -27,8 +27,8 @@
 #include "testlayers.h"
 #include <array>
 
-using namespace nd4j;
-using namespace nd4j::ops;
+using namespace sd;
+using namespace sd::ops;
 
 class JavaInteropTests : public testing::Test {
 public:
@@ -41,7 +41,7 @@ TEST_F(JavaInteropTests, TestShapeExposure1) {
     auto weights = NDArrayFactory::create<float>('c', {2, 2, 2, 3});
     auto exp = NDArrayFactory::create<float>('c', {1, 3, 5, 4});
 
-    nd4j::ops::conv2d op;
+    sd::ops::conv2d op;
 
     std::vector<double> tArgs({});
     std::vector<Nd4jLong> iArgs({2, 2, 1, 1, 0, 0, 1, 1, 1});
@@ -70,7 +70,7 @@ TEST_F(JavaInteropTests, TestShapeExposure2) {
     auto input = NDArrayFactory::create<float>('c', {1, 2, 5, 4});
     auto exp = NDArrayFactory::create<float>('c', {4}, {1, 2, 5, 4});
 
-    nd4j::ops::shape_of op;
+    sd::ops::shape_of op;
 
     std::vector<double> tArgs({});
     std::vector<Nd4jLong> iArgs({});
@@ -107,7 +107,7 @@ TEST_F(JavaInteropTests, TestShapeExposure3) {
     Nd4jPointer inputBuffers[] = {x.buffer(), sizes.buffer(), x.getSpecialBuffer(), sizes.getSpecialBuffer()};
     Nd4jPointer inputShapes[] = {x.shapeInfo(), sizes.shapeInfo(), x.getSpecialShapeInfo(), sizes.getSpecialShapeInfo()};
 
-    nd4j::ops::split_v op;
+    sd::ops::split_v op;
 
     Nd4jLong iArgs[] = {1};
     auto hash = op.getOpHash();
@@ -128,7 +128,7 @@ TEST_F(JavaInteropTests, Test_Squeeze_1) {
     auto z = NDArrayFactory::create<float>('c', {6});
     auto e = NDArrayFactory::create<float>('c', {6}, {1.f, 2.f, 3.f, 4.f, 5.f, 6.f});
 
-    nd4j::ops::squeeze op;
+    sd::ops::squeeze op;
 
     Nd4jPointer ptrsInBuffer[] = {(Nd4jPointer) x.getBuffer(), x.getSpecialBuffer()};
     Nd4jPointer ptrsInShapes[] = {(Nd4jPointer) x.getShapeInfo(), x.getSpecialShapeInfo()};
@@ -149,7 +149,7 @@ TEST_F(JavaInteropTests, Test_RDiv_1) {
 
     NDArray::prepareSpecialUse({&z}, {&x, &y});
 
-    nd4j::ops::reversedivide op;
+    sd::ops::reversedivide op;
 
     Nd4jPointer ptrsInBuffer[] = {(Nd4jPointer) x.getBuffer(), (Nd4jPointer) y.getBuffer(), x.getSpecialBuffer(), y.getSpecialBuffer()};
     Nd4jPointer ptrsInShapes[] = {(Nd4jPointer) x.getShapeInfo(), (Nd4jPointer) y.getShapeInfo(), x.getSpecialShapeInfo(), y.getSpecialShapeInfo()};
@@ -182,7 +182,7 @@ TEST_F(JavaInteropTests, TestSconv2d_1) {
 
     auto expOutput = NDArrayFactory::create<float>('c', {3, 2, 8, 8});
 
-    nd4j::ops::sconv2d op;
+    sd::ops::sconv2d op;
 
     NDArray::prepareSpecialUse({&output}, {&input, &weightsD, &weightsP, &bias});
 
@@ -217,7 +217,7 @@ TEST_F(JavaInteropTests, TestSconv2d_2) {
 
     auto expOutput = NDArrayFactory::create<float>('c', {3, 3, 8, 8});
 
-    nd4j::ops::sconv2d op;
+    sd::ops::sconv2d op;
 
     NDArray::prepareSpecialUse({&output}, {&input, &weightsD});
 
@@ -253,7 +253,7 @@ TEST_F(JavaInteropTests, TestMaxPooling2d_1) {
 
     std::vector<Nd4jLong> iArgs({2, 2, 1, 1, 0, 0, 1, 1, 1});
 
-    nd4j::ops::maxpool2d op;
+    sd::ops::maxpool2d op;
 
     Nd4jStatus status = execCustomOp(nullptr, op.getOpHash(), ptrsInBuffer, ptrsInShapes, 1, ptrsOutBuffers, ptrsOutShapes, 1, nullptr, 0, iArgs.data(), 9, nullptr, 0, false);
 
@@ -282,7 +282,7 @@ TEST_F(JavaInteropTests, TestCol2Im_1) {
     Nd4jPointer ptrsOutBuffers[] = {(Nd4jPointer) output.getBuffer(), output.getSpecialBuffer()};
     Nd4jPointer ptrsOutShapes[] = {(Nd4jPointer) output.getShapeInfo(), output.getSpecialShapeInfo()};
 
-    nd4j::ops::col2im op;
+    sd::ops::col2im op;
 
     Nd4jLong exp[] = {1, 1, 1, 1, 4, 5, 1, 1, 1};
 
@@ -312,7 +312,7 @@ TEST_F(JavaInteropTests, TestPNorm_1) {
 
     NDArray::prepareSpecialUse({&output}, {&input});
 
-    nd4j::ops::pnormpool2d op;
+    sd::ops::pnormpool2d op;
 
     Nd4jLong exp[] = {2, 2, 1, 1, 0, 0, 1, 1, 0, 2, 0, 0};
 
@@ -338,7 +338,7 @@ TEST_F(JavaInteropTests, TestInplace_1) {
 
     NDArray::prepareSpecialUse({}, {&input});
 
-    nd4j::ops::clipbyvalue op;
+    sd::ops::clipbyvalue op;
 
     double extras[] = {-1.0f, 1.0f};
 
@@ -408,7 +408,7 @@ TEST_F(JavaInteropTests, Test_FastPath_Validation_1) {
     ctx.setInputArray(0, x.buffer(), x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo());
     ctx.setOutputArray(0, z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo());
 
-    nd4j::ops::softmax op;
+    sd::ops::softmax op;
     auto status = op.execute(&ctx);
     ASSERT_NE(Status::OK(), status);
 }
@@ -421,7 +421,7 @@ TEST_F(JavaInteropTests, Test_FastPath_Validation_2) {
     ctx.setInputArray(0, x.buffer(), x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo());
     ctx.setOutputArray(0, z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo());
 
-    nd4j::ops::softmax op;
+    sd::ops::softmax op;
     auto status = op.execute(&ctx);
     ASSERT_NE(Status::OK(), status);
 }
@@ -442,7 +442,7 @@ TEST_F(JavaInteropTests, Test_FastPath_Validation_3) {
     ctx.setInputArray(2, max.buffer(), max.shapeInfo(), max.specialBuffer(), max.specialShapeInfo());
     ctx.setOutputArray(0, z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo());
 
-    nd4j::ops::fake_quant_with_min_max_vars_per_channel op;
+    sd::ops::fake_quant_with_min_max_vars_per_channel op;
     ASSERT_ANY_THROW(op.execute(&ctx));
 }
 
@@ -458,7 +458,7 @@ TEST_F(JavaInteropTests, Test_empty_cast_1) {
     ctx.setOutputArray(0, z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo());
     ctx.setIArguments(iArgs, 1);
 
-    nd4j::ops::cast op;
+    sd::ops::cast op;
     auto result = op.execute(&ctx);
     ASSERT_EQ(Status::OK(), result);
     ASSERT_EQ(e, z);
@@ -477,7 +477,7 @@ TEST_F(JavaInteropTests, test_avgpooling_edge_1) {
 
     NDArray::prepareSpecialUse({&z}, {&x});
 
-    nd4j::ops::avgpool2d op;
+    sd::ops::avgpool2d op;
     //auto result = op.execute({&x}, {}, {3,3, 1,1, 0,0, 1,1, 1, 0, 1});
 
     Nd4jLong exp[] = {3,3, 1,1, 0,0, 1,1, 1, 0, 1};
@@ -511,11 +511,11 @@ TEST_F(JavaInteropTests, test_avgpooling_edge_1) {
             int hTo = hFrom + k;
             int wTo = wFrom + k;
 
-            hFrom = nd4j::math::nd4j_max<int>(0, hFrom);
-            wFrom = nd4j::math::nd4j_max<int>(0, wFrom);
+            hFrom = sd::math::nd4j_max<int>(0, hFrom);
+            wFrom = sd::math::nd4j_max<int>(0, wFrom);
 
-            hTo = nd4j::math::nd4j_min<int>(inOutH, hTo);
-            wTo = nd4j::math::nd4j_min<int>(inOutW, wTo);
+            hTo = sd::math::nd4j_min<int>(inOutH, hTo);
+            wTo = sd::math::nd4j_min<int>(inOutW, wTo);
 
             int idxOut[4];
             int idxIn[4];
@@ -547,7 +547,7 @@ TEST_F(JavaInteropTests, test_avgpooling_edge_1) {
     for (int e = 0; e < z.lengthOf() && cnt < lim; e++) {
         auto _m = m.e<float>(e);
         auto _z = z.e<float>(e);
-        auto eq = nd4j::math::nd4j_eq<float>(_m, _z, 1e-5);
+        auto eq = sd::math::nd4j_eq<float>(_m, _z, 1e-5);
         if (!eq) {
             nd4j_printf("Difference at element e [%i]: <%f> vs <%f>\n", e, _m, _z);
             cnt++;
@@ -559,7 +559,7 @@ TEST_F(JavaInteropTests, test_avgpooling_edge_1) {
 
 
 TEST_F(JavaInteropTests, Test_GraphReuse_1) {
-    uint8_t* data = nd4j::graph::readFlatBuffers("./resources/reduce_dim_false.fb");
+    uint8_t* data = sd::graph::readFlatBuffers("./resources/reduce_dim_false.fb");
 
     registerGraph(nullptr, 119, (Nd4jPointer) data);
 
@@ -582,7 +582,7 @@ TEST_F(JavaInteropTests, Test_GraphReuse_2) {
     auto exp2 = NDArrayFactory::create<float>('c', {3}, {9, 9, 9});
 
     // we load graph from file, because we're not in java here, and dont have buffer ready
-    uint8_t* data = nd4j::graph::readFlatBuffers("./resources/reduce_dim_false.fb");
+    uint8_t* data = sd::graph::readFlatBuffers("./resources/reduce_dim_false.fb");
 
     // we ensure that there's no such a graph stored earlier
     ASSERT_FALSE(GraphHolder::getInstance()->hasGraph(119));
@@ -667,7 +667,7 @@ TEST_F(JavaInteropTests, Test_Greater_1) {
 
     NDArray::prepareSpecialUse({&o}, {&x, &y});
 
-    nd4j::ops::greater op;
+    sd::ops::greater op;
 
     Nd4jPointer ptrsInBuffer[] = {(Nd4jPointer) x.getBuffer(), (Nd4jPointer) y.getBuffer(), x.getSpecialBuffer(), y.getSpecialBuffer()};
     Nd4jPointer ptrsInShapes[] = {(Nd4jPointer) x.getShapeInfo(), (Nd4jPointer) y.getShapeInfo(), x.getSpecialShapeInfo(), y.getSpecialShapeInfo()};
@@ -689,7 +689,7 @@ TEST_F(JavaInteropTests, Test_Greater_2) {
 
     auto exp = NDArrayFactory::create<bool>('c', {2, 2}, {false, false, true, true});
 
-    nd4j::ops::greater op;
+    sd::ops::greater op;
 
     NDArray::prepareSpecialUse({&o}, {&x, &y});
 
@@ -708,7 +708,7 @@ TEST_F(JavaInteropTests, Test_Greater_2) {
 
 TEST_F(JavaInteropTests, Test_Boolean_Op_1) {
 
-    nd4j::ops::is_non_decreasing op;
+    sd::ops::is_non_decreasing op;
 
     auto x = NDArrayFactory::create<float>('c', {5}, {1.f, 2.f, 3.f, 4.f, 5.f});
     auto o = NDArrayFactory::create<bool>(false);
@@ -737,7 +737,7 @@ TEST_F(JavaInteropTests, Test_Inplace_Outputs_1) {
     auto exp = NDArrayFactory::create<float>('c', {2, 3}, {1.f, 2.f, 3.f, 4.f, 5.f, 6.f});
     auto z = NDArrayFactory::create<float>('c', {2, 3});
 
-    nd4j::ops::test_output_reshape op;
+    sd::ops::test_output_reshape op;
 
     NDArray::prepareSpecialUse({&z}, {&x});
 
@@ -765,7 +765,7 @@ TEST_F(JavaInteropTests, Test_Inplace_Outputs_2) {
     auto e = NDArrayFactory::create<float>('c', {2, 3}, {3.f, 4.f, 5.f, 6.f, 7.f, 8.f});
 
 
-    nd4j::ops::add op;
+    sd::ops::add op;
 
     NDArray::prepareSpecialUse({&z}, {&x, &y});
 
@@ -792,7 +792,7 @@ TEST_F(JavaInteropTests, Test_Inplace_Outputs_3) {
     auto output = NDArrayFactory::create<double>('f', {2, 1, 6, 4});
     auto e = NDArrayFactory::create<double>('c', {2, 1, 6, 4}, {1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 9,10,11,12, 5, 6, 7, 8, 9,10,11,12, 13,14,15,16, 17,18,19,20, 21,22,23,24, 21,22,23,24, 17,18,19,20, 21,22,23,24});
 
-    nd4j::ops::gather op;
+    sd::ops::gather op;
 
     NDArray::prepareSpecialUse({&output}, {&input, &indices});
 
@@ -823,15 +823,15 @@ TEST_F(JavaInteropTests, Test_Reduce3_EdgeCase) {
     auto dims = NDArrayFactory::create<int>('c', {2}, {0, 1});
     dims.syncToHost();
 
-    nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext();
+    sd::LaunchContext* context = sd::LaunchContext::defaultContext();
 
     Nd4jPointer* extraPointers = nullptr;
     #ifdef __CUDABLAS__
         extraPointers = new Nd4jPointer[6] {nullptr, context->getCudaStream(), context->getScalarPointer(), nullptr, context->getCudaSpecialStream(), context->getReductionPointer()};
     #endif
 
-    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(x.getShapeInfo(), {0,1});
-    auto packY = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(y.getShapeInfo(), {0,1});
+    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(x.getShapeInfo(), {0,1});
+    auto packY = sd::ConstantTadHelper::getInstance()->tadForDimensions(y.getShapeInfo(), {0,1});
 
     NDArray::prepareSpecialUse({&z}, {&x, &y, &dims});
     OpaqueDataBuffer xBuf(x.dataBuffer());
@@ -856,7 +856,7 @@ TEST_F(JavaInteropTests, Test_SimpleIf_Output) {
     Environment::getInstance()->setDebug(true);
     Environment::getInstance()->setVerbose(false);
 
-    auto pl = nd4j::graph::readFlatBuffers("./resources/simpleif_0_1.fb");
+    auto pl = sd::graph::readFlatBuffers("./resources/simpleif_0_1.fb");
     auto ptr = executeFlatGraph(nullptr, pl);
 
     Environment::getInstance()->setDebug(false);
@@ -873,7 +873,7 @@ TEST_F(JavaInteropTests, Test_AveragePooling_FF_TF_double) {
     auto z = NDArrayFactory::create<double>('c', {4, 4, 4, 3});
     auto exp = NDArrayFactory::create<double>('c', {4, 4, 4, 3}, {7.97172260, 0.06878620,             2.27749538,             7.29276514,             -0.14074677,             0.65480286,             5.70313978,             -0.06546132,             0.35443667,             3.70382833,             -0.84020567,             0.63826996,             8.60301399,             -0.38236514,             1.55177069,             7.37542057,             -0.99374938,             -0.29971302,             8.84352493,             -0.67121059,             0.43132120,             4.78175592,             -1.25070143,             -1.91523600,             6.03855371,             -0.00292124,             -1.11214364,             7.90158176,             -0.57949901,             -0.96735370,             7.81192017,             -0.53255427,             -0.48009714,             3.16953635,             0.08353355,             -1.54299748,             3.74821687,             1.69396687,             0.72724354,             5.42915201,             -1.13686812,             -0.71793109,             5.78376389,             -0.72239977,             -0.60055625,             2.53636408,             0.56777251,             -2.07892323,             6.08064651,             0.68620735,             2.54017019,             5.65828180,             -0.68255502,             1.47283304,             6.10842514,             -0.39655915,             0.28380761,             1.96707797,             -1.98206317,             0.94027776,             4.71811438,             0.32104525,             -0.92409706,             8.34588146,             -1.05581069,             -0.55217457,             9.58440876,             -0.96549922,             0.45820439,             5.65453672,             -2.50953507,             -0.71441835,             8.03059578,             -0.21281289,             0.92125505,             9.26900673,             -0.35963219,             -0.70039093,             8.59924412,             -1.22358346,             0.81318003,             3.85920119,             -0.01305223,             -1.09234154,             6.33158875,             1.28094780,             -1.48926139,             4.94969177,             -0.77126902,             -1.97033751,             5.64381838,             -0.16285487,             -1.31277227,             2.39893222,             -1.32902908,             -1.39609122,             6.47572327,             -0.45267010,             1.55727172,             6.70965624,             -1.68735468,             -0.05672536,             7.25092363,             -0.64613032,             0.67050058,             3.60789680,             -2.05948973,             2.22687531,             8.15202713,             -0.70148355,             1.28314006,             8.14842319,             -1.88807654,             -1.04808438,             8.45500565,             -0.76425624,             0.94542569,             4.56179953,             -0.28786001,             -2.04502511,             8.46278095,             -0.31019822,             0.07339200,             9.34214592,             -0.61948007,             0.52481830,             8.32515621,             -1.52418160,             0.49678251,             5.11082315,             -1.09908783,             -0.52969611,             5.27806664,             0.88632923,             0.66754371,             4.75839233,             0.48928693,             -0.68036932,             6.56925392,             -0.02949905,             -2.99189186,             4.46320581,             -0.64534980,             -0.29516968,             8.60809517,             -1.13120568,             3.41720533,             5.84243155,             -1.24109328,             0.89566326,             5.99578333,             -0.42496428,             2.07076764,             3.17812920,             -0.81566459,             -0.14363396,             6.55184317,             0.39633346,             -0.43852386,             8.70214558,             -2.24613595,             0.30708700,             8.73882294,             -0.53545928,             1.54409575,             4.49452257,             -0.16509305,             0.19028664,             8.24897003,             0.44750381,             2.15448594,             8.97640514,             -0.77728152,             0.57272542,             9.03467560,             0.47173575,             -1.10807717,             3.30056310,             -0.43268481,             -0.41470885,             3.53798294,             -0.08546703,             -2.16840744,             6.18733406,             -0.17871059,             -2.59837723,             5.94218683,             -1.02990067,             -0.49760687,             3.76938033,             0.86383581,             -1.91504073});
 
-    nd4j::ops::avgpool2d op;
+    sd::ops::avgpool2d op;
 
     NDArray::prepareSpecialUse({&z}, {&input});
 
@@ -911,7 +911,7 @@ TEST_F(JavaInteropTests, Test_MaxPool2D_float_1) {
 
     Nd4jLong iArgs[] = {2,2,  1,1,  1,1,  2,2,1,  0,0};
 
-    nd4j::ops::maxpool2d op;
+    sd::ops::maxpool2d op;
 
     auto hash = op.getOpHash();
     auto status = execCustomOp(nullptr, hash, ptrsInBuffer, ptrsInShapes, 1, ptrsOutBuffers, ptrsOutShapes, 1, nullptr, 0, iArgs, 11, nullptr, 0, false);
@@ -939,7 +939,7 @@ TEST_F(JavaInteropTests, Test_Unstack_1) {
 
     Nd4jLong iArgs[] = {0};
 
-    nd4j::ops::unstack op;
+    sd::ops::unstack op;
 
     auto hash = op.getOpHash();
     auto status = execCustomOp(nullptr, hash, ptrsInBuffer, ptrsInShapes, 1, ptrsOutBuffers, ptrsOutShapes, 5, nullptr, 0, iArgs, 1, nullptr, 0, false);
@@ -954,7 +954,7 @@ TEST_F(JavaInteropTests, Test_AveragePooling_FF_TF_float) {
     auto z = NDArrayFactory::create<float>('c', {4, 4, 4, 3});
     auto exp = NDArrayFactory::create<float>('c', {4, 4, 4, 3}, {7.97172260f, 0.06878620f,             2.27749538f,             7.29276514f,             -0.14074677f,             0.65480286f,             5.70313978f,             -0.06546132f,             0.35443667f,             3.70382833f,             -0.84020567f,             0.63826996f,             8.60301399f,             -0.38236514f,             1.55177069f,             7.37542057f,             -0.99374938f,             -0.29971302f,             8.84352493f,             -0.67121059f,             0.43132120f,             4.78175592f,             -1.25070143f,             -1.91523600f,             6.03855371f,             -0.00292124f,             -1.11214364f,             7.90158176f,             -0.57949901f,             -0.96735370f,             7.81192017f,             -0.53255427f,             -0.48009714f,             3.16953635f,             0.08353355f,             -1.54299748f,             3.74821687f,             1.69396687f,             0.72724354f,             5.42915201f,             -1.13686812f,             -0.71793109f,             5.78376389f,             -0.72239977f,             -0.60055625f,             2.53636408f,             0.56777251f,             -2.07892323f,             6.08064651f,             0.68620735f,             2.54017019f,             5.65828180f,             -0.68255502f,             1.47283304f,             6.10842514f,             -0.39655915f,             0.28380761f,             1.96707797f,             -1.98206317f,             0.94027776f,             4.71811438f,             0.32104525f,             -0.92409706f,             8.34588146f,             -1.05581069f,             -0.55217457f,             9.58440876f,             -0.96549922f,             0.45820439f,             5.65453672f,             -2.50953507f,             -0.71441835f,             8.03059578f,             -0.21281289f,             0.92125505f,             9.26900673f,             -0.35963219f,             -0.70039093f,             8.59924412f,             -1.22358346f,             0.81318003f,             3.85920119f,             -0.01305223f,             -1.09234154f,             6.33158875f,             1.28094780f,             -1.48926139f,             4.94969177f,             -0.77126902f,             -1.97033751f,             5.64381838f,             -0.16285487f,             -1.31277227f,             2.39893222f,             -1.32902908f,             -1.39609122f,             6.47572327f,             -0.45267010f,             1.55727172f,             6.70965624f,             -1.68735468f,             -0.05672536f,             7.25092363f,             -0.64613032f,             0.67050058f,             3.60789680f,             -2.05948973f,             2.22687531f,             8.15202713f,             -0.70148355f,             1.28314006f,             8.14842319f,             -1.88807654f,             -1.04808438f,             8.45500565f,             -0.76425624f,             0.94542569f,             4.56179953f,             -0.28786001f,             -2.04502511f,             8.46278095f,             -0.31019822f,             0.07339200f,             9.34214592f,             -0.61948007f,             0.52481830f,             8.32515621f,             -1.52418160f,             0.49678251f,             5.11082315f,             -1.09908783f,             -0.52969611f,             5.27806664f,             0.88632923f,             0.66754371f,             4.75839233f,             0.48928693f,             -0.68036932f,             6.56925392f,             -0.02949905f,             -2.99189186f,             4.46320581f,             -0.64534980f,             -0.29516968f,             8.60809517f,             -1.13120568f,             3.41720533f,             5.84243155f,             -1.24109328f,             0.89566326f,             5.99578333f,             -0.42496428f,             2.07076764f,             3.17812920f,             -0.81566459f,             -0.14363396f,             6.55184317f,             0.39633346f,             -0.43852386f,             8.70214558f,             -2.24613595f,             0.30708700f,             8.73882294f,             -0.53545928f,             1.54409575f,             4.49452257f,             -0.16509305f,             0.19028664f,             8.24897003f,             0.44750381f,             2.15448594f,             8.97640514f,             -0.77728152f,             0.57272542f,             9.03467560f,             0.47173575f,             -1.10807717f,             3.30056310f,             -0.43268481f,             -0.41470885f,             3.53798294f,             -0.08546703f,             -2.16840744f,             6.18733406f,             -0.17871059f,             -2.59837723f,             5.94218683f,             -1.02990067f,             -0.49760687f,             3.76938033f,             0.86383581f,             -1.91504073f});
 
-    nd4j::ops::avgpool2d op;
+    sd::ops::avgpool2d op;
 
     NDArray::prepareSpecialUse({&z}, {&input});
 
@@ -1008,7 +1008,7 @@ TEST_F(JavaInteropTests, Test_Add_1) {
 
     NDArray::prepareSpecialUse({&x}, {&x, &y});
 
-    nd4j::ops::add op;
+    sd::ops::add op;
 
     Nd4jPointer ptrsInBuffer[] = {(Nd4jPointer) x.getBuffer(), y.getBuffer(), x.getSpecialBuffer(), y.getSpecialBuffer()};
     Nd4jPointer ptrsInShapes[] = {(Nd4jPointer) x.getShapeInfo(), y.getShapeInfo(), x.getSpecialShapeInfo(), y.getSpecialShapeInfo(),};
@@ -1031,7 +1031,7 @@ TEST_F(JavaInteropTests, zeta_test10) {
 
     auto e = NDArrayFactory::create<double>('c', {3, 4}, {23.014574, 12.184081, 8.275731, 6.1532226, 4.776538, 3.7945523, 3.0541048, 2.4765317, 2.0163891, 205.27448, 21.090889, 19.477398});
 
-    nd4j::ops::zeta op;
+    sd::ops::zeta op;
 
     NDArray::prepareSpecialUse({&z}, {&x, &q});
 
@@ -1064,7 +1064,7 @@ TEST_F(JavaInteropTests, Test_Boolean_Broadcastables_1) {
     Nd4jPointer ptrsInShapes[] = {reinterpret_cast<Nd4jPointer>(arrayX.shapeInfo()), reinterpret_cast<Nd4jPointer>(arrayY.shapeInfo()), arrayX.getSpecialShapeInfo(), arrayY.getSpecialShapeInfo()};
 
     NDArray::prepareSpecialUse({}, {&arrayX, &arrayY});
-    nd4j::ops::greater_equal op;
+    sd::ops::greater_equal op;
     auto shapeList = calculateOutputShapes2(nullptr, op.getOpHash(), ptrsInBuffer, ptrsInShapes, 2, nullptr, 0, nullptr, 0, nullptr, 0, nullptr, 0);
     NDArray::registerSpecialUse({}, {&arrayX, &arrayY});
     delete shapeList;
@@ -1083,7 +1083,7 @@ TEST_F(JavaInteropTests, Test_L2_Loss_3) {
     Nd4jPointer ptrsOutBuffer[] = {reinterpret_cast<Nd4jPointer>(z.buffer()), z.getSpecialBuffer()};
     Nd4jPointer ptrsOutShapes[] = {reinterpret_cast<Nd4jPointer>(z.shapeInfo()), z.getSpecialShapeInfo()};
 
-    nd4j::ops::l2_loss op;
+    sd::ops::l2_loss op;
     auto status = execCustomOp(nullptr, op.getOpHash(), ptrsInBuffer, ptrsInShapes, 1, ptrsOutBuffer, ptrsOutShapes, 1, nullptr, 0, nullptr, 0, nullptr, 0, false);
     ASSERT_EQ(Status::OK(), status);
 
@@ -1108,7 +1108,7 @@ TEST_F(JavaInteropTests, Test_Fastpath_3) {
 
     ASSERT_EQ(2, ctx.width());
 
-    nd4j::ops::add op;
+    sd::ops::add op;
     execCustomOp2(nullptr, op.getOpHash(), &ctx);
 
     NDArray::registerSpecialUse({&z}, {&array0, &array1});
@@ -1130,7 +1130,7 @@ TEST_F(JavaInteropTests, Test_Fastpath_4) {
     ctx.setOutputArray(0, z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo());
     ctx.setIArguments(iArgs, 3);
 
-    nd4j::ops::tri op;
+    sd::ops::tri op;
     execCustomOp2(nullptr, op.getOpHash(), &ctx);
 
     NDArray::registerSpecialUse({&z}, {});
@@ -1153,7 +1153,7 @@ TEST_F(JavaInteropTests, Test_Fastpath_5) {
     ctx.setInputArray(1, b.buffer(), b.shapeInfo(), b.specialBuffer(), b.specialShapeInfo());
     ctx.setOutputArray(0, c.buffer(), c.shapeInfo(), c.specialBuffer(), c.specialShapeInfo());
 
-    nd4j::ops::matmul op;
+    sd::ops::matmul op;
     auto status = execCustomOp2(nullptr, op.getOpHash(), &ctx);
 
     NDArray::registerSpecialUse({&c}, {&b, &c});
@@ -1186,7 +1186,7 @@ TEST_F(JavaInteropTests, Test_Fastpath_6) {
 
     ctx.setIArguments(iArgs, 3);
 
-    nd4j::ops::matmul_bp op;
+    sd::ops::matmul_bp op;
     auto status = execCustomOp2(nullptr, op.getOpHash(), &ctx);
 
     NDArray::registerSpecialUse({&gA, &gB}, {&a, &b, &gI});
@@ -1207,7 +1207,7 @@ TEST_F(JavaInteropTests, Test_Fastpath_7) {
 
     ctx.setIArguments(iArgs, 1);
 
-    nd4j::ops::concat op;
+    sd::ops::concat op;
 
     ctx.setInputArray(0, a.buffer(), a.shapeInfo(), a.specialBuffer(), a.specialShapeInfo());
     ctx.setInputArray(1, b.buffer(), b.shapeInfo(), b.specialBuffer(), b.specialShapeInfo());
@@ -1230,7 +1230,7 @@ TEST_F(JavaInteropTests, test_bfloat16_rng) {
     RandomGenerator rng(119, 323841120L);
     bfloat16 args[2] = {(bfloat16) 0.0f, (bfloat16) 1.0f};
     OpaqueDataBuffer zBuf(z.dataBuffer());
-    execRandom(nullptr, nd4j::random::Ops::UniformDistribution, &rng, &zBuf, z.shapeInfo(), z.specialShapeInfo(), args);
+    execRandom(nullptr, sd::random::Ops::UniformDistribution, &rng, &zBuf, z.shapeInfo(), z.specialShapeInfo(), args);
 
     //z.printIndexedBuffer("z");
     ASSERT_TRUE(z.sumNumber().e<float>(0) > 0);
@@ -1254,7 +1254,7 @@ TEST_F(JavaInteropTests, test_ismax_view) {
     ctx.setOutputArray(0, z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo());
     ctx.setIArguments(iArgs, 1);
 
-    nd4j::ops::ismax op;
+    sd::ops::ismax op;
     op.execute(&ctx);
 
     ASSERT_EQ(e, z);
@@ -1269,7 +1269,7 @@ TEST_F(JavaInteropTests, test_size_dtype_1) {
     ctx.setInputArray(0, x.buffer(), x.shapeInfo(), x.specialBuffer(), x.specialShapeInfo());
     ctx.setOutputArray(0, z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo());
 
-    nd4j::ops::size op;
+    sd::ops::size op;
     auto status = op.execute(&ctx);
     ASSERT_EQ(Status::OK(), status);
 
@@ -1278,7 +1278,7 @@ TEST_F(JavaInteropTests, test_size_dtype_1) {
 
 TEST_F(JavaInteropTests, test_expandable_array_op_1) {
     auto x = NDArrayFactory::string( {2}, {"first string", "second"});
-    auto d = NDArrayFactory::string(" ", nd4j::DataType::UTF8);
+    auto d = NDArrayFactory::string(" ", sd::DataType::UTF8);
 
     auto z0 = NDArrayFactory::create<Nd4jLong>('c', {6});
     auto z1 = NDArrayFactory::string( {3}, {"", "", ""});
@@ -1295,7 +1295,7 @@ TEST_F(JavaInteropTests, test_expandable_array_op_1) {
     ctx.setOutputArray(0, &iz0, z0.shapeInfo(), z0.specialShapeInfo());
     ctx.setOutputArray(1, &iz1, z1.shapeInfo(), z1.specialShapeInfo());
 
-    nd4j::ops::compat_string_split op;
+    sd::ops::compat_string_split op;
     auto status = op.execute(&ctx);
     ASSERT_EQ(Status::OK(), status);
 
@@ -1329,14 +1329,14 @@ TEST_F(JavaInteropTests, test_workspace_backed_arrays_1) {
 
     ctx.setIArguments({2, 2, 1, 1, 0, 0, 1, 1, 0, 0, 0});
 
-    nd4j::ops::maxpool2d_bp op;
+    sd::ops::maxpool2d_bp op;
     auto status = op.execute(&ctx);
     ASSERT_EQ(Status::OK(), status);
 }
 
 /*
 TEST_F(JavaInteropTests, Test_Results_Conversion_1) {
-    auto pl = nd4j::graph::readFlatBuffers("./resources/gru_dynamic_mnist.fb");
+    auto pl = sd::graph::readFlatBuffers("./resources/gru_dynamic_mnist.fb");
     auto ptr = executeFlatGraph(nullptr, pl);
 
     // at this point we have FlatResults
@@ -1363,7 +1363,7 @@ TEST_F(JavaInteropTests, Test_Results_Conversion_1) {
         ASSERT_TRUE(shape->size() > 0 && rank >= 0 &&  rank < MAX_RANK);
 
         // building regular NDArray out of this FlatArray
-        auto ndarray = nd4j::graph::FlatUtils::fromFlatArray(flatArray);
+        auto ndarray = sd::graph::FlatUtils::fromFlatArray(flatArray);
 
         // rank should match FlatArray
         ASSERT_EQ(rank, ndarray->rankOf());
@@ -1395,7 +1395,7 @@ TEST_F(JavaInteropTests, Test_Results_Conversion_1) {
 
 //     for (int e = 0; e < exp.size(); e++) {
 //         auto f = static_cast<double>(e);
-//         auto tmp = nd4j::math::nd4j_exp<double, double>((f / 100000.0 * 2.0 - 1.0) * 6.0);
+//         auto tmp = sd::math::nd4j_exp<double, double>((f / 100000.0 * 2.0 - 1.0) * 6.0);
 //         exp[e] = static_cast<float>(tmp / (tmp + 1.0));
 //     }
 
diff --git a/libnd4j/tests_cpu/layers_tests/LambdaTests.cu b/libnd4j/tests_cpu/layers_tests/LambdaTests.cu
index 5bf8c8b57..a114f7179 100644
--- a/libnd4j/tests_cpu/layers_tests/LambdaTests.cu
+++ b/libnd4j/tests_cpu/layers_tests/LambdaTests.cu
@@ -24,7 +24,7 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 
-using namespace nd4j;
+using namespace sd;
 
 class LambdaTests : public testing::Test {
 public:
@@ -191,10 +191,10 @@ template <typename T>
 void testPairwiseMy(NDArray &x, NDArray &y, NDArray &z) {
 
     auto f = LAMBDA_TT(x, y){
-        return nd4j::math::nd4j_max<T>(x, (T)0.f)
+        return sd::math::nd4j_max<T>(x, (T)0.f)
               - x * y
-              + nd4j::math::nd4j_log<T,T>((T)1.f
-                + nd4j::math::nd4j_exp<T,T>(-nd4j::math::nd4j_abs(x)));
+              + sd::math::nd4j_log<T,T>((T)1.f
+                + sd::math::nd4j_exp<T,T>(-sd::math::nd4j_abs(x)));
     };
 
     x.applyPairwiseLambda(y, f, z);
@@ -204,8 +204,8 @@ void testPairwiseMy(NDArray &x, NDArray &y, NDArray &z) {
 TEST_F(LambdaTests, test_basic_9) {
 
     NDArray labels('c', {2,3,4},{0,1,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,0,1,1,0,1,0});
-    NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray output('c', {2,3,4}, nd4j::DataType::DOUBLE);
+    NDArray logits('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray output('c', {2,3,4}, sd::DataType::DOUBLE);
     NDArray expected('c', {2,3,4}, {0.744397, 0.598139, 0.554355, 0.913015, 0.474077, 1.037488, 0.403186, 1.171101, 0.341154, 1.313262, 0.287335, 1.463282, 0.241008, 1.620417, 0.201413, 1.783901, 0.167786, 1.952978, 2.039387, 0.126928, 0.115520, 2.305083, 0.095545, 2.486836});
 
     logits.linspace(0.1, 0.1);
diff --git a/libnd4j/tests_cpu/layers_tests/LaunchContextCudaTests.cu b/libnd4j/tests_cpu/layers_tests/LaunchContextCudaTests.cu
index d7632ace5..e16df80e6 100644
--- a/libnd4j/tests_cpu/layers_tests/LaunchContextCudaTests.cu
+++ b/libnd4j/tests_cpu/layers_tests/LaunchContextCudaTests.cu
@@ -19,9 +19,9 @@
 //
 
 #include "testlayers.h"
-#include <NDArray.h>
-#include <ShapeUtils.h>
-#include <reduce3.h>
+#include <array/NDArray.h>
+#include <helpers/ShapeUtils.h>
+#include <loops/reduce3.h>
 #include <ops/declarable/LegacyTransformOp.h>
 #include <ops/declarable/LegacyPairwiseTransformOp.h>
 #include <ops/declarable/LegacyScalarOp.h>
@@ -34,8 +34,8 @@
 #include <thread>
 #include <execution/AffinityManager.h>
 
-using namespace nd4j;
-using namespace nd4j::ops;
+using namespace sd;
+using namespace sd::ops;
 
 class LaunchContextCudaTests : public testing::Test {
     //
diff --git a/libnd4j/tests_cpu/layers_tests/LegacyOpsCudaTests.cu b/libnd4j/tests_cpu/layers_tests/LegacyOpsCudaTests.cu
index 354051f81..53179cd68 100644
--- a/libnd4j/tests_cpu/layers_tests/LegacyOpsCudaTests.cu
+++ b/libnd4j/tests_cpu/layers_tests/LegacyOpsCudaTests.cu
@@ -19,9 +19,9 @@
 //
 
 #include "testlayers.h"
-#include <NDArray.h>
-#include <ShapeUtils.h>
-#include <reduce3.h>
+#include <array/NDArray.h>
+#include <helpers/ShapeUtils.h>
+#include <loops/reduce3.h>
 #include <ops/declarable/LegacyTransformOp.h>
 #include <ops/declarable/LegacyPairwiseTransformOp.h>
 #include <ops/declarable/LegacyScalarOp.h>
@@ -32,8 +32,8 @@
 #include <helpers/TAD.h>
 #include <helpers/ConstantTadHelper.h>
 
-using namespace nd4j;
-using namespace nd4j::ops;
+using namespace sd;
+using namespace sd::ops;
 
 class LegacyOpsCudaTests : public testing::Test {
 
diff --git a/libnd4j/tests_cpu/layers_tests/LegacyOpsTests.cpp b/libnd4j/tests_cpu/layers_tests/LegacyOpsTests.cpp
index 35f46d739..9bb35dc65 100644
--- a/libnd4j/tests_cpu/layers_tests/LegacyOpsTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/LegacyOpsTests.cpp
@@ -19,9 +19,9 @@
 //
 
 #include "testlayers.h"
-#include <NDArray.h>
-#include <ShapeUtils.h>
-#include <reduce3.h>
+#include <array/NDArray.h>
+#include <helpers/ShapeUtils.h>
+#include <loops/reduce3.h>
 #include <ops/declarable/LegacyTransformOp.h>
 #include <ops/declarable/LegacyPairwiseTransformOp.h>
 #include <ops/declarable/LegacyScalarOp.h>
@@ -32,8 +32,8 @@
 #include <helpers/TAD.h>
 #include <helpers/ConstantTadHelper.h>
 
-using namespace nd4j;
-using namespace nd4j::ops;
+using namespace sd;
+using namespace sd::ops;
 
 class LegacyOpsTests : public testing::Test {
 
@@ -47,7 +47,7 @@ TEST_F(LegacyOpsTests, TransformTests_1) {
     auto exp = NDArrayFactory::create<float>('c', {5, 5});
     exp.assign(-1.0);
 
-    nd4j::ops::LegacyTransformSameOp op(transform::Neg); // Neg
+    sd::ops::LegacyTransformSameOp op(transform::Neg); // Neg
     auto status = op.execute({&x}, {&z}, {}, {}, {});
     ASSERT_EQ(status, ND4J_STATUS_OK);
     //z.printIndexedBuffer("Output NEG");
@@ -61,7 +61,7 @@ TEST_F(LegacyOpsTests, TransformTests_2) {
     auto exp = NDArrayFactory::create<float>('c', {5, 5});
     exp.assign(-1.0);
 
-    nd4j::ops::LegacyTransformSameOp op(transform::Neg); // Neg
+    sd::ops::LegacyTransformSameOp op(transform::Neg); // Neg
     auto result = op.evaluate({&x}, {}, {});
 
     ASSERT_EQ(1, result->size());
@@ -80,7 +80,7 @@ TEST_F(LegacyOpsTests,  Reciprocal_1) {
     auto ethalon = NDArrayFactory::create<float>('c', {5, 5});
     ethalon.assign(0.5f);
 
-    nd4j::ops::LegacyTransformSameOp op(transform::Reciprocal); // Reciprocal
+    sd::ops::LegacyTransformSameOp op(transform::Reciprocal); // Reciprocal
     Nd4jStatus status = op.execute({&x}, {&x}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -98,7 +98,7 @@ TEST_F(LegacyOpsTests,  PWT_Tests_1) {
     auto exp = NDArrayFactory::create<float>('c', {5, 5});
     exp.assign(6.0);
 
-    nd4j::ops::LegacyPairwiseTransformOp op(pairwise::Multiply); // Multiply
+    sd::ops::LegacyPairwiseTransformOp op(pairwise::Multiply); // Multiply
     Nd4jStatus status = op.execute({&x, &y}, {&x}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -118,7 +118,7 @@ TEST_F(LegacyOpsTests,  PWT_Tests_2) {
     auto exp = NDArrayFactory::create<float>('c', {5, 5});
     exp.assign(6.0);
 
-    nd4j::ops::LegacyPairwiseTransformOp op(pairwise::Multiply); // Multiply
+    sd::ops::LegacyPairwiseTransformOp op(pairwise::Multiply); // Multiply
     auto result = op.evaluate({&x, &y}, {}, {});
 
     auto z = result->at(0);
@@ -136,7 +136,7 @@ TEST_F(LegacyOpsTests, Scalar_Test_1) {
     auto exp = NDArrayFactory::create<float>('c', {5, 5});
     exp.assign(7.0);
 
-    nd4j::ops::LegacyScalarOp op(scalar::Add);
+    sd::ops::LegacyScalarOp op(scalar::Add);
     op.execute({&x}, {&x}, {5.0}, {}, {}); //
 
     ASSERT_TRUE(exp.equalsTo(&x));
@@ -151,7 +151,7 @@ TEST_F(LegacyOpsTests, Scalar_Test_2) {
 
     auto y = NDArrayFactory::create<float>(5.0f);
 
-    nd4j::ops::LegacyScalarOp op(scalar::Add, y);
+    sd::ops::LegacyScalarOp op(scalar::Add, y);
     auto result = op.evaluate({&x}, {}, {});
 
     auto z = result->at(0);
@@ -165,7 +165,7 @@ TEST_F(LegacyOpsTests, ReduceTests_1) {
     auto x = NDArrayFactory::create<float>('c', {5, 5});
     x.assign(1.0);
     int opNum = reduce::Sum;
-    nd4j::ops::LegacyReduceSameOp op(opNum);
+    sd::ops::LegacyReduceSameOp op(opNum);
 
     auto result = op.evaluate({&x}, {}, {});
 
@@ -184,7 +184,7 @@ TEST_F(LegacyOpsTests, ReduceTests_2) {
     auto x = NDArrayFactory::create<float>('c', {5, 5});
     x.assign(1.0);
 
-    nd4j::ops::LegacyReduceSameOp op(reduce::Sum);
+    sd::ops::LegacyReduceSameOp op(reduce::Sum);
     auto axis = NDArrayFactory::create<Nd4jLong>('c', {1}, {1});
     auto result = op.evaluate({&x, &axis}, {}, {});
 
@@ -207,7 +207,7 @@ TEST_F(LegacyOpsTests, ReduceTests_3) {
     auto indices = NDArrayFactory::create<int>('c', {1,1}, {1});
 
 
-    nd4j::ops::LegacyReduceSameOp op(reduce::Sum);
+    sd::ops::LegacyReduceSameOp op(reduce::Sum);
     auto result = op.evaluate({&x, &indices}, {}, {});
     auto z = result->at(0);
     auto exp = x.reduceAlongDimension(reduce::Sum,{1});
@@ -227,7 +227,7 @@ TEST_F(LegacyOpsTests, ReduceTests_4) {
     auto indices = NDArrayFactory::create<int>('c', {1, 1}, {1});
 
 
-    nd4j::ops::LegacyReduceSameOp op(reduce::Sum);
+    sd::ops::LegacyReduceSameOp op(reduce::Sum);
     auto result = op.evaluate({&x, &indices}, {}, {}, {true});
     auto z = result->at(0);
     auto exp = x.reduceAlongDimension(reduce::Sum, {1}, true);
@@ -245,7 +245,7 @@ TEST_F(LegacyOpsTests, ReduceTests_5) {
     auto x = NDArrayFactory::create<float>('c', {5, 5});
     x.assign(1.0);
     int opNum = reduce::Mean;
-    nd4j::ops::LegacyReduceFloatOp op(opNum);
+    sd::ops::LegacyReduceFloatOp op(opNum);
 
     auto result = op.evaluate({&x});
 
@@ -264,7 +264,7 @@ TEST_F(LegacyOpsTests, ReduceTests_6) {
     auto x = NDArrayFactory::create<float>('c', {5, 5});
     x.assign(1.0);
     auto axis = NDArrayFactory::create<int>('c', {1}, {1});
-    nd4j::ops::LegacyReduceFloatOp op(reduce::Mean);
+    sd::ops::LegacyReduceFloatOp op(reduce::Mean);
 
     auto result = op.evaluate({&x, &axis}, {}, {});
 
@@ -287,7 +287,7 @@ TEST_F(LegacyOpsTests, ReduceTests_7) {
     auto indices = NDArrayFactory::create<int>('c', {1,1}, {1});
 
 
-    nd4j::ops::LegacyReduceFloatOp op(reduce::Mean);
+    sd::ops::LegacyReduceFloatOp op(reduce::Mean);
     auto result = op.evaluate({&x, &indices}, {}, {});
     auto z = result->at(0);
     auto exp = x.reduceAlongDimension(reduce::Mean,{1});
@@ -307,7 +307,7 @@ TEST_F(LegacyOpsTests, ReduceTests_8) {
     auto indices = NDArrayFactory::create<int>('c', {1}, {1});
 
 
-    nd4j::ops::LegacyReduceFloatOp op(reduce::Mean);
+    sd::ops::LegacyReduceFloatOp op(reduce::Mean);
     auto result = op.evaluate({&x, &indices}, {}, {}, {true});
     auto z = result->at(0);
     auto exp = x.reduceAlongDimension(reduce::Mean, {1}, true);
@@ -327,7 +327,7 @@ TEST_F(LegacyOpsTests, IndexReduceTests_1) {
     auto x = NDArrayFactory::create<float>('c', {5, 5});
     x.linspace(1);
 
-    nd4j::ops::LegacyIndexReduceOp op(indexreduce::IndexMax);
+    sd::ops::LegacyIndexReduceOp op(indexreduce::IndexMax);
 
     auto result = op.evaluate({&x}, {}, {});
 
@@ -347,7 +347,7 @@ TEST_F(LegacyOpsTests, IndexReduceTests_2) {
     auto indices = NDArrayFactory::create<int>('c', {1}, {1});
     x.linspace(1);
     auto exp = NDArrayFactory::create<Nd4jLong>({4,4,4,4,4});
-    nd4j::ops::LegacyIndexReduceOp op(indexreduce::IndexMax);
+    sd::ops::LegacyIndexReduceOp op(indexreduce::IndexMax);
 
     auto result = op.evaluate({&x, &indices}, {}, {});
 
@@ -372,7 +372,7 @@ TEST_F(LegacyOpsTests, BroadcastingTests_1) {
     auto row = NDArrayFactory::create<double>('c', {1, 5});
     row.linspace(1);
     auto axis = NDArrayFactory::create<int>('c', {1}, {1});
-    nd4j::ops::LegacyBroadcastOp op(broadcast::Add);
+    sd::ops::LegacyBroadcastOp op(broadcast::Add);
     Nd4jStatus status = op.execute({&x, &row, &axis}, {&x}, {}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, status);
@@ -394,7 +394,7 @@ TEST_F(LegacyOpsTests, BroadcastingTests_2) {
     int axis = 1;
 
     // shape::printShapeInfoLinear("tad shape", tad.tadOnlyShapeInfo);
-    auto packY = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(y.shapeInfo(), {axis});
+    auto packY = sd::ConstantTadHelper::getInstance()->tadForDimensions(y.shapeInfo(), {axis});
 
     NDArray::prepareSpecialUse({&y}, {&x});
 
@@ -433,11 +433,11 @@ TEST_F(LegacyOpsTests, reduce3_1) {
 
     std::vector<int> dim = {1};
 
-    auto shapeBuffer  = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, yShape);
-    auto xShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 1, xShape);
+    auto shapeBuffer  = sd::ShapeBuilders::createShapeInfo(sd::DataType::FLOAT32, 'c', 2, yShape);
+    auto xShapeBuffer = sd::ShapeBuilders::createShapeInfo(sd::DataType::FLOAT32, 'c', 1, xShape);
 
     //int *tadShapeBuffer = shape::computeResultShape(shapeBuffer,dimension,dimensionLength);
-    auto tadShapeBuffer = nd4j::ShapeUtils::evalReduceShapeInfo('c', dim, shapeBuffer, false, true, nullptr);
+    auto tadShapeBuffer = sd::ShapeUtils::evalReduceShapeInfo('c', dim, shapeBuffer, false, true, nullptr);
     functions::reduce3::Reduce3<float, float>::exec(opNum, x, xShapeBuffer, extraVals, y, shapeBuffer, result, tadShapeBuffer, dimension, dimensionLength, 0, 4);
 
     float distancesAssertion[4] = {0.0,8.0,16.0,24.0};
@@ -459,15 +459,15 @@ TEST_F(LegacyOpsTests, Reduce3_2) {
     auto dim = NDArrayFactory::create<int>('c', {1}, {1});
     dim.syncToHost();
 
-    nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext();
+    sd::LaunchContext* context = sd::LaunchContext::defaultContext();
 
     Nd4jPointer* extraPointers = nullptr;
     #ifdef __CUDABLAS__
         extraPointers = new Nd4jPointer[7] {nullptr, context->getCudaStream(), context->getScalarPointer(), nullptr, context->getCudaSpecialStream(), context->getReductionPointer(), context->getAllocationPointer()};
     #endif
 
-    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(x.getShapeInfo(), {1});
-    auto packY = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(y.getShapeInfo(), {1});
+    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(x.getShapeInfo(), {1});
+    auto packY = sd::ConstantTadHelper::getInstance()->tadForDimensions(y.getShapeInfo(), {1});
 
     NDArray::prepareSpecialUse({&z}, {&x, &y, &dim});
     OpaqueDataBuffer xBuf(x.dataBuffer());
@@ -499,15 +499,15 @@ TEST_F(LegacyOpsTests, Reduce3_3) {
     auto dim = NDArrayFactory::create<int>('c', {1}, {1});
     dim.syncToHost();
 
-    nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext();
+    sd::LaunchContext* context = sd::LaunchContext::defaultContext();
 
     Nd4jPointer* extraPointers = nullptr;
     #ifdef __CUDABLAS__
         extraPointers = new Nd4jPointer[7] {nullptr, context->getCudaStream(), context->getScalarPointer(), nullptr, context->getCudaSpecialStream(), context->getReductionPointer(), context->getAllocationPointer()};
     #endif
 
-    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(x.getShapeInfo(), {1});
-    auto packY = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(y.getShapeInfo(), {1});
+    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(x.getShapeInfo(), {1});
+    auto packY = sd::ConstantTadHelper::getInstance()->tadForDimensions(y.getShapeInfo(), {1});
 
     NDArray::prepareSpecialUse({&z}, {&x, &y, &dim});
     OpaqueDataBuffer xBuf(x.dataBuffer());
@@ -539,15 +539,15 @@ TEST_F(LegacyOpsTests, Reduce3_4) {
     auto dim = NDArrayFactory::create<int>('c', {1}, {1});
     dim.syncToHost();
 
-    nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext();
+    sd::LaunchContext* context = sd::LaunchContext::defaultContext();
 
     Nd4jPointer* extraPointers = nullptr;
     #ifdef __CUDABLAS__
         extraPointers = new Nd4jPointer[7] {nullptr, context->getCudaStream(), context->getScalarPointer(), nullptr, context->getCudaSpecialStream(), context->getReductionPointer(), context->getAllocationPointer()};
     #endif
 
-    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(x.getShapeInfo(), {1});
-    auto packY = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(y.getShapeInfo(), {1});
+    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(x.getShapeInfo(), {1});
+    auto packY = sd::ConstantTadHelper::getInstance()->tadForDimensions(y.getShapeInfo(), {1});
 
     NDArray::prepareSpecialUse({&z}, {&x, &y, &dim});
     OpaqueDataBuffer xBuf(x.dataBuffer());
@@ -581,15 +581,15 @@ TEST_F(LegacyOpsTests, Reduce3_5) {
     auto dim = NDArrayFactory::create<int>('c', {1}, {1});
     dim.syncToHost();
 
-    nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext();
+    sd::LaunchContext* context = sd::LaunchContext::defaultContext();
 
     Nd4jPointer* extraPointers = nullptr;
     #ifdef __CUDABLAS__
         extraPointers = new Nd4jPointer[7] {nullptr, context->getCudaStream(), context->getScalarPointer(), nullptr, context->getCudaSpecialStream(), context->getReductionPointer(), context->getAllocationPointer()};
     #endif
 
-    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(x.getShapeInfo(), {1});
-    auto packY = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(y.getShapeInfo(), {1});
+    auto packX = sd::ConstantTadHelper::getInstance()->tadForDimensions(x.getShapeInfo(), {1});
+    auto packY = sd::ConstantTadHelper::getInstance()->tadForDimensions(y.getShapeInfo(), {1});
 
     NDArray::prepareSpecialUse({&z}, {&x, &y, &dim});
 
@@ -616,10 +616,10 @@ TEST_F(LegacyOpsTests, test_Reduce3_All_1) {
     auto z = NDArrayFactory::create<float>('c', {1000, 1});
     auto dim = NDArrayFactory::create<int>('c', {1}, {-1});
 
-    auto tadPackX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(x.shapeInfo(), -1);
-    auto tadPackY = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(y.shapeInfo(), -1);
+    auto tadPackX = sd::ConstantTadHelper::getInstance()->tadForDimensions(x.shapeInfo(), -1);
+    auto tadPackY = sd::ConstantTadHelper::getInstance()->tadForDimensions(y.shapeInfo(), -1);
 
-    nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext();
+    sd::LaunchContext* context = sd::LaunchContext::defaultContext();
 
     Nd4jPointer* extraPointers = nullptr;
     #ifdef __CUDABLAS__
@@ -652,7 +652,7 @@ TEST_F(LegacyOpsTests, test_inverse_broadcast_1) {
     auto e = NDArrayFactory::create<float>('c', {3, 4});
     e.assign(2.0f);
 
-    auto tadPackY = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(y.shapeInfo(), 1);
+    auto tadPackY = sd::ConstantTadHelper::getInstance()->tadForDimensions(y.shapeInfo(), 1);
 
     y.tickWriteDevice();
 
@@ -680,7 +680,7 @@ TEST_F(LegacyOpsTests, test_inverse_broadcast_2) {
     auto erow = e.tensorAlongDimension(1, {1});
     erow.assign(true);
 
-    auto tadPackY = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(y.shapeInfo(), 1);
+    auto tadPackY = sd::ConstantTadHelper::getInstance()->tadForDimensions(y.shapeInfo(), 1);
 
     z.tickWriteDevice();
 
diff --git a/libnd4j/tests_cpu/layers_tests/ListOperationsTests.cpp b/libnd4j/tests_cpu/layers_tests/ListOperationsTests.cpp
index 625d9978f..ba019d9b0 100644
--- a/libnd4j/tests_cpu/layers_tests/ListOperationsTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ListOperationsTests.cpp
@@ -19,12 +19,12 @@
 //
 
 #include "testlayers.h"
-#include <NDArray.h>
-#include <GraphExecutioner.h>
+#include <array/NDArray.h>
+#include <graph/GraphExecutioner.h>
 #include <ops/declarable/CustomOperations.h>
 
-using namespace nd4j;
-using namespace nd4j::ops;
+using namespace sd;
+using namespace sd::ops;
 
 class ListOperationsTests : public testing::Test {
 
@@ -35,7 +35,7 @@ TEST_F(ListOperationsTests, BasicTest_Write_1) {
     auto x = NDArrayFactory::create<double>('c', {128});
     x.linspace(1);
 
-    nd4j::ops::write_list op;
+    sd::ops::write_list op;
 
     auto result = op.execute(&list, {&x}, {}, {1});
 
@@ -62,7 +62,7 @@ TEST_F(ListOperationsTests, BasicTest_Stack_1) {
         tads.at(e)->assign(row);
     }
 
-    nd4j::ops::stack_list op;
+    sd::ops::stack_list op;
 
     auto result = op.execute(&list, {}, {}, {1});
 
@@ -89,7 +89,7 @@ TEST_F(ListOperationsTests, BasicTest_UnStackList_1) {
         delete row;
     }
 
-    nd4j::ops::unstack_list op;
+    sd::ops::unstack_list op;
 
     auto result = op.execute(&list, {&x}, {}, {0});
 
@@ -122,7 +122,7 @@ TEST_F(ListOperationsTests, BasicTest_UnStackList_1) {
 //        delete row;
 //    }
 //
-//    nd4j::ops::unstack_list op;
+//    sd::ops::unstack_list op;
 //
 //    auto result = op.execute(nullptr, {&x}, {}, {0});
 //
@@ -156,7 +156,7 @@ TEST_F(ListOperationsTests, BasicTest_Read_1) {
         delete row;
     }
 
-    nd4j::ops::read_list op;
+    sd::ops::read_list op;
 
     auto result = op.execute(&list, {}, {}, {4});
 
@@ -189,7 +189,7 @@ TEST_F(ListOperationsTests, BasicTest_Pick_1) {
     tads.at(3)->assign(3.0f);
 
 
-    nd4j::ops::pick_list op;
+    sd::ops::pick_list op;
     auto result = op.execute(&list, {}, {}, {1, 1, 3, 3});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -213,7 +213,7 @@ TEST_F(ListOperationsTests, BasicTest_Size_1) {
         delete row;
     }
 
-    nd4j::ops::size_list op;
+    sd::ops::size_list op;
 
     auto result = op.execute(&list, {}, {}, {1});
 
@@ -231,7 +231,7 @@ TEST_F(ListOperationsTests, BasicTest_Create_1) {
     auto matrix = NDArrayFactory::create<double>('c', {3, 2});
     matrix.linspace(1);
 
-    nd4j::ops::create_list op;
+    sd::ops::create_list op;
 
     auto result = op.execute(nullptr, {&matrix}, {}, {1, 1});
 
@@ -281,7 +281,7 @@ TEST_F(ListOperationsTests, BasicTest_Split_1) {
         delete row;
     }
 
-    nd4j::ops::split_list op;
+    sd::ops::split_list op;
     auto result = op.execute(&list, {&matrix, &lengths}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -316,7 +316,7 @@ TEST_F(ListOperationsTests, BasicTest_Scatter_1) {
     for (int e = 0; e < matrix.rows(); e++)
         indices.p(e, 9 - e);
 
-    nd4j::ops::scatter_list op;
+    sd::ops::scatter_list op;
     auto result = op.execute(&list, {&indices, &matrix, &s}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -345,7 +345,7 @@ TEST_F(ListOperationsTests, BasicTest_Clone_1) {
     Context block(1, &variableSpace);
     block.pickInput(-1);
 
-    nd4j::ops::clone_list op;
+    sd::ops::clone_list op;
 
     ASSERT_TRUE(list == block.variable(0)->getNDArrayList());
 
@@ -382,7 +382,7 @@ TEST_F(ListOperationsTests, BasicTest_Gather_1) {
     auto indices = NDArrayFactory::create<double>('c', {1, 10});
     indices.linspace(9, -1);
 
-    nd4j::ops::gather_list op;
+    sd::ops::gather_list op;
     auto result = op.execute(&list, {&indices}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -428,17 +428,17 @@ TEST_F(ListOperationsTests, GraphTests_Sequential_1) {
     auto nodeA = new Node(OpType_TRANSFORM_SAME, 0, 1, {-1});
 
     // creating list
-    nd4j::ops::create_list opB;
+    sd::ops::create_list opB;
     auto nodeB = new Node(&opB, 2, {1},{},{}, 0.0f, {}, {0, 1});
     //nodeB->setCustomOp(&opB);
 
     // filling list with matrix
-    nd4j::ops::split_list opC;
+    sd::ops::split_list opC;
     auto nodeC = new Node(&opC, 3, {2, 1, -2});
     //nodeC->setCustomOp(&opC);
 
     // reading chunks from List. We're adding op number 3 in inputs, to ensure graph will execute this node after split
-    nd4j::ops::read_list opD;
+    sd::ops::read_list opD;
     auto nodeD0 = new Node(&opD, 5, {2, 3}, {},{}, 0.0f, {}, {0});
     auto nodeD1 = new Node(&opD, 6, {2, 3}, {},{}, 0.0f, {}, {1});
     auto nodeD2 = new Node(&opD, 7, {2, 3}, {},{}, 0.0f, {}, {2});
@@ -447,12 +447,12 @@ TEST_F(ListOperationsTests, GraphTests_Sequential_1) {
     //nodeD2->setCustomOp(&opD);
 
     // using OneMinus on each chunk separately
-    auto nodeE0 = new Node(OpType_TRANSFORM_SAME, nd4j::transform::OneMinus, 10, {5});
-    auto nodeE1 = new Node(OpType_TRANSFORM_SAME, nd4j::transform::OneMinus, 11, {6});
-    auto nodeE2 = new Node(OpType_TRANSFORM_SAME, nd4j::transform::OneMinus, 12, {7});
+    auto nodeE0 = new Node(OpType_TRANSFORM_SAME, sd::transform::OneMinus, 10, {5});
+    auto nodeE1 = new Node(OpType_TRANSFORM_SAME, sd::transform::OneMinus, 11, {6});
+    auto nodeE2 = new Node(OpType_TRANSFORM_SAME, sd::transform::OneMinus, 12, {7});
 
     // writing chunks back to the List
-    nd4j::ops::write_list opF;
+    sd::ops::write_list opF;
     auto nodeF0 = new Node(&opF, 15, {2, 10}, {},{}, 0.0f, {}, {0});
     auto nodeF1 = new Node(&opF, 16, {2, 11}, {},{}, 0.0f, {}, {1});
     auto nodeF2 = new Node(&opF, 17, {2, 12}, {},{}, 0.0f, {}, {2});
@@ -462,7 +462,7 @@ TEST_F(ListOperationsTests, GraphTests_Sequential_1) {
 //    nodeF2->setCustomOp(&opF);
 
     // now we're stacking chunks back to matrix state
-    nd4j::ops::stack_list opG;
+    sd::ops::stack_list opG;
     auto nodeG = new Node(&opG, 20, {2, 15, 16, 17});
     //auto nodeG = new Node<float>(OpType_CUSTOM, 0, 20, {2});
 
@@ -559,17 +559,17 @@ TEST_F(ListOperationsTests, GraphTests_Sequential_2) {
     auto nodeA = new Node(OpType_TRANSFORM_SAME, 0, 1, {-1});
 
     // creating list
-    nd4j::ops::create_list opB;
+    sd::ops::create_list opB;
     auto nodeB = new Node(&opB, 2, {1},{},{}, 0.0f, {}, {0, 1});
 //    nodeB->setCustomOp(&opB);
 
     // filling list with matrix
-    nd4j::ops::scatter_list opC;
+    sd::ops::scatter_list opC;
     auto nodeC = new Node(&opC, 3, {2, -2, 1, -3});
 
     //nodeC->setCustomOp(&opC);
 
-    nd4j::ops::read_list opD;
+    sd::ops::read_list opD;
     auto nodeD0 = new Node(&opD, 5, {2, 3}, {},{}, 0.0f, {}, {0});
     auto nodeD1 = new Node(&opD, 6, {2, 3, 15}, {},{}, 0.0f, {}, {1});
     auto nodeD2 = new Node(&opD, 7, {2, 3, 16}, {},{}, 0.0f, {}, {2});
@@ -580,12 +580,12 @@ TEST_F(ListOperationsTests, GraphTests_Sequential_2) {
 
 
     // using OneMinus on each chunk separately
-    auto nodeE0 = new Node(OpType_TRANSFORM_SAME, nd4j::transform::OneMinus, 10, {5});
-    auto nodeE1 = new Node(OpType_TRANSFORM_SAME, nd4j::transform::OneMinus, 11, {6});
-    auto nodeE2 = new Node(OpType_TRANSFORM_SAME, nd4j::transform::OneMinus, 12, {7});
+    auto nodeE0 = new Node(OpType_TRANSFORM_SAME, sd::transform::OneMinus, 10, {5});
+    auto nodeE1 = new Node(OpType_TRANSFORM_SAME, sd::transform::OneMinus, 11, {6});
+    auto nodeE2 = new Node(OpType_TRANSFORM_SAME, sd::transform::OneMinus, 12, {7});
 
     // writing chunks back to the List
-    nd4j::ops::write_list opF;
+    sd::ops::write_list opF;
     auto nodeF0 = new Node(&opF, 15, {2, 10}, {},{}, 0.0f, {}, {0});
     auto nodeF1 = new Node(&opF, 16, {2, 11}, {},{}, 0.0f, {}, {1});
     auto nodeF2 = new Node(&opF, 17, {2, 12}, {},{}, 0.0f, {}, {2});
@@ -595,7 +595,7 @@ TEST_F(ListOperationsTests, GraphTests_Sequential_2) {
 //    nodeF2->setCustomOp(&opF);
 
     // now we're gathering chunks back to matrix state
-    nd4j::ops::pick_list opG;
+    sd::ops::pick_list opG;
     auto nodeG = new Node(&opG, 20, {2, -2, 15, 16, 17});
     //auto nodeG = new Node<float>(OpType_CUSTOM, 0, 20, {2});
 
diff --git a/libnd4j/tests_cpu/layers_tests/LoopCoordsHelperTests.cpp b/libnd4j/tests_cpu/layers_tests/LoopCoordsHelperTests.cpp
index 1a65c09ae..976e89550 100644
--- a/libnd4j/tests_cpu/layers_tests/LoopCoordsHelperTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/LoopCoordsHelperTests.cpp
@@ -19,9 +19,9 @@
  //
 
 #include "testlayers.h"
-#include <LoopsCoordsHelper.h>
+#include <helpers/LoopsCoordsHelper.h>
 #include <type_traits>
-using namespace nd4j;
+using namespace sd;
 
 class LoopCoordsHelper : public testing::Test {
 public:
diff --git a/libnd4j/tests_cpu/layers_tests/MemoryUtilsTests.cpp b/libnd4j/tests_cpu/layers_tests/MemoryUtilsTests.cpp
index fd771231d..4bfe40405 100644
--- a/libnd4j/tests_cpu/layers_tests/MemoryUtilsTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/MemoryUtilsTests.cpp
@@ -22,7 +22,7 @@
 #include <memory/MemoryUtils.h>
 #include "testlayers.h"
 
-using namespace nd4j::memory;
+using namespace sd::memory;
 
 class MemoryUtilsTests : public testing::Test {
 public:
diff --git a/libnd4j/tests_cpu/layers_tests/MklDnnTests.cpp b/libnd4j/tests_cpu/layers_tests/MklDnnTests.cpp
index b01c9f98a..a9b0d92e6 100644
--- a/libnd4j/tests_cpu/layers_tests/MklDnnTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/MklDnnTests.cpp
@@ -34,7 +34,7 @@ public:
 
 };
 
-static void printer(std::initializer_list<nd4j::ops::platforms::PlatformHelper*> helpers) {
+static void printer(std::initializer_list<sd::ops::platforms::PlatformHelper*> helpers) {
 
     for (auto v:helpers) {
         nd4j_printf("Initialized [%s]\n", v->name().c_str());
@@ -45,29 +45,29 @@ static void printer(std::initializer_list<nd4j::ops::platforms::PlatformHelper*>
 TEST_F(MklDnnTests, helpers_includer) {
     // we need this block, to make sure all helpers are still available within binary, and not optimized out by linker
 #ifdef HAVE_MKLDNN
-    nd4j::ops::platforms::PLATFORM_conv2d_ENGINE_CPU conv2d;
-    nd4j::ops::platforms::PLATFORM_conv2d_bp_ENGINE_CPU conv2d_bp;
+    sd::ops::platforms::PLATFORM_conv2d_ENGINE_CPU conv2d;
+    sd::ops::platforms::PLATFORM_conv2d_bp_ENGINE_CPU conv2d_bp;
 
-    nd4j::ops::platforms::PLATFORM_conv2d_ENGINE_CPU conv3d;
-    nd4j::ops::platforms::PLATFORM_conv2d_bp_ENGINE_CPU conv3d_bp;
+    sd::ops::platforms::PLATFORM_conv2d_ENGINE_CPU conv3d;
+    sd::ops::platforms::PLATFORM_conv2d_bp_ENGINE_CPU conv3d_bp;
 
-    nd4j::ops::platforms::PLATFORM_avgpool2d_ENGINE_CPU avgpool2d;
-    nd4j::ops::platforms::PLATFORM_avgpool2d_bp_ENGINE_CPU avgpool2d_bp;
+    sd::ops::platforms::PLATFORM_avgpool2d_ENGINE_CPU avgpool2d;
+    sd::ops::platforms::PLATFORM_avgpool2d_bp_ENGINE_CPU avgpool2d_bp;
 
-    nd4j::ops::platforms::PLATFORM_maxpool2d_ENGINE_CPU maxpool2d;
-    nd4j::ops::platforms::PLATFORM_maxpool2d_bp_ENGINE_CPU maxpool2d_bp;
+    sd::ops::platforms::PLATFORM_maxpool2d_ENGINE_CPU maxpool2d;
+    sd::ops::platforms::PLATFORM_maxpool2d_bp_ENGINE_CPU maxpool2d_bp;
 
-    nd4j::ops::platforms::PLATFORM_avgpool3dnew_ENGINE_CPU avgpool3d;
-    nd4j::ops::platforms::PLATFORM_avgpool3dnew_bp_ENGINE_CPU avgpool3d_bp;
+    sd::ops::platforms::PLATFORM_avgpool3dnew_ENGINE_CPU avgpool3d;
+    sd::ops::platforms::PLATFORM_avgpool3dnew_bp_ENGINE_CPU avgpool3d_bp;
 
-    nd4j::ops::platforms::PLATFORM_maxpool3dnew_ENGINE_CPU maxpool3d;
-    nd4j::ops::platforms::PLATFORM_maxpool3dnew_bp_ENGINE_CPU maxpool3d_bp;
+    sd::ops::platforms::PLATFORM_maxpool3dnew_ENGINE_CPU maxpool3d;
+    sd::ops::platforms::PLATFORM_maxpool3dnew_bp_ENGINE_CPU maxpool3d_bp;
 
-    nd4j::ops::platforms::PLATFORM_lrn_ENGINE_CPU lrn;
+    sd::ops::platforms::PLATFORM_lrn_ENGINE_CPU lrn;
 
-    nd4j::ops::platforms::PLATFORM_batchnorm_ENGINE_CPU batchnorm;
+    sd::ops::platforms::PLATFORM_batchnorm_ENGINE_CPU batchnorm;
 
-    nd4j::ops::platforms::PLATFORM_matmul_ENGINE_CPU matmul;
+    sd::ops::platforms::PLATFORM_matmul_ENGINE_CPU matmul;
 
     printer({&conv2d, &conv2d_bp, &conv3d, &conv3d_bp, &avgpool2d, &avgpool2d_bp, &maxpool2d, &maxpool2d_bp, &avgpool3d, &avgpool3d_bp, &maxpool3d, &maxpool3d_bp, &lrn, &batchnorm, &matmul});
 #endif
diff --git a/libnd4j/tests_cpu/layers_tests/MmapTests.cpp b/libnd4j/tests_cpu/layers_tests/MmapTests.cpp
index b8b298f9b..c1df42fd1 100644
--- a/libnd4j/tests_cpu/layers_tests/MmapTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/MmapTests.cpp
@@ -20,12 +20,12 @@
 
 #include "testlayers.h"
 #include <ops/declarable/CustomOperations.h>
-#include <NDArray.h>
-#include <NativeOps.h>
+#include <array/NDArray.h>
+#include <legacy/NativeOps.h>
 #include <fstream>
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class MmapTests : public testing::Test {
 public:
diff --git a/libnd4j/tests_cpu/layers_tests/MultiDataTypeTests.cpp b/libnd4j/tests_cpu/layers_tests/MultiDataTypeTests.cpp
index 127b3c7d3..0f5b5ed62 100644
--- a/libnd4j/tests_cpu/layers_tests/MultiDataTypeTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/MultiDataTypeTests.cpp
@@ -20,13 +20,13 @@
 
 #include "testlayers.h"
 #include <array/ArrayOptions.h>
-#include <NDArray.h>
-#include <NDArrayFactory.h>
+#include <array/NDArray.h>
+#include <array/NDArrayFactory.h>
 #include <ops/declarable/headers/broadcastable.h>
-#include <MmulHelper.h>
+#include <helpers/MmulHelper.h>
 
 
-using namespace nd4j;
+using namespace sd;
 
 class MultiDataTypeTests : public testing::Test {
 public:
@@ -35,23 +35,23 @@ public:
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(MultiDataTypeTests, DataTypeUtils_Test_1) {
-    auto dtype = DataTypeUtils::pickPairwiseResultType(nd4j::INT32, nd4j::FLOAT32);
+    auto dtype = DataTypeUtils::pickPairwiseResultType(sd::INT32, sd::FLOAT32);
 
-    ASSERT_EQ(nd4j::FLOAT32, dtype);
+    ASSERT_EQ(sd::FLOAT32, dtype);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(MultiDataTypeTests, DataTypeUtils_Test_2) {
-    auto dtype = DataTypeUtils::pickPairwiseResultType(nd4j::INT32, nd4j::DOUBLE);
-    ASSERT_EQ(nd4j::DOUBLE, dtype);
+    auto dtype = DataTypeUtils::pickPairwiseResultType(sd::INT32, sd::DOUBLE);
+    ASSERT_EQ(sd::DOUBLE, dtype);
 
-    ASSERT_EQ(nd4j::DOUBLE, DataTypeUtils::pickPairwiseResultType(nd4j::DOUBLE, nd4j::INT32));
+    ASSERT_EQ(sd::DOUBLE, DataTypeUtils::pickPairwiseResultType(sd::DOUBLE, sd::INT32));
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(MultiDataTypeTests, DataTypeUtils_Test_3) {
-    auto dtype = DataTypeUtils::pickPairwiseResultType(nd4j::FLOAT32, nd4j::DOUBLE);
-    ASSERT_EQ(nd4j::FLOAT32, dtype);
+    auto dtype = DataTypeUtils::pickPairwiseResultType(sd::FLOAT32, sd::DOUBLE);
+    ASSERT_EQ(sd::FLOAT32, dtype);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -132,7 +132,7 @@ TEST_F(MultiDataTypeTests, Basic_Test_7) {
     auto y = NDArrayFactory::create<float>('c', {2, 3}, {0.f, 1.f, 2.f, 3.f, 4.f, 5.f});
     auto e = NDArrayFactory::create<float>('c', {2, 3}, {0.f, 2.f, 4.f, 6.f, 8.f, 10.f});
 
-    nd4j::ops::add op;
+    sd::ops::add op;
     auto result = op.evaluate({&x, &y});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -159,8 +159,8 @@ TEST_F(MultiDataTypeTests, Basic_Test_6) {
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(MultiDataTypeTests, ndarray_assign_number_test1) {
-    NDArray x('c', {2, 3}, {0, 1, 2, 3, 4, 5}, nd4j::DataType::UINT8);
-    NDArray exp('c', {2, 3}, {10, 10, 10, 10, 10, 10}, nd4j::DataType::UINT8);
+    NDArray x('c', {2, 3}, {0, 1, 2, 3, 4, 5}, sd::DataType::UINT8);
+    NDArray exp('c', {2, 3}, {10, 10, 10, 10, 10, 10}, sd::DataType::UINT8);
 
     const double number = 10.8;
     x = number;
@@ -170,8 +170,8 @@ TEST_F(MultiDataTypeTests, ndarray_assign_number_test1) {
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(MultiDataTypeTests, ndarray_assign_number_test2) {
-    NDArray x('c', {2, 3}, {0, 1, 2, 3, 4, 5}, nd4j::DataType::INT64);
-    NDArray exp('c', {2, 3}, {1, 1, 1, 1, 1, 1}, nd4j::DataType::INT64);
+    NDArray x('c', {2, 3}, {0, 1, 2, 3, 4, 5}, sd::DataType::INT64);
+    NDArray exp('c', {2, 3}, {1, 1, 1, 1, 1, 1}, sd::DataType::INT64);
 
     const bool number = 1000;
     x = number;
@@ -181,8 +181,8 @@ TEST_F(MultiDataTypeTests, ndarray_assign_number_test2) {
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(MultiDataTypeTests, ndarray_assign_number_test3) {
-    NDArray x('c', {2, 3}, {0, 1, 0, 1, 0, 1}, nd4j::DataType::BOOL);
-    NDArray exp('c', {2, 3}, {1, 1, 1, 1, 1, 1}, nd4j::DataType::BOOL);
+    NDArray x('c', {2, 3}, {0, 1, 0, 1, 0, 1}, sd::DataType::BOOL);
+    NDArray exp('c', {2, 3}, {1, 1, 1, 1, 1, 1}, sd::DataType::BOOL);
 
     const int number = 1000;
     x = number;
@@ -192,9 +192,9 @@ TEST_F(MultiDataTypeTests, ndarray_assign_number_test3) {
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(MultiDataTypeTests, ndarray_repeat_test1) {
-    NDArray x('c', {2, 2}, {0.5, 1.5, 2.5, 3.5}, nd4j::DataType::HALF);
-    NDArray y('c', {2, 4}, nd4j::DataType::HALF);
-    NDArray exp('c', {2, 4}, {0.5, 0.5, 1.5, 1.5, 2.5, 2.5, 3.5, 3.5}, nd4j::DataType::HALF);
+    NDArray x('c', {2, 2}, {0.5, 1.5, 2.5, 3.5}, sd::DataType::HALF);
+    NDArray y('c', {2, 4}, sd::DataType::HALF);
+    NDArray exp('c', {2, 4}, {0.5, 0.5, 1.5, 1.5, 2.5, 2.5, 3.5, 3.5}, sd::DataType::HALF);
 
     x.repeat(1, {2}, y);
 
@@ -203,8 +203,8 @@ TEST_F(MultiDataTypeTests, ndarray_repeat_test1) {
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(MultiDataTypeTests, ndarray_bufferAsT_test1) {
-    NDArray x('f', {2}, {1.5, 3.5}, nd4j::DataType::FLOAT32);
-    NDArray y('c', {}, std::vector<double>{1.5}, nd4j::DataType::FLOAT32);
+    NDArray x('f', {2}, {1.5, 3.5}, sd::DataType::FLOAT32);
+    NDArray y('c', {}, std::vector<double>{1.5}, sd::DataType::FLOAT32);
 
     const int* buffX = x.bufferAsT<int>();
     const int* buffY = y.bufferAsT<int>();
@@ -214,11 +214,11 @@ TEST_F(MultiDataTypeTests, ndarray_bufferAsT_test1) {
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(MultiDataTypeTests, ndarray_assign_test1) {
-    NDArray x('c', {2,2}, {0, 1, 2, 3}, nd4j::DataType::UINT8);
-    NDArray exp('c', {2,2}, {10, 10, 20, 20}, nd4j::DataType::UINT8);
+    NDArray x('c', {2,2}, {0, 1, 2, 3}, sd::DataType::UINT8);
+    NDArray exp('c', {2,2}, {10, 10, 20, 20}, sd::DataType::UINT8);
 
-    NDArray scalar1('c', {}, std::vector<double>{10.5}, nd4j::DataType::FLOAT32);
-    NDArray scalar2('c', {}, std::vector<double>{20.8}, nd4j::DataType::DOUBLE);
+    NDArray scalar1('c', {}, std::vector<double>{10.5}, sd::DataType::FLOAT32);
+    NDArray scalar2('c', {}, std::vector<double>{20.8}, sd::DataType::DOUBLE);
 
     x(0,{0}).assign(scalar1);
     x(1,{0}).assign(scalar2);
@@ -232,67 +232,67 @@ TEST_F(MultiDataTypeTests, ndarray_assign_test1) {
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(MultiDataTypeTests, ndarray_reduceAlongDimension_test1) {
-    NDArray x('f', {2,2}, {0, 1.5, 2.5, 3.5}, nd4j::DataType::HALF);
-    NDArray exp1('c', {}, std::vector<double>{3}, nd4j::DataType::INT64);
-    NDArray exp2('c', {1,1}, std::vector<double>{1}, nd4j::DataType::INT64);
-    NDArray exp3('c', {2}, std::vector<double>{1,2}, nd4j::DataType::INT64);
+    NDArray x('f', {2,2}, {0, 1.5, 2.5, 3.5}, sd::DataType::HALF);
+    NDArray exp1('c', {}, std::vector<double>{3}, sd::DataType::INT64);
+    NDArray exp2('c', {1,1}, std::vector<double>{1}, sd::DataType::INT64);
+    NDArray exp3('c', {2}, std::vector<double>{1,2}, sd::DataType::INT64);
 
-    auto scalar1 = x.reduceAlongDimension(nd4j::reduce::CountNonZero, {}/*whole range*/);
+    auto scalar1 = x.reduceAlongDimension(sd::reduce::CountNonZero, {}/*whole range*/);
     ASSERT_EQ(scalar1, exp1);
 
-    auto scalar2 = x.reduceAlongDimension(nd4j::reduce::CountZero, {}/*whole range*/, true);
+    auto scalar2 = x.reduceAlongDimension(sd::reduce::CountZero, {}/*whole range*/, true);
     ASSERT_EQ(scalar2, exp2);
 
-    auto scalar3 = x.reduceAlongDimension(nd4j::reduce::CountNonZero, {1});
+    auto scalar3 = x.reduceAlongDimension(sd::reduce::CountNonZero, {1});
     ASSERT_EQ(scalar3, exp3);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(MultiDataTypeTests, ndarray_reduceAlongDimension_test2) {
-    NDArray x('c', {2, 2}, {0, 1, 2, 3}, nd4j::DataType::INT32);
-    NDArray exp1('c', {}, std::vector<double>{1.5}, nd4j::DataType::FLOAT32);
-    NDArray exp2('c', {2}, {0.5,2.5}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {2, 2}, {0, 1, 2, 3}, sd::DataType::INT32);
+    NDArray exp1('c', {}, std::vector<double>{1.5}, sd::DataType::FLOAT32);
+    NDArray exp2('c', {2}, {0.5,2.5}, sd::DataType::FLOAT32);
 
-    auto scalar1 = x.reduceAlongDimension(nd4j::reduce::Mean, {}/*whole range*/);
+    auto scalar1 = x.reduceAlongDimension(sd::reduce::Mean, {}/*whole range*/);
     // scalar1->printShapeInfo();
     // scalar1->printIndexedBuffer();
     ASSERT_EQ(scalar1, exp1);
 
-    auto scalar2 = x.reduceAlongDimension(nd4j::reduce::Mean, {1});
+    auto scalar2 = x.reduceAlongDimension(sd::reduce::Mean, {1});
     ASSERT_EQ(scalar2, exp2);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(MultiDataTypeTests, ndarray_reduceAlongDimension_test3) {
-    NDArray x('c', {2, 2}, {0.5, 1.5, 2.5, 3.5}, nd4j::DataType::HALF);
-    NDArray exp1('c', {}, std::vector<double>{8.}, nd4j::DataType::HALF);
-    NDArray exp2('c', {2}, {2.,6.}, nd4j::DataType::HALF);
+    NDArray x('c', {2, 2}, {0.5, 1.5, 2.5, 3.5}, sd::DataType::HALF);
+    NDArray exp1('c', {}, std::vector<double>{8.}, sd::DataType::HALF);
+    NDArray exp2('c', {2}, {2.,6.}, sd::DataType::HALF);
 
-    auto scalar1 = x.reduceAlongDimension(nd4j::reduce::Sum, {}/*whole range*/);
+    auto scalar1 = x.reduceAlongDimension(sd::reduce::Sum, {}/*whole range*/);
     ASSERT_EQ(scalar1, exp1);
 
-    auto scalar2 = x.reduceAlongDimension(nd4j::reduce::Sum, {1});
+    auto scalar2 = x.reduceAlongDimension(sd::reduce::Sum, {1});
     ASSERT_EQ(scalar2, exp2);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(MultiDataTypeTests, ndarray_reduceAlongDimension_test4) {
-    NDArray x('c', {2, 2}, {10.5, 1.5, -2.5, -3.5}, nd4j::DataType::HALF);
-    NDArray exp1('c', {}, std::vector<double>{1}, nd4j::DataType::BOOL);
-    NDArray exp2('c', {2}, std::vector<double>{1, 0}, nd4j::DataType::BOOL);
+    NDArray x('c', {2, 2}, {10.5, 1.5, -2.5, -3.5}, sd::DataType::HALF);
+    NDArray exp1('c', {}, std::vector<double>{1}, sd::DataType::BOOL);
+    NDArray exp2('c', {2}, std::vector<double>{1, 0}, sd::DataType::BOOL);
 
-    auto scalar1 = x.reduceAlongDimension(nd4j::reduce::IsPositive, {}/*whole range*/);
+    auto scalar1 = x.reduceAlongDimension(sd::reduce::IsPositive, {}/*whole range*/);
     ASSERT_EQ(scalar1, exp1);
 
-    auto scalar2 = x.reduceAlongDimension(nd4j::reduce::IsPositive, {1});
+    auto scalar2 = x.reduceAlongDimension(sd::reduce::IsPositive, {1});
     ASSERT_EQ(scalar2, exp2);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(MultiDataTypeTests, ndarray_varianceNumber_test1) {
-    NDArray x('f', {2, 2}, {0, 1, 2, 3}, nd4j::DataType::INT64);
-    NDArray exp1('c', {}, std::vector<double>{1.666666667}, nd4j::DataType::FLOAT32);
-    NDArray exp2('c', {}, std::vector<double>{1.118033989}, nd4j::DataType::FLOAT32);
+    NDArray x('f', {2, 2}, {0, 1, 2, 3}, sd::DataType::INT64);
+    NDArray exp1('c', {}, std::vector<double>{1.666666667}, sd::DataType::FLOAT32);
+    NDArray exp2('c', {}, std::vector<double>{1.118033989}, sd::DataType::FLOAT32);
 
     auto scalar1 = x.varianceNumber(variance::SummaryStatsVariance);
     ASSERT_EQ(scalar1, exp1);
@@ -306,11 +306,11 @@ TEST_F(MultiDataTypeTests, ndarray_operatorPlus_test1) {
     if (!Environment::getInstance()->isExperimentalBuild())
         return;
 
-    NDArray x1('c', {2, 2}, {0, 1, 2, 3},     nd4j::DataType::INT64);
-    NDArray x2('c', {2, 2}, {-1, -2, -1, -2}, nd4j::DataType::FLOAT32);
-    NDArray x3('c', {2}, {-1, -2},            nd4j::DataType::FLOAT32);
+    NDArray x1('c', {2, 2}, {0, 1, 2, 3},     sd::DataType::INT64);
+    NDArray x2('c', {2, 2}, {-1, -2, -1, -2}, sd::DataType::FLOAT32);
+    NDArray x3('c', {2}, {-1, -2},            sd::DataType::FLOAT32);
 
-    NDArray exp('c', {2, 2}, {-1, -1, 1, 1},  nd4j::DataType::FLOAT32);
+    NDArray exp('c', {2, 2}, {-1, -1, 1, 1},  sd::DataType::FLOAT32);
 
     ASSERT_EQ(x1+x2, exp);
     ASSERT_EQ(x1+x3, exp);
@@ -321,14 +321,14 @@ TEST_F(MultiDataTypeTests, ndarray_operatorPlus_test2) {
     if (!Environment::getInstance()->isExperimentalBuild())
         return;
 
-    NDArray x1('c', {2, 2}, {0, 1, 2, 3},     nd4j::DataType::INT64);
-    NDArray x2('c', {2, 2}, {0, 1, 2, 3},     nd4j::DataType::FLOAT32);
-    NDArray x3('c', {2, 2}, {0, 1, 2, 3},     nd4j::DataType::HALF);
+    NDArray x1('c', {2, 2}, {0, 1, 2, 3},     sd::DataType::INT64);
+    NDArray x2('c', {2, 2}, {0, 1, 2, 3},     sd::DataType::FLOAT32);
+    NDArray x3('c', {2, 2}, {0, 1, 2, 3},     sd::DataType::HALF);
     const double val1 = -2;
     const int val2 = -2;
-    NDArray exp1('c', {2,2}, {-2, -1, 0, 1},  nd4j::DataType::DOUBLE);
-    NDArray exp2('c', {2,2}, {-2, -1, 0, 1},  nd4j::DataType::FLOAT32);
-    NDArray exp3('c', {2,2}, {-2, -1, 0, 1},  nd4j::DataType::HALF);
+    NDArray exp1('c', {2,2}, {-2, -1, 0, 1},  sd::DataType::DOUBLE);
+    NDArray exp2('c', {2,2}, {-2, -1, 0, 1},  sd::DataType::FLOAT32);
+    NDArray exp3('c', {2,2}, {-2, -1, 0, 1},  sd::DataType::HALF);
 
     ASSERT_EQ(x1+val1, exp1);
     ASSERT_EQ(val1+x1, exp1);
@@ -345,11 +345,11 @@ TEST_F(MultiDataTypeTests, ndarray_operatorMinus_test1) {
     if (!Environment::getInstance()->isExperimentalBuild())
         return;
 
-    NDArray x1('c', {2, 2}, {0, 1, 2, 3},     nd4j::DataType::INT64);
-    NDArray x2('c', {2, 2}, {-1, -2, -1, -2}, nd4j::DataType::HALF);
-    NDArray x3('c', {2}, {-1, -2},            nd4j::DataType::HALF);
+    NDArray x1('c', {2, 2}, {0, 1, 2, 3},     sd::DataType::INT64);
+    NDArray x2('c', {2, 2}, {-1, -2, -1, -2}, sd::DataType::HALF);
+    NDArray x3('c', {2}, {-1, -2},            sd::DataType::HALF);
 
-    NDArray exp('c', {2, 2}, {1, 3, 3, 5},  nd4j::DataType::HALF);
+    NDArray exp('c', {2, 2}, {1, 3, 3, 5},  sd::DataType::HALF);
 
     ASSERT_EQ(x1-x2, exp);
     ASSERT_EQ(x1-x3, exp);
@@ -360,17 +360,17 @@ TEST_F(MultiDataTypeTests, ndarray_operatorMinus_test2) {
     if (!Environment::getInstance()->isExperimentalBuild())
         return;
 
-    NDArray x1('c', {2, 2}, {0, 1, 2, 3},     nd4j::DataType::INT64);
-    NDArray x2('c', {2, 2}, {0, 1, 2, 3},     nd4j::DataType::FLOAT32);
-    NDArray x3('c', {2, 2}, {0, 1, 2, 3},     nd4j::DataType::HALF);
+    NDArray x1('c', {2, 2}, {0, 1, 2, 3},     sd::DataType::INT64);
+    NDArray x2('c', {2, 2}, {0, 1, 2, 3},     sd::DataType::FLOAT32);
+    NDArray x3('c', {2, 2}, {0, 1, 2, 3},     sd::DataType::HALF);
     const double val1 = 2;
     const int val2 = 2;
-    NDArray exp1('c', {2,2}, {-2, -1, 0, 1},  nd4j::DataType::DOUBLE);
-    NDArray exp2('c', {2,2}, {2, 1, 0, -1},   nd4j::DataType::DOUBLE);
-    NDArray exp3('c', {2,2}, {-2, -1, 0, 1},  nd4j::DataType::FLOAT32);
-    NDArray exp4('c', {2,2}, {-2, -1, 0, 1},  nd4j::DataType::HALF);
-    NDArray exp5('c', {2,2}, {2, 1, 0, -1},   nd4j::DataType::FLOAT32);
-    NDArray exp6('c', {2,2}, {2, 1, 0, -1},   nd4j::DataType::HALF);
+    NDArray exp1('c', {2,2}, {-2, -1, 0, 1},  sd::DataType::DOUBLE);
+    NDArray exp2('c', {2,2}, {2, 1, 0, -1},   sd::DataType::DOUBLE);
+    NDArray exp3('c', {2,2}, {-2, -1, 0, 1},  sd::DataType::FLOAT32);
+    NDArray exp4('c', {2,2}, {-2, -1, 0, 1},  sd::DataType::HALF);
+    NDArray exp5('c', {2,2}, {2, 1, 0, -1},   sd::DataType::FLOAT32);
+    NDArray exp6('c', {2,2}, {2, 1, 0, -1},   sd::DataType::HALF);
 
     ASSERT_EQ(x1-val1, exp1);
     ASSERT_EQ(val1-x1, exp2);
@@ -387,11 +387,11 @@ TEST_F(MultiDataTypeTests, ndarray_operatorMultiply_test1) {
     if (!Environment::getInstance()->isExperimentalBuild())
         return;
 
-    NDArray x1('c', {2, 2}, {0, 1, 2, 3},     nd4j::DataType::INT64);
-    NDArray x2('c', {2, 2}, {-1, -2, -1, -2}, nd4j::DataType::DOUBLE);
-    NDArray x3('c', {2}, {-1, -2},            nd4j::DataType::DOUBLE);
+    NDArray x1('c', {2, 2}, {0, 1, 2, 3},     sd::DataType::INT64);
+    NDArray x2('c', {2, 2}, {-1, -2, -1, -2}, sd::DataType::DOUBLE);
+    NDArray x3('c', {2}, {-1, -2},            sd::DataType::DOUBLE);
 
-    NDArray exp('c', {2, 2}, {0, -2, -2, -6},  nd4j::DataType::DOUBLE);
+    NDArray exp('c', {2, 2}, {0, -2, -2, -6},  sd::DataType::DOUBLE);
 
     ASSERT_EQ(x1*x2, exp);
     ASSERT_EQ(x1*x3, exp);
@@ -402,14 +402,14 @@ TEST_F(MultiDataTypeTests, ndarray_operatorMultiply_test2) {
     if (!Environment::getInstance()->isExperimentalBuild())
         return;
 
-    NDArray x1('c', {2, 2}, {0, 1, 2, 3},     nd4j::DataType::INT64);
-    NDArray x2('c', {2, 2}, {0, 1, 2, 3},     nd4j::DataType::FLOAT32);
-    NDArray x3('c', {2, 2}, {0, 1, 2, 3},     nd4j::DataType::HALF);
+    NDArray x1('c', {2, 2}, {0, 1, 2, 3},     sd::DataType::INT64);
+    NDArray x2('c', {2, 2}, {0, 1, 2, 3},     sd::DataType::FLOAT32);
+    NDArray x3('c', {2, 2}, {0, 1, 2, 3},     sd::DataType::HALF);
     const double val1 = -2;
     const int val2 = -2;
-    NDArray exp1('c', {2,2}, {0, -2, -4, -6},  nd4j::DataType::DOUBLE);
-    NDArray exp2('c', {2,2}, {0, -2, -4, -6},  nd4j::DataType::FLOAT32);
-    NDArray exp3('c', {2,2}, {0, -2, -4, -6},  nd4j::DataType::HALF);
+    NDArray exp1('c', {2,2}, {0, -2, -4, -6},  sd::DataType::DOUBLE);
+    NDArray exp2('c', {2,2}, {0, -2, -4, -6},  sd::DataType::FLOAT32);
+    NDArray exp3('c', {2,2}, {0, -2, -4, -6},  sd::DataType::HALF);
 
     ASSERT_EQ(x1*val1, exp1);
     ASSERT_EQ(val1*x1, exp1);
@@ -427,12 +427,12 @@ TEST_F(MultiDataTypeTests, ndarray_operatorDivide_test1) {
     if (!Environment::getInstance()->isExperimentalBuild())
         return;
 
-    NDArray x1('c', {2, 2}, {4, 1, 2, 3},     nd4j::DataType::HALF);
-    NDArray x2('c', {2, 2}, {-1, -2, -1, -9}, nd4j::DataType::DOUBLE);
-    NDArray x3('c', {2}, {-1, -2},            nd4j::DataType::FLOAT32);
+    NDArray x1('c', {2, 2}, {4, 1, 2, 3},     sd::DataType::HALF);
+    NDArray x2('c', {2, 2}, {-1, -2, -1, -9}, sd::DataType::DOUBLE);
+    NDArray x3('c', {2}, {-1, -2},            sd::DataType::FLOAT32);
 
-    NDArray exp1('c', {2, 2}, {-4, -0.5, -2, -0.3333333},  nd4j::DataType::HALF);
-    NDArray exp2('c', {2, 2}, {-0.25, -2, -0.5, -0.666667},  nd4j::DataType::HALF);
+    NDArray exp1('c', {2, 2}, {-4, -0.5, -2, -0.3333333},  sd::DataType::HALF);
+    NDArray exp2('c', {2, 2}, {-0.25, -2, -0.5, -0.666667},  sd::DataType::HALF);
 
     ASSERT_EQ(x1/x2, exp1);
     ASSERT_EQ(x3/x1, exp2);
@@ -443,19 +443,19 @@ TEST_F(MultiDataTypeTests, ndarray_operatorDivide_test2) {
     if (!Environment::getInstance()->isExperimentalBuild())
         return;
 
-    NDArray x1('c', {2, 2}, {1, 2, 3, 4},     nd4j::DataType::INT64);
-    NDArray x2('c', {2, 2}, {1, 2, 3, 4},     nd4j::DataType::FLOAT32);
-    NDArray x3('c', {2, 2}, {1, 2, 3, 4},     nd4j::DataType::HALF);
+    NDArray x1('c', {2, 2}, {1, 2, 3, 4},     sd::DataType::INT64);
+    NDArray x2('c', {2, 2}, {1, 2, 3, 4},     sd::DataType::FLOAT32);
+    NDArray x3('c', {2, 2}, {1, 2, 3, 4},     sd::DataType::HALF);
     const double val1 = 2;
     const int val2 = -2;
-    NDArray exp1('c', {2,2}, {0.5, 1, 1.5, 2},  nd4j::DataType::DOUBLE);
-    NDArray exp2('c', {2,2}, {2, 1, 0.666667, 0.5},   nd4j::DataType::DOUBLE);
-    NDArray exp3('c', {2,2}, {0, -1, -1, -2},  nd4j::DataType::INT64);
-    NDArray exp4('c', {2,2}, {-2, -1, 0., 0.},   nd4j::DataType::INT64);
-    NDArray exp5('c', {2,2}, {-0.5, -1, -1.5, -2},  nd4j::DataType::FLOAT32);
-    NDArray exp6('c', {2,2}, {-2, -1, -0.666667, -0.5},  nd4j::DataType::FLOAT32);
-    NDArray exp7('c', {2,2}, {0.5, 1, 1.5, 2},  nd4j::DataType::HALF);
-    NDArray exp8('c', {2,2}, {2, 1, 0.666667, 0.5},   nd4j::DataType::HALF);
+    NDArray exp1('c', {2,2}, {0.5, 1, 1.5, 2},  sd::DataType::DOUBLE);
+    NDArray exp2('c', {2,2}, {2, 1, 0.666667, 0.5},   sd::DataType::DOUBLE);
+    NDArray exp3('c', {2,2}, {0, -1, -1, -2},  sd::DataType::INT64);
+    NDArray exp4('c', {2,2}, {-2, -1, 0., 0.},   sd::DataType::INT64);
+    NDArray exp5('c', {2,2}, {-0.5, -1, -1.5, -2},  sd::DataType::FLOAT32);
+    NDArray exp6('c', {2,2}, {-2, -1, -0.666667, -0.5},  sd::DataType::FLOAT32);
+    NDArray exp7('c', {2,2}, {0.5, 1, 1.5, 2},  sd::DataType::HALF);
+    NDArray exp8('c', {2,2}, {2, 1, 0.666667, 0.5},   sd::DataType::HALF);
 
     ASSERT_EQ(x1/val1, exp1);
     ASSERT_EQ(val1/x1, exp2);
@@ -475,21 +475,21 @@ TEST_F(MultiDataTypeTests, ndarray_operatorPlusEqual_test1) {
     if (!Environment::getInstance()->isExperimentalBuild())
         return;
 
-    NDArray scalar1('c', {0}, std::vector<double>{4}, nd4j::DataType::INT32);
-    NDArray scalar2('c', {0}, std::vector<double>{1.5}, nd4j::DataType::HALF);
+    NDArray scalar1('c', {0}, std::vector<double>{4}, sd::DataType::INT32);
+    NDArray scalar2('c', {0}, std::vector<double>{1.5}, sd::DataType::HALF);
 
-    NDArray x1('c', {2,3}, {1.5, 2.5, 3.5, 4.5, 5.5, 6.5},  nd4j::DataType::FLOAT32);
-    NDArray x2('c', {3,2}, {10, 20, 30, 40, 50, 60},  nd4j::DataType::INT64);
-    NDArray x3('c', {2,2}, {0, 1, 2, 3},  nd4j::DataType::INT64);
-    NDArray x4('c', {2},   {0.4, 0.5},  nd4j::DataType::HALF);
-    NDArray x5('c', {2,2}, {0, 1, 2, 3},  nd4j::DataType::HALF);
-    NDArray x6('c', {2},   {0.4, 0.5},  nd4j::DataType::FLOAT32);
+    NDArray x1('c', {2,3}, {1.5, 2.5, 3.5, 4.5, 5.5, 6.5},  sd::DataType::FLOAT32);
+    NDArray x2('c', {3,2}, {10, 20, 30, 40, 50, 60},  sd::DataType::INT64);
+    NDArray x3('c', {2,2}, {0, 1, 2, 3},  sd::DataType::INT64);
+    NDArray x4('c', {2},   {0.4, 0.5},  sd::DataType::HALF);
+    NDArray x5('c', {2,2}, {0, 1, 2, 3},  sd::DataType::HALF);
+    NDArray x6('c', {2},   {0.4, 0.5},  sd::DataType::FLOAT32);
 
-    NDArray exp1('c', {0}, std::vector<double>{5},  nd4j::DataType::INT32);
-    NDArray exp2('c', {0}, std::vector<double>{6.5},  nd4j::DataType::HALF);
-    NDArray exp3('c', {3,2}, {11, 22, 33, 44, 55, 66},  nd4j::DataType::INT64);
-    NDArray exp4('c', {2,3}, {12.5, 24.5, 36.5, 48.5, 60.5, 72.5},  nd4j::DataType::FLOAT32);
-    NDArray exp5('c', {2,2}, {0.4, 1.5, 2.4, 3.5},  nd4j::DataType::HALF);
+    NDArray exp1('c', {0}, std::vector<double>{5},  sd::DataType::INT32);
+    NDArray exp2('c', {0}, std::vector<double>{6.5},  sd::DataType::HALF);
+    NDArray exp3('c', {3,2}, {11, 22, 33, 44, 55, 66},  sd::DataType::INT64);
+    NDArray exp4('c', {2,3}, {12.5, 24.5, 36.5, 48.5, 60.5, 72.5},  sd::DataType::FLOAT32);
+    NDArray exp5('c', {2,2}, {0.4, 1.5, 2.4, 3.5},  sd::DataType::HALF);
 
     scalar1 += scalar2;
     ASSERT_EQ(scalar1, exp1);
@@ -515,19 +515,19 @@ TEST_F(MultiDataTypeTests, ndarray_operatorPlusEqual_test2) {
     if (!Environment::getInstance()->isExperimentalBuild())
         return;
 
-    NDArray x1('c', {2,2}, {0, 1, 2, 3}, nd4j::DataType::FLOAT32);
-    NDArray x2('c', {2,2}, {0, 1, 2, 3}, nd4j::DataType::INT32);
+    NDArray x1('c', {2,2}, {0, 1, 2, 3}, sd::DataType::FLOAT32);
+    NDArray x2('c', {2,2}, {0, 1, 2, 3}, sd::DataType::INT32);
 
     const Nd4jLong val1 = 1;
     const float16  val2 = 1.5;
     const double   val3 = 2.2;
 
-    NDArray exp1('c', {2,2}, {1, 2, 3, 4},  nd4j::DataType::FLOAT32);
-    NDArray exp2('c', {2,2}, {1, 2, 3, 4},  nd4j::DataType::INT32);
-    NDArray exp3('c', {2,2}, {2.5, 3.5, 4.5, 5.5}, nd4j::DataType::FLOAT32);
-    NDArray exp4('c', {2,2}, {2, 3, 4.5, 5}, nd4j::DataType::INT32);
-    NDArray exp5('c', {2,2}, {4.7, 5.7, 6.7, 7.7}, nd4j::DataType::FLOAT32);
-    NDArray exp6('c', {2,2}, {4, 5, 6, 7}, nd4j::DataType::INT32);
+    NDArray exp1('c', {2,2}, {1, 2, 3, 4},  sd::DataType::FLOAT32);
+    NDArray exp2('c', {2,2}, {1, 2, 3, 4},  sd::DataType::INT32);
+    NDArray exp3('c', {2,2}, {2.5, 3.5, 4.5, 5.5}, sd::DataType::FLOAT32);
+    NDArray exp4('c', {2,2}, {2, 3, 4.5, 5}, sd::DataType::INT32);
+    NDArray exp5('c', {2,2}, {4.7, 5.7, 6.7, 7.7}, sd::DataType::FLOAT32);
+    NDArray exp6('c', {2,2}, {4, 5, 6, 7}, sd::DataType::INT32);
 
     x1 += val1;
     ASSERT_EQ(x1, exp1);
@@ -553,21 +553,21 @@ TEST_F(MultiDataTypeTests, ndarray_operatorMinusEqual_test1) {
     if (!Environment::getInstance()->isExperimentalBuild())
         return;
 
-    NDArray scalar1('c', {0}, std::vector<double>{4}, nd4j::DataType::INT32);
-    NDArray scalar2('c', {0}, std::vector<double>{1.5}, nd4j::DataType::HALF);
+    NDArray scalar1('c', {0}, std::vector<double>{4}, sd::DataType::INT32);
+    NDArray scalar2('c', {0}, std::vector<double>{1.5}, sd::DataType::HALF);
 
-    NDArray x1('c', {2,3}, {1.5, 2.5, 3.5, 4.5, 5.5, 6.5},  nd4j::DataType::FLOAT32);
-    NDArray x2('c', {3,2}, {10, 20, 30, 40, 50, 60},  nd4j::DataType::INT64);
-    NDArray x3('c', {2,2}, {0, 1, 2, 3},  nd4j::DataType::INT64);
-    NDArray x4('c', {2},   {0.4, 0.5},  nd4j::DataType::HALF);
-    NDArray x5('c', {2,2}, {0, 1, 2, 3},  nd4j::DataType::HALF);
-    NDArray x6('c', {2},   {0.4, 0.5},  nd4j::DataType::FLOAT32);
+    NDArray x1('c', {2,3}, {1.5, 2.5, 3.5, 4.5, 5.5, 6.5},  sd::DataType::FLOAT32);
+    NDArray x2('c', {3,2}, {10, 20, 30, 40, 50, 60},  sd::DataType::INT64);
+    NDArray x3('c', {2,2}, {0, 1, 2, 3},  sd::DataType::INT64);
+    NDArray x4('c', {2},   {0.4, 0.5},  sd::DataType::HALF);
+    NDArray x5('c', {2,2}, {0, 1, 2, 3},  sd::DataType::HALF);
+    NDArray x6('c', {2},   {0.4, 0.5},  sd::DataType::FLOAT32);
 
-    NDArray exp1('c', {0}, std::vector<double>{2},  nd4j::DataType::INT32);
-    NDArray exp2('c', {0}, std::vector<double>{-0.5},  nd4j::DataType::HALF);
-    NDArray exp3('c', {3,2}, {8, 17, 26, 35, 44, 53},  nd4j::DataType::INT64);
-    NDArray exp4('c', {2,3}, {-6.5, -14.5, -22.5, -30.5, -38.5, -46.5},  nd4j::DataType::FLOAT32);
-    NDArray exp5('c', {2,2}, {0.4, -0.5, -1.6, -2.5},  nd4j::DataType::HALF);
+    NDArray exp1('c', {0}, std::vector<double>{2},  sd::DataType::INT32);
+    NDArray exp2('c', {0}, std::vector<double>{-0.5},  sd::DataType::HALF);
+    NDArray exp3('c', {3,2}, {8, 17, 26, 35, 44, 53},  sd::DataType::INT64);
+    NDArray exp4('c', {2,3}, {-6.5, -14.5, -22.5, -30.5, -38.5, -46.5},  sd::DataType::FLOAT32);
+    NDArray exp5('c', {2,2}, {0.4, -0.5, -1.6, -2.5},  sd::DataType::HALF);
 
     scalar1 -= scalar2;
     ASSERT_EQ(scalar1, exp1);
@@ -593,19 +593,19 @@ TEST_F(MultiDataTypeTests, ndarray_operatorMinusEqual_test2) {
     if (!Environment::getInstance()->isExperimentalBuild())
         return;
 
-    NDArray x1('c', {2,2}, {0, 1, 2, 3}, nd4j::DataType::FLOAT32);
-    NDArray x2('c', {2,2}, {0, 1, 2, 3}, nd4j::DataType::INT32);
+    NDArray x1('c', {2,2}, {0, 1, 2, 3}, sd::DataType::FLOAT32);
+    NDArray x2('c', {2,2}, {0, 1, 2, 3}, sd::DataType::INT32);
 
     const Nd4jLong val1 = 1;
     const float16  val2 = 1.5;
     const double   val3 = 2.2;
 
-    NDArray exp1('c', {2,2}, {-1, 0, 1, 2},  nd4j::DataType::FLOAT32);
-    NDArray exp2('c', {2,2}, {-1, 0, 1, 2},  nd4j::DataType::INT32);
-    NDArray exp3('c', {2,2}, {-2.5, -1.5, -0.5, 0.5}, nd4j::DataType::FLOAT32);
-    NDArray exp4('c', {2,2}, {-2., -1., 0., 0.}, nd4j::DataType::INT32);
-    NDArray exp5('c', {2,2}, {-4.7, -3.7, -2.7, -1.7}, nd4j::DataType::FLOAT32);
-    NDArray exp6('c', {2,2}, {-4, -3, -2, -2}, nd4j::DataType::INT32);
+    NDArray exp1('c', {2,2}, {-1, 0, 1, 2},  sd::DataType::FLOAT32);
+    NDArray exp2('c', {2,2}, {-1, 0, 1, 2},  sd::DataType::INT32);
+    NDArray exp3('c', {2,2}, {-2.5, -1.5, -0.5, 0.5}, sd::DataType::FLOAT32);
+    NDArray exp4('c', {2,2}, {-2., -1., 0., 0.}, sd::DataType::INT32);
+    NDArray exp5('c', {2,2}, {-4.7, -3.7, -2.7, -1.7}, sd::DataType::FLOAT32);
+    NDArray exp6('c', {2,2}, {-4, -3, -2, -2}, sd::DataType::INT32);
 
     x1 -= val1;
     ASSERT_EQ(x1, exp1);
@@ -631,21 +631,21 @@ TEST_F(MultiDataTypeTests, ndarray_operatorMultiplyEqual_test1) {
     if (!Environment::getInstance()->isExperimentalBuild())
         return;
 
-    NDArray scalar1('c', {0}, std::vector<double>{3}, nd4j::DataType::INT32);
-    NDArray scalar2('c', {0}, std::vector<double>{2.5}, nd4j::DataType::HALF);
+    NDArray scalar1('c', {0}, std::vector<double>{3}, sd::DataType::INT32);
+    NDArray scalar2('c', {0}, std::vector<double>{2.5}, sd::DataType::HALF);
 
-    NDArray x1('c', {2,3}, {1.5, 2.5, 3.5, 4.5, 5.5, 6.5},  nd4j::DataType::FLOAT32);
-    NDArray x2('c', {3,2}, {1, 2, 3, 4, 5, 6},  nd4j::DataType::INT64);
-    NDArray x3('c', {2,2}, {0, 1, 2, 3},  nd4j::DataType::INT64);
-    NDArray x4('c', {2},   {0.4, 0.5},  nd4j::DataType::HALF);
-    NDArray x5('c', {2,2}, {0, 1, 2, 3},  nd4j::DataType::HALF);
-    NDArray x6('c', {2},   {0.4, 0.5},  nd4j::DataType::FLOAT32);
+    NDArray x1('c', {2,3}, {1.5, 2.5, 3.5, 4.5, 5.5, 6.5},  sd::DataType::FLOAT32);
+    NDArray x2('c', {3,2}, {1, 2, 3, 4, 5, 6},  sd::DataType::INT64);
+    NDArray x3('c', {2,2}, {0, 1, 2, 3},  sd::DataType::INT64);
+    NDArray x4('c', {2},   {0.4, 0.5},  sd::DataType::HALF);
+    NDArray x5('c', {2,2}, {0, 1, 2, 3},  sd::DataType::HALF);
+    NDArray x6('c', {2},   {0.4, 0.5},  sd::DataType::FLOAT32);
 
-    NDArray exp1('c', {0}, std::vector<double>{7},  nd4j::DataType::INT32);
-    NDArray exp2('c', {0}, std::vector<double>{17.5},  nd4j::DataType::HALF);
-    NDArray exp3('c', {3,2}, {1, 5, 10, 18, 27, 39},  nd4j::DataType::INT64);
-    NDArray exp4('c', {2,3}, {1.5, 12.5, 35, 81, 148.5, 253.5},  nd4j::DataType::FLOAT32);
-    NDArray exp5('c', {2,2}, {0., 0.5, 0.8, 1.5},  nd4j::DataType::HALF);
+    NDArray exp1('c', {0}, std::vector<double>{7},  sd::DataType::INT32);
+    NDArray exp2('c', {0}, std::vector<double>{17.5},  sd::DataType::HALF);
+    NDArray exp3('c', {3,2}, {1, 5, 10, 18, 27, 39},  sd::DataType::INT64);
+    NDArray exp4('c', {2,3}, {1.5, 12.5, 35, 81, 148.5, 253.5},  sd::DataType::FLOAT32);
+    NDArray exp5('c', {2,2}, {0., 0.5, 0.8, 1.5},  sd::DataType::HALF);
 
     scalar1 *= scalar2;
     ASSERT_EQ(scalar1, exp1);
@@ -671,19 +671,19 @@ TEST_F(MultiDataTypeTests, ndarray_operatorMultiplyEqual_test2) {
     if (!Environment::getInstance()->isExperimentalBuild())
         return;
 
-    NDArray x1('c', {2,2}, {0, 1, 2, 3}, nd4j::DataType::FLOAT32);
-    NDArray x2('c', {2,2}, {0, 1, 2, 3}, nd4j::DataType::INT32);
+    NDArray x1('c', {2,2}, {0, 1, 2, 3}, sd::DataType::FLOAT32);
+    NDArray x2('c', {2,2}, {0, 1, 2, 3}, sd::DataType::INT32);
 
     const Nd4jLong val1 = 1;
     const float16  val2 = 1.5;
     const double   val3 = 2.2;
 
-    NDArray exp1('c', {2,2}, {0, 1, 2, 3},  nd4j::DataType::FLOAT32);
-    NDArray exp2('c', {2,2}, {0, 1, 2, 3},  nd4j::DataType::INT32);
-    NDArray exp3('c', {2,2}, {0, 1.5, 3, 4.5}, nd4j::DataType::FLOAT32);
-    NDArray exp4('c', {2,2}, {0, 1, 3, 4}, nd4j::DataType::INT32);
-    NDArray exp5('c', {2,2}, {0, 3.3, 6.6, 9.9}, nd4j::DataType::FLOAT32);
-    NDArray exp6('c', {2,2}, {0, 2, 6, 8}, nd4j::DataType::INT32);
+    NDArray exp1('c', {2,2}, {0, 1, 2, 3},  sd::DataType::FLOAT32);
+    NDArray exp2('c', {2,2}, {0, 1, 2, 3},  sd::DataType::INT32);
+    NDArray exp3('c', {2,2}, {0, 1.5, 3, 4.5}, sd::DataType::FLOAT32);
+    NDArray exp4('c', {2,2}, {0, 1, 3, 4}, sd::DataType::INT32);
+    NDArray exp5('c', {2,2}, {0, 3.3, 6.6, 9.9}, sd::DataType::FLOAT32);
+    NDArray exp6('c', {2,2}, {0, 2, 6, 8}, sd::DataType::INT32);
 
     x1 *= val1;
     ASSERT_EQ(x1, exp1);
@@ -709,21 +709,21 @@ TEST_F(MultiDataTypeTests, ndarray_operatorDivideEqual_test1) {
     if (!Environment::getInstance()->isExperimentalBuild())
         return;
 
-    NDArray scalar1('c', {0}, std::vector<double>{3}, nd4j::DataType::INT32);
-    NDArray scalar2('c', {0}, std::vector<double>{2.5}, nd4j::DataType::HALF);
+    NDArray scalar1('c', {0}, std::vector<double>{3}, sd::DataType::INT32);
+    NDArray scalar2('c', {0}, std::vector<double>{2.5}, sd::DataType::HALF);
 
-    NDArray x1('c', {2,3}, {1.5, 2.5, 3.5, 4.5, 5.5, 6.5},  nd4j::DataType::FLOAT32);
-    NDArray x2('c', {3,2}, {10, 20, 30, 40, 50, 60},  nd4j::DataType::INT64);
-    NDArray x3('c', {2,2}, {1, 2, 3, 4},  nd4j::DataType::INT64);
-    NDArray x4('c', {2},   {0.4, 0.5},  nd4j::DataType::HALF);
-    NDArray x5('c', {2,2}, {1, 2, 3, 4},  nd4j::DataType::HALF);
-    NDArray x6('c', {2},   {0.4, 0.5},  nd4j::DataType::FLOAT32);
+    NDArray x1('c', {2,3}, {1.5, 2.5, 3.5, 4.5, 5.5, 6.5},  sd::DataType::FLOAT32);
+    NDArray x2('c', {3,2}, {10, 20, 30, 40, 50, 60},  sd::DataType::INT64);
+    NDArray x3('c', {2,2}, {1, 2, 3, 4},  sd::DataType::INT64);
+    NDArray x4('c', {2},   {0.4, 0.5},  sd::DataType::HALF);
+    NDArray x5('c', {2,2}, {1, 2, 3, 4},  sd::DataType::HALF);
+    NDArray x6('c', {2},   {0.4, 0.5},  sd::DataType::FLOAT32);
 
-    NDArray exp1('c', {0}, std::vector<double>{1},  nd4j::DataType::INT32);
-    NDArray exp2('c', {0}, std::vector<double>{2.5},  nd4j::DataType::HALF);
-    NDArray exp3('c', {3,2}, {6, 8, 8, 8, 9, 9},  nd4j::DataType::INT64);
-    NDArray exp4('c', {2,3}, {0.25, 0.3125, 0.4375, 0.5625, 0.611111111, 0.722222222}, nd4j::DataType::FLOAT32);
-    NDArray exp5('c', {2,2}, {0.4, 0.25, 0.1333333, 0.125},  nd4j::DataType::HALF);
+    NDArray exp1('c', {0}, std::vector<double>{1},  sd::DataType::INT32);
+    NDArray exp2('c', {0}, std::vector<double>{2.5},  sd::DataType::HALF);
+    NDArray exp3('c', {3,2}, {6, 8, 8, 8, 9, 9},  sd::DataType::INT64);
+    NDArray exp4('c', {2,3}, {0.25, 0.3125, 0.4375, 0.5625, 0.611111111, 0.722222222}, sd::DataType::FLOAT32);
+    NDArray exp5('c', {2,2}, {0.4, 0.25, 0.1333333, 0.125},  sd::DataType::HALF);
 
     scalar1 /= scalar2;
     ASSERT_EQ(scalar1, exp1);
@@ -749,19 +749,19 @@ TEST_F(MultiDataTypeTests, ndarray_operatorDivideEqual_test2) {
     if (!Environment::getInstance()->isExperimentalBuild())
         return;
 
-    NDArray x1('c', {2,2}, {0, 2, 4, 6}, nd4j::DataType::FLOAT32);
-    NDArray x2('c', {2,2}, {0, 2, 4, 6}, nd4j::DataType::INT32);
+    NDArray x1('c', {2,2}, {0, 2, 4, 6}, sd::DataType::FLOAT32);
+    NDArray x2('c', {2,2}, {0, 2, 4, 6}, sd::DataType::INT32);
 
     const Nd4jLong val1 = 1;
     const float16  val2 = 2.;
     const double   val3 = 2.2;
 
-    NDArray exp1('c', {2,2}, {0, 2, 4, 6},  nd4j::DataType::FLOAT32);
-    NDArray exp2('c', {2,2}, {0, 2, 4, 6},  nd4j::DataType::INT32);
-    NDArray exp3('c', {2,2}, {0, 1, 2, 3}, nd4j::DataType::FLOAT32);
-    NDArray exp4('c', {2,2}, {0, 1, 2, 3}, nd4j::DataType::INT32);
-    NDArray exp5('c', {2,2}, {0, 0.45454545, 0.909090909, 1.363636364}, nd4j::DataType::FLOAT32);
-    NDArray exp6('c', {2,2}, {0, 0, 0, 1}, nd4j::DataType::INT32);
+    NDArray exp1('c', {2,2}, {0, 2, 4, 6},  sd::DataType::FLOAT32);
+    NDArray exp2('c', {2,2}, {0, 2, 4, 6},  sd::DataType::INT32);
+    NDArray exp3('c', {2,2}, {0, 1, 2, 3}, sd::DataType::FLOAT32);
+    NDArray exp4('c', {2,2}, {0, 1, 2, 3}, sd::DataType::INT32);
+    NDArray exp5('c', {2,2}, {0, 0.45454545, 0.909090909, 1.363636364}, sd::DataType::FLOAT32);
+    NDArray exp6('c', {2,2}, {0, 0, 0, 1}, sd::DataType::INT32);
 
     x1 /= val1;
     ASSERT_EQ(x1, exp1);
@@ -787,15 +787,15 @@ TEST_F(MultiDataTypeTests, ndarray_reduceNumberFloat_test1) {
     if (!Environment::getInstance()->isExperimentalBuild())
         return;
 
-    NDArray x1('c', {2,2}, {0, 1, 2, 3}, nd4j::DataType::INT64);
-    NDArray x2('c', {2,2}, {0.5, 1.5, 2.5, 3.5}, nd4j::DataType::HALF);
-    NDArray x3('c', {2,2}, {0.5, 1.5, 2.5, 3.5}, nd4j::DataType::DOUBLE);
-    NDArray x4('c', {2,2}, {0, 1, 0, 1}, nd4j::DataType::BOOL);
+    NDArray x1('c', {2,2}, {0, 1, 2, 3}, sd::DataType::INT64);
+    NDArray x2('c', {2,2}, {0.5, 1.5, 2.5, 3.5}, sd::DataType::HALF);
+    NDArray x3('c', {2,2}, {0.5, 1.5, 2.5, 3.5}, sd::DataType::DOUBLE);
+    NDArray x4('c', {2,2}, {0, 1, 0, 1}, sd::DataType::BOOL);
 
-    NDArray exp1('c', {0}, std::vector<double>{1.5}, nd4j::DataType::FLOAT32);
-    NDArray exp2('c', {0}, std::vector<double>{2},   nd4j::DataType::HALF);
-    NDArray exp3('c', {0}, std::vector<double>{2},   nd4j::DataType::DOUBLE);
-    NDArray exp4('c', {0}, std::vector<double>{0.25},nd4j::DataType::FLOAT32);
+    NDArray exp1('c', {0}, std::vector<double>{1.5}, sd::DataType::FLOAT32);
+    NDArray exp2('c', {0}, std::vector<double>{2},   sd::DataType::HALF);
+    NDArray exp3('c', {0}, std::vector<double>{2},   sd::DataType::DOUBLE);
+    NDArray exp4('c', {0}, std::vector<double>{0.25},sd::DataType::FLOAT32);
 
 
     NDArray scalar = x1.reduceNumber(reduce::Mean);
@@ -824,15 +824,15 @@ TEST_F(MultiDataTypeTests, ndarray_reduceNumberSame_test1) {
     if (!Environment::getInstance()->isExperimentalBuild())
         return;
 
-    NDArray x1('c', {2,2}, {0, 1, 2, 3}, nd4j::DataType::INT64);
-    NDArray x2('c', {2,2}, {0.5, 1.5, 2.5, 3.5}, nd4j::DataType::HALF);
-    NDArray x3('c', {2,2}, {0.5, 1.5, 2.5, 3.5}, nd4j::DataType::DOUBLE);
-    NDArray x4('c', {2,2}, {0, 1, 0, 1}, nd4j::DataType::BOOL);
+    NDArray x1('c', {2,2}, {0, 1, 2, 3}, sd::DataType::INT64);
+    NDArray x2('c', {2,2}, {0.5, 1.5, 2.5, 3.5}, sd::DataType::HALF);
+    NDArray x3('c', {2,2}, {0.5, 1.5, 2.5, 3.5}, sd::DataType::DOUBLE);
+    NDArray x4('c', {2,2}, {0, 1, 0, 1}, sd::DataType::BOOL);
 
-    NDArray exp1('c', {0}, std::vector<double>{6}, nd4j::DataType::INT64);
-    NDArray exp2('c', {0}, std::vector<double>{8}, nd4j::DataType::HALF);
-    NDArray exp3('c', {0}, std::vector<double>{8}, nd4j::DataType::DOUBLE);
-    NDArray exp4('c', {0}, std::vector<double>{1}, nd4j::DataType::BOOL);
+    NDArray exp1('c', {0}, std::vector<double>{6}, sd::DataType::INT64);
+    NDArray exp2('c', {0}, std::vector<double>{8}, sd::DataType::HALF);
+    NDArray exp3('c', {0}, std::vector<double>{8}, sd::DataType::DOUBLE);
+    NDArray exp4('c', {0}, std::vector<double>{1}, sd::DataType::BOOL);
 
 
     NDArray scalar = x1.reduceNumber(reduce::Sum);
@@ -861,12 +861,12 @@ TEST_F(MultiDataTypeTests, ndarray_reduceNumberBool_test1) {
     if (!Environment::getInstance()->isExperimentalBuild())
         return;
 
-    NDArray x1('c', {2,2}, {0, -1, 2, -3}, nd4j::DataType::INT64);
-    NDArray x2('c', {2,2}, {0.5, -1.5, 2.5, -3.5}, nd4j::DataType::HALF);
-    NDArray x3('c', {2,2}, {0.5, 1.5, 2.5, 3.5}, nd4j::DataType::DOUBLE);
-    NDArray x4('c', {2,2}, {-2, -1, 0, 1}, nd4j::DataType::BOOL);
+    NDArray x1('c', {2,2}, {0, -1, 2, -3}, sd::DataType::INT64);
+    NDArray x2('c', {2,2}, {0.5, -1.5, 2.5, -3.5}, sd::DataType::HALF);
+    NDArray x3('c', {2,2}, {0.5, 1.5, 2.5, 3.5}, sd::DataType::DOUBLE);
+    NDArray x4('c', {2,2}, {-2, -1, 0, 1}, sd::DataType::BOOL);
 
-    NDArray exp1('c', {0}, std::vector<double>{1}, nd4j::DataType::BOOL);
+    NDArray exp1('c', {0}, std::vector<double>{1}, sd::DataType::BOOL);
 
     NDArray scalar = x1.reduceNumber(reduce::IsFinite);
     ASSERT_EQ(scalar, exp1);
@@ -894,15 +894,15 @@ TEST_F(MultiDataTypeTests, ndarray_reduceNumberLong_test1) {
     if (!Environment::getInstance()->isExperimentalBuild())
         return;
 
-    NDArray x1('c', {2,2}, {0, 1, 2, 3}, nd4j::DataType::INT64);
-    NDArray x2('c', {2,2}, {0.5, 1.5, 2.5, 3.5}, nd4j::DataType::HALF);
-    NDArray x3('c', {2,2}, {0.5, -1.5, 0, 3.5}, nd4j::DataType::DOUBLE);
-    NDArray x4('c', {2,2}, {0, 1, 0, 1}, nd4j::DataType::BOOL);
+    NDArray x1('c', {2,2}, {0, 1, 2, 3}, sd::DataType::INT64);
+    NDArray x2('c', {2,2}, {0.5, 1.5, 2.5, 3.5}, sd::DataType::HALF);
+    NDArray x3('c', {2,2}, {0.5, -1.5, 0, 3.5}, sd::DataType::DOUBLE);
+    NDArray x4('c', {2,2}, {0, 1, 0, 1}, sd::DataType::BOOL);
 
-    NDArray exp1('c', {0}, std::vector<double>{3}, nd4j::DataType::INT64);
-    NDArray exp2('c', {0}, std::vector<double>{4}, nd4j::DataType::INT64);
-    NDArray exp3('c', {0}, std::vector<double>{3}, nd4j::DataType::INT64);
-    NDArray exp4('c', {0}, std::vector<double>{2}, nd4j::DataType::INT64);
+    NDArray exp1('c', {0}, std::vector<double>{3}, sd::DataType::INT64);
+    NDArray exp2('c', {0}, std::vector<double>{4}, sd::DataType::INT64);
+    NDArray exp3('c', {0}, std::vector<double>{3}, sd::DataType::INT64);
+    NDArray exp4('c', {0}, std::vector<double>{2}, sd::DataType::INT64);
 
     NDArray scalar = x1.reduceNumber(reduce::CountNonZero);
     ASSERT_EQ(scalar, exp1);
@@ -930,21 +930,21 @@ TEST_F(MultiDataTypeTests, ndarray_indexReduceNumber_test1) {
     if (!Environment::getInstance()->isExperimentalBuild())
         return;
 
-    NDArray x1('c', {2,2}, {0, 1, 2, 3}, nd4j::DataType::INT32);
-    NDArray x2('c', {2,2}, {0.5, 1.5, -4.5, 3.5}, nd4j::DataType::HALF);
-    NDArray x3('c', {2,2}, {0, -1, 0, 1}, nd4j::DataType::BOOL);
+    NDArray x1('c', {2,2}, {0, 1, 2, 3}, sd::DataType::INT32);
+    NDArray x2('c', {2,2}, {0.5, 1.5, -4.5, 3.5}, sd::DataType::HALF);
+    NDArray x3('c', {2,2}, {0, -1, 0, 1}, sd::DataType::BOOL);
 
-    NDArray exp1('c', {0}, std::vector<double>{3}, nd4j::DataType::INT64);
-    NDArray exp2('c', {0}, std::vector<double>{2}, nd4j::DataType::INT64);
-    NDArray exp3('c', {0}, std::vector<double>{1}, nd4j::DataType::INT64);
+    NDArray exp1('c', {0}, std::vector<double>{3}, sd::DataType::INT64);
+    NDArray exp2('c', {0}, std::vector<double>{2}, sd::DataType::INT64);
+    NDArray exp3('c', {0}, std::vector<double>{1}, sd::DataType::INT64);
 
-    NDArray scalar = x1.indexReduceNumber(nd4j::indexreduce::IndexAbsoluteMax);
+    NDArray scalar = x1.indexReduceNumber(sd::indexreduce::IndexAbsoluteMax);
     ASSERT_EQ(scalar, exp1);
 
-    scalar = x2.indexReduceNumber(nd4j::indexreduce::IndexAbsoluteMax);
+    scalar = x2.indexReduceNumber(sd::indexreduce::IndexAbsoluteMax);
     ASSERT_EQ(scalar, exp2);
 
-    scalar = x3.indexReduceNumber(nd4j::indexreduce::IndexAbsoluteMax);
+    scalar = x3.indexReduceNumber(sd::indexreduce::IndexAbsoluteMax);
     ASSERT_EQ(scalar, exp3);
 }
 
@@ -953,36 +953,36 @@ TEST_F(MultiDataTypeTests, ndarray_applyTransformFloat_test1) {
     if (!Environment::getInstance()->isExperimentalBuild())
         return;
 
-    NDArray x1('c', {2,2}, {0, 4, 9, 16}, nd4j::DataType::INT64);
-    NDArray x2('c', {2,2}, {0, 2.25, 6.25, 12.25}, nd4j::DataType::HALF);
-    NDArray x3('c', {2,2}, {0, 2.25, 6.25, 12.25}, nd4j::DataType::DOUBLE);
-    NDArray x4('c', {2,2}, {0, 1, 0, 1}, nd4j::DataType::BOOL);
+    NDArray x1('c', {2,2}, {0, 4, 9, 16}, sd::DataType::INT64);
+    NDArray x2('c', {2,2}, {0, 2.25, 6.25, 12.25}, sd::DataType::HALF);
+    NDArray x3('c', {2,2}, {0, 2.25, 6.25, 12.25}, sd::DataType::DOUBLE);
+    NDArray x4('c', {2,2}, {0, 1, 0, 1}, sd::DataType::BOOL);
 
-    NDArray exp1('c', {2,2}, {0, 2, 3, 4}, nd4j::DataType::FLOAT32);
-    NDArray exp2('c', {2,2}, {0, 1.5, 2.5, 3.5}, nd4j::DataType::DOUBLE);
-    NDArray exp3('c', {2,2}, {0, 1.5, 2.5, 3.5}, nd4j::DataType::HALF);
-    NDArray exp4('c', {2,2}, {0, 1, 0, 1}, nd4j::DataType::HALF);
+    NDArray exp1('c', {2,2}, {0, 2, 3, 4}, sd::DataType::FLOAT32);
+    NDArray exp2('c', {2,2}, {0, 1.5, 2.5, 3.5}, sd::DataType::DOUBLE);
+    NDArray exp3('c', {2,2}, {0, 1.5, 2.5, 3.5}, sd::DataType::HALF);
+    NDArray exp4('c', {2,2}, {0, 1, 0, 1}, sd::DataType::HALF);
 
-    NDArray result1('c', {2,2}, nd4j::DataType::FLOAT32);
-    NDArray result2('c', {2,2}, nd4j::DataType::DOUBLE);
-    NDArray result3('c', {2,2}, nd4j::DataType::HALF);
+    NDArray result1('c', {2,2}, sd::DataType::FLOAT32);
+    NDArray result2('c', {2,2}, sd::DataType::DOUBLE);
+    NDArray result3('c', {2,2}, sd::DataType::HALF);
 
-    x1.applyTransform(nd4j::transform::Sqrt, result1);
+    x1.applyTransform(sd::transform::Sqrt, result1);
     ASSERT_EQ(result1, exp1);
 
-    x2.applyTransform(nd4j::transform::Sqrt, result2);
+    x2.applyTransform(sd::transform::Sqrt, result2);
     ASSERT_EQ(result2, exp2);
 
-    x3.applyTransform(nd4j::transform::Sqrt, result3);
+    x3.applyTransform(sd::transform::Sqrt, result3);
     ASSERT_EQ(result3, exp3);
 
-    x4.applyTransform(nd4j::transform::Sqrt, result3);
+    x4.applyTransform(sd::transform::Sqrt, result3);
     ASSERT_EQ(result3, exp4);
 
-    x2.applyTransform(nd4j::transform::Sqrt, x2);
+    x2.applyTransform(sd::transform::Sqrt, x2);
     ASSERT_EQ(x2, exp3);
 
-    x3.applyTransform(nd4j::transform::Sqrt, x3);
+    x3.applyTransform(sd::transform::Sqrt, x3);
     ASSERT_EQ(x3, exp2);
 }
 
@@ -991,43 +991,43 @@ TEST_F(MultiDataTypeTests, ndarray_applyTransformSame_test1) {
     if (!Environment::getInstance()->isExperimentalBuild())
         return;
 
-    NDArray x1('c', {2,2}, {0, 1, 2, 3}, nd4j::DataType::INT64);
-    NDArray x2('c', {2,2}, {0, 1.5, 2.5, 3.5}, nd4j::DataType::HALF);
-    NDArray x3('c', {2,2}, {0, 1.5, 2.5, 3.5}, nd4j::DataType::DOUBLE);
-    NDArray x4('c', {2,2}, {0, 1, 0, 1}, nd4j::DataType::BOOL);
-    NDArray x5('c', {2,3}, {0, 1.5, 2.5, 3.5, 4.5, 5.5}, nd4j::DataType::DOUBLE);
+    NDArray x1('c', {2,2}, {0, 1, 2, 3}, sd::DataType::INT64);
+    NDArray x2('c', {2,2}, {0, 1.5, 2.5, 3.5}, sd::DataType::HALF);
+    NDArray x3('c', {2,2}, {0, 1.5, 2.5, 3.5}, sd::DataType::DOUBLE);
+    NDArray x4('c', {2,2}, {0, 1, 0, 1}, sd::DataType::BOOL);
+    NDArray x5('c', {2,3}, {0, 1.5, 2.5, 3.5, 4.5, 5.5}, sd::DataType::DOUBLE);
 
-    NDArray exp1('c', {2,2}, {0, 1, 4, 9}, nd4j::DataType::INT64);
-    NDArray exp2('c', {2,2}, {0, 2.25, 6.25, 12.25}, nd4j::DataType::HALF);
-    NDArray exp3('c', {2,2}, {0, 2.25, 6.25, 12.25}, nd4j::DataType::DOUBLE);
-    NDArray exp4('c', {2,2}, {0, 1, 0, 1}, nd4j::DataType::BOOL);
-    NDArray exp5('c', {3,2}, {0, 2.25, 6.25, 12.25, 20.25, 30.25}, nd4j::DataType::DOUBLE);
+    NDArray exp1('c', {2,2}, {0, 1, 4, 9}, sd::DataType::INT64);
+    NDArray exp2('c', {2,2}, {0, 2.25, 6.25, 12.25}, sd::DataType::HALF);
+    NDArray exp3('c', {2,2}, {0, 2.25, 6.25, 12.25}, sd::DataType::DOUBLE);
+    NDArray exp4('c', {2,2}, {0, 1, 0, 1}, sd::DataType::BOOL);
+    NDArray exp5('c', {3,2}, {0, 2.25, 6.25, 12.25, 20.25, 30.25}, sd::DataType::DOUBLE);
 
-    NDArray result1('c', {2,2}, nd4j::DataType::INT64);
-    NDArray result2('c', {2,2}, nd4j::DataType::HALF);
-    NDArray result3('c', {2,2}, nd4j::DataType::DOUBLE);
-    NDArray result4('c', {2,2}, nd4j::DataType::BOOL);
-    NDArray result5('c', {3,2}, nd4j::DataType::DOUBLE);
+    NDArray result1('c', {2,2}, sd::DataType::INT64);
+    NDArray result2('c', {2,2}, sd::DataType::HALF);
+    NDArray result3('c', {2,2}, sd::DataType::DOUBLE);
+    NDArray result4('c', {2,2}, sd::DataType::BOOL);
+    NDArray result5('c', {3,2}, sd::DataType::DOUBLE);
 
-    x1.applyTransform(nd4j::transform::Square, result1);
+    x1.applyTransform(sd::transform::Square, result1);
     ASSERT_EQ(result1, exp1);
 
-    x2.applyTransform(nd4j::transform::Square, result2);
+    x2.applyTransform(sd::transform::Square, result2);
     ASSERT_EQ(result2, exp2);
 
-    x3.applyTransform(nd4j::transform::Square, result3);
+    x3.applyTransform(sd::transform::Square, result3);
     ASSERT_EQ(result3, exp3);
 
-    x4.applyTransform(nd4j::transform::Square, result4);
+    x4.applyTransform(sd::transform::Square, result4);
     ASSERT_EQ(result4, exp4);
 
-    x2.applyTransform(nd4j::transform::Square, x2);
+    x2.applyTransform(sd::transform::Square, x2);
     ASSERT_EQ(x2, exp2);
 
-    x3.applyTransform(nd4j::transform::Square, x3);
+    x3.applyTransform(sd::transform::Square, x3);
     ASSERT_EQ(x3, exp3);
 
-    x5.applyTransform(nd4j::transform::Square, result5);
+    x5.applyTransform(sd::transform::Square, result5);
     ASSERT_EQ(result5, exp5);
 }
 
@@ -1036,33 +1036,33 @@ TEST_F(MultiDataTypeTests, ndarray_applyTransformBool_test1) {
     if (!Environment::getInstance()->isExperimentalBuild())
         return;
 
-    NDArray x1('c', {2,2}, {0, 1, 2, 3}, nd4j::DataType::INT64);
-    NDArray x2('c', {2,2}, {0, 1.5, 2.5, 3.5}, nd4j::DataType::HALF);
-    NDArray x3('c', {2,2}, {0, 1.5, 2.5, 3.5}, nd4j::DataType::DOUBLE);
-    NDArray x4('c', {2,2}, {0, 1, 0, 1}, nd4j::DataType::BOOL);
-    NDArray x5('c', {2,3}, {0, 1.5, 2.5, 3.5, 4.5, 5.5}, nd4j::DataType::DOUBLE);
+    NDArray x1('c', {2,2}, {0, 1, 2, 3}, sd::DataType::INT64);
+    NDArray x2('c', {2,2}, {0, 1.5, 2.5, 3.5}, sd::DataType::HALF);
+    NDArray x3('c', {2,2}, {0, 1.5, 2.5, 3.5}, sd::DataType::DOUBLE);
+    NDArray x4('c', {2,2}, {0, 1, 0, 1}, sd::DataType::BOOL);
+    NDArray x5('c', {2,3}, {0, 1.5, 2.5, 3.5, 4.5, 5.5}, sd::DataType::DOUBLE);
 
-    NDArray exp1('c', {2,2}, {0, 0, 0, 1}, nd4j::DataType::BOOL);
-    NDArray exp2('c', {2,2}, {0, 1, 0, 0}, nd4j::DataType::BOOL);
-    NDArray exp3('c', {3,2}, {0, 0, 0, 0, 0, 1}, nd4j::DataType::BOOL);
+    NDArray exp1('c', {2,2}, {0, 0, 0, 1}, sd::DataType::BOOL);
+    NDArray exp2('c', {2,2}, {0, 1, 0, 0}, sd::DataType::BOOL);
+    NDArray exp3('c', {3,2}, {0, 0, 0, 0, 0, 1}, sd::DataType::BOOL);
 
-    NDArray result1('c', {2,2}, nd4j::DataType::BOOL);
-    NDArray result2('c', {3,2}, nd4j::DataType::BOOL);
+    NDArray result1('c', {2,2}, sd::DataType::BOOL);
+    NDArray result2('c', {3,2}, sd::DataType::BOOL);
 
     /*
-    x1.applyTransform(nd4j::transform::IsMax, result1);
+    x1.applyTransform(sd::transform::IsMax, result1);
     ASSERT_EQ(result1, exp1);
 
-    x2.applyTransform(nd4j::transform::IsMax, result1);
+    x2.applyTransform(sd::transform::IsMax, result1);
     ASSERT_EQ(result1, exp1);
 
-    x3.applyTransform(nd4j::transform::IsMax, result1);
+    x3.applyTransform(sd::transform::IsMax, result1);
     ASSERT_EQ(result1, exp1);
 
-    x4.applyTransform(nd4j::transform::IsMax, result1);
+    x4.applyTransform(sd::transform::IsMax, result1);
     ASSERT_EQ(result1, exp2);
 
-    x5.applyTransform(nd4j::transform::IsMax, result2);
+    x5.applyTransform(sd::transform::IsMax, result2);
     ASSERT_EQ(result2, exp3);
     */
 }
@@ -1072,44 +1072,44 @@ TEST_F(MultiDataTypeTests, ndarray_applyTransformStrict_test1) {
     if (!Environment::getInstance()->isExperimentalBuild())
         return;
 
-    NDArray x1('c', {2,2}, {0, 1, 2, 3}, nd4j::DataType::HALF);
-    NDArray x2('c', {2,2}, {0, 1, 2, 3}, nd4j::DataType::FLOAT32);
-    NDArray x3('c', {2,2}, {0, 1, 2, 3}, nd4j::DataType::DOUBLE);
-    NDArray x4('c', {2,3}, {0, 1, 2, 3, 4, 5}, nd4j::DataType::DOUBLE);
+    NDArray x1('c', {2,2}, {0, 1, 2, 3}, sd::DataType::HALF);
+    NDArray x2('c', {2,2}, {0, 1, 2, 3}, sd::DataType::FLOAT32);
+    NDArray x3('c', {2,2}, {0, 1, 2, 3}, sd::DataType::DOUBLE);
+    NDArray x4('c', {2,3}, {0, 1, 2, 3, 4, 5}, sd::DataType::DOUBLE);
 
-    NDArray exp1('c', {2,2}, {0, 3, 12, 27}, nd4j::DataType::HALF);
-    NDArray exp2('c', {2,2}, {0, 3, 12, 27}, nd4j::DataType::FLOAT32);
-    NDArray exp3('c', {2,2}, {0, 3, 12, 27}, nd4j::DataType::DOUBLE);
-    NDArray exp4('c', {3,2}, {0, 3, 12, 27, 48, 75}, nd4j::DataType::DOUBLE);
-    NDArray exp5('c', {2,3}, {0, 3, 12, 27, 48, 75}, nd4j::DataType::DOUBLE);
+    NDArray exp1('c', {2,2}, {0, 3, 12, 27}, sd::DataType::HALF);
+    NDArray exp2('c', {2,2}, {0, 3, 12, 27}, sd::DataType::FLOAT32);
+    NDArray exp3('c', {2,2}, {0, 3, 12, 27}, sd::DataType::DOUBLE);
+    NDArray exp4('c', {3,2}, {0, 3, 12, 27, 48, 75}, sd::DataType::DOUBLE);
+    NDArray exp5('c', {2,3}, {0, 3, 12, 27, 48, 75}, sd::DataType::DOUBLE);
 
-    NDArray result1('c', {2,2}, nd4j::DataType::HALF);
-    NDArray result2('c', {2,2}, nd4j::DataType::FLOAT32);
-    NDArray result3('c', {2,2}, nd4j::DataType::DOUBLE);
-    NDArray result4('c', {3,2}, nd4j::DataType::DOUBLE);
+    NDArray result1('c', {2,2}, sd::DataType::HALF);
+    NDArray result2('c', {2,2}, sd::DataType::FLOAT32);
+    NDArray result3('c', {2,2}, sd::DataType::DOUBLE);
+    NDArray result4('c', {3,2}, sd::DataType::DOUBLE);
 
-    x1.applyTransform(nd4j::transform::CubeDerivative, result1);
+    x1.applyTransform(sd::transform::CubeDerivative, result1);
     ASSERT_EQ(result1, exp1);
 
-    x2.applyTransform(nd4j::transform::CubeDerivative, result2);
+    x2.applyTransform(sd::transform::CubeDerivative, result2);
     ASSERT_EQ(result2, exp2);
 
-    x3.applyTransform(nd4j::transform::CubeDerivative, result3);
+    x3.applyTransform(sd::transform::CubeDerivative, result3);
     ASSERT_EQ(result3, exp3);
 
-    x4.applyTransform(nd4j::transform::CubeDerivative, result4);
+    x4.applyTransform(sd::transform::CubeDerivative, result4);
     ASSERT_EQ(result4, exp4);
 
-    x1.applyTransform(nd4j::transform::CubeDerivative, x1);
+    x1.applyTransform(sd::transform::CubeDerivative, x1);
     ASSERT_EQ(x1, exp1);
 
-    x2.applyTransform(nd4j::transform::CubeDerivative, x2);
+    x2.applyTransform(sd::transform::CubeDerivative, x2);
     ASSERT_EQ(x2, exp2);
 
-    x3.applyTransform(nd4j::transform::CubeDerivative, x3);
+    x3.applyTransform(sd::transform::CubeDerivative, x3);
     ASSERT_EQ(x3, exp3);
 
-    x4.applyTransform(nd4j::transform::CubeDerivative, x4);
+    x4.applyTransform(sd::transform::CubeDerivative, x4);
     ASSERT_EQ(x4, exp5);
 }
 
@@ -1118,32 +1118,32 @@ TEST_F(MultiDataTypeTests, ndarray_applyPairwiseTransform_test1) {
     if (!Environment::getInstance()->isExperimentalBuild())
         return;
 
-    NDArray x1('c', {2,3}, {0,     1,   2,   3,   4,   5}, nd4j::DataType::INT32);
-    NDArray x2('c', {2,3}, {0,     1,   2,   3,   4,   5}, nd4j::DataType::FLOAT32);
-    NDArray x3('c', {2,3}, {0,     1,   0,   1,   0,   0}, nd4j::DataType::BOOL);
-    NDArray x4('c', {3,2}, {0.5, 1.5, 2.5, 3.5, 4.5,   0}, nd4j::DataType::DOUBLE);
-    NDArray x5('c', {3,2}, nd4j::DataType::INT32);
-    NDArray x6('c', {2,3}, nd4j::DataType::DOUBLE);
+    NDArray x1('c', {2,3}, {0,     1,   2,   3,   4,   5}, sd::DataType::INT32);
+    NDArray x2('c', {2,3}, {0,     1,   2,   3,   4,   5}, sd::DataType::FLOAT32);
+    NDArray x3('c', {2,3}, {0,     1,   0,   1,   0,   0}, sd::DataType::BOOL);
+    NDArray x4('c', {3,2}, {0.5, 1.5, 2.5, 3.5, 4.5,   0}, sd::DataType::DOUBLE);
+    NDArray x5('c', {3,2}, sd::DataType::INT32);
+    NDArray x6('c', {2,3}, sd::DataType::DOUBLE);
 
-    NDArray exp1('c', {2,3}, {0, 2, 4, 6, 8, 5}, nd4j::DataType::INT32);
-    NDArray exp2('c', {2,3}, {0.5, 2.5, 4.5, 6.5, 8.5, 5.}, nd4j::DataType::FLOAT32);
-    NDArray exp3('c', {2,3}, {1, 1, 1, 1, 1, 0}, nd4j::DataType::BOOL);
-    NDArray exp4('c', {2,3}, {0.5, 2.5, 4.5, 6.5, 8.5, 5.}, nd4j::DataType::DOUBLE);
-    NDArray exp5('c', {3,2}, {0, 2, 4, 6, 8, 5}, nd4j::DataType::INT32);
+    NDArray exp1('c', {2,3}, {0, 2, 4, 6, 8, 5}, sd::DataType::INT32);
+    NDArray exp2('c', {2,3}, {0.5, 2.5, 4.5, 6.5, 8.5, 5.}, sd::DataType::FLOAT32);
+    NDArray exp3('c', {2,3}, {1, 1, 1, 1, 1, 0}, sd::DataType::BOOL);
+    NDArray exp4('c', {2,3}, {0.5, 2.5, 4.5, 6.5, 8.5, 5.}, sd::DataType::DOUBLE);
+    NDArray exp5('c', {3,2}, {0, 2, 4, 6, 8, 5}, sd::DataType::INT32);
 
-    x1.applyPairwiseTransform(nd4j::pairwise::Add, x4, x5);
+    x1.applyPairwiseTransform(sd::pairwise::Add, x4, x5);
     ASSERT_EQ(x5, exp5);
 
-    x1.applyPairwiseTransform(nd4j::pairwise::Add, x4, x6);
+    x1.applyPairwiseTransform(sd::pairwise::Add, x4, x6);
     ASSERT_EQ(x6, exp4);
 
-    x1.applyPairwiseTransform(nd4j::pairwise::Add, x4);
+    x1.applyPairwiseTransform(sd::pairwise::Add, x4);
     ASSERT_EQ(x1, exp1);
 
-    x2.applyPairwiseTransform(nd4j::pairwise::Add, x4);
+    x2.applyPairwiseTransform(sd::pairwise::Add, x4);
     ASSERT_EQ(x2, exp2);
 
-    x3.applyPairwiseTransform(nd4j::pairwise::Add, x4);
+    x3.applyPairwiseTransform(sd::pairwise::Add, x4);
     ASSERT_EQ(x3, exp3);
 }
 
@@ -1152,27 +1152,27 @@ TEST_F(MultiDataTypeTests, ndarray_applyPairwiseTransform_test2) {
     if (!Environment::getInstance()->isExperimentalBuild())
         return;
 
-    NDArray x1('c', {2,3}, {1,     1,   2,   3,   4,   5}, nd4j::DataType::INT32);
-    NDArray x2('c', {3,2}, {1,     0,   2,   0,   4,   0}, nd4j::DataType::INT32);
-    NDArray x3('c', {3,2}, {0.5, 1.5, 2.5,   3, 4.5,   0}, nd4j::DataType::DOUBLE);
-    NDArray x4('c', {2,3}, {0.5, 1.,  2.5,   3, 4.,    0}, nd4j::DataType::DOUBLE);
-    NDArray x5('c', {3,2}, {0, 1, 0, 1, 0, 1}, nd4j::DataType::BOOL);
-    NDArray x6('c', {2,3}, {1, 1, 1, 0, 1, 0}, nd4j::DataType::BOOL);
+    NDArray x1('c', {2,3}, {1,     1,   2,   3,   4,   5}, sd::DataType::INT32);
+    NDArray x2('c', {3,2}, {1,     0,   2,   0,   4,   0}, sd::DataType::INT32);
+    NDArray x3('c', {3,2}, {0.5, 1.5, 2.5,   3, 4.5,   0}, sd::DataType::DOUBLE);
+    NDArray x4('c', {2,3}, {0.5, 1.,  2.5,   3, 4.,    0}, sd::DataType::DOUBLE);
+    NDArray x5('c', {3,2}, {0, 1, 0, 1, 0, 1}, sd::DataType::BOOL);
+    NDArray x6('c', {2,3}, {1, 1, 1, 0, 1, 0}, sd::DataType::BOOL);
 
-    NDArray x7('c', {3,2}, nd4j::DataType::BOOL);
-    NDArray x8('c', {2,3}, nd4j::DataType::BOOL);
+    NDArray x7('c', {3,2}, sd::DataType::BOOL);
+    NDArray x8('c', {2,3}, sd::DataType::BOOL);
 
-    NDArray exp1('c', {3,2}, {1, 0, 1, 0, 1, 0}, nd4j::DataType::BOOL);
-    NDArray exp2('c', {2,3}, {1, 0, 1, 1, 0, 1}, nd4j::DataType::BOOL);
-    NDArray exp3('c', {2,3}, {0, 1, 0, 0, 0, 0}, nd4j::DataType::BOOL);
+    NDArray exp1('c', {3,2}, {1, 0, 1, 0, 1, 0}, sd::DataType::BOOL);
+    NDArray exp2('c', {2,3}, {1, 0, 1, 1, 0, 1}, sd::DataType::BOOL);
+    NDArray exp3('c', {2,3}, {0, 1, 0, 0, 0, 0}, sd::DataType::BOOL);
 
-    x1.applyPairwiseTransform(nd4j::pairwise::EqualTo, x2, x7);
+    x1.applyPairwiseTransform(sd::pairwise::EqualTo, x2, x7);
     ASSERT_EQ(x7, exp1);
 
-    x3.applyPairwiseTransform(nd4j::pairwise::EqualTo, x4, x8);
+    x3.applyPairwiseTransform(sd::pairwise::EqualTo, x4, x8);
     ASSERT_EQ(x8, exp2);
 
-    x5.applyPairwiseTransform(nd4j::pairwise::EqualTo, x6, x8);
+    x5.applyPairwiseTransform(sd::pairwise::EqualTo, x6, x8);
     ASSERT_EQ(x8, exp3);
 }
 
@@ -1181,44 +1181,44 @@ TEST_F(MultiDataTypeTests, ndarray_applyBroadcast_test1) {
     if (!Environment::getInstance()->isExperimentalBuild())
         return;
 
-    NDArray x1('c', {2,3}, {10, 20, 30, 40, 50, 60}, nd4j::DataType::INT32);
-    NDArray x2('c', {2},   {1, 2}, nd4j::DataType::INT64);
-    NDArray x3('c', {2,3}, nd4j::DataType::INT32);
-    NDArray x4('c', {2},   {1, 2}, nd4j::DataType::FLOAT32);
-    NDArray x5('c', {2,3}, nd4j::DataType::FLOAT32);
-    NDArray x6('c', {2},   {1, 1}, nd4j::DataType::BOOL);
+    NDArray x1('c', {2,3}, {10, 20, 30, 40, 50, 60}, sd::DataType::INT32);
+    NDArray x2('c', {2},   {1, 2}, sd::DataType::INT64);
+    NDArray x3('c', {2,3}, sd::DataType::INT32);
+    NDArray x4('c', {2},   {1, 2}, sd::DataType::FLOAT32);
+    NDArray x5('c', {2,3}, sd::DataType::FLOAT32);
+    NDArray x6('c', {2},   {1, 1}, sd::DataType::BOOL);
 
-    NDArray exp1('c', {2,3}, {11, 21, 31, 42, 52, 62}, nd4j::DataType::INT32);
-    NDArray exp2('c', {2,3}, {11, 21, 31, 42, 52, 62}, nd4j::DataType::FLOAT32);
-    NDArray exp3('c', {2,3}, {11, 21, 31, 41, 51, 61}, nd4j::DataType::INT32);
+    NDArray exp1('c', {2,3}, {11, 21, 31, 42, 52, 62}, sd::DataType::INT32);
+    NDArray exp2('c', {2,3}, {11, 21, 31, 42, 52, 62}, sd::DataType::FLOAT32);
+    NDArray exp3('c', {2,3}, {11, 21, 31, 41, 51, 61}, sd::DataType::INT32);
 
-    x1.applyBroadcast(nd4j::broadcast::Add, {0}, x2, x3);
+    x1.applyBroadcast(sd::broadcast::Add, {0}, x2, x3);
     ASSERT_EQ(x3, exp1);
 
-    x1.applyBroadcast(nd4j::broadcast::Add, {0}, x4, x5);
+    x1.applyBroadcast(sd::broadcast::Add, {0}, x4, x5);
     ASSERT_EQ(x5, exp2);
 
-    x1.applyBroadcast(nd4j::broadcast::Add, {0}, x6, x3);
+    x1.applyBroadcast(sd::broadcast::Add, {0}, x6, x3);
     ASSERT_EQ(x3, exp3);
 }
 
 //////////////////////////////////////////////////////////////////////////////
 TEST_F(MultiDataTypeTests, ndarray_applyBroadcast_test2) {
 
-    NDArray x1('c', {2,3}, {10, 20, 30, 40, 50, 60}, nd4j::DataType::INT32);
-    NDArray x2('c', {2},   {10, 60}, nd4j::DataType::INT32);
-    NDArray x3('c', {2,3}, nd4j::DataType::BOOL);
+    NDArray x1('c', {2,3}, {10, 20, 30, 40, 50, 60}, sd::DataType::INT32);
+    NDArray x2('c', {2},   {10, 60}, sd::DataType::INT32);
+    NDArray x3('c', {2,3}, sd::DataType::BOOL);
 
-    NDArray x4('c', {2,3}, {0, 0, 0, 0, 0, 1}, nd4j::DataType::BOOL);
-    NDArray x5('c', {2},   {0, 1}, nd4j::DataType::BOOL);
+    NDArray x4('c', {2,3}, {0, 0, 0, 0, 0, 1}, sd::DataType::BOOL);
+    NDArray x5('c', {2},   {0, 1}, sd::DataType::BOOL);
 
-    NDArray exp1('c', {2,3}, {1, 0, 0, 0, 0, 1}, nd4j::DataType::BOOL);
-    NDArray exp2('c', {2,3}, {1, 1, 1, 0, 0, 1}, nd4j::DataType::BOOL);
+    NDArray exp1('c', {2,3}, {1, 0, 0, 0, 0, 1}, sd::DataType::BOOL);
+    NDArray exp2('c', {2,3}, {1, 1, 1, 0, 0, 1}, sd::DataType::BOOL);
 
-    x1.applyBroadcast(nd4j::broadcast::EqualTo, {0}, x2, x3);
+    x1.applyBroadcast(sd::broadcast::EqualTo, {0}, x2, x3);
     ASSERT_EQ(x3, exp1);
 
-    x4.applyBroadcast(nd4j::broadcast::EqualTo, {0}, x5, x3);
+    x4.applyBroadcast(sd::broadcast::EqualTo, {0}, x5, x3);
     ASSERT_EQ(x3, exp2);
 }
 
@@ -1227,56 +1227,56 @@ TEST_F(MultiDataTypeTests, ndarray_applyTrueBroadcast_test1) {
     if (!Environment::getInstance()->isExperimentalBuild())
         return;
 
-    NDArray x1('c', {2,2}, {10, 20, 30, 40}, nd4j::DataType::INT32);
-    NDArray x2('c', {2},   {1, 2}, nd4j::DataType::HALF);
-    NDArray x3('c', {2,2}, nd4j::DataType::HALF);
+    NDArray x1('c', {2,2}, {10, 20, 30, 40}, sd::DataType::INT32);
+    NDArray x2('c', {2},   {1, 2}, sd::DataType::HALF);
+    NDArray x3('c', {2,2}, sd::DataType::HALF);
 
-    NDArray x4('c', {2},   {1, 2}, nd4j::DataType::INT64);
-    NDArray x5('c', {2,2}, nd4j::DataType::INT32);
+    NDArray x4('c', {2},   {1, 2}, sd::DataType::INT64);
+    NDArray x5('c', {2,2}, sd::DataType::INT32);
 
-    NDArray x6('c', {2,2}, {0, 1, 0, 1}, nd4j::DataType::BOOL);
-    NDArray x7('c', {2}, {1, 2}, nd4j::DataType::INT64);
-    NDArray x8('c', {2,2}, nd4j::DataType::BOOL);
+    NDArray x6('c', {2,2}, {0, 1, 0, 1}, sd::DataType::BOOL);
+    NDArray x7('c', {2}, {1, 2}, sd::DataType::INT64);
+    NDArray x8('c', {2,2}, sd::DataType::BOOL);
 
-    NDArray x13('c', {0}, std::vector<double>{3}, nd4j::DataType::INT64);
-    NDArray x14('c', {0}, std::vector<double>{1.5}, nd4j::DataType::DOUBLE);
-    NDArray x15(nd4j::DataType::DOUBLE);
-    NDArray x16('c', {2,2}, nd4j::DataType::DOUBLE);
+    NDArray x13('c', {0}, std::vector<double>{3}, sd::DataType::INT64);
+    NDArray x14('c', {0}, std::vector<double>{1.5}, sd::DataType::DOUBLE);
+    NDArray x15(sd::DataType::DOUBLE);
+    NDArray x16('c', {2,2}, sd::DataType::DOUBLE);
 
-    NDArray exp1('c', {2,2}, {11, 22, 31, 42}, nd4j::DataType::HALF);
-    NDArray exp2('c', {2,2}, {11, 22, 31, 42}, nd4j::DataType::INT32);
-    NDArray exp3('c', {2,2}, {1, 1, 1, 1}, nd4j::DataType::BOOL);
-    NDArray exp4('c', {0}, std::vector<double>{4.5}, nd4j::DataType::DOUBLE);
-    NDArray exp5('c', {2,2}, {11.5, 21.5, 31.5, 41.5}, nd4j::DataType::DOUBLE);
+    NDArray exp1('c', {2,2}, {11, 22, 31, 42}, sd::DataType::HALF);
+    NDArray exp2('c', {2,2}, {11, 22, 31, 42}, sd::DataType::INT32);
+    NDArray exp3('c', {2,2}, {1, 1, 1, 1}, sd::DataType::BOOL);
+    NDArray exp4('c', {0}, std::vector<double>{4.5}, sd::DataType::DOUBLE);
+    NDArray exp5('c', {2,2}, {11.5, 21.5, 31.5, 41.5}, sd::DataType::DOUBLE);
 
-    x1.applyTrueBroadcast(nd4j::BroadcastOpsTuple::Add(), x2, x3);
+    x1.applyTrueBroadcast(sd::BroadcastOpsTuple::Add(), x2, x3);
     ASSERT_EQ(x3, exp1);
 
-    x1.applyTrueBroadcast(nd4j::BroadcastOpsTuple::Add(), x4, x5);
+    x1.applyTrueBroadcast(sd::BroadcastOpsTuple::Add(), x4, x5);
     ASSERT_EQ(x5, exp2);
 
-    x6.applyTrueBroadcast(nd4j::BroadcastOpsTuple::Add(), x7, x8);
+    x6.applyTrueBroadcast(sd::BroadcastOpsTuple::Add(), x7, x8);
     ASSERT_EQ(x8, exp3);
 
-    auto x9 = x1.applyTrueBroadcast(nd4j::BroadcastOpsTuple::Add(), x2);
+    auto x9 = x1.applyTrueBroadcast(sd::BroadcastOpsTuple::Add(), x2);
     ASSERT_EQ(x9, exp1);
 
-    auto x10 = x1.applyTrueBroadcast(nd4j::BroadcastOpsTuple::Add(), x4);
+    auto x10 = x1.applyTrueBroadcast(sd::BroadcastOpsTuple::Add(), x4);
     ASSERT_EQ(x10, exp2);
 
-    auto x11 = x6.applyTrueBroadcast(nd4j::BroadcastOpsTuple::Add(), x7);
+    auto x11 = x6.applyTrueBroadcast(sd::BroadcastOpsTuple::Add(), x7);
     ASSERT_EQ(x11, exp3);
 
-    auto x12 = x1.applyTrueBroadcast(nd4j::BroadcastOpsTuple::Add(), x2);
+    auto x12 = x1.applyTrueBroadcast(sd::BroadcastOpsTuple::Add(), x2);
     ASSERT_EQ(x12, exp1);
 
-    x13.applyTrueBroadcast(nd4j::BroadcastOpsTuple::Add(), x14, x15);
+    x13.applyTrueBroadcast(sd::BroadcastOpsTuple::Add(), x14, x15);
     ASSERT_EQ(x15, exp4);
 
-    x1.applyTrueBroadcast(nd4j::BroadcastOpsTuple::Add(), x14, x16);
+    x1.applyTrueBroadcast(sd::BroadcastOpsTuple::Add(), x14, x16);
     ASSERT_EQ(x16, exp5);
 
-    x14.applyTrueBroadcast(nd4j::BroadcastOpsTuple::Add(), x1, x16);
+    x14.applyTrueBroadcast(sd::BroadcastOpsTuple::Add(), x1, x16);
     ASSERT_EQ(x16, exp5);
 
 }
@@ -1286,27 +1286,27 @@ TEST_F(MultiDataTypeTests, ndarray_applyTrueBroadcast_test2) {
     if (!Environment::getInstance()->isExperimentalBuild())
         return;
 
-    NDArray x1('c', {2,2}, {10, 20, 30, 40}, nd4j::DataType::HALF);
-    NDArray x2('c', {2},   {10, 40}, nd4j::DataType::HALF);
-    NDArray x3('c', {2,2}, nd4j::DataType::BOOL);
-    NDArray x4('c', {0}, std::vector<double>{10}, nd4j::DataType::HALF);
-    NDArray x5('c', {0}, std::vector<double>{20}, nd4j::DataType::HALF);
-    NDArray x6(nd4j::DataType::BOOL);
+    NDArray x1('c', {2,2}, {10, 20, 30, 40}, sd::DataType::HALF);
+    NDArray x2('c', {2},   {10, 40}, sd::DataType::HALF);
+    NDArray x3('c', {2,2}, sd::DataType::BOOL);
+    NDArray x4('c', {0}, std::vector<double>{10}, sd::DataType::HALF);
+    NDArray x5('c', {0}, std::vector<double>{20}, sd::DataType::HALF);
+    NDArray x6(sd::DataType::BOOL);
 
-    NDArray exp1('c', {2,2}, {1, 0, 0, 1}, nd4j::DataType::BOOL);
-    NDArray exp2('c', {2,2}, {1, 0, 0, 0}, nd4j::DataType::BOOL);
-    NDArray exp3('c', {0}, std::vector<double>{0}, nd4j::DataType::BOOL);
+    NDArray exp1('c', {2,2}, {1, 0, 0, 1}, sd::DataType::BOOL);
+    NDArray exp2('c', {2,2}, {1, 0, 0, 0}, sd::DataType::BOOL);
+    NDArray exp3('c', {0}, std::vector<double>{0}, sd::DataType::BOOL);
 
-    x1.applyTrueBroadcast(BroadcastBoolOpsTuple(nd4j::scalar::EqualTo, nd4j::pairwise::EqualTo, nd4j::broadcast::EqualTo), x2, x3);
+    x1.applyTrueBroadcast(BroadcastBoolOpsTuple(sd::scalar::EqualTo, sd::pairwise::EqualTo, sd::broadcast::EqualTo), x2, x3);
     ASSERT_EQ(x3, exp1);
 
-    x1.applyTrueBroadcast(BroadcastBoolOpsTuple(nd4j::scalar::EqualTo, nd4j::pairwise::EqualTo, nd4j::broadcast::EqualTo), x4, x3);
+    x1.applyTrueBroadcast(BroadcastBoolOpsTuple(sd::scalar::EqualTo, sd::pairwise::EqualTo, sd::broadcast::EqualTo), x4, x3);
     ASSERT_EQ(x3, exp2);
 
-    x4.applyTrueBroadcast(BroadcastBoolOpsTuple(nd4j::scalar::EqualTo, nd4j::pairwise::EqualTo, nd4j::broadcast::EqualTo), x1, x3);
+    x4.applyTrueBroadcast(BroadcastBoolOpsTuple(sd::scalar::EqualTo, sd::pairwise::EqualTo, sd::broadcast::EqualTo), x1, x3);
     ASSERT_EQ(x3, exp2);
 
-    x5.applyTrueBroadcast(BroadcastBoolOpsTuple(nd4j::scalar::EqualTo, nd4j::pairwise::EqualTo, nd4j::broadcast::EqualTo), x4, x6);
+    x5.applyTrueBroadcast(BroadcastBoolOpsTuple(sd::scalar::EqualTo, sd::pairwise::EqualTo, sd::broadcast::EqualTo), x4, x6);
     ASSERT_EQ(x6, exp3);
 }
 
@@ -1315,52 +1315,52 @@ TEST_F(MultiDataTypeTests, ndarray_applyScalar_test1) {
     if (!Environment::getInstance()->isExperimentalBuild())
         return;
 
-    NDArray x1('c', {2,2}, {0, 1, 2, 3}, nd4j::DataType::INT64);
-    NDArray x2('c', {2,2}, {0, 1.5, 2.5, 3.5}, nd4j::DataType::FLOAT32);
-    NDArray x3('c', {2,2}, nd4j::DataType::DOUBLE);
-    NDArray x4('c', {2,2}, {0, 1, 0, 1}, nd4j::DataType::BOOL);
+    NDArray x1('c', {2,2}, {0, 1, 2, 3}, sd::DataType::INT64);
+    NDArray x2('c', {2,2}, {0, 1.5, 2.5, 3.5}, sd::DataType::FLOAT32);
+    NDArray x3('c', {2,2}, sd::DataType::DOUBLE);
+    NDArray x4('c', {2,2}, {0, 1, 0, 1}, sd::DataType::BOOL);
 
-    NDArray exp1('c', {2,2}, {1, 2, 3, 4}, nd4j::DataType::INT64);
-    NDArray exp2('c', {2,2}, {1.5, 2.5, 3.5, 4.5}, nd4j::DataType::DOUBLE);
-    NDArray exp3('c', {2,2}, {0.1, 1.6, 2.6, 3.6}, nd4j::DataType::FLOAT32);
-    NDArray exp4('c', {2,2}, {1.1, 2.1, 1.1, 2.1}, nd4j::DataType::DOUBLE);
-    NDArray exp5('c', {2,2}, {1, 1, 1, 1}, nd4j::DataType::BOOL);
+    NDArray exp1('c', {2,2}, {1, 2, 3, 4}, sd::DataType::INT64);
+    NDArray exp2('c', {2,2}, {1.5, 2.5, 3.5, 4.5}, sd::DataType::DOUBLE);
+    NDArray exp3('c', {2,2}, {0.1, 1.6, 2.6, 3.6}, sd::DataType::FLOAT32);
+    NDArray exp4('c', {2,2}, {1.1, 2.1, 1.1, 2.1}, sd::DataType::DOUBLE);
+    NDArray exp5('c', {2,2}, {1, 1, 1, 1}, sd::DataType::BOOL);
 
-    x1.applyScalar<int>(nd4j::scalar::Add, 1, x1);
+    x1.applyScalar<int>(sd::scalar::Add, 1, x1);
     ASSERT_EQ(x1, exp1);
 
-    x1.applyScalar<double>(nd4j::scalar::Add, 0.5, x3);
+    x1.applyScalar<double>(sd::scalar::Add, 0.5, x3);
     ASSERT_EQ(x3, exp2);
 
-    x2.applyScalar<double>(nd4j::scalar::Add, 0.1, x2);
+    x2.applyScalar<double>(sd::scalar::Add, 0.1, x2);
     ASSERT_EQ(x2, exp3);
 
-    x4.applyScalar<double>(nd4j::scalar::Add, 1.1, x3);
+    x4.applyScalar<double>(sd::scalar::Add, 1.1, x3);
     ASSERT_EQ(x3, exp4);
 
-    x4.applyScalar<Nd4jLong>(nd4j::scalar::Add, 1, x4);
+    x4.applyScalar<Nd4jLong>(sd::scalar::Add, 1, x4);
     ASSERT_EQ(x4, exp5);
 }
 
 //////////////////////////////////////////////////////////////////////////////
 TEST_F(MultiDataTypeTests, ndarray_applyScalar_test2) {
 
-    NDArray x1('c', {2,2}, {0, 1, 2, 3}, nd4j::DataType::INT64);
-    NDArray x2('c', {2,2}, {0, 1.5, 2.5, 3.5}, nd4j::DataType::FLOAT32);
-    NDArray x3('c', {2,2}, {0, 1, 1, 0}, nd4j::DataType::BOOL);
-    NDArray x4('c', {2,2}, nd4j::DataType::BOOL);
+    NDArray x1('c', {2,2}, {0, 1, 2, 3}, sd::DataType::INT64);
+    NDArray x2('c', {2,2}, {0, 1.5, 2.5, 3.5}, sd::DataType::FLOAT32);
+    NDArray x3('c', {2,2}, {0, 1, 1, 0}, sd::DataType::BOOL);
+    NDArray x4('c', {2,2}, sd::DataType::BOOL);
 
 
-    NDArray exp1('c', {2,2}, {0, 1, 0, 0}, nd4j::DataType::BOOL);
-    NDArray exp2('c', {2,2}, {0, 1, 1, 0}, nd4j::DataType::BOOL);
+    NDArray exp1('c', {2,2}, {0, 1, 0, 0}, sd::DataType::BOOL);
+    NDArray exp2('c', {2,2}, {0, 1, 1, 0}, sd::DataType::BOOL);
 
-    x1.applyScalar<Nd4jLong>(nd4j::scalar::EqualTo, 1, x4);
+    x1.applyScalar<Nd4jLong>(sd::scalar::EqualTo, 1, x4);
     ASSERT_EQ(x4, exp1);
 
-    x2.applyScalar<float>(nd4j::scalar::EqualTo, 1.5, x4);
+    x2.applyScalar<float>(sd::scalar::EqualTo, 1.5, x4);
     ASSERT_EQ(x4, exp1);
 
-    x3.applyScalar<bool>(nd4j::scalar::EqualTo, true, x4);
+    x3.applyScalar<bool>(sd::scalar::EqualTo, true, x4);
     ASSERT_EQ(x4, exp2);
 
 }
@@ -1369,13 +1369,13 @@ TEST_F(MultiDataTypeTests, ndarray_applyScalar_test2) {
 //////////////////////////////////////////////////////////////////////////////
 TEST_F(MultiDataTypeTests, ndarray_applyLambda_test1) {
 
-    NDArray x1('c', {2,2}, {0, 1, 2, 3}, nd4j::DataType::DOUBLE);
-    NDArray x2('c', {2,2}, {0, 1, 2, 3}, nd4j::DataType::INT64);
-    NDArray x3('c', {2,2}, {0, 1.5, 2.5, 3.5}, nd4j::DataType::FLOAT32);
-    NDArray x4('c', {2,2}, nd4j::DataType::DOUBLE);
-    NDArray x5('c', {2,2}, {0, 1.5, 2.5, 3.5}, nd4j::DataType::FLOAT32);
-    NDArray x6('c', {2,2}, {0, -1, -1, 0.1}, nd4j::DataType::BOOL);
-    NDArray x7('c', {2,2}, nd4j::DataType::BOOL);
+    NDArray x1('c', {2,2}, {0, 1, 2, 3}, sd::DataType::DOUBLE);
+    NDArray x2('c', {2,2}, {0, 1, 2, 3}, sd::DataType::INT64);
+    NDArray x3('c', {2,2}, {0, 1.5, 2.5, 3.5}, sd::DataType::FLOAT32);
+    NDArray x4('c', {2,2}, sd::DataType::DOUBLE);
+    NDArray x5('c', {2,2}, {0, 1.5, 2.5, 3.5}, sd::DataType::FLOAT32);
+    NDArray x6('c', {2,2}, {0, -1, -1, 0.1}, sd::DataType::BOOL);
+    NDArray x7('c', {2,2}, sd::DataType::BOOL);
 
     const float item1  = 0.1;
     const double item2 = 0.1;
@@ -1385,11 +1385,11 @@ TEST_F(MultiDataTypeTests, ndarray_applyLambda_test1) {
     auto func4 = [=](double elem) { return elem + item1; };
     auto func5 = [=](float elem) { return elem - (int)1; };
 
-    NDArray exp1('c', {2,2}, {0.1, 1.1, 2.1, 3.1}, nd4j::DataType::DOUBLE);
-    NDArray exp2('c', {2,2}, {0, 1, 2, 3}, nd4j::DataType::INT64);
-    NDArray exp3('c', {2,2}, {0.1, 1.1, 2.1, 3.1}, nd4j::DataType::FLOAT32);
-    NDArray exp4('c', {2,2}, {0.1, 1.6, 2.6, 3.6}, nd4j::DataType::FLOAT32);
-    NDArray exp5('c', {2,2}, {1, 0, 0, 0}, nd4j::DataType::BOOL);
+    NDArray exp1('c', {2,2}, {0.1, 1.1, 2.1, 3.1}, sd::DataType::DOUBLE);
+    NDArray exp2('c', {2,2}, {0, 1, 2, 3}, sd::DataType::INT64);
+    NDArray exp3('c', {2,2}, {0.1, 1.1, 2.1, 3.1}, sd::DataType::FLOAT32);
+    NDArray exp4('c', {2,2}, {0.1, 1.6, 2.6, 3.6}, sd::DataType::FLOAT32);
+    NDArray exp5('c', {2,2}, {1, 0, 0, 0}, sd::DataType::BOOL);
 
     x1.applyLambda<double>(func1, x4);
     ASSERT_EQ(x4, exp1);
@@ -1414,13 +1414,13 @@ TEST_F(MultiDataTypeTests, ndarray_applyLambda_test1) {
 //////////////////////////////////////////////////////////////////////////////
 TEST_F(MultiDataTypeTests, ndarray_applyIndexedLambda_test1) {
 
-    NDArray x1('c', {2,2}, {0, 1, 2, 3}, nd4j::DataType::DOUBLE);
-    NDArray x2('c', {2,2}, {0, 1, 2, 3}, nd4j::DataType::INT64);
-    NDArray x3('c', {2,2}, {0, 1.5, 2.5, 3.5}, nd4j::DataType::FLOAT32);
-    NDArray x4('c', {2,2}, nd4j::DataType::DOUBLE);
-    NDArray x5('c', {2,2}, {0, 1.5, 2.5, 3.5}, nd4j::DataType::FLOAT32);
-    NDArray x6('c', {2,2}, {1, -1, -1, 0.1}, nd4j::DataType::BOOL);
-    NDArray x7('c', {2,2}, nd4j::DataType::BOOL);
+    NDArray x1('c', {2,2}, {0, 1, 2, 3}, sd::DataType::DOUBLE);
+    NDArray x2('c', {2,2}, {0, 1, 2, 3}, sd::DataType::INT64);
+    NDArray x3('c', {2,2}, {0, 1.5, 2.5, 3.5}, sd::DataType::FLOAT32);
+    NDArray x4('c', {2,2}, sd::DataType::DOUBLE);
+    NDArray x5('c', {2,2}, {0, 1.5, 2.5, 3.5}, sd::DataType::FLOAT32);
+    NDArray x6('c', {2,2}, {1, -1, -1, 0.1}, sd::DataType::BOOL);
+    NDArray x7('c', {2,2}, sd::DataType::BOOL);
 
     const float item1  = 0.1;
     const double item2 = 0.1;
@@ -1430,12 +1430,12 @@ TEST_F(MultiDataTypeTests, ndarray_applyIndexedLambda_test1) {
     auto func4 = [=](Nd4jLong idx, double elem) { return idx + elem + item1; };
     auto func5 = [=](Nd4jLong idx, float elem) { return idx + elem - (int)1; };
 
-    NDArray exp1('c', {2,2}, {0.1, 2.1, 4.1, 6.1}, nd4j::DataType::DOUBLE);
-    NDArray exp2('c', {2,2}, {0, 2, 4, 6}, nd4j::DataType::INT64);
-    NDArray exp3('c', {2,2}, {0.1, 2.1, 4.1, 6.1}, nd4j::DataType::FLOAT32);
-    NDArray exp4('c', {2,2}, {0.1, 2.6, 4.6, 6.6}, nd4j::DataType::FLOAT32);
-    NDArray exp5('c', {2,2}, {0, 1, 1, 1}, nd4j::DataType::BOOL);
-    NDArray exp6('c', {2,2}, {0, 3, 6, 9}, nd4j::DataType::INT64);
+    NDArray exp1('c', {2,2}, {0.1, 2.1, 4.1, 6.1}, sd::DataType::DOUBLE);
+    NDArray exp2('c', {2,2}, {0, 2, 4, 6}, sd::DataType::INT64);
+    NDArray exp3('c', {2,2}, {0.1, 2.1, 4.1, 6.1}, sd::DataType::FLOAT32);
+    NDArray exp4('c', {2,2}, {0.1, 2.6, 4.6, 6.6}, sd::DataType::FLOAT32);
+    NDArray exp5('c', {2,2}, {0, 1, 1, 1}, sd::DataType::BOOL);
+    NDArray exp6('c', {2,2}, {0, 3, 6, 9}, sd::DataType::INT64);
 
     x1.applyIndexedLambda<double>(func1, x4);
     ASSERT_EQ(x4, exp1);
@@ -1459,17 +1459,17 @@ TEST_F(MultiDataTypeTests, ndarray_applyIndexedLambda_test1) {
 //////////////////////////////////////////////////////////////////////////////
 TEST_F(MultiDataTypeTests, ndarray_applyPairwiseLambda_test1) {
 
-    NDArray x1('c', {2,2}, {0., 1, 2, 3}, nd4j::DataType::DOUBLE);
-    NDArray x2('c', {2,2}, {0., 1, 2, 3}, nd4j::DataType::INT64);
-    NDArray x3('c', {2,2}, {0., 1.5, 2.5, 3.5}, nd4j::DataType::FLOAT32);
-    NDArray x4('c', {2,2}, nd4j::DataType::DOUBLE);
-    NDArray x5('c', {2,2}, {0, 1.5, 2.5, 3.5}, nd4j::DataType::FLOAT32);
-    NDArray x6('c', {2,2}, {0.1, -1, -1, 0.1}, nd4j::DataType::BOOL);
-    NDArray x7('c', {2,2}, nd4j::DataType::BOOL);
-    NDArray other1('c', {2,2}, {0.1, 0.1, 0.1, 0.1}, nd4j::DataType::FLOAT32);
-    NDArray other2('c', {2,2}, {0.1, 0.1, 0.1, 0.1}, nd4j::DataType::DOUBLE);
-    NDArray other3('c', {2,2}, {0., -1, -2, -3}, nd4j::DataType::INT64);
-    NDArray other4('c', {2,2}, {1, 0, 0.1, 0}, nd4j::DataType::BOOL);
+    NDArray x1('c', {2,2}, {0., 1, 2, 3}, sd::DataType::DOUBLE);
+    NDArray x2('c', {2,2}, {0., 1, 2, 3}, sd::DataType::INT64);
+    NDArray x3('c', {2,2}, {0., 1.5, 2.5, 3.5}, sd::DataType::FLOAT32);
+    NDArray x4('c', {2,2}, sd::DataType::DOUBLE);
+    NDArray x5('c', {2,2}, {0, 1.5, 2.5, 3.5}, sd::DataType::FLOAT32);
+    NDArray x6('c', {2,2}, {0.1, -1, -1, 0.1}, sd::DataType::BOOL);
+    NDArray x7('c', {2,2}, sd::DataType::BOOL);
+    NDArray other1('c', {2,2}, {0.1, 0.1, 0.1, 0.1}, sd::DataType::FLOAT32);
+    NDArray other2('c', {2,2}, {0.1, 0.1, 0.1, 0.1}, sd::DataType::DOUBLE);
+    NDArray other3('c', {2,2}, {0., -1, -2, -3}, sd::DataType::INT64);
+    NDArray other4('c', {2,2}, {1, 0, 0.1, 0}, sd::DataType::BOOL);
 
     auto func1 = [](float elem1, float elem2) { return elem1 + elem2; };
     auto func2 = [](int elem1, float elem2) { return elem1 + elem2; };
@@ -1477,11 +1477,11 @@ TEST_F(MultiDataTypeTests, ndarray_applyPairwiseLambda_test1) {
     auto func4 = [](double elem1, float elem2) { return elem1 + elem2; };
     auto func5 = [](float elem1, int elem2) { return elem1 - elem2; };
 
-    NDArray exp1('c', {2,2}, {0.1, 1.1, 2.1, 3.1}, nd4j::DataType::DOUBLE);
-    NDArray exp2('c', {2,2}, {0., 0, 0, 0}, nd4j::DataType::INT64);
-    NDArray exp3('c', {2,2}, {0.1, 1.1, 2.1, 3.1}, nd4j::DataType::FLOAT32);
-    NDArray exp4('c', {2,2}, {0.1, 1.6, 2.6, 3.6}, nd4j::DataType::FLOAT32);
-    NDArray exp5('c', {2,2}, {0., 1, 0, 1}, nd4j::DataType::BOOL);
+    NDArray exp1('c', {2,2}, {0.1, 1.1, 2.1, 3.1}, sd::DataType::DOUBLE);
+    NDArray exp2('c', {2,2}, {0., 0, 0, 0}, sd::DataType::INT64);
+    NDArray exp3('c', {2,2}, {0.1, 1.1, 2.1, 3.1}, sd::DataType::FLOAT32);
+    NDArray exp4('c', {2,2}, {0.1, 1.6, 2.6, 3.6}, sd::DataType::FLOAT32);
+    NDArray exp5('c', {2,2}, {0., 1, 0, 1}, sd::DataType::BOOL);
 
     x1.applyPairwiseLambda<double>(other2, func1, x4);
     ASSERT_EQ(x4, exp1);
@@ -1505,17 +1505,17 @@ TEST_F(MultiDataTypeTests, ndarray_applyPairwiseLambda_test1) {
 //////////////////////////////////////////////////////////////////////////////
 TEST_F(MultiDataTypeTests, ndarray_applyIndexedPairwiseLambda_test1) {
 
-    NDArray x1('c', {2,2}, {0., 1, 2, 3}, nd4j::DataType::DOUBLE);
-    NDArray x2('c', {2,2}, {0., 1, 2, 3}, nd4j::DataType::INT64);
-    NDArray x3('c', {2,2}, {0., 1.5, 2.5, 3.5}, nd4j::DataType::FLOAT32);
-    NDArray x4('c', {2,2}, nd4j::DataType::DOUBLE);
-    NDArray x5('c', {2,2}, {0, 1.5, 2.5, 3.5}, nd4j::DataType::FLOAT32);
-    NDArray x6('c', {2,2}, {0.1, -1, -1,  0.1}, nd4j::DataType::BOOL);
-    NDArray x7('c', {2,2}, nd4j::DataType::BOOL);
-    NDArray other1('c', {2,2}, {0.1, 0.1, 0.1, 0.1}, nd4j::DataType::FLOAT32);
-    NDArray other2('c', {2,2}, {0.1, 0.1, 0.1, 0.1}, nd4j::DataType::DOUBLE);
-    NDArray other3('c', {2,2}, {0., -1, -2, -3}, nd4j::DataType::INT64);
-    NDArray other4('c', {2,2}, {1, 0, 0.1, 0}, nd4j::DataType::BOOL);
+    NDArray x1('c', {2,2}, {0., 1, 2, 3}, sd::DataType::DOUBLE);
+    NDArray x2('c', {2,2}, {0., 1, 2, 3}, sd::DataType::INT64);
+    NDArray x3('c', {2,2}, {0., 1.5, 2.5, 3.5}, sd::DataType::FLOAT32);
+    NDArray x4('c', {2,2}, sd::DataType::DOUBLE);
+    NDArray x5('c', {2,2}, {0, 1.5, 2.5, 3.5}, sd::DataType::FLOAT32);
+    NDArray x6('c', {2,2}, {0.1, -1, -1,  0.1}, sd::DataType::BOOL);
+    NDArray x7('c', {2,2}, sd::DataType::BOOL);
+    NDArray other1('c', {2,2}, {0.1, 0.1, 0.1, 0.1}, sd::DataType::FLOAT32);
+    NDArray other2('c', {2,2}, {0.1, 0.1, 0.1, 0.1}, sd::DataType::DOUBLE);
+    NDArray other3('c', {2,2}, {0., -1, -2, -3}, sd::DataType::INT64);
+    NDArray other4('c', {2,2}, {1, 0, 0.1, 0}, sd::DataType::BOOL);
 
     auto func1 = [](Nd4jLong idx, float elem1, float elem2) { return elem1 + elem2 + idx; };
     auto func2 = [](Nd4jLong idx, int elem1, float elem2) { return elem1 + elem2 + idx; };
@@ -1523,11 +1523,11 @@ TEST_F(MultiDataTypeTests, ndarray_applyIndexedPairwiseLambda_test1) {
     auto func4 = [](Nd4jLong idx, double elem1, float elem2) { return elem1 + elem2 + idx; };
     auto func5 = [](Nd4jLong idx, float elem1, int elem2) { return elem1 - elem2 + idx; };
 
-    NDArray exp1('c', {2,2}, {0.1, 2.1, 4.1, 6.1}, nd4j::DataType::DOUBLE);
-    NDArray exp2('c', {2,2}, {0., 1, 2, 3}, nd4j::DataType::INT64);
-    NDArray exp3('c', {2,2}, {0.1, 2.1, 4.1, 6.1}, nd4j::DataType::FLOAT32);
-    NDArray exp4('c', {2,2}, {0.1, 2.6, 4.6, 6.6}, nd4j::DataType::FLOAT32);
-    NDArray exp5('c', {2,2}, {0., 1, 1, 1}, nd4j::DataType::BOOL);
+    NDArray exp1('c', {2,2}, {0.1, 2.1, 4.1, 6.1}, sd::DataType::DOUBLE);
+    NDArray exp2('c', {2,2}, {0., 1, 2, 3}, sd::DataType::INT64);
+    NDArray exp3('c', {2,2}, {0.1, 2.1, 4.1, 6.1}, sd::DataType::FLOAT32);
+    NDArray exp4('c', {2,2}, {0.1, 2.6, 4.6, 6.6}, sd::DataType::FLOAT32);
+    NDArray exp5('c', {2,2}, {0., 1, 1, 1}, sd::DataType::BOOL);
 
     x1.applyIndexedPairwiseLambda<double>(other2, func1, x4);
     ASSERT_EQ(x4, exp1);
@@ -1551,25 +1551,25 @@ TEST_F(MultiDataTypeTests, ndarray_applyIndexedPairwiseLambda_test1) {
 //////////////////////////////////////////////////////////////////////////////
 TEST_F(MultiDataTypeTests, ndarray_applyTriplewiseLambda_test1) {
 
-    NDArray x1('c', {2,2}, {0., 1, 2, 3}, nd4j::DataType::DOUBLE);
-    NDArray x2('c', {2,2}, {0., -1, -2, -3}, nd4j::DataType::DOUBLE);
-    NDArray x3('c', {2,2}, {0, -1.5, -2.5, -3.5}, nd4j::DataType::DOUBLE);
-    NDArray x4('c', {2,2}, nd4j::DataType::DOUBLE);
+    NDArray x1('c', {2,2}, {0., 1, 2, 3}, sd::DataType::DOUBLE);
+    NDArray x2('c', {2,2}, {0., -1, -2, -3}, sd::DataType::DOUBLE);
+    NDArray x3('c', {2,2}, {0, -1.5, -2.5, -3.5}, sd::DataType::DOUBLE);
+    NDArray x4('c', {2,2}, sd::DataType::DOUBLE);
 
-    NDArray x5('c', {2,2}, {0., 1, 2, 3}, nd4j::DataType::INT32);
-    NDArray x6('c', {2,2}, {0., -1, -2, -3}, nd4j::DataType::INT32);
-    NDArray x7('c', {2,2}, {0., 10, 20, 30}, nd4j::DataType::INT32);
+    NDArray x5('c', {2,2}, {0., 1, 2, 3}, sd::DataType::INT32);
+    NDArray x6('c', {2,2}, {0., -1, -2, -3}, sd::DataType::INT32);
+    NDArray x7('c', {2,2}, {0., 10, 20, 30}, sd::DataType::INT32);
 
-    NDArray x8('c', {2,2}, {0., 1, 0, 1}, nd4j::DataType::BOOL);
-    NDArray x9('c', {2,2}, {1., 1, 0, 1}, nd4j::DataType::BOOL);
-    NDArray x10('c', {2,2}, {0., 0, 0, 0}, nd4j::DataType::BOOL);
+    NDArray x8('c', {2,2}, {0., 1, 0, 1}, sd::DataType::BOOL);
+    NDArray x9('c', {2,2}, {1., 1, 0, 1}, sd::DataType::BOOL);
+    NDArray x10('c', {2,2}, {0., 0, 0, 0}, sd::DataType::BOOL);
 
     auto func1 = [](double elem1, float elem2, int elem3) { return elem1 + elem2 + elem3; };
     auto func2 = [](float elem1, float elem2, float elem3) { return elem1 + elem2 + elem3; };
     auto func3 = [](int elem1, int elem2, int elem3) { return elem1 + elem2 + elem3; };
     auto func4 = [](bool elem1, bool elem2, bool elem3) { return elem1 + elem2 + elem3; };
 
-    NDArray exp('c', {2,2}, {1., 1, 0, 1}, nd4j::DataType::BOOL);
+    NDArray exp('c', {2,2}, {1., 1, 0, 1}, sd::DataType::BOOL);
 
     x1.applyTriplewiseLambda<double>(x2, x3, func1, x4);
     ASSERT_EQ(x4, x2);
@@ -1589,51 +1589,51 @@ TEST_F(MultiDataTypeTests, ndarray_applyTriplewiseLambda_test1) {
 //////////////////////////////////////////////////////////////////////////////
 TEST_F(MultiDataTypeTests, ndarray_applyIndexReduce_test1) {
 
-    NDArray x1('c', {2,3}, {0, 1, 2, 3, 4, 5}, nd4j::DataType::DOUBLE);
-    NDArray exp1('c', {}, std::vector<double>{5}, nd4j::DataType::INT64);
-    NDArray exp2('c', {2}, {2,2}, nd4j::DataType::INT64);
-    NDArray exp3('c', {3}, {1,1,1}, nd4j::DataType::INT64);
+    NDArray x1('c', {2,3}, {0, 1, 2, 3, 4, 5}, sd::DataType::DOUBLE);
+    NDArray exp1('c', {}, std::vector<double>{5}, sd::DataType::INT64);
+    NDArray exp2('c', {2}, {2,2}, sd::DataType::INT64);
+    NDArray exp3('c', {3}, {1,1,1}, sd::DataType::INT64);
 
-    NDArray scalar = x1.applyIndexReduce(nd4j::indexreduce::IndexMax, {0,1});
+    NDArray scalar = x1.applyIndexReduce(sd::indexreduce::IndexMax, {0,1});
     ASSERT_EQ(scalar, exp1);
 
-    NDArray vec1 = x1.applyIndexReduce(nd4j::indexreduce::IndexMax, {1});
+    NDArray vec1 = x1.applyIndexReduce(sd::indexreduce::IndexMax, {1});
     ASSERT_EQ(vec1, exp2);
 
-    NDArray vec2 = x1.applyIndexReduce(nd4j::indexreduce::IndexMax, {0});
+    NDArray vec2 = x1.applyIndexReduce(sd::indexreduce::IndexMax, {0});
     ASSERT_EQ(vec2, exp3);
 }
 
 //////////////////////////////////////////////////////////////////////////////
 TEST_F(MultiDataTypeTests, ndarray_applyIndexReduce_test2) {
 
-    NDArray x1('c', {2,3}, {0, 1, 2, 3, 4, 5}, nd4j::DataType::DOUBLE);
-    NDArray scalar('c', {}, std::vector<double>{5}, nd4j::DataType::INT64);
-    NDArray vec1('c', {2}, {2,2}, nd4j::DataType::INT64);
-    NDArray vec2('c', {3}, {1,1,1}, nd4j::DataType::INT64);
-    NDArray exp1('c', {}, std::vector<double>{5}, nd4j::DataType::INT64);
-    NDArray exp2('c', {2}, {2,2}, nd4j::DataType::INT64);
-    NDArray exp3('c', {3}, {1,1,1}, nd4j::DataType::INT64);
+    NDArray x1('c', {2,3}, {0, 1, 2, 3, 4, 5}, sd::DataType::DOUBLE);
+    NDArray scalar('c', {}, std::vector<double>{5}, sd::DataType::INT64);
+    NDArray vec1('c', {2}, {2,2}, sd::DataType::INT64);
+    NDArray vec2('c', {3}, {1,1,1}, sd::DataType::INT64);
+    NDArray exp1('c', {}, std::vector<double>{5}, sd::DataType::INT64);
+    NDArray exp2('c', {2}, {2,2}, sd::DataType::INT64);
+    NDArray exp3('c', {3}, {1,1,1}, sd::DataType::INT64);
 
-    x1.applyIndexReduce(nd4j::indexreduce::IndexMax, scalar, {0,1});
+    x1.applyIndexReduce(sd::indexreduce::IndexMax, scalar, {0,1});
     ASSERT_EQ(scalar, exp1);
 
-    x1.applyIndexReduce(nd4j::indexreduce::IndexMax, vec1, {1});
+    x1.applyIndexReduce(sd::indexreduce::IndexMax, vec1, {1});
     ASSERT_EQ(vec1, exp2);
 
-    x1.applyIndexReduce(nd4j::indexreduce::IndexMax, vec2, {0});
+    x1.applyIndexReduce(sd::indexreduce::IndexMax, vec2, {0});
     ASSERT_EQ(vec2, exp3);
 }
 
 //////////////////////////////////////////////////////////////////////
 TEST_F(MultiDataTypeTests, applyReduce3_test1) {
 
-    NDArray x1('c', {2,2}, {1,2,3,4}, nd4j::DataType::INT32);
-    NDArray x2('c', {2,2}, {-1,-2,-3,-4}, nd4j::DataType::INT32);
-    NDArray x3('c', {2,2}, {1.5,1.5,1.5,1.5}, nd4j::DataType::DOUBLE);
-    NDArray x4('c', {2,2}, {1,2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray exp1('c', {}, std::vector<double>{-30}, nd4j::DataType::FLOAT32);
-    NDArray exp2('c', {}, std::vector<double>{15}, nd4j::DataType::DOUBLE);
+    NDArray x1('c', {2,2}, {1,2,3,4}, sd::DataType::INT32);
+    NDArray x2('c', {2,2}, {-1,-2,-3,-4}, sd::DataType::INT32);
+    NDArray x3('c', {2,2}, {1.5,1.5,1.5,1.5}, sd::DataType::DOUBLE);
+    NDArray x4('c', {2,2}, {1,2,3,4}, sd::DataType::DOUBLE);
+    NDArray exp1('c', {}, std::vector<double>{-30}, sd::DataType::FLOAT32);
+    NDArray exp2('c', {}, std::vector<double>{15}, sd::DataType::DOUBLE);
 
     auto result = x1.applyReduce3(reduce3::Dot, x2);
     ASSERT_EQ(result, exp1);
@@ -1645,21 +1645,21 @@ TEST_F(MultiDataTypeTests, applyReduce3_test1) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(MultiDataTypeTests, applyReduce3_test2) {
 
-    NDArray x1('c', {2,2}, {1,2,3,4}, nd4j::DataType::INT32);
-    NDArray x2('c', {2,2}, {-1,-2,-3,-4}, nd4j::DataType::INT32);
-    NDArray x3('c', {2,2}, {1.5,1.5,1.5,1.5}, nd4j::DataType::DOUBLE);
-    NDArray x4('c', {2,2}, {1,2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray x5('c', {2,3}, {1,2,3,4,5,6}, nd4j::DataType::INT32);
-    NDArray x6('c', {2,3}, {-6,-5,-4,-3,-2,-1}, nd4j::DataType::INT32);
-    NDArray x7('c', {2,3}, {1.5,1.5,1.5,1.5,1.5,1.5}, nd4j::DataType::DOUBLE);
-    NDArray x8('c', {2,3}, {1,2,3,4,5,6}, nd4j::DataType::DOUBLE);
+    NDArray x1('c', {2,2}, {1,2,3,4}, sd::DataType::INT32);
+    NDArray x2('c', {2,2}, {-1,-2,-3,-4}, sd::DataType::INT32);
+    NDArray x3('c', {2,2}, {1.5,1.5,1.5,1.5}, sd::DataType::DOUBLE);
+    NDArray x4('c', {2,2}, {1,2,3,4}, sd::DataType::DOUBLE);
+    NDArray x5('c', {2,3}, {1,2,3,4,5,6}, sd::DataType::INT32);
+    NDArray x6('c', {2,3}, {-6,-5,-4,-3,-2,-1}, sd::DataType::INT32);
+    NDArray x7('c', {2,3}, {1.5,1.5,1.5,1.5,1.5,1.5}, sd::DataType::DOUBLE);
+    NDArray x8('c', {2,3}, {1,2,3,4,5,6}, sd::DataType::DOUBLE);
 
-    NDArray exp1('c', {}, std::vector<double>{-30}, nd4j::DataType::FLOAT32);
-    NDArray exp2('c', {}, std::vector<double>{15}, nd4j::DataType::DOUBLE);
-    NDArray exp3('c', {3}, {-18,-20,-18}, nd4j::DataType::FLOAT32);
-    NDArray exp4('c', {2}, {-28,-28}, nd4j::DataType::FLOAT32);
-    NDArray exp5('c', {3}, {7.5,10.5,13.5}, nd4j::DataType::DOUBLE);
-    NDArray exp6('c', {2}, {9,22.5}, nd4j::DataType::DOUBLE);
+    NDArray exp1('c', {}, std::vector<double>{-30}, sd::DataType::FLOAT32);
+    NDArray exp2('c', {}, std::vector<double>{15}, sd::DataType::DOUBLE);
+    NDArray exp3('c', {3}, {-18,-20,-18}, sd::DataType::FLOAT32);
+    NDArray exp4('c', {2}, {-28,-28}, sd::DataType::FLOAT32);
+    NDArray exp5('c', {3}, {7.5,10.5,13.5}, sd::DataType::DOUBLE);
+    NDArray exp6('c', {2}, {9,22.5}, sd::DataType::DOUBLE);
 
     auto result = x1.applyReduce3(reduce3::Dot, x2, {0,1});
     ASSERT_EQ(result, exp1);
@@ -1683,12 +1683,12 @@ TEST_F(MultiDataTypeTests, applyReduce3_test2) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(MultiDataTypeTests, applyAllReduce3_test1) {
 
-    NDArray x1('c', {2,2}, {1,2,3,4}, nd4j::DataType::INT32);
-    NDArray x2('c', {2,3}, {-1,1,-1,1,-1,1}, nd4j::DataType::INT32);
-    NDArray x3('c', {2,3}, {1.5,1.5,1.5,1.5,1.5,1.5}, nd4j::DataType::DOUBLE);
-    NDArray x4('c', {2,2}, {1,2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray exp1('c', {2,3}, {2,-2,2,2,-2,2}, nd4j::DataType::FLOAT32);
-    NDArray exp2('c', {2,3}, {6,6,6,9,9,9}, nd4j::DataType::DOUBLE);
+    NDArray x1('c', {2,2}, {1,2,3,4}, sd::DataType::INT32);
+    NDArray x2('c', {2,3}, {-1,1,-1,1,-1,1}, sd::DataType::INT32);
+    NDArray x3('c', {2,3}, {1.5,1.5,1.5,1.5,1.5,1.5}, sd::DataType::DOUBLE);
+    NDArray x4('c', {2,2}, {1,2,3,4}, sd::DataType::DOUBLE);
+    NDArray exp1('c', {2,3}, {2,-2,2,2,-2,2}, sd::DataType::FLOAT32);
+    NDArray exp2('c', {2,3}, {6,6,6,9,9,9}, sd::DataType::DOUBLE);
 
     auto result = x1.applyAllReduce3(reduce3::Dot, x2, {0});
     ASSERT_EQ(result, exp1);
@@ -1702,16 +1702,16 @@ TEST_F(MultiDataTypeTests, RowCol_test1) {
     if (!Environment::getInstance()->isExperimentalBuild())
         return;
 
-    NDArray x1('c', {2,3}, {1,2,3,4,5,6}, nd4j::DataType::INT32);
-    NDArray x2('c', {2}, {0.5,0.6}, nd4j::DataType::FLOAT32);
-    NDArray x3('c', {3}, {1.5,1.6,1.7}, nd4j::DataType::FLOAT32);
-    NDArray x4('c', {2,3}, {1,2,3,4,5,6}, nd4j::DataType::DOUBLE);
-    NDArray x5('c', {2,3}, {1,2,3,4,5,6}, nd4j::DataType::INT32);
+    NDArray x1('c', {2,3}, {1,2,3,4,5,6}, sd::DataType::INT32);
+    NDArray x2('c', {2}, {0.5,0.6}, sd::DataType::FLOAT32);
+    NDArray x3('c', {3}, {1.5,1.6,1.7}, sd::DataType::FLOAT32);
+    NDArray x4('c', {2,3}, {1,2,3,4,5,6}, sd::DataType::DOUBLE);
+    NDArray x5('c', {2,3}, {1,2,3,4,5,6}, sd::DataType::INT32);
 
-    NDArray exp1('c', {2,3}, {2,3,4,5,6,7}, nd4j::DataType::INT32);
-    NDArray exp2('c', {2,3}, {0,1,2,3,4,5}, nd4j::DataType::INT32);
-    NDArray exp3('c', {2,3}, {1.5,2.5,3.5,4.6,5.6,6.6}, nd4j::DataType::DOUBLE);
-    NDArray exp4('c', {2,3}, {0,1,1,2,3,3}, nd4j::DataType::INT32);
+    NDArray exp1('c', {2,3}, {2,3,4,5,6,7}, sd::DataType::INT32);
+    NDArray exp2('c', {2,3}, {0,1,2,3,4,5}, sd::DataType::INT32);
+    NDArray exp3('c', {2,3}, {1.5,2.5,3.5,4.6,5.6,6.6}, sd::DataType::DOUBLE);
+    NDArray exp4('c', {2,3}, {0,1,1,2,3,3}, sd::DataType::INT32);
 
     x1.addiRowVector(x3);
     ASSERT_EQ(x1, exp1);
@@ -1731,23 +1731,23 @@ TEST_F(MultiDataTypeTests, RowCol_test2) {
     if (!Environment::getInstance()->isExperimentalBuild())
         return;
 
-    NDArray x1('c', {2,3}, {1,2,3,4,5,6}, nd4j::DataType::INT32);
-    NDArray x2('c', {2}, {0.5,0.6}, nd4j::DataType::FLOAT32);
-    NDArray x3('c', {3}, {1.5,1.6,1.7}, nd4j::DataType::FLOAT32);
-    NDArray x4('c', {2,3},  nd4j::DataType::FLOAT32);
-    NDArray x5('c', {3}, {1,2,3}, nd4j::DataType::INT64);
-    NDArray x6('c', {2,3},  nd4j::DataType::INT32);
-    NDArray x7('c', {3}, {1.5,1.6,1.7}, nd4j::DataType::DOUBLE);
-    NDArray x8('c', {2,3}, {1,2,3,4,5,6}, nd4j::DataType::FLOAT32);
-    NDArray x9('c', {3}, {1,2,3}, nd4j::DataType::DOUBLE);
-    NDArray x10('c', {2,3}, nd4j::DataType::DOUBLE);
+    NDArray x1('c', {2,3}, {1,2,3,4,5,6}, sd::DataType::INT32);
+    NDArray x2('c', {2}, {0.5,0.6}, sd::DataType::FLOAT32);
+    NDArray x3('c', {3}, {1.5,1.6,1.7}, sd::DataType::FLOAT32);
+    NDArray x4('c', {2,3},  sd::DataType::FLOAT32);
+    NDArray x5('c', {3}, {1,2,3}, sd::DataType::INT64);
+    NDArray x6('c', {2,3},  sd::DataType::INT32);
+    NDArray x7('c', {3}, {1.5,1.6,1.7}, sd::DataType::DOUBLE);
+    NDArray x8('c', {2,3}, {1,2,3,4,5,6}, sd::DataType::FLOAT32);
+    NDArray x9('c', {3}, {1,2,3}, sd::DataType::DOUBLE);
+    NDArray x10('c', {2,3}, sd::DataType::DOUBLE);
 
-    NDArray exp1('c', {2,3}, {2.5,3.6,4.7,5.5,6.6,7.7}, nd4j::DataType::FLOAT32);
-    NDArray exp2('c', {2,3}, {2, 4, 6, 5, 7, 9}, nd4j::DataType::INT32);
-    NDArray exp3('c', {2,3}, {-0.5,0.4,1.3,2.5,3.4,4.3}, nd4j::DataType::FLOAT32);
-    NDArray exp4('c', {2,3}, {1,4,9,4,10,18}, nd4j::DataType::DOUBLE);
-    NDArray exp5('c', {2,3}, {1,1,1,4,2.5,2}, nd4j::DataType::DOUBLE);
-    NDArray exp6('c', {2,3}, {1.5,2.5,3.5,4.6,5.6,6.6}, nd4j::DataType::FLOAT32);
+    NDArray exp1('c', {2,3}, {2.5,3.6,4.7,5.5,6.6,7.7}, sd::DataType::FLOAT32);
+    NDArray exp2('c', {2,3}, {2, 4, 6, 5, 7, 9}, sd::DataType::INT32);
+    NDArray exp3('c', {2,3}, {-0.5,0.4,1.3,2.5,3.4,4.3}, sd::DataType::FLOAT32);
+    NDArray exp4('c', {2,3}, {1,4,9,4,10,18}, sd::DataType::DOUBLE);
+    NDArray exp5('c', {2,3}, {1,1,1,4,2.5,2}, sd::DataType::DOUBLE);
+    NDArray exp6('c', {2,3}, {1.5,2.5,3.5,4.6,5.6,6.6}, sd::DataType::FLOAT32);
 
     x1.addRowVector(x3, x4);
     ASSERT_EQ(x4, exp1);
@@ -1772,18 +1772,18 @@ TEST_F(MultiDataTypeTests, RowCol_test2) {
 /*
 TEST_F(MultiDataTypeTests, tile_test1) {
 
-    NDArray x1('c', {2,1}, {0,1}, nd4j::DataType::INT32);
-    NDArray x2('c', {2,1}, {0.5,1.5}, nd4j::DataType::DOUBLE);
-    NDArray x3('c', {2,2}, nd4j::DataType::INT32);
-    NDArray x4('c', {2,2}, nd4j::DataType::DOUBLE);
-    NDArray x5('c', {1,2}, {0.5,1.5}, nd4j::DataType::DOUBLE);;
-    NDArray x6('c', {2,2}, nd4j::DataType::FLOAT32);
-    NDArray x7('c', {2,2}, nd4j::DataType::BOOL);
+    NDArray x1('c', {2,1}, {0,1}, sd::DataType::INT32);
+    NDArray x2('c', {2,1}, {0.5,1.5}, sd::DataType::DOUBLE);
+    NDArray x3('c', {2,2}, sd::DataType::INT32);
+    NDArray x4('c', {2,2}, sd::DataType::DOUBLE);
+    NDArray x5('c', {1,2}, {0.5,1.5}, sd::DataType::DOUBLE);;
+    NDArray x6('c', {2,2}, sd::DataType::FLOAT32);
+    NDArray x7('c', {2,2}, sd::DataType::BOOL);
 
-    NDArray exp1('c', {2,2}, {0,0,1,1}, nd4j::DataType::DOUBLE);
-    NDArray exp2('c', {2,2}, {0.5,1.5,0.5,1.5}, nd4j::DataType::FLOAT32);
-    NDArray exp3('c', {2,2}, {0,0,1,1}, nd4j::DataType::INT32);
-    NDArray exp4('c', {2,2}, {0,0,1,1}, nd4j::DataType::BOOL);
+    NDArray exp1('c', {2,2}, {0,0,1,1}, sd::DataType::DOUBLE);
+    NDArray exp2('c', {2,2}, {0.5,1.5,0.5,1.5}, sd::DataType::FLOAT32);
+    NDArray exp3('c', {2,2}, {0,0,1,1}, sd::DataType::INT32);
+    NDArray exp4('c', {2,2}, {0,0,1,1}, sd::DataType::BOOL);
 
     x1.tile({1,2}, x4);
     ASSERT_EQ(x4, exp1);
@@ -1808,10 +1808,10 @@ TEST_F(MultiDataTypeTests, tile_test1) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(MultiDataTypeTests, asT_test1) {
 
-    NDArray x1('c', {2}, {1.5, 2.5}, nd4j::DataType::FLOAT32);
+    NDArray x1('c', {2}, {1.5, 2.5}, sd::DataType::FLOAT32);
 
-    NDArray exp1('c', {2}, {1, 2}, nd4j::DataType::INT32);
-    NDArray exp2('c', {2}, {1.5, 2.5}, nd4j::DataType::DOUBLE);
+    NDArray exp1('c', {2}, {1, 2}, sd::DataType::INT32);
+    NDArray exp2('c', {2}, {1.5, 2.5}, sd::DataType::DOUBLE);
 
     auto result = new NDArray(x1.asT<int>());
     ASSERT_EQ(*result, exp1);
@@ -1821,11 +1821,11 @@ TEST_F(MultiDataTypeTests, asT_test1) {
     ASSERT_EQ(*result, exp2);
     delete result;
 
-    result = new NDArray(x1.asT(nd4j::DataType::INT32));
+    result = new NDArray(x1.asT(sd::DataType::INT32));
     ASSERT_EQ(*result, exp1);
     delete result;
 
-    result = new NDArray(x1.asT(nd4j::DataType::DOUBLE));
+    result = new NDArray(x1.asT(sd::DataType::DOUBLE));
     ASSERT_EQ(*result, exp2);
     delete result;
 }
@@ -1833,15 +1833,15 @@ TEST_F(MultiDataTypeTests, asT_test1) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(MultiDataTypeTests, assign_test2) {
 
-    NDArray x1('c', {2,3}, {1.5,2.5,3.5,4.5,5.5,6.5}, nd4j::DataType::FLOAT32);
-    NDArray x2('c', {3,2}, nd4j::DataType::INT32);
-    NDArray x3('c', {3,2}, nd4j::DataType::DOUBLE);
-    NDArray x4('c', {3,2}, nd4j::DataType::BOOL);
-    NDArray x5('c', {2,3}, {1.5,2.5,0,4.5,5.5,6.5}, nd4j::DataType::FLOAT32);
+    NDArray x1('c', {2,3}, {1.5,2.5,3.5,4.5,5.5,6.5}, sd::DataType::FLOAT32);
+    NDArray x2('c', {3,2}, sd::DataType::INT32);
+    NDArray x3('c', {3,2}, sd::DataType::DOUBLE);
+    NDArray x4('c', {3,2}, sd::DataType::BOOL);
+    NDArray x5('c', {2,3}, {1.5,2.5,0,4.5,5.5,6.5}, sd::DataType::FLOAT32);
 
-    NDArray exp1('c', {3,2}, {1, 2,3,4,5,6}, nd4j::DataType::INT32);
-    NDArray exp2('c', {3,2}, {1.5,2.5,3.5,4.5,5.5,6.5}, nd4j::DataType::DOUBLE);
-    NDArray exp3('c', {3,2}, {1,1,0,1,1,1}, nd4j::DataType::BOOL);
+    NDArray exp1('c', {3,2}, {1, 2,3,4,5,6}, sd::DataType::INT32);
+    NDArray exp2('c', {3,2}, {1.5,2.5,3.5,4.5,5.5,6.5}, sd::DataType::DOUBLE);
+    NDArray exp3('c', {3,2}, {1,1,0,1,1,1}, sd::DataType::BOOL);
 
     x2.assign(x1);
     ASSERT_EQ(x2, exp1);
@@ -1898,10 +1898,10 @@ TEST_F(MultiDataTypeTests, Test_Cast_2) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(MultiDataTypeTests, divide_bool_test1) {
 
-    NDArray x1('c', {2,3}, {1.5,0,3.5,0,5.5,6.5}, nd4j::DataType::FLOAT32);
-    NDArray x2('c', {3,2}, {1,1,0,1,0,1}, nd4j::DataType::BOOL);
-    NDArray x3('c', {2,3}, nd4j::DataType::FLOAT32);
-    NDArray x4('c', {2}, nd4j::DataType::BOOL);
+    NDArray x1('c', {2,3}, {1.5,0,3.5,0,5.5,6.5}, sd::DataType::FLOAT32);
+    NDArray x2('c', {3,2}, {1,1,0,1,0,1}, sd::DataType::BOOL);
+    NDArray x3('c', {2,3}, sd::DataType::FLOAT32);
+    NDArray x4('c', {2}, sd::DataType::BOOL);
 
     try {
         NDArray x3 = x1 / x2;
@@ -1936,7 +1936,7 @@ TEST_F(MultiDataTypeTests, divide_bool_test1) {
     }
 
     try {
-        x1.applyBroadcast(nd4j::broadcast::FloorDiv, {1}, x4, x3);
+        x1.applyBroadcast(sd::broadcast::FloorDiv, {1}, x4, x3);
     }
     catch (std::exception& message) {
         // printf("%s\n", message.what());
@@ -1956,13 +1956,13 @@ TEST_F(MultiDataTypeTests, divide_bool_test1) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(MultiDataTypeTests, aaa) {
 
-    NDArray z('c', {2,5}, {1,2,3,4,5,6,7,8,9,10}, nd4j::DataType::DOUBLE);
+    NDArray z('c', {2,5}, {1,2,3,4,5,6,7,8,9,10}, sd::DataType::DOUBLE);
     z.permutei({1,0});
 
-    nd4j::graph::RandomGenerator gen(119,5);
+    sd::graph::RandomGenerator gen(119,5);
     ExtraArguments extras({1.5, 2.5});
 
-    NativeOpExecutioner::execRandom(LaunchContext::defaultContext(), nd4j::random::UniformDistribution,
+    NativeOpExecutioner::execRandom(LaunchContext::defaultContext(), sd::random::UniformDistribution,
                                 &gen,
                                 z.buffer(), z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
                                 extras.argumentsAsT<double>());
@@ -1973,9 +1973,9 @@ TEST_F(MultiDataTypeTests, aaa) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(MultiDataTypeTests, assign_2)
 {
-    NDArray x('c', {4}, {1.5,2.5,3.5,4.5}, nd4j::DataType::FLOAT32);
-    NDArray y('c', {4}, nd4j::DataType::INT32);
-    NDArray expected('c', {4}, {1,2,3,4}, nd4j::DataType::INT32);
+    NDArray x('c', {4}, {1.5,2.5,3.5,4.5}, sd::DataType::FLOAT32);
+    NDArray y('c', {4}, sd::DataType::INT32);
+    NDArray expected('c', {4}, {1,2,3,4}, sd::DataType::INT32);
 
     y.assign(x);
     // y.printBuffer();
diff --git a/libnd4j/tests_cpu/layers_tests/MultiDeviceTests.cpp b/libnd4j/tests_cpu/layers_tests/MultiDeviceTests.cpp
index efd48311b..1c12f2d72 100644
--- a/libnd4j/tests_cpu/layers_tests/MultiDeviceTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/MultiDeviceTests.cpp
@@ -20,15 +20,15 @@
 
 #include "testlayers.h"
 #include <array/ArrayOptions.h>
-#include <AffinityManager.h>
-#include <NDArray.h>
-#include <NDArrayFactory.h>
+#include <execution/AffinityManager.h>
+#include <array/NDArray.h>
+#include <array/NDArrayFactory.h>
 #include <ops/declarable/headers/broadcastable.h>
-#include <MmulHelper.h>
+#include <helpers/MmulHelper.h>
 #include <thread>
 
 
-using namespace nd4j;
+using namespace sd;
 
 class MultiDeviceTests : public testing::Test {
 public:
diff --git a/libnd4j/tests_cpu/layers_tests/NDArrayConstructorsTests.cu b/libnd4j/tests_cpu/layers_tests/NDArrayConstructorsTests.cu
index 0c0c102ac..48208d2ff 100644
--- a/libnd4j/tests_cpu/layers_tests/NDArrayConstructorsTests.cu
+++ b/libnd4j/tests_cpu/layers_tests/NDArrayConstructorsTests.cu
@@ -19,20 +19,20 @@
 //
 
 #include "testlayers.h"
-#include <NDArray.h>
-#include <NDArrayFactory.h>
-#include <Context.h>
-#include <Node.h>
+#include <array/NDArray.h>
+#include <array/NDArrayFactory.h>
+#include <graph/Context.h>
+#include <graph/Node.h>
 #include <graph/Variable.h>
 #include <graph/VariableSpace.h>
 #include <execution/LaunchContext.h>
-#include <specials_cuda.h>
-#include <TAD.h>
+#include <ops/specials_cuda.h>
+#include <helpers/TAD.h>
 
 #include <cuda.h>
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class NDArrayConstructorsTests : public testing::Test {
 public:
@@ -84,7 +84,7 @@ TEST_F(NDArrayConstructorsTests, test_constructor_3) {
 }
 
 TEST_F(NDArrayConstructorsTests, test_constructor_4) {
-    auto x = NDArrayFactory::create(nd4j::DataType::FLOAT32, 1.0f);
+    auto x = NDArrayFactory::create(sd::DataType::FLOAT32, 1.0f);
 
     ASSERT_FALSE(x.buffer() == nullptr);
     ASSERT_FALSE(x.specialBuffer() == nullptr);
@@ -183,7 +183,7 @@ TEST_F(NDArrayConstructorsTests, test_linspace_1) {
 
 TEST_F(NDArrayConstructorsTests, test_constructor_10) {
 
-    NDArray scalar1(nd4j::DataType::DOUBLE); // scalar1 = 0
+    NDArray scalar1(sd::DataType::DOUBLE); // scalar1 = 0
     NDArray scalar2('c', {}, std::vector<double>{0});
 
     ASSERT_TRUE(scalar1.isActualOnDeviceSide());
diff --git a/libnd4j/tests_cpu/layers_tests/NDArrayCudaBasicsTests.cu b/libnd4j/tests_cpu/layers_tests/NDArrayCudaBasicsTests.cu
index 46f962dda..6c37e3145 100644
--- a/libnd4j/tests_cpu/layers_tests/NDArrayCudaBasicsTests.cu
+++ b/libnd4j/tests_cpu/layers_tests/NDArrayCudaBasicsTests.cu
@@ -19,21 +19,21 @@
  //
 
 #include "testlayers.h"
-#include <NDArray.h>
-#include <NDArrayFactory.h>
-#include <Context.h>
-#include <Node.h>
+#include <array/NDArray.h>
+#include <array/NDArrayFactory.h>
+#include <graph/Context.h>
+#include <graph/Node.h>
 #include <graph/Variable.h>
 #include <graph/VariableSpace.h>
 #include <execution/LaunchContext.h>
-#include <specials_cuda.h>
-#include <TAD.h>
+#include <ops/specials_cuda.h>
+#include <helpers/TAD.h>
 #include <ops/declarable/CustomOperations.h>
 
 #include <cuda.h>
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class NDArrayCudaBasicsTests : public testing::Test {
 public:
@@ -197,7 +197,7 @@ TEST_F(NDArrayCudaBasicsTests, TestAdd_2) {
     // allocating host-side arrays
     NDArray x('c', { 5 }, { 1, 2, 3, 4, 5});
     NDArray y('c', { 5 }, { 1, 2, 3, 4, 5});
-    NDArray z('c', { 5 }, nd4j::DataType::DOUBLE);
+    NDArray z('c', { 5 }, sd::DataType::DOUBLE);
 
     NDArray exp('c', { 5 }, { 2, 4, 6, 8, 10 });
 
@@ -422,7 +422,7 @@ TEST_F(NDArrayCudaBasicsTests, TestMultiply_2) {
     // allocating host-side arrays
     auto x = NDArrayFactory::create<double>('c', { 5 }, { 1, 2, 3, 4, 5});
     auto y = NDArrayFactory::create<double>('c', { 5 }, { 1, 2, 3, 4, 5});
-    NDArray z('c', { 5 }, nd4j::DataType::DOUBLE);
+    NDArray z('c', { 5 }, sd::DataType::DOUBLE);
 
     auto exp = NDArrayFactory::create<double>('c', { 5 }, { 1, 4, 9, 16, 25 });
 
@@ -449,8 +449,8 @@ TEST_F(NDArrayCudaBasicsTests, TestMultiply_2) {
 //////////////////////////////////////////////////////////////////////////
 TEST_F(NDArrayCudaBasicsTests, TestMultiply_3) {
     // allocating host-side arrays
-    NDArray x('c', { 5 }, { 1, 2, 3, 4, 5}, nd4j::DataType::DOUBLE);
-    NDArray y('c', { 5 }, { 1., 2., 3., 4., 5.}, nd4j::DataType::DOUBLE);
+    NDArray x('c', { 5 }, { 1, 2, 3, 4, 5}, sd::DataType::DOUBLE);
+    NDArray y('c', { 5 }, { 1., 2., 3., 4., 5.}, sd::DataType::DOUBLE);
     auto z = NDArrayFactory::create<double>('c', { 5 });
 
     auto exp = NDArrayFactory::create<double>('c', { 5 }, { 1, 4, 9, 16, 25 });
@@ -481,8 +481,8 @@ TEST_F(NDArrayCudaBasicsTests, TestMultiply_3) {
 //////////////////////////////////////////////////////////////////////////
 TEST_F(NDArrayCudaBasicsTests, TestMultiply_4) {
     // allocating host-side arrays
-    NDArray x('c', { 5 }, { 1, 2, 3, 4, 5}, nd4j::DataType::DOUBLE);
-    NDArray y('c', { 5 }, { 1., 2., 3., 4., 5.}, nd4j::DataType::DOUBLE);
+    NDArray x('c', { 5 }, { 1, 2, 3, 4, 5}, sd::DataType::DOUBLE);
+    NDArray y('c', { 5 }, { 1., 2., 3., 4., 5.}, sd::DataType::DOUBLE);
     //auto z = NDArrayFactory::create<double>('c', { 5 });
 
     auto exp = NDArrayFactory::create<double>('c', { 5 }, { 1, 4, 9, 16, 25 });
@@ -690,10 +690,10 @@ TEST_F(NDArrayCudaBasicsTests, TestRawBroadcast_2) {
     //    return;
 
     NDArray x = NDArrayFactory::create<double>('c', {2,3,4});
-    NDArray y('c', {2,4},   {10,20,30,40,50,60,70,80}, nd4j::DataType::DOUBLE);
-    NDArray z('c', {2,3,4}, {100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100}, nd4j::DataType::DOUBLE);
-//    NDArray exp('c', {2,3,4}, {10., 21., 32., 43., 14., 25., 36., 47., 18., 29., 40., 51., 62., 73., 84., 95., 66., 77., 88., 99., 70., 81., 92., 103}, nd4j::DataType::DOUBLE);
-    NDArray exp('c', {2,3,4}, {10., 40., 90., 160., 50., 120., 210., 320., 90., 200., 330., 480., 650., 840., 1050., 1280., 850., 1080., 1330., 1600., 1050., 1320., 1610., 1920.}, nd4j::DataType::DOUBLE);
+    NDArray y('c', {2,4},   {10,20,30,40,50,60,70,80}, sd::DataType::DOUBLE);
+    NDArray z('c', {2,3,4}, {100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100}, sd::DataType::DOUBLE);
+//    NDArray exp('c', {2,3,4}, {10., 21., 32., 43., 14., 25., 36., 47., 18., 29., 40., 51., 62., 73., 84., 95., 66., 77., 88., 99., 70., 81., 92., 103}, sd::DataType::DOUBLE);
+    NDArray exp('c', {2,3,4}, {10., 40., 90., 160., 50., 120., 210., 320., 90., 200., 330., 480., 650., 840., 1050., 1280., 850., 1080., 1330., 1600., 1050., 1320., 1610., 1920.}, sd::DataType::DOUBLE);
     x.linspace(1); x.syncToDevice();
 
     std::vector<int> dimensions = {0,2};
@@ -721,7 +721,7 @@ TEST_F(NDArrayCudaBasicsTests, TestRawBroadcast_2) {
     cudaResult = allocateDeviceMem(lc, devicePtrs, hostData);	ASSERT_EQ(0, cudaResult);
 
     // call cuda kernel which calculates result
-    NativeOpExecutioner::execBroadcast(&lc, nd4j::broadcast::Multiply,
+    NativeOpExecutioner::execBroadcast(&lc, sd::broadcast::Multiply,
                                        nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
                                        nullptr, y.getShapeInfo(), y.specialBuffer(), y.specialShapeInfo(),
                                        nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
@@ -749,11 +749,11 @@ TEST_F(NDArrayCudaBasicsTests, TestRawBroadcast_3) {
     //if (!Environment::getInstance()->isExperimentalBuild())
     //    return;
 
-    NDArray x('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    NDArray y('c', {2,4},   {10,20,30,40,50,60,70,80}, nd4j::DataType::DOUBLE);
-    NDArray z('c', {2,3,4}, {100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100}, nd4j::DataType::DOUBLE);
-//    NDArray exp('c', {2,3,4}, {10., 21., 32., 43., 14., 25., 36., 47., 18., 29., 40., 51., 62., 73., 84., 95., 66., 77., 88., 99., 70., 81., 92., 103}, nd4j::DataType::DOUBLE);
-    NDArray exp('c', {2,3,4}, {10., 40., 90., 160., 50., 120., 210., 320., 90., 200., 330., 480., 650., 840., 1050., 1280., 850., 1080., 1330., 1600., 1050., 1320., 1610., 1920.}, nd4j::DataType::DOUBLE);
+    NDArray x('c', {2,3,4}, sd::DataType::DOUBLE);
+    NDArray y('c', {2,4},   {10,20,30,40,50,60,70,80}, sd::DataType::DOUBLE);
+    NDArray z('c', {2,3,4}, {100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100}, sd::DataType::DOUBLE);
+//    NDArray exp('c', {2,3,4}, {10., 21., 32., 43., 14., 25., 36., 47., 18., 29., 40., 51., 62., 73., 84., 95., 66., 77., 88., 99., 70., 81., 92., 103}, sd::DataType::DOUBLE);
+    NDArray exp('c', {2,3,4}, {10., 40., 90., 160., 50., 120., 210., 320., 90., 200., 330., 480., 650., 840., 1050., 1280., 850., 1080., 1330., 1600., 1050., 1320., 1610., 1920.}, sd::DataType::DOUBLE);
     x.linspace(1); x.syncToDevice();
 
     std::vector<int> dimensions = {0,2};
@@ -787,7 +787,7 @@ TEST_F(NDArrayCudaBasicsTests, TestRawBroadcast_3) {
 
     NDArray::registerSpecialUse({&z}, {&x, &y});
     // call cuda kernel which calculates result
-    NativeOpExecutioner::execBroadcast(pLc, nd4j::broadcast::Multiply,
+    NativeOpExecutioner::execBroadcast(pLc, sd::broadcast::Multiply,
                                        nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
                                        nullptr, y.getShapeInfo(), y.specialBuffer(), y.specialShapeInfo(),
                                        nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
@@ -812,8 +812,8 @@ TEST_F(NDArrayCudaBasicsTests, TestRawBroadcast_3) {
 
 TEST_F(NDArrayCudaBasicsTests, TestBroadcastMultiply_1) {
     // allocating host-side arrays
-    NDArray x('c', { 2, 3 }, { 1, 2, 3, 4, 5, 6}, nd4j::DataType::DOUBLE);
-    NDArray y = NDArrayFactory::create<double>(3.); //'c', { 3 }, { 2., 3., 4.}, nd4j::DataType::DOUBLE);
+    NDArray x('c', { 2, 3 }, { 1, 2, 3, 4, 5, 6}, sd::DataType::DOUBLE);
+    NDArray y = NDArrayFactory::create<double>(3.); //'c', { 3 }, { 2., 3., 4.}, sd::DataType::DOUBLE);
     //auto z = NDArrayFactory::create<double>('c', { 5 });
 
     auto exp = NDArrayFactory::create<double>('c', { 2, 3 }, { 3, 6, 9, 12, 15, 18 });
@@ -842,8 +842,8 @@ TEST_F(NDArrayCudaBasicsTests, TestBroadcastMultiply_1) {
 
 TEST_F(NDArrayCudaBasicsTests, TestBroadcastMultiply_01) {
     // allocating host-side arrays
-    NDArray x('c', { 2, 3 }, { 1, 2, 3, 4, 5, 6}, nd4j::DataType::DOUBLE);
-    NDArray y = NDArrayFactory::create<double>(3.); //'c', { 3 }, { 2., 3., 4.}, nd4j::DataType::DOUBLE);
+    NDArray x('c', { 2, 3 }, { 1, 2, 3, 4, 5, 6}, sd::DataType::DOUBLE);
+    NDArray y = NDArrayFactory::create<double>(3.); //'c', { 3 }, { 2., 3., 4.}, sd::DataType::DOUBLE);
     auto z = NDArrayFactory::create<double>('c', { 2, 3 });
 
     auto exp = NDArrayFactory::create<double>('c', { 2, 3 }, { 3, 6, 9, 12, 15, 18 });
@@ -875,8 +875,8 @@ TEST_F(NDArrayCudaBasicsTests, TestBroadcastMultiply_01) {
 
 TEST_F(NDArrayCudaBasicsTests, TestBroadcastMultiply_02) {
     // allocating host-side arrays
-    auto x = NDArrayFactory::create<double>('c', { 2, 3 }, { 1, 2, 3, 4, 5, 6}); //, nd4j::DataType::DOUBLE);
-    auto y = NDArrayFactory::create<double>('c', {2,3}, {3, 3, 3, 3, 3, 3}); //'c', { 3 }, { 2., 3., 4.}, nd4j::DataType::DOUBLE);
+    auto x = NDArrayFactory::create<double>('c', { 2, 3 }, { 1, 2, 3, 4, 5, 6}); //, sd::DataType::DOUBLE);
+    auto y = NDArrayFactory::create<double>('c', {2,3}, {3, 3, 3, 3, 3, 3}); //'c', { 3 }, { 2., 3., 4.}, sd::DataType::DOUBLE);
     auto z = NDArrayFactory::create<double>('c', { 2, 3 });
 
     auto exp = NDArrayFactory::create<double>('c', { 2, 3 }, { 3, 6, 9, 12, 15, 18 });
@@ -909,8 +909,8 @@ TEST_F(NDArrayCudaBasicsTests, TestBroadcastMultiply_02) {
 
 TEST_F(NDArrayCudaBasicsTests, TestBroadcastMultiply_002) {
     // allocating host-side arrays
-    auto x = NDArrayFactory::create<double>('c', { 2, 3 }, { 1, 2, 3, 4, 5, 6}); //, nd4j::DataType::DOUBLE);
-    auto y = NDArrayFactory::create<double>('c', {2, 3}, {2., 3., 3., 3., 3., 3.}); //'c', { 3 }, { 2., 3., 4.}, nd4j::DataType::DOUBLE);
+    auto x = NDArrayFactory::create<double>('c', { 2, 3 }, { 1, 2, 3, 4, 5, 6}); //, sd::DataType::DOUBLE);
+    auto y = NDArrayFactory::create<double>('c', {2, 3}, {2., 3., 3., 3., 3., 3.}); //'c', { 3 }, { 2., 3., 4.}, sd::DataType::DOUBLE);
     auto z = NDArrayFactory::create<double>('c', { 2, 3 });
 
     auto exp = NDArrayFactory::create<double>('c', { 2, 3 }, { 2, 6, 9, 12, 15, 18 });
@@ -947,10 +947,10 @@ TEST_F(NDArrayCudaBasicsTests, TestBroadcastRaw_1) {
     //if (!Environment::getInstance()->isExperimentalBuild())
     //    return;
 
-    NDArray x('c', {2,3,4}, {100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100}, nd4j::DataType::INT32);
-    NDArray y('c', {3},   {10, 20, 30}, nd4j::DataType::INT64);
-    NDArray z('c', {2,3,4}, {100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100}, nd4j::DataType::INT32);
-    NDArray exp('c', {2,3,4}, {10, 11, 12, 13,24, 25, 26, 27,38, 39, 40, 41,22, 23, 24, 25,36, 37, 38, 39,50, 51, 52, 53}, nd4j::DataType::INT32);
+    NDArray x('c', {2,3,4}, {100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100}, sd::DataType::INT32);
+    NDArray y('c', {3},   {10, 20, 30}, sd::DataType::INT64);
+    NDArray z('c', {2,3,4}, {100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100}, sd::DataType::INT32);
+    NDArray exp('c', {2,3,4}, {10, 11, 12, 13,24, 25, 26, 27,38, 39, 40, 41,22, 23, 24, 25,36, 37, 38, 39,50, 51, 52, 53}, sd::DataType::INT32);
     //real output [10, 11, 12, 13, 4, 5, 6, 7, 28, 29, 30, 31, 22, 23, 24, 25, 16, 17, 18, 19, 40, 41, 42, 43]
     x.linspace(0); x.syncToDevice();
 
@@ -983,7 +983,7 @@ TEST_F(NDArrayCudaBasicsTests, TestBroadcastRaw_1) {
     }
 
     // call cuda kernel which calculates result
-    NativeOpExecutioner::execBroadcast(pLc, nd4j::broadcast::Add,
+    NativeOpExecutioner::execBroadcast(pLc, sd::broadcast::Add,
                                        nullptr, x.getShapeInfo(), x.specialBuffer(), x.specialShapeInfo(),
                                        nullptr, y.getShapeInfo(), y.specialBuffer(), y.specialShapeInfo(),
                                        nullptr, z.getShapeInfo(), z.specialBuffer(), z.specialShapeInfo(),
@@ -1010,8 +1010,8 @@ TEST_F(NDArrayCudaBasicsTests, TestBroadcastRaw_1) {
 
 TEST_F(NDArrayCudaBasicsTests, TestBroadcastMultiply) {
     // allocating host-side arrays
-    NDArray x('c', { 2, 3 }, { 1, 2, 3, 4, 5, 6}, nd4j::DataType::DOUBLE);
-    NDArray y('c', { 3 }, { 2., 3., 4.}, nd4j::DataType::DOUBLE);
+    NDArray x('c', { 2, 3 }, { 1, 2, 3, 4, 5, 6}, sd::DataType::DOUBLE);
+    NDArray y('c', { 3 }, { 2., 3., 4.}, sd::DataType::DOUBLE);
     //auto z = NDArrayFactory::create<double>('c', { 5 });
 
     auto exp = NDArrayFactory::create<double>('c', { 2, 3 }, { 2, 6, 12, 8, 15, 24 });
@@ -1042,8 +1042,8 @@ TEST_F(NDArrayCudaBasicsTests, TestBroadcastMultiply) {
 
 TEST_F(NDArrayCudaBasicsTests, TestBroadcastMultiply_2) {
     // allocating host-side arrays
-    NDArray x('c', { 2, 3 }, { 1, 2, 3, 4, 5, 6}, nd4j::DataType::DOUBLE);
-    NDArray y('c', { 3 }, { 2., 3., 4.}, nd4j::DataType::DOUBLE);
+    NDArray x('c', { 2, 3 }, { 1, 2, 3, 4, 5, 6}, sd::DataType::DOUBLE);
+    NDArray y('c', { 3 }, { 2., 3., 4.}, sd::DataType::DOUBLE);
     //auto z = NDArrayFactory::create<double>('c', { 5 });
 
     auto exp = NDArrayFactory::create<double>('c', { 2, 3 }, { 11,12, 13,14, 15, 16 });
@@ -1060,7 +1060,7 @@ TEST_F(NDArrayCudaBasicsTests, TestBroadcastMultiply_2) {
     //x.applyPairwiseTransform(pairwise::Multiply, &y, &z, nullptr);
     //x.printBuffer("23X = ");
     //y.printBuffer("23Y = ");
-    //void NDArray::applyTrueBroadcast(nd4j::BroadcastOpsTuple op, const NDArray* other, NDArray* target, const bool checkTargetShape, ExtraArguments *extraArgs)
+    //void NDArray::applyTrueBroadcast(sd::BroadcastOpsTuple op, const NDArray* other, NDArray* target, const bool checkTargetShape, ExtraArguments *extraArgs)
     x.applyTrueBroadcast(BroadcastOpsTuple::Multiply(), y, exp);
 
     //
@@ -1114,8 +1114,8 @@ TEST_F(NDArrayCudaBasicsTests, TestDup1) {
 //////////////////////////////////////////////////////////////////////////
 TEST_F(NDArrayCudaBasicsTests, equalsTo_1) {
 
-    NDArray x('c', {2,5}, {1,2,3,4,5,6,7,8,9,10}, nd4j::DataType::DOUBLE);
-    NDArray y('c', {2,5}, {1,2,3,4,5,6,7,8,9,10}, nd4j::DataType::DOUBLE);
+    NDArray x('c', {2,5}, {1,2,3,4,5,6,7,8,9,10}, sd::DataType::DOUBLE);
+    NDArray y('c', {2,5}, {1,2,3,4,5,6,7,8,9,10}, sd::DataType::DOUBLE);
 
     ASSERT_TRUE(x.equalsTo(y));
 
@@ -1128,8 +1128,8 @@ TEST_F(NDArrayCudaBasicsTests, equalsTo_1) {
 //////////////////////////////////////////////////////////////////////////
 TEST_F(NDArrayCudaBasicsTests, equalsTo_2) {
 
-    NDArray x('c', {2,5}, {1,2,3,4,5,6,7,8,10,10}, nd4j::DataType::DOUBLE);
-    NDArray y('c', {2,5}, {1,2,5,4,5,6,7,8,9,10}, nd4j::DataType::DOUBLE);
+    NDArray x('c', {2,5}, {1,2,3,4,5,6,7,8,10,10}, sd::DataType::DOUBLE);
+    NDArray y('c', {2,5}, {1,2,5,4,5,6,7,8,9,10}, sd::DataType::DOUBLE);
 
     ASSERT_FALSE(x.equalsTo(y));
 
@@ -1142,8 +1142,8 @@ TEST_F(NDArrayCudaBasicsTests, equalsTo_2) {
 //////////////////////////////////////////////////////////////////////////
 TEST_F(NDArrayCudaBasicsTests, equalsTo_3) {
 
-    NDArray x('c', {2,5}, {1,2,3,4,5,6,7,8,9,10}, nd4j::DataType::DOUBLE);
-    NDArray y('c', {2,5}, {1.f,2.f,3.f,4.f,5.f,6.f,7.f,8.f,9.f,10.f}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {2,5}, {1,2,3,4,5,6,7,8,9,10}, sd::DataType::DOUBLE);
+    NDArray y('c', {2,5}, {1.f,2.f,3.f,4.f,5.f,6.f,7.f,8.f,9.f,10.f}, sd::DataType::FLOAT32);
 
     ASSERT_FALSE(x.equalsTo(y));
 
@@ -1156,78 +1156,78 @@ TEST_F(NDArrayCudaBasicsTests, equalsTo_3) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(NDArrayCudaBasicsTests, applyReduce3_1) {
 
-    NDArray x('c', {2,3,4}, {-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13}, nd4j::DataType::INT32);
-    NDArray x2('c', {2,3,4}, {-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13}, nd4j::DataType::INT32);
-    NDArray y('c', {2,3,4}, {-2,3,-4,5,-2,3,-4,5,-2,3,-4,5,-2,3,-4,5,-2,3,-4,5,-2,3,-4,5}, nd4j::DataType::INT32);
-    NDArray k('c', {2,3}, {-2,3,-4,5,-2,3}, nd4j::DataType::INT32);
-    NDArray k2('c', {3,2}, {-2,3,-4,5,-2,3}, nd4j::DataType::INT32);
+    NDArray x('c', {2,3,4}, {-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13}, sd::DataType::INT32);
+    NDArray x2('c', {2,3,4}, {-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13}, sd::DataType::INT32);
+    NDArray y('c', {2,3,4}, {-2,3,-4,5,-2,3,-4,5,-2,3,-4,5,-2,3,-4,5,-2,3,-4,5,-2,3,-4,5}, sd::DataType::INT32);
+    NDArray k('c', {2,3}, {-2,3,-4,5,-2,3}, sd::DataType::INT32);
+    NDArray k2('c', {3,2}, {-2,3,-4,5,-2,3}, sd::DataType::INT32);
 
-    NDArray exp1('c', {3}, {4.f, 20.f, 36.f}, nd4j::DataType::FLOAT32);
-    NDArray exp2('c', {2,3}, {-10.f, -2.f, 6.f,14.f, 22.f, 30.f}, nd4j::DataType::FLOAT32);
-    NDArray exp3('c', {4}, {38.f, 41.f, 44.f, 47.f}, nd4j::DataType::FLOAT32);
-    NDArray exp4('c', {4}, {114.f, 117.f, 120.f, 123.f}, nd4j::DataType::FLOAT32);
+    NDArray exp1('c', {3}, {4.f, 20.f, 36.f}, sd::DataType::FLOAT32);
+    NDArray exp2('c', {2,3}, {-10.f, -2.f, 6.f,14.f, 22.f, 30.f}, sd::DataType::FLOAT32);
+    NDArray exp3('c', {4}, {38.f, 41.f, 44.f, 47.f}, sd::DataType::FLOAT32);
+    NDArray exp4('c', {4}, {114.f, 117.f, 120.f, 123.f}, sd::DataType::FLOAT32);
 
 
-    NDArray z = x.applyReduce3(nd4j::reduce3::Dot, y, {0,2});
+    NDArray z = x.applyReduce3(sd::reduce3::Dot, y, {0,2});
     ASSERT_TRUE(z.equalsTo(&exp1));
 
-    z = x.applyReduce3(nd4j::reduce3::Dot, k, {0,1});
+    z = x.applyReduce3(sd::reduce3::Dot, k, {0,1});
     ASSERT_TRUE(z.equalsTo(&exp3));
 
     x.permutei({0,2,1});
     y.permutei({0,2,1});
 
-    z = y.applyReduce3(nd4j::reduce3::Dot, x, {1});
+    z = y.applyReduce3(sd::reduce3::Dot, x, {1});
     ASSERT_TRUE(z.equalsTo(&exp2));
 
     x2.permutei({1,0,2});
 
-    z = x2.applyReduce3(nd4j::reduce3::Dot, k2, {0,1});
+    z = x2.applyReduce3(sd::reduce3::Dot, k2, {0,1});
     ASSERT_TRUE(z.equalsTo(&exp4));
 }
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(NDArrayCudaBasicsTests, applyReduce3_2) {
 
-    NDArray x('c', {2,3,4}, {-10,-9,-8.5,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13}, nd4j::DataType::DOUBLE);
-    NDArray x2('c', {2,3,4}, {-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0.5,1,2,3,4,5,6,7,8,9,10,11,12,13}, nd4j::DataType::DOUBLE);
-    NDArray y('c', {2,3,4}, {-2,3,-4,5,-2,3,-4,5,-2,3,-4,5,-2.5,3,-4,5,-2,3,-4,5,-2,3,-4,5}, nd4j::DataType::DOUBLE);
-    NDArray k('c', {2,3}, {-2,3,-4,5.5,-2,3}, nd4j::DataType::DOUBLE);
-    NDArray k2('c', {3,2}, {-2,3,-4,5,-2,3.5}, nd4j::DataType::DOUBLE);
+    NDArray x('c', {2,3,4}, {-10,-9,-8.5,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13}, sd::DataType::DOUBLE);
+    NDArray x2('c', {2,3,4}, {-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0.5,1,2,3,4,5,6,7,8,9,10,11,12,13}, sd::DataType::DOUBLE);
+    NDArray y('c', {2,3,4}, {-2,3,-4,5,-2,3,-4,5,-2,3,-4,5,-2.5,3,-4,5,-2,3,-4,5,-2,3,-4,5}, sd::DataType::DOUBLE);
+    NDArray k('c', {2,3}, {-2,3,-4,5.5,-2,3}, sd::DataType::DOUBLE);
+    NDArray k2('c', {3,2}, {-2,3,-4,5,-2,3.5}, sd::DataType::DOUBLE);
 
-    NDArray exp1('c', {3}, {5., 20., 36.}, nd4j::DataType::DOUBLE);
-    NDArray exp2('c', {2,3}, {-8., -2., 6., 13., 22., 30.}, nd4j::DataType::DOUBLE);
-    NDArray exp3('c', {4}, {39., 42.5, 47., 49.5}, nd4j::DataType::DOUBLE);
-    NDArray exp4('c', {4}, {119., 122.5, 125., 129.5}, nd4j::DataType::DOUBLE);
+    NDArray exp1('c', {3}, {5., 20., 36.}, sd::DataType::DOUBLE);
+    NDArray exp2('c', {2,3}, {-8., -2., 6., 13., 22., 30.}, sd::DataType::DOUBLE);
+    NDArray exp3('c', {4}, {39., 42.5, 47., 49.5}, sd::DataType::DOUBLE);
+    NDArray exp4('c', {4}, {119., 122.5, 125., 129.5}, sd::DataType::DOUBLE);
 
-    NDArray z = x.applyReduce3(nd4j::reduce3::Dot, y, {0,2});
+    NDArray z = x.applyReduce3(sd::reduce3::Dot, y, {0,2});
     ASSERT_TRUE(z.equalsTo(&exp1));
 
-    z = x.applyReduce3(nd4j::reduce3::Dot, k, {0,1});
+    z = x.applyReduce3(sd::reduce3::Dot, k, {0,1});
     ASSERT_TRUE(z.equalsTo(&exp3));
 
     x.permutei({0,2,1});
     y.permutei({0,2,1});
 
-    z = y.applyReduce3(nd4j::reduce3::Dot, x, {1});
+    z = y.applyReduce3(sd::reduce3::Dot, x, {1});
     ASSERT_TRUE(z.equalsTo(&exp2));
 
     x2.permutei({1,0,2});
 
-    z = x2.applyReduce3(nd4j::reduce3::Dot, k2, {0,1});
+    z = x2.applyReduce3(sd::reduce3::Dot, k2, {0,1});
     ASSERT_TRUE(z.equalsTo(&exp4));
 }
 
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(NDArrayCudaBasicsTests, applyReduce3_3) {
 
-    NDArray x1('c', {2,2,2}, {1,2,3,4,5,6,7,8}, nd4j::DataType::INT32);
-    NDArray x2('c', {2,2,2}, {-1,-2,-3,-4,-5,-6,-7,-8}, nd4j::DataType::INT32);
-    NDArray x3('c', {3,2}, {1.5,1.5,1.5,1.5,1.5,1.5}, nd4j::DataType::DOUBLE);
-    NDArray x4('c', {3,2}, {1,2,3,4,5,6}, nd4j::DataType::DOUBLE);
+    NDArray x1('c', {2,2,2}, {1,2,3,4,5,6,7,8}, sd::DataType::INT32);
+    NDArray x2('c', {2,2,2}, {-1,-2,-3,-4,-5,-6,-7,-8}, sd::DataType::INT32);
+    NDArray x3('c', {3,2}, {1.5,1.5,1.5,1.5,1.5,1.5}, sd::DataType::DOUBLE);
+    NDArray x4('c', {3,2}, {1,2,3,4,5,6}, sd::DataType::DOUBLE);
 
-    NDArray exp1('c', {}, std::vector<double>{-204}, nd4j::DataType::FLOAT32);
-    NDArray exp2('c', {}, std::vector<double>{31.5}, nd4j::DataType::DOUBLE);
+    NDArray exp1('c', {}, std::vector<double>{-204}, sd::DataType::FLOAT32);
+    NDArray exp2('c', {}, std::vector<double>{31.5}, sd::DataType::DOUBLE);
 
 
     auto z = x1.applyReduce3(reduce3::Dot, x2);
@@ -1251,17 +1251,17 @@ TEST_F(NDArrayCudaBasicsTests, applyReduce3_3) {
 ////////////////////////////////////////////////////////////////////////////
 TEST_F(NDArrayCudaBasicsTests, applyAllReduce3_1) {
 
-    NDArray x1('c', {2,3,2}, {1,2,3,4,5,6,7,8,-1,-2,-3,-4,}, nd4j::DataType::INT32);
-    NDArray x2('c', {2,2,2}, {-1,-2,-3,-4,-5,-6,-7,-8}, nd4j::DataType::INT32);
-    NDArray x3('c', {3,2}, {1.5,1.5,1.5,1.5,1.5,1.5}, nd4j::DataType::DOUBLE);
-    NDArray x4('c', {3,2}, {1,2,3,4,5,6}, nd4j::DataType::DOUBLE);
+    NDArray x1('c', {2,3,2}, {1,2,3,4,5,6,7,8,-1,-2,-3,-4,}, sd::DataType::INT32);
+    NDArray x2('c', {2,2,2}, {-1,-2,-3,-4,-5,-6,-7,-8}, sd::DataType::INT32);
+    NDArray x3('c', {3,2}, {1.5,1.5,1.5,1.5,1.5,1.5}, sd::DataType::DOUBLE);
+    NDArray x4('c', {3,2}, {1,2,3,4,5,6}, sd::DataType::DOUBLE);
 
-    NDArray exp1('c', {3,2}, {-88.f, -124.f, 6.f, -2.f, 22.f, 14.f}, nd4j::DataType::FLOAT32);
+    NDArray exp1('c', {3,2}, {-88.f, -124.f, 6.f, -2.f, 22.f, 14.f}, sd::DataType::FLOAT32);
     NDArray exp2('c', {6,4}, {-36.f, -44.f, -52.f, -60.f,-42.f, -52.f, -62.f, -72.f, 2.f, 0.f, -2.f,
                               -4.f, 6.f, 4.f, 2.f, 0.f, 10.f, 8.f, 6.f, 4.f, 14.f, 12.f, 10.f, 8.f},
-            nd4j::DataType::FLOAT32);
-    NDArray exp3('c', {1,1}, std::vector<double>{31.5}, nd4j::DataType::DOUBLE);
-    NDArray exp4('c', {3,3}, {4.5, 10.5, 16.5,4.5, 10.5, 16.5,4.5, 10.5, 16.5}, nd4j::DataType::DOUBLE);
+            sd::DataType::FLOAT32);
+    NDArray exp3('c', {1,1}, std::vector<double>{31.5}, sd::DataType::DOUBLE);
+    NDArray exp4('c', {3,3}, {4.5, 10.5, 16.5,4.5, 10.5, 16.5,4.5, 10.5, 16.5}, sd::DataType::DOUBLE);
 
     auto z = x1.applyAllReduce3(reduce3::Dot, x2, {0,2});
     ASSERT_TRUE(z.equalsTo(&exp1));
@@ -1290,38 +1290,38 @@ TEST_F(NDArrayCudaBasicsTests, applyAllReduce3_1) {
 //////////////////////////////////////////////////////////////////////////////
 TEST_F(NDArrayCudaBasicsTests, applyIndexReduce_test1) {
 
-    NDArray x('c', {2,3}, {0, 10, 1, 2, 2.5,-4}, nd4j::DataType::DOUBLE);
+    NDArray x('c', {2,3}, {0, 10, 1, 2, 2.5,-4}, sd::DataType::DOUBLE);
 
-    NDArray scalar('c', {}, std::vector<double>{100}, nd4j::DataType::INT64);
-    NDArray vec1('c', {2}, {100,100}, nd4j::DataType::INT64);
-    NDArray vec2('c', {3}, {100,100,100}, nd4j::DataType::INT64);
+    NDArray scalar('c', {}, std::vector<double>{100}, sd::DataType::INT64);
+    NDArray vec1('c', {2}, {100,100}, sd::DataType::INT64);
+    NDArray vec2('c', {3}, {100,100,100}, sd::DataType::INT64);
 
-    NDArray exp1('c', {}, std::vector<double>{1}, nd4j::DataType::INT64);
-    NDArray exp2('c', {2}, {1,1}, nd4j::DataType::INT64);
-    NDArray exp3('c', {3}, {1,0,0}, nd4j::DataType::INT64);
+    NDArray exp1('c', {}, std::vector<double>{1}, sd::DataType::INT64);
+    NDArray exp2('c', {2}, {1,1}, sd::DataType::INT64);
+    NDArray exp3('c', {3}, {1,0,0}, sd::DataType::INT64);
 
-    NDArray exp4('c', {}, std::vector<double>{2}, nd4j::DataType::INT64);
-    NDArray exp5('c', {2}, {1,1}, nd4j::DataType::INT64);
-    NDArray exp6('c', {3}, {1,0,0}, nd4j::DataType::INT64);
+    NDArray exp4('c', {}, std::vector<double>{2}, sd::DataType::INT64);
+    NDArray exp5('c', {2}, {1,1}, sd::DataType::INT64);
+    NDArray exp6('c', {3}, {1,0,0}, sd::DataType::INT64);
 
-    x.applyIndexReduce(nd4j::indexreduce::IndexMax, scalar, {0,1});
+    x.applyIndexReduce(sd::indexreduce::IndexMax, scalar, {0,1});
     ASSERT_TRUE(scalar.equalsTo(&exp1));
 
-    x.applyIndexReduce(nd4j::indexreduce::IndexMax, vec1, {1});
+    x.applyIndexReduce(sd::indexreduce::IndexMax, vec1, {1});
     ASSERT_TRUE(vec1.equalsTo(&exp2));
 
-    x.applyIndexReduce(nd4j::indexreduce::IndexMax, vec2, {0});
+    x.applyIndexReduce(sd::indexreduce::IndexMax, vec2, {0});
     ASSERT_TRUE(vec2.equalsTo(&exp3));
 
     x.permutei({1,0});
 
-    x.applyIndexReduce(nd4j::indexreduce::IndexMax, scalar, {0,1});
+    x.applyIndexReduce(sd::indexreduce::IndexMax, scalar, {0,1});
     ASSERT_TRUE(scalar.equalsTo(&exp4));
 
-    x.applyIndexReduce(nd4j::indexreduce::IndexMax, vec1, {0});
+    x.applyIndexReduce(sd::indexreduce::IndexMax, vec1, {0});
     ASSERT_TRUE(vec1.equalsTo(&exp5));
 
-    x.applyIndexReduce(nd4j::indexreduce::IndexMax, vec2, {1});
+    x.applyIndexReduce(sd::indexreduce::IndexMax, vec2, {1});
     ASSERT_TRUE(vec2.equalsTo(&exp6));
 }
 
@@ -1329,104 +1329,104 @@ TEST_F(NDArrayCudaBasicsTests, applyIndexReduce_test1) {
 //////////////////////////////////////////////////////////////////////////////
 TEST_F(NDArrayCudaBasicsTests, applyIndexReduce_test2) {
 
-    NDArray x('c', {2,3}, {0, 10, 1, 2, 2.5,-4}, nd4j::DataType::DOUBLE);
+    NDArray x('c', {2,3}, {0, 10, 1, 2, 2.5,-4}, sd::DataType::DOUBLE);
 
-    NDArray exp1('c', {}, std::vector<double>{1}, nd4j::DataType::INT64);
-    NDArray exp2('c', {2}, {1,1}, nd4j::DataType::INT64);
-    NDArray exp3('c', {3}, {1,0,0}, nd4j::DataType::INT64);
+    NDArray exp1('c', {}, std::vector<double>{1}, sd::DataType::INT64);
+    NDArray exp2('c', {2}, {1,1}, sd::DataType::INT64);
+    NDArray exp3('c', {3}, {1,0,0}, sd::DataType::INT64);
 
-    NDArray exp4('c', {}, std::vector<double>{2}, nd4j::DataType::INT64);
-    NDArray exp5('c', {2}, {1,1}, nd4j::DataType::INT64);
-    NDArray exp6('c', {3}, {1,0,0}, nd4j::DataType::INT64);
+    NDArray exp4('c', {}, std::vector<double>{2}, sd::DataType::INT64);
+    NDArray exp5('c', {2}, {1,1}, sd::DataType::INT64);
+    NDArray exp6('c', {3}, {1,0,0}, sd::DataType::INT64);
 
-    auto z = x.applyIndexReduce(nd4j::indexreduce::IndexMax, {0,1});
+    auto z = x.applyIndexReduce(sd::indexreduce::IndexMax, {0,1});
     ASSERT_TRUE(z.equalsTo(&exp1));
 
-    z = x.applyIndexReduce(nd4j::indexreduce::IndexMax, {1});
+    z = x.applyIndexReduce(sd::indexreduce::IndexMax, {1});
     ASSERT_TRUE(z.equalsTo(&exp2));
 
-    z = x.applyIndexReduce(nd4j::indexreduce::IndexMax, {0});
+    z = x.applyIndexReduce(sd::indexreduce::IndexMax, {0});
     ASSERT_TRUE(z.equalsTo(&exp3));
 
     x.permutei({1,0});
 
-    z = x.applyIndexReduce(nd4j::indexreduce::IndexMax, {0,1});
+    z = x.applyIndexReduce(sd::indexreduce::IndexMax, {0,1});
     ASSERT_TRUE(z.equalsTo(&exp4));
 
-    z = x.applyIndexReduce(nd4j::indexreduce::IndexMax, {0});
+    z = x.applyIndexReduce(sd::indexreduce::IndexMax, {0});
     ASSERT_TRUE(z.equalsTo(&exp5));
 
-    z = x.applyIndexReduce(nd4j::indexreduce::IndexMax, {1});
+    z = x.applyIndexReduce(sd::indexreduce::IndexMax, {1});
     ASSERT_TRUE(z.equalsTo(&exp6));
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(NDArrayCudaBasicsTests, reduceAlongDimension_float_test1) {
 
-    NDArray x('c', {2,3,2}, {1,2,3,4,5,6,7,8,-1,-2,-3,-4,}, nd4j::DataType::INT32);
+    NDArray x('c', {2,3,2}, {1,2,3,4,5,6,7,8,-1,-2,-3,-4,}, sd::DataType::INT32);
 
-    NDArray z1('c', {}, std::vector<double>{100}, nd4j::DataType::DOUBLE);
-    NDArray z2('c', {2,2}, {100,100,100,100}, nd4j::DataType::FLOAT32);
-    NDArray z3('c', {3}, {100,100,100}, nd4j::DataType::DOUBLE);
-    NDArray z4('c', {3,2}, {100,100,100,100,100,100}, nd4j::DataType::FLOAT32);
-    NDArray z5('c', {2}, {100,100}, nd4j::DataType::FLOAT32);
+    NDArray z1('c', {}, std::vector<double>{100}, sd::DataType::DOUBLE);
+    NDArray z2('c', {2,2}, {100,100,100,100}, sd::DataType::FLOAT32);
+    NDArray z3('c', {3}, {100,100,100}, sd::DataType::DOUBLE);
+    NDArray z4('c', {3,2}, {100,100,100,100,100,100}, sd::DataType::FLOAT32);
+    NDArray z5('c', {2}, {100,100}, sd::DataType::FLOAT32);
 
-    NDArray exp1('c', {}, std::vector<double>{2.166667}, nd4j::DataType::DOUBLE);
-    NDArray exp2('c', {2,2}, {3.f,4.f,1.f,0.666667f}, nd4j::DataType::FLOAT32);
-    NDArray exp3('c', {3}, {4.5,1,1}, nd4j::DataType::DOUBLE);
-    NDArray exp4('c', {3,2}, {4,5,1,1,1,1}, nd4j::DataType::FLOAT32);
-    NDArray exp5('c', {2}, {3.5f,0.833333f}, nd4j::DataType::FLOAT32);
+    NDArray exp1('c', {}, std::vector<double>{2.166667}, sd::DataType::DOUBLE);
+    NDArray exp2('c', {2,2}, {3.f,4.f,1.f,0.666667f}, sd::DataType::FLOAT32);
+    NDArray exp3('c', {3}, {4.5,1,1}, sd::DataType::DOUBLE);
+    NDArray exp4('c', {3,2}, {4,5,1,1,1,1}, sd::DataType::FLOAT32);
+    NDArray exp5('c', {2}, {3.5f,0.833333f}, sd::DataType::FLOAT32);
 
-    x.reduceAlongDimension(nd4j::reduce::Mean, z1, {0,1,2});
+    x.reduceAlongDimension(sd::reduce::Mean, z1, {0,1,2});
     ASSERT_TRUE(z1.equalsTo(&exp1));
 
-    x.reduceAlongDimension(nd4j::reduce::Mean, z2, {1});
+    x.reduceAlongDimension(sd::reduce::Mean, z2, {1});
     ASSERT_TRUE(z2.equalsTo(&exp2));
 
-    x.reduceAlongDimension(nd4j::reduce::Mean, z3, {0,2});
+    x.reduceAlongDimension(sd::reduce::Mean, z3, {0,2});
     ASSERT_TRUE(z3.equalsTo(&exp3));
 
     x.permutei({1,0,2});    // 3x2x2
 
-    x.reduceAlongDimension(nd4j::reduce::Mean, z1, {0,1,2});
+    x.reduceAlongDimension(sd::reduce::Mean, z1, {0,1,2});
     ASSERT_TRUE(z1.equalsTo(&exp1));
 
-    x.reduceAlongDimension(nd4j::reduce::Mean, z4, {1});
+    x.reduceAlongDimension(sd::reduce::Mean, z4, {1});
     ASSERT_TRUE(z4.equalsTo(&exp4));
 
-    x.reduceAlongDimension(nd4j::reduce::Mean, z5, {0,2});
+    x.reduceAlongDimension(sd::reduce::Mean, z5, {0,2});
     ASSERT_TRUE(z5.equalsTo(&exp5));
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(NDArrayCudaBasicsTests, reduceAlongDimension_float_test2) {
 
-    NDArray x('c', {2,3,2}, {1,2,3,4,5,6,7,8,-1,-2,-3,-4,}, nd4j::DataType::DOUBLE);
+    NDArray x('c', {2,3,2}, {1,2,3,4,5,6,7,8,-1,-2,-3,-4,}, sd::DataType::DOUBLE);
 
-    NDArray exp1('c', {}, std::vector<double>{2.166667}, nd4j::DataType::DOUBLE);
-    NDArray exp2('c', {2,2}, {3,4,1,0.666667}, nd4j::DataType::DOUBLE);
-    NDArray exp3('c', {3}, {4.5,1,1}, nd4j::DataType::DOUBLE);
-    NDArray exp4('c', {3,2}, {4,5,1,1,1,1}, nd4j::DataType::DOUBLE);
-    NDArray exp5('c', {2}, {3.5,0.833333}, nd4j::DataType::DOUBLE);
+    NDArray exp1('c', {}, std::vector<double>{2.166667}, sd::DataType::DOUBLE);
+    NDArray exp2('c', {2,2}, {3,4,1,0.666667}, sd::DataType::DOUBLE);
+    NDArray exp3('c', {3}, {4.5,1,1}, sd::DataType::DOUBLE);
+    NDArray exp4('c', {3,2}, {4,5,1,1,1,1}, sd::DataType::DOUBLE);
+    NDArray exp5('c', {2}, {3.5,0.833333}, sd::DataType::DOUBLE);
 
-    NDArray z1 = x.reduceAlongDimension(nd4j::reduce::Mean, {0,1,2});
+    NDArray z1 = x.reduceAlongDimension(sd::reduce::Mean, {0,1,2});
     ASSERT_TRUE(z1.equalsTo(&exp1));
 
-    NDArray z2 = x.reduceAlongDimension(nd4j::reduce::Mean, {1});
+    NDArray z2 = x.reduceAlongDimension(sd::reduce::Mean, {1});
     ASSERT_TRUE(z2.equalsTo(&exp2));
 
-    NDArray z3 = x.reduceAlongDimension(nd4j::reduce::Mean, {0,2});
+    NDArray z3 = x.reduceAlongDimension(sd::reduce::Mean, {0,2});
     ASSERT_TRUE(z3.equalsTo(&exp3));
 
     x.permutei({1,0,2});    // 3x2x2
 
-    NDArray z4 = x.reduceAlongDimension(nd4j::reduce::Mean, {0,1,2});
+    NDArray z4 = x.reduceAlongDimension(sd::reduce::Mean, {0,1,2});
     ASSERT_TRUE(z4.equalsTo(&exp1));
 
-    NDArray z5 = x.reduceAlongDimension(nd4j::reduce::Mean, {1});
+    NDArray z5 = x.reduceAlongDimension(sd::reduce::Mean, {1});
     ASSERT_TRUE(z5.equalsTo(&exp4));
 
-    NDArray z6 = x.reduceAlongDimension(nd4j::reduce::Mean, {0,2});
+    NDArray z6 = x.reduceAlongDimension(sd::reduce::Mean, {0,2});
     ASSERT_TRUE(z6.equalsTo(&exp5));
 }
 
@@ -1475,210 +1475,210 @@ TEST_F(NDArrayCudaBasicsTests, EqualityTest1) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(NDArrayCudaBasicsTests, reduceAlongDimension_same_test1) {
 
-    NDArray x('c', {2,3,2}, {1.5f,2.f,3.f,4.f,5.f,6.f,7.5f,8.f,-1.f,-2.f,-3.5f,-4.f}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {2,3,2}, {1.5f,2.f,3.f,4.f,5.f,6.f,7.5f,8.f,-1.f,-2.f,-3.5f,-4.f}, sd::DataType::FLOAT32);
 
-    NDArray z1('c', {}, std::vector<double>{100}, nd4j::DataType::FLOAT32);
-    NDArray z2('c', {2,2}, {100,100,100,100}, nd4j::DataType::FLOAT32);
-    NDArray z3('c', {3}, {100,100,100}, nd4j::DataType::FLOAT32);
-    NDArray z4('c', {3,2}, {100,100,100,100,100,100}, nd4j::DataType::FLOAT32);
-    NDArray z5('c', {2}, {100,100}, nd4j::DataType::FLOAT32);
+    NDArray z1('c', {}, std::vector<double>{100}, sd::DataType::FLOAT32);
+    NDArray z2('c', {2,2}, {100,100,100,100}, sd::DataType::FLOAT32);
+    NDArray z3('c', {3}, {100,100,100}, sd::DataType::FLOAT32);
+    NDArray z4('c', {3,2}, {100,100,100,100,100,100}, sd::DataType::FLOAT32);
+    NDArray z5('c', {2}, {100,100}, sd::DataType::FLOAT32);
 
-    NDArray exp1('c', {}, std::vector<double>{26.5f}, nd4j::DataType::FLOAT32);
-    NDArray exp2('c', {2,2}, {9.5f,12.f,3.f,2.f}, nd4j::DataType::FLOAT32);
-    NDArray exp3('c', {3}, {19.f,4.f,3.5f}, nd4j::DataType::FLOAT32);
-    NDArray exp4('c', {3,2}, {9.f,10.f,2.f,2.f,1.5f,2.f}, nd4j::DataType::FLOAT32);
-    NDArray exp5('c', {2}, {21.5f,5.f}, nd4j::DataType::FLOAT32);
+    NDArray exp1('c', {}, std::vector<double>{26.5f}, sd::DataType::FLOAT32);
+    NDArray exp2('c', {2,2}, {9.5f,12.f,3.f,2.f}, sd::DataType::FLOAT32);
+    NDArray exp3('c', {3}, {19.f,4.f,3.5f}, sd::DataType::FLOAT32);
+    NDArray exp4('c', {3,2}, {9.f,10.f,2.f,2.f,1.5f,2.f}, sd::DataType::FLOAT32);
+    NDArray exp5('c', {2}, {21.5f,5.f}, sd::DataType::FLOAT32);
 
-    x.reduceAlongDimension(nd4j::reduce::Sum, z1, {0,1,2});
+    x.reduceAlongDimension(sd::reduce::Sum, z1, {0,1,2});
     ASSERT_TRUE(z1.equalsTo(&exp1));
 
-    x.reduceAlongDimension(nd4j::reduce::Sum, z2, {1});
+    x.reduceAlongDimension(sd::reduce::Sum, z2, {1});
     ASSERT_TRUE(z2.equalsTo(&exp2));
 
-    x.reduceAlongDimension(nd4j::reduce::Sum, z3, {0,2});
+    x.reduceAlongDimension(sd::reduce::Sum, z3, {0,2});
     ASSERT_TRUE(z3.equalsTo(&exp3));
 
     x.permutei({1,0,2});    // 3x2x2
 
-    x.reduceAlongDimension(nd4j::reduce::Sum, z1, {0,1,2});
+    x.reduceAlongDimension(sd::reduce::Sum, z1, {0,1,2});
     ASSERT_TRUE(z1.equalsTo(&exp1));
 
-    x.reduceAlongDimension(nd4j::reduce::Sum, z4, {1});
+    x.reduceAlongDimension(sd::reduce::Sum, z4, {1});
     ASSERT_TRUE(z4.equalsTo(&exp4));
 
-    x.reduceAlongDimension(nd4j::reduce::Sum, z5, {0,2});
+    x.reduceAlongDimension(sd::reduce::Sum, z5, {0,2});
     ASSERT_TRUE(z5.equalsTo(&exp5));
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(NDArrayCudaBasicsTests, reduceAlongDimension_same_test2) {
 
-    NDArray x('c', {2,3,2}, {1.5,2,3,4,5,6,7.5,8,-1,-2,-3.5,-4,}, nd4j::DataType::INT64);
+    NDArray x('c', {2,3,2}, {1.5,2,3,4,5,6,7.5,8,-1,-2,-3.5,-4,}, sd::DataType::INT64);
 
-    NDArray exp1('c', {}, std::vector<double>{26}, nd4j::DataType::INT64);
-    NDArray exp2('c', {2,2}, {9,12,3,2}, nd4j::DataType::INT64);
-    NDArray exp3('c', {3}, {18,4,4}, nd4j::DataType::INT64);
-    NDArray exp4('c', {3,2}, {8,10,2,2,2,2}, nd4j::DataType::INT64);
-    NDArray exp5('c', {2}, {21,5}, nd4j::DataType::INT64);
+    NDArray exp1('c', {}, std::vector<double>{26}, sd::DataType::INT64);
+    NDArray exp2('c', {2,2}, {9,12,3,2}, sd::DataType::INT64);
+    NDArray exp3('c', {3}, {18,4,4}, sd::DataType::INT64);
+    NDArray exp4('c', {3,2}, {8,10,2,2,2,2}, sd::DataType::INT64);
+    NDArray exp5('c', {2}, {21,5}, sd::DataType::INT64);
 
-    NDArray z1 = x.reduceAlongDimension(nd4j::reduce::Sum, {0,1,2});
+    NDArray z1 = x.reduceAlongDimension(sd::reduce::Sum, {0,1,2});
     ASSERT_TRUE(z1.equalsTo(&exp1));
 
-    NDArray z2 = x.reduceAlongDimension(nd4j::reduce::Sum, {1});
+    NDArray z2 = x.reduceAlongDimension(sd::reduce::Sum, {1});
     ASSERT_TRUE(z2.equalsTo(&exp2));
 
-    NDArray z3 = x.reduceAlongDimension(nd4j::reduce::Sum, {0,2});
+    NDArray z3 = x.reduceAlongDimension(sd::reduce::Sum, {0,2});
     ASSERT_TRUE(z3.equalsTo(&exp3));
 
     x.permutei({1,0,2});    // 3x2x2
 
-    NDArray z4 = x.reduceAlongDimension(nd4j::reduce::Sum, {0,1,2});
+    NDArray z4 = x.reduceAlongDimension(sd::reduce::Sum, {0,1,2});
     ASSERT_TRUE(z4.equalsTo(&exp1));
 
-    NDArray z5 = x.reduceAlongDimension(nd4j::reduce::Sum, {1});
+    NDArray z5 = x.reduceAlongDimension(sd::reduce::Sum, {1});
     ASSERT_TRUE(z5.equalsTo(&exp4));
 
-    NDArray z6 = x.reduceAlongDimension(nd4j::reduce::Sum, {0,2});
+    NDArray z6 = x.reduceAlongDimension(sd::reduce::Sum, {0,2});
     ASSERT_TRUE(z6.equalsTo(&exp5));
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(NDArrayCudaBasicsTests, reduceAlongDimension_bool_test1) {
 
-    NDArray x('c', {2,3,2}, {0.5,2,3,-4,5,6,-7.5,8,-1,-0.5,-3.5,4}, nd4j::DataType::DOUBLE);
+    NDArray x('c', {2,3,2}, {0.5,2,3,-4,5,6,-7.5,8,-1,-0.5,-3.5,4}, sd::DataType::DOUBLE);
 
-    NDArray z1('c', {}, std::vector<double>{true}, nd4j::DataType::BOOL);
-    NDArray z2('c', {2,2}, {true,true,true,true}, nd4j::DataType::BOOL);
-    NDArray z3('c', {3}, {true,true,true}, nd4j::DataType::BOOL);
-    NDArray z4('c', {3,2}, {true,true,true,true,true,true}, nd4j::DataType::BOOL);
-    NDArray z5('c', {2}, {true,true}, nd4j::DataType::BOOL);
+    NDArray z1('c', {}, std::vector<double>{true}, sd::DataType::BOOL);
+    NDArray z2('c', {2,2}, {true,true,true,true}, sd::DataType::BOOL);
+    NDArray z3('c', {3}, {true,true,true}, sd::DataType::BOOL);
+    NDArray z4('c', {3,2}, {true,true,true,true,true,true}, sd::DataType::BOOL);
+    NDArray z5('c', {2}, {true,true}, sd::DataType::BOOL);
 
-    NDArray exp1('c', {}, std::vector<double>{true}, nd4j::DataType::BOOL);
-    NDArray exp2('c', {2,2}, {true,true,false,true}, nd4j::DataType::BOOL);
-    NDArray exp3('c', {3}, {true,true,true}, nd4j::DataType::BOOL);
-    NDArray exp4('c', {3,2}, {true,true,true,false,true,true}, nd4j::DataType::BOOL);
-    NDArray exp5('c', {2}, {true,true}, nd4j::DataType::BOOL);
+    NDArray exp1('c', {}, std::vector<double>{true}, sd::DataType::BOOL);
+    NDArray exp2('c', {2,2}, {true,true,false,true}, sd::DataType::BOOL);
+    NDArray exp3('c', {3}, {true,true,true}, sd::DataType::BOOL);
+    NDArray exp4('c', {3,2}, {true,true,true,false,true,true}, sd::DataType::BOOL);
+    NDArray exp5('c', {2}, {true,true}, sd::DataType::BOOL);
 
-    x.reduceAlongDimension(nd4j::reduce::IsPositive, z1, {0,1,2});
+    x.reduceAlongDimension(sd::reduce::IsPositive, z1, {0,1,2});
     ASSERT_TRUE(z1.equalsTo(&exp1));
 
-    x.reduceAlongDimension(nd4j::reduce::IsPositive, z2, {1});
+    x.reduceAlongDimension(sd::reduce::IsPositive, z2, {1});
     ASSERT_TRUE(z2.equalsTo(&exp2));
 
-    x.reduceAlongDimension(nd4j::reduce::IsPositive, z3, {0,2});
+    x.reduceAlongDimension(sd::reduce::IsPositive, z3, {0,2});
     ASSERT_TRUE(z3.equalsTo(&exp3));
 
     x.permutei({1,0,2});    // 3x2x2
 
-    x.reduceAlongDimension(nd4j::reduce::IsPositive, z1, {0,1,2});
+    x.reduceAlongDimension(sd::reduce::IsPositive, z1, {0,1,2});
     ASSERT_TRUE(z1.equalsTo(&exp1));
 
-    x.reduceAlongDimension(nd4j::reduce::IsPositive, z4, {1});
+    x.reduceAlongDimension(sd::reduce::IsPositive, z4, {1});
     ASSERT_TRUE(z4.equalsTo(&exp4));
 
-    x.reduceAlongDimension(nd4j::reduce::IsPositive, z5, {0,2});
+    x.reduceAlongDimension(sd::reduce::IsPositive, z5, {0,2});
     ASSERT_TRUE(z5.equalsTo(&exp5));
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(NDArrayCudaBasicsTests, reduceAlongDimension_bool_test2) {
 
-    NDArray x('c', {2,3,2}, {0.5,2,3,-4,5,6,-7.5,8,-1,-0.5,-3.5,4}, nd4j::DataType::INT32);
+    NDArray x('c', {2,3,2}, {0.5,2,3,-4,5,6,-7.5,8,-1,-0.5,-3.5,4}, sd::DataType::INT32);
 
-    NDArray exp1('c', {}, std::vector<double>{1}, nd4j::DataType::BOOL);
-    NDArray exp2('c', {2,2}, {1,1,0,1}, nd4j::DataType::BOOL);
-    NDArray exp3('c', {3}, {1,1,1}, nd4j::DataType::BOOL);
-    NDArray exp4('c', {3,2}, {0,1,1,0,1,1}, nd4j::DataType::BOOL);
-    NDArray exp5('c', {2}, {1,1}, nd4j::DataType::BOOL);
+    NDArray exp1('c', {}, std::vector<double>{1}, sd::DataType::BOOL);
+    NDArray exp2('c', {2,2}, {1,1,0,1}, sd::DataType::BOOL);
+    NDArray exp3('c', {3}, {1,1,1}, sd::DataType::BOOL);
+    NDArray exp4('c', {3,2}, {0,1,1,0,1,1}, sd::DataType::BOOL);
+    NDArray exp5('c', {2}, {1,1}, sd::DataType::BOOL);
 
-    NDArray z1 = x.reduceAlongDimension(nd4j::reduce::IsPositive, {0,1,2});
+    NDArray z1 = x.reduceAlongDimension(sd::reduce::IsPositive, {0,1,2});
     ASSERT_TRUE(z1.equalsTo(&exp1));
 
-    NDArray z2 = x.reduceAlongDimension(nd4j::reduce::IsPositive, {1});
+    NDArray z2 = x.reduceAlongDimension(sd::reduce::IsPositive, {1});
     ASSERT_TRUE(z2.equalsTo(&exp2));
 
-    NDArray z3 = x.reduceAlongDimension(nd4j::reduce::IsPositive, {0,2});
+    NDArray z3 = x.reduceAlongDimension(sd::reduce::IsPositive, {0,2});
     ASSERT_TRUE(z3.equalsTo(&exp3));
 
     x.permutei({1,0,2});    // 3x2x2
 
-    NDArray z4 = x.reduceAlongDimension(nd4j::reduce::IsPositive, {0,1,2});
+    NDArray z4 = x.reduceAlongDimension(sd::reduce::IsPositive, {0,1,2});
     ASSERT_TRUE(z4.equalsTo(&exp1));
 
-    NDArray z5 = x.reduceAlongDimension(nd4j::reduce::IsPositive, {1});
+    NDArray z5 = x.reduceAlongDimension(sd::reduce::IsPositive, {1});
     ASSERT_TRUE(z5.equalsTo(&exp4));
 
-    NDArray z6 = x.reduceAlongDimension(nd4j::reduce::IsPositive, {0,2});
+    NDArray z6 = x.reduceAlongDimension(sd::reduce::IsPositive, {0,2});
     ASSERT_TRUE(z6.equalsTo(&exp5));
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(NDArrayCudaBasicsTests, reduceAlongDimension_long_test1) {
 
-    NDArray x('c', {2,3,2}, {0.5f,2.f,3.f,-0.f,5.f,6.f,-7.5f,0.f,-1.f,-0.5f,-3.5f,4.f}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {2,3,2}, {0.5f,2.f,3.f,-0.f,5.f,6.f,-7.5f,0.f,-1.f,-0.5f,-3.5f,4.f}, sd::DataType::FLOAT32);
 
-    NDArray z1('c', {}, std::vector<double>{100}, nd4j::DataType::INT64);
-    NDArray z2('c', {2,2}, {100,100,100,100}, nd4j::DataType::INT64);
-    NDArray z3('c', {3}, {100,100,100}, nd4j::DataType::INT64);
-    NDArray z4('c', {3,2}, {100,100,100,100,100,100}, nd4j::DataType::INT64);
-    NDArray z5('c', {2}, {100,100}, nd4j::DataType::INT64);
+    NDArray z1('c', {}, std::vector<double>{100}, sd::DataType::INT64);
+    NDArray z2('c', {2,2}, {100,100,100,100}, sd::DataType::INT64);
+    NDArray z3('c', {3}, {100,100,100}, sd::DataType::INT64);
+    NDArray z4('c', {3,2}, {100,100,100,100,100,100}, sd::DataType::INT64);
+    NDArray z5('c', {2}, {100,100}, sd::DataType::INT64);
 
-    NDArray exp1('c', {}, std::vector<double>{2}, nd4j::DataType::INT64);
-    NDArray exp2('c', {2,2}, {0,1,0,1}, nd4j::DataType::INT64);
-    NDArray exp3('c', {3}, {1,1,0}, nd4j::DataType::INT64);
-    NDArray exp4('c', {3,2}, {0,1,0,1,0,0}, nd4j::DataType::INT64);
-    NDArray exp5('c', {2}, {1,1}, nd4j::DataType::INT64);
+    NDArray exp1('c', {}, std::vector<double>{2}, sd::DataType::INT64);
+    NDArray exp2('c', {2,2}, {0,1,0,1}, sd::DataType::INT64);
+    NDArray exp3('c', {3}, {1,1,0}, sd::DataType::INT64);
+    NDArray exp4('c', {3,2}, {0,1,0,1,0,0}, sd::DataType::INT64);
+    NDArray exp5('c', {2}, {1,1}, sd::DataType::INT64);
 
-    x.reduceAlongDimension(nd4j::reduce::CountZero, z1, {0,1,2});
+    x.reduceAlongDimension(sd::reduce::CountZero, z1, {0,1,2});
     ASSERT_TRUE(z1.equalsTo(&exp1));
 
-    x.reduceAlongDimension(nd4j::reduce::CountZero, z2, {1});
+    x.reduceAlongDimension(sd::reduce::CountZero, z2, {1});
     ASSERT_TRUE(z2.equalsTo(&exp2));
 
-    x.reduceAlongDimension(nd4j::reduce::CountZero, z3, {0,2});
+    x.reduceAlongDimension(sd::reduce::CountZero, z3, {0,2});
     ASSERT_TRUE(z3.equalsTo(&exp3));
 
     x.permutei({1,0,2});    // 3x2x2
 
-    x.reduceAlongDimension(nd4j::reduce::CountZero, z1, {0,1,2});
+    x.reduceAlongDimension(sd::reduce::CountZero, z1, {0,1,2});
     ASSERT_TRUE(z1.equalsTo(&exp1));
 
-    x.reduceAlongDimension(nd4j::reduce::CountZero, z4, {1});
+    x.reduceAlongDimension(sd::reduce::CountZero, z4, {1});
     ASSERT_TRUE(z4.equalsTo(&exp4));
 
-    x.reduceAlongDimension(nd4j::reduce::CountZero, z5, {0,2});
+    x.reduceAlongDimension(sd::reduce::CountZero, z5, {0,2});
     ASSERT_TRUE(z5.equalsTo(&exp5));
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(NDArrayCudaBasicsTests, reduceAlongDimension_long_test2) {
 
-    NDArray x('c', {2,3,2}, {0.5,2,3,-0,5,6,-7.5,0,-1,-0.5,-3.5,4}, nd4j::DataType::INT32);
+    NDArray x('c', {2,3,2}, {0.5,2,3,-0,5,6,-7.5,0,-1,-0.5,-3.5,4}, sd::DataType::INT32);
 
-    NDArray exp1('c', {}, std::vector<double>{4}, nd4j::DataType::INT64);
-    NDArray exp2('c', {2,2}, {1,1,0,2}, nd4j::DataType::INT64);
-    NDArray exp3('c', {3}, {2,2,0}, nd4j::DataType::INT64);
-    NDArray exp4('c', {3,2}, {1,1,0,2,0,0}, nd4j::DataType::INT64);
-    NDArray exp5('c', {2}, {2,2}, nd4j::DataType::INT64);
+    NDArray exp1('c', {}, std::vector<double>{4}, sd::DataType::INT64);
+    NDArray exp2('c', {2,2}, {1,1,0,2}, sd::DataType::INT64);
+    NDArray exp3('c', {3}, {2,2,0}, sd::DataType::INT64);
+    NDArray exp4('c', {3,2}, {1,1,0,2,0,0}, sd::DataType::INT64);
+    NDArray exp5('c', {2}, {2,2}, sd::DataType::INT64);
 
-    NDArray z1 = x.reduceAlongDimension(nd4j::reduce::CountZero, {0,1,2});
+    NDArray z1 = x.reduceAlongDimension(sd::reduce::CountZero, {0,1,2});
     ASSERT_TRUE(z1.equalsTo(&exp1));
 
-    NDArray z2 = x.reduceAlongDimension(nd4j::reduce::CountZero, {1});
+    NDArray z2 = x.reduceAlongDimension(sd::reduce::CountZero, {1});
     ASSERT_TRUE(z2.equalsTo(&exp2));
 
-    NDArray z3 = x.reduceAlongDimension(nd4j::reduce::CountZero, {0,2});
+    NDArray z3 = x.reduceAlongDimension(sd::reduce::CountZero, {0,2});
     ASSERT_TRUE(z3.equalsTo(&exp3));
 
     x.permutei({1,0,2});    // 3x2x2
 
-    NDArray z4 = x.reduceAlongDimension(nd4j::reduce::CountZero, {0,1,2});
+    NDArray z4 = x.reduceAlongDimension(sd::reduce::CountZero, {0,1,2});
     ASSERT_TRUE(z4.equalsTo(&exp1));
 
-    NDArray z5 = x.reduceAlongDimension(nd4j::reduce::CountZero, {1});
+    NDArray z5 = x.reduceAlongDimension(sd::reduce::CountZero, {1});
     ASSERT_TRUE(z5.equalsTo(&exp4));
 
-    NDArray z6 = x.reduceAlongDimension(nd4j::reduce::CountZero, {0,2});
+    NDArray z6 = x.reduceAlongDimension(sd::reduce::CountZero, {0,2});
     ASSERT_TRUE(z6.equalsTo(&exp5));
 }
 
@@ -1687,8 +1687,8 @@ TEST_F(NDArrayCudaBasicsTests, BroadcastOpsTest1) {
     auto x = NDArrayFactory::create<float>('c', {5, 5});
     auto z = NDArrayFactory::create<float>('c', {5, 5});
     auto row = NDArrayFactory::linspace(1.0f, 5.0f, 5);
-    NDArray expRow('c', {1, 5,}, {1,2,3,4,5}, nd4j::DataType::FLOAT32);
-    NDArray exp('c', {5,5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5}, nd4j::DataType::FLOAT32);
+    NDArray expRow('c', {1, 5,}, {1,2,3,4,5}, sd::DataType::FLOAT32);
+    NDArray exp('c', {5,5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5}, sd::DataType::FLOAT32);
 
     ASSERT_TRUE(row->equalsTo(&expRow));
 
@@ -1706,8 +1706,8 @@ TEST_F(NDArrayCudaBasicsTests, BroadcastOpsTest2) {
     auto x = NDArrayFactory::create<float>('c', {5, 5});
     //auto z = NDArrayFactory::create<float>('c', {5, 5});
     auto row = NDArrayFactory::linspace(1.0f, 5.0f, 5);
-    NDArray expRow('c', {1, 5,}, {1,2,3,4,5}, nd4j::DataType::FLOAT32);
-    NDArray exp('c', {5,5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5}, nd4j::DataType::FLOAT32);
+    NDArray expRow('c', {1, 5,}, {1,2,3,4,5}, sd::DataType::FLOAT32);
+    NDArray exp('c', {5,5}, {1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5}, sd::DataType::FLOAT32);
 
     ASSERT_TRUE(row->equalsTo(&expRow));
     x.applyBroadcast(broadcast::Add, {1}, *row, x);
@@ -1717,7 +1717,7 @@ TEST_F(NDArrayCudaBasicsTests, BroadcastOpsTest2) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(NDArrayCudaBasicsTests, TestBroadcast_1) {
 
-    NDArray exp('c', {2, 3, 2, 2}, {1., 1., 1., 1., 2., 2., 2., 2., 3., 3., 3., 3., 1., 1., 1., 1., 2., 2., 2., 2., 3., 3., 3., 3.}, nd4j::DataType::DOUBLE);
+    NDArray exp('c', {2, 3, 2, 2}, {1., 1., 1., 1., 2., 2., 2., 2., 3., 3., 3., 3., 1., 1., 1., 1., 2., 2., 2., 2., 3., 3., 3., 3.}, sd::DataType::DOUBLE);
 
     auto input = NDArrayFactory::create<double>('c',{ 2, 3, 2, 2});
     auto bias = NDArrayFactory::create<double>('c', {1, 3});
@@ -1884,7 +1884,7 @@ TEST_F(NDArrayCudaBasicsTests, Tile_Test_2_3)
 TEST_F(NDArrayCudaBasicsTests, Operator_Plus_Test_2)
 {
     double expBuff[] = {2., 3, 3., 4., 4., 5, 5., 6., 6., 7, 7., 8.};
-    NDArray a('c', {4,4}, {1,2,3,4,5,6,7,8,9,2,3,2,1,0,4,7}, nd4j::DataType::FLOAT32);
+    NDArray a('c', {4,4}, {1,2,3,4,5,6,7,8,9,2,3,2,1,0,4,7}, sd::DataType::FLOAT32);
     auto x = NDArrayFactory::create<double>('c', {3, 2, 1});
     auto y = NDArrayFactory::create<double>('c',    {1, 2});
     auto expected = NDArrayFactory::create<double>(expBuff, 'c', {3, 2, 2});
@@ -1900,9 +1900,9 @@ TEST_F(NDArrayCudaBasicsTests, Operator_Plus_Test_2)
 //////////////////////////////////////////////////////////////////////
 TEST_F(NDArrayCudaBasicsTests, assign_2)
 {
-    NDArray x('c', {4}, {1.5f,2.5f,3.5f,4.5f}, nd4j::DataType::FLOAT32);
-    NDArray y('c', {4}, nd4j::DataType::INT32);
-    NDArray expected('c', {4}, {1,2,3,4}, nd4j::DataType::INT32);
+    NDArray x('c', {4}, {1.5f,2.5f,3.5f,4.5f}, sd::DataType::FLOAT32);
+    NDArray y('c', {4}, sd::DataType::INT32);
+    NDArray expected('c', {4}, {1,2,3,4}, sd::DataType::INT32);
 
     y.assign(x);
     // y.printBuffer("ASSIGN VECTOR");
@@ -1913,8 +1913,8 @@ TEST_F(NDArrayCudaBasicsTests, assign_2)
 //////////////////////////////////////////////////////////////////////
 TEST_F(NDArrayCudaBasicsTests, subarray_1)
 {
-    NDArray x('c', {2,3,4}, {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24}, nd4j::DataType::FLOAT32);
-    NDArray y('f', {2,3,4}, {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {2,3,4}, {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24}, sd::DataType::FLOAT32);
+    NDArray y('f', {2,3,4}, {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24}, sd::DataType::FLOAT32);
 
     Nd4jLong shapeExpX0[] = {1, 2, 12, 8192, 1, 99};
     float    buffExpX0[]  = {1.f, 13.f};
@@ -2074,11 +2074,11 @@ TEST_F(NDArrayCudaBasicsTests, Test_diagonal_1) {
         ASSERT_NEAR(diag.e<float>(e), exp.e<float>(e), 1.e-5);
     }
     double eps(1.e-5);
-    NDArray tmp(nd4j::DataType::FLOAT32, x.getContext()); // scalar = 0
+    NDArray tmp(sd::DataType::FLOAT32, x.getContext()); // scalar = 0
 
     ExtraArguments extras({eps});
     NativeOpExecutioner::execReduce3Scalar(diag.getContext(), reduce3::EqualsWithEps, diag.getBuffer(),
-            diag.getShapeInfo(), diag.getSpecialBuffer(), diag.getSpecialShapeInfo(), extras.argumentsAsT(nd4j::DataType::FLOAT32),
+            diag.getShapeInfo(), diag.getSpecialBuffer(), diag.getSpecialShapeInfo(), extras.argumentsAsT(sd::DataType::FLOAT32),
             exp.getBuffer(), exp.getShapeInfo(), exp.getSpecialBuffer(), exp.getSpecialShapeInfo(),
             tmp.buffer(), tmp.shapeInfo(), tmp.specialBuffer(), tmp.specialShapeInfo());
     cudaStream_t* stream = x.getContext()->getCudaStream();
@@ -2185,13 +2185,13 @@ TEST_F(NDArrayCudaBasicsTests, Test_Empty_2) {
 }
 
 TEST_F(NDArrayCudaBasicsTests, Test_Empty_3) {
-    auto x = NDArrayFactory::empty(nd4j::DataType::FLOAT32);
+    auto x = NDArrayFactory::empty(sd::DataType::FLOAT32);
 
     ASSERT_TRUE(x.isEmpty());
 }
 
 TEST_F(NDArrayCudaBasicsTests, Test_Empty_4) {
-    auto x = NDArrayFactory::empty_(nd4j::DataType::FLOAT32);
+    auto x = NDArrayFactory::empty_(sd::DataType::FLOAT32);
 
     ASSERT_TRUE(x->isEmpty());
     delete x;
diff --git a/libnd4j/tests_cpu/layers_tests/NDArrayListTests.cpp b/libnd4j/tests_cpu/layers_tests/NDArrayListTests.cpp
index e57c7e625..2de3e4651 100644
--- a/libnd4j/tests_cpu/layers_tests/NDArrayListTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/NDArrayListTests.cpp
@@ -18,11 +18,11 @@
 // @author raver119@gmail.com
 //
 
-#include <NDArray.h>
-#include <NDArrayList.h>
+#include <array/NDArray.h>
+#include <array/NDArrayList.h>
 #include "testlayers.h"
 
-using namespace nd4j;
+using namespace sd;
 
 class NDArrayListTests : public testing::Test {
 public:
diff --git a/libnd4j/tests_cpu/layers_tests/NDArrayTests.cpp b/libnd4j/tests_cpu/layers_tests/NDArrayTests.cpp
index fb55b4484..85888dac1 100644
--- a/libnd4j/tests_cpu/layers_tests/NDArrayTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/NDArrayTests.cpp
@@ -20,10 +20,10 @@
 
 #include "testlayers.h"
 #include <memory>
-#include <NDArray.h>
-#include <MmulHelper.h>
+#include <array/NDArray.h>
+#include <helpers/MmulHelper.h>
 
-using namespace nd4j;
+using namespace sd;
 
 //////////////////////////////////////////////////////////////////////
 class NDArrayTest : public testing::Test {
@@ -231,7 +231,7 @@ TEST_F(NDArrayTest, TestTad3) {
 
 TEST_F(NDArrayTest, TestPermuteReshape1) {
 
-    NDArray array('c', {2, 2, 5, 5}, nd4j::DataType::FLOAT32);
+    NDArray array('c', {2, 2, 5, 5}, sd::DataType::FLOAT32);
     int pShape[] = {4, 2, 5, 5, 2, 25, 5, 1, 50, 8192, 0, 99};
     int rShape[] = {3, 2, 25, 2, 25, 1, 50, 8192, 0, 99};
 
@@ -275,7 +275,7 @@ TEST_F(NDArrayTest, TestRepeat1) {
 
     auto eBuffer = new float[8] {1.0,2.0,1.0,2.0,3.0,4.0,3.0,4.0};
     auto eShape = new Nd4jLong[8]{2, 4, 2, 2, 1, 8192, 1, 99};
-    NDArray array('c', {2, 2}, nd4j::DataType::FLOAT32);
+    NDArray array('c', {2, 2}, sd::DataType::FLOAT32);
     auto exp = new NDArray(eBuffer, eShape);
     for (int e = 0; e < array.lengthOf(); e++)
         array.p(e, e + 1);
@@ -461,7 +461,7 @@ TEST_F(NDArrayTest, TestTranspose2) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(NDArrayTest, TestSumAlongDimension1) {
 
-    NDArray array('c', {2,2}, {1,2,3,4}, nd4j::DataType::FLOAT32);
+    NDArray array('c', {2,2}, {1,2,3,4}, sd::DataType::FLOAT32);
 
     auto res = array.reduceAlongDimension(reduce::Sum, {0});
 
@@ -937,19 +937,19 @@ TEST_F(NDArrayTest, TestPermuteReshapeMmul4) {
 TEST_F(NDArrayTest, TestMmulHelper2) {
     auto xBuffer = new float[15]{1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f};
     Nd4jLong xShape[8] = {2, 5, 3, 3, 1, 8192, 1, 99};
-    auto x = new NDArray(xBuffer, xShape, nd4j::LaunchContext ::defaultContext(), true);
+    auto x = new NDArray(xBuffer, xShape, sd::LaunchContext ::defaultContext(), true);
 
 
     auto yBuffer = new float[3]{2.f, 4.f, 6.f};
     Nd4jLong yShape[8] = {2, 3, 1, 1, 1, 8192, 1, 99};
-    auto y = new NDArray(yBuffer, yShape, nd4j::LaunchContext ::defaultContext(), true);
+    auto y = new NDArray(yBuffer, yShape, sd::LaunchContext ::defaultContext(), true);
 
     auto z = NDArrayFactory::create_<float>('f', {5, 1});
 
     auto expBuffer = new float[5]{28.00f,  64.00f,  100.00f,  136.00f,  172.00f};
-    auto exp = new NDArray(expBuffer, z->getShapeInfo(), nd4j::LaunchContext ::defaultContext(), true);
+    auto exp = new NDArray(expBuffer, z->getShapeInfo(), sd::LaunchContext ::defaultContext(), true);
 
-    //nd4j::blas::GEMV<float>::op('f',  x->rows(), x->columns(), 1.0f, x->getBuffer(), y->rows(), y->getBuffer(), 1, 0.0, z->getBuffer(), 1);
+    //sd::blas::GEMV<float>::op('f',  x->rows(), x->columns(), 1.0f, x->getBuffer(), y->rows(), y->getBuffer(), 1, 0.0, z->getBuffer(), 1);
 
     MmulHelper::mmul(x, y, z);
 
@@ -978,7 +978,7 @@ TEST_F(NDArrayTest, TestMmulHelper3) {
     auto expBuffer = new float[5]{92.00f,  104.00f,  116.00f,  128.00f,  140.00f};
     auto exp = new NDArray(expBuffer, z->getShapeInfo());
 
-    //nd4j::blas::GEMV<float>::op('f',  x->rows(), x->columns(), 1.0f, x->getBuffer(), y->rows(), y->getBuffer(), 1, 0.0, z->getBuffer(), 1);
+    //sd::blas::GEMV<float>::op('f',  x->rows(), x->columns(), 1.0f, x->getBuffer(), y->rows(), y->getBuffer(), 1, 0.0, z->getBuffer(), 1);
 
     MmulHelper::mmul(x, y, z);
 
@@ -1569,7 +1569,7 @@ TEST_F(NDArrayTest, TestStdDev4) {
     float y = 0;
     double M2 = 0;
     for (int e = 0; e < total; ++e) {
-    //    y += nd4j::math::nd4j_abs(array(e) - x);
+    //    y += sd::math::nd4j_abs(array(e) - x);
         M2 += (array.e<float>(e) - x) * (array.e<float>(e) - x);
     }
     //y /= total;
@@ -1580,7 +1580,7 @@ TEST_F(NDArrayTest, TestStdDev4) {
     auto std = a.e<float>(0);
 //    float bY = array.varianceNumber();
     float bY = 0.3333333f;
-    // nd4j_printf("Variance is %f, res is %f, internal is %f\n, deviance is %f(%f)\n", std, x, bY, y, nd4j::math::nd4j_sqrt<double>(M2));
+    // nd4j_printf("Variance is %f, res is %f, internal is %f\n, deviance is %f(%f)\n", std, x, bY, y, sd::math::nd4j_sqrt<double>(M2));
     ASSERT_NEAR(std, 0.3333333f, 1.0e-5f);
 }
 
@@ -1802,7 +1802,7 @@ TEST_F(NDArrayTest, TestTensorDotAgain_1) {
     double _expB[] = {96.0,  116.0,  136.0,  156.0,  256.0,  276.0,  296.0,  316.0,  102.0,  124.0,  146.0,  168.0,    278.0,  300.0,  322.0,  344.0,  108.0,  132.0,  156.0,  180.0,  300.0,  324.0,  348.0,  372.0,    114.0,  140.0,  166.0,  192.0,  322.0,  348.0,  374.0,  400.0,  120.0,  148.0,  176.0,  204.0,    344.0,  372.0,  400.0,  428.0,  126.0,  156.0,  186.0,  216.0,  366.0,  396.0,  426.0,  456.0,    132.0,  164.0,  196.0,  228.0,  388.0,  420.0,  452.0,  484.0,  138.0,  172.0,  206.0,  240.0,    410.0,  444.0,  478.0,  512.0,  144.0,  180.0,  216.0,  252.0,  432.0,  468.0,  504.0,  540.0,    150.0,  188.0,  226.0,  264.0,  454.0,  492.0,  530.0,  568.0,  156.0,  196.0,  236.0,  276.0,    476.0,  516.0,  556.0,  596.0,  162.0,  204.0,  246.0,  288.0,  498.0,  540.0,  582.0,  624.0,    168.0,  212.0,  256.0,  300.0,  520.0,  564.0,  608.0,  652.0,  174.0,  220.0,  266.0,  312.0,    542.0,  588.0,  634.0,  680.0,  180.0,  228.0,  276.0,  324.0,  564.0,  612.0,  660.0,  708.0,    186.0,  236.0,  286.0,  336.0,  586.0,  636.0,  686.0,  736.0,  192.0,  244.0,  296.0,  348.0,    608.0,  660.0,  712.0,  764.0,  198.0,  252.0,  306.0,  360.0,  630.0,  684.0,  738.0,  792.0};
 
     Nd4jLong _expS[] = {6, 2, 3, 3, 2, 2, 2, 72, 24, 8, 4, 2, 1, 16384, 1, 99};
-    NDArray exp(_expB, _expS, nd4j::LaunchContext ::defaultContext(), false);
+    NDArray exp(_expB, _expS, sd::LaunchContext ::defaultContext(), false);
 
     auto input = NDArrayFactory::create<double>('c', {B, iC, iY, iX});
     auto weights = NDArrayFactory::create<double>('c', {iC, oC, kY, kX});
@@ -1826,7 +1826,7 @@ TEST_F(NDArrayTest, TestTensorDotAgain_1) {
 TEST_F(NDArrayTest, TestBroadcast_1) {
     double _expB[] = {1.000000, 1.000000, 1.000000, 1.000000, 2.000000, 2.000000, 2.000000, 2.000000, 3.000000, 3.000000, 3.000000, 3.000000, 1.000000, 1.000000, 1.000000, 1.000000, 2.000000, 2.000000, 2.000000, 2.000000, 3.000000, 3.000000, 3.000000, 3.000000};
     Nd4jLong _expS[] = {4, 2, 3, 2, 2, 12, 4, 2, 1, 16384, 1, 99};
-    NDArray exp(_expB, _expS, nd4j::LaunchContext ::defaultContext(), false);
+    NDArray exp(_expB, _expS, sd::LaunchContext ::defaultContext(), false);
 
     auto input = NDArrayFactory::create<double>('c',{ 2, 3, 2, 2});
     auto bias = NDArrayFactory::create<double>('c', {1, 3});
@@ -1913,7 +1913,7 @@ TEST_F(NDArrayTest, TestMatmMul_Again_1) {
 
     float _expB[] = {1.f,    2.f,    3.f,    4.f,    5.f,    2.f,    4.f,    6.f,    8.f,   10.f,    3.f,    6.f,    9.f,   12.f,   15.f,    4.f,    8.f,   12.f,   16.f,   20.f,   30.f,   35.f,   40.f,   45.f,    50.f,   36.f,   42.f,   48.f,   54.f,   60.f,   42.f,   49.f,   56.f,   63.f,   70.f,   48.f,    56.f,   64.f,   72.f,   80.f,   99.f,  108.f,  117.f,  126.f,  135.f,  110.f,  120.f,  130.f,    140.f,  150.f,  121.f,  132.f,  143.f,  154.f,  165.f,  132.f,  144.f,  156.f,  168.f,  180.f};
     Nd4jLong _expS[] = {3, 3, 4, 5, 20, 5, 1, 8192, 1, 99};
-    NDArray c(_expB, _expS, nd4j::LaunchContext ::defaultContext(), false);
+    NDArray c(_expB, _expS, sd::LaunchContext ::defaultContext(), false);
 
     auto c_ = MmulHelper::mmul(&a, &b);
 
diff --git a/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp b/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp
index 6d5366396..49f003809 100644
--- a/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp
+++ b/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp
@@ -20,11 +20,11 @@
 
 #include "testlayers.h"
 #include <memory>
-#include <NDArray.h>
-#include <DebugHelper.h>
+#include <array/NDArray.h>
+#include <helpers/DebugHelper.h>
 #include <ops/declarable/headers/parity_ops.h>
 
-using namespace nd4j;
+using namespace sd;
 
 //////////////////////////////////////////////////////////////////////
 class NDArrayTest2 : public testing::Test {
@@ -662,8 +662,8 @@ TEST_F(NDArrayTest2, permute_test4) {
     auto arr1Buffer = new float[786432];
     auto arr2Buffer = new float[786432];
 
-    NDArray arr1(arr1Buffer, arr1ShapeInfo, nd4j::LaunchContext ::defaultContext());
-    NDArray arr2(arr2Buffer, arr2ShapeInfo, nd4j::LaunchContext ::defaultContext());
+    NDArray arr1(arr1Buffer, arr1ShapeInfo, sd::LaunchContext ::defaultContext());
+    NDArray arr2(arr2Buffer, arr2ShapeInfo, sd::LaunchContext ::defaultContext());
 
     const std::vector<int> perm = {0, 4, 5, 1, 2, 3};
     auto arr1P = arr1.permute(perm);
@@ -776,7 +776,7 @@ TEST_F(NDArrayTest2, scalar_get_test1) {
 
     auto scalar1 = NDArrayFactory::create(20.f);
 
-    NDArray arr('c', {2,2}, {0., 10., 20., 30.}, nd4j::DataType::FLOAT32);
+    NDArray arr('c', {2,2}, {0., 10., 20., 30.}, sd::DataType::FLOAT32);
 
     NDArray scalar2 = arr.e(2);
 
@@ -790,7 +790,7 @@ TEST_F(NDArrayTest2, scalar_get_test2) {
 
     auto scalar1 = NDArrayFactory::create(20.f);
 
-    NDArray arr('f', {2,2}, {0., 10., 20., 30.}, nd4j::DataType::FLOAT32);
+    NDArray arr('f', {2,2}, {0., 10., 20., 30.}, sd::DataType::FLOAT32);
 
     NDArray scalar2 = arr.e(1);
 
@@ -804,8 +804,8 @@ TEST_F(NDArrayTest2, scalar_set_test1) {
 
     NDArray scalar1 = NDArrayFactory::create(20.f);
 
-    NDArray arr('c', {2,2}, {0., 10., -20., 30.}, nd4j::DataType::FLOAT32);
-    NDArray exp('c', {2,2}, {0., 10.,  20., 30.}, nd4j::DataType::FLOAT32);
+    NDArray arr('c', {2,2}, {0., 10., -20., 30.}, sd::DataType::FLOAT32);
+    NDArray exp('c', {2,2}, {0., 10.,  20., 30.}, sd::DataType::FLOAT32);
 
     arr.p(2, scalar1);
 
@@ -818,8 +818,8 @@ TEST_F(NDArrayTest2, scalar_set_test2) {
 
     NDArray scalar1 = NDArrayFactory::create(20.f);
 
-    NDArray arr('f', {2,2}, {0., 10., -20., 30.}, nd4j::DataType::FLOAT32);
-    NDArray exp('f', {2,2}, {0., 10.,  20., 30.}, nd4j::DataType::FLOAT32);
+    NDArray arr('f', {2,2}, {0., 10., -20., 30.}, sd::DataType::FLOAT32);
+    NDArray exp('f', {2,2}, {0., 10.,  20., 30.}, sd::DataType::FLOAT32);
 
     arr.p(1, scalar1);
 
@@ -846,14 +846,14 @@ TEST_F(NDArrayTest2, debugInfoTest_1) {
             91.,  -82.,  37.,  64.,     55.,  46.,  73.,  28.,    -119.,  12., 112.,  13.,     14., 114.,  16., 117.,
             51.,  42.,  67.,  24.,     15.,  0.,  93.,  28.,    109.,  82.,  12., 113.,    114.,  14., 116.,  11.,
             31.,  22.,  87.,  44.,     55.,  46.,  73.,  28.,    119.,  12., 112.,  13.,     14., 114.,  16., 117.,
-            91.,  82.,  37.,  64.,    -3,  0, 73.,  28.,    119.,  12., 112.,  13.,    140., 110., 160., 107.}, nd4j::DataType::DOUBLE);
-    NDArray res(nd4j::DataType::DOUBLE);
+            91.,  82.,  37.,  64.,    -3,  0, 73.,  28.,    119.,  12., 112.,  13.,    140., 110., 160., 107.}, sd::DataType::DOUBLE);
+    NDArray res(sd::DataType::DOUBLE);
     DebugInfo info = DebugHelper::debugStatistics(&testArray);
     DebugInfo exp; // = {}
-    nd4j::ops::reduce_min minOp;
-    nd4j::ops::reduce_mean meanOp;
-    nd4j::ops::reduce_max maxOp;
-    nd4j::ops::reduce_stdev stdevOp;
+    sd::ops::reduce_min minOp;
+    sd::ops::reduce_mean meanOp;
+    sd::ops::reduce_max maxOp;
+    sd::ops::reduce_stdev stdevOp;
 
     minOp.execute({&testArray}, {&res}, {}, {}, {});
     exp._minValue = res.e<double>(0);
@@ -883,7 +883,7 @@ TEST_F(NDArrayTest2, debugInfoTest_2) {
             91.,  -82.,  37.,  64.,     55.,  46.,  73.,  28.,    -119.,  12., 112.,  13.,     14., 114.,  16., 117.,
             51.,  42.,  67.,  24.,     15.,  0.,  93.,  28.,    109.,  82.,  12., 113.,    114.,  14., 116.,  11.,
             31.,  22.,  87.,  44.,     55.,  46.,  73.,  28.,    119.,  12., 112.,  13.,     14., 114.,  16., 117.,
-            91.,  82.,  37.,  64.,    -3,  0, 73.,  28.,    119.,  12., 112.,  13.,    140., 110., 160., 107.}, nd4j::DataType::DOUBLE);
+            91.,  82.,  37.,  64.,    -3,  0, 73.,  28.,    119.,  12., 112.,  13.,    140., 110., 160., 107.}, sd::DataType::DOUBLE);
 
     DebugInfo info;
     DebugInfo exp; // = {}
@@ -908,7 +908,7 @@ TEST_F(NDArrayTest2, debugInfoTest_2) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(NDArrayTest2, test_subarray_ews_1) {
 
-    NDArray x('c', {10, 5}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {10, 5}, sd::DataType::FLOAT32);
     auto subArr1 = x.subarray({NDIndex::all(), NDIndex::point(2)});
 
     ASSERT_EQ(5, subArr1.ews());
@@ -917,7 +917,7 @@ TEST_F(NDArrayTest2, test_subarray_ews_1) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(NDArrayTest2, test_subarray_ews_2) {
 
-    NDArray x('f', {10, 5}, nd4j::DataType::FLOAT32);
+    NDArray x('f', {10, 5}, sd::DataType::FLOAT32);
     auto subArr1 = x.subarray({NDIndex::all(), NDIndex::point(2)});
 
     ASSERT_EQ(1, subArr1.ews());
@@ -926,7 +926,7 @@ TEST_F(NDArrayTest2, test_subarray_ews_2) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(NDArrayTest2, test_subarray_ews_3) {
 
-    NDArray x('c', {10, 5}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {10, 5}, sd::DataType::FLOAT32);
     auto subArr1 = x.subarray({NDIndex::point(2), NDIndex::all()});
 
     ASSERT_EQ(1, subArr1.ews());
@@ -935,7 +935,7 @@ TEST_F(NDArrayTest2, test_subarray_ews_3) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(NDArrayTest2, test_subarray_ews_4) {
 
-    NDArray x('f', {10, 5}, nd4j::DataType::FLOAT32);
+    NDArray x('f', {10, 5}, sd::DataType::FLOAT32);
     auto subArr1 = x.subarray({NDIndex::point(2), NDIndex::all()});
 
     ASSERT_EQ(10, subArr1.ews());
@@ -944,8 +944,8 @@ TEST_F(NDArrayTest2, test_subarray_ews_4) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(NDArrayTest2, subarray_1) {
 
-    NDArray x('c', {2,3,4}, {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24}, nd4j::DataType::FLOAT32);
-    NDArray y('f', {2,3,4}, {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {2,3,4}, {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24}, sd::DataType::FLOAT32);
+    NDArray y('f', {2,3,4}, {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24}, sd::DataType::FLOAT32);
 
     Nd4jLong shapeExpX0[] = {1, 2, 12, 8192, 12, 99};
     float    buffExpX0[]  = {1.000000, 13.000000};
@@ -1049,7 +1049,7 @@ TEST_F(NDArrayTest2, subarray_1) {
 
 TEST_F(NDArrayTest2, test_subarray_interval_1) {
 
-    NDArray x('f', {10, 10}, nd4j::DataType::FLOAT32);
+    NDArray x('f', {10, 10}, sd::DataType::FLOAT32);
     auto subArr1 = x.subarray({NDIndex::all(), NDIndex::interval(0,9)});
 
     ASSERT_EQ(10, subArr1.sizeAt(0));
@@ -1058,7 +1058,7 @@ TEST_F(NDArrayTest2, test_subarray_interval_1) {
 
 TEST_F(NDArrayTest2, test_subarray_interval_2) {
 
-    NDArray x('c', {10, 10}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {10, 10}, sd::DataType::FLOAT32);
     auto subArr1 = x.subarray({NDIndex::all(), NDIndex::interval(0,9)});
 
     ASSERT_EQ(10, subArr1.sizeAt(0));
@@ -1066,8 +1066,8 @@ TEST_F(NDArrayTest2, test_subarray_interval_2) {
 }
 
 TEST_F(NDArrayTest2, test_subarray_3d_cf) {
-    NDArray f('f', {10, 20, 30}, nd4j::DataType::FLOAT32);
-    NDArray c('c', {10, 20, 30}, nd4j::DataType::FLOAT32);
+    NDArray f('f', {10, 20, 30}, sd::DataType::FLOAT32);
+    NDArray c('c', {10, 20, 30}, sd::DataType::FLOAT32);
 
     auto subarrayF = f({0,0, 0,0, 2,3}, true);
 
@@ -1198,10 +1198,10 @@ TEST_F(NDArrayTest2, trueBroadcast_1) {
 
     NDArray x('f', {2, 3}, {1., 2., 3., 4., 5., 6.});
     NDArray y('f', {1, 3}, {5., 4., 3.});
-    NDArray z('c', {2, 3}, nd4j::DataType::DOUBLE);
+    NDArray z('c', {2, 3}, sd::DataType::DOUBLE);
 
     auto exp = x - y;
-    x.applyTrueBroadcast(nd4j::BroadcastOpsTuple::Subtract(), y, z);
+    x.applyTrueBroadcast(sd::BroadcastOpsTuple::Subtract(), y, z);
 
     // exp.printIndexedBuffer();
     // z.printIndexedBuffer();
@@ -1212,12 +1212,12 @@ TEST_F(NDArrayTest2, trueBroadcast_1) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(NDArrayTest2, reduce_1) {
 
-    NDArray arr6('f', {1, 1, 4, 4, 4, 4}, nd4j::DataType::DOUBLE);
-    NDArray exp('f', {1, 1, 4, 4}, nd4j::DataType::DOUBLE);
+    NDArray arr6('f', {1, 1, 4, 4, 4, 4}, sd::DataType::DOUBLE);
+    NDArray exp('f', {1, 1, 4, 4}, sd::DataType::DOUBLE);
 
     arr6.linspace(1);
 
-    NDArray arr6s = arr6.reduceAlongDimension(nd4j::reduce::Sum, {2,3});
+    NDArray arr6s = arr6.reduceAlongDimension(sd::reduce::Sum, {2,3});
 
     for (int i = 0; i < 4; i++) {
         for (int j = 0; j < 4; j++) {
@@ -1248,7 +1248,7 @@ TEST_F(NDArrayTest2, reduce3_1) {
     NDArray y('c', {1,4}, {2,3,4,5});
     NDArray exp('c', {4}, {1,1,1,1});
 
-    NDArray z = x.applyReduce3(nd4j::reduce3::EuclideanDistance, y, {0}, nullptr);
+    NDArray z = x.applyReduce3(sd::reduce3::EuclideanDistance, y, {0}, nullptr);
 
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
@@ -1281,8 +1281,8 @@ TEST_F(NDArrayTest2, test_trueBroadcast_empty_2) {
 
 TEST_F(NDArrayTest2, test_subarray_followed_by_reshape_1) {
 
-    NDArray x('c', {5, 1, 3}, nd4j::DataType::FLOAT32);
-    NDArray e('c', {1, 3}, {7.f, 8.f, 9.f}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {5, 1, 3}, sd::DataType::FLOAT32);
+    NDArray e('c', {1, 3}, {7.f, 8.f, 9.f}, sd::DataType::FLOAT32);
 
     x.linspace(1.);
 
diff --git a/libnd4j/tests_cpu/layers_tests/NativeOpsTests.cpp b/libnd4j/tests_cpu/layers_tests/NativeOpsTests.cpp
index 0306fb555..971fe452e 100644
--- a/libnd4j/tests_cpu/layers_tests/NativeOpsTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/NativeOpsTests.cpp
@@ -19,9 +19,9 @@
 //
 
 #include "testlayers.h"
-#include <NDArray.h>
-#include <ShapeUtils.h>
-#include <reduce3.h>
+#include <array/NDArray.h>
+#include <helpers/ShapeUtils.h>
+#include <loops/reduce3.h>
 #include <ops/declarable/LegacyTransformOp.h>
 #include <ops/declarable/LegacyPairwiseTransformOp.h>
 #include <ops/declarable/LegacyScalarOp.h>
@@ -31,10 +31,10 @@
 #include <ops/declarable/LegacyBroadcastOp.h>
 #include <helpers/TAD.h>
 #include <helpers/ConstantTadHelper.h>
-#include <type_conversions.h>
+#include <loops/type_conversions.h>
 #include <ops/declarable/CustomOperations.h>
-using namespace nd4j;
-using namespace nd4j::ops;
+using namespace sd;
+using namespace sd::ops;
 
 class NativeOpsTests : public testing::Test {
 public:
@@ -76,7 +76,7 @@ printf("Unsupported for cuda now.\n");
 //    auto exp = NDArrayFactory::create<float>('c', {5, 5});
 //    exp.assign(-1.0);
 //
-//    nd4j::ops::LegacyTransformSameOp op(transform::Neg); // Neg
+//    sd::ops::LegacyTransformSameOp op(transform::Neg); // Neg
 //    auto result = op.execute({&x}, {}, {});
 //
 //    ASSERT_EQ(1, result->size());
@@ -95,7 +95,7 @@ TEST_F(NativeOpsTests, ThresholdTests_1) {
     printf("Unsupported for cuda now.\n");
 #else
     ::setElementThreshold(4);
-    ASSERT_TRUE(4 == nd4j::Environment::getInstance()->elementwiseThreshold());
+    ASSERT_TRUE(4 == sd::Environment::getInstance()->elementwiseThreshold());
 #endif
 
 }
@@ -107,7 +107,7 @@ TEST_F(NativeOpsTests, ThresholdTests_2) {
     printf("Unsupported for cuda now.\n");
 #else
     ::setTADThreshold(4);
-    ASSERT_TRUE(4 == nd4j::Environment::getInstance()->tadThreshold());
+    ASSERT_TRUE(4 == sd::Environment::getInstance()->tadThreshold());
 #endif
 
 }
@@ -644,8 +644,8 @@ TEST_F(NativeOpsTests, Reduce3Test_4) {
     x.syncToDevice();
     dimension.syncToHost();
     int* dimensions = reinterpret_cast<int*>(dimension.buffer());
-    auto tadPackX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(x.shapeInfo(), dimensions, dimension.lengthOf());
-    auto tadPackY = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(y.shapeInfo(), dimensions, dimension.lengthOf());
+    auto tadPackX = sd::ConstantTadHelper::getInstance()->tadForDimensions(x.shapeInfo(), dimensions, dimension.lengthOf());
+    auto tadPackY = sd::ConstantTadHelper::getInstance()->tadForDimensions(y.shapeInfo(), dimensions, dimension.lengthOf());
 
     auto hTADShapeInfoX = tadPackX.primaryShapeInfo();
     auto hTADOffsetsX = tadPackX.primaryOffsets();
@@ -963,8 +963,8 @@ TEST_F(NativeOpsTests, ScalarTadTest_1) {
     z.syncToDevice();
     auto dimension = NDArrayFactory::create<int>({0, 1});
     auto dimensions = reinterpret_cast<int*>(dimension.buffer());
-    auto tadPackX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(x.shapeInfo(), dimensions, dimension.lengthOf());
-    auto tadPackZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(z.shapeInfo(), dimensions, dimension.lengthOf());
+    auto tadPackX = sd::ConstantTadHelper::getInstance()->tadForDimensions(x.shapeInfo(), dimensions, dimension.lengthOf());
+    auto tadPackZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(z.shapeInfo(), dimensions, dimension.lengthOf());
 
     OpaqueDataBuffer xBuf(x.dataBuffer());
     OpaqueDataBuffer yBuf(y.dataBuffer());
@@ -1008,8 +1008,8 @@ TEST_F(NativeOpsTests, ScalarTadTest_2) {
     z.syncToDevice();
     auto dimension = NDArrayFactory::create<int>({0, 1});
     auto dimensions = reinterpret_cast<int*>(dimension.buffer());
-    auto tadPackX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(x.shapeInfo(), dimensions, dimension.lengthOf());
-    auto tadPackZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(z.shapeInfo(), dimensions, dimension.lengthOf());
+    auto tadPackX = sd::ConstantTadHelper::getInstance()->tadForDimensions(x.shapeInfo(), dimensions, dimension.lengthOf());
+    auto tadPackZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(z.shapeInfo(), dimensions, dimension.lengthOf());
     z.assign(true);
 
     OpaqueDataBuffer xBuf(x.dataBuffer());
@@ -1057,8 +1057,8 @@ TEST_F(NativeOpsTests, ConcatTest_2) {
     int d = 0;
     auto dimension = NDArrayFactory::create<int>('c', {1}, {d});
     auto dimensions = reinterpret_cast<int*>(dimension.buffer());
-    //auto tadPackX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(x.shapeInfo(), dimensions, dimension.lengthOf());
-    auto tadPackZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(z.shapeInfo(), dimensions, dimension.lengthOf());
+    //auto tadPackX = sd::ConstantTadHelper::getInstance()->tadForDimensions(x.shapeInfo(), dimensions, dimension.lengthOf());
+    auto tadPackZ = sd::ConstantTadHelper::getInstance()->tadForDimensions(z.shapeInfo(), dimensions, dimension.lengthOf());
     exp.linspace(1);
     Nd4jPointer datas[] = {x.buffer(), y.buffer()};
     Nd4jPointer shapes[] = {x.shapeInfo(), y.shapeInfo()};
@@ -1117,7 +1117,7 @@ TEST_F(NativeOpsTests, MemTest_1) {
 
 TEST_F(NativeOpsTests, PullRowsTest_1) {
     NDArray x('c', {5, 1}, {0,1,2,3,4});
-    NDArray z('c', {4, 1}, nd4j::DataType::DOUBLE);
+    NDArray z('c', {4, 1}, sd::DataType::DOUBLE);
     NDArray exp('c', {4, 1}, {0,2,3,4});
 
     Nd4jLong indexes[] = {0,2,3,4};
@@ -1126,8 +1126,8 @@ TEST_F(NativeOpsTests, PullRowsTest_1) {
 
     std::vector<int> dims = {1};
 
-    auto xTadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(x.getShapeInfo(), dims);
-    auto zTadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(z.getShapeInfo(), dims);
+    auto xTadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(x.getShapeInfo(), dims);
+    auto zTadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(z.getShapeInfo(), dims);
 
     Nd4jPointer nativeStart[2];
 
@@ -1151,7 +1151,7 @@ TEST_F(NativeOpsTests, TadPackTest_1) {
     int dimension[] = {1};
     int const dimensionLength = 1;
     auto x = NDArrayFactory::create<int>('c', {2,3,4});
-    nd4j::TadPack* pack = ::tadOnlyShapeInfo(x.shapeInfo(),
+    sd::TadPack* pack = ::tadOnlyShapeInfo(x.shapeInfo(),
                                     dimension,
                                     dimensionLength);
     ASSERT_TRUE(pack != nullptr);
@@ -1231,7 +1231,7 @@ TEST_F(NativeOpsTests, ShuffleTest_1) {
     Nd4jPointer zShapeList[] = {z.shapeInfo(), z.shapeInfo()};
     Nd4jPointer dzShapeList[] = {z.specialShapeInfo(), z.specialShapeInfo()};
     int shuffleMap[] = {1, 0, 4, 3, 2};
-    auto zTadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(x.getShapeInfo(), {1});
+    auto zTadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(x.getShapeInfo(), {1});
     Nd4jPointer zListOffset[] = {zTadPack.platformOffsets(), zTadPack.platformOffsets()};
     Nd4jPointer zListTADs[] = {zTadPack.platformShapeInfo(), zTadPack.platformShapeInfo()};
     ::shuffle(nullptr,
@@ -1268,7 +1268,7 @@ TEST_F(NativeOpsTests, ConvertTypesTest_1) {
 //    auto y = NDArrayFactory::create<float>('c', {5,5});
 //
 //
-//    ops.execAggregate(nullptr, 0, maxArgs, maxShapes, maxIntArrays, maxIntArraySize, maxIndexArguments, maxRealArguments, pointer.data(), nd4j::DataType::FLOAT32);
+//    ops.execAggregate(nullptr, 0, maxArgs, maxShapes, maxIntArrays, maxIntArraySize, maxIndexArguments, maxRealArguments, pointer.data(), sd::DataType::FLOAT32);
 //    void **arguments,
 //    int numArguments,
 //    Nd4jLong **shapeArguments,
@@ -1279,7 +1279,7 @@ TEST_F(NativeOpsTests, ConvertTypesTest_1) {
 //    int numIntArrays,
 //    void *realArguments,
 //    int numRealArguments,
-//    nd4j::DataType dtype
+//    sd::DataType dtype
 //}
 
 TEST_F(NativeOpsTests, RandomTest_1) {
@@ -1492,7 +1492,7 @@ TEST_F(NativeOpsTests, CustomOpTest_1) {
     auto z = NDArrayFactory::create<float>('c', {6});
     auto e = NDArrayFactory::create<float>('c', {6}, {1.f, 2.f, 3.f, 4.f, 5.f, 6.f});
 
-    nd4j::ops::squeeze op;
+    sd::ops::squeeze op;
 
     Nd4jPointer ptrsInBuffer[] = {(Nd4jPointer) x.getBuffer(), x.getSpecialBuffer()};
     Nd4jPointer ptrsInShapes[] = {(Nd4jPointer) x.getShapeInfo(), x.getSpecialShapeInfo()};
@@ -1522,7 +1522,7 @@ TEST_F(NativeOpsTests, CustomOpTests_2) {
 
     ASSERT_EQ(2, ctx.width());
 
-    nd4j::ops::add op;
+    sd::ops::add op;
     ::execCustomOp2(nullptr, op.getOpHash(), &ctx);
 
     NDArray::registerSpecialUse({&z}, {&array0, &array1});
@@ -1534,7 +1534,7 @@ TEST_F(NativeOpsTests, CalculateOutputShapeTests_1) {
     auto weights = NDArrayFactory::create<float>('c', {2, 2, 2, 3});
     auto exp = NDArrayFactory::create<float>('c', {1, 3, 5, 4});
 
-    nd4j::ops::conv2d op;
+    sd::ops::conv2d op;
 
     std::vector<double> tArgs({});
     std::vector<Nd4jLong> iArgs({2, 2, 1, 1, 0, 0, 1, 1, 1});
@@ -1566,7 +1566,7 @@ TEST_F(NativeOpsTests, CalculateOutputShapeTests_2) {
     auto weights = NDArrayFactory::create<float>('c', {2, 2, 2, 3});
     auto exp = NDArrayFactory::create<float>('c', {1, 3, 5, 4});
 
-    nd4j::ops::conv2d op;
+    sd::ops::conv2d op;
 
     std::vector<double> tArgs({});
     std::vector<bool> bArgsF({});
diff --git a/libnd4j/tests_cpu/layers_tests/NlpTests.cpp b/libnd4j/tests_cpu/layers_tests/NlpTests.cpp
index 83eeee48b..e738a57dc 100644
--- a/libnd4j/tests_cpu/layers_tests/NlpTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/NlpTests.cpp
@@ -20,13 +20,13 @@
 
 #include "testlayers.h"
 #include <ops/declarable/CustomOperations.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <ops/ops.h>
-#include <GradCheck.h>
+#include <helpers/GradCheck.h>
 #include <helpers/RandomLauncher.h>
 
 
-using namespace nd4j;
+using namespace sd;
 
 
 class NlpTests : public testing::Test {
@@ -64,7 +64,7 @@ TEST_F(NlpTests, basic_sg_hs_test_1) {
     auto randomValue = NDArrayFactory::create<Nd4jLong>(1L);
     auto inferenceVector = NDArrayFactory::empty<float>();
 
-    nd4j::ops::skipgram op;
+    sd::ops::skipgram op;
     auto result = op.evaluate({&target, &ngStarter, &indices, &codes, &syn0, &syn1, &syn1Neg, &expTable, &negTable, &alpha, &randomValue, &inferenceVector, &neu1e}, {}, {}, {false}, {}, true);
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -105,7 +105,7 @@ TEST_F(NlpTests, basic_sg_hs_test_2) {
     auto randomValue = NDArrayFactory::create<Nd4jLong>(1L);
     auto inferenceVector = NDArrayFactory::empty<float>();
 
-    nd4j::ops::skipgram op;
+    sd::ops::skipgram op;
     auto result = op.evaluate({&target, &ngStarter, &indices, &codes, &syn0, &syn1, &syn1Neg, &expTable, &negTable, &alpha, &randomValue, &inferenceVector, &neu1e}, {}, {}, {false}, {}, true);
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -156,7 +156,7 @@ TEST_F(NlpTests, basic_sg_hs_test_3) {
     auto randomValue = NDArrayFactory::create<Nd4jLong>(1L);
     auto inferenceVector = NDArrayFactory::empty<float>();
 
-    nd4j::ops::skipgram op;
+    sd::ops::skipgram op;
     auto result0 = op.evaluate({&target, &ngStarter, &indices0, &codes00, &syn00, &syn10, &syn1Neg, &expTable, &negTable, &alpha, &randomValue, &inferenceVector, &neu1e}, {}, {}, {false}, {}, true);
     auto result1 = op.evaluate({&target, &ngStarter, &indices1, &codes01, &syn01, &syn11, &syn1Neg, &expTable, &negTable, &alpha, &randomValue, &inferenceVector, &neu1e}, {}, {}, {false}, {}, true);
     ASSERT_EQ(Status::OK(), result0->status());
@@ -190,7 +190,7 @@ TEST_F(NlpTests, basic_sg_hs_ns_test_1) {
     auto randomValue = NDArrayFactory::create<Nd4jLong>(119L);
     auto inferenceVector = NDArrayFactory::empty<float>();
 
-    nd4j::ops::skipgram op;
+    sd::ops::skipgram op;
     auto result = op.evaluate({&target, &ngStarter, &indices, &codes, &syn0, &syn1, &syn1Neg, &expTable, &negTable, &alpha, &randomValue, &inferenceVector, &neu1e}, {}, {3}, {false}, {}, true);
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -225,7 +225,7 @@ TEST_F(NlpTests, basic_sg_ns_test_1) {
     auto randomValue = NDArrayFactory::create<Nd4jLong>(2L);
     auto inferenceVector = NDArrayFactory::empty<float>();
 
-    nd4j::ops::skipgram op;
+    sd::ops::skipgram op;
     auto result = op.evaluate({&target, &ngStarter, &indices, &codes, &syn0, &syn1, &syn1Neg, &expTable, &negTable, &alpha, &randomValue, &inferenceVector, &neu1e}, {}, {1, 1}, {false}, {}, true);
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -267,7 +267,7 @@ TEST_F(NlpTests, basic_cb_hs_test_1) {
     auto randomValue = NDArrayFactory::create<Nd4jLong>(2L);
     auto inferenceVector = NDArrayFactory::empty<float>();
 
-    nd4j::ops::cbow op;
+    sd::ops::cbow op;
     auto result = op.evaluate({&target, &ngStarter, &context, &indices, &codes, &syn0, &syn1, &syn1Neg, &expTable, &negTable, &alpha, &randomValue, &numWords, &locked, &inferenceVector}, {}, {}, {true}, {}, true);
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -321,7 +321,7 @@ TEST_F(NlpTests, basic_cb_ns_test_1) {
     auto randomValue = NDArrayFactory::create<Nd4jLong>(2L);
     auto inferenceVector = NDArrayFactory::empty<float>();
 
-    nd4j::ops::cbow op;
+    sd::ops::cbow op;
     auto result = op.evaluate({&target, &ngStarter, &context, &indices, &codes, &syn0, &syn1, &syn1Neg, &expTable, &negTable, &alpha, &randomValue, &numWords, &locked, &inferenceVector}, {}, {1, 2, 0}, {true}, {}, true);
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -370,7 +370,7 @@ TEST_F(NlpTests, test_sg_hs_batch_1) {
     syn1.assign(0.02);
     expTable.assign(0.5);
 
-    nd4j::ops::skipgram op;
+    sd::ops::skipgram op;
     auto result = op.evaluate({&target, &ngStarter, &indices, &codes, &syn0, &syn1, &syn1Neg, &expTable, &negTable, &alpha, &randomValue, &inferenceVector, &neu1e}, {}, {}, {false, true}, {}, true);
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -414,7 +414,7 @@ TEST_F(NlpTests, test_sg_ns_batch_1) {
     expTable.assign(0.5);
     negTable.linspace(0.0);
 
-    nd4j::ops::skipgram op;
+    sd::ops::skipgram op;
     auto result = op.evaluate({&target, &ngStarter, &indices, &codes, &syn0, &syn1, &syn1Neg, &expTable, &negTable, &alpha, &randomValue, &inferenceVector, &neu1e}, {}, {4, 5}, {false, true}, {}, true);
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -451,7 +451,7 @@ TEST_F(NlpTests, test_cbow_hs_batch_1) {
     auto randomValue = NDArrayFactory::create<Nd4jLong>('c', {2}, {2L, 2L});
     auto inferenceVector = NDArrayFactory::empty<float>();
 
-    nd4j::ops::cbow op;
+    sd::ops::cbow op;
     auto result = op.evaluate({&target, &ngStarter, &context, &indices, &codes, &syn0, &syn1, &syn1Neg, &expTable, &negTable, &alpha, &randomValue, &numWords, &locked, &inferenceVector}, {}, {}, {true}, {}, true);
     ASSERT_EQ(Status::OK(), result->status());
 
diff --git a/libnd4j/tests_cpu/layers_tests/NodeTests.cpp b/libnd4j/tests_cpu/layers_tests/NodeTests.cpp
index d4d8c9a1f..8f4a8ae70 100644
--- a/libnd4j/tests_cpu/layers_tests/NodeTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/NodeTests.cpp
@@ -19,13 +19,13 @@
 //
 
 #include "testlayers.h"
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <graph/Variable.h>
 #include <flatbuffers/flatbuffers.h>
 #include <ops/declarable/headers/broadcastable.h>
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class NodeTests : public testing::Test {
 public:
@@ -51,7 +51,7 @@ TEST_F(NodeTests, Test_Dtype_Conversion_1) {
 
 
 TEST_F(NodeTests, Test_Dtype_Conversion_2) {
-    nd4j::ops::add opA;
+    sd::ops::add opA;
 
     //auto nodeA = new Node(OpType_CUSTOM, 0, 1, {-1}, {2});
     auto nodeA = new Node(&opA, 1, {-1}, {2});
diff --git a/libnd4j/tests_cpu/layers_tests/OmpLaunchHelperTests.cpp b/libnd4j/tests_cpu/layers_tests/OmpLaunchHelperTests.cpp
index 0d879748d..a7c7eae24 100644
--- a/libnd4j/tests_cpu/layers_tests/OmpLaunchHelperTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/OmpLaunchHelperTests.cpp
@@ -19,12 +19,12 @@
 //
 
 #include "testlayers.h"
-#include <NDArray.h>
-#include <OmpLaunchHelper.h>
+#include <array/NDArray.h>
+#include <helpers/OmpLaunchHelper.h>
 
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class OmpLaunchHelperTests : public testing::Test {
 private:
diff --git a/libnd4j/tests_cpu/layers_tests/OneOffTests.cpp b/libnd4j/tests_cpu/layers_tests/OneOffTests.cpp
index 05f823e4a..e1cf4ec52 100644
--- a/libnd4j/tests_cpu/layers_tests/OneOffTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/OneOffTests.cpp
@@ -23,13 +23,13 @@
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/OpTuple.h>
 #include <ops/declarable/OpRegistrator.h>
-#include <GraphExecutioner.h>
+#include <graph/GraphExecutioner.h>
 #include <memory/MemoryReport.h>
 #include <memory/MemoryUtils.h>
-#include <MmulHelper.h>
+#include <helpers/MmulHelper.h>
 
-using namespace nd4j;
-using namespace nd4j::ops;
+using namespace sd;
+using namespace sd::ops;
 
 class OneOffTests : public testing::Test {
 public:
@@ -62,9 +62,9 @@ TEST_F(OneOffTests, test_non2d_0A_1) {
 
 /*
 TEST_F(OneOffTests, test_assert_scalar_float32_1) {
-    nd4j::ops::Assert op;
-    nd4j::ops::identity op1;
-    nd4j::ops::noop op2;
+    sd::ops::Assert op;
+    sd::ops::identity op1;
+    sd::ops::noop op2;
     auto graph = GraphExecutioner::importFromFlatBuffers("./resources/scalar_float32.fb");
 
     ASSERT_TRUE(graph != nullptr);
@@ -77,9 +77,9 @@ TEST_F(OneOffTests, test_assert_scalar_float32_1) {
 }*/
 
 TEST_F(OneOffTests, test_assert_scalar_float32_2) {
-    nd4j::ops::Assert op;
-    nd4j::ops::identity op1;
-    nd4j::ops::noop op2;
+    sd::ops::Assert op;
+    sd::ops::identity op1;
+    sd::ops::noop op2;
     auto graph = GraphExecutioner::importFromFlatBuffers("./resources/assertsomething.fb");
 
     ASSERT_TRUE(graph != nullptr);
@@ -318,7 +318,7 @@ TEST_F(OneOffTests, test_cond_false_1) {
 TEST_F(OneOffTests, test_identity_n_2) {
     auto e = NDArrayFactory::create<float>('c', {2, 3}, {0.77878559f, 0.80119777f, 0.72437465f, 0.23089433f, 0.72714126f, 0.18039072f});
 
-    nd4j::ops::identity_n op;
+    sd::ops::identity_n op;
 
     auto graph = GraphExecutioner::importFromFlatBuffers("./resources/identity_n_2.fb");
     ASSERT_TRUE(graph != nullptr);
diff --git a/libnd4j/tests_cpu/layers_tests/OpTrackerTests.cpp b/libnd4j/tests_cpu/layers_tests/OpTrackerTests.cpp
index 36828a807..fe581e09e 100644
--- a/libnd4j/tests_cpu/layers_tests/OpTrackerTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/OpTrackerTests.cpp
@@ -18,15 +18,15 @@
 // Created by raver119 on 15.12.17.
 //
 #include "testlayers.h"
-#include <Graph.h>
+#include <graph/Graph.h>
 #include <chrono>
-#include <Node.h>
+#include <graph/Node.h>
 #include <helpers/OpTracker.h>
 #include <ops/declarable/CustomOperations.h>
 
-using namespace nd4j;
-using namespace nd4j::ops;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::ops;
+using namespace sd::graph;
 
 class OpTrackerTests : public testing::Test {
 public:
@@ -40,7 +40,7 @@ public:
 };
 
 TEST_F(OpTrackerTests, Test_Existence_1) {
-    nd4j::_loader loader;
+    sd::_loader loader;
 
     // nd4j_printf("Groups: %i; Operations: %i\n", OpTracker::getInstance()->totalGroups(), OpTracker::getInstance()->totalOperations());
 
@@ -51,7 +51,7 @@ TEST_F(OpTrackerTests, Test_Existence_1) {
 }
 
 TEST_F(OpTrackerTests, Test_Ops_List_1) {
-    nd4j::ops::less op;
+    sd::ops::less op;
     auto vec = OpRegistrator::getInstance()->getAllHashes();
 
     // nd4j_printf("Total ops: %lld\n", vec.size());
diff --git a/libnd4j/tests_cpu/layers_tests/OpTupleTests.cpp b/libnd4j/tests_cpu/layers_tests/OpTupleTests.cpp
index a7aafb3f7..bec75f056 100644
--- a/libnd4j/tests_cpu/layers_tests/OpTupleTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/OpTupleTests.cpp
@@ -19,11 +19,11 @@
 //
 
 #include "testlayers.h"
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <ops/declarable/OpTuple.h>
 
-using namespace nd4j;
-using namespace nd4j::ops;
+using namespace sd;
+using namespace sd::ops;
 
 class OpTupleTests : public testing::Test {
     public:
diff --git a/libnd4j/tests_cpu/layers_tests/PairwiseTests.cpp b/libnd4j/tests_cpu/layers_tests/PairwiseTests.cpp
index ec0388a0b..9cb2589c1 100644
--- a/libnd4j/tests_cpu/layers_tests/PairwiseTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/PairwiseTests.cpp
@@ -18,7 +18,7 @@
 // Created by agibsonccc on 1/17/17.
 //
 #include "testinclude.h"
-#include <reduce3.h>
+#include <loops/reduce3.h>
 
 class EqualsTest : public testing::Test {
 public:
@@ -35,7 +35,7 @@ public:
 #ifndef __CUDABLAS__
 
 TEST_F(EqualsTest,Eps) {
-    auto val = nd4j::NDArrayFactory::create(0.0f);
+    auto val = sd::NDArrayFactory::create(0.0f);
     functions::reduce3::Reduce3<float, float>::execScalar(opNum,
                                                                data,
                                                                firstShapeBuffer,
diff --git a/libnd4j/tests_cpu/layers_tests/ParityOpsTests.cpp b/libnd4j/tests_cpu/layers_tests/ParityOpsTests.cpp
index a53d71a65..63dd236c3 100644
--- a/libnd4j/tests_cpu/layers_tests/ParityOpsTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ParityOpsTests.cpp
@@ -19,12 +19,12 @@
 //
 
 #include "testlayers.h"
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <ops/declarable/CustomOperations.h>
 
 
-using namespace nd4j;
-using namespace nd4j::ops;
+using namespace sd;
+using namespace sd::ops;
 
 class ParityOpsTests : public testing::Test {
 public:
@@ -39,7 +39,7 @@ TEST_F(ParityOpsTests, TestZeroAs1) {
     auto exp = NDArrayFactory::create<float>('c', {10, 10});
     exp.assign(0.0f);
 
-    nd4j::ops::zeros_as op;
+    sd::ops::zeros_as op;
 
     auto result = op.evaluate({&x}, {}, {});
 
@@ -58,7 +58,7 @@ TEST_F(ParityOpsTests, TestMaximum1) {
     auto y = NDArrayFactory::create<float>('c', {10, 10});
     y.assign(2.0);
 
-    nd4j::ops::maximum op;
+    sd::ops::maximum op;
 
     auto result = op.evaluate({&x, &y}, {}, {});
 
@@ -78,7 +78,7 @@ TEST_F(ParityOpsTests, TestMinimum1) {
     y.assign(-2.0f);
 
 
-    nd4j::ops::minimum op;
+    sd::ops::minimum op;
 
     auto result = op.evaluate({&x, &y}, {}, {});
 
@@ -97,7 +97,7 @@ TEST_F(ParityOpsTests, TestTear1) {
         tads.at(e)->assign((float) e + 1);
     }
 
-    nd4j::ops::tear op;
+    sd::ops::tear op;
 
     auto result = op.evaluate({&input}, {}, {1});
 
@@ -117,7 +117,7 @@ TEST_F(ParityOpsTests, TestUnstack1) {
         tads.at(e)->assign((float) e + 1);
     }
 
-    nd4j::ops::unstack op;
+    sd::ops::unstack op;
 
     auto result = op.evaluate({&input}, {}, {0});
 
@@ -139,7 +139,7 @@ TEST_F(ParityOpsTests, TestUnstack2) {
         tads.at(e)->assign((float) e + 1);
     }
 
-    nd4j::ops::unstack op;
+    sd::ops::unstack op;
 
     auto result = op.evaluate({&input}, {}, {2});
 
@@ -156,7 +156,7 @@ TEST_F(ParityOpsTests, TestUnstack3) {
     auto exp = NDArrayFactory::create<float>('c', {3, 2}, {1.f, 4., 7., 10.f, 13.f,  16.f});
     input.linspace(1);
 
-    nd4j::ops::unstack op;
+    sd::ops::unstack op;
 
     auto result = op.evaluate({&input}, {}, {2});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -175,7 +175,7 @@ TEST_F(ParityOpsTests, TestUnstack4) {
     auto exp = NDArrayFactory::create<float>('c', {3, 3}, { 1, 2, 3, 7, 8, 9, 13, 14, 15.});
     input.linspace(1);
 
-    nd4j::ops::unstack op;
+    sd::ops::unstack op;
 
     auto result = op.evaluate({&input}, {}, {1});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -193,7 +193,7 @@ TEST_F(ParityOpsTests, TestUnstack5) {
     auto exp = NDArrayFactory::create<float>('c', {2, 3}, { 1, 2, 3, 4, 5, 6});
     input.linspace(1);
 
-    nd4j::ops::unstack op;
+    sd::ops::unstack op;
 
     auto result = op.evaluate({&input}, {}, {0});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -211,7 +211,7 @@ TEST_F(ParityOpsTests, TestUnstack6) {
     auto exp = NDArrayFactory::create<float>('c', {1, 1}, {1});
     input.linspace(1);
 
-    nd4j::ops::unstack op;
+    sd::ops::unstack op;
 
     auto result = op.evaluate({&input}, {}, {0});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -229,7 +229,7 @@ TEST_F(ParityOpsTests, TestUnstack7) {
     auto exp = NDArrayFactory::create<float>('c', {1, 1}, {1});
     input.linspace(1);
 
-    nd4j::ops::unstack op;
+    sd::ops::unstack op;
 
     auto result = op.evaluate({&input}, {}, {1});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -247,7 +247,7 @@ TEST_F(ParityOpsTests, TestUnstack8) {
     auto exp = NDArrayFactory::create<float>('c', {1}, {1});
     input.linspace(1);
 
-    nd4j::ops::unstack op;
+    sd::ops::unstack op;
 
     auto result = op.evaluate({&input}, {}, {0});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -265,7 +265,7 @@ TEST_F(ParityOpsTests, TestUnstack9) {
     auto exp = NDArrayFactory::create<float>('c', {1}, {1});
     input.linspace(1);
 
-    nd4j::ops::unstack op;
+    sd::ops::unstack op;
 
     auto result = op.evaluate({&input}, {}, {1});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -284,7 +284,7 @@ TEST_F(ParityOpsTests, TestUnstack10) {
     auto input = NDArrayFactory::create<float>('c', {3, 0, 2});
     auto exp = NDArrayFactory::create<float>('c', {0,2});
 
-    nd4j::ops::unstack op;
+    sd::ops::unstack op;
 
     auto result = op.evaluate({&input}, {}, {0});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -302,7 +302,7 @@ TEST_F(ParityOpsTests, TestUnstack11) {
     auto input = NDArrayFactory::create<float>('c', {3, 0, 2});
     auto exp = NDArrayFactory::create<float>('c', {3,0});
 
-    nd4j::ops::unstack op;
+    sd::ops::unstack op;
 
     auto result = op.evaluate({&input}, {}, {2});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -318,7 +318,7 @@ TEST_F(ParityOpsTests, TestUnstack12) {
 
     auto input = NDArrayFactory::create<float>('c', {3, 0, 2});
 
-    nd4j::ops::unstack op;
+    sd::ops::unstack op;
 
     auto result = op.evaluate({&input}, {}, {1});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -333,7 +333,7 @@ TEST_F(ParityOpsTests, ExpandDimsTest1) {
     input.linspace(1);
     auto reshaped = input.reshape('c', {5, 1, 5});
 
-    nd4j::ops::expand_dims op;
+    sd::ops::expand_dims op;
     auto result = op.evaluate({&input}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -352,7 +352,7 @@ TEST_F(ParityOpsTests, ExpandDimsTest2) {
     input.linspace(1);
     auto reshaped = input.reshape('c', {1, 3, 4});
 
-    nd4j::ops::expand_dims op;
+    sd::ops::expand_dims op;
     auto result = op.evaluate({&input}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -371,7 +371,7 @@ TEST_F(ParityOpsTests, ExpandDimsTest3) {
     input.linspace(1);
     auto reshaped = input.reshape('c', {3, 1, 4});
 
-    nd4j::ops::expand_dims op;
+    sd::ops::expand_dims op;
     auto result = op.evaluate({&input}, {}, {-2});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -389,7 +389,7 @@ TEST_F(ParityOpsTests, ExpandDimsTest4) {
     input.linspace(1);
     auto reshaped = input.reshape('c', {1, 3, 4});
 
-    nd4j::ops::expand_dims op;
+    sd::ops::expand_dims op;
     auto result = op.evaluate({&input}, {}, {-3});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -407,7 +407,7 @@ TEST_F(ParityOpsTests, Test_Shape_1) {
     auto x = NDArrayFactory::create<float>('c', {3, 4, 5, 6});
     auto exp = NDArrayFactory::create<Nd4jLong>('c', {4}, {3, 4, 5, 6});
 
-    nd4j::ops::shape_of op;
+    sd::ops::shape_of op;
     auto result = op.evaluate({&x}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -425,7 +425,7 @@ TEST_F(ParityOpsTests, Test_Equals_1) {
     auto y = NDArrayFactory::create<float>('c', {1, 5}, {1, 0, 3, 0, 5});
     auto exp = NDArrayFactory::create<bool>('c', {1, 5}, {1, 0, 1, 0, 1});
 
-    nd4j::ops::equals op;
+    sd::ops::equals op;
     auto result = op.evaluate({&x, &y});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -443,7 +443,7 @@ TEST_F(ParityOpsTests, Test_NotEquals_1) {
     auto y = NDArrayFactory::create<float>('c', {1, 5}, {1, 0, 3, 0, 5});
     auto exp = NDArrayFactory::create<bool>('c', {1, 5}, {0, 1, 0, 1, 0});
 
-    nd4j::ops::not_equals op;
+    sd::ops::not_equals op;
     auto result = op.evaluate({&x, &y});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -460,7 +460,7 @@ TEST_F(ParityOpsTests, Test_Less_1) {
     auto y = NDArrayFactory::create<float>('c', {1, 5}, {5, 4, 3, 2, 1});
     auto exp = NDArrayFactory::create<bool>('c', {1, 5}, {1, 1, 0, 0, 0});
 
-    nd4j::ops::less op;
+    sd::ops::less op;
     auto result = op.evaluate({&x, &y});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -477,7 +477,7 @@ TEST_F(ParityOpsTests, Test_LessEquals_1) {
     auto y = NDArrayFactory::create<float>('c', {1, 5}, {5, 4, 3, 2, 1});
     auto exp = NDArrayFactory::create<bool>('c', {1, 5}, {1, 1, 1, 0, 0});
 
-    nd4j::ops::less_equal op;
+    sd::ops::less_equal op;
     auto result = op.evaluate({&x, &y});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -494,7 +494,7 @@ TEST_F(ParityOpsTests, Test_GreaterEquals_1) {
     auto y = NDArrayFactory::create<float>('c', {1, 5}, {5, 4, 3, 2, 1});
     auto exp = NDArrayFactory::create<bool>('c', {1, 5}, {0, 0, 1, 1, 1});
 
-    nd4j::ops::greater_equal op;
+    sd::ops::greater_equal op;
     auto result = op.evaluate({&x, &y});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -511,7 +511,7 @@ TEST_F(ParityOpsTests, Test_GreaterEquals_2) {
     auto y = NDArrayFactory::create<double>('c', {1, 5}, {5, 4, 3, 2, 1});
     auto exp = NDArrayFactory::create<bool>('c', {1, 5}, {0, 0, 1, 1, 1});
 
-    nd4j::ops::greater_equal op;
+    sd::ops::greater_equal op;
     auto result = op.evaluate({&x, &y}, {}, {}, {}, {}, false);
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -528,7 +528,7 @@ TEST_F(ParityOpsTests, Test_Greater_1) {
     auto y = NDArrayFactory::create<float>('c', {1, 5}, {5, 4, 3, 2, 1});
     auto exp = NDArrayFactory::create<bool>('c', {1, 5}, {0, 0, 0, 1, 1});
 
-    nd4j::ops::greater op;
+    sd::ops::greater op;
     auto result = op.evaluate({&x, &y});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -546,7 +546,7 @@ TEST_F(ParityOpsTests, Test_Where_1) {
     auto y = NDArrayFactory::create<float>('c', {3, 3}, {9, 8, 7, 6, 5, 4, 3, 2, 1});
     auto exp = NDArrayFactory::create<float>('c', {3, 3}, {1, 2, 3, 6, 5, 4, 7, 8, 9});
 
-    nd4j::ops::Where op;
+    sd::ops::Where op;
     auto result = op.evaluate({&mask, &x, &y}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -566,7 +566,7 @@ TEST_F(ParityOpsTests, Test_Where_2) {
     auto y = NDArrayFactory::create<float>('c', {3, 3}, {9, 8, 7, 6, 5, 4, 3, 2, 1});
     auto exp = NDArrayFactory::create<float>('c', {3, 3}, {1, 2, 3, 6, 5, 4, 3, 2, 1});
 
-    nd4j::ops::Where op;
+    sd::ops::Where op;
     auto result = op.evaluate({&mask, &x, &y}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -583,7 +583,7 @@ TEST_F(ParityOpsTests, Test_Where_3) {
     auto mask = NDArrayFactory::create<bool>('c', {2, 2, 3}, {0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1});
     auto exp = NDArrayFactory::create<Nd4jLong>('c', {5, 3}, {0, 0, 1, 0, 0, 2, 0, 1, 1, 1, 0, 0, 1, 1, 2});
 
-    nd4j::ops::Where op;
+    sd::ops::Where op;
     auto result = op.evaluate({&mask}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -603,7 +603,7 @@ TEST_F(ParityOpsTests, Test_Select_1) {
     auto y = NDArrayFactory::create<float>('c', {3, 3}, {9, 8, 7, 6, 5, 4, 3, 2, 1});
     auto exp = NDArrayFactory::create<float>('c', {3, 3}, {1, 2, 3, 6, 5, 4, 3, 2, 1});
 
-    nd4j::ops::select op;
+    sd::ops::select op;
     auto result = op.evaluate({&mask, &x, &y}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -621,7 +621,7 @@ TEST_F(ParityOpsTests, Test_Select_2) {
     auto y = NDArrayFactory::create<float>('c', {2, 2}, {9, 8, 7, 6});
     auto exp = NDArrayFactory::create<float>('c', {2, 2}, {1, 8, 3, 6});
 
-    nd4j::ops::select op;
+    sd::ops::select op;
     auto result = op.evaluate({&mask, &x, &y}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -640,7 +640,7 @@ TEST_F(ParityOpsTests, Test_Select_3) {
     auto y = NDArrayFactory::create<float>('c', {1, 1}, {2});
     auto exp = NDArrayFactory::create<float>('c', {1, 1}, {2});
 
-    nd4j::ops::select op;
+    sd::ops::select op;
     auto result = op.evaluate({&mask, &x, &y}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -658,7 +658,7 @@ TEST_F(ParityOpsTests, Test_Reshape_TF_1) {
 
     auto exp = NDArrayFactory::create<int>('c', {1, 2, 2}, {1, 2, 3, 4});
 
-    nd4j::ops::reshape op;
+    sd::ops::reshape op;
 
     auto result = op.evaluate({&x, &shape}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -675,7 +675,7 @@ TEST_F(ParityOpsTests, Test_Bias_Add_1) {
     auto x = NDArrayFactory::create<float>('c', {10, 5});
     x.assign(0.0);
     auto bias = NDArrayFactory::create<float>('c', {5}, {1, 2, 3, 4, 5});
-    nd4j::ops::biasadd op;
+    sd::ops::biasadd op;
 
     auto result = op.evaluate({&x, &bias}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -692,11 +692,11 @@ TEST_F(ParityOpsTests, Test_Bias_Add_1) {
 
 TEST_F(ParityOpsTests, Test_Scatter_Add_1) {
     auto matrix = NDArrayFactory::create<float>('c', {2, 2}, {1, 2, 3, 4});
-    NDArray idc('c', {1}, std::vector<double>({0}), nd4j::DataType::INT64);
+    NDArray idc('c', {1}, std::vector<double>({0}), sd::DataType::INT64);
     auto updates = NDArrayFactory::create<float>('c', {1, 2}, {1, 1});
     auto exp = NDArrayFactory::create<float>('c', {2, 2}, {2, 3, 3, 4});
 
-    nd4j::ops::scatter_add op;
+    sd::ops::scatter_add op;
     auto result = op.evaluate({&matrix, &idc, &updates}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -710,11 +710,11 @@ TEST_F(ParityOpsTests, Test_Scatter_Add_1) {
 TEST_F(ParityOpsTests, Test_Scatter_Add_2) {
 
     auto vec = NDArrayFactory::create<float>('c', {4}, {1, 2, 3, 4});
-    NDArray idc('c', {1, 4}, {0., 1, 2, 3}, nd4j::DataType::INT64);
+    NDArray idc('c', {1, 4}, {0., 1, 2, 3}, sd::DataType::INT64);
     auto updates = NDArrayFactory::create<float>('c', {1, 4}, {1, 1, 1, 1});
     auto exp = NDArrayFactory::create<float>('c', {1, 4}, {2, 3, 4, 5});
 
-    nd4j::ops::scatter_add op;
+    sd::ops::scatter_add op;
     auto result = op.evaluate({&vec, &idc, &updates}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -727,11 +727,11 @@ TEST_F(ParityOpsTests, Test_Scatter_Add_2) {
 
 TEST_F(ParityOpsTests, Test_Scatter_Add_3) {
     auto matrix = NDArrayFactory::create<float>('c', {2, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8});
-    NDArray idc('c', {1}, std::vector<double>({0}), nd4j::DataType::INT64);
+    NDArray idc('c', {1}, std::vector<double>({0}), sd::DataType::INT64);
     auto updates = NDArrayFactory::create<float>('c', {1, 2, 2}, {1, 1, 1, 1});
     auto exp = NDArrayFactory::create<float>('c', {2, 2, 2}, {2, 3, 4, 5, 5, 6, 7, 8});
 
-    nd4j::ops::scatter_add op;
+    sd::ops::scatter_add op;
     auto result = op.evaluate({&matrix, &idc, &updates}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -744,11 +744,11 @@ TEST_F(ParityOpsTests, Test_Scatter_Add_3) {
 
 TEST_F(ParityOpsTests, Test_Scatter_Add_4) {
     auto matrix = NDArrayFactory::create<float>('c', {2, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8});
-    NDArray idc('c', {1, 2}, std::vector<double>{0, 0}, nd4j::DataType::INT64);
+    NDArray idc('c', {1, 2}, std::vector<double>{0, 0}, sd::DataType::INT64);
     auto updates = NDArrayFactory::create<float>('c', {1, 2, 2, 2}, {1, 1, 1, 1, 1, 1, 1, 1});
     auto exp = NDArrayFactory::create<float>('c', {2, 2, 2}, {3, 4, 5, 6, 5, 6, 7, 8});
 
-    nd4j::ops::scatter_add op;
+    sd::ops::scatter_add op;
     auto result = op.evaluate({&matrix, &idc, &updates}, {}, {}, {true, true});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -761,11 +761,11 @@ TEST_F(ParityOpsTests, Test_Scatter_Add_4) {
 
 TEST_F(ParityOpsTests, Test_Scatter_Add_5) {
     auto matrix = NDArrayFactory::create<float>('c', {2, 2, 3}, {1, 1, 1, 1, 1, 1,   1, 1, 1, 1, 1, 1});
-    NDArray idc('c', {2, 2}, {1., 1, 0, 0}, nd4j::DataType::INT64);
+    NDArray idc('c', {2, 2}, {1., 1, 0, 0}, sd::DataType::INT64);
     auto updates = NDArrayFactory::create<float>('c', {2, 2, 2, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
     auto exp = NDArrayFactory::create<float>('c', {2, 2, 3}, {9., 11., 13.,15., 17., 19., 9., 11., 13.,15., 17., 19.});
 
-    nd4j::ops::scatter_add op;
+    sd::ops::scatter_add op;
     auto result = op.evaluate({&matrix, &idc, &updates}, {}, {}, {true});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -779,11 +779,11 @@ TEST_F(ParityOpsTests, Test_Scatter_Add_5) {
 
 TEST_F(ParityOpsTests, Test_Scatter_Add_6) {
     auto matrix = NDArrayFactory::create<float>('c', {2, 2, 2}, {1, 1, 1, 1, 1, 1, 1, 1});
-    NDArray idc('c', {2, 2}, {1, 1, 0, 0}, nd4j::DataType::INT64);
+    NDArray idc('c', {2, 2}, {1, 1, 0, 0}, sd::DataType::INT64);
     auto updates = NDArrayFactory::create<float>('c', {2, 2, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8});
     auto exp = NDArrayFactory::create<float>('c', {2, 2, 2}, {7, 9, 11, 13, 7, 9, 11, 13});
 
-    nd4j::ops::scatter_add op;
+    sd::ops::scatter_add op;
     auto result = op.evaluate({&matrix, &idc, &updates}, {}, {}, {true, true});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -796,11 +796,11 @@ TEST_F(ParityOpsTests, Test_Scatter_Add_6) {
 
 TEST_F(ParityOpsTests, Test_Scatter_Add_7) {
     auto matrix = NDArrayFactory::create<float>('c', {10, 3}, {1.f,2.f,3.f,4.f,5.f,6.f,7.f,8.f,9.f,10.f,11.f,12.f,13.f,14.f,15.f,16.f,17.f,18.f,19.f,20.f,21.f,22.f,23.f,24.f,25.f,26.f,27.f,28.f,29.f,30.f});
-    NDArray idc('c', {}, std::vector<double>{5}, nd4j::DataType::INT64);
+    NDArray idc('c', {}, std::vector<double>{5}, sd::DataType::INT64);
     auto updates = NDArrayFactory::create<float>('c', {3}, {10.f, 20.f, 30.f});
     auto exp = NDArrayFactory::create<float>('c', {10, 3}, {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f,11.f,12.f, 13.f,14.f,15.f, 26.f,37.f,48.f, 19.f,20.f,21.f, 22.f,23.f,24.f, 25.f,26.f,27.f, 28.f,29.f,30.f});
 
-    nd4j::ops::scatter_add op;
+    sd::ops::scatter_add op;
     auto result = op.evaluate({&matrix, &idc, &updates}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -814,14 +814,14 @@ TEST_F(ParityOpsTests, Test_Scatter_Add_7) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(ParityOpsTests, Test_Scatter_Add_8) {
 
-    NDArray input('c', {8}, {1,1,1,1,1,1,1,1}, nd4j::DataType::FLOAT32);
-    NDArray indices('c', {4}, {1, 1, 1, 1}, nd4j::DataType::INT32);
-    NDArray updates('c', {4}, {1,2,3,4}, nd4j::DataType::FLOAT32);
-    NDArray expected('c', {8}, {1.f, 11.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f}, nd4j::DataType::FLOAT32);
+    NDArray input('c', {8}, {1,1,1,1,1,1,1,1}, sd::DataType::FLOAT32);
+    NDArray indices('c', {4}, {1, 1, 1, 1}, sd::DataType::INT32);
+    NDArray updates('c', {4}, {1,2,3,4}, sd::DataType::FLOAT32);
+    NDArray expected('c', {8}, {1.f, 11.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f}, sd::DataType::FLOAT32);
 
-    NDArray z('c', {8}, nd4j::DataType::FLOAT32);
+    NDArray z('c', {8}, sd::DataType::FLOAT32);
 
-    nd4j::ops::scatter_add op;
+    sd::ops::scatter_add op;
     Nd4jStatus status = op.execute({&input, &indices, &updates}, {&z}, {}, {}, {true});
     // z.printBuffer();
 
@@ -833,11 +833,11 @@ TEST_F(ParityOpsTests, Test_Scatter_Add_8) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(ParityOpsTests, Test_Scatter_Add_9) {
     auto matrix = NDArrayFactory::create<float>('c', {2, 2, 3}, {1, 1, 1, 1, 1, 1,   1, 1, 1, 1, 1, 1});
-    NDArray idc('c', {2, 2}, {1, 10, 0, 0}, nd4j::DataType::INT64);
+    NDArray idc('c', {2, 2}, {1, 10, 0, 0}, sd::DataType::INT64);
     auto updates = NDArrayFactory::create<float>('c', {2, 2, 2, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
     auto output = NDArrayFactory::create<float>('c', {2, 2, 3});
 
-    nd4j::ops::scatter_add op;
+    sd::ops::scatter_add op;
 
     ASSERT_ANY_THROW(op.execute({&matrix, &idc, &updates}, {&output}, {}, {}, {true, true}));
 }
@@ -845,11 +845,11 @@ TEST_F(ParityOpsTests, Test_Scatter_Add_9) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(ParityOpsTests, scatterMax_test1) {
     auto matrix = NDArrayFactory::create<float>('c', {2, 2}, {1, 2, 3, 4});
-    NDArray idc('c', {1}, std::vector<double>{0.}, nd4j::DataType::INT64);
+    NDArray idc('c', {1}, std::vector<double>{0.}, sd::DataType::INT64);
     auto updates = NDArrayFactory::create<float>('c', {1, 2}, {10, 1});
     auto exp = NDArrayFactory::create<float>('c', {2, 2}, {10, 2, 3, 4});
 
-    nd4j::ops::scatter_max op;
+    sd::ops::scatter_max op;
     auto result = op.evaluate({&matrix, &idc, &updates}, {}, {}, {true});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -862,11 +862,11 @@ TEST_F(ParityOpsTests, scatterMax_test1) {
 
 TEST_F(ParityOpsTests, scatterMax_test2) {
     auto vec = NDArrayFactory::create<float>('c', {4}, {1, 2, 3, 4});
-    NDArray idc('c', {1, 4}, {0, 1, 2, 3}, nd4j::DataType::INT64);
+    NDArray idc('c', {1, 4}, {0, 1, 2, 3}, sd::DataType::INT64);
     auto updates = NDArrayFactory::create<float>('c', {1, 4}, {10, 1, 30, 1});
     auto exp = NDArrayFactory::create<float>('c', {1, 4}, {10, 2, 30, 4});
 
-    nd4j::ops::scatter_max op;
+    sd::ops::scatter_max op;
     auto result = op.evaluate({&vec, &idc, &updates}, {}, {}, {true});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -879,11 +879,11 @@ TEST_F(ParityOpsTests, scatterMax_test2) {
 
 TEST_F(ParityOpsTests, scatterMax_test3) {
     auto matrix = NDArrayFactory::create<float>('c', {2, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8});
-    NDArray idc('c', {1}, std::vector<double>({0}), nd4j::DataType::INT64);
+    NDArray idc('c', {1}, std::vector<double>({0}), sd::DataType::INT64);
     auto updates = NDArrayFactory::create<float>('c', {1, 2, 2}, {10, 1, 30, 1});
     auto exp = NDArrayFactory::create<float>('c', {2, 2, 2}, {10, 2, 30, 4, 5, 6, 7, 8});
 
-    nd4j::ops::scatter_max op;
+    sd::ops::scatter_max op;
     auto result = op.evaluate({&matrix, &idc, &updates}, {}, {}, {true});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -896,11 +896,11 @@ TEST_F(ParityOpsTests, scatterMax_test3) {
 
 TEST_F(ParityOpsTests, scatterMax_test4) {
     auto matrix = NDArrayFactory::create<float>('c', {2, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8});
-    NDArray idc('c', {1,2}, std::vector<double>{0.,0}, nd4j::DataType::INT32);
+    NDArray idc('c', {1,2}, std::vector<double>{0.,0}, sd::DataType::INT32);
     auto updates = NDArrayFactory::create<float>('c', {1, 2, 2, 2}, {1,10,1,10, 1,1,10,1.});
     auto exp = NDArrayFactory::create<float>('c', {2, 2, 2}, {1, 10, 10, 10, 5, 6, 7, 8});
 
-    nd4j::ops::scatter_max op;
+    sd::ops::scatter_max op;
     auto result = op.evaluate({&matrix, &idc, &updates}, {}, {true}, {true});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -913,11 +913,11 @@ TEST_F(ParityOpsTests, scatterMax_test4) {
 
 TEST_F(ParityOpsTests, scatterMax_test5) {
     auto matrix = NDArrayFactory::create<float>('c', {2, 2, 3}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
-    NDArray idc('c', {2, 2}, {1, 1, 0, 0}, nd4j::DataType::INT32);
+    NDArray idc('c', {2, 2}, {1, 1, 0, 0}, sd::DataType::INT32);
     auto updates = NDArrayFactory::create<float>('c', {2, 2, 2, 3}, {2,10,1,10, 2,10,1,10, 2,10,1,10,  10,2,10,1, 10,2,10,1, 10,2,10,1.});
     auto exp = NDArrayFactory::create<float>('c', {2, 2, 3}, {10, 2, 10,   2, 10, 2,   2, 10, 2,   10, 2, 10});
 
-    nd4j::ops::scatter_max op;
+    sd::ops::scatter_max op;
     auto result = op.evaluate({&matrix, &idc, &updates}, {}, {}, {true});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -930,11 +930,11 @@ TEST_F(ParityOpsTests, scatterMax_test5) {
 
 TEST_F(ParityOpsTests, scatterMax_test6) {
     auto matrix = NDArrayFactory::create<float>('c', {2, 2, 2}, {1, 1, 1, 1, 1, 1, 1, 1});
-    NDArray idc('c', {2, 2}, {1, 1, 0, 0}, nd4j::DataType::INT32);
+    NDArray idc('c', {2, 2}, {1, 1, 0, 0}, sd::DataType::INT32);
     auto updates = NDArrayFactory::create<float>('c', {2, 2, 2, 2}, {0,2,0,2, 0,2,0,2, 2,0,2,0.,  2,0,2,0});
     auto exp = NDArrayFactory::create<float>('c', {2, 2, 2}, {2, 1, 2, 1, 1, 2, 1, 2});
 
-    nd4j::ops::scatter_max op;
+    sd::ops::scatter_max op;
     auto result = op.evaluate({&matrix, &idc, &updates}, {}, {}, {true});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -948,11 +948,11 @@ TEST_F(ParityOpsTests, scatterMax_test6) {
 
 TEST_F(ParityOpsTests, scatterMin_test1) {
     auto matrix = NDArrayFactory::create<float>('c', {2, 2}, {1, 2, 3, 4});
-    NDArray idc('c', {1}, std::vector<double>({0}), nd4j::DataType::INT32);
+    NDArray idc('c', {1}, std::vector<double>({0}), sd::DataType::INT32);
     auto updates = NDArrayFactory::create<float>('c', {1, 2}, {-1, 1});
     auto exp = NDArrayFactory::create<float>('c', {2, 2}, {-1, 1, 3, 4});
 
-    nd4j::ops::scatter_min op;
+    sd::ops::scatter_min op;
     auto result = op.evaluate({&matrix, &idc, &updates}, {}, {}, {true});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -965,11 +965,11 @@ TEST_F(ParityOpsTests, scatterMin_test1) {
 
 TEST_F(ParityOpsTests, scatterMin_test2) {
     auto vec = NDArrayFactory::create<float>('c', {4}, {1, 2, 3, 4});
-    NDArray idc('c', {1, 4}, {0, 1, 2, 3}, nd4j::DataType::INT32);
+    NDArray idc('c', {1, 4}, {0, 1, 2, 3}, sd::DataType::INT32);
     auto updates = NDArrayFactory::create<float>('c', {1, 4}, {10, 1, 30, 1});
     auto exp = NDArrayFactory::create<float>('c', {1, 4}, {1, 1, 3, 1});
 
-    nd4j::ops::scatter_min op;
+    sd::ops::scatter_min op;
     auto result = op.evaluate({&vec, &idc, &updates}, {}, {}, {true});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -982,11 +982,11 @@ TEST_F(ParityOpsTests, scatterMin_test2) {
 
 TEST_F(ParityOpsTests, scatterMin_test3) {
     auto matrix = NDArrayFactory::create<float>('c', {2, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8});
-    NDArray idc('c', {1}, std::vector<double>({0}), nd4j::DataType::INT32);
+    NDArray idc('c', {1}, std::vector<double>({0}), sd::DataType::INT32);
     auto updates = NDArrayFactory::create<float>('c', {1, 2, 2}, {10, 1, 30, 2});
     auto exp = NDArrayFactory::create<float>('c', {2, 2, 2}, {1, 1, 3, 2, 5, 6, 7, 8});
 
-    nd4j::ops::scatter_min op;
+    sd::ops::scatter_min op;
     auto result = op.evaluate({&matrix, &idc, &updates}, {}, {}, {true});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -999,11 +999,11 @@ TEST_F(ParityOpsTests, scatterMin_test3) {
 
 TEST_F(ParityOpsTests, scatterMin_test4) {
     auto matrix = NDArrayFactory::create<float>('c', {2, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8});
-    NDArray idc('c', {1,2}, std::vector<double>{0.,0}, nd4j::DataType::INT32);
+    NDArray idc('c', {1,2}, std::vector<double>{0.,0}, sd::DataType::INT32);
     auto updates = NDArrayFactory::create<float>('c', {1, 2, 2, 2}, {1,10,1,10, 1,1,10,1.});
     auto exp = NDArrayFactory::create<float>('c', {2, 2, 2}, {1, 1, 1, 1, 5, 6, 7, 8});
 
-    nd4j::ops::scatter_min op;
+    sd::ops::scatter_min op;
     auto result = op.evaluate({&matrix, &idc, &updates}, {}, {}, {true});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1018,11 +1018,11 @@ TEST_F(ParityOpsTests, scatterMin_test4) {
 ////////////////////////////////////////////////////////////////////////
 TEST_F(ParityOpsTests, scatterMin_test5) {
     auto matrix = NDArrayFactory::create<float>('c', {2, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8});
-    NDArray idc('c', {1,2}, {10,10}, nd4j::DataType::INT32);
+    NDArray idc('c', {1,2}, {10,10}, sd::DataType::INT32);
     auto updates = NDArrayFactory::create<float>('c', {1, 2, 2, 2}, {1,10,1,10, 1,1,10,1.});
     auto output = NDArrayFactory::create<float>('c', {2, 2, 2});
 
-    nd4j::ops::scatter_min op;
+    sd::ops::scatter_min op;
 
     ASSERT_ANY_THROW(op.execute({&matrix, &idc, &updates}, {&output}, {}, {}, {true, true}));
 }
@@ -1030,12 +1030,12 @@ TEST_F(ParityOpsTests, scatterMin_test5) {
 ////////////////////////////////////////////////////////////////////////
 TEST_F(ParityOpsTests, scatterND_test1) {
 
-    NDArray indices('c', {2, 1}, {1., 0.}, nd4j::DataType::INT32);
+    NDArray indices('c', {2, 1}, {1., 0.}, sd::DataType::INT32);
     auto updates = NDArrayFactory::create<float>('c', {2, 4}, {10.f, 20.f, 30.f, 40.f, 50.f, 60.f, 70.f, 80.f});
     auto shape = NDArrayFactory::create<int>('c', {2}, {3, 4});
     auto exp = NDArrayFactory::create<float>('c', {3, 4}, {50.f, 60.f, 70.f, 80.f, 10.f, 20.f, 30.f, 40.f, 0.f,  0.f,  0.f,  0.f});
 
-    nd4j::ops::scatter_nd op;
+    sd::ops::scatter_nd op;
     auto result = op.evaluate({&indices, &updates, &shape}, {}, {false, true});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1051,13 +1051,13 @@ TEST_F(ParityOpsTests, scatterND_test1) {
 ////////////////////////////////////////////////////////////////////////
 TEST_F(ParityOpsTests, scatterND_test2) {
 
-    NDArray indices('c', {3, 1}, {4., 2., 0.}, nd4j::DataType::INT32);
+    NDArray indices('c', {3, 1}, {4., 2., 0.}, sd::DataType::INT32);
     auto updates = NDArrayFactory::create<float>('c', {3, 4});
     auto shape = NDArrayFactory::create<int>('c', {2}, {5, 4});
     auto exp = NDArrayFactory::create<float>('c', {5, 4}, {9.f,10.f,11.f,12.f, 0.f, 0.f, 0.f, 0.f, 5.f, 6.f, 7.f, 8.f, 0.f, 0.f, 0.f, 0.f, 1.f, 2.f, 3.f, 4.f});
     updates.linspace(1.f);
 
-    nd4j::ops::scatter_nd op;
+    sd::ops::scatter_nd op;
     auto result = op.evaluate({&indices, &updates, &shape}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1072,7 +1072,7 @@ TEST_F(ParityOpsTests, scatterND_test2) {
 ////////////////////////////////////////////////////////////////////////
 TEST_F(ParityOpsTests, scatterND_test3) {
 
-    NDArray indices('c', {2, 3, 1}, {0., 2., 7., 3., 6., 9.}, nd4j::DataType::INT32);
+    NDArray indices('c', {2, 3, 1}, {0., 2., 7., 3., 6., 9.}, sd::DataType::INT32);
     auto updates = NDArrayFactory::create<float>('c', {2,3, 3,4});
     auto shape = NDArrayFactory::create<int>('c', {3}, {10, 3, 4});
     auto exp = NDArrayFactory::create<float>('c', {10, 3, 4}, {1.f,  2.f,  3.f,  4., 5.f,  6.f,  7.f,  8., 9.f, 10.f, 11.f, 12., 0.f,  0.f,  0.f,  0., 0.f,  0.f,  0.f,  0., 0.f,  0.f,  0.f,  0.,
@@ -1082,7 +1082,7 @@ TEST_F(ParityOpsTests, scatterND_test3) {
                                          0.f,  0.f,  0.f,  0., 0.f,  0.f,  0.f,  0., 0.f,  0.f,  0.f,  0.,61.f, 62.f, 63.f, 64.,65.f, 66.f, 67.f, 68.,69.f, 70.f, 71.f, 72.,});
     updates.linspace(1.f);
 
-    nd4j::ops::scatter_nd op;
+    sd::ops::scatter_nd op;
     auto result = op.evaluate({&indices, &updates, &shape}, {}, {false, true});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1097,12 +1097,12 @@ TEST_F(ParityOpsTests, scatterND_test3) {
 ////////////////////////////////////////////////////////////////////////
 TEST_F(ParityOpsTests, scatterND_test4) {
 
-    NDArray indices('c', {4, 1}, {4., 3., 1., 7.}, nd4j::DataType::INT32);
+    NDArray indices('c', {4, 1}, {4., 3., 1., 7.}, sd::DataType::INT32);
     auto updates = NDArrayFactory::create<float>('c', {4}, {9.f, 10.f, 11.f, 12.f});
     auto shape = NDArrayFactory::create<int>('c', {1}, {8});
     auto exp = NDArrayFactory::create<float>('c', {8}, {0.f, 11.f, 0.f, 10.f, 9.f, 0.f, 0.f, 12.f});
 
-    nd4j::ops::scatter_nd op;
+    sd::ops::scatter_nd op;
     auto result = op.evaluate({&indices, &updates, &shape}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1117,12 +1117,12 @@ TEST_F(ParityOpsTests, scatterND_test4) {
 ////////////////////////////////////////////////////////////////////////
 TEST_F(ParityOpsTests, scatterND_test5) {
 
-    NDArray indices('c', {4, 1}, {1, 1, 1, 1}, nd4j::DataType::INT32);
+    NDArray indices('c', {4, 1}, {1, 1, 1, 1}, sd::DataType::INT32);
     auto updates = NDArrayFactory::create<float>('c', {4}, {1.f, 2.f, 3.f, 4.f});
     auto shape = NDArrayFactory::create<int>('c', {1}, {8});
     auto exp = NDArrayFactory::create<float>('c', {8}, {0.f, 10.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f});
 
-    nd4j::ops::scatter_nd op;
+    sd::ops::scatter_nd op;
     auto result = op.evaluate({&indices, &updates, &shape}, {}, {}, {true});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1138,18 +1138,18 @@ TEST_F(ParityOpsTests, scatterND_test5) {
 ////////////////////////////////////////////////////////////////////////
 TEST_F(ParityOpsTests, scatterND_test6) {
 
-    NDArray indices('c', {3, 2}, {0,1,1,0,3,2}, nd4j::DataType::INT32);
-    NDArray updates('c', {3, 2, 3},  nd4j::DataType::FLOAT32);
-    NDArray shape('c', {4}, {5,4,2,3},  nd4j::DataType::INT32);
+    NDArray indices('c', {3, 2}, {0,1,1,0,3,2}, sd::DataType::INT32);
+    NDArray updates('c', {3, 2, 3},  sd::DataType::FLOAT32);
+    NDArray shape('c', {4}, {5,4,2,3},  sd::DataType::INT32);
 
     NDArray exp('c', {5,4,2,3}, {0.,  0.,  0.,0.,  0.,  0.,1.,  2.,  3.,4.,  5.,  6.,0.,  0.,  0.,0.,  0.,  0.,   0.,  0.,  0.,0.,  0.,  0.,
                                 7.,  8.,  9., 10., 11., 12., 0.,  0.,  0., 0.,  0.,  0., 0.,  0.,  0., 0.,  0.,  0., 0.,  0.,  0., 0.,  0.,  0.,
                                 0.,  0.,  0., 0.,  0.,  0., 0.,  0.,  0., 0.,  0.,  0., 0.,  0.,  0., 0.,  0.,  0., 0.,  0.,  0., 0.,  0.,  0.,
                                 0.,  0.,  0., 0.,  0.,  0., 0.,  0.,  0., 0.,  0.,  0., 13., 14., 15., 16., 17., 18., 0.,  0.,  0., 0.,  0.,  0.,
-                                0.,  0.,  0., 0.,  0.,  0., 0.,  0.,  0., 0.,  0.,  0., 0.,  0.,  0., 0.,  0.,  0., 0.,  0.,  0., 0.,  0.,  0.},  nd4j::DataType::FLOAT32);
+                                0.,  0.,  0., 0.,  0.,  0., 0.,  0.,  0., 0.,  0.,  0., 0.,  0.,  0., 0.,  0.,  0., 0.,  0.,  0., 0.,  0.,  0.},  sd::DataType::FLOAT32);
     updates.linspace(1);
 
-    nd4j::ops::scatter_nd op;
+    sd::ops::scatter_nd op;
     auto result = op.evaluate({&indices, &updates, &shape}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1165,18 +1165,18 @@ TEST_F(ParityOpsTests, scatterND_test6) {
 ////////////////////////////////////////////////////////////////////////
 TEST_F(ParityOpsTests, scatterND_test7) {
 
-    NDArray indices('c', {4,3,2}, {0,1,1,0,3,2,1,0,0,1,1,0,3,2,1,0,0,1,1,0,3,2,1,0}, nd4j::DataType::INT32);
-    NDArray updates('c', {4,3,2,3},  nd4j::DataType::FLOAT32);
-    NDArray shape('c', {4}, {5,4,2,3},  nd4j::DataType::INT32);
+    NDArray indices('c', {4,3,2}, {0,1,1,0,3,2,1,0,0,1,1,0,3,2,1,0,0,1,1,0,3,2,1,0}, sd::DataType::INT32);
+    NDArray updates('c', {4,3,2,3},  sd::DataType::FLOAT32);
+    NDArray shape('c', {4}, {5,4,2,3},  sd::DataType::INT32);
 
     NDArray exp('c', {5,4,2,3}, {0.,   0.,   0., 0.,   0.,   0., 75.,  78.,  81., 84.,  87.,  90., 0.,   0.,   0., 0.,   0.,   0., 0.,   0.,   0., 0.,   0.,   0.,
                                  222., 228., 234., 240., 246., 252., 0.,   0.,   0., 0.,   0.,   0., 0.,   0.,   0., 0.,   0.,   0., 0.,   0.,   0., 0.,   0.,   0.,
                                  0.,   0.,   0., 0.,   0.,   0., 0.,   0.,   0., 0.,   0.,   0., 0.,   0.,   0., 0.,   0.,   0., 0.,   0.,   0., 0.,   0.,   0.,
                                  0.,   0.,   0., 0.,   0.,   0., 0.,   0.,   0., 0.,   0.,   0., 111., 114., 117., 120., 123., 126., 0.,   0.,   0., 0.,   0.,   0.,
-                                 0.,   0.,   0., 0.,   0.,   0., 0.,   0.,   0., 0.,   0.,   0., 0.,   0.,   0., 0.,   0.,   0., 0.,   0.,   0., 0.,   0.,   0.},  nd4j::DataType::FLOAT32);
+                                 0.,   0.,   0., 0.,   0.,   0., 0.,   0.,   0., 0.,   0.,   0., 0.,   0.,   0., 0.,   0.,   0., 0.,   0.,   0., 0.,   0.,   0.},  sd::DataType::FLOAT32);
     updates.linspace(1);
 
-    nd4j::ops::scatter_nd op;
+    sd::ops::scatter_nd op;
     auto result = op.evaluate({&indices, &updates, &shape}, {}, {}, {true, true});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1192,12 +1192,12 @@ TEST_F(ParityOpsTests, scatterND_test7) {
 ////////////////////////////////////////////////////////////////////////
 TEST_F(ParityOpsTests, scatterND_test8) {
 
-    NDArray indices('c', {3, 2}, {0,0, 1,1, 2,2}, nd4j::DataType::INT32);
+    NDArray indices('c', {3, 2}, {0,0, 1,1, 2,2}, sd::DataType::INT32);
     auto updates = NDArrayFactory::create<float>('c', {3}, {1.f, 2.f, 3.f});
     auto shape = NDArrayFactory::create<int>('c', {2}, {6,4});
     auto exp = NDArrayFactory::create<float>('c', {6,4}, {1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
 
-    nd4j::ops::scatter_nd op;
+    sd::ops::scatter_nd op;
     auto result = op.evaluate({&indices, &updates, &shape}, {}, {true});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1213,12 +1213,12 @@ TEST_F(ParityOpsTests, scatterND_test8) {
 ////////////////////////////////////////////////////////////////////////
 TEST_F(ParityOpsTests, scatterND_test9) {
 
-    NDArray indices('c', {2, 3, 1}, {0., 20., 7., 30., 6., 90.}, nd4j::DataType::INT32);
+    NDArray indices('c', {2, 3, 1}, {0., 20., 7., 30., 6., 90.}, sd::DataType::INT32);
     auto updates = NDArrayFactory::create<float>('c', {2,3, 3,4});
     auto shape = NDArrayFactory::create<int>('c', {3}, {10, 3, 4});
     auto output = NDArrayFactory::create<float>('c', {10, 3, 4});
 
-    nd4j::ops::scatter_nd op;
+    sd::ops::scatter_nd op;
 
     ASSERT_ANY_THROW(auto result = op.execute({&indices, &updates, &shape}, {&output}, {}, {}, {false, true}));
 }
@@ -1228,11 +1228,11 @@ TEST_F(ParityOpsTests, scatterND_test9) {
 TEST_F(ParityOpsTests, scatterND_add_test1) {
 
     auto input = NDArrayFactory::create<float>('c', {8}, {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f});
-    NDArray indices('c', {4, 1}, {4., 3., 1., 7.}, nd4j::DataType::INT32);
+    NDArray indices('c', {4, 1}, {4., 3., 1., 7.}, sd::DataType::INT32);
     auto updates = NDArrayFactory::create<float>('c', {4}, {9.f, 10.f, 11.f, 12.f});
     auto exp = NDArrayFactory::create<float>('c', {8}, {1.f, 13.f, 3.f, 14.f, 14.f, 6.f, 7.f, 20.f});
 
-    nd4j::ops::scatter_nd_add op;
+    sd::ops::scatter_nd_add op;
     auto result = op.evaluate({&input, &indices, &updates}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1248,14 +1248,14 @@ TEST_F(ParityOpsTests, scatterND_add_test1) {
 TEST_F(ParityOpsTests, scatterND_add_test2) {
 
     auto input = NDArrayFactory::create<float>('c', {6, 4});
-    NDArray indices('c', {3, 3, 2}, {0.f,0.f, 1.f,1.f, 2.f,2.f, 3.f,3.f, 4.f,0.f, 5.f,1.f, 0.f,2.f, 1.f,3.f, 2.f,0.f}, nd4j::DataType::INT32);
+    NDArray indices('c', {3, 3, 2}, {0.f,0.f, 1.f,1.f, 2.f,2.f, 3.f,3.f, 4.f,0.f, 5.f,1.f, 0.f,2.f, 1.f,3.f, 2.f,0.f}, sd::DataType::INT32);
     auto updates = NDArrayFactory::create<float>('c', {3,3});
     auto exp = NDArrayFactory::create<float>('c', {6,4}, {1.f,0.f,7.f,0.f, 0.f,2.f,0.f,8.f, 9.f,0.f,3.f,0.f, 0.f,0.f,0.f,4.f, 5.f,0.f,0.f,0.f, 0.f,6.f,0.f,0.f});
 
     input = 0.f;
     updates.linspace(1.f);
 
-    nd4j::ops::scatter_nd_add op;
+    sd::ops::scatter_nd_add op;
     auto result = op.evaluate({&input, &indices, &updates}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1272,14 +1272,14 @@ TEST_F(ParityOpsTests, scatterND_add_test2) {
 TEST_F(ParityOpsTests, scatterND_add_test3) {
 
     auto input = NDArrayFactory::create<float>('c', {6, 4});
-    NDArray indices('c', {2, 3, 1}, {5.f, 1.f, 2.f, 3.f, 4.f, 0.f}, nd4j::DataType::INT32);
+    NDArray indices('c', {2, 3, 1}, {5.f, 1.f, 2.f, 3.f, 4.f, 0.f}, sd::DataType::INT32);
     auto updates = NDArrayFactory::create<float>('c', {2,3,4});
     auto exp = NDArrayFactory::create<float>('c', {6,4}, {21.f, 22.f, 23.f, 24.f, 5.f,  6.f,  7.f,  8.f, 9.f, 10.f, 11.f, 12.f,13.f, 14.f, 15.f, 16.f,17.f, 18.f, 19.f, 20.f, 1.f,  2.f,  3.f,  4.f});
 
     input = 0.f;
     updates.linspace(1.f);
 
-    nd4j::ops::scatter_nd_add op;
+    sd::ops::scatter_nd_add op;
     auto result = op.evaluate({&input, &indices, &updates}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1295,7 +1295,7 @@ TEST_F(ParityOpsTests, scatterND_add_test3) {
 TEST_F(ParityOpsTests, scatterND_add_test4) {
 
     auto input = NDArrayFactory::create<float>('c', {6, 4, 5});
-    NDArray indices('c', {3, 3, 2}, {0.f,0.f, 1.f,1.f, 2.f,2.f, 3.f,3.f, 4.f,0.f, 5.f,1.f, 0.f,2.f, 1.f,3.f, 2.f,0.f}, nd4j::DataType::INT32);
+    NDArray indices('c', {3, 3, 2}, {0.f,0.f, 1.f,1.f, 2.f,2.f, 3.f,3.f, 4.f,0.f, 5.f,1.f, 0.f,2.f, 1.f,3.f, 2.f,0.f}, sd::DataType::INT32);
     auto updates = NDArrayFactory::create<float>('c', {3,3,5});
     auto exp = NDArrayFactory::create<float>('c', {6,4,5}, {1.f,  2.f,  3.f,  4.f,  5.f, 0.f,  0.f,  0.f,  0.f,  0.f,31.f, 32.f, 33.f, 34.f, 35.f, 0.f,  0.f,  0.f,  0.f,  0.f,
                                       0.f,  0.f,  0.f,  0.f,  0.f, 6.f,  7.f,  8.f,  9.f, 10.f, 0.f,  0.f,  0.f,  0.f,  0.f,36.f, 37.f, 38.f, 39.f, 40.f,
@@ -1306,7 +1306,7 @@ TEST_F(ParityOpsTests, scatterND_add_test4) {
     input = 0.f;
     updates.linspace(1.f);
 
-    nd4j::ops::scatter_nd_add op;
+    sd::ops::scatter_nd_add op;
     auto result = op.evaluate({&input, &indices, &updates}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1322,7 +1322,7 @@ TEST_F(ParityOpsTests, scatterND_add_test4) {
 TEST_F(ParityOpsTests, scatterND_add_test5) {
 
     auto input = NDArrayFactory::create<float>('c', {6,5,4,3,2});
-    NDArray indices('c', {2,2,3}, {0.f,0.f,0.f, 1.f,1.f,1.f, 2.f,2.f,2.f, 3.f,3.f,3.f}, nd4j::DataType::INT32);
+    NDArray indices('c', {2,2,3}, {0.f,0.f,0.f, 1.f,1.f,1.f, 2.f,2.f,2.f, 3.f,3.f,3.f}, sd::DataType::INT32);
     auto updates = NDArrayFactory::create<float>('c', {2,2,3,2});
     auto exp = NDArrayFactory::create<float>('c', {6,5,4,3,2}, { 1.f,  2.f, 3.f,  4.f, 5.f,  6.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f,
 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f,
@@ -1342,7 +1342,7 @@ TEST_F(ParityOpsTests, scatterND_add_test5) {
     input = 0.f;
     updates.linspace(1.f);
 
-    nd4j::ops::scatter_nd_add op;
+    sd::ops::scatter_nd_add op;
     auto result = op.evaluate({&input, &indices, &updates}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1358,11 +1358,11 @@ TEST_F(ParityOpsTests, scatterND_add_test5) {
 TEST_F(ParityOpsTests, scatterND_add_test6) {
 
     auto input = NDArrayFactory::create<float>('c', {6, 4});
-    NDArray indices('c', {2, 3, 1}, {50.f, 1.f, 2.f, 3.f, 40.f, 0.f}, nd4j::DataType::INT32);
+    NDArray indices('c', {2, 3, 1}, {50.f, 1.f, 2.f, 3.f, 40.f, 0.f}, sd::DataType::INT32);
     auto updates = NDArrayFactory::create<float>('c', {2,3,4});
     auto output = NDArrayFactory::create<float>('c', {6,4});
 
-    nd4j::ops::scatter_nd_add op;
+    sd::ops::scatter_nd_add op;
 
     ASSERT_ANY_THROW(op.execute({&input, &indices, &updates}, {&output}, {}, {}, {false, true}));
 }
@@ -1371,11 +1371,11 @@ TEST_F(ParityOpsTests, scatterND_add_test6) {
 TEST_F(ParityOpsTests, scatterND_sub_test1) {
 
     auto input = NDArrayFactory::create<float>('c', {8}, {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f});
-    NDArray indices('c', {4, 1}, {4.f, 3.f, 1.f, 7.f}, nd4j::DataType::INT32);
+    NDArray indices('c', {4, 1}, {4.f, 3.f, 1.f, 7.f}, sd::DataType::INT32);
     auto updates = NDArrayFactory::create<float>('c', {4}, {9.f, 10.f, 11.f, 12.f});
     auto exp = NDArrayFactory::create<float>('c', {8}, {1.f, -9.f,  3.f, -6.f, -4.f,  6.f,  7.f, -4.f});
 
-    nd4j::ops::scatter_nd_sub op;
+    sd::ops::scatter_nd_sub op;
     auto result = op.evaluate({&input, &indices, &updates}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1391,14 +1391,14 @@ TEST_F(ParityOpsTests, scatterND_sub_test1) {
 TEST_F(ParityOpsTests, scatterND_sub_test2) {
 
     auto input = NDArrayFactory::create<float>('c', {6, 4});
-    NDArray indices('c', {3, 3, 2}, {0.f,0.f, 1.f,1.f, 2.f,2.f, 3.f,3.f, 4.f,0.f, 5.f,1.f, 0.f,2.f, 1.f,3.f, 2.f,0.f}, nd4j::DataType::INT32);
+    NDArray indices('c', {3, 3, 2}, {0.f,0.f, 1.f,1.f, 2.f,2.f, 3.f,3.f, 4.f,0.f, 5.f,1.f, 0.f,2.f, 1.f,3.f, 2.f,0.f}, sd::DataType::INT32);
     auto updates = NDArrayFactory::create<float>('c', {3,3});
     auto exp = NDArrayFactory::create<float>('c', {6,4}, {-1.f,0.f,-7.f,0.f, 0.f,-2.f,0.f,-8.f, -9.f,0.f,-3.f,0.f, 0.f,0.f,0.f,-4.f, -5.f,0.f,0.f,0.f, 0.f,-6.f,0.f,0.f});
 
     input = 0.f;
     updates.linspace(1.f);
 
-    nd4j::ops::scatter_nd_sub op;
+    sd::ops::scatter_nd_sub op;
     auto result = op.evaluate({&input, &indices, &updates}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1416,14 +1416,14 @@ TEST_F(ParityOpsTests, scatterND_sub_test2) {
 TEST_F(ParityOpsTests, scatterND_sub_test3) {
 
     auto input = NDArrayFactory::create<float>('c', {6, 4});
-    NDArray indices('c', {2, 3, 1}, {5.f, 1.f, 2.f, 3.f,4.f, 0.f}, nd4j::DataType::INT32);
+    NDArray indices('c', {2, 3, 1}, {5.f, 1.f, 2.f, 3.f,4.f, 0.f}, sd::DataType::INT32);
     auto updates = NDArrayFactory::create<float>('c', {2,3,4});
     auto exp = NDArrayFactory::create<float>('c', {6,4}, {-21.f,-22.f,-23.f,-24., -5.f, -6.f, -7.f, -8., -9.f,-10.f,-11.f,-12., -13.f,-14.f,-15.f,-16., -17.f,-18.f,-19.f,-20., -1.f, -2.f, -3.f, -4.f});
 
     input = 0.f;
     updates.linspace(1.f);
 
-    nd4j::ops::scatter_nd_sub op;
+    sd::ops::scatter_nd_sub op;
     auto result = op.evaluate({&input, &indices, &updates}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1439,7 +1439,7 @@ TEST_F(ParityOpsTests, scatterND_sub_test3) {
 TEST_F(ParityOpsTests, scatterND_sub_test4) {
 
     auto input = NDArrayFactory::create<float>('c', {6, 4, 5});
-    NDArray indices('c', {3, 3, 2}, {0.f,0.f, 1.f,1.f, 2.f,2.f, 3.f,3.f, 4.f,0.f, 5.f,1.f, 0.f,2.f, 1.f,3.f, 2.f,0.f}, nd4j::DataType::INT32);
+    NDArray indices('c', {3, 3, 2}, {0.f,0.f, 1.f,1.f, 2.f,2.f, 3.f,3.f, 4.f,0.f, 5.f,1.f, 0.f,2.f, 1.f,3.f, 2.f,0.f}, sd::DataType::INT32);
     auto updates = NDArrayFactory::create<float>('c', {3,3,5});
     auto exp = NDArrayFactory::create<float>('c', {6,4,5}, {-1.f,  -2.f,  -3.f,  -4.f,  -5.f, 0.f,  0.f,  0.f,  0.f,  0.f,-31.f, -32.f, -33.f, -34.f, -35.f, 0.f,  0.f,  0.f,  0.f,  0.f,
                                       0.f,  0.f,  0.f,  0.f,  0.f, -6.f,  -7.f,  -8.f,  -9.f, -10.f, 0.f,  0.f,  0.f,  0.f,  0.f,-36.f, -37.f, -38.f, -39.f, -40.f,
@@ -1450,7 +1450,7 @@ TEST_F(ParityOpsTests, scatterND_sub_test4) {
     input = 0.f;
     updates.linspace(1.f);
 
-    nd4j::ops::scatter_nd_sub op;
+    sd::ops::scatter_nd_sub op;
     auto result = op.evaluate({&input, &indices, &updates}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1466,7 +1466,7 @@ TEST_F(ParityOpsTests, scatterND_sub_test4) {
 TEST_F(ParityOpsTests, scatterND_sub_test5) {
 
     auto input = NDArrayFactory::create<float>('c', {6,5,4,3,2});
-    NDArray indices('c', {2,2,3}, {0.f,0.f,0.f, 1.f,1.f,1.f, 2.f,2.f,2.f, 3.f,3.f,3.f}, nd4j::DataType::INT32);
+    NDArray indices('c', {2,2,3}, {0.f,0.f,0.f, 1.f,1.f,1.f, 2.f,2.f,2.f, 3.f,3.f,3.f}, sd::DataType::INT32);
     auto updates = NDArrayFactory::create<float>('c', {2,2,3,2});
     auto exp = NDArrayFactory::create<float>('c', {6,5,4,3,2}, { -1.f,  -2.f, -3.f,  -4.f, -5.f,  -6.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f,
 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f, 0.f,  0.f,
@@ -1486,7 +1486,7 @@ TEST_F(ParityOpsTests, scatterND_sub_test5) {
     input = 0.f;
     updates.linspace(1.f);
 
-    nd4j::ops::scatter_nd_sub op;
+    sd::ops::scatter_nd_sub op;
     auto result = op.evaluate({&input, &indices, &updates}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1503,11 +1503,11 @@ TEST_F(ParityOpsTests, scatterND_sub_test5) {
 TEST_F(ParityOpsTests, scatterND_update_test1) {
 
     auto input = NDArrayFactory::create<float>('c', {8}, {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f});
-    NDArray indices('c', {4, 1}, {4.f, 3.f, 1.f, 7.f}, nd4j::DataType::INT32);
+    NDArray indices('c', {4, 1}, {4.f, 3.f, 1.f, 7.f}, sd::DataType::INT32);
     auto updates = NDArrayFactory::create<float>('c', {4}, {9.f, 10.f, 11.f, 12.f});
     auto exp = NDArrayFactory::create<float>('c', {8}, {1.f, 11.f, 3.f, 10.f, 9.f, 6.f, 7.f, 12.f});
 
-    nd4j::ops::scatter_nd_update op;
+    sd::ops::scatter_nd_update op;
     auto result = op.evaluate({&input, &indices, &updates}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1523,14 +1523,14 @@ TEST_F(ParityOpsTests, scatterND_update_test1) {
 TEST_F(ParityOpsTests, scatterND_update_test2) {
 
     auto input = NDArrayFactory::create<float>('c', {6, 4});
-    NDArray indices('c', {3, 3, 2}, {0.f,0.f, 1.f,1.f, 2.f,2.f, 3.f,3.f, 4.f,0.f, 5.f,1.f, 0.f,2.f, 1.f,3.f, 2.f,0.f}, nd4j::DataType::INT32);
+    NDArray indices('c', {3, 3, 2}, {0.f,0.f, 1.f,1.f, 2.f,2.f, 3.f,3.f, 4.f,0.f, 5.f,1.f, 0.f,2.f, 1.f,3.f, 2.f,0.f}, sd::DataType::INT32);
     auto updates = NDArrayFactory::create<float>('c', {3,3});
     auto exp = NDArrayFactory::create<float>('c', {6,4}, {1.f,-1.f,7.f,-1.f, -1.f,2.f,-1.f,8.f, 9.f,-1.f,3.f,-1.f, -1.f,-1.f,-1.f,4.f, 5.f,-1.f,-1.f,-1.f, -1.f,6.f,-1.f,-1.f});
 
     input = -1.f;
     updates.linspace(1.f);
 
-    nd4j::ops::scatter_nd_update op;
+    sd::ops::scatter_nd_update op;
     auto result = op.evaluate({&input, &indices, &updates}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1547,14 +1547,14 @@ TEST_F(ParityOpsTests, scatterND_update_test2) {
 TEST_F(ParityOpsTests, scatterND_update_test3) {
 
     auto input = NDArrayFactory::create<float>('c', {6, 4});
-    NDArray indices('c', {2, 3, 1}, {5.f, 1.f, 2.f, 3.f, 4.f, 0.f}, nd4j::DataType::INT32);
+    NDArray indices('c', {2, 3, 1}, {5.f, 1.f, 2.f, 3.f, 4.f, 0.f}, sd::DataType::INT32);
     auto updates = NDArrayFactory::create<float>('c', {2,3,4});
     auto exp = NDArrayFactory::create<float>('c', {6,4}, {21.f, 22.f, 23.f, 24.f, 5.f,  6.f,  7.f,  8.f, 9.f, 10.f, 11.f, 12.f,13.f, 14.f, 15.f, 16.f,17.f, 18.f, 19.f, 20.f, 1.f,  2.f,  3.f,  4.f,});
 
     input = -1.f;
     updates.linspace(1.f);
 
-    nd4j::ops::scatter_nd_update op;
+    sd::ops::scatter_nd_update op;
     auto result = op.evaluate({&input, &indices, &updates}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1571,7 +1571,7 @@ TEST_F(ParityOpsTests, scatterND_update_test3) {
 TEST_F(ParityOpsTests, scatterND_update_test4) {
 
     auto input = NDArrayFactory::create<float>('c', {6, 4, 5});
-    NDArray indices('c', {3, 3, 2}, {0.f,0.f, 1.f,1.f, 2.f,2.f, 3.f,3.f, 4.f,0.f, 5.f,1.f, 0.f,2.f, 1.f,3.f, 2.f,0.f}, nd4j::DataType::INT32);
+    NDArray indices('c', {3, 3, 2}, {0.f,0.f, 1.f,1.f, 2.f,2.f, 3.f,3.f, 4.f,0.f, 5.f,1.f, 0.f,2.f, 1.f,3.f, 2.f,0.f}, sd::DataType::INT32);
     auto updates = NDArrayFactory::create<float>('c', {3,3,5});
     auto exp = NDArrayFactory::create<float>('c', {6,4,5}, {1.f,  2.f,  3.f,  4.f,  5.f, -1.f,  -1.f,  -1.f,  -1.f,  -1.f,31.f, 32.f, 33.f, 34.f, 35.f, -1.f,  -1.f,  -1.f,  -1.f,  -1.f,
                                       -1.f,  -1.f,  -1.f,  -1.f,  -1.f, 6.f,  7.f,  8.f,  9.f, 10.f, -1.f,  -1.f,  -1.f,  -1.f,  -1.f,36.f, 37.f, 38.f, 39.f, 40.f,
@@ -1582,7 +1582,7 @@ TEST_F(ParityOpsTests, scatterND_update_test4) {
     input = -1.f;
     updates.linspace(1.f);
 
-    nd4j::ops::scatter_nd_update op;
+    sd::ops::scatter_nd_update op;
     auto result = op.evaluate({&input, &indices, &updates}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1598,7 +1598,7 @@ TEST_F(ParityOpsTests, scatterND_update_test4) {
 TEST_F(ParityOpsTests, scatterND_update_test5) {
 
     auto input = NDArrayFactory::create<float>('c', {6,5,4,3,2});
-    NDArray indices('c', {2,2,3}, {0.f,0.f,0.f, 1.f,1.f,1.f, 2.f,2.f,2.f, 3.f,3.f,3.f}, nd4j::DataType::INT32);
+    NDArray indices('c', {2,2,3}, {0.f,0.f,0.f, 1.f,1.f,1.f, 2.f,2.f,2.f, 3.f,3.f,3.f}, sd::DataType::INT32);
     auto updates = NDArrayFactory::create<float>('c', {2,2,3,2});
     auto exp = NDArrayFactory::create<float>('c', {6,5,4,3,2}, { 1.f,  2.f, 3.f,  4.f, 5.f,  6.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f,
 -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f, -1.f,  -1.f,
@@ -1618,7 +1618,7 @@ TEST_F(ParityOpsTests, scatterND_update_test5) {
     input = -1.f;
     updates.linspace(1.f);
 
-    nd4j::ops::scatter_nd_update op;
+    sd::ops::scatter_nd_update op;
     auto result = op.evaluate({&input, &indices, &updates}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -1634,11 +1634,11 @@ TEST_F(ParityOpsTests, scatterND_update_test5) {
 TEST_F(ParityOpsTests, scatterND_update_test6) {
 
     auto input = NDArrayFactory::create<float>('c', {6, 4});
-    NDArray indices('c', {3, 3, 2}, {0.f,0.f, 10.f,1.f, 20.f,2.f, 30.f,3.f, 40.f,0.f, 50.f,1.f, 0.f,2.f, 1.f,3.f, 2.f,0.f}, nd4j::DataType::INT32);
+    NDArray indices('c', {3, 3, 2}, {0.f,0.f, 10.f,1.f, 20.f,2.f, 30.f,3.f, 40.f,0.f, 50.f,1.f, 0.f,2.f, 1.f,3.f, 2.f,0.f}, sd::DataType::INT32);
     auto updates = NDArrayFactory::create<float>('c', {3,3});
     auto output = NDArrayFactory::create<float>('c', {6,4});
 
-    nd4j::ops::scatter_nd_update op;
+    sd::ops::scatter_nd_update op;
 
     ASSERT_ANY_THROW(op.execute({&input, &indices, &updates}, {&output}, {}, {}, {true, true}));
 }
@@ -1646,12 +1646,12 @@ TEST_F(ParityOpsTests, scatterND_update_test6) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(ParityOpsTests, scatter_update_1) {
 
-    NDArray x('c', {2,2}, {1,2,3,4}, nd4j::DataType::INT32);
-    NDArray updates('c', {2,2}, {10,20,30,40}, nd4j::DataType::INT32);
+    NDArray x('c', {2,2}, {1,2,3,4}, sd::DataType::INT32);
+    NDArray updates('c', {2,2}, {10,20,30,40}, sd::DataType::INT32);
 
-    NDArray exp('c', {2,2}, {30,40,10,20}, nd4j::DataType::INT32);
+    NDArray exp('c', {2,2}, {30,40,10,20}, sd::DataType::INT32);
 
-    nd4j::ops::scatter_update op;
+    sd::ops::scatter_update op;
     auto results = op.evaluate({&x, &updates}, {}, {6,   1,1,  2,1,0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1666,12 +1666,12 @@ TEST_F(ParityOpsTests, scatter_update_1) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(ParityOpsTests, scatter_update_2) {
 
-    NDArray x('c', {2,2}, {1,2,3,4}, nd4j::DataType::INT32);
-    NDArray updates('c', {2,2}, {10,20,30,40}, nd4j::DataType::INT32);
+    NDArray x('c', {2,2}, {1,2,3,4}, sd::DataType::INT32);
+    NDArray updates('c', {2,2}, {10,20,30,40}, sd::DataType::INT32);
 
-    NDArray exp('c', {2,2}, {20,10,40,30}, nd4j::DataType::INT32);
+    NDArray exp('c', {2,2}, {20,10,40,30}, sd::DataType::INT32);
 
-    nd4j::ops::scatter_update op;
+    sd::ops::scatter_update op;
     auto results = op.evaluate({&x, &updates}, {}, {6,   1,0,  2,1,0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1685,12 +1685,12 @@ TEST_F(ParityOpsTests, scatter_update_2) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(ParityOpsTests, scatter_update_3) {
 
-    NDArray x('c', {2,2,2}, {1,2,3,4,5,6,7,8}, nd4j::DataType::INT32);
-    NDArray updates('c', {2,2,2}, {10,20,30,40,50,60,70,80}, nd4j::DataType::INT32);
+    NDArray x('c', {2,2,2}, {1,2,3,4,5,6,7,8}, sd::DataType::INT32);
+    NDArray updates('c', {2,2,2}, {10,20,30,40,50,60,70,80}, sd::DataType::INT32);
 
-    NDArray exp('c', {2,2,2}, {50,60,70,80,10,20,30,40}, nd4j::DataType::INT32);
+    NDArray exp('c', {2,2,2}, {50,60,70,80,10,20,30,40}, sd::DataType::INT32);
 
-    nd4j::ops::scatter_update op;
+    sd::ops::scatter_update op;
     auto results = op.evaluate({&x, &updates}, {}, {6,  2,1,2,  2,1,0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
@@ -1704,12 +1704,12 @@ TEST_F(ParityOpsTests, scatter_update_3) {
 //////////////////////////////////////////////////////////////////////
 TEST_F(ParityOpsTests, scatter_update_4) {
 
-    NDArray x('c', {2,2,2}, {1,2,3,4,5,6,7,8}, nd4j::DataType::INT32);
-    NDArray updates('c', {2,2,2}, {10,20,30,40,50,60,70,80}, nd4j::DataType::INT32);
+    NDArray x('c', {2,2,2}, {1,2,3,4,5,6,7,8}, sd::DataType::INT32);
+    NDArray updates('c', {2,2,2}, {10,20,30,40,50,60,70,80}, sd::DataType::INT32);
 
-    NDArray exp('c', {2,2,2}, {20,2,3,10,60,6,7,50}, nd4j::DataType::INT32);
+    NDArray exp('c', {2,2,2}, {20,2,3,10,60,6,7,50}, sd::DataType::INT32);
 
-    nd4j::ops::scatter_update op;
+    sd::ops::scatter_update op;
     auto results = op.evaluate({&x, &updates}, {}, {6,  1,0,  2,3,0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
diff --git a/libnd4j/tests_cpu/layers_tests/PerformanceTests.cpp b/libnd4j/tests_cpu/layers_tests/PerformanceTests.cpp
index 68c68aafb..52c4bd33e 100644
--- a/libnd4j/tests_cpu/layers_tests/PerformanceTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/PerformanceTests.cpp
@@ -19,20 +19,20 @@
 //
 
 #include "testlayers.h"
-#include <Graph.h>
+#include <graph/Graph.h>
 #include <chrono>
-#include <Node.h>
+#include <graph/Node.h>
 #include <ops/declarable/CustomOperations.h>
 #include <graph/profiling/GraphProfilingHelper.h>
-#include <type_conversions.h>
+#include <loops/type_conversions.h>
 #include <helpers/threshold.h>
 #include <helpers/MmulHelper.h>
 #include <ops/ops.h>
-#include <OmpLaunchHelper.h>
-#include <GradCheck.h>
+#include <helpers/OmpLaunchHelper.h>
+#include <helpers/GradCheck.h>
 #include <ops/declarable/helpers/im2col.h>
-#include <Loops.h>
-#include <RandomLauncher.h>
+#include <helpers/Loops.h>
+#include <helpers/RandomLauncher.h>
 
 #include <helpers/BenchmarkHelper.h>
 #include <ops/declarable/helpers/scatter.h>
@@ -45,8 +45,8 @@
 #include <ops/declarable/helpers/legacy_helpers.h>
 #include <execution/ThreadPool.h>
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class PerformanceTests : public testing::Test {
 public:
@@ -75,7 +75,7 @@ TEST_F(PerformanceTests, test_maxpooling2d_1) {
     ctx.setOutputArray(0, &z);
     ctx.setIArguments(iArgs, 9);
 
-    nd4j::ops::maxpool2d op;
+    sd::ops::maxpool2d op;
 
     for (int i = 0; i < numIterations; i++) {
         auto timeStart = std::chrono::system_clock::now();
diff --git a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp
index 24cbe1d5c..779717d5f 100644
--- a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp
@@ -20,20 +20,20 @@
 //
 
 #include "testlayers.h"
-#include <Graph.h>
+#include <graph/Graph.h>
 #include <chrono>
-#include <Node.h>
+#include <graph/Node.h>
 #include <ops/declarable/CustomOperations.h>
 #include <graph/profiling/GraphProfilingHelper.h>
-#include <type_conversions.h>
+#include <loops/type_conversions.h>
 #include <helpers/threshold.h>
 #include <helpers/MmulHelper.h>
 #include <ops/ops.h>
-#include <OmpLaunchHelper.h>
-#include <GradCheck.h>
+#include <helpers/OmpLaunchHelper.h>
+#include <helpers/GradCheck.h>
 #include <ops/declarable/helpers/im2col.h>
-#include <Loops.h>
-#include <RandomLauncher.h>
+#include <helpers/Loops.h>
+#include <helpers/RandomLauncher.h>
 #include <ops/declarable/helpers/convolutions.h>
 
 #include <helpers/BenchmarkHelper.h>
@@ -47,8 +47,8 @@
 #include <ops/declarable/helpers/legacy_helpers.h>
 #include <ops/declarable/helpers/addBias.h>
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class PlaygroundTests : public testing::Test {
 public:
@@ -72,7 +72,7 @@ TEST_F(PlaygroundTests, test_biasAdd_1) {
 
     std::vector<Nd4jLong> values;
 
-    nd4j::ops::biasadd op;
+    sd::ops::biasadd op;
 
     for (int e = 0; e < 100; e++) {
         auto timeStart = std::chrono::system_clock::now();
@@ -93,7 +93,7 @@ TEST_F(PlaygroundTests, test_biasAdd_1) {
 
 TEST_F(PlaygroundTests, test_bert_1) {
     // this test will run ONLY if this model exists
-    if (nd4j::graph::getFileSize("/home/raver119/Downloads/Bert_minimal_model/bert_minimal_model.fb") < 0)
+    if (sd::graph::getFileSize("/home/raver119/Downloads/Bert_minimal_model/bert_minimal_model.fb") < 0)
         return;
 
     auto graph = GraphExecutioner::importFromFlatBuffers("/home/raver119/Downloads/Bert_minimal_model/bert_minimal_model.fb");
@@ -121,12 +121,12 @@ TEST_F(PlaygroundTests, test_bert_1) {
     ASSERT_EQ(z, *array);
 
 /*
-    nd4j::Environment::getInstance()->setProfiling(true);
+    sd::Environment::getInstance()->setProfiling(true);
     auto profile = GraphProfilingHelper::profile(graph, 1);
 
     profile->printOut();
 
-    nd4j::Environment::getInstance()->setProfiling(false);
+    sd::Environment::getInstance()->setProfiling(false);
     delete profile;
 */
 /*
@@ -151,7 +151,7 @@ TEST_F(PlaygroundTests, test_bert_1) {
 
 TEST_F(PlaygroundTests, test_bert_2) {
     // this test will run ONLY if this model exists
-    if (nd4j::graph::getFileSize("/home/raver119/Downloads/Bert_minimal_model/bert_like_ops.fb") < 0)
+    if (sd::graph::getFileSize("/home/raver119/Downloads/Bert_minimal_model/bert_like_ops.fb") < 0)
         return;
 
     auto graph = GraphExecutioner::importFromFlatBuffers("/home/raver119/Downloads/Bert_minimal_model/bert_like_ops.fb");
@@ -171,12 +171,12 @@ TEST_F(PlaygroundTests, test_bert_2) {
     ASSERT_EQ(z, *array);
 */
 
-    nd4j::Environment::getInstance()->setProfiling(true);
+    sd::Environment::getInstance()->setProfiling(true);
     auto profile = GraphProfilingHelper::profile(graph, 1);
 
     profile->printOut();
 
-    nd4j::Environment::getInstance()->setProfiling(false);
+    sd::Environment::getInstance()->setProfiling(false);
     delete profile;
 
 /*
@@ -204,7 +204,7 @@ TEST_F(PlaygroundTests, test_one_off_ops_1) {
     auto y = NDArrayFactory::create<float>('c', {4, 128, 1});
     auto z = x.ulike();
 
-    nd4j::ops::squaredsubtract op;
+    sd::ops::squaredsubtract op;
     op.execute({&x, &y}, {&z});
 }
 
@@ -229,7 +229,7 @@ TEST_F(PlaygroundTests, test_broadcast_1) {
     std::vector<Nd4jLong> values;
     Context ctx(1);
 
-    nd4j::ops::biasadd op;
+    sd::ops::biasadd op;
 
     for (int e = 0; e < 1000; e++) {
         auto x = aX[e < pool ? e : e % pool];
@@ -239,7 +239,7 @@ TEST_F(PlaygroundTests, test_broadcast_1) {
         auto timeStart = std::chrono::system_clock::now();
 
         //op.execute({x, y}, {z});
-        nd4j::ops::helpers::addBias(ctx, *x, *y, *z, false);
+        sd::ops::helpers::addBias(ctx, *x, *y, *z, false);
 
         auto timeEnd = std::chrono::system_clock::now();
         auto outerTime = std::chrono::duration_cast<std::chrono::microseconds>(timeEnd - timeStart).count();
@@ -313,7 +313,7 @@ TEST_F(PlaygroundTests, test_s_0) {
 
     for (auto shape: shapes) {
         for (auto t: threads) {
-            nd4j::Environment::getInstance()->setMaxMasterThreads(t);
+            sd::Environment::getInstance()->setMaxMasterThreads(t);
 
             auto x = NDArrayFactory::create<float>('c', shape);
             auto y = NDArrayFactory::create<float>('c', {shape[3]});
@@ -325,14 +325,14 @@ TEST_F(PlaygroundTests, test_s_0) {
             ctx.setInputArray(1, &y);
             ctx.setOutputArray(0, &z);
 
-            nd4j::ops::biasadd op;
+            sd::ops::biasadd op;
 
 
             for (int e = 0; e < 10000; e++) {
                 auto timeStart = std::chrono::system_clock::now();
 
                 op.execute(&ctx);
-                nd4j::ops::helpers::addBias(ctx, x, y, z, false);
+                sd::ops::helpers::addBias(ctx, x, y, z, false);
 
                 auto timeEnd = std::chrono::system_clock::now();
                 auto outerTime = std::chrono::duration_cast<std::chrono::microseconds>(timeEnd - timeStart).count();
@@ -352,7 +352,7 @@ TEST_F(PlaygroundTests, test_s_1) {
 
     for (auto shape: shapes) {
         for (auto t: threads) {
-            nd4j::Environment::getInstance()->setMaxMasterThreads(t);
+            sd::Environment::getInstance()->setMaxMasterThreads(t);
 
             auto x = NDArrayFactory::create<float>('c', shape);
             auto y = NDArrayFactory::create<float>('c', {shape[1]});
@@ -364,14 +364,14 @@ TEST_F(PlaygroundTests, test_s_1) {
             ctx.setInputArray(1, &y);
             ctx.setOutputArray(0, &z);
 
-            nd4j::ops::biasadd op;
+            sd::ops::biasadd op;
 
 
             for (int e = 0; e < 10000; e++) {
                 auto timeStart = std::chrono::system_clock::now();
 
                 //op.execute({&x, &y}, {&z}, {true});
-                nd4j::ops::helpers::addBias(ctx, x, y, z, true);
+                sd::ops::helpers::addBias(ctx, x, y, z, true);
 
                 auto timeEnd = std::chrono::system_clock::now();
                 auto outerTime = std::chrono::duration_cast<std::chrono::microseconds>(timeEnd - timeStart).count();
@@ -398,7 +398,7 @@ TEST_F(PlaygroundTests, test_s_0) {
     ctx.setInputArray(1, &y);
     ctx.setOutputArray(0, &z);
 
-    nd4j::ops::biasadd op;
+    sd::ops::biasadd op;
 
 
     for (int e = 0; e < 10000; e++) {
@@ -442,7 +442,7 @@ TEST_F(PlaygroundTests, test_s_1) {
 
     std::vector<Nd4jLong> values;
 
-    nd4j::ops::concat op;
+    sd::ops::concat op;
     op.execute(&ctx);
 
     for (int e = 0; e < 1000; e++) {
@@ -686,12 +686,12 @@ TEST_F(PlaygroundTests, my) {
     int bS=8, iD=32,iH=32,iW=32,  iC=128,  kD=2,kH=2,kW=2,  sD=1,sH=1,sW=1,  pD=0,pH=0,pW=0,  dD=2,dH=2,dW=2;
     int       oD,oH,oW;
 
-    nd4j::ops::ConvolutionUtils::calcOutSizeDeconv3D(oD, oH, oW, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, iD, iH, iW, 0);
+    sd::ops::ConvolutionUtils::calcOutSizeDeconv3D(oD, oH, oW, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, iD, iH, iW, 0);
 
     printf("!!%i, %i, %i\n", oD,oH,oW);
 
-    NDArray col('c', {bS, iC, kD, kH, kW, iD, iH, iW}, nd4j::DataType::DOUBLE);
-    NDArray vol('c', {bS, iC, oD, oH, oW}, nd4j::DataType::DOUBLE);
+    NDArray col('c', {bS, iC, kD, kH, kW, iD, iH, iW}, sd::DataType::DOUBLE);
+    NDArray vol('c', {bS, iC, oD, oH, oW}, sd::DataType::DOUBLE);
 
     col = 3.77;
     vol = -10.33;
@@ -700,7 +700,7 @@ TEST_F(PlaygroundTests, my) {
     auto block = new Context(1, variableSpace, false);  // not-in-place
 
     auto timeStart = std::chrono::system_clock::now();
-    nd4j::ops::ConvolutionUtils::col2vol(*block, col, vol, sD, sH, sW, pD, pH, pW, dD, dH, dW);
+    sd::ops::ConvolutionUtils::col2vol(*block, col, vol, sD, sH, sW, pD, pH, pW, dD, dH, dW);
     auto timeEnd = std::chrono::system_clock::now();
     auto time = std::chrono::duration_cast<std::chrono::microseconds> (timeEnd - timeStart).count();
 
@@ -715,15 +715,15 @@ TEST_F(PlaygroundTests, my) {
     int bS=32, iD=32,iH=64,iW=64,  iC=128,  kD=2,kH=2,kW=2,  sD=1,sH=1,sW=1,  pD=0,pH=0,pW=0,  dD=2,dH=2,dW=2;
     int       oD,oH,oW;
 
-    // nd4j::ops::ConvolutionUtils::calcOutSizeDeconv3D(oD, oH, oW, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, iD, iH, iW, 0);
-    nd4j::ops::ConvolutionUtils::calcOutSizeDeconv2D(oH, oW, kH, kW, sH, sW, pH, pW,dH, dW, iH, iW, 0);
+    // sd::ops::ConvolutionUtils::calcOutSizeDeconv3D(oD, oH, oW, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, iD, iH, iW, 0);
+    sd::ops::ConvolutionUtils::calcOutSizeDeconv2D(oH, oW, kH, kW, sH, sW, pH, pW,dH, dW, iH, iW, 0);
 
     printf("!!%i, %i, %i\n", oD,oH,oW);
 
-    // NDArray col('c', {bS, iC, kD, kH, kW, iD, iH, iW}, nd4j::DataType::DOUBLE);
-    // NDArray vol('c', {bS, iC, oD, oH, oW}, nd4j::DataType::DOUBLE);
-    NDArray col('c', {bS, iC, kH, kW, iH, iW}, nd4j::DataType::DOUBLE);
-    NDArray im('c', {bS, iC, oH, oW}, nd4j::DataType::DOUBLE);
+    // NDArray col('c', {bS, iC, kD, kH, kW, iD, iH, iW}, sd::DataType::DOUBLE);
+    // NDArray vol('c', {bS, iC, oD, oH, oW}, sd::DataType::DOUBLE);
+    NDArray col('c', {bS, iC, kH, kW, iH, iW}, sd::DataType::DOUBLE);
+    NDArray im('c', {bS, iC, oH, oW}, sd::DataType::DOUBLE);
 
     col = 3.77;
     // vol = -10.33;
@@ -733,8 +733,8 @@ TEST_F(PlaygroundTests, my) {
     auto block = new Context(1, variableSpace, false);  // not-in-place
 
     auto timeStart = std::chrono::system_clock::now();
-    // nd4j::ops::ConvolutionUtils::col2vol(*block, col, vol, sD, sH, sW, pD, pH, pW, dD, dH, dW);
-    nd4j::ops::helpers::col2im(*col.getContext(), col, im, sH, sW, pH, pW, iH, iW, dH, dW);
+    // sd::ops::ConvolutionUtils::col2vol(*block, col, vol, sD, sH, sW, pD, pH, pW, dD, dH, dW);
+    sd::ops::helpers::col2im(*col.getContext(), col, im, sH, sW, pH, pW, iH, iW, dH, dW);
     auto timeEnd = std::chrono::system_clock::now();
     auto time = std::chrono::duration_cast<std::chrono::microseconds> (timeEnd - timeStart).count();
 
@@ -753,19 +753,19 @@ TEST_F(PlaygroundTests, my) {
     int paddingMode = 1;             // 1-SAME, 0-VALID;
     int dataFormat  = 1;             // 1-NHWC, 0-NCHW
 
-    // NDArray input('c', {bS, iC, iH, iW}, nd4j::DataType::FLOAT32);
-    // NDArray output('c', {bS, oC, oH, oW}, nd4j::DataType::FLOAT32);
-    NDArray input('c', {bS, iH, iW, iC}, nd4j::DataType::FLOAT32);
-    NDArray output('c', {bS, oH, oW, oC}, nd4j::DataType::FLOAT32);
-    // NDArray weights('c', {kH, kW, iC, oC}, nd4j::DataType::FLOAT32);    // permute [kH, kW, iC, oC] -> [oC, iC, kH, kW]
-    NDArray weights('c', {oC, iC, kH, kW}, nd4j::DataType::FLOAT32);
-    NDArray bias('c', {oC}, nd4j::DataType::FLOAT32);
+    // NDArray input('c', {bS, iC, iH, iW}, sd::DataType::FLOAT32);
+    // NDArray output('c', {bS, oC, oH, oW}, sd::DataType::FLOAT32);
+    NDArray input('c', {bS, iH, iW, iC}, sd::DataType::FLOAT32);
+    NDArray output('c', {bS, oH, oW, oC}, sd::DataType::FLOAT32);
+    // NDArray weights('c', {kH, kW, iC, oC}, sd::DataType::FLOAT32);    // permute [kH, kW, iC, oC] -> [oC, iC, kH, kW]
+    NDArray weights('c', {oC, iC, kH, kW}, sd::DataType::FLOAT32);
+    NDArray bias('c', {oC}, sd::DataType::FLOAT32);
 
     input = 5.;
     weights = 3.;
     bias = 1.;
 
-    nd4j::ops::conv2d op;
+    sd::ops::conv2d op;
     auto err = op.execute({&input, &weights, &bias}, {&output}, {kH,kW,  sH,sW,  pH,pW,  dH,dW, paddingMode, dataFormat});
 
     auto timeStart = std::chrono::system_clock::now();
diff --git a/libnd4j/tests_cpu/layers_tests/ProtoBufTests.cpp b/libnd4j/tests_cpu/layers_tests/ProtoBufTests.cpp
index bc9ed5881..fe2f97bb6 100644
--- a/libnd4j/tests_cpu/layers_tests/ProtoBufTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ProtoBufTests.cpp
@@ -20,11 +20,11 @@
 
 
 #include "testlayers.h"
-#include <GraphExecutioner.h>
+#include <graph/GraphExecutioner.h>
 
 /*
 
-using namespace nd4j::graph;
+using namespace sd::graph;
 
 class ProtoBufTests : public testing::Test {
 
diff --git a/libnd4j/tests_cpu/layers_tests/QuantizationTests.cpp b/libnd4j/tests_cpu/layers_tests/QuantizationTests.cpp
index 608ee443f..97f6cd8cd 100644
--- a/libnd4j/tests_cpu/layers_tests/QuantizationTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/QuantizationTests.cpp
@@ -20,11 +20,11 @@
 
 
 #include "testlayers.h"
-#include <NDArray.h>
-#include <type_conversions.h>
+#include <array/NDArray.h>
+#include <loops/type_conversions.h>
 
 
-using namespace nd4j;
+using namespace sd;
 
 class QuantizationTests : public testing::Test {
 
diff --git a/libnd4j/tests_cpu/layers_tests/RNGTests.cpp b/libnd4j/tests_cpu/layers_tests/RNGTests.cpp
index b40b74939..5ba874ae7 100644
--- a/libnd4j/tests_cpu/layers_tests/RNGTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/RNGTests.cpp
@@ -21,12 +21,12 @@
 
 #include "testlayers.h"
 #include <chrono>
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <helpers/RandomLauncher.h>
 #include <ops/declarable/LegacyRandomOp.h>
 #include <ops/declarable/CustomOperations.h>
 
-using namespace nd4j;
+using namespace sd;
 
 class RNGTests : public testing::Test {
 private:
@@ -35,10 +35,10 @@ private:
 
 public:
     long _seed = 119L;
-    //nd4j::random::RandomBuffer *_rngA;
-    //nd4j::random::RandomBuffer *_rngB;
-    nd4j::graph::RandomGenerator _rngA;
-    nd4j::graph::RandomGenerator _rngB;
+    //sd::random::RandomBuffer *_rngA;
+    //sd::random::RandomBuffer *_rngB;
+    sd::graph::RandomGenerator _rngA;
+    sd::graph::RandomGenerator _rngB;
 
     NDArray* nexp0 = NDArrayFactory::create_<float>('c', {10, 10});
     NDArray* nexp1 = NDArrayFactory::create_<float>('c', {10, 10});
@@ -47,8 +47,8 @@ public:
     RNGTests() {
         //_bufferA = new Nd4jLong[100000];
         //_bufferB = new Nd4jLong[100000];
-        //_rngA = (nd4j::random::RandomBuffer *) initRandom(nullptr, _seed, 100000, (Nd4jPointer) _bufferA);
-        //_rngB = (nd4j::random::RandomBuffer *) initRandom(nullptr, _seed, 100000, (Nd4jPointer) _bufferB);
+        //_rngA = (sd::random::RandomBuffer *) initRandom(nullptr, _seed, 100000, (Nd4jPointer) _bufferA);
+        //_rngB = (sd::random::RandomBuffer *) initRandom(nullptr, _seed, 100000, (Nd4jPointer) _bufferB);
         _rngA.setStates(_seed, _seed);
         _rngB.setStates(_seed, _seed);
         nexp0->assign(-1.0f);
@@ -256,7 +256,7 @@ TEST_F(RNGTests, Test_Gaussian_21) {
     ASSERT_FALSE(x0.equalsTo(nexp0));
     ASSERT_FALSE(x0.equalsTo(nexp1));
     ASSERT_FALSE(x0.equalsTo(nexp2));
-    nd4j::ops::moments op;
+    sd::ops::moments op;
     auto result = op.evaluate({&x0}, {}, {});
     //x0.printIndexedBuffer("X0 Normal");
     //x1.printIndexedBuffer("X1 Normal");
@@ -267,7 +267,7 @@ TEST_F(RNGTests, Test_Gaussian_21) {
     // mean->printIndexedBuffer("Mean");
     // variance->printIndexedBuffer("Variance");
 
-    ASSERT_NEAR(nd4j::math::nd4j_abs(mean->e<float>(0)), 0.f, 0.2f);
+    ASSERT_NEAR(sd::math::nd4j_abs(mean->e<float>(0)), 0.f, 0.2f);
     ASSERT_NEAR(variance->e<float>(0), 1.0f, 0.2f);
 
     delete result;
@@ -278,7 +278,7 @@ TEST_F(RNGTests, Test_Gaussian_22) {
     auto x0 = NDArrayFactory::create<float>('c', {1000, 800});
     auto x1 = NDArrayFactory::create<float>('c', {1000, 800});
 
-    RandomLauncher::fillGaussian(nd4j::LaunchContext::defaultContext(), _rngA, &x0, 0.0f, 1.0f);
+    RandomLauncher::fillGaussian(sd::LaunchContext::defaultContext(), _rngA, &x0, 0.0f, 1.0f);
     RandomLauncher::fillGaussian(LaunchContext::defaultContext(), _rngB, &x1, 0.0f, 1.0f);
 
     //x0.printIndexedBuffer("x0");
@@ -288,7 +288,7 @@ TEST_F(RNGTests, Test_Gaussian_22) {
     ASSERT_FALSE(x0.equalsTo(nexp0));
     ASSERT_FALSE(x0.equalsTo(nexp1));
     ASSERT_FALSE(x0.equalsTo(nexp2));
-    nd4j::ops::moments op;
+    sd::ops::moments op;
     auto result = op.evaluate({&x0}, {}, {});
     //x0.printIndexedBuffer("X0 Normal");
     //x1.printIndexedBuffer("X1 Normal");
@@ -298,7 +298,7 @@ TEST_F(RNGTests, Test_Gaussian_22) {
 
     //mean0->printIndexedBuffer("Mean");
     //variance0->printIndexedBuffer("Variance");
-    ASSERT_NEAR(nd4j::math::nd4j_abs(mean0->e<float>(0)), 0.f, 1.0e-3f);
+    ASSERT_NEAR(sd::math::nd4j_abs(mean0->e<float>(0)), 0.f, 1.0e-3f);
     ASSERT_NEAR(variance0->e<float>(0), 1.0f, 1.e-3f);
     delete result;
 }
@@ -309,7 +309,7 @@ TEST_F(RNGTests, Test_Gaussian_3) {
     RandomLauncher::fillGaussian(LaunchContext::defaultContext(), _rngA, &x0, 0.0, 1.0);
 
     auto mean = x0.meanNumber(); //.e<double>(0);
-    auto stdev = x0.varianceNumber(nd4j::variance::SummaryStatsStandardDeviation, false);//.e<double>(0);
+    auto stdev = x0.varianceNumber(sd::variance::SummaryStatsStandardDeviation, false);//.e<double>(0);
     auto meanExp = NDArrayFactory::create<double>(0.);
     auto devExp = NDArrayFactory::create<double>(1.);
     ASSERT_TRUE(meanExp.equalsTo(mean, 1.e-3));
@@ -411,13 +411,13 @@ TEST_F(RNGTests, Test_Truncated_21) {
     //x1.printIndexedBuffer("Distribution TN");
     ASSERT_NEAR(mean.e<float>(0), 1.f, 0.002);
     ASSERT_NEAR(deviation.e<float>(0), 2.f, 0.5);
-    nd4j::ops::moments op;
+    sd::ops::moments op;
     auto result = op.evaluate({&x0}, {}, {}, {}, {}, false);
     // result->at(0)->printBuffer("MEAN");
     // result->at(1)->printBuffer("VARIANCE");
     delete result;
-    nd4j::ops::reduce_min minOp;
-    nd4j::ops::reduce_max maxOp;
+    sd::ops::reduce_min minOp;
+    sd::ops::reduce_max maxOp;
     auto minRes = minOp.evaluate({&x1}, {}, {}, {});
     auto maxRes = maxOp.evaluate({&x0}, {}, {}, {});
     // minRes->at(0)->printBuffer("MIN for Truncated");
@@ -458,13 +458,13 @@ TEST_F(RNGTests, Test_Truncated_22) {
     //x1.printIndexedBuffer("Distribution TN");
     ASSERT_NEAR(mean.e<float>(0), 2.f, 0.01);
     ASSERT_NEAR(deviation.e<float>(0), 4.f, 0.52);
-    nd4j::ops::moments op;
+    sd::ops::moments op;
     auto result = op.evaluate({&x0}, {}, {}, {}, {}, false);
     // result->at(0)->printBuffer("MEAN");
     // result->at(1)->printBuffer("VARIANCE");
     delete result;
-    nd4j::ops::reduce_min minOp;
-    nd4j::ops::reduce_max maxOp;
+    sd::ops::reduce_min minOp;
+    sd::ops::reduce_max maxOp;
     auto minRes = minOp.evaluate({&x1}, {}, {}, {});
     auto maxRes = maxOp.evaluate({&x0}, {}, {}, {});
     // minRes->at(0)->printBuffer("MIN for Truncated2");
@@ -505,13 +505,13 @@ TEST_F(RNGTests, Test_Truncated_23) {
     //x1.printIndexedBuffer("Distribution TN");
     ASSERT_NEAR(mean.e<float>(0), 0.f, 0.01);
     ASSERT_NEAR(deviation.e<float>(0), 1.f, 0.5);
-    nd4j::ops::moments op;
+    sd::ops::moments op;
     auto result = op.evaluate({&x0});
     // result->at(0)->printBuffer("MEAN");
     // result->at(1)->printBuffer("VARIANCE");
     delete result;
-    nd4j::ops::reduce_min minOp;
-    nd4j::ops::reduce_max maxOp;
+    sd::ops::reduce_min minOp;
+    sd::ops::reduce_max maxOp;
     auto minRes = minOp.evaluate({&x1}, {}, {}, {});
     auto maxRes = maxOp.evaluate({&x0}, {}, {}, {});
     // minRes->at(0)->printBuffer("MIN for Truncated3");
@@ -565,7 +565,7 @@ TEST_F(RNGTests, Test_Uniform_2) {
 
     RandomLauncher::fillUniform(LaunchContext::defaultContext(), _rngB, &x1, 1.0f, 2.0f);
 
-    auto op = new nd4j::ops::LegacyRandomOp(0);
+    auto op = new sd::ops::LegacyRandomOp(0);
     auto result = op->execute(_rngA, {&input}, {1.0f, 2.0f}, {});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -585,7 +585,7 @@ TEST_F(RNGTests, Test_Gaussian_2) {
 
     RandomLauncher::fillGaussian(LaunchContext::defaultContext(), _rngB, &x1, 1.0f, 2.0f);
 
-    auto op = new nd4j::ops::LegacyRandomOp(random::GaussianDistribution);
+    auto op = new sd::ops::LegacyRandomOp(random::GaussianDistribution);
     auto result = op->execute(_rngA, {&input}, {1.0f, 2.0f}, {});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -605,7 +605,7 @@ TEST_F(RNGTests, Test_LogNorm_2) {
 
     RandomLauncher::fillLogNormal(LaunchContext::defaultContext(), _rngB, &x1, 1.0f, 2.0f);
 
-    auto op = new nd4j::ops::LegacyRandomOp(random::LogNormalDistribution);
+    auto op = new sd::ops::LegacyRandomOp(random::LogNormalDistribution);
     auto result = op->execute(_rngA, {&input}, {1.0f, 2.0f}, {});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -625,7 +625,7 @@ TEST_F(RNGTests, Test_TruncatedNorm_2) {
 
     RandomLauncher::fillTruncatedNormal(LaunchContext::defaultContext(), _rngB, &x1, 1.0f, 2.0f);
 
-    auto op = new nd4j::ops::LegacyRandomOp(random::TruncatedNormalDistribution);
+    auto op = new sd::ops::LegacyRandomOp(random::TruncatedNormalDistribution);
     auto result = op->execute(_rngA, {&input}, {1.0f, 2.0f}, {});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -645,7 +645,7 @@ TEST_F(RNGTests, Test_Binomial_2) {
 
     RandomLauncher::fillBinomial(LaunchContext::defaultContext(), _rngB, &x1, 3, 0.5f);
 
-    auto op = new nd4j::ops::LegacyRandomOp(random::BinomialDistributionEx);
+    auto op = new sd::ops::LegacyRandomOp(random::BinomialDistributionEx);
     auto result = op->execute(_rngA, {&input}, {0.5f}, {3});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -666,7 +666,7 @@ TEST_F(RNGTests, Test_Bernoulli_2) {
 
     RandomLauncher::fillBernoulli(LaunchContext::defaultContext(), _rngB, &x1, 0.5f);
 
-    auto op = new nd4j::ops::LegacyRandomOp(random::BernoulliDistribution);
+    auto op = new sd::ops::LegacyRandomOp(random::BernoulliDistribution);
     auto result = op->execute(_rngA, {&input}, {0.5f}, {});
 
     ASSERT_EQ(Status::OK(), result->status());
@@ -685,7 +685,7 @@ TEST_F(RNGTests, Test_GaussianDistribution_1) {
     auto exp0 = NDArrayFactory::create<float>('c', {10, 10});
 
 
-    nd4j::ops::random_normal op;
+    sd::ops::random_normal op;
     auto result = op.evaluate({&x}, {0.0, 1.0f}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -706,7 +706,7 @@ TEST_F(RNGTests, Test_BernoulliDistribution_1) {
     auto exp0 = NDArrayFactory::create<float>('c', {10, 10});
 
 
-    nd4j::ops::random_bernoulli op;
+    sd::ops::random_bernoulli op;
     auto result = op.evaluate({&x}, {0.5f}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -727,7 +727,7 @@ TEST_F(RNGTests, Test_ExponentialDistribution_1) {
     auto exp0 = NDArrayFactory::create<float>('c', {10, 10});
 
 
-    nd4j::ops::random_exponential op;
+    sd::ops::random_exponential op;
     auto result = op.evaluate({&x}, {0.25f}, {0});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -751,7 +751,7 @@ TEST_F(RNGTests, Test_ExponentialDistribution_2) {
     y.assign(1.0);
 
 
-    nd4j::ops::random_exponential op;
+    sd::ops::random_exponential op;
     auto result = op.evaluate({&x, &y}, {0.25f}, {0});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -775,7 +775,7 @@ TEST_F(RNGTests, Test_PoissonDistribution_1) {
     la.linspace(1.0);
 
 
-    nd4j::ops::random_poisson op;
+    sd::ops::random_poisson op;
     auto result = op.evaluate({&x, &la}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -795,7 +795,7 @@ TEST_F(RNGTests, Test_GammaDistribution_1) {
     al.linspace(1.0);
 
 
-    nd4j::ops::random_gamma op;
+    sd::ops::random_gamma op;
     auto result = op.evaluate({&x, &al}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -816,7 +816,7 @@ TEST_F(RNGTests, Test_GammaDistribution_2) {
     al.linspace(1.0);
     be.assign(1.0);
 
-    nd4j::ops::random_gamma op;
+    sd::ops::random_gamma op;
     auto result = op.evaluate({&x, &al, &be}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -837,7 +837,7 @@ TEST_F(RNGTests, Test_GammaDistribution_3) {
     al.linspace(1.0);
     be.assign(2.0);
 
-    nd4j::ops::random_gamma op;
+    sd::ops::random_gamma op;
     auto result = op.evaluate({&x, &al, &be}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -856,7 +856,7 @@ TEST_F(RNGTests, Test_UniformDistribution_04) {
     auto exp0 = NDArrayFactory::create<float>('c', {10});
 
 
-    nd4j::ops::randomuniform op;
+    sd::ops::randomuniform op;
     auto result = op.evaluate({&x, &al, &be}, {}, {DataType::INT32});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -867,9 +867,9 @@ TEST_F(RNGTests, Test_UniformDistribution_04) {
     delete result;
 }
 
-namespace nd4j {
+namespace sd {
     namespace tests {
-        static void fillList(Nd4jLong seed, int numberOfArrays, std::vector<Nd4jLong> &shape, std::vector<NDArray*> &list, nd4j::graph::RandomGenerator *rng) {
+        static void fillList(Nd4jLong seed, int numberOfArrays, std::vector<Nd4jLong> &shape, std::vector<NDArray*> &list, sd::graph::RandomGenerator *rng) {
             rng->setSeed((int) seed);
 
             for (int i = 0; i < numberOfArrays; i++) {
@@ -877,7 +877,7 @@ namespace nd4j {
                 auto arrayR = NDArrayFactory::create_<double>('c', shape);
                 auto min = NDArrayFactory::create(0.0);
                 auto max = NDArrayFactory::create(1.0);
-                nd4j::ops::randomuniform op;
+                sd::ops::randomuniform op;
                 op.execute(*rng, {&arrayI, &min, &max}, {arrayR}, {}, {DataType::DOUBLE}, {}, {}, false);
 
                 list.emplace_back(arrayR);
@@ -890,14 +890,14 @@ TEST_F(RNGTests, Test_Reproducibility_1) {
     Nd4jLong seed = 123;
 
     std::vector<Nd4jLong> shape = {32, 3, 28, 28};
-    nd4j::graph::RandomGenerator rng;
+    sd::graph::RandomGenerator rng;
 
     std::vector<NDArray*> expList;
-    nd4j::tests::fillList(seed, 10, shape, expList, &rng);
+    sd::tests::fillList(seed, 10, shape, expList, &rng);
 
     for (int e = 0; e < 2; e++) {
         std::vector<NDArray *> trialList;
-        nd4j::tests::fillList(seed, 10, shape, trialList, &rng);
+        sd::tests::fillList(seed, 10, shape, trialList, &rng);
 
         for (int a = 0; a < expList.size(); a++) {
             auto arrayE = expList[a];
@@ -922,14 +922,14 @@ TEST_F(RNGTests, Test_Reproducibility_2) {
     Nd4jLong seed = 123;
 
     std::vector<Nd4jLong> shape = {32, 3, 64, 64};
-    nd4j::graph::RandomGenerator rng;
+    sd::graph::RandomGenerator rng;
 
     std::vector<NDArray*> expList;
-    nd4j::tests::fillList(seed, 10, shape, expList, &rng);
+    sd::tests::fillList(seed, 10, shape, expList, &rng);
 
     for (int e = 0; e < 2; e++) {
         std::vector<NDArray*> trialList;
-        nd4j::tests::fillList(seed, 10, shape, trialList, &rng);
+        sd::tests::fillList(seed, 10, shape, trialList, &rng);
 
         for (int a = 0; a < expList.size(); a++) {
             auto arrayE = expList[a];
@@ -943,7 +943,7 @@ TEST_F(RNGTests, Test_Reproducibility_2) {
                     double x = arrayE->e<double>(f);
                     double y = arrayT->e<double>(f);
 
-                    if (nd4j::math::nd4j_re(x, y) > 0.1) {
+                    if (sd::math::nd4j_re(x, y) > 0.1) {
                         // nd4j_printf("E[%lld] %f != T[%lld] %f\n", (long long) f, (float) x, (long long) f, (float) y);
                         throw std::runtime_error("boom");
                     }
@@ -986,7 +986,7 @@ TEST_F(RNGTests, test_choice_1) {
     auto z = NDArrayFactory::create<double>('c', {1000});
 
     RandomGenerator rng(119, 256);
-    NativeOpExecutioner::execRandom(nd4j::LaunchContext ::defaultContext(), random::Choice, &rng, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(), prob->buffer(), prob->shapeInfo(), prob->specialBuffer(), prob->specialShapeInfo(), z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), nullptr);
+    NativeOpExecutioner::execRandom(sd::LaunchContext ::defaultContext(), random::Choice, &rng, x->buffer(), x->shapeInfo(), x->specialBuffer(), x->specialShapeInfo(), prob->buffer(), prob->shapeInfo(), prob->specialBuffer(), prob->specialShapeInfo(), z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), nullptr);
 
     // z.printIndexedBuffer("z");
 
@@ -999,26 +999,26 @@ TEST_F(RNGTests, test_uniform_119) {
     auto z = NDArrayFactory::create<float>('c', {1, 5});
 
 
-    nd4j::ops::randomuniform op;
+    sd::ops::randomuniform op;
     auto status = op.execute({&x}, {&z}, {1.0, 2.0}, {}, {});
     ASSERT_EQ(Status::OK(), status);
 }
 
 TEST_F(RNGTests, test_multinomial_1) {
 
-    NDArray probs('f', { 3, 3 }, { 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3 }, nd4j::DataType::FLOAT32);
-    NDArray expected('f', { 3, 3 }, { 0., 1, 2,  2, 0, 0,  1, 2, 1 }, nd4j::DataType::INT64);
-    NDArray output('f', { 3, 3 }, nd4j::DataType::INT64);
-    NDArray samples('f', { 1 }, std::vector<double>({3}), nd4j::DataType::INT32);
+    NDArray probs('f', { 3, 3 }, { 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3 }, sd::DataType::FLOAT32);
+    NDArray expected('f', { 3, 3 }, { 0., 1, 2,  2, 0, 0,  1, 2, 1 }, sd::DataType::INT64);
+    NDArray output('f', { 3, 3 }, sd::DataType::INT64);
+    NDArray samples('f', { 1 }, std::vector<double>({3}), sd::DataType::INT32);
 
-    nd4j::ops::random_multinomial op;
+    sd::ops::random_multinomial op;
     RandomGenerator rng(1234, 1234);
     ASSERT_EQ(Status::OK(),  op.execute(rng, { &probs, &samples }, { &output }, {}, { 0, INT64}, {}, {}, false) );
     ASSERT_TRUE(expected.isSameShape(output));
     ASSERT_TRUE(expected.equalsTo(output));
 
-    NDArray probsZ('c', { 1, 3 }, { 0.3, 0.3, 0.3 }, nd4j::DataType::FLOAT32);
-    NDArray expectedZ('c', { 3, 3 }, { 0., 0, 0,  0, 0, 0,  0, 0, 0 }, nd4j::DataType::INT64);
+    NDArray probsZ('c', { 1, 3 }, { 0.3, 0.3, 0.3 }, sd::DataType::FLOAT32);
+    NDArray expectedZ('c', { 3, 3 }, { 0., 0, 0,  0, 0, 0,  0, 0, 0 }, sd::DataType::INT64);
 
     auto result = op.evaluate({ &probsZ, &samples }, { }, { 1, INT64 });
     auto outputZ = result->at(0);
@@ -1031,20 +1031,20 @@ TEST_F(RNGTests, test_multinomial_1) {
 
 TEST_F(RNGTests, test_multinomial_2) {
 
-    NDArray samples('c', { 1 }, std::vector<double>{ 20 }, nd4j::DataType::INT32);
-    NDArray probs('c', { 3, 5 }, { 0.2, 0.3, 0.5,    0.3, 0.5, 0.2,  0.5, 0.2, 0.3,  0.35, 0.25, 0.3,  0.25, 0.25, 0.5 }, nd4j::DataType::FLOAT32);
-    NDArray expected('c', { 3, 20 }, { 0, 2, 0, 2, 0, 4, 2, 0, 1, 2, 0, 2, 3, 0, 0, 2, 4, 4, 1, 0, 2, 3, 2, 3, 0, 1, 3, 1, 1, 1, 2, 4, 3, 3, 1, 4, 4, 2, 0, 0, 3, 3, 3, 0, 0, 2, 2, 3, 3, 0,  0, 2, 3, 4, 2, 2, 3, 2, 1, 2   }, nd4j::DataType::INT64);
-    NDArray output('c', { 3, 20 }, nd4j::DataType::INT64);
+    NDArray samples('c', { 1 }, std::vector<double>{ 20 }, sd::DataType::INT32);
+    NDArray probs('c', { 3, 5 }, { 0.2, 0.3, 0.5,    0.3, 0.5, 0.2,  0.5, 0.2, 0.3,  0.35, 0.25, 0.3,  0.25, 0.25, 0.5 }, sd::DataType::FLOAT32);
+    NDArray expected('c', { 3, 20 }, { 0, 2, 0, 2, 0, 4, 2, 0, 1, 2, 0, 2, 3, 0, 0, 2, 4, 4, 1, 0, 2, 3, 2, 3, 0, 1, 3, 1, 1, 1, 2, 4, 3, 3, 1, 4, 4, 2, 0, 0, 3, 3, 3, 0, 0, 2, 2, 3, 3, 0,  0, 2, 3, 4, 2, 2, 3, 2, 1, 2   }, sd::DataType::INT64);
+    NDArray output('c', { 3, 20 }, sd::DataType::INT64);
 
-    nd4j::ops::random_multinomial op;
+    sd::ops::random_multinomial op;
     RandomGenerator rng(1234, 1234);
     ASSERT_EQ(Status::OK(), op.execute(rng, { &probs, &samples }, { &output }, {}, { 0, INT64 }, {}, {}, false));
     ASSERT_TRUE(expected.isSameShape(output));
     ASSERT_TRUE(expected.equalsTo(output));
 
-    NDArray probs2('c', { 5, 3 }, { 0.2, 0.3, 0.5,    0.3, 0.5, 0.2,  0.5, 0.2, 0.3,  0.35, 0.25, 0.3,  0.25, 0.25, 0.5 }, nd4j::DataType::FLOAT32);
-    NDArray expected2('c', { 20, 3 }, {  0, 2, 3, 2, 3, 3, 0, 2, 3, 2,  3, 0, 0, 0, 0, 4, 1, 2, 2, 3,  2, 3, 1, 3, 1, 1, 3, 2, 1, 0, 0, 2, 0, 2, 4, 2, 3, 3, 3, 0,  3, 4, 0, 1, 2, 2, 0, 2, 4, 4, 0, 4, 2, 2, 1, 0, 1, 0, 0, 2  }, nd4j::DataType::INT64);
-    NDArray output2('c', { 20, 3 }, nd4j::DataType::INT64);
+    NDArray probs2('c', { 5, 3 }, { 0.2, 0.3, 0.5,    0.3, 0.5, 0.2,  0.5, 0.2, 0.3,  0.35, 0.25, 0.3,  0.25, 0.25, 0.5 }, sd::DataType::FLOAT32);
+    NDArray expected2('c', { 20, 3 }, {  0, 2, 3, 2, 3, 3, 0, 2, 3, 2,  3, 0, 0, 0, 0, 4, 1, 2, 2, 3,  2, 3, 1, 3, 1, 1, 3, 2, 1, 0, 0, 2, 0, 2, 4, 2, 3, 3, 3, 0,  3, 4, 0, 1, 2, 2, 0, 2, 4, 4, 0, 4, 2, 2, 1, 0, 1, 0, 0, 2  }, sd::DataType::INT64);
+    NDArray output2('c', { 20, 3 }, sd::DataType::INT64);
 
     rng.setStates(1234, 1234);
     ASSERT_EQ(Status::OK(), op.execute(rng, { &probs2, &samples }, { &output2 }, {}, { 1, INT64 }, {}, {}, false));
@@ -1054,13 +1054,13 @@ TEST_F(RNGTests, test_multinomial_2) {
 
 TEST_F(RNGTests, test_multinomial_3) {
 
-    NDArray probs('c', {  4, 3 }, { 0.3, 0.3, 0.4,  0.3, 0.4, 0.3,  0.3, 0.3, 0.4,  0.4, 0.3, 0.3 }, nd4j::DataType::FLOAT32);
-    NDArray  expected('c', { 4, 5 }, nd4j::DataType::INT64);
-    NDArray  output('c', { 4, 5 }, nd4j::DataType::INT64);
-    NDArray samples('c', { 1 }, std::vector<double>{ 5 }, nd4j::DataType::INT32);
+    NDArray probs('c', {  4, 3 }, { 0.3, 0.3, 0.4,  0.3, 0.4, 0.3,  0.3, 0.3, 0.4,  0.4, 0.3, 0.3 }, sd::DataType::FLOAT32);
+    NDArray  expected('c', { 4, 5 }, sd::DataType::INT64);
+    NDArray  output('c', { 4, 5 }, sd::DataType::INT64);
+    NDArray samples('c', { 1 }, std::vector<double>{ 5 }, sd::DataType::INT32);
     RandomGenerator rng(1234, 1234);
 
-    nd4j::ops::random_multinomial op;
+    sd::ops::random_multinomial op;
 
     ASSERT_EQ(Status::OK(), op.execute(rng, { &probs, &samples }, { &expected }, {}, { 0, INT64 }, {}, {}, false));
 
@@ -1072,13 +1072,13 @@ TEST_F(RNGTests, test_multinomial_3) {
 
 TEST_F(RNGTests, test_multinomial_4) {
 
-    NDArray probs('c', { 3, 4 }, { 0.3, 0.3, 0.4, 0.3, 0.4, 0.3, 0.3, 0.3, 0.4, 0.4, 0.3, 0.3 }, nd4j::DataType::FLOAT32);
-    NDArray  expected('c', { 5, 4 }, nd4j::DataType::INT64);
-    NDArray  output('c', { 5, 4 }, nd4j::DataType::INT64);
-    NDArray samples('c', { 1 }, std::vector<double>{ 5 }, nd4j::DataType::INT32);
+    NDArray probs('c', { 3, 4 }, { 0.3, 0.3, 0.4, 0.3, 0.4, 0.3, 0.3, 0.3, 0.4, 0.4, 0.3, 0.3 }, sd::DataType::FLOAT32);
+    NDArray  expected('c', { 5, 4 }, sd::DataType::INT64);
+    NDArray  output('c', { 5, 4 }, sd::DataType::INT64);
+    NDArray samples('c', { 1 }, std::vector<double>{ 5 }, sd::DataType::INT32);
 
     RandomGenerator rng(1234, 1234);
-    nd4j::ops::random_multinomial op;
+    sd::ops::random_multinomial op;
     ASSERT_EQ(Status::OK(), op.execute(rng, { &probs, &samples }, { &expected }, {}, { 1, INT64 }, {}, {}, false));
 
     rng.setStates(1234, 1234);
@@ -1093,13 +1093,13 @@ TEST_F(RNGTests, test_multinomial_5) {
     int ClassValue = 2;
     int Samples = 100000;
 
-    NDArray samples('c', { 1 }, std::vector<double>{ 1.*Samples }, nd4j::DataType::INT32);
+    NDArray samples('c', { 1 }, std::vector<double>{ 1.*Samples }, sd::DataType::INT32);
 
-    NDArray probs('c', { ClassValue, batchValue }, { 1.0, 1.0 }, nd4j::DataType::FLOAT32);
+    NDArray probs('c', { ClassValue, batchValue }, { 1.0, 1.0 }, sd::DataType::FLOAT32);
 
-    nd4j::ops::random_multinomial op;
+    sd::ops::random_multinomial op;
 
-    NDArray  output('c', { Samples, batchValue }, nd4j::DataType::INT64);
+    NDArray  output('c', { Samples, batchValue }, sd::DataType::INT64);
     RandomGenerator rng(1234, 1234);
 
     ASSERT_EQ(Status::OK(), op.execute(rng, { &probs, &samples }, { &output }, {}, { 1 }, {}, {}, false));
@@ -1141,19 +1141,19 @@ TEST_F(RNGTests, test_multinomial_6) {
     int ClassValue = 5;
     int Samples = 100000;
 
-    NDArray samples('c', { 1 }, std::vector<double>{ 1. * Samples }, nd4j::DataType::INT32);
+    NDArray samples('c', { 1 }, std::vector<double>{ 1. * Samples }, sd::DataType::INT32);
 
-    nd4j::ops::random_multinomial op;
-    NDArray probExpect('c', { ClassValue }, { 0.058, 0.096, 0.1576, 0.2598, 0.4287 }, nd4j::DataType::DOUBLE);
+    sd::ops::random_multinomial op;
+    NDArray probExpect('c', { ClassValue }, { 0.058, 0.096, 0.1576, 0.2598, 0.4287 }, sd::DataType::DOUBLE);
 
     // without seed
-    NDArray probsR('c', { batchValue,  ClassValue }, { 1., 1.5, 2., 2.5, 3. }, nd4j::DataType::FLOAT32);
+    NDArray probsR('c', { batchValue,  ClassValue }, { 1., 1.5, 2., 2.5, 3. }, sd::DataType::FLOAT32);
 
     auto resultR = op.evaluate({ &probsR, &samples }, { }, { 0 });
     auto outputR = resultR->at(0);
     ASSERT_EQ(Status::OK(), resultR->status());
 
-    NDArray countsR('c', { ClassValue }, { 0., 0, 0, 0, 0 }, nd4j::DataType::DOUBLE);
+    NDArray countsR('c', { ClassValue }, { 0., 0, 0, 0, 0 }, sd::DataType::DOUBLE);
 
     for (int i = 0; i < outputR->lengthOf(); i++) {
         auto value = outputR->e<Nd4jLong>(i);
@@ -1178,12 +1178,12 @@ TEST_F(RNGTests, test_multinomial_6) {
     delete resultR;
 
     RandomGenerator rng(1234, 1234);
-    NDArray probs('c', { batchValue, ClassValue }, { 1., 1.5, 2., 2.5, 3. }, nd4j::DataType::FLOAT32);
-    NDArray  output('c', { batchValue, Samples }, nd4j::DataType::INT64);
+    NDArray probs('c', { batchValue, ClassValue }, { 1., 1.5, 2., 2.5, 3. }, sd::DataType::FLOAT32);
+    NDArray  output('c', { batchValue, Samples }, sd::DataType::INT64);
 
     ASSERT_EQ(Status::OK(), op.execute(rng, { &probs, &samples }, { &output }, {}, { 0, INT64 }, {}, {}, false));
 
-    NDArray counts('c', { ClassValue }, { 0., 0, 0, 0, 0 }, nd4j::DataType::DOUBLE);
+    NDArray counts('c', { ClassValue }, { 0., 0, 0, 0, 0 }, sd::DataType::DOUBLE);
 
     for (int i = 0; i < output.lengthOf(); i++) {
         auto value = output.e<Nd4jLong>(i);
diff --git a/libnd4j/tests_cpu/layers_tests/ReduceTests.cpp b/libnd4j/tests_cpu/layers_tests/ReduceTests.cpp
index b91730954..adbe28a41 100644
--- a/libnd4j/tests_cpu/layers_tests/ReduceTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ReduceTests.cpp
@@ -21,7 +21,7 @@
 #include "testinclude.h"
 #include <loops/reduce3.h>
 #include <loops/reduce_float.h>
-#include <ArrayOptions.h>
+#include <array/ArrayOptions.h>
 
 class ReduceTest : public testing::Test {
 public:
@@ -65,8 +65,8 @@ public:
 #ifndef __CUDABLAS__
 TEST_F(EuclideanDistanceTest,Test1) {
     //int *tadShapeBuffer = shape::computeResultShape(shapeBuffer,dimension,dimensionLength);
-    nd4j::ArrayOptions::setDataType(shapeBuffer, nd4j::DataType::FLOAT32);
-    auto tadShapeBuffer = nd4j::ShapeUtils::evalReduceShapeInfo('c', dim, shapeBuffer, false, true, nullptr);
+    sd::ArrayOptions::setDataType(shapeBuffer, sd::DataType::FLOAT32);
+    auto tadShapeBuffer = sd::ShapeUtils::evalReduceShapeInfo('c', dim, shapeBuffer, false, true, nullptr);
     //shape::printShapeInfoLinear("tadShape", tadShapeBuffer);
     functions::reduce3::Reduce3<float, float>::exec(opNum,
                                              x,
@@ -84,9 +84,9 @@ TEST_F(EuclideanDistanceTest,Test1) {
 
 
 TEST_F(StdTest,MultiDimTest) {
-    auto xShapeInfo = shape::shapeBuffer(4, nd4j::DataType::FLOAT32, examplesShape);
+    auto xShapeInfo = shape::shapeBuffer(4, sd::DataType::FLOAT32, examplesShape);
     //int *resultShapeInfo = shape::computeResultShape(xShapeInfo,dimensionsForStd,dimensionLength);
-    auto resultShapeInfo = nd4j::ShapeUtils::evalReduceShapeInfo('c', dimsForStd, xShapeInfo, false, true, nullptr);
+    auto resultShapeInfo = sd::ShapeUtils::evalReduceShapeInfo('c', dimsForStd, xShapeInfo, false, true, nullptr);
     int resultLengthAssertion = 5;
     ASSERT_EQ(resultLengthAssertion,shape::length(resultShapeInfo));
     shape::TAD *tad = new shape::TAD;
@@ -123,9 +123,9 @@ TEST_F(StdTest,MultiDimTest) {
 
 TEST_F(ReduceTest,MatrixTest) {
     int opNum = 4;    
-    auto xShapeInfo = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, shape);
+    auto xShapeInfo = sd::ShapeBuilders::createShapeInfo(sd::DataType::FLOAT32, 'c', 2, shape);
     //int *resultShapeInfo = shape::computeResultShape(xShapeInfo,dimension,dimensionLength);
-    auto resultShapeInfo = nd4j::ShapeUtils::evalReduceShapeInfo('c', dim, xShapeInfo, false, true, nullptr);
+    auto resultShapeInfo = sd::ShapeUtils::evalReduceShapeInfo('c', dim, xShapeInfo, false, true, nullptr);
     int resultLengthAssertion = 3;
     ASSERT_EQ(resultLengthAssertion,shape::length(resultShapeInfo));
     shape::TAD *tad = new shape::TAD;
diff --git a/libnd4j/tests_cpu/layers_tests/ResultSetTests.cpp b/libnd4j/tests_cpu/layers_tests/ResultSetTests.cpp
index 404b95013..4ca8a3806 100644
--- a/libnd4j/tests_cpu/layers_tests/ResultSetTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ResultSetTests.cpp
@@ -19,12 +19,12 @@
 //
 
 #include "testlayers.h"
-#include <Graph.h>
-#include <Node.h>
+#include <graph/Graph.h>
+#include <graph/Node.h>
 #include <ops/declarable/CustomOperations.h>
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class ResultSetTests : public testing::Test {
 public:
diff --git a/libnd4j/tests_cpu/layers_tests/SanityTests.cpp b/libnd4j/tests_cpu/layers_tests/SanityTests.cpp
index dbfcca381..7ca6732fe 100644
--- a/libnd4j/tests_cpu/layers_tests/SanityTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/SanityTests.cpp
@@ -19,12 +19,12 @@
 //
 
 #include "testlayers.h"
-#include <Graph.h>
-#include <Node.h>
+#include <graph/Graph.h>
+#include <graph/Node.h>
 #include <ops/declarable/CustomOperations.h>
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class SanityTests : public testing::Test {
 public:
diff --git a/libnd4j/tests_cpu/layers_tests/ScalarTests.cpp b/libnd4j/tests_cpu/layers_tests/ScalarTests.cpp
index 881a33c2e..b15e3d484 100644
--- a/libnd4j/tests_cpu/layers_tests/ScalarTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ScalarTests.cpp
@@ -20,12 +20,12 @@
 
 #include "testlayers.h"
 #include <ops/declarable/CustomOperations.h>
-#include <NDArray.h>
-#include <NativeOps.h>
+#include <array/NDArray.h>
+#include <legacy/NativeOps.h>
 #include <helpers/BitwiseUtils.h>
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class ScalarTests : public testing::Test {
 public:
@@ -93,7 +93,7 @@ TEST_F(ScalarTests, Test_Concat_1) {
     auto v = NDArrayFactory::create<float>(3.0f);
     auto exp = NDArrayFactory::create<float>('c', {3}, {1, 2, 3});
 
-    nd4j::ops::concat op;
+    sd::ops::concat op;
     auto result = op.evaluate({&t, &u, &v}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -113,7 +113,7 @@ TEST_F(ScalarTests, Test_Concat_2) {
     auto v = NDArrayFactory::create<float>(5.0f);
     auto exp = NDArrayFactory::create<float>('c', {5}, {1, 2, 3, 4, 5});
 
-    nd4j::ops::concat op;
+    sd::ops::concat op;
     auto result = op.evaluate({&t, &u, &v}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -134,7 +134,7 @@ TEST_F(ScalarTests, Test_Concat_3) {
     auto v = NDArrayFactory::create<float>(5.0f);
     auto exp = NDArrayFactory::create<float>('c', {5}, {1, 2, 3, 4, 5});
 
-    nd4j::ops::concat op;
+    sd::ops::concat op;
     auto result = op.evaluate({&t, &u, &v}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -153,7 +153,7 @@ TEST_F(ScalarTests, Test_ExpandDims_1) {
     auto x = NDArrayFactory::create<float>(2.0f);
     auto exp = NDArrayFactory::create<float>('c', {1}, {2.0f});
 
-    nd4j::ops::expand_dims op;
+    sd::ops::expand_dims op;
     auto result = op.evaluate({&x}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -170,7 +170,7 @@ TEST_F(ScalarTests, Test_Squeeze_1) {
     auto x = NDArrayFactory::create<float>(2.0f);
     auto exp = NDArrayFactory::create<float>(2.0f);
 
-    nd4j::ops::squeeze op;
+    sd::ops::squeeze op;
     auto result = op.evaluate({&x}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -187,7 +187,7 @@ TEST_F(ScalarTests, Test_Reshape_1) {
     auto x = NDArrayFactory::create<float>(2.0f);
     auto exp = NDArrayFactory::create<float>('c', {1, 1, 1}, {2.0f});
 
-    nd4j::ops::reshape op;
+    sd::ops::reshape op;
     auto result = op.evaluate({&x}, {}, {-99, 1, 1, 1});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -204,7 +204,7 @@ TEST_F(ScalarTests, Test_Permute_1) {
     auto x = NDArrayFactory::create<float>(3.0f);
     auto exp = NDArrayFactory::create<float>(3.0f);
 
-    nd4j::ops::permute op;
+    sd::ops::permute op;
     auto result = op.evaluate({&x}, {}, {0});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -223,7 +223,7 @@ TEST_F(ScalarTests, Test_Stack_1) {
     auto v = NDArrayFactory::create<float>(3.0f);
     auto exp = NDArrayFactory::create<float>('c', {3}, {1, 2, 3});
 
-    nd4j::ops::stack op;
+    sd::ops::stack op;
     auto result = op.evaluate({&t, &u, &v}, {}, {0});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -242,7 +242,7 @@ TEST_F(ScalarTests, Test_Stack_2) {
     auto w = NDArrayFactory::create<float>('c', {1, 1}, {4.0f});
     auto exp = NDArrayFactory::create<float>('c', {4, 1, 1}, {1, 2, 3, 4});
 
-    nd4j::ops::stack op;
+    sd::ops::stack op;
     auto result = op.evaluate({&t, &u, &v, &w}, {}, {0});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -264,7 +264,7 @@ TEST_F(ScalarTests, Test_Concat_Scalar_1) {
     auto w = NDArrayFactory::create<float>('c', {1, 1}, {4.0f});
     auto exp = NDArrayFactory::create<float>('c', {4, 1}, {1, 2, 3, 4});
 
-    nd4j::ops::concat op;
+    sd::ops::concat op;
     auto result = op.evaluate({&t, &u, &v, &w}, {}, {0});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -284,7 +284,7 @@ TEST_F(ScalarTests, Test_Concat_Scalar_2) {
     auto w = NDArrayFactory::create<float>('c', {1, 1}, {4.0f});
     auto exp = NDArrayFactory::create<float>('c', {1, 4}, {1, 2, 3, 4});
 
-    nd4j::ops::concat op;
+    sd::ops::concat op;
     auto result = op.evaluate({&t, &u, &v, &w}, {}, {1});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
diff --git a/libnd4j/tests_cpu/layers_tests/ScopeTests.cpp b/libnd4j/tests_cpu/layers_tests/ScopeTests.cpp
index ad30f1c42..6c83e869e 100644
--- a/libnd4j/tests_cpu/layers_tests/ScopeTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ScopeTests.cpp
@@ -19,12 +19,12 @@
 //
 
 #include "testlayers.h"
-#include <Graph.h>
-#include <Node.h>
+#include <graph/Graph.h>
+#include <graph/Node.h>
 #include <ops/declarable/CustomOperations.h>
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class ScopeTests : public testing::Test {
 public:
@@ -40,7 +40,7 @@ TEST_F(ScopeTests, BasicTests_1) {
     auto variableSpace = graph.getVariableSpace();
     variableSpace->putVariable(-1, x);
 
-    nd4j::ops::Scope opScope;
+    sd::ops::Scope opScope;
 
     auto scopeBody = new Node(OpType_LOGIC, 10, 1);
     scopeBody->setName("scopeBody");
@@ -86,7 +86,7 @@ TEST_F(ScopeTests, RealTests_1) {
     //
     auto scopeCondition = new Node(OpType_LOGIC, logic::Scope, 3);
     scopeCondition->setName("scopeCondition");
-    nd4j::ops::Scope opScope;
+    sd::ops::Scope opScope;
     scopeCondition->setCustomOp(&opScope);
 
     // this is scope of the body, it'll be executed multiple times
@@ -102,7 +102,7 @@ TEST_F(ScopeTests, RealTests_1) {
     scopedA0->setScopeInfo(3, "scopeCondition");
 
     // this op compares LT A0 result with variable `scalar` which is 10;
-    nd4j::ops::lt_scalar op;
+    sd::ops::lt_scalar op;
     auto scopedA1 = new Node(&op, 5, {4, -3});
     scopedA1->setScopeInfo(3, "scopeCondition");
 
@@ -115,13 +115,13 @@ TEST_F(ScopeTests, RealTests_1) {
     scopedB0->setScopeInfo(10, "scopeBody");
 
     auto nodeReturn = new Node(OpType_LOGIC, logic::Return, 7, {6}, {12});
-    nd4j::ops::Return opReturn;
+    sd::ops::Return opReturn;
     nodeReturn->setCustomOp(&opReturn);
     nodeReturn->setScopeInfo(10, "scopeBody");
 
     // WHILE operations takes 2 scopes - :0 is condition scope, and :1 is loop body scope
     auto nodeWhile = new Node(OpType_LOGIC, logic::While, 12, {-2, 3, 10});
-    nd4j::ops::While opWhile;
+    sd::ops::While opWhile;
     nodeWhile->setCustomOp(&opWhile);
 
     // adding root nodes first, nothing unusual expected here
diff --git a/libnd4j/tests_cpu/layers_tests/ServerRelatedTests.cpp b/libnd4j/tests_cpu/layers_tests/ServerRelatedTests.cpp
index cedbf0d2f..e0d03731b 100644
--- a/libnd4j/tests_cpu/layers_tests/ServerRelatedTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ServerRelatedTests.cpp
@@ -19,12 +19,12 @@
 //
 
 #include "testlayers.h"
-#include <GraphExecutioner.h>
+#include <graph/GraphExecutioner.h>
 #include <graph/GraphHolder.h>
 #include <graph/InferenceRequest.h>
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class ServerRelatedTests : public testing::Test {
 public:
diff --git a/libnd4j/tests_cpu/layers_tests/SessionLocalTests.cpp b/libnd4j/tests_cpu/layers_tests/SessionLocalTests.cpp
index 41f8ed2d0..8481dfde5 100644
--- a/libnd4j/tests_cpu/layers_tests/SessionLocalTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/SessionLocalTests.cpp
@@ -22,10 +22,10 @@
 #define LIBND4J_SESSIONLOCALTESTS_H
 
 #include "testlayers.h"
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 #include <graph/SessionLocalStorage.h>
 
-using namespace nd4j::graph;
+using namespace sd::graph;
 
 class SessionLocalTests : public testing::Test {
 public:
@@ -62,7 +62,7 @@ TEST_F(SessionLocalTests, BasicTests_2) {
     if (omp_get_max_threads() <= 1)
         return;
 
-    auto alpha = nd4j::NDArrayFactory::create_<float>('c',{5,5});
+    auto alpha = sd::NDArrayFactory::create_<float>('c',{5,5});
     alpha->assign(0.0);
 
     variableSpace.putVariable(-1, alpha);
@@ -74,7 +74,7 @@ TEST_F(SessionLocalTests, BasicTests_2) {
         auto varSpace = storage.localVariableSpace();
 
         auto arr = varSpace->getVariable(-1)->getNDArray();
-        arr->applyScalar(nd4j::scalar::Add, (float) e+1, *arr);
+        arr->applyScalar(sd::scalar::Add, (float) e+1, *arr);
     }
 
     float lastValue = 0.0f;
diff --git a/libnd4j/tests_cpu/layers_tests/ShapeTests.cpp b/libnd4j/tests_cpu/layers_tests/ShapeTests.cpp
index 003474fab..c6ab40b76 100644
--- a/libnd4j/tests_cpu/layers_tests/ShapeTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ShapeTests.cpp
@@ -22,8 +22,8 @@
 #include "testlayers.h"
 #include <ops/declarable/headers/shape.h>
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class ShapeTests : public testing::Test {
 public:
@@ -292,7 +292,7 @@ TEST_F(ShapeTests, Tests_Transpose_119_1) {
     auto e = x.permute({1, 0});
     e.streamline('c');
 
-    nd4j::ops::transpose op;
+    sd::ops::transpose op;
     auto result = op.execute({&x, &y}, {&z}, {}, {}, {});
 
     ASSERT_EQ(Status::OK(), result);
@@ -306,7 +306,7 @@ TEST_F(ShapeTests, Tests_Transpose_119_2) {
 
     auto exp = x.transpose();
 
-    nd4j::ops::transpose op;
+    sd::ops::transpose op;
     auto result = op.evaluate({&x});
     ASSERT_EQ(Status::OK(), result->status());
 
@@ -326,7 +326,7 @@ TEST_F(ShapeTests, Tests_Transpose_119_3) {
 
     auto exp = x.transpose();
 
-    nd4j::ops::transpose op;
+    sd::ops::transpose op;
     auto result = op.execute({&x}, {&z}, {}, {}, {});
     ASSERT_EQ(Status::OK(), result);
 
diff --git a/libnd4j/tests_cpu/layers_tests/ShapeTests2.cpp b/libnd4j/tests_cpu/layers_tests/ShapeTests2.cpp
index fb0d7991a..591956357 100644
--- a/libnd4j/tests_cpu/layers_tests/ShapeTests2.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ShapeTests2.cpp
@@ -20,7 +20,7 @@
 #include <helpers/data_gen.h>
 #include "testinclude.h"
 #include <helpers/TAD.h>
-#include <ShapeBuilders.h>
+#include <helpers/ShapeBuilders.h>
 
 class OnesTest : public testing::Test {
 public:
@@ -43,7 +43,7 @@ public:
     Nd4jLong shape[3] = {3,4,5};
     Nd4jLong *shapeBuffer;
     ThreeDTest() {
-        shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 3, shape);
+        shapeBuffer = sd::ShapeBuilders::createShapeInfo(sd::DataType::FLOAT32, 'c', 3, shape);
     }
     ~ThreeDTest() {
         delete[] shapeBuffer;
@@ -244,7 +244,7 @@ public:
     int dimensionLength = 2;
     int dimensions[2] = {0,1};
     Nd4jLong shape[3] = {1,5,1};
-    Nd4jLong *shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 3, shape);
+    Nd4jLong *shapeBuffer = sd::ShapeBuilders::createShapeInfo(sd::DataType::FLOAT32, 'c', 3, shape);
 
     ~DimensionWarning() {
         delete[] shapeBuffer;
@@ -324,8 +324,8 @@ public:
     int dimensionFour = 0;
     int dimensionLength = 1;
     FourDTest() {
-        threeDShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'f', 3, threeDShape);
-        fourDShapeBuffer  = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'f', 4, fourDShape);
+        threeDShapeBuffer = sd::ShapeBuilders::createShapeInfo(sd::DataType::FLOAT32, 'f', 3, threeDShape);
+        fourDShapeBuffer  = sd::ShapeBuilders::createShapeInfo(sd::DataType::FLOAT32, 'f', 4, fourDShape);
     }
     ~FourDTest() {
         if(threeDShapeBuffer != nullptr)
@@ -492,7 +492,7 @@ TEST_F(LabelTest,LabelTad) {
 }
 
 TEST_F(ExpectedValuesTest,TadTest) {
-    auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 4, mainShape);
+    auto shapeBuffer = sd::ShapeBuilders::createShapeInfo(sd::DataType::FLOAT32, 'c', 4, mainShape);
     shape::TAD *tad = new shape::TAD;
     tad->init(shapeBuffer,testDimensions,3);
     tad->createTadOnlyShapeInfo();
@@ -529,7 +529,7 @@ TEST_F(ThreeDTest,TensorAlongDimensionTest) {
 
 
 TEST_F(NumTadTests,TadTest) {
-    auto shape = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 3, this->shape);
+    auto shape = sd::ShapeBuilders::createShapeInfo(sd::DataType::FLOAT32, 'c', 3, this->shape);
     shape::TAD *tad = new shape::TAD;
     tad->init(shape,&dimension,1);
     int numTads = shape::tensorsAlongDimension(shape,&dimension,1);
@@ -539,7 +539,7 @@ TEST_F(NumTadTests,TadTest) {
 }
 
 TEST_F(TADStall,TestStall) {
-    auto shapeInfo = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 4, shape);
+    auto shapeInfo = sd::ShapeBuilders::createShapeInfo(sd::DataType::FLOAT32, 'c', 4, shape);
     shape::TAD *tad = new shape::TAD;
     tad->init(0,shapeInfo,this->dimensions,3);
     tad->createTadOnlyShapeInfo();
@@ -563,12 +563,12 @@ TEST_F(PermuteTest,PermuteShapeBufferTest) {
     int normalOrder[4] = {0,1,2,3};
     Nd4jLong shapeToPermute[4] = {5,3,2,6};
     Nd4jLong permutedOrder[4] = {6,2,3,5};
-    auto shapeBufferOriginal  = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 4, shapeToPermute);
-    auto assertionShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 4, shapeToPermute);
+    auto shapeBufferOriginal  = sd::ShapeBuilders::createShapeInfo(sd::DataType::FLOAT32, 'c', 4, shapeToPermute);
+    auto assertionShapeBuffer = sd::ShapeBuilders::createShapeInfo(sd::DataType::FLOAT32, 'c', 4, shapeToPermute);
     shape::permuteShapeBufferInPlace(shapeBufferOriginal,normalOrder,shapeBufferOriginal);
     EXPECT_TRUE(arrsEquals(4,assertionShapeBuffer,shapeBufferOriginal));
 
-    auto backwardsAssertion  = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 4, permutedOrder);
+    auto backwardsAssertion  = sd::ShapeBuilders::createShapeInfo(sd::DataType::FLOAT32, 'c', 4, permutedOrder);
     auto permuted = shape::permuteShapeBuffer(assertionShapeBuffer, permuteOrder);
     EXPECT_TRUE(arrsEquals(4, backwardsAssertion, permuted));
 
@@ -585,9 +585,9 @@ TEST_F(ElementWiseStrideTest,ElementWiseStrideTest) {
 
 TEST_F(SliceVectorTest,RowColumnVectorTest) {
     Nd4jLong rowVectorShape[2] = {1,5};
-    auto rowVectorShapeInfo  = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, rowVectorShape);
+    auto rowVectorShapeInfo  = sd::ShapeBuilders::createShapeInfo(sd::DataType::FLOAT32, 'c', 2, rowVectorShape);
     Nd4jLong colVectorShape[2] = {5,1};
-    auto colVectorShapeInfo  = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, colVectorShape);
+    auto colVectorShapeInfo  = sd::ShapeBuilders::createShapeInfo(sd::DataType::FLOAT32, 'c', 2, colVectorShape);
     Nd4jLong *sliceRow = shape::sliceOfShapeBuffer(0,rowVectorShapeInfo);
     EXPECT_TRUE(arrsEquals(2,rowVectorShapeInfo,sliceRow));
     Nd4jLong *scalarSliceInfo = shape::createScalarShapeInfo();
@@ -606,9 +606,9 @@ TEST_F(SliceVectorTest,RowColumnVectorTest) {
 
 TEST_F(SliceTensorTest,TestSlice) {
     Nd4jLong shape[3] = {3,3,2};
-    auto shapeBuffer  = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 3, shape);
+    auto shapeBuffer  = sd::ShapeBuilders::createShapeInfo(sd::DataType::FLOAT32, 'c', 3, shape);
     Nd4jLong sliceShape[2] = {3,2};
-    auto sliceShapeBuffer  = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, sliceShape);
+    auto sliceShapeBuffer  = sd::ShapeBuilders::createShapeInfo(sd::DataType::FLOAT32, 'c', 2, sliceShape);
     Nd4jLong *testSlice = shape::sliceOfShapeBuffer(0,shapeBuffer);
     EXPECT_TRUE(arrsEquals(2,sliceShapeBuffer,testSlice));
     delete[] testSlice;
@@ -619,9 +619,9 @@ TEST_F(SliceTensorTest,TestSlice) {
 
 TEST_F(SliceMatrixTest,TestSlice) {
     Nd4jLong shape[2] = {3,2};
-    auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, shape);
+    auto shapeBuffer = sd::ShapeBuilders::createShapeInfo(sd::DataType::FLOAT32, 'c', 2, shape);
     Nd4jLong sliceShape[2] = {1,2};
-    auto sliceShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, sliceShape);
+    auto sliceShapeBuffer = sd::ShapeBuilders::createShapeInfo(sd::DataType::FLOAT32, 'c', 2, sliceShape);
     Nd4jLong *testSlice = shape::sliceOfShapeBuffer(0,shapeBuffer);
     EXPECT_TRUE(arrsEquals(2,sliceShapeBuffer,testSlice));
     delete[] testSlice;
@@ -664,13 +664,13 @@ TEST_F(TensorTwoFromFourDDimTest,TadTwoFromFourDimTest) {
     //Along dimension 1,2: expect matrix with shape [cols,dim2]
     //Along dimension 1,3: expect matrix with shape [cols,dim3]
     //Along dimension 2,3: expect matrix with shape [dim2,dim3]
-    auto baseShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 4, shape);
+    auto baseShapeBuffer = sd::ShapeBuilders::createShapeInfo(sd::DataType::FLOAT32, 'c', 4, shape);
     for(int i = 0; i <  3; i++) {
         int *dimArr = dims[i];
         Nd4jLong *expectedShape = expectedShapes[i];
         shape::TAD *tad = new shape::TAD;
         tad->init(baseShapeBuffer,dimArr,dimensionLength);
-        auto expectedShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', dimensionLength, expectedShape);
+        auto expectedShapeBuffer = sd::ShapeBuilders::createShapeInfo(sd::DataType::FLOAT32, 'c', dimensionLength, expectedShape);
         tad->createTadOnlyShapeInfo();
         Nd4jLong *testShapeBuffer = tad->tadOnlyShapeInfo;
         EXPECT_TRUE(arrsEquals(shape::rank(expectedShapeBuffer),expectedShape,shape::shapeOf(testShapeBuffer)));
@@ -687,14 +687,14 @@ TEST_F(TensorTwoDimTest,TadTwoDimTest) {
     //Along dimension 0,1: expect matrix with shape [rows,cols]
     //Along dimension 0,2: expect matrix with shape [rows,dim2]
     //Along dimension 1,2: expect matrix with shape [cols,dim2]
-    auto baseShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 3, shape);
+    auto baseShapeBuffer = sd::ShapeBuilders::createShapeInfo(sd::DataType::FLOAT32, 'c', 3, shape);
 
     for(int i = 0; i <  3; i++) {
         int *dimArr = dims[i];
         Nd4jLong *expectedShape = expectedShapes[i];
         shape::TAD *tad = new shape::TAD;
         tad->init(baseShapeBuffer,dimArr,dimensionLength);
-        auto expectedShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', dimensionLength, expectedShape);
+        auto expectedShapeBuffer = sd::ShapeBuilders::createShapeInfo(sd::DataType::FLOAT32, 'c', dimensionLength, expectedShape);
         tad->createTadOnlyShapeInfo();
         Nd4jLong *testShapeBuffer = tad->tadOnlyShapeInfo;
         Nd4jLong *expectedStride = expectedStrides[i];
@@ -715,7 +715,7 @@ TEST_F(TensorTwoDimTest,TadTwoDimTest) {
 
 TEST_F(TensorOneDimTest,TadDimensionsForTensor) {
     Nd4jLong shape[3] = {rows,cols,dim2};
-    auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', rank, shape);
+    auto shapeBuffer = sd::ShapeBuilders::createShapeInfo(sd::DataType::FLOAT32, 'c', rank, shape);
 
     for(int i = 0; i < rank; i++) {
         //Along dimension 0: expect row vector with length 'dims[i]'
@@ -737,7 +737,7 @@ TEST_F(TensorOneDimTest,TadDimensionsForTensor) {
 
 TEST_F(MatrixTest,TadDimensionsForMatrix) {
     Nd4jLong shape[2] = {rows,cols};
-    auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', rank, shape);
+    auto shapeBuffer = sd::ShapeBuilders::createShapeInfo(sd::DataType::FLOAT32, 'c', rank, shape);
 
     shape::TAD *dimZero = new shape::TAD;
     dimZero->init(shapeBuffer,&dims[0],1);
@@ -745,7 +745,7 @@ TEST_F(MatrixTest,TadDimensionsForMatrix) {
     dimOne->init(shapeBuffer,&dims[1],1);
     //Along dimension 0: expect row vector with length 'rows'
     Nd4jLong rowVectorShape[2] = {1,rows};
-    auto expectedDimZeroShape = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, rowVectorShape);
+    auto expectedDimZeroShape = sd::ShapeBuilders::createShapeInfo(sd::DataType::FLOAT32, 'c', 2, rowVectorShape);
     dimZero->createTadOnlyShapeInfo();
     Nd4jLong *testDimZero = dimZero->tadOnlyShapeInfo;
     EXPECT_TRUE(arrsEquals(2,expectedShapes[0],shape::shapeOf(testDimZero)));
@@ -754,7 +754,7 @@ TEST_F(MatrixTest,TadDimensionsForMatrix) {
     delete[] expectedDimZeroShape;
     //Along dimension 1: expect row vector with length 'cols'
     Nd4jLong rowVectorColShape[2] {1,cols};
-    auto expectedDimOneShape = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, rowVectorColShape);
+    auto expectedDimOneShape = sd::ShapeBuilders::createShapeInfo(sd::DataType::FLOAT32, 'c', 2, rowVectorColShape);
     dimOne->createTadOnlyShapeInfo();
     Nd4jLong *testDimOneShape = dimOne->tadOnlyShapeInfo;
     EXPECT_TRUE(arrsEquals(2,expectedShapes[1],shape::shapeOf(testDimOneShape)));
@@ -768,11 +768,11 @@ TEST_F(MatrixTest,TadDimensionsForMatrix) {
 
 TEST_F(VectorTest,VectorTadShape) {
     Nd4jLong rowVector[2] = {2,2};
-    auto rowBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, rowVector);
+    auto rowBuffer = sd::ShapeBuilders::createShapeInfo(sd::DataType::FLOAT32, 'c', 2, rowVector);
     int rowDimension = 1;
 
     Nd4jLong columnVector[2] = {2,2};
-    auto colShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, columnVector);
+    auto colShapeBuffer = sd::ShapeBuilders::createShapeInfo(sd::DataType::FLOAT32, 'c', 2, columnVector);
     int colDimension = 0;
 
 
@@ -811,7 +811,7 @@ TEST_F(VectorTest,LinspaceCombinationTest) {
     int len = rows * cols;
     double *linspaced = linspace<double>(1,rows * cols,len);
     Nd4jLong shape[2] = {rows,cols};
-    auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, shape);
+    auto shapeBuffer = sd::ShapeBuilders::createShapeInfo(sd::DataType::FLOAT32, 'c', 2, shape);
 
     delete[] shapeBuffer;
     delete[] linspaced;
diff --git a/libnd4j/tests_cpu/layers_tests/ShapeUtilsTests.cpp b/libnd4j/tests_cpu/layers_tests/ShapeUtilsTests.cpp
index f07d3d68a..36fce0dd9 100644
--- a/libnd4j/tests_cpu/layers_tests/ShapeUtilsTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ShapeUtilsTests.cpp
@@ -20,11 +20,11 @@
 
 #include "testlayers.h"
 #include <helpers/ShapeUtils.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class ShapeUtilsTests : public testing::Test {
 public:
diff --git a/libnd4j/tests_cpu/layers_tests/SingleDimTests.cpp b/libnd4j/tests_cpu/layers_tests/SingleDimTests.cpp
index c2d8bce04..31104d7b5 100644
--- a/libnd4j/tests_cpu/layers_tests/SingleDimTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/SingleDimTests.cpp
@@ -20,12 +20,12 @@
 
 #include "testlayers.h"
 #include <ops/declarable/CustomOperations.h>
-#include <NDArray.h>
-#include <NativeOps.h>
+#include <array/NDArray.h>
+#include <legacy/NativeOps.h>
 #include <helpers/BitwiseUtils.h>
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class SingleDimTests : public testing::Test {
 public:
@@ -67,7 +67,7 @@ TEST_F(SingleDimTests, Test_Concat_1) {
     auto y = NDArrayFactory::create<float>('c', {3}, {4, 5, 6});
     auto exp = NDArrayFactory::create<float>('c', {6}, {1, 2, 3, 4, 5, 6});
 
-    nd4j::ops::concat op;
+    sd::ops::concat op;
     auto result = op.evaluate({&x, &y}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -101,7 +101,7 @@ TEST_F(SingleDimTests, Test_ExpandDims_1) {
     auto x = NDArrayFactory::create<float>('c', {3}, {1, 2, 3});
     auto exp = NDArrayFactory::create<float>('c', {1, 3}, {1, 2, 3});
 
-    nd4j::ops::expand_dims op;
+    sd::ops::expand_dims op;
     auto result = op.evaluate({&x}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -119,7 +119,7 @@ TEST_F(SingleDimTests, Test_ExpandDims_2) {
     auto x = NDArrayFactory::create<float>('c', {3}, {1, 2, 3});
     auto exp = NDArrayFactory::create<float>('c', {3, 1}, {1, 2, 3});
 
-    nd4j::ops::expand_dims op;
+    sd::ops::expand_dims op;
     auto result = op.evaluate({&x}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -139,7 +139,7 @@ TEST_F(SingleDimTests, Test_Squeeze_1) {
     auto x = NDArrayFactory::create<float>('c', vecS, vecB);
     auto exp = NDArrayFactory::create<float>(3.0f);
 
-    nd4j::ops::squeeze op;
+    sd::ops::squeeze op;
     auto result = op.evaluate({&x}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
@@ -156,7 +156,7 @@ TEST_F(SingleDimTests, Test_Squeeze_2) {
     auto x = NDArrayFactory::create<float>('c', {3}, {1, 2, 3});
     auto exp = NDArrayFactory::create<float>('c', {3}, {1, 2, 3});
 
-    nd4j::ops::squeeze op;
+    sd::ops::squeeze op;
     auto result = op.evaluate({&x}, {}, {});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -172,7 +172,7 @@ TEST_F(SingleDimTests, Test_Reshape_1) {
     auto x = NDArrayFactory::create<float>('c', {1, 3}, {1, 2, 3});
     auto exp = NDArrayFactory::create<float>('c', {3}, {1, 2, 3});
 
-    nd4j::ops::reshape op;
+    sd::ops::reshape op;
     auto result = op.evaluate({&x}, {}, {-99, 3});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -188,7 +188,7 @@ TEST_F(SingleDimTests, Test_Reshape_2) {
     auto x = NDArrayFactory::create<float>('c', {3}, {1, 2, 3});
     auto exp = NDArrayFactory::create<float>('c', {1, 3}, {1, 2, 3});
 
-    nd4j::ops::reshape op;
+    sd::ops::reshape op;
     auto result = op.evaluate({&x}, {}, {-99, 1, 3});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -205,7 +205,7 @@ TEST_F(SingleDimTests, Test_Permute_1) {
     auto x = NDArrayFactory::create<float>('c', {3}, {1, 2, 3});
     auto exp = NDArrayFactory::create<float>('c', {3}, {1, 2, 3});
 
-    nd4j::ops::permute op;
+    sd::ops::permute op;
     auto result = op.evaluate({&x}, {}, {0});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
diff --git a/libnd4j/tests_cpu/layers_tests/SortCpuTests.cpp b/libnd4j/tests_cpu/layers_tests/SortCpuTests.cpp
index 122a16e45..4dcedf035 100644
--- a/libnd4j/tests_cpu/layers_tests/SortCpuTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/SortCpuTests.cpp
@@ -20,12 +20,12 @@
 
 #include "testlayers.h"
 #include <ops/declarable/CustomOperations.h>
-#include <NDArray.h>
-#include <NativeOps.h>
+#include <array/NDArray.h>
+#include <legacy/NativeOps.h>
 #include <helpers/BitwiseUtils.h>
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class SortCpuTests : public testing::Test {
 public:
diff --git a/libnd4j/tests_cpu/layers_tests/SortCudaTests.cu b/libnd4j/tests_cpu/layers_tests/SortCudaTests.cu
index 6913722be..5a5e75b1b 100644
--- a/libnd4j/tests_cpu/layers_tests/SortCudaTests.cu
+++ b/libnd4j/tests_cpu/layers_tests/SortCudaTests.cu
@@ -20,12 +20,12 @@
 
 #include "testlayers.h"
 #include <ops/declarable/CustomOperations.h>
-#include <NDArray.h>
-#include <NativeOps.h>
+#include <array/NDArray.h>
+#include <legacy/NativeOps.h>
 #include <helpers/BitwiseUtils.h>
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class SortCudaTests : public testing::Test {
 public:
diff --git a/libnd4j/tests_cpu/layers_tests/SparseUtilsTest.cpp b/libnd4j/tests_cpu/layers_tests/SparseUtilsTest.cpp
index fdeb3a884..becd5a21f 100644
--- a/libnd4j/tests_cpu/layers_tests/SparseUtilsTest.cpp
+++ b/libnd4j/tests_cpu/layers_tests/SparseUtilsTest.cpp
@@ -20,9 +20,9 @@
 
 #include "testlayers.h"
 #include <memory>
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include "ops/specials_sparse.h"
-using namespace nd4j;
+using namespace sd;
 
 //////////////////////////////////////////////////////////////////////
 class SparseUtilsTest : public testing::Test {
@@ -131,7 +131,7 @@ TEST_F(SparseUtilsTest, SortCOOindices_Test) {
                                           34, 35, 36, 37, 38, 39
                                           });
 
-    nd4j::sparse::SparseUtils<float >::sortCooIndicesGeneric(indicesArr, reinterpret_cast<float*>(values.getBuffer()), nnz, rank);
+    sd::sparse::SparseUtils<float >::sortCooIndicesGeneric(indicesArr, reinterpret_cast<float*>(values.getBuffer()), nnz, rank);
 
     for ( int i = 0; i < rank * nnz; ++i){
         ASSERT_EQ(expIndicesArr[i], indicesArr[i]);
diff --git a/libnd4j/tests_cpu/layers_tests/StashTests.cpp b/libnd4j/tests_cpu/layers_tests/StashTests.cpp
index bfa1a6ac6..2cba6682d 100644
--- a/libnd4j/tests_cpu/layers_tests/StashTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/StashTests.cpp
@@ -21,12 +21,12 @@
 #ifndef LIBND4J_STASHTESTS_H
 #define LIBND4J_STASHTESTS_H
 
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include "testlayers.h"
 #include <graph/Stash.h>
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class StashTests : public testing::Test {
 public:
diff --git a/libnd4j/tests_cpu/layers_tests/StringTests.cpp b/libnd4j/tests_cpu/layers_tests/StringTests.cpp
index 8b9d92f2f..272c410c7 100644
--- a/libnd4j/tests_cpu/layers_tests/StringTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/StringTests.cpp
@@ -21,12 +21,12 @@
 //
 
 
-#include <NDArray.h>
-#include <NDArrayFactory.h>
+#include <array/NDArray.h>
+#include <array/NDArrayFactory.h>
 #include "testlayers.h"
 #include <graph/Stash.h>
 
-using namespace nd4j;
+using namespace sd;
 
 class StringTests : public testing::Test {
 public:
@@ -36,7 +36,7 @@ public:
 TEST_F(StringTests, Basic_Test_1) {
     std::string f("alpha");
     auto array = NDArrayFactory::string(f);
-    ASSERT_EQ(nd4j::DataType::UTF8, array.dataType());
+    ASSERT_EQ(sd::DataType::UTF8, array.dataType());
 
     ASSERT_EQ(1, array.lengthOf());
     ASSERT_EQ(0, array.rankOf());
@@ -49,7 +49,7 @@ TEST_F(StringTests, Basic_Test_1) {
 TEST_F(StringTests, Basic_Test_2) {
     std::string f("alpha");
     auto array = NDArrayFactory::string(f.c_str());
-    ASSERT_EQ(nd4j::DataType::UTF8, array.dataType());
+    ASSERT_EQ(sd::DataType::UTF8, array.dataType());
 
     ASSERT_EQ(1, array.lengthOf());
     ASSERT_EQ(0, array.rankOf());
@@ -237,7 +237,7 @@ TEST_F(StringTests, Basic_Test_21) {
 TEST_F(StringTests, Basic_Test_22) {
     std::u16string f(u"ß水𝄋ÿ€한𐍈®кею90ощъ]ї");
     auto array = NDArrayFactory::string(f.c_str());
-    ASSERT_EQ(nd4j::DataType::UTF16, array.dataType());
+    ASSERT_EQ(sd::DataType::UTF16, array.dataType());
 
     ASSERT_EQ(1, array.lengthOf());
     ASSERT_EQ(0, array.rankOf());
@@ -250,7 +250,7 @@ TEST_F(StringTests, Basic_Test_22) {
 TEST_F(StringTests, Basic_Test_23) {
     std::u32string f(U"ß水𝄋ÿ€한𐍈®кею90ощъ]ї");
     auto array = NDArrayFactory::string(f.c_str());
-    ASSERT_EQ(nd4j::DataType::UTF32, array.dataType());
+    ASSERT_EQ(sd::DataType::UTF32, array.dataType());
 
     ASSERT_EQ(1, array.lengthOf());
     ASSERT_EQ(0, array.rankOf());
@@ -268,7 +268,7 @@ TEST_F(StringTests, Export_Test_1) {
 TEST_F(StringTests, Basic_dup_1) {
     std::string f("alpha");
     auto array = NDArrayFactory::string(f);
-    ASSERT_EQ(nd4j::DataType::UTF8, array.dataType());
+    ASSERT_EQ(sd::DataType::UTF8, array.dataType());
 
     ASSERT_EQ(1, array.lengthOf());
     ASSERT_EQ(0, array.rankOf());
@@ -410,17 +410,17 @@ TEST_F(StringTests, byte_length_test_Default) {
 /////////////////////////////////////////////////////////////////////////
 TEST_F(StringTests, byte_length_test_UTF16) {
     std::string f(u8"alpha");
-    auto array = NDArrayFactory::string(f, nd4j::DataType::UTF16);
+    auto array = NDArrayFactory::string(f, sd::DataType::UTF16);
 
     ASSERT_EQ(sizeof(char16_t) * f.length(), StringUtils::byteLength(array));
 
     std::u16string f16(u"alpha");
-    auto array16 = NDArrayFactory::string(f16, nd4j::DataType::UTF16);
+    auto array16 = NDArrayFactory::string(f16, sd::DataType::UTF16);
 
     ASSERT_EQ(sizeof(char16_t) * f16.length(), StringUtils::byteLength(array16));
 
     std::u32string f32(U"alpha");
-    auto array32 = NDArrayFactory::string(f32, nd4j::DataType::UTF16);
+    auto array32 = NDArrayFactory::string(f32, sd::DataType::UTF16);
 
     ASSERT_EQ(sizeof(char16_t) * f32.length(), StringUtils::byteLength(array32));
 }
@@ -428,8 +428,8 @@ TEST_F(StringTests, byte_length_test_UTF16) {
 TEST_F(StringTests, Basic_Test_UTF16toU8) {
 
     std::u16string f16(u"alpha水𝄋ÿ€한𐍈®кею");
-    auto array = NDArrayFactory::string(f16, nd4j::DataType::UTF8);
-    ASSERT_EQ(nd4j::DataType::UTF8, array.dataType());
+    auto array = NDArrayFactory::string(f16, sd::DataType::UTF8);
+    ASSERT_EQ(sd::DataType::UTF8, array.dataType());
 
     ASSERT_EQ(1, array.lengthOf());
     ASSERT_EQ(0, array.rankOf());
@@ -442,8 +442,8 @@ TEST_F(StringTests, Basic_Test_UTF16toU8) {
 /////////////////////////////////////////////////////////////////////////
 TEST_F(StringTests, Basic_Test_UTF32toU8) {
     std::u32string f32(U"alpha水𝄋ÿ€한𐍈®кею");
-    auto array = NDArrayFactory::string(f32.c_str(), nd4j::DataType::UTF8);
-    ASSERT_EQ(nd4j::DataType::UTF8, array.dataType());
+    auto array = NDArrayFactory::string(f32.c_str(), sd::DataType::UTF8);
+    ASSERT_EQ(sd::DataType::UTF8, array.dataType());
 
     ASSERT_EQ(1, array.lengthOf());
     ASSERT_EQ(0, array.rankOf());
@@ -456,8 +456,8 @@ TEST_F(StringTests, Basic_Test_UTF32toU8) {
 TEST_F(StringTests, Basic_Test_UTF16toU16) {
 
     std::u16string f16(u"€alpha水𝄋ÿ€한𐍈®кею");
-    auto array = NDArrayFactory::string(f16, nd4j::DataType::UTF16);
-    ASSERT_EQ(nd4j::DataType::UTF16, array.dataType());
+    auto array = NDArrayFactory::string(f16, sd::DataType::UTF16);
+    ASSERT_EQ(sd::DataType::UTF16, array.dataType());
 
     ASSERT_EQ(1, array.lengthOf());
     ASSERT_EQ(0, array.rankOf());
@@ -469,8 +469,8 @@ TEST_F(StringTests, Basic_Test_UTF16toU16) {
 TEST_F(StringTests, Basic_Test_UTF32toU16) {
 
     std::u32string f32(U"€alpha水𝄋ÿ€한𐍈®кею");
-    auto array = NDArrayFactory::string(f32, nd4j::DataType::UTF16);
-    ASSERT_EQ(nd4j::DataType::UTF16, array.dataType());
+    auto array = NDArrayFactory::string(f32, sd::DataType::UTF16);
+    ASSERT_EQ(sd::DataType::UTF16, array.dataType());
 
     ASSERT_EQ(1, array.lengthOf());
     ASSERT_EQ(0, array.rankOf());
@@ -482,8 +482,8 @@ TEST_F(StringTests, Basic_Test_UTF32toU16) {
 TEST_F(StringTests, Basic_Test_UTF16toU32) {
 
     std::u16string f16(u"€alpha水𝄋ÿ€한𐍈®кею");
-    auto array = NDArrayFactory::string(f16, nd4j::DataType::UTF32);
-    ASSERT_EQ(nd4j::DataType::UTF32, array.dataType());
+    auto array = NDArrayFactory::string(f16, sd::DataType::UTF32);
+    ASSERT_EQ(sd::DataType::UTF32, array.dataType());
 
     ASSERT_EQ(1, array.lengthOf());
     ASSERT_EQ(0, array.rankOf());
@@ -497,7 +497,7 @@ TEST_F(StringTests, Basic_Test_UTF32toU32) {
 
     std::u32string f32(U"€alpha水𝄋ÿ€한𐍈®кею");
     auto array = NDArrayFactory::string(f32);
-    ASSERT_EQ(nd4j::DataType::UTF32, array.dataType());
+    ASSERT_EQ(sd::DataType::UTF32, array.dataType());
 
     ASSERT_EQ(1, array.lengthOf());
     ASSERT_EQ(0, array.rankOf());
@@ -508,8 +508,8 @@ TEST_F(StringTests, Basic_Test_UTF32toU32) {
 TEST_F(StringTests, Basic_Test_UTF8toU32) {
 
     std::string f(u8"€alpha水𝄋ÿ€한𐍈®кею");
-    auto array = NDArrayFactory::string(f, nd4j::DataType::UTF32);
-    ASSERT_EQ(nd4j::DataType::UTF32, array.dataType());
+    auto array = NDArrayFactory::string(f, sd::DataType::UTF32);
+    ASSERT_EQ(sd::DataType::UTF32, array.dataType());
 
     ASSERT_EQ(1, array.lengthOf());
     ASSERT_EQ(0, array.rankOf());
@@ -519,7 +519,7 @@ TEST_F(StringTests, Basic_Test_UTF8toU32) {
 }
 /////////////////////////////////////////////////////////////////////////
 TEST_F(StringTests, Basic_Test_StringVecU8toUTF16) {
-    auto array = NDArrayFactory::string({ 3, 2 }, { "alpha€", "beta", "gamma水", "phi", "theta", "omega水" }, nd4j::DataType::UTF16);
+    auto array = NDArrayFactory::string({ 3, 2 }, { "alpha€", "beta", "gamma水", "phi", "theta", "omega水" }, sd::DataType::UTF16);
 
     ASSERT_EQ(6, array.lengthOf());
     ASSERT_EQ(2, array.rankOf());
@@ -528,7 +528,7 @@ TEST_F(StringTests, Basic_Test_StringVecU8toUTF16) {
 }
 /////////////////////////////////////////////////////////////////////////
 TEST_F(StringTests, Basic_Test_StringVecU8toUTF32) {
-    auto array = NDArrayFactory::string( { 3, 2 }, { "alpha€", "beta水", "gamma", "phi", "theta", "omega" }, nd4j::DataType::UTF32);
+    auto array = NDArrayFactory::string( { 3, 2 }, { "alpha€", "beta水", "gamma", "phi", "theta", "omega" }, sd::DataType::UTF32);
 
     ASSERT_EQ(6, array.lengthOf());
     ASSERT_EQ(2, array.rankOf());
@@ -537,19 +537,19 @@ TEST_F(StringTests, Basic_Test_StringVecU8toUTF32) {
 }
 /////////////////////////////////////////////////////////////////////////
 TEST_F(StringTests, Export_Test_U8toUTF16) {
-    auto array = NDArrayFactory::string({ 3 }, { "alpha", "beta", "gamma" }, nd4j::DataType::UTF16);
+    auto array = NDArrayFactory::string({ 3 }, { "alpha", "beta", "gamma" }, sd::DataType::UTF16);
 
     auto vector = array.asByteVector();
 }
 /////////////////////////////////////////////////////////////////////////
 TEST_F(StringTests, Export_Test_U8toUTF32) {
-    auto array = NDArrayFactory::string({ 3 }, { "alpha", "beta", "gamma" }, nd4j::DataType::UTF32);
+    auto array = NDArrayFactory::string({ 3 }, { "alpha", "beta", "gamma" }, sd::DataType::UTF32);
 
     auto vector = array.asByteVector();
 }
 /////////////////////////////////////////////////////////////////////////
 TEST_F(StringTests, Basic_Test_StringVecU16toUTF16) {
-    auto array = NDArrayFactory::string({ 3, 2 }, { u"alpha水", u"beta", u"gamma", u"phi", u"theta水", u"omega" }, nd4j::DataType::UTF16);
+    auto array = NDArrayFactory::string({ 3, 2 }, { u"alpha水", u"beta", u"gamma", u"phi", u"theta水", u"omega" }, sd::DataType::UTF16);
 
     ASSERT_EQ(6, array.lengthOf());
     ASSERT_EQ(2, array.rankOf());
@@ -558,7 +558,7 @@ TEST_F(StringTests, Basic_Test_StringVecU16toUTF16) {
 }
 /////////////////////////////////////////////////////////////////////////
 TEST_F(StringTests, Basic_Test_StringVecU16toUTF32) {
-    auto array = NDArrayFactory::string( { 3, 2 }, { u"alpha水", u"beta", u"gamma水", u"phi", u"theta", u"omega" }, nd4j::DataType::UTF32);
+    auto array = NDArrayFactory::string( { 3, 2 }, { u"alpha水", u"beta", u"gamma水", u"phi", u"theta", u"omega" }, sd::DataType::UTF32);
 
     ASSERT_EQ(6, array.lengthOf());
     ASSERT_EQ(2, array.rankOf());
@@ -567,7 +567,7 @@ TEST_F(StringTests, Basic_Test_StringVecU16toUTF32) {
 }
 /////////////////////////////////////////////////////////////////////////
 TEST_F(StringTests, Basic_Test_StringVecU16toUTF8) {
-    auto array = NDArrayFactory::string( { 3, 2 }, { u"alpha€", u"beta水", u"gamma", u"phi水", u"theta", u"omega" }, nd4j::DataType::UTF8);
+    auto array = NDArrayFactory::string( { 3, 2 }, { u"alpha€", u"beta水", u"gamma", u"phi水", u"theta", u"omega" }, sd::DataType::UTF8);
 
     ASSERT_EQ(6, array.lengthOf());
     ASSERT_EQ(2, array.rankOf());
@@ -576,25 +576,25 @@ TEST_F(StringTests, Basic_Test_StringVecU16toUTF8) {
 }
 /////////////////////////////////////////////////////////////////////////
 TEST_F(StringTests, Export_Test_U16toUTF8) {
-    auto array = NDArrayFactory::string( { 3 }, { u"alpha", u"beta", u"gamma" }, nd4j::DataType::UTF8);
+    auto array = NDArrayFactory::string( { 3 }, { u"alpha", u"beta", u"gamma" }, sd::DataType::UTF8);
 
     auto vector = array.asByteVector();
 }
 /////////////////////////////////////////////////////////////////////////
 TEST_F(StringTests, Export_Test_U16toUTF16) {
-    auto array = NDArrayFactory::string( { 3 }, { u"alpha", u"beta", u"gamma" }, nd4j::DataType::UTF16);
+    auto array = NDArrayFactory::string( { 3 }, { u"alpha", u"beta", u"gamma" }, sd::DataType::UTF16);
 
     auto vector = array.asByteVector();
 }
 /////////////////////////////////////////////////////////////////////////
 TEST_F(StringTests, Export_Test_U16toUTF32) {
-    auto array = NDArrayFactory::string( { 3 }, { u"alpha水", u"beta", u"gamma水" }, nd4j::DataType::UTF32);
+    auto array = NDArrayFactory::string( { 3 }, { u"alpha水", u"beta", u"gamma水" }, sd::DataType::UTF32);
 
     auto vector = array.asByteVector();
 }
 /////////////////////////////////////////////////////////////////////////
 TEST_F(StringTests, Basic_Test_StringVecU32toUTF32) {
-    auto array = NDArrayFactory::string( { 3, 2 }, { U"alpha€", U"beta水", U"gamma", U"phi", U"theta", U"omega水" }, nd4j::DataType::UTF32);
+    auto array = NDArrayFactory::string( { 3, 2 }, { U"alpha€", U"beta水", U"gamma", U"phi", U"theta", U"omega水" }, sd::DataType::UTF32);
 
     ASSERT_EQ(6, array.lengthOf());
     ASSERT_EQ(2, array.rankOf());
@@ -603,7 +603,7 @@ TEST_F(StringTests, Basic_Test_StringVecU32toUTF32) {
 }
 /////////////////////////////////////////////////////////////////////////
 TEST_F(StringTests, Basic_Test_StringVecU32toUTF16) {
-    auto array = NDArrayFactory::string({ 3, 2 }, { U"alpha水", U"水beta", U"gamma", U"phi水", U"theta", U"omega" }, nd4j::DataType::UTF16);
+    auto array = NDArrayFactory::string({ 3, 2 }, { U"alpha水", U"水beta", U"gamma", U"phi水", U"theta", U"omega" }, sd::DataType::UTF16);
 
     ASSERT_EQ(6, array.lengthOf());
     ASSERT_EQ(2, array.rankOf());
@@ -617,7 +617,7 @@ TEST_F(StringTests, Basic_Test_StringVecU32toUTF16) {
 }
 /////////////////////////////////////////////////////////////////////////
 TEST_F(StringTests, Basic_Test_StringVecU32toUTF8) {
-    auto array = NDArrayFactory::string( { 3, 2 }, { U"alpha水", U"beta", U"gamma水", U"phi", U"theta", U"omega" }, nd4j::DataType::UTF8);
+    auto array = NDArrayFactory::string( { 3, 2 }, { U"alpha水", U"beta", U"gamma水", U"phi", U"theta", U"omega" }, sd::DataType::UTF8);
 
     ASSERT_EQ(6, array.lengthOf());
     ASSERT_EQ(2, array.rankOf());
@@ -626,19 +626,19 @@ TEST_F(StringTests, Basic_Test_StringVecU32toUTF8) {
 }
 /////////////////////////////////////////////////////////////////////////
 TEST_F(StringTests, Export_Test_U32toUTF32) {
-    auto array = NDArrayFactory::string( { 3 }, { U"alpha", U"beta", U"gamma" }, nd4j::DataType::UTF32);
+    auto array = NDArrayFactory::string( { 3 }, { U"alpha", U"beta", U"gamma" }, sd::DataType::UTF32);
 
     auto vector = array.asByteVector();
 }
 /////////////////////////////////////////////////////////////////////////
 TEST_F(StringTests, Export_Test_U32toUTF16) {
-    auto array = NDArrayFactory::string( { 3 }, { U"alpha", U"beta水", U"gamma水" }, nd4j::DataType::UTF16);
+    auto array = NDArrayFactory::string( { 3 }, { U"alpha", U"beta水", U"gamma水" }, sd::DataType::UTF16);
 
     auto vector = array.asByteVector();
 }
 /////////////////////////////////////////////////////////////////////////
 TEST_F(StringTests, Export_Test_U32toUTF8) {
-    auto array = NDArrayFactory::string( { 3 }, { U"alpha", U"beta", U"gamma水" }, nd4j::DataType::UTF8);
+    auto array = NDArrayFactory::string( { 3 }, { U"alpha", U"beta", U"gamma水" }, sd::DataType::UTF8);
 
     auto vector = array.asByteVector();
 }
@@ -646,7 +646,7 @@ TEST_F(StringTests, Export_Test_U32toUTF8) {
 TEST_F(StringTests, Basic_dup_UTF16) {
     std::u16string f(u"€alpha水𝄋ÿ€한𐍈®кею");
     auto array = NDArrayFactory::string(f);
-    ASSERT_EQ(nd4j::DataType::UTF16, array.dataType());
+    ASSERT_EQ(sd::DataType::UTF16, array.dataType());
 
     ASSERT_EQ(1, array.lengthOf());
     ASSERT_EQ(0, array.rankOf());
@@ -665,7 +665,7 @@ TEST_F(StringTests, Basic_dup_UTF16) {
 TEST_F(StringTests, Basic_dup_UTF32) {
     std::u32string f(U"€alpha水𝄋ÿ€한𐍈®кею");
     auto array = NDArrayFactory::string(f);
-    ASSERT_EQ(nd4j::DataType::UTF32, array.dataType());
+    ASSERT_EQ(sd::DataType::UTF32, array.dataType());
 
     ASSERT_EQ(1, array.lengthOf());
     ASSERT_EQ(0, array.rankOf());
@@ -688,12 +688,12 @@ TEST_F(StringTests, Basic_cast_UTF32toUTF8) {
     std::string u8(u8"€alpha水𝄋ÿ€한𐍈®кею");
     
     auto array = NDArrayFactory::string(u32);
-    ASSERT_EQ(nd4j::DataType::UTF32, array.dataType());
+    ASSERT_EQ(sd::DataType::UTF32, array.dataType());
 
     ASSERT_EQ(1, array.lengthOf());
     ASSERT_EQ(0, array.rankOf());
 
-    auto aCast =  array.cast(nd4j::DataType::UTF8);
+    auto aCast =  array.cast(sd::DataType::UTF8);
 
     auto z0 = array.e<std::u32string>(0);
     auto z1 = aCast.e<std::string>(0);
@@ -709,12 +709,12 @@ TEST_F(StringTests, Basic_cast_UTF32toUTF16) {
     std::u16string u16(u"€alpha水𝄋ÿ€한𐍈®кею");
 
     auto array = NDArrayFactory::string(u32);
-    ASSERT_EQ(nd4j::DataType::UTF32, array.dataType());
+    ASSERT_EQ(sd::DataType::UTF32, array.dataType());
 
     ASSERT_EQ(1, array.lengthOf());
     ASSERT_EQ(0, array.rankOf());
     
-    auto aCast = array.cast(nd4j::DataType::UTF16);
+    auto aCast = array.cast(sd::DataType::UTF16);
 
     auto z0 = array.e<std::u32string>(0);
     auto z1 = aCast.e<std::u16string>(0);
@@ -728,12 +728,12 @@ TEST_F(StringTests, Basic_cast_UTF32toUTF32) {
     std::u32string u32(U"€alpha水𝄋ÿ€한𐍈®кею");
 
     auto array = NDArrayFactory::string(u32);
-    ASSERT_EQ(nd4j::DataType::UTF32, array.dataType());
+    ASSERT_EQ(sd::DataType::UTF32, array.dataType());
 
     ASSERT_EQ(1, array.lengthOf());
     ASSERT_EQ(0, array.rankOf());
 
-    auto aCast = array.cast(nd4j::DataType::UTF32);
+    auto aCast = array.cast(sd::DataType::UTF32);
 
     auto z0 = array.e<std::u32string>(0);
     auto z1 = aCast.e<std::u32string>(0);
@@ -747,12 +747,12 @@ TEST_F(StringTests, Basic_cast_UTF16toUTF16) {
     std::u16string u16(u"€alpha水𝄋ÿ€한𐍈®кею");
 
     auto array = NDArrayFactory::string(u16);
-    ASSERT_EQ(nd4j::DataType::UTF16, array.dataType());
+    ASSERT_EQ(sd::DataType::UTF16, array.dataType());
 
     ASSERT_EQ(1, array.lengthOf());
     ASSERT_EQ(0, array.rankOf());
 
-    auto aCast = array.cast(nd4j::DataType::UTF16);
+    auto aCast = array.cast(sd::DataType::UTF16);
 
     auto z0 = array.e<std::u16string>(0);
     auto z1 = aCast.e<std::u16string>(0);
@@ -768,12 +768,12 @@ TEST_F(StringTests, Basic_cast_UTF16toUTF32) {
     std::u16string u16(u"€alpha水𝄋ÿ€한𐍈®кею");
 
     auto array = NDArrayFactory::string(u16);
-    ASSERT_EQ(nd4j::DataType::UTF16, array.dataType());
+    ASSERT_EQ(sd::DataType::UTF16, array.dataType());
 
     ASSERT_EQ(1, array.lengthOf());
     ASSERT_EQ(0, array.rankOf());
 
-    auto aCast = array.cast(nd4j::DataType::UTF32);
+    auto aCast = array.cast(sd::DataType::UTF32);
 
     auto z0 = array.e<std::u16string>(0);
     auto z1 = aCast.e<std::u32string>(0);
@@ -789,12 +789,12 @@ TEST_F(StringTests, Basic_cast_UTF16toUTF8) {
     std::u16string u16(u"€alpha水𝄋ÿ€한𐍈®кею");
 
     auto array = NDArrayFactory::string(u16);
-    ASSERT_EQ(nd4j::DataType::UTF16, array.dataType());
+    ASSERT_EQ(sd::DataType::UTF16, array.dataType());
 
     ASSERT_EQ(1, array.lengthOf());
     ASSERT_EQ(0, array.rankOf());
 
-    auto aCast = array.cast(nd4j::DataType::UTF8);
+    auto aCast = array.cast(sd::DataType::UTF8);
 
     auto z0 = array.e<std::u16string>(0);
     auto z1 = aCast.e<std::string>(0);
@@ -808,12 +808,12 @@ TEST_F(StringTests, Basic_cast_UTF8toUTF8) {
     std::string u8("€alpha水𝄋ÿ€한𐍈®кею");
 
     auto array = NDArrayFactory::string(u8);
-    ASSERT_EQ(nd4j::DataType::UTF8, array.dataType());
+    ASSERT_EQ(sd::DataType::UTF8, array.dataType());
 
     ASSERT_EQ(1, array.lengthOf());
     ASSERT_EQ(0, array.rankOf());
 
-    auto aCast = array.cast(nd4j::DataType::UTF8);
+    auto aCast = array.cast(sd::DataType::UTF8);
 
     auto z0 = array.e<std::string>(0);
     auto z1 = aCast.e<std::string>(0);
@@ -829,12 +829,12 @@ TEST_F(StringTests, Basic_cast_UTF8toUTF16) {
     std::u16string u16(u"€alpha水𝄋ÿ€한𐍈®кею");
 
     auto array = NDArrayFactory::string(u8);
-    ASSERT_EQ(nd4j::DataType::UTF8, array.dataType());
+    ASSERT_EQ(sd::DataType::UTF8, array.dataType());
 
     ASSERT_EQ(1, array.lengthOf());
     ASSERT_EQ(0, array.rankOf());
 
-    auto aCast = array.cast(nd4j::DataType::UTF16);
+    auto aCast = array.cast(sd::DataType::UTF16);
 
     auto z0 = array.e<std::string>(0);
     auto z1 = aCast.e<std::u16string>(0);
@@ -850,12 +850,12 @@ TEST_F(StringTests, Basic_cast_UTF8toUTF32) {
     std::u32string u32(U"€alpha水𝄋ÿ€한𐍈®кею");
 
     auto array = NDArrayFactory::string(u8);
-    ASSERT_EQ(nd4j::DataType::UTF8, array.dataType());
+    ASSERT_EQ(sd::DataType::UTF8, array.dataType());
 
     ASSERT_EQ(1, array.lengthOf());
     ASSERT_EQ(0, array.rankOf());
 
-    auto aCast = array.cast(nd4j::DataType::UTF32);
+    auto aCast = array.cast(sd::DataType::UTF32);
 
     auto z0 = array.e<std::string>(0);
     auto z1 = aCast.e<std::u32string>(0);
diff --git a/libnd4j/tests_cpu/layers_tests/SwitchTests.cpp b/libnd4j/tests_cpu/layers_tests/SwitchTests.cpp
index ee68c2c5d..8d6a8d180 100644
--- a/libnd4j/tests_cpu/layers_tests/SwitchTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/SwitchTests.cpp
@@ -21,9 +21,9 @@
 #include "testlayers.h"
 #include <ops/declarable/CustomOperations.h>
 
-using namespace nd4j;
-using namespace nd4j::ops;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::ops;
+using namespace sd::graph;
 
 class SwitchTests : public testing::Test {
 public:
@@ -56,14 +56,14 @@ TEST_F(SwitchTests, SwitchTest1) {
 
     // this is our condition op, we'll be using Equals condition, on variables conditionX and conditionY (ids -2 and -3 respectively)
     // we're creating this op manually in tests, as always.
-    nd4j::ops::eq_scalar eqOp;
+    sd::ops::eq_scalar eqOp;
     auto nodeCondition = new Node(&eqOp, 119, {-2, -3});
     //nodeCondition->setOpType(OpType_BOOLEAN);
 
     // now, this is Switch operation. It takes BooleanOperation operation in,
     // and based on evaluation result (true/false) - it'll pass data via :0 or :1 output
     // other idx will be considered disabled, and that graph branch won't be executed
-    nd4j::ops::Switch switchOp;
+    sd::ops::Switch switchOp;
     auto nodeSwitch = new Node(&switchOp, 3, {2, 119}, {4, 5});
 
     // these 2 ops are connected to FALSE and TRUE outputs. output :0 considered FALSE, and output :1 considered TRUE
@@ -148,12 +148,12 @@ TEST_F(SwitchTests, SwitchTest2) {
     auto nodeCondition = new Node(OpType_LOGIC, logic::Scope, 119, {-2, -3});
     nodeCondition->setScopeInfo(3, "scopeCondition");
 
-    nd4j::ops::eq_scalar eqOp;
+    sd::ops::eq_scalar eqOp;
     nodeCondition->setCustomOp(&eqOp);
 
     auto nodeSwitch = new Node(OpType_LOGIC, logic::Switch, 5, {3, 2});
 
-    nd4j::ops::Switch switchOp;
+    sd::ops::Switch switchOp;
     nodeSwitch->setCustomOp(&switchOp);
 
 
@@ -214,12 +214,12 @@ TEST_F(SwitchTests, SwitchTest3) {
     auto nodeCondition = new Node(OpType_LOGIC, logic::Scope, 119, {-2, -3});
     nodeCondition->setScopeInfo(3, "scopeCondition");
 
-    nd4j::ops::eq_scalar eqOp;
+    sd::ops::eq_scalar eqOp;
     nodeCondition->setCustomOp(&eqOp);
 
     auto nodeSwitch = new Node(OpType_LOGIC, logic::Switch, 5, {3, 2});
 
-    nd4j::ops::Switch switchOp;
+    sd::ops::Switch switchOp;
     nodeSwitch->setCustomOp(&switchOp);
 
 
diff --git a/libnd4j/tests_cpu/layers_tests/TadTests.cpp b/libnd4j/tests_cpu/layers_tests/TadTests.cpp
index 86e7264e8..69339a3ac 100644
--- a/libnd4j/tests_cpu/layers_tests/TadTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/TadTests.cpp
@@ -22,12 +22,12 @@
 #define LIBND4J_TADTESTS_H
 
 #include "testlayers.h"
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <helpers/TAD.h>
 #include <array>
 #include <helpers/ConstantTadHelper.h>
 
-using namespace nd4j;
+using namespace sd;
 
 class TadTests : public testing::Test {
 public:
@@ -39,7 +39,7 @@ public:
 
 TEST_F(TadTests, Test4DTad1) {
 
-    NDArray*  arraySource = nd4j::NDArrayFactory::linspace(1.0f, 10000.0f, 10000);
+    NDArray*  arraySource = sd::NDArrayFactory::linspace(1.0f, 10000.0f, 10000);
 
     Nd4jLong badShape[]  = {4, 2, 1, 4, 4, 80, 16, 4, 1, 8192, -1, 99};
     Nd4jLong goodShape[] = {4, 2, 1, 4, 4, 16, 16, 4, 1, 8192,  1, 99};
@@ -245,13 +245,13 @@ TEST_F(TadTests, test_tad_order_4) {
 
 TEST_F(TadTests, test_column_1) {
     auto x = NDArrayFactory::create<float>('c', {5, 2});
-    auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(x.shapeInfo(), 0);
+    auto tadPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(x.shapeInfo(), 0);
 
     ASSERT_EQ(1, shape::rank(tadPack.primaryShapeInfo()));
     ASSERT_EQ(5, shape::length(tadPack.primaryShapeInfo()));
     ASSERT_TRUE(shape::isVector(tadPack.primaryShapeInfo()));
 
-    auto scalarViewPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(tadPack.primaryShapeInfo(), 0);
+    auto scalarViewPack = sd::ConstantTadHelper::getInstance()->tadForDimensions(tadPack.primaryShapeInfo(), 0);
 
     ASSERT_TRUE(shape::equalsStrict(tadPack.primaryShapeInfo(), scalarViewPack.primaryShapeInfo()));
 }
@@ -288,10 +288,10 @@ TEST_F(TadTests, calcOffsets_1) {
 /////////////////////////////////////////////////////////////////
 TEST_F(TadTests, outerArrayIndexes_1) {
 
-    NDArray x('c', {2,3,4,5}, nd4j::DataType::FLOAT32);
+    NDArray x('c', {2,3,4,5}, sd::DataType::FLOAT32);
     Nd4jLong maxIdxs[120];
 
-    NDArray y1('c', {3,5}, nd4j::DataType::FLOAT32);
+    NDArray y1('c', {3,5}, sd::DataType::FLOAT32);
     const std::vector<int> dimsToExclude1 = {0,2};
     const int n1[] = {20,25,30,35,  80,85,90,95};
     int minIdx = 5;
@@ -301,7 +301,7 @@ TEST_F(TadTests, outerArrayIndexes_1) {
     for(int i = 0; i < N; ++i)
         ASSERT_TRUE(n1[i] == maxIdxs[i]);
 
-    NDArray y2('c', {4,5}, nd4j::DataType::FLOAT32);
+    NDArray y2('c', {4,5}, sd::DataType::FLOAT32);
     const std::vector<int> dimsToExclude2 = {0,1};
     const int n2[] = {12,32,52,  72,92,112};
     minIdx = 12;
@@ -311,7 +311,7 @@ TEST_F(TadTests, outerArrayIndexes_1) {
     for(int i = 0; i < N; ++i)
         ASSERT_TRUE(n2[i] == maxIdxs[i]);
 
-    NDArray y3('c', {2,5}, nd4j::DataType::FLOAT32);
+    NDArray y3('c', {2,5}, sd::DataType::FLOAT32);
     const std::vector<int> dimsToExclude3 = {1,2};
     const int n3[] = {64,69,74,79,84,89,94,99,104,109,114,119};
     minIdx = 9;
@@ -321,7 +321,7 @@ TEST_F(TadTests, outerArrayIndexes_1) {
     for(int i = 0; i < N; ++i)
         ASSERT_TRUE(n3[i] == maxIdxs[i]);
 
-    NDArray y4('c', {2,3}, nd4j::DataType::FLOAT32);
+    NDArray y4('c', {2,3}, sd::DataType::FLOAT32);
     const std::vector<int> dimsToExclude4 = {2,3};
     const int n4[] = {20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39};
     minIdx = 1;
@@ -331,7 +331,7 @@ TEST_F(TadTests, outerArrayIndexes_1) {
     for(int i = 0; i < N; ++i)
         ASSERT_TRUE(n4[i] == maxIdxs[i]);
 
-    NDArray y5('c', {2,4}, nd4j::DataType::FLOAT32);
+    NDArray y5('c', {2,4}, sd::DataType::FLOAT32);
     const std::vector<int> dimsToExclude5 = {1,3};
     const int n5[] = {65,66,67,68,69, 85,86,87,88,89, 105,106,107,108,109};
     minIdx = 5;
@@ -341,7 +341,7 @@ TEST_F(TadTests, outerArrayIndexes_1) {
     for(int i = 0; i < N; ++i)
         ASSERT_TRUE(n5[i] == maxIdxs[i]);
 
-    NDArray y6('c', {2,3,4}, nd4j::DataType::FLOAT32);
+    NDArray y6('c', {2,3,4}, sd::DataType::FLOAT32);
     const std::vector<int> dimsToExclude6 = {3};
     const int n6[] = {65,66,67,68,69};
     minIdx = 13;
@@ -351,7 +351,7 @@ TEST_F(TadTests, outerArrayIndexes_1) {
     for(int i = 0; i < N; ++i)
         ASSERT_TRUE(n6[i] == maxIdxs[i]);
 
-    NDArray y7('c', {4}, nd4j::DataType::FLOAT32);
+    NDArray y7('c', {4}, sd::DataType::FLOAT32);
     const std::vector<int> dimsToExclude7 = {0,1,3};
     const int n7[] = {15,16,17,18,19, 35,36,37,38,39, 55,56,57,58,59, 75,76,77,78,79, 95,96,97,98,99, 115,116,117,118,119};
     minIdx = 3;
@@ -361,7 +361,7 @@ TEST_F(TadTests, outerArrayIndexes_1) {
     for(int i = 0; i < N; ++i)
         ASSERT_TRUE(n7[i] == maxIdxs[i]);
 
-    NDArray y8('c', {5}, nd4j::DataType::FLOAT32);
+    NDArray y8('c', {5}, sd::DataType::FLOAT32);
     const std::vector<int> dimsToExclude8 = {0,1,2};
     const int n8[] = {0,5,10,15,  20,25,30,35, 40,45,50,55, 60,65,70,75, 80,85,90,95, 100,105,110,115};
     minIdx = 0;
@@ -371,7 +371,7 @@ TEST_F(TadTests, outerArrayIndexes_1) {
     for(int i = 0; i < N; ++i)
         ASSERT_TRUE(n8[i] == maxIdxs[i]);
 
-    NDArray y9('c', {2}, nd4j::DataType::FLOAT32);
+    NDArray y9('c', {2}, sd::DataType::FLOAT32);
     const std::vector<int> dimsToExclude9 = {1,2,3};
     const int n9[] = {60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119};
     minIdx = 1;
@@ -381,7 +381,7 @@ TEST_F(TadTests, outerArrayIndexes_1) {
     for(int i = 0; i < N; ++i)
         ASSERT_TRUE(n9[i] == maxIdxs[i]);
 
-    NDArray y10('c', {3,4,5}, nd4j::DataType::FLOAT32);
+    NDArray y10('c', {3,4,5}, sd::DataType::FLOAT32);
     const std::vector<int> dimsToExclude10 = {0};
     const int n10[] = {11, 71};
     minIdx = 11;
@@ -391,7 +391,7 @@ TEST_F(TadTests, outerArrayIndexes_1) {
     for(int i = 0; i < N; ++i)
         ASSERT_TRUE(n10[i] == maxIdxs[i]);
 
-    NDArray y11('c', {2,4,5}, nd4j::DataType::FLOAT32);
+    NDArray y11('c', {2,4,5}, sd::DataType::FLOAT32);
     const std::vector<int> dimsToExclude11 = {1};
     const int n11[] = {66, 86, 106};
     minIdx = 26;
@@ -401,7 +401,7 @@ TEST_F(TadTests, outerArrayIndexes_1) {
     for(int i = 0; i < N; ++i)
         ASSERT_TRUE(n11[i] == maxIdxs[i]);
 
-    NDArray y12('c', {3,2}, nd4j::DataType::FLOAT32);
+    NDArray y12('c', {3,2}, sd::DataType::FLOAT32);
     const std::vector<int> dimsToExclude12 = {0,2};
     const int n12[] = {0,2,4,5,7,9,10,12,14,15,17,19,60,62,64,65,67,69,70,72,74,75,77,79};
     minIdx = 0;
@@ -410,7 +410,7 @@ TEST_F(TadTests, outerArrayIndexes_1) {
     for(int i = 0; i < N; ++i)
         ASSERT_TRUE(n12[i] == maxIdxs[i]);
 
-    NDArray y13('c', {3,2}, nd4j::DataType::FLOAT32);
+    NDArray y13('c', {3,2}, sd::DataType::FLOAT32);
     const std::vector<int> dimsToExclude13 = {0,2};
     const int n13[] = {1,3,6,8,11,13,16,18,61,63,66,68,71,73,76,78};
     minIdx = 1;
@@ -419,7 +419,7 @@ TEST_F(TadTests, outerArrayIndexes_1) {
     for(int i = 0; i < N; ++i)
         ASSERT_TRUE(n13[i] == maxIdxs[i]);
 
-    NDArray y14('c', {4,5}, nd4j::DataType::FLOAT32);
+    NDArray y14('c', {4,5}, sd::DataType::FLOAT32);
     const int n14[] = {12,32,52,  72,92,112};
     minIdx = 12;
 
@@ -428,7 +428,7 @@ TEST_F(TadTests, outerArrayIndexes_1) {
     for(int i = 0; i < N; ++i)
         ASSERT_TRUE(n14[i] == maxIdxs[i]);
 
-    NDArray y15('c', {3,4,5}, nd4j::DataType::FLOAT32);
+    NDArray y15('c', {3,4,5}, sd::DataType::FLOAT32);
     const int n15[] = {11, 71};
     minIdx = 11;
 
diff --git a/libnd4j/tests_cpu/layers_tests/ThreadsTests.cpp b/libnd4j/tests_cpu/layers_tests/ThreadsTests.cpp
index fa89fbcaa..a9450e9d0 100644
--- a/libnd4j/tests_cpu/layers_tests/ThreadsTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ThreadsTests.cpp
@@ -26,9 +26,9 @@
 #include <execution/ThreadPool.h>
 
 using namespace samediff;
-using namespace nd4j;
-using namespace nd4j::ops;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::ops;
+using namespace sd::graph;
 
 class ThreadsTests : public testing::Test {
 public:
diff --git a/libnd4j/tests_cpu/layers_tests/TypeCastTests.cpp b/libnd4j/tests_cpu/layers_tests/TypeCastTests.cpp
index 1f352dd2e..2c27f95f9 100644
--- a/libnd4j/tests_cpu/layers_tests/TypeCastTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/TypeCastTests.cpp
@@ -22,9 +22,9 @@
 #include <ops/declarable/CustomOperations.h>
 #include <loops/type_conversions.h>
 
-using namespace nd4j;
-using namespace nd4j::ops;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::ops;
+using namespace sd::graph;
 
 class TypeCastTests : public testing::Test {
 public:
diff --git a/libnd4j/tests_cpu/layers_tests/VariableProxyTests.cpp b/libnd4j/tests_cpu/layers_tests/VariableProxyTests.cpp
index fc1bde841..16e7cf7ac 100644
--- a/libnd4j/tests_cpu/layers_tests/VariableProxyTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/VariableProxyTests.cpp
@@ -21,8 +21,8 @@
 #include "testlayers.h"
 #include <graph/VariableProxy.h>
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class VariableProxyTests : public testing::Test {
 public:
diff --git a/libnd4j/tests_cpu/layers_tests/VariableSpaceTests.cpp b/libnd4j/tests_cpu/layers_tests/VariableSpaceTests.cpp
index 8f28c4907..ec10f3db0 100644
--- a/libnd4j/tests_cpu/layers_tests/VariableSpaceTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/VariableSpaceTests.cpp
@@ -24,10 +24,10 @@
 #include <graph/generated/graph_generated.h>
 #include <graph/Node.h>
 #include <graph/Graph.h>
-#include <NDArray.h>
+#include <array/NDArray.h>
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class VariableSpaceTest : public testing::Test {
 public:
diff --git a/libnd4j/tests_cpu/layers_tests/VariableTests.cpp b/libnd4j/tests_cpu/layers_tests/VariableTests.cpp
index 1a1915fdc..49b9b02d6 100644
--- a/libnd4j/tests_cpu/layers_tests/VariableTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/VariableTests.cpp
@@ -22,12 +22,12 @@
 #define LIBND4J_VARIABLETESTS_H
 
 #include "testlayers.h"
-#include <NDArray.h>
+#include <array/NDArray.h>
 #include <graph/Variable.h>
 #include <flatbuffers/flatbuffers.h>
 
-using namespace nd4j;
-using namespace nd4j::graph;
+using namespace sd;
+using namespace sd::graph;
 
 class VariableTests : public testing::Test {
 public:
@@ -73,9 +73,9 @@ TEST_F(VariableTests, Test_FlatVariableDataType_1) {
     auto fBuffer = builder.CreateVector(vec);
     auto fVid = CreateIntPair(builder, 1, 12);
 
-    auto fArray = CreateFlatArray(builder, fShape, fBuffer, nd4j::graph::DType::DType_FLOAT);
+    auto fArray = CreateFlatArray(builder, fShape, fBuffer, sd::graph::DType::DType_FLOAT);
 
-    auto flatVar = CreateFlatVariable(builder, fVid, 0, nd4j::graph::DType::DType_FLOAT, 0, fArray);
+    auto flatVar = CreateFlatVariable(builder, fVid, 0, sd::graph::DType::DType_FLOAT, 0, fArray);
 
     builder.Finish(flatVar);
 
@@ -107,9 +107,9 @@ TEST_F(VariableTests, Test_FlatVariableDataType_2) {
     auto fBuffer = builder.CreateVector(vec);
     auto fVid = CreateIntPair(builder, 1, 12);
 
-    auto fArray = CreateFlatArray(builder, fShape, fBuffer, nd4j::graph::DType::DType_DOUBLE);
+    auto fArray = CreateFlatArray(builder, fShape, fBuffer, sd::graph::DType::DType_DOUBLE);
 
-    auto flatVar = CreateFlatVariable(builder, fVid, 0, nd4j::graph::DType::DType_DOUBLE, 0, fArray);
+    auto flatVar = CreateFlatVariable(builder, fVid, 0, sd::graph::DType::DType_DOUBLE, 0, fArray);
 
     builder.Finish(flatVar);
 
@@ -144,9 +144,9 @@ TEST_F(VariableTests, Test_FlatVariableDataType_3) {
     auto fBuffer = builder.CreateVector(vec);
     auto fVid = CreateIntPair(builder, 1, 12);
 
-    auto fArray = CreateFlatArray(builder, fShape, fBuffer, nd4j::graph::DType::DType_DOUBLE);
+    auto fArray = CreateFlatArray(builder, fShape, fBuffer, sd::graph::DType::DType_DOUBLE);
 
-    auto flatVar = CreateFlatVariable(builder, fVid, 0, nd4j::graph::DType::DType_DOUBLE, 0, fArray);
+    auto flatVar = CreateFlatVariable(builder, fVid, 0, sd::graph::DType::DType_DOUBLE, 0, fArray);
 
     builder.Finish(flatVar);
 
@@ -179,7 +179,7 @@ TEST_F(VariableTests, Test_FlatVariableDataType_4) {
     auto fShape = builder.CreateVector(original.getShapeAsFlatVector());
     auto fVid = CreateIntPair(builder, 37, 12);
 
-    auto flatVar = CreateFlatVariable(builder, fVid, 0, nd4j::graph::DType::DType_FLOAT, fShape, 0, 0, VarType_PLACEHOLDER);
+    auto flatVar = CreateFlatVariable(builder, fVid, 0, sd::graph::DType::DType_FLOAT, fShape, 0, 0, VarType_PLACEHOLDER);
 
     builder.Finish(flatVar);
 
diff --git a/libnd4j/tests_cpu/layers_tests/WorkspaceTests.cpp b/libnd4j/tests_cpu/layers_tests/WorkspaceTests.cpp
index fd277b971..571db71f3 100644
--- a/libnd4j/tests_cpu/layers_tests/WorkspaceTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/WorkspaceTests.cpp
@@ -22,13 +22,13 @@
 #define LIBND4J_WORKSPACETESTS_H
 
 #include "testlayers.h"
-#include <NDArray.h>
-#include <Workspace.h>
-#include <MemoryRegistrator.h>
-#include <MmulHelper.h>
+#include <array/NDArray.h>
+#include <memory/Workspace.h>
+#include <memory/MemoryRegistrator.h>
+#include <helpers/MmulHelper.h>
 
-using namespace nd4j;
-using namespace nd4j::memory;
+using namespace sd;
+using namespace sd::memory;
 
 class WorkspaceTests : public testing::Test {
 
diff --git a/libnd4j/tests_cpu/layers_tests/WorkspaceTests.cu b/libnd4j/tests_cpu/layers_tests/WorkspaceTests.cu
index 6aafe4ea2..6fe157ac8 100644
--- a/libnd4j/tests_cpu/layers_tests/WorkspaceTests.cu
+++ b/libnd4j/tests_cpu/layers_tests/WorkspaceTests.cu
@@ -19,13 +19,13 @@
 //
 
 #include "testlayers.h"
-#include <NDArray.h>
-#include <Workspace.h>
-#include <MemoryRegistrator.h>
-#include <MmulHelper.h>
+#include <array/NDArray.h>
+#include <memory/Workspace.h>
+#include <memory/MemoryRegistrator.h>
+#include <helpers/MmulHelper.h>
 
-using namespace nd4j;
-using namespace nd4j::memory;
+using namespace sd;
+using namespace sd::memory;
 
 class CudaWorkspaceTests : public testing::Test {
 
diff --git a/libnd4j/tests_cpu/layers_tests/testinclude.h b/libnd4j/tests_cpu/layers_tests/testinclude.h
index 79607cdc9..b266019a9 100644
--- a/libnd4j/tests_cpu/layers_tests/testinclude.h
+++ b/libnd4j/tests_cpu/layers_tests/testinclude.h
@@ -22,7 +22,7 @@
 #define LIBND4J_TESTINCLUDE_H
 #include "testlayers.h"
 #include <string>
-#include <op_boilerplate.h>
+#include <system/op_boilerplate.h>
 
 //https://stackoverflow.com/questions/228005/alternative-to-itoa-for-converting-integer-to-string-c
 FORCEINLINE std::string int_array_to_string(Nd4jLong int_array[], Nd4jLong size_of_array) {
diff --git a/libnd4j/tests_cpu/layers_tests/testlayers.h b/libnd4j/tests_cpu/layers_tests/testlayers.h
index 95814faae..9106223d8 100644
--- a/libnd4j/tests_cpu/layers_tests/testlayers.h
+++ b/libnd4j/tests_cpu/layers_tests/testlayers.h
@@ -22,20 +22,20 @@
 #define LIBND4J_TESTLAYERS_H
 
 #include <memory/MemoryTracker.h>
-#include <op_boilerplate.h>
-#include <pointercast.h>
-#include <NativeOps.h>
+#include <system/op_boilerplate.h>
+#include <system/pointercast.h>
+#include <legacy/NativeOps.h>
 #include <graph/Node.h>
 #include <graph/Variable.h>
 #include <graph/VariableSpace.h>
-#include <NDArray.h>
-#include <cnpy.h>
+#include <array/NDArray.h>
+#include <cnpy/cnpy.h>
 #include <ops/ops.h>
 #include <helpers/shape.h>
 #include <ops/gemm.h>
-#include <GraphExecutioner.h>
+#include <graph/GraphExecutioner.h>
 #include <gtest/gtest.h>
-#include <NDArrayFactory.h>
+#include <array/NDArrayFactory.h>
 #include <array>
 
 #endif //LIBND4J_TESTLAYERS_H
diff --git a/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt b/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt
index fbba329e3..7d3073b58 100644
--- a/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt
+++ b/libnd4j/tests_cpu/libnd4j_tests/CMakeLists.txt
@@ -84,7 +84,7 @@ set(gtest_SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/googletest-src)
 add_definitions(-D__STANDALONE_BUILD__=true)
 
 include_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR})
-include_directories(../../include ../../layers ../../include/helpers ../../include/array ../../include/execution ../../include/exceptions ../../include/memory ../../include/loops ../../include/graph ../../include/ops ../../include/types ../../include/cnpy ../../blas)
+include_directories(../../include)
 if(LINUX)
     link_directories(/usr/local/lib)
     link_directories(/usr/lib)
@@ -109,7 +109,7 @@ endif()
 # -fsanitize=address
 # -fsanitize=leak
 if (APPLE)
-    set(CMAKE_CXX_FLAGS  " -O0 -g -fPIC -std=c++11 -D__APPLE_OS__=true -DAPPLE_BUILD=true")
+    set(CMAKE_CXX_FLAGS  " -O0 -g -fPIC -std=c++11 -D__APPLE_OS__=true -DSD_APPLE_BUILD=true")
 elseif(WIN32)
 	if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
 		set(CMAKE_CXX_FLAGS  " -g -fPIC -std=c++11 -Wa,-mbig-obj")
@@ -127,7 +127,7 @@ else()
         endif()
     else()
         set(CMAKE_CXX_FLAGS  " -g -O0 -fPIC -std=c++11 ")
-        if (NOT CUDA_BLAS)
+        if (NOT SD_CUDA)
             set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address")
         endif()
     endif()
@@ -143,14 +143,14 @@ else()
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDEBUG_BUILD=true")
 endif()
 
-if ("${EXPERIMENTAL}" STREQUAL "yes")
+if ("${SD_EXPERIMENTAL}" STREQUAL "yes")
     message("Experimental mode ENABLED")
     set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__ND4J_EXPERIMENTAL__=true")
     set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__ND4J_EXPERIMENTAL__=true")
 endif()
 
 # tests are always compiled with all ops included
-SET( CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -DLIBND4J_ALL_OPS=true -DDEFAULT_ENGINE=samediff::ENGINE_CPU")
+SET( CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -DSD_ALL_OPS=true -DDEFAULT_ENGINE=samediff::ENGINE_CPU")
 
 if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
     # using Clang
@@ -209,6 +209,7 @@ file(GLOB_RECURSE CUSTOMOPS_GENERIC_SOURCES false ../../include/ops/declarable/h
 file(GLOB_RECURSE OPS_SOURCES false ../../include/ops/impl/*.cpp ../../include/ops/declarable/impl/*.cpp  ../../include/ops/*.h)
 file(GLOB_RECURSE INDEXING_SOURCES false ../../include/indexing/*.cpp ../../include/indexing/*.h)
 file(GLOB_RECURSE HELPERS_SOURCES false ../../include/helpers/*.cpp)
+file(GLOB_RECURSE LEGACY_SOURCES false ../../include/legacy/impl/*.cpp  ../../include/legacy/cpu/*.cpp ../../include/legacy/*.h)
 file(GLOB_RECURSE LOOPS_SOURCES false ../../include/loops/*.cpp ../../include/loops/*.h)
 
 # optionally build mkldnn
@@ -268,10 +269,7 @@ foreach (TMP_PATH ${TEST_SOURCES})
 endforeach(TMP_PATH)
 
 
-add_executable(runtests ${LOOPS_SOURCES} ../../blas/cpu/NativeOps.cpp ../../blas/cpu/GraphExecutioner.cpp
-        ../../blas/cpu/NativeOpExecutioner.cpp ../../blas/cpu/NDArray.cpp ../../blas/cpu/NDArrayFactory.cpp
-    ../../include/cnpy/cnpy.cpp  ../../include/nd4jmemset.h ../../include/nd4jmalloc.h
-    ../../blas/Environment.cpp ../../blas/Environment.h ${EXEC_SOURCES} ${HELPERS_SOURCES}  ${ARRAY_SOURCES} ${TYPES_SOURCES}
+add_executable(runtests ${LOOPS_SOURCES} ${LEGACY_SOURCES} ${EXEC_SOURCES} ${HELPERS_SOURCES}  ${ARRAY_SOURCES} ${TYPES_SOURCES}
     ${MEMORY_SOURCES} ${GRAPH_SOURCES} ${CUSTOMOPS_SOURCES} ${EXCEPTIONS_SOURCES} ${INDEXING_SOURCES} ${CUSTOMOPS_PLATFORM_SOURCES} ${CUSTOMOPS_GENERIC_SOURCES}
     ${OPS_SOURCES} ${TEST_SOURCES} ${PERF_SOURCES})
 
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/pom.xml b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/pom.xml
index 36f25d636..46566f50b 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/pom.xml
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/pom.xml
@@ -198,7 +198,7 @@
                                 </requireProperty>
                                 <requireFilesExist>
                                     <files>
-                                        <file>${libnd4jhome}/blas/NativeOps.h</file>
+                                        <file>${libnd4jhome}/include/legacy/NativeOps.h</file>
                                         <file>${libnd4jhome}/blasbuild/cuda/blas</file>
                                     </files>
                                     <message>!!! You have to compile libnd4j with cuda support first!</message>
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java
index db2c941e9..56c9e4069 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java
@@ -107,7 +107,7 @@ public class Nd4jCuda extends org.nd4j.nativeblas.Nd4jCudaHelper {
     }
 }
 
-@Name("std::vector<nd4j::NDArray*>") public static class NDArrayVector extends Pointer {
+@Name("std::vector<sd::NDArray*>") public static class NDArrayVector extends Pointer {
     static { Loader.load(); }
     /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
     public NDArrayVector(Pointer p) { super(p); }
@@ -175,7 +175,7 @@ public class Nd4jCuda extends org.nd4j.nativeblas.Nd4jCudaHelper {
     }
 }
 
-@Name("std::vector<const nd4j::NDArray*>") public static class ConstNDArrayVector extends Pointer {
+@Name("std::vector<const sd::NDArray*>") public static class ConstNDArrayVector extends Pointer {
     static { Loader.load(); }
     /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
     public ConstNDArrayVector(Pointer p) { super(p); }
@@ -287,7 +287,7 @@ public class Nd4jCuda extends org.nd4j.nativeblas.Nd4jCudaHelper {
 
 // #ifndef ND4J_DATATYPE_H
 // #define ND4J_DATATYPE_H
-    /** enum nd4j::DataType */
+    /** enum sd::DataType */
     public static final int
         INHERIT = 0,
         BOOL = 1,
@@ -343,14 +343,14 @@ public class Nd4jCuda extends org.nd4j.nativeblas.Nd4jCudaHelper {
 // #define DEV_TESTS_DATABUFFER_H
 
 // #include <cstring>
-// #include <op_boilerplate.h>
-// #include <dll.h>
-// #include <pointercast.h>
+// #include <system/op_boilerplate.h>
+// #include <system/dll.h>
+// #include <system/pointercast.h>
 // #include <array/DataType.h>
 // #include <memory/Workspace.h>
 // #include <execution/LaunchContext.h>
 
-@Namespace("nd4j") @NoOffset public static class DataBuffer extends Pointer {
+@Namespace("sd") @NoOffset public static class DataBuffer extends Pointer {
     static { Loader.load(); }
     /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
     public DataBuffer(Pointer p) { super(p); }
@@ -363,46 +363,46 @@ public class Nd4jCuda extends org.nd4j.nativeblas.Nd4jCudaHelper {
 
 
         public DataBuffer(Pointer primary, Pointer special,
-                                       @Cast("const size_t") long lenInBytes, @Cast("const nd4j::DataType") int dataType,
+                                       @Cast("const size_t") long lenInBytes, @Cast("const sd::DataType") int dataType,
                                        @Cast("const bool") boolean isOwnerPrimary/*=false*/, @Cast("const bool") boolean isOwnerSpecial/*=false*/,
                                        Workspace workspace/*=nullptr*/) { super((Pointer)null); allocate(primary, special, lenInBytes, dataType, isOwnerPrimary, isOwnerSpecial, workspace); }
         private native void allocate(Pointer primary, Pointer special,
-                                       @Cast("const size_t") long lenInBytes, @Cast("const nd4j::DataType") int dataType,
+                                       @Cast("const size_t") long lenInBytes, @Cast("const sd::DataType") int dataType,
                                        @Cast("const bool") boolean isOwnerPrimary/*=false*/, @Cast("const bool") boolean isOwnerSpecial/*=false*/,
                                        Workspace workspace/*=nullptr*/);
         public DataBuffer(Pointer primary, Pointer special,
-                                       @Cast("const size_t") long lenInBytes, @Cast("const nd4j::DataType") int dataType) { super((Pointer)null); allocate(primary, special, lenInBytes, dataType); }
+                                       @Cast("const size_t") long lenInBytes, @Cast("const sd::DataType") int dataType) { super((Pointer)null); allocate(primary, special, lenInBytes, dataType); }
         private native void allocate(Pointer primary, Pointer special,
-                                       @Cast("const size_t") long lenInBytes, @Cast("const nd4j::DataType") int dataType);
+                                       @Cast("const size_t") long lenInBytes, @Cast("const sd::DataType") int dataType);
 
         public DataBuffer(Pointer primary,
-                                       @Cast("const size_t") long lenInBytes, @Cast("const nd4j::DataType") int dataType,
+                                       @Cast("const size_t") long lenInBytes, @Cast("const sd::DataType") int dataType,
                                        @Cast("const bool") boolean isOwnerPrimary/*=false*/,
                                        Workspace workspace/*=nullptr*/) { super((Pointer)null); allocate(primary, lenInBytes, dataType, isOwnerPrimary, workspace); }
         private native void allocate(Pointer primary,
-                                       @Cast("const size_t") long lenInBytes, @Cast("const nd4j::DataType") int dataType,
+                                       @Cast("const size_t") long lenInBytes, @Cast("const sd::DataType") int dataType,
                                        @Cast("const bool") boolean isOwnerPrimary/*=false*/,
                                        Workspace workspace/*=nullptr*/);
         public DataBuffer(Pointer primary,
-                                       @Cast("const size_t") long lenInBytes, @Cast("const nd4j::DataType") int dataType) { super((Pointer)null); allocate(primary, lenInBytes, dataType); }
+                                       @Cast("const size_t") long lenInBytes, @Cast("const sd::DataType") int dataType) { super((Pointer)null); allocate(primary, lenInBytes, dataType); }
         private native void allocate(Pointer primary,
-                                       @Cast("const size_t") long lenInBytes, @Cast("const nd4j::DataType") int dataType);
+                                       @Cast("const size_t") long lenInBytes, @Cast("const sd::DataType") int dataType);
 
         public DataBuffer(@Const Pointer hostBuffer,
-                                       @Cast("const nd4j::DataType") int dataType, @Cast("const size_t") long lenInBytes,
+                                       @Cast("const sd::DataType") int dataType, @Cast("const size_t") long lenInBytes,
                                        Workspace workspace/*=nullptr*/) { super((Pointer)null); allocate(hostBuffer, dataType, lenInBytes, workspace); }
         private native void allocate(@Const Pointer hostBuffer,
-                                       @Cast("const nd4j::DataType") int dataType, @Cast("const size_t") long lenInBytes,
+                                       @Cast("const sd::DataType") int dataType, @Cast("const size_t") long lenInBytes,
                                        Workspace workspace/*=nullptr*/);
         public DataBuffer(@Const Pointer hostBuffer,
-                                       @Cast("const nd4j::DataType") int dataType, @Cast("const size_t") long lenInBytes) { super((Pointer)null); allocate(hostBuffer, dataType, lenInBytes); }
+                                       @Cast("const sd::DataType") int dataType, @Cast("const size_t") long lenInBytes) { super((Pointer)null); allocate(hostBuffer, dataType, lenInBytes); }
         private native void allocate(@Const Pointer hostBuffer,
-                                       @Cast("const nd4j::DataType") int dataType, @Cast("const size_t") long lenInBytes);
+                                       @Cast("const sd::DataType") int dataType, @Cast("const size_t") long lenInBytes);
 
-        public DataBuffer(@Cast("const size_t") long lenInBytes, @Cast("const nd4j::DataType") int dataType, Workspace workspace/*=nullptr*/, @Cast("const bool") boolean allocBoth/*=false*/) { super((Pointer)null); allocate(lenInBytes, dataType, workspace, allocBoth); }
-        private native void allocate(@Cast("const size_t") long lenInBytes, @Cast("const nd4j::DataType") int dataType, Workspace workspace/*=nullptr*/, @Cast("const bool") boolean allocBoth/*=false*/);
-        public DataBuffer(@Cast("const size_t") long lenInBytes, @Cast("const nd4j::DataType") int dataType) { super((Pointer)null); allocate(lenInBytes, dataType); }
-        private native void allocate(@Cast("const size_t") long lenInBytes, @Cast("const nd4j::DataType") int dataType);
+        public DataBuffer(@Cast("const size_t") long lenInBytes, @Cast("const sd::DataType") int dataType, Workspace workspace/*=nullptr*/, @Cast("const bool") boolean allocBoth/*=false*/) { super((Pointer)null); allocate(lenInBytes, dataType, workspace, allocBoth); }
+        private native void allocate(@Cast("const size_t") long lenInBytes, @Cast("const sd::DataType") int dataType, Workspace workspace/*=nullptr*/, @Cast("const bool") boolean allocBoth/*=false*/);
+        public DataBuffer(@Cast("const size_t") long lenInBytes, @Cast("const sd::DataType") int dataType) { super((Pointer)null); allocate(lenInBytes, dataType); }
+        private native void allocate(@Cast("const size_t") long lenInBytes, @Cast("const sd::DataType") int dataType);
 
         public DataBuffer(@Const @ByRef DataBuffer other) { super((Pointer)null); allocate(other); }
         private native void allocate(@Const @ByRef DataBuffer other);
@@ -411,8 +411,8 @@ public class Nd4jCuda extends org.nd4j.nativeblas.Nd4jCudaHelper {
 
         public native @ByRef @Name("operator =") DataBuffer put(@Const @ByRef DataBuffer other);
 
-        public native @Cast("nd4j::DataType") int getDataType();
-        public native void setDataType(@Cast("nd4j::DataType") int dataType);
+        public native @Cast("sd::DataType") int getDataType();
+        public native void setDataType(@Cast("sd::DataType") int dataType);
         public native @Cast("size_t") long getLenInBytes();
 
         public native Pointer primary();
@@ -497,10 +497,10 @@ public class Nd4jCuda extends org.nd4j.nativeblas.Nd4jCudaHelper {
 // #include <array/DataType.h>
 // #include <unordered_map>
 // #include <vector>
-// #include <pointercast.h>
-// #include <dll.h>
+// #include <system/pointercast.h>
+// #include <system/dll.h>
 // #include <array/ConstantDataBuffer.h>
-    @Namespace("nd4j") @NoOffset public static class ConstantDescriptor extends Pointer {
+    @Namespace("sd") @NoOffset public static class ConstantDescriptor extends Pointer {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public ConstantDescriptor(Pointer p) { super(p); }
@@ -547,6 +547,10 @@ public class Nd4jCuda extends org.nd4j.nativeblas.Nd4jCudaHelper {
     }
 
 
+// #ifndef __JAVACPP_HACK__
+
+// #endif
+
 
 // #endif //DEV_TESTS_CONSTANTDESCRIPTOR_H
 
@@ -575,9 +579,9 @@ public class Nd4jCuda extends org.nd4j.nativeblas.Nd4jCudaHelper {
 // #ifndef LIBND4J_CONSTANTDATABUFFER_H
 // #define LIBND4J_CONSTANTDATABUFFER_H
 
-// #include <dll.h>
-// #include <pointercast.h>
-    @Namespace("nd4j") @NoOffset public static class ConstantDataBuffer extends Pointer {
+// #include <system/dll.h>
+// #include <system/pointercast.h>
+    @Namespace("sd") @NoOffset public static class ConstantDataBuffer extends Pointer {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public ConstantDataBuffer(Pointer p) { super(p); }
@@ -634,7 +638,7 @@ public class Nd4jCuda extends org.nd4j.nativeblas.Nd4jCudaHelper {
 // #define DEV_TESTS_TADPACK_H
 
 // #include "ConstantDataBuffer.h"
-    @Namespace("nd4j") @NoOffset public static class TadPack extends Pointer {
+    @Namespace("sd") @NoOffset public static class TadPack extends Pointer {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public TadPack(Pointer p) { super(p); }
@@ -698,7 +702,7 @@ public class Nd4jCuda extends org.nd4j.nativeblas.Nd4jCudaHelper {
 // #define DEV_TESTS_ERRORREFERENCE_H
 
 // #include <string>
-// #include <dll.h>
+// #include <system/dll.h>
     @Namespace("sd") @NoOffset public static class ErrorReference extends Pointer {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
@@ -801,7 +805,7 @@ public class Nd4jCuda extends org.nd4j.nativeblas.Nd4jCudaHelper {
 
 // #ifndef DEV_TESTS_MEMORYTYPE_H
 // #define DEV_TESTS_MEMORYTYPE_H
-        /** enum nd4j::memory::MemoryType */
+        /** enum sd::memory::MemoryType */
         public static final int
             HOST = 0,
             DEVICE = 10;
@@ -811,7 +815,7 @@ public class Nd4jCuda extends org.nd4j.nativeblas.Nd4jCudaHelper {
 // #endif //DEV_TESTS_MEMORYTYPE_H
 
 
-// Parsed from Environment.h
+// Parsed from system/Environment.h
 
 /*******************************************************************************
  * Copyright (c) 2015-2018 Skymind, Inc.
@@ -838,12 +842,12 @@ public class Nd4jCuda extends org.nd4j.nativeblas.Nd4jCudaHelper {
 
 // #include <atomic>
 // #include <vector>
-// #include <dll.h>
+// #include <system/dll.h>
 // #include <stdexcept>
 // #include <array/DataType.h>
 // #include <types/pair.h>
-// #include <pointercast.h>
-    @Namespace("nd4j") @NoOffset public static class Environment extends Pointer {
+// #include <system/pointercast.h>
+    @Namespace("sd") @NoOffset public static class Environment extends Pointer {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public Environment(Pointer p) { super(p); }
@@ -908,8 +912,8 @@ public class Nd4jCuda extends org.nd4j.nativeblas.Nd4jCudaHelper {
         public native @Cast("bool") boolean isUseMKLDNN();
         public native void setUseMKLDNN(@Cast("bool") boolean useMKLDNN);
 
-        public native @Cast("nd4j::DataType") int defaultFloatDataType();
-        public native void setDefaultFloatDataType(@Cast("nd4j::DataType") int dtype);
+        public native @Cast("sd::DataType") int defaultFloatDataType();
+        public native void setDefaultFloatDataType(@Cast("sd::DataType") int dtype);
 
         public native @Cast("bool") boolean precisionBoostAllowed();
         public native void allowPrecisionBoost(@Cast("bool") boolean reallyAllow);
@@ -956,8 +960,8 @@ public class Nd4jCuda extends org.nd4j.nativeblas.Nd4jCudaHelper {
 // #define DEV_TESTS_UTF8STRING_H
 
 // #include <string>
-// #include <dll.h>
-    @Namespace("nd4j") @NoOffset public static class utf8string extends Pointer {
+// #include <system/dll.h>
+    @Namespace("sd") @NoOffset public static class utf8string extends Pointer {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public utf8string(Pointer p) { super(p); }
@@ -992,7 +996,7 @@ public class Nd4jCuda extends org.nd4j.nativeblas.Nd4jCudaHelper {
 // #endif //DEV_TESTS_UTF8STRING_H
 
 
-// Parsed from NativeOps.h
+// Parsed from legacy/NativeOps.h
 
 /*******************************************************************************
  * Copyright (c) 2015-2018 Skymind, Inc.
@@ -1038,9 +1042,9 @@ public class Nd4jCuda extends org.nd4j.nativeblas.Nd4jCudaHelper {
 #endif
 */
 
-// #include <pointercast.h>
+// #include <system/pointercast.h>
 // #include <types/float16.h>
-// #include <cnpy.h>
+// #include <cnpy/cnpy.h>
 
 //DO NOT REMOVE: THIS IS AN EDITOR SEMANTICS THING FOR CLION
 //IT DEFINES THE EXPORT MACRO FOR THE EDITOR AND THEN
@@ -1050,7 +1054,7 @@ public class Nd4jCuda extends org.nd4j.nativeblas.Nd4jCudaHelper {
 // #else
 // #define ND4J_EXPORT
 // #endif
-// #include <dll.h>
+// #include <system/dll.h>
 
 /*
 int tad_threshold = 1;
@@ -1071,7 +1075,7 @@ bool verbose = false;
 // #include <graph/GraphState.h>
 // #include <graph/execution/LogicExecutor.h>
 // #include <graph/ResultWrapper.h>
-// #include <DebugInfo.h>
+// #include <helpers/DebugInfo.h>
 // #include <memory/MemoryCounter.h>
 
 /**
@@ -2421,7 +2425,7 @@ public native void execAggregate(@Cast("Nd4jPointer*") PointerPointer extraPoint
                          int numIntArrays,
                          Pointer realArguments,
                          int numRealArguments,
-                         @Cast("nd4j::DataType") int dtype);
+                         @Cast("sd::DataType") int dtype);
 public native void execAggregate(@Cast("Nd4jPointer*") PointerPointer extraPointers,
                          int opNum,
                          @Cast("void**") @ByPtrPtr Pointer arguments,
@@ -2434,7 +2438,7 @@ public native void execAggregate(@Cast("Nd4jPointer*") PointerPointer extraPoint
                          int numIntArrays,
                          Pointer realArguments,
                          int numRealArguments,
-                         @Cast("nd4j::DataType") int dtype);
+                         @Cast("sd::DataType") int dtype);
 public native void execAggregate(@Cast("Nd4jPointer*") PointerPointer extraPointers,
                          int opNum,
                          @Cast("void**") @ByPtrPtr Pointer arguments,
@@ -2447,7 +2451,7 @@ public native void execAggregate(@Cast("Nd4jPointer*") PointerPointer extraPoint
                          int numIntArrays,
                          Pointer realArguments,
                          int numRealArguments,
-                         @Cast("nd4j::DataType") int dtype);
+                         @Cast("sd::DataType") int dtype);
 public native void execAggregate(@Cast("Nd4jPointer*") PointerPointer extraPointers,
                          int opNum,
                          @Cast("void**") @ByPtrPtr Pointer arguments,
@@ -2460,7 +2464,7 @@ public native void execAggregate(@Cast("Nd4jPointer*") PointerPointer extraPoint
                          int numIntArrays,
                          Pointer realArguments,
                          int numRealArguments,
-                         @Cast("nd4j::DataType") int dtype);
+                         @Cast("sd::DataType") int dtype);
 
 
 public native void batchExecutor(@Cast("Nd4jPointer*") PointerPointer extraPointers,
@@ -2473,7 +2477,7 @@ public native void batchExecutor(@Cast("Nd4jPointer*") PointerPointer extraPoint
                                int maxIdx,
                                int maxReals,
                                Pointer ptrToArguments,
-                               @Cast("nd4j::DataType") int dtype);
+                               @Cast("sd::DataType") int dtype);
 
 public native void execAggregateBatch(@Cast("Nd4jPointer*") PointerPointer extraPointers,
                               int numAggregates,
@@ -2485,7 +2489,7 @@ public native void execAggregateBatch(@Cast("Nd4jPointer*") PointerPointer extra
                               int maxIdx,
                               int maxReals,
                               Pointer ptrToArguments,
-                              @Cast("nd4j::DataType") int dtype);
+                              @Cast("sd::DataType") int dtype);
 
 /**
  * Random operations
@@ -3066,17 +3070,17 @@ public native void inspectArray(@Cast("Nd4jPointer*") PointerPointer extraPointe
 public native void inspectArray(@Cast("Nd4jPointer*") PointerPointer extraPointers, @Cast("Nd4jPointer") Pointer buffer, @Cast("Nd4jLong*") LongBuffer shapeInfo, @Cast("Nd4jPointer") Pointer specialBuffer, @Cast("Nd4jLong*") LongBuffer specialShapeInfo, @Cast("Nd4jPointer") Pointer debugInfo);
 public native void inspectArray(@Cast("Nd4jPointer*") PointerPointer extraPointers, @Cast("Nd4jPointer") Pointer buffer, @Cast("Nd4jLong*") long[] shapeInfo, @Cast("Nd4jPointer") Pointer specialBuffer, @Cast("Nd4jLong*") long[] specialShapeInfo, @Cast("Nd4jPointer") Pointer debugInfo);
 
-public native OpaqueConstantDataBuffer shapeBuffer(int rank, @Cast("Nd4jLong*") LongPointer shape, @Cast("Nd4jLong*") LongPointer strides, @Cast("nd4j::DataType") int dtype, char order, @Cast("Nd4jLong") long ews, @Cast("bool") boolean empty);
-public native OpaqueConstantDataBuffer shapeBuffer(int rank, @Cast("Nd4jLong*") LongBuffer shape, @Cast("Nd4jLong*") LongBuffer strides, @Cast("nd4j::DataType") int dtype, char order, @Cast("Nd4jLong") long ews, @Cast("bool") boolean empty);
-public native OpaqueConstantDataBuffer shapeBuffer(int rank, @Cast("Nd4jLong*") long[] shape, @Cast("Nd4jLong*") long[] strides, @Cast("nd4j::DataType") int dtype, char order, @Cast("Nd4jLong") long ews, @Cast("bool") boolean empty);
+public native OpaqueConstantDataBuffer shapeBuffer(int rank, @Cast("Nd4jLong*") LongPointer shape, @Cast("Nd4jLong*") LongPointer strides, @Cast("sd::DataType") int dtype, char order, @Cast("Nd4jLong") long ews, @Cast("bool") boolean empty);
+public native OpaqueConstantDataBuffer shapeBuffer(int rank, @Cast("Nd4jLong*") LongBuffer shape, @Cast("Nd4jLong*") LongBuffer strides, @Cast("sd::DataType") int dtype, char order, @Cast("Nd4jLong") long ews, @Cast("bool") boolean empty);
+public native OpaqueConstantDataBuffer shapeBuffer(int rank, @Cast("Nd4jLong*") long[] shape, @Cast("Nd4jLong*") long[] strides, @Cast("sd::DataType") int dtype, char order, @Cast("Nd4jLong") long ews, @Cast("bool") boolean empty);
 
-public native OpaqueConstantDataBuffer constantBufferLong(@Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*") LongPointer data, int length);
-public native OpaqueConstantDataBuffer constantBufferLong(@Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*") LongBuffer data, int length);
-public native OpaqueConstantDataBuffer constantBufferLong(@Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*") long[] data, int length);
-public native OpaqueConstantDataBuffer constantBufferDouble(@Cast("nd4j::DataType") int dtype, DoublePointer data, int length);
-public native OpaqueConstantDataBuffer constantBufferDouble(@Cast("nd4j::DataType") int dtype, DoubleBuffer data, int length);
-public native OpaqueConstantDataBuffer constantBufferDouble(@Cast("nd4j::DataType") int dtype, double[] data, int length);
-public native OpaqueConstantDataBuffer constantBuffer(@Cast("nd4j::DataType") int dtype, ConstantDescriptor descriptor);
+public native OpaqueConstantDataBuffer constantBufferLong(@Cast("sd::DataType") int dtype, @Cast("Nd4jLong*") LongPointer data, int length);
+public native OpaqueConstantDataBuffer constantBufferLong(@Cast("sd::DataType") int dtype, @Cast("Nd4jLong*") LongBuffer data, int length);
+public native OpaqueConstantDataBuffer constantBufferLong(@Cast("sd::DataType") int dtype, @Cast("Nd4jLong*") long[] data, int length);
+public native OpaqueConstantDataBuffer constantBufferDouble(@Cast("sd::DataType") int dtype, DoublePointer data, int length);
+public native OpaqueConstantDataBuffer constantBufferDouble(@Cast("sd::DataType") int dtype, DoubleBuffer data, int length);
+public native OpaqueConstantDataBuffer constantBufferDouble(@Cast("sd::DataType") int dtype, double[] data, int length);
+public native OpaqueConstantDataBuffer constantBuffer(@Cast("sd::DataType") int dtype, ConstantDescriptor descriptor);
 
 public native @Cast("Nd4jPointer") Pointer getConstantDataBufferPrimary(OpaqueConstantDataBuffer dbf);
 public native @Cast("Nd4jPointer") Pointer getConstantDataBufferSpecial(OpaqueConstantDataBuffer dbf);
@@ -3189,9 +3193,9 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #ifndef LIBND4J_EXTERNALWORKSPACE_H
 // #define LIBND4J_EXTERNALWORKSPACE_H
 
-// #include <pointercast.h>
-// #include <dll.h>
-        @Namespace("nd4j::memory") @NoOffset public static class ExternalWorkspace extends Pointer {
+// #include <system/pointercast.h>
+// #include <system/dll.h>
+        @Namespace("sd::memory") @NoOffset public static class ExternalWorkspace extends Pointer {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public ExternalWorkspace(Pointer p) { super(p); }
@@ -3250,13 +3254,13 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #include <atomic>
 // #include <vector>
 // #include <mutex>
-// #include <dll.h>
-// #include <pointercast.h>
+// #include <system/dll.h>
+// #include <system/pointercast.h>
 // #include <types/float16.h>
 // #include <memory/ExternalWorkspace.h>
 // #include <memory/MemoryType.h>
 
-        @Namespace("nd4j::memory") @NoOffset public static class Workspace extends Pointer {
+        @Namespace("sd::memory") @NoOffset public static class Workspace extends Pointer {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public Workspace(Pointer p) { super(p); }
@@ -3294,7 +3298,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 //            bool resizeSupported();
 
             public native Pointer allocateBytes(@Cast("Nd4jLong") long numBytes);
-            public native Pointer allocateBytes(@Cast("nd4j::memory::MemoryType") int type, @Cast("Nd4jLong") long numBytes);
+            public native Pointer allocateBytes(@Cast("sd::memory::MemoryType") int type, @Cast("Nd4jLong") long numBytes);
 
             public native void scopeIn();
             public native void scopeOut();
@@ -3335,10 +3339,10 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #ifndef LIBND4J_NDINDEX_H
 // #define LIBND4J_NDINDEX_H
 
-// #include <pointercast.h>
+// #include <system/pointercast.h>
 // #include <vector>
-// #include <dll.h>
-    @Namespace("nd4j") @NoOffset public static class NDIndex extends Pointer {
+// #include <system/dll.h>
+    @Namespace("sd") @NoOffset public static class NDIndex extends Pointer {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public NDIndex(Pointer p) { super(p); }
@@ -3365,7 +3369,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         public static native NDIndex interval(@Cast("Nd4jLong") long start, @Cast("Nd4jLong") long end);
     }
 
-    @Namespace("nd4j") public static class NDIndexAll extends NDIndex {
+    @Namespace("sd") public static class NDIndexAll extends NDIndex {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public NDIndexAll(Pointer p) { super(p); }
@@ -3382,7 +3386,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
     }
 
 
-    @Namespace("nd4j") public static class NDIndexPoint extends NDIndex {
+    @Namespace("sd") public static class NDIndexPoint extends NDIndex {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public NDIndexPoint(Pointer p) { super(p); }
@@ -3392,7 +3396,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         public native @Cast("bool") boolean isInterval();
     }
 
-    @Namespace("nd4j") public static class NDIndexInterval extends NDIndex {
+    @Namespace("sd") public static class NDIndexInterval extends NDIndex {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public NDIndexInterval(Pointer p) { super(p); }
@@ -3437,7 +3441,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 
 // #include <initializer_list>
 // #include "NDIndex.h"
-    @Namespace("nd4j") @NoOffset public static class IndicesList extends Pointer {
+    @Namespace("sd") @NoOffset public static class IndicesList extends Pointer {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public IndicesList(Pointer p) { super(p); }
@@ -3476,7 +3480,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 
 // #ifndef ND4J_VARIABLE_TYPE_H
 // #define ND4J_VARIABLE_TYPE_H
-        /** enum nd4j::graph::VariableType */
+        /** enum sd::graph::VariableType */
         public static final int
             NDARRAY = 0,
             ARRAY_LIST = 1,
@@ -3513,12 +3517,12 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #ifndef LIBND4J_INPUTLIST_H
 // #define LIBND4J_INPUTLIST_H
 
-// #include <op_boilerplate.h>
-// #include <pointercast.h>
-// #include <dll.h>
+// #include <system/op_boilerplate.h>
+// #include <system/pointercast.h>
+// #include <system/dll.h>
 // #include <vector>
 // #include <types/pair.h>
-    @Namespace("nd4j::graph") @NoOffset public static class ArgumentsList extends Pointer {
+    @Namespace("sd::graph") @NoOffset public static class ArgumentsList extends Pointer {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public ArgumentsList(Pointer p) { super(p); }
@@ -3578,8 +3582,8 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #ifndef LIBND4J_PAIR_H
 // #define LIBND4J_PAIR_H
 
-// #include <dll.h>
-    @Namespace("nd4j") @NoOffset public static class Pair extends Pointer {
+// #include <system/dll.h>
+    @Namespace("sd") @NoOffset public static class Pair extends Pointer {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public Pair(Pointer p) { super(p); }
@@ -3604,7 +3608,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #endif //LIBND4J_PAIR_H
 
 
-// Parsed from NDArray.h
+// Parsed from array/NDArray.h
 
 /*******************************************************************************
  * Copyright (c) 2015-2018 Skymind, Inc.
@@ -3625,11 +3629,11 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #ifndef NDARRAY_H
 // #define NDARRAY_H
 
-// #include <dll.h>
+// #include <system/dll.h>
 // #include <initializer_list>
 // #include <functional>
-// #include <shape.h>
-// #include "NativeOpExecutioner.h"
+// #include <helpers/shape.h>
+// #include "legacy/NativeOpExecutioner.h"
 // #include <indexing/NDIndex.h>
 // #include <indexing/IndicesList.h>
 // #include <graph/Intervals.h>
@@ -3640,13 +3644,13 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #include <array/ArrayType.h>
 // #include <array/ResultSet.h>
 // #include <helpers/ShapeBuilders.h>
-// #include <op_enums.h>
+// #include <system/op_enums.h>
 // #include <ops/BroadcastOpsTuple.h>
 // #include <ops/BroadcastBoolOpsTuple.h>
 // #include <ops/BroadcastIntOpsTuple.h>
 // #include <array/ExtraArguments.h>
-// #include <Status.h>
-// #include <ShapeDescriptor.h>
+// #include <graph/Status.h>
+// #include <array/ShapeDescriptor.h>
 // #include <helpers/ConstantShapeHelper.h>
 // #include <array/DataBuffer.h>
 // #include <execution/AffinityManager.h>
@@ -3657,9 +3661,9 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 
 
 
-    @Namespace("nd4j") public static native @ByVal NDArray mmul(@Const @ByRef NDArray arg0, @Const @ByRef NDArray arg1);
+    @Namespace("sd") public static native @ByVal NDArray mmul(@Const @ByRef NDArray arg0, @Const @ByRef NDArray arg1);
 
-    @Namespace("nd4j") @NoOffset public static class NDArray extends Pointer {
+    @Namespace("sd") @NoOffset public static class NDArray extends Pointer {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public NDArray(Pointer p) { super(p); }
@@ -3677,16 +3681,16 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         /**
         *  do not allocate memory, memory for array is passed from outside
         */
-        public NDArray(Pointer buffer, @Cast("Nd4jLong*") LongPointer shapeInfo, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/) { super((Pointer)null); allocate(buffer, shapeInfo, context, isBuffAlloc); }
-        private native void allocate(Pointer buffer, @Cast("Nd4jLong*") LongPointer shapeInfo, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/);
+        public NDArray(Pointer buffer, @Cast("Nd4jLong*") LongPointer shapeInfo, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/) { super((Pointer)null); allocate(buffer, shapeInfo, context, isBuffAlloc); }
+        private native void allocate(Pointer buffer, @Cast("Nd4jLong*") LongPointer shapeInfo, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/);
         public NDArray(Pointer buffer, @Cast("Nd4jLong*") LongPointer shapeInfo) { super((Pointer)null); allocate(buffer, shapeInfo); }
         private native void allocate(Pointer buffer, @Cast("Nd4jLong*") LongPointer shapeInfo);
-        public NDArray(Pointer buffer, @Cast("Nd4jLong*") LongBuffer shapeInfo, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/) { super((Pointer)null); allocate(buffer, shapeInfo, context, isBuffAlloc); }
-        private native void allocate(Pointer buffer, @Cast("Nd4jLong*") LongBuffer shapeInfo, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/);
+        public NDArray(Pointer buffer, @Cast("Nd4jLong*") LongBuffer shapeInfo, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/) { super((Pointer)null); allocate(buffer, shapeInfo, context, isBuffAlloc); }
+        private native void allocate(Pointer buffer, @Cast("Nd4jLong*") LongBuffer shapeInfo, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/);
         public NDArray(Pointer buffer, @Cast("Nd4jLong*") LongBuffer shapeInfo) { super((Pointer)null); allocate(buffer, shapeInfo); }
         private native void allocate(Pointer buffer, @Cast("Nd4jLong*") LongBuffer shapeInfo);
-        public NDArray(Pointer buffer, @Cast("Nd4jLong*") long[] shapeInfo, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/) { super((Pointer)null); allocate(buffer, shapeInfo, context, isBuffAlloc); }
-        private native void allocate(Pointer buffer, @Cast("Nd4jLong*") long[] shapeInfo, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/);
+        public NDArray(Pointer buffer, @Cast("Nd4jLong*") long[] shapeInfo, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/) { super((Pointer)null); allocate(buffer, shapeInfo, context, isBuffAlloc); }
+        private native void allocate(Pointer buffer, @Cast("Nd4jLong*") long[] shapeInfo, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/);
         public NDArray(Pointer buffer, @Cast("Nd4jLong*") long[] shapeInfo) { super((Pointer)null); allocate(buffer, shapeInfo); }
         private native void allocate(Pointer buffer, @Cast("Nd4jLong*") long[] shapeInfo);
 
@@ -3694,16 +3698,16 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         *  do not allocate memory, memory for array is passed from outside
         *  we suppose the content of both (device and host) buffers is identical
         */
-        public NDArray(Pointer buffer, Pointer bufferD, @Cast("Nd4jLong*") LongPointer shapeInfo, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/, @Cast("const bool") boolean isBuffDAlloc/*=false*/) { super((Pointer)null); allocate(buffer, bufferD, shapeInfo, context, isBuffAlloc, isBuffDAlloc); }
-        private native void allocate(Pointer buffer, Pointer bufferD, @Cast("Nd4jLong*") LongPointer shapeInfo, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/, @Cast("const bool") boolean isBuffDAlloc/*=false*/);
+        public NDArray(Pointer buffer, Pointer bufferD, @Cast("Nd4jLong*") LongPointer shapeInfo, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/, @Cast("const bool") boolean isBuffDAlloc/*=false*/) { super((Pointer)null); allocate(buffer, bufferD, shapeInfo, context, isBuffAlloc, isBuffDAlloc); }
+        private native void allocate(Pointer buffer, Pointer bufferD, @Cast("Nd4jLong*") LongPointer shapeInfo, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/, @Cast("const bool") boolean isBuffDAlloc/*=false*/);
         public NDArray(Pointer buffer, Pointer bufferD, @Cast("Nd4jLong*") LongPointer shapeInfo) { super((Pointer)null); allocate(buffer, bufferD, shapeInfo); }
         private native void allocate(Pointer buffer, Pointer bufferD, @Cast("Nd4jLong*") LongPointer shapeInfo);
-        public NDArray(Pointer buffer, Pointer bufferD, @Cast("Nd4jLong*") LongBuffer shapeInfo, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/, @Cast("const bool") boolean isBuffDAlloc/*=false*/) { super((Pointer)null); allocate(buffer, bufferD, shapeInfo, context, isBuffAlloc, isBuffDAlloc); }
-        private native void allocate(Pointer buffer, Pointer bufferD, @Cast("Nd4jLong*") LongBuffer shapeInfo, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/, @Cast("const bool") boolean isBuffDAlloc/*=false*/);
+        public NDArray(Pointer buffer, Pointer bufferD, @Cast("Nd4jLong*") LongBuffer shapeInfo, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/, @Cast("const bool") boolean isBuffDAlloc/*=false*/) { super((Pointer)null); allocate(buffer, bufferD, shapeInfo, context, isBuffAlloc, isBuffDAlloc); }
+        private native void allocate(Pointer buffer, Pointer bufferD, @Cast("Nd4jLong*") LongBuffer shapeInfo, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/, @Cast("const bool") boolean isBuffDAlloc/*=false*/);
         public NDArray(Pointer buffer, Pointer bufferD, @Cast("Nd4jLong*") LongBuffer shapeInfo) { super((Pointer)null); allocate(buffer, bufferD, shapeInfo); }
         private native void allocate(Pointer buffer, Pointer bufferD, @Cast("Nd4jLong*") LongBuffer shapeInfo);
-        public NDArray(Pointer buffer, Pointer bufferD, @Cast("Nd4jLong*") long[] shapeInfo, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/, @Cast("const bool") boolean isBuffDAlloc/*=false*/) { super((Pointer)null); allocate(buffer, bufferD, shapeInfo, context, isBuffAlloc, isBuffDAlloc); }
-        private native void allocate(Pointer buffer, Pointer bufferD, @Cast("Nd4jLong*") long[] shapeInfo, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/, @Cast("const bool") boolean isBuffDAlloc/*=false*/);
+        public NDArray(Pointer buffer, Pointer bufferD, @Cast("Nd4jLong*") long[] shapeInfo, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/, @Cast("const bool") boolean isBuffDAlloc/*=false*/) { super((Pointer)null); allocate(buffer, bufferD, shapeInfo, context, isBuffAlloc, isBuffDAlloc); }
+        private native void allocate(Pointer buffer, Pointer bufferD, @Cast("Nd4jLong*") long[] shapeInfo, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/, @Cast("const bool") boolean isBuffDAlloc/*=false*/);
         public NDArray(Pointer buffer, Pointer bufferD, @Cast("Nd4jLong*") long[] shapeInfo) { super((Pointer)null); allocate(buffer, bufferD, shapeInfo); }
         private native void allocate(Pointer buffer, Pointer bufferD, @Cast("Nd4jLong*") long[] shapeInfo);
 
@@ -3727,16 +3731,16 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         /**
 		*  constructor creates new NDArray using shape information from "shapeInfo", set all elements in new array to zeros, if copyStrides is true then use stride values from "shapeInfo", else calculate strides independently
         */
-		public NDArray(@Cast("Nd4jLong*") LongPointer shapeInfo, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(shapeInfo, copyStrides, context); }
-		private native void allocate(@Cast("Nd4jLong*") LongPointer shapeInfo, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/);
+		public NDArray(@Cast("Nd4jLong*") LongPointer shapeInfo, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(shapeInfo, copyStrides, context); }
+		private native void allocate(@Cast("Nd4jLong*") LongPointer shapeInfo, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/);
 		public NDArray(@Cast("Nd4jLong*") LongPointer shapeInfo) { super((Pointer)null); allocate(shapeInfo); }
 		private native void allocate(@Cast("Nd4jLong*") LongPointer shapeInfo);
-		public NDArray(@Cast("Nd4jLong*") LongBuffer shapeInfo, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(shapeInfo, copyStrides, context); }
-		private native void allocate(@Cast("Nd4jLong*") LongBuffer shapeInfo, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/);
+		public NDArray(@Cast("Nd4jLong*") LongBuffer shapeInfo, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(shapeInfo, copyStrides, context); }
+		private native void allocate(@Cast("Nd4jLong*") LongBuffer shapeInfo, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/);
 		public NDArray(@Cast("Nd4jLong*") LongBuffer shapeInfo) { super((Pointer)null); allocate(shapeInfo); }
 		private native void allocate(@Cast("Nd4jLong*") LongBuffer shapeInfo);
-		public NDArray(@Cast("Nd4jLong*") long[] shapeInfo, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(shapeInfo, copyStrides, context); }
-		private native void allocate(@Cast("Nd4jLong*") long[] shapeInfo, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/);
+		public NDArray(@Cast("Nd4jLong*") long[] shapeInfo, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(shapeInfo, copyStrides, context); }
+		private native void allocate(@Cast("Nd4jLong*") long[] shapeInfo, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/);
 		public NDArray(@Cast("Nd4jLong*") long[] shapeInfo) { super((Pointer)null); allocate(shapeInfo); }
 		private native void allocate(@Cast("Nd4jLong*") long[] shapeInfo);
 
@@ -3744,66 +3748,66 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         *  constructor creates new NDArray using shape information from "shapeInfo", set all elements in new array to be zeros, if copyStrides is true then use stride values from "shapeInfo", else calculate strides independently
         *  set dtype as array type
         */
-        public NDArray(@Cast("Nd4jLong*") LongPointer shapeInfo, @Cast("const nd4j::DataType") int dtype, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(shapeInfo, dtype, copyStrides, context); }
-        private native void allocate(@Cast("Nd4jLong*") LongPointer shapeInfo, @Cast("const nd4j::DataType") int dtype, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/);
-        public NDArray(@Cast("Nd4jLong*") LongPointer shapeInfo, @Cast("const nd4j::DataType") int dtype) { super((Pointer)null); allocate(shapeInfo, dtype); }
-        private native void allocate(@Cast("Nd4jLong*") LongPointer shapeInfo, @Cast("const nd4j::DataType") int dtype);
-        public NDArray(@Cast("Nd4jLong*") LongBuffer shapeInfo, @Cast("const nd4j::DataType") int dtype, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(shapeInfo, dtype, copyStrides, context); }
-        private native void allocate(@Cast("Nd4jLong*") LongBuffer shapeInfo, @Cast("const nd4j::DataType") int dtype, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/);
-        public NDArray(@Cast("Nd4jLong*") LongBuffer shapeInfo, @Cast("const nd4j::DataType") int dtype) { super((Pointer)null); allocate(shapeInfo, dtype); }
-        private native void allocate(@Cast("Nd4jLong*") LongBuffer shapeInfo, @Cast("const nd4j::DataType") int dtype);
-        public NDArray(@Cast("Nd4jLong*") long[] shapeInfo, @Cast("const nd4j::DataType") int dtype, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(shapeInfo, dtype, copyStrides, context); }
-        private native void allocate(@Cast("Nd4jLong*") long[] shapeInfo, @Cast("const nd4j::DataType") int dtype, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/);
-        public NDArray(@Cast("Nd4jLong*") long[] shapeInfo, @Cast("const nd4j::DataType") int dtype) { super((Pointer)null); allocate(shapeInfo, dtype); }
-        private native void allocate(@Cast("Nd4jLong*") long[] shapeInfo, @Cast("const nd4j::DataType") int dtype);
+        public NDArray(@Cast("Nd4jLong*") LongPointer shapeInfo, @Cast("const sd::DataType") int dtype, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(shapeInfo, dtype, copyStrides, context); }
+        private native void allocate(@Cast("Nd4jLong*") LongPointer shapeInfo, @Cast("const sd::DataType") int dtype, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/);
+        public NDArray(@Cast("Nd4jLong*") LongPointer shapeInfo, @Cast("const sd::DataType") int dtype) { super((Pointer)null); allocate(shapeInfo, dtype); }
+        private native void allocate(@Cast("Nd4jLong*") LongPointer shapeInfo, @Cast("const sd::DataType") int dtype);
+        public NDArray(@Cast("Nd4jLong*") LongBuffer shapeInfo, @Cast("const sd::DataType") int dtype, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(shapeInfo, dtype, copyStrides, context); }
+        private native void allocate(@Cast("Nd4jLong*") LongBuffer shapeInfo, @Cast("const sd::DataType") int dtype, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/);
+        public NDArray(@Cast("Nd4jLong*") LongBuffer shapeInfo, @Cast("const sd::DataType") int dtype) { super((Pointer)null); allocate(shapeInfo, dtype); }
+        private native void allocate(@Cast("Nd4jLong*") LongBuffer shapeInfo, @Cast("const sd::DataType") int dtype);
+        public NDArray(@Cast("Nd4jLong*") long[] shapeInfo, @Cast("const sd::DataType") int dtype, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(shapeInfo, dtype, copyStrides, context); }
+        private native void allocate(@Cast("Nd4jLong*") long[] shapeInfo, @Cast("const sd::DataType") int dtype, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/);
+        public NDArray(@Cast("Nd4jLong*") long[] shapeInfo, @Cast("const sd::DataType") int dtype) { super((Pointer)null); allocate(shapeInfo, dtype); }
+        private native void allocate(@Cast("Nd4jLong*") long[] shapeInfo, @Cast("const sd::DataType") int dtype);
 
         /**
         *  this constructor creates new array using shape information contained in vector argument
         */
-        public NDArray(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("nd4j::DataType") int dtype/*=nd4j::DOUBLE*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(order, shape, dtype, context); }
-        private native void allocate(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("nd4j::DataType") int dtype/*=nd4j::DOUBLE*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/);
+        public NDArray(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("sd::DataType") int dtype/*=sd::DOUBLE*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(order, shape, dtype, context); }
+        private native void allocate(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("sd::DataType") int dtype/*=sd::DOUBLE*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/);
         public NDArray(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape) { super((Pointer)null); allocate(order, shape); }
         private native void allocate(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape);
-        public NDArray(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("nd4j::DataType") int dtype/*=nd4j::DOUBLE*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(order, shape, dtype, context); }
-        private native void allocate(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("nd4j::DataType") int dtype/*=nd4j::DOUBLE*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/);
+        public NDArray(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("sd::DataType") int dtype/*=sd::DOUBLE*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(order, shape, dtype, context); }
+        private native void allocate(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("sd::DataType") int dtype/*=sd::DOUBLE*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/);
         public NDArray(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape) { super((Pointer)null); allocate(order, shape); }
         private native void allocate(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape);
-        public NDArray(byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("nd4j::DataType") int dtype/*=nd4j::DOUBLE*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(order, shape, dtype, context); }
-        private native void allocate(byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("nd4j::DataType") int dtype/*=nd4j::DOUBLE*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/);
+        public NDArray(byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("sd::DataType") int dtype/*=sd::DOUBLE*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(order, shape, dtype, context); }
+        private native void allocate(byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("sd::DataType") int dtype/*=sd::DOUBLE*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/);
         public NDArray(byte order, @Cast("Nd4jLong*") @StdVector long[] shape) { super((Pointer)null); allocate(order, shape); }
         private native void allocate(byte order, @Cast("Nd4jLong*") @StdVector long[] shape);
 
         /**
         * This constructor creates new array with elements copied from data and using shape information stored in shape, elements from data will be casted to dtype
         */
-        public NDArray(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @StdVector DoublePointer data, @Cast("nd4j::DataType") int dtype/*=nd4j::DOUBLE*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(order, shape, data, dtype, context); }
-        private native void allocate(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @StdVector DoublePointer data, @Cast("nd4j::DataType") int dtype/*=nd4j::DOUBLE*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/);
+        public NDArray(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @StdVector DoublePointer data, @Cast("sd::DataType") int dtype/*=sd::DOUBLE*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(order, shape, data, dtype, context); }
+        private native void allocate(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @StdVector DoublePointer data, @Cast("sd::DataType") int dtype/*=sd::DOUBLE*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/);
         public NDArray(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @StdVector DoublePointer data) { super((Pointer)null); allocate(order, shape, data); }
         private native void allocate(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @StdVector DoublePointer data);
-        public NDArray(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @StdVector DoubleBuffer data, @Cast("nd4j::DataType") int dtype/*=nd4j::DOUBLE*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(order, shape, data, dtype, context); }
-        private native void allocate(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @StdVector DoubleBuffer data, @Cast("nd4j::DataType") int dtype/*=nd4j::DOUBLE*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/);
+        public NDArray(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @StdVector DoubleBuffer data, @Cast("sd::DataType") int dtype/*=sd::DOUBLE*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(order, shape, data, dtype, context); }
+        private native void allocate(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @StdVector DoubleBuffer data, @Cast("sd::DataType") int dtype/*=sd::DOUBLE*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/);
         public NDArray(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @StdVector DoubleBuffer data) { super((Pointer)null); allocate(order, shape, data); }
         private native void allocate(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @StdVector DoubleBuffer data);
-        public NDArray(byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @StdVector double[] data, @Cast("nd4j::DataType") int dtype/*=nd4j::DOUBLE*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(order, shape, data, dtype, context); }
-        private native void allocate(byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @StdVector double[] data, @Cast("nd4j::DataType") int dtype/*=nd4j::DOUBLE*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/);
+        public NDArray(byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @StdVector double[] data, @Cast("sd::DataType") int dtype/*=sd::DOUBLE*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(order, shape, data, dtype, context); }
+        private native void allocate(byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @StdVector double[] data, @Cast("sd::DataType") int dtype/*=sd::DOUBLE*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/);
         public NDArray(byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @StdVector double[] data) { super((Pointer)null); allocate(order, shape, data); }
         private native void allocate(byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @StdVector double[] data);
 
         /**
         *  this constructor creates new array using given buffer (without memory allocation) and shape information stored in shape
         */
-        public NDArray(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape,  @Cast("nd4j::DataType") int dtype, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/) { super((Pointer)null); allocate(buffer, order, shape, dtype, context, isBuffAlloc); }
-        private native void allocate(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape,  @Cast("nd4j::DataType") int dtype, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/);
-        public NDArray(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape,  @Cast("nd4j::DataType") int dtype) { super((Pointer)null); allocate(buffer, order, shape, dtype); }
-        private native void allocate(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape,  @Cast("nd4j::DataType") int dtype);
-        public NDArray(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape,  @Cast("nd4j::DataType") int dtype, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/) { super((Pointer)null); allocate(buffer, order, shape, dtype, context, isBuffAlloc); }
-        private native void allocate(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape,  @Cast("nd4j::DataType") int dtype, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/);
-        public NDArray(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape,  @Cast("nd4j::DataType") int dtype) { super((Pointer)null); allocate(buffer, order, shape, dtype); }
-        private native void allocate(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape,  @Cast("nd4j::DataType") int dtype);
-        public NDArray(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector long[] shape,  @Cast("nd4j::DataType") int dtype, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/) { super((Pointer)null); allocate(buffer, order, shape, dtype, context, isBuffAlloc); }
-        private native void allocate(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector long[] shape,  @Cast("nd4j::DataType") int dtype, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/);
-        public NDArray(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector long[] shape,  @Cast("nd4j::DataType") int dtype) { super((Pointer)null); allocate(buffer, order, shape, dtype); }
-        private native void allocate(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector long[] shape,  @Cast("nd4j::DataType") int dtype);
+        public NDArray(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape,  @Cast("sd::DataType") int dtype, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/) { super((Pointer)null); allocate(buffer, order, shape, dtype, context, isBuffAlloc); }
+        private native void allocate(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape,  @Cast("sd::DataType") int dtype, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/);
+        public NDArray(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape,  @Cast("sd::DataType") int dtype) { super((Pointer)null); allocate(buffer, order, shape, dtype); }
+        private native void allocate(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape,  @Cast("sd::DataType") int dtype);
+        public NDArray(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape,  @Cast("sd::DataType") int dtype, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/) { super((Pointer)null); allocate(buffer, order, shape, dtype, context, isBuffAlloc); }
+        private native void allocate(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape,  @Cast("sd::DataType") int dtype, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/);
+        public NDArray(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape,  @Cast("sd::DataType") int dtype) { super((Pointer)null); allocate(buffer, order, shape, dtype); }
+        private native void allocate(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape,  @Cast("sd::DataType") int dtype);
+        public NDArray(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector long[] shape,  @Cast("sd::DataType") int dtype, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/) { super((Pointer)null); allocate(buffer, order, shape, dtype, context, isBuffAlloc); }
+        private native void allocate(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector long[] shape,  @Cast("sd::DataType") int dtype, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/);
+        public NDArray(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector long[] shape,  @Cast("sd::DataType") int dtype) { super((Pointer)null); allocate(buffer, order, shape, dtype); }
+        private native void allocate(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector long[] shape,  @Cast("sd::DataType") int dtype);
 
         /**
         * This method returns new array with the same shape & data type
@@ -3822,16 +3826,16 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         *  this constructor creates new NDArray with shape matching "other" array,
         *  doesn't copy "other" elements into new array !!!
         */
-        public NDArray(@Const NDArray other, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(other, copyStrides, context); }
-        private native void allocate(@Const NDArray other, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/);
+        public NDArray(@Const NDArray other, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(other, copyStrides, context); }
+        private native void allocate(@Const NDArray other, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/);
 
         /**
         *  this constructor creates scalar(and set its value = 0) or empty array depending on bool argument isScalar
         */
-        public NDArray(@Cast("nd4j::DataType") int dtype, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isScalar/*=true*/) { super((Pointer)null); allocate(dtype, context, isScalar); }
-        private native void allocate(@Cast("nd4j::DataType") int dtype, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isScalar/*=true*/);
-        public NDArray(@Cast("nd4j::DataType") int dtype) { super((Pointer)null); allocate(dtype); }
-        private native void allocate(@Cast("nd4j::DataType") int dtype);
+        public NDArray(@Cast("sd::DataType") int dtype, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isScalar/*=true*/) { super((Pointer)null); allocate(dtype, context, isScalar); }
+        private native void allocate(@Cast("sd::DataType") int dtype, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isScalar/*=true*/);
+        public NDArray(@Cast("sd::DataType") int dtype) { super((Pointer)null); allocate(dtype); }
+        private native void allocate(@Cast("sd::DataType") int dtype);
 
         /**
          * This method blocks until asynchronous operation finishes
@@ -3944,9 +3948,9 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         /**
         *  cast array elements to given dtype
         */
-        public native @ByVal NDArray cast(@Cast("nd4j::DataType") int dtype);
+        public native @ByVal NDArray cast(@Cast("sd::DataType") int dtype);
 
-        public native void cast(@ByRef NDArray target, @Cast("nd4j::DataType") int dtype);
+        public native void cast(@ByRef NDArray target, @Cast("sd::DataType") int dtype);
 
         /**
         *   returns _context
@@ -4036,6 +4040,9 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         public native @ByVal NDArray permute(@Const IntPointer dimensions, int rank);
         public native @ByVal NDArray permute(@Const IntBuffer dimensions, int rank);
         public native @ByVal NDArray permute(@Const int[] dimensions, int rank);
+        
+        
+        
 
         public native void permute(@Const IntPointer dimensions, int rank, @ByRef NDArray target);
         public native void permute(@Const IntBuffer dimensions, int rank, @ByRef NDArray target);
@@ -4049,6 +4056,9 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         public native @ByVal NDArray permute(@Cast("const Nd4jLong*") LongPointer dimensions, int rank);
         public native @ByVal NDArray permute(@Cast("const Nd4jLong*") LongBuffer dimensions, int rank);
         public native @ByVal NDArray permute(@Cast("const Nd4jLong*") long[] dimensions, int rank);
+        
+        
+        
 
         public native void permute(@Cast("const Nd4jLong*") LongPointer dimensions, int rank, @ByRef NDArray target);
         public native void permute(@Cast("const Nd4jLong*") LongBuffer dimensions, int rank, @ByRef NDArray target);
@@ -4142,6 +4152,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         *   apply transpose operation to the copy of this array, that is this array remains unaffected
         */
         public native @ByVal NDArray transpose();
+        
 
         /**
         *  perform transpose operation and store result in target, this array remains unaffected
@@ -4273,12 +4284,13 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         *
         * if permute have been applied before or there are weird strides, then new buffer is allocated for new array
         */
-        public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/);
-        public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape);
-        public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/);
-        public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape);
-        public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("const bool") boolean copyToNewBuff/*=true*/);
-        public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector long[] shape);
+		public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/);
+		public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape);
+		public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/);
+		public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape);
+		public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("const bool") boolean copyToNewBuff/*=true*/);
+		public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector long[] shape);
+        
 
         /**
         *  calculate strides and set given order
@@ -4388,6 +4400,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         *  negative operator, it changes sign of all array elements on opposite
         */
         public native @ByVal @Name("operator -") NDArray subtract();
+        
 
         /**
         *  pairwise multiplication unary operator array *= other
@@ -4467,7 +4480,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #ifndef __JAVACPP_HACK__
 // #endif
 
-        public native @ByVal NDArray asT(@Cast("nd4j::DataType") int dtype);
+        public native @ByVal NDArray asT(@Cast("sd::DataType") int dtype);
 
 
         public native void linspace(double start);
@@ -4495,9 +4508,9 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         public native void setShapeInfo(@Cast("const Nd4jLong*") LongPointer shapeInfo);
         public native void setShapeInfo(@Cast("const Nd4jLong*") LongBuffer shapeInfo);
         public native void setShapeInfo(@Cast("const Nd4jLong*") long[] shapeInfo);
-        public native void setShapeInfo(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const nd4j::DataType") int dtype);
-        public native void setShapeInfo(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const nd4j::DataType") int dtype);
-        public native void setShapeInfo(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const nd4j::DataType") int dtype);
+        public native void setShapeInfo(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const sd::DataType") int dtype);
+        public native void setShapeInfo(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const sd::DataType") int dtype);
+        public native void setShapeInfo(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const sd::DataType") int dtype);
         public native void setShapeInfo(@Const @ByRef ShapeDescriptor descriptor);
         public native void setShapeInfo(@Const @ByRef ConstantDataBuffer shapeBuffer);
 
@@ -4692,7 +4705,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         * Returns data type of this array
         * @return
         */
-        public native @Cast("nd4j::DataType") int dataType();
+        public native @Cast("sd::DataType") int dataType();
 
         /**
          * This method returns true if value is from Integer space
@@ -4889,7 +4902,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 
 // #if defined(__CUDACC__) //&& defined(BUILD_TESTS)
 // for CUDA we need stil stuff inline
-// #include "cuda/NDArrayLambda.hpp"
+// #include <array/NDArrayLambda.hXX>
 // #endif
 
 
@@ -4927,10 +4940,10 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #include <string>
 // #include <atomic>
 // #include <unordered_map>
-// #include <NDArray.h>
+// #include <array/NDArray.h>
 // #include <memory/Workspace.h>
-// #include <dll.h>
-    @Namespace("nd4j") @NoOffset public static class NDArrayList extends Pointer {
+// #include <system/dll.h>
+    @Namespace("sd") @NoOffset public static class NDArrayList extends Pointer {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public NDArrayList(Pointer p) { super(p); }
@@ -4940,7 +4953,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         public NDArrayList(int height) { super((Pointer)null); allocate(height); }
         private native void allocate(int height);
 
-        public native @Cast("nd4j::DataType") int dataType();
+        public native @Cast("sd::DataType") int dataType();
 
         public native NDArray read(int idx);
         public native NDArray readRaw(int idx);
@@ -4957,7 +4970,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 
         public native @ByRef IntIntPair id();
         public native @StdString @ByRef @Cast({"char*", "std::string*"}) BytePointer name();
-        //nd4j::memory::Workspace* workspace();
+        //sd::memory::Workspace* workspace();
         public native LaunchContext context();
         public native NDArrayList clone();
 
@@ -4995,7 +5008,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // 
 // PLESE NOTE: It will delete all stored NDArrays upon destructor call
 //
-// Created by raver119 on 07.09.17.
+// @author raver119@gmail.com
 //
 
 // #ifndef LIBND4J_RESULTSET_H
@@ -5003,10 +5016,10 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 
 // #include <vector>
 // #include <graph/generated/result_generated.h>
-// #include <pointercast.h>
-// #include <dll.h> // forward declaration of template class NDArray
+// #include <system/pointercast.h>
+// #include <system/dll.h> // forward declaration of template class NDArray
     
-    @Namespace("nd4j") @NoOffset public static class ResultSet extends Pointer {
+    @Namespace("sd") @NoOffset public static class ResultSet extends Pointer {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public ResultSet(Pointer p) { super(p); }
@@ -5017,10 +5030,12 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
             return (ResultSet)super.position(position);
         }
     
-        // default constructor
         public ResultSet() { super((Pointer)null); allocate(); }
         private native void allocate();
 
+// #ifndef __JAVACPP_HACK__
+// #endif
+
         public ResultSet(@Const @ByRef ResultSet other) { super((Pointer)null); allocate(other); }
         @NoException private native void allocate(@Const @ByRef ResultSet other);
 
@@ -5071,9 +5086,9 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #define LIBND4J_GRAPH_RNG_H
 
 // #include <types/u64.h>
-// #include <pointercast.h>
-// #include <op_boilerplate.h>
-// #include <dll.h>
+// #include <system/pointercast.h>
+// #include <system/op_boilerplate.h>
+// #include <system/dll.h>
 // #include <chrono>
 // #include <array/DataTypeUtils.h>
 // #include <helpers/logger.h>
@@ -5083,7 +5098,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #endif
 // #ifdef __CUDACC__
 // #else
-        @Namespace("nd4j::graph") @NoOffset public static class RandomGenerator extends Pointer {
+        @Namespace("sd::graph") @NoOffset public static class RandomGenerator extends Pointer {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public RandomGenerator(Pointer p) { super(p); }
@@ -5167,9 +5182,9 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         
 
         //////
-        @Namespace("nd4j::graph") public static native @Cast("uint32_t") int rotl(@Cast("const uint32_t") int x, int k);
+        @Namespace("sd::graph") public static native @Cast("uint32_t") int rotl(@Cast("const uint32_t") int x, int k);
 
-        @Namespace("nd4j::graph") public static native @Cast("uint64_t") long rotl(@Cast("const uint64_t") long x, int k);
+        @Namespace("sd::graph") public static native @Cast("uint64_t") long rotl(@Cast("const uint64_t") long x, int k);
 
         
 
@@ -5208,13 +5223,17 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #define LIBND4J_VARIABLE_H
 
 // #include <string>
-// #include <NDArray.h>
+// #include <array/NDArray.h>
 // #include <array/NDArrayList.h>
 // #include <graph/VariableType.h>
 // #include <graph/generated/array_generated.h>
 // #include <graph/generated/node_generated.h>
 // #include <graph/generated/graph_generated.h>
-        @Namespace("nd4j::graph") @NoOffset public static class Variable extends Pointer {
+
+// #ifndef __JAVACPP_HACK__
+
+// #endif
+        @Namespace("sd::graph") @NoOffset public static class Variable extends Pointer {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public Variable(Pointer p) { super(p); }
@@ -5242,6 +5261,9 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
             public Variable(NDArray array/*=nullptr*/, @Cast("char*") BytePointer name/*=nullptr*/) { super((Pointer)null); allocate(array, name); }
             private native void allocate(NDArray array/*=nullptr*/, @Cast("char*") BytePointer name/*=nullptr*/);
 
+// #ifndef __JAVACPP_HACK__
+// #endif
+
             public native Variable clone();
 
             public native @Cast("bool") boolean hasNDArray();
@@ -5259,8 +5281,8 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 
             public native @Cast("bool") boolean isPlaceholder();
 
-            public native @Cast("nd4j::graph::VariableType") int variableType();
-            public native void setVariableType(@Cast("nd4j::graph::VariableType") int variableType);
+            public native @Cast("sd::graph::VariableType") int variableType();
+            public native void setVariableType(@Cast("sd::graph::VariableType") int variableType);
 
             /**
              * This method returns InputType of this variable  
@@ -5321,10 +5343,10 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 
 // #include <iterator>
 // #include <vector>
-// #include <pointercast.h>
-// #include <dll.h>
+// #include <system/pointercast.h>
+// #include <system/dll.h>
 // #include <graph/Variable.h>
-        @Namespace("nd4j::graph") @NoOffset public static class VariablesSet extends Pointer {
+        @Namespace("sd::graph") @NoOffset public static class VariablesSet extends Pointer {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public VariablesSet(Pointer p) { super(p); }
@@ -5376,13 +5398,15 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #ifndef LIBND4J_FLOWPATH_H
 // #define LIBND4J_FLOWPATH_H
 
+// #include <system/op_boilerplate.h>
+// #include <unordered_map>
 // #include <map>
-// #include <pointercast.h>
+// #include <system/pointercast.h>
 // #include <graph/NodeState.h>
 // #include <graph/FrameState.h>
 // #include <graph/profiling/GraphProfile.h>
-// #include <dll.h>
-        @Namespace("nd4j::graph") @NoOffset public static class FlowPath extends Pointer {
+// #include <system/dll.h>
+        @Namespace("sd::graph") @NoOffset public static class FlowPath extends Pointer {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public FlowPath(Pointer p) { super(p); }
@@ -5463,12 +5487,12 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #ifndef LIBND4J_INTERVALS_H
 // #define LIBND4J_INTERVALS_H
 
-// #include <pointercast.h>
+// #include <system/pointercast.h>
 // #include <vector>
 // #include <initializer_list>
-// #include <dll.h>
+// #include <system/dll.h>
 
-    @Namespace("nd4j") @NoOffset public static class Intervals extends Pointer {
+    @Namespace("sd") @NoOffset public static class Intervals extends Pointer {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public Intervals(Pointer p) { super(p); }
@@ -5528,12 +5552,14 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #define LIBND4J_STASH_H
 
 //#include <graph/Block.h>
-// #include <NDArray.h>
-// #include <unordered_map>
+// #include <array/NDArray.h>
+// #include <map>
+// #include <vector>
 // #include <string>
 // #include <atomic>
-// #include <pointercast.h>
-        @Namespace("nd4j::graph") @NoOffset public static class KeyPair extends Pointer {
+// #include <functional>
+// #include <system/pointercast.h>
+        @Namespace("sd::graph") @NoOffset public static class KeyPair extends Pointer {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public KeyPair(Pointer p) { super(p); }
@@ -5552,9 +5578,19 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
             private native void allocate(int node/*=0*/, @Cast("char*") BytePointer name/*=nullptr*/);
 
             public native @Cast("bool") @Name("operator <") boolean lessThan(@Const @ByRef KeyPair other);
-        }
 
-        @Namespace("nd4j::graph") @NoOffset public static class Stash extends Pointer {
+            public native @Cast("bool") @Name("operator ==") boolean equals(@Const @ByRef KeyPair other);
+
+            public native int key();
+            public native @StdString BytePointer name();
+        }
+    
+
+
+// #ifndef __JAVACPP_HACK__
+
+// #endif
+        @Namespace("sd::graph") @NoOffset public static class Stash extends Pointer {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public Stash(Pointer p) { super(p); }
@@ -5568,15 +5604,15 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
             public Stash() { super((Pointer)null); allocate(); }
             private native void allocate();
 
-            //void storeArray(nd4j::graph::Block<T>& block, const char *name, nd4j::NDArray<T> *array);
+            //void storeArray(sd::graph::Block<T>& block, const char *name, sd::NDArray<T> *array);
             public native void storeArray(int nodeId, @Cast("char*") String name, NDArray array);
             public native void storeArray(int nodeId, @Cast("char*") BytePointer name, NDArray array);
 
-            //bool checkStash(nd4j::graph::Block<T>& block, const char *name);
+            //bool checkStash(sd::graph::Block<T>& block, const char *name);
             public native @Cast("bool") boolean checkStash(int nodeId, @Cast("char*") String name);
             public native @Cast("bool") boolean checkStash(int nodeId, @Cast("char*") BytePointer name);
 
-            //nd4j::NDArray<T>* extractArray(nd4j::graph::Block<T>& block, const char *name);
+            //sd::NDArray<T>* extractArray(sd::graph::Block<T>& block, const char *name);
             public native NDArray extractArray(int nodeId, @Cast("char*") String name);
             public native NDArray extractArray(int nodeId, @Cast("char*") BytePointer name);
 
@@ -5588,6 +5624,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 
 
 
+
 // #endif //LIBND4J_STASH_H
 
 
@@ -5616,20 +5653,21 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #ifndef LIBND4J_GRAPHSTATE_H
 // #define LIBND4J_GRAPHSTATE_H
 
-// #include <pointercast.h>
-// #include <op_boilerplate.h>
-// #include <dll.h>
+// #include <system/pointercast.h>
+// #include <system/op_boilerplate.h>
+// #include <system/dll.h>
 // #include <vector>
 // #include <unordered_map>
+// #include <map>
 // #include <graph/Scope.h>
-// #include <Status.h>
+// #include <graph/Status.h>
 // #include <graph/VariableSpace.h>
 // #include <ops/declarable/DeclarableOp.h>
 // #include <types/pair.h>
 // #include <graph/ArgumentsList.h>
 // #include <graph/Graph.h>
 
-    @Namespace("nd4j::graph") @NoOffset public static class GraphState extends Pointer {
+    @Namespace("sd::graph") @NoOffset public static class GraphState extends Pointer {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public GraphState(Pointer p) { super(p); }
@@ -5738,13 +5776,13 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #include <list>
 // #include <unordered_map>
 // #include <mutex>
-// #include <NDArray.h>
+// #include <array/NDArray.h>
 // #include <array/NDArrayList.h>
 // #include <graph/Variable.h>
 // #include <memory/Workspace.h>
 // #include <graph/Stash.h>
 // #include <graph/FlowPath.h>
-        @Namespace("nd4j::graph") @NoOffset public static class VariableSpace extends Pointer {
+        @Namespace("sd::graph") @NoOffset public static class VariableSpace extends Pointer {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public VariableSpace(Pointer p) { super(p); }
@@ -5761,7 +5799,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
             public native @ByRef @Name("operator =") VariableSpace put(@Const @ByRef VariableSpace other);
 
             public native int numberOfPlaceholders();
-            public native @Cast("nd4j::graph::Variable**") @StdVector PointerPointer getPlaceholders();
+            public native @Cast("sd::graph::Variable**") @StdVector PointerPointer getPlaceholders();
             public native void setWorkspace(Workspace workspace);
 
             public native LaunchContext launchContext();
@@ -5780,13 +5818,13 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
             public native Variable getVariable(@ByRef IntIntPair pair);
             public native Variable getVariable(@StdString @Cast({"char*", "std::string*"}) BytePointer symbol);
 
-            public native @Cast("nd4j::graph::Variable**") @StdVector PointerPointer getVariables();
+            public native @Cast("sd::graph::Variable**") @StdVector PointerPointer getVariables();
 
-            public native void putVariable(@ByRef IntIntPair pair, NDArray array);
+            public native Variable putVariable(@ByRef IntIntPair pair, NDArray array);
             public native void putVariable(@ByRef IntIntPair pair, Variable variable);
             public native void putVariable(int id, Variable variable);
             public native void putVariable(int id, NDArray array);
-            public native void putVariable(int id, int idx, NDArray array);
+            public native Variable putVariable(int id, int idx, NDArray array);
             public native void putVariable(int id, int idx, Variable array);
 
             public native void dropVariable(@ByRef IntIntPair pair);
@@ -5809,7 +5847,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 
             public native VariableSpace clone();
 
-            public native @Cast("nd4j::graph::Variable**") @StdVector PointerPointer handles();
+            public native @Cast("sd::graph::Variable**") @StdVector PointerPointer handles();
 
 
             public native VariableSpace asT();
@@ -5817,7 +5855,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 
             public native Stash getStash();
 
-            public native @Cast("nd4j::graph::Variable**") @StdVector PointerPointer getExternalVariables();
+            public native @Cast("sd::graph::Variable**") @StdVector PointerPointer getExternalVariables();
 
             public native void setFlowPath(FlowPath timers);
             public native FlowPath flowPath();
@@ -5854,10 +5892,10 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #ifndef LIBND4J_HELPER_GENERATOR_H
 // #define LIBND4J_HELPER_GENERATOR_H
 
-// #include <op_boilerplate.h>
-// #include <pointercast.h>
+// #include <system/op_boilerplate.h>
+// #include <system/pointercast.h>
 // #include <array/DataTypeUtils.h>
-// #include <dll.h>
+// #include <system/dll.h>
 
 // #ifdef _MSC_VER
 // include for uint64_t on MSVC
@@ -5882,7 +5920,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 
 // #ifdef __CUDACC__
 // #else
-        @Namespace("nd4j::random") @NoOffset public static class RandomBuffer extends Pointer {
+        @Namespace("sd::random") @NoOffset public static class RandomBuffer extends Pointer {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public RandomBuffer(Pointer p) { super(p); }
@@ -6043,7 +6081,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 
         }
 
-        @Namespace("nd4j::random") @NoOffset public static class IGenerator extends Pointer {
+        @Namespace("sd::random") @NoOffset public static class IGenerator extends Pointer {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public IGenerator(Pointer p) { super(p); }
@@ -6063,7 +6101,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 
 
 
-        @Namespace("nd4j::random") @NoOffset public static class Xoroshiro128 extends IGenerator {
+        @Namespace("sd::random") @NoOffset public static class Xoroshiro128 extends IGenerator {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public Xoroshiro128(Pointer p) { super(p); }
@@ -6104,13 +6142,13 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #define ND4J_GRAPH_PROFILE_H
 
 // #include "NodeProfile.h"
-// #include <pointercast.h>
-// #include <dll.h>
+// #include <system/pointercast.h>
+// #include <system/dll.h>
 // #include <vector>
 // #include <string>
 // #include <map>
 // #include <chrono>
-        @Namespace("nd4j::graph") @NoOffset public static class GraphProfile extends Pointer {
+        @Namespace("sd::graph") @NoOffset public static class GraphProfile extends Pointer {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public GraphProfile(Pointer p) { super(p); }
@@ -6209,11 +6247,11 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #ifndef LIBND4J_NODE_PROFILE_H
 // #define LIBND4J_NODE_PROFILE_H
 
-// #include <pointercast.h>
-// #include <dll.h>
+// #include <system/pointercast.h>
+// #include <system/dll.h>
 // #include <string>
 // #include <vector>
-        @Namespace("nd4j::graph") @NoOffset public static class NodeProfile extends Pointer {
+        @Namespace("sd::graph") @NoOffset public static class NodeProfile extends Pointer {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public NodeProfile(Pointer p) { super(p); }
@@ -6298,7 +6336,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #define LIBND4J_CONTEXT_H
 
 // #include <vector>
-// #include <NDArray.h>
+// #include <array/NDArray.h>
 // #include <graph/Variable.h>
 // #include <graph/VariableSpace.h>
 // #include <graph/ContextPrototype.h>
@@ -6311,7 +6349,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         /**
          * This class defines input desired for any given node/operation within graph
          */
-        @Namespace("nd4j::graph") @NoOffset public static class Context extends ContextPrototype {
+        @Namespace("sd::graph") @NoOffset public static class Context extends ContextPrototype {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public Context(Pointer p) { super(p); }
@@ -6334,10 +6372,10 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
             public native @Cast("Nd4jLong") long getOuterTime();
             public native @Cast("Nd4jLong") long getInnerTime();
 
-            public native @Cast("nd4j::DataType") int dataType();
+            public native @Cast("sd::DataType") int dataType();
 
-            public native @Cast("nd4j::DataType") int dataType(int index);
-            public native void setDataType(int index, @Cast("nd4j::DataType") int type);
+            public native @Cast("sd::DataType") int dataType(int index);
+            public native void setDataType(int index, @Cast("sd::DataType") int type);
             // these methods are related to Workspace abstraction
             public native @Cast("bool") boolean hasWorkspaceProvided();
             public native void attachWorkspace(Workspace workspace);
@@ -6431,11 +6469,17 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 
             // methods used in java interop
             /**
-             * This method checks, if Context uses fastpath variable access
+             * This method checks if Context uses fastpath variable access
              * @return
              */
             public native @Cast("bool") boolean isFastPath();
 
+            /**
+             * Method allows to forbid FastPath execution
+             * @param reallyForbid
+             */
+            public native void forbidFastPath(@Cast("bool") boolean reallyForbid);
+
 // #ifndef __JAVACPP_HACK__
 // #endif
 
@@ -6457,9 +6501,9 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
             public native void setIArguments(@Cast("Nd4jLong*") long[] arguments, int numberOfArguments);
             public native void setBArguments(@Cast("bool*") BooleanPointer arguments, int numberOfArguments);
             public native void setBArguments(@Cast("bool*") boolean[] arguments, int numberOfArguments);
-            public native void setDArguments(@Cast("nd4j::DataType*") IntPointer arguments, int numberOfArguments);
-            public native void setDArguments(@Cast("nd4j::DataType*") IntBuffer arguments, int numberOfArguments);
-            public native void setDArguments(@Cast("nd4j::DataType*") int[] arguments, int numberOfArguments);
+            public native void setDArguments(@Cast("sd::DataType*") IntPointer arguments, int numberOfArguments);
+            public native void setDArguments(@Cast("sd::DataType*") IntBuffer arguments, int numberOfArguments);
+            public native void setDArguments(@Cast("sd::DataType*") int[] arguments, int numberOfArguments);
 
             public native void setTArguments(@StdVector DoublePointer tArgs);
             public native void setTArguments(@StdVector DoubleBuffer tArgs);
@@ -6469,9 +6513,9 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
             public native void setIArguments(@Cast("Nd4jLong*") @StdVector long[] tArgs);
             public native void setBArguments(@Cast("bool*") @StdVector BooleanPointer tArgs);
             public native void setBArguments(@Cast("bool*") @StdVector boolean[] tArgs);
-            public native void setDArguments(@Cast("nd4j::DataType*") @StdVector IntPointer dArgs);
-            public native void setDArguments(@Cast("nd4j::DataType*") @StdVector IntBuffer dArgs);
-            public native void setDArguments(@Cast("nd4j::DataType*") @StdVector int[] dArgs);
+            public native void setDArguments(@Cast("sd::DataType*") @StdVector IntPointer dArgs);
+            public native void setDArguments(@Cast("sd::DataType*") @StdVector IntBuffer dArgs);
+            public native void setDArguments(@Cast("sd::DataType*") @StdVector int[] dArgs);
 
             /**
              * This method purges fastpath in/out contents and releases all the handles.
@@ -6528,10 +6572,10 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #define ND4J_CONTEXT_PROTOTYPE_H
 
 // #include <vector>
-// #include <Environment.h>
+// #include <system/Environment.h>
 // #include <array/DataType.h>
-// #include <dll.h>
-// #include <RandomGenerator.h>
+// #include <system/dll.h>
+// #include <graph/RandomGenerator.h>
 // #include <ops/declarable/OpDescriptor.h>
 // #include <execution/Engine.h>
 // #include <execution/ExecutionMode.h>
@@ -6540,7 +6584,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #include <config.h>
 // #endif
 
-        @Namespace("nd4j::graph") @NoOffset public static class ContextPrototype extends Pointer {
+        @Namespace("sd::graph") @NoOffset public static class ContextPrototype extends Pointer {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public ContextPrototype(Pointer p) { super(p); }
@@ -6564,9 +6608,9 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 
             public native void setOpDescriptor(OpDescriptor opDescriptor);
 
-            public native @Cast("nd4j::DataType") int dataType();
-            public native @Cast("nd4j::DataType") int dataType(int index);
-            public native void setDataType(int index, @Cast("nd4j::DataType") int type);
+            public native @Cast("sd::DataType") int dataType();
+            public native @Cast("sd::DataType") int dataType(int index);
+            public native void setDataType(int index, @Cast("sd::DataType") int type);
 
             public native @Cast("bool") boolean isInplace();
             public native void markInplace(@Cast("bool") boolean reallyInplace);
@@ -6582,7 +6626,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
             public native @StdVector DoublePointer getTArguments();
             public native @StdVector IntPointer getIArguments();
             public native @Cast("bool*") @StdVector BooleanPointer getBArguments();
-            public native @Cast("nd4j::DataType*") @StdVector IntPointer getDArguments();
+            public native @Cast("sd::DataType*") @StdVector IntPointer getDArguments();
             public native @StdVector IntPointer getAxis();
 
             public native @Cast("samediff::Engine") int engine();
@@ -6647,10 +6691,10 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #ifndef LIBND4J_RESULTWRAPPER_H
 // #define LIBND4J_RESULTWRAPPER_H
 
-// #include <op_boilerplate.h>
-// #include <pointercast.h>
-// #include <dll.h>
-        @Namespace("nd4j::graph") @NoOffset public static class ResultWrapper extends org.nd4j.nativeblas.ResultWrapperAbstraction {
+// #include <system/op_boilerplate.h>
+// #include <system/pointercast.h>
+// #include <system/dll.h>
+        @Namespace("sd::graph") @NoOffset public static class ResultWrapper extends org.nd4j.nativeblas.ResultWrapperAbstraction {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public ResultWrapper(Pointer p) { super(p); }
@@ -6699,13 +6743,13 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 
 // #include <cstring>
 // #include <cstdio>
-// #include "../dll.h"
-// #include "../nd4jmalloc.h"
-// #include "../templatemath.h"
+// #include "system/dll.h"
+// #include "system/nd4jmalloc.h"
+// #include "math/templatemath.h"
 // #include "../helpers/logger.h"
-// #include "../pointercast.h"
+// #include "system/pointercast.h"
 // #include "../cnpy/cnpy.h"
-// #include <op_boilerplate.h>
+// #include <system/op_boilerplate.h>
 
 public static final int MAX_DIMENSION = 0x7fffffff;
 public static final int MAX_NUM_THREADS =  1024;
@@ -6722,7 +6766,7 @@ public static final int PREALLOC_SIZE = 33554432;
 // #define INLINEDEF inline
 // #endif
 
-// #include "../pairwise_util.h"
+// #include "system/pairwise_util.h"
 // #include <stdint.h>
 // #include <array/ArrayOptions.h>
 
@@ -6868,25 +6912,25 @@ public static final int PREALLOC_SIZE = 33554432;
     * Get the shape info buffer
     * for the given rank and shape.
     */
-    @Namespace("shape") public static native @Cast("Nd4jLong*") LongPointer shapeBuffer(int rank, @Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*") LongPointer shape);
-    @Namespace("shape") public static native @Cast("Nd4jLong*") LongBuffer shapeBuffer(int rank, @Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*") LongBuffer shape);
-    @Namespace("shape") public static native @Cast("Nd4jLong*") long[] shapeBuffer(int rank, @Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*") long[] shape);
+    @Namespace("shape") public static native @Cast("Nd4jLong*") LongPointer shapeBuffer(int rank, @Cast("sd::DataType") int dtype, @Cast("Nd4jLong*") LongPointer shape);
+    @Namespace("shape") public static native @Cast("Nd4jLong*") LongBuffer shapeBuffer(int rank, @Cast("sd::DataType") int dtype, @Cast("Nd4jLong*") LongBuffer shape);
+    @Namespace("shape") public static native @Cast("Nd4jLong*") long[] shapeBuffer(int rank, @Cast("sd::DataType") int dtype, @Cast("Nd4jLong*") long[] shape);
 
-    @Namespace("shape") public static native @Cast("Nd4jLong*") LongPointer shapeBuffer(int rank, @Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*") LongPointer shape, @Cast("Nd4jLong*") LongPointer buffer);
-    @Namespace("shape") public static native @Cast("Nd4jLong*") LongBuffer shapeBuffer(int rank, @Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*") LongBuffer shape, @Cast("Nd4jLong*") LongBuffer buffer);
-    @Namespace("shape") public static native @Cast("Nd4jLong*") long[] shapeBuffer(int rank, @Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*") long[] shape, @Cast("Nd4jLong*") long[] buffer);
+    @Namespace("shape") public static native @Cast("Nd4jLong*") LongPointer shapeBuffer(int rank, @Cast("sd::DataType") int dtype, @Cast("Nd4jLong*") LongPointer shape, @Cast("Nd4jLong*") LongPointer buffer);
+    @Namespace("shape") public static native @Cast("Nd4jLong*") LongBuffer shapeBuffer(int rank, @Cast("sd::DataType") int dtype, @Cast("Nd4jLong*") LongBuffer shape, @Cast("Nd4jLong*") LongBuffer buffer);
+    @Namespace("shape") public static native @Cast("Nd4jLong*") long[] shapeBuffer(int rank, @Cast("sd::DataType") int dtype, @Cast("Nd4jLong*") long[] shape, @Cast("Nd4jLong*") long[] buffer);
 
     /**
     * Get the shape info buffer
     * for the given rank and shape.
      */
-    @Namespace("shape") public static native @Cast("Nd4jLong*") LongPointer shapeBufferFortran(int rank, @Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*") LongPointer shape);
-    @Namespace("shape") public static native @Cast("Nd4jLong*") LongBuffer shapeBufferFortran(int rank, @Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*") LongBuffer shape);
-    @Namespace("shape") public static native @Cast("Nd4jLong*") long[] shapeBufferFortran(int rank, @Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*") long[] shape);
+    @Namespace("shape") public static native @Cast("Nd4jLong*") LongPointer shapeBufferFortran(int rank, @Cast("sd::DataType") int dtype, @Cast("Nd4jLong*") LongPointer shape);
+    @Namespace("shape") public static native @Cast("Nd4jLong*") LongBuffer shapeBufferFortran(int rank, @Cast("sd::DataType") int dtype, @Cast("Nd4jLong*") LongBuffer shape);
+    @Namespace("shape") public static native @Cast("Nd4jLong*") long[] shapeBufferFortran(int rank, @Cast("sd::DataType") int dtype, @Cast("Nd4jLong*") long[] shape);
 
-    @Namespace("shape") public static native @Cast("Nd4jLong*") LongPointer shapeBufferFortran(int rank, @Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*") LongPointer shape, @Cast("Nd4jLong*") LongPointer output);
-    @Namespace("shape") public static native @Cast("Nd4jLong*") LongBuffer shapeBufferFortran(int rank, @Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*") LongBuffer shape, @Cast("Nd4jLong*") LongBuffer output);
-    @Namespace("shape") public static native @Cast("Nd4jLong*") long[] shapeBufferFortran(int rank, @Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*") long[] shape, @Cast("Nd4jLong*") long[] output);
+    @Namespace("shape") public static native @Cast("Nd4jLong*") LongPointer shapeBufferFortran(int rank, @Cast("sd::DataType") int dtype, @Cast("Nd4jLong*") LongPointer shape, @Cast("Nd4jLong*") LongPointer output);
+    @Namespace("shape") public static native @Cast("Nd4jLong*") LongBuffer shapeBufferFortran(int rank, @Cast("sd::DataType") int dtype, @Cast("Nd4jLong*") LongBuffer shape, @Cast("Nd4jLong*") LongBuffer output);
+    @Namespace("shape") public static native @Cast("Nd4jLong*") long[] shapeBufferFortran(int rank, @Cast("sd::DataType") int dtype, @Cast("Nd4jLong*") long[] shape, @Cast("Nd4jLong*") long[] output);
 
 // #ifdef __CUDACC__
 // #endif
@@ -7939,20 +7983,20 @@ public static final int PREALLOC_SIZE = 33554432;
     @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("Nd4jLong*") long[] offsets);
     // ND4J_EXPORT void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const char order = 'c');
     // ND4J_EXPORT void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const Nd4jLong* zShapeInfo, Nd4jLong*& zOffsets, const char order = 'c');
-    @Namespace("shape") public static native void shapeOldScalar(@Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*const") LongPointer buffer, byte order);
-    @Namespace("shape") public static native void shapeOldScalar(@Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*const") LongBuffer buffer, byte order);
-    @Namespace("shape") public static native void shapeOldScalar(@Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*const") long[] buffer, byte order);
+    @Namespace("shape") public static native void shapeOldScalar(@Cast("sd::DataType") int dtype, @Cast("Nd4jLong*const") LongPointer buffer, byte order);
+    @Namespace("shape") public static native void shapeOldScalar(@Cast("sd::DataType") int dtype, @Cast("Nd4jLong*const") LongBuffer buffer, byte order);
+    @Namespace("shape") public static native void shapeOldScalar(@Cast("sd::DataType") int dtype, @Cast("Nd4jLong*const") long[] buffer, byte order);
 
     // deduce order and element-wise stride
     // if array is scalar or unit length vector then ews = 1 and order is preserved
     // if array is common vector then ews = stride of non-unity dimension and order is preserved
     // if strides are normal/contiguous then ews = 1 and corresponding order is set, otherwise ews = 0 and order is preserved
-    @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") LongPointer shapeInfo, byte proposedOrder, int numOfNonUnitDims, @Cast("const Nd4jLong*") LongPointer shapeNoUnities, @Cast("const Nd4jLong*") LongPointer stridesNoUnities);
-    @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") LongBuffer shapeInfo, byte proposedOrder, int numOfNonUnitDims, @Cast("const Nd4jLong*") LongBuffer shapeNoUnities, @Cast("const Nd4jLong*") LongBuffer stridesNoUnities);
-    @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") long[] shapeInfo, byte proposedOrder, int numOfNonUnitDims, @Cast("const Nd4jLong*") long[] shapeNoUnities, @Cast("const Nd4jLong*") long[] stridesNoUnities);
-    @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") LongPointer shapeInfo);
-    @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") LongBuffer shapeInfo);
-    @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") long[] shapeInfo);
+    @Namespace("shape") public static native void checkStridesEwsAndOrder(@Cast("Nd4jLong*") LongPointer shapeInfo, byte proposedOrder, int numOfNonUnitDims, @Cast("const Nd4jLong*") LongPointer shapeNoUnities, @Cast("const Nd4jLong*") LongPointer stridesNoUnities);
+    @Namespace("shape") public static native void checkStridesEwsAndOrder(@Cast("Nd4jLong*") LongBuffer shapeInfo, byte proposedOrder, int numOfNonUnitDims, @Cast("const Nd4jLong*") LongBuffer shapeNoUnities, @Cast("const Nd4jLong*") LongBuffer stridesNoUnities);
+    @Namespace("shape") public static native void checkStridesEwsAndOrder(@Cast("Nd4jLong*") long[] shapeInfo, byte proposedOrder, int numOfNonUnitDims, @Cast("const Nd4jLong*") long[] shapeNoUnities, @Cast("const Nd4jLong*") long[] stridesNoUnities);
+    @Namespace("shape") public static native void checkStridesEwsAndOrder(@Cast("Nd4jLong*") LongPointer shapeInfo);
+    @Namespace("shape") public static native void checkStridesEwsAndOrder(@Cast("Nd4jLong*") LongBuffer shapeInfo);
+    @Namespace("shape") public static native void checkStridesEwsAndOrder(@Cast("Nd4jLong*") long[] shapeInfo);
 
     /**
     * processes whole set of sub-arrays
@@ -7985,7 +8029,7 @@ public static final int PREALLOC_SIZE = 33554432;
     @Namespace("shape") public static native int excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") long[] inShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] shapeNoUnities, @Cast("Nd4jLong*&") @ByPtrRef long[] stridesNoUnities);
 
     /**
-    * for example inShapeInfo is {3, 2,1,3,1,4,  12,12,4,4,1, 16384,1,99}, dimsToExclude = {2,3}, dimsSize = 2
+    * for example inShapeInfo is {3, 2,1,3,1,4,  12,12,4,4,1, 16384,1,99}, dimsToExclude = {1,3}, dimsSize = 2
     * then outShapeInfo will contain {3, 2,3,4, 12,4,1, 16384,1,99}
     */
     @Namespace("shape") public static native void excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") LongPointer inShapeInfo, int dimsSize, @Const IntPointer dimsToExclude, @Cast("Nd4jLong*") LongPointer outShapeInfo);
@@ -8762,7 +8806,7 @@ public static final int PREALLOC_SIZE = 33554432;
 //         target[shape::shapeInfoLength(newRank) - 3] = 0;
 //         target[shape::shapeInfoLength(newRank) - 2] = 0;
 //         target[shape::shapeInfoLength(newRank) - 1] = isFOrder ? 102 : 99;
-//         nd4j::ArrayOptions::setDataType(target, nd4j::ArrayOptions::dataType(oldShape));
+//         sd::ArrayOptions::setDataType(target, sd::ArrayOptions::dataType(oldShape));
 
 //         delete[] olddims;
 //         delete[] oldstrides;
@@ -9090,9 +9134,9 @@ public static final int PREALLOC_SIZE = 33554432;
 // #define LIBND4J_SHAPELIST_H
 
 // #include <vector>
-// #include <shape.h>
-// #include <dll.h>
-    @Namespace("nd4j") @NoOffset public static class ShapeList extends Pointer {
+// #include <helpers/shape.h>
+// #include <system/dll.h>
+    @Namespace("sd") @NoOffset public static class ShapeList extends Pointer {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public ShapeList(Pointer p) { super(p); }
@@ -9158,7 +9202,7 @@ public static final int PREALLOC_SIZE = 33554432;
 
 // #ifndef ND4J_INPUTTYPE_H
 // #define ND4J_INPUTTYPE_H
-        /** enum nd4j::ops::InputType */
+        /** enum sd::ops::InputType */
         public static final int
             InputType_BOOLEAN = 0,
             InputType_NUMERIC = 1,
@@ -9197,7 +9241,6 @@ public static final int PREALLOC_SIZE = 33554432;
 
 // #include <string>
 // #include <vector>
-// #include <map>
 // #include <initializer_list>
 // #include <helpers/helper_hash.h>
 // #include <ops/InputType.h>
@@ -9208,7 +9251,7 @@ public static final int PREALLOC_SIZE = 33554432;
         *   This class is very basic info holder for ops. bean/pojo pretty much.
         *
         */
-        @Namespace("nd4j::ops") @NoOffset public static class OpDescriptor extends Pointer {
+        @Namespace("sd::ops") @NoOffset public static class OpDescriptor extends Pointer {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public OpDescriptor(Pointer p) { super(p); }
@@ -9292,30 +9335,30 @@ public static final int PREALLOC_SIZE = 33554432;
 
             public native void setHash(@Cast("Nd4jLong") long hash);
 
-            public native @Cast("nd4j::ops::InputType") int inputType();
+            public native @Cast("sd::ops::InputType") int inputType();
 
 
 
-            public native OpDescriptor setInputType(@Cast("nd4j::ops::InputType") int type);
-            public native OpDescriptor setAllowedInputTypes(int index, @Cast("nd4j::DataType*") @StdVector IntPointer dtype);
-            public native OpDescriptor setAllowedInputTypes(int index, @Cast("nd4j::DataType*") @StdVector IntBuffer dtype);
-            public native OpDescriptor setAllowedInputTypes(int index, @Cast("nd4j::DataType*") @StdVector int[] dtype);
-            public native OpDescriptor setAllowedOutputTypes(int index, @Cast("nd4j::DataType*") @StdVector IntPointer dtype);
-            public native OpDescriptor setAllowedOutputTypes(int index, @Cast("nd4j::DataType*") @StdVector IntBuffer dtype);
-            public native OpDescriptor setAllowedOutputTypes(int index, @Cast("nd4j::DataType*") @StdVector int[] dtype);
-            public native OpDescriptor setAllowedInputTypes(int index,  @Cast("nd4j::DataType") int dtype);
-            public native OpDescriptor setAllowedOutputTypes(int index, @Cast("nd4j::DataType") int dtype);
-            public native OpDescriptor setAllowedInputTypes(@Cast("nd4j::DataType") int dtype);
-            public native OpDescriptor setAllowedOutputTypes(@Cast("nd4j::DataType") int dtype);
+            public native OpDescriptor setInputType(@Cast("sd::ops::InputType") int type);
+            public native OpDescriptor setAllowedInputTypes(int index, @Cast("sd::DataType*") @StdVector IntPointer dtype);
+            public native OpDescriptor setAllowedInputTypes(int index, @Cast("sd::DataType*") @StdVector IntBuffer dtype);
+            public native OpDescriptor setAllowedInputTypes(int index, @Cast("sd::DataType*") @StdVector int[] dtype);
+            public native OpDescriptor setAllowedOutputTypes(int index, @Cast("sd::DataType*") @StdVector IntPointer dtype);
+            public native OpDescriptor setAllowedOutputTypes(int index, @Cast("sd::DataType*") @StdVector IntBuffer dtype);
+            public native OpDescriptor setAllowedOutputTypes(int index, @Cast("sd::DataType*") @StdVector int[] dtype);
+            public native OpDescriptor setAllowedInputTypes(int index,  @Cast("sd::DataType") int dtype);
+            public native OpDescriptor setAllowedOutputTypes(int index, @Cast("sd::DataType") int dtype);
+            public native OpDescriptor setAllowedInputTypes(@Cast("sd::DataType") int dtype);
+            public native OpDescriptor setAllowedOutputTypes(@Cast("sd::DataType") int dtype);
             public native OpDescriptor allowOverride(@Cast("bool") boolean reallyAllow);
             public native OpDescriptor setSameMode(@Cast("bool") boolean reallySame);
-            public native OpDescriptor setInputType(int idx, @Cast("nd4j::DataType") int dtype);
-            public native OpDescriptor setOutputType(int idx, @Cast("nd4j::DataType") int dtype);
+            public native OpDescriptor setInputType(int idx, @Cast("sd::DataType") int dtype);
+            public native OpDescriptor setOutputType(int idx, @Cast("sd::DataType") int dtype);
 
-            public native @Cast("nd4j::DataType*") @StdVector IntPointer getOutputTypesForOutput(int index);
+            public native @Cast("sd::DataType*") @StdVector IntPointer getOutputTypesForOutput(int index);
 
-            public native @Cast("bool") boolean checkInputMatch(int index, @Cast("nd4j::DataType") int dataType);
-            public native @Cast("bool") boolean checkOutputMatch(int index, @Cast("nd4j::DataType") int dataType);
+            public native @Cast("bool") boolean checkInputMatch(int index, @Cast("sd::DataType") int dataType);
+            public native @Cast("bool") boolean checkOutputMatch(int index, @Cast("sd::DataType") int dataType);
             public native @Cast("bool") boolean isSameMode();
 
             public native @Cast("bool") boolean isInherit(int index);
@@ -9351,16 +9394,16 @@ public static final int PREALLOC_SIZE = 33554432;
 // #ifndef SD_PLATFORMHELPER_H
 // #define SD_PLATFORMHELPER_H
 
-// #include <ShapeUtils.h>
+// #include <helpers/ShapeUtils.h>
 // #include <execution/Engine.h>
 // #include <graph/Context.h>
 // #include <string>
-// #include <pointercast.h>
-// #include <dll.h>
+// #include <system/pointercast.h>
+// #include <system/dll.h>
             /**
              * This abstract class defines methods used by platform-specific helpers implementations
              */
-            @Namespace("nd4j::ops::platforms") @NoOffset public static class PlatformHelper extends Pointer {
+            @Namespace("sd::ops::platforms") @NoOffset public static class PlatformHelper extends Pointer {
                 static { Loader.load(); }
                 /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
                 public PlatformHelper(Pointer p) { super(p); }
@@ -9433,7 +9476,7 @@ public static final int PREALLOC_SIZE = 33554432;
 // #include "OpDescriptor.h"
 // #include "DeclarableOp.h"
 // #include "DeclarableCustomOp.h"
-        @Namespace("nd4j::ops") public static class BroadcastableOp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class BroadcastableOp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public BroadcastableOp(Pointer p) { super(p); }
@@ -9474,10 +9517,10 @@ public static final int PREALLOC_SIZE = 33554432;
 // #define LIBND4J_OPARGSHOLDER_H
 
 
-// #include <NDArray.h>
-// #include <dll.h>
+// #include <array/NDArray.h>
+// #include <system/dll.h>
 
-@Namespace("nd4j") @NoOffset public static class OpArgsHolder extends Pointer {
+@Namespace("sd") @NoOffset public static class OpArgsHolder extends Pointer {
     static { Loader.load(); }
     /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
     public OpArgsHolder(Pointer p) { super(p); }
@@ -9579,15 +9622,15 @@ public static final int PREALLOC_SIZE = 33554432;
 
 // #include <sstream>
 // #include <types/float16.h>
-// #include <pointercast.h>
-// #include <NDArray.h>
+// #include <system/pointercast.h>
+// #include <array/NDArray.h>
 // #include <graph/Context.h>
 // #include "OpDescriptor.h"
 // #include <helpers/helper_hash.h>
 // #include <array/ShapeList.h>
 // #include <array/ResultSet.h>
 // #include <helpers/OpArgsHolder.h>
-// #include <dll.h>
+// #include <system/dll.h>
 // #include <ops/declarable/EmptyHandling.h>
 //#include <ops/declarable/declarable_ops.h>
 
@@ -9595,14 +9638,14 @@ public static final int PREALLOC_SIZE = 33554432;
 // #include <ctime>
 // #include <mutex>
 
-        @Namespace("nd4j::ops") public static native @Cast("Nd4jStatus") int conditionHelper(@Cast("char*") String file, int line, int condition, int argNumber, @Cast("char*") String format);
-        @Namespace("nd4j::ops") public static native @Cast("Nd4jStatus") int conditionHelper(@Cast("char*") BytePointer file, int line, int condition, int argNumber, @Cast("char*") BytePointer format);
+        @Namespace("sd::ops") public static native @Cast("Nd4jStatus") int conditionHelper(@Cast("char*") String file, int line, int condition, int argNumber, @Cast("char*") String format);
+        @Namespace("sd::ops") public static native @Cast("Nd4jStatus") int conditionHelper(@Cast("char*") BytePointer file, int line, int condition, int argNumber, @Cast("char*") BytePointer format);
 
         /**
          * This class is the basic building block of Graph Operations. Any CustomOp out there is built on top of this "abstract" class.
          *
          */
-        @Namespace("nd4j::ops") @NoOffset public static class DeclarableOp extends Pointer {
+        @Namespace("sd::ops") @NoOffset public static class DeclarableOp extends Pointer {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public DeclarableOp(Pointer p) { super(p); }
@@ -9657,40 +9700,40 @@ public static final int PREALLOC_SIZE = 33554432;
 
             public native @Cast("Nd4jStatus") int execute(@Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs);
 
-            public native @Cast("Nd4jStatus") int execute(@Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoublePointer tArgs, @Cast("Nd4jLong*") @StdVector LongPointer iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs/*=std::vector<bool>()*/, @Cast("nd4j::DataType*") @StdVector IntPointer dArgs/*=std::vector<nd4j::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
+            public native @Cast("Nd4jStatus") int execute(@Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoublePointer tArgs, @Cast("Nd4jLong*") @StdVector LongPointer iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs/*=std::vector<bool>()*/, @Cast("sd::DataType*") @StdVector IntPointer dArgs/*=std::vector<sd::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
             public native @Cast("Nd4jStatus") int execute(@Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoublePointer tArgs, @Cast("Nd4jLong*") @StdVector LongPointer iArgs);
-            public native @Cast("Nd4jStatus") int execute(@Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoubleBuffer tArgs, @Cast("Nd4jLong*") @StdVector LongBuffer iArgs, @Cast("bool*") @StdVector boolean[] bArgs/*=std::vector<bool>()*/, @Cast("nd4j::DataType*") @StdVector IntBuffer dArgs/*=std::vector<nd4j::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
+            public native @Cast("Nd4jStatus") int execute(@Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoubleBuffer tArgs, @Cast("Nd4jLong*") @StdVector LongBuffer iArgs, @Cast("bool*") @StdVector boolean[] bArgs/*=std::vector<bool>()*/, @Cast("sd::DataType*") @StdVector IntBuffer dArgs/*=std::vector<sd::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
             public native @Cast("Nd4jStatus") int execute(@Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoubleBuffer tArgs, @Cast("Nd4jLong*") @StdVector LongBuffer iArgs);
-            public native @Cast("Nd4jStatus") int execute(@Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector double[] tArgs, @Cast("Nd4jLong*") @StdVector long[] iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs/*=std::vector<bool>()*/, @Cast("nd4j::DataType*") @StdVector int[] dArgs/*=std::vector<nd4j::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
+            public native @Cast("Nd4jStatus") int execute(@Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector double[] tArgs, @Cast("Nd4jLong*") @StdVector long[] iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs/*=std::vector<bool>()*/, @Cast("sd::DataType*") @StdVector int[] dArgs/*=std::vector<sd::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
             public native @Cast("Nd4jStatus") int execute(@Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector double[] tArgs, @Cast("Nd4jLong*") @StdVector long[] iArgs);
-            public native @Cast("Nd4jStatus") int execute(@Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoublePointer tArgs, @Cast("Nd4jLong*") @StdVector LongPointer iArgs, @Cast("bool*") @StdVector boolean[] bArgs/*=std::vector<bool>()*/, @Cast("nd4j::DataType*") @StdVector IntPointer dArgs/*=std::vector<nd4j::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
-            public native @Cast("Nd4jStatus") int execute(@Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoubleBuffer tArgs, @Cast("Nd4jLong*") @StdVector LongBuffer iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs/*=std::vector<bool>()*/, @Cast("nd4j::DataType*") @StdVector IntBuffer dArgs/*=std::vector<nd4j::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
-            public native @Cast("Nd4jStatus") int execute(@Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector double[] tArgs, @Cast("Nd4jLong*") @StdVector long[] iArgs, @Cast("bool*") @StdVector boolean[] bArgs/*=std::vector<bool>()*/, @Cast("nd4j::DataType*") @StdVector int[] dArgs/*=std::vector<nd4j::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
+            public native @Cast("Nd4jStatus") int execute(@Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoublePointer tArgs, @Cast("Nd4jLong*") @StdVector LongPointer iArgs, @Cast("bool*") @StdVector boolean[] bArgs/*=std::vector<bool>()*/, @Cast("sd::DataType*") @StdVector IntPointer dArgs/*=std::vector<sd::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
+            public native @Cast("Nd4jStatus") int execute(@Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoubleBuffer tArgs, @Cast("Nd4jLong*") @StdVector LongBuffer iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs/*=std::vector<bool>()*/, @Cast("sd::DataType*") @StdVector IntBuffer dArgs/*=std::vector<sd::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
+            public native @Cast("Nd4jStatus") int execute(@Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector double[] tArgs, @Cast("Nd4jLong*") @StdVector long[] iArgs, @Cast("bool*") @StdVector boolean[] bArgs/*=std::vector<bool>()*/, @Cast("sd::DataType*") @StdVector int[] dArgs/*=std::vector<sd::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
 
 
             public native ResultSet evaluate(@Const @ByRef NDArrayVector inputs);
 
-            public native ResultSet evaluate(@Const @ByRef NDArrayVector inputs, @StdVector DoublePointer tArgs, @Cast("Nd4jLong*") @StdVector LongPointer iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs/*=std::vector<bool>()*/, @Cast("nd4j::DataType*") @StdVector IntPointer dArgs/*=std::vector<nd4j::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
+            public native ResultSet evaluate(@Const @ByRef NDArrayVector inputs, @StdVector DoublePointer tArgs, @Cast("Nd4jLong*") @StdVector LongPointer iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs/*=std::vector<bool>()*/, @Cast("sd::DataType*") @StdVector IntPointer dArgs/*=std::vector<sd::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
             public native ResultSet evaluate(@Const @ByRef NDArrayVector inputs, @StdVector DoublePointer tArgs, @Cast("Nd4jLong*") @StdVector LongPointer iArgs);
-            public native ResultSet evaluate(@Const @ByRef NDArrayVector inputs, @StdVector DoubleBuffer tArgs, @Cast("Nd4jLong*") @StdVector LongBuffer iArgs, @Cast("bool*") @StdVector boolean[] bArgs/*=std::vector<bool>()*/, @Cast("nd4j::DataType*") @StdVector IntBuffer dArgs/*=std::vector<nd4j::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
+            public native ResultSet evaluate(@Const @ByRef NDArrayVector inputs, @StdVector DoubleBuffer tArgs, @Cast("Nd4jLong*") @StdVector LongBuffer iArgs, @Cast("bool*") @StdVector boolean[] bArgs/*=std::vector<bool>()*/, @Cast("sd::DataType*") @StdVector IntBuffer dArgs/*=std::vector<sd::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
             public native ResultSet evaluate(@Const @ByRef NDArrayVector inputs, @StdVector DoubleBuffer tArgs, @Cast("Nd4jLong*") @StdVector LongBuffer iArgs);
-            public native ResultSet evaluate(@Const @ByRef NDArrayVector inputs, @StdVector double[] tArgs, @Cast("Nd4jLong*") @StdVector long[] iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs/*=std::vector<bool>()*/, @Cast("nd4j::DataType*") @StdVector int[] dArgs/*=std::vector<nd4j::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
+            public native ResultSet evaluate(@Const @ByRef NDArrayVector inputs, @StdVector double[] tArgs, @Cast("Nd4jLong*") @StdVector long[] iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs/*=std::vector<bool>()*/, @Cast("sd::DataType*") @StdVector int[] dArgs/*=std::vector<sd::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
             public native ResultSet evaluate(@Const @ByRef NDArrayVector inputs, @StdVector double[] tArgs, @Cast("Nd4jLong*") @StdVector long[] iArgs);
-            public native ResultSet evaluate(@Const @ByRef NDArrayVector inputs, @StdVector DoublePointer tArgs, @Cast("Nd4jLong*") @StdVector LongPointer iArgs, @Cast("bool*") @StdVector boolean[] bArgs/*=std::vector<bool>()*/, @Cast("nd4j::DataType*") @StdVector IntPointer dArgs/*=std::vector<nd4j::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
-            public native ResultSet evaluate(@Const @ByRef NDArrayVector inputs, @StdVector DoubleBuffer tArgs, @Cast("Nd4jLong*") @StdVector LongBuffer iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs/*=std::vector<bool>()*/, @Cast("nd4j::DataType*") @StdVector IntBuffer dArgs/*=std::vector<nd4j::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
-            public native ResultSet evaluate(@Const @ByRef NDArrayVector inputs, @StdVector double[] tArgs, @Cast("Nd4jLong*") @StdVector long[] iArgs, @Cast("bool*") @StdVector boolean[] bArgs/*=std::vector<bool>()*/, @Cast("nd4j::DataType*") @StdVector int[] dArgs/*=std::vector<nd4j::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
+            public native ResultSet evaluate(@Const @ByRef NDArrayVector inputs, @StdVector DoublePointer tArgs, @Cast("Nd4jLong*") @StdVector LongPointer iArgs, @Cast("bool*") @StdVector boolean[] bArgs/*=std::vector<bool>()*/, @Cast("sd::DataType*") @StdVector IntPointer dArgs/*=std::vector<sd::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
+            public native ResultSet evaluate(@Const @ByRef NDArrayVector inputs, @StdVector DoubleBuffer tArgs, @Cast("Nd4jLong*") @StdVector LongBuffer iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs/*=std::vector<bool>()*/, @Cast("sd::DataType*") @StdVector IntBuffer dArgs/*=std::vector<sd::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
+            public native ResultSet evaluate(@Const @ByRef NDArrayVector inputs, @StdVector double[] tArgs, @Cast("Nd4jLong*") @StdVector long[] iArgs, @Cast("bool*") @StdVector boolean[] bArgs/*=std::vector<bool>()*/, @Cast("sd::DataType*") @StdVector int[] dArgs/*=std::vector<sd::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
 
-            public native @Cast("Nd4jStatus") int execute(@ByRef RandomGenerator rng, @Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoublePointer tArgs, @Cast("Nd4jLong*") @StdVector LongPointer iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs, @Cast("nd4j::DataType*") @StdVector IntPointer dArgs/*=std::vector<nd4j::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/, @Cast("nd4j::DataType") int type/*=nd4j::DataType::FLOAT32*/);
+            public native @Cast("Nd4jStatus") int execute(@ByRef RandomGenerator rng, @Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoublePointer tArgs, @Cast("Nd4jLong*") @StdVector LongPointer iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs, @Cast("sd::DataType*") @StdVector IntPointer dArgs/*=std::vector<sd::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/, @Cast("sd::DataType") int type/*=sd::DataType::FLOAT32*/);
             public native @Cast("Nd4jStatus") int execute(@ByRef RandomGenerator rng, @Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoublePointer tArgs, @Cast("Nd4jLong*") @StdVector LongPointer iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs);
-            public native @Cast("Nd4jStatus") int execute(@ByRef RandomGenerator rng, @Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoubleBuffer tArgs, @Cast("Nd4jLong*") @StdVector LongBuffer iArgs, @Cast("bool*") @StdVector boolean[] bArgs, @Cast("nd4j::DataType*") @StdVector IntBuffer dArgs/*=std::vector<nd4j::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/, @Cast("nd4j::DataType") int type/*=nd4j::DataType::FLOAT32*/);
+            public native @Cast("Nd4jStatus") int execute(@ByRef RandomGenerator rng, @Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoubleBuffer tArgs, @Cast("Nd4jLong*") @StdVector LongBuffer iArgs, @Cast("bool*") @StdVector boolean[] bArgs, @Cast("sd::DataType*") @StdVector IntBuffer dArgs/*=std::vector<sd::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/, @Cast("sd::DataType") int type/*=sd::DataType::FLOAT32*/);
             public native @Cast("Nd4jStatus") int execute(@ByRef RandomGenerator rng, @Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoubleBuffer tArgs, @Cast("Nd4jLong*") @StdVector LongBuffer iArgs, @Cast("bool*") @StdVector boolean[] bArgs);
-            public native @Cast("Nd4jStatus") int execute(@ByRef RandomGenerator rng, @Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector double[] tArgs, @Cast("Nd4jLong*") @StdVector long[] iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs, @Cast("nd4j::DataType*") @StdVector int[] dArgs/*=std::vector<nd4j::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/, @Cast("nd4j::DataType") int type/*=nd4j::DataType::FLOAT32*/);
+            public native @Cast("Nd4jStatus") int execute(@ByRef RandomGenerator rng, @Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector double[] tArgs, @Cast("Nd4jLong*") @StdVector long[] iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs, @Cast("sd::DataType*") @StdVector int[] dArgs/*=std::vector<sd::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/, @Cast("sd::DataType") int type/*=sd::DataType::FLOAT32*/);
             public native @Cast("Nd4jStatus") int execute(@ByRef RandomGenerator rng, @Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector double[] tArgs, @Cast("Nd4jLong*") @StdVector long[] iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs);
-            public native @Cast("Nd4jStatus") int execute(@ByRef RandomGenerator rng, @Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoublePointer tArgs, @Cast("Nd4jLong*") @StdVector LongPointer iArgs, @Cast("bool*") @StdVector boolean[] bArgs, @Cast("nd4j::DataType*") @StdVector IntPointer dArgs/*=std::vector<nd4j::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/, @Cast("nd4j::DataType") int type/*=nd4j::DataType::FLOAT32*/);
+            public native @Cast("Nd4jStatus") int execute(@ByRef RandomGenerator rng, @Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoublePointer tArgs, @Cast("Nd4jLong*") @StdVector LongPointer iArgs, @Cast("bool*") @StdVector boolean[] bArgs, @Cast("sd::DataType*") @StdVector IntPointer dArgs/*=std::vector<sd::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/, @Cast("sd::DataType") int type/*=sd::DataType::FLOAT32*/);
             public native @Cast("Nd4jStatus") int execute(@ByRef RandomGenerator rng, @Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoublePointer tArgs, @Cast("Nd4jLong*") @StdVector LongPointer iArgs, @Cast("bool*") @StdVector boolean[] bArgs);
-            public native @Cast("Nd4jStatus") int execute(@ByRef RandomGenerator rng, @Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoubleBuffer tArgs, @Cast("Nd4jLong*") @StdVector LongBuffer iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs, @Cast("nd4j::DataType*") @StdVector IntBuffer dArgs/*=std::vector<nd4j::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/, @Cast("nd4j::DataType") int type/*=nd4j::DataType::FLOAT32*/);
+            public native @Cast("Nd4jStatus") int execute(@ByRef RandomGenerator rng, @Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoubleBuffer tArgs, @Cast("Nd4jLong*") @StdVector LongBuffer iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs, @Cast("sd::DataType*") @StdVector IntBuffer dArgs/*=std::vector<sd::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/, @Cast("sd::DataType") int type/*=sd::DataType::FLOAT32*/);
             public native @Cast("Nd4jStatus") int execute(@ByRef RandomGenerator rng, @Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoubleBuffer tArgs, @Cast("Nd4jLong*") @StdVector LongBuffer iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs);
-            public native @Cast("Nd4jStatus") int execute(@ByRef RandomGenerator rng, @Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector double[] tArgs, @Cast("Nd4jLong*") @StdVector long[] iArgs, @Cast("bool*") @StdVector boolean[] bArgs, @Cast("nd4j::DataType*") @StdVector int[] dArgs/*=std::vector<nd4j::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/, @Cast("nd4j::DataType") int type/*=nd4j::DataType::FLOAT32*/);
+            public native @Cast("Nd4jStatus") int execute(@ByRef RandomGenerator rng, @Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector double[] tArgs, @Cast("Nd4jLong*") @StdVector long[] iArgs, @Cast("bool*") @StdVector boolean[] bArgs, @Cast("sd::DataType*") @StdVector int[] dArgs/*=std::vector<sd::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/, @Cast("sd::DataType") int type/*=sd::DataType::FLOAT32*/);
             public native @Cast("Nd4jStatus") int execute(@ByRef RandomGenerator rng, @Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector double[] tArgs, @Cast("Nd4jLong*") @StdVector long[] iArgs, @Cast("bool*") @StdVector boolean[] bArgs);
 
             public native ResultSet execute(@Const @ByRef OpArgsHolder holder, @Cast("bool") boolean isInplace/*=false*/);
@@ -9758,7 +9801,7 @@ public static final int PREALLOC_SIZE = 33554432;
 // #include <graph/Context.h>
 // #include <ops/declarable/OpRegistrator.h>
 // #include <ops/declarable/DeclarableOp.h>
-        @Namespace("nd4j::ops") public static class DeclarableListOp extends DeclarableOp {
+        @Namespace("sd::ops") public static class DeclarableListOp extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public DeclarableListOp(Pointer p) { super(p); }
@@ -9803,7 +9846,7 @@ public static final int PREALLOC_SIZE = 33554432;
 // #define LIBND4J_DECLARABLE_REDUCTION_OP_H
 
 // #include <ops/declarable/DeclarableOp.h>
-        @Namespace("nd4j::ops") public static class DeclarableReductionOp extends DeclarableOp {
+        @Namespace("sd::ops") public static class DeclarableReductionOp extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public DeclarableReductionOp(Pointer p) { super(p); }
@@ -9843,7 +9886,7 @@ public static final int PREALLOC_SIZE = 33554432;
 // #define LIBND4J_DECLARABLECUSTOMOP_H
 
 // #include <ops/declarable/DeclarableOp.h>
-        @Namespace("nd4j::ops") public static class DeclarableCustomOp extends DeclarableOp {
+        @Namespace("sd::ops") public static class DeclarableCustomOp extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public DeclarableCustomOp(Pointer p) { super(p); }
@@ -9885,7 +9928,7 @@ public static final int PREALLOC_SIZE = 33554432;
 // #include <graph/Context.h>
 // #include "OpDescriptor.h"
 // #include "DeclarableOp.h"
-        @Namespace("nd4j::ops") @NoOffset public static class BooleanOp extends DeclarableOp {
+        @Namespace("sd::ops") @NoOffset public static class BooleanOp extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public BooleanOp(Pointer p) { super(p); }
@@ -9938,7 +9981,7 @@ public static final int PREALLOC_SIZE = 33554432;
          * Their code is the part of GraphExecutioner logic. But we still want them to be expressed via Graph
          * \tparam T
          */
-        @Namespace("nd4j::ops") public static class LogicOp extends DeclarableOp {
+        @Namespace("sd::ops") public static class LogicOp extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public LogicOp(Pointer p) { super(p); }
@@ -9982,7 +10025,7 @@ public static final int PREALLOC_SIZE = 33554432;
 // #ifndef LIBND4J_OPREGISTRATOR_H
 // #define LIBND4J_OPREGISTRATOR_H
 
-// #include <pointercast.h>
+// #include <system/pointercast.h>
 // #include <vector>
 // #include <unordered_map>
 // #include <mutex>
@@ -9993,6 +10036,10 @@ public static final int PREALLOC_SIZE = 33554432;
 // handlers part
 // #include <cstdlib>
 // #include <csignal>
+
+// #ifndef __JAVACPP_HACK__
+
+// #endif
         /**
         *   This class provides runtime ops lookup, based on opName or opHash.
         *   To build lookup directory we use *_OP_IMPL macro, which puts static structs at compile time in .cpp files,
@@ -10000,7 +10047,7 @@ public static final int PREALLOC_SIZE = 33554432;
         *   available at runtime via this singleton.
         *
         */
-        @Namespace("nd4j::ops") @NoOffset public static class OpRegistrator extends Pointer {
+        @Namespace("sd::ops") @NoOffset public static class OpRegistrator extends Pointer {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public OpRegistrator(Pointer p) { super(p); }
@@ -10075,10 +10122,10 @@ public static final int PREALLOC_SIZE = 33554432;
 // #ifndef LIBND4J_CONTEXTBUFFERS_H
 // #define LIBND4J_CONTEXTBUFFERS_H
 
-// #include <dll.h>
-// #include <pointercast.h>
+// #include <system/dll.h>
+// #include <system/pointercast.h>
 // #include <execution/ErrorReference.h>
-    @Namespace("nd4j") @NoOffset public static class ContextBuffers extends Pointer {
+    @Namespace("sd") @NoOffset public static class ContextBuffers extends Pointer {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public ContextBuffers(Pointer p) { super(p); }
@@ -10166,16 +10213,16 @@ public static final int PREALLOC_SIZE = 33554432;
 // #include "config.h"
 // #endif
 
-// #include <dll.h>
+// #include <system/dll.h>
 // #include <memory>
-// #include <op_boilerplate.h>
+// #include <system/op_boilerplate.h>
 // #include <memory/Workspace.h>
 // #include <vector>
 // #include <mutex>
 // #include <execution/ContextBuffers.h>
 // #include <execution/ErrorReference.h>
 
-@Namespace("nd4j") @NoOffset public static class LaunchContext extends Pointer {
+@Namespace("sd") @NoOffset public static class LaunchContext extends Pointer {
     static { Loader.load(); }
     /** Native array allocator. Access with {@link Pointer#position(long)}. */
     public LaunchContext(long size) { super((Pointer)null); allocateArray(size); }
@@ -10251,12 +10298,12 @@ public static final int PREALLOC_SIZE = 33554432;
 
 // #include <unordered_map>
 // #include <vector>
-// #include <dll.h>
-// #include <pointercast.h>
-// #include <DataType.h>
+// #include <system/dll.h>
+// #include <system/pointercast.h>
+// #include <array/DataType.h>
 // #include <initializer_list>
 
-@Namespace("nd4j") @NoOffset public static class ShapeDescriptor extends Pointer {
+@Namespace("sd") @NoOffset public static class ShapeDescriptor extends Pointer {
     static { Loader.load(); }
     /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
     public ShapeDescriptor(Pointer p) { super(p); }
@@ -10281,12 +10328,12 @@ public static final int PREALLOC_SIZE = 33554432;
         private native void allocate(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("bool") boolean inheritDtype/*=true*/);
         public ShapeDescriptor(@Cast("const Nd4jLong*") long[] shapeInfo) { super((Pointer)null); allocate(shapeInfo); }
         private native void allocate(@Cast("const Nd4jLong*") long[] shapeInfo);
-        public ShapeDescriptor(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const nd4j::DataType") int dtypeOverride) { super((Pointer)null); allocate(shapeInfo, dtypeOverride); }
-        private native void allocate(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const nd4j::DataType") int dtypeOverride);
-        public ShapeDescriptor(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const nd4j::DataType") int dtypeOverride) { super((Pointer)null); allocate(shapeInfo, dtypeOverride); }
-        private native void allocate(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const nd4j::DataType") int dtypeOverride);
-        public ShapeDescriptor(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const nd4j::DataType") int dtypeOverride) { super((Pointer)null); allocate(shapeInfo, dtypeOverride); }
-        private native void allocate(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const nd4j::DataType") int dtypeOverride);
+        public ShapeDescriptor(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const sd::DataType") int dtypeOverride) { super((Pointer)null); allocate(shapeInfo, dtypeOverride); }
+        private native void allocate(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const sd::DataType") int dtypeOverride);
+        public ShapeDescriptor(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const sd::DataType") int dtypeOverride) { super((Pointer)null); allocate(shapeInfo, dtypeOverride); }
+        private native void allocate(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const sd::DataType") int dtypeOverride);
+        public ShapeDescriptor(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const sd::DataType") int dtypeOverride) { super((Pointer)null); allocate(shapeInfo, dtypeOverride); }
+        private native void allocate(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const sd::DataType") int dtypeOverride);
         public ShapeDescriptor(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const Nd4jLong*") LongPointer dtypeOverride) { super((Pointer)null); allocate(shapeInfo, dtypeOverride); }
         private native void allocate(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const Nd4jLong*") LongPointer dtypeOverride);
         public ShapeDescriptor(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const Nd4jLong*") LongBuffer dtypeOverride) { super((Pointer)null); allocate(shapeInfo, dtypeOverride); }
@@ -10299,38 +10346,38 @@ public static final int PREALLOC_SIZE = 33554432;
         private native void allocate(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const Nd4jLong*") LongBuffer dtypeOverride, @Cast("const Nd4jLong*") LongBuffer orderOverride);
         public ShapeDescriptor(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const Nd4jLong*") long[] dtypeOverride, @Cast("const Nd4jLong*") long[] orderOverride) { super((Pointer)null); allocate(shapeInfo, dtypeOverride, orderOverride); }
         private native void allocate(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const Nd4jLong*") long[] dtypeOverride, @Cast("const Nd4jLong*") long[] orderOverride);
-        public ShapeDescriptor(@Cast("const nd4j::DataType") int type, @Cast("const Nd4jLong") long length) { super((Pointer)null); allocate(type, length); }
-        private native void allocate(@Cast("const nd4j::DataType") int type, @Cast("const Nd4jLong") long length);
-        public ShapeDescriptor(@Cast("const nd4j::DataType") int type, byte order, @Cast("const Nd4jLong*") LongPointer shape, int rank) { super((Pointer)null); allocate(type, order, shape, rank); }
-        private native void allocate(@Cast("const nd4j::DataType") int type, byte order, @Cast("const Nd4jLong*") LongPointer shape, int rank);
-        public ShapeDescriptor(@Cast("const nd4j::DataType") int type, byte order, @Cast("const Nd4jLong*") LongBuffer shape, int rank) { super((Pointer)null); allocate(type, order, shape, rank); }
-        private native void allocate(@Cast("const nd4j::DataType") int type, byte order, @Cast("const Nd4jLong*") LongBuffer shape, int rank);
-        public ShapeDescriptor(@Cast("const nd4j::DataType") int type, byte order, @Cast("const Nd4jLong*") long[] shape, int rank) { super((Pointer)null); allocate(type, order, shape, rank); }
-        private native void allocate(@Cast("const nd4j::DataType") int type, byte order, @Cast("const Nd4jLong*") long[] shape, int rank);
-        public ShapeDescriptor(@Cast("const nd4j::DataType") int type, byte order, @Cast("const Nd4jLong*") LongPointer shape, @Cast("const Nd4jLong*") LongPointer strides, int rank, @Cast("Nd4jLong") long ews, @Cast("const bool") boolean empty) { super((Pointer)null); allocate(type, order, shape, strides, rank, ews, empty); }
-        private native void allocate(@Cast("const nd4j::DataType") int type, byte order, @Cast("const Nd4jLong*") LongPointer shape, @Cast("const Nd4jLong*") LongPointer strides, int rank, @Cast("Nd4jLong") long ews, @Cast("const bool") boolean empty);
-        public ShapeDescriptor(@Cast("const nd4j::DataType") int type, byte order, @Cast("const Nd4jLong*") LongBuffer shape, @Cast("const Nd4jLong*") LongBuffer strides, int rank, @Cast("Nd4jLong") long ews, @Cast("const bool") boolean empty) { super((Pointer)null); allocate(type, order, shape, strides, rank, ews, empty); }
-        private native void allocate(@Cast("const nd4j::DataType") int type, byte order, @Cast("const Nd4jLong*") LongBuffer shape, @Cast("const Nd4jLong*") LongBuffer strides, int rank, @Cast("Nd4jLong") long ews, @Cast("const bool") boolean empty);
-        public ShapeDescriptor(@Cast("const nd4j::DataType") int type, byte order, @Cast("const Nd4jLong*") long[] shape, @Cast("const Nd4jLong*") long[] strides, int rank, @Cast("Nd4jLong") long ews, @Cast("const bool") boolean empty) { super((Pointer)null); allocate(type, order, shape, strides, rank, ews, empty); }
-        private native void allocate(@Cast("const nd4j::DataType") int type, byte order, @Cast("const Nd4jLong*") long[] shape, @Cast("const Nd4jLong*") long[] strides, int rank, @Cast("Nd4jLong") long ews, @Cast("const bool") boolean empty);
-        public ShapeDescriptor(@Cast("const nd4j::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape) { super((Pointer)null); allocate(type, order, shape); }
-        private native void allocate(@Cast("const nd4j::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape);
-        public ShapeDescriptor(@Cast("const nd4j::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape) { super((Pointer)null); allocate(type, order, shape); }
-        private native void allocate(@Cast("const nd4j::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape);
-        public ShapeDescriptor(@Cast("const nd4j::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector long[] shape) { super((Pointer)null); allocate(type, order, shape); }
-        private native void allocate(@Cast("const nd4j::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector long[] shape);
-        public ShapeDescriptor(@Cast("const nd4j::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("Nd4jLong*") @StdVector LongPointer strides) { super((Pointer)null); allocate(type, order, shape, strides); }
-        private native void allocate(@Cast("const nd4j::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("Nd4jLong*") @StdVector LongPointer strides);
-        public ShapeDescriptor(@Cast("const nd4j::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("Nd4jLong*") @StdVector LongBuffer strides) { super((Pointer)null); allocate(type, order, shape, strides); }
-        private native void allocate(@Cast("const nd4j::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("Nd4jLong*") @StdVector LongBuffer strides);
-        public ShapeDescriptor(@Cast("const nd4j::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("Nd4jLong*") @StdVector long[] strides) { super((Pointer)null); allocate(type, order, shape, strides); }
-        private native void allocate(@Cast("const nd4j::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("Nd4jLong*") @StdVector long[] strides);
-        public ShapeDescriptor(@Cast("const nd4j::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("Nd4jLong*") @StdVector LongPointer strides, @Cast("const Nd4jLong") long ews) { super((Pointer)null); allocate(type, order, shape, strides, ews); }
-        private native void allocate(@Cast("const nd4j::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("Nd4jLong*") @StdVector LongPointer strides, @Cast("const Nd4jLong") long ews);
-        public ShapeDescriptor(@Cast("const nd4j::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("Nd4jLong*") @StdVector LongBuffer strides, @Cast("const Nd4jLong") long ews) { super((Pointer)null); allocate(type, order, shape, strides, ews); }
-        private native void allocate(@Cast("const nd4j::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("Nd4jLong*") @StdVector LongBuffer strides, @Cast("const Nd4jLong") long ews);
-        public ShapeDescriptor(@Cast("const nd4j::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("Nd4jLong*") @StdVector long[] strides, @Cast("const Nd4jLong") long ews) { super((Pointer)null); allocate(type, order, shape, strides, ews); }
-        private native void allocate(@Cast("const nd4j::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("Nd4jLong*") @StdVector long[] strides, @Cast("const Nd4jLong") long ews);
+        public ShapeDescriptor(@Cast("const sd::DataType") int type, @Cast("const Nd4jLong") long length) { super((Pointer)null); allocate(type, length); }
+        private native void allocate(@Cast("const sd::DataType") int type, @Cast("const Nd4jLong") long length);
+        public ShapeDescriptor(@Cast("const sd::DataType") int type, byte order, @Cast("const Nd4jLong*") LongPointer shape, int rank) { super((Pointer)null); allocate(type, order, shape, rank); }
+        private native void allocate(@Cast("const sd::DataType") int type, byte order, @Cast("const Nd4jLong*") LongPointer shape, int rank);
+        public ShapeDescriptor(@Cast("const sd::DataType") int type, byte order, @Cast("const Nd4jLong*") LongBuffer shape, int rank) { super((Pointer)null); allocate(type, order, shape, rank); }
+        private native void allocate(@Cast("const sd::DataType") int type, byte order, @Cast("const Nd4jLong*") LongBuffer shape, int rank);
+        public ShapeDescriptor(@Cast("const sd::DataType") int type, byte order, @Cast("const Nd4jLong*") long[] shape, int rank) { super((Pointer)null); allocate(type, order, shape, rank); }
+        private native void allocate(@Cast("const sd::DataType") int type, byte order, @Cast("const Nd4jLong*") long[] shape, int rank);
+        public ShapeDescriptor(@Cast("const sd::DataType") int type, byte order, @Cast("const Nd4jLong*") LongPointer shape, @Cast("const Nd4jLong*") LongPointer strides, int rank, @Cast("Nd4jLong") long ews, @Cast("const bool") boolean empty) { super((Pointer)null); allocate(type, order, shape, strides, rank, ews, empty); }
+        private native void allocate(@Cast("const sd::DataType") int type, byte order, @Cast("const Nd4jLong*") LongPointer shape, @Cast("const Nd4jLong*") LongPointer strides, int rank, @Cast("Nd4jLong") long ews, @Cast("const bool") boolean empty);
+        public ShapeDescriptor(@Cast("const sd::DataType") int type, byte order, @Cast("const Nd4jLong*") LongBuffer shape, @Cast("const Nd4jLong*") LongBuffer strides, int rank, @Cast("Nd4jLong") long ews, @Cast("const bool") boolean empty) { super((Pointer)null); allocate(type, order, shape, strides, rank, ews, empty); }
+        private native void allocate(@Cast("const sd::DataType") int type, byte order, @Cast("const Nd4jLong*") LongBuffer shape, @Cast("const Nd4jLong*") LongBuffer strides, int rank, @Cast("Nd4jLong") long ews, @Cast("const bool") boolean empty);
+        public ShapeDescriptor(@Cast("const sd::DataType") int type, byte order, @Cast("const Nd4jLong*") long[] shape, @Cast("const Nd4jLong*") long[] strides, int rank, @Cast("Nd4jLong") long ews, @Cast("const bool") boolean empty) { super((Pointer)null); allocate(type, order, shape, strides, rank, ews, empty); }
+        private native void allocate(@Cast("const sd::DataType") int type, byte order, @Cast("const Nd4jLong*") long[] shape, @Cast("const Nd4jLong*") long[] strides, int rank, @Cast("Nd4jLong") long ews, @Cast("const bool") boolean empty);
+        public ShapeDescriptor(@Cast("const sd::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape) { super((Pointer)null); allocate(type, order, shape); }
+        private native void allocate(@Cast("const sd::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape);
+        public ShapeDescriptor(@Cast("const sd::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape) { super((Pointer)null); allocate(type, order, shape); }
+        private native void allocate(@Cast("const sd::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape);
+        public ShapeDescriptor(@Cast("const sd::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector long[] shape) { super((Pointer)null); allocate(type, order, shape); }
+        private native void allocate(@Cast("const sd::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector long[] shape);
+        public ShapeDescriptor(@Cast("const sd::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("Nd4jLong*") @StdVector LongPointer strides) { super((Pointer)null); allocate(type, order, shape, strides); }
+        private native void allocate(@Cast("const sd::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("Nd4jLong*") @StdVector LongPointer strides);
+        public ShapeDescriptor(@Cast("const sd::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("Nd4jLong*") @StdVector LongBuffer strides) { super((Pointer)null); allocate(type, order, shape, strides); }
+        private native void allocate(@Cast("const sd::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("Nd4jLong*") @StdVector LongBuffer strides);
+        public ShapeDescriptor(@Cast("const sd::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("Nd4jLong*") @StdVector long[] strides) { super((Pointer)null); allocate(type, order, shape, strides); }
+        private native void allocate(@Cast("const sd::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("Nd4jLong*") @StdVector long[] strides);
+        public ShapeDescriptor(@Cast("const sd::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("Nd4jLong*") @StdVector LongPointer strides, @Cast("const Nd4jLong") long ews) { super((Pointer)null); allocate(type, order, shape, strides, ews); }
+        private native void allocate(@Cast("const sd::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("Nd4jLong*") @StdVector LongPointer strides, @Cast("const Nd4jLong") long ews);
+        public ShapeDescriptor(@Cast("const sd::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("Nd4jLong*") @StdVector LongBuffer strides, @Cast("const Nd4jLong") long ews) { super((Pointer)null); allocate(type, order, shape, strides, ews); }
+        private native void allocate(@Cast("const sd::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("Nd4jLong*") @StdVector LongBuffer strides, @Cast("const Nd4jLong") long ews);
+        public ShapeDescriptor(@Cast("const sd::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("Nd4jLong*") @StdVector long[] strides, @Cast("const Nd4jLong") long ews) { super((Pointer)null); allocate(type, order, shape, strides, ews); }
+        private native void allocate(@Cast("const sd::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("Nd4jLong*") @StdVector long[] strides, @Cast("const Nd4jLong") long ews);
         public ShapeDescriptor() { super((Pointer)null); allocate(); }
         private native void allocate();
 
@@ -10338,7 +10385,7 @@ public static final int PREALLOC_SIZE = 33554432;
         public native @Cast("Nd4jLong") long ews();
         public native @Cast("Nd4jLong") long arrLength();
         public native char order();
-        public native @Cast("nd4j::DataType") int dataType();
+        public native @Cast("sd::DataType") int dataType();
         public native @Cast("bool") boolean isEmpty();
         public native @Cast("Nd4jLong*") @StdVector LongPointer shape();
         public native @Cast("Nd4jLong*") @StdVector LongPointer strides();
@@ -10357,13 +10404,15 @@ public static final int PREALLOC_SIZE = 33554432;
         public native @Cast("Nd4jLong*") LongPointer toShapeInfo();
 
 
-        public static native @ByVal ShapeDescriptor emptyDescriptor(@Cast("const nd4j::DataType") int type);
-        public static native @ByVal ShapeDescriptor scalarDescriptor(@Cast("const nd4j::DataType") int type);
-        public static native @ByVal ShapeDescriptor vectorDescriptor(@Cast("const Nd4jLong") long length, @Cast("const nd4j::DataType") int type);
+        public static native @ByVal ShapeDescriptor emptyDescriptor(@Cast("const sd::DataType") int type);
+        public static native @ByVal ShapeDescriptor scalarDescriptor(@Cast("const sd::DataType") int type);
+        public static native @ByVal ShapeDescriptor vectorDescriptor(@Cast("const Nd4jLong") long length, @Cast("const sd::DataType") int type);
     }
 
 
+// #ifndef __JAVACPP_HACK__
 
+// #endif
 
 
 // #endif //DEV_TESTS_SHAPEDESCRIPTOR_H
@@ -10395,8 +10444,8 @@ public static final int PREALLOC_SIZE = 33554432;
 // #define DEV_TESTS_TADDESCRIPTOR_H
 
 // #include "ShapeDescriptor.h"
-// #include <dll.h>
-    @Namespace("nd4j") @NoOffset public static class TadDescriptor extends Pointer {
+// #include <system/dll.h>
+    @Namespace("sd") @NoOffset public static class TadDescriptor extends Pointer {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public TadDescriptor(Pointer p) { super(p); }
@@ -10441,10 +10490,15 @@ public static final int PREALLOC_SIZE = 33554432;
 
         public native @StdVector IntPointer axis();
         public native @ByRef ShapeDescriptor originalShape();
+        public native @Const @ByRef ShapeDescriptor originalShapeConst();
         public native @Cast("bool") boolean areUnitiesinShape();
     }
 
 
+// #ifndef __JAVACPP_HACK__
+
+// #endif
+
 
 // #endif //DEV_TESTS_TADDESCRIPTOR_H
 
@@ -10474,18 +10528,18 @@ public static final int PREALLOC_SIZE = 33554432;
 // #ifndef LIBND4J__DEBUG_INFO_HELPER__H
 // #define LIBND4J__DEBUG_INFO_HELPER__H
 
-// #include <pointercast.h>
-// #include <op_boilerplate.h>
-// #include <Environment.h>
-// #include <StringUtils.h>
+// #include <system/pointercast.h>
+// #include <system/op_boilerplate.h>
+// #include <system/Environment.h>
+// #include <helpers/StringUtils.h>
 // #include <string>
-// #include <dll.h>
-// #include <templatemath.h>
+// #include <system/dll.h>
+// #include <math/templatemath.h>
 
 // #ifdef __CUDACC__
 
 // #endif
-    @Namespace("nd4j") public static class DebugInfo extends Pointer {
+    @Namespace("sd") public static class DebugInfo extends Pointer {
         static { Loader.load(); }
         /** Default native constructor. */
         public DebugInfo() { super((Pointer)null); allocate(); }
@@ -10510,7 +10564,7 @@ public static final int PREALLOC_SIZE = 33554432;
        public native @Cast("Nd4jLong") long _nanCount(); public native DebugInfo _nanCount(long setter);
     }
 
-    @Namespace("nd4j") public static native @Cast("bool") @Name("operator ==") boolean equals(@Const @ByRef DebugInfo first, @Const @ByRef DebugInfo second);
+    @Namespace("sd") public static native @Cast("bool") @Name("operator ==") boolean equals(@Const @ByRef DebugInfo first, @Const @ByRef DebugInfo second);
 
 
 
@@ -10567,17 +10621,17 @@ public static final int PREALLOC_SIZE = 33554432;
 // #include <ops/declarable/headers/util.h>
 // #include <ops/declarable/headers/BarnesHutTsne.h>
 // #include <ops/declarable/headers/images.h>
-// #include <dll.h>
+// #include <system/dll.h>
 // #include <helpers/shape.h>
 // #include <helpers/TAD.h>
-// #include <Status.h>
+// #include <graph/Status.h>
 // #include <helpers/ArrayUtils.h>
 // #include <helpers/ShapeBuilders.h>
-// #include <NDArrayFactory.h>
+// #include <array/NDArrayFactory.h>
 // #include <helpers/OpTracker.h>
-// #include <ConstantShapeHelper.h>
-// #include <ConstantTadHelper.h>
-    @Namespace("nd4j") public static class _loader extends Pointer {
+// #include <helpers/ConstantShapeHelper.h>
+// #include <helpers/ConstantTadHelper.h>
+    @Namespace("sd") public static class _loader extends Pointer {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public _loader(Pointer p) { super(p); }
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCudaPresets.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCudaPresets.java
index cdfefce31..1f3f7bde4 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCudaPresets.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCudaPresets.java
@@ -41,9 +41,9 @@ import org.bytedeco.javacpp.tools.InfoMapper;
                         "execution/Engine.h",
                         "execution/ExecutionMode.h",
                         "memory/MemoryType.h",
-                        "Environment.h",
+                        "system/Environment.h",
                         "types/utf8string.h",
-                        "NativeOps.h",
+                        "legacy/NativeOps.h",
                         "memory/ExternalWorkspace.h",
                         "memory/Workspace.h",
                         "indexing/NDIndex.h",
@@ -53,7 +53,7 @@ import org.bytedeco.javacpp.tools.InfoMapper;
                         "graph/ArgumentsList.h",
                         "types/pair.h",
                         "types/pair.h",
-                        "NDArray.h",
+                        "array/NDArray.h",
                         "array/NDArrayList.h",
                         "array/ResultSet.h",
                         "graph/RandomGenerator.h",
@@ -72,7 +72,7 @@ import org.bytedeco.javacpp.tools.InfoMapper;
                         "graph/ResultWrapper.h",
                         "helpers/shape.h",
                         "array/ShapeList.h",
-                        //"op_boilerplate.h",
+                        //"system/op_boilerplate.h",
                         "ops/InputType.h",
                         "ops/declarable/OpDescriptor.h",
                         "ops/declarable/PlatformHelper.h",
@@ -185,38 +185,38 @@ public class Nd4jCudaPresets implements LoadEnabled, InfoMapper {
 
         infoMap.put(new Info("__CUDACC__", "MAX_UINT", "HAVE_MKLDNN").define(false))
                .put(new Info("__JAVACPP_HACK__", "LIBND4J_ALL_OPS","__CUDABLAS__").define(true))
-               .put(new Info("std::initializer_list", "cnpy::NpyArray", "nd4j::NDArray::applyLambda", "nd4j::NDArray::applyPairwiseLambda",
-                             "nd4j::graph::FlatResult", "nd4j::graph::FlatVariable", "nd4j::NDArray::subarray").skip())
+               .put(new Info("std::initializer_list", "cnpy::NpyArray", "sd::NDArray::applyLambda", "sd::NDArray::applyPairwiseLambda",
+                             "sd::graph::FlatResult", "sd::graph::FlatVariable", "sd::NDArray::subarray").skip())
                .put(new Info("std::string").annotations("@StdString").valueTypes("BytePointer", "String")
                                            .pointerTypes("@Cast({\"char*\", \"std::string*\"}) BytePointer"))
                 .put(new Info("std::pair<int,int>").pointerTypes("IntIntPair").define())
                 .put(new Info("std::vector<std::vector<int> >").pointerTypes("IntVectorVector").define())
                 .put(new Info("std::vector<std::vector<Nd4jLong> >").pointerTypes("LongVectorVector").define())
-                .put(new Info("std::vector<nd4j::NDArray*>").pointerTypes("NDArrayVector").define())
-                .put(new Info("std::vector<const nd4j::NDArray*>").pointerTypes("ConstNDArrayVector").define())
+                .put(new Info("std::vector<sd::NDArray*>").pointerTypes("NDArrayVector").define())
+                .put(new Info("std::vector<const sd::NDArray*>").pointerTypes("ConstNDArrayVector").define())
                 .put(new Info("bool").cast().valueTypes("boolean").pointerTypes("BooleanPointer", "boolean[]"))
-                .put(new Info("nd4j::graph::ResultWrapper").base("org.nd4j.nativeblas.ResultWrapperAbstraction").define())
-               .put(new Info("nd4j::IndicesList").purify());
+                .put(new Info("sd::graph::ResultWrapper").base("org.nd4j.nativeblas.ResultWrapperAbstraction").define())
+               .put(new Info("sd::IndicesList").purify());
 /*
         String classTemplates[] = {
-                "nd4j::NDArray",
-                "nd4j::NDArrayList",
-                "nd4j::ResultSet",
-                "nd4j::OpArgsHolder",
-                "nd4j::graph::GraphState",
-                "nd4j::graph::Variable",
-                "nd4j::graph::VariablesSet",
-                "nd4j::graph::Stash",
-                "nd4j::graph::VariableSpace",
-                "nd4j::graph::Context",
-                "nd4j::graph::ContextPrototype",
-                "nd4j::ops::DeclarableOp",
-                "nd4j::ops::DeclarableListOp",
-                "nd4j::ops::DeclarableReductionOp",
-                "nd4j::ops::DeclarableCustomOp",
-                "nd4j::ops::BooleanOp",
-                "nd4j::ops::BroadcastableOp",
-                "nd4j::ops::LogicOp"};
+                "sd::NDArray",
+                "sd::NDArrayList",
+                "sd::ResultSet",
+                "sd::OpArgsHolder",
+                "sd::graph::GraphState",
+                "sd::graph::Variable",
+                "sd::graph::VariablesSet",
+                "sd::graph::Stash",
+                "sd::graph::VariableSpace",
+                "sd::graph::Context",
+                "sd::graph::ContextPrototype",
+                "sd::ops::DeclarableOp",
+                "sd::ops::DeclarableListOp",
+                "sd::ops::DeclarableReductionOp",
+                "sd::ops::DeclarableCustomOp",
+                "sd::ops::BooleanOp",
+                "sd::ops::BroadcastableOp",
+                "sd::ops::LogicOp"};
         for (String t : classTemplates) {
             String s = t.substring(t.lastIndexOf(':') + 1);
             infoMap.put(new Info(t + "<float>").pointerTypes("Float" + s))
@@ -225,6 +225,6 @@ public class Nd4jCudaPresets implements LoadEnabled, InfoMapper {
 
         }
 */
-        infoMap.put(new Info("nd4j::ops::OpRegistrator::updateMSVC").skip());
+        infoMap.put(new Info("sd::ops::OpRegistrator::updateMSVC").skip());
     }
 }
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/pom.xml b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/pom.xml
index 48cdc3e03..f3418657e 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/pom.xml
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/pom.xml
@@ -235,7 +235,7 @@
                                 </requireProperty>
                                 <requireFilesExist>
                                     <files>
-                                        <file>${libnd4jhome}/blas/NativeOps.h</file>
+                                        <file>${libnd4jhome}/include/legacy/NativeOps.h</file>
                                         <file>${libnd4jhome}/blasbuild/cpu/blas</file>
                                     </files>
                                     <message>!!! You have to compile libnd4j with cpu support first!</message>
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java
index 71614c20f..c8fbb724d 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpu.java
@@ -110,7 +110,7 @@ public class Nd4jCpu extends org.nd4j.nativeblas.Nd4jCpuHelper {
     }
 }
 
-@Name("std::vector<const nd4j::NDArray*>") public static class ConstNDArrayVector extends Pointer {
+@Name("std::vector<const sd::NDArray*>") public static class ConstNDArrayVector extends Pointer {
     static { Loader.load(); }
     /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
     public ConstNDArrayVector(Pointer p) { super(p); }
@@ -178,7 +178,7 @@ public class Nd4jCpu extends org.nd4j.nativeblas.Nd4jCpuHelper {
     }
 }
 
-@Name("std::vector<nd4j::NDArray*>") public static class NDArrayVector extends Pointer {
+@Name("std::vector<sd::NDArray*>") public static class NDArrayVector extends Pointer {
     static { Loader.load(); }
     /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
     public NDArrayVector(Pointer p) { super(p); }
@@ -274,7 +274,7 @@ public class Nd4jCpu extends org.nd4j.nativeblas.Nd4jCpuHelper {
 
 // #ifndef DEV_TESTS_MEMORYTYPE_H
 // #define DEV_TESTS_MEMORYTYPE_H
-        /** enum nd4j::memory::MemoryType */
+        /** enum sd::memory::MemoryType */
         public static final int
             HOST = 0,
             DEVICE = 10;
@@ -308,7 +308,7 @@ public class Nd4jCpu extends org.nd4j.nativeblas.Nd4jCpuHelper {
 
 // #ifndef ND4J_DATATYPE_H
 // #define ND4J_DATATYPE_H
-    /** enum nd4j::DataType */
+    /** enum sd::DataType */
     public static final int
         INHERIT = 0,
         BOOL = 1,
@@ -364,14 +364,14 @@ public class Nd4jCpu extends org.nd4j.nativeblas.Nd4jCpuHelper {
 // #define DEV_TESTS_DATABUFFER_H
 
 // #include <cstring>
-// #include <op_boilerplate.h>
-// #include <dll.h>
-// #include <pointercast.h>
+// #include <system/op_boilerplate.h>
+// #include <system/dll.h>
+// #include <system/pointercast.h>
 // #include <array/DataType.h>
 // #include <memory/Workspace.h>
 // #include <execution/LaunchContext.h>
 
-@Namespace("nd4j") @NoOffset public static class DataBuffer extends Pointer {
+@Namespace("sd") @NoOffset public static class DataBuffer extends Pointer {
     static { Loader.load(); }
     /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
     public DataBuffer(Pointer p) { super(p); }
@@ -384,46 +384,46 @@ public class Nd4jCpu extends org.nd4j.nativeblas.Nd4jCpuHelper {
 
 
         public DataBuffer(Pointer primary, Pointer special,
-                                       @Cast("const size_t") long lenInBytes, @Cast("const nd4j::DataType") int dataType,
+                                       @Cast("const size_t") long lenInBytes, @Cast("const sd::DataType") int dataType,
                                        @Cast("const bool") boolean isOwnerPrimary/*=false*/, @Cast("const bool") boolean isOwnerSpecial/*=false*/,
                                        Workspace workspace/*=nullptr*/) { super((Pointer)null); allocate(primary, special, lenInBytes, dataType, isOwnerPrimary, isOwnerSpecial, workspace); }
         private native void allocate(Pointer primary, Pointer special,
-                                       @Cast("const size_t") long lenInBytes, @Cast("const nd4j::DataType") int dataType,
+                                       @Cast("const size_t") long lenInBytes, @Cast("const sd::DataType") int dataType,
                                        @Cast("const bool") boolean isOwnerPrimary/*=false*/, @Cast("const bool") boolean isOwnerSpecial/*=false*/,
                                        Workspace workspace/*=nullptr*/);
         public DataBuffer(Pointer primary, Pointer special,
-                                       @Cast("const size_t") long lenInBytes, @Cast("const nd4j::DataType") int dataType) { super((Pointer)null); allocate(primary, special, lenInBytes, dataType); }
+                                       @Cast("const size_t") long lenInBytes, @Cast("const sd::DataType") int dataType) { super((Pointer)null); allocate(primary, special, lenInBytes, dataType); }
         private native void allocate(Pointer primary, Pointer special,
-                                       @Cast("const size_t") long lenInBytes, @Cast("const nd4j::DataType") int dataType);
+                                       @Cast("const size_t") long lenInBytes, @Cast("const sd::DataType") int dataType);
 
         public DataBuffer(Pointer primary,
-                                       @Cast("const size_t") long lenInBytes, @Cast("const nd4j::DataType") int dataType,
+                                       @Cast("const size_t") long lenInBytes, @Cast("const sd::DataType") int dataType,
                                        @Cast("const bool") boolean isOwnerPrimary/*=false*/,
                                        Workspace workspace/*=nullptr*/) { super((Pointer)null); allocate(primary, lenInBytes, dataType, isOwnerPrimary, workspace); }
         private native void allocate(Pointer primary,
-                                       @Cast("const size_t") long lenInBytes, @Cast("const nd4j::DataType") int dataType,
+                                       @Cast("const size_t") long lenInBytes, @Cast("const sd::DataType") int dataType,
                                        @Cast("const bool") boolean isOwnerPrimary/*=false*/,
                                        Workspace workspace/*=nullptr*/);
         public DataBuffer(Pointer primary,
-                                       @Cast("const size_t") long lenInBytes, @Cast("const nd4j::DataType") int dataType) { super((Pointer)null); allocate(primary, lenInBytes, dataType); }
+                                       @Cast("const size_t") long lenInBytes, @Cast("const sd::DataType") int dataType) { super((Pointer)null); allocate(primary, lenInBytes, dataType); }
         private native void allocate(Pointer primary,
-                                       @Cast("const size_t") long lenInBytes, @Cast("const nd4j::DataType") int dataType);
+                                       @Cast("const size_t") long lenInBytes, @Cast("const sd::DataType") int dataType);
 
         public DataBuffer(@Const Pointer hostBuffer,
-                                       @Cast("const nd4j::DataType") int dataType, @Cast("const size_t") long lenInBytes,
+                                       @Cast("const sd::DataType") int dataType, @Cast("const size_t") long lenInBytes,
                                        Workspace workspace/*=nullptr*/) { super((Pointer)null); allocate(hostBuffer, dataType, lenInBytes, workspace); }
         private native void allocate(@Const Pointer hostBuffer,
-                                       @Cast("const nd4j::DataType") int dataType, @Cast("const size_t") long lenInBytes,
+                                       @Cast("const sd::DataType") int dataType, @Cast("const size_t") long lenInBytes,
                                        Workspace workspace/*=nullptr*/);
         public DataBuffer(@Const Pointer hostBuffer,
-                                       @Cast("const nd4j::DataType") int dataType, @Cast("const size_t") long lenInBytes) { super((Pointer)null); allocate(hostBuffer, dataType, lenInBytes); }
+                                       @Cast("const sd::DataType") int dataType, @Cast("const size_t") long lenInBytes) { super((Pointer)null); allocate(hostBuffer, dataType, lenInBytes); }
         private native void allocate(@Const Pointer hostBuffer,
-                                       @Cast("const nd4j::DataType") int dataType, @Cast("const size_t") long lenInBytes);
+                                       @Cast("const sd::DataType") int dataType, @Cast("const size_t") long lenInBytes);
 
-        public DataBuffer(@Cast("const size_t") long lenInBytes, @Cast("const nd4j::DataType") int dataType, Workspace workspace/*=nullptr*/, @Cast("const bool") boolean allocBoth/*=false*/) { super((Pointer)null); allocate(lenInBytes, dataType, workspace, allocBoth); }
-        private native void allocate(@Cast("const size_t") long lenInBytes, @Cast("const nd4j::DataType") int dataType, Workspace workspace/*=nullptr*/, @Cast("const bool") boolean allocBoth/*=false*/);
-        public DataBuffer(@Cast("const size_t") long lenInBytes, @Cast("const nd4j::DataType") int dataType) { super((Pointer)null); allocate(lenInBytes, dataType); }
-        private native void allocate(@Cast("const size_t") long lenInBytes, @Cast("const nd4j::DataType") int dataType);
+        public DataBuffer(@Cast("const size_t") long lenInBytes, @Cast("const sd::DataType") int dataType, Workspace workspace/*=nullptr*/, @Cast("const bool") boolean allocBoth/*=false*/) { super((Pointer)null); allocate(lenInBytes, dataType, workspace, allocBoth); }
+        private native void allocate(@Cast("const size_t") long lenInBytes, @Cast("const sd::DataType") int dataType, Workspace workspace/*=nullptr*/, @Cast("const bool") boolean allocBoth/*=false*/);
+        public DataBuffer(@Cast("const size_t") long lenInBytes, @Cast("const sd::DataType") int dataType) { super((Pointer)null); allocate(lenInBytes, dataType); }
+        private native void allocate(@Cast("const size_t") long lenInBytes, @Cast("const sd::DataType") int dataType);
 
         public DataBuffer(@Const @ByRef DataBuffer other) { super((Pointer)null); allocate(other); }
         private native void allocate(@Const @ByRef DataBuffer other);
@@ -432,8 +432,8 @@ public class Nd4jCpu extends org.nd4j.nativeblas.Nd4jCpuHelper {
 
         public native @ByRef @Name("operator =") DataBuffer put(@Const @ByRef DataBuffer other);
 
-        public native @Cast("nd4j::DataType") int getDataType();
-        public native void setDataType(@Cast("nd4j::DataType") int dataType);
+        public native @Cast("sd::DataType") int getDataType();
+        public native void setDataType(@Cast("sd::DataType") int dataType);
         public native @Cast("size_t") long getLenInBytes();
 
         public native Pointer primary();
@@ -514,9 +514,9 @@ public class Nd4jCpu extends org.nd4j.nativeblas.Nd4jCpuHelper {
 // #ifndef LIBND4J_CONSTANTDATABUFFER_H
 // #define LIBND4J_CONSTANTDATABUFFER_H
 
-// #include <dll.h>
-// #include <pointercast.h>
-    @Namespace("nd4j") @NoOffset public static class ConstantDataBuffer extends Pointer {
+// #include <system/dll.h>
+// #include <system/pointercast.h>
+    @Namespace("sd") @NoOffset public static class ConstantDataBuffer extends Pointer {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public ConstantDataBuffer(Pointer p) { super(p); }
@@ -575,10 +575,10 @@ public class Nd4jCpu extends org.nd4j.nativeblas.Nd4jCpuHelper {
 // #include <array/DataType.h>
 // #include <unordered_map>
 // #include <vector>
-// #include <pointercast.h>
-// #include <dll.h>
+// #include <system/pointercast.h>
+// #include <system/dll.h>
 // #include <array/ConstantDataBuffer.h>
-    @Namespace("nd4j") @NoOffset public static class ConstantDescriptor extends Pointer {
+    @Namespace("sd") @NoOffset public static class ConstantDescriptor extends Pointer {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public ConstantDescriptor(Pointer p) { super(p); }
@@ -625,6 +625,10 @@ public class Nd4jCpu extends org.nd4j.nativeblas.Nd4jCpuHelper {
     }
 
 
+// #ifndef __JAVACPP_HACK__
+
+// #endif
+
 
 // #endif //DEV_TESTS_CONSTANTDESCRIPTOR_H
 
@@ -655,7 +659,7 @@ public class Nd4jCpu extends org.nd4j.nativeblas.Nd4jCpuHelper {
 // #define DEV_TESTS_TADPACK_H
 
 // #include "ConstantDataBuffer.h"
-    @Namespace("nd4j") @NoOffset public static class TadPack extends Pointer {
+    @Namespace("sd") @NoOffset public static class TadPack extends Pointer {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public TadPack(Pointer p) { super(p); }
@@ -719,7 +723,7 @@ public class Nd4jCpu extends org.nd4j.nativeblas.Nd4jCpuHelper {
 // #define DEV_TESTS_ERRORREFERENCE_H
 
 // #include <string>
-// #include <dll.h>
+// #include <system/dll.h>
     @Namespace("sd") @NoOffset public static class ErrorReference extends Pointer {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
@@ -814,7 +818,7 @@ public class Nd4jCpu extends org.nd4j.nativeblas.Nd4jCpuHelper {
 // #endif //SD_EXECUTIONMODE_H
 
 
-// Parsed from Environment.h
+// Parsed from system/Environment.h
 
 /*******************************************************************************
  * Copyright (c) 2015-2018 Skymind, Inc.
@@ -841,12 +845,12 @@ public class Nd4jCpu extends org.nd4j.nativeblas.Nd4jCpuHelper {
 
 // #include <atomic>
 // #include <vector>
-// #include <dll.h>
+// #include <system/dll.h>
 // #include <stdexcept>
 // #include <array/DataType.h>
 // #include <types/pair.h>
-// #include <pointercast.h>
-    @Namespace("nd4j") @NoOffset public static class Environment extends Pointer {
+// #include <system/pointercast.h>
+    @Namespace("sd") @NoOffset public static class Environment extends Pointer {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public Environment(Pointer p) { super(p); }
@@ -911,8 +915,8 @@ public class Nd4jCpu extends org.nd4j.nativeblas.Nd4jCpuHelper {
         public native @Cast("bool") boolean isUseMKLDNN();
         public native void setUseMKLDNN(@Cast("bool") boolean useMKLDNN);
 
-        public native @Cast("nd4j::DataType") int defaultFloatDataType();
-        public native void setDefaultFloatDataType(@Cast("nd4j::DataType") int dtype);
+        public native @Cast("sd::DataType") int defaultFloatDataType();
+        public native void setDefaultFloatDataType(@Cast("sd::DataType") int dtype);
 
         public native @Cast("bool") boolean precisionBoostAllowed();
         public native void allowPrecisionBoost(@Cast("bool") boolean reallyAllow);
@@ -959,8 +963,8 @@ public class Nd4jCpu extends org.nd4j.nativeblas.Nd4jCpuHelper {
 // #define DEV_TESTS_UTF8STRING_H
 
 // #include <string>
-// #include <dll.h>
-    @Namespace("nd4j") @NoOffset public static class utf8string extends Pointer {
+// #include <system/dll.h>
+    @Namespace("sd") @NoOffset public static class utf8string extends Pointer {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public utf8string(Pointer p) { super(p); }
@@ -995,7 +999,7 @@ public class Nd4jCpu extends org.nd4j.nativeblas.Nd4jCpuHelper {
 // #endif //DEV_TESTS_UTF8STRING_H
 
 
-// Parsed from NativeOps.h
+// Parsed from legacy/NativeOps.h
 
 /*******************************************************************************
  * Copyright (c) 2015-2018 Skymind, Inc.
@@ -1041,9 +1045,9 @@ public class Nd4jCpu extends org.nd4j.nativeblas.Nd4jCpuHelper {
 #endif
 */
 
-// #include <pointercast.h>
+// #include <system/pointercast.h>
 // #include <types/float16.h>
-// #include <cnpy.h>
+// #include <cnpy/cnpy.h>
 
 //DO NOT REMOVE: THIS IS AN EDITOR SEMANTICS THING FOR CLION
 //IT DEFINES THE EXPORT MACRO FOR THE EDITOR AND THEN
@@ -1053,7 +1057,7 @@ public class Nd4jCpu extends org.nd4j.nativeblas.Nd4jCpuHelper {
 // #else
 // #define ND4J_EXPORT
 // #endif
-// #include <dll.h>
+// #include <system/dll.h>
 
 /*
 int tad_threshold = 1;
@@ -1074,7 +1078,7 @@ bool verbose = false;
 // #include <graph/GraphState.h>
 // #include <graph/execution/LogicExecutor.h>
 // #include <graph/ResultWrapper.h>
-// #include <DebugInfo.h>
+// #include <helpers/DebugInfo.h>
 // #include <memory/MemoryCounter.h>
 
 /**
@@ -2424,7 +2428,7 @@ public native void execAggregate(@Cast("Nd4jPointer*") PointerPointer extraPoint
                          int numIntArrays,
                          Pointer realArguments,
                          int numRealArguments,
-                         @Cast("nd4j::DataType") int dtype);
+                         @Cast("sd::DataType") int dtype);
 public native void execAggregate(@Cast("Nd4jPointer*") PointerPointer extraPointers,
                          int opNum,
                          @Cast("void**") @ByPtrPtr Pointer arguments,
@@ -2437,7 +2441,7 @@ public native void execAggregate(@Cast("Nd4jPointer*") PointerPointer extraPoint
                          int numIntArrays,
                          Pointer realArguments,
                          int numRealArguments,
-                         @Cast("nd4j::DataType") int dtype);
+                         @Cast("sd::DataType") int dtype);
 public native void execAggregate(@Cast("Nd4jPointer*") PointerPointer extraPointers,
                          int opNum,
                          @Cast("void**") @ByPtrPtr Pointer arguments,
@@ -2450,7 +2454,7 @@ public native void execAggregate(@Cast("Nd4jPointer*") PointerPointer extraPoint
                          int numIntArrays,
                          Pointer realArguments,
                          int numRealArguments,
-                         @Cast("nd4j::DataType") int dtype);
+                         @Cast("sd::DataType") int dtype);
 public native void execAggregate(@Cast("Nd4jPointer*") PointerPointer extraPointers,
                          int opNum,
                          @Cast("void**") @ByPtrPtr Pointer arguments,
@@ -2463,7 +2467,7 @@ public native void execAggregate(@Cast("Nd4jPointer*") PointerPointer extraPoint
                          int numIntArrays,
                          Pointer realArguments,
                          int numRealArguments,
-                         @Cast("nd4j::DataType") int dtype);
+                         @Cast("sd::DataType") int dtype);
 
 
 public native void batchExecutor(@Cast("Nd4jPointer*") PointerPointer extraPointers,
@@ -2476,7 +2480,7 @@ public native void batchExecutor(@Cast("Nd4jPointer*") PointerPointer extraPoint
                                int maxIdx,
                                int maxReals,
                                Pointer ptrToArguments,
-                               @Cast("nd4j::DataType") int dtype);
+                               @Cast("sd::DataType") int dtype);
 
 public native void execAggregateBatch(@Cast("Nd4jPointer*") PointerPointer extraPointers,
                               int numAggregates,
@@ -2488,7 +2492,7 @@ public native void execAggregateBatch(@Cast("Nd4jPointer*") PointerPointer extra
                               int maxIdx,
                               int maxReals,
                               Pointer ptrToArguments,
-                              @Cast("nd4j::DataType") int dtype);
+                              @Cast("sd::DataType") int dtype);
 
 /**
  * Random operations
@@ -3069,17 +3073,17 @@ public native void inspectArray(@Cast("Nd4jPointer*") PointerPointer extraPointe
 public native void inspectArray(@Cast("Nd4jPointer*") PointerPointer extraPointers, @Cast("Nd4jPointer") Pointer buffer, @Cast("Nd4jLong*") LongBuffer shapeInfo, @Cast("Nd4jPointer") Pointer specialBuffer, @Cast("Nd4jLong*") LongBuffer specialShapeInfo, @Cast("Nd4jPointer") Pointer debugInfo);
 public native void inspectArray(@Cast("Nd4jPointer*") PointerPointer extraPointers, @Cast("Nd4jPointer") Pointer buffer, @Cast("Nd4jLong*") long[] shapeInfo, @Cast("Nd4jPointer") Pointer specialBuffer, @Cast("Nd4jLong*") long[] specialShapeInfo, @Cast("Nd4jPointer") Pointer debugInfo);
 
-public native OpaqueConstantDataBuffer shapeBuffer(int rank, @Cast("Nd4jLong*") LongPointer shape, @Cast("Nd4jLong*") LongPointer strides, @Cast("nd4j::DataType") int dtype, char order, @Cast("Nd4jLong") long ews, @Cast("bool") boolean empty);
-public native OpaqueConstantDataBuffer shapeBuffer(int rank, @Cast("Nd4jLong*") LongBuffer shape, @Cast("Nd4jLong*") LongBuffer strides, @Cast("nd4j::DataType") int dtype, char order, @Cast("Nd4jLong") long ews, @Cast("bool") boolean empty);
-public native OpaqueConstantDataBuffer shapeBuffer(int rank, @Cast("Nd4jLong*") long[] shape, @Cast("Nd4jLong*") long[] strides, @Cast("nd4j::DataType") int dtype, char order, @Cast("Nd4jLong") long ews, @Cast("bool") boolean empty);
+public native OpaqueConstantDataBuffer shapeBuffer(int rank, @Cast("Nd4jLong*") LongPointer shape, @Cast("Nd4jLong*") LongPointer strides, @Cast("sd::DataType") int dtype, char order, @Cast("Nd4jLong") long ews, @Cast("bool") boolean empty);
+public native OpaqueConstantDataBuffer shapeBuffer(int rank, @Cast("Nd4jLong*") LongBuffer shape, @Cast("Nd4jLong*") LongBuffer strides, @Cast("sd::DataType") int dtype, char order, @Cast("Nd4jLong") long ews, @Cast("bool") boolean empty);
+public native OpaqueConstantDataBuffer shapeBuffer(int rank, @Cast("Nd4jLong*") long[] shape, @Cast("Nd4jLong*") long[] strides, @Cast("sd::DataType") int dtype, char order, @Cast("Nd4jLong") long ews, @Cast("bool") boolean empty);
 
-public native OpaqueConstantDataBuffer constantBufferLong(@Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*") LongPointer data, int length);
-public native OpaqueConstantDataBuffer constantBufferLong(@Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*") LongBuffer data, int length);
-public native OpaqueConstantDataBuffer constantBufferLong(@Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*") long[] data, int length);
-public native OpaqueConstantDataBuffer constantBufferDouble(@Cast("nd4j::DataType") int dtype, DoublePointer data, int length);
-public native OpaqueConstantDataBuffer constantBufferDouble(@Cast("nd4j::DataType") int dtype, DoubleBuffer data, int length);
-public native OpaqueConstantDataBuffer constantBufferDouble(@Cast("nd4j::DataType") int dtype, double[] data, int length);
-public native OpaqueConstantDataBuffer constantBuffer(@Cast("nd4j::DataType") int dtype, ConstantDescriptor descriptor);
+public native OpaqueConstantDataBuffer constantBufferLong(@Cast("sd::DataType") int dtype, @Cast("Nd4jLong*") LongPointer data, int length);
+public native OpaqueConstantDataBuffer constantBufferLong(@Cast("sd::DataType") int dtype, @Cast("Nd4jLong*") LongBuffer data, int length);
+public native OpaqueConstantDataBuffer constantBufferLong(@Cast("sd::DataType") int dtype, @Cast("Nd4jLong*") long[] data, int length);
+public native OpaqueConstantDataBuffer constantBufferDouble(@Cast("sd::DataType") int dtype, DoublePointer data, int length);
+public native OpaqueConstantDataBuffer constantBufferDouble(@Cast("sd::DataType") int dtype, DoubleBuffer data, int length);
+public native OpaqueConstantDataBuffer constantBufferDouble(@Cast("sd::DataType") int dtype, double[] data, int length);
+public native OpaqueConstantDataBuffer constantBuffer(@Cast("sd::DataType") int dtype, ConstantDescriptor descriptor);
 
 public native @Cast("Nd4jPointer") Pointer getConstantDataBufferPrimary(OpaqueConstantDataBuffer dbf);
 public native @Cast("Nd4jPointer") Pointer getConstantDataBufferSpecial(OpaqueConstantDataBuffer dbf);
@@ -3192,9 +3196,9 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #ifndef LIBND4J_EXTERNALWORKSPACE_H
 // #define LIBND4J_EXTERNALWORKSPACE_H
 
-// #include <pointercast.h>
-// #include <dll.h>
-        @Namespace("nd4j::memory") @NoOffset public static class ExternalWorkspace extends Pointer {
+// #include <system/pointercast.h>
+// #include <system/dll.h>
+        @Namespace("sd::memory") @NoOffset public static class ExternalWorkspace extends Pointer {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public ExternalWorkspace(Pointer p) { super(p); }
@@ -3253,13 +3257,13 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #include <atomic>
 // #include <vector>
 // #include <mutex>
-// #include <dll.h>
-// #include <pointercast.h>
+// #include <system/dll.h>
+// #include <system/pointercast.h>
 // #include <types/float16.h>
 // #include <memory/ExternalWorkspace.h>
 // #include <memory/MemoryType.h>
 
-        @Namespace("nd4j::memory") @NoOffset public static class Workspace extends Pointer {
+        @Namespace("sd::memory") @NoOffset public static class Workspace extends Pointer {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public Workspace(Pointer p) { super(p); }
@@ -3297,7 +3301,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 //            bool resizeSupported();
 
             public native Pointer allocateBytes(@Cast("Nd4jLong") long numBytes);
-            public native Pointer allocateBytes(@Cast("nd4j::memory::MemoryType") int type, @Cast("Nd4jLong") long numBytes);
+            public native Pointer allocateBytes(@Cast("sd::memory::MemoryType") int type, @Cast("Nd4jLong") long numBytes);
 
             public native void scopeIn();
             public native void scopeOut();
@@ -3338,10 +3342,10 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #ifndef LIBND4J_NDINDEX_H
 // #define LIBND4J_NDINDEX_H
 
-// #include <pointercast.h>
+// #include <system/pointercast.h>
 // #include <vector>
-// #include <dll.h>
-    @Namespace("nd4j") @NoOffset public static class NDIndex extends Pointer {
+// #include <system/dll.h>
+    @Namespace("sd") @NoOffset public static class NDIndex extends Pointer {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public NDIndex(Pointer p) { super(p); }
@@ -3368,7 +3372,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         public static native NDIndex interval(@Cast("Nd4jLong") long start, @Cast("Nd4jLong") long end);
     }
 
-    @Namespace("nd4j") public static class NDIndexAll extends NDIndex {
+    @Namespace("sd") public static class NDIndexAll extends NDIndex {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public NDIndexAll(Pointer p) { super(p); }
@@ -3385,7 +3389,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
     }
 
 
-    @Namespace("nd4j") public static class NDIndexPoint extends NDIndex {
+    @Namespace("sd") public static class NDIndexPoint extends NDIndex {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public NDIndexPoint(Pointer p) { super(p); }
@@ -3395,7 +3399,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         public native @Cast("bool") boolean isInterval();
     }
 
-    @Namespace("nd4j") public static class NDIndexInterval extends NDIndex {
+    @Namespace("sd") public static class NDIndexInterval extends NDIndex {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public NDIndexInterval(Pointer p) { super(p); }
@@ -3440,7 +3444,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 
 // #include <initializer_list>
 // #include "NDIndex.h"
-    @Namespace("nd4j") @NoOffset public static class IndicesList extends Pointer {
+    @Namespace("sd") @NoOffset public static class IndicesList extends Pointer {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public IndicesList(Pointer p) { super(p); }
@@ -3479,7 +3483,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 
 // #ifndef ND4J_VARIABLE_TYPE_H
 // #define ND4J_VARIABLE_TYPE_H
-        /** enum nd4j::graph::VariableType */
+        /** enum sd::graph::VariableType */
         public static final int
             NDARRAY = 0,
             ARRAY_LIST = 1,
@@ -3516,12 +3520,12 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #ifndef LIBND4J_INPUTLIST_H
 // #define LIBND4J_INPUTLIST_H
 
-// #include <op_boilerplate.h>
-// #include <pointercast.h>
-// #include <dll.h>
+// #include <system/op_boilerplate.h>
+// #include <system/pointercast.h>
+// #include <system/dll.h>
 // #include <vector>
 // #include <types/pair.h>
-    @Namespace("nd4j::graph") @NoOffset public static class ArgumentsList extends Pointer {
+    @Namespace("sd::graph") @NoOffset public static class ArgumentsList extends Pointer {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public ArgumentsList(Pointer p) { super(p); }
@@ -3581,8 +3585,8 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #ifndef LIBND4J_PAIR_H
 // #define LIBND4J_PAIR_H
 
-// #include <dll.h>
-    @Namespace("nd4j") @NoOffset public static class Pair extends Pointer {
+// #include <system/dll.h>
+    @Namespace("sd") @NoOffset public static class Pair extends Pointer {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public Pair(Pointer p) { super(p); }
@@ -3607,7 +3611,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #endif //LIBND4J_PAIR_H
 
 
-// Parsed from NDArray.h
+// Parsed from array/NDArray.h
 
 /*******************************************************************************
  * Copyright (c) 2015-2018 Skymind, Inc.
@@ -3628,11 +3632,11 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #ifndef NDARRAY_H
 // #define NDARRAY_H
 
-// #include <dll.h>
+// #include <system/dll.h>
 // #include <initializer_list>
 // #include <functional>
-// #include <shape.h>
-// #include "NativeOpExecutioner.h"
+// #include <helpers/shape.h>
+// #include "legacy/NativeOpExecutioner.h"
 // #include <indexing/NDIndex.h>
 // #include <indexing/IndicesList.h>
 // #include <graph/Intervals.h>
@@ -3643,13 +3647,13 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #include <array/ArrayType.h>
 // #include <array/ResultSet.h>
 // #include <helpers/ShapeBuilders.h>
-// #include <op_enums.h>
+// #include <system/op_enums.h>
 // #include <ops/BroadcastOpsTuple.h>
 // #include <ops/BroadcastBoolOpsTuple.h>
 // #include <ops/BroadcastIntOpsTuple.h>
 // #include <array/ExtraArguments.h>
-// #include <Status.h>
-// #include <ShapeDescriptor.h>
+// #include <graph/Status.h>
+// #include <array/ShapeDescriptor.h>
 // #include <helpers/ConstantShapeHelper.h>
 // #include <array/DataBuffer.h>
 // #include <execution/AffinityManager.h>
@@ -3660,9 +3664,9 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 
 
 
-    @Namespace("nd4j") public static native @ByVal NDArray mmul(@Const @ByRef NDArray arg0, @Const @ByRef NDArray arg1);
+    @Namespace("sd") public static native @ByVal NDArray mmul(@Const @ByRef NDArray arg0, @Const @ByRef NDArray arg1);
 
-    @Namespace("nd4j") @NoOffset public static class NDArray extends Pointer {
+    @Namespace("sd") @NoOffset public static class NDArray extends Pointer {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public NDArray(Pointer p) { super(p); }
@@ -3680,16 +3684,16 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         /**
         *  do not allocate memory, memory for array is passed from outside
         */
-        public NDArray(Pointer buffer, @Cast("Nd4jLong*") LongPointer shapeInfo, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/) { super((Pointer)null); allocate(buffer, shapeInfo, context, isBuffAlloc); }
-        private native void allocate(Pointer buffer, @Cast("Nd4jLong*") LongPointer shapeInfo, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/);
+        public NDArray(Pointer buffer, @Cast("Nd4jLong*") LongPointer shapeInfo, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/) { super((Pointer)null); allocate(buffer, shapeInfo, context, isBuffAlloc); }
+        private native void allocate(Pointer buffer, @Cast("Nd4jLong*") LongPointer shapeInfo, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/);
         public NDArray(Pointer buffer, @Cast("Nd4jLong*") LongPointer shapeInfo) { super((Pointer)null); allocate(buffer, shapeInfo); }
         private native void allocate(Pointer buffer, @Cast("Nd4jLong*") LongPointer shapeInfo);
-        public NDArray(Pointer buffer, @Cast("Nd4jLong*") LongBuffer shapeInfo, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/) { super((Pointer)null); allocate(buffer, shapeInfo, context, isBuffAlloc); }
-        private native void allocate(Pointer buffer, @Cast("Nd4jLong*") LongBuffer shapeInfo, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/);
+        public NDArray(Pointer buffer, @Cast("Nd4jLong*") LongBuffer shapeInfo, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/) { super((Pointer)null); allocate(buffer, shapeInfo, context, isBuffAlloc); }
+        private native void allocate(Pointer buffer, @Cast("Nd4jLong*") LongBuffer shapeInfo, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/);
         public NDArray(Pointer buffer, @Cast("Nd4jLong*") LongBuffer shapeInfo) { super((Pointer)null); allocate(buffer, shapeInfo); }
         private native void allocate(Pointer buffer, @Cast("Nd4jLong*") LongBuffer shapeInfo);
-        public NDArray(Pointer buffer, @Cast("Nd4jLong*") long[] shapeInfo, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/) { super((Pointer)null); allocate(buffer, shapeInfo, context, isBuffAlloc); }
-        private native void allocate(Pointer buffer, @Cast("Nd4jLong*") long[] shapeInfo, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/);
+        public NDArray(Pointer buffer, @Cast("Nd4jLong*") long[] shapeInfo, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/) { super((Pointer)null); allocate(buffer, shapeInfo, context, isBuffAlloc); }
+        private native void allocate(Pointer buffer, @Cast("Nd4jLong*") long[] shapeInfo, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/);
         public NDArray(Pointer buffer, @Cast("Nd4jLong*") long[] shapeInfo) { super((Pointer)null); allocate(buffer, shapeInfo); }
         private native void allocate(Pointer buffer, @Cast("Nd4jLong*") long[] shapeInfo);
 
@@ -3697,16 +3701,16 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         *  do not allocate memory, memory for array is passed from outside
         *  we suppose the content of both (device and host) buffers is identical
         */
-        public NDArray(Pointer buffer, Pointer bufferD, @Cast("Nd4jLong*") LongPointer shapeInfo, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/, @Cast("const bool") boolean isBuffDAlloc/*=false*/) { super((Pointer)null); allocate(buffer, bufferD, shapeInfo, context, isBuffAlloc, isBuffDAlloc); }
-        private native void allocate(Pointer buffer, Pointer bufferD, @Cast("Nd4jLong*") LongPointer shapeInfo, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/, @Cast("const bool") boolean isBuffDAlloc/*=false*/);
+        public NDArray(Pointer buffer, Pointer bufferD, @Cast("Nd4jLong*") LongPointer shapeInfo, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/, @Cast("const bool") boolean isBuffDAlloc/*=false*/) { super((Pointer)null); allocate(buffer, bufferD, shapeInfo, context, isBuffAlloc, isBuffDAlloc); }
+        private native void allocate(Pointer buffer, Pointer bufferD, @Cast("Nd4jLong*") LongPointer shapeInfo, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/, @Cast("const bool") boolean isBuffDAlloc/*=false*/);
         public NDArray(Pointer buffer, Pointer bufferD, @Cast("Nd4jLong*") LongPointer shapeInfo) { super((Pointer)null); allocate(buffer, bufferD, shapeInfo); }
         private native void allocate(Pointer buffer, Pointer bufferD, @Cast("Nd4jLong*") LongPointer shapeInfo);
-        public NDArray(Pointer buffer, Pointer bufferD, @Cast("Nd4jLong*") LongBuffer shapeInfo, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/, @Cast("const bool") boolean isBuffDAlloc/*=false*/) { super((Pointer)null); allocate(buffer, bufferD, shapeInfo, context, isBuffAlloc, isBuffDAlloc); }
-        private native void allocate(Pointer buffer, Pointer bufferD, @Cast("Nd4jLong*") LongBuffer shapeInfo, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/, @Cast("const bool") boolean isBuffDAlloc/*=false*/);
+        public NDArray(Pointer buffer, Pointer bufferD, @Cast("Nd4jLong*") LongBuffer shapeInfo, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/, @Cast("const bool") boolean isBuffDAlloc/*=false*/) { super((Pointer)null); allocate(buffer, bufferD, shapeInfo, context, isBuffAlloc, isBuffDAlloc); }
+        private native void allocate(Pointer buffer, Pointer bufferD, @Cast("Nd4jLong*") LongBuffer shapeInfo, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/, @Cast("const bool") boolean isBuffDAlloc/*=false*/);
         public NDArray(Pointer buffer, Pointer bufferD, @Cast("Nd4jLong*") LongBuffer shapeInfo) { super((Pointer)null); allocate(buffer, bufferD, shapeInfo); }
         private native void allocate(Pointer buffer, Pointer bufferD, @Cast("Nd4jLong*") LongBuffer shapeInfo);
-        public NDArray(Pointer buffer, Pointer bufferD, @Cast("Nd4jLong*") long[] shapeInfo, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/, @Cast("const bool") boolean isBuffDAlloc/*=false*/) { super((Pointer)null); allocate(buffer, bufferD, shapeInfo, context, isBuffAlloc, isBuffDAlloc); }
-        private native void allocate(Pointer buffer, Pointer bufferD, @Cast("Nd4jLong*") long[] shapeInfo, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/, @Cast("const bool") boolean isBuffDAlloc/*=false*/);
+        public NDArray(Pointer buffer, Pointer bufferD, @Cast("Nd4jLong*") long[] shapeInfo, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/, @Cast("const bool") boolean isBuffDAlloc/*=false*/) { super((Pointer)null); allocate(buffer, bufferD, shapeInfo, context, isBuffAlloc, isBuffDAlloc); }
+        private native void allocate(Pointer buffer, Pointer bufferD, @Cast("Nd4jLong*") long[] shapeInfo, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/, @Cast("const bool") boolean isBuffDAlloc/*=false*/);
         public NDArray(Pointer buffer, Pointer bufferD, @Cast("Nd4jLong*") long[] shapeInfo) { super((Pointer)null); allocate(buffer, bufferD, shapeInfo); }
         private native void allocate(Pointer buffer, Pointer bufferD, @Cast("Nd4jLong*") long[] shapeInfo);
 
@@ -3730,16 +3734,16 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         /**
 		*  constructor creates new NDArray using shape information from "shapeInfo", set all elements in new array to zeros, if copyStrides is true then use stride values from "shapeInfo", else calculate strides independently
         */
-		public NDArray(@Cast("Nd4jLong*") LongPointer shapeInfo, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(shapeInfo, copyStrides, context); }
-		private native void allocate(@Cast("Nd4jLong*") LongPointer shapeInfo, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/);
+		public NDArray(@Cast("Nd4jLong*") LongPointer shapeInfo, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(shapeInfo, copyStrides, context); }
+		private native void allocate(@Cast("Nd4jLong*") LongPointer shapeInfo, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/);
 		public NDArray(@Cast("Nd4jLong*") LongPointer shapeInfo) { super((Pointer)null); allocate(shapeInfo); }
 		private native void allocate(@Cast("Nd4jLong*") LongPointer shapeInfo);
-		public NDArray(@Cast("Nd4jLong*") LongBuffer shapeInfo, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(shapeInfo, copyStrides, context); }
-		private native void allocate(@Cast("Nd4jLong*") LongBuffer shapeInfo, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/);
+		public NDArray(@Cast("Nd4jLong*") LongBuffer shapeInfo, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(shapeInfo, copyStrides, context); }
+		private native void allocate(@Cast("Nd4jLong*") LongBuffer shapeInfo, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/);
 		public NDArray(@Cast("Nd4jLong*") LongBuffer shapeInfo) { super((Pointer)null); allocate(shapeInfo); }
 		private native void allocate(@Cast("Nd4jLong*") LongBuffer shapeInfo);
-		public NDArray(@Cast("Nd4jLong*") long[] shapeInfo, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(shapeInfo, copyStrides, context); }
-		private native void allocate(@Cast("Nd4jLong*") long[] shapeInfo, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/);
+		public NDArray(@Cast("Nd4jLong*") long[] shapeInfo, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(shapeInfo, copyStrides, context); }
+		private native void allocate(@Cast("Nd4jLong*") long[] shapeInfo, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/);
 		public NDArray(@Cast("Nd4jLong*") long[] shapeInfo) { super((Pointer)null); allocate(shapeInfo); }
 		private native void allocate(@Cast("Nd4jLong*") long[] shapeInfo);
 
@@ -3747,66 +3751,66 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         *  constructor creates new NDArray using shape information from "shapeInfo", set all elements in new array to be zeros, if copyStrides is true then use stride values from "shapeInfo", else calculate strides independently
         *  set dtype as array type
         */
-        public NDArray(@Cast("Nd4jLong*") LongPointer shapeInfo, @Cast("const nd4j::DataType") int dtype, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(shapeInfo, dtype, copyStrides, context); }
-        private native void allocate(@Cast("Nd4jLong*") LongPointer shapeInfo, @Cast("const nd4j::DataType") int dtype, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/);
-        public NDArray(@Cast("Nd4jLong*") LongPointer shapeInfo, @Cast("const nd4j::DataType") int dtype) { super((Pointer)null); allocate(shapeInfo, dtype); }
-        private native void allocate(@Cast("Nd4jLong*") LongPointer shapeInfo, @Cast("const nd4j::DataType") int dtype);
-        public NDArray(@Cast("Nd4jLong*") LongBuffer shapeInfo, @Cast("const nd4j::DataType") int dtype, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(shapeInfo, dtype, copyStrides, context); }
-        private native void allocate(@Cast("Nd4jLong*") LongBuffer shapeInfo, @Cast("const nd4j::DataType") int dtype, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/);
-        public NDArray(@Cast("Nd4jLong*") LongBuffer shapeInfo, @Cast("const nd4j::DataType") int dtype) { super((Pointer)null); allocate(shapeInfo, dtype); }
-        private native void allocate(@Cast("Nd4jLong*") LongBuffer shapeInfo, @Cast("const nd4j::DataType") int dtype);
-        public NDArray(@Cast("Nd4jLong*") long[] shapeInfo, @Cast("const nd4j::DataType") int dtype, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(shapeInfo, dtype, copyStrides, context); }
-        private native void allocate(@Cast("Nd4jLong*") long[] shapeInfo, @Cast("const nd4j::DataType") int dtype, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/);
-        public NDArray(@Cast("Nd4jLong*") long[] shapeInfo, @Cast("const nd4j::DataType") int dtype) { super((Pointer)null); allocate(shapeInfo, dtype); }
-        private native void allocate(@Cast("Nd4jLong*") long[] shapeInfo, @Cast("const nd4j::DataType") int dtype);
+        public NDArray(@Cast("Nd4jLong*") LongPointer shapeInfo, @Cast("const sd::DataType") int dtype, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(shapeInfo, dtype, copyStrides, context); }
+        private native void allocate(@Cast("Nd4jLong*") LongPointer shapeInfo, @Cast("const sd::DataType") int dtype, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/);
+        public NDArray(@Cast("Nd4jLong*") LongPointer shapeInfo, @Cast("const sd::DataType") int dtype) { super((Pointer)null); allocate(shapeInfo, dtype); }
+        private native void allocate(@Cast("Nd4jLong*") LongPointer shapeInfo, @Cast("const sd::DataType") int dtype);
+        public NDArray(@Cast("Nd4jLong*") LongBuffer shapeInfo, @Cast("const sd::DataType") int dtype, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(shapeInfo, dtype, copyStrides, context); }
+        private native void allocate(@Cast("Nd4jLong*") LongBuffer shapeInfo, @Cast("const sd::DataType") int dtype, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/);
+        public NDArray(@Cast("Nd4jLong*") LongBuffer shapeInfo, @Cast("const sd::DataType") int dtype) { super((Pointer)null); allocate(shapeInfo, dtype); }
+        private native void allocate(@Cast("Nd4jLong*") LongBuffer shapeInfo, @Cast("const sd::DataType") int dtype);
+        public NDArray(@Cast("Nd4jLong*") long[] shapeInfo, @Cast("const sd::DataType") int dtype, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(shapeInfo, dtype, copyStrides, context); }
+        private native void allocate(@Cast("Nd4jLong*") long[] shapeInfo, @Cast("const sd::DataType") int dtype, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/);
+        public NDArray(@Cast("Nd4jLong*") long[] shapeInfo, @Cast("const sd::DataType") int dtype) { super((Pointer)null); allocate(shapeInfo, dtype); }
+        private native void allocate(@Cast("Nd4jLong*") long[] shapeInfo, @Cast("const sd::DataType") int dtype);
 
         /**
         *  this constructor creates new array using shape information contained in vector argument
         */
-        public NDArray(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("nd4j::DataType") int dtype/*=nd4j::DOUBLE*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(order, shape, dtype, context); }
-        private native void allocate(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("nd4j::DataType") int dtype/*=nd4j::DOUBLE*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/);
+        public NDArray(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("sd::DataType") int dtype/*=sd::DOUBLE*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(order, shape, dtype, context); }
+        private native void allocate(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("sd::DataType") int dtype/*=sd::DOUBLE*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/);
         public NDArray(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape) { super((Pointer)null); allocate(order, shape); }
         private native void allocate(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape);
-        public NDArray(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("nd4j::DataType") int dtype/*=nd4j::DOUBLE*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(order, shape, dtype, context); }
-        private native void allocate(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("nd4j::DataType") int dtype/*=nd4j::DOUBLE*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/);
+        public NDArray(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("sd::DataType") int dtype/*=sd::DOUBLE*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(order, shape, dtype, context); }
+        private native void allocate(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("sd::DataType") int dtype/*=sd::DOUBLE*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/);
         public NDArray(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape) { super((Pointer)null); allocate(order, shape); }
         private native void allocate(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape);
-        public NDArray(byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("nd4j::DataType") int dtype/*=nd4j::DOUBLE*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(order, shape, dtype, context); }
-        private native void allocate(byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("nd4j::DataType") int dtype/*=nd4j::DOUBLE*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/);
+        public NDArray(byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("sd::DataType") int dtype/*=sd::DOUBLE*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(order, shape, dtype, context); }
+        private native void allocate(byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("sd::DataType") int dtype/*=sd::DOUBLE*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/);
         public NDArray(byte order, @Cast("Nd4jLong*") @StdVector long[] shape) { super((Pointer)null); allocate(order, shape); }
         private native void allocate(byte order, @Cast("Nd4jLong*") @StdVector long[] shape);
 
         /**
         * This constructor creates new array with elements copied from data and using shape information stored in shape, elements from data will be casted to dtype
         */
-        public NDArray(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @StdVector DoublePointer data, @Cast("nd4j::DataType") int dtype/*=nd4j::DOUBLE*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(order, shape, data, dtype, context); }
-        private native void allocate(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @StdVector DoublePointer data, @Cast("nd4j::DataType") int dtype/*=nd4j::DOUBLE*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/);
+        public NDArray(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @StdVector DoublePointer data, @Cast("sd::DataType") int dtype/*=sd::DOUBLE*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(order, shape, data, dtype, context); }
+        private native void allocate(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @StdVector DoublePointer data, @Cast("sd::DataType") int dtype/*=sd::DOUBLE*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/);
         public NDArray(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @StdVector DoublePointer data) { super((Pointer)null); allocate(order, shape, data); }
         private native void allocate(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @StdVector DoublePointer data);
-        public NDArray(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @StdVector DoubleBuffer data, @Cast("nd4j::DataType") int dtype/*=nd4j::DOUBLE*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(order, shape, data, dtype, context); }
-        private native void allocate(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @StdVector DoubleBuffer data, @Cast("nd4j::DataType") int dtype/*=nd4j::DOUBLE*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/);
+        public NDArray(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @StdVector DoubleBuffer data, @Cast("sd::DataType") int dtype/*=sd::DOUBLE*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(order, shape, data, dtype, context); }
+        private native void allocate(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @StdVector DoubleBuffer data, @Cast("sd::DataType") int dtype/*=sd::DOUBLE*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/);
         public NDArray(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @StdVector DoubleBuffer data) { super((Pointer)null); allocate(order, shape, data); }
         private native void allocate(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @StdVector DoubleBuffer data);
-        public NDArray(byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @StdVector double[] data, @Cast("nd4j::DataType") int dtype/*=nd4j::DOUBLE*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(order, shape, data, dtype, context); }
-        private native void allocate(byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @StdVector double[] data, @Cast("nd4j::DataType") int dtype/*=nd4j::DOUBLE*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/);
+        public NDArray(byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @StdVector double[] data, @Cast("sd::DataType") int dtype/*=sd::DOUBLE*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(order, shape, data, dtype, context); }
+        private native void allocate(byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @StdVector double[] data, @Cast("sd::DataType") int dtype/*=sd::DOUBLE*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/);
         public NDArray(byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @StdVector double[] data) { super((Pointer)null); allocate(order, shape, data); }
         private native void allocate(byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @StdVector double[] data);
 
         /**
         *  this constructor creates new array using given buffer (without memory allocation) and shape information stored in shape
         */
-        public NDArray(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape,  @Cast("nd4j::DataType") int dtype, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/) { super((Pointer)null); allocate(buffer, order, shape, dtype, context, isBuffAlloc); }
-        private native void allocate(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape,  @Cast("nd4j::DataType") int dtype, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/);
-        public NDArray(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape,  @Cast("nd4j::DataType") int dtype) { super((Pointer)null); allocate(buffer, order, shape, dtype); }
-        private native void allocate(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape,  @Cast("nd4j::DataType") int dtype);
-        public NDArray(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape,  @Cast("nd4j::DataType") int dtype, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/) { super((Pointer)null); allocate(buffer, order, shape, dtype, context, isBuffAlloc); }
-        private native void allocate(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape,  @Cast("nd4j::DataType") int dtype, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/);
-        public NDArray(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape,  @Cast("nd4j::DataType") int dtype) { super((Pointer)null); allocate(buffer, order, shape, dtype); }
-        private native void allocate(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape,  @Cast("nd4j::DataType") int dtype);
-        public NDArray(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector long[] shape,  @Cast("nd4j::DataType") int dtype, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/) { super((Pointer)null); allocate(buffer, order, shape, dtype, context, isBuffAlloc); }
-        private native void allocate(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector long[] shape,  @Cast("nd4j::DataType") int dtype, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/);
-        public NDArray(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector long[] shape,  @Cast("nd4j::DataType") int dtype) { super((Pointer)null); allocate(buffer, order, shape, dtype); }
-        private native void allocate(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector long[] shape,  @Cast("nd4j::DataType") int dtype);
+        public NDArray(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape,  @Cast("sd::DataType") int dtype, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/) { super((Pointer)null); allocate(buffer, order, shape, dtype, context, isBuffAlloc); }
+        private native void allocate(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape,  @Cast("sd::DataType") int dtype, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/);
+        public NDArray(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape,  @Cast("sd::DataType") int dtype) { super((Pointer)null); allocate(buffer, order, shape, dtype); }
+        private native void allocate(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape,  @Cast("sd::DataType") int dtype);
+        public NDArray(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape,  @Cast("sd::DataType") int dtype, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/) { super((Pointer)null); allocate(buffer, order, shape, dtype, context, isBuffAlloc); }
+        private native void allocate(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape,  @Cast("sd::DataType") int dtype, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/);
+        public NDArray(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape,  @Cast("sd::DataType") int dtype) { super((Pointer)null); allocate(buffer, order, shape, dtype); }
+        private native void allocate(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape,  @Cast("sd::DataType") int dtype);
+        public NDArray(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector long[] shape,  @Cast("sd::DataType") int dtype, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/) { super((Pointer)null); allocate(buffer, order, shape, dtype, context, isBuffAlloc); }
+        private native void allocate(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector long[] shape,  @Cast("sd::DataType") int dtype, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isBuffAlloc/*=false*/);
+        public NDArray(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector long[] shape,  @Cast("sd::DataType") int dtype) { super((Pointer)null); allocate(buffer, order, shape, dtype); }
+        private native void allocate(Pointer buffer, byte order, @Cast("Nd4jLong*") @StdVector long[] shape,  @Cast("sd::DataType") int dtype);
 
         /**
         * This method returns new array with the same shape & data type
@@ -3825,16 +3829,16 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         *  this constructor creates new NDArray with shape matching "other" array,
         *  doesn't copy "other" elements into new array !!!
         */
-        public NDArray(@Const NDArray other, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(other, copyStrides, context); }
-        private native void allocate(@Const NDArray other, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/);
+        public NDArray(@Const NDArray other, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/) { super((Pointer)null); allocate(other, copyStrides, context); }
+        private native void allocate(@Const NDArray other, @Cast("const bool") boolean copyStrides/*=false*/, LaunchContext context/*=sd::LaunchContext::defaultContext()*/);
 
         /**
         *  this constructor creates scalar(and set its value = 0) or empty array depending on bool argument isScalar
         */
-        public NDArray(@Cast("nd4j::DataType") int dtype, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isScalar/*=true*/) { super((Pointer)null); allocate(dtype, context, isScalar); }
-        private native void allocate(@Cast("nd4j::DataType") int dtype, LaunchContext context/*=nd4j::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isScalar/*=true*/);
-        public NDArray(@Cast("nd4j::DataType") int dtype) { super((Pointer)null); allocate(dtype); }
-        private native void allocate(@Cast("nd4j::DataType") int dtype);
+        public NDArray(@Cast("sd::DataType") int dtype, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isScalar/*=true*/) { super((Pointer)null); allocate(dtype, context, isScalar); }
+        private native void allocate(@Cast("sd::DataType") int dtype, LaunchContext context/*=sd::LaunchContext::defaultContext()*/, @Cast("const bool") boolean isScalar/*=true*/);
+        public NDArray(@Cast("sd::DataType") int dtype) { super((Pointer)null); allocate(dtype); }
+        private native void allocate(@Cast("sd::DataType") int dtype);
 
         /**
          * This method blocks until asynchronous operation finishes
@@ -3947,9 +3951,9 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         /**
         *  cast array elements to given dtype
         */
-        public native @ByVal NDArray cast(@Cast("nd4j::DataType") int dtype);
+        public native @ByVal NDArray cast(@Cast("sd::DataType") int dtype);
 
-        public native void cast(@ByRef NDArray target, @Cast("nd4j::DataType") int dtype);
+        public native void cast(@ByRef NDArray target, @Cast("sd::DataType") int dtype);
 
         /**
         *   returns _context
@@ -4039,6 +4043,9 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         public native @ByVal NDArray permute(@Const IntPointer dimensions, int rank);
         public native @ByVal NDArray permute(@Const IntBuffer dimensions, int rank);
         public native @ByVal NDArray permute(@Const int[] dimensions, int rank);
+        
+        
+        
 
         public native void permute(@Const IntPointer dimensions, int rank, @ByRef NDArray target);
         public native void permute(@Const IntBuffer dimensions, int rank, @ByRef NDArray target);
@@ -4052,6 +4059,9 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         public native @ByVal NDArray permute(@Cast("const Nd4jLong*") LongPointer dimensions, int rank);
         public native @ByVal NDArray permute(@Cast("const Nd4jLong*") LongBuffer dimensions, int rank);
         public native @ByVal NDArray permute(@Cast("const Nd4jLong*") long[] dimensions, int rank);
+        
+        
+        
 
         public native void permute(@Cast("const Nd4jLong*") LongPointer dimensions, int rank, @ByRef NDArray target);
         public native void permute(@Cast("const Nd4jLong*") LongBuffer dimensions, int rank, @ByRef NDArray target);
@@ -4145,6 +4155,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         *   apply transpose operation to the copy of this array, that is this array remains unaffected
         */
         public native @ByVal NDArray transpose();
+        
 
         /**
         *  perform transpose operation and store result in target, this array remains unaffected
@@ -4276,12 +4287,13 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         *
         * if permute have been applied before or there are weird strides, then new buffer is allocated for new array
         */
-        public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/);
-        public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape);
-        public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/);
-        public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape);
-        public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("const bool") boolean copyToNewBuff/*=true*/);
-        public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector long[] shape);
+		public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/);
+		public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape);
+		public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("const bool") boolean copyToNewBuff/*=true*/);
+		public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape);
+		public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("const bool") boolean copyToNewBuff/*=true*/);
+		public native @ByVal NDArray reshape(byte order, @Cast("Nd4jLong*") @StdVector long[] shape);
+        
 
         /**
         *  calculate strides and set given order
@@ -4391,6 +4403,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         *  negative operator, it changes sign of all array elements on opposite
         */
         public native @ByVal @Name("operator -") NDArray subtract();
+        
 
         /**
         *  pairwise multiplication unary operator array *= other
@@ -4470,7 +4483,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #ifndef __JAVACPP_HACK__
 // #endif
 
-        public native @ByVal NDArray asT(@Cast("nd4j::DataType") int dtype);
+        public native @ByVal NDArray asT(@Cast("sd::DataType") int dtype);
 
 
         public native void linspace(double start);
@@ -4498,9 +4511,9 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         public native void setShapeInfo(@Cast("const Nd4jLong*") LongPointer shapeInfo);
         public native void setShapeInfo(@Cast("const Nd4jLong*") LongBuffer shapeInfo);
         public native void setShapeInfo(@Cast("const Nd4jLong*") long[] shapeInfo);
-        public native void setShapeInfo(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const nd4j::DataType") int dtype);
-        public native void setShapeInfo(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const nd4j::DataType") int dtype);
-        public native void setShapeInfo(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const nd4j::DataType") int dtype);
+        public native void setShapeInfo(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const sd::DataType") int dtype);
+        public native void setShapeInfo(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const sd::DataType") int dtype);
+        public native void setShapeInfo(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const sd::DataType") int dtype);
         public native void setShapeInfo(@Const @ByRef ShapeDescriptor descriptor);
         public native void setShapeInfo(@Const @ByRef ConstantDataBuffer shapeBuffer);
 
@@ -4695,7 +4708,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         * Returns data type of this array
         * @return
         */
-        public native @Cast("nd4j::DataType") int dataType();
+        public native @Cast("sd::DataType") int dataType();
 
         /**
          * This method returns true if value is from Integer space
@@ -4892,7 +4905,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 
 // #if defined(__CUDACC__) //&& defined(BUILD_TESTS)
 // for CUDA we need stil stuff inline
-// #include "cuda/NDArrayLambda.hpp"
+// #include <array/NDArrayLambda.hXX>
 // #endif
 
 
@@ -4930,10 +4943,10 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #include <string>
 // #include <atomic>
 // #include <unordered_map>
-// #include <NDArray.h>
+// #include <array/NDArray.h>
 // #include <memory/Workspace.h>
-// #include <dll.h>
-    @Namespace("nd4j") @NoOffset public static class NDArrayList extends Pointer {
+// #include <system/dll.h>
+    @Namespace("sd") @NoOffset public static class NDArrayList extends Pointer {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public NDArrayList(Pointer p) { super(p); }
@@ -4943,7 +4956,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         public NDArrayList(int height) { super((Pointer)null); allocate(height); }
         private native void allocate(int height);
 
-        public native @Cast("nd4j::DataType") int dataType();
+        public native @Cast("sd::DataType") int dataType();
 
         public native NDArray read(int idx);
         public native NDArray readRaw(int idx);
@@ -4960,7 +4973,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 
         public native @ByRef IntIntPair id();
         public native @StdString @ByRef @Cast({"char*", "std::string*"}) BytePointer name();
-        //nd4j::memory::Workspace* workspace();
+        //sd::memory::Workspace* workspace();
         public native LaunchContext context();
         public native NDArrayList clone();
 
@@ -4998,7 +5011,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // 
 // PLESE NOTE: It will delete all stored NDArrays upon destructor call
 //
-// Created by raver119 on 07.09.17.
+// @author raver119@gmail.com
 //
 
 // #ifndef LIBND4J_RESULTSET_H
@@ -5006,10 +5019,10 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 
 // #include <vector>
 // #include <graph/generated/result_generated.h>
-// #include <pointercast.h>
-// #include <dll.h> // forward declaration of template class NDArray
+// #include <system/pointercast.h>
+// #include <system/dll.h> // forward declaration of template class NDArray
     
-    @Namespace("nd4j") @NoOffset public static class ResultSet extends Pointer {
+    @Namespace("sd") @NoOffset public static class ResultSet extends Pointer {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public ResultSet(Pointer p) { super(p); }
@@ -5020,10 +5033,12 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
             return (ResultSet)super.position(position);
         }
     
-        // default constructor
         public ResultSet() { super((Pointer)null); allocate(); }
         private native void allocate();
 
+// #ifndef __JAVACPP_HACK__
+// #endif
+
         public ResultSet(@Const @ByRef ResultSet other) { super((Pointer)null); allocate(other); }
         @NoException private native void allocate(@Const @ByRef ResultSet other);
 
@@ -5074,9 +5089,9 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #define LIBND4J_GRAPH_RNG_H
 
 // #include <types/u64.h>
-// #include <pointercast.h>
-// #include <op_boilerplate.h>
-// #include <dll.h>
+// #include <system/pointercast.h>
+// #include <system/op_boilerplate.h>
+// #include <system/dll.h>
 // #include <chrono>
 // #include <array/DataTypeUtils.h>
 // #include <helpers/logger.h>
@@ -5086,7 +5101,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #endif
 // #ifdef __CUDACC__
 // #else
-        @Namespace("nd4j::graph") @NoOffset public static class RandomGenerator extends Pointer {
+        @Namespace("sd::graph") @NoOffset public static class RandomGenerator extends Pointer {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public RandomGenerator(Pointer p) { super(p); }
@@ -5170,9 +5185,9 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         
 
         //////
-        @Namespace("nd4j::graph") public static native @Cast("uint32_t") int rotl(@Cast("const uint32_t") int x, int k);
+        @Namespace("sd::graph") public static native @Cast("uint32_t") int rotl(@Cast("const uint32_t") int x, int k);
 
-        @Namespace("nd4j::graph") public static native @Cast("uint64_t") long rotl(@Cast("const uint64_t") long x, int k);
+        @Namespace("sd::graph") public static native @Cast("uint64_t") long rotl(@Cast("const uint64_t") long x, int k);
 
         
 
@@ -5211,13 +5226,17 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #define LIBND4J_VARIABLE_H
 
 // #include <string>
-// #include <NDArray.h>
+// #include <array/NDArray.h>
 // #include <array/NDArrayList.h>
 // #include <graph/VariableType.h>
 // #include <graph/generated/array_generated.h>
 // #include <graph/generated/node_generated.h>
 // #include <graph/generated/graph_generated.h>
-        @Namespace("nd4j::graph") @NoOffset public static class Variable extends Pointer {
+
+// #ifndef __JAVACPP_HACK__
+
+// #endif
+        @Namespace("sd::graph") @NoOffset public static class Variable extends Pointer {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public Variable(Pointer p) { super(p); }
@@ -5245,6 +5264,9 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
             public Variable(NDArray array/*=nullptr*/, @Cast("char*") BytePointer name/*=nullptr*/) { super((Pointer)null); allocate(array, name); }
             private native void allocate(NDArray array/*=nullptr*/, @Cast("char*") BytePointer name/*=nullptr*/);
 
+// #ifndef __JAVACPP_HACK__
+// #endif
+
             public native Variable clone();
 
             public native @Cast("bool") boolean hasNDArray();
@@ -5262,8 +5284,8 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 
             public native @Cast("bool") boolean isPlaceholder();
 
-            public native @Cast("nd4j::graph::VariableType") int variableType();
-            public native void setVariableType(@Cast("nd4j::graph::VariableType") int variableType);
+            public native @Cast("sd::graph::VariableType") int variableType();
+            public native void setVariableType(@Cast("sd::graph::VariableType") int variableType);
 
             /**
              * This method returns InputType of this variable  
@@ -5324,10 +5346,10 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 
 // #include <iterator>
 // #include <vector>
-// #include <pointercast.h>
-// #include <dll.h>
+// #include <system/pointercast.h>
+// #include <system/dll.h>
 // #include <graph/Variable.h>
-        @Namespace("nd4j::graph") @NoOffset public static class VariablesSet extends Pointer {
+        @Namespace("sd::graph") @NoOffset public static class VariablesSet extends Pointer {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public VariablesSet(Pointer p) { super(p); }
@@ -5379,13 +5401,15 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #ifndef LIBND4J_FLOWPATH_H
 // #define LIBND4J_FLOWPATH_H
 
+// #include <system/op_boilerplate.h>
+// #include <unordered_map>
 // #include <map>
-// #include <pointercast.h>
+// #include <system/pointercast.h>
 // #include <graph/NodeState.h>
 // #include <graph/FrameState.h>
 // #include <graph/profiling/GraphProfile.h>
-// #include <dll.h>
-        @Namespace("nd4j::graph") @NoOffset public static class FlowPath extends Pointer {
+// #include <system/dll.h>
+        @Namespace("sd::graph") @NoOffset public static class FlowPath extends Pointer {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public FlowPath(Pointer p) { super(p); }
@@ -5466,12 +5490,12 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #ifndef LIBND4J_INTERVALS_H
 // #define LIBND4J_INTERVALS_H
 
-// #include <pointercast.h>
+// #include <system/pointercast.h>
 // #include <vector>
 // #include <initializer_list>
-// #include <dll.h>
+// #include <system/dll.h>
 
-    @Namespace("nd4j") @NoOffset public static class Intervals extends Pointer {
+    @Namespace("sd") @NoOffset public static class Intervals extends Pointer {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public Intervals(Pointer p) { super(p); }
@@ -5531,12 +5555,14 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #define LIBND4J_STASH_H
 
 //#include <graph/Block.h>
-// #include <NDArray.h>
-// #include <unordered_map>
+// #include <array/NDArray.h>
+// #include <map>
+// #include <vector>
 // #include <string>
 // #include <atomic>
-// #include <pointercast.h>
-        @Namespace("nd4j::graph") @NoOffset public static class KeyPair extends Pointer {
+// #include <functional>
+// #include <system/pointercast.h>
+        @Namespace("sd::graph") @NoOffset public static class KeyPair extends Pointer {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public KeyPair(Pointer p) { super(p); }
@@ -5555,9 +5581,19 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
             private native void allocate(int node/*=0*/, @Cast("char*") BytePointer name/*=nullptr*/);
 
             public native @Cast("bool") @Name("operator <") boolean lessThan(@Const @ByRef KeyPair other);
-        }
 
-        @Namespace("nd4j::graph") @NoOffset public static class Stash extends Pointer {
+            public native @Cast("bool") @Name("operator ==") boolean equals(@Const @ByRef KeyPair other);
+
+            public native int key();
+            public native @StdString BytePointer name();
+        }
+    
+
+
+// #ifndef __JAVACPP_HACK__
+
+// #endif
+        @Namespace("sd::graph") @NoOffset public static class Stash extends Pointer {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public Stash(Pointer p) { super(p); }
@@ -5571,15 +5607,15 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
             public Stash() { super((Pointer)null); allocate(); }
             private native void allocate();
 
-            //void storeArray(nd4j::graph::Block<T>& block, const char *name, nd4j::NDArray<T> *array);
+            //void storeArray(sd::graph::Block<T>& block, const char *name, sd::NDArray<T> *array);
             public native void storeArray(int nodeId, @Cast("char*") String name, NDArray array);
             public native void storeArray(int nodeId, @Cast("char*") BytePointer name, NDArray array);
 
-            //bool checkStash(nd4j::graph::Block<T>& block, const char *name);
+            //bool checkStash(sd::graph::Block<T>& block, const char *name);
             public native @Cast("bool") boolean checkStash(int nodeId, @Cast("char*") String name);
             public native @Cast("bool") boolean checkStash(int nodeId, @Cast("char*") BytePointer name);
 
-            //nd4j::NDArray<T>* extractArray(nd4j::graph::Block<T>& block, const char *name);
+            //sd::NDArray<T>* extractArray(sd::graph::Block<T>& block, const char *name);
             public native NDArray extractArray(int nodeId, @Cast("char*") String name);
             public native NDArray extractArray(int nodeId, @Cast("char*") BytePointer name);
 
@@ -5591,6 +5627,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 
 
 
+
 // #endif //LIBND4J_STASH_H
 
 
@@ -5619,20 +5656,21 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #ifndef LIBND4J_GRAPHSTATE_H
 // #define LIBND4J_GRAPHSTATE_H
 
-// #include <pointercast.h>
-// #include <op_boilerplate.h>
-// #include <dll.h>
+// #include <system/pointercast.h>
+// #include <system/op_boilerplate.h>
+// #include <system/dll.h>
 // #include <vector>
 // #include <unordered_map>
+// #include <map>
 // #include <graph/Scope.h>
-// #include <Status.h>
+// #include <graph/Status.h>
 // #include <graph/VariableSpace.h>
 // #include <ops/declarable/DeclarableOp.h>
 // #include <types/pair.h>
 // #include <graph/ArgumentsList.h>
 // #include <graph/Graph.h>
 
-    @Namespace("nd4j::graph") @NoOffset public static class GraphState extends Pointer {
+    @Namespace("sd::graph") @NoOffset public static class GraphState extends Pointer {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public GraphState(Pointer p) { super(p); }
@@ -5741,13 +5779,13 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #include <list>
 // #include <unordered_map>
 // #include <mutex>
-// #include <NDArray.h>
+// #include <array/NDArray.h>
 // #include <array/NDArrayList.h>
 // #include <graph/Variable.h>
 // #include <memory/Workspace.h>
 // #include <graph/Stash.h>
 // #include <graph/FlowPath.h>
-        @Namespace("nd4j::graph") @NoOffset public static class VariableSpace extends Pointer {
+        @Namespace("sd::graph") @NoOffset public static class VariableSpace extends Pointer {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public VariableSpace(Pointer p) { super(p); }
@@ -5764,7 +5802,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
             public native @ByRef @Name("operator =") VariableSpace put(@Const @ByRef VariableSpace other);
 
             public native int numberOfPlaceholders();
-            public native @Cast("nd4j::graph::Variable**") @StdVector PointerPointer getPlaceholders();
+            public native @Cast("sd::graph::Variable**") @StdVector PointerPointer getPlaceholders();
             public native void setWorkspace(Workspace workspace);
 
             public native LaunchContext launchContext();
@@ -5783,13 +5821,13 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
             public native Variable getVariable(@ByRef IntIntPair pair);
             public native Variable getVariable(@StdString @Cast({"char*", "std::string*"}) BytePointer symbol);
 
-            public native @Cast("nd4j::graph::Variable**") @StdVector PointerPointer getVariables();
+            public native @Cast("sd::graph::Variable**") @StdVector PointerPointer getVariables();
 
-            public native void putVariable(@ByRef IntIntPair pair, NDArray array);
+            public native Variable putVariable(@ByRef IntIntPair pair, NDArray array);
             public native void putVariable(@ByRef IntIntPair pair, Variable variable);
             public native void putVariable(int id, Variable variable);
             public native void putVariable(int id, NDArray array);
-            public native void putVariable(int id, int idx, NDArray array);
+            public native Variable putVariable(int id, int idx, NDArray array);
             public native void putVariable(int id, int idx, Variable array);
 
             public native void dropVariable(@ByRef IntIntPair pair);
@@ -5812,7 +5850,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 
             public native VariableSpace clone();
 
-            public native @Cast("nd4j::graph::Variable**") @StdVector PointerPointer handles();
+            public native @Cast("sd::graph::Variable**") @StdVector PointerPointer handles();
 
 
             public native VariableSpace asT();
@@ -5820,7 +5858,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 
             public native Stash getStash();
 
-            public native @Cast("nd4j::graph::Variable**") @StdVector PointerPointer getExternalVariables();
+            public native @Cast("sd::graph::Variable**") @StdVector PointerPointer getExternalVariables();
 
             public native void setFlowPath(FlowPath timers);
             public native FlowPath flowPath();
@@ -5857,10 +5895,10 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #ifndef LIBND4J_HELPER_GENERATOR_H
 // #define LIBND4J_HELPER_GENERATOR_H
 
-// #include <op_boilerplate.h>
-// #include <pointercast.h>
+// #include <system/op_boilerplate.h>
+// #include <system/pointercast.h>
 // #include <array/DataTypeUtils.h>
-// #include <dll.h>
+// #include <system/dll.h>
 
 // #ifdef _MSC_VER
 // include for uint64_t on MSVC
@@ -5885,7 +5923,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 
 // #ifdef __CUDACC__
 // #else
-        @Namespace("nd4j::random") @NoOffset public static class RandomBuffer extends Pointer {
+        @Namespace("sd::random") @NoOffset public static class RandomBuffer extends Pointer {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public RandomBuffer(Pointer p) { super(p); }
@@ -6046,7 +6084,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 
         }
 
-        @Namespace("nd4j::random") @NoOffset public static class IGenerator extends Pointer {
+        @Namespace("sd::random") @NoOffset public static class IGenerator extends Pointer {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public IGenerator(Pointer p) { super(p); }
@@ -6066,7 +6104,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 
 
 
-        @Namespace("nd4j::random") @NoOffset public static class Xoroshiro128 extends IGenerator {
+        @Namespace("sd::random") @NoOffset public static class Xoroshiro128 extends IGenerator {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public Xoroshiro128(Pointer p) { super(p); }
@@ -6107,13 +6145,13 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #define ND4J_GRAPH_PROFILE_H
 
 // #include "NodeProfile.h"
-// #include <pointercast.h>
-// #include <dll.h>
+// #include <system/pointercast.h>
+// #include <system/dll.h>
 // #include <vector>
 // #include <string>
 // #include <map>
 // #include <chrono>
-        @Namespace("nd4j::graph") @NoOffset public static class GraphProfile extends Pointer {
+        @Namespace("sd::graph") @NoOffset public static class GraphProfile extends Pointer {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public GraphProfile(Pointer p) { super(p); }
@@ -6212,11 +6250,11 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #ifndef LIBND4J_NODE_PROFILE_H
 // #define LIBND4J_NODE_PROFILE_H
 
-// #include <pointercast.h>
-// #include <dll.h>
+// #include <system/pointercast.h>
+// #include <system/dll.h>
 // #include <string>
 // #include <vector>
-        @Namespace("nd4j::graph") @NoOffset public static class NodeProfile extends Pointer {
+        @Namespace("sd::graph") @NoOffset public static class NodeProfile extends Pointer {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public NodeProfile(Pointer p) { super(p); }
@@ -6301,7 +6339,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #define LIBND4J_CONTEXT_H
 
 // #include <vector>
-// #include <NDArray.h>
+// #include <array/NDArray.h>
 // #include <graph/Variable.h>
 // #include <graph/VariableSpace.h>
 // #include <graph/ContextPrototype.h>
@@ -6314,7 +6352,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
         /**
          * This class defines input desired for any given node/operation within graph
          */
-        @Namespace("nd4j::graph") @NoOffset public static class Context extends ContextPrototype {
+        @Namespace("sd::graph") @NoOffset public static class Context extends ContextPrototype {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public Context(Pointer p) { super(p); }
@@ -6337,10 +6375,10 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
             public native @Cast("Nd4jLong") long getOuterTime();
             public native @Cast("Nd4jLong") long getInnerTime();
 
-            public native @Cast("nd4j::DataType") int dataType();
+            public native @Cast("sd::DataType") int dataType();
 
-            public native @Cast("nd4j::DataType") int dataType(int index);
-            public native void setDataType(int index, @Cast("nd4j::DataType") int type);
+            public native @Cast("sd::DataType") int dataType(int index);
+            public native void setDataType(int index, @Cast("sd::DataType") int type);
             // these methods are related to Workspace abstraction
             public native @Cast("bool") boolean hasWorkspaceProvided();
             public native void attachWorkspace(Workspace workspace);
@@ -6434,11 +6472,17 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 
             // methods used in java interop
             /**
-             * This method checks, if Context uses fastpath variable access
+             * This method checks if Context uses fastpath variable access
              * @return
              */
             public native @Cast("bool") boolean isFastPath();
 
+            /**
+             * Method allows to forbid FastPath execution
+             * @param reallyForbid
+             */
+            public native void forbidFastPath(@Cast("bool") boolean reallyForbid);
+
 // #ifndef __JAVACPP_HACK__
 // #endif
 
@@ -6460,9 +6504,9 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
             public native void setIArguments(@Cast("Nd4jLong*") long[] arguments, int numberOfArguments);
             public native void setBArguments(@Cast("bool*") BooleanPointer arguments, int numberOfArguments);
             public native void setBArguments(@Cast("bool*") boolean[] arguments, int numberOfArguments);
-            public native void setDArguments(@Cast("nd4j::DataType*") IntPointer arguments, int numberOfArguments);
-            public native void setDArguments(@Cast("nd4j::DataType*") IntBuffer arguments, int numberOfArguments);
-            public native void setDArguments(@Cast("nd4j::DataType*") int[] arguments, int numberOfArguments);
+            public native void setDArguments(@Cast("sd::DataType*") IntPointer arguments, int numberOfArguments);
+            public native void setDArguments(@Cast("sd::DataType*") IntBuffer arguments, int numberOfArguments);
+            public native void setDArguments(@Cast("sd::DataType*") int[] arguments, int numberOfArguments);
 
             public native void setTArguments(@StdVector DoublePointer tArgs);
             public native void setTArguments(@StdVector DoubleBuffer tArgs);
@@ -6472,9 +6516,9 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
             public native void setIArguments(@Cast("Nd4jLong*") @StdVector long[] tArgs);
             public native void setBArguments(@Cast("bool*") @StdVector BooleanPointer tArgs);
             public native void setBArguments(@Cast("bool*") @StdVector boolean[] tArgs);
-            public native void setDArguments(@Cast("nd4j::DataType*") @StdVector IntPointer dArgs);
-            public native void setDArguments(@Cast("nd4j::DataType*") @StdVector IntBuffer dArgs);
-            public native void setDArguments(@Cast("nd4j::DataType*") @StdVector int[] dArgs);
+            public native void setDArguments(@Cast("sd::DataType*") @StdVector IntPointer dArgs);
+            public native void setDArguments(@Cast("sd::DataType*") @StdVector IntBuffer dArgs);
+            public native void setDArguments(@Cast("sd::DataType*") @StdVector int[] dArgs);
 
             /**
              * This method purges fastpath in/out contents and releases all the handles.
@@ -6531,10 +6575,10 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #define ND4J_CONTEXT_PROTOTYPE_H
 
 // #include <vector>
-// #include <Environment.h>
+// #include <system/Environment.h>
 // #include <array/DataType.h>
-// #include <dll.h>
-// #include <RandomGenerator.h>
+// #include <system/dll.h>
+// #include <graph/RandomGenerator.h>
 // #include <ops/declarable/OpDescriptor.h>
 // #include <execution/Engine.h>
 // #include <execution/ExecutionMode.h>
@@ -6543,7 +6587,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #include <config.h>
 // #endif
 
-        @Namespace("nd4j::graph") @NoOffset public static class ContextPrototype extends Pointer {
+        @Namespace("sd::graph") @NoOffset public static class ContextPrototype extends Pointer {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public ContextPrototype(Pointer p) { super(p); }
@@ -6567,9 +6611,9 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 
             public native void setOpDescriptor(OpDescriptor opDescriptor);
 
-            public native @Cast("nd4j::DataType") int dataType();
-            public native @Cast("nd4j::DataType") int dataType(int index);
-            public native void setDataType(int index, @Cast("nd4j::DataType") int type);
+            public native @Cast("sd::DataType") int dataType();
+            public native @Cast("sd::DataType") int dataType(int index);
+            public native void setDataType(int index, @Cast("sd::DataType") int type);
 
             public native @Cast("bool") boolean isInplace();
             public native void markInplace(@Cast("bool") boolean reallyInplace);
@@ -6585,7 +6629,7 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
             public native @StdVector DoublePointer getTArguments();
             public native @StdVector IntPointer getIArguments();
             public native @Cast("bool*") @StdVector BooleanPointer getBArguments();
-            public native @Cast("nd4j::DataType*") @StdVector IntPointer getDArguments();
+            public native @Cast("sd::DataType*") @StdVector IntPointer getDArguments();
             public native @StdVector IntPointer getAxis();
 
             public native @Cast("samediff::Engine") int engine();
@@ -6650,10 +6694,10 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 // #ifndef LIBND4J_RESULTWRAPPER_H
 // #define LIBND4J_RESULTWRAPPER_H
 
-// #include <op_boilerplate.h>
-// #include <pointercast.h>
-// #include <dll.h>
-        @Namespace("nd4j::graph") @NoOffset public static class ResultWrapper extends org.nd4j.nativeblas.ResultWrapperAbstraction {
+// #include <system/op_boilerplate.h>
+// #include <system/pointercast.h>
+// #include <system/dll.h>
+        @Namespace("sd::graph") @NoOffset public static class ResultWrapper extends org.nd4j.nativeblas.ResultWrapperAbstraction {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public ResultWrapper(Pointer p) { super(p); }
@@ -6702,13 +6746,13 @@ public native @Cast("bool") boolean isOptimalRequirementsMet();
 
 // #include <cstring>
 // #include <cstdio>
-// #include "../dll.h"
-// #include "../nd4jmalloc.h"
-// #include "../templatemath.h"
+// #include "system/dll.h"
+// #include "system/nd4jmalloc.h"
+// #include "math/templatemath.h"
 // #include "../helpers/logger.h"
-// #include "../pointercast.h"
+// #include "system/pointercast.h"
 // #include "../cnpy/cnpy.h"
-// #include <op_boilerplate.h>
+// #include <system/op_boilerplate.h>
 
 public static final int MAX_DIMENSION = 0x7fffffff;
 public static final int MAX_NUM_THREADS =  1024;
@@ -6725,7 +6769,7 @@ public static final int PREALLOC_SIZE = 33554432;
 // #define INLINEDEF inline
 // #endif
 
-// #include "../pairwise_util.h"
+// #include "system/pairwise_util.h"
 // #include <stdint.h>
 // #include <array/ArrayOptions.h>
 
@@ -6871,25 +6915,25 @@ public static final int PREALLOC_SIZE = 33554432;
     * Get the shape info buffer
     * for the given rank and shape.
     */
-    @Namespace("shape") public static native @Cast("Nd4jLong*") LongPointer shapeBuffer(int rank, @Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*") LongPointer shape);
-    @Namespace("shape") public static native @Cast("Nd4jLong*") LongBuffer shapeBuffer(int rank, @Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*") LongBuffer shape);
-    @Namespace("shape") public static native @Cast("Nd4jLong*") long[] shapeBuffer(int rank, @Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*") long[] shape);
+    @Namespace("shape") public static native @Cast("Nd4jLong*") LongPointer shapeBuffer(int rank, @Cast("sd::DataType") int dtype, @Cast("Nd4jLong*") LongPointer shape);
+    @Namespace("shape") public static native @Cast("Nd4jLong*") LongBuffer shapeBuffer(int rank, @Cast("sd::DataType") int dtype, @Cast("Nd4jLong*") LongBuffer shape);
+    @Namespace("shape") public static native @Cast("Nd4jLong*") long[] shapeBuffer(int rank, @Cast("sd::DataType") int dtype, @Cast("Nd4jLong*") long[] shape);
 
-    @Namespace("shape") public static native @Cast("Nd4jLong*") LongPointer shapeBuffer(int rank, @Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*") LongPointer shape, @Cast("Nd4jLong*") LongPointer buffer);
-    @Namespace("shape") public static native @Cast("Nd4jLong*") LongBuffer shapeBuffer(int rank, @Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*") LongBuffer shape, @Cast("Nd4jLong*") LongBuffer buffer);
-    @Namespace("shape") public static native @Cast("Nd4jLong*") long[] shapeBuffer(int rank, @Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*") long[] shape, @Cast("Nd4jLong*") long[] buffer);
+    @Namespace("shape") public static native @Cast("Nd4jLong*") LongPointer shapeBuffer(int rank, @Cast("sd::DataType") int dtype, @Cast("Nd4jLong*") LongPointer shape, @Cast("Nd4jLong*") LongPointer buffer);
+    @Namespace("shape") public static native @Cast("Nd4jLong*") LongBuffer shapeBuffer(int rank, @Cast("sd::DataType") int dtype, @Cast("Nd4jLong*") LongBuffer shape, @Cast("Nd4jLong*") LongBuffer buffer);
+    @Namespace("shape") public static native @Cast("Nd4jLong*") long[] shapeBuffer(int rank, @Cast("sd::DataType") int dtype, @Cast("Nd4jLong*") long[] shape, @Cast("Nd4jLong*") long[] buffer);
 
     /**
     * Get the shape info buffer
     * for the given rank and shape.
      */
-    @Namespace("shape") public static native @Cast("Nd4jLong*") LongPointer shapeBufferFortran(int rank, @Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*") LongPointer shape);
-    @Namespace("shape") public static native @Cast("Nd4jLong*") LongBuffer shapeBufferFortran(int rank, @Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*") LongBuffer shape);
-    @Namespace("shape") public static native @Cast("Nd4jLong*") long[] shapeBufferFortran(int rank, @Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*") long[] shape);
+    @Namespace("shape") public static native @Cast("Nd4jLong*") LongPointer shapeBufferFortran(int rank, @Cast("sd::DataType") int dtype, @Cast("Nd4jLong*") LongPointer shape);
+    @Namespace("shape") public static native @Cast("Nd4jLong*") LongBuffer shapeBufferFortran(int rank, @Cast("sd::DataType") int dtype, @Cast("Nd4jLong*") LongBuffer shape);
+    @Namespace("shape") public static native @Cast("Nd4jLong*") long[] shapeBufferFortran(int rank, @Cast("sd::DataType") int dtype, @Cast("Nd4jLong*") long[] shape);
 
-    @Namespace("shape") public static native @Cast("Nd4jLong*") LongPointer shapeBufferFortran(int rank, @Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*") LongPointer shape, @Cast("Nd4jLong*") LongPointer output);
-    @Namespace("shape") public static native @Cast("Nd4jLong*") LongBuffer shapeBufferFortran(int rank, @Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*") LongBuffer shape, @Cast("Nd4jLong*") LongBuffer output);
-    @Namespace("shape") public static native @Cast("Nd4jLong*") long[] shapeBufferFortran(int rank, @Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*") long[] shape, @Cast("Nd4jLong*") long[] output);
+    @Namespace("shape") public static native @Cast("Nd4jLong*") LongPointer shapeBufferFortran(int rank, @Cast("sd::DataType") int dtype, @Cast("Nd4jLong*") LongPointer shape, @Cast("Nd4jLong*") LongPointer output);
+    @Namespace("shape") public static native @Cast("Nd4jLong*") LongBuffer shapeBufferFortran(int rank, @Cast("sd::DataType") int dtype, @Cast("Nd4jLong*") LongBuffer shape, @Cast("Nd4jLong*") LongBuffer output);
+    @Namespace("shape") public static native @Cast("Nd4jLong*") long[] shapeBufferFortran(int rank, @Cast("sd::DataType") int dtype, @Cast("Nd4jLong*") long[] shape, @Cast("Nd4jLong*") long[] output);
 
 // #ifdef __CUDACC__
 // #endif
@@ -7745,18 +7789,18 @@ public static final int PREALLOC_SIZE = 33554432;
 * @return the double at the specified index
 */
 
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const Nd4jLong*") LongPointer indices, @Cast("Nd4jLong") long baseOffset/*=0*/);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const Nd4jLong*") LongPointer indices);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const Nd4jLong*") LongBuffer indices, @Cast("Nd4jLong") long baseOffset/*=0*/);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const Nd4jLong*") LongBuffer indices);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const Nd4jLong*") long[] indices, @Cast("Nd4jLong") long baseOffset/*=0*/);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const Nd4jLong*") long[] indices);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Const IntPointer indices, @Cast("Nd4jLong") long baseOffset/*=0*/);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Const IntPointer indices);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Const IntBuffer indices, @Cast("Nd4jLong") long baseOffset/*=0*/);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Const IntBuffer indices);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Const int[] indices, @Cast("Nd4jLong") long baseOffset/*=0*/);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Const int[] indices);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const Nd4jLong*") LongPointer coords, @Cast("Nd4jLong") long baseOffset/*=0*/);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const Nd4jLong*") LongPointer coords);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const Nd4jLong*") LongBuffer coords, @Cast("Nd4jLong") long baseOffset/*=0*/);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const Nd4jLong*") LongBuffer coords);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const Nd4jLong*") long[] coords, @Cast("Nd4jLong") long baseOffset/*=0*/);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const Nd4jLong*") long[] coords);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Const IntPointer coords, @Cast("Nd4jLong") long baseOffset/*=0*/);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Const IntPointer coords);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Const IntBuffer coords, @Cast("Nd4jLong") long baseOffset/*=0*/);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Const IntBuffer coords);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Const int[] coords, @Cast("Nd4jLong") long baseOffset/*=0*/);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Const int[] coords);
 
     @Namespace("shape") public static native @Cast("Nd4jLong*") LongPointer createShapeInfo(@Cast("Nd4jLong*") LongPointer shape, @Cast("Nd4jLong*") LongPointer stride, int rank);
     @Namespace("shape") public static native @Cast("Nd4jLong*") LongBuffer createShapeInfo(@Cast("Nd4jLong*") LongBuffer shape, @Cast("Nd4jLong*") LongBuffer stride, int rank);
@@ -7942,20 +7986,20 @@ public static final int PREALLOC_SIZE = 33554432;
     @Namespace("shape") public static native void calcOffsets(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("Nd4jLong*") long[] offsets);
     // ND4J_EXPORT void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const char order = 'c');
     // ND4J_EXPORT void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const Nd4jLong* zShapeInfo, Nd4jLong*& zOffsets, const char order = 'c');
-    @Namespace("shape") public static native void shapeOldScalar(@Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*const") LongPointer buffer, byte order);
-    @Namespace("shape") public static native void shapeOldScalar(@Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*const") LongBuffer buffer, byte order);
-    @Namespace("shape") public static native void shapeOldScalar(@Cast("nd4j::DataType") int dtype, @Cast("Nd4jLong*const") long[] buffer, byte order);
+    @Namespace("shape") public static native void shapeOldScalar(@Cast("sd::DataType") int dtype, @Cast("Nd4jLong*const") LongPointer buffer, byte order);
+    @Namespace("shape") public static native void shapeOldScalar(@Cast("sd::DataType") int dtype, @Cast("Nd4jLong*const") LongBuffer buffer, byte order);
+    @Namespace("shape") public static native void shapeOldScalar(@Cast("sd::DataType") int dtype, @Cast("Nd4jLong*const") long[] buffer, byte order);
 
     // deduce order and element-wise stride
     // if array is scalar or unit length vector then ews = 1 and order is preserved
     // if array is common vector then ews = stride of non-unity dimension and order is preserved
     // if strides are normal/contiguous then ews = 1 and corresponding order is set, otherwise ews = 0 and order is preserved
-    @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") LongPointer shapeInfo, byte proposedOrder, int numOfNonUnitDims, @Cast("const Nd4jLong*") LongPointer shapeNoUnities, @Cast("const Nd4jLong*") LongPointer stridesNoUnities);
-    @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") LongBuffer shapeInfo, byte proposedOrder, int numOfNonUnitDims, @Cast("const Nd4jLong*") LongBuffer shapeNoUnities, @Cast("const Nd4jLong*") LongBuffer stridesNoUnities);
-    @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") long[] shapeInfo, byte proposedOrder, int numOfNonUnitDims, @Cast("const Nd4jLong*") long[] shapeNoUnities, @Cast("const Nd4jLong*") long[] stridesNoUnities);
-    @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") LongPointer shapeInfo);
-    @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") LongBuffer shapeInfo);
-    @Namespace("shape") public static native void checkStridesSetEwsAndOrder(@Cast("Nd4jLong*") long[] shapeInfo);
+    @Namespace("shape") public static native void checkStridesEwsAndOrder(@Cast("Nd4jLong*") LongPointer shapeInfo, byte proposedOrder, int numOfNonUnitDims, @Cast("const Nd4jLong*") LongPointer shapeNoUnities, @Cast("const Nd4jLong*") LongPointer stridesNoUnities);
+    @Namespace("shape") public static native void checkStridesEwsAndOrder(@Cast("Nd4jLong*") LongBuffer shapeInfo, byte proposedOrder, int numOfNonUnitDims, @Cast("const Nd4jLong*") LongBuffer shapeNoUnities, @Cast("const Nd4jLong*") LongBuffer stridesNoUnities);
+    @Namespace("shape") public static native void checkStridesEwsAndOrder(@Cast("Nd4jLong*") long[] shapeInfo, byte proposedOrder, int numOfNonUnitDims, @Cast("const Nd4jLong*") long[] shapeNoUnities, @Cast("const Nd4jLong*") long[] stridesNoUnities);
+    @Namespace("shape") public static native void checkStridesEwsAndOrder(@Cast("Nd4jLong*") LongPointer shapeInfo);
+    @Namespace("shape") public static native void checkStridesEwsAndOrder(@Cast("Nd4jLong*") LongBuffer shapeInfo);
+    @Namespace("shape") public static native void checkStridesEwsAndOrder(@Cast("Nd4jLong*") long[] shapeInfo);
 
     /**
     * processes whole set of sub-arrays
@@ -7988,7 +8032,7 @@ public static final int PREALLOC_SIZE = 33554432;
     @Namespace("shape") public static native int excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") long[] inShapeInfo, @Cast("Nd4jLong*&") @ByPtrRef long[] shapeNoUnities, @Cast("Nd4jLong*&") @ByPtrRef long[] stridesNoUnities);
 
     /**
-    * for example inShapeInfo is {3, 2,1,3,1,4,  12,12,4,4,1, 16384,1,99}, dimsToExclude = {2,3}, dimsSize = 2
+    * for example inShapeInfo is {3, 2,1,3,1,4,  12,12,4,4,1, 16384,1,99}, dimsToExclude = {1,3}, dimsSize = 2
     * then outShapeInfo will contain {3, 2,3,4, 12,4,1, 16384,1,99}
     */
     @Namespace("shape") public static native void excludeUnitiesFromShapeInfo(@Cast("const Nd4jLong*") LongPointer inShapeInfo, int dimsSize, @Const IntPointer dimsToExclude, @Cast("Nd4jLong*") LongPointer outShapeInfo);
@@ -8765,7 +8809,7 @@ public static final int PREALLOC_SIZE = 33554432;
 //         target[shape::shapeInfoLength(newRank) - 3] = 0;
 //         target[shape::shapeInfoLength(newRank) - 2] = 0;
 //         target[shape::shapeInfoLength(newRank) - 1] = isFOrder ? 102 : 99;
-//         nd4j::ArrayOptions::setDataType(target, nd4j::ArrayOptions::dataType(oldShape));
+//         sd::ArrayOptions::setDataType(target, sd::ArrayOptions::dataType(oldShape));
 
 //         delete[] olddims;
 //         delete[] oldstrides;
@@ -9093,10 +9137,10 @@ public static final int PREALLOC_SIZE = 33554432;
 // #define LIBND4J_OPARGSHOLDER_H
 
 
-// #include <NDArray.h>
-// #include <dll.h>
+// #include <array/NDArray.h>
+// #include <system/dll.h>
 
-@Namespace("nd4j") @NoOffset public static class OpArgsHolder extends Pointer {
+@Namespace("sd") @NoOffset public static class OpArgsHolder extends Pointer {
     static { Loader.load(); }
     /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
     public OpArgsHolder(Pointer p) { super(p); }
@@ -9197,9 +9241,9 @@ public static final int PREALLOC_SIZE = 33554432;
 // #define LIBND4J_SHAPELIST_H
 
 // #include <vector>
-// #include <shape.h>
-// #include <dll.h>
-    @Namespace("nd4j") @NoOffset public static class ShapeList extends Pointer {
+// #include <helpers/shape.h>
+// #include <system/dll.h>
+    @Namespace("sd") @NoOffset public static class ShapeList extends Pointer {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public ShapeList(Pointer p) { super(p); }
@@ -9241,7 +9285,7 @@ public static final int PREALLOC_SIZE = 33554432;
 // #endif //LIBND4J_SHAPELIST_H
 
 
-// Parsed from type_boilerplate.h
+// Parsed from system/type_boilerplate.h
 
 /*******************************************************************************
  * Copyright (c) 2015-2018 Skymind, Inc.
@@ -9876,8 +9920,8 @@ public static final int PREALLOC_SIZE = 33554432;
 // #define RANDOMTRIPLE(NAME, SIGNATURE, TYPES_X, TYPES_Y, TYPE_Z)  _RANDOMTRIPLE(NAME, SIGNATURE, TYPE_Z, TYPES_X, TYPES_Y)
 
 
-// #define BROADCAST(NAME) nd4j::BroadcastOpsTuple::custom(nd4j::scalar::NAME, nd4j::pairwise::NAME, nd4j::broadcast::NAME)
-// #define BROADCAST_BOOL(NAME) nd4j::BroadcastBoolOpsTuple::custom(nd4j::scalar::NAME, nd4j::pairwise::NAME, nd4j::broadcast::NAME)
+// #define BROADCAST(NAME) sd::BroadcastOpsTuple::custom(sd::scalar::NAME, sd::pairwise::NAME, sd::broadcast::NAME)
+// #define BROADCAST_BOOL(NAME) sd::BroadcastBoolOpsTuple::custom(sd::scalar::NAME, sd::pairwise::NAME, sd::broadcast::NAME)
 
 public static final int ALL_STRINGS =UTF32;
 public static final int ALL_INDICES =INT64;
@@ -9887,7 +9931,7 @@ public static final int ALL_FLOATS =BFLOAT16;
 // #endif //TESTS_CPU_TYPE_BOILERPLATE_H
 
 
-// Parsed from op_boilerplate.h
+// Parsed from system/op_boilerplate.h
 
 /*******************************************************************************
  * Copyright (c) 2015-2018 Skymind, Inc.
@@ -9956,8 +10000,8 @@ public static final int ALL_FLOATS =BFLOAT16;
 // #ifndef OP_BOILERPLATE_HH
 // #define OP_BOILERPLATE_HH
 
-// #include <openmp_pragmas.h>
-// #include <type_boilerplate.h>
+// #include <system/openmp_pragmas.h>
+// #include <system/type_boilerplate.h>
 // #include <exceptions/allocation_exception.h>
 // #include <memory/MemoryTracker.h>
 
@@ -10002,8 +10046,8 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 // #define PRINT_FIRST(...)    printf(__VA_ARGS__); fflush(stdout)
 // #endif
 
-// #define DEBUG_CALL(STREAM)      if (nd4j::Environment::getInstance()->isDebug()) { cudaError_t tRes = cudaStreamSynchronize(*STREAM); checkCudaErrors(tRes); if (tRes != 0) { throw std::runtime_error(); }; }
-// #define DEBUG_KERNEL(STREAM, OP_NUM)       if (nd4j::Environment::getInstance()->isDebug()) { cudaError_t tRes = cudaStreamSynchronize(*STREAM); checkCudaErrors(tRes); if (tRes != 0) {std::string tFile(__FILE__); std::string tOp = "Kernel OpNum failed: [" + nd4j::StringUtils::valueToString<int>(OP_NUM) + std::string("]; File: ") + tFile + std::string(":") + nd4j::StringUtils::valueToString<int>(__LINE__); throw std::runtime_error(tOp.c_str()); }; }
+// #define DEBUG_CALL(STREAM)      if (sd::Environment::getInstance()->isDebug()) { cudaError_t tRes = cudaStreamSynchronize(*STREAM); checkCudaErrors(tRes); if (tRes != 0) { throw std::runtime_error(); }; }
+// #define DEBUG_KERNEL(STREAM, OP_NUM)       if (sd::Environment::getInstance()->isDebug()) { cudaError_t tRes = cudaStreamSynchronize(*STREAM); checkCudaErrors(tRes); if (tRes != 0) {std::string tFile(__FILE__); std::string tOp = "Kernel OpNum failed: [" + sd::StringUtils::valueToString<int>(OP_NUM) + std::string("]; File: ") + tFile + std::string(":") + sd::StringUtils::valueToString<int>(__LINE__); throw std::runtime_error(tOp.c_str()); }; }
 
 
 // #define LAUNCH(A, B, C, D) <<<A, B, C, D>>>
@@ -11099,8 +11143,8 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 
 
 /** graph definitions */
-// #define REQUIRE_OK(A)  if (nd4j::ops::resultHelper( (A), #A, __FILE__, __LINE__ ) != 0) return ND4J_STATUS_VALIDATION;
-// #define REQUIRE_TRUE(...) if (nd4j::ops::conditionHelper(__FILE__, __LINE__, __VA_ARGS__) != 0) throw std::invalid_argument("Op validation failed");
+// #define REQUIRE_OK(A)  if (sd::ops::resultHelper( (A), #A, __FILE__, __LINE__ ) != 0) return ND4J_STATUS_VALIDATION;
+// #define REQUIRE_TRUE(COND, ...) if (!(COND)) { if (sd::ops::conditionHelper(__FILE__, __LINE__, COND, __VA_ARGS__) != 0) throw std::invalid_argument("Op validation failed");};
 
 // #define DECLARE_ENTRY(NAME, ...)           template struct ND4J_EXPORT __registratorFloat<NAME<float>>;
 //                                       template struct ND4J_EXPORT __registratorHalf<NAME<float16>>;
@@ -11114,13 +11158,13 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 // #define NOT_EXCLUDED(NAME) 1>0
 // #else
 // for now we don't want minifier mechanics working
-//#define NOT_EXCLUDED(NAME) defined(LIBND4J_ALL_OPS) || defined(NAME)
+//#define NOT_EXCLUDED(NAME) defined(SD_ALL_OPS) || defined(NAME)
 // #define NOT_EXCLUDED(NAME) 1>0
 // #endif
 
 // #ifdef __JAVACPP_HACK__
 // #define REGISTER_H(NAME)
-// #elif defined(LIBND4J_ALL_OPS)
+// #elif defined(SD_ALL_OPS)
 // #else
 // #define REGISTER_H(NAME)  template <typename OpName>
 //                         struct __registrator_##NAME {
@@ -11129,68 +11173,68 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //                                 OpRegistrator::getInstance()->registerOperation(ptr);
 //                             }
 //                         };
-//                         static nd4j::ops::__registrator_##NAME<NAME> zzz_register_opd_##NAME;
+//                         static sd::ops::__registrator_##NAME<NAME> zzz_register_opd_##NAME;
 // #endif
 
 // #ifdef __JAVACPP_HACK__
 // #define REGISTER_C(NAME)
-// #elif defined(LIBND4J_ALL_OPS)
+// #elif defined(SD_ALL_OPS)
 // #else
 // #define REGISTER_C(NAME)
 // #endif
 
-// #define DECLARE_OP(NAME, NIN, NOUT, INPLACEABLE)   class ND4J_EXPORT NAME: public nd4j::ops::DeclarableOp {
+// #define DECLARE_OP(NAME, NIN, NOUT, INPLACEABLE)   class ND4J_EXPORT NAME: public sd::ops::DeclarableOp {
 //                                                 public:
 //                                                     NAME();
-//                                                     nd4j::ShapeList* calculateOutputShape(nd4j::ShapeList* inputShape, nd4j::graph::Context& block);
+//                                                     sd::ShapeList* calculateOutputShape(sd::ShapeList* inputShape, sd::graph::Context& block);
 //                                                 protected:
 //                                                     void registerTypes();
-//                                                     Nd4jStatus validateAndExecute(nd4j::graph::Context& block);
+//                                                     Nd4jStatus validateAndExecute(sd::graph::Context& block);
 //                                                 };
 //                                                 REGISTER_H(NAME)
 
-// #define DECLARE_BOOLEAN_OP(NAME, NIN, SCALAR)   class ND4J_EXPORT NAME: public nd4j::ops::BooleanOp {
+// #define DECLARE_BOOLEAN_OP(NAME, NIN, SCALAR)   class ND4J_EXPORT NAME: public sd::ops::BooleanOp {
 //                                                 public:
 //                                                     NAME();
 //                                                 protected:
 //                                                     void registerTypes();
-//                                                     Nd4jStatus validateAndExecute(nd4j::graph::Context& block);
+//                                                     Nd4jStatus validateAndExecute(sd::graph::Context& block);
 //                                                 };
 //                                                 REGISTER_H(NAME)
 
-// #define BOOLEAN_OP_IMPL(NAME, NIN, SCALAR)   NAME::NAME() : nd4j::ops::BooleanOp(#NAME, NIN, SCALAR) { };
+// #define BOOLEAN_OP_IMPL(NAME, NIN, SCALAR)   NAME::NAME() : sd::ops::BooleanOp(#NAME, NIN, SCALAR) { };
 //                                                 REGISTER_C(NAME)
-//                                                 Nd4jStatus nd4j::ops::NAME::validateAndExecute(nd4j::graph::Context& block)
+//                                                 Nd4jStatus sd::ops::NAME::validateAndExecute(sd::graph::Context& block)
 
-// #define DECLARE_LIST_OP(NAME, NIN, NOUT, TARGS, IARGS)      class ND4J_EXPORT  NAME: public nd4j::ops::DeclarableListOp {
+// #define DECLARE_LIST_OP(NAME, NIN, NOUT, TARGS, IARGS)      class ND4J_EXPORT  NAME: public sd::ops::DeclarableListOp {
 //                                                             public:
 //                                                                 NAME();
 //                                                             protected:
-//                                                                 Nd4jStatus validateAndExecute(nd4j::graph::Context& block);
+//                                                                 Nd4jStatus validateAndExecute(sd::graph::Context& block);
 //                                                             };
 //                                                             REGISTER_H(NAME)
 
-// #define LIST_OP_IMPL(NAME, NIN, NOUT, TARGS, IARGS)         NAME::NAME() : nd4j::ops::DeclarableListOp(NIN, NOUT, #NAME, TARGS, IARGS) { };
+// #define LIST_OP_IMPL(NAME, NIN, NOUT, TARGS, IARGS)         NAME::NAME() : sd::ops::DeclarableListOp(NIN, NOUT, #NAME, TARGS, IARGS) { };
 //                                                             REGISTER_C(NAME)
-//                                                             Nd4jStatus nd4j::ops::NAME::validateAndExecute(nd4j::graph::Context& block)
+//                                                             Nd4jStatus sd::ops::NAME::validateAndExecute(sd::graph::Context& block)
 
-// #define DECLARE_LOGIC_OP(NAME)      class ND4J_EXPORT NAME: public nd4j::ops::LogicOp {
+// #define DECLARE_LOGIC_OP(NAME)      class ND4J_EXPORT NAME: public sd::ops::LogicOp {
 //                                     public:
 //                                         NAME();
 //                                     protected:
-//                                         Nd4jStatus validateAndExecute(nd4j::graph::Context& block);
+//                                         Nd4jStatus validateAndExecute(sd::graph::Context& block);
 //                                     };
 //                                     REGISTER_H(NAME)
 
-// #define LOGIC_OP_IMPL(NAME)     NAME::NAME() : nd4j::ops::LogicOp(#NAME) { };
+// #define LOGIC_OP_IMPL(NAME)     NAME::NAME() : sd::ops::LogicOp(#NAME) { };
 //                                 REGISTER_C(NAME)
-//                                 Nd4jStatus nd4j::ops::NAME::validateAndExecute(nd4j::graph::Context& block) { return nd4j::ops::LogicOp::validateAndExecute(block); };
+//                                 Nd4jStatus sd::ops::NAME::validateAndExecute(sd::graph::Context& block) { return sd::ops::LogicOp::validateAndExecute(block); };
 
 
 
-// #define OP_IMPL(NAME, NIN, NOUT, INPLACEABLE)   NAME::NAME() : nd4j::ops::DeclarableOp(NIN, NOUT, #NAME, INPLACEABLE) { };
+// #define OP_IMPL(NAME, NIN, NOUT, INPLACEABLE)   NAME::NAME() : sd::ops::DeclarableOp(NIN, NOUT, #NAME, INPLACEABLE) { };
 //                                                 REGISTER_C(NAME)
-//                                                 nd4j::ShapeList* nd4j::ops::NAME::calculateOutputShape(nd4j::ShapeList* inputShape, nd4j::graph::Context& block) {
+//                                                 sd::ShapeList* sd::ops::NAME::calculateOutputShape(sd::ShapeList* inputShape, sd::graph::Context& block) {
 //                                                     auto shapeList = SHAPELIST();
 //                                                     auto opLimit = this->getOpDescriptor()->getNumberOfOutputs() < 1 ? block.width() : this->getOpDescriptor()->getNumberOfOutputs();
 //                                                     for (int e = 0; e < opLimit; e++) {
@@ -11199,7 +11243,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //                                                     }
 //                                                     return shapeList;
 //                                                 }
-//                                                 Nd4jStatus nd4j::ops::NAME::validateAndExecute(nd4j::graph::Context& block)
+//                                                 Nd4jStatus sd::ops::NAME::validateAndExecute(sd::graph::Context& block)
 
 
 // #define DECLARE_SYN(NAME, ORIGINAL) template <typename OpName>
@@ -11209,26 +11253,26 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //                                             if (ptr == nullptr) {
 //                                                 std::string newName(name);
 //                                                 std::string oldName(oname);
-//                                                 OpRegistrator::getInstance()->updateMSVC(nd4j::ops::HashHelper::getInstance()->getLongHash(newName), oldName);
+//                                                 OpRegistrator::getInstance()->updateMSVC(sd::ops::HashHelper::getInstance()->getLongHash(newName), oldName);
 //                                                 return;
 //                                             }
 //                                             OpRegistrator::getInstance()->registerOperation(name, ptr);
 //                                             }
 //                                         };
-//                                         static nd4j::ops::__registratorSynonym_##NAME<ORIGINAL> zzz_register_opd_##NAME(#NAME, #ORIGINAL)
+//                                         static sd::ops::__registratorSynonym_##NAME<ORIGINAL> zzz_register_opd_##NAME(#NAME, #ORIGINAL)
 
-// #define DECLARE_DIVERGENT_OP(NAME, NIN, NOUT, INPLACEABLE)  class ND4J_EXPORT NAME: public nd4j::ops::DeclarableOp {
+// #define DECLARE_DIVERGENT_OP(NAME, NIN, NOUT, INPLACEABLE)  class ND4J_EXPORT NAME: public sd::ops::DeclarableOp {
 //                                                             public:
 //                                                                 NAME();
-//                                                                 nd4j::ShapeList* calculateOutputShape(nd4j::ShapeList* inputShape, nd4j::graph::Context& block);
+//                                                                 sd::ShapeList* calculateOutputShape(sd::ShapeList* inputShape, sd::graph::Context& block);
 //                                                             protected:
-//                                                                 Nd4jStatus validateAndExecute(nd4j::graph::Context& block);
+//                                                                 Nd4jStatus validateAndExecute(sd::graph::Context& block);
 //                                                             };
 //                                                             REGISTER_H(NAME)
 
-// #define DIVERGENT_OP_IMPL(NAME, NIN, NOUT, INPLACEABLE)     NAME::NAME() : nd4j::ops::DeclarableOp(NIN, NOUT, #NAME, INPLACEABLE, true) { };
+// #define DIVERGENT_OP_IMPL(NAME, NIN, NOUT, INPLACEABLE)     NAME::NAME() : sd::ops::DeclarableOp(NIN, NOUT, #NAME, INPLACEABLE, true) { };
 //                                                             REGISTER_C(NAME)
-//                                                             nd4j::ShapeList* nd4j::ops::NAME::calculateOutputShape(nd4j::ShapeList* inputShape, nd4j::graph::Context& block) {
+//                                                             sd::ShapeList* sd::ops::NAME::calculateOutputShape(sd::ShapeList* inputShape, sd::graph::Context& block) {
 //                                                                 auto shapeList = SHAPELIST();
 //                                                                 auto opLimit = this->getOpDescriptor()->getNumberOfOutputs() < 1 ? block.width() : this->getOpDescriptor()->getNumberOfOutputs();
 //                                                                 for (int e = 0; e < opLimit; e++) {
@@ -11238,21 +11282,21 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //                                                                 }
 //                                                                 return shapeList;
 //                                                             }
-//                                                             Nd4jStatus nd4j::ops::NAME::validateAndExecute(nd4j::graph::Context& block)
+//                                                             Nd4jStatus sd::ops::NAME::validateAndExecute(sd::graph::Context& block)
 
-// #define DECLARE_CONFIGURABLE_OP(NAME, NIN, NOUT, INPLACEABLE, TARGS, IARGS)     class ND4J_EXPORT NAME: public nd4j::ops::DeclarableOp {
+// #define DECLARE_CONFIGURABLE_OP(NAME, NIN, NOUT, INPLACEABLE, TARGS, IARGS)     class ND4J_EXPORT NAME: public sd::ops::DeclarableOp {
 //                                                                                 public:
 //                                                                                     NAME();
-//                                                                                     nd4j::ShapeList* calculateOutputShape(nd4j::ShapeList* inputShape, nd4j::graph::Context& block);
+//                                                                                     sd::ShapeList* calculateOutputShape(sd::ShapeList* inputShape, sd::graph::Context& block);
 //                                                                                 protected:
 //                                                                                     void registerTypes();
-//                                                                                     Nd4jStatus validateAndExecute(nd4j::graph::Context& block);
+//                                                                                     Nd4jStatus validateAndExecute(sd::graph::Context& block);
 //                                                                                 };
 //                                                                                 REGISTER_H(NAME)
 
-// #define CONFIGURABLE_OP_IMPL(NAME, NIN, NOUT, INPLACEABLE, TARGS, IARGS)        NAME::NAME() : nd4j::ops::DeclarableOp(NIN, NOUT, #NAME, INPLACEABLE, TARGS, IARGS) { };
+// #define CONFIGURABLE_OP_IMPL(NAME, NIN, NOUT, INPLACEABLE, TARGS, IARGS)        NAME::NAME() : sd::ops::DeclarableOp(NIN, NOUT, #NAME, INPLACEABLE, TARGS, IARGS) { };
 //                                                                                 REGISTER_C(NAME)
-//                                                                                 nd4j::ShapeList* nd4j::ops::NAME::calculateOutputShape(nd4j::ShapeList* inputShape, nd4j::graph::Context& block) {
+//                                                                                 sd::ShapeList* sd::ops::NAME::calculateOutputShape(sd::ShapeList* inputShape, sd::graph::Context& block) {
 //                                                                                     auto shapeList = SHAPELIST();
 //                                                                                     auto opLimit = this->getOpDescriptor()->getNumberOfOutputs() < 1 ? block.width() : this->getOpDescriptor()->getNumberOfOutputs();
 //                                                                                     for (int e = 0; e < opLimit; e++) {
@@ -11261,9 +11305,9 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //                                                                                     }
 //                                                                                     return shapeList;
 //                                                                                 }
-//                                                                                 Nd4jStatus nd4j::ops::NAME::validateAndExecute(Context& block)
+//                                                                                 Nd4jStatus sd::ops::NAME::validateAndExecute(Context& block)
 
-// #define DECLARE_REDUCTION_OP(NAME, NIN, NOUT, INPLACEABLE, TARGS, IARGS)        class ND4J_EXPORT NAME: public nd4j::ops::DeclarableReductionOp {
+// #define DECLARE_REDUCTION_OP(NAME, NIN, NOUT, INPLACEABLE, TARGS, IARGS)        class ND4J_EXPORT NAME: public sd::ops::DeclarableReductionOp {
 //                                                                                 public:
 //                                                                                     NAME();
 //                                                                                 protected:
@@ -11272,34 +11316,34 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //                                                                                 };
 //                                                                                 REGISTER_H(NAME)
 
-// #define REDUCTION_OP_IMPL(NAME, NIN, NOUT, INPLACEABLE, TARGS, IARGS)           NAME::NAME() : nd4j::ops::DeclarableReductionOp(NIN, NOUT, #NAME, INPLACEABLE, TARGS, IARGS) { };
+// #define REDUCTION_OP_IMPL(NAME, NIN, NOUT, INPLACEABLE, TARGS, IARGS)           NAME::NAME() : sd::ops::DeclarableReductionOp(NIN, NOUT, #NAME, INPLACEABLE, TARGS, IARGS) { };
 //                                                                                 REGISTER_C(NAME)
-//                                                                                 Nd4jStatus nd4j::ops::NAME::validateAndExecute(nd4j::graph::Context& block)
+//                                                                                 Nd4jStatus sd::ops::NAME::validateAndExecute(sd::graph::Context& block)
 
 
-// #define DECLARE_CUSTOM_OP(NAME, NIN, NOUT, INPLACEABLE, TARGS, IARGS)           class ND4J_EXPORT NAME: public nd4j::ops::DeclarableCustomOp {
+// #define DECLARE_CUSTOM_OP(NAME, NIN, NOUT, INPLACEABLE, TARGS, IARGS)           class ND4J_EXPORT NAME: public sd::ops::DeclarableCustomOp {
 //                                                                                 protected:
 //                                                                                     void registerTypes();
 //                                                                                     Nd4jStatus validateAndExecute(Context& block);
 //                                                                                 public:
 //                                                                                     NAME();
-//                                                                                     nd4j::ShapeList* calculateOutputShape(nd4j::ShapeList* inputShape, nd4j::graph::Context& block);
+//                                                                                     sd::ShapeList* calculateOutputShape(sd::ShapeList* inputShape, sd::graph::Context& block);
 //                                                                                 };
 //                                                                                 REGISTER_H(NAME)
 
-// #define CUSTOM_OP_IMPL(NAME, NIN, NOUT, INPLACEABLE, TARGS, IARGS)              NAME::NAME(): nd4j::ops::DeclarableCustomOp(NIN, NOUT, #NAME, INPLACEABLE, TARGS, IARGS) { };
+// #define CUSTOM_OP_IMPL(NAME, NIN, NOUT, INPLACEABLE, TARGS, IARGS)              NAME::NAME(): sd::ops::DeclarableCustomOp(NIN, NOUT, #NAME, INPLACEABLE, TARGS, IARGS) { };
 //                                                                                 REGISTER_C(NAME)
-//                                                                                 Nd4jStatus nd4j::ops::NAME::validateAndExecute(nd4j::graph::Context& block)
+//                                                                                 Nd4jStatus sd::ops::NAME::validateAndExecute(sd::graph::Context& block)
 
 // this declaration MUST follow DECLARE_CUSTOM_OP
-// #define DECLARE_SHAPE_FN(NAME)                                                  nd4j::ShapeList* nd4j::ops::NAME::calculateOutputShape(nd4j::ShapeList* inputShape, nd4j::graph::Context& block)
+// #define DECLARE_SHAPE_FN(NAME)                                                  sd::ShapeList* sd::ops::NAME::calculateOutputShape(sd::ShapeList* inputShape, sd::graph::Context& block)
 
 
-// #define DECLARE_SAME_TYPE(NAME)                                                 void nd4j::ops::NAME::registerTypes() {this->getOpDescriptor()->setSameMode(true);}
+// #define DECLARE_SAME_TYPE(NAME)                                                 void sd::ops::NAME::registerTypes() {this->getOpDescriptor()->setSameMode(true);}
 
-// #define DECLARE_TYPES(NAME)                                                     void nd4j::ops::NAME::registerTypes()
+// #define DECLARE_TYPES(NAME)                                                     void sd::ops::NAME::registerTypes()
 
-// #define DECLARE_BROADCASTABLE_OP(NAME,TARGS, IARGS)                             class ND4J_EXPORT NAME: public nd4j::ops::BroadcastableOp {
+// #define DECLARE_BROADCASTABLE_OP(NAME,TARGS, IARGS)                             class ND4J_EXPORT NAME: public sd::ops::BroadcastableOp {
 //                                                                                 protected:
 //                                                                                     void registerTypes();
 //                                                                                     Nd4jStatus validateAndExecute(Context& block);
@@ -11308,17 +11352,17 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //                                                                                 };
 //                                                                                 REGISTER_H(NAME)
 
-// #define BROADCASTABLE_OP_IMPL(NAME, TARGS, IARGS)                               NAME::NAME(): nd4j::ops::BroadcastableOp(#NAME, TARGS, IARGS) { };
+// #define BROADCASTABLE_OP_IMPL(NAME, TARGS, IARGS)                               NAME::NAME(): sd::ops::BroadcastableOp(#NAME, TARGS, IARGS) { };
 //                                                                                 REGISTER_C(NAME)
-//                                                                                 Nd4jStatus nd4j::ops::NAME::validateAndExecute(nd4j::graph::Context& block)
+//                                                                                 Nd4jStatus sd::ops::NAME::validateAndExecute(sd::graph::Context& block)
 
 
 // #define DECLARE_DEVICE_OP(NAME, NIN, NOUT, INPLACEABLE, TARGS, IARGS)
 
 // #define REPLICATE_SHAPE(SRC, TGT)   if (shape::order(SRC) == 'c')
-//                                         shape::shapeBuffer(shape::rank(SRC), nd4j::ArrayOptions::dataType(SRC), shape::shapeOf(SRC), TGT);
+//                                         shape::shapeBuffer(shape::rank(SRC), sd::ArrayOptions::dataType(SRC), shape::shapeOf(SRC), TGT);
 //                                     else
-//                                         shape::shapeBufferFortran(shape::rank(SRC),  nd4j::ArrayOptions::dataType(SRC), shape::shapeOf(SRC), TGT);
+//                                         shape::shapeBufferFortran(shape::rank(SRC),  sd::ArrayOptions::dataType(SRC), shape::shapeOf(SRC), TGT);
 
 
 // #ifdef __CUDABLAS__
@@ -11337,8 +11381,8 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 
 // #else
 
-// #define ALLOCATE(VARIABLE, WORKSPACE, LENGTH, TT)   if (WORKSPACE == nullptr) {VARIABLE = new TT[LENGTH]; nd4j::memory::MemoryTracker::getInstance()->countIn(nd4j::memory::MemoryType::HOST, VARIABLE, LENGTH * sizeof(TT)); } else {VARIABLE = reinterpret_cast<TT *>(WORKSPACE->allocateBytes(LENGTH * sizeof(TT))); }; memset(VARIABLE, 0, LENGTH * sizeof(TT));
-// #define RELEASE(VARIABLE, WORKSPACE)    if (WORKSPACE == nullptr) { nd4j::memory::MemoryTracker::getInstance()->countOut(VARIABLE); delete[] VARIABLE;};
+// #define ALLOCATE(VARIABLE, WORKSPACE, LENGTH, TT)   if (WORKSPACE == nullptr) {VARIABLE = new TT[LENGTH]; sd::memory::MemoryTracker::getInstance()->countIn(sd::memory::MemoryType::HOST, VARIABLE, LENGTH * sizeof(TT)); } else {VARIABLE = reinterpret_cast<TT *>(WORKSPACE->allocateBytes(LENGTH * sizeof(TT))); }; memset(VARIABLE, 0, LENGTH * sizeof(TT));
+// #define RELEASE(VARIABLE, WORKSPACE)    if (WORKSPACE == nullptr) { sd::memory::MemoryTracker::getInstance()->countOut(VARIABLE); delete[] VARIABLE;};
 
 // #endif
 
@@ -11360,9 +11404,9 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 // #define UNSTASH(NAME)       block.getStash()->extractArray(block.getNodeId(), NAME);
 
 // #define INPUT_VARIABLE(INDEX)     block.array(INDEX)
-// #define OUTPUT_VARIABLE(INDEX)    reinterpret_cast<nd4j::NDArray *>(this->getZ(block, INDEX))
+// #define OUTPUT_VARIABLE(INDEX)    reinterpret_cast<sd::NDArray *>(this->getZ(block, INDEX))
 
-// #define INPUT_LIST(INDEX)     reinterpret_cast<nd4j::NDArrayList *>(block.getVariable(INDEX)->getNDArrayList())
+// #define INPUT_LIST(INDEX)     reinterpret_cast<sd::NDArrayList *>(block.getVariable(INDEX)->getNDArrayList())
 
 // #define D_ARG(INDEX)     block.getDArguments()->at(INDEX)
 // #define INT_ARG(INDEX)     block.getIArguments()->at(INDEX)
@@ -11401,7 +11445,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 
 // #endif // CUDACC
 
-// #define CHECK_ALLOC(PTR, MSG, BYTES) if (PTR == nullptr) { throw nd4j::allocation_exception::build(MSG, BYTES); };
+// #define CHECK_ALLOC(PTR, MSG, BYTES) if (PTR == nullptr) { throw sd::allocation_exception::build(MSG, BYTES); };
 
 
 
@@ -11475,7 +11519,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 
 // #ifndef ND4J_INPUTTYPE_H
 // #define ND4J_INPUTTYPE_H
-        /** enum nd4j::ops::InputType */
+        /** enum sd::ops::InputType */
         public static final int
             InputType_BOOLEAN = 0,
             InputType_NUMERIC = 1,
@@ -11514,7 +11558,6 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 
 // #include <string>
 // #include <vector>
-// #include <map>
 // #include <initializer_list>
 // #include <helpers/helper_hash.h>
 // #include <ops/InputType.h>
@@ -11525,7 +11568,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
         *   This class is very basic info holder for ops. bean/pojo pretty much.
         *
         */
-        @Namespace("nd4j::ops") @NoOffset public static class OpDescriptor extends Pointer {
+        @Namespace("sd::ops") @NoOffset public static class OpDescriptor extends Pointer {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public OpDescriptor(Pointer p) { super(p); }
@@ -11609,30 +11652,30 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 
             public native void setHash(@Cast("Nd4jLong") long hash);
 
-            public native @Cast("nd4j::ops::InputType") int inputType();
+            public native @Cast("sd::ops::InputType") int inputType();
 
 
 
-            public native OpDescriptor setInputType(@Cast("nd4j::ops::InputType") int type);
-            public native OpDescriptor setAllowedInputTypes(int index, @Cast("nd4j::DataType*") @StdVector IntPointer dtype);
-            public native OpDescriptor setAllowedInputTypes(int index, @Cast("nd4j::DataType*") @StdVector IntBuffer dtype);
-            public native OpDescriptor setAllowedInputTypes(int index, @Cast("nd4j::DataType*") @StdVector int[] dtype);
-            public native OpDescriptor setAllowedOutputTypes(int index, @Cast("nd4j::DataType*") @StdVector IntPointer dtype);
-            public native OpDescriptor setAllowedOutputTypes(int index, @Cast("nd4j::DataType*") @StdVector IntBuffer dtype);
-            public native OpDescriptor setAllowedOutputTypes(int index, @Cast("nd4j::DataType*") @StdVector int[] dtype);
-            public native OpDescriptor setAllowedInputTypes(int index,  @Cast("nd4j::DataType") int dtype);
-            public native OpDescriptor setAllowedOutputTypes(int index, @Cast("nd4j::DataType") int dtype);
-            public native OpDescriptor setAllowedInputTypes(@Cast("nd4j::DataType") int dtype);
-            public native OpDescriptor setAllowedOutputTypes(@Cast("nd4j::DataType") int dtype);
+            public native OpDescriptor setInputType(@Cast("sd::ops::InputType") int type);
+            public native OpDescriptor setAllowedInputTypes(int index, @Cast("sd::DataType*") @StdVector IntPointer dtype);
+            public native OpDescriptor setAllowedInputTypes(int index, @Cast("sd::DataType*") @StdVector IntBuffer dtype);
+            public native OpDescriptor setAllowedInputTypes(int index, @Cast("sd::DataType*") @StdVector int[] dtype);
+            public native OpDescriptor setAllowedOutputTypes(int index, @Cast("sd::DataType*") @StdVector IntPointer dtype);
+            public native OpDescriptor setAllowedOutputTypes(int index, @Cast("sd::DataType*") @StdVector IntBuffer dtype);
+            public native OpDescriptor setAllowedOutputTypes(int index, @Cast("sd::DataType*") @StdVector int[] dtype);
+            public native OpDescriptor setAllowedInputTypes(int index,  @Cast("sd::DataType") int dtype);
+            public native OpDescriptor setAllowedOutputTypes(int index, @Cast("sd::DataType") int dtype);
+            public native OpDescriptor setAllowedInputTypes(@Cast("sd::DataType") int dtype);
+            public native OpDescriptor setAllowedOutputTypes(@Cast("sd::DataType") int dtype);
             public native OpDescriptor allowOverride(@Cast("bool") boolean reallyAllow);
             public native OpDescriptor setSameMode(@Cast("bool") boolean reallySame);
-            public native OpDescriptor setInputType(int idx, @Cast("nd4j::DataType") int dtype);
-            public native OpDescriptor setOutputType(int idx, @Cast("nd4j::DataType") int dtype);
+            public native OpDescriptor setInputType(int idx, @Cast("sd::DataType") int dtype);
+            public native OpDescriptor setOutputType(int idx, @Cast("sd::DataType") int dtype);
 
-            public native @Cast("nd4j::DataType*") @StdVector IntPointer getOutputTypesForOutput(int index);
+            public native @Cast("sd::DataType*") @StdVector IntPointer getOutputTypesForOutput(int index);
 
-            public native @Cast("bool") boolean checkInputMatch(int index, @Cast("nd4j::DataType") int dataType);
-            public native @Cast("bool") boolean checkOutputMatch(int index, @Cast("nd4j::DataType") int dataType);
+            public native @Cast("bool") boolean checkInputMatch(int index, @Cast("sd::DataType") int dataType);
+            public native @Cast("bool") boolean checkOutputMatch(int index, @Cast("sd::DataType") int dataType);
             public native @Cast("bool") boolean isSameMode();
 
             public native @Cast("bool") boolean isInherit(int index);
@@ -11668,16 +11711,16 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 // #ifndef SD_PLATFORMHELPER_H
 // #define SD_PLATFORMHELPER_H
 
-// #include <ShapeUtils.h>
+// #include <helpers/ShapeUtils.h>
 // #include <execution/Engine.h>
 // #include <graph/Context.h>
 // #include <string>
-// #include <pointercast.h>
-// #include <dll.h>
+// #include <system/pointercast.h>
+// #include <system/dll.h>
             /**
              * This abstract class defines methods used by platform-specific helpers implementations
              */
-            @Namespace("nd4j::ops::platforms") @NoOffset public static class PlatformHelper extends Pointer {
+            @Namespace("sd::ops::platforms") @NoOffset public static class PlatformHelper extends Pointer {
                 static { Loader.load(); }
                 /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
                 public PlatformHelper(Pointer p) { super(p); }
@@ -11750,7 +11793,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 // #include "OpDescriptor.h"
 // #include "DeclarableOp.h"
 // #include "DeclarableCustomOp.h"
-        @Namespace("nd4j::ops") public static class BroadcastableOp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class BroadcastableOp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public BroadcastableOp(Pointer p) { super(p); }
@@ -11792,15 +11835,15 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 
 // #include <sstream>
 // #include <types/float16.h>
-// #include <pointercast.h>
-// #include <NDArray.h>
+// #include <system/pointercast.h>
+// #include <array/NDArray.h>
 // #include <graph/Context.h>
 // #include "OpDescriptor.h"
 // #include <helpers/helper_hash.h>
 // #include <array/ShapeList.h>
 // #include <array/ResultSet.h>
 // #include <helpers/OpArgsHolder.h>
-// #include <dll.h>
+// #include <system/dll.h>
 // #include <ops/declarable/EmptyHandling.h>
 //#include <ops/declarable/declarable_ops.h>
 
@@ -11808,14 +11851,14 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 // #include <ctime>
 // #include <mutex>
 
-        @Namespace("nd4j::ops") public static native @Cast("Nd4jStatus") int conditionHelper(@Cast("char*") String file, int line, int condition, int argNumber, @Cast("char*") String format);
-        @Namespace("nd4j::ops") public static native @Cast("Nd4jStatus") int conditionHelper(@Cast("char*") BytePointer file, int line, int condition, int argNumber, @Cast("char*") BytePointer format);
+        @Namespace("sd::ops") public static native @Cast("Nd4jStatus") int conditionHelper(@Cast("char*") String file, int line, int condition, int argNumber, @Cast("char*") String format);
+        @Namespace("sd::ops") public static native @Cast("Nd4jStatus") int conditionHelper(@Cast("char*") BytePointer file, int line, int condition, int argNumber, @Cast("char*") BytePointer format);
 
         /**
          * This class is the basic building block of Graph Operations. Any CustomOp out there is built on top of this "abstract" class.
          *
          */
-        @Namespace("nd4j::ops") @NoOffset public static class DeclarableOp extends Pointer {
+        @Namespace("sd::ops") @NoOffset public static class DeclarableOp extends Pointer {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public DeclarableOp(Pointer p) { super(p); }
@@ -11870,40 +11913,40 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 
             public native @Cast("Nd4jStatus") int execute(@Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs);
 
-            public native @Cast("Nd4jStatus") int execute(@Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoublePointer tArgs, @Cast("Nd4jLong*") @StdVector LongPointer iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs/*=std::vector<bool>()*/, @Cast("nd4j::DataType*") @StdVector IntPointer dArgs/*=std::vector<nd4j::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
+            public native @Cast("Nd4jStatus") int execute(@Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoublePointer tArgs, @Cast("Nd4jLong*") @StdVector LongPointer iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs/*=std::vector<bool>()*/, @Cast("sd::DataType*") @StdVector IntPointer dArgs/*=std::vector<sd::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
             public native @Cast("Nd4jStatus") int execute(@Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoublePointer tArgs, @Cast("Nd4jLong*") @StdVector LongPointer iArgs);
-            public native @Cast("Nd4jStatus") int execute(@Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoubleBuffer tArgs, @Cast("Nd4jLong*") @StdVector LongBuffer iArgs, @Cast("bool*") @StdVector boolean[] bArgs/*=std::vector<bool>()*/, @Cast("nd4j::DataType*") @StdVector IntBuffer dArgs/*=std::vector<nd4j::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
+            public native @Cast("Nd4jStatus") int execute(@Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoubleBuffer tArgs, @Cast("Nd4jLong*") @StdVector LongBuffer iArgs, @Cast("bool*") @StdVector boolean[] bArgs/*=std::vector<bool>()*/, @Cast("sd::DataType*") @StdVector IntBuffer dArgs/*=std::vector<sd::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
             public native @Cast("Nd4jStatus") int execute(@Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoubleBuffer tArgs, @Cast("Nd4jLong*") @StdVector LongBuffer iArgs);
-            public native @Cast("Nd4jStatus") int execute(@Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector double[] tArgs, @Cast("Nd4jLong*") @StdVector long[] iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs/*=std::vector<bool>()*/, @Cast("nd4j::DataType*") @StdVector int[] dArgs/*=std::vector<nd4j::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
+            public native @Cast("Nd4jStatus") int execute(@Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector double[] tArgs, @Cast("Nd4jLong*") @StdVector long[] iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs/*=std::vector<bool>()*/, @Cast("sd::DataType*") @StdVector int[] dArgs/*=std::vector<sd::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
             public native @Cast("Nd4jStatus") int execute(@Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector double[] tArgs, @Cast("Nd4jLong*") @StdVector long[] iArgs);
-            public native @Cast("Nd4jStatus") int execute(@Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoublePointer tArgs, @Cast("Nd4jLong*") @StdVector LongPointer iArgs, @Cast("bool*") @StdVector boolean[] bArgs/*=std::vector<bool>()*/, @Cast("nd4j::DataType*") @StdVector IntPointer dArgs/*=std::vector<nd4j::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
-            public native @Cast("Nd4jStatus") int execute(@Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoubleBuffer tArgs, @Cast("Nd4jLong*") @StdVector LongBuffer iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs/*=std::vector<bool>()*/, @Cast("nd4j::DataType*") @StdVector IntBuffer dArgs/*=std::vector<nd4j::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
-            public native @Cast("Nd4jStatus") int execute(@Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector double[] tArgs, @Cast("Nd4jLong*") @StdVector long[] iArgs, @Cast("bool*") @StdVector boolean[] bArgs/*=std::vector<bool>()*/, @Cast("nd4j::DataType*") @StdVector int[] dArgs/*=std::vector<nd4j::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
+            public native @Cast("Nd4jStatus") int execute(@Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoublePointer tArgs, @Cast("Nd4jLong*") @StdVector LongPointer iArgs, @Cast("bool*") @StdVector boolean[] bArgs/*=std::vector<bool>()*/, @Cast("sd::DataType*") @StdVector IntPointer dArgs/*=std::vector<sd::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
+            public native @Cast("Nd4jStatus") int execute(@Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoubleBuffer tArgs, @Cast("Nd4jLong*") @StdVector LongBuffer iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs/*=std::vector<bool>()*/, @Cast("sd::DataType*") @StdVector IntBuffer dArgs/*=std::vector<sd::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
+            public native @Cast("Nd4jStatus") int execute(@Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector double[] tArgs, @Cast("Nd4jLong*") @StdVector long[] iArgs, @Cast("bool*") @StdVector boolean[] bArgs/*=std::vector<bool>()*/, @Cast("sd::DataType*") @StdVector int[] dArgs/*=std::vector<sd::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
 
 
             public native ResultSet evaluate(@Const @ByRef NDArrayVector inputs);
 
-            public native ResultSet evaluate(@Const @ByRef NDArrayVector inputs, @StdVector DoublePointer tArgs, @Cast("Nd4jLong*") @StdVector LongPointer iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs/*=std::vector<bool>()*/, @Cast("nd4j::DataType*") @StdVector IntPointer dArgs/*=std::vector<nd4j::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
+            public native ResultSet evaluate(@Const @ByRef NDArrayVector inputs, @StdVector DoublePointer tArgs, @Cast("Nd4jLong*") @StdVector LongPointer iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs/*=std::vector<bool>()*/, @Cast("sd::DataType*") @StdVector IntPointer dArgs/*=std::vector<sd::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
             public native ResultSet evaluate(@Const @ByRef NDArrayVector inputs, @StdVector DoublePointer tArgs, @Cast("Nd4jLong*") @StdVector LongPointer iArgs);
-            public native ResultSet evaluate(@Const @ByRef NDArrayVector inputs, @StdVector DoubleBuffer tArgs, @Cast("Nd4jLong*") @StdVector LongBuffer iArgs, @Cast("bool*") @StdVector boolean[] bArgs/*=std::vector<bool>()*/, @Cast("nd4j::DataType*") @StdVector IntBuffer dArgs/*=std::vector<nd4j::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
+            public native ResultSet evaluate(@Const @ByRef NDArrayVector inputs, @StdVector DoubleBuffer tArgs, @Cast("Nd4jLong*") @StdVector LongBuffer iArgs, @Cast("bool*") @StdVector boolean[] bArgs/*=std::vector<bool>()*/, @Cast("sd::DataType*") @StdVector IntBuffer dArgs/*=std::vector<sd::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
             public native ResultSet evaluate(@Const @ByRef NDArrayVector inputs, @StdVector DoubleBuffer tArgs, @Cast("Nd4jLong*") @StdVector LongBuffer iArgs);
-            public native ResultSet evaluate(@Const @ByRef NDArrayVector inputs, @StdVector double[] tArgs, @Cast("Nd4jLong*") @StdVector long[] iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs/*=std::vector<bool>()*/, @Cast("nd4j::DataType*") @StdVector int[] dArgs/*=std::vector<nd4j::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
+            public native ResultSet evaluate(@Const @ByRef NDArrayVector inputs, @StdVector double[] tArgs, @Cast("Nd4jLong*") @StdVector long[] iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs/*=std::vector<bool>()*/, @Cast("sd::DataType*") @StdVector int[] dArgs/*=std::vector<sd::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
             public native ResultSet evaluate(@Const @ByRef NDArrayVector inputs, @StdVector double[] tArgs, @Cast("Nd4jLong*") @StdVector long[] iArgs);
-            public native ResultSet evaluate(@Const @ByRef NDArrayVector inputs, @StdVector DoublePointer tArgs, @Cast("Nd4jLong*") @StdVector LongPointer iArgs, @Cast("bool*") @StdVector boolean[] bArgs/*=std::vector<bool>()*/, @Cast("nd4j::DataType*") @StdVector IntPointer dArgs/*=std::vector<nd4j::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
-            public native ResultSet evaluate(@Const @ByRef NDArrayVector inputs, @StdVector DoubleBuffer tArgs, @Cast("Nd4jLong*") @StdVector LongBuffer iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs/*=std::vector<bool>()*/, @Cast("nd4j::DataType*") @StdVector IntBuffer dArgs/*=std::vector<nd4j::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
-            public native ResultSet evaluate(@Const @ByRef NDArrayVector inputs, @StdVector double[] tArgs, @Cast("Nd4jLong*") @StdVector long[] iArgs, @Cast("bool*") @StdVector boolean[] bArgs/*=std::vector<bool>()*/, @Cast("nd4j::DataType*") @StdVector int[] dArgs/*=std::vector<nd4j::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
+            public native ResultSet evaluate(@Const @ByRef NDArrayVector inputs, @StdVector DoublePointer tArgs, @Cast("Nd4jLong*") @StdVector LongPointer iArgs, @Cast("bool*") @StdVector boolean[] bArgs/*=std::vector<bool>()*/, @Cast("sd::DataType*") @StdVector IntPointer dArgs/*=std::vector<sd::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
+            public native ResultSet evaluate(@Const @ByRef NDArrayVector inputs, @StdVector DoubleBuffer tArgs, @Cast("Nd4jLong*") @StdVector LongBuffer iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs/*=std::vector<bool>()*/, @Cast("sd::DataType*") @StdVector IntBuffer dArgs/*=std::vector<sd::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
+            public native ResultSet evaluate(@Const @ByRef NDArrayVector inputs, @StdVector double[] tArgs, @Cast("Nd4jLong*") @StdVector long[] iArgs, @Cast("bool*") @StdVector boolean[] bArgs/*=std::vector<bool>()*/, @Cast("sd::DataType*") @StdVector int[] dArgs/*=std::vector<sd::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/);
 
-            public native @Cast("Nd4jStatus") int execute(@ByRef RandomGenerator rng, @Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoublePointer tArgs, @Cast("Nd4jLong*") @StdVector LongPointer iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs, @Cast("nd4j::DataType*") @StdVector IntPointer dArgs/*=std::vector<nd4j::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/, @Cast("nd4j::DataType") int type/*=nd4j::DataType::FLOAT32*/);
+            public native @Cast("Nd4jStatus") int execute(@ByRef RandomGenerator rng, @Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoublePointer tArgs, @Cast("Nd4jLong*") @StdVector LongPointer iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs, @Cast("sd::DataType*") @StdVector IntPointer dArgs/*=std::vector<sd::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/, @Cast("sd::DataType") int type/*=sd::DataType::FLOAT32*/);
             public native @Cast("Nd4jStatus") int execute(@ByRef RandomGenerator rng, @Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoublePointer tArgs, @Cast("Nd4jLong*") @StdVector LongPointer iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs);
-            public native @Cast("Nd4jStatus") int execute(@ByRef RandomGenerator rng, @Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoubleBuffer tArgs, @Cast("Nd4jLong*") @StdVector LongBuffer iArgs, @Cast("bool*") @StdVector boolean[] bArgs, @Cast("nd4j::DataType*") @StdVector IntBuffer dArgs/*=std::vector<nd4j::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/, @Cast("nd4j::DataType") int type/*=nd4j::DataType::FLOAT32*/);
+            public native @Cast("Nd4jStatus") int execute(@ByRef RandomGenerator rng, @Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoubleBuffer tArgs, @Cast("Nd4jLong*") @StdVector LongBuffer iArgs, @Cast("bool*") @StdVector boolean[] bArgs, @Cast("sd::DataType*") @StdVector IntBuffer dArgs/*=std::vector<sd::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/, @Cast("sd::DataType") int type/*=sd::DataType::FLOAT32*/);
             public native @Cast("Nd4jStatus") int execute(@ByRef RandomGenerator rng, @Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoubleBuffer tArgs, @Cast("Nd4jLong*") @StdVector LongBuffer iArgs, @Cast("bool*") @StdVector boolean[] bArgs);
-            public native @Cast("Nd4jStatus") int execute(@ByRef RandomGenerator rng, @Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector double[] tArgs, @Cast("Nd4jLong*") @StdVector long[] iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs, @Cast("nd4j::DataType*") @StdVector int[] dArgs/*=std::vector<nd4j::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/, @Cast("nd4j::DataType") int type/*=nd4j::DataType::FLOAT32*/);
+            public native @Cast("Nd4jStatus") int execute(@ByRef RandomGenerator rng, @Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector double[] tArgs, @Cast("Nd4jLong*") @StdVector long[] iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs, @Cast("sd::DataType*") @StdVector int[] dArgs/*=std::vector<sd::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/, @Cast("sd::DataType") int type/*=sd::DataType::FLOAT32*/);
             public native @Cast("Nd4jStatus") int execute(@ByRef RandomGenerator rng, @Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector double[] tArgs, @Cast("Nd4jLong*") @StdVector long[] iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs);
-            public native @Cast("Nd4jStatus") int execute(@ByRef RandomGenerator rng, @Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoublePointer tArgs, @Cast("Nd4jLong*") @StdVector LongPointer iArgs, @Cast("bool*") @StdVector boolean[] bArgs, @Cast("nd4j::DataType*") @StdVector IntPointer dArgs/*=std::vector<nd4j::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/, @Cast("nd4j::DataType") int type/*=nd4j::DataType::FLOAT32*/);
+            public native @Cast("Nd4jStatus") int execute(@ByRef RandomGenerator rng, @Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoublePointer tArgs, @Cast("Nd4jLong*") @StdVector LongPointer iArgs, @Cast("bool*") @StdVector boolean[] bArgs, @Cast("sd::DataType*") @StdVector IntPointer dArgs/*=std::vector<sd::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/, @Cast("sd::DataType") int type/*=sd::DataType::FLOAT32*/);
             public native @Cast("Nd4jStatus") int execute(@ByRef RandomGenerator rng, @Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoublePointer tArgs, @Cast("Nd4jLong*") @StdVector LongPointer iArgs, @Cast("bool*") @StdVector boolean[] bArgs);
-            public native @Cast("Nd4jStatus") int execute(@ByRef RandomGenerator rng, @Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoubleBuffer tArgs, @Cast("Nd4jLong*") @StdVector LongBuffer iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs, @Cast("nd4j::DataType*") @StdVector IntBuffer dArgs/*=std::vector<nd4j::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/, @Cast("nd4j::DataType") int type/*=nd4j::DataType::FLOAT32*/);
+            public native @Cast("Nd4jStatus") int execute(@ByRef RandomGenerator rng, @Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoubleBuffer tArgs, @Cast("Nd4jLong*") @StdVector LongBuffer iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs, @Cast("sd::DataType*") @StdVector IntBuffer dArgs/*=std::vector<sd::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/, @Cast("sd::DataType") int type/*=sd::DataType::FLOAT32*/);
             public native @Cast("Nd4jStatus") int execute(@ByRef RandomGenerator rng, @Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector DoubleBuffer tArgs, @Cast("Nd4jLong*") @StdVector LongBuffer iArgs, @Cast("bool*") @StdVector BooleanPointer bArgs);
-            public native @Cast("Nd4jStatus") int execute(@ByRef RandomGenerator rng, @Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector double[] tArgs, @Cast("Nd4jLong*") @StdVector long[] iArgs, @Cast("bool*") @StdVector boolean[] bArgs, @Cast("nd4j::DataType*") @StdVector int[] dArgs/*=std::vector<nd4j::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/, @Cast("nd4j::DataType") int type/*=nd4j::DataType::FLOAT32*/);
+            public native @Cast("Nd4jStatus") int execute(@ByRef RandomGenerator rng, @Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector double[] tArgs, @Cast("Nd4jLong*") @StdVector long[] iArgs, @Cast("bool*") @StdVector boolean[] bArgs, @Cast("sd::DataType*") @StdVector int[] dArgs/*=std::vector<sd::DataType>()*/, @Cast("bool") boolean isInplace/*=false*/, @Cast("sd::DataType") int type/*=sd::DataType::FLOAT32*/);
             public native @Cast("Nd4jStatus") int execute(@ByRef RandomGenerator rng, @Const @ByRef NDArrayVector inputs, @Const @ByRef NDArrayVector outputs, @StdVector double[] tArgs, @Cast("Nd4jLong*") @StdVector long[] iArgs, @Cast("bool*") @StdVector boolean[] bArgs);
 
             public native ResultSet execute(@Const @ByRef OpArgsHolder holder, @Cast("bool") boolean isInplace/*=false*/);
@@ -11971,7 +12014,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 // #include <graph/Context.h>
 // #include <ops/declarable/OpRegistrator.h>
 // #include <ops/declarable/DeclarableOp.h>
-        @Namespace("nd4j::ops") public static class DeclarableListOp extends DeclarableOp {
+        @Namespace("sd::ops") public static class DeclarableListOp extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public DeclarableListOp(Pointer p) { super(p); }
@@ -12016,7 +12059,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 // #define LIBND4J_DECLARABLE_REDUCTION_OP_H
 
 // #include <ops/declarable/DeclarableOp.h>
-        @Namespace("nd4j::ops") public static class DeclarableReductionOp extends DeclarableOp {
+        @Namespace("sd::ops") public static class DeclarableReductionOp extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public DeclarableReductionOp(Pointer p) { super(p); }
@@ -12056,7 +12099,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 // #define LIBND4J_DECLARABLECUSTOMOP_H
 
 // #include <ops/declarable/DeclarableOp.h>
-        @Namespace("nd4j::ops") public static class DeclarableCustomOp extends DeclarableOp {
+        @Namespace("sd::ops") public static class DeclarableCustomOp extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public DeclarableCustomOp(Pointer p) { super(p); }
@@ -12098,7 +12141,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 // #include <graph/Context.h>
 // #include "OpDescriptor.h"
 // #include "DeclarableOp.h"
-        @Namespace("nd4j::ops") @NoOffset public static class BooleanOp extends DeclarableOp {
+        @Namespace("sd::ops") @NoOffset public static class BooleanOp extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public BooleanOp(Pointer p) { super(p); }
@@ -12151,7 +12194,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * Their code is the part of GraphExecutioner logic. But we still want them to be expressed via Graph
          * \tparam T
          */
-        @Namespace("nd4j::ops") public static class LogicOp extends DeclarableOp {
+        @Namespace("sd::ops") public static class LogicOp extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public LogicOp(Pointer p) { super(p); }
@@ -12195,7 +12238,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 // #ifndef LIBND4J_OPREGISTRATOR_H
 // #define LIBND4J_OPREGISTRATOR_H
 
-// #include <pointercast.h>
+// #include <system/pointercast.h>
 // #include <vector>
 // #include <unordered_map>
 // #include <mutex>
@@ -12206,6 +12249,10 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 // handlers part
 // #include <cstdlib>
 // #include <csignal>
+
+// #ifndef __JAVACPP_HACK__
+
+// #endif
         /**
         *   This class provides runtime ops lookup, based on opName or opHash.
         *   To build lookup directory we use *_OP_IMPL macro, which puts static structs at compile time in .cpp files,
@@ -12213,7 +12260,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
         *   available at runtime via this singleton.
         *
         */
-        @Namespace("nd4j::ops") @NoOffset public static class OpRegistrator extends Pointer {
+        @Namespace("sd::ops") @NoOffset public static class OpRegistrator extends Pointer {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public OpRegistrator(Pointer p) { super(p); }
@@ -12312,17 +12359,17 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 // #include <ops/declarable/headers/util.h>
 // #include <ops/declarable/headers/BarnesHutTsne.h>
 // #include <ops/declarable/headers/images.h>
-// #include <dll.h>
+// #include <system/dll.h>
 // #include <helpers/shape.h>
 // #include <helpers/TAD.h>
-// #include <Status.h>
+// #include <graph/Status.h>
 // #include <helpers/ArrayUtils.h>
 // #include <helpers/ShapeBuilders.h>
-// #include <NDArrayFactory.h>
+// #include <array/NDArrayFactory.h>
 // #include <helpers/OpTracker.h>
-// #include <ConstantShapeHelper.h>
-// #include <ConstantTadHelper.h>
-    @Namespace("nd4j") public static class _loader extends Pointer {
+// #include <helpers/ConstantShapeHelper.h>
+// #include <helpers/ConstantTadHelper.h>
+    @Namespace("sd") public static class _loader extends Pointer {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public _loader(Pointer p) { super(p); }
@@ -12338,7 +12385,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
     }
 
         // logic ops 
-        @Namespace("nd4j::ops") public static class Switch extends DeclarableOp {
+        @Namespace("sd::ops") public static class Switch extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public Switch(Pointer p) { super(p); }
@@ -12353,7 +12400,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                 private native void allocate();
                                                                 public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                             }
-        @Namespace("nd4j::ops") public static class While extends LogicOp {
+        @Namespace("sd::ops") public static class While extends LogicOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public While(Pointer p) { super(p); }
@@ -12367,7 +12414,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                         public While() { super((Pointer)null); allocate(); }
                                         private native void allocate();
                                     }
-        @Namespace("nd4j::ops") public static class Scope extends LogicOp {
+        @Namespace("sd::ops") public static class Scope extends LogicOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public Scope(Pointer p) { super(p); }
@@ -12381,7 +12428,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                         public Scope() { super((Pointer)null); allocate(); }
                                         private native void allocate();
                                     }
-        @Namespace("nd4j::ops") public static class Conditional extends LogicOp {
+        @Namespace("sd::ops") public static class Conditional extends LogicOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public Conditional(Pointer p) { super(p); }
@@ -12395,7 +12442,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                         public Conditional() { super((Pointer)null); allocate(); }
                                         private native void allocate();
                                     }
-        @Namespace("nd4j::ops") public static class Return extends LogicOp {
+        @Namespace("sd::ops") public static class Return extends LogicOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public Return(Pointer p) { super(p); }
@@ -12417,7 +12464,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *
          * PLEASE NOTE: This operation is internal graph operation, and shouldn't be used directly usually.
          */
-        @Namespace("nd4j::ops") public static class expose extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class expose extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public expose(Pointer p) { super(p); }
@@ -12470,7 +12517,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * Math is: 1 / 1 + exp(-x)
          */
 //         #if NOT_EXCLUDED(OP_sigmoid)
-        @Namespace("nd4j::ops") public static class sigmoid extends DeclarableOp {
+        @Namespace("sd::ops") public static class sigmoid extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public sigmoid(Pointer p) { super(p); }
@@ -12485,7 +12532,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class sigmoid_bp extends DeclarableOp {
+        @Namespace("sd::ops") public static class sigmoid_bp extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public sigmoid_bp(Pointer p) { super(p); }
@@ -12507,7 +12554,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * Math is: x / 1 + abs(x)
          */
 //         #if NOT_EXCLUDED(OP_softsign)
-        @Namespace("nd4j::ops") public static class softsign extends DeclarableOp {
+        @Namespace("sd::ops") public static class softsign extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public softsign(Pointer p) { super(p); }
@@ -12522,7 +12569,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class softsign_bp extends DeclarableOp {
+        @Namespace("sd::ops") public static class softsign_bp extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public softsign_bp(Pointer p) { super(p); }
@@ -12543,7 +12590,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * This is Tanh activation function implementation
          */
 //         #if NOT_EXCLUDED(OP_tanh)
-        @Namespace("nd4j::ops") public static class tanh extends DeclarableOp {
+        @Namespace("sd::ops") public static class tanh extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public tanh(Pointer p) { super(p); }
@@ -12558,7 +12605,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class tanh_bp extends DeclarableOp {
+        @Namespace("sd::ops") public static class tanh_bp extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public tanh_bp(Pointer p) { super(p); }
@@ -12580,7 +12627,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * Math is: log(1 + exp(x))
          */
 //         #if NOT_EXCLUDED(OP_softplus)
-        @Namespace("nd4j::ops") public static class softplus extends DeclarableOp {
+        @Namespace("sd::ops") public static class softplus extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public softplus(Pointer p) { super(p); }
@@ -12595,7 +12642,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class softplus_bp extends DeclarableOp {
+        @Namespace("sd::ops") public static class softplus_bp extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public softplus_bp(Pointer p) { super(p); }
@@ -12616,7 +12663,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * This is RELU activation function implementation
          */
 //         #if NOT_EXCLUDED(OP_relu)
-        @Namespace("nd4j::ops") public static class relu extends DeclarableOp {
+        @Namespace("sd::ops") public static class relu extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public relu(Pointer p) { super(p); }
@@ -12631,7 +12678,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class relu_bp extends DeclarableOp {
+        @Namespace("sd::ops") public static class relu_bp extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public relu_bp(Pointer p) { super(p); }
@@ -12652,7 +12699,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * This is SELU activation function implementation
          */
 //         #if NOT_EXCLUDED(OP_selu)
-        @Namespace("nd4j::ops") public static class selu extends DeclarableOp {
+        @Namespace("sd::ops") public static class selu extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public selu(Pointer p) { super(p); }
@@ -12667,7 +12714,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class selu_bp extends DeclarableOp {
+        @Namespace("sd::ops") public static class selu_bp extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public selu_bp(Pointer p) { super(p); }
@@ -12689,7 +12736,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * Math is: x < 0 ?  alpha * x : x;
          */
 //         #if NOT_EXCLUDED(OP_lrelu)
-        @Namespace("nd4j::ops") public static class lrelu extends DeclarableOp {
+        @Namespace("sd::ops") public static class lrelu extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public lrelu(Pointer p) { super(p); }
@@ -12704,7 +12751,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class lrelu_bp extends DeclarableOp {
+        @Namespace("sd::ops") public static class lrelu_bp extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public lrelu_bp(Pointer p) { super(p); }
@@ -12726,7 +12773,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * Math is: x >= 0 ? x : exp(x) - 1;
          */
 //         #if NOT_EXCLUDED(OP_elu)
-        @Namespace("nd4j::ops") public static class elu extends DeclarableOp {
+        @Namespace("sd::ops") public static class elu extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public elu(Pointer p) { super(p); }
@@ -12741,7 +12788,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class elu_bp extends DeclarableOp {
+        @Namespace("sd::ops") public static class elu_bp extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public elu_bp(Pointer p) { super(p); }
@@ -12763,7 +12810,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * Math is: x^3
          */
 //         #if NOT_EXCLUDED(OP_cube)
-        @Namespace("nd4j::ops") public static class cube extends DeclarableOp {
+        @Namespace("sd::ops") public static class cube extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public cube(Pointer p) { super(p); }
@@ -12778,7 +12825,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class cube_bp extends DeclarableOp {
+        @Namespace("sd::ops") public static class cube_bp extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public cube_bp(Pointer p) { super(p); }
@@ -12800,7 +12847,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * Math is: max(0, tanh(x))
          */
 //         #if NOT_EXCLUDED(OP_rectifiedtanh)
-        @Namespace("nd4j::ops") public static class rectifiedtanh extends DeclarableOp {
+        @Namespace("sd::ops") public static class rectifiedtanh extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public rectifiedtanh(Pointer p) { super(p); }
@@ -12815,7 +12862,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class rectifiedtanh_bp extends DeclarableOp {
+        @Namespace("sd::ops") public static class rectifiedtanh_bp extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public rectifiedtanh_bp(Pointer p) { super(p); }
@@ -12836,7 +12883,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * This is RationalTanh activation function.
          */
 //         #if NOT_EXCLUDED(OP_rationaltanh)
-        @Namespace("nd4j::ops") public static class rationaltanh extends DeclarableOp {
+        @Namespace("sd::ops") public static class rationaltanh extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public rationaltanh(Pointer p) { super(p); }
@@ -12851,7 +12898,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class rationaltanh_bp extends DeclarableOp {
+        @Namespace("sd::ops") public static class rationaltanh_bp extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public rationaltanh_bp(Pointer p) { super(p); }
@@ -12873,7 +12920,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * Math is: x < -1.0 ? -1.0 : x > 1.0 ? 1.0 : x;
          */
 //         #if NOT_EXCLUDED(OP_hardtanh)
-        @Namespace("nd4j::ops") public static class hardtanh extends DeclarableOp {
+        @Namespace("sd::ops") public static class hardtanh extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public hardtanh(Pointer p) { super(p); }
@@ -12888,7 +12935,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class hardtanh_bp extends DeclarableOp {
+        @Namespace("sd::ops") public static class hardtanh_bp extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public hardtanh_bp(Pointer p) { super(p); }
@@ -12910,7 +12957,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * Math is: min(1, max(0, 0.2 * x + 0.5))
          */
 //         #if NOT_EXCLUDED(OP_hardsigmoid)
-        @Namespace("nd4j::ops") public static class hardsigmoid extends DeclarableOp {
+        @Namespace("sd::ops") public static class hardsigmoid extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public hardsigmoid(Pointer p) { super(p); }
@@ -12925,7 +12972,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class hardsigmoid_bp extends DeclarableOp {
+        @Namespace("sd::ops") public static class hardsigmoid_bp extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public hardsigmoid_bp(Pointer p) { super(p); }
@@ -12946,7 +12993,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * This is Indentity operation. It passes signal umodified in both directions.
          */
 //         #if NOT_EXCLUDED(OP_identity)
-        @Namespace("nd4j::ops") public static class identity extends DeclarableOp {
+        @Namespace("sd::ops") public static class identity extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public identity(Pointer p) { super(p); }
@@ -12961,7 +13008,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                     private native void allocate();
                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                 }
-        @Namespace("nd4j::ops") public static class identity_bp extends DeclarableOp {
+        @Namespace("sd::ops") public static class identity_bp extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public identity_bp(Pointer p) { super(p); }
@@ -12982,7 +13029,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * This is Indentity operation. It passes signal umodified in both directions.
          */
 //         #if NOT_EXCLUDED(OP_identity_n)
-        @Namespace("nd4j::ops") public static class identity_n extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class identity_n extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public identity_n(Pointer p) { super(p); }
@@ -13006,7 +13053,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * PLEASE NOTE: Concatenation will double amount of features available in input
          */
 //         #if NOT_EXCLUDED(OP_crelu)
-        @Namespace("nd4j::ops") public static class crelu extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class crelu extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public crelu(Pointer p) { super(p); }
@@ -13021,7 +13068,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class crelu_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class crelu_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public crelu_bp(Pointer p) { super(p); }
@@ -13042,7 +13089,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * This is RELU6 activation function implementation
          */
 //         #if NOT_EXCLUDED(OP_relu6)
-        @Namespace("nd4j::ops") public static class relu6 extends DeclarableOp {
+        @Namespace("sd::ops") public static class relu6 extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public relu6(Pointer p) { super(p); }
@@ -13057,7 +13104,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class relu6_bp extends DeclarableOp {
+        @Namespace("sd::ops") public static class relu6_bp extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public relu6_bp(Pointer p) { super(p); }
@@ -13080,7 +13127,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * f(x) = alpha * x for x < 0, f(x) = x for x >= 0
          */
 //         #if NOT_EXCLUDED(OP_prelu)
-        @Namespace("nd4j::ops") public static class prelu extends DeclarableOp {
+        @Namespace("sd::ops") public static class prelu extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public prelu(Pointer p) { super(p); }
@@ -13095,7 +13142,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class prelu_bp extends DeclarableOp {
+        @Namespace("sd::ops") public static class prelu_bp extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public prelu_bp(Pointer p) { super(p); }
@@ -13118,7 +13165,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * theta must be >= 0
          */
 //         #if NOT_EXCLUDED(OP_thresholdedrelu)
-        @Namespace("nd4j::ops") public static class thresholdedrelu extends DeclarableOp {
+        @Namespace("sd::ops") public static class thresholdedrelu extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public thresholdedrelu(Pointer p) { super(p); }
@@ -13133,7 +13180,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class thresholdedrelu_bp extends DeclarableOp {
+        @Namespace("sd::ops") public static class thresholdedrelu_bp extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public thresholdedrelu_bp(Pointer p) { super(p); }
@@ -13190,7 +13237,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * Returns true if x < y
          */
 //         #if NOT_EXCLUDED(OP_lt_scalar)
-        @Namespace("nd4j::ops") public static class lt_scalar extends BooleanOp {
+        @Namespace("sd::ops") public static class lt_scalar extends BooleanOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public lt_scalar(Pointer p) { super(p); }
@@ -13213,7 +13260,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * Returns true if x > y
          */
 //         #if NOT_EXCLUDED(OP_gt_scalar)
-        @Namespace("nd4j::ops") public static class gt_scalar extends BooleanOp {
+        @Namespace("sd::ops") public static class gt_scalar extends BooleanOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public gt_scalar(Pointer p) { super(p); }
@@ -13236,7 +13283,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * Returns true if x <= y
          */
 //         #if NOT_EXCLUDED(OP_lte_scalar)
-        @Namespace("nd4j::ops") public static class lte_scalar extends BooleanOp {
+        @Namespace("sd::ops") public static class lte_scalar extends BooleanOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public lte_scalar(Pointer p) { super(p); }
@@ -13259,7 +13306,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * Returns true if x >= y
          */
 //         #if NOT_EXCLUDED(OP_gte_scalar)
-        @Namespace("nd4j::ops") public static class gte_scalar extends BooleanOp {
+        @Namespace("sd::ops") public static class gte_scalar extends BooleanOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public gte_scalar(Pointer p) { super(p); }
@@ -13282,7 +13329,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * Returns true if both operands are equal.
          */
 //         #if NOT_EXCLUDED(OP_eq_scalar)
-        @Namespace("nd4j::ops") public static class eq_scalar extends BooleanOp {
+        @Namespace("sd::ops") public static class eq_scalar extends BooleanOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public eq_scalar(Pointer p) { super(p); }
@@ -13305,7 +13352,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * Returns true if x != y
          */
 //         #if NOT_EXCLUDED(OP_neq_scalar)
-        @Namespace("nd4j::ops") public static class neq_scalar extends BooleanOp {
+        @Namespace("sd::ops") public static class neq_scalar extends BooleanOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public neq_scalar(Pointer p) { super(p); }
@@ -13326,7 +13373,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * array of the same shape, with elements, either from x or y, depending on the condition.
          */
 //         #if NOT_EXCLUDED(OP_where)
-        @Namespace("nd4j::ops") public static class Where extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class Where extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public Where(Pointer p) { super(p); }
@@ -13344,7 +13391,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_where_np)
-        @Namespace("nd4j::ops") public static class where_np extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class where_np extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public where_np(Pointer p) { super(p); }
@@ -13366,7 +13413,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * array of the same shape, with elements, either from x or y, depending on the condition.
          */
 //         #if NOT_EXCLUDED(OP_select)
-        @Namespace("nd4j::ops") public static class select extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class select extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public select(Pointer p) { super(p); }
@@ -13397,7 +13444,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * \tparam T
          */
 //         #if NOT_EXCLUDED(OP_choose)
-        @Namespace("nd4j::ops") public static class choose extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class choose extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public choose(Pointer p) { super(p); }
@@ -13418,7 +13465,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
         * This op takes 1 n-dimensional array as input, and returns true if for every adjacent pair we have x[i] <= x[i+1].
          */
 //         #if NOT_EXCLUDED(OP_is_non_decreasing)
-        @Namespace("nd4j::ops") public static class is_non_decreasing extends BooleanOp {
+        @Namespace("sd::ops") public static class is_non_decreasing extends BooleanOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public is_non_decreasing(Pointer p) { super(p); }
@@ -13438,7 +13485,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * This op takes 1 n-dimensional array as input, and returns true if for every adjacent pair we have x[i] < x[i+1].
          */
 //         #if NOT_EXCLUDED(OP_is_strictly_increasing)
-        @Namespace("nd4j::ops") public static class is_strictly_increasing extends BooleanOp {
+        @Namespace("sd::ops") public static class is_strictly_increasing extends BooleanOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public is_strictly_increasing(Pointer p) { super(p); }
@@ -13458,7 +13505,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * This op takes 1 n-dimensional array as input, and returns true if input is a numeric array.
          */
 //         #if NOT_EXCLUDED(OP_is_numeric_tensor)
-        @Namespace("nd4j::ops") public static class is_numeric_tensor extends BooleanOp {
+        @Namespace("sd::ops") public static class is_numeric_tensor extends BooleanOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public is_numeric_tensor(Pointer p) { super(p); }
@@ -13478,7 +13525,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *
          */
 //         #if NOT_EXCLUDED(OP_boolean_not)
-        @Namespace("nd4j::ops") public static class boolean_not extends DeclarableOp {
+        @Namespace("sd::ops") public static class boolean_not extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public boolean_not(Pointer p) { super(p); }
@@ -13539,7 +13586,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * This operation returns Z = Max(X, Y)
          */
 //         #if NOT_EXCLUDED(OP_maximum)
-        @Namespace("nd4j::ops") public static class maximum extends BroadcastableOp {
+        @Namespace("sd::ops") public static class maximum extends BroadcastableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public maximum(Pointer p) { super(p); }
@@ -13553,7 +13600,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     public maximum() { super((Pointer)null); allocate(); }
                                                                                     private native void allocate();
                                                                                 }
-        @Namespace("nd4j::ops") public static class maximum_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class maximum_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public maximum_bp(Pointer p) { super(p); }
@@ -13580,7 +13627,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * This operation returns Z = Min(X, Y)
          */
 //         #if NOT_EXCLUDED(OP_minimum)
-        @Namespace("nd4j::ops") public static class minimum extends BroadcastableOp {
+        @Namespace("sd::ops") public static class minimum extends BroadcastableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public minimum(Pointer p) { super(p); }
@@ -13594,7 +13641,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     public minimum() { super((Pointer)null); allocate(); }
                                                                                     private native void allocate();
                                                                                 }
-        @Namespace("nd4j::ops") public static class minimum_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class minimum_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public minimum_bp(Pointer p) { super(p); }
@@ -13621,7 +13668,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * This operation returns Z = Add(X, Y)
          */
 //         #if NOT_EXCLUDED(OP_add)
-        @Namespace("nd4j::ops") public static class add extends BroadcastableOp {
+        @Namespace("sd::ops") public static class add extends BroadcastableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public add(Pointer p) { super(p); }
@@ -13635,7 +13682,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     public add() { super((Pointer)null); allocate(); }
                                                                                     private native void allocate();
                                                                                 }
-        @Namespace("nd4j::ops") public static class add_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class add_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public add_bp(Pointer p) { super(p); }
@@ -13662,7 +13709,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * This operation returns Z = Subtract(X, Y)
          */
 //         #if NOT_EXCLUDED(OP_subtract)
-        @Namespace("nd4j::ops") public static class subtract extends BroadcastableOp {
+        @Namespace("sd::ops") public static class subtract extends BroadcastableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public subtract(Pointer p) { super(p); }
@@ -13676,7 +13723,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     public subtract() { super((Pointer)null); allocate(); }
                                                                                     private native void allocate();
                                                                                 }
-        @Namespace("nd4j::ops") public static class subtract_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class subtract_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public subtract_bp(Pointer p) { super(p); }
@@ -13703,7 +13750,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * This operation returns Z = Subtract(Y, X)
          */
 //         #if NOT_EXCLUDED(OP_reversesubtract)
-        @Namespace("nd4j::ops") public static class reversesubtract extends BroadcastableOp {
+        @Namespace("sd::ops") public static class reversesubtract extends BroadcastableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public reversesubtract(Pointer p) { super(p); }
@@ -13717,7 +13764,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     public reversesubtract() { super((Pointer)null); allocate(); }
                                                                                     private native void allocate();
                                                                                 }
-        @Namespace("nd4j::ops") public static class reversesubtract_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class reversesubtract_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public reversesubtract_bp(Pointer p) { super(p); }
@@ -13744,7 +13791,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * This operation returns Z = ReverseMod(X, Y) == Mod(Y, X)
          */
 //         #if NOT_EXCLUDED(OP_reversemod)
-        @Namespace("nd4j::ops") public static class reversemod extends BroadcastableOp {
+        @Namespace("sd::ops") public static class reversemod extends BroadcastableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public reversemod(Pointer p) { super(p); }
@@ -13758,7 +13805,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     public reversemod() { super((Pointer)null); allocate(); }
                                                                                     private native void allocate();
                                                                                 }
-        @Namespace("nd4j::ops") public static class reversemod_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class reversemod_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public reversemod_bp(Pointer p) { super(p); }
@@ -13786,7 +13833,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * This operation returns Z = Subtract(X, Y) * Subtract(X, Y)
          */
 //         #if NOT_EXCLUDED(OP_squaredsubtract)
-        @Namespace("nd4j::ops") public static class squaredsubtract extends BroadcastableOp {
+        @Namespace("sd::ops") public static class squaredsubtract extends BroadcastableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public squaredsubtract(Pointer p) { super(p); }
@@ -13800,7 +13847,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     public squaredsubtract() { super((Pointer)null); allocate(); }
                                                                                     private native void allocate();
                                                                                 }
-                                                                                @Namespace("nd4j::ops") public static class squaredsubtract_bp extends DeclarableCustomOp {
+                                                                                @Namespace("sd::ops") public static class squaredsubtract_bp extends DeclarableCustomOp {
                                                                                     static { Loader.load(); }
                                                                                     /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
                                                                                     public squaredsubtract_bp(Pointer p) { super(p); }
@@ -13827,7 +13874,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * This operation returns Z = Multiply(X, Y)
          */
 //         #if NOT_EXCLUDED(OP_multiply)
-        @Namespace("nd4j::ops") public static class multiply extends BroadcastableOp {
+        @Namespace("sd::ops") public static class multiply extends BroadcastableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public multiply(Pointer p) { super(p); }
@@ -13841,7 +13888,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     public multiply() { super((Pointer)null); allocate(); }
                                                                                     private native void allocate();
                                                                                 }
-        @Namespace("nd4j::ops") public static class multiply_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class multiply_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public multiply_bp(Pointer p) { super(p); }
@@ -13868,7 +13915,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * This operation returns Z = Divide(X, Y)
          */
 //         #if NOT_EXCLUDED(OP_divide)
-        @Namespace("nd4j::ops") public static class divide extends BroadcastableOp {
+        @Namespace("sd::ops") public static class divide extends BroadcastableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public divide(Pointer p) { super(p); }
@@ -13882,7 +13929,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     public divide() { super((Pointer)null); allocate(); }
                                                                                     private native void allocate();
                                                                                 }
-        @Namespace("nd4j::ops") public static class divide_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class divide_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public divide_bp(Pointer p) { super(p); }
@@ -13909,7 +13956,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * This operation returns Z = Divide(X, Y) with exception, 0 if Y = 0
          */
 //         #if NOT_EXCLUDED(OP_divide_no_nan)
-        @Namespace("nd4j::ops") public static class divide_no_nan extends BroadcastableOp {
+        @Namespace("sd::ops") public static class divide_no_nan extends BroadcastableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public divide_no_nan(Pointer p) { super(p); }
@@ -13934,7 +13981,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * This operation returns Z = Divide(Y, x)
          */
 //         #if NOT_EXCLUDED(OP_reversedivide)
-        @Namespace("nd4j::ops") public static class reversedivide extends BroadcastableOp {
+        @Namespace("sd::ops") public static class reversedivide extends BroadcastableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public reversedivide(Pointer p) { super(p); }
@@ -13948,7 +13995,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     public reversedivide() { super((Pointer)null); allocate(); }
                                                                                     private native void allocate();
                                                                                 }
-        @Namespace("nd4j::ops") public static class reversedivide_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class reversedivide_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public reversedivide_bp(Pointer p) { super(p); }
@@ -13975,7 +14022,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * This operation returns Z = FloorMod(X, Y)
          */
 //         #if NOT_EXCLUDED(OP_floormod)
-        @Namespace("nd4j::ops") public static class floormod extends BroadcastableOp {
+        @Namespace("sd::ops") public static class floormod extends BroadcastableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public floormod(Pointer p) { super(p); }
@@ -13989,7 +14036,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     public floormod() { super((Pointer)null); allocate(); }
                                                                                     private native void allocate();
                                                                                 }
-        @Namespace("nd4j::ops") public static class floormod_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class floormod_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public floormod_bp(Pointer p) { super(p); }
@@ -14007,7 +14054,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_mod)
-        @Namespace("nd4j::ops") public static class mod extends BroadcastableOp {
+        @Namespace("sd::ops") public static class mod extends BroadcastableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public mod(Pointer p) { super(p); }
@@ -14021,7 +14068,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     public mod() { super((Pointer)null); allocate(); }
                                                                                     private native void allocate();
                                                                                 }
-        @Namespace("nd4j::ops") public static class mod_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class mod_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public mod_bp(Pointer p) { super(p); }
@@ -14048,7 +14095,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * This operation returns Z = FloorDiv(X, Y)
          */
 //         #if NOT_EXCLUDED(OP_floordiv)
-        @Namespace("nd4j::ops") public static class floordiv extends BroadcastableOp {
+        @Namespace("sd::ops") public static class floordiv extends BroadcastableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public floordiv(Pointer p) { super(p); }
@@ -14062,7 +14109,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     public floordiv() { super((Pointer)null); allocate(); }
                                                                                     private native void allocate();
                                                                                 }
-                                                                                @Namespace("nd4j::ops") public static class floordiv_bp extends DeclarableCustomOp {
+                                                                                @Namespace("sd::ops") public static class floordiv_bp extends DeclarableCustomOp {
                                                                                     static { Loader.load(); }
                                                                                     /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
                                                                                     public floordiv_bp(Pointer p) { super(p); }
@@ -14089,7 +14136,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * This operation returns Z = Divide(X, Y)
          */
 //         #if NOT_EXCLUDED(OP_realdiv)
-        @Namespace("nd4j::ops") public static class realdiv extends BroadcastableOp {
+        @Namespace("sd::ops") public static class realdiv extends BroadcastableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public realdiv(Pointer p) { super(p); }
@@ -14103,7 +14150,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     public realdiv() { super((Pointer)null); allocate(); }
                                                                                     private native void allocate();
                                                                                 }
-        @Namespace("nd4j::ops") public static class realdiv_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class realdiv_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public realdiv_bp(Pointer p) { super(p); }
@@ -14126,7 +14173,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *
          * \tparam T
          */
-        @Namespace("nd4j::ops") public static class truncatediv extends BroadcastableOp {
+        @Namespace("sd::ops") public static class truncatediv extends BroadcastableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public truncatediv(Pointer p) { super(p); }
@@ -14151,7 +14198,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * This operation returns Z = Assign(X, Y)
          */
 //         #if NOT_EXCLUDED(OP_assign)
-        @Namespace("nd4j::ops") public static class assign extends BroadcastableOp {
+        @Namespace("sd::ops") public static class assign extends BroadcastableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public assign(Pointer p) { super(p); }
@@ -14165,7 +14212,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     public assign() { super((Pointer)null); allocate(); }
                                                                                     private native void allocate();
                                                                                 }
-        @Namespace("nd4j::ops") public static class assign_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class assign_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public assign_bp(Pointer p) { super(p); }
@@ -14183,7 +14230,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_meshgrid)
-        @Namespace("nd4j::ops") public static class meshgrid extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class meshgrid extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public meshgrid(Pointer p) { super(p); }
@@ -14206,7 +14253,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *
          */
 //         #if NOT_EXCLUDED(OP_equals)
-        @Namespace("nd4j::ops") public static class equals extends BroadcastableOp {
+        @Namespace("sd::ops") public static class equals extends BroadcastableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public equals(Pointer p) { super(p); }
@@ -14227,7 +14274,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * Math is: _x != _y ? (T) 1.0f : (T) 0.0f;
          */
 //         #if NOT_EXCLUDED(OP_not_equals)
-        @Namespace("nd4j::ops") public static class not_equals extends BroadcastableOp {
+        @Namespace("sd::ops") public static class not_equals extends BroadcastableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public not_equals(Pointer p) { super(p); }
@@ -14248,7 +14295,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * Math is: _x <= _y ? (T) 1.0f : (T) 0.0f;
          */
 //         #if NOT_EXCLUDED(OP_less_equal)
-        @Namespace("nd4j::ops") public static class less_equal extends BroadcastableOp {
+        @Namespace("sd::ops") public static class less_equal extends BroadcastableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public less_equal(Pointer p) { super(p); }
@@ -14269,7 +14316,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * Math is: _x >= _y ? (T) 1.0f : (T) 0.0f;
          */
 //         #if NOT_EXCLUDED(OP_greater_equal)
-        @Namespace("nd4j::ops") public static class greater_equal extends BroadcastableOp {
+        @Namespace("sd::ops") public static class greater_equal extends BroadcastableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public greater_equal(Pointer p) { super(p); }
@@ -14290,7 +14337,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * Math is: _x < _y ? (T) 1.0f : (T) 0.0f;
          */
 //         #if NOT_EXCLUDED(OP_less)
-        @Namespace("nd4j::ops") public static class less extends BroadcastableOp {
+        @Namespace("sd::ops") public static class less extends BroadcastableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public less(Pointer p) { super(p); }
@@ -14311,7 +14358,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * Math is: _x > _y ? (T) 1.0f : (T) 0.0f;
          */
 //         #if NOT_EXCLUDED(OP_greater)
-        @Namespace("nd4j::ops") public static class greater extends BroadcastableOp {
+        @Namespace("sd::ops") public static class greater extends BroadcastableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public greater(Pointer p) { super(p); }
@@ -14331,7 +14378,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *
          */
 //         #if NOT_EXCLUDED(OP_boolean_and)
-        @Namespace("nd4j::ops") public static class boolean_and extends BroadcastableOp {
+        @Namespace("sd::ops") public static class boolean_and extends BroadcastableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public boolean_and(Pointer p) { super(p); }
@@ -14351,7 +14398,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *
          */
 //         #if NOT_EXCLUDED(OP_boolean_or)
-        @Namespace("nd4j::ops") public static class boolean_or extends BroadcastableOp {
+        @Namespace("sd::ops") public static class boolean_or extends BroadcastableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public boolean_or(Pointer p) { super(p); }
@@ -14371,7 +14418,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *
          */
 //         #if NOT_EXCLUDED(OP_boolean_xor)
-        @Namespace("nd4j::ops") public static class boolean_xor extends BroadcastableOp {
+        @Namespace("sd::ops") public static class boolean_xor extends BroadcastableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public boolean_xor(Pointer p) { super(p); }
@@ -14400,7 +14447,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * 
          */
 //         #if NOT_EXCLUDED(OP_percentile)
-        @Namespace("nd4j::ops") public static class percentile extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class percentile extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public percentile(Pointer p) { super(p); }
@@ -14423,7 +14470,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * \tparam T
          */
 //         #if NOT_EXCLUDED(OP_tf_atan2)
-        @Namespace("nd4j::ops") public static class tf_atan2 extends BroadcastableOp {
+        @Namespace("sd::ops") public static class tf_atan2 extends BroadcastableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public tf_atan2(Pointer p) { super(p); }
@@ -14444,7 +14491,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * \tparam T
          */
 //         #if NOT_EXCLUDED(OP_Pow)
-        @Namespace("nd4j::ops") public static class Pow extends BroadcastableOp {
+        @Namespace("sd::ops") public static class Pow extends BroadcastableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public Pow(Pointer p) { super(p); }
@@ -14458,7 +14505,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     public Pow() { super((Pointer)null); allocate(); }
                                                                                     private native void allocate();
                                                                                 }
-        @Namespace("nd4j::ops") public static class Pow_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class Pow_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public Pow_bp(Pointer p) { super(p); }
@@ -14484,7 +14531,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * \tparam T
          */
 //         #if NOT_EXCLUDED(OP_igamma)
-                @Namespace("nd4j::ops") public static class igamma extends BroadcastableOp {
+                @Namespace("sd::ops") public static class igamma extends BroadcastableOp {
                     static { Loader.load(); }
                     /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
                     public igamma(Pointer p) { super(p); }
@@ -14507,7 +14554,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * \tparam T
          */
 //         #if NOT_EXCLUDED(OP_igammac)
-                @Namespace("nd4j::ops") public static class igammac extends BroadcastableOp {
+                @Namespace("sd::ops") public static class igammac extends BroadcastableOp {
                     static { Loader.load(); }
                     /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
                     public igammac(Pointer p) { super(p); }
@@ -14567,7 +14614,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * 2: padding
          */
 //         #if NOT_EXCLUDED(OP_conv1d)
-        @Namespace("nd4j::ops") public static class conv1d extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class conv1d extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public conv1d(Pointer p) { super(p); }
@@ -14582,7 +14629,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class conv1d_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class conv1d_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public conv1d_bp(Pointer p) { super(p); }
@@ -14619,7 +14666,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * 9: data format: 1 NHWC, 0 NCHW
          */
 //         #if NOT_EXCLUDED(OP_conv2d)
-        @Namespace("nd4j::ops") public static class conv2d extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class conv2d extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public conv2d(Pointer p) { super(p); }
@@ -14634,7 +14681,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class conv2d_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class conv2d_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public conv2d_bp(Pointer p) { super(p); }
@@ -14649,7 +14696,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class conv2d_input_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class conv2d_input_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public conv2d_input_bp(Pointer p) { super(p); }
@@ -14675,7 +14722,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * bias: optional, vector
          */
 //         #if NOT_EXCLUDED(OP_sconv2d)
-        @Namespace("nd4j::ops") public static class sconv2d extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class sconv2d extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public sconv2d(Pointer p) { super(p); }
@@ -14690,7 +14737,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class sconv2d_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class sconv2d_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public sconv2d_bp(Pointer p) { super(p); }
@@ -14722,7 +14769,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * 8: same mode: 0 false, 1 true
          */
 //         #if NOT_EXCLUDED(OP_deconv2d)
-        @Namespace("nd4j::ops") public static class deconv2d extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class deconv2d extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public deconv2d(Pointer p) { super(p); }
@@ -14737,7 +14784,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class deconv2d_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class deconv2d_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public deconv2d_bp(Pointer p) { super(p); }
@@ -14775,7 +14822,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          */
 
 //         #if NOT_EXCLUDED(OP_deconv3d)
-        @Namespace("nd4j::ops") public static class deconv3d extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class deconv3d extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public deconv3d(Pointer p) { super(p); }
@@ -14790,7 +14837,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class deconv3d_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class deconv3d_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public deconv3d_bp(Pointer p) { super(p); }
@@ -14824,7 +14871,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * 8: same mode: 0 false, 1 true
          */
 //         #if NOT_EXCLUDED(OP_maxpool2d)
-        @Namespace("nd4j::ops") public static class maxpool2d extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class maxpool2d extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public maxpool2d(Pointer p) { super(p); }
@@ -14839,7 +14886,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class maxpool2d_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class maxpool2d_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public maxpool2d_bp(Pointer p) { super(p); }
@@ -14872,7 +14919,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * 8: same mode: 0 false, 1 true
          */
 //         #if NOT_EXCLUDED(OP_avgpool2d)
-        @Namespace("nd4j::ops") public static class avgpool2d extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class avgpool2d extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public avgpool2d(Pointer p) { super(p); }
@@ -14887,7 +14934,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class avgpool2d_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class avgpool2d_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public avgpool2d_bp(Pointer p) { super(p); }
@@ -14921,7 +14968,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * 9: p for p-norm
          */
 //         #if NOT_EXCLUDED(OP_pnormpool2d)
-        @Namespace("nd4j::ops") public static class pnormpool2d extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class pnormpool2d extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public pnormpool2d(Pointer p) { super(p); }
@@ -14936,7 +14983,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class pnormpool2d_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class pnormpool2d_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public pnormpool2d_bp(Pointer p) { super(p); }
@@ -14969,7 +15016,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * 8: isSameMode
          */
 //         #if NOT_EXCLUDED(OP_im2col)
-        @Namespace("nd4j::ops") public static class im2col extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class im2col extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public im2col(Pointer p) { super(p); }
@@ -14984,7 +15031,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-		@Namespace("nd4j::ops") public static class im2col_bp extends DeclarableCustomOp {
+		@Namespace("sd::ops") public static class im2col_bp extends DeclarableCustomOp {
 		    static { Loader.load(); }
 		    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
 		    public im2col_bp(Pointer p) { super(p); }
@@ -15016,7 +15063,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * 7: dilation width
          */
 //         #if NOT_EXCLUDED(OP_col2im)
-        @Namespace("nd4j::ops") public static class col2im extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class col2im extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public col2im(Pointer p) { super(p); }
@@ -15042,7 +15089,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * 2: data format: 0 NHWC (default), 1 NCHW
          */
 //         #if NOT_EXCLUDED(OP_upsampling2d)
-        @Namespace("nd4j::ops") public static class upsampling2d extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class upsampling2d extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public upsampling2d(Pointer p) { super(p); }
@@ -15057,7 +15104,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class upsampling2d_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class upsampling2d_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public upsampling2d_bp(Pointer p) { super(p); }
@@ -15084,7 +15131,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * 3: data format: 0 NDHWC (default), 1 NCDHW
          */
 //         #if NOT_EXCLUDED(OP_upsampling3d)
-        @Namespace("nd4j::ops") public static class upsampling3d extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class upsampling3d extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public upsampling3d(Pointer p) { super(p); }
@@ -15099,7 +15146,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class upsampling3d_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class upsampling3d_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public upsampling3d_bp(Pointer p) { super(p); }
@@ -15124,7 +15171,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * 0: axis
          */
 //         #if NOT_EXCLUDED(OP_ismax)
-        @Namespace("nd4j::ops") public static class ismax extends DeclarableOp {
+        @Namespace("sd::ops") public static class ismax extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public ismax(Pointer p) { super(p); }
@@ -15148,7 +15195,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * 0: isSameMode
          */
 //         #if NOT_EXCLUDED(OP_dilation2d)
-        @Namespace("nd4j::ops") public static class dilation2d extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class dilation2d extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public dilation2d(Pointer p) { super(p); }
@@ -15166,7 +15213,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_conv3dnew)
-        @Namespace("nd4j::ops") public static class conv3dnew extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class conv3dnew extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public conv3dnew(Pointer p) { super(p); }
@@ -15181,7 +15228,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class conv3dnew_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class conv3dnew_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public conv3dnew_bp(Pointer p) { super(p); }
@@ -15199,7 +15246,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_avgpool3dnew)
-        @Namespace("nd4j::ops") public static class avgpool3dnew extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class avgpool3dnew extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public avgpool3dnew(Pointer p) { super(p); }
@@ -15214,7 +15261,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class avgpool3dnew_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class avgpool3dnew_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public avgpool3dnew_bp(Pointer p) { super(p); }
@@ -15232,7 +15279,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_maxpool3dnew)
-        @Namespace("nd4j::ops") public static class maxpool3dnew extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class maxpool3dnew extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public maxpool3dnew(Pointer p) { super(p); }
@@ -15247,7 +15294,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class maxpool3dnew_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class maxpool3dnew_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public maxpool3dnew_bp(Pointer p) { super(p); }
@@ -15276,7 +15323,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *   9 int with 2x4 vectors and 1 bool value
          */
 //         #if NOT_EXCLUDED(OP_max_pool_woth_argmax)
-        @Namespace("nd4j::ops") public static class max_pool_with_argmax extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class max_pool_with_argmax extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public max_pool_with_argmax(Pointer p) { super(p); }
@@ -15295,7 +15342,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 
 
 //         #if NOT_EXCLUDED(OP_depthwise_conv2d)
-        @Namespace("nd4j::ops") public static class depthwise_conv2d extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class depthwise_conv2d extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public depthwise_conv2d(Pointer p) { super(p); }
@@ -15310,7 +15357,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class depthwise_conv2d_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class depthwise_conv2d_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public depthwise_conv2d_bp(Pointer p) { super(p); }
@@ -15337,7 +15384,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * IntArgs:
          * 0: data format: 1 NHWC, 0 NCHW (optional, by default = NHWC)
          */
-        @Namespace("nd4j::ops") public static class pointwise_conv2d extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class pointwise_conv2d extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public pointwise_conv2d(Pointer p) { super(p); }
@@ -15353,7 +15400,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
 
-        @Namespace("nd4j::ops") public static class deconv2d_tf extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class deconv2d_tf extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public deconv2d_tf(Pointer p) { super(p); }
@@ -15408,7 +15455,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * If no NDArrayList was provided - new one will be created
          */
 //         #if NOT_EXCLUDED(OP_write_list)
-        @Namespace("nd4j::ops") public static class write_list extends DeclarableListOp {
+        @Namespace("sd::ops") public static class write_list extends DeclarableListOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public write_list(Pointer p) { super(p); }
@@ -15428,7 +15475,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * This operation concatenates given NDArrayList, and returns NDArray as result
          */
 //         #if NOT_EXCLUDED(OP_stack_list)
-        @Namespace("nd4j::ops") public static class stack_list extends DeclarableListOp {
+        @Namespace("sd::ops") public static class stack_list extends DeclarableListOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public stack_list(Pointer p) { super(p); }
@@ -15454,7 +15501,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * optional, index
          */
 //         #if NOT_EXCLUDED(OP_read_list)
-        @Namespace("nd4j::ops") public static class read_list extends DeclarableListOp {
+        @Namespace("sd::ops") public static class read_list extends DeclarableListOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public read_list(Pointer p) { super(p); }
@@ -15480,7 +15527,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * optional, indices
          */
 //         #if NOT_EXCLUDED(OP_pick_list)
-        @Namespace("nd4j::ops") public static class pick_list extends DeclarableListOp {
+        @Namespace("sd::ops") public static class pick_list extends DeclarableListOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public pick_list(Pointer p) { super(p); }
@@ -15502,7 +15549,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * x: list
          */
 //         #if NOT_EXCLUDED(OP_size_list)
-        @Namespace("nd4j::ops") public static class size_list extends DeclarableListOp {
+        @Namespace("sd::ops") public static class size_list extends DeclarableListOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public size_list(Pointer p) { super(p); }
@@ -15522,7 +15569,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * This operation creates new empty NDArrayList
          */
 //         #if NOT_EXCLUDED(OP_create_list)
-        @Namespace("nd4j::ops") public static class create_list extends DeclarableListOp {
+        @Namespace("sd::ops") public static class create_list extends DeclarableListOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public create_list(Pointer p) { super(p); }
@@ -15542,7 +15589,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * This operation unpacks given NDArray into specified NDArrayList wrt specified indices
          */
 //         #if NOT_EXCLUDED(OP_scatter_list)
-        @Namespace("nd4j::ops") public static class scatter_list extends DeclarableListOp {
+        @Namespace("sd::ops") public static class scatter_list extends DeclarableListOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public scatter_list(Pointer p) { super(p); }
@@ -15566,7 +15613,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * sizes: vector with sizes for each chunk
          */
 //         #if NOT_EXCLUDED(OP_split_list)
-        @Namespace("nd4j::ops") public static class split_list extends DeclarableListOp {
+        @Namespace("sd::ops") public static class split_list extends DeclarableListOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public split_list(Pointer p) { super(p); }
@@ -15589,7 +15636,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * indices: vector with indices for gather operation
          */
 //         #if NOT_EXCLUDED(OP_gather_list)
-        @Namespace("nd4j::ops") public static class gather_list extends DeclarableListOp {
+        @Namespace("sd::ops") public static class gather_list extends DeclarableListOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public gather_list(Pointer p) { super(p); }
@@ -15609,7 +15656,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * This operation clones given NDArrayList
          */
 //         #if NOT_EXCLUDED(OP_clone_list)
-        @Namespace("nd4j::ops") public static class clone_list extends DeclarableListOp {
+        @Namespace("sd::ops") public static class clone_list extends DeclarableListOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public clone_list(Pointer p) { super(p); }
@@ -15629,7 +15676,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * This operation unstacks given NDArray into NDArrayList by the first dimension
          */
 //         #if NOT_EXCLUDED(OP_unstack_list)
-        @Namespace("nd4j::ops") public static class unstack_list extends DeclarableListOp {
+        @Namespace("sd::ops") public static class unstack_list extends DeclarableListOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public unstack_list(Pointer p) { super(p); }
@@ -15692,7 +15739,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
        *    1: 3d tensor of cell state [bS x K x N]
        */
 //         #if NOT_EXCLUDED(OP_sru)
-        @Namespace("nd4j::ops") public static class sru extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class sru extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public sru(Pointer p) { super(p); }
@@ -15725,7 +15772,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
        *    1: 3d tensor of cell state [N x bS x 2K]
        */
 //         #if NOT_EXCLUDED(OP_sru_bi)
-        @Namespace("nd4j::ops") public static class sru_bi extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class sru_bi extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public sru_bi(Pointer p) { super(p); }
@@ -15764,7 +15811,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
        *    3: 2d, tensor of state gradients [bS x K]
        */
 //         #if NOT_EXCLUDED(OP_sru)
-        @Namespace("nd4j::ops") public static class sru_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class sru_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public sru_bp(Pointer p) { super(p); }
@@ -15802,7 +15849,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
        *    3: 2d, tensor of state gradients [bS x 2K]
        */
 //         #if NOT_EXCLUDED(OP_sru_bi)
-        @Namespace("nd4j::ops") public static class sru_bi_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class sru_bi_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public sru_bi_bp(Pointer p) { super(p); }
@@ -15852,7 +15899,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
        *    1: current cell state  [batchSize x numUnits], that is at current time step t
        */
 //         #if NOT_EXCLUDED(OP_lstmCell)
-        @Namespace("nd4j::ops") public static class lstmCell extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class lstmCell extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public lstmCell(Pointer p) { super(p); }
@@ -15906,7 +15953,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
        *    6: y (h)  - Current cell output [bS, numUnits], time t
        */
 //         #if NOT_EXCLUDED(OP_lstmBlockCell)
-        @Namespace("nd4j::ops") public static class lstmBlockCell extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class lstmBlockCell extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public lstmBlockCell(Pointer p) { super(p); }
@@ -15962,7 +16009,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
        *    6: y (h)  - Current cell output, rank 3, shape as per dataFormat
        */
 //         #if NOT_EXCLUDED(OP_lstmBlock)
-        @Namespace("nd4j::ops") public static class lstmBlock extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class lstmBlock extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public lstmBlock(Pointer p) { super(p); }
@@ -15981,7 +16028,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 
         //////////////////////////////////////////////////////////////////////////
 //         #if NOT_EXCLUDED(OP_lstmLayer)
-        @Namespace("nd4j::ops") public static class lstmLayer extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class lstmLayer extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public lstmLayer(Pointer p) { super(p); }
@@ -16014,7 +16061,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
        *    1: current cell state  [batchSize x inSize], that is at current time step t
        */
 //         #if NOT_EXCLUDED(OP_sruCell)
-        @Namespace("nd4j::ops") public static class sruCell extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class sruCell extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public sruCell(Pointer p) { super(p); }
@@ -16053,7 +16100,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
        *    3: Current cell output [bS, numUnits]
        */
 //         #if NOT_EXCLUDED(OP_gruCell)
-        @Namespace("nd4j::ops") public static class gruCell extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class gruCell extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public gruCell(Pointer p) { super(p); }
@@ -16071,7 +16118,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_gruCell)
-        @Namespace("nd4j::ops") public static class gruCell_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class gruCell_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public gruCell_bp(Pointer p) { super(p); }
@@ -16116,7 +16163,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
        *    1: cell states  [time x batchSize x numUnits], that is per each time step
        */
 //         #if NOT_EXCLUDED(OP_lstm)
-        @Namespace("nd4j::ops") public static class lstm extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class lstm extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public lstm(Pointer p) { super(p); }
@@ -16148,7 +16195,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
        *    0: cell outputs [time x batchSize x numUnits], that is per each time step
        */
 //         #if NOT_EXCLUDED(OP_gru)
-        @Namespace("nd4j::ops") public static class gru extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class gru extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public gru(Pointer p) { super(p); }
@@ -16181,7 +16228,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
        *    0: cell outputs [time x batchSize x numUnits]
        *    1: cell final non-zero output [batchSize x numUnits]
        */
-        @Namespace("nd4j::ops") public static class static_rnn extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class static_rnn extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public static_rnn(Pointer p) { super(p); }
@@ -16216,7 +16263,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
        *    0: cell outputs [time x batchSize x numUnits] or [batchSize x time x numUnits]
        *    1: cell final non-zero output [batchSize x numUnits]
        */
-        @Namespace("nd4j::ops") public static class dynamic_rnn extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class dynamic_rnn extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public dynamic_rnn(Pointer p) { super(p); }
@@ -16253,7 +16300,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
        *    1: cell final non-zero output for forward RNN  [batchSize x numUnitsFW]
        *    2: cell final non-zero output for backward RNN [batchSize x numUnitsBW]
        */
-        @Namespace("nd4j::ops") public static class static_bidirectional_rnn extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class static_bidirectional_rnn extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public static_bidirectional_rnn(Pointer p) { super(p); }
@@ -16294,7 +16341,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
        *    2: cell final non-zero output for forward  RNN [batchSize x numUnitsFW]
        *    3: cell final non-zero output for backward RNN [batchSize x numUnitsBW]
        */
-        @Namespace("nd4j::ops") public static class dynamic_bidirectional_rnn extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class dynamic_bidirectional_rnn extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public dynamic_bidirectional_rnn(Pointer p) { super(p); }
@@ -16341,7 +16388,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 
 // #include <ops/declarable/headers/common.h>
 //         #if NOT_EXCLUDED(OP_clipbyvalue)
-        @Namespace("nd4j::ops") public static class clipbyvalue extends DeclarableOp {
+        @Namespace("sd::ops") public static class clipbyvalue extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public clipbyvalue(Pointer p) { super(p); }
@@ -16359,7 +16406,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_clipbynorm)
-        @Namespace("nd4j::ops") public static class clipbynorm extends DeclarableOp {
+        @Namespace("sd::ops") public static class clipbynorm extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public clipbynorm(Pointer p) { super(p); }
@@ -16374,7 +16421,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class clipbynorm_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class clipbynorm_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public clipbynorm_bp(Pointer p) { super(p); }
@@ -16392,7 +16439,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_clipbyavgnorm)
-        @Namespace("nd4j::ops") public static class clipbyavgnorm extends DeclarableOp {
+        @Namespace("sd::ops") public static class clipbyavgnorm extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public clipbyavgnorm(Pointer p) { super(p); }
@@ -16410,7 +16457,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_cumsum)
-        @Namespace("nd4j::ops") public static class cumsum extends DeclarableOp {
+        @Namespace("sd::ops") public static class cumsum extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public cumsum(Pointer p) { super(p); }
@@ -16428,7 +16475,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_cumprod)
-        @Namespace("nd4j::ops") public static class cumprod extends DeclarableOp {
+        @Namespace("sd::ops") public static class cumprod extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public cumprod(Pointer p) { super(p); }
@@ -16446,7 +16493,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_tile)
-        @Namespace("nd4j::ops") public static class tile extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class tile extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public tile(Pointer p) { super(p); }
@@ -16461,7 +16508,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class tile_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class tile_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public tile_bp(Pointer p) { super(p); }
@@ -16479,7 +16526,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_repeat)
-        @Namespace("nd4j::ops") public static class repeat extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class repeat extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public repeat(Pointer p) { super(p); }
@@ -16497,7 +16544,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_invert_permutation)
-        @Namespace("nd4j::ops") public static class invert_permutation extends DeclarableOp {
+        @Namespace("sd::ops") public static class invert_permutation extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public invert_permutation(Pointer p) { super(p); }
@@ -16514,7 +16561,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                 }
 //         #endif
 
-        @Namespace("nd4j::ops") public static class concat extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class concat extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public concat(Pointer p) { super(p); }
@@ -16529,7 +16576,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class concat_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class concat_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public concat_bp(Pointer p) { super(p); }
@@ -16546,7 +16593,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                 }
 
 //         #if NOT_EXCLUDED(OP_mergemax)
-        @Namespace("nd4j::ops") public static class mergemax extends DeclarableOp {
+        @Namespace("sd::ops") public static class mergemax extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public mergemax(Pointer p) { super(p); }
@@ -16570,7 +16617,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * INT_ARG: result type (one of int), INT32 by default
          */
 //         #if NOT_EXCLUDED(OP_mergemaxindex)
-        @Namespace("nd4j::ops") public static class mergemaxindex extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class mergemaxindex extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public mergemaxindex(Pointer p) { super(p); }
@@ -16588,7 +16635,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_mergeadd)
-        @Namespace("nd4j::ops") public static class mergeadd extends DeclarableOp {
+        @Namespace("sd::ops") public static class mergeadd extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public mergeadd(Pointer p) { super(p); }
@@ -16606,7 +16653,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_mergeavg)
-        @Namespace("nd4j::ops") public static class mergeavg extends DeclarableOp {
+        @Namespace("sd::ops") public static class mergeavg extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public mergeavg(Pointer p) { super(p); }
@@ -16624,7 +16671,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_scatter_update)
-        @Namespace("nd4j::ops") public static class scatter_update extends DeclarableOp {
+        @Namespace("sd::ops") public static class scatter_update extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public scatter_update(Pointer p) { super(p); }
@@ -16642,7 +16689,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_Floor)
-        @Namespace("nd4j::ops") public static class Floor extends DeclarableOp {
+        @Namespace("sd::ops") public static class Floor extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public Floor(Pointer p) { super(p); }
@@ -16660,7 +16707,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_Log1p)
-        @Namespace("nd4j::ops") public static class Log1p extends DeclarableOp {
+        @Namespace("sd::ops") public static class Log1p extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public Log1p(Pointer p) { super(p); }
@@ -16678,7 +16725,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_reverse)
-        @Namespace("nd4j::ops") public static class reverse extends DeclarableOp {
+        @Namespace("sd::ops") public static class reverse extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public reverse(Pointer p) { super(p); }
@@ -16693,7 +16740,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class reverse_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class reverse_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public reverse_bp(Pointer p) { super(p); }
@@ -16711,7 +16758,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_gather)
-        @Namespace("nd4j::ops") public static class gather extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class gather extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public gather(Pointer p) { super(p); }
@@ -16729,7 +16776,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_pad)
-        @Namespace("nd4j::ops") public static class pad extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class pad extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public pad(Pointer p) { super(p); }
@@ -16762,7 +16809,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * IArgs[3,4,...] - optional, shape of batch, output matrix will have leading batch dimensions of this shape
          */
 //         #if NOT_EXCLUDED(OP_eye)
-        @Namespace("nd4j::ops") public static class eye extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class eye extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public eye(Pointer p) { super(p); }
@@ -16780,7 +16827,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_gather_nd)
-        @Namespace("nd4j::ops") public static class gather_nd extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class gather_nd extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public gather_nd(Pointer p) { super(p); }
@@ -16798,7 +16845,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_reverse_sequence)
-        @Namespace("nd4j::ops") public static class reverse_sequence extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class reverse_sequence extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public reverse_sequence(Pointer p) { super(p); }
@@ -16816,7 +16863,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_trace)
-        @Namespace("nd4j::ops") public static class trace extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class trace extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public trace(Pointer p) { super(p); }
@@ -16834,7 +16881,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_random_shuffle)
-        @Namespace("nd4j::ops") public static class random_shuffle extends DeclarableOp {
+        @Namespace("sd::ops") public static class random_shuffle extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public random_shuffle(Pointer p) { super(p); }
@@ -16864,7 +16911,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *  and global_norm as scalar tensor at the end
          */
 //         #if NOT_EXCLUDED(OP_clip_by_global_norm)
-        @Namespace("nd4j::ops") public static class clip_by_global_norm extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class clip_by_global_norm extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public clip_by_global_norm(Pointer p) { super(p); }
@@ -16881,7 +16928,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                 }
 //         #endif
 
-        @Namespace("nd4j::ops") public static class tri extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class tri extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public tri(Pointer p) { super(p); }
@@ -16897,7 +16944,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
 
-        @Namespace("nd4j::ops") public static class triu extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class triu extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public triu(Pointer p) { super(p); }
@@ -16913,7 +16960,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
 
-        @Namespace("nd4j::ops") public static class triu_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class triu_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public triu_bp(Pointer p) { super(p); }
@@ -16930,7 +16977,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                 }
 
 //         #if NOT_EXCLUDED(OP_mirror_pad)
-        @Namespace("nd4j::ops") public static class mirror_pad extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class mirror_pad extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public mirror_pad(Pointer p) { super(p); }
@@ -16948,7 +16995,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_cumsum)
-        @Namespace("nd4j::ops") public static class cumsum_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class cumsum_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public cumsum_bp(Pointer p) { super(p); }
@@ -16966,7 +17013,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_cumprod)
-        @Namespace("nd4j::ops") public static class cumprod_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class cumprod_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public cumprod_bp(Pointer p) { super(p); }
@@ -16985,7 +17032,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 
 
 //         #if NOT_EXCLUDED(OP_flatten)
-        @Namespace("nd4j::ops") public static class flatten extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class flatten extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public flatten(Pointer p) { super(p); }
@@ -17014,7 +17061,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *    nbins (optional) - number of histogram bins, default value is 100
          */
 //         #if NOT_EXCLUDED(OP_histogram_fixed_width)
-        @Namespace("nd4j::ops") public static class histogram_fixed_width extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class histogram_fixed_width extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public histogram_fixed_width(Pointer p) { super(p); }
@@ -17038,7 +17085,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *
          */
 //         #if NOT_EXCLUDED(OP_standardize)
-                @Namespace("nd4j::ops") public static class standardize extends DeclarableOp {
+                @Namespace("sd::ops") public static class standardize extends DeclarableOp {
                     static { Loader.load(); }
                     /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
                     public standardize(Pointer p) { super(p); }
@@ -17053,7 +17100,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-                @Namespace("nd4j::ops") public static class standardize_bp extends DeclarableCustomOp {
+                @Namespace("sd::ops") public static class standardize_bp extends DeclarableCustomOp {
                     static { Loader.load(); }
                     /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
                     public standardize_bp(Pointer p) { super(p); }
@@ -17074,7 +17121,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * This operation calculates hash code, optionally along dimension
          */
 //         #if NOT_EXCLUDED(OP_hashcode)
-            @Namespace("nd4j::ops") public static class hashcode extends DeclarableCustomOp {
+            @Namespace("sd::ops") public static class hashcode extends DeclarableCustomOp {
                 static { Loader.load(); }
                 /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
                 public hashcode(Pointer p) { super(p); }
@@ -17095,7 +17142,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * This operation calculates number of entries per bin
          */
 //         #if NOT_EXCLUDED(OP_histogram)
-        @Namespace("nd4j::ops") public static class histogram extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class histogram extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public histogram(Pointer p) { super(p); }
@@ -17153,7 +17200,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * 0: optional axis
          */
 //         #if NOT_EXCLUDED(OP_argmax)
-        @Namespace("nd4j::ops") public static class argmax extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class argmax extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public argmax(Pointer p) { super(p); }
@@ -17180,7 +17227,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * 0: optional axis
          */
 //         #if NOT_EXCLUDED(OP_argmin)
-        @Namespace("nd4j::ops") public static class argmin extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class argmin extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public argmin(Pointer p) { super(p); }
@@ -17218,7 +17265,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * 1: p for p-norm
          */
 //         #if NOT_EXCLUDED(OP_norm)
-        @Namespace("nd4j::ops") public static class norm extends DeclarableReductionOp {
+        @Namespace("sd::ops") public static class norm extends DeclarableReductionOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public norm(Pointer p) { super(p); }
@@ -17250,7 +17297,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
         *  0: has the same shape as input, corresponding diagonal elements are substituted
         */
 //         #if NOT_EXCLUDED(OP_matrix_set_diag)
-        @Namespace("nd4j::ops") public static class matrix_set_diag extends DeclarableOp {
+        @Namespace("sd::ops") public static class matrix_set_diag extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public matrix_set_diag(Pointer p) { super(p); }
@@ -17278,7 +17325,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
         * Output array:
         *   0: is considered as batch of matrices, if for example diagonal array has shape [A,B,C] then output array has shape [A,B,C,C]
         */
-        @Namespace("nd4j::ops") public static class matrix_diag extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class matrix_diag extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public matrix_diag(Pointer p) { super(p); }
@@ -17311,7 +17358,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
         * Three input and one output arrays must have the same shape
         */
 //         #if NOT_EXCLUDED(OP_betainc)
-        @Namespace("nd4j::ops") public static class betainc extends DeclarableOp {
+        @Namespace("sd::ops") public static class betainc extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public betainc(Pointer p) { super(p); }
@@ -17336,7 +17383,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * 1: bias vector
          */
 //         #if NOT_EXCLUDED(OP_biasadd)
-        @Namespace("nd4j::ops") public static class biasadd extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class biasadd extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public biasadd(Pointer p) { super(p); }
@@ -17351,7 +17398,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class biasadd_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class biasadd_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public biasadd_bp(Pointer p) { super(p); }
@@ -17372,7 +17419,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * Returns a diagonal tensor with a given diagonal values. Given a diagonal, this operation returns a tensor with the diagonal and everything else padded with zeros.
          */
 //         #if NOT_EXCLUDED(OP_diag)
-        @Namespace("nd4j::ops") public static class diag extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class diag extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public diag(Pointer p) { super(p); }
@@ -17393,7 +17440,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * Returns a diagonal tensor with a given diagonal values. Given a diagonal, this operation returns a tensor with the diagonal and everything else padded with zeros.
          */
 //         #if NOT_EXCLUDED(OP_diag_part)
-        @Namespace("nd4j::ops") public static class diag_part extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class diag_part extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public diag_part(Pointer p) { super(p); }
@@ -17419,7 +17466,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * Output: tensor with rank lesser by 1 from input
          */
 //         #if NOT_EXCLUDED(OP_matrix_diag_part)
-        @Namespace("nd4j::ops") public static class matrix_diag_part extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class matrix_diag_part extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public matrix_diag_part(Pointer p) { super(p); }
@@ -17448,7 +17495,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *    1 - float tensor with shape {.,..,...,NxN} - batch of upper triangular matricies {Rs}
          */
 //         #if NOT_EXCLUDED(OP_qr)
-        @Namespace("nd4j::ops") public static class qr extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class qr extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public qr(Pointer p) { super(p); }
@@ -17472,7 +17519,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * 1: vector with values to exclude
          */
 //         #if NOT_EXCLUDED(OP_listdiff)
-        @Namespace("nd4j::ops") public static class listdiff extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class listdiff extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public listdiff(Pointer p) { super(p); }
@@ -17497,7 +17544,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * updates: array containing elements to be interfered with input
          */
 //         #if NOT_EXCLUDED(OP_scatter_add)
-        @Namespace("nd4j::ops") public static class scatter_add extends DeclarableOp {
+        @Namespace("sd::ops") public static class scatter_add extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public scatter_add(Pointer p) { super(p); }
@@ -17522,7 +17569,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * updates: array containing elements to be interfered with input
          */
 //         #if NOT_EXCLUDED(OP_scatter_sub)
-        @Namespace("nd4j::ops") public static class scatter_sub extends DeclarableOp {
+        @Namespace("sd::ops") public static class scatter_sub extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public scatter_sub(Pointer p) { super(p); }
@@ -17547,7 +17594,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * updates: array containing elements to be interfered with input
          */
 //         #if NOT_EXCLUDED(OP_scatter_mul)
-        @Namespace("nd4j::ops") public static class scatter_mul extends DeclarableOp {
+        @Namespace("sd::ops") public static class scatter_mul extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public scatter_mul(Pointer p) { super(p); }
@@ -17572,7 +17619,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * updates: array containing elements to be interfered with input
          */
 //         #if NOT_EXCLUDED(OP_scatter_div)
-        @Namespace("nd4j::ops") public static class scatter_div extends DeclarableOp {
+        @Namespace("sd::ops") public static class scatter_div extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public scatter_div(Pointer p) { super(p); }
@@ -17597,7 +17644,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * updates: array containing elements to be interfered with input
          */
 //         #if NOT_EXCLUDED(OP_scatter_upd)
-        @Namespace("nd4j::ops") public static class scatter_upd extends DeclarableOp {
+        @Namespace("sd::ops") public static class scatter_upd extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public scatter_upd(Pointer p) { super(p); }
@@ -17622,7 +17669,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * updates: array containing elements to be interfered with input
          */
 //         #if NOT_EXCLUDED(OP_scatter_max)
-        @Namespace("nd4j::ops") public static class scatter_max extends DeclarableOp {
+        @Namespace("sd::ops") public static class scatter_max extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public scatter_max(Pointer p) { super(p); }
@@ -17647,7 +17694,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * updates: array containing elements to be interfered with input
          */
 //         #if NOT_EXCLUDED(OP_scatter_min)
-        @Namespace("nd4j::ops") public static class scatter_min extends DeclarableOp {
+        @Namespace("sd::ops") public static class scatter_min extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public scatter_min(Pointer p) { super(p); }
@@ -17672,7 +17719,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * shape: contains shape of output array
          */
 //         #if NOT_EXCLUDED(OP_scatter_nd)
-        @Namespace("nd4j::ops") public static class scatter_nd extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class scatter_nd extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public scatter_nd(Pointer p) { super(p); }
@@ -17697,7 +17744,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * updates: array containing elements to be inserted into input array
          */
 //         #if NOT_EXCLUDED(OP_scatter_nd_update)
-        @Namespace("nd4j::ops") public static class scatter_nd_update extends DeclarableOp {
+        @Namespace("sd::ops") public static class scatter_nd_update extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public scatter_nd_update(Pointer p) { super(p); }
@@ -17722,7 +17769,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * updates: array containing elements to be interfered with input
          */
 //         #if NOT_EXCLUDED(OP_scatter_add)
-        @Namespace("nd4j::ops") public static class scatter_nd_add extends DeclarableOp {
+        @Namespace("sd::ops") public static class scatter_nd_add extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public scatter_nd_add(Pointer p) { super(p); }
@@ -17747,7 +17794,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * updates: array containing elements to be interfered with input
          */
 //         #if NOT_EXCLUDED(OP_scatter_sub)
-        @Namespace("nd4j::ops") public static class scatter_nd_sub extends DeclarableOp {
+        @Namespace("sd::ops") public static class scatter_nd_sub extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public scatter_nd_sub(Pointer p) { super(p); }
@@ -17773,7 +17820,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * 0: scalar value, used to fill NDArray
          */
 //         #if NOT_EXCLUDED(OP_fill_as)
-        @Namespace("nd4j::ops") public static class fill_as extends DeclarableOp {
+        @Namespace("sd::ops") public static class fill_as extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public fill_as(Pointer p) { super(p); }
@@ -17794,7 +17841,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * This operation applies element-wise rint (round to integral value) operation
          */
 //         #if NOT_EXCLUDED(OP_rint)
-        @Namespace("nd4j::ops") public static class rint extends DeclarableOp {
+        @Namespace("sd::ops") public static class rint extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public rint(Pointer p) { super(p); }
@@ -17817,7 +17864,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * input: N-dimensional array
          */
 //         #if NOT_EXCLUDED(OP_unique)
-        @Namespace("nd4j::ops") public static class unique extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class unique extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public unique(Pointer p) { super(p); }
@@ -17845,7 +17892,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *     2 - 1D array with counts for values in array above
          */
 //         #if NOT_EXCLUDED(OP_unique_with_counts)
-        @Namespace("nd4j::ops") public static class unique_with_counts extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class unique_with_counts extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public unique_with_counts(Pointer p) { super(p); }
@@ -17871,7 +17918,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * 0..: TAD axis
          */
 //         #if NOT_EXCLUDED(OP_tear)
-        @Namespace("nd4j::ops") public static class tear extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class tear extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public tear(Pointer p) { super(p); }
@@ -17893,7 +17940,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * \tparam T
          */
 //         #if NOT_EXCLUDED(OP_unstack)
-        @Namespace("nd4j::ops") public static class unstack extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class unstack extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public unstack(Pointer p) { super(p); }
@@ -17914,7 +17961,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * This operation extracts a strided (optionally) slice from a tensor,
          */
 //         #if NOT_EXCLUDED(OP_strided_slice)
-        @Namespace("nd4j::ops") public static class strided_slice extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class strided_slice extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public strided_slice(Pointer p) { super(p); }
@@ -17929,7 +17976,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 } // TODO: new op type needed. that returns VIEW
-        @Namespace("nd4j::ops") public static class strided_slice_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class strided_slice_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public strided_slice_bp(Pointer p) { super(p); }
@@ -17951,7 +17998,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *
          */
 //         #if NOT_EXCLUDED(OP_slice)
-        @Namespace("nd4j::ops") public static class slice extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class slice extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public slice(Pointer p) { super(p); }
@@ -17966,7 +18013,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class slice_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class slice_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public slice_bp(Pointer p) { super(p); }
@@ -18001,7 +18048,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * 1: optional scalar witn step value
          */
 //         #if NOT_EXCLUDED(OP_range)
-        @Namespace("nd4j::ops") public static class range extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class range extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public range(Pointer p) { super(p); }
@@ -18032,7 +18079,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * 1: axis
          */
 //         #if NOT_EXCLUDED(OP_onehot)
-        @Namespace("nd4j::ops") public static class onehot extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class onehot extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public onehot(Pointer p) { super(p); }
@@ -18063,7 +18110,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *
          */
 //         #if NOT_EXCLUDED(OP_confusion_matrix)
-        @Namespace("nd4j::ops") public static class confusion_matrix extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class confusion_matrix extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public confusion_matrix(Pointer p) { super(p); }
@@ -18087,7 +18134,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 		 *
 		 */
 //         #if NOT_EXCLUDED(OP_stack)
-        @Namespace("nd4j::ops") public static class stack extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class stack extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public stack(Pointer p) { super(p); }
@@ -18112,7 +18159,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * TODO: make this operation reduction, to allow TAD -> size
          */
 //         #if NOT_EXCLUDED(OP_size)
-        @Namespace("nd4j::ops") public static class size extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class size extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public size(Pointer p) { super(p); }
@@ -18134,7 +18181,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * This operation returns rank of input array as scalar value.
          */
 //         #if NOT_EXCLUDED(OP_rank)
-        @Namespace("nd4j::ops") public static class rank extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class rank extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public rank(Pointer p) { super(p); }
@@ -18153,7 +18200,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 
 
 //         #if NOT_EXCLUDED(OP_broadcastgradientargs)
-        @Namespace("nd4j::ops") public static class broadcastgradientargs extends DeclarableOp {
+        @Namespace("sd::ops") public static class broadcastgradientargs extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public broadcastgradientargs(Pointer p) { super(p); }
@@ -18177,7 +18224,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *
          */
 //         #if NOT_EXCLUDED(OP_zeros_as)
-        @Namespace("nd4j::ops") public static class zeros_as extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class zeros_as extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public zeros_as(Pointer p) { super(p); }
@@ -18201,7 +18248,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *
          */
 //         #if NOT_EXCLUDED(OP_ones_as)
-        @Namespace("nd4j::ops") public static class ones_as extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class ones_as extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public ones_as(Pointer p) { super(p); }
@@ -18224,7 +18271,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * input: N-Dimensional array
          */
 //         #if NOT_EXCLUDED(OP_square)
-        @Namespace("nd4j::ops") public static class square extends DeclarableOp {
+        @Namespace("sd::ops") public static class square extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public square(Pointer p) { super(p); }
@@ -18255,7 +18302,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
         * Two input and one output arrays must have the same shape
         */
 //         #if NOT_EXCLUDED(OP_zeta)
-        @Namespace("nd4j::ops") public static class zeta extends DeclarableOp {
+        @Namespace("sd::ops") public static class zeta extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public zeta(Pointer p) { super(p); }
@@ -18286,7 +18333,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
         * Two input and one output arrays have the same shape
         */
 //         #if NOT_EXCLUDED(OP_polygamma)
-        @Namespace("nd4j::ops") public static class polygamma extends DeclarableOp {
+        @Namespace("sd::ops") public static class polygamma extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public polygamma(Pointer p) { super(p); }
@@ -18314,7 +18361,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
         *
         */
 //         #if NOT_EXCLUDED(OP_lgamma)
-        @Namespace("nd4j::ops") public static class lgamma extends DeclarableOp {
+        @Namespace("sd::ops") public static class lgamma extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public lgamma(Pointer p) { super(p); }
@@ -18342,7 +18389,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
         *
         */
 //         #if NOT_EXCLUDED(OP_digamma)
-        @Namespace("nd4j::ops") public static class digamma extends DeclarableOp {
+        @Namespace("sd::ops") public static class digamma extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public digamma(Pointer p) { super(p); }
@@ -18370,7 +18417,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *
          */
 //         #if NOT_EXCLUDED(OP_fill)
-        @Namespace("nd4j::ops") public static class fill extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class fill extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public fill(Pointer p) { super(p); }
@@ -18399,7 +18446,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *
          */
 //         #if NOT_EXCLUDED(OP_split_v)
-        @Namespace("nd4j::ops") public static class split_v extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class split_v extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public split_v(Pointer p) { super(p); }
@@ -18426,7 +18473,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * 1 - optional axis
          */
 //         #if NOT_EXCLUDED(OP_split)
-        @Namespace("nd4j::ops") public static class split extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class split extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public split(Pointer p) { super(p); }
@@ -18457,7 +18504,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * 0 - optional argument, corresponds to dimension with 3 channels
          */
 //         #if NOT_EXCLUDED(OP_adjust_hue)
-        @Namespace("nd4j::ops") public static class adjust_hue extends DeclarableOp {
+        @Namespace("sd::ops") public static class adjust_hue extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public adjust_hue(Pointer p) { super(p); }
@@ -18487,7 +18534,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * 0 - optional argument, corresponds to dimension with 3 channels
          */
 //         #if NOT_EXCLUDED(OP_adjust_saturation)
-        @Namespace("nd4j::ops") public static class adjust_saturation extends DeclarableOp {
+        @Namespace("sd::ops") public static class adjust_saturation extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public adjust_saturation(Pointer p) { super(p); }
@@ -18515,7 +18562,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *
          */
 //         #if NOT_EXCLUDED(OP_adjust_contrast)
-        @Namespace("nd4j::ops") public static class adjust_contrast extends DeclarableOp {
+        @Namespace("sd::ops") public static class adjust_contrast extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public adjust_contrast(Pointer p) { super(p); }
@@ -18530,7 +18577,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class adjust_contrast_v2 extends DeclarableOp {
+        @Namespace("sd::ops") public static class adjust_contrast_v2 extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public adjust_contrast_v2(Pointer p) { super(p); }
@@ -18568,7 +18615,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *                             optional (default 0)
          */
 //         #if NOT_EXCLUDED(OP_depth_to_space)
-        @Namespace("nd4j::ops") public static class depth_to_space extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class depth_to_space extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public depth_to_space(Pointer p) { super(p); }
@@ -18604,7 +18651,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *
          */
 //         #if NOT_EXCLUDED(OP_space_to_depth)
-        @Namespace("nd4j::ops") public static class space_to_depth extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class space_to_depth extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public space_to_depth(Pointer p) { super(p); }
@@ -18628,7 +18675,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * 1 - vector or tensor B
          */
 //         #if NOT_EXCLUDED(OP_cross)
-        @Namespace("nd4j::ops") public static class cross extends DeclarableOp {
+        @Namespace("sd::ops") public static class cross extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public cross(Pointer p) { super(p); }
@@ -18663,7 +18710,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *
          */
 //         #if NOT_EXCLUDED(OP_space_to_batch)
-        @Namespace("nd4j::ops") public static class space_to_batch extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class space_to_batch extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public space_to_batch(Pointer p) { super(p); }
@@ -18697,7 +18744,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *
          * */
 //         #if NOT_EXCLUDED(OP_space_to_batch_nd)
-        @Namespace("nd4j::ops") public static class space_to_batch_nd extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class space_to_batch_nd extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public space_to_batch_nd(Pointer p) { super(p); }
@@ -18719,7 +18766,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *
          */
 //         #if NOT_EXCLUDED(OP_batch_to_space)
-        @Namespace("nd4j::ops") public static class batch_to_space extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class batch_to_space extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public batch_to_space(Pointer p) { super(p); }
@@ -18736,7 +18783,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                 }
 //         #endif
 //         #if NOT_EXCLUDED(OP_batch_to_space_nd)
-        @Namespace("nd4j::ops") public static class batch_to_space_nd extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class batch_to_space_nd extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public batch_to_space_nd(Pointer p) { super(p); }
@@ -18763,7 +18810,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *  The third is boolean value(default is true) (0 - as is, 1 - sorted by value) optional
          */
 //         #if NOT_EXCLUDED(OP_top_k)
-        @Namespace("nd4j::ops") public static class top_k extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class top_k extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public top_k(Pointer p) { super(p); }
@@ -18788,7 +18835,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *  The third is k
          */
 //         #if NOT_EXCLUDED(OP_in_top_k)
-        @Namespace("nd4j::ops") public static class in_top_k extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class in_top_k extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public in_top_k(Pointer p) { super(p); }
@@ -18815,7 +18862,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * the optional flag "keep_dims" can be set as T param
          */
 //         #if NOT_EXCLUDED(OP_moments)
-        @Namespace("nd4j::ops") public static class moments extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class moments extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public moments(Pointer p) { super(p); }
@@ -18837,7 +18884,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * accordingly to index array given.
          */
 //         #if NOT_EXCLUDED(OP_embedding_lookup)
-        @Namespace("nd4j::ops") public static class embedding_lookup extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class embedding_lookup extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public embedding_lookup(Pointer p) { super(p); }
@@ -18865,7 +18912,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * returns a num of NDArrays as output
          */
 //         #if NOT_EXCLUDED(OP_dynamic_partition)
-        @Namespace("nd4j::ops") public static class dynamic_partition extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class dynamic_partition extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public dynamic_partition(Pointer p) { super(p); }
@@ -18883,7 +18930,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_dynamic_partition_bp)
-        @Namespace("nd4j::ops") public static class dynamic_partition_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class dynamic_partition_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public dynamic_partition_bp(Pointer p) { super(p); }
@@ -18912,7 +18959,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * the operation is inversion od dynamic_partition
          */
 //         #if NOT_EXCLUDED(OP_dynamic_stitch)
-        @Namespace("nd4j::ops") public static class dynamic_stitch extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class dynamic_stitch extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public dynamic_stitch(Pointer p) { super(p); }
@@ -18937,7 +18984,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * output value - a real number with given type (e.g. float or double)
          */
 //         #if NOT_EXCLUDED(OP_zero_fraction)
-        @Namespace("nd4j::ops") public static class zero_fraction extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class zero_fraction extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public zero_fraction(Pointer p) { super(p); }
@@ -18965,7 +19012,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * output value - 2D matrix NxN as multiply of matrixes and add vector
          */
 //         #if NOT_EXCLUDED(OP_xw_plus_b)
-        @Namespace("nd4j::ops") public static class xw_plus_b extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class xw_plus_b extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public xw_plus_b(Pointer p) { super(p); }
@@ -18988,7 +19035,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * Input - NDArray, output - NDArray with the same shape.
          */
 //         #if NOT_EXCLUDED(OP_stop_gradient)
-        @Namespace("nd4j::ops") public static class stop_gradient extends DeclarableOp {
+        @Namespace("sd::ops") public static class stop_gradient extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public stop_gradient(Pointer p) { super(p); }
@@ -19006,7 +19053,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_parallel_stack)
-        @Namespace("nd4j::ops") public static class parallel_stack extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class parallel_stack extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public parallel_stack(Pointer p) { super(p); }
@@ -19036,7 +19083,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *  returns a normalized pair mean and variance with the same shapes as input
          */
 //         #if NOT_EXCLUDED(OP_normalize_moments)
-        @Namespace("nd4j::ops") public static class normalize_moments extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class normalize_moments extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public normalize_moments(Pointer p) { super(p); }
@@ -19072,7 +19119,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *     - shift (if was given by input floating param)
          */
 //         #if NOT_EXCLUDED(OP_sufficient_statistics)
-        @Namespace("nd4j::ops") public static class sufficient_statistics extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class sufficient_statistics extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public sufficient_statistics(Pointer p) { super(p); }
@@ -19099,7 +19146,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *  return value - a tensor with the same shape as target or input
          */
 //         #if NOT_EXCLUDED(OP_weighted_cross_entropy_with_logits)
-        @Namespace("nd4j::ops") public static class weighted_cross_entropy_with_logits extends DeclarableOp {
+        @Namespace("sd::ops") public static class weighted_cross_entropy_with_logits extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public weighted_cross_entropy_with_logits(Pointer p) { super(p); }
@@ -19127,7 +19174,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *  return value - a tensor with the same shape as target or input
          */
 //         #if NOT_EXCLUDED(OP_dropout)
-        @Namespace("nd4j::ops") public static class dropout extends DeclarableOp {
+        @Namespace("sd::ops") public static class dropout extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public dropout(Pointer p) { super(p); }
@@ -19144,7 +19191,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                 }
 //         #endif
 //         #if NOT_EXCLUDED(OP_dropout_bp)
-        @Namespace("nd4j::ops") public static class dropout_bp extends DeclarableOp {
+        @Namespace("sd::ops") public static class dropout_bp extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public dropout_bp(Pointer p) { super(p); }
@@ -19169,7 +19216,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                 3 - beta value
          */
 //         #if NOT_EXCLUDED(OP_alpha_dropout_bp)
-        @Namespace("nd4j::ops") public static class alpha_dropout_bp extends DeclarableOp {
+        @Namespace("sd::ops") public static class alpha_dropout_bp extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public alpha_dropout_bp(Pointer p) { super(p); }
@@ -19205,7 +19252,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *
          */
 //         #if NOT_EXCLUDED(OP_bincount)
-        @Namespace("nd4j::ops") public static class bincount extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class bincount extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public bincount(Pointer p) { super(p); }
@@ -19233,7 +19280,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *    vector with broadcasted shape
          */
 //         #if NOT_EXCLUDED(OP_broadcast_dynamic_shape)
-        @Namespace("nd4j::ops") public static class broadcast_dynamic_shape extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class broadcast_dynamic_shape extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public broadcast_dynamic_shape(Pointer p) { super(p); }
@@ -19261,7 +19308,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * M x M matricies
          */
 //         #if NOT_EXCLUDED(OP_matrix_determinant)
-        @Namespace("nd4j::ops") public static class matrix_determinant extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class matrix_determinant extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public matrix_determinant(Pointer p) { super(p); }
@@ -19290,7 +19337,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          */
 
 //         #if NOT_EXCLUDED(OP_log_matrix_determinant)
-        @Namespace("nd4j::ops") public static class log_matrix_determinant extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class log_matrix_determinant extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public log_matrix_determinant(Pointer p) { super(p); }
@@ -19319,7 +19366,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          */
 
 //         #if NOT_EXCLUDED(OP_logdet)
-        @Namespace("nd4j::ops") public static class logdet extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class logdet extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public logdet(Pointer p) { super(p); }
@@ -19336,6 +19383,76 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                 }
 //         #endif
 
+        /**
+         * matrix_solve_ls op (lstsq) - solves one or more linear least-squares problems.
+         *
+         * input params:
+         *    0 - the tensor with dimension (x * y * z * ::: * M * N) - left parts of equations
+         *    1 - the tensor with dimension (x * y * z * ::: * M * K) - right parts of equations
+         *
+         * float args:
+         *    0 - l2_regularizer (default 0. and only for 0 implemented)
+         *
+         * boolean args:
+         *    0 - fast - default is true (optional) - use Cholesky decomposition instead QR decomposition of matricies.
+         *
+         * return value:
+         *    tensor with dimension (x * y * z * ::: * N * K) with solutions
+         *
+         */
+//         #if NOT_EXCLUDED(OP_lstsq)
+        @Namespace("sd::ops") public static class lstsq extends DeclarableCustomOp {
+            static { Loader.load(); }
+            /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
+            public lstsq(Pointer p) { super(p); }
+            /** Native array allocator. Access with {@link Pointer#position(long)}. */
+            public lstsq(long size) { super((Pointer)null); allocateArray(size); }
+            private native void allocateArray(long size);
+            @Override public lstsq position(long position) {
+                return (lstsq)super.position(position);
+            }
+        
+                                                                                    public lstsq() { super((Pointer)null); allocate(); }
+                                                                                    private native void allocate();
+                                                                                    public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
+                                                                                }
+//         #endif
+
+        /* solve_ls - analog of lstsq op with another solution approach
+         *
+         * input params:
+         *    0 - the tensor with dimension (x * y * z * ::: * M * N) - left parts of equations
+         *    1 - the tensor with dimension (x * y * z * ::: * M * K) - right parts of equations
+         *
+         * float args:
+         *    0 - l2_regularizer (default 0. and only for 0 implemented)
+         *
+         * boolean args:
+         *    0 - fast - default is true (optional) - use Cholesky decomposition instead QR decomposition of matricies.
+         *
+         * return value:
+         *    tensor with dimension (x * y * z * ::: * N * K) with solutions
+         *
+         * Note: if fast is false - then l2_regularizer arg is ignored and used lstsq method due QR decomposition
+         * */
+//         #if NOT_EXCLUDED(OP_solve_ls)
+                @Namespace("sd::ops") public static class solve_ls extends DeclarableCustomOp {
+                    static { Loader.load(); }
+                    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
+                    public solve_ls(Pointer p) { super(p); }
+                    /** Native array allocator. Access with {@link Pointer#position(long)}. */
+                    public solve_ls(long size) { super((Pointer)null); allocateArray(size); }
+                    private native void allocateArray(long size);
+                    @Override public solve_ls position(long position) {
+                        return (solve_ls)super.position(position);
+                    }
+                
+                                                                                    public solve_ls() { super((Pointer)null); allocate(); }
+                                                                                    private native void allocate();
+                                                                                    public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
+                                                                                }
+//         #endif
+
         /**
          * matrix_inverse op. - make inverse for all 2D square matricies found in the input tensor
          *
@@ -19346,7 +19463,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *    tensor with dimension (x * y * z * ::: * M * M) with inverse M x M matricies in it
          */
 //         #if NOT_EXCLUDED(OP_matrix_inverse)
-        @Namespace("nd4j::ops") public static class matrix_inverse extends DeclarableOp {
+        @Namespace("sd::ops") public static class matrix_inverse extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public matrix_inverse(Pointer p) { super(p); }
@@ -19379,7 +19496,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *
          */
 //         #if NOT_EXCLUDED(OP_triangular_solve)
-        @Namespace("nd4j::ops") public static class triangular_solve extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class triangular_solve extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public triangular_solve(Pointer p) { super(p); }
@@ -19411,7 +19528,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *
          */
 //         #if NOT_EXCLUDED(OP_solve)
-        @Namespace("nd4j::ops") public static class solve extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class solve extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public solve(Pointer p) { super(p); }
@@ -19443,7 +19560,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          */
 
 //         #if NOT_EXCLUDED(OP_matrix_inverse)
-        @Namespace("nd4j::ops") public static class lu extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class lu extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public lu(Pointer p) { super(p); }
@@ -19471,7 +19588,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *    (N+1)D tensor filled by 0 and 1 accordingly the mask
          */
 //         #if NOT_EXCLUDED(OP_sequence_mask)
-        @Namespace("nd4j::ops") public static class sequence_mask extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class sequence_mask extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public sequence_mask(Pointer p) { super(p); }
@@ -19499,7 +19616,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          */
 
 //         #if NOT_EXCLUDED(OP_segment_max)
-        @Namespace("nd4j::ops") public static class segment_max extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class segment_max extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public segment_max(Pointer p) { super(p); }
@@ -19516,7 +19633,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                 }
 //         #endif
 //         #if NOT_EXCLUDED(OP_segment_max_bp)
-        @Namespace("nd4j::ops") public static class segment_max_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class segment_max_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public segment_max_bp(Pointer p) { super(p); }
@@ -19544,7 +19661,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *    tensor with min values according to indices sets.
          */
 //         #if NOT_EXCLUDED(OP_segment_min)
-        @Namespace("nd4j::ops") public static class segment_min extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class segment_min extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public segment_min(Pointer p) { super(p); }
@@ -19561,7 +19678,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                 }
 //         #endif
 //         #if NOT_EXCLUDED(OP_segment_min_bp)
-        @Namespace("nd4j::ops") public static class segment_min_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class segment_min_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public segment_min_bp(Pointer p) { super(p); }
@@ -19589,7 +19706,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *    tensor with sum of values according to indices sets.
          */
 //         #if NOT_EXCLUDED(OP_segment_sum)
-        @Namespace("nd4j::ops") public static class segment_sum extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class segment_sum extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public segment_sum(Pointer p) { super(p); }
@@ -19606,7 +19723,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                 }
 //         #endif
 //         #if NOT_EXCLUDED(OP_segment_sum_bp)
-        @Namespace("nd4j::ops") public static class segment_sum_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class segment_sum_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public segment_sum_bp(Pointer p) { super(p); }
@@ -19634,7 +19751,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *    tensor with product of values according to indices sets.
          */
 //         #if NOT_EXCLUDED(OP_segment_prod)
-        @Namespace("nd4j::ops") public static class segment_prod extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class segment_prod extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public segment_prod(Pointer p) { super(p); }
@@ -19651,7 +19768,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                 }
 //         #endif
 //         #if NOT_EXCLUDED(OP_segment_prod_bp)
-        @Namespace("nd4j::ops") public static class segment_prod_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class segment_prod_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public segment_prod_bp(Pointer p) { super(p); }
@@ -19678,7 +19795,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *    tensor with average of values according to indices sets.
          */
 //         #if NOT_EXCLUDED(OP_segment_mean)
-        @Namespace("nd4j::ops") public static class segment_mean extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class segment_mean extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public segment_mean(Pointer p) { super(p); }
@@ -19695,7 +19812,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                 }
 //         #endif
 //         #if NOT_EXCLUDED(OP_segment_mean_bp)
-        @Namespace("nd4j::ops") public static class segment_mean_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class segment_mean_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public segment_mean_bp(Pointer p) { super(p); }
@@ -19723,7 +19840,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *    tensor with max values according to indices sets.
          */
 //         #if NOT_EXCLUDED(OP_unsorted_segment_max)
-        @Namespace("nd4j::ops") public static class unsorted_segment_max extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class unsorted_segment_max extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public unsorted_segment_max(Pointer p) { super(p); }
@@ -19740,7 +19857,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                 }
 //         #endif
 //         #if NOT_EXCLUDED(OP_unsorted_segment_max_bp)
-        @Namespace("nd4j::ops") public static class unsorted_segment_max_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class unsorted_segment_max_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public unsorted_segment_max_bp(Pointer p) { super(p); }
@@ -19771,7 +19888,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *    tensor with min values according to indices sets.
          */
 //         #if NOT_EXCLUDED(OP_unsorted_segment_min_bp)
-        @Namespace("nd4j::ops") public static class unsorted_segment_min extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class unsorted_segment_min extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public unsorted_segment_min(Pointer p) { super(p); }
@@ -19788,7 +19905,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                 }
 //         #endif
 //         #if NOT_EXCLUDED(OP_unsorted_segment_min_bp)
-        @Namespace("nd4j::ops") public static class unsorted_segment_min_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class unsorted_segment_min_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public unsorted_segment_min_bp(Pointer p) { super(p); }
@@ -19819,7 +19936,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *    tensor with sum of values according to indices sets.
          */
 //         #if NOT_EXCLUDED(OP_unsorted_segment_sum)
-        @Namespace("nd4j::ops") public static class unsorted_segment_sum extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class unsorted_segment_sum extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public unsorted_segment_sum(Pointer p) { super(p); }
@@ -19836,7 +19953,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                 }
 //         #endif
 //         #if NOT_EXCLUDED(OP_unsorted_segment_sum_bp)
-        @Namespace("nd4j::ops") public static class unsorted_segment_sum_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class unsorted_segment_sum_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public unsorted_segment_sum_bp(Pointer p) { super(p); }
@@ -19867,7 +19984,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *    tensor with product of values according to indices sets.
          */
 //         #if NOT_EXCLUDED(OP_unsorted_segment_prod)
-        @Namespace("nd4j::ops") public static class unsorted_segment_prod extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class unsorted_segment_prod extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public unsorted_segment_prod(Pointer p) { super(p); }
@@ -19884,7 +20001,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                 }
 //         #endif
 //         #if NOT_EXCLUDED(OP_unsorted_segment_prod_bp)
-        @Namespace("nd4j::ops") public static class unsorted_segment_prod_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class unsorted_segment_prod_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public unsorted_segment_prod_bp(Pointer p) { super(p); }
@@ -19915,7 +20032,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *    tensor with average of values according to indices sets.
          */
 //         #if NOT_EXCLUDED(OP_unsorted_segment_mean)
-        @Namespace("nd4j::ops") public static class unsorted_segment_mean extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class unsorted_segment_mean extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public unsorted_segment_mean(Pointer p) { super(p); }
@@ -19932,7 +20049,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                 }
 //         #endif
 //         #if NOT_EXCLUDED(OP_unsorted_segment_mean_bp)
-        @Namespace("nd4j::ops") public static class unsorted_segment_mean_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class unsorted_segment_mean_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public unsorted_segment_mean_bp(Pointer p) { super(p); }
@@ -19963,7 +20080,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *    tensor with average of values according to indices sets.
          */
 //         #if NOT_EXCLUDED(OP_unsorted_segment_sqrt)
-        @Namespace("nd4j::ops") public static class unsorted_segment_sqrt_n extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class unsorted_segment_sqrt_n extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public unsorted_segment_sqrt_n(Pointer p) { super(p); }
@@ -19980,7 +20097,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                 }
 //         #endif
 //         #if NOT_EXCLUDED(OP_unsorted_segment_sqrt_n_bp)
-        @Namespace("nd4j::ops") public static class unsorted_segment_sqrt_n_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class unsorted_segment_sqrt_n_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public unsorted_segment_sqrt_n_bp(Pointer p) { super(p); }
@@ -20013,7 +20130,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *    6 - padding_type - 0 - equiv 'VALID', 1 - 'SAME'
          */
 //         #if NOT_EXCLUDED(OP_extract_image_patches)
-        @Namespace("nd4j::ops") public static class extract_image_patches extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class extract_image_patches extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public extract_image_patches(Pointer p) { super(p); }
@@ -20044,7 +20161,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *    0 - 4D tensor with same shape as images (input 0)
          */
 //         #if NOT_EXCLUDED(OP_draw_bounding_boxes)
-        @Namespace("nd4j::ops") public static class draw_bounding_boxes extends DeclarableOp {
+        @Namespace("sd::ops") public static class draw_bounding_boxes extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public draw_bounding_boxes(Pointer p) { super(p); }
@@ -20080,7 +20197,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *    0 - NDArray with the same shape as input.
          */
 //         #if NOT_EXCLUDED(OP_roll)
-        @Namespace("nd4j::ops") public static class roll extends DeclarableOp {
+        @Namespace("sd::ops") public static class roll extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public roll(Pointer p) { super(p); }
@@ -20109,7 +20226,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *    0 - 1D NDArray with the same type as input and length as given with numOfElements param.
          */
 //         #if NOT_EXCLUDED(OP_lin_space)
-        @Namespace("nd4j::ops") public static class lin_space extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class lin_space extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public lin_space(Pointer p) { super(p); }
@@ -20147,7 +20264,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *    0 - NDArray with reduces shape accordingly to axes (the scalar in default case).
          */
 //         #if NOT_EXCLUDED(OP_reduce_sum)
-        @Namespace("nd4j::ops") public static class reduce_sum extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class reduce_sum extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public reduce_sum(Pointer p) { super(p); }
@@ -20165,7 +20282,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_reduce_sum_bp)
-        @Namespace("nd4j::ops") public static class reduce_sum_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class reduce_sum_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public reduce_sum_bp(Pointer p) { super(p); }
@@ -20203,7 +20320,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *    0 - NDArray with reduces shape accordingly to axes (the scalar in default case).
          */
 //         #if NOT_EXCLUDED(OP_reduce_prod)
-        @Namespace("nd4j::ops") public static class reduce_prod extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class reduce_prod extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public reduce_prod(Pointer p) { super(p); }
@@ -20221,7 +20338,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_reduce_prod_bp)
-        @Namespace("nd4j::ops") public static class reduce_prod_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class reduce_prod_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public reduce_prod_bp(Pointer p) { super(p); }
@@ -20254,7 +20371,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
         *    reduced tensor with calculated mins
         */
 //         #if NOT_EXCLUDED(OP_reduce_min)
-        @Namespace("nd4j::ops") public static class reduce_min extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class reduce_min extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public reduce_min(Pointer p) { super(p); }
@@ -20271,7 +20388,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                 }
 //         #endif
 //         #if NOT_EXCLUDED(OP_reduce_min_bp)
-        @Namespace("nd4j::ops") public static class reduce_min_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class reduce_min_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public reduce_min_bp(Pointer p) { super(p); }
@@ -20304,7 +20421,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
         *    reduced tensor with calculated maxes
         */
 //         #if NOT_EXCLUDED(OP_reduce_max)
-        @Namespace("nd4j::ops") public static class reduce_max extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class reduce_max extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public reduce_max(Pointer p) { super(p); }
@@ -20321,7 +20438,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                 }
 //         #endif
 //         #if NOT_EXCLUDED(OP_reduce_max_bp)
-        @Namespace("nd4j::ops") public static class reduce_max_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class reduce_max_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public reduce_max_bp(Pointer p) { super(p); }
@@ -20354,7 +20471,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
         *    reduced tensor with calculated norm1
         */
 //         #if NOT_EXCLUDED(OP_reduce_norm1)
-        @Namespace("nd4j::ops") public static class reduce_norm1 extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class reduce_norm1 extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public reduce_norm1(Pointer p) { super(p); }
@@ -20371,7 +20488,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                 }
 //         #endif
 //         #if NOT_EXCLUDED(OP_reduce_norm1_bp)
-        @Namespace("nd4j::ops") public static class reduce_norm1_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class reduce_norm1_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public reduce_norm1_bp(Pointer p) { super(p); }
@@ -20404,7 +20521,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
         *    reduced tensor with calculated norm2
         */
 //         #if NOT_EXCLUDED(OP_reduce_norm2)
-        @Namespace("nd4j::ops") public static class reduce_norm2 extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class reduce_norm2 extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public reduce_norm2(Pointer p) { super(p); }
@@ -20421,7 +20538,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                 }
 //         #endif
 //         #if NOT_EXCLUDED(OP_reduce_norm2_bp)
-        @Namespace("nd4j::ops") public static class reduce_norm2_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class reduce_norm2_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public reduce_norm2_bp(Pointer p) { super(p); }
@@ -20455,7 +20572,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
         *    reduced tensor with calculated norm
         */
 //         #if NOT_EXCLUDED(OP_reduce_sqnorm)
-        @Namespace("nd4j::ops") public static class reduce_sqnorm extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class reduce_sqnorm extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public reduce_sqnorm(Pointer p) { super(p); }
@@ -20472,7 +20589,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                 }
 //         #endif
 //         #if NOT_EXCLUDED(OP_reduce_sqnorm_bp)
-        @Namespace("nd4j::ops") public static class reduce_sqnorm_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class reduce_sqnorm_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public reduce_sqnorm_bp(Pointer p) { super(p); }
@@ -20505,7 +20622,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
         *    reduced tensor with calculated norm
         */
 //         #if NOT_EXCLUDED(OP_reduce_norm_max)
-        @Namespace("nd4j::ops") public static class reduce_norm_max extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class reduce_norm_max extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public reduce_norm_max(Pointer p) { super(p); }
@@ -20522,7 +20639,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                 }
 //         #endif
 //         #if NOT_EXCLUDED(OP_reduce_norm_max_bp)
-        @Namespace("nd4j::ops") public static class reduce_norm_max_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class reduce_norm_max_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public reduce_norm_max_bp(Pointer p) { super(p); }
@@ -20555,7 +20672,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
         *    reduced tensor with calculated means
         */
 //         #if NOT_EXCLUDED(OP_reduce_mean)
-        @Namespace("nd4j::ops") public static class reduce_mean extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class reduce_mean extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public reduce_mean(Pointer p) { super(p); }
@@ -20573,7 +20690,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_reduce_mean_bp)
-        @Namespace("nd4j::ops") public static class reduce_mean_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class reduce_mean_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public reduce_mean_bp(Pointer p) { super(p); }
@@ -20605,7 +20722,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
         * output array:
         *    reduced tensor with calculated means
         */
-        @Namespace("nd4j::ops") public static class reduce_variance extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class reduce_variance extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public reduce_variance(Pointer p) { super(p); }
@@ -20620,7 +20737,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class reduce_variance_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class reduce_variance_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public reduce_variance_bp(Pointer p) { super(p); }
@@ -20651,7 +20768,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
         * output array:
         *    reduced tensor with calculated means
         */
-        @Namespace("nd4j::ops") public static class reduce_stdev extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class reduce_stdev extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public reduce_stdev(Pointer p) { super(p); }
@@ -20666,7 +20783,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class reduce_stdev_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class reduce_stdev_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public reduce_stdev_bp(Pointer p) { super(p); }
@@ -20700,7 +20817,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
         */
 
 //         #if NOT_EXCLUDED(OP_reduce_dot_bp)
-        @Namespace("nd4j::ops") public static class reduce_dot_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class reduce_dot_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public reduce_dot_bp(Pointer p) { super(p); }
@@ -20739,7 +20856,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *    0 - NDArray with reduces shape accordingly to axes (the scalar in default case).
          */
 //         #if NOT_EXCLUDED(OP_reduce_logsumexp)
-        @Namespace("nd4j::ops") public static class reduce_logsumexp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class reduce_logsumexp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public reduce_logsumexp(Pointer p) { super(p); }
@@ -20775,7 +20892,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
         *   the 4D-Tensor with resized to crop_size images given - float type
         */
 //         #if NOT_EXCLUDED(OP_crop_and_resize)
-        @Namespace("nd4j::ops") public static class crop_and_resize extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class crop_and_resize extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public crop_and_resize(Pointer p) { super(p); }
@@ -20810,7 +20927,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
         */
 
 //         #if NOT_EXCLUDED(OP_resize_bilinear)
-        @Namespace("nd4j::ops") public static class resize_bilinear extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class resize_bilinear extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public resize_bilinear(Pointer p) { super(p); }
@@ -20845,7 +20962,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
         */
 
 //         #if NOT_EXCLUDED(OP_resize_nearest_neighbor)
-        @Namespace("nd4j::ops") public static class resize_nearest_neighbor extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class resize_nearest_neighbor extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public resize_nearest_neighbor(Pointer p) { super(p); }
@@ -20874,7 +20991,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
         *
         */
 //         #if NOT_EXCLUDED(OP_resize_bicubic)
-        @Namespace("nd4j::ops") public static class resize_bicubic extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class resize_bicubic extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public resize_bicubic(Pointer p) { super(p); }
@@ -20909,7 +21026,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
         *
         */
 //         #if NOT_EXCLUDED(OP_resize_area)
-        @Namespace("nd4j::ops") public static class resize_area extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class resize_area extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public resize_area(Pointer p) { super(p); }
@@ -20947,7 +21064,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
         */
 
 //         #if NOT_EXCLUDED(OP_image_resize)
-        @Namespace("nd4j::ops") public static class image_resize extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class image_resize extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public image_resize(Pointer p) { super(p); }
@@ -20980,7 +21097,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
         */
 
 //         #if NOT_EXCLUDED(OP_matrix_band_part)
-        @Namespace("nd4j::ops") public static class matrix_band_part extends DeclarableOp {
+        @Namespace("sd::ops") public static class matrix_band_part extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public matrix_band_part(Pointer p) { super(p); }
@@ -20999,7 +21116,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 
 
 //         #if NOT_EXCLUDED(OP_Assert)
-        @Namespace("nd4j::ops") public static class Assert extends DeclarableOp {
+        @Namespace("sd::ops") public static class Assert extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public Assert(Pointer p) { super(p); }
@@ -21033,7 +21150,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *
          * */
 //         #if NOT_EXCLUDED(OP_image_non_max_suppression)
-        @Namespace("nd4j::ops") public static class non_max_suppression extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class non_max_suppression extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public non_max_suppression(Pointer p) { super(p); }
@@ -21050,7 +21167,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                 }
 //         #endif
 //         #if NOT_EXCLUDED(OP_image_non_max_suppression_v3)
-                @Namespace("nd4j::ops") public static class non_max_suppression_v3 extends DeclarableCustomOp {
+                @Namespace("sd::ops") public static class non_max_suppression_v3 extends DeclarableCustomOp {
                     static { Loader.load(); }
                     /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
                     public non_max_suppression_v3(Pointer p) { super(p); }
@@ -21083,7 +21200,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *     0 - 1D integer tensor with shape [M], epresenting the selected indices from the overlaps tensor, where M <= max_output_size
          * */
 //         #if NOT_EXCLUDED(OP_image_non_max_suppression_overlaps)
-        @Namespace("nd4j::ops") public static class non_max_suppression_overlaps extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class non_max_suppression_overlaps extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public non_max_suppression_overlaps(Pointer p) { super(p); }
@@ -21108,7 +21225,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * output - lower triangular matrix (matricies when rank > 2) with the same shape as input.
          * */
 //         #if NOT_EXCLUDED(OP_cholesky)
-        @Namespace("nd4j::ops") public static class cholesky extends DeclarableOp {
+        @Namespace("sd::ops") public static class cholesky extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public cholesky(Pointer p) { super(p); }
@@ -21134,7 +21251,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *    0 - NDArray with the same shape as input
          */
 //         #if NOT_EXCLUDED(OP_nth_element)
-        @Namespace("nd4j::ops") public static class nth_element extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class nth_element extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public nth_element(Pointer p) { super(p); }
@@ -21155,7 +21272,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * This op checks for Inf/NaN values within input array, and throws exception if there's at least one
          */
 //         #if NOT_EXCLUDED(OP_check_numerics)
-        @Namespace("nd4j::ops") public static class check_numerics extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class check_numerics extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public check_numerics(Pointer p) { super(p); }
@@ -21187,7 +21304,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *    0 - NDArray with the same shape as input
          */
 //         #if NOT_EXCLUDED(OP_fake_quant_with_min_max_vars)
-        @Namespace("nd4j::ops") public static class fake_quant_with_min_max_vars extends DeclarableOp {
+        @Namespace("sd::ops") public static class fake_quant_with_min_max_vars extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public fake_quant_with_min_max_vars(Pointer p) { super(p); }
@@ -21220,7 +21337,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *    0 - NDArray with the same shape as input
          */
 //         #if NOT_EXCLUDED(OP_fake_quant_with_min_max_vars_per_channel)
-                @Namespace("nd4j::ops") public static class fake_quant_with_min_max_vars_per_channel extends DeclarableOp {
+                @Namespace("sd::ops") public static class fake_quant_with_min_max_vars_per_channel extends DeclarableOp {
                     static { Loader.load(); }
                     /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
                     public fake_quant_with_min_max_vars_per_channel(Pointer p) { super(p); }
@@ -21249,7 +21366,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *    0 - NDArray with the same shape as input and type uint8
          */
 //         #if NOT_EXCLUDED(OP_compare_and_bitpack)
-        @Namespace("nd4j::ops") public static class compare_and_bitpack extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class compare_and_bitpack extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public compare_and_bitpack(Pointer p) { super(p); }
@@ -21298,7 +21415,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 
 // #include <ops/declarable/headers/common.h>
 //         #if NOT_EXCLUDED(OP_permute)
-        @Namespace("nd4j::ops") public static class permute extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class permute extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public permute(Pointer p) { super(p); }
@@ -21316,7 +21433,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_reshapeas)
-        @Namespace("nd4j::ops") public static class reshapeas extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class reshapeas extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public reshapeas(Pointer p) { super(p); }
@@ -21334,7 +21451,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_transpose)
-        @Namespace("nd4j::ops") public static class transpose extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class transpose extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public transpose(Pointer p) { super(p); }
@@ -21352,7 +21469,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_shape_of)
-        @Namespace("nd4j::ops") public static class shape_of extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class shape_of extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public shape_of(Pointer p) { super(p); }
@@ -21370,7 +21487,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_shapes_of)
-        @Namespace("nd4j::ops") public static class shapes_of extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class shapes_of extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public shapes_of(Pointer p) { super(p); }
@@ -21388,7 +21505,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_squeeze)
-        @Namespace("nd4j::ops") public static class squeeze extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class squeeze extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public squeeze(Pointer p) { super(p); }
@@ -21406,7 +21523,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_expand_dims)
-        @Namespace("nd4j::ops") public static class expand_dims extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class expand_dims extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public expand_dims(Pointer p) { super(p); }
@@ -21424,7 +21541,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_reshape)
-        @Namespace("nd4j::ops") public static class reshape extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class reshape extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public reshape(Pointer p) { super(p); }
@@ -21442,7 +21559,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_size_at)
-        @Namespace("nd4j::ops") public static class size_at extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class size_at extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public size_at(Pointer p) { super(p); }
@@ -21469,7 +21586,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * \tparam T
          */
 //         #if NOT_EXCLUDED(OP_order)
-        @Namespace("nd4j::ops") public static class order extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class order extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public order(Pointer p) { super(p); }
@@ -21492,7 +21609,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * \tparam T
          */
 //         #if NOT_EXCLUDED(OP_tile_to_shape)
-        @Namespace("nd4j::ops") public static class tile_to_shape extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class tile_to_shape extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public tile_to_shape(Pointer p) { super(p); }
@@ -21507,7 +21624,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class tile_to_shape_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class tile_to_shape_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public tile_to_shape_bp(Pointer p) { super(p); }
@@ -21532,7 +21649,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *  shape array - array containing shape be broadcasted to
          */
 //         #if NOT_EXCLUDED(OP_broadcast_to)
-        @Namespace("nd4j::ops") public static class broadcast_to extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class broadcast_to extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public broadcast_to(Pointer p) { super(p); }
@@ -21551,7 +21668,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 
 
 //         #if NOT_EXCLUDED(OP_evaluate_reduction_shape)
-        @Namespace("nd4j::ops") public static class evaluate_reduction_shape extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class evaluate_reduction_shape extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public evaluate_reduction_shape(Pointer p) { super(p); }
@@ -21581,7 +21698,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *    initialization option
          */
 //         #if NOT_EXCLUDED(OP_create)
-        @Namespace("nd4j::ops") public static class create extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class create extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public create(Pointer p) { super(p); }
@@ -21630,7 +21747,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 
 // #include <ops/declarable/headers/common.h>
 //         #if NOT_EXCLUDED(OP_set_seed)
-        @Namespace("nd4j::ops") public static class set_seed extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class set_seed extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public set_seed(Pointer p) { super(p); }
@@ -21648,7 +21765,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_get_seed)
-        @Namespace("nd4j::ops") public static class get_seed extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class get_seed extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public get_seed(Pointer p) { super(p); }
@@ -21678,7 +21795,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *    0 - uniformly distributed values of given type (between min and max)
          */
 //         #if NOT_EXCLUDED(OP_randomuniform)
-        @Namespace("nd4j::ops") public static class randomuniform extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class randomuniform extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public randomuniform(Pointer p) { super(p); }
@@ -21708,7 +21825,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *    0 - 2D ndarray with the drawn samples of shape [batch_size, num_samples]
          */
 //         #if NOT_EXCLUDED(OP_random_multinomial)
-        @Namespace("nd4j::ops") public static class random_multinomial extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class random_multinomial extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public random_multinomial(Pointer p) { super(p); }
@@ -21726,7 +21843,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_random_normal)
-        @Namespace("nd4j::ops") public static class random_normal extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class random_normal extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public random_normal(Pointer p) { super(p); }
@@ -21744,7 +21861,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_random_bernoulli)
-        @Namespace("nd4j::ops") public static class random_bernoulli extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class random_bernoulli extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public random_bernoulli(Pointer p) { super(p); }
@@ -21762,7 +21879,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_random_exponential)
-        @Namespace("nd4j::ops") public static class random_exponential extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class random_exponential extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public random_exponential(Pointer p) { super(p); }
@@ -21780,7 +21897,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_random_crop)
-        @Namespace("nd4j::ops") public static class random_crop extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class random_crop extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public random_crop(Pointer p) { super(p); }
@@ -21801,7 +21918,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * random_gamma op.
          */
 //         #if NOT_EXCLUDED(OP_random_gamma)
-        @Namespace("nd4j::ops") public static class random_gamma extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class random_gamma extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public random_gamma(Pointer p) { super(p); }
@@ -21822,7 +21939,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * random_poisson op.
          */
 //         #if NOT_EXCLUDED(OP_random_poisson)
-        @Namespace("nd4j::ops") public static class random_poisson extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class random_poisson extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public random_poisson(Pointer p) { super(p); }
@@ -21872,7 +21989,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 // #include <ops/declarable/headers/common.h>
 
 //         #if NOT_EXCLUDED(OP_softmax)
-        @Namespace("nd4j::ops") public static class softmax extends DeclarableOp {
+        @Namespace("sd::ops") public static class softmax extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public softmax(Pointer p) { super(p); }
@@ -21887,7 +22004,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class softmax_bp extends DeclarableOp {
+        @Namespace("sd::ops") public static class softmax_bp extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public softmax_bp(Pointer p) { super(p); }
@@ -21919,7 +22036,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * output - 4D array
          */
 //         #if NOT_EXCLUDED(OP_lrn)
-        @Namespace("nd4j::ops") public static class lrn extends DeclarableOp {
+        @Namespace("sd::ops") public static class lrn extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public lrn(Pointer p) { super(p); }
@@ -21953,7 +22070,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * output - next approximation as 4D array
          */
 //         #if NOT_EXCLUDED(OP_lrn)
-        @Namespace("nd4j::ops") public static class lrn_bp extends DeclarableOp {
+        @Namespace("sd::ops") public static class lrn_bp extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public lrn_bp(Pointer p) { super(p); }
@@ -21990,7 +22107,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
         * 0: epsilon
         */
 //         #if NOT_EXCLUDED(OP_batchnorm)
-        @Namespace("nd4j::ops") public static class batchnorm extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class batchnorm extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public batchnorm(Pointer p) { super(p); }
@@ -22033,7 +22150,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
         * dL/dBeta, optional
         */
 //         #if NOT_EXCLUDED(OP_batchnorm)
-        @Namespace("nd4j::ops") public static class batchnorm_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class batchnorm_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public batchnorm_bp(Pointer p) { super(p); }
@@ -22062,7 +22179,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * 0: optional, learning rate
          */
 //         #if NOT_EXCLUDED(OP_apply_sgd)
-        @Namespace("nd4j::ops") public static class apply_sgd extends DeclarableOp {
+        @Namespace("sd::ops") public static class apply_sgd extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public apply_sgd(Pointer p) { super(p); }
@@ -22100,7 +22217,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * 1: isTraining, may have two values: zero -> inference, unity -> training
          */
 //         #if NOT_EXCLUDED(OP_fused_batch_norm)
-        @Namespace("nd4j::ops") public static class fused_batch_norm extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class fused_batch_norm extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public fused_batch_norm(Pointer p) { super(p); }
@@ -22118,7 +22235,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_log_softmax)
-        @Namespace("nd4j::ops") public static class log_softmax extends DeclarableOp {
+        @Namespace("sd::ops") public static class log_softmax extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public log_softmax(Pointer p) { super(p); }
@@ -22133,7 +22250,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class log_softmax_bp extends DeclarableOp {
+        @Namespace("sd::ops") public static class log_softmax_bp extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public log_softmax_bp(Pointer p) { super(p); }
@@ -22154,7 +22271,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
         /**
          * relu_layer = relu(x*w + b)
          */
-        @Namespace("nd4j::ops") public static class relu_layer extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class relu_layer extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public relu_layer(Pointer p) { super(p); }
@@ -22174,11 +22291,11 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * applies layer normalization to input
          * y = g * standardize(x) + b
          *
-         * see nd4j::ops::standardize
+         * see sd::ops::standardize
          *
          */
 //         #if NOT_EXCLUDED(OP_layer_norm)
-                @Namespace("nd4j::ops") public static class layer_norm extends DeclarableOp {
+                @Namespace("sd::ops") public static class layer_norm extends DeclarableOp {
                     static { Loader.load(); }
                     /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
                     public layer_norm(Pointer p) { super(p); }
@@ -22193,7 +22310,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-                @Namespace("nd4j::ops") public static class layer_norm_bp extends DeclarableCustomOp {
+                @Namespace("sd::ops") public static class layer_norm_bp extends DeclarableCustomOp {
                     static { Loader.load(); }
                     /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
                     public layer_norm_bp(Pointer p) { super(p); }
@@ -22242,7 +22359,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * 1: OPTIONAL; Attention weights of shape [batchSize, timesteps, queryCount] or [batchSize, numHeads, timesteps, queryCount]
          */
 //         #if NOT_EXCLUDED(OP_dot_product_attention)
-                @Namespace("nd4j::ops") public static class dot_product_attention extends DeclarableCustomOp {
+                @Namespace("sd::ops") public static class dot_product_attention extends DeclarableCustomOp {
                     static { Loader.load(); }
                     /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
                     public dot_product_attention(Pointer p) { super(p); }
@@ -22257,7 +22374,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-                @Namespace("nd4j::ops") public static class dot_product_attention_bp extends DeclarableCustomOp {
+                @Namespace("sd::ops") public static class dot_product_attention_bp extends DeclarableCustomOp {
                     static { Loader.load(); }
                     /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
                     public dot_product_attention_bp(Pointer p) { super(p); }
@@ -22305,7 +22422,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * 1: OPTIONAL; Attention weights of shape [batchSize, numHeads, timesteps, queryCount]
          */
 //         #if NOT_EXCLUDED(OP_multi_head_dot_product_attention)
-                @Namespace("nd4j::ops") public static class multi_head_dot_product_attention extends DeclarableCustomOp {
+                @Namespace("sd::ops") public static class multi_head_dot_product_attention extends DeclarableCustomOp {
                     static { Loader.load(); }
                     /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
                     public multi_head_dot_product_attention(Pointer p) { super(p); }
@@ -22320,7 +22437,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-                @Namespace("nd4j::ops") public static class multi_head_dot_product_attention_bp extends DeclarableCustomOp {
+                @Namespace("sd::ops") public static class multi_head_dot_product_attention_bp extends DeclarableCustomOp {
                     static { Loader.load(); }
                     /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
                     public multi_head_dot_product_attention_bp(Pointer p) { super(p); }
@@ -22384,7 +22501,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * 1: transB (where applicable)
          */
 //         #if NOT_EXCLUDED(OP_matmul)
-        @Namespace("nd4j::ops") public static class matmul extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class matmul extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public matmul(Pointer p) { super(p); }
@@ -22399,7 +22516,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class matmul_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class matmul_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public matmul_bp(Pointer p) { super(p); }
@@ -22427,7 +22544,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * IArgs[1]... axes values for second array
          */
 //         #if NOT_EXCLUDED(OP_tensormmul)
-        @Namespace("nd4j::ops") public static class tensormmul extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class tensormmul extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public tensormmul(Pointer p) { super(p); }
@@ -22442,7 +22559,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class tensormmul_bp extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class tensormmul_bp extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public tensormmul_bp(Pointer p) { super(p); }
@@ -22464,7 +22581,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * Math is: y += a * x;
          */
 //         #if NOT_EXCLUDED(OP_axpy)
-        @Namespace("nd4j::ops") public static class axpy extends DeclarableOp {
+        @Namespace("sd::ops") public static class axpy extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public axpy(Pointer p) { super(p); }
@@ -22495,7 +22612,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * PLEASE NOTE: M, N, K, ldA, ldB, ldC should be equal for all matrices within batch.
          */
 //         #if NOT_EXCLUDED(OP_batched_gemm)
-        @Namespace("nd4j::ops") public static class batched_gemm extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class batched_gemm extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public batched_gemm(Pointer p) { super(p); }
@@ -22532,7 +22649,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *            Recommended value is 16. 
          */
 //         #if NOT_EXCLUDED(OP_svd)
-        @Namespace("nd4j::ops") public static class svd extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class svd extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public svd(Pointer p) { super(p); }
@@ -22576,7 +22693,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //
 // #include <ops/declarable/headers/common.h>
 //         #if NOT_EXCLUDED(OP_test_output_reshape)
-        @Namespace("nd4j::ops") public static class test_output_reshape extends DeclarableOp {
+        @Namespace("sd::ops") public static class test_output_reshape extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public test_output_reshape(Pointer p) { super(p); }
@@ -22594,7 +22711,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_test_scalar)
-        @Namespace("nd4j::ops") public static class test_scalar extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class test_scalar extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public test_scalar(Pointer p) { super(p); }
@@ -22612,7 +22729,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_testreduction)
-        @Namespace("nd4j::ops") public static class testreduction extends DeclarableReductionOp {
+        @Namespace("sd::ops") public static class testreduction extends DeclarableReductionOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public testreduction(Pointer p) { super(p); }
@@ -22629,7 +22746,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_noop)
-        @Namespace("nd4j::ops") public static class noop extends DeclarableOp {
+        @Namespace("sd::ops") public static class noop extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public noop(Pointer p) { super(p); }
@@ -22647,7 +22764,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_testop2i2o)
-        @Namespace("nd4j::ops") public static class testop2i2o extends DeclarableOp {
+        @Namespace("sd::ops") public static class testop2i2o extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public testop2i2o(Pointer p) { super(p); }
@@ -22665,7 +22782,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 //         #endif
 
 //         #if NOT_EXCLUDED(OP_testcustom)
-        @Namespace("nd4j::ops") public static class testcustom extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class testcustom extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public testcustom(Pointer p) { super(p); }
@@ -22718,7 +22835,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * \tparam T
          */
 //         #if NOT_EXCLUDED(OP_toggle_bits)
-        @Namespace("nd4j::ops") public static class toggle_bits extends DeclarableOp {
+        @Namespace("sd::ops") public static class toggle_bits extends DeclarableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public toggle_bits(Pointer p) { super(p); }
@@ -22744,7 +22861,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * \tparam T
          */
 //         #if NOT_EXCLUDED(OP_shift_bits)
-        @Namespace("nd4j::ops") public static class shift_bits extends BroadcastableOp {
+        @Namespace("sd::ops") public static class shift_bits extends BroadcastableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public shift_bits(Pointer p) { super(p); }
@@ -22768,7 +22885,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * \tparam T
          */
 //         #if NOT_EXCLUDED(OP_rshift_bits)
-        @Namespace("nd4j::ops") public static class rshift_bits extends BroadcastableOp {
+        @Namespace("sd::ops") public static class rshift_bits extends BroadcastableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public rshift_bits(Pointer p) { super(p); }
@@ -22792,7 +22909,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * \tparam T
          */
 //         #if NOT_EXCLUDED(OP_cyclic_shift_bits)
-        @Namespace("nd4j::ops") public static class cyclic_shift_bits extends BroadcastableOp {
+        @Namespace("sd::ops") public static class cyclic_shift_bits extends BroadcastableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public cyclic_shift_bits(Pointer p) { super(p); }
@@ -22816,7 +22933,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * \tparam T
          */
 //         #if NOT_EXCLUDED(OP_cyclic_rshift_bits)
-        @Namespace("nd4j::ops") public static class cyclic_rshift_bits extends BroadcastableOp {
+        @Namespace("sd::ops") public static class cyclic_rshift_bits extends BroadcastableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public cyclic_rshift_bits(Pointer p) { super(p); }
@@ -22840,7 +22957,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * \tparam T
          */
 //         #if NOT_EXCLUDED(OP_bitwise_and)
-        @Namespace("nd4j::ops") public static class bitwise_and extends BroadcastableOp {
+        @Namespace("sd::ops") public static class bitwise_and extends BroadcastableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public bitwise_and(Pointer p) { super(p); }
@@ -22864,7 +22981,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * \tparam T
          */
 //         #if NOT_EXCLUDED(OP_bitwise_or)
-        @Namespace("nd4j::ops") public static class bitwise_or extends BroadcastableOp {
+        @Namespace("sd::ops") public static class bitwise_or extends BroadcastableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public bitwise_or(Pointer p) { super(p); }
@@ -22888,7 +23005,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * \tparam T
          */
 //         #if NOT_EXCLUDED(OP_bitwise_xor)
-        @Namespace("nd4j::ops") public static class bitwise_xor extends BroadcastableOp {
+        @Namespace("sd::ops") public static class bitwise_xor extends BroadcastableOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public bitwise_xor(Pointer p) { super(p); }
@@ -22912,7 +23029,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * \tparam T
          */
 //         #if NOT_EXCLUDED(OP_bits_hamming_distance)
-        @Namespace("nd4j::ops") public static class bits_hamming_distance extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class bits_hamming_distance extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public bits_hamming_distance(Pointer p) { super(p); }
@@ -22983,7 +23100,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
        *       Can be an array with the same shape as logits or just single scalar, depending on reduction mode (see input integer argument)
        */               
 //         #if NOT_EXCLUDED(OP_hinge_loss)
-        @Namespace("nd4j::ops") public static class hinge_loss extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class hinge_loss extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public hinge_loss(Pointer p) { super(p); }
@@ -22998,7 +23115,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class hinge_loss_grad extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class hinge_loss_grad extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public hinge_loss_grad(Pointer p) { super(p); }
@@ -23044,7 +23161,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
        *       Can be an array with the same shape as predictions or just single scalar, depending on reduction mode (see input integer argument)
        */      
 //         #if NOT_EXCLUDED(OP_huber_loss)
-        @Namespace("nd4j::ops") public static class huber_loss extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class huber_loss extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public huber_loss(Pointer p) { super(p); }
@@ -23059,7 +23176,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class huber_loss_grad extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class huber_loss_grad extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public huber_loss_grad(Pointer p) { super(p); }
@@ -23103,7 +23220,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
        *       Can be an array with the same shape as predictions or just single scalar, depending on reduction mode (see input integer argument)
        */      
 //         #if NOT_EXCLUDED(OP_log_loss)
-        @Namespace("nd4j::ops") public static class log_loss extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class log_loss extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public log_loss(Pointer p) { super(p); }
@@ -23118,7 +23235,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class log_loss_grad extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class log_loss_grad extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public log_loss_grad(Pointer p) { super(p); }
@@ -23143,7 +23260,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * output value - a real number with given type (e.g. float or double)
          */
 //         #if NOT_EXCLUDED(OP_l2_loss)
-        @Namespace("nd4j::ops") public static class l2_loss extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class l2_loss extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public l2_loss(Pointer p) { super(p); }
@@ -23183,7 +23300,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          *       Can be an array with the same shape as log_predictions or just single scalar, depending on reduction mode (see input integer argument)
          */
 //         #if NOT_EXCLUDED(OP_log_poisson_loss)
-        @Namespace("nd4j::ops") public static class log_poisson_loss extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class log_poisson_loss extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public log_poisson_loss(Pointer p) { super(p); }
@@ -23198,7 +23315,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class log_poisson_loss_grad extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class log_poisson_loss_grad extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public log_poisson_loss_grad(Pointer p) { super(p); }
@@ -23230,7 +23347,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
        *    0: loss value, it is just single scalar, type float.
        */     
 //         #if NOT_EXCLUDED(OP_mean_pairwssqerr_loss)
-        @Namespace("nd4j::ops") public static class mean_pairwssqerr_loss extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class mean_pairwssqerr_loss extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public mean_pairwssqerr_loss(Pointer p) { super(p); }
@@ -23245,7 +23362,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class mean_pairwssqerr_loss_grad extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class mean_pairwssqerr_loss_grad extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public mean_pairwssqerr_loss_grad(Pointer p) { super(p); }
@@ -23286,7 +23403,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
        *       Can be an array with the same shape as predictions or just single scalar, depending on reduction mode (see input integer argument)
        */      
 //         #if NOT_EXCLUDED(OP_mean_sqerr_loss)
-        @Namespace("nd4j::ops") public static class mean_sqerr_loss extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class mean_sqerr_loss extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public mean_sqerr_loss(Pointer p) { super(p); }
@@ -23301,7 +23418,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class mean_sqerr_loss_grad extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class mean_sqerr_loss_grad extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public mean_sqerr_loss_grad(Pointer p) { super(p); }
@@ -23345,7 +23462,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
        *       Can be an array with the same shape as logits or just single scalar, depending on reduction mode (see input integer argument)
        */      
 //         #if NOT_EXCLUDED(OP_sigm_cross_entropy_loss)
-        @Namespace("nd4j::ops") public static class sigm_cross_entropy_loss extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class sigm_cross_entropy_loss extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public sigm_cross_entropy_loss(Pointer p) { super(p); }
@@ -23360,7 +23477,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class sigm_cross_entropy_loss_grad extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class sigm_cross_entropy_loss_grad extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public sigm_cross_entropy_loss_grad(Pointer p) { super(p); }
@@ -23404,7 +23521,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
        *       Can be an array with shape as in logits except last dimension is equal to unity or just single scalar, depending on reduction mode (see input integer argument)
        */      
 //         #if NOT_EXCLUDED(OP_softmax_cross_entropy_loss)
-        @Namespace("nd4j::ops") public static class softmax_cross_entropy_loss extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class softmax_cross_entropy_loss extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public softmax_cross_entropy_loss(Pointer p) { super(p); }
@@ -23419,7 +23536,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }  
-        @Namespace("nd4j::ops") public static class softmax_cross_entropy_loss_grad extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class softmax_cross_entropy_loss_grad extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public softmax_cross_entropy_loss_grad(Pointer p) { super(p); }
@@ -23460,7 +23577,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
        *       Can be an array with the same shape as predictions or just single scalar, depending on reduction mode (see input integer argument)
        */      
 //         #if NOT_EXCLUDED(OP_absolute_difference_loss)
-        @Namespace("nd4j::ops") public static class absolute_difference_loss extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class absolute_difference_loss extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public absolute_difference_loss(Pointer p) { super(p); }
@@ -23475,7 +23592,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class absolute_difference_loss_grad extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class absolute_difference_loss_grad extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public absolute_difference_loss_grad(Pointer p) { super(p); }
@@ -23517,7 +23634,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
        *       Can be an array with the same shape as predictions or just single scalar, depending on reduction mode (see input integer argument)
        */         
 //         #if NOT_EXCLUDED(OP_cosine_distance_loss)
-        @Namespace("nd4j::ops") public static class cosine_distance_loss extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class cosine_distance_loss extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public cosine_distance_loss(Pointer p) { super(p); }
@@ -23532,7 +23649,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class cosine_distance_loss_grad extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class cosine_distance_loss_grad extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public cosine_distance_loss_grad(Pointer p) { super(p); }
@@ -23565,7 +23682,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
        *    0: loss values, type float. An array with shape resulting from reducing of logits shape along dimension with classes
        */      
 //         #if NOT_EXCLUDED(OP_softmax_cross_entropy_loss_with_logits)
-        @Namespace("nd4j::ops") public static class softmax_cross_entropy_loss_with_logits extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class softmax_cross_entropy_loss_with_logits extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public softmax_cross_entropy_loss_with_logits(Pointer p) { super(p); }
@@ -23580,7 +23697,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class softmax_cross_entropy_loss_with_logits_grad extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class softmax_cross_entropy_loss_with_logits_grad extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public softmax_cross_entropy_loss_with_logits_grad(Pointer p) { super(p); }
@@ -23610,7 +23727,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
        *    0: loss values, type float. Has the same shape as labels
        */      
 //         #if NOT_EXCLUDED(OP_sparse_softmax_cross_entropy_loss_with_logits)
-        @Namespace("nd4j::ops") public static class sparse_softmax_cross_entropy_loss_with_logits extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class sparse_softmax_cross_entropy_loss_with_logits extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public sparse_softmax_cross_entropy_loss_with_logits(Pointer p) { super(p); }
@@ -23625,7 +23742,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
                                                                                     private native void allocate();
                                                                                     public native ShapeList calculateOutputShape(ShapeList inputShape, @ByRef Context block);
                                                                                 }
-        @Namespace("nd4j::ops") public static class sparse_softmax_cross_entropy_loss_with_logits_grad extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class sparse_softmax_cross_entropy_loss_with_logits_grad extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public sparse_softmax_cross_entropy_loss_with_logits_grad(Pointer p) { super(p); }
@@ -23679,7 +23796,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * PLEASE NOTE: This op is disabled atm, and reserved for future releases.
          */
 //         #if NOT_EXCLUDED(OP_to_double)
-        @Namespace("nd4j::ops") public static class to_double extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class to_double extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public to_double(Pointer p) { super(p); }
@@ -23702,7 +23819,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * PLEASE NOTE: This op is disabled atm, and reserved for future releases.
          */
 //         #if NOT_EXCLUDED(OP_to_float16)
-        @Namespace("nd4j::ops") public static class to_float16 extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class to_float16 extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public to_float16(Pointer p) { super(p); }
@@ -23725,7 +23842,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * PLEASE NOTE: This op is disabled atm, and reserved for future releases.
          */
 //         #if NOT_EXCLUDED(OP_to_float32)
-        @Namespace("nd4j::ops") public static class to_float32 extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class to_float32 extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public to_float32(Pointer p) { super(p); }
@@ -23748,7 +23865,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * PLEASE NOTE: This op is disabled atm, and reserved for future releases.
          */
 //         #if NOT_EXCLUDED(OP_to_int32)
-        @Namespace("nd4j::ops") public static class to_int32 extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class to_int32 extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public to_int32(Pointer p) { super(p); }
@@ -23771,7 +23888,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * PLEASE NOTE: This op is disabled atm, and reserved for future releases.
          */
 //         #if NOT_EXCLUDED(OP_to_int64)
-        @Namespace("nd4j::ops") public static class to_int64 extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class to_int64 extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public to_int64(Pointer p) { super(p); }
@@ -23794,7 +23911,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * PLEASE NOTE: This op is disabled atm, and reserved for future releases.
          */
 //         #if NOT_EXCLUDED(OP_to_uint32)
-        @Namespace("nd4j::ops") public static class to_uint32 extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class to_uint32 extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public to_uint32(Pointer p) { super(p); }
@@ -23817,7 +23934,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * PLEASE NOTE: This op is disabled atm, and reserved for future releases.
          */
 //         #if NOT_EXCLUDED(OP_to_uint64)
-        @Namespace("nd4j::ops") public static class to_uint64 extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class to_uint64 extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public to_uint64(Pointer p) { super(p); }
@@ -23844,7 +23961,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * 0: target DataType
          */
 //         #if NOT_EXCLUDED(OP_cast)
-        @Namespace("nd4j::ops") public static class cast extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class cast extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public cast(Pointer p) { super(p); }
@@ -23866,7 +23983,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
          * all as above op
          * */
 //         #if NOT_EXCLUDED(OP_bitcast)
-                @Namespace("nd4j::ops") public static class bitcast extends DeclarableCustomOp {
+                @Namespace("sd::ops") public static class bitcast extends DeclarableCustomOp {
                     static { Loader.load(); }
                     /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
                     public bitcast(Pointer p) { super(p); }
@@ -23912,10 +24029,10 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 // #ifndef LIBND4J_CONTEXTBUFFERS_H
 // #define LIBND4J_CONTEXTBUFFERS_H
 
-// #include <dll.h>
-// #include <pointercast.h>
+// #include <system/dll.h>
+// #include <system/pointercast.h>
 // #include <execution/ErrorReference.h>
-    @Namespace("nd4j") @NoOffset public static class ContextBuffers extends Pointer {
+    @Namespace("sd") @NoOffset public static class ContextBuffers extends Pointer {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public ContextBuffers(Pointer p) { super(p); }
@@ -23998,16 +24115,16 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 // #include "config.h"
 // #endif
 
-// #include <dll.h>
+// #include <system/dll.h>
 // #include <memory>
-// #include <op_boilerplate.h>
+// #include <system/op_boilerplate.h>
 // #include <memory/Workspace.h>
 // #include <vector>
 // #include <mutex>
 // #include <execution/ContextBuffers.h>
 // #include <execution/ErrorReference.h>
 
-@Namespace("nd4j") @NoOffset public static class LaunchContext extends Pointer {
+@Namespace("sd") @NoOffset public static class LaunchContext extends Pointer {
     static { Loader.load(); }
     /** Native array allocator. Access with {@link Pointer#position(long)}. */
     public LaunchContext(long size) { super((Pointer)null); allocateArray(size); }
@@ -24078,12 +24195,12 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 
 // #include <unordered_map>
 // #include <vector>
-// #include <dll.h>
-// #include <pointercast.h>
-// #include <DataType.h>
+// #include <system/dll.h>
+// #include <system/pointercast.h>
+// #include <array/DataType.h>
 // #include <initializer_list>
 
-@Namespace("nd4j") @NoOffset public static class ShapeDescriptor extends Pointer {
+@Namespace("sd") @NoOffset public static class ShapeDescriptor extends Pointer {
     static { Loader.load(); }
     /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
     public ShapeDescriptor(Pointer p) { super(p); }
@@ -24108,12 +24225,12 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
         private native void allocate(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("bool") boolean inheritDtype/*=true*/);
         public ShapeDescriptor(@Cast("const Nd4jLong*") long[] shapeInfo) { super((Pointer)null); allocate(shapeInfo); }
         private native void allocate(@Cast("const Nd4jLong*") long[] shapeInfo);
-        public ShapeDescriptor(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const nd4j::DataType") int dtypeOverride) { super((Pointer)null); allocate(shapeInfo, dtypeOverride); }
-        private native void allocate(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const nd4j::DataType") int dtypeOverride);
-        public ShapeDescriptor(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const nd4j::DataType") int dtypeOverride) { super((Pointer)null); allocate(shapeInfo, dtypeOverride); }
-        private native void allocate(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const nd4j::DataType") int dtypeOverride);
-        public ShapeDescriptor(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const nd4j::DataType") int dtypeOverride) { super((Pointer)null); allocate(shapeInfo, dtypeOverride); }
-        private native void allocate(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const nd4j::DataType") int dtypeOverride);
+        public ShapeDescriptor(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const sd::DataType") int dtypeOverride) { super((Pointer)null); allocate(shapeInfo, dtypeOverride); }
+        private native void allocate(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const sd::DataType") int dtypeOverride);
+        public ShapeDescriptor(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const sd::DataType") int dtypeOverride) { super((Pointer)null); allocate(shapeInfo, dtypeOverride); }
+        private native void allocate(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const sd::DataType") int dtypeOverride);
+        public ShapeDescriptor(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const sd::DataType") int dtypeOverride) { super((Pointer)null); allocate(shapeInfo, dtypeOverride); }
+        private native void allocate(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const sd::DataType") int dtypeOverride);
         public ShapeDescriptor(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const Nd4jLong*") LongPointer dtypeOverride) { super((Pointer)null); allocate(shapeInfo, dtypeOverride); }
         private native void allocate(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const Nd4jLong*") LongPointer dtypeOverride);
         public ShapeDescriptor(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const Nd4jLong*") LongBuffer dtypeOverride) { super((Pointer)null); allocate(shapeInfo, dtypeOverride); }
@@ -24126,38 +24243,38 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
         private native void allocate(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const Nd4jLong*") LongBuffer dtypeOverride, @Cast("const Nd4jLong*") LongBuffer orderOverride);
         public ShapeDescriptor(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const Nd4jLong*") long[] dtypeOverride, @Cast("const Nd4jLong*") long[] orderOverride) { super((Pointer)null); allocate(shapeInfo, dtypeOverride, orderOverride); }
         private native void allocate(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const Nd4jLong*") long[] dtypeOverride, @Cast("const Nd4jLong*") long[] orderOverride);
-        public ShapeDescriptor(@Cast("const nd4j::DataType") int type, @Cast("const Nd4jLong") long length) { super((Pointer)null); allocate(type, length); }
-        private native void allocate(@Cast("const nd4j::DataType") int type, @Cast("const Nd4jLong") long length);
-        public ShapeDescriptor(@Cast("const nd4j::DataType") int type, byte order, @Cast("const Nd4jLong*") LongPointer shape, int rank) { super((Pointer)null); allocate(type, order, shape, rank); }
-        private native void allocate(@Cast("const nd4j::DataType") int type, byte order, @Cast("const Nd4jLong*") LongPointer shape, int rank);
-        public ShapeDescriptor(@Cast("const nd4j::DataType") int type, byte order, @Cast("const Nd4jLong*") LongBuffer shape, int rank) { super((Pointer)null); allocate(type, order, shape, rank); }
-        private native void allocate(@Cast("const nd4j::DataType") int type, byte order, @Cast("const Nd4jLong*") LongBuffer shape, int rank);
-        public ShapeDescriptor(@Cast("const nd4j::DataType") int type, byte order, @Cast("const Nd4jLong*") long[] shape, int rank) { super((Pointer)null); allocate(type, order, shape, rank); }
-        private native void allocate(@Cast("const nd4j::DataType") int type, byte order, @Cast("const Nd4jLong*") long[] shape, int rank);
-        public ShapeDescriptor(@Cast("const nd4j::DataType") int type, byte order, @Cast("const Nd4jLong*") LongPointer shape, @Cast("const Nd4jLong*") LongPointer strides, int rank, @Cast("Nd4jLong") long ews, @Cast("const bool") boolean empty) { super((Pointer)null); allocate(type, order, shape, strides, rank, ews, empty); }
-        private native void allocate(@Cast("const nd4j::DataType") int type, byte order, @Cast("const Nd4jLong*") LongPointer shape, @Cast("const Nd4jLong*") LongPointer strides, int rank, @Cast("Nd4jLong") long ews, @Cast("const bool") boolean empty);
-        public ShapeDescriptor(@Cast("const nd4j::DataType") int type, byte order, @Cast("const Nd4jLong*") LongBuffer shape, @Cast("const Nd4jLong*") LongBuffer strides, int rank, @Cast("Nd4jLong") long ews, @Cast("const bool") boolean empty) { super((Pointer)null); allocate(type, order, shape, strides, rank, ews, empty); }
-        private native void allocate(@Cast("const nd4j::DataType") int type, byte order, @Cast("const Nd4jLong*") LongBuffer shape, @Cast("const Nd4jLong*") LongBuffer strides, int rank, @Cast("Nd4jLong") long ews, @Cast("const bool") boolean empty);
-        public ShapeDescriptor(@Cast("const nd4j::DataType") int type, byte order, @Cast("const Nd4jLong*") long[] shape, @Cast("const Nd4jLong*") long[] strides, int rank, @Cast("Nd4jLong") long ews, @Cast("const bool") boolean empty) { super((Pointer)null); allocate(type, order, shape, strides, rank, ews, empty); }
-        private native void allocate(@Cast("const nd4j::DataType") int type, byte order, @Cast("const Nd4jLong*") long[] shape, @Cast("const Nd4jLong*") long[] strides, int rank, @Cast("Nd4jLong") long ews, @Cast("const bool") boolean empty);
-        public ShapeDescriptor(@Cast("const nd4j::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape) { super((Pointer)null); allocate(type, order, shape); }
-        private native void allocate(@Cast("const nd4j::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape);
-        public ShapeDescriptor(@Cast("const nd4j::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape) { super((Pointer)null); allocate(type, order, shape); }
-        private native void allocate(@Cast("const nd4j::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape);
-        public ShapeDescriptor(@Cast("const nd4j::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector long[] shape) { super((Pointer)null); allocate(type, order, shape); }
-        private native void allocate(@Cast("const nd4j::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector long[] shape);
-        public ShapeDescriptor(@Cast("const nd4j::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("Nd4jLong*") @StdVector LongPointer strides) { super((Pointer)null); allocate(type, order, shape, strides); }
-        private native void allocate(@Cast("const nd4j::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("Nd4jLong*") @StdVector LongPointer strides);
-        public ShapeDescriptor(@Cast("const nd4j::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("Nd4jLong*") @StdVector LongBuffer strides) { super((Pointer)null); allocate(type, order, shape, strides); }
-        private native void allocate(@Cast("const nd4j::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("Nd4jLong*") @StdVector LongBuffer strides);
-        public ShapeDescriptor(@Cast("const nd4j::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("Nd4jLong*") @StdVector long[] strides) { super((Pointer)null); allocate(type, order, shape, strides); }
-        private native void allocate(@Cast("const nd4j::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("Nd4jLong*") @StdVector long[] strides);
-        public ShapeDescriptor(@Cast("const nd4j::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("Nd4jLong*") @StdVector LongPointer strides, @Cast("const Nd4jLong") long ews) { super((Pointer)null); allocate(type, order, shape, strides, ews); }
-        private native void allocate(@Cast("const nd4j::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("Nd4jLong*") @StdVector LongPointer strides, @Cast("const Nd4jLong") long ews);
-        public ShapeDescriptor(@Cast("const nd4j::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("Nd4jLong*") @StdVector LongBuffer strides, @Cast("const Nd4jLong") long ews) { super((Pointer)null); allocate(type, order, shape, strides, ews); }
-        private native void allocate(@Cast("const nd4j::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("Nd4jLong*") @StdVector LongBuffer strides, @Cast("const Nd4jLong") long ews);
-        public ShapeDescriptor(@Cast("const nd4j::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("Nd4jLong*") @StdVector long[] strides, @Cast("const Nd4jLong") long ews) { super((Pointer)null); allocate(type, order, shape, strides, ews); }
-        private native void allocate(@Cast("const nd4j::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("Nd4jLong*") @StdVector long[] strides, @Cast("const Nd4jLong") long ews);
+        public ShapeDescriptor(@Cast("const sd::DataType") int type, @Cast("const Nd4jLong") long length) { super((Pointer)null); allocate(type, length); }
+        private native void allocate(@Cast("const sd::DataType") int type, @Cast("const Nd4jLong") long length);
+        public ShapeDescriptor(@Cast("const sd::DataType") int type, byte order, @Cast("const Nd4jLong*") LongPointer shape, int rank) { super((Pointer)null); allocate(type, order, shape, rank); }
+        private native void allocate(@Cast("const sd::DataType") int type, byte order, @Cast("const Nd4jLong*") LongPointer shape, int rank);
+        public ShapeDescriptor(@Cast("const sd::DataType") int type, byte order, @Cast("const Nd4jLong*") LongBuffer shape, int rank) { super((Pointer)null); allocate(type, order, shape, rank); }
+        private native void allocate(@Cast("const sd::DataType") int type, byte order, @Cast("const Nd4jLong*") LongBuffer shape, int rank);
+        public ShapeDescriptor(@Cast("const sd::DataType") int type, byte order, @Cast("const Nd4jLong*") long[] shape, int rank) { super((Pointer)null); allocate(type, order, shape, rank); }
+        private native void allocate(@Cast("const sd::DataType") int type, byte order, @Cast("const Nd4jLong*") long[] shape, int rank);
+        public ShapeDescriptor(@Cast("const sd::DataType") int type, byte order, @Cast("const Nd4jLong*") LongPointer shape, @Cast("const Nd4jLong*") LongPointer strides, int rank, @Cast("Nd4jLong") long ews, @Cast("const bool") boolean empty) { super((Pointer)null); allocate(type, order, shape, strides, rank, ews, empty); }
+        private native void allocate(@Cast("const sd::DataType") int type, byte order, @Cast("const Nd4jLong*") LongPointer shape, @Cast("const Nd4jLong*") LongPointer strides, int rank, @Cast("Nd4jLong") long ews, @Cast("const bool") boolean empty);
+        public ShapeDescriptor(@Cast("const sd::DataType") int type, byte order, @Cast("const Nd4jLong*") LongBuffer shape, @Cast("const Nd4jLong*") LongBuffer strides, int rank, @Cast("Nd4jLong") long ews, @Cast("const bool") boolean empty) { super((Pointer)null); allocate(type, order, shape, strides, rank, ews, empty); }
+        private native void allocate(@Cast("const sd::DataType") int type, byte order, @Cast("const Nd4jLong*") LongBuffer shape, @Cast("const Nd4jLong*") LongBuffer strides, int rank, @Cast("Nd4jLong") long ews, @Cast("const bool") boolean empty);
+        public ShapeDescriptor(@Cast("const sd::DataType") int type, byte order, @Cast("const Nd4jLong*") long[] shape, @Cast("const Nd4jLong*") long[] strides, int rank, @Cast("Nd4jLong") long ews, @Cast("const bool") boolean empty) { super((Pointer)null); allocate(type, order, shape, strides, rank, ews, empty); }
+        private native void allocate(@Cast("const sd::DataType") int type, byte order, @Cast("const Nd4jLong*") long[] shape, @Cast("const Nd4jLong*") long[] strides, int rank, @Cast("Nd4jLong") long ews, @Cast("const bool") boolean empty);
+        public ShapeDescriptor(@Cast("const sd::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape) { super((Pointer)null); allocate(type, order, shape); }
+        private native void allocate(@Cast("const sd::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape);
+        public ShapeDescriptor(@Cast("const sd::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape) { super((Pointer)null); allocate(type, order, shape); }
+        private native void allocate(@Cast("const sd::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape);
+        public ShapeDescriptor(@Cast("const sd::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector long[] shape) { super((Pointer)null); allocate(type, order, shape); }
+        private native void allocate(@Cast("const sd::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector long[] shape);
+        public ShapeDescriptor(@Cast("const sd::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("Nd4jLong*") @StdVector LongPointer strides) { super((Pointer)null); allocate(type, order, shape, strides); }
+        private native void allocate(@Cast("const sd::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("Nd4jLong*") @StdVector LongPointer strides);
+        public ShapeDescriptor(@Cast("const sd::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("Nd4jLong*") @StdVector LongBuffer strides) { super((Pointer)null); allocate(type, order, shape, strides); }
+        private native void allocate(@Cast("const sd::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("Nd4jLong*") @StdVector LongBuffer strides);
+        public ShapeDescriptor(@Cast("const sd::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("Nd4jLong*") @StdVector long[] strides) { super((Pointer)null); allocate(type, order, shape, strides); }
+        private native void allocate(@Cast("const sd::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("Nd4jLong*") @StdVector long[] strides);
+        public ShapeDescriptor(@Cast("const sd::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("Nd4jLong*") @StdVector LongPointer strides, @Cast("const Nd4jLong") long ews) { super((Pointer)null); allocate(type, order, shape, strides, ews); }
+        private native void allocate(@Cast("const sd::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongPointer shape, @Cast("Nd4jLong*") @StdVector LongPointer strides, @Cast("const Nd4jLong") long ews);
+        public ShapeDescriptor(@Cast("const sd::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("Nd4jLong*") @StdVector LongBuffer strides, @Cast("const Nd4jLong") long ews) { super((Pointer)null); allocate(type, order, shape, strides, ews); }
+        private native void allocate(@Cast("const sd::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector LongBuffer shape, @Cast("Nd4jLong*") @StdVector LongBuffer strides, @Cast("const Nd4jLong") long ews);
+        public ShapeDescriptor(@Cast("const sd::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("Nd4jLong*") @StdVector long[] strides, @Cast("const Nd4jLong") long ews) { super((Pointer)null); allocate(type, order, shape, strides, ews); }
+        private native void allocate(@Cast("const sd::DataType") int type, byte order, @Cast("Nd4jLong*") @StdVector long[] shape, @Cast("Nd4jLong*") @StdVector long[] strides, @Cast("const Nd4jLong") long ews);
         public ShapeDescriptor() { super((Pointer)null); allocate(); }
         private native void allocate();
 
@@ -24165,7 +24282,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
         public native @Cast("Nd4jLong") long ews();
         public native @Cast("Nd4jLong") long arrLength();
         public native char order();
-        public native @Cast("nd4j::DataType") int dataType();
+        public native @Cast("sd::DataType") int dataType();
         public native @Cast("bool") boolean isEmpty();
         public native @Cast("Nd4jLong*") @StdVector LongPointer shape();
         public native @Cast("Nd4jLong*") @StdVector LongPointer strides();
@@ -24184,13 +24301,15 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
         public native @Cast("Nd4jLong*") LongPointer toShapeInfo();
 
 
-        public static native @ByVal ShapeDescriptor emptyDescriptor(@Cast("const nd4j::DataType") int type);
-        public static native @ByVal ShapeDescriptor scalarDescriptor(@Cast("const nd4j::DataType") int type);
-        public static native @ByVal ShapeDescriptor vectorDescriptor(@Cast("const Nd4jLong") long length, @Cast("const nd4j::DataType") int type);
+        public static native @ByVal ShapeDescriptor emptyDescriptor(@Cast("const sd::DataType") int type);
+        public static native @ByVal ShapeDescriptor scalarDescriptor(@Cast("const sd::DataType") int type);
+        public static native @ByVal ShapeDescriptor vectorDescriptor(@Cast("const Nd4jLong") long length, @Cast("const sd::DataType") int type);
     }
 
 
+// #ifndef __JAVACPP_HACK__
 
+// #endif
 
 
 // #endif //DEV_TESTS_SHAPEDESCRIPTOR_H
@@ -24222,8 +24341,8 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 // #define DEV_TESTS_TADDESCRIPTOR_H
 
 // #include "ShapeDescriptor.h"
-// #include <dll.h>
-    @Namespace("nd4j") @NoOffset public static class TadDescriptor extends Pointer {
+// #include <system/dll.h>
+    @Namespace("sd") @NoOffset public static class TadDescriptor extends Pointer {
         static { Loader.load(); }
         /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
         public TadDescriptor(Pointer p) { super(p); }
@@ -24268,10 +24387,15 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 
         public native @StdVector IntPointer axis();
         public native @ByRef ShapeDescriptor originalShape();
+        public native @Const @ByRef ShapeDescriptor originalShapeConst();
         public native @Cast("bool") boolean areUnitiesinShape();
     }
 
 
+// #ifndef __JAVACPP_HACK__
+
+// #endif
+
 
 // #endif //DEV_TESTS_TADDESCRIPTOR_H
 
@@ -24301,18 +24425,18 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 // #ifndef LIBND4J__DEBUG_INFO_HELPER__H
 // #define LIBND4J__DEBUG_INFO_HELPER__H
 
-// #include <pointercast.h>
-// #include <op_boilerplate.h>
-// #include <Environment.h>
-// #include <StringUtils.h>
+// #include <system/pointercast.h>
+// #include <system/op_boilerplate.h>
+// #include <system/Environment.h>
+// #include <helpers/StringUtils.h>
 // #include <string>
-// #include <dll.h>
-// #include <templatemath.h>
+// #include <system/dll.h>
+// #include <math/templatemath.h>
 
 // #ifdef __CUDACC__
 
 // #endif
-    @Namespace("nd4j") public static class DebugInfo extends Pointer {
+    @Namespace("sd") public static class DebugInfo extends Pointer {
         static { Loader.load(); }
         /** Default native constructor. */
         public DebugInfo() { super((Pointer)null); allocate(); }
@@ -24337,7 +24461,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
        public native @Cast("Nd4jLong") long _nanCount(); public native DebugInfo _nanCount(long setter);
     }
 
-    @Namespace("nd4j") public static native @Cast("bool") @Name("operator ==") boolean equals(@Const @ByRef DebugInfo first, @Const @ByRef DebugInfo second);
+    @Namespace("sd") public static native @Cast("bool") @Name("operator ==") boolean equals(@Const @ByRef DebugInfo first, @Const @ByRef DebugInfo second);
 
 
 
@@ -24372,7 +24496,7 @@ public static final int TAD_THRESHOLD = TAD_THRESHOLD();
 
 // #include <ops/declarable/headers/common.h>
 //         #if NOT_EXCLUDED(OP_firas_sparse)
-        @Namespace("nd4j::ops") public static class firas_sparse extends DeclarableCustomOp {
+        @Namespace("sd::ops") public static class firas_sparse extends DeclarableCustomOp {
             static { Loader.load(); }
             /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
             public firas_sparse(Pointer p) { super(p); }
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpuPresets.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpuPresets.java
index 577626864..057dd5b95 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpuPresets.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-native/src/main/java/org/nd4j/nativeblas/Nd4jCpuPresets.java
@@ -43,9 +43,9 @@ import java.util.Scanner;
                                               "execution/ErrorReference.h",
                                               "execution/Engine.h",
                                               "execution/ExecutionMode.h",
-                                              "Environment.h",
+                                              "system/Environment.h",
                                               "types/utf8string.h",
-                                              "NativeOps.h",
+                                              "legacy/NativeOps.h",
                                               "memory/ExternalWorkspace.h",
                                               "memory/Workspace.h",
                                               "indexing/NDIndex.h",
@@ -53,7 +53,7 @@ import java.util.Scanner;
                                               "graph/VariableType.h",
                                               "graph/ArgumentsList.h",
                                               "types/pair.h",
-                                              "NDArray.h",
+                                              "array/NDArray.h",
                                               "array/NDArrayList.h",
                                               "array/ResultSet.h",
                                               "types/pair.h",
@@ -74,8 +74,8 @@ import java.util.Scanner;
                                               "helpers/shape.h",
                                               "helpers/OpArgsHolder.h",
                                               "array/ShapeList.h",
-                                              "type_boilerplate.h",
-                                              "op_boilerplate.h",
+                                              "system/type_boilerplate.h",
+                                              "system/op_boilerplate.h",
                                               //"enum_boilerplate.h",
                                               //"op_enums.h",
                                               "ops/InputType.h",
@@ -183,39 +183,39 @@ public class Nd4jCpuPresets implements InfoMapper, BuildEnabled {
 
         infoMap.put(new Info("__CUDACC__", "MAX_UINT", "HAVE_MKLDNN", "__CUDABLAS__").define(false))
                .put(new Info("__JAVACPP_HACK__", "LIBND4J_ALL_OPS").define(true))
-               .put(new Info("std::initializer_list", "cnpy::NpyArray", "nd4j::NDArray::applyLambda", "nd4j::NDArray::applyPairwiseLambda",
-                             "nd4j::graph::FlatResult", "nd4j::graph::FlatVariable", "nd4j::NDArray::subarray").skip())
+               .put(new Info("std::initializer_list", "cnpy::NpyArray", "sd::NDArray::applyLambda", "sd::NDArray::applyPairwiseLambda",
+                             "sd::graph::FlatResult", "sd::graph::FlatVariable", "sd::NDArray::subarray").skip())
                .put(new Info("std::string").annotations("@StdString").valueTypes("BytePointer", "String")
                                            .pointerTypes("@Cast({\"char*\", \"std::string*\"}) BytePointer"))
                .put(new Info("std::pair<int,int>").pointerTypes("IntIntPair").define())
                .put(new Info("std::vector<std::vector<int> >").pointerTypes("IntVectorVector").define())
                .put(new Info("std::vector<std::vector<Nd4jLong> >").pointerTypes("LongVectorVector").define())
-               .put(new Info("std::vector<const nd4j::NDArray*>").pointerTypes("ConstNDArrayVector").define())
-               .put(new Info("std::vector<nd4j::NDArray*>").pointerTypes("NDArrayVector").define())
-               .put(new Info("nd4j::graph::ResultWrapper").base("org.nd4j.nativeblas.ResultWrapperAbstraction").define())
+               .put(new Info("std::vector<const sd::NDArray*>").pointerTypes("ConstNDArrayVector").define())
+               .put(new Info("std::vector<sd::NDArray*>").pointerTypes("NDArrayVector").define())
+               .put(new Info("sd::graph::ResultWrapper").base("org.nd4j.nativeblas.ResultWrapperAbstraction").define())
                .put(new Info("bool").cast().valueTypes("boolean").pointerTypes("BooleanPointer", "boolean[]"))
-               .put(new Info("nd4j::IndicesList").purify());
+               .put(new Info("sd::IndicesList").purify());
 
         /*
         String classTemplates[] = {
-                "nd4j::NDArray",
-                "nd4j::NDArrayList",
-                "nd4j::ResultSet",
-                "nd4j::OpArgsHolder",
-                "nd4j::graph::GraphState",
-                "nd4j::graph::Variable",
-                "nd4j::graph::VariablesSet",
-                "nd4j::graph::Stash",
-                "nd4j::graph::VariableSpace",
-                "nd4j::graph::Context",
-                "nd4j::graph::ContextPrototype",
-                "nd4j::ops::DeclarableOp",
-                "nd4j::ops::DeclarableListOp",
-                "nd4j::ops::DeclarableReductionOp",
-                "nd4j::ops::DeclarableCustomOp",
-                "nd4j::ops::BooleanOp",
-                "nd4j::ops::BroadcastableOp",
-                "nd4j::ops::LogicOp"};
+                "sd::NDArray",
+                "sd::NDArrayList",
+                "sd::ResultSet",
+                "sd::OpArgsHolder",
+                "sd::graph::GraphState",
+                "sd::graph::Variable",
+                "sd::graph::VariablesSet",
+                "sd::graph::Stash",
+                "sd::graph::VariableSpace",
+                "sd::graph::Context",
+                "sd::graph::ContextPrototype",
+                "sd::ops::DeclarableOp",
+                "sd::ops::DeclarableListOp",
+                "sd::ops::DeclarableReductionOp",
+                "sd::ops::DeclarableCustomOp",
+                "sd::ops::BooleanOp",
+                "sd::ops::BroadcastableOp",
+                "sd::ops::LogicOp"};
         for (String t : classTemplates) {
             String s = t.substring(t.lastIndexOf(':') + 1);
             infoMap.put(new Info(t + "<float>").pointerTypes("Float" + s))
@@ -265,7 +265,7 @@ public class Nd4jCpuPresets implements InfoMapper, BuildEnabled {
         /*
         String floatOps = "", halfOps = "", doubleOps = "";
         for (String t : opTemplates) {
-            String s = "nd4j::ops::" + t;
+            String s = "sd::ops::" + t;
             infoMap.put(new Info(s + "<float>").pointerTypes("float_" + t))
                    .put(new Info(s + "<float16>").pointerTypes("half_" + t))
                    .put(new Info(s + "<double>").pointerTypes("double_" + t));
@@ -279,6 +279,6 @@ public class Nd4jCpuPresets implements InfoMapper, BuildEnabled {
                                       + "    Class[] halfOps = {" + halfOps + "};" + "\n"
                                       + "    Class[] doubleOps = {" + doubleOps + "};"));
         */
-        infoMap.put(new Info("nd4j::ops::OpRegistrator::updateMSVC").skip());
+        infoMap.put(new Info("sd::ops::OpRegistrator::updateMSVC").skip());
     }
 }