diff --git a/libnd4j/include/execution/cuda/ContextBuffers.cu b/libnd4j/include/execution/cuda/ContextBuffers.cu index c8a149a5f..ed84d511a 100644 --- a/libnd4j/include/execution/cuda/ContextBuffers.cu +++ b/libnd4j/include/execution/cuda/ContextBuffers.cu @@ -30,13 +30,13 @@ namespace nd4j { ContextBuffers::ContextBuffers() { - nd4j_printf("Creating ContextBuffers for device [%i]\n", AffinityManager::currentDeviceId()); + //nd4j_printf("Creating ContextBuffers for device [%i]\n", AffinityManager::currentDeviceId()); _deviceId = AffinityManager::currentDeviceId(); } ContextBuffers::~ContextBuffers() { if (_allocated) { - nd4j_printf("Releasing ContextBuffers\n",""); + //nd4j_printf("Releasing ContextBuffers\n",""); if (_allocationPointer != nullptr) cudaFree(_allocationPointer); @@ -69,7 +69,7 @@ namespace nd4j { } void ContextBuffers::initialize() { - nd4j_printf("Initializing buffers on deviceId [%i]\n", AffinityManager::currentNativeDeviceId()); + //nd4j_printf("Initializing buffers on deviceId [%i]\n", AffinityManager::currentNativeDeviceId()); auto res = cudaMalloc(reinterpret_cast(&_reductionPointer), 1024 * 1024 * 8); if (res != 0) diff --git a/libnd4j/include/helpers/impl/Parameters.cpp b/libnd4j/include/helpers/impl/Parameters.cpp index f5bcb1014..d2678832f 100644 --- a/libnd4j/include/helpers/impl/Parameters.cpp +++ b/libnd4j/include/helpers/impl/Parameters.cpp @@ -19,6 +19,7 @@ // #include "../benchmark/Parameters.h" +#include namespace nd4j { Parameters* Parameters::addIntParam(std::string string, int param) { diff --git a/libnd4j/include/op_boilerplate.h b/libnd4j/include/op_boilerplate.h index d9c8dee62..4f70d9bf2 100644 --- a/libnd4j/include/op_boilerplate.h +++ b/libnd4j/include/op_boilerplate.h @@ -1461,7 +1461,7 @@ #ifdef _RELEASE -#define ALLOCATE_SPECIAL(VARIABLE, WORKSPACE, LENGTH, TT) if (WORKSPACE == nullptr) {auto erc_##VARIABLE = cudaMalloc(reinterpret_cast(&VARIABLE), LENGTH * sizeof(TT)); if (erc_##VARIABLE != 0) {throw cuda_exception::build("[DEVICE] allocation failed", erc_##VARIABLE);} else { }; } else {VARIABLE = reinterpret_cast(WORKSPACE->allocateBytes(nd4j::memory::MemoryType::DEVICE, LENGTH * sizeof(TT))); } +#define ALLOCATE_SPECIAL(VARIABLE, WORKSPACE, LENGTH, TT) if (WORKSPACE == nullptr) {auto erc_##VARIABLE = cudaMalloc(reinterpret_cast(&VARIABLE), LENGTH * sizeof(TT) + 8); if (erc_##VARIABLE != 0) {throw cuda_exception::build("[DEVICE] allocation failed", erc_##VARIABLE);} else { }; } else {VARIABLE = reinterpret_cast(WORKSPACE->allocateBytes(nd4j::memory::MemoryType::DEVICE, LENGTH * sizeof(TT) + 8)); } #define RELEASE_SPECIAL(VARIABLE, WORKSPACE) if (VARIABLE != nullptr) {if (WORKSPACE == nullptr) { auto erc_##VARIABLE = cudaFree(reinterpret_cast(VARIABLE)); if (erc_##VARIABLE != 0) {throw cuda_exception::build("[DEVICE] deallocation failed", erc_##VARIABLE);}; }; }; #else diff --git a/libnd4j/include/ops/declarable/helpers/cuda/lup.cu b/libnd4j/include/ops/declarable/helpers/cuda/lup.cu index 1bf40ba7b..354f360c3 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/lup.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/lup.cu @@ -177,7 +177,9 @@ namespace helpers { } void invertLowerMatrix(NDArray* inputMatrix, NDArray* invertedMatrix) { + NDArray::prepareSpecialUse({invertedMatrix}, {inputMatrix}); BUILD_SINGLE_SELECTOR(inputMatrix->dataType(), invertLowerMatrix_, (inputMatrix, invertedMatrix), FLOAT_NATIVE); + NDArray::registerSpecialUse({invertedMatrix}, {inputMatrix}); } template @@ -195,7 +197,9 @@ namespace helpers { } void invertUpperMatrix(NDArray* inputMatrix, NDArray* invertedMatrix) { + NDArray::prepareSpecialUse({invertedMatrix}, {inputMatrix}); BUILD_SINGLE_SELECTOR(inputMatrix->dataType(), invertUpperMatrix_, (inputMatrix, invertedMatrix), FLOAT_NATIVE); + NDArray::prepareSpecialUse({invertedMatrix}, {inputMatrix}); } // template @@ -242,11 +246,7 @@ namespace helpers { template static __global__ void determinantKernel(T* compound, T* result, Nd4jLong len) { - __shared__ F tempRes; - if (blockIdx.x == 0) { - tempRes = (F)result[0]; - } - __syncthreads(); + F tempRes = (F)result[0]; auto start = blockIdx.x * blockDim.x + threadIdx.x; auto step = blockDim.x * gridDim.x; @@ -256,18 +256,14 @@ namespace helpers { } __syncthreads(); - if (blockIdx.x == 0) { + if (threadIdx.x == 0) { result[0] = (T)tempRes; } } template static __global__ void determinantLogKernel(T* compound, T* result, Nd4jLong len) { - __shared__ F tempRes; - if (blockIdx.x == 0) { - tempRes = (F)result[0]; - } - __syncthreads(); + F tempRes = (F)result[0]; auto start = blockIdx.x * blockDim.x + threadIdx.x; auto step = blockDim.x * gridDim.x; @@ -277,7 +273,7 @@ namespace helpers { } __syncthreads(); - if (blockIdx.x == 0) { + if (threadIdx.x == 0) { result[0] = (T)math::nd4j_log(math::nd4j_abs(tempRes)); } } @@ -520,7 +516,9 @@ namespace helpers { } int determinant(nd4j::LaunchContext * context, NDArray* input, NDArray* output) { + NDArray::prepareSpecialUse({output}, {input}); BUILD_SINGLE_SELECTOR(input->dataType(), return determinant_, (context, input, output), FLOAT_NATIVE); + NDArray::registerSpecialUse({output}, {input}); } template @@ -568,7 +566,9 @@ namespace helpers { } int logAbsDeterminant(nd4j::LaunchContext * context, NDArray* input, NDArray* output) { + NDArray::prepareSpecialUse({output}, {input}); BUILD_SINGLE_SELECTOR(input->dataType(), return logAbsDeterminant_, (context, input, output), FLOAT_NATIVE); + NDArray::registerSpecialUse({output}, {input}); } template @@ -631,26 +631,27 @@ namespace helpers { auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {output->rankOf() - 2, output->rankOf() - 1}); auto stream = context->getCudaStream(); -// PRAGMA_OMP_PARALLEL_FOR for (auto i = 0LL; i < packX.numberOfTads(); i++) { - fillMatrix<<<1, n2, 128, *stream>>>(matrix.specialBuffer(), matrix.specialShapeInfo(), input->specialBuffer(), input->specialShapeInfo(), i * n2, n); + fillMatrix<<<1, n2, 1024, *stream>>>(matrix.specialBuffer(), matrix.specialShapeInfo(), input->specialBuffer(), input->specialShapeInfo(), i * n2, n); matrix.tickWriteDevice(); compound.assign(matrix); lup_(context, &compound, nullptr, nullptr); - fillLowerUpperKernel<<>>(lower.specialBuffer(), lower.specialShapeInfo(), upper.specialBuffer(), upper.specialShapeInfo(), compound.specialBuffer(), compound.specialShapeInfo(), n); + fillLowerUpperKernel<<>>(lower.specialBuffer(), lower.specialShapeInfo(), upper.specialBuffer(), upper.specialShapeInfo(), compound.specialBuffer(), compound.specialShapeInfo(), n); matrix.assign(0); invertUpperMatrix(&upper, &matrix); // U^{-1} compound.assign(0); invertLowerMatrix(&lower, &compound); // L{-1} nd4j::MmulHelper::mmul(&matrix, &compound, &upper, 1.0, 0.0); - returnMatrix<<<1, n2, 128, *stream>>>(output->specialBuffer(), output->specialShapeInfo(), upper.specialBuffer(), upper.specialShapeInfo(), i * n2, n); + returnMatrix<<<1, n2, 1024, *stream>>>(output->specialBuffer(), output->specialShapeInfo(), upper.specialBuffer(), upper.specialShapeInfo(), i * n2, n); } return Status::OK(); } int inverse(nd4j::LaunchContext * context, NDArray* input, NDArray* output) { + NDArray::prepareSpecialUse({output}, {input}); BUILD_SINGLE_SELECTOR(input->dataType(), return inverse_, (context, input, output), FLOAT_NATIVE); + NDArray::registerSpecialUse({output}, {input}); } bool checkCholeskyInput(nd4j::LaunchContext * context, NDArray const* input) { @@ -795,10 +796,12 @@ namespace helpers { double* output = outputBuf; double* input = inputBuf; + Nd4jLong* shapeOf = shape::shapeOf(tadShape); + Nd4jLong* strideOf = shape::stride(tadShape); + for (auto i = blockIdx.x; i < batchNum; i += gridDim.x) { double* current = input + tadOffsets[i]; - Nd4jLong* shapeOf = shape::shapeOf(tadShape); - Nd4jLong* strideOf = shape::stride(tadShape); + auto zIndex = shape::getIndexOffset(i, outputShape, batchNum); for (auto e = threadIdx.x; e < n; e += blockDim.x) { Nd4jLong diag[] = {e, e}; diff --git a/libnd4j/include/templatemath.h b/libnd4j/include/templatemath.h index b690f4f6e..6a543b35d 100644 --- a/libnd4j/include/templatemath.h +++ b/libnd4j/include/templatemath.h @@ -1057,6 +1057,9 @@ inline __device__ uint64_t nd4j_atomicAdd(uint64_t* address, uint64_t template <> inline __device__ float16 nd4j_atomicAdd(float16* address, float16 val) { +#if __CUDA_ARCH__ >= 700 + atomicAdd(reinterpret_cast<__half*>(address), val.data); +#else int* address_as_ull = (int*) address; long addr = (long) address; @@ -1086,6 +1089,7 @@ inline __device__ float16 nd4j_atomicAdd(float16* address, float16 val) if (!misaligned) return old.B.H; else return old.B.L; +#endif } template <> diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp index 48996f2a5..bc716cc8e 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp @@ -1846,8 +1846,8 @@ TEST_F(DeclarableOpsTests6, MatrixInverse_3) { ASSERT_EQ(ND4J_STATUS_OK, result->status()); auto z = result->at(0); - //z->printIndexedBuffer("Output "); - //exp.printIndexedBuffer("Expected "); + exp.printIndexedBuffer("Expected "); + z->printIndexedBuffer("Output "); ASSERT_TRUE(exp.isSameShape(z)); ASSERT_TRUE(exp.equalsTo(z));