diff --git a/libnd4j/blas/cuda/NativeOps.cu b/libnd4j/blas/cuda/NativeOps.cu index 626b0ea26..7e74c3237 100755 --- a/libnd4j/blas/cuda/NativeOps.cu +++ b/libnd4j/blas/cuda/NativeOps.cu @@ -2728,11 +2728,13 @@ int execCustomOp2(Nd4jPointer* extraPointers, Nd4jLong hash, Nd4jPointer opConte throw nd4j::cuda_exception::build("customOp execution failed", res); for (auto v:context->fastpath_in()) { - v->syncToDevice(); + if (!v->isEmpty()) + v->syncToDevice(); } for (auto v:context->fastpath_out()) { - v->syncToDevice(); + if (!v->isEmpty()) + v->syncToDevice(); } return result; diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_mean.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_mean.cu index fbb45a375..19869f646 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/segment_mean.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_mean.cu @@ -74,14 +74,14 @@ namespace helpers { template static __global__ void unsortedSegmentMeanLinearKernel(void* input, Nd4jLong* inputShape, void* indices, Nd4jLong* indicesShape, int* starts, int* lengths, Nd4jLong numOfClasses, void* output, Nd4jLong* outputShape) { __shared__ T* val; - __shared__ Nd4jLong xLen, zLen, segment, zIndex; + __shared__ Nd4jLong xLen, zLen, zIndex; __shared__ T* x; __shared__ T* z; __shared__ I* y; //int threadsPerSegment, start, finish; - + auto segment = blockIdx.x;// / if (threadIdx.x == 0) { // threadsPerSegment = (gridDim.x + numOfClasses - 1) / numOfClasses; - segment = blockIdx.x;// / threadsPerSegment; +// threadsPerSegment; x = reinterpret_cast(input); z = reinterpret_cast(output); y = reinterpret_cast(indices); @@ -117,12 +117,12 @@ namespace helpers { template static __global__ void segmentMeanTadKernel(void* inputBuf, Nd4jLong* inputShape, Nd4jLong* inputTads, Nd4jLong* inputTadOffsets, I* indices, int* starts, int* lengths, Nd4jLong numOfClasses, void* outputBuf, Nd4jLong* outputShape, Nd4jLong* outputTads, Nd4jLong* outputTadOffsets) { __shared__ T* val; - __shared__ Nd4jLong len, segment, zIndex, total; + __shared__ Nd4jLong len, zIndex, total; __shared__ T* z; __shared__ int threadsPerSegment, start, finish; + auto segment = indices[blockIdx.x]; // / threadsPerSegment; if (threadIdx.x == 0) { - segment = indices[blockIdx.x]; // / threadsPerSegment; z = reinterpret_cast(outputBuf) + outputTadOffsets[segment]; len = shape::length(inputTads); start = starts[segment]; @@ -139,7 +139,7 @@ namespace helpers { for (auto e = threadIdx.x; e < len; e += blockDim.x) { auto xIndex = shape::getIndexOffset(e, inputTads, len); auto zIndex = shape::getIndexOffset(e, outputTads, len); - z[zIndex] = T(x[xIndex]/lengths[segment]); + nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], T(x[xIndex]/lengths[segment])); } } else { @@ -197,7 +197,6 @@ namespace helpers { static void unsortedSegmentMeanFunctor_(nd4j::LaunchContext* context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) { auto stream = context->getCudaStream(); // NDArray classes = NDArrayFactory::create('c', {numOfClasses, 2}); - NDArray::prepareSpecialUse({output}, {input, indices}); NDArray classesRangesBegs = NDArrayFactory::create('c', {numOfClasses}); NDArray classesRangesLens = NDArrayFactory::create('c', {numOfClasses}); @@ -226,7 +225,6 @@ namespace helpers { dims.x = input->sizeAt(0); segmentMeanTadKernel<<>>(input->specialBuffer(), input->specialShapeInfo(), inputTads, inputTadOffsets, reinterpret_cast(indices->specialBuffer()), begins, lengths, numOfClasses, output->specialBuffer(), output->specialShapeInfo(), outputTads, outputTadOffsets); } - NDArray::registerSpecialUse({output}, {input, indices}); } // -------------------------------------------------------------------------------------------------------------- // @@ -234,7 +232,7 @@ namespace helpers { NDArray::prepareSpecialUse({output}, {input, indices}); BUILD_DOUBLE_SELECTOR(input->dataType(), indices->dataType(), unsortedSegmentMeanFunctor_, (context, input, indices, numOfClasses, output), NUMERIC_TYPES, INDEXING_TYPES); - NDArray::prepareSpecialUse({output}, {input, indices}); + NDArray::registerSpecialUse({output}, {input, indices}); } // -------------------------------------------------------------------------------------------------------------- // diff --git a/libnd4j/include/ops/declarable/impl/DeclarableOp.cpp b/libnd4j/include/ops/declarable/impl/DeclarableOp.cpp index 4fe28df8c..b313acd9c 100644 --- a/libnd4j/include/ops/declarable/impl/DeclarableOp.cpp +++ b/libnd4j/include/ops/declarable/impl/DeclarableOp.cpp @@ -372,6 +372,9 @@ namespace nd4j { if (_descriptor->isSameMode()) { if (index >= block.width()) { + if (block.fastpath_in().size() == 0) + continue; + auto ia = block.fastpath_in()[0]; if (ia->dataType() != cType) { @@ -423,6 +426,9 @@ namespace nd4j { if (_descriptor->isSameMode()) { if (index >= block.width()) { + if (block.width() == 0) + continue; + auto iv = block.variable(0); if (iv->getNDArray()->dataType() != cType) { diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp index cff84c69b..a2772a734 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp @@ -88,3 +88,33 @@ TEST_F(DeclarableOpsTests16, test_size_dtype_1) { ASSERT_EQ(e, z); } + +TEST_F(DeclarableOpsTests16, test_empty_noop_1) { + auto z = NDArrayFactory::empty(); + + nd4j::ops::noop op; + auto status = op.execute({}, {&z}, {}, {}, {}); + ASSERT_EQ(Status::OK(), status); +} + +TEST_F(DeclarableOpsTests16, test_empty_noop_2) { + auto z = NDArrayFactory::empty(); + + Context ctx(1); + ctx.setOutputArray(0, z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo()); + + nd4j::ops::noop op; + auto status = op.execute(&ctx); + + ASSERT_EQ(Status::OK(), status); +} + +TEST_F(DeclarableOpsTests16, test_svd_1) { + auto x = NDArrayFactory::create('c', {3, 3}, {0.7787856f, 0.80119777f, 0.72437465f, 0.23089433f, 0.72714126f, 0.18039072f,0.50563407f, 0.89252293f, 0.5461209f}); + auto z = NDArrayFactory::create('c', {3}); + + nd4j::ops::svd op; + auto status = op.execute({&x}, {&z}, {}, {0, 0, 16}, {}); + + ASSERT_EQ(Status::OK(), status); +} \ No newline at end of file diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests7.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests7.cpp index b0488c23a..c80d75372 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests7.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests7.cpp @@ -1459,6 +1459,25 @@ TEST_F(DeclarableOpsTests7, TestSegmentMean_021) { delete result; } +TEST_F(DeclarableOpsTests7, TestSegmentMean_022) { + auto x = NDArrayFactory::create('c', {6, 3});//, {1, 2, 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18.}); + auto idx = NDArrayFactory::create({0, 0, 1, 1, 2,2}); + auto z = NDArrayFactory::create('c', {3, 3}); //, { 2.5, 3.5, 4.5, 8.5, 9.5, 10.5, 14.5, 15.5, 16.5}); + auto exp = NDArrayFactory::create('c', {3, 3}, { 2.5, 3.5, 4.5, 8.5, 9.5, 10.5, 14.5, 15.5, 16.5}); + + nd4j::ops::segment_mean op; + x.linspace(1.); + auto result = op.execute({&x, &idx}, {&z}, {}, {}, {}, false, nd4j::DataType::FLOAT32); + ASSERT_EQ(result, Status::OK()); + + exp.printIndexedBuffer("Expect Mean"); + z.printIndexedBuffer("Output Mean"); +// exp.printShapeInfo("Exp Shape"); + ASSERT_TRUE(exp.equalsTo(z)); + +// delete result; +} + //////////////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests7, TestSegmentMeanBP_2) { auto x = NDArrayFactory::create('c', {4, 4}, {1.8, 2.5, 4., 9.,2.1, 2.4, 3., 9.,2.1, 2.1, 0.7, 0.1,3., 4.2, 2.2, 1.}); diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/jita/allocator/impl/AtomicAllocator.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/jita/allocator/impl/AtomicAllocator.java index 0ec1876ca..8ec8734f7 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/jita/allocator/impl/AtomicAllocator.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/jita/allocator/impl/AtomicAllocator.java @@ -40,6 +40,7 @@ import org.nd4j.jita.workspace.CudaWorkspace; import org.nd4j.linalg.api.buffer.BaseDataBuffer; import org.nd4j.linalg.api.buffer.DataBuffer; import org.nd4j.linalg.api.buffer.DataType; +import org.nd4j.linalg.api.buffer.Utf8Buffer; import org.nd4j.linalg.api.memory.enums.MemoryKind; import org.nd4j.linalg.api.memory.pointers.PagedPointer; import org.nd4j.linalg.api.ndarray.INDArray; @@ -284,10 +285,16 @@ public class AtomicAllocator implements Allocator { */ @Override public Pointer getPointer(@NonNull DataBuffer buffer, CudaContext context) { + if (buffer instanceof Utf8Buffer) + return null; + return memoryHandler.getDevicePointer(buffer, context); } public Pointer getPointer(DataBuffer buffer) { + if (buffer instanceof Utf8Buffer) + return null; + return memoryHandler.getDevicePointer(buffer, getDeviceContext()); } diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaExecutioner.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaExecutioner.java index afeff4d8b..c5b02a82f 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaExecutioner.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/linalg/jcublas/ops/executioner/CudaExecutioner.java @@ -961,12 +961,12 @@ public class CudaExecutioner extends DefaultOpExecutioner { if (CudaEnvironment.getInstance().getConfiguration().isDebug()) lastOp.set(op.opName()); - val tadBuffers = tadManager.getTADOnlyShapeInfo(op.x(), dimension); + val tadBuffers = op.x().isEmpty() ? Pair.makePair(op.x().data(), null) : tadManager.getTADOnlyShapeInfo(op.x(), dimension); val hostTadShapeInfo = AddressRetriever.retrieveHostPointer(tadBuffers.getFirst()); val devTadShapeInfo = AtomicAllocator.getInstance().getPointer(tadBuffers.getFirst(), context); - val offsets = tadBuffers.getSecond(); + val offsets = op.x().isEmpty() ? null : tadBuffers.getSecond(); val devTadOffsets = offsets == null ? null : AtomicAllocator.getInstance().getPointer(offsets, context); Pointer x = AtomicAllocator.getInstance().getPointer(op.x(), context); diff --git a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/shape/EmptyTests.java b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/shape/EmptyTests.java index e7e8f8288..3bef69c19 100644 --- a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/shape/EmptyTests.java +++ b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/shape/EmptyTests.java @@ -309,6 +309,17 @@ public class EmptyTests extends BaseNd4jTest { assertEquals(x, out); } + @Test + public void testEmptyNoop() { + val output = Nd4j.empty(DataType.LONG); + + val op = DynamicCustomOp.builder("noop") + .addOutputs(output) + .build(); + + Nd4j.exec(op); + } + @Override public char ordering() { return 'c';