diff --git a/libnd4j/blas/CMakeLists.txt b/libnd4j/blas/CMakeLists.txt index 3dbe89226..d07ec57bc 100755 --- a/libnd4j/blas/CMakeLists.txt +++ b/libnd4j/blas/CMakeLists.txt @@ -101,16 +101,17 @@ ELSE() endif() ENDIF() -if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang") +if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang" AND X86_BUILD) + # apple clang but not ios-arm + SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_TUNE}") +elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") # using Clang SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_TUNE}") - elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel") # using Intel C++ SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_TUNE} -O3 -fp-model fast") elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") # using Visual Studio C++ - set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_TUNE}") elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") # using GCC diff --git a/libnd4j/include/ops/declarable/helpers/cpu/histogram.cpp b/libnd4j/include/ops/declarable/helpers/cpu/histogram.cpp index 97cd2f84e..911230367 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/histogram.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/histogram.cpp @@ -29,26 +29,16 @@ namespace nd4j { auto result = reinterpret_cast(zBuffer); int length = shape::length(xShapeInfo); - // FIXME: 2??? - int _threads = 2; - - int span = (length / _threads) + 8; X binSize = (max_val - min_val) / (numBins); - PRAGMA_OMP_PARALLEL_THREADS(_threads) + // FIXME: this op should be parallelized { - int tid, start, end; - int *bins = new int[numBins]; std::memset(bins, 0, sizeof(int) * numBins); - tid = omp_get_thread_num(); - start = span * tid; - end = span * (tid + 1); - if (end > length) end = length; PRAGMA_OMP_SIMD - for (int x = start; x < end; x++) { + for (int x = 0; x < length; x++) { int idx = (int) ((dx[x] - min_val) / binSize); if (idx < 0) idx = 0; @@ -58,15 +48,12 @@ namespace nd4j { bins[idx]++; } - PRAGMA_OMP_CRITICAL - { - PRAGMA_OMP_SIMD - for (int x = 0; x < numBins; x++) { - result[x] += bins[x]; - } - + PRAGMA_OMP_SIMD + for (int x = 0; x < numBins; x++) { + result[x] += bins[x]; } + delete[] bins; } }