From 8fd0e3e1829a53bf0b612bdaadb6d4cb5a0cce78 Mon Sep 17 00:00:00 2001 From: Will Usher Date: Tue, 7 Jul 2015 11:10:18 -0600 Subject: [PATCH 01/26] Initial hacky OpenMP backend Just threw a #pragma omp for around the for loops in the serial backend to get an intitial super rough backend together so I could get it integrated into vtk-m (eg. run all tests, setup CMake stuff, etc). From here I'll work on making the backend actually good (hopefully) --- CMake/UseVTKmOpenMP.cmake | 43 ++++++ CMakeLists.txt | 4 + vtkm/cont/CMakeLists.txt | 3 + vtkm/cont/DeviceAdapterAlgorithm.h | 4 +- vtkm/cont/internal/ArrayManagerExecution.h | 4 +- vtkm/cont/internal/DeviceAdapterTag.h | 5 + vtkm/cont/openmp/CMakeLists.txt | 30 ++++ vtkm/cont/openmp/DeviceAdapterOpenMP.h | 28 ++++ .../internal/ArrayManagerExecutionOpenMP.h | 81 +++++++++++ vtkm/cont/openmp/internal/CMakeLists.txt | 28 ++++ .../internal/DeviceAdapterAlgorithmOpenMP.h | 131 ++++++++++++++++++ .../openmp/internal/DeviceAdapterTagOpenMP.h | 28 ++++ vtkm/cont/openmp/testing/CMakeLists.txt | 27 ++++ .../testing/UnitTestDeviceAdapterOpenMP.cxx | 32 +++++ .../testing/UnitTestOpenMPArrayHandle.cxx | 31 +++++ .../UnitTestOpenMPArrayHandleFancy.cxx | 31 +++++ vtkm/worklet/testing/CMakeLists.txt | 4 + 17 files changed, 510 insertions(+), 4 deletions(-) create mode 100644 CMake/UseVTKmOpenMP.cmake create mode 100644 vtkm/cont/openmp/CMakeLists.txt create mode 100644 vtkm/cont/openmp/DeviceAdapterOpenMP.h create mode 100644 vtkm/cont/openmp/internal/ArrayManagerExecutionOpenMP.h create mode 100644 vtkm/cont/openmp/internal/CMakeLists.txt create mode 100644 vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h create mode 100644 vtkm/cont/openmp/internal/DeviceAdapterTagOpenMP.h create mode 100644 vtkm/cont/openmp/testing/CMakeLists.txt create mode 100644 vtkm/cont/openmp/testing/UnitTestDeviceAdapterOpenMP.cxx create mode 100644 vtkm/cont/openmp/testing/UnitTestOpenMPArrayHandle.cxx create mode 100644 vtkm/cont/openmp/testing/UnitTestOpenMPArrayHandleFancy.cxx diff --git a/CMake/UseVTKmOpenMP.cmake b/CMake/UseVTKmOpenMP.cmake new file mode 100644 index 000000000..94b18bbdd --- /dev/null +++ b/CMake/UseVTKmOpenMP.cmake @@ -0,0 +1,43 @@ +##============================================================================ +## Copyright (c) Kitware, Inc. +## All rights reserved. +## See LICENSE.txt for details. +## This software is distributed WITHOUT ANY WARRANTY; without even +## the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR +## PURPOSE. See the above copyright notice for more information. +## +## Copyright 2014 Sandia Corporation. +## Copyright 2014 UT-Battelle, LLC. +## Copyright 2014 Los Alamos National Security. +## +## Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +## the U.S. Government retains certain rights in this software. +## +## Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National +## Laboratory (LANL), the U.S. Government retains certain rights in +## this software. +##============================================================================ + +if (VTKm_OpenMP_initialize_complete) + return() +endif (VTKm_OpenMP_initialize_complete) + +#----------------------------------------------------------------------------- +# Find OpenMP. +#----------------------------------------------------------------------------- +if (NOT VTKm_OpenMP_FOUND) + find_package(OpenMP REQUIRED) + set (VTKm_OpenMP_FOUND ${OPENMP_FOUND}) +endif() + +#----------------------------------------------------------------------------- +# Set up all these dependent packages (if they were all found). +#----------------------------------------------------------------------------- +if (VTKm_OpenMP_FOUND) + include_directories( + ${VTKm_INCLUDE_DIRS} + ) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") + set(VTKm_OpenMP_initialize_complete TRUE) +endif() + diff --git a/CMakeLists.txt b/CMakeLists.txt index 15a2d42d0..0ebd5c604 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -66,6 +66,7 @@ include(CMake/VTKmCompilerExtras.cmake) # Configurable Options option(VTKm_ENABLE_CUDA "Enable Cuda support" OFF) option(VTKm_ENABLE_TBB "Enable TBB support" OFF) +option(VTKm_ENABLE_OPENMP "Enable OpenMP support" OFF) option(VTKm_ENABLE_TESTING "Enable VTKm Testing" ON) option(VTKm_ENABLE_BENCHMARKS "Enable VTKm Benchmarking" OFF) @@ -91,6 +92,9 @@ endif (VTKm_ENABLE_CUDA) if (VTKm_ENABLE_TBB) vtkm_configure_device(TBB) endif (VTKm_ENABLE_TBB) +if (VTKm_ENABLE_OPENMP) + vtkm_configure_device(OpenMP) +endif() #----------------------------------------------------------------------------- diff --git a/vtkm/cont/CMakeLists.txt b/vtkm/cont/CMakeLists.txt index c87c6b789..f35c7a015 100644 --- a/vtkm/cont/CMakeLists.txt +++ b/vtkm/cont/CMakeLists.txt @@ -77,6 +77,9 @@ endif () if (VTKm_ENABLE_TBB) add_subdirectory(tbb) endif () +if (VTKm_ENABLE_OPENMP) + add_subdirectory(openmp) +endif() #----------------------------------------------------------------------------- add_subdirectory(testing) diff --git a/vtkm/cont/DeviceAdapterAlgorithm.h b/vtkm/cont/DeviceAdapterAlgorithm.h index 5cceb25fe..18e5ce6ca 100644 --- a/vtkm/cont/DeviceAdapterAlgorithm.h +++ b/vtkm/cont/DeviceAdapterAlgorithm.h @@ -509,8 +509,8 @@ public: #include #elif VTKM_DEVICE_ADAPTER == VTKM_DEVICE_ADAPTER_CUDA #include -// #elif VTKM_DEVICE_ADAPTER == VTKM_DEVICE_ADAPTER_OPENMP -// #include +#elif VTKM_DEVICE_ADAPTER == VTKM_DEVICE_ADAPTER_OPENMP +#include #elif VTKM_DEVICE_ADAPTER == VTKM_DEVICE_ADAPTER_TBB #include #endif diff --git a/vtkm/cont/internal/ArrayManagerExecution.h b/vtkm/cont/internal/ArrayManagerExecution.h index a50af43be..0a0dcd577 100644 --- a/vtkm/cont/internal/ArrayManagerExecution.h +++ b/vtkm/cont/internal/ArrayManagerExecution.h @@ -160,8 +160,8 @@ public: #include #elif VTKM_DEVICE_ADAPTER == VTKM_DEVICE_ADAPTER_CUDA #include -// #elif VTKM_DEVICE_ADAPTER == VTKM_DEVICE_ADAPTER_OPENMP -// #include +#elif VTKM_DEVICE_ADAPTER == VTKM_DEVICE_ADAPTER_OPENMP +#include #elif VTKM_DEVICE_ADAPTER == VTKM_DEVICE_ADAPTER_TBB #include #endif diff --git a/vtkm/cont/internal/DeviceAdapterTag.h b/vtkm/cont/internal/DeviceAdapterTag.h index 966a33b70..a12eefd91 100644 --- a/vtkm/cont/internal/DeviceAdapterTag.h +++ b/vtkm/cont/internal/DeviceAdapterTag.h @@ -106,6 +106,11 @@ struct DeviceAdapterTagCheck #include #define VTKM_DEFAULT_DEVICE_ADAPTER_TAG ::vtkm::cont::DeviceAdapterTagTBB +#elif VTKM_DEVICE_ADAPTER == VTKM_DEVICE_ADAPTER_OPENMP + +#include +#define VTKM_DEFAULT_DEVICE_ADAPTER_TAG ::vtkm::cont::DeviceAdapterTagOpenMP + #elif VTKM_DEVICE_ADAPTER == VTKM_DEVICE_ADAPTER_ERROR #include diff --git a/vtkm/cont/openmp/CMakeLists.txt b/vtkm/cont/openmp/CMakeLists.txt new file mode 100644 index 000000000..3bb73ffbe --- /dev/null +++ b/vtkm/cont/openmp/CMakeLists.txt @@ -0,0 +1,30 @@ +##============================================================================ +## Copyright (c) Kitware, Inc. +## All rights reserved. +## See LICENSE.txt for details. +## This software is distributed WITHOUT ANY WARRANTY; without even +## the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR +## PURPOSE. See the above copyright notice for more information. +## +## Copyright 2014 Sandia Corporation. +## Copyright 2014 UT-Battelle, LLC. +## Copyright 2014 Los Alamos National Security. +## +## Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +## the U.S. Government retains certain rights in this software. +## +## Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National +## Laboratory (LANL), the U.S. Government retains certain rights in +## this software. +##============================================================================ + +set(headers + DeviceAdapterOpenMP.h + ) + +add_subdirectory(internal) + +vtkm_declare_headers(${headers}) + +add_subdirectory(testing) + diff --git a/vtkm/cont/openmp/DeviceAdapterOpenMP.h b/vtkm/cont/openmp/DeviceAdapterOpenMP.h new file mode 100644 index 000000000..8b1bc3201 --- /dev/null +++ b/vtkm/cont/openmp/DeviceAdapterOpenMP.h @@ -0,0 +1,28 @@ +//============================================================================ +// Copyright (c) Kitware, Inc. +// All rights reserved. +// See LICENSE.txt for details. +// This software is distributed WITHOUT ANY WARRANTY; without even +// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR +// PURPOSE. See the above copyright notice for more information. +// +// Copyright 2014 Sandia Corporation. +// Copyright 2014 UT-Battelle, LLC. +// Copyright 2014 Los Alamos National Security. +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National +// Laboratory (LANL), the U.S. Government retains certain rights in +// this software. +//============================================================================ +#ifndef vtk_m_cont_openmp_DeviceAdapterOpenMP_h +#define vtk_m_cont_openmp_DeviceAdapterOpenMP_h + +#include +#include +#include + +#endif //vtk_m_cont_openmp_DeviceAdapterOpenMP_h + diff --git a/vtkm/cont/openmp/internal/ArrayManagerExecutionOpenMP.h b/vtkm/cont/openmp/internal/ArrayManagerExecutionOpenMP.h new file mode 100644 index 000000000..3f2eadbde --- /dev/null +++ b/vtkm/cont/openmp/internal/ArrayManagerExecutionOpenMP.h @@ -0,0 +1,81 @@ +//============================================================================ +// Copyright (c) Kitware, Inc. +// All rights reserved. +// See LICENSE.txt for details. +// This software is distributed WITHOUT ANY WARRANTY; without even +// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR +// PURPOSE. See the above copyright notice for more information. +// +// Copyright 2014 Sandia Corporation. +// Copyright 2014 UT-Battelle, LLC. +// Copyright 2014 Los Alamos National Security. +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National +// Laboratory (LANL), the U.S. Government retains certain rights in +// this software. +//============================================================================ +#ifndef vtk_m_cont_openmp_internal_ArrayManagerExecutionOpenMP_h +#define vtk_m_cont_openmp_internal_ArrayManagerExecutionOpenMP_h + +#include + +#include +#include + +// These must be placed in the vtkm::cont::internal namespace so that +// the template can be found. + +// TODO Will: This is just ArrayManagerExecutionTBB, I think they should +// behave the same since they're both multithreaded CPU devices +// But I could be totally wrong here. + +namespace vtkm { +namespace cont { +namespace internal { + +template +class ArrayManagerExecution + + : public vtkm::cont::internal::ArrayManagerExecutionShareWithControl + +{ +public: + typedef vtkm::cont::internal::ArrayManagerExecutionShareWithControl + Superclass; + typedef typename Superclass::ValueType ValueType; + typedef typename Superclass::PortalType PortalType; + typedef typename Superclass::PortalConstType PortalConstType; + typedef typename Superclass::StorageType StorageType; + + VTKM_CONT_EXPORT + ArrayManagerExecution(StorageType *storage) + : Superclass(storage) { } + + VTKM_CONT_EXPORT + PortalConstType PrepareForInput(bool updateData) + { + return this->Superclass::PrepareForInput(updateData); + } + + VTKM_CONT_EXPORT + PortalType PrepareForInPlace(bool updateData) + { + return this->Superclass::PrepareForInPlace(updateData); + } + + VTKM_CONT_EXPORT + PortalType PrepareForOutput(vtkm::Id numberOfValues) + { + return this->Superclass::PrepareForOutput(numberOfValues); + } +}; + +} +} +} // namespace vtkm::cont::internal + + +#endif //vtk_m_cont_openmp_internal_ArrayManagerExecutionOpenMP_h diff --git a/vtkm/cont/openmp/internal/CMakeLists.txt b/vtkm/cont/openmp/internal/CMakeLists.txt new file mode 100644 index 000000000..7f184e087 --- /dev/null +++ b/vtkm/cont/openmp/internal/CMakeLists.txt @@ -0,0 +1,28 @@ +##============================================================================ +## Copyright (c) Kitware, Inc. +## All rights reserved. +## See LICENSE.txt for details. +## This software is distributed WITHOUT ANY WARRANTY; without even +## the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR +## PURPOSE. See the above copyright notice for more information. +## +## Copyright 2014 Sandia Corporation. +## Copyright 2014 UT-Battelle, LLC. +## Copyright 2014 Los Alamos National Security. +## +## Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +## the U.S. Government retains certain rights in this software. +## +## Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National +## Laboratory (LANL), the U.S. Government retains certain rights in +## this software. +##============================================================================ + +set(headers + ArrayManagerExecutionOpenMP.h + DeviceAdapterAlgorithmOpenMP.h + DeviceAdapterTagOpenMP.h + ) + +vtkm_declare_headers(${headers}) + diff --git a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h new file mode 100644 index 000000000..011f8d5b5 --- /dev/null +++ b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h @@ -0,0 +1,131 @@ +//============================================================================ +// Copyright (c) Kitware, Inc. +// All rights reserved. +// See LICENSE.txt for details. +// This software is distributed WITHOUT ANY WARRANTY; without even +// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR +// PURPOSE. See the above copyright notice for more information. +// +// Copyright 2014 Sandia Corporation. +// Copyright 2014 UT-Battelle, LLC. +// Copyright 2014 Los Alamos National Security. +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National +// Laboratory (LANL), the U.S. Government retains certain rights in +// this software. +//============================================================================ +#ifndef vtk_m_cont_openmp_internal_DeviceAdapterAlgorithmOpenMP_h +#define vtk_m_cont_openmp_internal_DeviceAdapterAlgorithmOpenMP_h + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace vtkm { +namespace cont { + +template<> +struct DeviceAdapterAlgorithm : + vtkm::cont::internal::DeviceAdapterAlgorithmGeneral< + DeviceAdapterAlgorithm, + vtkm::cont::DeviceAdapterTagOpenMP> +{ +private: + // This is basically the Serial ScheduleKernel + template + class ScheduleKernel + { + public: + ScheduleKernel(const FunctorType &functor) + : Functor(functor) {} + + VTKM_EXEC_EXPORT void operator()(vtkm::Id index) const + { + this->Functor(index); + } + + private: + const FunctorType Functor; + }; + +public: + template + VTKM_CONT_EXPORT static void Schedule(Functor functor, vtkm::Id numInstances) + { + // Literally just Serial's Schedule with a parallel for slapped on + const vtkm::Id MESSAGE_SIZE = 1024; + char errorString[MESSAGE_SIZE]; + errorString[0] = '\0'; + vtkm::exec::internal::ErrorMessageBuffer + errorMessage(errorString, MESSAGE_SIZE); + + functor.SetErrorMessageBuffer(errorMessage); + + ScheduleKernel kernel(functor); + + const vtkm::Id size = numInstances; +#pragma omp for schedule(static) + for(vtkm::Id i=0; i < size; ++i) + { + kernel(i); + } + + if (errorMessage.IsErrorRaised()) + { + throw vtkm::cont::ErrorExecution(errorString); + } + } + + template + VTKM_CONT_EXPORT static void Schedule(Functor functor, vtkm::Id3 rangeMax) + { + // Literally just Serial's Schedule with a parallel for slapped on + // and the nested loops flattened + const vtkm::Id MESSAGE_SIZE = 1024; + char errorString[MESSAGE_SIZE]; + errorString[0] = '\0'; + vtkm::exec::internal::ErrorMessageBuffer + errorMessage(errorString, MESSAGE_SIZE); + + functor.SetErrorMessageBuffer(errorMessage); + + ScheduleKernel kernel(functor); + + // TODO Will: This is ok right? I've just sort of glanced at this + const vtkm::Id size = rangeMax[0] * rangeMax[1] * rangeMax[2]; +#pragma omp for schedule(static) + for(vtkm::Id i=0; i < size; ++i) + { + kernel(i); + } + + if (errorMessage.IsErrorRaised()) + { + throw vtkm::cont::ErrorExecution(errorString); + } + } + + VTKM_CONT_EXPORT static void Synchronize() + { + // We do split/join here so there's nothing to synchronize. The OpenMP + // backend should behave identically to Intel TBB where we've re-joined + // all threads before returning from an execution + } +}; +} +}// namespace vtkm::cont + +#endif //vtk_m_cont_openmp_internal_DeviceAdapterAlgorithmOpenMP_h diff --git a/vtkm/cont/openmp/internal/DeviceAdapterTagOpenMP.h b/vtkm/cont/openmp/internal/DeviceAdapterTagOpenMP.h new file mode 100644 index 000000000..7afbb2344 --- /dev/null +++ b/vtkm/cont/openmp/internal/DeviceAdapterTagOpenMP.h @@ -0,0 +1,28 @@ +//============================================================================ +// Copyright (c) Kitware, Inc. +// All rights reserved. +// See LICENSE.txt for details. +// This software is distributed WITHOUT ANY WARRANTY; without even +// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR +// PURPOSE. See the above copyright notice for more information. +// +// Copyright 2014 Sandia Corporation. +// Copyright 2014 UT-Battelle, LLC. +// Copyright 2014 Los Alamos National Security. +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National +// Laboratory (LANL), the U.S. Government retains certain rights in +// this software. +//============================================================================ +#ifndef vtk_m_cont_openmp_internal_DeviceAdapterTagOpenMP_h +#define vtk_m_cont_openmp_internal_DeviceAdapterTagOpenMP_h + +#include + + +VTKM_CREATE_DEVICE_ADAPTER(OpenMP); + +#endif //vtk_m_cont_openmp_internal_DeviceAdapterTagOpenMP_h diff --git a/vtkm/cont/openmp/testing/CMakeLists.txt b/vtkm/cont/openmp/testing/CMakeLists.txt new file mode 100644 index 000000000..6c4beaf17 --- /dev/null +++ b/vtkm/cont/openmp/testing/CMakeLists.txt @@ -0,0 +1,27 @@ +##============================================================================ +## Copyright (c) Kitware, Inc. +## All rights reserved. +## See LICENSE.txt for details. +## This software is distributed WITHOUT ANY WARRANTY; without even +## the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR +## PURPOSE. See the above copyright notice for more information. +## +## Copyright 2014 Sandia Corporation. +## Copyright 2014 UT-Battelle, LLC. +## Copyright 2014 Los Alamos National Security. +## +## Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +## the U.S. Government retains certain rights in this software. +## +## Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National +## Laboratory (LANL), the U.S. Government retains certain rights in +## this software. +##============================================================================ + +set(unit_tests + UnitTestDeviceAdapterOpenMP.cxx + UnitTestOpenMPArrayHandle.cxx + UnitTestOpenMPArrayHandleFancy.cxx + ) +vtkm_unit_tests(OpenMP SOURCES ${unit_tests}) + diff --git a/vtkm/cont/openmp/testing/UnitTestDeviceAdapterOpenMP.cxx b/vtkm/cont/openmp/testing/UnitTestDeviceAdapterOpenMP.cxx new file mode 100644 index 000000000..5ac7902ce --- /dev/null +++ b/vtkm/cont/openmp/testing/UnitTestDeviceAdapterOpenMP.cxx @@ -0,0 +1,32 @@ +//============================================================================ +// Copyright (c) Kitware, Inc. +// All rights reserved. +// See LICENSE.txt for details. +// This software is distributed WITHOUT ANY WARRANTY; without even +// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR +// PURPOSE. See the above copyright notice for more information. +// +// Copyright 2014 Sandia Corporation. +// Copyright 2014 UT-Battelle, LLC. +// Copyright 2014 Los Alamos National Security. +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National +// Laboratory (LANL), the U.S. Government retains certain rights in +// this software. +//============================================================================ + +#define VTKM_DEVICE_ADAPTER VTKM_DEVICE_ADAPTER_ERROR + +#include + +#include + +int UnitTestDeviceAdapterOpenMP(int, char *[]) +{ + return vtkm::cont::testing::TestingDeviceAdapter + ::Run(); +} + diff --git a/vtkm/cont/openmp/testing/UnitTestOpenMPArrayHandle.cxx b/vtkm/cont/openmp/testing/UnitTestOpenMPArrayHandle.cxx new file mode 100644 index 000000000..9c36d36ad --- /dev/null +++ b/vtkm/cont/openmp/testing/UnitTestOpenMPArrayHandle.cxx @@ -0,0 +1,31 @@ +//============================================================================ +// Copyright (c) Kitware, Inc. +// All rights reserved. +// See LICENSE.txt for details. +// This software is distributed WITHOUT ANY WARRANTY; without even +// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR +// PURPOSE. See the above copyright notice for more information. +// +// Copyright 2014 Sandia Corporation. +// Copyright 2014 UT-Battelle, LLC. +// Copyright 2014 Los Alamos National Security. +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National +// Laboratory (LANL), the U.S. Government retains certain rights in +// this software. +//============================================================================ + +#define VTKM_DEVICE_ADAPTER VTKM_DEVICE_ADAPTER_ERROR + +#include +#include + +int UnitTestOpenMPArrayHandle(int, char *[]) +{ + return vtkm::cont::testing::TestingArrayHandles + ::Run(); +} + diff --git a/vtkm/cont/openmp/testing/UnitTestOpenMPArrayHandleFancy.cxx b/vtkm/cont/openmp/testing/UnitTestOpenMPArrayHandleFancy.cxx new file mode 100644 index 000000000..24ac34e1c --- /dev/null +++ b/vtkm/cont/openmp/testing/UnitTestOpenMPArrayHandleFancy.cxx @@ -0,0 +1,31 @@ +//============================================================================ +// Copyright (c) Kitware, Inc. +// All rights reserved. +// See LICENSE.txt for details. +// This software is distributed WITHOUT ANY WARRANTY; without even +// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR +// PURPOSE. See the above copyright notice for more information. +// +// Copyright 2014 Sandia Corporation. +// Copyright 2014 UT-Battelle, LLC. +// Copyright 2014 Los Alamos National Security. +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National +// Laboratory (LANL), the U.S. Government retains certain rights in +// this software. +//============================================================================ + +#define VTKM_DEVICE_ADAPTER VTKM_DEVICE_ADAPTER_ERROR + +#include +#include + +int UnitTestOpenMPArrayHandleFancy(int, char *[]) +{ + return vtkm::cont::testing::TestingFancyArrayHandles + ::Run(); +} + diff --git a/vtkm/worklet/testing/CMakeLists.txt b/vtkm/worklet/testing/CMakeLists.txt index f2773d148..1034e7535 100644 --- a/vtkm/worklet/testing/CMakeLists.txt +++ b/vtkm/worklet/testing/CMakeLists.txt @@ -38,3 +38,7 @@ endif() if (VTKm_ENABLE_TBB) vtkm_worklet_unit_tests( VTKM_DEVICE_ADAPTER_TBB ) endif() +if (VTKm_ENABLE_OPENMP) + vtkm_worklet_unit_tests( VTKM_DEVICE_ADAPTER_OpenMP ) +endif() + -- GitLab From bfef6221dc33d5316d2c7446cae3686715da73af Mon Sep 17 00:00:00 2001 From: Will Usher Date: Tue, 7 Jul 2015 13:44:13 -0600 Subject: [PATCH 02/26] Now actually run in parallel with OpenMP It helps a lot to read the manual first! --- vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h index 011f8d5b5..6c2b02edb 100644 --- a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h +++ b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h @@ -20,7 +20,6 @@ #ifndef vtk_m_cont_openmp_internal_DeviceAdapterAlgorithmOpenMP_h #define vtk_m_cont_openmp_internal_DeviceAdapterAlgorithmOpenMP_h - #include #include #include @@ -77,7 +76,7 @@ public: ScheduleKernel kernel(functor); const vtkm::Id size = numInstances; -#pragma omp for schedule(static) +#pragma omp parallel for schedule(static, 1024) for(vtkm::Id i=0; i < size; ++i) { kernel(i); @@ -106,7 +105,7 @@ public: // TODO Will: This is ok right? I've just sort of glanced at this const vtkm::Id size = rangeMax[0] * rangeMax[1] * rangeMax[2]; -#pragma omp for schedule(static) +#pragma omp parallel for schedule(static, 1024) for(vtkm::Id i=0; i < size; ++i) { kernel(i); -- GitLab From 250dcdcba6c35cfb2d7efdf41ffbfb8107908b0a Mon Sep 17 00:00:00 2001 From: Will Usher Date: Tue, 7 Jul 2015 14:32:24 -0600 Subject: [PATCH 03/26] Some minor tweaking to improve performance a bit --- .../internal/DeviceAdapterAlgorithmOpenMP.h | 92 +++++++++---------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h index 6c2b02edb..3db96bbcb 100644 --- a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h +++ b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h @@ -62,60 +62,60 @@ private: public: template - VTKM_CONT_EXPORT static void Schedule(Functor functor, vtkm::Id numInstances) + VTKM_CONT_EXPORT static void Schedule(Functor functor, vtkm::Id numInstances) + { + // Literally just Serial's Schedule with a parallel for slapped on + const vtkm::Id MESSAGE_SIZE = 1024; + char errorString[MESSAGE_SIZE]; + errorString[0] = '\0'; + vtkm::exec::internal::ErrorMessageBuffer + errorMessage(errorString, MESSAGE_SIZE); + + functor.SetErrorMessageBuffer(errorMessage); + + ScheduleKernel kernel(functor); + + const vtkm::Id size = numInstances; +#pragma omp parallel for schedule(guided, 1512) + for(vtkm::Id i=0; i < size; ++i) { - // Literally just Serial's Schedule with a parallel for slapped on - const vtkm::Id MESSAGE_SIZE = 1024; - char errorString[MESSAGE_SIZE]; - errorString[0] = '\0'; - vtkm::exec::internal::ErrorMessageBuffer - errorMessage(errorString, MESSAGE_SIZE); - - functor.SetErrorMessageBuffer(errorMessage); - - ScheduleKernel kernel(functor); - - const vtkm::Id size = numInstances; -#pragma omp parallel for schedule(static, 1024) - for(vtkm::Id i=0; i < size; ++i) - { - kernel(i); - } - - if (errorMessage.IsErrorRaised()) - { - throw vtkm::cont::ErrorExecution(errorString); - } + kernel(i); } + if (errorMessage.IsErrorRaised()) + { + throw vtkm::cont::ErrorExecution(errorString); + } + } + template - VTKM_CONT_EXPORT static void Schedule(Functor functor, vtkm::Id3 rangeMax) + VTKM_CONT_EXPORT static void Schedule(Functor functor, vtkm::Id3 rangeMax) + { + // Literally just Serial's Schedule with a parallel for slapped on + // and the nested loops flattened + const vtkm::Id MESSAGE_SIZE = 1024; + char errorString[MESSAGE_SIZE]; + errorString[0] = '\0'; + vtkm::exec::internal::ErrorMessageBuffer + errorMessage(errorString, MESSAGE_SIZE); + + functor.SetErrorMessageBuffer(errorMessage); + + ScheduleKernel kernel(functor); + + // TODO Will: This is ok right? I've just sort of glanced at this + const vtkm::Id size = rangeMax[0] * rangeMax[1] * rangeMax[2]; +#pragma omp parallel for schedule(guided, 1512) + for(vtkm::Id i=0; i < size; ++i) { - // Literally just Serial's Schedule with a parallel for slapped on - // and the nested loops flattened - const vtkm::Id MESSAGE_SIZE = 1024; - char errorString[MESSAGE_SIZE]; - errorString[0] = '\0'; - vtkm::exec::internal::ErrorMessageBuffer - errorMessage(errorString, MESSAGE_SIZE); - - functor.SetErrorMessageBuffer(errorMessage); - - ScheduleKernel kernel(functor); - - // TODO Will: This is ok right? I've just sort of glanced at this - const vtkm::Id size = rangeMax[0] * rangeMax[1] * rangeMax[2]; -#pragma omp parallel for schedule(static, 1024) - for(vtkm::Id i=0; i < size; ++i) - { - kernel(i); - } - - if (errorMessage.IsErrorRaised()) + kernel(i); + } + + if (errorMessage.IsErrorRaised()) { throw vtkm::cont::ErrorExecution(errorString); } - } + } VTKM_CONT_EXPORT static void Synchronize() { -- GitLab From 0b4b0b1f82d97026cdbdb115b88a524366165998 Mon Sep 17 00:00:00 2001 From: Will Usher Date: Thu, 9 Jul 2015 17:01:11 -0600 Subject: [PATCH 04/26] Small Sort perf. gain by specializing and keeping threads around Instead of letting the General adapter call Schedule a bunch and produce a lot of fork/join barrier events the specialized version spawns the threads once and then distributes the work in the loops to them. This doesn't give a huge boost though since we basically trade fork barriers for regular barriers at the end of the for loops since we still need to synchronize. There's probably more we can do here to improve things (hopefully). Maybe better scheduling to reduce how long we wait at the barriers for? Or a different algorithm or way of mapping the algorithm onto OpenMP? --- .../internal/DeviceAdapterAlgorithmOpenMP.h | 148 +++++++++++++++++- 1 file changed, 146 insertions(+), 2 deletions(-) diff --git a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h index 3db96bbcb..c7ff176f8 100644 --- a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h +++ b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h @@ -43,6 +43,9 @@ struct DeviceAdapterAlgorithm : vtkm::cont::DeviceAdapterTagOpenMP> { private: + const static vtkm::Id GRAIN_SIZE = 4096; + typedef vtkm::cont::DeviceAdapterTagOpenMP DeviceAdapterTag; + // This is basically the Serial ScheduleKernel template class ScheduleKernel @@ -76,7 +79,7 @@ public: ScheduleKernel kernel(functor); const vtkm::Id size = numInstances; -#pragma omp parallel for schedule(guided, 1512) +#pragma omp parallel for schedule(guided, GRAIN_SIZE) for(vtkm::Id i=0; i < size; ++i) { kernel(i); @@ -105,7 +108,7 @@ public: // TODO Will: This is ok right? I've just sort of glanced at this const vtkm::Id size = rangeMax[0] * rangeMax[1] * rangeMax[2]; -#pragma omp parallel for schedule(guided, 1512) +#pragma omp parallel for schedule(guided, GRAIN_SIZE) for(vtkm::Id i=0; i < size; ++i) { kernel(i); @@ -117,6 +120,147 @@ public: } } + //-------------------------------------------------------------------------- + // Sort +private: + template + struct BitonicSortMergeKernel : vtkm::exec::FunctorBase + { + PortalType Portal; + BinaryCompare Compare; + vtkm::Id GroupSize; + + VTKM_CONT_EXPORT + BitonicSortMergeKernel(const PortalType &portal, + const BinaryCompare &compare, + vtkm::Id groupSize) + : Portal(portal), Compare(compare), GroupSize(groupSize) { } + + VTKM_EXEC_EXPORT + void operator()(vtkm::Id index) const + { + typedef typename PortalType::ValueType ValueType; + + vtkm::Id groupIndex = index%this->GroupSize; + vtkm::Id blockSize = 2*this->GroupSize; + vtkm::Id blockIndex = index/this->GroupSize; + + vtkm::Id lowIndex = blockIndex * blockSize + groupIndex; + vtkm::Id highIndex = lowIndex + this->GroupSize; + + if (highIndex < this->Portal.GetNumberOfValues()) + { + ValueType lowValue = this->Portal.Get(lowIndex); + ValueType highValue = this->Portal.Get(highIndex); + if (this->Compare(highValue, lowValue)) + { + this->Portal.Set(highIndex, lowValue); + this->Portal.Set(lowIndex, highValue); + } + } + } + }; + + template + struct BitonicSortCrossoverKernel : vtkm::exec::FunctorBase + { + PortalType Portal; + BinaryCompare Compare; + vtkm::Id GroupSize; + + VTKM_CONT_EXPORT + BitonicSortCrossoverKernel(const PortalType &portal, + const BinaryCompare &compare, + vtkm::Id groupSize) + : Portal(portal), Compare(compare), GroupSize(groupSize) { } + + VTKM_EXEC_EXPORT + void operator()(vtkm::Id index) const + { + typedef typename PortalType::ValueType ValueType; + + vtkm::Id groupIndex = index%this->GroupSize; + vtkm::Id blockSize = 2*this->GroupSize; + vtkm::Id blockIndex = index/this->GroupSize; + + vtkm::Id lowIndex = blockIndex*blockSize + groupIndex; + vtkm::Id highIndex = blockIndex*blockSize + (blockSize - groupIndex - 1); + + if (highIndex < this->Portal.GetNumberOfValues()) + { + ValueType lowValue = this->Portal.Get(lowIndex); + ValueType highValue = this->Portal.Get(highIndex); + if (this->Compare(highValue, lowValue)) + { + this->Portal.Set(highIndex, lowValue); + this->Portal.Set(lowIndex, highValue); + } + } + } + }; + + struct DefaultCompareFunctor + { + + template + VTKM_EXEC_EXPORT + bool operator()(const T& first, const T& second) const + { + return first < second; + } + }; + +public: + template + VTKM_CONT_EXPORT static void Sort( + vtkm::cont::ArrayHandle &values, + BinaryCompare binary_compare) + { + typedef typename vtkm::cont::ArrayHandle ArrayType; + typedef typename ArrayType::template ExecutionTypes + ::Portal PortalType; + + vtkm::Id numValues = values.GetNumberOfValues(); + if (numValues < 2) { return; } + + PortalType portal = values.PrepareForInPlace(DeviceAdapterTag()); + + vtkm::Id numThreads = 1; + while (numThreads < numValues) { numThreads *= 2; } + numThreads /= 2; + + typedef BitonicSortMergeKernel MergeKernel; + typedef BitonicSortCrossoverKernel CrossoverKernel; + + // TODO: Probably more we can do to improve this here +#pragma omp parallel + { + for (vtkm::Id crossoverSize = 1; crossoverSize < numValues; crossoverSize *= 2) + { + CrossoverKernel cross = CrossoverKernel(portal,binary_compare,crossoverSize); +#pragma omp for schedule(guided, GRAIN_SIZE) + for (vtkm::Id i = 0; i < numThreads; ++i){ + cross(i); + } + for (vtkm::Id mergeSize = crossoverSize/2; mergeSize > 0; mergeSize /= 2) + { + MergeKernel merge = MergeKernel(portal,binary_compare,mergeSize); +#pragma omp for schedule(guided, GRAIN_SIZE) + for (vtkm::Id i = 0; i < numThreads; ++i){ + merge(i); + } + } + } + } + } + + template + VTKM_CONT_EXPORT static void Sort( + vtkm::cont::ArrayHandle &values) + { + Sort(values, DefaultCompareFunctor()); + } + VTKM_CONT_EXPORT static void Synchronize() { // We do split/join here so there's nothing to synchronize. The OpenMP -- GitLab From f6c95e54d4a2cbb6dd70cd6b92d0a2d2a560a603 Mon Sep 17 00:00:00 2001 From: Will Usher Date: Thu, 9 Jul 2015 17:34:02 -0600 Subject: [PATCH 05/26] Same approach for ScanInclusive as I took for Sort's specialization Again we get a bit of a performance gain but not a huge one, smaller than I got with sort actually. Probably more we can do here as well. --- .../internal/DeviceAdapterAlgorithmOpenMP.h | 96 +++++++++++++++++++ 1 file changed, 96 insertions(+) diff --git a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h index c7ff176f8..024968857 100644 --- a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h +++ b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h @@ -120,6 +120,102 @@ public: } } + //-------------------------------------------------------------------------- + // Scan Inclusive +private: + template + struct ScanKernel : vtkm::exec::FunctorBase + { + PortalType Portal; + BinaryFunctor BinaryOperator; + vtkm::Id Stride; + vtkm::Id Offset; + vtkm::Id Distance; + + VTKM_CONT_EXPORT + ScanKernel(const PortalType &portal, BinaryFunctor binary_functor, + vtkm::Id stride, vtkm::Id offset) + : Portal(portal), + BinaryOperator(binary_functor), + Stride(stride), + Offset(offset), + Distance(stride/2) + { } + + VTKM_EXEC_EXPORT + void operator()(vtkm::Id index) const + { + typedef typename PortalType::ValueType ValueType; + + vtkm::Id leftIndex = this->Offset + index*this->Stride; + vtkm::Id rightIndex = leftIndex + this->Distance; + + if (rightIndex < this->Portal.GetNumberOfValues()) + { + ValueType leftValue = this->Portal.Get(leftIndex); + ValueType rightValue = this->Portal.Get(rightIndex); + this->Portal.Set(rightIndex, BinaryOperator(leftValue,rightValue) ); + } + } + }; + +public: + template + VTKM_CONT_EXPORT static T ScanInclusive( + const vtkm::cont::ArrayHandle &input, + vtkm::cont::ArrayHandle& output) + { + return ScanInclusive(input, output, vtkm::internal::Add()); + } + + template + VTKM_CONT_EXPORT static T ScanInclusive( + const vtkm::cont::ArrayHandle &input, + vtkm::cont::ArrayHandle& output, + BinaryFunctor binary_functor) + { + typedef typename + vtkm::cont::ArrayHandle + ::template ExecutionTypes::Portal PortalType; + + typedef ScanKernel ScanKernelType; + + Copy(input, output); + + vtkm::Id numValues = output.GetNumberOfValues(); + if (numValues < 1) + { + return output.GetPortalConstControl().Get(0); + } + + PortalType portal = output.PrepareForInPlace(DeviceAdapterTag()); + +#pragma omp parallel + { + vtkm::Id stride; + for (stride = 2; stride-1 < numValues; stride *= 2) + { + ScanKernelType kernel(portal, binary_functor, stride, stride/2 - 1); + //DerivedAlgorithm::Schedule(kernel, numValues/stride); +#pragma omp for schedule(guided, GRAIN_SIZE) + for (vtkm::Id i = 0; i < numValues / stride; ++i){ + kernel(i); + } + } + // Do reverse operation on odd indices. Start at stride we were just at. + for (stride /= 2; stride > 1; stride /= 2) + { + ScanKernelType kernel(portal, binary_functor, stride, stride - 1); + //DerivedAlgorithm::Schedule(kernel, numValues/stride); +#pragma omp for schedule(guided, GRAIN_SIZE) + for (vtkm::Id i = 0; i < numValues / stride; ++i){ + kernel(i); + } + } + } + return output.GetPortalConstControl().Get(numValues - 1); + } + //-------------------------------------------------------------------------- // Sort private: -- GitLab From 33191a474ff2e56bfffbf16fba13669d1228e084 Mon Sep 17 00:00:00 2001 From: Will Usher Date: Fri, 10 Jul 2015 13:57:33 -0600 Subject: [PATCH 06/26] Tweaking scheduling of sort does give a bit more improvement But maybe bitonic sort is a poor choice? Is there another method that will be faster and require less synchronization? --- .../internal/DeviceAdapterAlgorithmOpenMP.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h index 024968857..63738d8fa 100644 --- a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h +++ b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h @@ -334,14 +334,14 @@ public: for (vtkm::Id crossoverSize = 1; crossoverSize < numValues; crossoverSize *= 2) { CrossoverKernel cross = CrossoverKernel(portal,binary_compare,crossoverSize); -#pragma omp for schedule(guided, GRAIN_SIZE) +#pragma omp for schedule(dynamic, GRAIN_SIZE) for (vtkm::Id i = 0; i < numThreads; ++i){ cross(i); } for (vtkm::Id mergeSize = crossoverSize/2; mergeSize > 0; mergeSize /= 2) { MergeKernel merge = MergeKernel(portal,binary_compare,mergeSize); -#pragma omp for schedule(guided, GRAIN_SIZE) +#pragma omp for schedule(dynamic, GRAIN_SIZE) for (vtkm::Id i = 0; i < numThreads; ++i){ merge(i); } @@ -357,12 +357,12 @@ public: Sort(values, DefaultCompareFunctor()); } - VTKM_CONT_EXPORT static void Synchronize() - { - // We do split/join here so there's nothing to synchronize. The OpenMP - // backend should behave identically to Intel TBB where we've re-joined - // all threads before returning from an execution - } + VTKM_CONT_EXPORT static void Synchronize() + { + // We do split/join here so there's nothing to synchronize. The OpenMP + // backend should behave identically to Intel TBB where we've re-joined + // all threads before returning from an execution + } }; } }// namespace vtkm::cont -- GitLab From 8f01bcb6919779eba616aa115adca6c4d6b65226 Mon Sep 17 00:00:00 2001 From: Will Usher Date: Mon, 13 Jul 2015 16:44:22 -0600 Subject: [PATCH 07/26] Some minor tweaks for a bit of performance improvement Some small tweaks to scheduling and grain size seem to help somewhat --- vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h index 63738d8fa..ff9da7d0a 100644 --- a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h +++ b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h @@ -196,8 +196,7 @@ public: for (stride = 2; stride-1 < numValues; stride *= 2) { ScanKernelType kernel(portal, binary_functor, stride, stride/2 - 1); - //DerivedAlgorithm::Schedule(kernel, numValues/stride); -#pragma omp for schedule(guided, GRAIN_SIZE) +#pragma omp for schedule(guided, GRAIN_SIZE / 4) for (vtkm::Id i = 0; i < numValues / stride; ++i){ kernel(i); } @@ -206,7 +205,6 @@ public: for (stride /= 2; stride > 1; stride /= 2) { ScanKernelType kernel(portal, binary_functor, stride, stride - 1); - //DerivedAlgorithm::Schedule(kernel, numValues/stride); #pragma omp for schedule(guided, GRAIN_SIZE) for (vtkm::Id i = 0; i < numValues / stride; ++i){ kernel(i); @@ -334,14 +332,14 @@ public: for (vtkm::Id crossoverSize = 1; crossoverSize < numValues; crossoverSize *= 2) { CrossoverKernel cross = CrossoverKernel(portal,binary_compare,crossoverSize); -#pragma omp for schedule(dynamic, GRAIN_SIZE) +#pragma omp for schedule(guided, GRAIN_SIZE) for (vtkm::Id i = 0; i < numThreads; ++i){ cross(i); } for (vtkm::Id mergeSize = crossoverSize/2; mergeSize > 0; mergeSize /= 2) { MergeKernel merge = MergeKernel(portal,binary_compare,mergeSize); -#pragma omp for schedule(dynamic, GRAIN_SIZE) +#pragma omp for schedule(guided, GRAIN_SIZE / 4) for (vtkm::Id i = 0; i < numThreads; ++i){ merge(i); } -- GitLab From c3650030c06a9c358674feab7dbb45896f5450b6 Mon Sep 17 00:00:00 2001 From: Will Usher Date: Tue, 14 Jul 2015 14:27:52 -0600 Subject: [PATCH 08/26] Benchmark the OpenMP backend --- vtkm/benchmarking/CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vtkm/benchmarking/CMakeLists.txt b/vtkm/benchmarking/CMakeLists.txt index 8934552d3..e28db0ba0 100644 --- a/vtkm/benchmarking/CMakeLists.txt +++ b/vtkm/benchmarking/CMakeLists.txt @@ -32,4 +32,7 @@ endif() if (VTKm_ENABLE_TBB) vtkm_benchmarks(VTKM_DEVICE_ADAPTER_TBB) endif() +if (VTKm_ENABLE_OPENMP) + vtkm_benchmarks(VTKM_DEVICE_ADAPTER_OpenMP) +endif() -- GitLab From 3730d10fd587b9e294cad6ca64ea20e570d0c425 Mon Sep 17 00:00:00 2001 From: Will Usher Date: Wed, 22 Jul 2015 13:04:55 -0600 Subject: [PATCH 09/26] Working on getting Jeff's MIC scan implemented, almost have it working I just am getting values off by one, it seems like he may have implemented an exclusive scan? So maybe I could just specialize exclusive scan and implement inclusive scan using that. --- .../internal/DeviceAdapterAlgorithmOpenMP.h | 366 +++++++++++++++++- 1 file changed, 351 insertions(+), 15 deletions(-) diff --git a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h index ff9da7d0a..57ae3eb22 100644 --- a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h +++ b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -33,6 +34,14 @@ #include +#include +#include +#include + +#define SPECIALIZE_OPENMP +#define USE_IVDEP +//#define USE_OLD_SCAN + namespace vtkm { namespace cont { @@ -76,9 +85,12 @@ public: functor.SetErrorMessageBuffer(errorMessage); - ScheduleKernel kernel(functor); + const ScheduleKernel kernel(functor); const vtkm::Id size = numInstances; +#ifdef USE_IVDEP +#pragma ivdep +#endif #pragma omp parallel for schedule(guided, GRAIN_SIZE) for(vtkm::Id i=0; i < size; ++i) { @@ -104,10 +116,12 @@ public: functor.SetErrorMessageBuffer(errorMessage); - ScheduleKernel kernel(functor); + const ScheduleKernel kernel(functor); - // TODO Will: This is ok right? I've just sort of glanced at this const vtkm::Id size = rangeMax[0] * rangeMax[1] * rangeMax[2]; +#ifdef USE_IVDEP +#pragma ivdep +#endif #pragma omp parallel for schedule(guided, GRAIN_SIZE) for(vtkm::Id i=0; i < size; ++i) { @@ -159,7 +173,90 @@ private: } }; + /* + * This kernel will execute the tiled scan function from Jeff's MIC tuned scan + * (the functon named `simd_scan`) + */ + template + struct TiledScanKernel : vtkm::exec::FunctorBase + { + typedef typename PortalType::ValueType ValueType; + + PortalType Portal; + const BinaryFunctor BinaryOperator; + const ValueType &CarryIn; + ValueType &CarryOut; + + VTKM_CONT_EXPORT + TiledScanKernel(PortalType &portal, BinaryFunctor binary_functor, + const ValueType &carry_in, ValueType &carry_out) + : Portal(portal), BinaryOperator(binary_functor), + CarryIn(carry_in), CarryOut(carry_out) + {} + + VTKM_EXEC_EXPORT + void operator()(vtkm::Id index) const + { + // Perform a scan within the tile + const vtkm::Id offset = index * VectorSize; + ValueType vector[VectorSize]; + // Load up the elements for this 'vector' + for (vtkm::Id i = 0; i < 16; ++i){ + vector[i] = Portal.Get(i + offset); + } + // Just do a standard tree-scan instead of using the scan kernels + // Up-sweep + const vtkm::Id N = vtkm::Log2(vtkm::Float32(VectorSize)); + for (vtkm::Id i = 0; i < N; ++i){ + const vtkm::Id stride = 1 << (i + 1); + const vtkm::Id tree_offset = 1 << i; + const vtkm::Id N_TREE = VectorSize / stride; +#ifdef USE_IVDEP +#pragma ivdep +#endif + for (vtkm::Id j = 0; j < N_TREE; ++j){ + const vtkm::Id base = j * stride - 1; + ValueType left = vector[base + tree_offset]; + ValueType right = vector[base + stride]; + vector[base + stride] = BinaryOperator(left, right); + } + } + CarryOut = vector[VectorSize - 1]; + vector[VectorSize - 1] = ValueType(); + + // Do down sweep + for (vtkm::Id i = N - 1; i >= 0; --i){ + const vtkm::Id stride = 1 << (i + 1); + const vtkm::Id tree_offset = 1 << i; + const vtkm::Id N_TREE = VectorSize / stride; +#ifdef USE_IVDEP +#pragma ivdep +#endif + for (vtkm::Id j = 0; j < N_TREE; ++j){ + const vtkm::Id base = j * stride - 1; + // We get scatter/gather here, is that what's hurting performance? + // Jeff's version has no scatter/gather + ValueType left = vector[base + tree_offset]; + ValueType right = vector[base + stride]; + vector[base + tree_offset] = right; + vector[base + stride] = BinaryOperator(left, right); + } + } + + // Add the carry in value to the scan and write our values + // back into the portal +#ifdef USE_IVDEP +#pragma ivdep +#endif + for (vtkm::Id i = 0; i < VectorSize; ++i){ + ValueType value = vector[i]; + Portal.Set(i + offset, BinaryOperator(CarryIn, value)); + } + } + }; + public: +#ifdef SPECIALIZE_OPENMP template VTKM_CONT_EXPORT static T ScanInclusive( const vtkm::cont::ArrayHandle &input, @@ -168,6 +265,8 @@ public: return ScanInclusive(input, output, vtkm::internal::Add()); } +#ifdef USE_OLD_SCAN + // Original scan template VTKM_CONT_EXPORT static T ScanInclusive( const vtkm::cont::ArrayHandle &input, @@ -178,35 +277,43 @@ public: vtkm::cont::ArrayHandle ::template ExecutionTypes::Portal PortalType; - typedef ScanKernel ScanKernelType; + typedef ScanKernel ScanKernelType; Copy(input, output); + omp_set_num_threads(1); - vtkm::Id numValues = output.GetNumberOfValues(); + const vtkm::Id numValues = output.GetNumberOfValues(); if (numValues < 1) { return output.GetPortalConstControl().Get(0); } PortalType portal = output.PrepareForInPlace(DeviceAdapterTag()); - #pragma omp parallel { vtkm::Id stride; - for (stride = 2; stride-1 < numValues; stride *= 2) + for (stride = 2; stride - 1 < numValues; stride *= 2) { - ScanKernelType kernel(portal, binary_functor, stride, stride/2 - 1); + const ScanKernelType kernel(portal, binary_functor, stride, stride/2 - 1); + const vtkm::Id N = numValues / stride; +#ifdef USE_IVDEP +#pragma ivdep +#endif #pragma omp for schedule(guided, GRAIN_SIZE / 4) - for (vtkm::Id i = 0; i < numValues / stride; ++i){ + for (vtkm::Id i = 0; i < N; ++i){ kernel(i); } } // Do reverse operation on odd indices. Start at stride we were just at. for (stride /= 2; stride > 1; stride /= 2) { - ScanKernelType kernel(portal, binary_functor, stride, stride - 1); + const ScanKernelType kernel(portal, binary_functor, stride, stride - 1); + const vtkm::Id N = numValues / stride; +#ifdef USE_IVDEP +#pragma ivdep +#endif #pragma omp for schedule(guided, GRAIN_SIZE) - for (vtkm::Id i = 0; i < numValues / stride; ++i){ + for (vtkm::Id i = 0; i < N; ++i){ kernel(i); } } @@ -214,6 +321,226 @@ public: return output.GetPortalConstControl().Get(numValues - 1); } +#else + // Port of Jeff's optimized MIC scan + template + VTKM_CONT_EXPORT static T ScanInclusive( + const vtkm::cont::ArrayHandle &input, + vtkm::cont::ArrayHandle& output, + BinaryFunctor binary_functor) + { + typedef typename + vtkm::cont::ArrayHandle + ::template ExecutionTypes::Portal PortalType; + typedef typename vtkm::cont::ArrayHandle::PortalControl PortalControlType; + + const vtkm::Id VECTOR_SIZE = 16; + typedef TiledScanKernel TiledScanKernelType; + + typedef ScanKernel ScanKernelType; + + Copy(input, output); + + const vtkm::Id numValues = output.GetNumberOfValues(); + if (numValues < 1) + { + return output.GetPortalConstControl().Get(0); + } + + PortalType portal = output.PrepareForInPlace(DeviceAdapterTag()); + const vtkm::Id MAX_THREADS = omp_get_max_threads(); + + // Number of vectors that fit in 32Kb cache + size_t L1_vectors = (32 * 1024 * 1024) / (VECTOR_SIZE * 4); + + // TODO: Handling non-divisible by 16 array? Could we just pad with zeros out to something + // divisible by 16? + const vtkm::Id COUNT512 = output.GetNumberOfValues() / VECTOR_SIZE; + // number of 512-bit vectors in a "tile" + const vtkm::Id TILE_SIZE = COUNT512 > L1_vectors ? L1_vectors : COUNT512; + const vtkm::Id NUM_THREADS = MAX_THREADS > TILE_SIZE ? TILE_SIZE : MAX_THREADS; + + const vtkm::Id round_off_mask = VECTOR_SIZE == 16 ? ~0x0f + : VECTOR_SIZE == 8 ? ~0x07 : ~0x03; + + // meta_scan[i] will get the carry-out for the sub-scan performed by + // thread[i]. NOTE: This *must* have a 32-bit value for every thread. + // Round-up size of meta_scan[] to the nearest multiple of VECTOR_SIZE, to allow + // use of simd_scan(). + const vtkm::Id meta_scan_count = NUM_THREADS % VECTOR_SIZE + ? (NUM_THREADS + VECTOR_SIZE) & round_off_mask : NUM_THREADS; + + omp_set_num_threads(NUM_THREADS); + /* + std::cout << "Scan using tile size of " << TILE_SIZE << ", " << NUM_THREADS + << " threads and meta scan count of " << meta_scan_count + << ", COUNT512 = " << COUNT512 << "\n"; + */ + + vtkm::cont::ArrayHandle meta_scan_handle; + meta_scan_handle.Allocate(meta_scan_count); + PortalType meta_scan_portal = meta_scan_handle.PrepareForInPlace(vtkm::cont::DeviceAdapterTagOpenMP()); + PortalControlType meta_scan_control = meta_scan_handle.GetPortalControl(); + T carry_in = T(); + T carry_out = T(); + T tile_offset = T(); + + // Loop over each tile + for (vtkm::Id t = 0; t < COUNT512; t += TILE_SIZE){ + // PASS1 (per tile) parallel + // + // A "tile" represents a subset of the total input. Tiles are + // processed sequentially, and their size is chosen so as to restrict + // the amount of memory that is written through the cache. The goal is + // that whenever we write and subsequently read, the read should find + // the value in cache. Tiles are chosen with sizes to allow this. + // + // Within a given tile, multiple threads create independent scans of + // their local region of iterations. This results in N distinct local + // scans, where N is the number of threads. Each scan produces a + // "carry-out" value into meta_scan, such that meta_scan[i] is the + // carry-out value of thread[i]. + + // NOTE: In "static" schedule, each thread is assigned a chunk of + // contiguous iterations, of a size such that all threads have the same + // chunk-size (assuming n_iterations/n_threads has no remainder). + // Furthermore, subsequent static-parallel loops will assign iterations + // to threads in exactly the same way (assuming n_iterations is the + // same). +#pragma omp parallel firstprivate(carry_in), private(carry_out) + { + TiledScanKernelType kernel = TiledScanKernelType(portal, binary_functor, carry_in, carry_out); +#pragma omp for schedule(static) + for (vtkm::Id i = 0; i < TILE_SIZE; ++i){ + kernel(t + i); + carry_in = binary_functor(carry_in, carry_out); + //std::cout << "Tile " << i << " carry out = " << carry_out << std::endl; + } + // Now share our carry out with everyone else in the meta scan + // It's a bit awkward but the carry out of this thread is actually in + // `carry_in` + const vtkm::Id thread = omp_get_thread_num(); + meta_scan_portal.Set(thread, carry_in); + } + + /* + for (vtkm::Id i = 0; i < meta_scan_count; ++i){ + std::cout << "meta_scan[" << i << "] = " + << meta_scan_handle.GetPortalConstControl().Get(i) << std::endl; + } + */ + + // META-PASS: (per tile) serial + // + // Still within the given tile, we now run a scan on the values in + // meta-scan. This produces offsets that will be added back into the + // sub-scans within the tile. We also want to add the global offset + // for this tile to all the values in the sub-scans. This can be done + // with the simd_scan2(), which also supports PASS2 (below). + // + // After the meta-scan, values in the meta-scan provide fully + // "relocated" offsets, for the corresponding threads within the + // current tile. The threads will add these offsets to their sub-scan + // elements in PASS2. + // + // The meta-scan also produces a carry-out. This is the carry-out + // value for this tile, which is also the global offset of the next + // tile. + { + // perform the meta-scan + T carry_in_meta = T(); + TiledScanKernelType kernel = TiledScanKernelType(meta_scan_portal, + binary_functor, carry_in_meta, carry_out); +#ifdef USE_IVDEP +#pragma ivdep +#endif + for (vtkm::Id i = 0; i < meta_scan_count / VECTOR_SIZE; ++i){ + //std::cout << "meta scan iter " << i << std::endl; + kernel(i); + carry_in_meta = binary_functor(carry_in_meta, carry_out); + } + + //std::cout << "carry_in_meta = " << carry_in_meta << std::endl; + + // Add the tile offset to each element in the meta scan + for (vtkm::Id i = 0; i < meta_scan_count / VECTOR_SIZE; ++i){ +#ifdef USE_IVDEP +#pragma ivdep +#endif + for (vtkm::Id j = 0; j < VECTOR_SIZE; ++j){ + T value = meta_scan_control.Get(i * VECTOR_SIZE + j); + /* + std::cout << "index = " << t + i * VECTOR_SIZE + j << ", value = " << value + << ", meta scan value = " << binary_functor(tile_offset, value) + << std::endl; + */ + meta_scan_control.Set(i * VECTOR_SIZE + j, binary_functor(tile_offset, value)); + } + } + // increment offset for the next tile by the carry-out from the + // meta-scan + tile_offset = binary_functor(tile_offset, carry_in_meta); + } + /* + std::cout << "Post meta-scan\n"; + for (vtkm::Id i = 0; i < meta_scan_count; ++i){ + std::cout << "meta_scan[" << i << "] = " << meta_scan_portal.Get(i) << std::endl; + } + */ + + // PASS2 (per tile) parallel + // + // Now each thread just adds the values from the corresponding + // meta-scan, to all the elements within its chunk, within the current + // tile. (See NOTE in PASS1, re repeatability of OMP static schedule.) + // The resulting elements are "fully relocated", such that, when we are + // finished with a tile, we don't need to revisit it again. Thus, we + // can now forget about cache for this tile. +#pragma omp parallel shared(carry_out) + { + const vtkm::Id thread = omp_get_thread_num(); + T thread_offset = meta_scan_portal.Get(thread); + +#pragma omp for schedule(static) + for (size_t i = 0; i < TILE_SIZE; ++i){ +#ifdef USE_IVDEP +#pragma ivdep +#endif + for (vtkm::Id j = 0; j < VECTOR_SIZE; ++j){ + T value = portal.Get(t + i * VECTOR_SIZE + j); + portal.Set(t + i * VECTOR_SIZE + j, binary_functor(thread_offset, value)); + } + } + // carry-out of final iteration becomes new global carry-out + if (thread == omp_get_num_threads() - 1) { + carry_out = binary_functor(carry_out, thread_offset); + } + } + + // CLEANUP (per-tile) serial + { + // If meta_scan was rounded up, then the last several values are not + // reset by any thread, during PASS1. Reset them to zero now. +#ifdef USE_IVDEP +#pragma ivdep +#endif + for (vtkm::Id i = meta_scan_count - VECTOR_SIZE; i < meta_scan_count; ++i){ + meta_scan_control.Set(i, T()); + } + // carry_in should be reset to zeros, for the next tile. You'd think + // we could use the tile_offset as the new carry_in, but that would + // mess up the meta-scan. + carry_in = T(); + } + } + + // Reset the max # of threads + omp_set_num_threads(MAX_THREADS); + return output.GetPortalConstControl().Get(numValues - 1); + } +#endif +#endif + //-------------------------------------------------------------------------- // Sort private: @@ -305,6 +632,7 @@ private: }; public: +#ifdef SPECIALIZE_OPENMP template VTKM_CONT_EXPORT static void Sort( vtkm::cont::ArrayHandle &values, @@ -322,6 +650,7 @@ public: vtkm::Id numThreads = 1; while (numThreads < numValues) { numThreads *= 2; } numThreads /= 2; + const vtkm::Id NUM_THREADS = numThreads; typedef BitonicSortMergeKernel MergeKernel; typedef BitonicSortCrossoverKernel CrossoverKernel; @@ -331,16 +660,22 @@ public: { for (vtkm::Id crossoverSize = 1; crossoverSize < numValues; crossoverSize *= 2) { - CrossoverKernel cross = CrossoverKernel(portal,binary_compare,crossoverSize); + const CrossoverKernel cross = CrossoverKernel(portal,binary_compare,crossoverSize); +#ifdef USE_IVDEP +#pragma ivdep +#endif #pragma omp for schedule(guided, GRAIN_SIZE) - for (vtkm::Id i = 0; i < numThreads; ++i){ + for (vtkm::Id i = 0; i < NUM_THREADS; ++i){ cross(i); } for (vtkm::Id mergeSize = crossoverSize/2; mergeSize > 0; mergeSize /= 2) { - MergeKernel merge = MergeKernel(portal,binary_compare,mergeSize); + const MergeKernel merge = MergeKernel(portal,binary_compare,mergeSize); +#ifdef USE_IVDEP +#pragma ivdep +#endif #pragma omp for schedule(guided, GRAIN_SIZE / 4) - for (vtkm::Id i = 0; i < numThreads; ++i){ + for (vtkm::Id i = 0; i < NUM_THREADS; ++i){ merge(i); } } @@ -354,6 +689,7 @@ public: { Sort(values, DefaultCompareFunctor()); } +#endif VTKM_CONT_EXPORT static void Synchronize() { -- GitLab From 44a13abe029caaa30e11b83d7caf701fa881c401 Mon Sep 17 00:00:00 2001 From: Will Usher Date: Tue, 28 Jul 2015 10:30:30 -0600 Subject: [PATCH 10/26] Trying just directly running Jeff's hand-vectorized kernel on MIC --- .../internal/DeviceAdapterAlgorithmOpenMP.h | 167 +-------------- vtkm/cont/openmp/internal/TiledScanKernel.h | 198 ++++++++++++++++++ 2 files changed, 206 insertions(+), 159 deletions(-) create mode 100644 vtkm/cont/openmp/internal/TiledScanKernel.h diff --git a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h index 57ae3eb22..a460f9ab2 100644 --- a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h +++ b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h @@ -32,6 +32,8 @@ #include #include +#include + #include #include @@ -39,8 +41,7 @@ #include #define SPECIALIZE_OPENMP -#define USE_IVDEP -//#define USE_OLD_SCAN +//#define USE_IVDEP namespace vtkm { namespace cont { @@ -173,88 +174,6 @@ private: } }; - /* - * This kernel will execute the tiled scan function from Jeff's MIC tuned scan - * (the functon named `simd_scan`) - */ - template - struct TiledScanKernel : vtkm::exec::FunctorBase - { - typedef typename PortalType::ValueType ValueType; - - PortalType Portal; - const BinaryFunctor BinaryOperator; - const ValueType &CarryIn; - ValueType &CarryOut; - - VTKM_CONT_EXPORT - TiledScanKernel(PortalType &portal, BinaryFunctor binary_functor, - const ValueType &carry_in, ValueType &carry_out) - : Portal(portal), BinaryOperator(binary_functor), - CarryIn(carry_in), CarryOut(carry_out) - {} - - VTKM_EXEC_EXPORT - void operator()(vtkm::Id index) const - { - // Perform a scan within the tile - const vtkm::Id offset = index * VectorSize; - ValueType vector[VectorSize]; - // Load up the elements for this 'vector' - for (vtkm::Id i = 0; i < 16; ++i){ - vector[i] = Portal.Get(i + offset); - } - // Just do a standard tree-scan instead of using the scan kernels - // Up-sweep - const vtkm::Id N = vtkm::Log2(vtkm::Float32(VectorSize)); - for (vtkm::Id i = 0; i < N; ++i){ - const vtkm::Id stride = 1 << (i + 1); - const vtkm::Id tree_offset = 1 << i; - const vtkm::Id N_TREE = VectorSize / stride; -#ifdef USE_IVDEP -#pragma ivdep -#endif - for (vtkm::Id j = 0; j < N_TREE; ++j){ - const vtkm::Id base = j * stride - 1; - ValueType left = vector[base + tree_offset]; - ValueType right = vector[base + stride]; - vector[base + stride] = BinaryOperator(left, right); - } - } - CarryOut = vector[VectorSize - 1]; - vector[VectorSize - 1] = ValueType(); - - // Do down sweep - for (vtkm::Id i = N - 1; i >= 0; --i){ - const vtkm::Id stride = 1 << (i + 1); - const vtkm::Id tree_offset = 1 << i; - const vtkm::Id N_TREE = VectorSize / stride; -#ifdef USE_IVDEP -#pragma ivdep -#endif - for (vtkm::Id j = 0; j < N_TREE; ++j){ - const vtkm::Id base = j * stride - 1; - // We get scatter/gather here, is that what's hurting performance? - // Jeff's version has no scatter/gather - ValueType left = vector[base + tree_offset]; - ValueType right = vector[base + stride]; - vector[base + tree_offset] = right; - vector[base + stride] = BinaryOperator(left, right); - } - } - - // Add the carry in value to the scan and write our values - // back into the portal -#ifdef USE_IVDEP -#pragma ivdep -#endif - for (vtkm::Id i = 0; i < VectorSize; ++i){ - ValueType value = vector[i]; - Portal.Set(i + offset, BinaryOperator(CarryIn, value)); - } - } - }; - public: #ifdef SPECIALIZE_OPENMP template @@ -265,63 +184,6 @@ public: return ScanInclusive(input, output, vtkm::internal::Add()); } -#ifdef USE_OLD_SCAN - // Original scan - template - VTKM_CONT_EXPORT static T ScanInclusive( - const vtkm::cont::ArrayHandle &input, - vtkm::cont::ArrayHandle& output, - BinaryFunctor binary_functor) - { - typedef typename - vtkm::cont::ArrayHandle - ::template ExecutionTypes::Portal PortalType; - - typedef ScanKernel ScanKernelType; - - Copy(input, output); - omp_set_num_threads(1); - - const vtkm::Id numValues = output.GetNumberOfValues(); - if (numValues < 1) - { - return output.GetPortalConstControl().Get(0); - } - - PortalType portal = output.PrepareForInPlace(DeviceAdapterTag()); -#pragma omp parallel - { - vtkm::Id stride; - for (stride = 2; stride - 1 < numValues; stride *= 2) - { - const ScanKernelType kernel(portal, binary_functor, stride, stride/2 - 1); - const vtkm::Id N = numValues / stride; -#ifdef USE_IVDEP -#pragma ivdep -#endif -#pragma omp for schedule(guided, GRAIN_SIZE / 4) - for (vtkm::Id i = 0; i < N; ++i){ - kernel(i); - } - } - // Do reverse operation on odd indices. Start at stride we were just at. - for (stride /= 2; stride > 1; stride /= 2) - { - const ScanKernelType kernel(portal, binary_functor, stride, stride - 1); - const vtkm::Id N = numValues / stride; -#ifdef USE_IVDEP -#pragma ivdep -#endif -#pragma omp for schedule(guided, GRAIN_SIZE) - for (vtkm::Id i = 0; i < N; ++i){ - kernel(i); - } - } - } - return output.GetPortalConstControl().Get(numValues - 1); - } - -#else // Port of Jeff's optimized MIC scan template VTKM_CONT_EXPORT static T ScanInclusive( @@ -335,7 +197,7 @@ public: typedef typename vtkm::cont::ArrayHandle::PortalControl PortalControlType; const vtkm::Id VECTOR_SIZE = 16; - typedef TiledScanKernel TiledScanKernelType; + typedef internal::TiledScanKernel TiledScanKernelType; typedef ScanKernel ScanKernelType; @@ -351,7 +213,7 @@ public: const vtkm::Id MAX_THREADS = omp_get_max_threads(); // Number of vectors that fit in 32Kb cache - size_t L1_vectors = (32 * 1024 * 1024) / (VECTOR_SIZE * 4); + const size_t L1_vectors = (32 * 1024 * 1024) / (VECTOR_SIZE * 4); // TODO: Handling non-divisible by 16 array? Could we just pad with zeros out to something // divisible by 16? @@ -360,8 +222,8 @@ public: const vtkm::Id TILE_SIZE = COUNT512 > L1_vectors ? L1_vectors : COUNT512; const vtkm::Id NUM_THREADS = MAX_THREADS > TILE_SIZE ? TILE_SIZE : MAX_THREADS; - const vtkm::Id round_off_mask = VECTOR_SIZE == 16 ? ~0x0f - : VECTOR_SIZE == 8 ? ~0x07 : ~0x03; + const vtkm::Id round_off_mask = VECTOR_SIZE == 16 ? ~0x0f + : VECTOR_SIZE == 8 ? ~0x07 : ~0x03; // meta_scan[i] will get the carry-out for the sub-scan performed by // thread[i]. NOTE: This *must* have a 32-bit value for every thread. @@ -410,7 +272,7 @@ public: #pragma omp parallel firstprivate(carry_in), private(carry_out) { TiledScanKernelType kernel = TiledScanKernelType(portal, binary_functor, carry_in, carry_out); -#pragma omp for schedule(static) +#pragma omp for schedule(static, 16) for (vtkm::Id i = 0; i < TILE_SIZE; ++i){ kernel(t + i); carry_in = binary_functor(carry_in, carry_out); @@ -451,9 +313,6 @@ public: T carry_in_meta = T(); TiledScanKernelType kernel = TiledScanKernelType(meta_scan_portal, binary_functor, carry_in_meta, carry_out); -#ifdef USE_IVDEP -#pragma ivdep -#endif for (vtkm::Id i = 0; i < meta_scan_count / VECTOR_SIZE; ++i){ //std::cout << "meta scan iter " << i << std::endl; kernel(i); @@ -464,9 +323,6 @@ public: // Add the tile offset to each element in the meta scan for (vtkm::Id i = 0; i < meta_scan_count / VECTOR_SIZE; ++i){ -#ifdef USE_IVDEP -#pragma ivdep -#endif for (vtkm::Id j = 0; j < VECTOR_SIZE; ++j){ T value = meta_scan_control.Get(i * VECTOR_SIZE + j); /* @@ -503,9 +359,6 @@ public: #pragma omp for schedule(static) for (size_t i = 0; i < TILE_SIZE; ++i){ -#ifdef USE_IVDEP -#pragma ivdep -#endif for (vtkm::Id j = 0; j < VECTOR_SIZE; ++j){ T value = portal.Get(t + i * VECTOR_SIZE + j); portal.Set(t + i * VECTOR_SIZE + j, binary_functor(thread_offset, value)); @@ -521,9 +374,6 @@ public: { // If meta_scan was rounded up, then the last several values are not // reset by any thread, during PASS1. Reset them to zero now. -#ifdef USE_IVDEP -#pragma ivdep -#endif for (vtkm::Id i = meta_scan_count - VECTOR_SIZE; i < meta_scan_count; ++i){ meta_scan_control.Set(i, T()); } @@ -538,7 +388,6 @@ public: omp_set_num_threads(MAX_THREADS); return output.GetPortalConstControl().Get(numValues - 1); } -#endif #endif //-------------------------------------------------------------------------- diff --git a/vtkm/cont/openmp/internal/TiledScanKernel.h b/vtkm/cont/openmp/internal/TiledScanKernel.h new file mode 100644 index 000000000..757612dd2 --- /dev/null +++ b/vtkm/cont/openmp/internal/TiledScanKernel.h @@ -0,0 +1,198 @@ +#ifndef vtk_m_cont_openmp_internal_TiledScanKernel_h +#define vtk_m_cont_openmp_internal_TiledScanKernel_h + +#include + +//#define USE_JEFF_SCAN + +namespace vtkm { +namespace cont { +namespace internal { +/* + * This kernel will execute the tiled scan function from Jeff's MIC tuned scan + * (the functon named `simd_scan`) + * Could we specialize this for vectorizable types and use Jeff's tuned scan? + * Would want to move the tiled scan kernel out of the class though to make + * writing the specializations easier + */ +template +struct TiledScanKernel : vtkm::exec::FunctorBase +{ + typedef typename PortalType::ValueType ValueType; + + PortalType Portal; + const BinaryFunctor BinaryOperator; + const ValueType &CarryIn; + ValueType &CarryOut; + + VTKM_CONT_EXPORT + TiledScanKernel(PortalType &portal, BinaryFunctor binary_functor, + const ValueType &carry_in, ValueType &carry_out) + : Portal(portal), BinaryOperator(binary_functor), + CarryIn(carry_in), CarryOut(carry_out) + {} + + VTKM_EXEC_EXPORT + void operator()(vtkm::Id index) const + { + // Perform a scan within the tile + const vtkm::Id offset = index * VectorSize; + ValueType vector[VectorSize]; + // Load up the elements for this 'vector' + for (vtkm::Id i = 0; i < 16; ++i){ + vector[i] = Portal.Get(i + offset); + } + // Just do a standard tree-scan instead of using the scan kernels + // Up-sweep + const vtkm::Id N = vtkm::Log2(vtkm::Float32(VectorSize)); + for (vtkm::Id i = 0; i < N; ++i){ + const vtkm::Id stride = 1 << (i + 1); + const vtkm::Id tree_offset = 1 << i; + const vtkm::Id N_TREE = VectorSize / stride; + for (vtkm::Id j = 0; j < N_TREE; ++j){ + const vtkm::Id base = j * stride - 1; + ValueType left = vector[base + tree_offset]; + ValueType right = vector[base + stride]; + vector[base + stride] = BinaryOperator(left, right); + } + } + CarryOut = vector[VectorSize - 1]; + vector[VectorSize - 1] = Portal.Get(offset); + + // Do down sweep + for (vtkm::Id i = N - 1; i >= 0; --i){ + const vtkm::Id stride = 1 << (i + 1); + const vtkm::Id tree_offset = 1 << i; + const vtkm::Id N_TREE = VectorSize / stride; + for (vtkm::Id j = 0; j < N_TREE; ++j){ + const vtkm::Id base = j * stride - 1; + // We get scatter/gather here if we do ivdep, is that what's hurting performance? + // Jeff's version has no scatter/gather + ValueType left = vector[base + tree_offset]; + ValueType right = vector[base + stride]; + vector[base + tree_offset] = right; + vector[base + stride] = BinaryOperator(left, right); + } + } + + // Add the carry in value to the scan and write our values + // back into the portal + for (vtkm::Id i = 0; i < VectorSize; ++i){ + ValueType value = vector[i]; + Portal.Set(i + offset, BinaryOperator(CarryIn, value)); + } + } +}; + +#ifdef USE_JEFF_SCAN +#ifdef __MIC__ +#include + +template<> +VTKM_EXEC_EXPORT void TiledScanKernel:: + template ExecutionTypes::Portal, + vtkm::internal::Add, 16>::operator()(vtkm::Id index) const { + const static vtkm::Id VectorSize = 16; + + register __m512i a; + register __m512i tmp; + + // initialize permutation mask + // indices increase from right-to-left in the vector. + // Q: Which end is the effing "high-order" end? + // A: low-order value (RHS) in the register gets read/stored first. + // Therefore, the RHS element (you know, the one with index 0) is really the effing "high-order" end. + // therefore, we need to think of the scan as right-to-left, within the vector. + // NOTE: we have values here that are unused in upward-pass, but used in downward-pass + static const __m512i perm_mask4 = _mm512_set_epi32( 7, 0, 0, 0, 0, 0, 0, 0,15, 0, 0, 0, 0, 0, 0, 0 ); + static const __m512i perm_mask3 = _mm512_set_epi32( 11, 0, 0, 0,15, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 ); + static const __m512i perm_mask2 = _mm512_set_epi32( 13, 0,15, 0, 9, 0,11, 0, 5, 0, 7, 0, 1, 0, 3, 0 ); + static const __m512i perm_mask1 = _mm512_set_epi32( 14,15,12,13,10,11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 ); + + /// print_vec("", perm_mask1); + + // would rather build this efficiently from scratch, using arithmetic operators provided via intrinsics. + // hard to see how, though. There's no NOT. Comparisons set bits in a mask, rather than a vector. etc. + static const __m512i fifteen = _mm512_set_epi32( 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15 ); + + static const __m512i identity = _mm512_set_epi32( 15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 ); + + // Perform a scan within the tile + const vtkm::Id offset = index * VectorSize; + __declspec(align(64)) vtkm::Int32 vector[VectorSize]; + // Load up the elements for this 'vector' + for (vtkm::Id i = 0; i < 16; ++i){ + vector[i] = Portal.Get(i + offset); + } + + // load 16 values + a = _mm512_load_epi32(vector); + + // (1) upward-pass + tmp = _mm512_permutevar_epi32(perm_mask1, a); + a = _mm512_mask_add_epi32(a, 0xaaaa, a, tmp); // was 0x5555 + + tmp = _mm512_permutevar_epi32(perm_mask2, a); + a = _mm512_mask_add_epi32(a, 0x8888, a, tmp); // was 0x1111 + + tmp = _mm512_permutevar_epi32(perm_mask3, a); + a = _mm512_mask_add_epi32(a, 0x8080, a, tmp); // was 0x0101 + + tmp = _mm512_permutevar_epi32(perm_mask4, a); + a = _mm512_mask_add_epi32(a, 0x8000, a, tmp); // was 0x0001 + + tmp = _mm512_permutevar_epi32(fifteen, a); // all positions of get a[15] ? + _mm512_store_epi32(vector, tmp); + CarryOut = vector[15]; + + // zero-out high-order element + // a = _mm512_mask_xor_epi32(a, 0x8000, a, a); // high-order value (LHS) becomes 0 + a = _mm512_mask_slli_epi32(a, 0x8000, a, 32); // high-order value (LHS) becomes 0 + + // (2) downward pass + tmp = _mm512_permutevar_epi32(perm_mask4, a); + a = _mm512_mask_add_epi32(a, 0x8000, a, tmp); // was 0x0001 + a = _mm512_mask_xor_epi32(a, 0x0080, a, a); + a = _mm512_mask_or_epi32 (a, 0x0080, a, tmp); + + tmp = _mm512_permutevar_epi32(perm_mask3, a); + a = _mm512_mask_add_epi32(a, 0x8080, a, tmp); // was 0x0101 + a = _mm512_mask_xor_epi32(a, 0x0808, a, a); + a = _mm512_mask_or_epi32 (a, 0x0808, a, tmp); + + tmp = _mm512_permutevar_epi32(perm_mask2, a); + a = _mm512_mask_add_epi32(a, 0x8888, a, tmp); // was 0x1111 + a = _mm512_mask_xor_epi32(a, 0x2222, a, a); + a = _mm512_mask_or_epi32 (a, 0x2222, a, tmp); + + tmp = _mm512_permutevar_epi32(perm_mask1, a); + a = _mm512_mask_add_epi32(a, 0xaaaa, a, tmp); // was 0x5555 + a = _mm512_mask_xor_epi32(a, 0x5555, a, a); + a = _mm512_mask_or_epi32 (a, 0x5555, a, tmp); + +#pragma unroll + for (vtkm::Id i = 0; i < 16; ++i){ + vector[i] = CarryIn; + } + tmp = _mm512_load_epi32(vector); + a = _mm512_add_epi32(a, tmp); + + // store 16 values of completed scan + _mm512_store_epi32(vector, a); + + // Write our result back into the array portal + for (vtkm::Id i = 0; i < VectorSize; ++i){ + ValueType value = vector[i]; + Portal.Set(i + offset, value); + } +} +#endif +#endif + +} +} +} + +#endif + -- GitLab From 91414c8af562ec5450cffe3a8982d24fe7f39229 Mon Sep 17 00:00:00 2001 From: Will Usher Date: Wed, 29 Jul 2015 13:06:40 -0600 Subject: [PATCH 11/26] Use VectorSize instead of 16 magic number, also clarify comment on Scan --- vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h | 2 +- vtkm/cont/openmp/internal/TiledScanKernel.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h index a460f9ab2..508be207f 100644 --- a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h +++ b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h @@ -184,7 +184,7 @@ public: return ScanInclusive(input, output, vtkm::internal::Add()); } - // Port of Jeff's optimized MIC scan + // Port of Jeff Inman's optimized MIC scan template VTKM_CONT_EXPORT static T ScanInclusive( const vtkm::cont::ArrayHandle &input, diff --git a/vtkm/cont/openmp/internal/TiledScanKernel.h b/vtkm/cont/openmp/internal/TiledScanKernel.h index 757612dd2..1ec127e4f 100644 --- a/vtkm/cont/openmp/internal/TiledScanKernel.h +++ b/vtkm/cont/openmp/internal/TiledScanKernel.h @@ -39,7 +39,7 @@ struct TiledScanKernel : vtkm::exec::FunctorBase const vtkm::Id offset = index * VectorSize; ValueType vector[VectorSize]; // Load up the elements for this 'vector' - for (vtkm::Id i = 0; i < 16; ++i){ + for (vtkm::Id i = 0; i < VectorSize; ++i){ vector[i] = Portal.Get(i + offset); } // Just do a standard tree-scan instead of using the scan kernels @@ -88,6 +88,7 @@ struct TiledScanKernel : vtkm::exec::FunctorBase #ifdef __MIC__ #include +// Specialized hand tuned kernel used by Jeff Inman's scan template<> VTKM_EXEC_EXPORT void TiledScanKernel:: -- GitLab From 4f7a8b80c28e8124b8eb6df8584e530346384346 Mon Sep 17 00:00:00 2001 From: Will Usher Date: Wed, 29 Jul 2015 17:26:21 -0600 Subject: [PATCH 12/26] Working inclusive scan using tiled scheduling Remaining todos: - How to get efficient vectorization within TiledInclusiveScanKernel to match performance of Jeff's hand tuned version. - How to handle missing carry-in on the first iteration. This gives wrong results where the identity value for the operator isn't T(), ie 0. For example a *-scan we get wrong results because we have a carry in of 0. --- .../internal/DeviceAdapterAlgorithmOpenMP.h | 28 +++++-- vtkm/cont/openmp/internal/TiledScanKernel.h | 78 ++++++++++++++++++- 2 files changed, 95 insertions(+), 11 deletions(-) diff --git a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h index 508be207f..00f015df6 100644 --- a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h +++ b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h @@ -174,8 +174,8 @@ private: } }; -public: #ifdef SPECIALIZE_OPENMP +public: template VTKM_CONT_EXPORT static T ScanInclusive( const vtkm::cont::ArrayHandle &input, @@ -184,7 +184,6 @@ public: return ScanInclusive(input, output, vtkm::internal::Add()); } - // Port of Jeff Inman's optimized MIC scan template VTKM_CONT_EXPORT static T ScanInclusive( const vtkm::cont::ArrayHandle &input, @@ -197,7 +196,7 @@ public: typedef typename vtkm::cont::ArrayHandle::PortalControl PortalControlType; const vtkm::Id VECTOR_SIZE = 16; - typedef internal::TiledScanKernel TiledScanKernelType; + typedef internal::TiledInclusiveScanKernel TiledScanKernelType; typedef ScanKernel ScanKernelType; @@ -230,7 +229,7 @@ public: // Round-up size of meta_scan[] to the nearest multiple of VECTOR_SIZE, to allow // use of simd_scan(). const vtkm::Id meta_scan_count = NUM_THREADS % VECTOR_SIZE - ? (NUM_THREADS + VECTOR_SIZE) & round_off_mask : NUM_THREADS; + ? 1 + ((NUM_THREADS + VECTOR_SIZE) & round_off_mask) : NUM_THREADS + 1; omp_set_num_threads(NUM_THREADS); /* @@ -243,10 +242,19 @@ public: meta_scan_handle.Allocate(meta_scan_count); PortalType meta_scan_portal = meta_scan_handle.PrepareForInPlace(vtkm::cont::DeviceAdapterTagOpenMP()); PortalControlType meta_scan_control = meta_scan_handle.GetPortalControl(); + // Should we not have a carry in if no value has been set for it? + // Same for tile offset and carry out usage. Assuming T() is the identity + // type for the operator is not correct, eg. 0 is not the identity type + // for a *-scan T carry_in = T(); T carry_out = T(); T tile_offset = T(); + // TODO: Change this to schedule a clear array kernel + for (vtkm::Id i = 0; i < meta_scan_count; ++i){ + meta_scan_control.Set(i, T()); + } + // Loop over each tile for (vtkm::Id t = 0; t < COUNT512; t += TILE_SIZE){ // PASS1 (per tile) parallel @@ -271,8 +279,10 @@ public: // same). #pragma omp parallel firstprivate(carry_in), private(carry_out) { + // TODO: On the first tile we don't have a carry in value, how to express + // this to the kernel? TiledScanKernelType kernel = TiledScanKernelType(portal, binary_functor, carry_in, carry_out); -#pragma omp for schedule(static, 16) +#pragma omp for schedule(static) for (vtkm::Id i = 0; i < TILE_SIZE; ++i){ kernel(t + i); carry_in = binary_functor(carry_in, carry_out); @@ -282,7 +292,7 @@ public: // It's a bit awkward but the carry out of this thread is actually in // `carry_in` const vtkm::Id thread = omp_get_thread_num(); - meta_scan_portal.Set(thread, carry_in); + meta_scan_portal.Set(thread + 1, carry_in); } /* @@ -374,7 +384,8 @@ public: { // If meta_scan was rounded up, then the last several values are not // reset by any thread, during PASS1. Reset them to zero now. - for (vtkm::Id i = meta_scan_count - VECTOR_SIZE; i < meta_scan_count; ++i){ + // TODO: Maybe schedule a clear kernel? + for (vtkm::Id i = 0; i < meta_scan_count; ++i){ meta_scan_control.Set(i, T()); } // carry_in should be reset to zeros, for the next tile. You'd think @@ -386,7 +397,8 @@ public: // Reset the max # of threads omp_set_num_threads(MAX_THREADS); - return output.GetPortalConstControl().Get(numValues - 1); + // The offset that would be used for the next tile is the result of the scan on the array + return output.GetPortalConstControl().Get(numValues - 1); } #endif diff --git a/vtkm/cont/openmp/internal/TiledScanKernel.h b/vtkm/cont/openmp/internal/TiledScanKernel.h index 1ec127e4f..2f127b9e7 100644 --- a/vtkm/cont/openmp/internal/TiledScanKernel.h +++ b/vtkm/cont/openmp/internal/TiledScanKernel.h @@ -8,6 +8,78 @@ namespace vtkm { namespace cont { namespace internal { + +template +struct TiledInclusiveScanKernel : vtkm::exec::FunctorBase +{ + typedef typename PortalType::ValueType ValueType; + + PortalType Portal; + const BinaryFunctor BinaryOperator; + const ValueType &CarryIn; + ValueType &CarryOut; + + VTKM_CONT_EXPORT + TiledInclusiveScanKernel(PortalType &portal, BinaryFunctor binary_functor, + const ValueType &carry_in, ValueType &carry_out) + : Portal(portal), BinaryOperator(binary_functor), + CarryIn(carry_in), CarryOut(carry_out) + {} + + VTKM_EXEC_EXPORT + void operator()(vtkm::Id index) const + { + // Perform a scan within the tile + const vtkm::Id offset = index * VectorSize; + ValueType vector[VectorSize]; + // Load up the elements for this 'vector' + for (vtkm::Id i = 0; i < VectorSize; ++i){ + vector[i] = Portal.Get(i + offset); + } + + vtkm::Id stride; + for (stride = 2; stride - 1 < VectorSize; stride *= 2) + { + //ScanKernelType kernel(portal, binary_functor, stride, stride/2 - 1); + const vtkm::Id scan_offset = stride / 2 - 1; + const vtkm::Id scan_distance = stride / 2; + //DerivedAlgorithm::Schedule(kernel, numValues/stride); + for (vtkm::Id i = 0; i < VectorSize / stride; ++i){ + const vtkm::Id left = scan_offset + i * stride; + const vtkm::Id right = left + scan_distance; + if (right < VectorSize){ + vector[right] = BinaryOperator(vector[left], vector[right]); + } + } + } + + // Do reverse operation on odd indices. Start at stride we were just at. + for (stride /= 2; stride > 1; stride /= 2) + { + //ScanKernelType kernel(portal, binary_functor, stride, stride - 1); + const vtkm::Id scan_offset = stride - 1; + const vtkm::Id scan_distance = stride / 2; + //DerivedAlgorithm::Schedule(kernel, numValues/stride); + for (vtkm::Id i = 0; i < VectorSize / stride; ++i){ + const vtkm::Id left = scan_offset + i * stride; + const vtkm::Id right = left + scan_distance; + if (right < VectorSize){ + vector[right] = BinaryOperator(vector[left], vector[right]); + } + } + } + + CarryOut = vector[VectorSize - 1]; + + // Add the carry in value to the scan and write our values + // back into the portal + for (vtkm::Id i = 0; i < VectorSize; ++i){ + ValueType value = vector[i]; + Portal.Set(i + offset, BinaryOperator(CarryIn, value)); + } + } +}; + /* * This kernel will execute the tiled scan function from Jeff's MIC tuned scan * (the functon named `simd_scan`) @@ -16,7 +88,7 @@ namespace internal { * writing the specializations easier */ template -struct TiledScanKernel : vtkm::exec::FunctorBase +struct TiledExclusiveScanKernel : vtkm::exec::FunctorBase { typedef typename PortalType::ValueType ValueType; @@ -26,7 +98,7 @@ struct TiledScanKernel : vtkm::exec::FunctorBase ValueType &CarryOut; VTKM_CONT_EXPORT - TiledScanKernel(PortalType &portal, BinaryFunctor binary_functor, + TiledExclusiveScanKernel(PortalType &portal, BinaryFunctor binary_functor, const ValueType &carry_in, ValueType &carry_out) : Portal(portal), BinaryOperator(binary_functor), CarryIn(carry_in), CarryOut(carry_out) @@ -57,7 +129,7 @@ struct TiledScanKernel : vtkm::exec::FunctorBase } } CarryOut = vector[VectorSize - 1]; - vector[VectorSize - 1] = Portal.Get(offset); + vector[VectorSize - 1] = ValueType(); // Do down sweep for (vtkm::Id i = N - 1; i >= 0; --i){ -- GitLab From 5f2ba3cb1720aa2b45d0067f61a3100635d4890d Mon Sep 17 00:00:00 2001 From: Will Usher Date: Wed, 29 Jul 2015 17:50:05 -0600 Subject: [PATCH 13/26] Working on figuring out how to create a permute inclusive scan kernel --- vtkm/cont/openmp/internal/TiledScanKernel.h | 91 +++++++++++++++++++++ 1 file changed, 91 insertions(+) diff --git a/vtkm/cont/openmp/internal/TiledScanKernel.h b/vtkm/cont/openmp/internal/TiledScanKernel.h index 2f127b9e7..4348b5f69 100644 --- a/vtkm/cont/openmp/internal/TiledScanKernel.h +++ b/vtkm/cont/openmp/internal/TiledScanKernel.h @@ -37,6 +37,97 @@ struct TiledInclusiveScanKernel : vtkm::exec::FunctorBase vector[i] = Portal.Get(i + offset); } + /* + * Execution of this kernel on a 16 wide vector: + * TODO: Double check this, then write version using permutes + * + * UPWARD PASS + * ------ + * outer iter = 0 + * stride = 2 + * scan_offset = 0 + * scan_distance = 1 + * VectorSize / stride = 8 + * inner loop: + * vec[1] = op(vec[0], vec[1]) + * vec[3] = op(vec[2], vec[3]) + * vec[5] = op(vec[4], vec[5]) + * vec[7] = op(vec[6], vec[7]) + * vec[9] = op(vec[8], vec[9]) + * vec[11] = op(vec[10], vec[11]) + * vec[13] = op(vec[12], vec[13]) + * vec[15] = op(vec[14], vec[15]) + * -- + * outer iter = 1 + * stride = 4 + * scan_offset = 1 + * scan_distance = 2 + * VectorSize / stride = 4 + * inner loop: + * vec[3] = op(vec[1], vec[3]) + * vec[7] = op(vec[5], vec[7]) + * vec[11] = op(vec[9], vec[11]) + * vec[15] = op(vec[13], vec[15]) + * -- + * outer iter = 2 + * stride = 8 + * scan_offset = 3 + * scan_distance = 4 + * VectorSize / stride = 2 + * inner loop: + * vec[7] = op(vec[3], vec[7]) + * vec[15] = op(vec[11], vec[15]) + * -- + * outer iter = 3 + * stride = 16 + * scan_offset = 7 + * scan_distance = 8 + * VectorSize / stride = 1 + * inner loop: + * vec[15] = op(vec[7], vec[15]) + * -- + * END UPWARD PASS + * + * DOWNWARD PASS + * ------ + * outer iter = 0 + * stride = 8 + * scan_offset = 7 + * scan_distance = 4 + * VectorSize / stride = 2 + * inner loop: + * vec[11] = op(vec[7], vec[11]) + * //vec[19] = op(vec[15], vec[19]) --> out of bounds, not executed + * -- + * outer iter = 1 + * stride = 4 + * scan_offset = 3 + * scan_distance = 2 + * VectorSize / stride = 4 + * inner loop: + * vec[5] = op(vec[3], vec[5]) + * vec[9] = op(vec[7], vec[9]) + * vec[13] = op(vec[11], vec[13]) + * //vec[17] = op(vec[15], vec[17]) --> out of bounds, not executed + * -- + * outer iter = 2 + * stride = 2 + * scan_offset = 1 + * scan_distance = 1 + * VectorSize / stride = 8 + * inner loop: + * vec[2] = op(vec[1], vec[2]) + * vec[4] = op(vec[3], vec[4]) + * vec[6] = op(vec[5], vec[6]) + * vec[8] = op(vec[7], vec[8]) + * vec[10] = op(vec[9], vec[10]) + * vec[12] = op(vec[11], vec[12]) + * vec[14] = op(vec[13], vec[14]) + * //vec[16] = op(vec[15], vec[16]) --> out of bounds, not executed + * -- + * END DOWNWARD PASS + */ + vtkm::Id stride; for (stride = 2; stride - 1 < VectorSize; stride *= 2) { -- GitLab From 137fc5ddf833cceca74f15057aa268cc1832e3b8 Mon Sep 17 00:00:00 2001 From: Will Usher Date: Thu, 30 Jul 2015 11:24:24 -0600 Subject: [PATCH 14/26] Using a hand-unrolled version of the scan gets us very close to Jeff We're ~0.01s off the version that uses JEff's kernel although it's also a scan exclusive so is there a slight difference in operations performed? Remaining todo: try to get compiler to emit permutation instructions so we can get masked adds (or is it already emitting this? double check!) --- vtkm/cont/openmp/internal/TiledScanKernel.h | 58 ++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/vtkm/cont/openmp/internal/TiledScanKernel.h b/vtkm/cont/openmp/internal/TiledScanKernel.h index 4348b5f69..8ac73a295 100644 --- a/vtkm/cont/openmp/internal/TiledScanKernel.h +++ b/vtkm/cont/openmp/internal/TiledScanKernel.h @@ -4,6 +4,7 @@ #include //#define USE_JEFF_SCAN +#define UNROLLED_SCAN namespace vtkm { namespace cont { @@ -33,6 +34,11 @@ struct TiledInclusiveScanKernel : vtkm::exec::FunctorBase const vtkm::Id offset = index * VectorSize; ValueType vector[VectorSize]; // Load up the elements for this 'vector' + // TODO: To handle non-divisible by 16 can we pad this array out with + // ValueType() and then just ignore the values after where we went + // beyond the # of elems in the array? + // The uninitialized values shouldn't effect the elements below them right? +#pragma unroll for (vtkm::Id i = 0; i < VectorSize; ++i){ vector[i] = Portal.Get(i + offset); } @@ -128,6 +134,54 @@ struct TiledInclusiveScanKernel : vtkm::exec::FunctorBase * END DOWNWARD PASS */ +#ifdef UNROLLED_SCAN + vector[1] = BinaryOperator(vector[0], vector[1]); + vector[3] = BinaryOperator(vector[2], vector[3]); + vector[5] = BinaryOperator(vector[4], vector[5]); + vector[7] = BinaryOperator(vector[6], vector[7]); + vector[9] = BinaryOperator(vector[8], vector[9]); + vector[11] = BinaryOperator(vector[10], vector[11]); + vector[13] = BinaryOperator(vector[12], vector[13]); + vector[15] = BinaryOperator(vector[14], vector[15]); + + // Outer loop iter 1 + // stride = 4 + vector[3] = BinaryOperator(vector[1], vector[3]); + vector[7] = BinaryOperator(vector[5], vector[7]); + vector[11] = BinaryOperator(vector[9], vector[11]); + vector[15] = BinaryOperator(vector[13], vector[15]); + + // Outer loop iter 2 + // stride = 8 + vector[7] = BinaryOperator(vector[3], vector[7]); + vector[15] = BinaryOperator(vector[11], vector[15]); + + // Outer loop iter 3 + // stride = 16 + vector[15] = BinaryOperator(vector[7], vector[15]); + + // DOWNWARD PASS + // Outer loop iter 0 + // stride = 8 + vector[11] = BinaryOperator(vector[7], vector[11]); + + // Outer loop iter 1 + // stride = 4 + vector[5] = BinaryOperator(vector[3], vector[5]); + vector[9] = BinaryOperator(vector[7], vector[9]); + vector[13] = BinaryOperator(vector[11], vector[13]); + + // Outer loop iter 2 + // stride = 2 + vector[2] = BinaryOperator(vector[1], vector[2]); + vector[4] = BinaryOperator(vector[3], vector[4]); + vector[6] = BinaryOperator(vector[5], vector[6]); + vector[8] = BinaryOperator(vector[7], vector[8]); + vector[10] = BinaryOperator(vector[9], vector[10]); + vector[12] = BinaryOperator(vector[11], vector[12]); + vector[14] = BinaryOperator(vector[13], vector[14]); + +#else vtkm::Id stride; for (stride = 2; stride - 1 < VectorSize; stride *= 2) { @@ -159,11 +213,13 @@ struct TiledInclusiveScanKernel : vtkm::exec::FunctorBase } } } +#endif CarryOut = vector[VectorSize - 1]; // Add the carry in value to the scan and write our values // back into the portal +#pragma unroll for (vtkm::Id i = 0; i < VectorSize; ++i){ ValueType value = vector[i]; Portal.Set(i + offset, BinaryOperator(CarryIn, value)); @@ -253,7 +309,7 @@ struct TiledExclusiveScanKernel : vtkm::exec::FunctorBase // Specialized hand tuned kernel used by Jeff Inman's scan template<> -VTKM_EXEC_EXPORT void TiledScanKernel:: template ExecutionTypes::Portal, vtkm::internal::Add, 16>::operator()(vtkm::Id index) const { -- GitLab From 846c41f707cbb3d20acd1410144df041d79e5f2c Mon Sep 17 00:00:00 2001 From: Will Usher Date: Thu, 30 Jul 2015 16:58:48 -0600 Subject: [PATCH 15/26] Can now handle other operator scans and non-multiple of 16 sized scans Although the *-scan test seems to fail on MIC for unclear reasons. The exact same code works on CPU and the MIC is run as just a native MIC executable (not offload), so not sure what is happening --- .../internal/DeviceAdapterAlgorithmOpenMP.h | 109 +++++++++++------- vtkm/cont/openmp/internal/TiledScanKernel.h | 29 +++-- 2 files changed, 87 insertions(+), 51 deletions(-) diff --git a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h index 00f015df6..b8d91ff92 100644 --- a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h +++ b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h @@ -41,7 +41,6 @@ #include #define SPECIALIZE_OPENMP -//#define USE_IVDEP namespace vtkm { namespace cont { @@ -89,9 +88,8 @@ public: const ScheduleKernel kernel(functor); const vtkm::Id size = numInstances; -#ifdef USE_IVDEP -#pragma ivdep -#endif + // Should we have a VectorSize inner loop with ivdep and then scheudle + // size / VectorSize runs? Then have a remaining overflow loop? #pragma omp parallel for schedule(guided, GRAIN_SIZE) for(vtkm::Id i=0; i < size; ++i) { @@ -120,9 +118,6 @@ public: const ScheduleKernel kernel(functor); const vtkm::Id size = rangeMax[0] * rangeMax[1] * rangeMax[2]; -#ifdef USE_IVDEP -#pragma ivdep -#endif #pragma omp parallel for schedule(guided, GRAIN_SIZE) for(vtkm::Id i=0; i < size; ++i) { @@ -216,7 +211,8 @@ public: // TODO: Handling non-divisible by 16 array? Could we just pad with zeros out to something // divisible by 16? - const vtkm::Id COUNT512 = output.GetNumberOfValues() / VECTOR_SIZE; + const vtkm::Id COUNT512 = output.GetNumberOfValues() / VECTOR_SIZE + + (output.GetNumberOfValues() % VECTOR_SIZE == 0 ? 0 : 1); // number of 512-bit vectors in a "tile" const vtkm::Id TILE_SIZE = COUNT512 > L1_vectors ? L1_vectors : COUNT512; const vtkm::Id NUM_THREADS = MAX_THREADS > TILE_SIZE ? TILE_SIZE : MAX_THREADS; @@ -236,7 +232,7 @@ public: std::cout << "Scan using tile size of " << TILE_SIZE << ", " << NUM_THREADS << " threads and meta scan count of " << meta_scan_count << ", COUNT512 = " << COUNT512 << "\n"; - */ + */ vtkm::cont::ArrayHandle meta_scan_handle; meta_scan_handle.Allocate(meta_scan_count); @@ -249,10 +245,11 @@ public: T carry_in = T(); T carry_out = T(); T tile_offset = T(); + bool tile_offset_valid; // TODO: Change this to schedule a clear array kernel for (vtkm::Id i = 0; i < meta_scan_count; ++i){ - meta_scan_control.Set(i, T()); + meta_scan_portal.Set(i, T()); } // Loop over each tile @@ -279,26 +276,33 @@ public: // same). #pragma omp parallel firstprivate(carry_in), private(carry_out) { - // TODO: On the first tile we don't have a carry in value, how to express - // this to the kernel? - TiledScanKernelType kernel = TiledScanKernelType(portal, binary_functor, carry_in, carry_out); + // On the first tile for each thread we don't have a valid carry in value + bool carry_in_valid = false; + TiledScanKernelType kernel = TiledScanKernelType(portal, binary_functor, carry_in, carry_out, + carry_in_valid); #pragma omp for schedule(static) for (vtkm::Id i = 0; i < TILE_SIZE; ++i){ kernel(t + i); - carry_in = binary_functor(carry_in, carry_out); + if (carry_in_valid){ + carry_in = binary_functor(carry_in, carry_out); + } + else { + carry_in = carry_out; + carry_in_valid = true; + } //std::cout << "Tile " << i << " carry out = " << carry_out << std::endl; } // Now share our carry out with everyone else in the meta scan // It's a bit awkward but the carry out of this thread is actually in // `carry_in` const vtkm::Id thread = omp_get_thread_num(); - meta_scan_portal.Set(thread + 1, carry_in); + meta_scan_portal.Set(thread, carry_in); } /* for (vtkm::Id i = 0; i < meta_scan_count; ++i){ std::cout << "meta_scan[" << i << "] = " - << meta_scan_handle.GetPortalConstControl().Get(i) << std::endl; + << meta_scan_portal.Get(i) << std::endl; } */ @@ -321,31 +325,47 @@ public: { // perform the meta-scan T carry_in_meta = T(); + bool carry_in_meta_valid = false; TiledScanKernelType kernel = TiledScanKernelType(meta_scan_portal, - binary_functor, carry_in_meta, carry_out); + binary_functor, carry_in_meta, carry_out, carry_in_meta_valid); for (vtkm::Id i = 0; i < meta_scan_count / VECTOR_SIZE; ++i){ //std::cout << "meta scan iter " << i << std::endl; kernel(i); - carry_in_meta = binary_functor(carry_in_meta, carry_out); + if (carry_in_meta_valid){ + carry_in_meta = binary_functor(carry_in_meta, carry_out); + } + else { + carry_in_meta = carry_out; + carry_in_meta_valid = true; + } } //std::cout << "carry_in_meta = " << carry_in_meta << std::endl; // Add the tile offset to each element in the meta scan - for (vtkm::Id i = 0; i < meta_scan_count / VECTOR_SIZE; ++i){ - for (vtkm::Id j = 0; j < VECTOR_SIZE; ++j){ - T value = meta_scan_control.Get(i * VECTOR_SIZE + j); - /* - std::cout << "index = " << t + i * VECTOR_SIZE + j << ", value = " << value - << ", meta scan value = " << binary_functor(tile_offset, value) - << std::endl; - */ - meta_scan_control.Set(i * VECTOR_SIZE + j, binary_functor(tile_offset, value)); + if (tile_offset_valid){ + for (vtkm::Id i = 0; i < meta_scan_count / VECTOR_SIZE; ++i){ +#pragma unroll + for (vtkm::Id j = 0; j < VECTOR_SIZE; ++j){ + T value = meta_scan_portal.Get(i * VECTOR_SIZE + j); + /* + std::cout << "index = " << t + i * VECTOR_SIZE + j << ", value = " << value + << ", meta scan value = " << binary_functor(tile_offset, value) + << std::endl; + */ + meta_scan_portal.Set(i * VECTOR_SIZE + j, binary_functor(tile_offset, value)); + } } } // increment offset for the next tile by the carry-out from the // meta-scan - tile_offset = binary_functor(tile_offset, carry_in_meta); + if (tile_offset_valid){ + tile_offset = binary_functor(tile_offset, carry_in_meta); + } + else { + tile_offset = carry_in_meta; + tile_offset_valid = true; + } } /* std::cout << "Post meta-scan\n"; @@ -364,14 +384,27 @@ public: // can now forget about cache for this tile. #pragma omp parallel shared(carry_out) { + // TODO: Thread 0 has no thread offset. Could I instead do a parallel for + // on the meta scan count and then have each thread do its tile? That's + // essentially what we're getting. Although that might thrash cache then because + // we'd switch which thread was getting which elements const vtkm::Id thread = omp_get_thread_num(); - T thread_offset = meta_scan_portal.Get(thread); + T thread_offset; + if (thread > 0){ + thread_offset = meta_scan_portal.Get(thread - 1); + } #pragma omp for schedule(static) for (size_t i = 0; i < TILE_SIZE; ++i){ - for (vtkm::Id j = 0; j < VECTOR_SIZE; ++j){ - T value = portal.Get(t + i * VECTOR_SIZE + j); - portal.Set(t + i * VECTOR_SIZE + j, binary_functor(thread_offset, value)); + if (thread > 0){ + const vtkm::Id offset = i * VECTOR_SIZE; + const vtkm::Id N_VALID = offset + VECTOR_SIZE > portal.GetNumberOfValues() ? + VECTOR_SIZE - (offset + VECTOR_SIZE - portal.GetNumberOfValues()) : VECTOR_SIZE; + + for (vtkm::Id j = 0; j < N_VALID; ++j){ + T value = portal.Get(t + offset + j); + portal.Set(t + offset + j, binary_functor(thread_offset, value)); + } } } // carry-out of final iteration becomes new global carry-out @@ -384,9 +417,8 @@ public: { // If meta_scan was rounded up, then the last several values are not // reset by any thread, during PASS1. Reset them to zero now. - // TODO: Maybe schedule a clear kernel? for (vtkm::Id i = 0; i < meta_scan_count; ++i){ - meta_scan_control.Set(i, T()); + meta_scan_portal.Set(i, T()); } // carry_in should be reset to zeros, for the next tile. You'd think // we could use the tile_offset as the new carry_in, but that would @@ -398,7 +430,7 @@ public: // Reset the max # of threads omp_set_num_threads(MAX_THREADS); // The offset that would be used for the next tile is the result of the scan on the array - return output.GetPortalConstControl().Get(numValues - 1); + return output.GetPortalConstControl().Get(numValues - 1); } #endif @@ -522,9 +554,6 @@ public: for (vtkm::Id crossoverSize = 1; crossoverSize < numValues; crossoverSize *= 2) { const CrossoverKernel cross = CrossoverKernel(portal,binary_compare,crossoverSize); -#ifdef USE_IVDEP -#pragma ivdep -#endif #pragma omp for schedule(guided, GRAIN_SIZE) for (vtkm::Id i = 0; i < NUM_THREADS; ++i){ cross(i); @@ -532,9 +561,6 @@ public: for (vtkm::Id mergeSize = crossoverSize/2; mergeSize > 0; mergeSize /= 2) { const MergeKernel merge = MergeKernel(portal,binary_compare,mergeSize); -#ifdef USE_IVDEP -#pragma ivdep -#endif #pragma omp for schedule(guided, GRAIN_SIZE / 4) for (vtkm::Id i = 0; i < NUM_THREADS; ++i){ merge(i); @@ -563,3 +589,4 @@ public: }// namespace vtkm::cont #endif //vtk_m_cont_openmp_internal_DeviceAdapterAlgorithmOpenMP_h + diff --git a/vtkm/cont/openmp/internal/TiledScanKernel.h b/vtkm/cont/openmp/internal/TiledScanKernel.h index 8ac73a295..219134756 100644 --- a/vtkm/cont/openmp/internal/TiledScanKernel.h +++ b/vtkm/cont/openmp/internal/TiledScanKernel.h @@ -19,12 +19,13 @@ struct TiledInclusiveScanKernel : vtkm::exec::FunctorBase const BinaryFunctor BinaryOperator; const ValueType &CarryIn; ValueType &CarryOut; + bool &CarryInValid; VTKM_CONT_EXPORT TiledInclusiveScanKernel(PortalType &portal, BinaryFunctor binary_functor, - const ValueType &carry_in, ValueType &carry_out) + const ValueType &carry_in, ValueType &carry_out, bool &carry_in_valid) : Portal(portal), BinaryOperator(binary_functor), - CarryIn(carry_in), CarryOut(carry_out) + CarryIn(carry_in), CarryOut(carry_out), CarryInValid(carry_in_valid) {} VTKM_EXEC_EXPORT @@ -32,20 +33,22 @@ struct TiledInclusiveScanKernel : vtkm::exec::FunctorBase { // Perform a scan within the tile const vtkm::Id offset = index * VectorSize; + const vtkm::Id n_valid = offset + VectorSize > Portal.GetNumberOfValues() ? + VectorSize - (offset + VectorSize - Portal.GetNumberOfValues()) : VectorSize; ValueType vector[VectorSize]; + // Load up the elements for this 'vector' // TODO: To handle non-divisible by 16 can we pad this array out with // ValueType() and then just ignore the values after where we went // beyond the # of elems in the array? // The uninitialized values shouldn't effect the elements below them right? -#pragma unroll - for (vtkm::Id i = 0; i < VectorSize; ++i){ + for (vtkm::Id i = 0; i < n_valid; ++i){ vector[i] = Portal.Get(i + offset); } /* * Execution of this kernel on a 16 wide vector: - * TODO: Double check this, then write version using permutes + * TODO: Version using permutes * * UPWARD PASS * ------ @@ -215,14 +218,20 @@ struct TiledInclusiveScanKernel : vtkm::exec::FunctorBase } #endif - CarryOut = vector[VectorSize - 1]; + CarryOut = vector[n_valid - 1]; // Add the carry in value to the scan and write our values // back into the portal -#pragma unroll - for (vtkm::Id i = 0; i < VectorSize; ++i){ - ValueType value = vector[i]; - Portal.Set(i + offset, BinaryOperator(CarryIn, value)); + if (CarryInValid){ + for (vtkm::Id i = 0; i < n_valid; ++i){ + ValueType value = vector[i]; + Portal.Set(i + offset, BinaryOperator(CarryIn, value)); + } + } + else { + for (vtkm::Id i = 0; i < n_valid; ++i){ + Portal.Set(i + offset, vector[i]); + } } } }; -- GitLab From 93ba74a02621f945d4d49e1b4c4511e76d4f93c3 Mon Sep 17 00:00:00 2001 From: Will Usher Date: Fri, 31 Jul 2015 12:32:39 -0600 Subject: [PATCH 16/26] Unrolled+tiled scaninclusive works on all types, ops and array sizes Unfortunately the additional checking slows us down a bit, on MIC we're ~0.04s slower and on CPU we're ~0.006s slower. --- .../internal/DeviceAdapterAlgorithmOpenMP.h | 44 +------ vtkm/cont/openmp/internal/TiledScanKernel.h | 123 +----------------- 2 files changed, 11 insertions(+), 156 deletions(-) diff --git a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h index b8d91ff92..1bbe7c0d2 100644 --- a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h +++ b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h @@ -198,8 +198,7 @@ public: Copy(input, output); const vtkm::Id numValues = output.GetNumberOfValues(); - if (numValues < 1) - { + if (numValues < 1){ return output.GetPortalConstControl().Get(0); } @@ -209,16 +208,12 @@ public: // Number of vectors that fit in 32Kb cache const size_t L1_vectors = (32 * 1024 * 1024) / (VECTOR_SIZE * 4); - // TODO: Handling non-divisible by 16 array? Could we just pad with zeros out to something - // divisible by 16? const vtkm::Id COUNT512 = output.GetNumberOfValues() / VECTOR_SIZE - + (output.GetNumberOfValues() % VECTOR_SIZE == 0 ? 0 : 1); + + (output.GetNumberOfValues() % VECTOR_SIZE == 0 ? 0 : 1); // number of 512-bit vectors in a "tile" const vtkm::Id TILE_SIZE = COUNT512 > L1_vectors ? L1_vectors : COUNT512; const vtkm::Id NUM_THREADS = MAX_THREADS > TILE_SIZE ? TILE_SIZE : MAX_THREADS; - - const vtkm::Id round_off_mask = VECTOR_SIZE == 16 ? ~0x0f - : VECTOR_SIZE == 8 ? ~0x07 : ~0x03; + const vtkm::Id round_off_mask = ~0x0f; // meta_scan[i] will get the carry-out for the sub-scan performed by // thread[i]. NOTE: This *must* have a 32-bit value for every thread. @@ -228,16 +223,10 @@ public: ? 1 + ((NUM_THREADS + VECTOR_SIZE) & round_off_mask) : NUM_THREADS + 1; omp_set_num_threads(NUM_THREADS); - /* - std::cout << "Scan using tile size of " << TILE_SIZE << ", " << NUM_THREADS - << " threads and meta scan count of " << meta_scan_count - << ", COUNT512 = " << COUNT512 << "\n"; - */ vtkm::cont::ArrayHandle meta_scan_handle; meta_scan_handle.Allocate(meta_scan_count); PortalType meta_scan_portal = meta_scan_handle.PrepareForInPlace(vtkm::cont::DeviceAdapterTagOpenMP()); - PortalControlType meta_scan_control = meta_scan_handle.GetPortalControl(); // Should we not have a carry in if no value has been set for it? // Same for tile offset and carry out usage. Assuming T() is the identity // type for the operator is not correct, eg. 0 is not the identity type @@ -245,9 +234,8 @@ public: T carry_in = T(); T carry_out = T(); T tile_offset = T(); - bool tile_offset_valid; + bool tile_offset_valid = false; - // TODO: Change this to schedule a clear array kernel for (vtkm::Id i = 0; i < meta_scan_count; ++i){ meta_scan_portal.Set(i, T()); } @@ -290,7 +278,6 @@ public: carry_in = carry_out; carry_in_valid = true; } - //std::cout << "Tile " << i << " carry out = " << carry_out << std::endl; } // Now share our carry out with everyone else in the meta scan // It's a bit awkward but the carry out of this thread is actually in @@ -299,13 +286,6 @@ public: meta_scan_portal.Set(thread, carry_in); } - /* - for (vtkm::Id i = 0; i < meta_scan_count; ++i){ - std::cout << "meta_scan[" << i << "] = " - << meta_scan_portal.Get(i) << std::endl; - } - */ - // META-PASS: (per tile) serial // // Still within the given tile, we now run a scan on the values in @@ -329,7 +309,6 @@ public: TiledScanKernelType kernel = TiledScanKernelType(meta_scan_portal, binary_functor, carry_in_meta, carry_out, carry_in_meta_valid); for (vtkm::Id i = 0; i < meta_scan_count / VECTOR_SIZE; ++i){ - //std::cout << "meta scan iter " << i << std::endl; kernel(i); if (carry_in_meta_valid){ carry_in_meta = binary_functor(carry_in_meta, carry_out); @@ -340,19 +319,12 @@ public: } } - //std::cout << "carry_in_meta = " << carry_in_meta << std::endl; - // Add the tile offset to each element in the meta scan if (tile_offset_valid){ for (vtkm::Id i = 0; i < meta_scan_count / VECTOR_SIZE; ++i){ #pragma unroll for (vtkm::Id j = 0; j < VECTOR_SIZE; ++j){ T value = meta_scan_portal.Get(i * VECTOR_SIZE + j); - /* - std::cout << "index = " << t + i * VECTOR_SIZE + j << ", value = " << value - << ", meta scan value = " << binary_functor(tile_offset, value) - << std::endl; - */ meta_scan_portal.Set(i * VECTOR_SIZE + j, binary_functor(tile_offset, value)); } } @@ -367,12 +339,6 @@ public: tile_offset_valid = true; } } - /* - std::cout << "Post meta-scan\n"; - for (vtkm::Id i = 0; i < meta_scan_count; ++i){ - std::cout << "meta_scan[" << i << "] = " << meta_scan_portal.Get(i) << std::endl; - } - */ // PASS2 (per tile) parallel // @@ -382,7 +348,7 @@ public: // The resulting elements are "fully relocated", such that, when we are // finished with a tile, we don't need to revisit it again. Thus, we // can now forget about cache for this tile. -#pragma omp parallel shared(carry_out) +#pragma omp parallel { // TODO: Thread 0 has no thread offset. Could I instead do a parallel for // on the meta scan count and then have each thread do its tile? That's diff --git a/vtkm/cont/openmp/internal/TiledScanKernel.h b/vtkm/cont/openmp/internal/TiledScanKernel.h index 219134756..73a64210c 100644 --- a/vtkm/cont/openmp/internal/TiledScanKernel.h +++ b/vtkm/cont/openmp/internal/TiledScanKernel.h @@ -4,12 +4,14 @@ #include //#define USE_JEFF_SCAN -#define UNROLLED_SCAN namespace vtkm { namespace cont { namespace internal { +/* + * This kernel performs an inclusive scan on 16 elements, if the + */ template struct TiledInclusiveScanKernel : vtkm::exec::FunctorBase { @@ -38,17 +40,13 @@ struct TiledInclusiveScanKernel : vtkm::exec::FunctorBase ValueType vector[VectorSize]; // Load up the elements for this 'vector' - // TODO: To handle non-divisible by 16 can we pad this array out with - // ValueType() and then just ignore the values after where we went - // beyond the # of elems in the array? - // The uninitialized values shouldn't effect the elements below them right? for (vtkm::Id i = 0; i < n_valid; ++i){ vector[i] = Portal.Get(i + offset); } /* * Execution of this kernel on a 16 wide vector: - * TODO: Version using permutes + * TODO: Version using permutes? * * UPWARD PASS * ------ @@ -137,7 +135,8 @@ struct TiledInclusiveScanKernel : vtkm::exec::FunctorBase * END DOWNWARD PASS */ -#ifdef UNROLLED_SCAN + // Outer loop iter 0 + // stride = 2 vector[1] = BinaryOperator(vector[0], vector[1]); vector[3] = BinaryOperator(vector[2], vector[3]); vector[5] = BinaryOperator(vector[4], vector[5]); @@ -184,40 +183,6 @@ struct TiledInclusiveScanKernel : vtkm::exec::FunctorBase vector[12] = BinaryOperator(vector[11], vector[12]); vector[14] = BinaryOperator(vector[13], vector[14]); -#else - vtkm::Id stride; - for (stride = 2; stride - 1 < VectorSize; stride *= 2) - { - //ScanKernelType kernel(portal, binary_functor, stride, stride/2 - 1); - const vtkm::Id scan_offset = stride / 2 - 1; - const vtkm::Id scan_distance = stride / 2; - //DerivedAlgorithm::Schedule(kernel, numValues/stride); - for (vtkm::Id i = 0; i < VectorSize / stride; ++i){ - const vtkm::Id left = scan_offset + i * stride; - const vtkm::Id right = left + scan_distance; - if (right < VectorSize){ - vector[right] = BinaryOperator(vector[left], vector[right]); - } - } - } - - // Do reverse operation on odd indices. Start at stride we were just at. - for (stride /= 2; stride > 1; stride /= 2) - { - //ScanKernelType kernel(portal, binary_functor, stride, stride - 1); - const vtkm::Id scan_offset = stride - 1; - const vtkm::Id scan_distance = stride / 2; - //DerivedAlgorithm::Schedule(kernel, numValues/stride); - for (vtkm::Id i = 0; i < VectorSize / stride; ++i){ - const vtkm::Id left = scan_offset + i * stride; - const vtkm::Id right = left + scan_distance; - if (right < VectorSize){ - vector[right] = BinaryOperator(vector[left], vector[right]); - } - } - } -#endif - CarryOut = vector[n_valid - 1]; // Add the carry in value to the scan and write our values @@ -236,82 +201,6 @@ struct TiledInclusiveScanKernel : vtkm::exec::FunctorBase } }; -/* - * This kernel will execute the tiled scan function from Jeff's MIC tuned scan - * (the functon named `simd_scan`) - * Could we specialize this for vectorizable types and use Jeff's tuned scan? - * Would want to move the tiled scan kernel out of the class though to make - * writing the specializations easier - */ -template -struct TiledExclusiveScanKernel : vtkm::exec::FunctorBase -{ - typedef typename PortalType::ValueType ValueType; - - PortalType Portal; - const BinaryFunctor BinaryOperator; - const ValueType &CarryIn; - ValueType &CarryOut; - - VTKM_CONT_EXPORT - TiledExclusiveScanKernel(PortalType &portal, BinaryFunctor binary_functor, - const ValueType &carry_in, ValueType &carry_out) - : Portal(portal), BinaryOperator(binary_functor), - CarryIn(carry_in), CarryOut(carry_out) - {} - - VTKM_EXEC_EXPORT - void operator()(vtkm::Id index) const - { - // Perform a scan within the tile - const vtkm::Id offset = index * VectorSize; - ValueType vector[VectorSize]; - // Load up the elements for this 'vector' - for (vtkm::Id i = 0; i < VectorSize; ++i){ - vector[i] = Portal.Get(i + offset); - } - // Just do a standard tree-scan instead of using the scan kernels - // Up-sweep - const vtkm::Id N = vtkm::Log2(vtkm::Float32(VectorSize)); - for (vtkm::Id i = 0; i < N; ++i){ - const vtkm::Id stride = 1 << (i + 1); - const vtkm::Id tree_offset = 1 << i; - const vtkm::Id N_TREE = VectorSize / stride; - for (vtkm::Id j = 0; j < N_TREE; ++j){ - const vtkm::Id base = j * stride - 1; - ValueType left = vector[base + tree_offset]; - ValueType right = vector[base + stride]; - vector[base + stride] = BinaryOperator(left, right); - } - } - CarryOut = vector[VectorSize - 1]; - vector[VectorSize - 1] = ValueType(); - - // Do down sweep - for (vtkm::Id i = N - 1; i >= 0; --i){ - const vtkm::Id stride = 1 << (i + 1); - const vtkm::Id tree_offset = 1 << i; - const vtkm::Id N_TREE = VectorSize / stride; - for (vtkm::Id j = 0; j < N_TREE; ++j){ - const vtkm::Id base = j * stride - 1; - // We get scatter/gather here if we do ivdep, is that what's hurting performance? - // Jeff's version has no scatter/gather - ValueType left = vector[base + tree_offset]; - ValueType right = vector[base + stride]; - vector[base + tree_offset] = right; - vector[base + stride] = BinaryOperator(left, right); - } - } - - // Add the carry in value to the scan and write our values - // back into the portal - for (vtkm::Id i = 0; i < VectorSize; ++i){ - ValueType value = vector[i]; - Portal.Set(i + offset, BinaryOperator(CarryIn, value)); - } - } -}; - #ifdef USE_JEFF_SCAN #ifdef __MIC__ #include -- GitLab From e8483d65cfe8089577e9d6bc859d94eadfc4ccab Mon Sep 17 00:00:00 2001 From: Will Usher Date: Fri, 31 Jul 2015 14:14:15 -0600 Subject: [PATCH 17/26] Applying a few tweaks and tricks to get some perf back We still produce the correct result but run faster than the crummier version I wrote previously. --- .../internal/DeviceAdapterAlgorithmOpenMP.h | 40 ++-- vtkm/cont/openmp/internal/TiledScanKernel.h | 182 +++++++----------- 2 files changed, 86 insertions(+), 136 deletions(-) diff --git a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h index 1bbe7c0d2..8d3f9b2a8 100644 --- a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h +++ b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h @@ -265,19 +265,17 @@ public: #pragma omp parallel firstprivate(carry_in), private(carry_out) { // On the first tile for each thread we don't have a valid carry in value - bool carry_in_valid = false; - TiledScanKernelType kernel = TiledScanKernelType(portal, binary_functor, carry_in, carry_out, - carry_in_valid); + TiledScanKernelType kernel = TiledScanKernelType(portal, binary_functor, carry_in, carry_out); #pragma omp for schedule(static) - for (vtkm::Id i = 0; i < TILE_SIZE; ++i){ - kernel(t + i); - if (carry_in_valid){ - carry_in = binary_functor(carry_in, carry_out); - } - else { - carry_in = carry_out; - carry_in_valid = true; - } + for (vtkm::Id i = 0; i < omp_get_num_threads(); ++i){ + kernel.no_carry_in(i); + } + carry_in = carry_out; + +#pragma omp for schedule(static) + for (vtkm::Id i = omp_get_num_threads(); i < TILE_SIZE; ++i){ + kernel.with_carry_in(t + i); + carry_in = binary_functor(carry_in, carry_out); } // Now share our carry out with everyone else in the meta scan // It's a bit awkward but the carry out of this thread is actually in @@ -305,18 +303,13 @@ public: { // perform the meta-scan T carry_in_meta = T(); - bool carry_in_meta_valid = false; TiledScanKernelType kernel = TiledScanKernelType(meta_scan_portal, - binary_functor, carry_in_meta, carry_out, carry_in_meta_valid); - for (vtkm::Id i = 0; i < meta_scan_count / VECTOR_SIZE; ++i){ - kernel(i); - if (carry_in_meta_valid){ - carry_in_meta = binary_functor(carry_in_meta, carry_out); - } - else { - carry_in_meta = carry_out; - carry_in_meta_valid = true; - } + binary_functor, carry_in_meta, carry_out); + kernel.no_carry_in(0); + carry_in_meta = carry_out; + for (vtkm::Id i = 1; i < meta_scan_count / VECTOR_SIZE; ++i){ + kernel.with_carry_in(i); + carry_in_meta = binary_functor(carry_in_meta, carry_out); } // Add the tile offset to each element in the meta scan @@ -331,6 +324,7 @@ public: } // increment offset for the next tile by the carry-out from the // meta-scan + // TODO: MErge this condition with the above if (tile_offset_valid){ tile_offset = binary_functor(tile_offset, carry_in_meta); } diff --git a/vtkm/cont/openmp/internal/TiledScanKernel.h b/vtkm/cont/openmp/internal/TiledScanKernel.h index 73a64210c..c8538bb10 100644 --- a/vtkm/cont/openmp/internal/TiledScanKernel.h +++ b/vtkm/cont/openmp/internal/TiledScanKernel.h @@ -21,122 +21,94 @@ struct TiledInclusiveScanKernel : vtkm::exec::FunctorBase const BinaryFunctor BinaryOperator; const ValueType &CarryIn; ValueType &CarryOut; - bool &CarryInValid; VTKM_CONT_EXPORT TiledInclusiveScanKernel(PortalType &portal, BinaryFunctor binary_functor, - const ValueType &carry_in, ValueType &carry_out, bool &carry_in_valid) + const ValueType &carry_in, ValueType &carry_out) : Portal(portal), BinaryOperator(binary_functor), - CarryIn(carry_in), CarryOut(carry_out), CarryInValid(carry_in_valid) + CarryIn(carry_in), CarryOut(carry_out) {} VTKM_EXEC_EXPORT - void operator()(vtkm::Id index) const + void no_carry_in(vtkm::Id index) const { + // Perform a scan within the tile + const vtkm::Id offset = index * VectorSize; + const vtkm::Id n_valid = offset + VectorSize > Portal.GetNumberOfValues() ? + VectorSize - (offset + VectorSize - Portal.GetNumberOfValues()) : VectorSize; + ValueType vector[VectorSize]; + if (n_valid == 16){ + scan16(vector, offset); + } + else { + scan_remainder(vector, offset, n_valid); + } + + CarryOut = vector[n_valid - 1]; + + store_vector(vector, offset, n_valid); + } + + VTKM_EXEC_EXPORT + void with_carry_in(vtkm::Id index) const { // Perform a scan within the tile const vtkm::Id offset = index * VectorSize; const vtkm::Id n_valid = offset + VectorSize > Portal.GetNumberOfValues() ? VectorSize - (offset + VectorSize - Portal.GetNumberOfValues()) : VectorSize; ValueType vector[VectorSize]; + if (n_valid == 16){ + scan16(vector, offset); + } + else { + scan_remainder(vector, offset, n_valid); + } + + CarryOut = vector[n_valid - 1]; + + // Add the carry in value to the scan and write our values + // back into the portal +#pragma unroll + for (vtkm::Id i = 0; i < VectorSize; ++i){ + vector[i] = BinaryOperator(CarryIn, vector[i]); + } + + store_vector(vector, offset, n_valid); + } + +private: + VTKM_EXEC_EXPORT + void scan16(ValueType *vector, const vtkm::Id offset) const { + // Load up the elements for this 'vector' +#pragma unroll + for (vtkm::Id i = 0; i < VectorSize; ++i){ + vector[i] = Portal.Get(i + offset); + } + scan_vector(vector); + } + + VTKM_EXEC_EXPORT + void scan_remainder(ValueType *vector, const vtkm::Id offset, const vtkm::Id n_valid) const { // Load up the elements for this 'vector' for (vtkm::Id i = 0; i < n_valid; ++i){ vector[i] = Portal.Get(i + offset); } + scan_vector(vector); + } + + VTKM_EXEC_EXPORT + void store_vector(ValueType *vector, const vtkm::Id offset, const vtkm::Id n_valid) const { + // Load up the elements for this 'vector' + for (vtkm::Id i = 0; i < n_valid; ++i){ + Portal.Set(i + offset, vector[i]); + } + } - /* - * Execution of this kernel on a 16 wide vector: - * TODO: Version using permutes? - * - * UPWARD PASS - * ------ - * outer iter = 0 - * stride = 2 - * scan_offset = 0 - * scan_distance = 1 - * VectorSize / stride = 8 - * inner loop: - * vec[1] = op(vec[0], vec[1]) - * vec[3] = op(vec[2], vec[3]) - * vec[5] = op(vec[4], vec[5]) - * vec[7] = op(vec[6], vec[7]) - * vec[9] = op(vec[8], vec[9]) - * vec[11] = op(vec[10], vec[11]) - * vec[13] = op(vec[12], vec[13]) - * vec[15] = op(vec[14], vec[15]) - * -- - * outer iter = 1 - * stride = 4 - * scan_offset = 1 - * scan_distance = 2 - * VectorSize / stride = 4 - * inner loop: - * vec[3] = op(vec[1], vec[3]) - * vec[7] = op(vec[5], vec[7]) - * vec[11] = op(vec[9], vec[11]) - * vec[15] = op(vec[13], vec[15]) - * -- - * outer iter = 2 - * stride = 8 - * scan_offset = 3 - * scan_distance = 4 - * VectorSize / stride = 2 - * inner loop: - * vec[7] = op(vec[3], vec[7]) - * vec[15] = op(vec[11], vec[15]) - * -- - * outer iter = 3 - * stride = 16 - * scan_offset = 7 - * scan_distance = 8 - * VectorSize / stride = 1 - * inner loop: - * vec[15] = op(vec[7], vec[15]) - * -- - * END UPWARD PASS - * - * DOWNWARD PASS - * ------ - * outer iter = 0 - * stride = 8 - * scan_offset = 7 - * scan_distance = 4 - * VectorSize / stride = 2 - * inner loop: - * vec[11] = op(vec[7], vec[11]) - * //vec[19] = op(vec[15], vec[19]) --> out of bounds, not executed - * -- - * outer iter = 1 - * stride = 4 - * scan_offset = 3 - * scan_distance = 2 - * VectorSize / stride = 4 - * inner loop: - * vec[5] = op(vec[3], vec[5]) - * vec[9] = op(vec[7], vec[9]) - * vec[13] = op(vec[11], vec[13]) - * //vec[17] = op(vec[15], vec[17]) --> out of bounds, not executed - * -- - * outer iter = 2 - * stride = 2 - * scan_offset = 1 - * scan_distance = 1 - * VectorSize / stride = 8 - * inner loop: - * vec[2] = op(vec[1], vec[2]) - * vec[4] = op(vec[3], vec[4]) - * vec[6] = op(vec[5], vec[6]) - * vec[8] = op(vec[7], vec[8]) - * vec[10] = op(vec[9], vec[10]) - * vec[12] = op(vec[11], vec[12]) - * vec[14] = op(vec[13], vec[14]) - * //vec[16] = op(vec[15], vec[16]) --> out of bounds, not executed - * -- - * END DOWNWARD PASS - */ - - // Outer loop iter 0 - // stride = 2 + // Perform an inclusive scan on a 16 element ValueType array + VTKM_EXEC_EXPORT + void scan_vector(ValueType *vector) const { + // Outer loop iter 0 + // stride = 2 vector[1] = BinaryOperator(vector[0], vector[1]); vector[3] = BinaryOperator(vector[2], vector[3]); vector[5] = BinaryOperator(vector[4], vector[5]); @@ -182,22 +154,6 @@ struct TiledInclusiveScanKernel : vtkm::exec::FunctorBase vector[10] = BinaryOperator(vector[9], vector[10]); vector[12] = BinaryOperator(vector[11], vector[12]); vector[14] = BinaryOperator(vector[13], vector[14]); - - CarryOut = vector[n_valid - 1]; - - // Add the carry in value to the scan and write our values - // back into the portal - if (CarryInValid){ - for (vtkm::Id i = 0; i < n_valid; ++i){ - ValueType value = vector[i]; - Portal.Set(i + offset, BinaryOperator(CarryIn, value)); - } - } - else { - for (vtkm::Id i = 0; i < n_valid; ++i){ - Portal.Set(i + offset, vector[i]); - } - } } }; -- GitLab From 10c49578d8546c5f6178b7ac36891ffc6663d14e Mon Sep 17 00:00:00 2001 From: Will Usher Date: Fri, 31 Jul 2015 14:33:21 -0600 Subject: [PATCH 18/26] Hard code vector size to 16, the kernel doesn't support any others --- .../internal/DeviceAdapterAlgorithmOpenMP.h | 61 ++++++++----------- vtkm/cont/openmp/internal/TiledScanKernel.h | 37 +++++------ 2 files changed, 46 insertions(+), 52 deletions(-) diff --git a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h index 8d3f9b2a8..1677b9249 100644 --- a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h +++ b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h @@ -189,11 +189,8 @@ public: vtkm::cont::ArrayHandle ::template ExecutionTypes::Portal PortalType; typedef typename vtkm::cont::ArrayHandle::PortalControl PortalControlType; - - const vtkm::Id VECTOR_SIZE = 16; - typedef internal::TiledInclusiveScanKernel TiledScanKernelType; - - typedef ScanKernel ScanKernelType; + typedef internal::TiledInclusiveScanKernel TiledScanKernelType; + const vtkm::Id vector_size = TiledScanKernelType::VECTOR_SIZE; Copy(input, output); @@ -203,26 +200,26 @@ public: } PortalType portal = output.PrepareForInPlace(DeviceAdapterTag()); - const vtkm::Id MAX_THREADS = omp_get_max_threads(); + const vtkm::Id max_threads = omp_get_max_threads(); // Number of vectors that fit in 32Kb cache - const size_t L1_vectors = (32 * 1024 * 1024) / (VECTOR_SIZE * 4); + const size_t l1_vectors = (32 * 1024 * 1024) / (vector_size * 4); - const vtkm::Id COUNT512 = output.GetNumberOfValues() / VECTOR_SIZE - + (output.GetNumberOfValues() % VECTOR_SIZE == 0 ? 0 : 1); + const vtkm::Id count512 = output.GetNumberOfValues() / vector_size + + (output.GetNumberOfValues() % vector_size == 0 ? 0 : 1); // number of 512-bit vectors in a "tile" - const vtkm::Id TILE_SIZE = COUNT512 > L1_vectors ? L1_vectors : COUNT512; - const vtkm::Id NUM_THREADS = MAX_THREADS > TILE_SIZE ? TILE_SIZE : MAX_THREADS; + const vtkm::Id tile_size = count512 > l1_vectors ? l1_vectors : count512; + const vtkm::Id num_threads = max_threads > tile_size ? tile_size : max_threads; const vtkm::Id round_off_mask = ~0x0f; // meta_scan[i] will get the carry-out for the sub-scan performed by // thread[i]. NOTE: This *must* have a 32-bit value for every thread. // Round-up size of meta_scan[] to the nearest multiple of VECTOR_SIZE, to allow // use of simd_scan(). - const vtkm::Id meta_scan_count = NUM_THREADS % VECTOR_SIZE - ? 1 + ((NUM_THREADS + VECTOR_SIZE) & round_off_mask) : NUM_THREADS + 1; + const vtkm::Id meta_scan_count = num_threads % vector_size + ? 1 + ((num_threads + vector_size) & round_off_mask) : num_threads + 1; - omp_set_num_threads(NUM_THREADS); + omp_set_num_threads(num_threads); vtkm::cont::ArrayHandle meta_scan_handle; meta_scan_handle.Allocate(meta_scan_count); @@ -241,7 +238,7 @@ public: } // Loop over each tile - for (vtkm::Id t = 0; t < COUNT512; t += TILE_SIZE){ + for (vtkm::Id t = 0; t < count512; t += tile_size){ // PASS1 (per tile) parallel // // A "tile" represents a subset of the total input. Tiles are @@ -273,7 +270,7 @@ public: carry_in = carry_out; #pragma omp for schedule(static) - for (vtkm::Id i = omp_get_num_threads(); i < TILE_SIZE; ++i){ + for (vtkm::Id i = omp_get_num_threads(); i < tile_size; ++i){ kernel.with_carry_in(t + i); carry_in = binary_functor(carry_in, carry_out); } @@ -307,28 +304,24 @@ public: binary_functor, carry_in_meta, carry_out); kernel.no_carry_in(0); carry_in_meta = carry_out; - for (vtkm::Id i = 1; i < meta_scan_count / VECTOR_SIZE; ++i){ + + for (vtkm::Id i = 1; i < meta_scan_count / vector_size; ++i){ kernel.with_carry_in(i); carry_in_meta = binary_functor(carry_in_meta, carry_out); } // Add the tile offset to each element in the meta scan if (tile_offset_valid){ - for (vtkm::Id i = 0; i < meta_scan_count / VECTOR_SIZE; ++i){ + for (vtkm::Id i = 0; i < meta_scan_count / vector_size; ++i){ #pragma unroll - for (vtkm::Id j = 0; j < VECTOR_SIZE; ++j){ - T value = meta_scan_portal.Get(i * VECTOR_SIZE + j); - meta_scan_portal.Set(i * VECTOR_SIZE + j, binary_functor(tile_offset, value)); + for (vtkm::Id j = 0; j < vector_size; ++j){ + T value = meta_scan_portal.Get(i * vector_size + j); + meta_scan_portal.Set(i * vector_size + j, binary_functor(tile_offset, value)); } } + tile_offset = binary_functor(tile_offset, carry_in_meta); } - // increment offset for the next tile by the carry-out from the - // meta-scan - // TODO: MErge this condition with the above - if (tile_offset_valid){ - tile_offset = binary_functor(tile_offset, carry_in_meta); - } - else { + else { tile_offset = carry_in_meta; tile_offset_valid = true; } @@ -355,13 +348,13 @@ public: } #pragma omp for schedule(static) - for (size_t i = 0; i < TILE_SIZE; ++i){ + for (size_t i = 0; i < tile_size; ++i){ if (thread > 0){ - const vtkm::Id offset = i * VECTOR_SIZE; - const vtkm::Id N_VALID = offset + VECTOR_SIZE > portal.GetNumberOfValues() ? - VECTOR_SIZE - (offset + VECTOR_SIZE - portal.GetNumberOfValues()) : VECTOR_SIZE; + const vtkm::Id offset = i * vector_size; + const vtkm::Id n_valid = offset + vector_size > portal.GetNumberOfValues() ? + vector_size - (offset + vector_size - portal.GetNumberOfValues()) : vector_size; - for (vtkm::Id j = 0; j < N_VALID; ++j){ + for (vtkm::Id j = 0; j < n_valid; ++j){ T value = portal.Get(t + offset + j); portal.Set(t + offset + j, binary_functor(thread_offset, value)); } @@ -388,7 +381,7 @@ public: } // Reset the max # of threads - omp_set_num_threads(MAX_THREADS); + omp_set_num_threads(max_threads); // The offset that would be used for the next tile is the result of the scan on the array return output.GetPortalConstControl().Get(numValues - 1); } diff --git a/vtkm/cont/openmp/internal/TiledScanKernel.h b/vtkm/cont/openmp/internal/TiledScanKernel.h index c8538bb10..f1c951b56 100644 --- a/vtkm/cont/openmp/internal/TiledScanKernel.h +++ b/vtkm/cont/openmp/internal/TiledScanKernel.h @@ -10,12 +10,13 @@ namespace cont { namespace internal { /* - * This kernel performs an inclusive scan on 16 elements, if the + * This kernel performs an inclusive scan on 16 elements */ -template +template struct TiledInclusiveScanKernel : vtkm::exec::FunctorBase { typedef typename PortalType::ValueType ValueType; + const static vtkm::Id VECTOR_SIZE = 16; PortalType Portal; const BinaryFunctor BinaryOperator; @@ -32,11 +33,11 @@ struct TiledInclusiveScanKernel : vtkm::exec::FunctorBase VTKM_EXEC_EXPORT void no_carry_in(vtkm::Id index) const { // Perform a scan within the tile - const vtkm::Id offset = index * VectorSize; - const vtkm::Id n_valid = offset + VectorSize > Portal.GetNumberOfValues() ? - VectorSize - (offset + VectorSize - Portal.GetNumberOfValues()) : VectorSize; - ValueType vector[VectorSize]; - if (n_valid == 16){ + const vtkm::Id offset = index * VECTOR_SIZE; + const vtkm::Id n_valid = offset + VECTOR_SIZE > Portal.GetNumberOfValues() ? + VECTOR_SIZE - (offset + VECTOR_SIZE - Portal.GetNumberOfValues()) : VECTOR_SIZE; + ValueType vector[VECTOR_SIZE]; + if (n_valid == VECTOR_SIZE){ scan16(vector, offset); } else { @@ -52,11 +53,11 @@ struct TiledInclusiveScanKernel : vtkm::exec::FunctorBase void with_carry_in(vtkm::Id index) const { // Perform a scan within the tile - const vtkm::Id offset = index * VectorSize; - const vtkm::Id n_valid = offset + VectorSize > Portal.GetNumberOfValues() ? - VectorSize - (offset + VectorSize - Portal.GetNumberOfValues()) : VectorSize; - ValueType vector[VectorSize]; - if (n_valid == 16){ + const vtkm::Id offset = index * VECTOR_SIZE; + const vtkm::Id n_valid = offset + VECTOR_SIZE > Portal.GetNumberOfValues() ? + VECTOR_SIZE - (offset + VECTOR_SIZE - Portal.GetNumberOfValues()) : VECTOR_SIZE; + ValueType vector[VECTOR_SIZE]; + if (n_valid == VECTOR_SIZE){ scan16(vector, offset); } else { @@ -68,7 +69,7 @@ struct TiledInclusiveScanKernel : vtkm::exec::FunctorBase // Add the carry in value to the scan and write our values // back into the portal #pragma unroll - for (vtkm::Id i = 0; i < VectorSize; ++i){ + for (vtkm::Id i = 0; i < VECTOR_SIZE; ++i){ vector[i] = BinaryOperator(CarryIn, vector[i]); } @@ -81,7 +82,7 @@ private: void scan16(ValueType *vector, const vtkm::Id offset) const { // Load up the elements for this 'vector' #pragma unroll - for (vtkm::Id i = 0; i < VectorSize; ++i){ + for (vtkm::Id i = 0; i < VECTOR_SIZE; ++i){ vector[i] = Portal.Get(i + offset); } scan_vector(vector); @@ -167,7 +168,7 @@ VTKM_EXEC_EXPORT void TiledInclusiveScanKernel:: template ExecutionTypes::Portal, vtkm::internal::Add, 16>::operator()(vtkm::Id index) const { - const static vtkm::Id VectorSize = 16; + const static vtkm::Id VECTOR_SIZE = 16; register __m512i a; register __m512i tmp; @@ -193,8 +194,8 @@ VTKM_EXEC_EXPORT void TiledInclusiveScanKernel Date: Fri, 31 Jul 2015 16:25:27 -0600 Subject: [PATCH 19/26] Fix some bad formatting/tabbing --- .../internal/DeviceAdapterAlgorithmOpenMP.h | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h index 1677b9249..f71eb0a69 100644 --- a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h +++ b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h @@ -190,7 +190,7 @@ public: ::template ExecutionTypes::Portal PortalType; typedef typename vtkm::cont::ArrayHandle::PortalControl PortalControlType; typedef internal::TiledInclusiveScanKernel TiledScanKernelType; - const vtkm::Id vector_size = TiledScanKernelType::VECTOR_SIZE; + const vtkm::Id vector_size = TiledScanKernelType::VECTOR_SIZE; Copy(input, output); @@ -224,10 +224,10 @@ public: vtkm::cont::ArrayHandle meta_scan_handle; meta_scan_handle.Allocate(meta_scan_count); PortalType meta_scan_portal = meta_scan_handle.PrepareForInPlace(vtkm::cont::DeviceAdapterTagOpenMP()); - // Should we not have a carry in if no value has been set for it? - // Same for tile offset and carry out usage. Assuming T() is the identity - // type for the operator is not correct, eg. 0 is not the identity type - // for a *-scan + + // Note: Carry in and tile offset don't have valid values until they're actually set + // by the output of a previous tile/thread. This is because T() is not the identity + // type for all operators we may do a scan with T carry_in = T(); T carry_out = T(); T tile_offset = T(); @@ -264,15 +264,15 @@ public: // On the first tile for each thread we don't have a valid carry in value TiledScanKernelType kernel = TiledScanKernelType(portal, binary_functor, carry_in, carry_out); #pragma omp for schedule(static) - for (vtkm::Id i = 0; i < omp_get_num_threads(); ++i){ - kernel.no_carry_in(i); - } - carry_in = carry_out; + for (vtkm::Id i = 0; i < omp_get_num_threads(); ++i){ + kernel.no_carry_in(i); + } + carry_in = carry_out; #pragma omp for schedule(static) for (vtkm::Id i = omp_get_num_threads(); i < tile_size; ++i){ kernel.with_carry_in(t + i); - carry_in = binary_functor(carry_in, carry_out); + carry_in = binary_functor(carry_in, carry_out); } // Now share our carry out with everyone else in the meta scan // It's a bit awkward but the carry out of this thread is actually in @@ -302,12 +302,12 @@ public: T carry_in_meta = T(); TiledScanKernelType kernel = TiledScanKernelType(meta_scan_portal, binary_functor, carry_in_meta, carry_out); - kernel.no_carry_in(0); - carry_in_meta = carry_out; + kernel.no_carry_in(0); + carry_in_meta = carry_out; for (vtkm::Id i = 1; i < meta_scan_count / vector_size; ++i){ kernel.with_carry_in(i); - carry_in_meta = binary_functor(carry_in_meta, carry_out); + carry_in_meta = binary_functor(carry_in_meta, carry_out); } // Add the tile offset to each element in the meta scan @@ -319,9 +319,9 @@ public: meta_scan_portal.Set(i * vector_size + j, binary_functor(tile_offset, value)); } } - tile_offset = binary_functor(tile_offset, carry_in_meta); + tile_offset = binary_functor(tile_offset, carry_in_meta); } - else { + else { tile_offset = carry_in_meta; tile_offset_valid = true; } -- GitLab From b21387ee1f14477729376bc239d8e5181ab20609 Mon Sep 17 00:00:00 2001 From: Will Usher Date: Tue, 4 Aug 2015 15:53:54 -0600 Subject: [PATCH 20/26] Fix scheduling of scaninclusive, will now compute correct results --- .../internal/DeviceAdapterAlgorithmOpenMP.h | 100 +++++++++++++----- 1 file changed, 73 insertions(+), 27 deletions(-) diff --git a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h index f71eb0a69..b28ce0a37 100644 --- a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h +++ b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h @@ -203,7 +203,7 @@ public: const vtkm::Id max_threads = omp_get_max_threads(); // Number of vectors that fit in 32Kb cache - const size_t l1_vectors = (32 * 1024 * 1024) / (vector_size * 4); + const vtkm::Id l1_vectors = (32 * 1024 * 1024) / (vector_size * 4); const vtkm::Id count512 = output.GetNumberOfValues() / vector_size + (output.GetNumberOfValues() % vector_size == 0 ? 0 : 1); @@ -231,14 +231,26 @@ public: T carry_in = T(); T carry_out = T(); T tile_offset = T(); + T tile_offset_next = T(); bool tile_offset_valid = false; for (vtkm::Id i = 0; i < meta_scan_count; ++i){ meta_scan_portal.Set(i, T()); } + /* + std::cout << "Using " << num_threads << " threads and tile size of " + << tile_size << ", count512 = " << count512 + << ", meta_scan_count = " << meta_scan_count << std::endl; + */ + // Loop over each tile for (vtkm::Id t = 0; t < count512; t += tile_size){ + /* + std::cout << "tile " << t << " tile offset " << tile_offset + << " current scan:\n"; + std::cout << "global carry out = " << carry_out << std::endl; + */ // PASS1 (per tile) parallel // // A "tile" represents a subset of the total input. Tiles are @@ -263,16 +275,22 @@ public: { // On the first tile for each thread we don't have a valid carry in value TiledScanKernelType kernel = TiledScanKernelType(portal, binary_functor, carry_in, carry_out); + // TODO: Splitting this into two loops breaks the continuity of a thread's tile breaking the + // assumptions we have about meta scan and how a thread's scan works. + // Will the branching the loop be an issue? If it is then we may want to manually + // schedule the loop + bool carry_in_valid = false; #pragma omp for schedule(static) - for (vtkm::Id i = 0; i < omp_get_num_threads(); ++i){ - kernel.no_carry_in(i); - } - carry_in = carry_out; - -#pragma omp for schedule(static) - for (vtkm::Id i = omp_get_num_threads(); i < tile_size; ++i){ - kernel.with_carry_in(t + i); - carry_in = binary_functor(carry_in, carry_out); + for (vtkm::Id i = 0; i < tile_size; ++i){ + if (!carry_in_valid){ + kernel.no_carry_in(t + i); + carry_in = carry_out; + carry_in_valid = true; + } + else { + kernel.with_carry_in(t + i); + carry_in = binary_functor(carry_in, carry_out); + } } // Now share our carry out with everyone else in the meta scan // It's a bit awkward but the carry out of this thread is actually in @@ -298,34 +316,43 @@ public: // value for this tile, which is also the global offset of the next // tile. { - // perform the meta-scan + /* + std::cout << "pre-scan of meta scan\n"; + for (vtkm::Id i = 0; i < meta_scan_count; ++i){ + std::cout << "meta_scan[" << i << "] = " << meta_scan_portal.Get(i) << "\n"; + } + */ + T carry_in_meta = T(); TiledScanKernelType kernel = TiledScanKernelType(meta_scan_portal, binary_functor, carry_in_meta, carry_out); kernel.no_carry_in(0); carry_in_meta = carry_out; - for (vtkm::Id i = 1; i < meta_scan_count / vector_size; ++i){ + for (vtkm::Id i = 1; i < meta_scan_count / vector_size + 1; ++i){ kernel.with_carry_in(i); carry_in_meta = binary_functor(carry_in_meta, carry_out); } // Add the tile offset to each element in the meta scan if (tile_offset_valid){ - for (vtkm::Id i = 0; i < meta_scan_count / vector_size; ++i){ -#pragma unroll - for (vtkm::Id j = 0; j < vector_size; ++j){ - T value = meta_scan_portal.Get(i * vector_size + j); - meta_scan_portal.Set(i * vector_size + j, binary_functor(tile_offset, value)); - } + for (vtkm::Id i = 1; i < meta_scan_count; ++i){ + T value = meta_scan_portal.Get(i); + meta_scan_portal.Set(i, binary_functor(tile_offset, value)); } - tile_offset = binary_functor(tile_offset, carry_in_meta); + tile_offset_next = binary_functor(tile_offset, carry_in_meta); } else { - tile_offset = carry_in_meta; + tile_offset_next = carry_in_meta; tile_offset_valid = true; } } + /* + std::cout << "post-scan of meta scan\n"; + for (vtkm::Id i = 0; i < meta_scan_count; ++i){ + std::cout << "meta_scan[" << i << "] = " << meta_scan_portal.Get(i) << "\n"; + } + */ // PASS2 (per tile) parallel // @@ -338,25 +365,43 @@ public: #pragma omp parallel { // TODO: Thread 0 has no thread offset. Could I instead do a parallel for + // CORRECTION: IT has no offset on the first tile! After that it has offset of `tile_offset` // on the meta scan count and then have each thread do its tile? That's // essentially what we're getting. Although that might thrash cache then because // we'd switch which thread was getting which elements const vtkm::Id thread = omp_get_thread_num(); - T thread_offset; - if (thread > 0){ - thread_offset = meta_scan_portal.Get(thread - 1); + T thread_offset = thread == 0 ? tile_offset : meta_scan_portal.Get(thread - 1); + + /* +#pragma omp critical + { + std::cout << "thread #" << thread << " offset = " << thread_offset << std::endl; } + */ #pragma omp for schedule(static) for (size_t i = 0; i < tile_size; ++i){ - if (thread > 0){ - const vtkm::Id offset = i * vector_size; + /* +#pragma omp critical + { + std::cout << "thread #" << thread << " got index " << i << "\n"; + } + */ + if (thread != 0 || t != 0){ + const vtkm::Id offset = (t + i) * vector_size; const vtkm::Id n_valid = offset + vector_size > portal.GetNumberOfValues() ? vector_size - (offset + vector_size - portal.GetNumberOfValues()) : vector_size; + /* +#pragma omp critical + { + std::cout << "thread #" << thread << " operating on indices [" << offset + << ".." << offset + n_valid - 1 << "], n_valid = " << n_valid << "\n"; + } + */ for (vtkm::Id j = 0; j < n_valid; ++j){ - T value = portal.Get(t + offset + j); - portal.Set(t + offset + j, binary_functor(thread_offset, value)); + T value = portal.Get(offset + j); + portal.Set(offset + j, binary_functor(thread_offset, value)); } } } @@ -365,6 +410,7 @@ public: carry_out = binary_functor(carry_out, thread_offset); } } + tile_offset = tile_offset_next; // CLEANUP (per-tile) serial { -- GitLab From bccfeb91d25635fe1ec7222b584b15abc6fa7830 Mon Sep 17 00:00:00 2001 From: Will Usher Date: Thu, 13 Aug 2015 17:35:29 -0600 Subject: [PATCH 21/26] Starting to work on a specialization of Sort It's just a parallel quicksort but might be some improvements we can make. --- .../internal/DeviceAdapterAlgorithmOpenMP.h | 186 +++++++++--------- 1 file changed, 88 insertions(+), 98 deletions(-) diff --git a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h index b28ce0a37..f8d811857 100644 --- a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h +++ b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h @@ -38,6 +38,8 @@ #include #include +#include +#include #include #define SPECIALIZE_OPENMP @@ -437,89 +439,99 @@ public: // Sort private: template - struct BitonicSortMergeKernel : vtkm::exec::FunctorBase - { + struct QuickSortKernel { + typedef typename PortalType::ValueType ValueType; PortalType Portal; BinaryCompare Compare; - vtkm::Id GroupSize; - - VTKM_CONT_EXPORT - BitonicSortMergeKernel(const PortalType &portal, - const BinaryCompare &compare, - vtkm::Id groupSize) - : Portal(portal), Compare(compare), GroupSize(groupSize) { } - - VTKM_EXEC_EXPORT - void operator()(vtkm::Id index) const - { - typedef typename PortalType::ValueType ValueType; - vtkm::Id groupIndex = index%this->GroupSize; - vtkm::Id blockSize = 2*this->GroupSize; - vtkm::Id blockIndex = index/this->GroupSize; + QuickSortKernel(PortalType &portal, + const BinaryCompare &compare) + : Portal(portal), Compare(compare){} - vtkm::Id lowIndex = blockIndex * blockSize + groupIndex; - vtkm::Id highIndex = lowIndex + this->GroupSize; - - if (highIndex < this->Portal.GetNumberOfValues()) - { - ValueType lowValue = this->Portal.Get(lowIndex); - ValueType highValue = this->Portal.Get(highIndex); - if (this->Compare(highValue, lowValue)) + // Sort the range of indices passed, range is inclusive + void sort(const vtkm::Id low, const vtkm::Id high){ + if (low >= high){ + return; + } + if (high - low > GRAIN_SIZE){ + vtkm::Id p = PartitionArray(low, high); +#pragma omp task firstprivate(p, low, high) { - this->Portal.Set(highIndex, lowValue); - this->Portal.Set(lowIndex, highValue); + sort(low, p); } + sort(p + 1, high); + } + else { + vtkm::cont::ArrayPortalToIterators iterators(Portal); + internal::WrappedBinaryOperator wrappedCompare(Compare); + typedef typename vtkm::cont::ArrayPortalToIterators::IteratorType Iterator; + Iterator begin = iterators.GetBegin(); + std::advance(begin, low); + Iterator end = iterators.GetBegin(); + std::advance(end, high + 1); + std::sort(begin, end, wrappedCompare); } } - }; - template - struct BitonicSortCrossoverKernel : vtkm::exec::FunctorBase - { - PortalType Portal; - BinaryCompare Compare; - vtkm::Id GroupSize; - - VTKM_CONT_EXPORT - BitonicSortCrossoverKernel(const PortalType &portal, - const BinaryCompare &compare, - vtkm::Id groupSize) - : Portal(portal), Compare(compare), GroupSize(groupSize) { } - - VTKM_EXEC_EXPORT - void operator()(vtkm::Id index) const - { - typedef typename PortalType::ValueType ValueType; - - vtkm::Id groupIndex = index%this->GroupSize; - vtkm::Id blockSize = 2*this->GroupSize; - vtkm::Id blockIndex = index/this->GroupSize; - - vtkm::Id lowIndex = blockIndex*blockSize + groupIndex; - vtkm::Id highIndex = blockIndex*blockSize + (blockSize - groupIndex - 1); - - if (highIndex < this->Portal.GetNumberOfValues()) - { - ValueType lowValue = this->Portal.Get(lowIndex); - ValueType highValue = this->Portal.Get(highIndex); - if (this->Compare(highValue, lowValue)) - { - this->Portal.Set(highIndex, lowValue); - this->Portal.Set(lowIndex, highValue); + private: + vtkm::Id MedianOfThree(vtkm::Id l, vtkm::Id m, vtkm::Id h){ + if (Compare(Portal.Get(l), Portal.Get(m))){ + if (Compare(Portal.Get(m), Portal.Get(h))){ + return m; + } + else { + return Compare(Portal.Get(l), Portal.Get(h)) ? h : l; + } + } + else { + if (Compare(Portal.Get(h), Portal.Get(m))){ + return m; + } + else { + return Compare(Portal.Get(h), Portal.Get(l)) ? h : l; } } } - }; - - struct DefaultCompareFunctor - { - - template - VTKM_EXEC_EXPORT - bool operator()(const T& first, const T& second) const - { - return first < second; + vtkm::Id PartitionArray(vtkm::Id low, vtkm::Id high){ + vtkm::Id length = high - low; + vtkm::Id offset = length / 8; + vtkm::Id pivot = MedianOfThree( + MedianOfThree(low, low + offset, low + offset * 2), + MedianOfThree(low + offset * 3, low + offset * 4, low + offset * 5), + MedianOfThree(low + offset * 6, low + offset * 7, high - 1)); + vtkm::Id i = low - 1; + vtkm::Id j = high + 1; + if (pivot > low){ + ValueType t = Portal.Get(low); + Portal.Set(low, Portal.Get(pivot)); + Portal.Set(pivot, t); + // std::swap won't take a vtkm::Pair<> ?? + //std::swap(Portal.Get(low), Portal.Get(pivot)); + } + while (true){ + do { + --j; + } + while (Compare(Portal.Get(low), Portal.Get(j))); + do { + ++i; + } + while (Compare(Portal.Get(i), Portal.Get(low))); + if (i < j){ + ValueType t = Portal.Get(i); + Portal.Set(i, Portal.Get(j)); + Portal.Set(j, t); + //std::swap(Portal.Get(i), Portal.Get(j)); + } + else { + break; + } + } + ValueType t = Portal.Get(low); + Portal.Set(low, Portal.Get(j)); + Portal.Set(j, t); + //std::swap(Portal.Get(low), Portal.Get(j)); + return j; } }; @@ -533,39 +545,17 @@ public: typedef typename vtkm::cont::ArrayHandle ArrayType; typedef typename ArrayType::template ExecutionTypes ::Portal PortalType; + typedef QuickSortKernel QuickSortKernelType; vtkm::Id numValues = values.GetNumberOfValues(); if (numValues < 2) { return; } PortalType portal = values.PrepareForInPlace(DeviceAdapterTag()); - - vtkm::Id numThreads = 1; - while (numThreads < numValues) { numThreads *= 2; } - numThreads /= 2; - const vtkm::Id NUM_THREADS = numThreads; - - typedef BitonicSortMergeKernel MergeKernel; - typedef BitonicSortCrossoverKernel CrossoverKernel; - - // TODO: Probably more we can do to improve this here + QuickSortKernelType kernel = QuickSortKernelType(portal, binary_compare); #pragma omp parallel { - for (vtkm::Id crossoverSize = 1; crossoverSize < numValues; crossoverSize *= 2) - { - const CrossoverKernel cross = CrossoverKernel(portal,binary_compare,crossoverSize); -#pragma omp for schedule(guided, GRAIN_SIZE) - for (vtkm::Id i = 0; i < NUM_THREADS; ++i){ - cross(i); - } - for (vtkm::Id mergeSize = crossoverSize/2; mergeSize > 0; mergeSize /= 2) - { - const MergeKernel merge = MergeKernel(portal,binary_compare,mergeSize); -#pragma omp for schedule(guided, GRAIN_SIZE / 4) - for (vtkm::Id i = 0; i < NUM_THREADS; ++i){ - merge(i); - } - } - } +#pragma omp single + kernel.sort(0, portal.GetNumberOfValues() - 1); } } @@ -573,7 +563,7 @@ public: VTKM_CONT_EXPORT static void Sort( vtkm::cont::ArrayHandle &values) { - Sort(values, DefaultCompareFunctor()); + Sort(values, std::less()); } #endif -- GitLab From dbd72dbe9077502c2e25111dab3c73917ebb47cc Mon Sep 17 00:00:00 2001 From: Will Usher Date: Tue, 18 Aug 2015 09:53:46 -0600 Subject: [PATCH 22/26] Clean up some comments and testing code --- .../internal/DeviceAdapterAlgorithmOpenMP.h | 265 +++++++----------- vtkm/cont/openmp/internal/TiledScanKernel.h | 110 -------- 2 files changed, 104 insertions(+), 271 deletions(-) diff --git a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h index f8d811857..adc8b2c87 100644 --- a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h +++ b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h @@ -54,7 +54,7 @@ struct DeviceAdapterAlgorithm : vtkm::cont::DeviceAdapterTagOpenMP> { private: - const static vtkm::Id GRAIN_SIZE = 4096; + const static vtkm::Id GRAIN_SIZE = 2048; typedef vtkm::cont::DeviceAdapterTagOpenMP DeviceAdapterTag; // This is basically the Serial ScheduleKernel @@ -181,6 +181,11 @@ public: return ScanInclusive(input, output, vtkm::internal::Add()); } + /* + * This ScanInclusive is a port of Jeff Inman's vectorized MIC scan, however + * we seem to perform quite a bit worse than Intel's TBB scan on MIC, so + * this may need to be revisted. On CPU this scan performs close to theirs + */ template VTKM_CONT_EXPORT static T ScanInclusive( const vtkm::cont::ArrayHandle &input, @@ -212,14 +217,13 @@ public: // number of 512-bit vectors in a "tile" const vtkm::Id tile_size = count512 > l1_vectors ? l1_vectors : count512; const vtkm::Id num_threads = max_threads > tile_size ? tile_size : max_threads; - const vtkm::Id round_off_mask = ~0x0f; // meta_scan[i] will get the carry-out for the sub-scan performed by - // thread[i]. NOTE: This *must* have a 32-bit value for every thread. + // thread[i]. NOTE: This *must* have a value for every thread. // Round-up size of meta_scan[] to the nearest multiple of VECTOR_SIZE, to allow // use of simd_scan(). const vtkm::Id meta_scan_count = num_threads % vector_size - ? 1 + ((num_threads + vector_size) & round_off_mask) : num_threads + 1; + ? 1 + ((num_threads + vector_size) & ~0x0f) : num_threads + 1; omp_set_num_threads(num_threads); @@ -240,19 +244,8 @@ public: meta_scan_portal.Set(i, T()); } - /* - std::cout << "Using " << num_threads << " threads and tile size of " - << tile_size << ", count512 = " << count512 - << ", meta_scan_count = " << meta_scan_count << std::endl; - */ - // Loop over each tile for (vtkm::Id t = 0; t < count512; t += tile_size){ - /* - std::cout << "tile " << t << " tile offset " << tile_offset - << " current scan:\n"; - std::cout << "global carry out = " << carry_out << std::endl; - */ // PASS1 (per tile) parallel // // A "tile" represents a subset of the total input. Tiles are @@ -266,24 +259,13 @@ public: // scans, where N is the number of threads. Each scan produces a // "carry-out" value into meta_scan, such that meta_scan[i] is the // carry-out value of thread[i]. - - // NOTE: In "static" schedule, each thread is assigned a chunk of - // contiguous iterations, of a size such that all threads have the same - // chunk-size (assuming n_iterations/n_threads has no remainder). - // Furthermore, subsequent static-parallel loops will assign iterations - // to threads in exactly the same way (assuming n_iterations is the - // same). #pragma omp parallel firstprivate(carry_in), private(carry_out) { - // On the first tile for each thread we don't have a valid carry in value TiledScanKernelType kernel = TiledScanKernelType(portal, binary_functor, carry_in, carry_out); - // TODO: Splitting this into two loops breaks the continuity of a thread's tile breaking the - // assumptions we have about meta scan and how a thread's scan works. - // Will the branching the loop be an issue? If it is then we may want to manually - // schedule the loop bool carry_in_valid = false; #pragma omp for schedule(static) for (vtkm::Id i = 0; i < tile_size; ++i){ + // On the first tile for each thread we don't have a valid carry in value if (!carry_in_valid){ kernel.no_carry_in(t + i); carry_in = carry_out; @@ -306,37 +288,29 @@ public: // Still within the given tile, we now run a scan on the values in // meta-scan. This produces offsets that will be added back into the // sub-scans within the tile. We also want to add the global offset - // for this tile to all the values in the sub-scans. This can be done - // with the simd_scan2(), which also supports PASS2 (below). + // for this tile to all the values in the sub-scans. // // After the meta-scan, values in the meta-scan provide fully // "relocated" offsets, for the corresponding threads within the - // current tile. The threads will add these offsets to their sub-scan + // current tile. The threads will add these offsets to their sub-scan // elements in PASS2. // // The meta-scan also produces a carry-out. This is the carry-out - // value for this tile, which is also the global offset of the next - // tile. + // value for this tile, which is also the global offset of the next tile { - /* - std::cout << "pre-scan of meta scan\n"; - for (vtkm::Id i = 0; i < meta_scan_count; ++i){ - std::cout << "meta_scan[" << i << "] = " << meta_scan_portal.Get(i) << "\n"; - } - */ - T carry_in_meta = T(); TiledScanKernelType kernel = TiledScanKernelType(meta_scan_portal, binary_functor, carry_in_meta, carry_out); kernel.no_carry_in(0); carry_in_meta = carry_out; + // On the first chunk of the scan we don't have a valid carry in for (vtkm::Id i = 1; i < meta_scan_count / vector_size + 1; ++i){ kernel.with_carry_in(i); carry_in_meta = binary_functor(carry_in_meta, carry_out); } - // Add the tile offset to each element in the meta scan + // Add the tile offset to each element in the meta scan if we have a valid one if (tile_offset_valid){ for (vtkm::Id i = 1; i < meta_scan_count; ++i){ T value = meta_scan_portal.Get(i); @@ -349,57 +323,28 @@ public: tile_offset_valid = true; } } - /* - std::cout << "post-scan of meta scan\n"; - for (vtkm::Id i = 0; i < meta_scan_count; ++i){ - std::cout << "meta_scan[" << i << "] = " << meta_scan_portal.Get(i) << "\n"; - } - */ // PASS2 (per tile) parallel // // Now each thread just adds the values from the corresponding // meta-scan, to all the elements within its chunk, within the current - // tile. (See NOTE in PASS1, re repeatability of OMP static schedule.) + // tile. + // // The resulting elements are "fully relocated", such that, when we are // finished with a tile, we don't need to revisit it again. Thus, we // can now forget about cache for this tile. #pragma omp parallel { - // TODO: Thread 0 has no thread offset. Could I instead do a parallel for - // CORRECTION: IT has no offset on the first tile! After that it has offset of `tile_offset` - // on the meta scan count and then have each thread do its tile? That's - // essentially what we're getting. Although that might thrash cache then because - // we'd switch which thread was getting which elements const vtkm::Id thread = omp_get_thread_num(); + // On the first tile thread 0 has no valid offset to apply T thread_offset = thread == 0 ? tile_offset : meta_scan_portal.Get(thread - 1); - /* -#pragma omp critical - { - std::cout << "thread #" << thread << " offset = " << thread_offset << std::endl; - } - */ - #pragma omp for schedule(static) for (size_t i = 0; i < tile_size; ++i){ - /* -#pragma omp critical - { - std::cout << "thread #" << thread << " got index " << i << "\n"; - } - */ if (thread != 0 || t != 0){ const vtkm::Id offset = (t + i) * vector_size; const vtkm::Id n_valid = offset + vector_size > portal.GetNumberOfValues() ? vector_size - (offset + vector_size - portal.GetNumberOfValues()) : vector_size; - /* -#pragma omp critical - { - std::cout << "thread #" << thread << " operating on indices [" << offset - << ".." << offset + n_valid - 1 << "], n_valid = " << n_valid << "\n"; - } - */ for (vtkm::Id j = 0; j < n_valid; ++j){ T value = portal.Get(offset + j); @@ -427,7 +372,6 @@ public: carry_in = T(); } } - // Reset the max # of threads omp_set_num_threads(max_threads); // The offset that would be used for the next tile is the result of the scan on the array @@ -439,101 +383,100 @@ public: // Sort private: template - struct QuickSortKernel { - typedef typename PortalType::ValueType ValueType; - PortalType Portal; - BinaryCompare Compare; - - QuickSortKernel(PortalType &portal, - const BinaryCompare &compare) - : Portal(portal), Compare(compare){} - - // Sort the range of indices passed, range is inclusive - void sort(const vtkm::Id low, const vtkm::Id high){ - if (low >= high){ - return; + static vtkm::Id MedianOfThree(const PortalType *portal, BinaryCompare compare, + const vtkm::Id l, const vtkm::Id m, const vtkm::Id h) + { + if (compare(portal->Get(l), portal->Get(m))){ + if (compare(portal->Get(m), portal->Get(h))){ + return m; } - if (high - low > GRAIN_SIZE){ - vtkm::Id p = PartitionArray(low, high); -#pragma omp task firstprivate(p, low, high) - { - sort(low, p); - } - sort(p + 1, high); + else { + return compare(portal->Get(l), portal->Get(h)) ? h : l; + } + } + else { + if (compare(portal->Get(h), portal->Get(m))){ + return m; } else { - vtkm::cont::ArrayPortalToIterators iterators(Portal); - internal::WrappedBinaryOperator wrappedCompare(Compare); - typedef typename vtkm::cont::ArrayPortalToIterators::IteratorType Iterator; - Iterator begin = iterators.GetBegin(); - std::advance(begin, low); - Iterator end = iterators.GetBegin(); - std::advance(end, high + 1); - std::sort(begin, end, wrappedCompare); + return compare(portal->Get(h), portal->Get(l)) ? h : l; } } + } - private: - vtkm::Id MedianOfThree(vtkm::Id l, vtkm::Id m, vtkm::Id h){ - if (Compare(Portal.Get(l), Portal.Get(m))){ - if (Compare(Portal.Get(m), Portal.Get(h))){ - return m; - } - else { - return Compare(Portal.Get(l), Portal.Get(h)) ? h : l; - } + template + static vtkm::Id PartitionArray(PortalType *portal, BinaryCompare compare, + const vtkm::Id low, const vtkm::Id high) + { + typedef typename PortalType::ValueType ValueType; + const vtkm::Id length = high - low; + const vtkm::Id offset = length / 8; + const vtkm::Id pivot = MedianOfThree(portal, compare, + MedianOfThree(portal, compare, low, low + offset, low + offset * 2), + MedianOfThree(portal, compare, low + offset * 3, low + offset * 4, low + offset * 5), + MedianOfThree(portal, compare, low + offset * 6, low + offset * 7, high)); + vtkm::Id i = low - 1; + vtkm::Id j = high + 1; + if (pivot > low){ + ValueType t = portal->Get(low); + portal->Set(low, portal->Get(pivot)); + portal->Set(pivot, t); + // std::swap won't take a vtkm::Pair<> ?? + //std::swap(portal->Get(low), portal->Get(pivot)); + } + while (true){ + do { + --j; + } + while (compare(portal->Get(low), portal->Get(j))); + do { + ++i; + } + while (compare(portal->Get(i), portal->Get(low))); + if (i < j){ + ValueType t = portal->Get(i); + portal->Set(i, portal->Get(j)); + portal->Set(j, t); + //std::swap(portal->Get(i), portal->Get(j)); } else { - if (Compare(Portal.Get(h), Portal.Get(m))){ - return m; - } - else { - return Compare(Portal.Get(h), Portal.Get(l)) ? h : l; - } + break; } } - vtkm::Id PartitionArray(vtkm::Id low, vtkm::Id high){ - vtkm::Id length = high - low; - vtkm::Id offset = length / 8; - vtkm::Id pivot = MedianOfThree( - MedianOfThree(low, low + offset, low + offset * 2), - MedianOfThree(low + offset * 3, low + offset * 4, low + offset * 5), - MedianOfThree(low + offset * 6, low + offset * 7, high - 1)); - vtkm::Id i = low - 1; - vtkm::Id j = high + 1; - if (pivot > low){ - ValueType t = Portal.Get(low); - Portal.Set(low, Portal.Get(pivot)); - Portal.Set(pivot, t); - // std::swap won't take a vtkm::Pair<> ?? - //std::swap(Portal.Get(low), Portal.Get(pivot)); - } - while (true){ - do { - --j; - } - while (Compare(Portal.Get(low), Portal.Get(j))); - do { - ++i; - } - while (Compare(Portal.Get(i), Portal.Get(low))); - if (i < j){ - ValueType t = Portal.Get(i); - Portal.Set(i, Portal.Get(j)); - Portal.Set(j, t); - //std::swap(Portal.Get(i), Portal.Get(j)); - } - else { - break; - } + ValueType t = portal->Get(low); + portal->Set(low, portal->Get(j)); + portal->Set(j, t); + //std::swap(portal->Get(low), portal->Get(j)); + return j; + } + + // Sort the range of indices passed, range is inclusive + template + static void QuicksortParallel(PortalType *portal, BinaryCompare compare, + const vtkm::Id low, const vtkm::Id high) + { + if (low >= high || low < 0 || high < 0){ + return; + } + if (high - low > GRAIN_SIZE){ + const vtkm::Id p = PartitionArray(portal, compare, low, high); +#pragma omp task firstprivate(p, low, high, compare) + { + QuicksortParallel(portal, compare, low, p); } - ValueType t = Portal.Get(low); - Portal.Set(low, Portal.Get(j)); - Portal.Set(j, t); - //std::swap(Portal.Get(low), Portal.Get(j)); - return j; + QuicksortParallel(portal, compare, p + 1, high); } - }; + else { + vtkm::cont::ArrayPortalToIterators iterators(*portal); + internal::WrappedBinaryOperator wrappedCompare(compare); + typedef typename vtkm::cont::ArrayPortalToIterators::IteratorType Iterator; + Iterator begin = iterators.GetBegin(); + std::advance(begin, low); + Iterator end = iterators.GetBegin(); + std::advance(end, high + 1); + std::sort(begin, end, wrappedCompare); + } + } public: #ifdef SPECIALIZE_OPENMP @@ -545,17 +488,17 @@ public: typedef typename vtkm::cont::ArrayHandle ArrayType; typedef typename ArrayType::template ExecutionTypes ::Portal PortalType; - typedef QuickSortKernel QuickSortKernelType; vtkm::Id numValues = values.GetNumberOfValues(); if (numValues < 2) { return; } - PortalType portal = values.PrepareForInPlace(DeviceAdapterTag()); - QuickSortKernelType kernel = QuickSortKernelType(portal, binary_compare); -#pragma omp parallel + +#pragma omp parallel shared(values, portal) { #pragma omp single - kernel.sort(0, portal.GetNumberOfValues() - 1); + { + QuicksortParallel(&portal, binary_compare, 0, portal.GetNumberOfValues() - 1); + } } } diff --git a/vtkm/cont/openmp/internal/TiledScanKernel.h b/vtkm/cont/openmp/internal/TiledScanKernel.h index f1c951b56..15ba37093 100644 --- a/vtkm/cont/openmp/internal/TiledScanKernel.h +++ b/vtkm/cont/openmp/internal/TiledScanKernel.h @@ -3,8 +3,6 @@ #include -//#define USE_JEFF_SCAN - namespace vtkm { namespace cont { namespace internal { @@ -157,114 +155,6 @@ private: vector[14] = BinaryOperator(vector[13], vector[14]); } }; - -#ifdef USE_JEFF_SCAN -#ifdef __MIC__ -#include - -// Specialized hand tuned kernel used by Jeff Inman's scan -template<> -VTKM_EXEC_EXPORT void TiledInclusiveScanKernel:: - template ExecutionTypes::Portal, - vtkm::internal::Add, 16>::operator()(vtkm::Id index) const { - const static vtkm::Id VECTOR_SIZE = 16; - - register __m512i a; - register __m512i tmp; - - // initialize permutation mask - // indices increase from right-to-left in the vector. - // Q: Which end is the effing "high-order" end? - // A: low-order value (RHS) in the register gets read/stored first. - // Therefore, the RHS element (you know, the one with index 0) is really the effing "high-order" end. - // therefore, we need to think of the scan as right-to-left, within the vector. - // NOTE: we have values here that are unused in upward-pass, but used in downward-pass - static const __m512i perm_mask4 = _mm512_set_epi32( 7, 0, 0, 0, 0, 0, 0, 0,15, 0, 0, 0, 0, 0, 0, 0 ); - static const __m512i perm_mask3 = _mm512_set_epi32( 11, 0, 0, 0,15, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 ); - static const __m512i perm_mask2 = _mm512_set_epi32( 13, 0,15, 0, 9, 0,11, 0, 5, 0, 7, 0, 1, 0, 3, 0 ); - static const __m512i perm_mask1 = _mm512_set_epi32( 14,15,12,13,10,11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 ); - - /// print_vec("", perm_mask1); - - // would rather build this efficiently from scratch, using arithmetic operators provided via intrinsics. - // hard to see how, though. There's no NOT. Comparisons set bits in a mask, rather than a vector. etc. - static const __m512i fifteen = _mm512_set_epi32( 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15 ); - - static const __m512i identity = _mm512_set_epi32( 15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 ); - - // Perform a scan within the tile - const vtkm::Id offset = index * VECTOR_SIZE; - __declspec(align(64)) vtkm::Int32 vector[VECTOR_SIZE]; - // Load up the elements for this 'vector' - for (vtkm::Id i = 0; i < 16; ++i){ - vector[i] = Portal.Get(i + offset); - } - - // load 16 values - a = _mm512_load_epi32(vector); - - // (1) upward-pass - tmp = _mm512_permutevar_epi32(perm_mask1, a); - a = _mm512_mask_add_epi32(a, 0xaaaa, a, tmp); // was 0x5555 - - tmp = _mm512_permutevar_epi32(perm_mask2, a); - a = _mm512_mask_add_epi32(a, 0x8888, a, tmp); // was 0x1111 - - tmp = _mm512_permutevar_epi32(perm_mask3, a); - a = _mm512_mask_add_epi32(a, 0x8080, a, tmp); // was 0x0101 - - tmp = _mm512_permutevar_epi32(perm_mask4, a); - a = _mm512_mask_add_epi32(a, 0x8000, a, tmp); // was 0x0001 - - tmp = _mm512_permutevar_epi32(fifteen, a); // all positions of get a[15] ? - _mm512_store_epi32(vector, tmp); - CarryOut = vector[15]; - - // zero-out high-order element - // a = _mm512_mask_xor_epi32(a, 0x8000, a, a); // high-order value (LHS) becomes 0 - a = _mm512_mask_slli_epi32(a, 0x8000, a, 32); // high-order value (LHS) becomes 0 - - // (2) downward pass - tmp = _mm512_permutevar_epi32(perm_mask4, a); - a = _mm512_mask_add_epi32(a, 0x8000, a, tmp); // was 0x0001 - a = _mm512_mask_xor_epi32(a, 0x0080, a, a); - a = _mm512_mask_or_epi32 (a, 0x0080, a, tmp); - - tmp = _mm512_permutevar_epi32(perm_mask3, a); - a = _mm512_mask_add_epi32(a, 0x8080, a, tmp); // was 0x0101 - a = _mm512_mask_xor_epi32(a, 0x0808, a, a); - a = _mm512_mask_or_epi32 (a, 0x0808, a, tmp); - - tmp = _mm512_permutevar_epi32(perm_mask2, a); - a = _mm512_mask_add_epi32(a, 0x8888, a, tmp); // was 0x1111 - a = _mm512_mask_xor_epi32(a, 0x2222, a, a); - a = _mm512_mask_or_epi32 (a, 0x2222, a, tmp); - - tmp = _mm512_permutevar_epi32(perm_mask1, a); - a = _mm512_mask_add_epi32(a, 0xaaaa, a, tmp); // was 0x5555 - a = _mm512_mask_xor_epi32(a, 0x5555, a, a); - a = _mm512_mask_or_epi32 (a, 0x5555, a, tmp); - -#pragma unroll - for (vtkm::Id i = 0; i < 16; ++i){ - vector[i] = CarryIn; - } - tmp = _mm512_load_epi32(vector); - a = _mm512_add_epi32(a, tmp); - - // store 16 values of completed scan - _mm512_store_epi32(vector, a); - - // Write our result back into the array portal - for (vtkm::Id i = 0; i < VECTOR_SIZE; ++i){ - ValueType value = vector[i]; - Portal.Set(i + offset, value); - } -} -#endif -#endif - } } } -- GitLab From 882ba457a16b6cd378b0ef1c3a1e214e826f12cd Mon Sep 17 00:00:00 2001 From: Will Usher Date: Tue, 18 Aug 2015 09:59:59 -0600 Subject: [PATCH 23/26] Remove the specialize OpenMP toggle define --- vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h index adc8b2c87..86afee036 100644 --- a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h +++ b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h @@ -42,8 +42,6 @@ #include #include -#define SPECIALIZE_OPENMP - namespace vtkm { namespace cont { @@ -171,7 +169,6 @@ private: } }; -#ifdef SPECIALIZE_OPENMP public: template VTKM_CONT_EXPORT static T ScanInclusive( @@ -377,7 +374,6 @@ public: // The offset that would be used for the next tile is the result of the scan on the array return output.GetPortalConstControl().Get(numValues - 1); } -#endif //-------------------------------------------------------------------------- // Sort @@ -479,7 +475,6 @@ private: } public: -#ifdef SPECIALIZE_OPENMP template VTKM_CONT_EXPORT static void Sort( vtkm::cont::ArrayHandle &values, @@ -508,7 +503,6 @@ public: { Sort(values, std::less()); } -#endif VTKM_CONT_EXPORT static void Synchronize() { -- GitLab From 90e65e8ea0bf65b5234f1d9807c6d9924e7ced4c Mon Sep 17 00:00:00 2001 From: Will Usher Date: Tue, 18 Aug 2015 10:27:53 -0600 Subject: [PATCH 24/26] Fix bad tabbing --- .../internal/DeviceAdapterAlgorithmOpenMP.h | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h index 86afee036..6262187ae 100644 --- a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h +++ b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h @@ -55,22 +55,22 @@ private: const static vtkm::Id GRAIN_SIZE = 2048; typedef vtkm::cont::DeviceAdapterTagOpenMP DeviceAdapterTag; - // This is basically the Serial ScheduleKernel - template - class ScheduleKernel - { - public: - ScheduleKernel(const FunctorType &functor) - : Functor(functor) {} - - VTKM_EXEC_EXPORT void operator()(vtkm::Id index) const - { - this->Functor(index); - } - - private: - const FunctorType Functor; - }; + // This is basically the Serial ScheduleKernel + template + class ScheduleKernel + { + public: + ScheduleKernel(const FunctorType &functor) + : Functor(functor) {} + + VTKM_EXEC_EXPORT void operator()(vtkm::Id index) const + { + this->Functor(index); + } + + private: + const FunctorType Functor; + }; public: template -- GitLab From 70e2dfa4c49b783a0d8b4a1b703a3153341bf5a6 Mon Sep 17 00:00:00 2001 From: Will Usher Date: Tue, 18 Aug 2015 10:48:42 -0600 Subject: [PATCH 25/26] Fix copyright statement --- vtkm/cont/openmp/internal/TiledScanKernel.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/vtkm/cont/openmp/internal/TiledScanKernel.h b/vtkm/cont/openmp/internal/TiledScanKernel.h index 15ba37093..c738a586a 100644 --- a/vtkm/cont/openmp/internal/TiledScanKernel.h +++ b/vtkm/cont/openmp/internal/TiledScanKernel.h @@ -1,3 +1,22 @@ +//============================================================================ +// Copyright (c) Kitware, Inc. +// All rights reserved. +// See LICENSE.txt for details. +// This software is distributed WITHOUT ANY WARRANTY; without even +// the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR +// PURPOSE. See the above copyright notice for more information. +// +// Copyright 2014 Sandia Corporation. +// Copyright 2014 UT-Battelle, LLC. +// Copyright 2014 Los Alamos National Security. +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Under the terms of Contract DE-AC52-06NA25396 with Los Alamos National +// Laboratory (LANL), the U.S. Government retains certain rights in +// this software. +//============================================================================ #ifndef vtk_m_cont_openmp_internal_TiledScanKernel_h #define vtk_m_cont_openmp_internal_TiledScanKernel_h -- GitLab From 45369d2126d26c48861dc5897bac4bec4e5abb2a Mon Sep 17 00:00:00 2001 From: Will Usher Date: Tue, 18 Aug 2015 11:07:16 -0600 Subject: [PATCH 26/26] Clean up some unneeded comments and code --- vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h index 6262187ae..5fc3d5a6c 100644 --- a/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h +++ b/vtkm/cont/openmp/internal/DeviceAdapterAlgorithmOpenMP.h @@ -87,11 +87,8 @@ public: const ScheduleKernel kernel(functor); - const vtkm::Id size = numInstances; - // Should we have a VectorSize inner loop with ivdep and then scheudle - // size / VectorSize runs? Then have a remaining overflow loop? #pragma omp parallel for schedule(guided, GRAIN_SIZE) - for(vtkm::Id i=0; i < size; ++i) + for(vtkm::Id i=0; i < numInstances; ++i) { kernel(i); } -- GitLab